diff --git a/.github/actions/install-deps/action.yml b/.github/actions/install-deps/action.yml
new file mode 100644
index 000000000..1f13767d9
--- /dev/null
+++ b/.github/actions/install-deps/action.yml
@@ -0,0 +1,31 @@
+name: "Install dependencies"
+description: "Installs dependencies on GitHub Actions runners"
+
+inputs:
+  os:
+    description: 'Runner OS'
+    required: true
+
+runs:
+  using: "composite"
+  steps:
+    - name: Verify ubuntu only
+      shell: bash
+      run: |
+        if ! echo ${{ inputs.os }} | grep -q "ubuntu"; then
+          echo "${{ inputs.os }} does not seem to be ubuntu"
+        fi
+    - name: Assert requested os exists in dependencies
+      shell: bash
+      run: |
+        if ! jq -e ".\"${{ inputs.os }}\" != null" $GITHUB_ACTION_PATH/dependencies.json; then
+          echo "${{ inputs.os }} does not exist as a supported os in $GITHUB_ACTION_PATH/dependencies.json"
+        fi
+    - name: Retrieve and install pkg deps based on OS
+      id: retrieve-pkg-deps
+      shell: bash
+      run: |
+        DEPENDENCIES=$(jq -r --arg os "${{ inputs.os }}" '.[$os] | .[]' $GITHUB_ACTION_PATH/dependencies.json)
+        echo $DEPENDENCIES
+        sudo apt update
+        sudo apt install $DEPENDENCIES
diff --git a/.github/actions/install-deps/dependencies.json b/.github/actions/install-deps/dependencies.json
new file mode 100644
index 000000000..2faab1678
--- /dev/null
+++ b/.github/actions/install-deps/dependencies.json
@@ -0,0 +1,12 @@
+{
+  "ubuntu-22.04": [
+    "software-properties-common",
+    "build-essential",
+    "python3.10-venv",
+    "libyaml-cpp-dev",
+    "libboost-all-dev",
+    "libsndfile1",
+    "libhwloc-dev",
+    "libzmq3-dev"
+  ]
+}
diff --git a/.github/workflows/build-artifacts.yml b/.github/workflows/build-artifacts.yml
new file mode 100644
index 000000000..5e52e7e8f
--- /dev/null
+++ b/.github/workflows/build-artifacts.yml
@@ -0,0 +1,24 @@
+name: Build artifacts
+
+on:
+  workflow_dispatch:
+  workflow_call:
+
+env:
+  PYTHON_VERSION: "python3.10"
+
+jobs:
+  build-artifacts:
+    strategy:
+      matrix:
+        arch: ["grayskull"]
+    runs-on: ubuntu-22.04
+    steps:
+      - uses: actions/checkout@v4
+      - uses: ./.github/actions/install-deps
+        with:
+          os: ubuntu-22.04
+      - name: Update submodule
+        run: git submodule update --init --recursive
+      - name: Build for ${{ matrix.arch }}
+        run: source env_for_silicon.sh
\ No newline at end of file
diff --git a/.github/workflows/post-commit-workflow.yml b/.github/workflows/post-commit-workflow.yml
new file mode 100644
index 000000000..ceb7d58da
--- /dev/null
+++ b/.github/workflows/post-commit-workflow.yml
@@ -0,0 +1,13 @@
+name: Post commit workflow
+
+on:
+  workflow_dispatch:
+  workflow_call:
+  push:
+    branches:
+      - main
+
+jobs:
+  build-artifacts:
+    uses: ./.github/workflows/build-artifacts.yml
+    secrets: inherit
\ No newline at end of file
diff --git a/.github/workflows/pull-request-workflow.yml b/.github/workflows/pull-request-workflow.yml
new file mode 100644
index 000000000..c5fbe7958
--- /dev/null
+++ b/.github/workflows/pull-request-workflow.yml
@@ -0,0 +1,13 @@
+name: Pull request workflow
+
+on:
+  workflow_dispatch:
+  workflow_call:
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  build-artifacts:
+    uses: ./.github/workflows/build-artifacts.yml
+    secrets: inherit
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 690531c0f..4dff4f040 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,132 +48,8 @@ bp_data
 third_party/llvm
 device_images
 generated_modules
+build_deps/
 
 # ClangD
 compile_commands.json
 \n\n# Exclude LFS files to keep the public repo small
-third_party/budabackend/common_lib/libboost_filesystem.so.1.65.1
-third_party/budabackend/common_lib/libboost_regex.so.1.65.1
-third_party/budabackend/common_lib/libboost_system.so.1.65.1
-third_party/budabackend/common_lib/libboost_thread.so.1.65.1
-third_party/budabackend/common_lib/libicudata.so.60
-third_party/budabackend/common_lib/libicui18n.so.60
-third_party/budabackend/common_lib/libicuuc.so.60
-third_party/budabackend/common_lib/libsqlite3.so.0
-third_party/budabackend/common_lib/libz.so.1
-third_party/budabackend/dbd/docs-md/images/debuda-buda.png
-third_party/budabackend/dbd/docs-md/images/debuda-debuda-server.png
-third_party/budabackend/dbd/docs-md/images/debuda-export-db.png
-third_party/budabackend/dbd/docs-md/images/debuda-speed-dial.png
-third_party/budabackend/dbd/docs-md/images/debuda-start-and-repl.png
-third_party/budabackend/dbd/docs-md/images/debuda.png
-third_party/budabackend/dbd/docs-md/images/tenstorrent-pdf-titlepage.png
-third_party/budabackend/device/bin/silicon/grayskull/clkctl
-third_party/budabackend/device/bin/silicon/grayskull/init
-third_party/budabackend/device/bin/silicon/grayskull/pm_en.ttx
-third_party/budabackend/device/bin/silicon/grayskull/setup_interface
-third_party/budabackend/device/bin/silicon/grayskull/tt-script
-third_party/budabackend/device/bin/silicon/reset-chip
-third_party/budabackend/device/bin/silicon/tensix-reset
-third_party/budabackend/device/bin/silicon/wormhole/boot
-third_party/budabackend/device/bin/silicon/wormhole/create-ethernet-map
-third_party/budabackend/device/bin/silicon/wormhole/flash-spi
-third_party/budabackend/device/bin/silicon/wormhole/get-wormhole-interfaces
-third_party/budabackend/device/bin/silicon/wormhole/imx8-reset
-third_party/budabackend/device/bin/silicon/wormhole/load-eth-fw
-third_party/budabackend/device/bin/silicon/wormhole/noc-overlay-dump
-third_party/budabackend/device/bin/silicon/wormhole/setup-interface
-third_party/budabackend/device/bin/silicon/wormhole/temp
-third_party/budabackend/device/bin/silicon/wormhole/tt-flash
-third_party/budabackend/device/bin/silicon/wormhole/tt-script
-third_party/budabackend/docs/public/images/1.png
-third_party/budabackend/docs/public/images/10.png
-third_party/budabackend/docs/public/images/perf_ui_candlestick.png
-third_party/budabackend/docs/public/images/11.png
-third_party/budabackend/docs/public/images/perf_ui_wft.png
-third_party/budabackend/docs/public/images/12.png
-third_party/budabackend/docs/public/images/13.png
-third_party/budabackend/docs/public/images/14.png
-third_party/budabackend/docs/public/images/16.png
-third_party/budabackend/docs/public/images/17.png
-third_party/budabackend/docs/public/images/18.png
-third_party/budabackend/docs/public/images/2.png
-third_party/budabackend/docs/public/images/20.png
-third_party/budabackend/docs/public/images/21.png
-third_party/budabackend/docs/public/images/22.png
-third_party/budabackend/docs/public/images/23.png
-third_party/budabackend/docs/public/images/24.png
-third_party/budabackend/docs/public/images/25.png
-third_party/budabackend/docs/public/images/26.png
-third_party/budabackend/docs/public/images/27.png
-third_party/budabackend/docs/public/images/28.png
-third_party/budabackend/docs/public/images/29.png
-third_party/budabackend/docs/public/images/3.png
-third_party/budabackend/docs/public/images/30.png
-third_party/budabackend/docs/public/images/31.png
-third_party/budabackend/docs/public/images/32.png
-third_party/budabackend/docs/public/images/33.png
-third_party/budabackend/docs/public/images/34.png
-third_party/budabackend/docs/public/images/35.png
-third_party/budabackend/docs/public/images/36.png
-third_party/budabackend/docs/public/images/37.png
-third_party/budabackend/docs/public/images/4.png
-third_party/budabackend/docs/public/images/5.png
-third_party/budabackend/docs/public/images/6.png
-third_party/budabackend/docs/public/images/7.png
-third_party/budabackend/docs/public/images/perf_ui_local_select.png
-third_party/budabackend/docs/public/images/8.png
-third_party/budabackend/docs/public/images/9.png
-third_party/budabackend/docs/public/images/Screen_Shot_2023-05-24_at_11.56.12_PM.png
-third_party/budabackend/docs/public/images/bfp-efficiency.png
-third_party/budabackend/docs/public/images/bfp2_blocks.png
-third_party/budabackend/docs/public/images/bfp4_blocks.png
-third_party/budabackend/docs/public/images/bfp8_blocks.png
-third_party/budabackend/docs/public/images/cropped-favicon-32x32.png
-third_party/budabackend/docs/public/images/data-formats.png
-third_party/budabackend/docs/public/images/logo.png
-third_party/budabackend/docs/public/images/perf_ui_device.png
-third_party/budabackend/docs/public/images/perf_ui_diff.png
-third_party/budabackend/docs/public/images/perf_ui_front_page.png
-third_party/budabackend/docs/public/images/perf_ui_host.png
-third_party/budabackend/docs/public/images/perf_ui_inputs.png
-third_party/budabackend/docs/public/images/perf_ui_select_test.png
-third_party/budabackend/docs/public/images/perf_ui_select_workspace.png
-third_party/budabackend/docs/public/images/tt_logo.png
-third_party/budabackend/docs/public/images/tt_logo.svg
-third_party/budabackend/docs/public/images/unpack_math_pack.png
-third_party/budabackend/loader/tests/reference_tensor_conv_bin/expected_tensor_for_conv_bfp8_s2.bin
-third_party/budabackend/loader/tests/reference_tensor_conv_bin/expected_tensor_for_conv_fp16b_s2.bin
-third_party/budabackend/loader/tests/reference_tensor_conv_bin/expected_tensor_for_conv_fp32_s2.bin
-third_party/budabackend/loader/tests/reference_tensor_conv_bin_mt/input_tensor_for_conv_fp16b.0.bin
-third_party/budabackend/loader/tests/reference_tensor_conv_bin_mt/input_tensor_for_conv_fp16b_s1.0.bin
-third_party/budabackend/loader/tests/reference_tensor_conv_bin_mt/input_tensor_for_conv_fp32.0.bin
-third_party/budabackend/perf_lib/graph_tests/grayskull/inference/bert_large_hifi3_fp16b.yaml
-third_party/budabackend/perf_lib/graph_tests/grayskull/inference/bert_large_lofi_bfp8b.yaml
-third_party/budabackend/verif/graph_tests/netlists/pregenerated/group_1.zip
-third_party/budabackend/verif/graph_tests/netlists/pregenerated/group_2.zip
-third_party/budabackend/verif/graph_tests/netlists/pregenerated/group_3.zip
-third_party/budabackend/verif/graph_tests/netlists/pregenerated/group_4.zip
-third_party/budabackend/verif/graph_tests/netlists/pregenerated/group_5.zip
-third_party/budabackend/verif/graph_tests/netlists/pregenerated/group_6.zip
-third_party/budabackend/verif/graph_tests/netlists/pregenerated/group_7.zip
-third_party/budabackend/verif/graph_tests/netlists/t5_large_wormhole_b0.yaml
-third_party/budabackend/verif/graph_tests/netlists/wormhole_b0/netlist_bert_12x_encoder_1x_whb0_base_training.yaml
-third_party/budabackend/verif/graph_tests/netlists/wormhole_b0/netlist_bert_24x_encoder_1x_whb0_large_training.yaml
-third_party/budabackend/verif/multichip_tests/wh_multichip/large_cluster/falcon_60l_8chip.yaml
-third_party/budabackend/verif/multichip_tests/wh_multichip/large_cluster/midbloom_inference_12chip_lab68.yaml
-third_party/budabackend/verif/multichip_tests/wh_multichip/large_cluster/midbloom_inference_32chip_lab78.no_snakeplace.yaml
-third_party/budabackend/verif/multichip_tests/wh_multichip/large_cluster/midbloom_inference_32chip_lab78.yaml
-third_party/budabackend/verif/multichip_tests/wh_multichip/large_cluster/midbloom_inference_4chip_jb11.yaml
-third_party/budabackend/verif/multichip_tests/wh_multichip/large_cluster/midbloom_inference_8chip_jb11.yaml
-third_party/budabackend/verif/multichip_tests/wh_multichip/netlist_bert_concurrent_24x_encoder_2x_wh_large_inference.yaml
-third_party/budabackend/verif/pipegen_tests/netlists/grayskull/nightly/baseline.zip
-third_party/budabackend/verif/pipegen_tests/netlists/grayskull/push/baseline.zip
-third_party/budabackend/verif/pipegen_tests/netlists/wormhole_b0/nightly/baseline.zip
-third_party/budabackend/verif/pipegen_tests/netlists/wormhole_b0/push/baseline.zip
-third_party/budabackend/verif/template_netlist/netlists/multi_tm_tests/weekly/wormhole_b0/test_dram_input_matmul_3tms_and_reblock.zip
-third_party/budabackend/verif/template_netlist/netlists/test_datacopy_matmul_2tms_and_reblock_pregenerated.zip
-third_party/budabackend/verif/template_netlist/netlists/test_datacopy_matmul_3tms_and_reblock_000_force_grayskull.zip
-third_party/budabackend/verif/template_netlist/netlists/test_dram_input_matmul_3tms_and_reblock_grayskull.zip
-third_party/budabackend/verif/tm_tests/directed/bert_large_inference_hifi.yaml
-third_party/budabackend/verif/tm_tests/directed/packer_mcast.tar.gz
diff --git a/.gitlab-ci.perf.yml b/.gitlab-ci.perf.yml
new file mode 100644
index 000000000..d683dc410
--- /dev/null
+++ b/.gitlab-ci.perf.yml
@@ -0,0 +1,44 @@
+include:
+  - .gitlab-ci.wheels.yml
+
+  # PyBuda repo, Grayskull e150
+  - ci/gitlab-test-lists/.gitlab-ci.grayskull_e150_perf_bfp8_b_nightly.yml
+  - ci/gitlab-test-lists/.gitlab-ci.grayskull_e150_perf_fp16_nightly.yml
+  - ci/gitlab-test-lists/.gitlab-ci.grayskull_e150_perf_release_nightly.yml
+
+  # PyBuda repo, Grayskull e75
+  - ci/gitlab-test-lists/.gitlab-ci.grayskull_e75_perf_bfp8_b_nightly.yml
+  - ci/gitlab-test-lists/.gitlab-ci.grayskull_e75_perf_fp16_nightly.yml
+  - ci/gitlab-test-lists/.gitlab-ci.grayskull_e75_perf_release_nightly.yml
+  
+  # PyBuda repo, Wormhole B0
+  - ci/gitlab-test-lists/.gitlab-ci.wormhole_b0_silicon_perf_bfp8_b_nightly.yml
+  - ci/gitlab-test-lists/.gitlab-ci.wormhole_b0_silicon_perf_fp16_nightly.yml
+  - ci/gitlab-test-lists/.gitlab-ci.wormhole_b0_silicon_perf_release_nightly.yml
+
+  - ci/gitlab-test-lists/.gitlab-ci.wormhole_b0_silicon_perf_bfp8_b_manual.yml
+  - ci/gitlab-test-lists/.gitlab-ci.wormhole_b0_silicon_perf_fp16_manual.yml
+  - ci/gitlab-test-lists/.gitlab-ci.wormhole_b0_silicon_perf_release_manual.yml
+
+  # Benchmarking repo
+  # Grayskull e75, Grayskull e150, Wormhole B0
+  - ci/gitlab-test-lists/benchmarking/.gitlab-ci.wormhole_b0_silicon_perf_release_public.yml
+  - ci/gitlab-test-lists/benchmarking/.gitlab-ci.grayskull_e75_perf_release_public.yml
+  - ci/gitlab-test-lists/benchmarking/.gitlab-ci.grayskull_e150_perf_release_public.yml
+
+# Dissable other jobs from .gitlab-ci.wheels.yml
+pybuda-gs-latest-bbe-wheel:
+  rules:
+    - if: ($CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_MESSAGE !~ /\[no_ci_perf/)
+
+pybuda-wh-b0-latest-bbe-wheel:
+  rules:
+    - if: ($CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_MESSAGE !~ /\[no_ci_perf/)
+
+pybuda-gs-unittests:
+  rules:
+    - if: ($CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_MESSAGE !~ /\[no_ci_perf/)
+
+pybuda-wh-b0-unittests:
+  rules:
+    - if: ($CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_MESSAGE !~ /\[no_ci_perf/)
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
index 58e4da86d..8b0954ebb 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,12 +1,13 @@
 [submodule "third_party/tvm"]
 	path = third_party/tvm
-	url = ../tt-tvm.git
-[submodule "third_party/budabackend"]
-	path = third_party/budabackend
-	url = ../tt-budabackend.git
+	url = https://github.com/tenstorrent/tt-tvm
 [submodule "third_party/pybind11"]
 	path = third_party/pybind11
-	url = https://github.com/pybind/pybind11
-[submodule "third_party/public-tt-buda"]
-	path = third_party/public-tt-buda
-	url = ../tt-buda-demos.git
+	url = https://github.com/pybind/pybind11.git
+[submodule "third_party/buda-model-demos"]
+	path = third_party/buda-model-demos
+	url = https://github.com/tenstorrent/tt-buda-demos
+
+[submodule "third_party/tt-mlir"]
+	path = third_party/tt-mlir
+	url = git@github.com:tenstorrent/tt-mlir.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..f12eafca7
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,51 @@
+cmake_minimum_required(VERSION 3.20.0)
+project(pybuda LANGUAGES CXX)
+
+find_program(CLANG_17 clang++-17)
+find_program(CLANG clang)
+if(CLANG_17 AND CLANG)
+    message(STATUS "Found Clang-17 here: ${CLANG_17}")
+    set(CMAKE_CXX_COMPILER "${CLANG_17}")
+    set(CMAKE_C_COMPILER "${CLANG}")
+else()
+    message(WARNING "Clang++-17 or clang not found!!!")
+endif()
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
+include(Utils)
+
+check_required_env_var(PYBUDA_PYTHON_VERSION)
+check_required_env_var(PYBUDA_TOOLCHAIN_DIR)
+check_required_env_var(PYBUDA_VENV_DIR)
+
+set(PYBUDA_PYTHON_VERSION $ENV{PYBUDA_PYTHON_VERSION})
+set(PYBUDA_TOOLCHAIN_DIR $ENV{PYBUDA_TOOLCHAIN_DIR})
+set(PYBUDA_VENV_DIR $ENV{PYBUDA_VENV_DIR})
+
+find_package(Python COMPONENTS REQUIRED Interpreter Development)
+
+set(TTMLIR_TOOLCHAIN_DIR $ENV{TTMLIR_TOOLCHAIN_DIR})
+set(TTMLIR_VENV_DIR $ENV{TTMLIR_VENV_DIR})
+
+if (NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release)
+endif()
+
+add_compile_options(-Wall -Wextra -Wpedantic -Werror -Wno-unused-parameter)
+
+set(PYBUDA_CSRC_WARNINGS -Wall -Wextra -Wno-pragmas -Wno-unused-parameter)
+set(CFLAGS_NO_WARN -DFMT_HEADER_ONLY)
+set(PYBUDA_CSRC_CFLAGS ${CFLAGS_NO_WARN} ${PYBUDA_CSRC_WARNINGS} -DUTILS_LOGGER_PYTHON_OSTREAM_REDIRECT=1)
+
+set(CONFIG_LDFLAGS "")
+
+set(STATIC_LIB_FLAGS -fPIC)
+set(SHARED_LIB_FLAGS -fPIC)
+
+add_subdirectory(third_party)
+add_subdirectory(pybuda)
diff --git a/Makefile b/Makefile
index 757d5bd1d..79a5e78be 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,14 @@
 .SUFFIXES:
 
+OS ?= $(shell uname)
+
+ifeq ($(OS),Linux)
 MAKEFLAGS := --jobs=$(shell nproc) --output-sync=target
+else ifeq ($(OS),Darwin)
+MAKEFLAGS := --jobs=$(shell sysctl -n hw.physicalcpu) --output-sync=target
+else
+$(error "Unknown OS: $(OS)")
+endif
 
 # Setup CONFIG, DEVICE_RUNNER, and out/build dirs first
 CONFIG ?= assert
@@ -12,9 +20,11 @@ PREFIX ?= $(OUT)
 
 CONFIG_CFLAGS =
 CONFIG_LDFLAGS =
+CONFIG_CXXFLAGS =
 
 ifeq ($(CONFIG), release)
 CONFIG_CFLAGS += -O3
+CONFIG_CXXFLAGS = -fvisibility-inlines-hidden
 else ifeq ($(CONFIG), ci)  # significantly smaller artifacts
 CONFIG_CFLAGS += -O3 -DDEBUG -Werror
 else ifeq ($(CONFIG), assert)
@@ -46,16 +56,21 @@ TESTDIR = $(OUT)/test
 DOCSDIR = $(OUT)/docs
 SUBMODULESDIR = $(OUT)/submodules
 
+# Python version
+PYTHON_VERSION ?= python3.8
+PYTHON_INCLUDES = $(shell $(PYTHON_VERSION)-config --includes)
+PYTHON_LDFLAGS = $(shell $(PYTHON_VERSION)-config --ldflags)
+
 # Top level flags, compiler, defines etc.
 
 #WARNINGS ?= -Wall -Wextra
 WARNINGS ?= -Wdelete-non-virtual-dtor -Wreturn-type -Wswitch -Wuninitialized -Wno-unused-parameter
-CC ?= gcc
-CXX ?= g++
-CFLAGS_NO_WARN ?= -MMD -I. $(CONFIG_CFLAGS) -mavx2 -DBUILD_DIR=\"$(OUT)\" -I$(INCDIR) -DFMT_HEADER_ONLY -Ithird_party/fmt -Ithird_party/pybind11/include
+CC ?= clang
+CXX ?= clang++
+CFLAGS_NO_WARN ?= -MMD -I. $(CONFIG_CFLAGS) -DBUILD_DIR=\"$(OUT)\" -I$(INCDIR) -DFMT_HEADER_ONLY -Ithird_party/fmt -Ithird_party/pybind11/include $(PYTHON_INCLUDES)
 CFLAGS ?= $(CFLAGS_NO_WARN) $(WARNINGS)
-CXXFLAGS ?= --std=c++17 -fvisibility-inlines-hidden -maes -mavx 
-LDFLAGS ?= $(CONFIG_LDFLAGS) -Wl,-rpath,$(PREFIX)/lib -L$(LIBDIR) -Ldevice/lib
+CXXFLAGS ?= --std=c++17 $(CONFIG_CXXFLAGS)
+LDFLAGS ?= $(CONFIG_LDFLAGS) -Wl,-rpath,$(PREFIX)/lib -L$(LIBDIR) -v
 SHARED_LIB_FLAGS = -shared -fPIC
 STATIC_LIB_FLAGS = -fPIC
 ifeq ($(findstring clang,$(CC)),clang)
@@ -66,11 +81,16 @@ WARNINGS += -Wmaybe-uninitialized
 LDFLAGS += -lstdc++
 endif
 GIT_COMMON_DIR=$(shell git rev-parse --git-common-dir)
-SUBMODULES=$(wildcard $(GIT_COMMON_DIR)/modules/third_party/*)
-SUBMODULES_UPDATED=$(addprefix $(SUBMODULESDIR)/, $(SUBMODULES:$(GIT_COMMON_DIR)/modules/%=%.checkout))
+SUBMODULES=$(shell git submodule status | grep -o "third_party/[^ ]*")
+SUBMODULES_UPDATED=$(addprefix $(SUBMODULESDIR)/, $(SUBMODULES:%=%.checkout))
 SKIP_BBE_UPDATE ?= 0
 SKIP_SUBMODULE_UPDATE ?= $(SKIP_BBE_UPDATE)
 
+ifeq ($(EMULATION_DEVICE_EN), 1)
+    TENSIX_EMULATION_ZEBU = $(TENSIX_EMULATION_ROOT)/zebu
+    TENSIX_EMULATION_ZCUI_WORK = $(TENSIX_EMULATION_ROOT)/targets/tensix_2x2_1dram_BH/zcui.work
+endif
+
 all: update_submodules build ;
 
 # These must be in dependency order (enforces no circular deps)
@@ -78,34 +98,45 @@ include python_env/module.mk
 include pybuda/module.mk
 include docs/public/module.mk
 
-update_submodules: $(SUBMODULES_UPDATED) ;
+update_submodules: $(SUBMODULES_UPDATED) emulation_device_links ;
 
-$(SUBMODULESDIR)/%.checkout: $(SUBMODULESDIR)/%
+$(SUBMODULESDIR)/%.checkout:
 	@mkdir -p $(dir $@)
+ifeq ($(SKIP_SUBMODULE_UPDATE), 0)
+	git submodule update --init --recursive $(@:$(SUBMODULESDIR)/%.checkout=%)
+	#git -C $(@:$(SUBMODULESDIR)/%.checkout=%) submodule foreach --recursive git lfs install || true
+	#git -C $(@:$(SUBMODULESDIR)/%.checkout=%) submodule foreach --recursive git lfs pull
+	#git -C $(@:$(SUBMODULESDIR)/%.checkout=%) submodule foreach --recursive git lfs checkout HEAD
+endif
 	touch $@
 
-$(SUBMODULESDIR)/%: $(GIT_COMMON_DIR)/modules/%/HEAD
-	@mkdir -p $(dir $@)
-ifeq ($(SKIP_SUBMODULE_UPDATE), 0)
-	git submodule update --init --recursive $(@:$(SUBMODULESDIR)/%=%)
-	git -C $(@:$(SUBMODULESDIR)/%=%) submodule foreach --recursive git lfs install || true
-	git -C $(@:$(SUBMODULESDIR)/%=%) submodule foreach --recursive git lfs pull
-	git -C $(@:$(SUBMODULESDIR)/%=%) submodule foreach --recursive git lfs checkout HEAD
+emulation_device_links:
+ifeq ($(EMULATION_DEVICE_EN), 1)
+	@echo "Linking and copying emulation device files..."
+	ln -sf $(TENSIX_EMULATION_ZCUI_WORK) $(OUT)/.
+	ln -sf $(TENSIX_EMULATION_ZCUI_WORK) ../
+	cp -f $(TENSIX_EMULATION_ZEBU)/scripts/designFeatures ./
 endif
-	touch -r $^ $@
-	touch $@.checkout
 
 build: pybuda third_party/tvm ;
 
 third_party/tvm: $(SUBMODULESDIR)/third_party/tvm.build ;
 
-$(SUBMODULESDIR)/third_party/tvm.build: python_env $(SUBMODULESDIR)/third_party/tvm.checkout
-	bash -c "source $(PYTHON_ENV)/bin/activate && ./third_party/tvm/install.sh"
+$(SUBMODULESDIR)/third_party/tvm.build: $(PYTHON_ENV) $(SUBMODULESDIR)/third_party/tvm.checkout
+	bash -c "source $(PYTHON_ENV_ROOT)/bin/activate && ./third_party/tvm/install.sh"
 	touch $@
 
-clean: third_party/budabackend/clean
+build_tests: pybuda/csrc/graph_lib/tests pybuda/csrc/passes/tests ;
+
+run_tests: build_tests
+	@echo "Running tests..."
+	$(TESTDIR)/pybuda/csrc/graph_lib/tests/graphlib_unit_tests
+	$(TESTDIR)/pybuda/csrc/passes/tests/passes_unit_tests
+
+clean: 
 	rm -rf $(OUT)
 	rm -rf third_party/tvm/build
+	rm -rf build_deps/
 
 clean_no_python:
 	find $(OUT)/ -maxdepth 1 -mindepth 1 -type d -not -name 'python_env' -print0 | xargs -0 -I {} rm -Rf {}
@@ -136,7 +167,7 @@ build_tvm: third_party/tvm ;
 .PHONY: stubs
 stubs:
 	pip install mypy
-	stubgen -m pybuda._C -m pybuda._C.autograd -m pybuda._C.balancer -m pybuda._C.graph -m pybuda._C.backend_api -m pybuda._C.pattern_matcher -m pybuda._C.scheduler -m pybuda._C.torch_device -o pybuda
+	stubgen -m pybuda._C -m pybuda._C.autograd -m pybuda._C.graph -m pybuda._C.torch_device -o pybuda
 
 # Cleaning PyBuda and BBE artifacts
 .PHONY: clean_tt
diff --git a/README.debug.md b/README.debug.md
deleted file mode 100644
index 73e040fed..000000000
--- a/README.debug.md
+++ /dev/null
@@ -1,115 +0,0 @@
-
-*PyBuda monitors many environment variables to modify default behavior. These can be used to debug or analyze problems.*
-
-## Overrides
- * PYBUDA\_BUILD\_DIR: Override the build directory for the compiler.
- * LOGURU\_LEVEL: set Python logger level - default is DEBUG, valid values are INFO, DEBUG, TRACE, NONE
- * LOGGER\_LEVEL: set C++ logger level - values are the same as Python logger
- * PYBUDA\_DEVMODE: set to make Golden/Sequential default run mode if one isn't specified explicitly
- * PYBUDA\_PROFILE: enable Python profiler
- * PYBUDA\_ASSERT\_UNSUPPORTED\_HW\_OP: assert if an unsupported op is found
- * PYBUDA\_BALANCER\_PLACER\_DATA: prints balancer/placer visual info, prints chip op group info
- * PYBUDA\_BALANCER\_POLICY\_TYPE: override balancer policy
- * PYBUDA\_SCHEDULER\_POLICY: override scheduler policy
- * PYBUDA\_BALANCER\_ONE\_ROW: limit placement to one row
- * PYBUDA\_ENABLE\_T\_STREAMING: enable t-streaming (i.e. streaming ops with small output buffers)
- * PYBUDA\_ENABLE\_TVM\_CACHE: Cache tvm graphs instead of re-compiling
- * PYBUDA\_FORCE\_FULL\_COMPILE\_DEPTH: Force each test to run to compile depth "FULL"
- * PYBUDA\_RELOAD\_GENERATED\_MODULES: Reload previously generated modules instead of recompiling through tvm.
- * PYBUDA\_SKIP\_L1\_USAGE\_VALIDATION: allows ops to use more L1 than available
- * PYBUDA\_ENFORCE\_SAME\_UBLOCK\_OPERANDS: ???
- * PYBUDA\_VERIFY\_NET2PIPE: verify produced netlist using net2pipe
- * PYBUDA\_CI\_DIR: ???
- * PYTEST\_CURRENT\_TEST: ???
- * PYBUDA\_CI\_CAPTURE\_TENSORS: save tensors used in the test so they can be used in stand-alone back-end tests
- * PYBUDA\_FORCE\_SEQUENTIAL: override test/script to run everything in sequential mode
- * PYBUDA\_TRACE\_SHUTDOWN: show stack trace on shutdown due to error
- * PYBUDA\_OVERRIDE\_NUM\_CHIPS: force the number of chips to use, instead of the auto-detected number
- * PYBUDA\_DISABLE\_DYNAMIC\_DRAM: disable dynamic allocation of e2e queues in inference
- * PYBUDA\_DISABLE\_FORK\_JOIN\_BUF: disable fork-join buffering
- * PYBUDA\_DISABLE\_FORK\_JOIN\_NOPS: don't insert nops if there's not enough buffering. Just add what's available in L1. This should only be used for debug.
- * PYBUDA\_FORK\_JOIN\_DEBUG\_INFO: print debug logs related to fork-join buffering
- * PYBUDA\_FORK\_JOIN\_DEBUG\_FORK\_NAME: filter debug logs (generated by PYBUDA\_FORK\_JOIN\_DEBUG\_INFO) by fork node name
- * PYBUDA\_FORK\_JOIN\_DEBUG\_JOIN\_NAME: filter debug logs (generated by PYBUDA\_FORK\_JOIN\_DEBUG\_INFO) by join node name
- * PYBUDA\_FORK\_JOIN\_SKIP\_EXPANDING\_BUFFERS: don't expand buffers in L1 - this will cause algorithm to add buffering nops/queues any time a fork-join needs to be buffered.
- * PYBUDA\_FORK\_JOIN\_EXPAND\_OUTPUT\_BUFFERS: expand only output buffers (instead of input buffers) for fork-join buffering
- * GOLDEN\_WORMHOLE: run Golden with Wormhole as target device instead of Grayskull (default)
- * SHOW\_ALL\_FAILS: don't assert on the first data mismatch, but show all fails before failing the test
- * PYBUDA\_EXP\_APPROX: run exp in approximate mode
- * PYBUDA\_VERIFY\_RESULTS\_OFF\_BY\_DEFAULT: disable result verification (tensor comparison of processed and golden module done via forward pass)
- * PYBUDA\_ENABLE\_STABLE\_SOFTMAX: enable stable Softmax (disabled by default)
- * EVAL\_DEBUG: prints inputs/outputs during module evaluation
- * TT\_BACKEND\_GOLDEN\_QUANTIZE: ???
- * PYBUDA\_RESET\_DEV\_BEFORE\_TEST: resets device between tests (pytest must be called with --forked in order to work)
- * PYBUDA\_PERF\_SIMULATOR: run performance simulator to estimate performance of the model
- * PYBUDA\_PERF\_SIMULATOR\_LOG: dump log of all events in perf simulator (will slow down the run)
- * PYBUDA\_PERF\_SIMULATOR\_TRACE: create trace file to be loaded into routeagui
- * PYBUDA\_OP\_PERF: dump op\_perf.csv file with op grid choices and estimated cycle counts
- * <u>PYBUDA\_BENCHMARK\_NO\_RESET\_ON\_ERROR: from the comments seems that it doesn't work, should we remove this one?</u>
- * PYBUDA\_SKIP\_BACKEND\_COMPILE: configure backend device to run in DeviceMode.RunOnly, picking up build binaries from previous run
- * PYBUDA\_PLACER\_BWD\_GROUPS: use bwd groups when placing so that fwd and bwd ops are placed together
- * PYBUDA\_TRIPLET\_PLACEMENT: try to place bwd groups in "triplet" placement strategy
- * PYBUDA\_EXP\_APPROX: force exp and exponent in gelu\_derivative to run in approximate mode (i.e. faster, but less accurate)
- * PYBUDA\_AMP\_LEVEL: configure the AMP (Automatic Mixed Precision) optimization level.
- * PYBUDA\_NO\_FUSE\_MATMUL\_BIAS: disable fusing of matmul+add into matmul
- * PYBUDA\_ENABLE\_OUTPUT\_QUEUES\_ON\_HOST: configures whether whether output queues are placed on HOST (default: true)
- * PYBUDA\_FORCE\_VERIFY\_ALL: ensure that verification is run after each compile stage, overrides VerifyCondig.disabled()
- * PYBUDA\_VERIFY\_POST\_AUTOGRAD\_PASSES: verify graph after post autograd passes, unless the verify config is VeifyConfig.disabled()
- * PYBUDA\_VERIFY\_POST\_PLACER: verify graph after post placer pass, unless the verify config is VeifyConfig.disabled()
- * PYBUDA\_GALAXY\_LINEAR\_ROUTE: place graphs sequentially in a snake route around the Galaxy modules
- * PYBUDA\_NEBULA\_GALAXY\_PLACER: only place output nop on mmio chip for untilizing
- * PYBUDA\_ENABLE\_AUTO\_TRANSPOSE: configures whether auto-transpose is enabled while op placement (default: false)
- * PYBUDA\_MINIMIZE\_REMOTE\_DRAM\_QUEUES: configures behaviour for data forking to remote chips - create single e2e queue on producer or e2e queue per consumer chip (default)
- * PYBUDA\_SPARSE\_MM\_ENCODING\_ESTIMATES\_OFF: when on, turns off estimation logic for in0/in2 for sparse mm, but gets slower
- * PYBUDA\_REBLOCK\_INPUT\_ACT: when enabled, we reblock input activations to the smallest grid across all users instead of forcing 1x1. (default: disabled)
- * PYBUDA\_DUMP\_MIXED\_PRECISION: when on, dump json with a per-op info about fidelity, data-formats (default: off). Default directory: reportify dump directory.
- * PYBUDA\_FRACTURIZATION\_DISABLE: disables kernel fracturing for convolutions
- * PYBUDA\_PRESTRIDE\_DISABLE: disables prestriding transform for convs
- * PYBUDA\_LEGALIZER\_DETAILED\_DEBUGGING: when on provides detailed debugging information and statistics about legalizer OpModel selection process including GraphSolver. Works only in DEBUG(default: off).
- * PYBUDA\_LEGALIZER\_DEBUG\_NODE\_NAME: used together with legalizer detailed debugging to narrow down debugging info to single node. Works only in DEBUG(default: off).
- * PYBUDA\_GRAPHSOLVER\_SELF\_CUT\_TYPE: Override for graph_solver_self_cut_type in BalancerConfig. Valid values: None, ConsumerOperandDataEdgesFirst, ProducerUserDataEdgesFirst, FastCut. When switched on(not None) graphsolver will cut edges for which it cannot produce valid paths. (default: None)
- * PYBUDA\_MAX\_GRAPH\_CUT\_RETRY: Override for default_resolve_retry_count_self_cutting in GraphSolver::resolve. This sets the max retry step if GraphSolver self cut is turned on.
- * PYBUDA\_REPLACE\_INF\_IN\_TVM\_PARAMS: Replace -inf and inf values from TVM parameters during PyBuda code generation.
- * PYBUDA\_FORCE\_ALLOW\_FRACTURING: All convs will be candidates for fracturing - this does NOT mean that all convs will fracture (search for is_kernel_fracturing_candidate)
- * PYBUDA\_DISABLE\_FUSE\_TAGS: Specify a list of ops (comma delimited) by original_op_type/op_type that will be exempt from fusion (e.g. PYBUDA\_DISABLE\_FUSE\_TAGS="reciprocal,softmax").
- * PYBUDA\_SINGLE\_OP\_EPOCHS: Place every single op on a new epoch.
- * PYBUDA\_FORK\_JOIN\_BUF\_QUEUES: Turn on adding buffering queues instead of nops in fork joins that need a lot of buffering (have one path much larger than the other).
- * PYBUDA\_RESNET\_BUFF\_QUEUE\_OVERRIDE: Turn off adding buffering queues in graph solver cut. Temporal fix for ResNet perf.
- * PYBUDA\_OVERRIDE\_DEVICE\_YAML: Override the soc device descriptor to compile against different device configurations.
- * PYBUDA\_DISABLE\_INTERACTIVE\_PLACER: Override balancer policy not to use Interactive placer and to fallback to legacy placer instead. (default: 0/False)
- * PYBUDA\_DISABLE\_INTERACTIVE\_FJ\_BUFFERING: Override balancer policy not to use inlined fork-join buffering. (default: 0/False)
- * PYBUDA\_DISABLE\_PADDING\_PASS\: Disable running of padding pass.
- * PYBUDA\_PADDING\_PASS\_ELEMENT\_WISE: In padding pass pad elementwise ops.
- * PYBUDA\_PADDING\_PASS\_MATMUL: In padding pass pad matmul ops.
- * PYBUDA\_PADDING\_PASS\_SPARSE\_MATMUL: In padding pass pad sparse matmul ops. Needs to have matmul ops enabled for padding too in order to enable this.
- * PYBUDA\_PADDING\_PASS\_BUFFER\_QUEUE": Enable padding pass, insert buffer queue
- * PYBUDA\_ENABLE\_STOCHASTIC\_ROUNDING": Enable stochastic rounding for all supported ops.
- * PYBUDA\_PADDING\_PASS\_CONCAT": Enable padding pass, for concatenate operation
- * PYBUDA\_FORCE\_CONV\_MULTI\_OP\_FRACTURE: Forces all convs to be fractured (during decompose pass) according to heuristic defined in `pybuda/pybuda/op/eval/pybuda/convolution.py`.
- * PYBUDA\_COLLECT\_CONSTRAINT\_INFO: Enables constraint info collection on every graphsolver resolve.
- * PYBUDA\_GRAPHSOLVER\_FAST: Enables partial re-resolve on cut and buffer, much faster at cost of not enabling all possible valid OpModels.
- * NUM\_EXEC\_LOOP\_ITERATIONS: For single temporal epoch tests, you can specify a # here that will rerun the epoch the specified # of times. Each rerun is initiated by FW rather than requiring host interaction, to improve performance.
- * PYBUDA\_PADDING\_PASS\_DISABLE\_BUDA\_OP: Disable padding logic that uses buda implementation for pad and unpad.
- * PYBUDA\_ENABLE\_ETH\_SERIALIZATION: Enable the ethernet stream reduction pass, using the ethernet datacopy op to implement the stream reduction
- * PYBUDA\_ENABLE\_ETH\_DATACOPY\_SERIALIZATION: Enable the ethernet stream reduction pass, using the tensix datacopy/nop op to implement the stream reduction. Will only insert datacopy ops if there are free tensix cores
- * PYBUDA\_SUPRESS\_T\_FACTOR\_MM: Enables a condition in calculate_op_model in legalizer that limits the t factor of sparse/dense matmul ops to be less than the flag's value. Valid values: any positive int value (eg. 16)
- * PYBUDA\_AMP\_LIGHT: Enable a "light" version of mixed precision to minimize accuracy impact (default: 0/False; 1: bfp8/hifi2, 2: bfp4/hifi2, 3: bfp4/LoFi)
- * PYBUDA\_GRAPH\_NAME\_SUFFIX: Suffix to add to the graph name (helps to generate unique netlist names)
- * PYBUDA\_DISABLE\_L1\_ACCUMULATE: Flag for disabling and debugging L1 accumaulation feature.
- * PYBUDA\_OVERRIDE\_VETO: Used to Add/Remove/Update general and env var based compiler configurations. 
- * PYBUDA\_DISABLE\_REPORTIFY\_DUMP: Disable generating reportify graph. 
- * PYBUDA\_DISABLE\_CAP\_SPARSE\_MM\_FIDELITY: Disables an optimization to cap the fidelity phases of sparse matmul to at most HiFi2.
- * PYBUDA\_DISABLE\_EXPLICIT\_DRAM\_IO: Disables the FE from programming netlist attribute `input_dram_io_buf_size_tiles`.  Instead the FE will leave this attribute as `0` which implicitly means that the backend will handle the allocation of this buffer.
- * PYBUDA\_CONCAT\_ON\_HOST: Lower concatenate ops on output nodes into runtime transforms so that they're done on host.
- * PYBUDA\_BALANCER\_LEGACY\_CYCLES\_CALC: Use kernel cycles instead of limiter cycles(kernel + mem BW) for estimation.
- * PYBUDA\_OP\_MODEL\_COMPARE\_VERSION: Version of op model comparision function. Can be used to compare effect of different comparison logic on performance.
- * PYBUDA\_RIBBON1\_PREPASS\_ENABLED: Whether to use or not suboptimal opmodel invalidation prepass. Default value is False.
- * PYBUDA\_RIBBON2\_OPTIMIZATION\_ITERATIONS: Number of optimization iterations in Ribbon2 balancing policy. Default value is 0.
- * PYBUDA\_RIBBON2\_DISABLE\_CLEANUP\_BUF\_NOPS: Disable cleanup of unneeded buffering nops in Ribbon2. (default: 0/False)
-
-## Temp overrides
-* PYBUDA\_TEMP\_ENABLE\_NEW\_SPARSE\_ESTIMATES: Apply new formula to estimate the cycle count of sparse matmul ops (currently only support LoFi and HiFi2 fidelities)
-* PYBUDA\_TEMP\_SCALE\_ESTIMATE\_ARGS: Scale counts of non-zero tiles, ublocks and strips to reflect the numbers that would end up on a single core, since BBE estimates always assume grid_size [1,1].
-* PYBUDA\_TEMP\_ELT\_UNARY\_ESTIMATES\_LEGACY: Force legacy path of calculating execution cycles for eltwise unary ops - instead of calling into BBE, use hand-crafted FE-side logic
-* PYBUDA\_TEMP\_ENABLE\_NEW\_FUSED\_ESTIMATES: Apply new formula to estimate the cycle count of fused ops. The formula calls BBE to estimate each subop and sums up the results.
-* PYBUDA\_LEGACY\_KERNEL\_BROADCAST: Use legacy kernel broadcast detection path. Will detect fewer kernel broadcasts, and will oftentimes use more tiles (longer KBs).
\ No newline at end of file
diff --git a/README.md b/README.md
index 6509de5ed..fe1d360e3 100644
--- a/README.md
+++ b/README.md
@@ -1,32 +1,28 @@
-# TT-Buda
-
-## Introduction
-
-The TT-Buda software stack can compile AI/ML models from several different frameworks such as PyTorch and Tensorflow, and execute them in many different ways on Tenstorrent hardware.
-
-**Note on terminology:**
-
-TT-Buda is the official Tenstorrent AI/ML compiler stack and PyBuda is the Python interface for TT-Buda. PyBuda allows users to access and utilize TT-Buda's features directly from Python. This includes directly importing model architectures and weights from PyTorch, TensorFlow, ONNX, and TFLite.
-
-## Model Demos
-
-Model demos are now part of a separate repo:
-
-https://github.com/tenstorrent/tt-buda-demos
-
-## Docs
-
-See: [Docs](https://docs.tenstorrent.com/tenstorrent/v/tt-buda)
-
-## Build
-
-https://docs.tenstorrent.com/tenstorrent/v/tt-buda/installation
-
-## Env setup
-
-Set `LD_LIBRARY_PATH` to the location of `third_party/budabackend/build/lib` - preferrably the absolute path to allow scripts to find them from anywhere.
-
-## Silicon
-
-See README.silicon.md for details on how to run on silicon.
-
+### Building dependencies
+* `cmake`
+* `clang`
+* `Ninja` - sudo apt-get install ninja-build
+
+### Building environment
+This is one off step. It will pull all dependencies needed for tt-forge.
+
+* `git submodule update --init --recursive -f`
+* `source env/activate`
+* `cmake -B env/build env`
+* `cmake --build env/build`
+
+### Build tt-forge
+* `source env/activate`
+* `cmake -G Ninja -B build .`
+* `cmake --build build`
+
+### Cleanup
+* `rm -rf build` - to cleanup tt-forge build artifacts.
+* `./clean_all.sh` - to cleanup all build artifacts (tt-forge/tvm/tt-mlir/tt-metal). This will not remove toolchain dependencies.
+
+### Environment variables:
+* `TTMLIR_TOOLCHAIN_DIR` - points to toolchain dir where dependencies of TTLMIR will be installed. If not defined it defaults to /opt/ttmlir-toolchain
+* `TTMLIR_VENV_DIR` - points to virtual environment directory of TTMLIR.If not defined it defaults to /opt/ttmlir-toolchain/venv
+* `PYBUDA_TOOLCHAIN_DIR` - points to toolchain dir where dependencies of PyBuda will be installed. If not defined it defaults to /opt/pybuda-toolchain
+* `PYBUDA_VENV_DIR` - points to virtual environment directory of tt-forge. If not defined it defaults to /opt/pybuda-toolchain/venv
+* `PYBUDA_PYTHON_VERSION` - set to override python version. If not defined it defaults to python3.10
diff --git a/README.silicon.md b/README.silicon.md
deleted file mode 100644
index f2eb254a1..000000000
--- a/README.silicon.md
+++ /dev/null
@@ -1,25 +0,0 @@
-
-# How to run pybuda on silicon
-
-## Docker
-
-To create a docker, run `bin/run-docker.sh` from `third_party/budabackend`. Then, enter it use `docker exec -u $USER -it special-$USER bash`.
-
-## Env
-
-Grayskull and Wormhole machines require slightly different setups, so there are setup scripts for each. To build everything run the appropriate script:
-
-* `source env_for_silicon.sh` (Grayskull) 
-
-or 
-
-* `source env_for_wormhole.sh` (Wormhole)
-
-Now, you should be able to run pybuda pytests and python scripts.
-
-## Run
-
-For example, try:
-
-`pytest -svv pybuda/test/backend/test_silicon.py::test_basic_training[Grayskull-acc1-mbc1-microbatch1-s1]`
-
diff --git a/bisect.sh b/bisect.sh
deleted file mode 100755
index e9d72d88a..000000000
--- a/bisect.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-
-function show_usage(){
-    echo "Usage bisect.sh GOOD_REV BAD_REV TEST_COMMAND_0"
-    exit 1
-}
-if [ ! $# -eq 3 ]
-then
-    show_usage
-fi
-
-git fetch
-
-if ! git cat-file -e $1 2> /dev/null
-then
-  echo "$1 not valid git rev"
-  show_usage
-fi
-
-if ! git cat-file -e $2 2> /dev/null
-then
-  echo "$2 not valid git rev"
-  show_usage
-fi
-
-echo "Running auto_bisect with passing revision: $1, failing revision $2"
-echo "Test commands:"
-
-for ((i = 3; i <= $#; i++ )); do
-  printf '%s\n' "  ${!i}"
-done
-
-read -n1 -s -r -p $'Press c to continue, q to quit\n' key
-
-if [ "$key" = 'q' ]
-then
-    echo "Exiting"
-    exit 0
-fi
-
-git bisect start
-git bisect bad $2
-git bisect good $1
-git bisect run bash -c ". compile_and_run_test.sh $3"
-git bisect log
-git bisect reset
\ No newline at end of file
diff --git a/clean_all.sh b/clean_all.sh
new file mode 100755
index 000000000..6818a1b3c
--- /dev/null
+++ b/clean_all.sh
@@ -0,0 +1,6 @@
+rm -rf build
+rm -rf env/build
+rm -rf third_party/tt-mlir/build
+rm -rf third_party/tt-mlir/env/build
+rm -rf third_party/tt-mlir/third_party/tt-metal
+rm -rf third_party/tvm/build
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
new file mode 100644
index 000000000..da33ba8e7
--- /dev/null
+++ b/cmake/Utils.cmake
@@ -0,0 +1,17 @@
+### Utility functions for pybuda ###
+
+### Check if an environment variable exists ###
+function(check_env_variable_internal VARIABLE_NAME ret)
+    if(NOT DEFINED ENV{${VARIABLE_NAME}})
+        set(${ret} "false" PARENT_SCOPE)
+    endif()
+endfunction()
+
+### Check if an environment variable exists ###
+function(check_required_env_var VARIABLE_NAME)
+    set(VARIABLE_EXISTS "true")
+    check_env_variable_internal(${VARIABLE_NAME} VARIABLE_EXISTS)
+    if(NOT ${VARIABLE_EXISTS})
+        message(FATAL_ERROR "${VARIABLE_NAME} does not exist. Did you run source env/activate?")
+    endif()
+endfunction()
diff --git a/compile_and_run_test.sh b/compile_and_run_test.sh
deleted file mode 100755
index a28439e9e..000000000
--- a/compile_and_run_test.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-export BACKEND_ARCH_NAME=wormhole_b0
-git reset --hard
-git submodule update --init --recursive
-make pybuda
-if [ ! $? -eq 0 ]
-then
-    echo "Make pybuda failed"
-    make clean_no_python
-    cd third_party/budabackend
-    git clean -fxd
-    cd ../..
-    make pybuda
-    if [ ! $? -eq 0 ]
-    then
-        echo "Clean build failed bad rev 1"
-        return 1
-    fi
-fi
-source build/python_env/bin/activate
-source third_party/tvm/install.sh
-source third_party/tvm/enable.sh
-echo "Evaluating $*"
-eval "$*"
-if [ ! $? -eq 0 ]
-then
-    echo "Test failed"
-    exit 1
-fi
-
-echo "All passed"
-exit 0
diff --git a/compile_flags.txt b/compile_flags.txt
index f66eece24..b2630aca4 100644
--- a/compile_flags.txt
+++ b/compile_flags.txt
@@ -13,4 +13,3 @@
 -Ithird_party/json
 -Ipybuda/csrc
 -Ithird_party/fmt
--Ithird_party/budabackend
diff --git a/docs/CI/.gitlab-ci.build-docs.yml b/docs/CI/.gitlab-ci.build-docs.yml
index 63335c462..b92211213 100644
--- a/docs/CI/.gitlab-ci.build-docs.yml
+++ b/docs/CI/.gitlab-ci.build-docs.yml
@@ -5,7 +5,7 @@
       - docs/public/md
   needs: 
     - pybuda-wh-b0-wheel
-    - seeded-venv
+    - !reference [.common_deps, needs]
   tags:
     - 8-core
   script: 
@@ -25,8 +25,10 @@ build-docs-main:
   extends: .build-docs
   rules:
     - if: $CI_COMMIT_TAG
-      when: always
+    - if: $CI_COMMIT_REF_NAME == "main"
+      when: manual
     - when: never  
+  allow_failure: true
   script:
     - !reference [.build-docs, script]
     - cp -r ../docs/public/md/* pybuda
@@ -38,7 +40,6 @@ build-docs-staging:
   extends: .build-docs
   rules:
   - if: $CI_COMMIT_BRANCH == "main"
-    when: always
   script: 
     - !reference [.build-docs, script]
     - git checkout staging 
@@ -46,4 +47,37 @@ build-docs-staging:
     - git add pybuda
     - git commit -m "update docs from pipeline $CI_PIPELINE_ID"
     - git push
-    
\ No newline at end of file
+
+# Job for building and pushing markdown to the new docsite repo
+# Note: the repo name will need to be updated when the docsite goes live, as the name will be changed from docs-test
+build-docs-docsite:
+  stage: docs
+  artifacts:
+    paths:
+      - docs/public/md
+  needs: 
+    - pybuda-wh-b0-wheel
+    - !reference [.common_deps, needs]
+  tags:
+    - 8-core
+  rules:
+  - if: $CI_COMMIT_BRANCH == "main" && $CI_COMMIT_TAG =~ /^v[0-9]+\.[0-9]+\.[0-9]+$/
+  script: 
+    - !reference [.common_prep, script]
+    - pip3 install sphinx
+    - pip3 install sphinx-markdown-builder
+    - sphinx-build -M markdown docs/public docs/public/md
+    - cp -r docs/public/images docs/public/md/markdown
+    - git config --global user.email "tenstorrent-github-bot@tenstorrent.com"
+    - git config --global user.name "tenstorrent-github-bot"
+    - git clone https://tenstorrent-github-bot:${GITHUB_BOT_TOKEN}@github.com/tenstorrent/docs-test.git
+    - cd docs-test
+    - git remote set-url origin https://tenstorrent-github-bot:${GITHUB_BOT_TOKEN}@github.com/tenstorrent/docs-test.git
+    - git checkout main
+    - pip install -r requirements.txt
+    - cp -r ../docs/public/md/* pybuda
+    - python update_tags.py pybuda $$CI_COMMIT_TAG
+    - git add .
+    - git commit -m "update pybuda docs from pipeline $CI_PIPELINE_ID with tag $CI_COMMIT_TAG"
+    - git tag -a $CI_COMMIT_TAG -m "pybuda documentation version $CI_COMMIT_TAG"
+    - git push && git push --tags
diff --git a/docs/public/api.rst b/docs/public/api.rst
index 21de6a121..5336bc92f 100644
--- a/docs/public/api.rst
+++ b/docs/public/api.rst
@@ -21,10 +21,6 @@ Python Runtime API
 .. automodule:: pybuda
    :members: run_inference, run_training, shutdown, initialize_pipeline, run_forward, run_backward, run_optimizer, get_parameter_checkpoint, get_parameter_gradients, update_device_parameters
 
-C++ Runtime API
-******************
-
-The BUDA Backend used by Python Runtime can be optionally used stand-alone to run pre-compiled TTI models. The API reference for stand-alone BUDA Backend Runtime can be found `here <http://yyz-webservice-02.local.tenstorrent.com/docs/budabackend-docs/>`_.
 
 Configuration and Placement
 ***************************
diff --git a/docs/public/images/tt_buda_w_logo.png b/docs/public/images/tt_buda_w_logo.png
new file mode 100644
index 000000000..553d8a194
Binary files /dev/null and b/docs/public/images/tt_buda_w_logo.png differ
diff --git a/docs/public/images/tt_logo.png b/docs/public/images/tt_logo.png
new file mode 100644
index 000000000..191e7f857
Binary files /dev/null and b/docs/public/images/tt_logo.png differ
diff --git a/docs/public/installation.rst b/docs/public/installation.rst
index 27d292d7c..cfa7d589e 100644
--- a/docs/public/installation.rst
+++ b/docs/public/installation.rst
@@ -19,7 +19,7 @@ Prerequisites
 OS Compatibility
 ----------------
 
-Presently, Tenstorrent software is only supported on the **Ubuntu 20.04 LTS (Focal Fossa)** operating system.
+Currently, Tenstorrent software is fully supported and tested on **Ubuntu 22.04 LTS (Jammy Jellyfish)**. Software is also functional on **Ubuntu 20.04 LTS (Focal Fossa)**, but we don't extensively test to guarantee there are no regressions on this OS version.
 
 Download
 ********
@@ -63,6 +63,7 @@ To install a PyBUDA release, follow these steps:
   source env/bin/activate
 
 * Step 4. Pip install PyBuda and TVM
+
 If you have downloaded the latest release wheel files, you can install them directly with pip.
 
 .. code-block:: bash
@@ -70,8 +71,11 @@ If you have downloaded the latest release wheel files, you can install them dire
   pip install pybuda-<version>.whl tvm-<version>.whl
 
 To compile PyBUDA from source, follow these steps:
+
 * Step 1. Clone PyBUDA from https://github.com/tenstorrent/tt-buda/
+
 * Step 2. Update submodules
+
 .. code-block:: bash
 
   cd tt-buda
@@ -80,6 +84,7 @@ To compile PyBUDA from source, follow these steps:
 * Step 3. Compile. PyBUDA's make system will automatically create the needed venv
 
 .. code-block:: bash
+
   make
   source  build/python_env/bin/activate
 
@@ -162,8 +167,14 @@ You may need to append each ``apt-get`` command with ``sudo`` if you do not have
   apt-get install -y python3.8-venv libboost-all-dev libgoogle-glog-dev libgl1-mesa-glx ruby
   apt-get install -y build-essential clang-6.0 libhdf5-serial-dev libzmq3-dev
 
+Environment variable PYTHON_VERSION sets path to the Python executable. This is used by the build system to determine which version of Python to use and where to look for Python.h. 
+For example on Ubuntu 22.04, PYTHON_VERSION should be set to "python3.10", while on Ubuntu 20.04 it should be set to "python3.8".
+
 Additional PyBUDA Compile Dependencies
------------------------------
+--------------------------------------
+
+OS Level Dependencies
+^^^^^^^^^^^^^^^^^^^^^
 
 Additional dependencies to compile PyBUDA from source after running `Backend Compiler Dependencies <#backend-compiler-dependencies>`_
 
@@ -172,9 +183,36 @@ You may need to append each ``apt-get`` command with ``sudo`` if you do not have
 .. code-block:: bash
 
   apt-get install -y libyaml-cpp-dev python3-pip sudo git git-lfs
-  apt-get install -y wget cmake cmake-data libgtest-dev libgmock-dev
+  apt-get install -y wget cmake cmake-data
   pip3 install pyyaml
 
+Package Level Dependencies
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In addition, if you intend to utilize ``torchvision`` for your model development, we strongly recommend using the ``torchvision`` version built with CXX11 ABI, that ensures optimal compatibility with PyBUDA.
+
+When building PyBUDA from source, the recommended version of ``torchvision`` is built and installed by default.
+
+.. note::
+
+  For your convenience, the ``torchvision`` wheel file is already included in the PyBUDA release bundle. This means that if you're using the release bundle, you won't need to build ``torchvision`` from source unless you want to use a different version or need to modify the source code. Simply install the provided wheel file using pip to add ``torchvision`` to your Python environment.
+
+  Here's an example of how you can install the ``torchvision`` wheel file:
+
+  .. code-block:: bash
+
+    pip install /path/to/your/wheel/file/torchvision*.whl
+
+  Replace ``/path/to/your/wheel/file/torchvision*.whl`` with the actual path to the ``torchvision`` wheel file in the PyBUDA release bundle.
+
+.. note::
+
+  To run the existing unit tests of PyBUDA components, e.g. after compiling it from source, you need to install the following packages.
+
+  .. code-block:: bash
+
+    apt-get install -y wget libgtest-dev libgmock-dev
+
 TT-SMI
 ------
 
diff --git a/docs/public/module.mk b/docs/public/module.mk
index df363655d..78bd4db4c 100644
--- a/docs/public/module.mk
+++ b/docs/public/module.mk
@@ -8,7 +8,7 @@ docs/public: $(DOCS_PUBLIC_DIR)
 
 .PHONY: foo
 
-$(DOCS_PUBLIC_DIR): $(DOCS_PUBLIC_BUILD_SCRIPT) $(DOCS_PUBLIC_SRCS) python_env foo
+$(DOCS_PUBLIC_DIR): $(DOCS_PUBLIC_BUILD_SCRIPT) $(DOCS_PUBLIC_SRCS) $(PYTHON_ENV) foo
 	LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):$(LIBDIR) \
 	PYTHON_ENV=$(PYTHON_ENV) \
 	BUILDER=$(DOCS_PUBLIC_SPHINX_BUILDER) \
@@ -19,7 +19,7 @@ $(DOCS_PUBLIC_DIR): $(DOCS_PUBLIC_BUILD_SCRIPT) $(DOCS_PUBLIC_SRCS) python_env f
 docs/public/publish: docs/public
 	rsync --delete -avz  $(DOCS_PUBLIC_DIR)/html/ yyz-webservice-02:/var/www/html/docs/pybuda-docs
 
-docs/pdf: python_env foo
+docs/pdf: $(PYTHON_ENV) foo
 	LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):$(LIBDIR) \
 	PYTHON_ENV=$(PYTHON_ENV) \
 	BUILDER=latexpdf \
diff --git a/docs/public/pybuda/training.rst b/docs/public/pybuda/training.rst
index 0ac5eb659..4c9a8794d 100644
--- a/docs/public/pybuda/training.rst
+++ b/docs/public/pybuda/training.rst
@@ -22,7 +22,7 @@ by a CPU device calculating the loss and backpropagating it to Grayskull.
 
   # Create a device and model, and place it 
   tt0 = TTDevice("tt0", devtype=devtype)
-  matmul0 = BudaLin("matmul0")
+  matmul0 = BudaLinear("matmul0")
   tt0.place_module(matmul0)
 
   act_dim = (1, 1, 32, 32)
diff --git a/docs/public/user_guide.rst b/docs/public/user_guide.rst
index d6e7f1500..8a1d1b836 100644
--- a/docs/public/user_guide.rst
+++ b/docs/public/user_guide.rst
@@ -231,18 +231,31 @@ Such a dictionary can also be pushed back onto the device using :py:func:`update
 TensTorrent Device Image (TTI): Saving/Loading
 **********************************************
 
-A Tenstorrent Device Image (TTI) is a standalone zip/archive file that snapshots the :py:class:`TTDevice<pybuda.TTDevice>` configuration,
-the compiled models/modules placed and any backend build files. There are multiple benefits with the usage a TTI archive:
+A Tenstorrent Image (TTI) is a standalone archive file that captures the entire compiled state of a 
+model. The contents of the archive include device configuration, compiler configuration, compiled model artifacts,
+backend build files (e.g. overlay and risc binaries), model parameter tensors. There can be multiple advantages
+with leveraging the usage of a TTI archive:
 
 1) Offline target compilation of models on arbitrary device targets (i.e. target device does not have to be present/available on the machine to compile and save a TTI).
 2) Loading a TTI archive allows the user to skip any long front-end and backend compilations of models onto the device
    and directly begin executing the graph/module that was packaged in the `*.tti` after pushing inputs to queues.
+3) TTI archives can be shared and loaded across different machines and environments.
+4) When we save a TTI archive, we can configure the serialization format for the model parameters. This can be useful for
+   scenarios where the user wants to save the model parameters in a tilized-binary format to avoid tilizing during model inference.
+   By default the serialization format is pickle. To configure for alternate serialization formats, the user can set either: 
+   `PYBUDA_TTI_BACKEND_FORMAT=1` or `PYBUDA_TTI_BACKEND_TILIZED_FORMAT=1` environment variables.
 
-We can save a TTI archive by invoking the `compile_to_image` method on  :py:class:`TTDevice<pybuda.TTDevice>`
+For example, from a machine without a silicon device, we can save a TTI archive intended to be deployed on a silicon device.
+We need to configure the device type and architecture of the target device and compile the model to a TTI archive.
+This can be done by invoking the `compile_to_image` method on  :py:class:`TTDevice<pybuda.TTDevice>`. 
 
 .. code-block:: python
 
-    tt0 = pybuda.TTDevice("tt0",arch=BackendDevice.Grayskull, devtype=BackendType.Silicon)
+    tt0 = pybuda.TTDevice(
+      name="tt0",
+      arch=BackendDevice.Wormhole_B0,
+      devtype=BackendType.Silicon
+    )
     tt0.place_module(...)
     device_img: TTDeviceImage = tt0.compile_to_image(
         img_path="device_images/tt0.tti",
@@ -256,14 +269,19 @@ This will create the archive file `device_images/tt0.tti`. The contents of a TTI
 .. code-block::
 
     /unzipped_tti_directory
-    ├── device.json # json file capturing device state
+    ├── device.json # Device state and compiled model metadata
     ├── <module-name>.yaml # netlist yaml
-    ├── backend_build_binaries # backend build files from tt_build/<test>
-    │   ├── blob_init
+    ├── compile_and_runtime_config.json # compiler and runtime configurations
+    ├── backend_build_binaries # backend build binaries
+    │   ├── device_desc.yaml
+    │   ├── cluster_desc.yaml
     │   ├── brisc
-    │   ├── ...
-    ├── *tensor*.pkl # pickled constant/parameter tensors
-    ├── *module*.pkl # pickled PyBuda module object
+    │   ├── erisc
+    │   ├── nrisc
+    │   ├── hlks 
+    │   ├── epoch_programs
+    ├── tensors # directory containing serialized tensors
+    ├── module_files # Python file containing the PybudaModule of the model
 
 To load the TTI archive and inspect the contents:
 
@@ -291,11 +309,10 @@ The :py:class:`TTDeviceImage<pybuda.TTDeviceImage>::info()` method provides a su
       - chip_ids: [0]
       - backend device type: BackendType.Silicon
       - grid size: [10, 12]
-      - harvested rows: 0
+      - harvested rows: [0]
 
       Compilation Graph State...
       - training: False
-      - modules: ['bert_encoder']
       - ordered input shapes: [[1, 128, 128], [1, 1, 128, 128]]
       - ordered targets shapes: []
 
@@ -312,8 +329,35 @@ We can now configure :py:class:`TTDevice<pybuda.TTDevice>` by using our image ob
     output_q = pybuda.run_inference()
 
 
-Create TTI Targeting Row-Harvested Silicon Devices
-**************************************************
+Create TTI: Targeting Supported Silicon Devices
+***********************************************
+
+In the example above, we saved a TTI file targeting a silicon device with default configuration (unharvested). There
+are also convenience labels available that can be used to target specific silicon devices in our supported product spec.
+The current support available is: {gs_e150, gs_e300, wh_n150, wh_n300}.
+
+To target a specific silicon device, we can set the device type and architecture using :py:func:`set_configuration_options<pybuda.set_configuration_options>`.
+
+
+.. code-block:: python
+
+    pybuda.set_configuration_options(device_config="wh_n150")
+
+    tt0 = pybuda.TTDevice(
+      name="tt0",
+      arch=BackendDevice.Wormhole_B0,
+      devtype=BackendType.Silicon
+    )
+    tt0.place_module(...)
+    device_img: TTDeviceImage = tt0.compile_to_image(
+        img_path="device_images/tt0.tti",
+        training=training,
+        sample_inputs=(...),
+    )
+
+
+Create TTI: Targeting Custom Row-Harvested Silicon Devices
+*********************************************************
 
 We can also save a TTI file targeting a machine with silicon devices with harvested rows offline. 
 The only difference from the above is we need to manually induce the harvested rows before saving TTI. 
@@ -348,32 +392,65 @@ Accordingly, part of the TTI file slightly changes as well:
 Note that only rows 1-5 and 7-11 are harvestable, and TTI loading will raise an error if the manually harvested rows in TTI does not match with that of the loaded silicon device.
 
 
+Create TTI: Targeting Custom Device Descriptor
+**************************************************
+
+We can also save a TTI file targeting a machine with silicon devices with custom device descriptor (specified with file-path).
+This can be done by setting the device descriptor using :py:func:`set_configuration_options<pybuda.set_configuration_options>` with `backend_device_descriptor_path` argument.
+ 
+.. code-block:: python
+
+    pybuda.set_configuration_options(backend_device_descriptor_path="<device-descriptor-path>/wormhole_b0_4x6.yaml")
+
+    tt0 = pybuda.TTDevice("tt0",arch=BackendDevice.Wormhole_B0, devtype=BackendType.Silicon)
+    tt0.place_module(...)
+    device_img: TTDeviceImage = tt0.compile_to_image(
+        img_path="device_images/tt0.tti",
+        training=training,
+        sample_inputs=(...),
+    )
+
+The device-descriptor used during the offline compilation process will be embedded in the TTI-archive.
+This device-descriptor will be used to configure the device during the TTI-loading process.
+
+
 Embedded TTI Loading
 ********************
 
-Here's an example of loading a TTI model from c++ for environments that do not have a packaged python interpreter.
+Here's an example of loading a generic TTI model from C++ for environments that do not have a packaged Python interpreter.
 
 .. code-block:: cpp
 
   #include <iostream>
   #include <memory>
   #include <vector>
+  #include <filesystem>
 
   #include "tt_backend.hpp"
   #include "tt_backend_api.hpp"
   #include "tt_backend_api_types.hpp"
+  #include "io_utils.h"
 
-  // Populate a queue descriptor from a queue name
-  tt::tt_dram_io_desc get_queue_descriptor(const std::shared_ptr<tt_backend> &backend, const std::string &queue_name);
+  namespace fs = std::filesystem; 
 
-  // Populate a tensor descriptor from a queue name
-  tt::tt_PytorchTensorDesc get_tensor_descriptor(const std::string &name);
+  int main(int argc, char **argv) {
+
+      if (argc <= 1) {
+          throw std::runtime_error("TTI path not specified on the command line");
+      }
+      else if (argc > 3) {
+          throw std::runtime_error("Incorrect number of arguments specified to inference harness. Supported args: TTI_PATH NUM_INFERENCE_LOOPS");
+      }
 
-  int main(int argc, char** argv)
-  {
       // Define path to pre-compiled model and output artifacts
-      std::string output_path = "tt_build/base_encoders";
-      std::string model_path = "base_encoders.tti";
+      std::string output_path = "tt_build/test_standalone_runtime";
+      fs::create_directories(output_path);
+      uint32_t inference_loops = 1;
+      std::string model_path = argv[1];  // eg. "/home_mnt/software/spatial2/backend/binaries/CI_TTI_TEST_BINARIES_WH/bert.tti"
+
+      if (argc == 3) {
+          inference_loops = std::stoi(argv[2]);
+      }
 
       // Create a pre-compiled model object and a backend object from it using default config
       std::shared_ptr<tt::tt_device_image> model = std::make_shared<tt::tt_device_image>(model_path, output_path);
@@ -388,23 +465,32 @@ Here's an example of loading a TTI model from c++ for environments that do not h
       }
 
       // The following code must execute between initialize() and finish()
-      {
+      for (uint32_t i = 0; i < inference_loops; i++) {
           // <io process> - Push a microbatch of inputs to device
           for (const std::string &name : model->get_graph_input_names()) {
-              tt::tt_dram_io_desc io_desc = get_queue_descriptor(backend, name);
-              tt::tt_PytorchTensorDesc tensor_desc = get_tensor_descriptor(name);
-
+              tt::tt_dram_io_desc io_desc = tt::io::utils::get_queue_descriptor(backend, name);
+              tt::tt_PytorchTensorDesc tensor_desc = tt::io::utils::get_tensor_descriptor(name, model, io_desc);
+              // Fill the tensor descriptor with data. We choose to allocate dummy memory using the TT backend for this tensor.
+              // The user is free to use previously allocated memory, or use the backend to allocate memory that is then filled with actual data.
+              tt::io::utils::fill_tensor_with_data(name, tensor_desc);
               // DMA the input tensor from host to device
               assert(tt::backend::push_input(io_desc, tensor_desc, false, 1) == tt::DEVICE_STATUS_CODE::Success);
+              // Optional: Host memory management
+              // - free releases storage on host (tensor data freed), since host is done with pushing data for this activation
+              // - The user can choose not to free this memory and use it even after the data is in device DRAM 
+              std::cout << "Pushed Input tensor " << name << " data ptr: " << tensor_desc.ptr << std::endl;
+              assert(tt::backend::free_tensor(tensor_desc) == tt::DEVICE_STATUS_CODE::Success);
           }
 
           // <runtime process> - Run inference program, p_loop_count is the number of microbatches executed
           std::map<std::string, std::string> program_parameters = {{"$p_loop_count", "1"}};
-          backend->run_program("run_fwd_0", program_parameters);
-
+          for (const auto& prog_name : backend -> get_programs()) {
+              assert(backend->run_program(prog_name, program_parameters) == tt::DEVICE_STATUS_CODE::Success);
+          }
+          
           // <io process> - Pop a microbatch of outputs from device
           for (const std::string &name : model->get_graph_output_names()) {
-              tt::tt_dram_io_desc io_desc = get_queue_descriptor(backend, name);
+              tt::tt_dram_io_desc io_desc = tt::io::utils::get_queue_descriptor(backend, name);
               tt::tt_PytorchTensorDesc tensor_desc = {};  // passed into get_tensor below to be populated
 
               // DMA the output tensor from device to host
@@ -416,11 +502,11 @@ Here's an example of loading a TTI model from c++ for environments that do not h
 
               // Host memory management
               // - free releases storage on host (tensor data freed), host is done with the output data
-              std::cout << "Output tensor " << name << " data ptr: " << tensor_desc.ptr << std::endl;
+              // - The user can choose not to free this memory and use it for downstream tasks 
+              std::cout << "Got Output tensor " << name << " data ptr: " << tensor_desc.ptr << std::endl;
               assert(tt::backend::free_tensor(tensor_desc) == tt::DEVICE_STATUS_CODE::Success);
           }
       }
-
       // <runtime process> - Teardown the backend
       if (backend->finish() != tt::DEVICE_STATUS_CODE::Success) {
           throw std::runtime_error("Failed to shutdown device");
@@ -428,56 +514,6 @@ Here's an example of loading a TTI model from c++ for environments that do not h
       return 0;
   }
 
-  // Populate an queue descriptor from a queue name
-  tt::tt_dram_io_desc get_queue_descriptor(const std::shared_ptr<tt_backend> &backend, const std::string &queue_name) {
-      tt::tt_dram_io_desc queue_desc = backend->get_queue_descriptor(queue_name);
-      // (optional) maps device address to a contiguous user-space address in tt::tt_dram_io_desc::bufq_mapping
-      // - push_input will use this mapping for memcpy-based fast DMA if it exists
-      // - push_input will use user-mode driver for DMA if mapping does not exist
-      tt::backend::translate_addresses(queue_desc);
-      return queue_desc;
-  }
-
-  // Populate a tensor descriptor from raw data + metadata
-  template <class T>
-  tt::tt_PytorchTensorDesc to_tensor_descptor(
-      const T *array,
-      unsigned int w_dim,
-      unsigned int z_dim,
-      unsigned int r_dim,
-      unsigned int c_dim,
-      tt::DataFormat format,
-      unsigned int dim = tt::PY_TENSOR_DIMS) {
-      tt::tt_PytorchTensorDesc tensor_desc;
-      tensor_desc.owner = tt::OWNERSHIP::Backend;
-      tensor_desc.ptr = array;
-      tensor_desc.itemsize = sizeof(T);
-      tensor_desc.format = format;
-      tensor_desc.shape = {w_dim, z_dim, r_dim, c_dim};
-      tensor_desc.dim = dim;
-
-      tensor_desc.strides[3] = sizeof(T);
-      tensor_desc.strides[2] = c_dim * tensor_desc.strides[3];
-      tensor_desc.strides[1] = r_dim * tensor_desc.strides[2];
-      tensor_desc.strides[0] = z_dim * tensor_desc.strides[1];
-      return tensor_desc;
-  }
-
-  // Populate a tensor descriptor from a queue name
-  tt::tt_PytorchTensorDesc get_tensor_descriptor(const std::string &name) {
-      // The following code is an example for BERT base encoder input:
-      // - activation: [microbatch, channels = 1, height = 128, width = 768]
-      // - atten_mask: [microbatch, channels = 1, height = 32, width = 128]
-      if (name == "input_1") {
-          static std::vector<uint16_t> tensor_data(128 * 1 * 128 * 768, 0);
-          return to_tensor_descptor<uint16_t>(tensor_data.data(), 128, 1, 128, 768, tt::DataFormat::Float16_b);
-      } else if (name == "attention_mask") {
-          static std::vector<uint16_t> tensor_data(128 * 1 * 32 * 128, 0);
-          return to_tensor_descptor<uint16_t>(tensor_data.data(), 128, 1, 32, 128, tt::DataFormat::Float16_b);
-      }
-      throw std::runtime_error("Tensor is not a valid input");
-  }
-
 
 Pybuda Automatic Mixed Precision
 --------------------------------------
@@ -626,6 +662,137 @@ However, ``num_chips`` and ``chip_ids`` parameters can be used to select a subse
 
 See :py:class:`TTDevice<pybuda.TTDevice>` for more details.
 
+Pybuda Multi-Model Support (Embedded Applications Only)
+-------------------------------------------------------
+
+Introduction
+*******************
+
+PyBuda allows users to merge several models into a single Tenstorrent Device Image, with minimal workflow overhead. The TTI can then be consumed by the C++ Backend and run on a Tenstorrent Device.
+
+A typical process to generate and execute a Multi-Model workload is as follows:
+
+**Compilation: Either Offline or on a Tenstorrent Device**
+
+#. Generate TTIs for each model in the workload.
+#. Run the Model-Merging tool to consolidate all models into a single TTI.
+
+**Execution: On a Tenstorrent Device**
+
+#. Spawn an application using the C++ backend APIs to deploy the workload contained in the TTI. An example application is provided in the `Embedded TTI Loading` section.
+
+Fusing multiple independent models is well tested with several State of the Art models (including ViT, Mobilenet, ResNet50 ...). Supporting pipelined models is currently under active development.
+
+Below, we describe the APIs and associated tools used to fuse models without any dependencies.
+
+Usage
+*****
+
+Pybuda exposes two entry points for users to run the Model Merging Tool:
+
+#. Command Line Interface to specify the list of models to merge along with optional arguments. These include parameters enabling/disabling certain optimizations.
+#. Python API to be consumed by user applications. Usage of this API is very similar to the Command Line Tool.
+
+**Command Line Interface**
+
+.. code-block:: bash
+
+  python3 pybuda/pybuda/tools/tti_merge.py [-h] [-mbl {dirname}] 
+    [-mdl {models}] [-a {arch}]
+    [-mml {filename}] [-scr] 
+    [-dqo]
+
+The following arguments are available when using `tti_merge.py`
+
+.. list-table:: Table 1. TT-SMI optional arguments.
+  :header-rows: 1
+
+  * - Argument
+    - Function
+  * - -h, --help
+    - Show help message and exit
+  * - -mbl, --model_binaries_location
+    - Relative path to where model TTIs are stored [Required]
+  * - -mdl, --models
+    - List of models to be merged (names must match TTI filenames) [Required]
+  * - -a, --arch
+    - Target Tenstorrent Architecture (default = wormhole_b0) [Optional]
+  * - -mml, --merged_model_location
+    - Relative path to where the Multi-Model TTI will be emitted (default = merged_model.tti) [Optional]
+  * - -scr, --skip_channel_reallocation
+    - Disable memory optimization that switches channels for queues when OOM during memory allocation (default = False) [Optional]
+  * - -dqo, --dynamic_queue_overlap_off
+    - Disable memory optimization allowing dynamic queues to overlap in memory channels (default = False) [Optional]
+  
+As an example, given the following directory structure in the Pybuda root directory:
+
+.. code-block:: bash
+
+  device_images_to_merge/
+  ├-- bert_large.tti
+  ├-- deit.tti
+  ├-- hrnet.tti
+  ├-- inception.tti
+  ├-- mobilenet_v1.tti
+  ├-- mobilenet_v2.tti
+  ├-- mobilenet_v3.tti
+  ├-- resnet.tti
+  ├-- unet.tti
+  ├-- vit.tti
+
+The following command will generate a Multi-Model TTI (with memory optimizations enabled) and store it in `multi_model_workload.tti`:
+
+.. code-block:: bash
+
+  python3 pybuda/pybuda/tools/tti_merge.py -mbl device_images_to_merge/ -mdl bert_large deit hrnet inception mobilenet_v1 mobilenet_v2 mobilenet_v3 resnet unet vit -mml multi_model_workload.tti
+
+**Python API**
+
+This API provides identical functionality as the command line interface, for cases where the Model Merging step needs to be automated.
+
+.. code-block:: python
+
+  # API Declaration
+  def merge_models(model_bin_location, models, arch = "wormhole_b0", merged_model_location = "", switch_chans_if_capacity_hit = True, overlap_dynamic_queues = True)
+
+Here the arguments `switch_chans_if_capacity_hit` and `overlap_dynamic_queues` corresponds to memory optimizations, which are enabled my default.
+
+The following Python code generates a Multi-Model TTI in a manner identical to the command listed in the previous section:
+
+.. code-block:: python
+
+  from pybuda.tools.tti_merge import merge_models
+
+  model_binary_loc = "device_images_to_merge"
+  models_to_merge = ["bert_large", "deit", "hrnet", "inception", "mobilenet_v1", "mobilenet_v2", "mobilenet_v3", "resnet", "unet", "vit"]
+  target_arch = "wormhole_b0
+  merged_model_location = "multi_model_workload.tti"
+
+  # Individual Model Generation Code Goes Here
+
+  merge_models(model_binary_loc, models_to_merge, target_arch, merged_model_location)
+
+**Memory Profiler**
+
+During the model fusion process, the API presented above is responsible for performing memory reallocation. Users may be interested in the memory footprint of the fused model (both Device and Host DRAM).
+
+To fullfil this requirement, the tool reports memory utilization post reallocation. An example using a model compiled for Wormhole (with 6 Device and upto 4 Host DRAM channels) is provided below.
+
+.. code-block:: bash
+
+Displaying memory footprint per DRAM channel (MB):
+0 : 161.17
+1 : 511.12
+2 : 577.51
+3 : 200.27
+4 : 204.41
+5 : 339.57
+Displaying memory footprint per Host channel (MB):
+0 : 132.88
+1 : 0.0
+2 : 0.0
+3 : 0.0
+
 TT-SMI
 ------
 
diff --git a/env/CMakeLists.txt b/env/CMakeLists.txt
new file mode 100644
index 000000000..66cef1a2d
--- /dev/null
+++ b/env/CMakeLists.txt
@@ -0,0 +1,38 @@
+cmake_minimum_required(VERSION 3.20.0)
+project(pybuda-toolchain LANGUAGES CXX)
+
+# Get parent directory of current source directory
+get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
+
+# Include Utils
+list(APPEND CMAKE_MODULE_PATH ${PARENT_DIR}/cmake)
+include(Utils)
+
+# Check if the environment variable is set, if not error out
+check_required_env_var(PYBUDA_PYTHON_VERSION)
+check_required_env_var(PYBUDA_TOOLCHAIN_DIR)
+check_required_env_var(PYBUDA_VENV_DIR)
+
+set(PYBUDA_PYTHON_VERSION $ENV{PYBUDA_PYTHON_VERSION})
+set(PYBUDA_TOOLCHAIN_DIR $ENV{PYBUDA_TOOLCHAIN_DIR})
+set(PYBUDA_VENV_DIR $ENV{PYBUDA_VENV_DIR})
+
+set(TTMLIR_TOOLCHAIN_DIR $ENV{TTMLIR_TOOLCHAIN_DIR})
+set(TTMLIR_VENV_DIR $ENV{TTMLIR_VENV_DIR})
+
+if (NOT EXISTS "${PYBUDA_TOOLCHAIN_DIR}")
+    message( FATAL_ERROR "The directory ${PYBUDA_TOOLCHAIN_DIR} does not exist. Please create it before running this script.\n  sudo mkdir -p ${PYBUDA_TOOLCHAIN_DIR}\n  sudo chown -R $ENV{USER} ${PYBUDA_TOOLCHAIN_DIR}")
+endif()
+
+if (NOT EXISTS "${TTMLIR_TOOLCHAIN_DIR}")
+    message( FATAL_ERROR "The directory ${TTMLIR_TOOLCHAIN_DIR} does not exist. Please create it before running this script.\n  sudo mkdir -p ${TTMLIR_TOOLCHAIN_DIR}\n  sudo chown -R $ENV{USER} ${TTMLIR_TOOLCHAIN_DIR}")
+endif()
+
+# Install all python dependencies
+add_custom_target(python-venv
+    ALL COMMAND
+    PYBUDA_PYTHON_VERSION=${PYBUDA_PYTHON_VERSION}
+    PYBUDA_TOOLCHAIN_DIR=${PYBUDA_TOOLCHAIN_DIR}
+    CURRENT_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR}
+    PYBUDA_VENV_DIR=${PYBUDA_VENV_DIR}
+    bash ${CMAKE_CURRENT_SOURCE_DIR}/create_venv.sh)
diff --git a/env/README.md b/env/README.md
new file mode 100644
index 000000000..b36f90214
--- /dev/null
+++ b/env/README.md
@@ -0,0 +1,9 @@
+This directory contains all environment dependencies for the project.
+All dependencies are installed to /opt/pybuda-toolchain.
+
+load_env_variables.sh - Script to load the environment variables for the project.
+    - Its used when downloading dependencies for the project.
+    - Its also used when activating env for project. This is important since it contains path to different lib/inc paths.
+
+Dependencies:
+(Python.3.10)[https://www.python.org/downloads/release/python-3100/] - Version of python which is compatible with the project.
diff --git a/env/activate b/env/activate
new file mode 100644
index 000000000..fc5a44af2
--- /dev/null
+++ b/env/activate
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+# NOTE
+# These env variables are stored in this file because they are needed by both
+# toolchain project and the pybuda project. This file is sourced by both projects.
+# For tt-mlir we have similar env variables but since we build toolchain and binaries of tt-mlir
+# using install.sh from tt-mlir project, we only define them in pybuda project top level cmake file.
+
+# Set PYBUDA_TOOLCHAIN_DIR to a default value if not already set
+export PYBUDA_TOOLCHAIN_DIR="${PYBUDA_TOOLCHAIN_DIR:-/opt/pybuda-toolchain}"
+
+# Set PYTHON_VERSION
+if [[ "$(uname)" == "Linux" ]]; then
+export PYBUDA_PYTHON_VERSION="${PYBUDA_PYTHON_VERSION:-python3.10}"
+elif [[ "$(uname)" == "Darwin" ]]; then
+export PYBUDA_PYTHON_VERSION="${PYBUDA_PYTHON_VERSION:-/usr/bin/python3}"
+else
+echo "Error: Unsupported OS $(uname)"
+fi
+
+# Set PYBUDA_VENV_DIR to a default value if not already set
+export PYBUDA_VENV_DIR="${PYBUDA_VENV_DIR:-${PYBUDA_TOOLCHAIN_DIR}/venv}"
+
+[ -f $PYBUDA_VENV_DIR/bin/activate ] && source $PYBUDA_VENV_DIR/bin/activate
+
+if [ -n "$PROJECT_ROOT" ]; then
+    export TT_METAL_HOME="$PROJECT_ROOT/third_party/tt-mlir/third_party/tt-metal/src/tt-metal"
+else
+    export TT_METAL_HOME="$(pwd)/third_party/tt-mlir/third_party/tt-metal/src/tt-metal"
+fi
+
+export TTMLIR_TOOLCHAIN_DIR="${TTMLIR_TOOLCHAIN_DIR:-/opt/ttmlir-toolchain}"
+
+export TTMLIR_VENV_DIR="${TTMLIR_VENV_DIR:-${TTMLIR_TOOLCHAIN_DIR}/venv}"
+
+export TTMLIR_ENV_ACTIVATED=1
+
+export ARCH_NAME="${ARCH_NAME:-wormhole_b0}"
diff --git a/python_env/core_requirements.txt b/env/core_requirements.txt
similarity index 85%
rename from python_env/core_requirements.txt
rename to env/core_requirements.txt
index 374743db8..5c64dad27 100644
--- a/python_env/core_requirements.txt
+++ b/env/core_requirements.txt
@@ -23,21 +23,22 @@ matplotlib==3.5.1
 # They are both requirements of datasets. Multiprocess is however not fixed.
 # TODO: Remove when datasets is upgraded
 multiprocess==0.70.13
-mxnet==1.9.1
+# Newer versions of mxnet cause issue with AnyJSONManager static object destruction (segfault)
+mxnet==1.6.0
 networkx==2.8.5
 numpy==1.23.1
-onnx==1.14.1
-onnxruntime==1.15.0
+onnx>=1.15.0
+onnxruntime>=1.16.3
 opencv-python-headless==4.6.0.66
 # This is needed to avoid issue https://yyz-gitlab.local.tenstorrent.com/devops/devops/-/issues/95
 pandas==1.5.3
 prettytable==3.0.0
 protobuf==3.20.3
 pybind11==2.6.2
-pyinstrument==4.1.1
-scipy==1.8.0
-tensorflow-cpu==2.13.0
-tensorboard==2.13.0
+pyinstrument>=4.1.1
+scipy>=1.8.0
+tensorflow==2.13
+tensorboard==2.13
 tf2onnx==1.15.1
 transformers==4.35.2
 # To avoid warning during the import
@@ -45,5 +46,5 @@ requests==2.28.2
 urllib3==1.26.14
 tflite==2.10.0
 ultralytics==8.0.145
-keras==2.13.1
-pytorch_forecasting==1.0.0
+keras>=2.13.1
+#pytorch_forecasting>=1.0.0
diff --git a/env/create_venv.sh b/env/create_venv.sh
new file mode 100644
index 000000000..4a592f6b9
--- /dev/null
+++ b/env/create_venv.sh
@@ -0,0 +1,37 @@
+set -e
+
+if [[ -z "$PYBUDA_PYTHON_VERSION" ]]; then
+    echo "PYBUDA_PYTHON_VERSION environment variable is not set"
+    exit 1
+fi
+
+if [[ -z "$PYBUDA_TOOLCHAIN_DIR" ]]; then
+    echo "PYBUDA_TOOLCHAIN_DIR environment variable is not set"
+    exit 1
+fi
+
+if [[ -z "$PYBUDA_VENV_DIR" ]]; then
+    echo "PYBUDA_VENV_DIR environment variable is not set"
+    exit 1
+fi
+
+if [[ -z "$CURRENT_SOURCE_DIR" ]]; then
+    echo "CURRENT_SOURCE_DIR environment variable is not set"
+    exit 1
+fi
+
+# Torch requires a specific version of wheel to be installed
+# which depends on the platform
+if [[ "$(uname)" == "Darwin" ]]; then
+    REQUIREMENTS_FILE="$CURRENT_SOURCE_DIR/mac_requirements.txt"
+else
+    # TODO test on linux
+    REQUIREMENTS_FILE="$CURRENT_SOURCE_DIR/linux_requirements.txt"
+fi
+
+$PYBUDA_PYTHON_VERSION -m venv $PYBUDA_VENV_DIR
+unset LD_PRELOAD
+source $PYBUDA_VENV_DIR/bin/activate
+$PYBUDA_PYTHON_VERSION -m pip install --upgrade pip
+pip3 install wheel==0.37.1
+pip3 install -r $REQUIREMENTS_FILE -f https://download.pytorch.org/whl/cpu/torch_stable.html
diff --git a/python_env/dist_requirements.txt b/env/dist_requirements.txt
similarity index 53%
rename from python_env/dist_requirements.txt
rename to env/dist_requirements.txt
index b60831429..9dc705e6c 100644
--- a/python_env/dist_requirements.txt
+++ b/env/dist_requirements.txt
@@ -1,5 +1,6 @@
 -r core_requirements.txt
 torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.1.0%2Bcpu.cxx11.abi-cp38-cp38-linux_x86_64.whl ; python_version=='3.8'
 torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.1.0%2Bcpu.cxx11.abi-cp310-cp310-linux_x86_64.whl ; python_version=='3.10'
-torchvision @ https://download.pytorch.org/whl/cpu/torchvision-0.16.0%2Bcpu-cp38-cp38-linux_x86_64.whl ; python_version=='3.8'
-torchvision @ https://download.pytorch.org/whl/cpu/torchvision-0.16.0%2Bcpu-cp310-cp310-linux_x86_64.whl ; python_version=='3.10'
+torch==2.3.1 ; python_version=='3.11'
+# Custom torchversion with ABI
+torchvision==0.16.0+fbb4cc5
\ No newline at end of file
diff --git a/python_env/requirements.txt b/env/linux_requirements.txt
similarity index 83%
rename from python_env/requirements.txt
rename to env/linux_requirements.txt
index dddbc8592..3fbf8b7cf 100644
--- a/python_env/requirements.txt
+++ b/env/linux_requirements.txt
@@ -3,7 +3,8 @@
 
 bitarray==2.5.1
 clang-format==14.0.3
-diffusers==0.14.0
+diffusers==0.27.2
+optimum==1.19.2
 hydra-core
 IPython==8.8.0
 nvidia-ml-py3==7.352.0
@@ -17,7 +18,7 @@ sacremoses==0.0.53
 seaborn
 scikit-image==0.20.0 # For DenseNet 121 HF XRay model
 segmentation_models_pytorch==0.3.2
-sentencepiece==0.1.96
+#sentencepiece==0.1.96
 subword-nmt==0.3.8
 tensorflow-hub==0.12.0
 timm==0.6.12
@@ -25,12 +26,16 @@ yolov5==7.0.9
 # The CPU versions of torch and torch visions are used due to their size being
 # several GB smaller which made a large impact on the performance of CI
 # (through build artifacts and caching)
-torchvision==0.16.0+cpu
 torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.1.0%2Bcpu.cxx11.abi-cp38-cp38-linux_x86_64.whl ; python_version=='3.8'
 torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.1.0%2Bcpu.cxx11.abi-cp310-cp310-linux_x86_64.whl ; python_version=='3.10'
+torch==2.3.1 ; python_version=='3.11'
 torchxrayvision==0.0.39
 vgg_pytorch==0.3.0
 librosa==0.10.0.post2
 soundfile==0.12.1
 accelerate==0.23.0
-python-gitlab==4.2.0
+python-gitlab==4.4.0
+deepdiff==6.7.1
+tabulate==0.9.0
+opencv-contrib-python==4.9.0.80
+yolov6detect==0.4.1
diff --git a/env/mac_requirements.txt b/env/mac_requirements.txt
new file mode 100644
index 000000000..f691e557c
--- /dev/null
+++ b/env/mac_requirements.txt
@@ -0,0 +1,38 @@
+# First include all requirements from the Distribution build
+-r core_requirements.txt
+
+bitarray==2.5.1
+clang-format==14.0.3
+diffusers==0.27.2
+optimum==1.19.2
+hydra-core
+IPython==8.8.0
+nvidia-ml-py3==7.352.0
+omegaconf
+pytest==6.2.4
+pytest-timeout==2.0.1
+pytest-xdist==2.5.0
+pytorchcv==0.0.67
+sacrebleu==2.1.0
+sacremoses==0.0.53
+seaborn
+scikit-image==0.20.0 # For DenseNet 121 HF XRay model
+segmentation_models_pytorch==0.3.2
+#sentencepiece==0.1.96
+subword-nmt==0.3.8
+tensorflow-hub==0.12.0
+timm==0.6.12
+yolov5==7.0.9
+torch @ https://download.pytorch.org/whl/cpu/torch-2.1.0-cp310-none-macosx_11_0_arm64.whl ; python_version=='3.10'
+torch @ https://download.pytorch.org/whl/cpu/torch-2.1.0-cp38-none-macosx_11_0_arm64.whl ; python_version=='3.8'
+torch==2.3.1 ; python_version=='3.11'
+torchxrayvision==0.0.39
+vgg_pytorch==0.3.0
+librosa==0.10.0.post2
+soundfile==0.12.1
+accelerate==0.23.0
+python-gitlab==4.4.0
+deepdiff==6.7.1
+tabulate==0.9.0
+opencv-contrib-python==4.9.0.80
+yolov6detect==0.4.1
diff --git a/python_env/requirements_ext.txt b/env/requirements_ext.txt
similarity index 100%
rename from python_env/requirements_ext.txt
rename to env/requirements_ext.txt
diff --git a/env_for_silicon.sh b/env_for_silicon.sh
deleted file mode 100755
index e6e80c556..000000000
--- a/env_for_silicon.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-export BACKEND_ARCH_NAME=grayskull
-export ARCH_NAME=grayskull
-
-if command -v bear >/dev/null 2>&1; then
-    bear make
-else
-    make
-fi
-source build/python_env/bin/activate
-source third_party/tvm/enable.sh
-set +e
diff --git a/env_for_wormhole.sh b/env_for_wormhole.sh
deleted file mode 100644
index d45cabd26..000000000
--- a/env_for_wormhole.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-export LD_LIBRARY_PATH=third_party/confidential_tenstorrent_modules/versim/wormhole/lib:third_party/confidential_tenstorrent_modules/versim/wormhole/lib/ext
-export BACKEND_ARCH_NAME=wormhole
-export ARCH_NAME=wormhole
-
-if command -v bear >/dev/null 2>&1; then
-    bear make
-else
-    make
-fi
-source build/python_env/bin/activate
-source third_party/tvm/install.sh
-source third_party/tvm/enable.sh
-set +e
diff --git a/env_for_wormhole_b0.sh b/env_for_wormhole_b0.sh
deleted file mode 100755
index e91dd8fcd..000000000
--- a/env_for_wormhole_b0.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-export LD_LIBRARY_PATH=versim/wormhole_b0/lib:versim/wormhole_b0/lib/ext
-export BACKEND_ARCH_NAME=wormhole_b0
-export ARCH_NAME=wormhole_b0
-
-if command -v bear >/dev/null 2>&1; then
-    bear make
-else
-    make
-fi
-source build/python_env/bin/activate
-source third_party/tvm/enable.sh
-set +e
diff --git a/pybuda/CMakeLists.txt b/pybuda/CMakeLists.txt
new file mode 100644
index 000000000..8b0df77fd
--- /dev/null
+++ b/pybuda/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(csrc)
+add_subdirectory(pybuda)
diff --git a/pybuda/csrc/CMakeLists.txt b/pybuda/csrc/CMakeLists.txt
new file mode 100644
index 000000000..81e9490e0
--- /dev/null
+++ b/pybuda/csrc/CMakeLists.txt
@@ -0,0 +1,139 @@
+set(CMAKE_PREFIX_PATH 
+    ${Python_SITELIB}/torch
+    ${CMAKE_PREFIX_PATH})
+find_package(Torch REQUIRED)
+
+set(TT_MLIR_ROOT_DIR ${CMAKE_SOURCE_DIR}/third_party/tt-mlir)
+set(TTMLIR_INCLUDE_DIRS
+    ${TT_MLIR_ROOT_DIR}/include
+    ${TT_MLIR_ROOT_DIR}/build/include
+    ${TT_MLIR_ROOT_DIR}/runtime/include)
+
+set(PYBUDA_CSRC_INCLUDES
+    ${CMAKE_CURRENT_SOURCE_DIR}
+    ${CMAKE_SOURCE_DIR}
+    ${CMAKE_SOURCE_DIR}/third_party
+    ${CMAKE_SOURCE_DIR}/third_party/fmt
+    ${CMAKE_SOURCE_DIR}/third_party/pybind11/include
+    ${CMAKE_SOURCE_DIR}/third_party/json
+    ${CMAKE_SOURCE_DIR}/third_party/tt-mlir/build/include
+    ${CMAKE_SOURCE_DIR}/third_party/tt-mlir/runtime/include
+    ${CMAKE_SOURCE_DIR}/third_party/tt-mlir/include
+    ${TTMLIR_TOOLCHAIN_DIR}/include
+    ${Python3_INCLUDE_DIRS}
+    ${TTMLIR_INCLUDE_DIRS}
+)
+
+include_directories(${PYBUDA_CSRC_INCLUDES})
+# This is workaround for utils/assert.hpp using ##__VA_ARGS__ which is not supported by clang
+include_directories(SYSTEM ${CMAKE_SOURCE_DIR})
+include_directories(SYSTEM ${TORCH_INCLUDE_DIRS})
+
+add_subdirectory(graph_lib)
+add_subdirectory(autograd)
+add_subdirectory(shared_utils)
+add_subdirectory(backend_api)
+add_subdirectory(reportify)
+add_subdirectory(runtime)
+add_subdirectory(tt_torch_device)
+
+### pybuda_csrc_objs ###
+
+file(GLOB CPP_FILES 
+    "pybuda_bindings.cpp"
+    "buda_passes.cpp"
+    "passes/*.cpp"
+    "lower_to_buda/common.cpp"
+)
+
+add_library(pybuda_csrc_objs OBJECT ${CPP_FILES})
+target_compile_options(pybuda_csrc_objs PRIVATE ${STATIC_LIB_FLAGS} ${PYBUDA_CSRC_CFLAGS})
+add_dependencies(pybuda_csrc_objs build_tt_mlir)
+
+### End of pybuda_csrc_objs ### 
+
+######## pybuda_csrc ########
+
+set(TTMLIR_LIB_DIR "${CMAKE_SOURCE_DIR}/third_party/tt-mlir/build/lib")
+set(TTRUNTIME_LIB_DIR "${CMAKE_SOURCE_DIR}/third_party/tt-mlir/build/runtime/lib")
+set(METAL_LIB_DIR "${CMAKE_SOURCE_DIR}/third_party/tt-mlir/third_party/tt-metal/src/tt-metal-build/lib")
+set(TORCH_LIB_DIR "${PYBUDA_VENV_DIR}/lib/${PYBUDA_PYTHON_VERSION}/site-packages/torch/lib")
+
+add_library(pybuda_csrc SHARED)
+
+set(METAL_LIB_DIR "${CMAKE_SOURCE_DIR}/third_party/tt-mlir/third_party/tt-metal/src/tt-metal-build/lib")
+
+# Because _ttnn library doesn't have lib prefix, this is workaround to make linking work
+add_library(ttnn SHARED IMPORTED)
+set_property(TARGET ttnn PROPERTY IMPORTED_LOCATION "${METAL_LIB_DIR}/_ttnn.so")
+
+target_link_libraries(pybuda_csrc PRIVATE
+    graph
+    autograd
+    shared_utils
+    backend_api
+    reportify
+    tt_torch_device
+    runtime
+    pybuda_csrc_objs
+
+    # NOTE: ordering of the libraries will affect the linking
+    LLVM
+    MLIR
+    TTNNTargetFlatbuffer
+    MLIRTTDialect
+    MLIRTTIRDialect
+    MLIRTTNNDialect
+    MLIRTTKernelDialect
+    MLIRTTMetalDialect
+    MLIRTTIRTransforms
+    MLIRTTNNTransforms
+    MLIRTTIRAnalysis
+    MLIRTTNNPipelines
+    TTMLIRTTNNToEmitC
+    TTRuntime
+    TTRuntimeTTNN
+    tt_metal
+    device
+    tt_eager
+    ttnn
+    tt_metal
+    device
+    tt_eager
+    flatbuffers
+    xml2
+    curses
+    z
+    m
+    torch_python
+    c10
+    ${PYBUDA_PYTHON_VERSION}
+    ${TORCH_LIBRARIES}
+)
+
+target_compile_options(pybuda_csrc PRIVATE
+    ${PYBUDA_CSRC_CFLAGS}
+    ${CXXFLAGS}
+    ${SHARED_LIB_FLAGS}
+)
+
+target_link_directories(pybuda_csrc PRIVATE
+    ${TTMLIR_TOOLCHAIN_DIR}/lib
+    ${TTMLIR_LIB_DIR}
+    ${TTRUNTIME_LIB_DIR}
+    ${METAL_LIB_DIR}
+    ${TORCH_LIB_DIR})
+
+### End of pybuda_csrc ###
+
+#### Copy python module extension to pybuda directory ####
+add_custom_target(run_after_pybuda_csrc ALL
+    COMMAND mkdir -p ${PYBUDA_VENV_DIR}/lib/${PYBUDA_PYTHON_VERSION}/site-packages/pybuda
+    COMMAND cp $<TARGET_FILE:pybuda_csrc> ${PYBUDA_VENV_DIR}/lib/${PYBUDA_PYTHON_VERSION}/site-packages/pybuda/_C.so
+    COMMAND touch -r $<TARGET_FILE:pybuda_csrc> ${PYBUDA_VENV_DIR}/lib/${PYBUDA_PYTHON_VERSION}/site-packages/pybuda/_C.so
+    COMMAND ln -sf ${PYBUDA_VENV_DIR}/lib/${PYBUDA_PYTHON_VERSION}/site-packages/pybuda/_C.so ${CMAKE_SOURCE_DIR}/pybuda/pybuda/_C.so
+    COMMENT "Running run_after_pybuda_csrc to copy the python module extension to pybuda directory"
+    USES_TERMINAL
+)
+
+add_dependencies(run_after_pybuda_csrc pybuda_csrc)
diff --git a/pybuda/csrc/autograd/CMakeLists.txt b/pybuda/csrc/autograd/CMakeLists.txt
new file mode 100644
index 000000000..7f35e9dd9
--- /dev/null
+++ b/pybuda/csrc/autograd/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_library(autograd
+    STATIC
+    autograd.cpp
+    binding.cpp
+    python_bindings.cpp)
+
+target_compile_options(autograd PRIVATE ${STATIC_LIB_FLAGS} ${PYBUDA_CSRC_CFLAGS})
+
diff --git a/pybuda/csrc/autograd/binding.cpp b/pybuda/csrc/autograd/binding.cpp
index 8a203eef9..793c084e5 100644
--- a/pybuda/csrc/autograd/binding.cpp
+++ b/pybuda/csrc/autograd/binding.cpp
@@ -2,7 +2,6 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 #include "autograd/binding.hpp"
-#include "passes/fuse_ops.hpp"
 
 #include <vector>
 
@@ -25,84 +24,10 @@ std::tuple<Shape, std::vector<DimBroadcast>> get_op_shape(OpType type, std::vect
     return std::make_tuple(s, ret[1].cast<std::vector<DimBroadcast>>());
 }
 
-std::tuple<Shape, std::vector<DimBroadcast>> get_fused_op_shape(tt::graphlib::BudaOpNode *op, std::vector<Shape> &operands)
-{
-    std::unordered_map<std::uint32_t, Shape> buffers;
-    std::vector<DimBroadcast> dim_broadcasts;
-    std::optional<Shape> dest;
-    for (auto schedule : op->get_fused_op()->get_schedules())
-    {
-        for (auto sub_op : schedule.ops)
-        {
-            std::vector<Shape> sub_op_inputs;
-            for (tt::FusedSubOpInput i : sub_op.inputs)
-            {
-                if (i.type == tt::FusedSubOpInput::InputType::INPUT)  {
-                    TT_ASSERT(i.index < operands.size(), "Refering to input that doesn't exist for fused op");
-                    sub_op_inputs.push_back(operands.at(i.index));
-                }
-                else if (i.type == tt::FusedSubOpInput::InputType::DEST) {
-                    TT_ASSERT(dest.has_value(), "Reading from dest that has not value");
-                    sub_op_inputs.push_back(dest.value());
-                    dest = std::nullopt;
-                }
-                else {
-                    auto it = buffers.find(i.index);
-                    TT_ASSERT(it != buffers.end(), "Referring to intermediate buffer that doesn't exist");
-                    sub_op_inputs.push_back(it->second);
-                }
-
-                // All inputs to the fused op are already properly broadcasted.
-                // But for the sub-op inputs which are outputs of previously executed sub-ops,
-                // we need to apply broadcast.
-                // NOTE: We don't need to apply tile broadcasts for shape calculation, since each
-                // input is at least the size of a tile.
-                if (i.type != tt::FusedSubOpInput::InputType::INPUT
-                    && i.has_broadcast())
-                {
-                    Shape operand_shape = sub_op_inputs.back();
-
-                    int broadcast_dim = i.broadcast.first;
-                    int broadcast_factor = i.broadcast.second;
-
-                    OpType broadcast_op = OpType("broadcast", {broadcast_dim, broadcast_factor}, {});
-                    std::vector<Shape> shapes = {operand_shape};
-                    std::tuple<Shape, std::vector<DimBroadcast>> shape_data = get_op_shape(broadcast_op, shapes, true);
-
-                    operand_shape = std::get<0>(shape_data);
-
-                    sub_op_inputs.pop_back();
-                    sub_op_inputs.emplace_back(operand_shape);
-                }
-            }
-
-            Shape result;
-            std::vector<DimBroadcast> broadcast;
-            tie(result, broadcast) = get_op_shape(sub_op.op_type, sub_op_inputs, true);
-
-            if (sub_op.output_type == tt::FusedSubOp::OutputType::OUTPUT)
-                return std::make_pair(result, dim_broadcasts);
-
-            else if (sub_op.output_type == tt::FusedSubOp::OutputType::DEST)
-                dest = result;
-            
-            else {
-                // intermed
-                if (buffers.count((std::uint32_t)sub_op.output_buffer) == 0)
-                    buffers.insert(std::make_pair((std::uint32_t)sub_op.output_buffer, result));
-                else
-                    buffers[(std::uint32_t)sub_op.output_buffer] = result;
-            }
-        }
-    }
-    TT_THROW("Evaluated the full fused op, but haven't reached the output shape.");
-    return std::make_pair(Shape(), std::vector<DimBroadcast>{});
-}
-
 NodeContext insert_backward(
     autograd_context context,
     OpType type,
-    int operand, 
+    int operand,
     const std::vector<NodeContext> &inputs,
     NodeContext output,
     NodeContext gradient)
diff --git a/pybuda/csrc/autograd/binding.hpp b/pybuda/csrc/autograd/binding.hpp
index 29818f7af..a8c517bd0 100644
--- a/pybuda/csrc/autograd/binding.hpp
+++ b/pybuda/csrc/autograd/binding.hpp
@@ -23,7 +23,6 @@ using TileDim = tt::TileDim;
 
 std::tuple<Shape, std::vector<DimBroadcast>> get_op_shape(
     OpType type, std::vector<Shape> &operands, bool is_buda, TileDim tile_dim = TileDim::Dim32x32);
-std::tuple<Shape, std::vector<DimBroadcast>> get_fused_op_shape(tt::graphlib::BudaOpNode *op, std::vector<Shape> &operands);
 inline Shape get_tm_shape(OpType type, Shape operand, bool is_buda)
 {
     Shape shape;
diff --git a/pybuda/csrc/autograd/module.mk b/pybuda/csrc/autograd/module.mk
index 255de133c..57f24bb81 100644
--- a/pybuda/csrc/autograd/module.mk
+++ b/pybuda/csrc/autograd/module.mk
@@ -13,7 +13,7 @@ PYBUDA_CSRC_AUTOGRAD_INCLUDES = $(PYBUDA_CSRC_INCLUDES)
 
 pybuda/csrc/autograd: $(PYBUDA_CSRC_AUTOGRAD)
 
-$(PYBUDA_CSRC_AUTOGRAD): $(PYBUDA_CSRC_AUTOGRAD_OBJS) $(PYBUDA_CSRC_GRAPH_LIB)
+$(PYBUDA_CSRC_AUTOGRAD): $(PYBUDA_CSRC_AUTOGRAD_OBJS)
 	@mkdir -p $(LIBDIR)
 	ar rcs $@ $^
 
diff --git a/pybuda/csrc/backend_api/CMakeLists.txt b/pybuda/csrc/backend_api/CMakeLists.txt
new file mode 100644
index 000000000..8fb8f8479
--- /dev/null
+++ b/pybuda/csrc/backend_api/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_library(backend_api
+    STATIC
+    arch_type.cpp
+    backend_api.cpp)
+
+target_link_libraries(backend_api shared_utils)
+add_dependencies(backend_api shared_utils)
+target_compile_options(backend_api PRIVATE ${STATIC_LIB_FLAGS} ${PYBUDA_CSRC_CFLAGS})
+
diff --git a/pybuda/csrc/backend_api/arch_type.cpp b/pybuda/csrc/backend_api/arch_type.cpp
new file mode 100644
index 000000000..3827ea3e8
--- /dev/null
+++ b/pybuda/csrc/backend_api/arch_type.cpp
@@ -0,0 +1,49 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#include <stdexcept>
+#include "arch_type.hpp"
+#include "shared_utils/string_extension.hpp"
+
+namespace tt{
+    std::string to_string_arch(ARCH arch)
+    {
+        switch (arch)
+        {
+            case ARCH::GRAYSKULL:
+                return "GRAYSKULL";
+            case ARCH::WORMHOLE_B0:
+                return "WORMHOLE_B0";
+            case ARCH::BLACKHOLE:
+                return "BLACKHOLE";
+            default:
+                throw std::runtime_error("Unsupported ARCH enum: " + std::to_string(static_cast<int>(arch)));
+        }
+    }
+
+    std::string to_string_arch_lower(ARCH arch)
+    {
+        return tt::utils::to_lower_string(to_string_arch(arch));
+    }
+
+    ARCH to_arch_type(const std::string& arch_string)
+    {
+        std::string arch_string_lower = tt::utils::to_upper_string(arch_string);
+        if (arch_string_lower == "GRAYSKULL")
+        {
+            return ARCH::GRAYSKULL;
+        }
+        else if (arch_string_lower == "WORMHOLE_B0")
+        {
+            return ARCH::WORMHOLE_B0;
+        }
+        else if (arch_string_lower == "BLACKHOLE")
+        {
+            return ARCH::BLACKHOLE;
+        }
+        else
+        {
+            throw std::runtime_error("Unsuported tt::ARCH string: " + arch_string_lower);
+        }
+    }
+}
\ No newline at end of file
diff --git a/pybuda/csrc/backend_api/arch_type.hpp b/pybuda/csrc/backend_api/arch_type.hpp
new file mode 100644
index 000000000..80d9fc085
--- /dev/null
+++ b/pybuda/csrc/backend_api/arch_type.hpp
@@ -0,0 +1,24 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include <string>
+
+namespace tt {
+enum class ARCH
+{
+    JAWBRIDGE = 0,
+    GRAYSKULL = 1,
+    WORMHOLE = 2,
+    WORMHOLE_B0 = 3,
+    BLACKHOLE = 4,
+    Invalid = 0xFF,
+};
+
+std::string to_string_arch(ARCH ar);
+std::string to_string_arch_lower(ARCH arch);
+ARCH to_arch_type(const std::string& arch_string);
+}
+
+
diff --git a/pybuda/csrc/backend_api/backend_api.cpp b/pybuda/csrc/backend_api/backend_api.cpp
index 97e404ed6..773ec4e4d 100644
--- a/pybuda/csrc/backend_api/backend_api.cpp
+++ b/pybuda/csrc/backend_api/backend_api.cpp
@@ -1,169 +1,32 @@
 // SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0
+
 #include <unordered_map>
-#include "yaml-cpp/yaml.h"
 
-#include "backend_api/backend_api.hpp"
 #include "backend_api/device_config.hpp"
 #include "utils/assert.hpp"
 
-#include "netlist/tt_backend.hpp"
-#include "netlist/tt_backend_api.hpp"
-#include "common/env_lib.hpp"
-
 namespace tt
 {
+template <class T>
+constexpr std::false_type false_type_t{};
+
 template <typename T>
 T DeviceConfig::get(std::string const &param, const bool system_level_command) const
 {
-    std::string key_value = (system_level_command) ? ("system-" + param) : (arch_name + "-" + param);
-    std::string value;
-    if (system_level_command and this->cached_system_level_params.size() > 0)
-    {
-        value = this->cached_system_level_params.at(key_value);
-    }
-    else 
-    {
-        value = ::tt::backend::get_backend_param(
-            key_value,
-            this->device_yaml,
-            this->cluster_config_yaml,
-            this->runtime_params_yaml,
-            this->store_backend_db_to_yaml);
-    }
-
-    if constexpr (std::is_same_v<T, std::string>)
-    {
-        return value;
-    }
-    else if constexpr (std::is_same_v<T, std::uint32_t>)
-    {
-        return std::stoul(value, 0, 0);
-    }
-    else if constexpr (std::is_same_v<T, int>)
+    TT_ASSERT(false);
+    if constexpr (std::is_same_v<T, CoreCoord>)
     {
-        return std::stoi(value, 0, 0);
-    }
-    else if constexpr (std::is_same_v<T, std::uint64_t>)
-    {
-        return std::stoull(value, 0, 0);
-    }
-    else if constexpr (std::is_same_v<T, bool>)
-    {
-        return static_cast<bool>(std::stoi(value, 0, 0));
-    }
-    else if constexpr (std::is_same_v<T, CoreCoord>)
-    {
-        auto delimeter = value.find("-");
-        auto x_str = value.substr(0, delimeter);
-        auto y_str = value.substr(delimeter + 1, std::string::npos);
-        return CoreCoord(std::stoi(x_str, 0, 0), std::stoi(y_str, 0, 0));
+        return CoreCoord(1, 1);
     }
     else if constexpr (std::is_same_v<T, DeviceGrid>)
     {
-        auto delimeter = value.find("-");
-        auto c_str = value.substr(0, delimeter);
-        auto r_str = value.substr(delimeter + 1, std::string::npos);
-        return DeviceGrid(std::stoi(r_str, 0, 0), std::stoi(c_str, 0, 0));
-    }
-    else if constexpr (std::is_same_v<T, std::string>)
-    {
-        return value;
-    }
-    else if constexpr (std::is_same_v<T, std::vector<int>>)
-    {
-        // Chips with mmio are serialized separated by a dash (eg. '1-2-3')
-        std::vector<int> chips_with_mmio;
-
-        // Split string and extract chip ids
-        size_t delimeter = 0;
-        while((delimeter = value.find("-")) != std::string::npos)
-        {
-            std::string curr_str = value.substr(0, delimeter);
-            chips_with_mmio.push_back(std::stoi(curr_str, 0, 0));
-            value.erase(0, delimeter + 1);
-        }
-
-        return chips_with_mmio;
-    }
-    else if constexpr (std::is_same_v<T, std::unordered_map<uint32_t, EthCoord>>)
-    {
-        // Chip locations are serialized separated by a dash (eg. '0,0,0,-1,1,0,-')
-        std::unordered_map<uint32_t, EthCoord> chip_locations;
-        std::vector<std::string> temporary_buffer;
-
-        // Split string into temporary buffer for additional processing
-        size_t delimeter = 0;
-        while((delimeter = value.find("-")) != std::string::npos)
-        {
-            std::string curr_str = value.substr(0, delimeter);
-            temporary_buffer.push_back(curr_str);
-            value.erase(0, delimeter + 1);
-        } 
-
-        // Loop through temporary buffer and extract information
-        for (std::string chip_location : temporary_buffer)
-        {
-            // Split string into chip id and chip location portions
-            size_t delimeter = 0;
-            std::vector<int> extracted_values;
-            while((delimeter = chip_location.find(",")) != std::string::npos)
-            {
-                std::string curr_str = chip_location.substr(0, delimeter);
-                extracted_values.push_back(std::stoi(curr_str, 0, 0));
-                chip_location.erase(0, delimeter + 1);
-            }
-
-            // Add chip location to map
-            chip_locations.insert(
-                {extracted_values.at(0),
-                 EthCoord(
-                     extracted_values.at(1), extracted_values.at(2), extracted_values.at(3), extracted_values.at(4))});
-        }
-
-        return chip_locations;
-    }
-    else if constexpr (std::is_same_v<T, std::unordered_map<uint32_t, std::unordered_map<uint32_t, std::tuple<uint32_t, uint32_t>>>>)
-    {
-        // Ethernet connections are serialized separated by a dash (eg. '0,0,1,0,-0,1,1,1,-')
-        std::unordered_map<uint32_t, std::unordered_map<uint32_t, std::tuple<uint32_t, uint32_t>>> ethernet_connections;
-        std::vector<std::string> temporary_buffer;
-
-        // Split string into temporary buffer for additional processing
-        size_t delimeter = 0;
-        while((delimeter = value.find("-")) != std::string::npos)
-        {
-            std::string curr_str = value.substr(0, delimeter);
-            temporary_buffer.push_back(curr_str);
-            value.erase(0, delimeter + 1);
-        }
-
-        // Loop through temporary buffer and extract information
-        for (std::string eth_connection : temporary_buffer)
-        {
-            // Split string and collect values
-            size_t delimeter = 0;
-            std::vector<uint32_t> extracted_values;
-            while((delimeter = eth_connection.find(",")) != std::string::npos)
-            {
-                std::string curr_str = eth_connection.substr(0, delimeter);
-                extracted_values.push_back(std::stoul(curr_str, 0, 0));
-                eth_connection.erase(0, delimeter + 1);
-            }
-
-            // Add values to map
-            if(ethernet_connections.find(extracted_values[0]) == ethernet_connections.end()) {
-                ethernet_connections[extracted_values[0]] = {};
-            }
-            ethernet_connections[extracted_values[0]][extracted_values[1]] = std::tuple<uint32_t, uint32_t>(extracted_values[2], extracted_values[3]);
-        }
-
-        return ethernet_connections;
+        return DeviceGrid(1, 1);
     }
     else
     {
-        static_assert(false_type_t<T>, "No specialization for type");
+        return T();
     }
 }
 
@@ -183,573 +46,10 @@ template std::unordered_map<uint32_t, std::unordered_map<uint32_t, std::tuple<ui
 std::unordered_map<std::string, std::string> load_cached_sys_param(std::string yaml_file)
 {
     std::unordered_map<std::string, std::string> cache;
-    YAML::Node reference_param_desc = YAML::LoadFile(yaml_file);
-    for (auto it = reference_param_desc["system_level_params"].begin(); it != reference_param_desc["system_level_params"].end(); it++)
-        cache[(it -> first).as<std::string>()] = (it -> second).as<std::string>();
     return cache;
 }
 
-void DeviceConfig::load_system_level_params()
-{
-    auto silicon_devices = tt::backend::detect_available_devices();
-    if (silicon_devices.size() == 0) // compute machine
-        this->cached_system_level_params = load_cached_sys_param(this->runtime_params_yaml);
-}
-
-std::vector<std::uint32_t> DeviceConfig::get_harvested_cfg() const
-{
-    auto silicon_devices = tt::backend::detect_available_devices();
-    if (silicon_devices.size() == 0 and this->runtime_params_yaml.empty())
-        return std::vector<std::uint32_t>(chip_ids.size(), 0);  // assume same harvesting-config among all chips for non-silicon backend
-
-    std::vector<std::uint32_t> ret;
-    for (auto i : chip_ids)
-    {
-        std::string cmd = "device";
-        cmd += std::to_string(i);
-        cmd += "-harvesting_mask";
-        uint32_t num = get<std::uint32_t>(cmd, true);
-        ret.push_back(num);
-    }
-    return ret;
-}
+void DeviceConfig::load_system_level_params() { TT_ASSERT(false); }
 
+std::unordered_map<std::uint32_t, std::uint32_t> DeviceConfig::get_harvested_cfg() const { TT_ASSERT(false); return {}; }
 }  // namespace tt
-
-namespace tt::backend_api
-{
-using tt_backend_config = tt::tt_backend_config;
-using tt_compile_result = tt::tt_compile_result;
-
-void python_handle_refchange(const void *handle_ptr, bool allocate)
-{
-    py::handle handle((PyObject *)handle_ptr);
-    if (allocate)
-        handle.inc_ref();
-    else
-        handle.dec_ref();
-}
-
-void BackendModule(py::module &m_backend) {
-
-
-    py::class_<tt_backend_config>(m_backend, "BackendConfig")
-        .def(py::init([](
-                  tt::DEVICE backend_type, 
-                  tt::ARCH backend_device, 
-                  tt::DEVICE_MODE device_mode, 
-                  int opt_level,
-                  const std::string &output_dir,
-                  const std::string &soc_descriptor_path,
-                  const std::string &cluster_descriptor_path) {
-
-            auto cfg = tt_backend_config{
-                .type = backend_type, 
-                .arch = backend_device, 
-                .mode = device_mode,
-                .output_dir = output_dir,
-                .soc_descriptor_path = soc_descriptor_path,
-                .cluster_descriptor_path = cluster_descriptor_path};
-
-            char *env_opt_level = getenv("TT_BACKEND_OPT_LEVEL");
-            if (env_opt_level) {
-                cfg.optimization_level = atoi(env_opt_level);
-            }
-            else {
-                cfg.optimization_level = opt_level;
-            }
-            if (backend_type == tt::DEVICE::Golden) {
-                cfg.ignore_data_format_precision = true; // run backend at full precision by default (on Golden)
-            }
-            return cfg;
-        }))
-        .def("set_golden_ignore_df_precision", [](tt_backend_config &self, bool ignore_data_format_precision) {
-            self.ignore_data_format_precision = ignore_data_format_precision;
-        })
-        .def("set_performance_trace_args", [](tt_backend_config &self, std::string args) {
-            self.perf_desc_args = args;
-        })
-        .def("set_runtime_args", [](tt_backend_config &self, std::string args) {
-            self.runtime_args = args;
-        });
-
-    m_backend.def("get_golden_config", []() {
-        tt_backend_config cfg = {tt::DEVICE::Golden, tt::ARCH::GRAYSKULL};
-        return cfg;
-    });
-
-    py::enum_<tt::DEVICE>(m_backend, "BackendType")
-        .value("Golden", tt::DEVICE::Golden)
-        .value("Model", tt::DEVICE::Model)
-        .value("Silicon", tt::DEVICE::Silicon)
-        .value("NoBackend", tt::DEVICE::Invalid)
-        .def_static("from_string", &tt::get_device_from_string)
-        .def("to_json", [](const tt::DEVICE backend_type) {
-            switch (backend_type)
-            {
-                case tt::DEVICE::Golden: return "Golden";
-                case tt::DEVICE::Model: return "Model";
-                case tt::DEVICE::Silicon: return "Silicon";
-                case tt::DEVICE::Invalid: return "Invalid";
-                default: break;
-            }
-            return "Invalid";
-        })
-        .def("from_json", [](std::string const& encoded) {
-            static std::unordered_map<std::string, tt::DEVICE> decode = {
-                {"Golden", tt::DEVICE::Golden},
-                {"Model", tt::DEVICE::Model},
-                {"Silicon", tt::DEVICE::Silicon},
-                {"NoBackend", tt::DEVICE::Invalid},
-            };
-            return decode.at(encoded);
-        });
-
-    py::enum_<tt::IO_TYPE>(m_backend, "IOType")
-        .value("Queue", tt::IO_TYPE::Queue)
-        .value("RandomAccess", tt::IO_TYPE::RandomAccess)
-        .value("Invalid", tt::IO_TYPE::Invalid);
-
-    py::enum_<tt::IO_LAYOUT>(m_backend, "IOLayout")
-        .value("Tilized", tt::IO_LAYOUT::Tilized)
-        .value("Flat", tt::IO_LAYOUT::Flat)
-        .value("Invalid", tt::IO_LAYOUT::Invalid);
-
-    py::enum_<tt::ARCH>(m_backend, "BackendDevice")
-        .value("Grayskull", tt::ARCH::GRAYSKULL)
-        .value("Wormhole", tt::ARCH::WORMHOLE)
-        .value("Wormhole_B0", tt::ARCH::WORMHOLE_B0)
-        .value("Invalid", tt::ARCH::Invalid)
-        .def("to_string", &tt::get_string_lowercase)
-        .def_static("from_string", &tt::get_arch_from_string)
-        .def("to_json", [](const tt::ARCH backend_device) {
-            switch (backend_device)
-            {
-                case tt::ARCH::GRAYSKULL: return "Grayskull";
-                case tt::ARCH::WORMHOLE: return "Wormhole";
-                case tt::ARCH::WORMHOLE_B0: return "Wormhole_B0";
-                case tt::ARCH::Invalid: return "Invalid";
-                default: break;
-            }
-            return "Invalid";
-        })
-        .def("from_json", [](std::string const& encoded) {
-            static std::unordered_map<std::string, tt::ARCH> decode = {
-                {"Grayskull", tt::ARCH::GRAYSKULL},
-                {"Wormhole", tt::ARCH::WORMHOLE},
-                {"Wormhole_B0", tt::ARCH::WORMHOLE_B0},
-                {"Invalid", tt::ARCH::Invalid},
-            };
-            return decode.at(encoded);
-        });
-
-    py::enum_<tt::DEVICE_MODE>(m_backend, "DeviceMode")
-        .value("CompileAndRun", tt::DEVICE_MODE::CompileAndRun)
-        .value("CompileOnly", tt::DEVICE_MODE::CompileOnly)
-        .value("RunOnly", tt::DEVICE_MODE::RunOnly)
-        .def(
-            "to_json",
-            [](tt::DEVICE_MODE d) {
-                switch (d)
-                {
-                    case tt::DEVICE_MODE::CompileAndRun: return "CompileAndRun";
-                    case tt::DEVICE_MODE::CompileOnly: return "CompileOnly";
-                    case tt::DEVICE_MODE::RunOnly: return "RunOnly";
-                    default: break;
-                }
-                return "Invalid";
-            })
-        .def("from_json", [](std::string const &encoded) {
-            static std::unordered_map<std::string, tt::DEVICE_MODE> decode = {
-                {"CompileAndRun", tt::DEVICE_MODE::CompileAndRun},
-                {"CompileOnly", tt::DEVICE_MODE::CompileOnly},
-                {"RunOnly", tt::DEVICE_MODE::RunOnly},
-            };
-            return decode.at(encoded);
-        });
-
-    py::class_<tt::Stride>(m_backend, "StrideDescriptor")
-        .def(py::init<>())
-        .def_readwrite("xy_offsets", &tt::Stride::xy_offsets)
-        .def_readwrite("stride", &tt::Stride::stride)
-        .def(py::pickle(
-            [](const tt::Stride &s) { // __getstate__
-                return py::make_tuple(
-                    s.xy_offsets,
-                    s.stride);
-            },
-            [](py::tuple t) { // __setstate__
-                if (t.size() != 2)
-                {
-                    throw std::runtime_error("Invalid state for tt::Stride");
-                }
-
-                tt::Stride s;
-
-                s.xy_offsets = t[0].cast<std::vector<std::pair<int, int>>>(); 
-                s.stride = t[1].cast<int>();
-
-                return s;
-            }
-        ));
-
-    py::class_<tt::tt_dram_io_desc>(m_backend, "DramIODesc")
-        .def_property_readonly("name", [](tt::tt_dram_io_desc &self) { return self.queue_name; })
-        .def_property_readonly("data_format", [](tt::tt_dram_io_desc &self) { return self.bufq_target_format; })
-        .def_readwrite("bufq_grid_dim_r", &tt::tt_dram_io_desc::bufq_grid_dim_r)
-        .def_readwrite("bufq_grid_dim_c", &tt::tt_dram_io_desc::bufq_grid_dim_c)
-        .def_readwrite("ublock_rt", &tt::tt_dram_io_desc::ublock_rt)
-        .def_readwrite("ublock_ct", &tt::tt_dram_io_desc::ublock_ct)
-        .def_readwrite("mblock_m", &tt::tt_dram_io_desc::mblock_m)
-        .def_readwrite("mblock_n", &tt::tt_dram_io_desc::mblock_n)
-        .def_readwrite("tile_height", &tt::tt_dram_io_desc::tile_height)
-        .def_readwrite("tile_width", &tt::tt_dram_io_desc::tile_width)
-        .def_readwrite("t", &tt::tt_dram_io_desc::t)
-        .def_readwrite("hstack_factor", &tt::tt_dram_io_desc::hstack_factor)
-        .def_readwrite("vstack_factor", &tt::tt_dram_io_desc::vstack_factor)
-        .def_readwrite("stack_row_major", &tt::tt_dram_io_desc::stack_row_major)
-        .def_readwrite("s_descriptor", &tt::tt_dram_io_desc::s_descriptor)
-        .def_readwrite("input_count", &tt::tt_dram_io_desc::input_count)
-        .def_readwrite("netlist_path", &tt::tt_dram_io_desc::netlist_path)
-        .def(py::pickle(
-            [](const tt::tt_dram_io_desc &p) {  // __getstate__
-                return py::make_tuple(
-                    p.netlist_path,
-                    p.queue_name,
-                    p.bufq_grid_dim_r,
-                    p.bufq_grid_dim_c,
-                    p.bufq_num_slots,
-                    p.ublock_rt,
-                    p.ublock_ct,
-                    p.mblock_m,
-                    p.mblock_n,
-                    p.tile_height,
-                    p.tile_width,
-                    p.t,
-                    p.input_count,
-                    p.hstack_factor,
-                    p.vstack_factor,
-                    p.stack_row_major,
-                    p.bufq_target_format,
-                    p.bufq_start_addr_channel,
-                    p.bufq_entry_size,
-                    p.io_type,
-                    p.s_descriptor,
-                    p.backend_type,
-                    p.layout);
-            },
-            [](py::tuple t) {  // __setstate__
-                if (t.size() != 23)
-                    throw std::runtime_error("tt::tt_dram_io_desc: Invalid state!");
-
-                tt::tt_dram_io_desc p;
-                p.netlist_path = t[0].cast<std::string>();
-                p.queue_name = t[1].cast<std::string>();
-                p.bufq_grid_dim_r = t[2].cast<std::uint32_t>();
-                p.bufq_grid_dim_c = t[3].cast<std::uint32_t>();
-                p.bufq_num_slots = t[4].cast<std::uint32_t>();
-                p.ublock_rt = t[5].cast<std::uint32_t>();
-                p.ublock_ct = t[6].cast<std::uint32_t>();
-                p.mblock_m = t[7].cast<std::uint32_t>();
-                p.mblock_n = t[8].cast<std::uint32_t>();
-                p.tile_height = t[9].cast<std::uint32_t>();
-                p.tile_width = t[10].cast<std::uint32_t>();
-                p.t = t[11].cast<std::uint32_t>();
-                p.input_count = t[12].cast<std::uint32_t>();
-                p.hstack_factor = t[13].cast<std::uint32_t>();
-                p.vstack_factor = t[14].cast<std::uint32_t>();
-                p.stack_row_major = t[15].cast<std::uint32_t>();
-                p.bufq_target_format = t[16].cast<DataFormat>();
-                p.bufq_start_addr_channel = t[17].cast<std::vector<std::pair<std::uint32_t, std::uint16_t>>>();
-                p.bufq_entry_size = t[18].cast<std::uint32_t>();
-                p.io_type = t[19].cast<IO_TYPE>();
-                p.s_descriptor = t[20].cast<tt::Stride>();
-                p.backend_type = t[21].cast<DEVICE>();
-                p.layout = t[22].cast<IO_LAYOUT>();
-
-                TT_ASSERT(
-                    tt::backend::translate_addresses(p) == tt::DEVICE_STATUS_CODE::Success,
-                    "Failed to translate addresses for " + p.queue_name);
-                return p;
-            }));
-
-    py::class_<tt::tt_PytorchTensorDesc>(m_backend, "PytorchTensorDesc", py::buffer_protocol())
-        .def(py::init([]() {
-            return tt_PytorchTensorDesc();
-        }))
-        .def(py::init([](py::object pytorch_tensor, std::uint32_t itemsize, tt::DataFormat format,
-                        std::uint32_t dim,
-                        std::array<std::uint32_t, 4> shape,
-                        std::array<std::uint32_t, 4> strides) {
-
-            auto ptr = pytorch_tensor.attr("data_ptr")().cast<std::uint64_t>();
-            py::handle handle = pytorch_tensor.release();
-
-            return tt_PytorchTensorDesc(
-                (void *)ptr, itemsize, format, shape, strides, dim, (void*)handle.ptr(), python_handle_refchange);
-        }))
-        .def(py::init([](void *buffer, std::uint32_t itemsize, tt::DataFormat format,
-                        std::uint32_t dim,
-                        std::array<std::uint32_t, 4> shape,
-                        std::array<std::uint32_t, 4> strides) {
-
-            return tt_PytorchTensorDesc(buffer, itemsize, format, shape, strides, dim);
-        }))
-        .def_readwrite("itemsize", &tt_PytorchTensorDesc::itemsize)
-        .def_readwrite("format", &tt_PytorchTensorDesc::format)
-        .def_readwrite("shape", &tt_PytorchTensorDesc::shape)
-        .def_readwrite("strides", &tt_PytorchTensorDesc::strides)
-        .def_readwrite("dim", &tt_PytorchTensorDesc::dim)
-        .def("print", [](tt::tt_PytorchTensorDesc &self) {
-            std::cout << "Descriptor: ptr=" << (std::uint64_t)self.ptr << 
-                    ", itemsize=" << self.itemsize <<
-                    ", format =" << (int)self.format <<
-                    ", dim =" << self.dim <<
-                    ", shape =" << self.shape[0] << "," << self.shape[1] << "," << self.shape[2] << "," << self.shape[3] <<
-                    ", strides =" << self.strides[0] << "," << self.strides[1] << "," << self.strides[2] << "," << self.strides[3] << std::endl;
-        })
-        .def_buffer([](tt::tt_PytorchTensorDesc &desc) -> py::buffer_info {
-
-           // Mostly irrelevant since we'll be turning this into torch tensor with its
-           // own format. However, this could cause numpy to interpret the data wrong
-           std::string data_format = py::format_descriptor<float>::format();
-           return py::buffer_info(
-                const_cast<void *>(desc.ptr),
-                desc.itemsize,
-                data_format,
-                4,
-                desc.shape,
-                desc.strides);
-         })
-         .def(py::pickle(
-           [](const tt::tt_PytorchTensorDesc &t) { // __getstate__
-              return py::make_tuple(
-                  reinterpret_cast<std::uintptr_t>(t.ptr),
-                  t.itemsize,
-                  t.format,
-                  t.shape,
-                  t.strides,
-                  t.dim);
-           },
-           [](py::tuple t) { // __setstate__
-              if (t.size() != 6)
-                  throw std::runtime_error("tt::tt_PytorchTensorDesc: Invalid state!");
-
-              tt::tt_PytorchTensorDesc p;
-              p.ptr = reinterpret_cast<const void*>(t[0].cast<std::uintptr_t>());
-              p.itemsize = t[1].cast<std::uint32_t>();
-              p.format = t[2].cast<DataFormat>();
-              p.shape = t[3].cast<std::array<std::uint32_t, 4>>();
-              p.strides = t[4].cast<std::array<std::uint32_t, 4>>();
-              p.dim = t[5].cast<std::uint32_t>();
-              return p;
-           }
-          ));
-
-    py::class_<tt::tt_TilizedTensorDesc>(m_backend, "TilizedTensorDesc")
-        .def(py::init<>())
-        .def_readwrite("num_buffers", &tt::tt_TilizedTensorDesc::num_buffers)
-        .def_readwrite("buf_size_bytes", &tt::tt_TilizedTensorDesc::buf_size_bytes)
-        .def_readwrite("format", &tt::tt_TilizedTensorDesc::format)
-        .def("print", [](tt::tt_TilizedTensorDesc &self) {
-            std::cout << "Descriptor: ptr=" << (std::uint64_t)self.ptr << 
-                    ", num_buffers=" << self.num_buffers <<
-                    ", buf_size_bytes=" << (int)self.buf_size_bytes <<
-                    ", format =" << self.format;
-        })
-        .def(py::pickle(
-           [](const tt::tt_TilizedTensorDesc &t) { // __getstate__
-              return py::make_tuple(
-                  t.num_buffers,
-                  t.buf_size_bytes,
-                  t.format);
-           },
-           [](py::tuple t) { // __setstate__
-              if (t.size() != 3)
-                  throw std::runtime_error("tt::tt_TilizedTensorDesc: Invalid state!");
-
-              return tt::tt_TilizedTensorDesc(
-                nullptr,
-                t[0].cast<std::uint32_t>(),
-                t[1].cast<std::uint32_t>(),
-                t[2].cast<DataFormat>()
-              );
-           }
-        ));
-
-
-    py::class_<param::DeviceDesc>(m_backend, "BackendDeviceDesc")
-        .def(py::init<>())
-        .def_readonly("arch", &param::DeviceDesc::arch)
-        .def_readonly("soc_desc_yaml", &param::DeviceDesc::soc_desc_yaml)
-        .def_readonly("mmio", &param::DeviceDesc::mmio)
-        .def_readonly("harvesting_mask", &param::DeviceDesc::harvesting_mask);
-
-    py::class_<tt_op_model_desc>(m_backend, "OpModelDesc")
-        .def(py::init<>())
-        .def_readwrite("type", &tt_op_model_desc::type)
-        .def_readwrite("arch", &tt_op_model_desc::arch)
-        .def_readwrite("data_format", &tt_op_model_desc::data_format)
-        .def_readwrite("math_fidelity", &tt_op_model_desc::math_fidelity)
-        .def_readwrite("t", &tt_op_model_desc::t)
-        .def_readwrite("mblock_m", &tt_op_model_desc::mblock_m)
-        .def_readwrite("mblock_n", &tt_op_model_desc::mblock_n)
-        .def_readwrite("ublock_rt", &tt_op_model_desc::ublock_rt)
-        .def_readwrite("ublock_ct", &tt_op_model_desc::ublock_ct)
-        .def_readwrite("mblock_k", &tt_op_model_desc::mblock_k)
-        .def_readwrite("ublock_kt", &tt_op_model_desc::ublock_kt)
-        .def_readwrite("sparse_indices", &tt_op_model_desc::sparse_indices)
-        .def_readwrite("sparse_nz_ublocks", &tt_op_model_desc::sparse_nz_ublocks)
-        .def_readwrite("sparse_nz_strips", &tt_op_model_desc::sparse_nz_strips)
-        .def_readwrite("approx_mode", &tt_op_model_desc::approx_mode)
-        .def_readwrite("op_attr", &tt_op_model_desc::op_attr)
-        .def_readwrite("reduce_z", &tt_op_model_desc::reduce_z);
-
-    py::enum_<tt::DEVICE_STATUS_CODE>(m_backend, "BackendStatusCode")
-        .value("Success", tt::DEVICE_STATUS_CODE::Success)
-        .value("RuntimeError", tt::DEVICE_STATUS_CODE::RuntimeError)
-        .value("TimeoutError", tt::DEVICE_STATUS_CODE::TimeoutError);
-
-    py::enum_<tt::COMPILE_FAILURE>(m_backend, "BackendCompileFailure")
-        .value("BriscCompile", tt::COMPILE_FAILURE::BriscCompile)
-        .value("EriscCompile",tt::COMPILE_FAILURE::EriscCompile)
-        .value("NriscCompile",tt::COMPILE_FAILURE::NriscCompile)
-        .value("Net2Pipe",tt::COMPILE_FAILURE::Net2Pipe)
-        .value("PipeGen",tt::COMPILE_FAILURE::PipeGen)
-        .value("BlobGen",tt::COMPILE_FAILURE::BlobGen)
-        .value("L1Size",tt::COMPILE_FAILURE::L1Size)
-        .value("OverlaySize",tt::COMPILE_FAILURE::OverlaySize)
-        .value("Invalid",tt::COMPILE_FAILURE::Invalid);
-
-    py::class_<tt_compile_result>(m_backend, "BackendCompileResult")
-        .def(py::init<>())
-        .def_readwrite("success", &tt_compile_result::success)
-        .def_readwrite("failure_type", &tt_compile_result::failure_type)
-        .def_readwrite("failure_message", &tt_compile_result::failure_message)
-        .def_readwrite("failure_target",&tt_compile_result::failure_target)
-        .def_readwrite("device_id",&tt_compile_result::device_id)
-        .def_readwrite("temporal_epoch_id", &tt_compile_result::temporal_epoch_id)
-        .def_readwrite("logical_core_x",&tt_compile_result::logical_core_x)
-        .def_readwrite("logical_core_y", &tt_compile_result::logical_core_y)
-        .def_readwrite("extra_size_bytes", &tt_compile_result::extra_size_bytes);
-            
-        
-    py::class_<tt_backend, std::shared_ptr<tt_backend>>(m_backend, "BackendApi")
-        .def(py::init(py::overload_cast<const std::string&, const tt::tt_backend_config&>(&tt_backend::create)))
-        .def("initialize", py::overload_cast<>(&tt_backend::initialize), py::call_guard<py::gil_scoped_release>())
-        .def("initialize", py::overload_cast<tt::tt_compile_result*>(&tt_backend::initialize), py::call_guard<py::gil_scoped_release>())
-        .def("finish", &tt_backend::finish)
-        .def("run_program", &tt_backend::run_program, py::call_guard<py::gil_scoped_release>())
-        .def("wait_for_idle", &tt_backend::wait_for_idle, py::call_guard<py::gil_scoped_release>())
-
-        .def("get_queue_descriptor", &tt_backend::get_queue_descriptor);
-
-    // Explicitly release the backend pointer
-    m_backend.def("release_backend_ptr", [](std::shared_ptr<tt_backend> backend) {
-        backend.reset();
-    });
-
-    m_backend.def(
-        "clear_backend_param_cache",
-        &tt::backend::clear_backend_param_cache_v2);
-
-    m_backend.def("get_op_model_execution_cycles", &tt::backend::get_op_model_execution_cycles);
-    m_backend.def("get_op_model_param", &tt::backend::get_op_model_param);
-    
-    m_backend.def(
-        "push_input", 
-        py::overload_cast<
-            const tt::tt_dram_io_desc&,
-            const tt::tt_PytorchTensorDesc&,
-            const bool, const int, const int>(&tt::backend::push_input), py::call_guard<py::gil_scoped_release>());
-    m_backend.def(
-        "push_input", 
-        py::overload_cast<
-            const tt::tt_dram_io_desc&,
-            const tt::tt_TilizedTensorDesc&,
-            const int, const int>(&tt::backend::push_input), py::call_guard<py::gil_scoped_release>());
-    m_backend.def("pop_output", &tt::backend::pop_output, py::call_guard<py::gil_scoped_release>());
-    m_backend.def("get_output", &tt::backend::get_output, py::call_guard<py::gil_scoped_release>());
-    m_backend.def("free_tensor", &tt::backend::free_tensor<tt::tt_PytorchTensorDesc>);
-    m_backend.def("free_tensor", &tt::backend::free_tensor<tt::tt_TilizedTensorDesc>);
-    m_backend.def("tilize_tensor", &tt::backend::tilize_tensor);
-    m_backend.def("binarize_tensor", &tt::backend::binarize_tensor<tt::tt_PytorchTensorDesc>);
-    m_backend.def("binarize_tensor", &tt::backend::binarize_tensor<tt::tt_TilizedTensorDesc>);
-    m_backend.def("debinarize_tensor", &tt::backend::debinarize_tensor<tt::tt_PytorchTensorDesc>);
-    m_backend.def("debinarize_tensor", &tt::backend::debinarize_tensor<tt::tt_TilizedTensorDesc>);
-
-    m_backend.def(
-        "get_io_size_in_bytes",
-        &tt::backend::get_io_size_in_bytes,
-        py::arg("data_formati"),
-        py::arg("is_untilizesd"),
-        py::arg("ublock_ct"),
-        py::arg("ublock_rt"),
-        py::arg("mblock_m"),
-        py::arg("mblock_n"),
-        py::arg("t"),
-        py::arg("entries"),
-        py::arg("tile_height") = 32,
-        py::arg("tile_width") = 32);
-    m_backend.def("get_next_aligned_address", &tt::backend::get_next_aligned_address);
-
-    m_backend.def("translate_addresses", &tt::backend::translate_addresses, py::call_guard<py::gil_scoped_release>());
-
-    m_backend.def(
-        "detect_available_silicon_devices", &tt::backend::detect_available_devices, py::arg("only_detect_mmio") = true);
-    m_backend.def(
-        "get_device_descs_for_available_devices",
-        &tt::backend::get_device_descs_for_available_devices,
-        py::arg("out_dir") = std::string("./tt_build"));
-
-    m_backend.def(
-        "get_custom_device_desc",
-        &tt::backend::get_custom_device_desc,
-        py::arg("arch") = tt::ARCH::Invalid,
-        py::arg("mmio") = false,
-        py::arg("harvesting_mask") = 0u,
-        py::arg("grid_dim") = std::make_pair(0, 0),
-        py::arg("out_dir") = std::string("./tt_build"));
-    m_backend.def("get_device_cluster_yaml", &tt::backend::get_device_cluster_yaml_v2, py::arg("out_dir"));
-    m_backend.def("initialize_child_process", &tt::backend::initialize_child_process);
-    m_backend.def("finish_child_process", &tt::backend::finish_child_process);
-    m_backend.def("load_cached_sys_param", &tt::load_cached_sys_param);
-
-    py::class_<DeviceGrid>(m_backend, "DeviceGrid")
-        .def(py::init<std::pair<int, int>>())
-        .def_readonly("r", &DeviceGrid::r)
-        .def_readonly("c", &DeviceGrid::c);
-
-    py::class_<DeviceConfig>(m_backend, "DeviceConfig")
-        .def(py::init<
-             std::string,
-             std::string,
-             std::string,
-             std::string,
-             std::string,
-             bool,
-             std::vector<std::uint32_t>>())
-        .def(py::init<
-             std::string,
-             std::string,
-             std::string,
-             std::string,
-             std::string,
-             bool,
-             std::vector<std::tuple<std::uint32_t, std::uint32_t, std::uint32_t, std::uint32_t>>>())
-        .def("get_harvested_cfg", &DeviceConfig::get_harvested_cfg)
-        .def("get_ethernet_connections", &DeviceConfig::get_ethernet_connections)
-        .def("get_dram_backend_reserved_max", &DeviceConfig::get_dram_backend_reserved_max)
-        .def("get_host_memory_channel_start_address", &DeviceConfig::get_host_memory_channel_start_address)
-        .def("get_host_memory_num_channels", &DeviceConfig::get_host_memory_num_channels)
-        .def("get_host_memory_channel_size", &DeviceConfig::get_host_memory_channel_size)
-        .def_property_readonly(
-            "arch", [](DeviceConfig const &dc) -> tt::ARCH { return get_arch_from_string(dc.arch_name); })
-        .def_readonly("arch_name", &DeviceConfig::arch_name)
-        .def_readonly("device_yaml", &DeviceConfig::device_yaml)
-        .def_readonly("cluster_config_yaml", &DeviceConfig::cluster_config_yaml)
-        .def_readonly("backend_type", &DeviceConfig::backend_type)
-        .def_readonly("grid_size", &DeviceConfig::grid_size)
-        .def_readonly("chip_ids", &DeviceConfig::chip_ids);
-}
-}  // namespace tt::backend_api
diff --git a/pybuda/csrc/backend_api/backend_api.hpp b/pybuda/csrc/backend_api/backend_api.hpp
deleted file mode 100644
index 4bf536a33..000000000
--- a/pybuda/csrc/backend_api/backend_api.hpp
+++ /dev/null
@@ -1,16 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <pybind11/numpy.h>
-namespace py = pybind11;
-
-namespace tt {
-namespace backend_api {
-
-void BackendModule(py::module &m_backend);
-
-}
-}
-
diff --git a/pybuda/csrc/backend_api/device_config.hpp b/pybuda/csrc/backend_api/device_config.hpp
index ada27b5d6..2ddf39681 100644
--- a/pybuda/csrc/backend_api/device_config.hpp
+++ b/pybuda/csrc/backend_api/device_config.hpp
@@ -12,6 +12,7 @@
 #include <utility>
 #include <vector>
 
+#include "arch_type.hpp"
 #include "utils/assert.hpp"
 #include "utils/env.hpp"
 #include "utils/logger.hpp"
@@ -25,6 +26,7 @@ struct DeviceGrid
 
     DeviceGrid(int r, int c) : r(r), c(c) {}
     DeviceGrid(std::pair<int, int> p) : r(p.first), c(p.second) {}
+    int size() const { return r * c; }
 };
 
 struct CoreCoord
@@ -54,6 +56,7 @@ struct EthCoord
 struct DeviceConfig
 {
     std::string arch_name;
+    ARCH arch;
     std::string device_yaml;
     std::string cluster_config_yaml;
     std::string runtime_params_yaml;
@@ -92,12 +95,14 @@ struct DeviceConfig
         store_backend_db_to_yaml(store_backend_db_to_yaml),
         grid_size(get<DeviceGrid>("t6-grid_size", false))
     {
+        arch = to_arch_type(arch_name);
+
         // Constructor - used only by unittesting.
         if (skip_backend_queries)
             return;
 
         // Get backend related parameters
-        if (this->is_wormhole())
+        if (this->is_wormhole_b0())
         {
             // Load and cache system-level params if needed
             if (this->backend_type == "silicon")
@@ -208,19 +213,25 @@ struct DeviceConfig
         }
     }
 
-    inline bool is_grayskull() const { return arch_name.find("grayskull") != std::string::npos; }
-    inline bool is_wormhole() const { return arch_name.find("wormhole") != std::string::npos; }
-    inline bool is_wormhole_b0() const { return arch_name.find("wormhole_b0") != std::string::npos; }
+    // Get if the device is a blackhole
+    inline bool is_blackhole() const { return arch == ARCH::BLACKHOLE; }
+
+    // Get if the device is a wormhole_b0
+    // During the onboarding process of the blackhole architecture,
+    // we temporarily treat it as equivalent to the Wormhole_b0 architecture.
+    inline bool is_wormhole_b0() const { return arch == ARCH::WORMHOLE_B0 || is_blackhole(); }
+    
+    // Get if the device is a grayskull
+    inline bool is_grayskull() const { return arch == ARCH::GRAYSKULL; }
 
     template <typename T>
     T get(std::string const &param, const bool system_level_command) const;
     void load_system_level_params();
-    std::vector<std::uint32_t> get_harvested_cfg() const;
+    std::unordered_map<std::uint32_t, std::uint32_t> get_harvested_cfg() const;
 
-    std::size_t get_dst_size() const { return get<std::size_t>("t6-dst_size", false); }
     std::size_t get_clock_freq() const
     {
-        return 1000000000;  // tenstorrent/budabackend#1912
+        return 1000000000;
     }
     std::uint32_t get_host_memory_num_channels() const
     {
@@ -246,27 +257,15 @@ struct DeviceConfig
         static size_t overlay_blob_extra_size = env_as<size_t>("TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE", 0);
         return overlay_blob_extra_size;
     }
-    std::size_t get_l1_backend_reserved_size() const
-    {
-        // BBE will account for extra blob size (TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE) in the reserved size
-        //
-        auto reserved_size = get<std::size_t>("t6-l1_backend_reserved_size", false);
-        static auto extra_l1_margin = env_as<int>("PYBUDA_EXTRA_L1_MARGIN");
-        if (reserved_size < (std::uint32_t)extra_l1_margin)
-            return 0;
-
-        return reserved_size - extra_l1_margin;
-    }
-    std::size_t get_l1_usable_size() const { return get_l1_size() - get_l1_backend_reserved_size(); }
+    std::size_t get_l1_usable_size() const { return get_l1_size(); }
     std::size_t get_l1_dram_io_backend_reserved_size() const
     {
         // Get this number from DB query:
-        // tenstorrent/budabackend#1979
         return 100 * 1024;
     }
     std::size_t get_noc_bandwidth_bytes_per_cycle() const
     {
-        return 32;  // tenstorrent/budabackend#1912
+        return 32;
     }
     std::uint32_t get_dram_num_channels() const { return get<std::uint32_t>("dram-num_channels", false); }
     std::uint32_t get_dram_num_subchannels() const
@@ -297,7 +296,8 @@ struct DeviceConfig
     }
     CoreCoord get_dram_core_coord(std::uint32_t channel, std::uint32_t subchannel) const
     {
-        if (is_grayskull())
+        // Emulation device has only one dram channel
+        if (is_grayskull() || this->backend_type == "emulation")
         {
             return get<CoreCoord>("dram-core_xy_chan" + std::to_string(channel), false);
         }
@@ -440,13 +440,6 @@ inline std::ostream& operator<<(std::ostream& os, DeviceConfig const& device_con
     os << indent << ".runtime_params_yaml = " << device_config.runtime_params_yaml << "," << std::endl;
     os << indent << ".grid_size = {" << device_config.grid_size.r << ", " << device_config.grid_size.c << "}"
        << "," << std::endl;
-    os << indent << ".get_dst_size = " << device_config.get_dst_size() << "," << std::endl;
-    os << indent << ".get_l1_size = " << device_config.get_l1_size() << "," << std::endl;
-    os << indent << ".get_l1_backend_reserved_size = " << device_config.get_l1_backend_reserved_size() << ","
-       << std::endl;
-    os << indent << ".get_l1_usable_size = " << device_config.get_l1_usable_size() << "," << std::endl;
-    os << indent << ".get_dram_num_channels = " << device_config.get_dram_num_channels() << "," << std::endl;
-    os << indent << ".get_dram_channel_capacity = " << device_config.get_dram_channel_capacity() << "," << std::endl;
     os << indent << ".supports_fp32_accumulation = " << device_config.supports_fp32_accumulation() << "," << std::endl;
     os << indent << ".supports_stochastic_rounding = " << device_config.supports_stochastic_rounding() << ","
        << std::endl;
diff --git a/pybuda/csrc/backend_api/module.mk b/pybuda/csrc/backend_api/module.mk
index b2f79cfcf..4605c1a95 100644
--- a/pybuda/csrc/backend_api/module.mk
+++ b/pybuda/csrc/backend_api/module.mk
@@ -1,21 +1,9 @@
 # Every variable in subdir must be prefixed with subdir (emulating a namespace)
 
-BACKEND_CONFIG ?= release
-BACKEND_ARCH_NAME ?= grayskull
-BACKEND_CC ?= gcc
-BACKEND_CXX ?= g++
-
-BACKEND_INCLUDES =  -Ithird_party/budabackend
-
-BUDABACKEND_LIBDIR = third_party/budabackend/build/lib
-BUDABACKEND_LIB = $(BUDABACKEND_LIBDIR)/libtt.so
-BUDABACKEND_DEVICE = $(BUDABACKEND_LIBDIR)/libdevice.so
-BUDABACKEND_NET2PIPE = third_party/budabackend/build/bin/net2pipe
-BUDABACKEND_PIPEGEN = third_party/budabackend/build/bin/pipegen2
-
 PYBUDA_CSRC_BACKENDAPI_LIB = $(LIBDIR)/libbackend_api.a
 PYBUDA_CSRC_BACKENDAPI_SRCS += \
-	pybuda/csrc/backend_api/backend_api.cpp
+	pybuda/csrc/backend_api/backend_api.cpp \
+	pybuda/csrc/backend_api/arch_type.cpp
 
 PYBUDA_CSRC_BACKENDAPI_INCLUDES = $(PYBUDA_CSRC_INCLUDES) $(BACKEND_INCLUDES)
 
@@ -24,32 +12,10 @@ PYBUDA_CSRC_BACKENDAPI_DEPS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_BACKENDAPI_S
 
 -include $(PYBUDA_CSRC_BACKENDAPI_DEPS)
 
-third_party/budabackend/clean:
-	#ROOT=third_party/budabackend $(MAKE) -C third_party/budabackend clean
-	cd third_party/budabackend; rm -rf build
-
-third_party/budabackend: $(SUBMODULESDIR)/third_party/budabackend.build ;
-
-DEVICE_VERSIM_INSTALL_ROOT ?= third_party/budabackend
-$(SUBMODULESDIR)/third_party/budabackend.build: $(SUBMODULESDIR)/third_party/budabackend.checkout
-	CC=$(BACKEND_CC) CXX=$(BACKEND_CXX) CONFIG=$(BACKEND_CONFIG) ARCH_NAME=$(BACKEND_ARCH_NAME) DEVICE_VERSIM_INSTALL_ROOT=$(DEVICE_VERSIM_INSTALL_ROOT) ROOT=$(PWD)/third_party/budabackend $(MAKE) -C third_party/budabackend backend build_hw
-	touch $@
-
-.PHONY: third_party/budabackend/netlist_analyzer
-third_party/budabackend/netlist_analyzer:
-	CONFIG=$(BACKEND_CONFIG) ARCH_NAME=$(BACKEND_ARCH_NAME) DEVICE_VERSIM_INSTALL_ROOT=$(DEVICE_VERSIM_INSTALL_ROOT) ROOT=$(PWD)/third_party/budabackend $(MAKE) -C third_party/budabackend netlist_analyzer/tests
-
-$(BUDABACKEND_DEVICE): third_party/budabackend ;
-$(BUDABACKEND_LIB):  third_party/budabackend ;
-$(BUDABACKEND_NET2PIPE): third_party/budabackend ;
-$(BUDABACKEND_PIPEGEN): third_party/budabackend ;
-
-third_party/budabackend/src/net2pipe: $(BUDABACKEND_NET2PIPE) $(BUDABACKEND_PIPEGEN) ;
-
 # Each module has a top level target as the entrypoint which must match the subdir name
-pybuda/csrc/backend_api: $(PYBUDA_CSRC_BACKENDAPI_LIB) $(BUDABACKEND_LIB) $(BUDABACKEND_DEVICE) ;
+pybuda/csrc/backend_api: $(PYBUDA_CSRC_BACKENDAPI_LIB) $(PYBUDA_CSRC_SHARED_UTILS_LIB) ;
 
-$(PYBUDA_CSRC_BACKENDAPI_LIB): $(PYBUDA_CSRC_BACKENDAPI_OBJS) $(BUDABACKEND_LIB) $(BUDABACKEND_DEVICE)
+$(PYBUDA_CSRC_BACKENDAPI_LIB): $(PYBUDA_CSRC_BACKENDAPI_OBJS)
 	@mkdir -p $(LIBDIR)
 	ar rcs $@ $^
 
diff --git a/pybuda/csrc/balancer/balancer.cpp b/pybuda/csrc/balancer/balancer.cpp
deleted file mode 100644
index b58f67d36..000000000
--- a/pybuda/csrc/balancer/balancer.cpp
+++ /dev/null
@@ -1,652 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "balancer/balancer.hpp"
-
-#include <algorithm>
-#include <experimental/filesystem>
-#include <fstream>
-#include <optional>
-#include <unordered_set>
-
-#include "balancer/balancer_cache_collection.hpp"
-#include "balancer/legalizer/legalizer.hpp"
-#include "balancer/policies/policies.hpp"
-#include "balancer/policies/policy_types.hpp"
-#include "balancer/policies/policy_utils.hpp"
-#include "balancer/python_interface.hpp"
-#include "graph_lib/node_types.hpp"
-#include "passes/passes_utils.hpp"
-#include "placer/epoch_placer.hpp"
-#include "placer/placer.hpp"
-#include "python_bindings_common.hpp"
-
-using NodeType = tt::graphlib::NodeType;
-
-namespace tt::balancer
-{
-
-std::ostream& operator<<(std::ostream& stream, PolicyType policy_type)
-{
-    switch (policy_type)
-    {
-        case PolicyType::MaximizeTMinimizeGrid: stream << "PolicyType::MaximizeTMinimizeGrid"; break;
-        case PolicyType::MinimizeGrid: stream << "PolicyType::MinimizeGrid"; break;
-        case PolicyType::Random: stream << "PolicyType::Random"; break;
-        case PolicyType::NLP: stream << "PolicyType::NLP"; break;
-        case PolicyType::CNN: stream << "PolicyType::CNN"; break;
-        case PolicyType::Ribbon: stream << "PolicyType::Ribbon"; break;
-        default: stream << "PolicyType::Unknown"; break;
-    }
-    return stream;
-}
-
-std::ostream& operator<<(std::ostream& os, BalancerConfig const& config)
-{
-    os << "BalancerConfig{" << std::endl;
-    os << "  .device_config.arch_name = " << config.device_config.arch_name << std::endl;
-    os << "  .device_config.device_yaml = " << config.device_config.device_yaml << std::endl;
-    os << "  .policy_type = " << config.policy_type << std::endl;
-    os << "  .random_policy_seed = " << config.random_policy_seed << std::endl;
-    os << "  .num_chips = " << config.chip_ids.size() << std::endl;
-    os << "  .skip_l1_usage_validation = " << config.skip_l1_usage_validation << std::endl;
-    os << "  .enable_t_streaming = " << config.enable_t_streaming << std::endl;
-    os << "}";
-    return os;
-}
-
-static std::vector<graphlib::Edge> cut_edges(BalancerConfig const& config, graphlib::Graph const* graph)
-{
-    // Prevent duplicates coming from config.
-    //
-    std::unordered_set<graphlib::Edge> edges;
-
-    for (auto const& epoch : config.op_names_to_epoch_break)
-    {
-        for (auto const& op_name : epoch)
-        {
-            if (not graph->has_node_with_name(op_name))
-                continue;
-            graphlib::Node* consumer = graph->get_node_by_name(op_name);
-            for (auto edge : graph->operand_data_edges(consumer))
-            {
-                if (graph->node_by_id(edge.producer_node_id)->node_type() != NodeType::kBudaOp)
-                    continue;
-
-                if (edges.count(edge) == 0)
-                {
-                    edges.insert(edge);
-                }
-            }
-        }
-    }
-
-    return std::vector<graphlib::Edge>(edges.begin(), edges.end());
-}
-
-legalizer::GraphSolver get_graph_solver(
-    BalancerConfig const& config,
-    std::shared_ptr<BalancerCacheCollection> cache_collection,
-    graphlib::Graph* graph,
-    LegalOpModels const& legal_op_models,
-    bool use_op_model_recalculation)
-{
-    if (config.device_config.is_grayskull())
-    {
-        return legalizer::GraphSolver::create<legalizer::GrayskullConstraint>(
-            graph, legal_op_models, config, cache_collection, use_op_model_recalculation, cut_edges(config, graph));
-    }
-    else if (config.device_config.is_wormhole())
-    {
-        return legalizer::GraphSolver::create<legalizer::WormholeConstraint>(
-            graph, legal_op_models, config, cache_collection, use_op_model_recalculation, cut_edges(config, graph));
-    }
-
-    log_fatal("Unknown device arch name: {}", config.device_config.arch_name);
-}
-
-static void add_broadcasts_for_sparse_inputs_0_2(
-    graphlib::Graph const* graph, graphlib::Node const* node, OpModel const& op_model)
-{
-    // If fracture factor > 1, inputs have already been properly sized to target eltwise pipes scenario
-    // The reason they can't be handled here as well is that fractured convs have different inputs 0 and
-    // 2 for each c-dim core
-    if (op_model.fracture_factor > 1)
-    {
-        return;
-    }
-
-    // No need to broadcast in this case
-    if (op_model.grid_shape.c == 1)
-    {
-        return;
-    }
-
-    std::vector<Edge> in_edges = graph->operand_data_edges(node);
-    TT_ASSERT(in_edges.size() == 3 || in_edges.size() == 4);  // 4 with bias
-
-    std::shared_ptr<tt::graphlib::EdgeAttributes> input0_edge_attrs = graph->get_edge_attributes(in_edges[0]);
-    std::shared_ptr<tt::graphlib::EdgeAttributes> input2_edge_attrs = graph->get_edge_attributes(in_edges[2]);
-
-    std::vector<tt::graphlib::OpType> input0_edge_tms = input0_edge_attrs->get_tms();
-    std::vector<tt::graphlib::OpType> input2_edge_tms = input2_edge_attrs->get_tms();
-
-    // Append tms
-    tt::graphlib::OpType broadcast_to_append = graphlib::OpType("broadcast", {3, op_model.grid_shape.c, true}, {});
-
-    input0_edge_tms.push_back(broadcast_to_append);
-    input2_edge_tms.push_back(broadcast_to_append);
-
-    input0_edge_attrs->set_tms(input0_edge_tms);
-    input2_edge_attrs->set_tms(input2_edge_tms);
-}
-
-static void insert_sparse_fracturing_tms(
-    graphlib::Graph const* graph, graphlib::Node const* node, OpModel const& op_model)
-{
-    // Out
-    std::vector<Edge> out_edges = graph->user_data_edges(node);
-    TT_ASSERT(out_edges.size() == 1);
-    std::shared_ptr<tt::graphlib::EdgeAttributes> out_edge_attrs = graph->get_edge_attributes(out_edges[0]);
-    std::vector<tt::graphlib::OpType> out_edge_tms = out_edge_attrs->get_tms();
-
-    std::vector<tt::graphlib::OpType> prepend_tms = {
-        // Instead of adding hslice(f) + vstack(f), we could just add an hslice(f) and divide the existing vslice's
-        // split factor by f (where f = fracture_factor), but the optimize_tms() pass does this for us
-        graphlib::OpType("hslice", {op_model.fracture_factor}, {}),
-        graphlib::OpType("vstack", {op_model.fracture_factor}, {})};
-    out_edge_tms.insert(out_edge_tms.begin(), prepend_tms.begin(), prepend_tms.end());
-    out_edge_attrs->set_tms(out_edge_tms);
-
-    // In1
-    std::vector<Edge> in_edges = graph->operand_data_edges(node);
-    TT_ASSERT(in_edges.size() == 3);
-    std::shared_ptr<tt::graphlib::EdgeAttributes> in1_edge_attrs = graph->get_edge_attributes(in_edges[1]);
-    std::vector<tt::graphlib::OpType> in1_edge_tms = in1_edge_attrs->get_tms();
-
-    std::vector<tt::graphlib::OpType> append_tms = {
-        graphlib::OpType("broadcast", {3, op_model.fracture_factor, true}, {})};
-    in1_edge_tms.insert(in1_edge_tms.end(), append_tms.begin(), append_tms.end());
-    in1_edge_attrs->set_tms(in1_edge_tms);
-}
-
-static void insert_sparse_buffer_op_tms(
-    Graph const* graph, graphlib::OpNode* op, balancer::OpModel const& op_model, int fracture_factor)
-{
-    TT_ASSERT(op_model.has_sparse_buffer());
-    TT_ASSERT(fracture_factor == 1);
-
-    if (op_model.grid_shape.r == 1)
-        return;
-
-    auto users = graph->user_data_edges(op);
-    graphlib::BudaOpNode* user = dynamic_cast<graphlib::BudaOpNode*>(graph->node_by_id(users[0].consumer_node_id));
-    bool is_reduce_z = user->op_name() == "reduce" and std::get<std::string>(user->buda_attrs().at("dim")) == "z";
-    bool is_matmul = user->is_matmul() and not user->is_sparse_matmul();
-
-    if (not is_reduce_z and not is_matmul)
-        return;
-
-    TT_ASSERT(users.size() == 1, "Unsupported, multiple users with different views");
-
-    auto edge_attrs = graph->get_edge_attributes(users[0]);
-    auto& tms = edge_attrs->get_tms();
-    if (tms.empty())
-        return;
-
-    TT_ASSERT(not is_matmul or tms.size() == 2, op->name(), user->name(), tms.size());
-    TT_ASSERT(not is_reduce_z or tms.size() == 1, op->name(), user->name(), tms.size());
-    if (is_matmul)
-    {
-        TT_ASSERT(tms[0].op == "vslice");
-        TT_ASSERT(tms[1].op == "hstack");
-        int vslice_factor = std::get<int>(tms[0].attr[0]);
-        int hstack_factor = std::get<int>(tms[1].attr[0]);
-        TT_ASSERT(vslice_factor == hstack_factor);
-        TT_ASSERT(vslice_factor > 1);
-        int factor = vslice_factor;
-        int grid_r = op_model.grid_shape.r;
-        tms.clear();
-        tms.push_back(graphlib::OpType("vslice", {grid_r * factor}, {}));
-        tms.push_back(graphlib::OpType("hstack", {factor}, {}));
-        tms.push_back(graphlib::OpType("vstack", {grid_r}, {}));
-    }
-    else if (is_reduce_z)
-    {
-        TT_ASSERT(false, "This path results in illegal tms, cannot have slice after stack");
-        TT_ASSERT(tms[0].op == "vslice");
-        int vslice_factor = std::get<int>(tms[0].attr[0]);
-        TT_ASSERT(vslice_factor > 1);
-        int factor = vslice_factor;
-        int grid_r = op_model.grid_shape.r;
-        tms.clear();
-        tms.push_back(graphlib::OpType("vslice", {grid_r * factor}, {}));
-        tms.push_back(graphlib::OpType("hstack", {factor}, {}));
-        tms.push_back(graphlib::OpType("vstack", {grid_r}, {}));
-        tms.push_back(graphlib::OpType("hslice", {factor}, {}));
-    }
-}
-
-// Layout dataflow reorders the output buffer of sparse matmul in a way
-// such that each row of cores between a sparse/consumer pair has a 1to1
-// mapping of tiles and avoids inefficient gathers.  This function erases
-// the existing TMs along this path and replaces them with "per row core"
-// equivalent set of TMs.  This often results in more complicated TMs, but
-// much simpler pipes
-static void insert_sparse_dataflow_tms(
-    graphlib::Graph const* graph, graphlib::Node const* node, OpModel const& op_model)
-{
-    for (Edge user : graph->user_data_edges(node))
-    {
-        auto& tms = graph->get_edge_attributes(user)->get_tms();
-        TT_ASSERT(tms.size() >= 1 and tms.size() <= 3);
-
-        bool needs_stack = false;
-        int row_slice = op_model.grid_shape.r * op_model.block_shape().rt();
-        bool backwards = tms.front().op == "transpose";
-        int factor = 0;
-
-        if (backwards)
-        {
-            TT_ASSERT(tms.size() >= 2);
-            TT_ASSERT(tms[1].op == "hslice");
-            factor = std::get<int>(tms[1].attr[0]);
-            if (tms.size() > 2)
-            {
-                TT_ASSERT(tms[2].op == "vstack");
-                TT_ASSERT(factor == std::get<int>(tms[2].attr[0]));
-                needs_stack = true;
-            }
-        }
-        else
-        {
-            TT_ASSERT(tms[0].op == "vslice");
-            factor = std::get<int>(tms[0].attr[0]);
-            if (tms.size() > 1)
-            {
-                TT_ASSERT(tms[1].op == "hstack");
-                TT_ASSERT(factor == std::get<int>(tms[1].attr[0]));
-                needs_stack = true;
-            }
-        }
-        TT_ASSERT(factor > 1);
-        TT_ASSERT(row_slice > 1);
-
-        tms.clear();
-
-        if (backwards)
-        {
-            tms.push_back(graphlib::OpType("transpose", {}, {}, {{"dim0", 2}, {"dim1", 3}, {"z_dim_slice", -1}}));
-            tms.push_back(graphlib::OpType("hslice", {row_slice}, {}));
-            tms.push_back(graphlib::OpType("vstack", {factor}, {}));
-            tms.push_back(graphlib::OpType("hstack", {row_slice / factor}, {}));
-            if (not needs_stack)
-                tms.push_back(graphlib::OpType("vslice", {factor}, {}));
-        }
-        else
-        {
-            tms.push_back(graphlib::OpType("vslice", {row_slice}, {}));
-            tms.push_back(graphlib::OpType("hstack", {factor}, {}));
-            tms.push_back(graphlib::OpType("vstack", {row_slice / factor}, {}));
-            if (not needs_stack)
-                tms.push_back(graphlib::OpType("hslice", {factor}, {}));
-        }
-        optimize_tms(tms);
-    }
-}
-
-void print_perf_input_data(
-    tt::sparse::EncodingTiles const& buda_indices_all_rows, int sparse_tile_ptr_bits, balancer::OpModel const& op_model)
-{
-    constexpr int TILE_DIM = tt::sparse::TILE_DIM;
-    using IndexType = std::remove_extent_t<decltype(tt::sparse::strip_info_struct::F::index_array)>;
-    const int ublock_tile_index_bytes = 16 - sparse_tile_ptr_bits;
-    const int grid_r = buda_indices_all_rows.size();
-
-    fmt::print("~~ Node: {}\n", op_model.buda_op_node->name());
-    fmt::print("~~~~ t: {}\n", op_model.t_stream_factor.t());
-    fmt::print("~~~~ mblock_m: {}\n", op_model.block_shape().mblock_m);
-    fmt::print("~~~~ mblock_n: {}\n", op_model.block_shape().mblock_n);
-    fmt::print("~~~~ ublock_rt: {}\n", op_model.ublock_shape().rt);
-    fmt::print("~~~~ ublock_ct: {}\n", op_model.ublock_shape().ct);
-    fmt::print("~~~~ u_kt: {}\n", op_model.input_buffers[0].block_shape.ublock.ct);
-    fmt::print(
-        "~~~~ m_k (total strips): {}\n",
-        op_model.op_shape.inputs[1].rt / op_model.input_buffers[1].block_shape.ublock.rt);
-
-    for (int curr_r = 0; curr_r < grid_r; curr_r++)
-    {
-        std::vector<std::int32_t> buda_indices = buda_indices_all_rows[curr_r];
-        std::uint8_t const* base_ptr = reinterpret_cast<std::uint8_t const*>(buda_indices.data());
-        TT_ASSERT((int)buda_indices.size() % (TILE_DIM * TILE_DIM) == 0);
-
-        int cnt_nz_tiles = 0;
-        int cnt_nz_ublocks = 0;
-        int cnt_nz_strips = 0;
-
-        for (int tile_id = 0; tile_id < (int)(buda_indices.size() / (TILE_DIM * TILE_DIM)); ++tile_id)
-        {
-            tt::sparse::strip_info_struct const* info = reinterpret_cast<tt::sparse::strip_info_struct const*>(
-                base_ptr + tile_id * (TILE_DIM * TILE_DIM * sizeof(std::uint32_t)));
-
-            bool done = false;
-            while (not done)
-            {
-                if (info->f.nz_ublocks > 0)
-                {
-                    cnt_nz_strips++;
-                }
-
-                cnt_nz_ublocks += info->f.nz_ublocks;
-
-                int i = 0;
-                for (int ublock_i = 0; ublock_i < info->f.nz_ublocks; ++ublock_i)
-                {
-                    IndexType encoded = info->f.index_array[i++];
-                    IndexType nz_tiles_in_ublock = encoded >> sparse_tile_ptr_bits;
-                    nz_tiles_in_ublock =
-                        (nz_tiles_in_ublock == 0u) ? (1u << ublock_tile_index_bytes) : nz_tiles_in_ublock;
-                    cnt_nz_tiles += nz_tiles_in_ublock;
-                    i += nz_tiles_in_ublock;
-                }
-
-                done = info->f.last_strip_in_tile;
-                info = reinterpret_cast<tt::sparse::strip_info_struct const*>(
-                    reinterpret_cast<std::uint8_t const*>(info) + sizeof(tt::sparse::strip_info_struct) +
-                    i * sizeof(IndexType));
-            }
-        }
-
-        fmt::print("~~~~~~ grid_r {}\n", curr_r);
-        fmt::print("~~~~~~~~ cnt_nz_tiles: {}\n", cnt_nz_tiles);
-        fmt::print("~~~~~~~~ cnt_nz_ublocks: {}\n", cnt_nz_ublocks);
-        fmt::print("~~~~~~~~ cnt_nz_strips: {}\n", cnt_nz_strips);
-    }
-}
-
-void update_ops_on_selected_op_models(graphlib::Graph const* graph, OpModels const& op_models)
-{
-    for (Node* node : graph->nodes())
-    {
-        if (node->node_type() == NodeType::kBudaOp)
-        {
-            graphlib::OpNode* op = node->as<graphlib::OpNode>();
-            graphlib::OpType type = op->op_type();
-            if (op->is_sparse_matmul())
-            {
-                TT_LOG_ASSERT(op_models.count(node) > 0, "Missing op model for node: {}", node->name());
-                balancer::OpModel op_model = op_models.at(node);
-
-                int grid_r = op_model.grid_shape.r;
-                int u_rt = op_model.output_buffers[0].block_shape.ublock.rt;
-                int u_kt = op_model.input_buffers[1].block_shape.ublock.rt;
-                int u_ct = op_model.output_buffers[0].block_shape.ublock.ct;
-                bool has_buffer_op = op_model.has_sparse_buffer();
-                bool force_buffer_op_layout = env_as<bool>("PYBUDA_FORCE_SPARSE_BUFFER_LAYOUT");
-                bool buffer_op_layout = has_buffer_op or force_buffer_op_layout;
-                const sparse::SparseBUDA& sparse_buda =
-                    graph->data_operands(node)[0]->as<graphlib::ConstantInputNode>()->get_sparse_buda();
-                auto layout = sparse::SparseBUDA::create_layout(
-                    buffer_op_layout, op_model.t_stream_factor.dir.z_major(), op_model.fracture_factor);
-
-                std::string visualize_sparse_path =
-                    env_as<bool>("PYBUDA_VISUALIZE_SPARSE") ? "sparse_" + op->name() + ".png" : "";
-                auto [sparse, encodings, sparse_s, encodings_s, num_strips_per_row] =
-                    sparse_buda.get_sparse_tiles_and_encodings(
-                        grid_r,
-                        op_model.t_stream_factor.r,
-                        op_model.t_stream_factor.c,
-                        u_rt,
-                        u_kt,
-                        op_model.fracture_factor,
-                        layout,
-                        visualize_sparse_path);
-                int sparse_tile_ptr_bits =
-                    sparse_buda.get_sparse_tile_ptr_bits(grid_r, op_model.t_stream_factor.r, u_rt);
-                int sparse_ublock_idx_bits =
-                    sparse_buda.get_sparse_ublock_idx_bits(grid_r, op_model.t_stream_factor.r, u_rt);
-
-                if (env_as<bool>("PYBUDA_SPARSE_PRINT_PERF_INPUT_DATA"))
-                {
-                    print_perf_input_data(encodings, sparse_tile_ptr_bits, op_model);
-                }
-
-                auto sparse_utils_module = py::module_::import("pybuda.op.eval.sparse_utils");
-                py::function shapeify = sparse_utils_module.attr("shapeify_sparse_tiles_and_encodings");
-
-                // Overwrite input tensors
-                auto [sp, enc] = shapeify(sparse, encodings, grid_r, op_model.fracture_factor)
-                                     .cast<std::pair<py::object, py::object>>();
-                graphlib::ConstantInputNode* cin0 = graph->data_operands(node)[0]->as<graphlib::ConstantInputNode>();
-                graphlib::ConstantInputNode* cin2 = graph->data_operands(node)[2]->as<graphlib::ConstantInputNode>();
-                cin0->set_tensor_handle(make_shared_py_object(sp));
-                cin2->set_tensor_handle(make_shared_py_object(enc));
-
-                // tenstorrent/budabackend#1234
-                // tenstorrent/pybuda#504
-                // Due to BBE-imposed constraint, we can't have more that 2 operands multicasting
-                // BBE changed behavior so that inputs 0&2 use eltwise style pipes instead of row multicast
-                // On FE side, we add a broadcast in c-dim to account for this diff
-                add_broadcasts_for_sparse_inputs_0_2(graph, node, op_model);
-
-                // Overwrite op attributes
-                auto op_attrs = op->op_attrs();
-                TT_ASSERT(op_attrs.size() == 15);
-                op_attrs[2] = sparse_tile_ptr_bits;
-                op_attrs[7] = op_model.fracture_factor;
-                op_attrs[8] = u_rt;
-                op_attrs[9] = u_kt;
-                op_attrs[10] = u_ct;
-                op_attrs[11] = op_model.grid_shape.c;
-                op_attrs[12] = op_model.t_stream_factor.r;
-                op_attrs[13] = op_model.t_stream_factor.c;
-                op_attrs[14] = sparse_ublock_idx_bits;
-                op->overwrite_op_attrs(op_attrs);
-
-                // Overwrite buda attributes
-                auto buda_attrs = op->buda_attrs();
-                buda_attrs["num_sparse_tiles"] =
-                    static_cast<int>(sparse_s[3] / sparse::TILE_DIM / op_model.fracture_factor);
-                buda_attrs["num_index_tiles"] =
-                    static_cast<int>(encodings_s[3] / sparse::TILE_DIM / op_model.fracture_factor);
-                buda_attrs["sparse_tile_ptr_bits"] = sparse_tile_ptr_bits;
-                buda_attrs["sparse_ublock_idx_bits"] = sparse_ublock_idx_bits;
-                buda_attrs["fracture_factor"] = op_model.fracture_factor;
-                if (has_buffer_op)
-                {
-                    TT_ASSERT((op_model.grid_shape.c % 2) == 0);
-                    std::vector<int> num_nz_strips;
-                    int grid_c = op_model.grid_shape.c / 2;
-                    int grid_volume = op_model.grid_shape.r * grid_c;
-                    num_nz_strips.resize(grid_volume);
-                    for (int i = 0; i < grid_volume; ++i) num_nz_strips[i] = num_strips_per_row[i / grid_c];
-                    buda_attrs["num_nz_strips"] = num_nz_strips;
-                    buda_attrs["act_buffered"] = true;
-                }
-                op->overwrite_buda_attrs(buda_attrs);
-
-                // Overwrite op attributes
-                auto attr = op->op_attrs();
-                std::get<int>(attr[2]) = sparse_tile_ptr_bits;
-                op->overwrite_op_attrs(attr);
-
-                log_trace(LogBalancer, "  Sparse node {}:", node->name());
-                log_trace(LogBalancer, "    {}", op_model.grid_shape);
-                log_trace(
-                    LogBalancer, "    Num sparse tiles per core: {}:", std::get<int>(buda_attrs["num_sparse_tiles"]));
-                log_trace(
-                    LogBalancer, "    Num index tiles per core: {}:", std::get<int>(buda_attrs["num_index_tiles"]));
-
-                graph->data_operands(node)[0]->set_shape(graphlib::Shape::create_buda(sparse_s));
-                graph->data_operands(node)[2]->set_shape(graphlib::Shape::create_buda(encodings_s));
-
-                if (op_model.fracture_factor > 1)
-                {
-                    // Update node shape to account for fracture factor
-                    tt::graphlib::Shape shape = node->shape().canonical();
-                    tt::graphlib::Shape new_shape = graphlib::Shape::create_buda(
-                        shape.as_vector()[0],
-                        shape.as_vector()[1],
-                        shape.as_vector()[2] / op_model.fracture_factor,
-                        shape.as_vector()[3] * op_model.fracture_factor);
-                    TT_ASSERT(shape.volume() == new_shape.volume());
-                    node->set_shape(new_shape);
-
-                    // Insert tms to account for fracture factor
-                    insert_sparse_fracturing_tms(graph, node, op_model);
-                }
-
-                log_trace(LogBalancer, "Sparse layout {}: {}", op->name(), layout);
-                switch (layout)
-                {
-                    case sparse::SparseBUDA::Layout::BufferOp:
-                    {
-                        insert_sparse_buffer_op_tms(graph, op, op_model, op_model.fracture_factor);
-                        break;
-                    }
-                    case sparse::SparseBUDA::Layout::ZMajorDataflow:
-                    {
-                        TT_ASSERT(op_model.fracture_factor == 1);
-                        insert_sparse_dataflow_tms(graph, op, op_model);
-                        break;
-                    }
-                    default:
-                    {
-                        break;
-                    }
-                }
-            }
-            else if (type.op == "embedding")
-            {
-                balancer::OpModel const& op_model = op_models.at(node);
-
-                auto* embedding_table = graph->data_operands(op)[0]->as<graphlib::InputNode>();
-                embedding_table->set_layout(BudaQueueLayout::Flat);
-
-                // Reconfigure shape for raw tilized layout
-                auto* embedding_indices = graph->data_operands(op)[1]->as<graphlib::InputNode>();
-                auto indices_shape = embedding_indices->shape();
-                TT_ASSERT(indices_shape[-2] == graphlib::Shape::BUDA_TILE_DIM);
-                indices_shape[-2] = indices_shape[-2] * op_model.grid_shape.r;
-                indices_shape[-1] = graphlib::align_up_tile(
-                    indices_shape[-1] / (op_model.grid_shape.r * graphlib::Shape::BUDA_TILE_DIM));
-                embedding_indices->set_shape(indices_shape);
-
-                // Convert num_indices to be per core
-                int num_indices = std::get<int>(type.buda_attrs.at("num_indices"));
-                num_indices = graphlib::align_up_tile(num_indices);
-                TT_ASSERT(num_indices % op_model.grid_shape.r == 0);
-                std::get<int>(type.buda_attrs.at("num_indices")) = num_indices / op_model.grid_shape.r;
-
-                op->change_op_type(type);
-            }
-            else if (type.op == "dropout")
-            {
-                // Overwrite op attributes
-                TT_LOG_ASSERT(op_models.count(node) > 0, "Missing op model for node: {}", node->name());
-                balancer::OpModel op_model = op_models.at(node);
-
-                auto attr = op->op_attrs();
-                attr[5] = op_model.t_stream_factor.r;
-                attr[6] = op_model.t_stream_factor.c;
-                attr[7] = op_model.t_stream_factor.dir.r();
-                attr[8] = op_model.t_stream_factor.dir.z_major();
-                op->overwrite_op_attrs(attr);
-            }
-            else if (type.op == "splice")
-            {
-                // Update op attributes
-                TT_LOG_ASSERT(op_models.count(node) > 0, "Missing op model for node: {}", node->name());
-                balancer::OpModel op_model = op_models.at(node);
-                graphlib::UBlockOrder ublock_order = get_output_ublock_order(graph, node);
-                op->py_attr<void>(
-                    "update_ranges",
-                    (ublock_order == graphlib::UBlockOrder::R),  // ublock_is_row_order
-                    op_model.ublock_shape().rt,
-                    op_model.ublock_shape().ct,
-                    op_model.grid_shape.r,
-                    op_model.grid_shape.c,
-                    op_model.t_stream_factor.r,
-                    op_model.t_stream_factor.c);
-            }
-            else if (type.op == "tilizer")
-            {
-                auto* input = graph->data_operands(op)[0]->as<graphlib::InputNode>();
-                input->set_layout(BudaQueueLayout::Flat);
-            }
-        }
-    }
-}
-
-static void insert_input_queues(
-    placer::PlacerSolution& placer_solution, const Graph* graph, const OpModelMap& op_models)
-{
-    // Add input queues to the placer solution
-    for (auto [node_name, op_model] : op_models)
-    {
-        Node* node = graph->get_node_by_name(node_name);
-        switch (node->node_type())
-        {
-            case NodeType::kInput:
-            {
-                placer_solution.input_queue_to_grid_shape.insert(
-                    {node_name,
-                     tt::placer::GridShape(
-                         (std::uint32_t)op_model.grid_shape.r, (std::uint32_t)op_model.grid_shape.c)});
-                break;
-            }
-            default: break;
-        }
-    }
-}
-
-static std::tuple<OpModelMap, BlockShapeMap, OutputHostTMMap, CutEdges> balancer_passes(
-    Graph* graph,
-    BalancerConfig& config,
-    std::shared_ptr<BalancerCacheCollection> cache_collection,
-    std::optional<placer::PlacerSolution>& placer_solution)
-{
-    log_debug(LogBalancer, "{}", config);
-    LegalOpModels valid_op_models = legalizer::get_legal_op_models(graph, config, cache_collection);
-
-    auto graph_solver = get_graph_solver(config, cache_collection, graph, valid_op_models);
-
-    legalizer::GraphSolverSolution graph_solver_solution = run_policy(graph, config, graph_solver, placer_solution);
-
-    update_ops_on_selected_op_models(graph, graph_solver_solution.selected_op_models);
-
-    auto ret = legalizer::resolve_block_shapes(graph, config, graph_solver_solution);
-
-    if (placer_solution.has_value())
-        insert_input_queues(placer_solution.value(), graph, std::get<0>(ret));
-
-    return ret;
-}
-
-std::shared_ptr<BalancerSolution> run_balancer_and_placer(
-    Graph* graph, BalancerConfig& config, std::shared_ptr<BalancerCacheCollection> cache_collection)
-{
-    log_info("Running Balancer with Policy: {}", config.policy_type);
-    PROFILE_SCOPE();
-
-    // New epoch-by-epoch placement loop
-    if (config.epoch_by_epoch)
-        return placer::run_epoch_placer(&graph, config, cache_collection);
-
-    std::optional<placer::PlacerSolution> opt_placer_solution = std::nullopt;
-    auto const& [op_models, block_shape_map, output_host_tms, cut_edges] =
-        balancer_passes(graph, config, cache_collection, opt_placer_solution);
-
-    TT_ASSERT(
-        graph->virtual_node_count() == 0,
-        "After balancer passes are complete we should not have virtual nodes in graph anymore.");
-
-    auto placer_solution =
-        opt_placer_solution.has_value() ? opt_placer_solution.value() : run_placer(graph, config, op_models);
-    dump_balancer_placer_data(
-        graph, config.chip_ids, placer_solution, op_models, std::cout, config.device_config.arch_name);
-
-    return std::make_shared<BalancerSolution>(placer_solution, op_models, block_shape_map, output_host_tms, cut_edges);
-}
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/balancer.hpp b/pybuda/csrc/balancer/balancer.hpp
deleted file mode 100644
index d9e212ab7..000000000
--- a/pybuda/csrc/balancer/balancer.hpp
+++ /dev/null
@@ -1,61 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "backend_api/device_config.hpp"
-#include "balancer/balancer_cache_collection.hpp"
-#include "balancer/balancer_config.hpp"
-#include "balancer/balancer_utils.hpp"
-#include "balancer/legalizer/graph_solver.hpp"
-#include "balancer/policies/policy_types.hpp"
-#include "balancer/types.hpp"
-#include "graph_lib/graph.hpp"
-#include "placer/chip_id_assignment.hpp"
-#include "placer/placer.hpp"
-#include "scheduler/scheduler.hpp"
-#include "utils/env.hpp"
-#include "utils/logger.hpp"
-#include "utils/result.hpp"
-
-using Graph = tt::graphlib::Graph;
-
-namespace tt::balancer
-{
-
-struct BalancerSolution
-{
-    placer::PlacerSolution placer_solution;
-    OpModelMap op_models;
-    BlockShapeMap block_shapes;
-    OutputHostTMMap output_host_tms;
-    CutEdges graph_solver_cut_edges;
-
-    BalancerSolution(
-        placer::PlacerSolution const& placer_solution,
-        OpModelMap const& op_models,
-        BlockShapeMap const& block_shapes,
-        OutputHostTMMap const& output_host_tms,
-        CutEdges const& graph_solver_cut_edges) :
-        placer_solution(placer_solution),
-        op_models(op_models),
-        block_shapes(block_shapes),
-        output_host_tms(output_host_tms),
-        graph_solver_cut_edges(graph_solver_cut_edges)
-    {
-    }
-};
-
-std::shared_ptr<BalancerSolution> run_balancer_and_placer(
-    Graph* graph, BalancerConfig& config, std::shared_ptr<BalancerCacheCollection> cache_collection);
-
-legalizer::GraphSolver get_graph_solver(
-    BalancerConfig const& config,
-    std::shared_ptr<BalancerCacheCollection> cache_collection,
-    graphlib::Graph* graph,
-    LegalOpModels const& legal_op_models,
-    bool use_op_model_recalculation = true);
-
-void update_ops_on_selected_op_models(graphlib::Graph const* graph, OpModels const& op_models);
-
-};  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/balancer_cache_collection.hpp b/pybuda/csrc/balancer/balancer_cache_collection.hpp
deleted file mode 100644
index fb206b45f..000000000
--- a/pybuda/csrc/balancer/balancer_cache_collection.hpp
+++ /dev/null
@@ -1,80 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "balancer/types.hpp"
-
-namespace tt::balancer
-{
-
-// typename std::enable_if<std::is_same<T, std::unordered_map<>>::value, int>::type
-
-// Generic template to check whether a type is a map in stdlib
-//
-template <typename T>
-struct is_std_map
-{
-    static constexpr bool value = false;
-};
-
-// Specialization for std::unordered_map
-//
-template <typename K, typename V>
-struct is_std_map<std::unordered_map<K, V>>
-{
-    static constexpr bool value = true;
-};
-
-// Specialization for std::map
-//
-template <typename K, typename V>
-struct is_std_map<std::map<K, V>>
-{
-    static constexpr bool value = true;
-};
-
-// Returns (very) approximate size of map in bytes
-//
-template <typename T>
-typename std::enable_if<is_std_map<T>::value, int>::type get_map_size_bytes_approx(T map)
-{
-    // sizeof all keys and values + sizeof map overhead
-    //
-    return (sizeof(typename T::key_type) + sizeof(typename T::mapped_type)) * map.size() + sizeof map;
-}
-
-// Container for various caches used throughout the balancing process.
-// Currently a very simple structure with a lot of space for improvements, to be done as needed...
-//
-struct BalancerCacheCollection
-{
-    std::unordered_map<Pipe, int> pipe_to_kb_len_cache;                    // Cache Pipe object to kernel broadcast len
-    std::unordered_map<Pipe, ResourceUsage> pipe_to_resource_usage_cache;  // Cache Pipe object to ResourceUsage
-
-    BalancerCacheCollection() { log_debug(tt::LogBalancer, "BalancerCacheCollection: Cache collection initialized"); }
-
-    ~BalancerCacheCollection()
-    {
-        log_debug(tt::LogBalancer, "BalancerCacheCollection: Cache collection destroyed");
-
-        // Stats
-        //
-        log_debug(tt::LogBalancer, "BalancerCacheCollection: Cache collection stats:");
-
-        log_debug(tt::LogBalancer, "  pipe_to_kb_len_cache size (elems): {}", pipe_to_kb_len_cache.size());
-        log_debug(
-            tt::LogBalancer,
-            "  pipe_to_kb_len_cache size approx (bytes): {} b",
-            get_map_size_bytes_approx(pipe_to_kb_len_cache));
-
-        log_debug(
-            tt::LogBalancer, "  pipe_to_resource_usage_cache size (elems): {}", pipe_to_resource_usage_cache.size());
-        log_debug(
-            tt::LogBalancer,
-            "  pipe_to_resource_usage_cache size approx (bytes): {} b",
-            get_map_size_bytes_approx(pipe_to_resource_usage_cache));
-    }
-};
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/balancer_config.hpp b/pybuda/csrc/balancer/balancer_config.hpp
deleted file mode 100644
index 3d00228ae..000000000
--- a/pybuda/csrc/balancer/balancer_config.hpp
+++ /dev/null
@@ -1,142 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "backend_api/device_config.hpp"
-#include "balancer/legalizer/graph_solver_types.hpp"
-#include "balancer/types.hpp"
-#include "placer/chip_id_assignment.hpp"
-#include "policies/policy_types.hpp"
-#include "scheduler/scheduler.hpp"
-
-namespace tt::balancer
-{
-
-struct OpOverride
-{
-    std::optional<std::pair<int, int>> grid_shape;
-    std::optional<bool> force_dram_parameters;
-    std::string t_stream_dir = "";
-    std::optional<std::pair<int, int>> t_stream_shape;
-    std::optional<int> fracture_factor;
-    std::optional<int> u_kt;
-    std::optional<std::map<uint32_t, std::uint32_t>> input_buffer_multiplier;
-    std::optional<int> output_buffer_multiplier;
-
-    void apply(
-        FactorizedShape& grid_pars,
-        bool& force_dram_parameters_out,
-        std::vector<TStreamDir>& t_stream_dirs,
-        FactorizedShape& overridden_streaming_pars,
-        bool& enable_t_streaming,
-        const std::string& op_name);
-
-    std::optional<int> get_fracture_factor();
-    std::optional<int> get_u_kt();
-};
-
-struct BalancerConfig
-{
-    DeviceConfig device_config;
-    scheduler::SchedulerConfig scheduler_config;
-    PolicyType policy_type;
-    int random_policy_seed;
-    std::vector<std::uint32_t> chip_ids;
-    placer::ChipPlacementPolicy chip_placement_policy;
-    bool default_dram_parameters;
-    bool skip_l1_usage_validation;
-    bool enable_t_streaming;
-    bool manual_t_streaming;
-    bool input_queues_on_host;
-    bool output_queues_on_host;
-    std::unordered_map<std::string, OpOverride> op_overrides;
-    std::vector<std::vector<std::string>> op_names_to_epoch_break;
-    std::vector<std::vector<std::string>> op_names_to_chip_break;
-    placer::OpToChipIdAssignment op_to_chip_id_assignment;
-    std::unordered_map<std::string, placer::PlacerOpOverride> op_name_to_placer_overrides;
-    bool enable_auto_transposing_placement;
-    bool epoch_by_epoch;
-    legalizer::GraphSolverSelfCutType graph_solver_self_cut_type;
-    bool use_interactive_placer;
-    bool enable_enumerate_u_kt;
-    bool enable_single_buffer_fallback;
-    std::uint32_t fork_join_tiles_treshold = 1000;
-
-    BalancerConfig(
-        const DeviceConfig& device_config,
-        scheduler::SchedulerConfig scheduler_config,
-        PolicyType policy_type,
-        int random_policy_seed,
-        const std::vector<std::uint32_t>& chip_ids,
-        placer::ChipPlacementPolicy chip_placement_policy,
-        bool default_dram_parameters,
-        bool skip_l1_usage_validation,
-        bool enable_t_streaming,
-        bool manual_t_streaming,
-        bool input_queues_on_host,
-        bool output_queues_on_host,
-        std::unordered_map<std::string, OpOverride> op_overrides,
-        const std::vector<std::vector<std::string>>& op_names_to_epoch_break,
-        const std::vector<std::vector<std::string>>& op_names_to_chip_break,
-        const placer::OpToChipIdAssignment& op_to_chip_id_assignment,
-        const std::unordered_map<std::string, placer::PlacerOpOverride>& op_name_to_placer_overrides,
-        bool enable_auto_transposing_placement,
-        legalizer::GraphSolverSelfCutType graph_solver_self_cut_type,
-        bool use_interactive_placer,
-        bool enable_enumerate_u_kt,
-        bool enable_single_buffer_fallback) :
-        device_config(device_config),
-        scheduler_config(scheduler_config),
-        policy_type(policy_type),
-        random_policy_seed(random_policy_seed),
-        chip_ids(chip_ids),
-        chip_placement_policy(chip_placement_policy),
-        default_dram_parameters(default_dram_parameters),
-        skip_l1_usage_validation(skip_l1_usage_validation),
-        enable_t_streaming(enable_t_streaming),
-        manual_t_streaming(manual_t_streaming),
-        input_queues_on_host(input_queues_on_host),
-        output_queues_on_host(output_queues_on_host),
-        op_overrides(op_overrides),
-        op_names_to_epoch_break(op_names_to_epoch_break),
-        op_names_to_chip_break(op_names_to_chip_break),
-        op_to_chip_id_assignment(op_to_chip_id_assignment),
-        op_name_to_placer_overrides(op_name_to_placer_overrides),
-        enable_auto_transposing_placement(enable_auto_transposing_placement),
-        graph_solver_self_cut_type(graph_solver_self_cut_type),
-        use_interactive_placer(use_interactive_placer),
-        enable_enumerate_u_kt(enable_enumerate_u_kt),
-        enable_single_buffer_fallback(enable_single_buffer_fallback)
-    {
-        epoch_by_epoch =
-            env_as<bool>("PYBUDA_EPOCH_BY_EPOCH_PLACER");  // temporary env switch to turn on epoch by epoch placement
-    }
-
-    // Constructor - used only by unittesting.
-    //
-    BalancerConfig(
-        const DeviceConfig& device_config,
-        PolicyType policy_type,
-        placer::ChipPlacementPolicy chip_placement_policy = placer::ChipPlacementPolicy::MMIO_LAST) :
-        device_config(device_config),
-        policy_type(policy_type),
-        chip_ids(device_config.chip_ids),
-        chip_placement_policy(chip_placement_policy),
-        graph_solver_self_cut_type(legalizer::GraphSolverSelfCutType::None)
-    {
-        // If unit tests specify policy which use IP, mark it as used.
-        //
-        use_interactive_placer = can_use_interactive_placer(policy_type);
-    }
-
-    inline std::optional<OpOverride> get_op_override(std::string const& op_name) const
-    {
-        auto op_override = op_overrides.find(op_name);
-        if (op_override == op_overrides.end())
-            return {};
-        return op_override->second;
-    }
-};
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/balancer_utils.cpp b/pybuda/csrc/balancer/balancer_utils.cpp
deleted file mode 100644
index de736627c..000000000
--- a/pybuda/csrc/balancer/balancer_utils.cpp
+++ /dev/null
@@ -1,714 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "balancer_utils.hpp"
-
-#include <cstdint>
-#include <unordered_map>
-
-#include "passes/t_stream.hpp"
-#include "utils/hash_combine.hpp"
-#include "utils/logger.hpp"
-
-using NodeType = tt::graphlib::NodeType;
-using UBlockOrder = tt::graphlib::UBlockOrder;
-
-namespace tt::balancer
-{
-OpShape get_op_shape(
-    Graph const* graph,
-    Node const* node,
-    GridShape grid_shape,
-    int u_rt,
-    int u_kt,
-    TStreamFactor t_stream_factor,
-    int fracture_factor)
-{
-    std::vector<TensorShape> producer_shapes;
-    std::vector<TensorShape> input_shapes;
-    std::vector<TensorShape> output_shapes;
-
-    const graphlib::OpNode* op_node = node->as<graphlib::OpNode>();
-
-    if (op_node->is_sparse_matmul())
-    {
-        std::vector<graphlib::Node*> data_operands = graph->data_operands(node);
-        graphlib::ConstantInputNode* cin = data_operands[0]->as<graphlib::ConstantInputNode>();
-        const sparse::SparseBUDA& sparse_buda = cin->get_sparse_buda();
-
-        int sparse_ct = -1;
-        int encodings_ct = -1;
-
-        // TODO: Estimates need fixing (include fracture_factor into calculation)
-
-        // Whether to fully calculate num tiles for in0/in2 (slower), or estimate them (faster)
-        if (env_as<bool>("PYBUDA_SPARSE_MM_ENCODING_ESTIMATES_OFF"))
-        {
-            // Full
-            auto [sparse_t, encodings_t, sparse_s, encodings_s, num_strips_per_row] =
-                sparse_buda.get_sparse_tiles_and_encodings(grid_shape.r, t_stream_factor.r, t_stream_factor.c);
-            sparse_ct = (int)sparse_s[3] / 32;
-            encodings_ct = (int)encodings_s[3] / 32;
-        }
-        else
-        {
-            // Estimate
-            sparse_ct = sparse_buda.get_sparse_tiles_per_core_general(grid_shape.r, t_stream_factor.r);
-            encodings_ct = sparse_buda.get_encoding_tiles_per_core_general(grid_shape.r, t_stream_factor.r, u_rt, u_kt);
-        }
-
-        // Lines below call the function that gets actual encodings, and compare to estimates - good for verification
-        // auto [sparse_t, encodings_t, sparse_s, encodings_s] =
-        //     sparse_buda.get_sparse_tiles_and_encodings(grid_shape.r);
-        // int encodings_ct_slow = (int)encodings_s[3] / 32;
-        // TT_ASSERT(
-        //     encodings_ct == encodings_ct_slow, fmt::format("left = {}, right = {}", encodings_ct,
-        //     encodings_ct_slow));
-
-        // in0
-        input_shapes.emplace_back(1, 1, grid_shape.r, sparse_ct);
-
-        // in1
-        graphlib::Edge in1_edge = graph->operand_data_edges(node).at(1);
-        graphlib::Shape in1_shape = data_operands[1]->shape();
-        std::vector<graphlib::OpType> tms = graph->get_edge_attributes(in1_edge)->get_tms();
-        insert_t_stream_tms(
-            node->as<graphlib::OpNode>(), tms, t_stream_factor, TStreamFactor{}, in1_edge.consumer_input_port_id);
-        input_shapes.emplace_back(post_tms_shape(in1_shape, tms));
-
-        // in2
-        input_shapes.emplace_back(1, 1, grid_shape.r, encodings_ct);
-
-        // out
-        graphlib::Shape out_shape = node->shape().canonical();
-        graphlib::Shape new_shape = graphlib::Shape::create_buda(
-            out_shape.as_vector()[0],
-            out_shape.as_vector()[1],
-            out_shape.as_vector()[2] / fracture_factor,
-            out_shape.as_vector()[3] * fracture_factor);
-        output_shapes.emplace_back(new_shape);
-
-        producer_shapes.resize(3);
-        producer_shapes[0] = input_shapes[0];
-        producer_shapes[1] = in1_shape;
-        producer_shapes[2] = input_shapes[2];
-        return OpShape(producer_shapes, input_shapes, output_shapes);
-    }
-    else
-    {
-        for (graphlib::Edge edge : graph->operand_data_edges(node))
-        {
-            graphlib::Shape producer_shape = graph->node_by_id(edge.producer_node_id)->shape();
-            producer_shapes.emplace_back(producer_shape);
-            std::vector<graphlib::OpType> tms = graph->get_edge_attributes(edge)->get_tms();
-            insert_t_stream_tms(
-                node->as<graphlib::OpNode>(), tms, t_stream_factor, TStreamFactor{}, edge.consumer_input_port_id);
-            input_shapes.emplace_back(post_tms_shape(producer_shape, tms));
-        }
-        return OpShape(producer_shapes, input_shapes, {TensorShape(node->shape())});
-    }
-}
-
-std::vector<graphlib::OpType> calculate_t_streaming_tms(Graph const*, Node const*, OpModel const& op_model)
-{
-    if (op_model.t_stream_factor.none())
-    {
-        return {};
-    }
-
-    std::vector<graphlib::OpType> tms;
-    if (op_model.t_stream_factor.dir.r())
-    {
-        if (op_model.t_stream_factor.r > 1)
-        {
-            tms.push_back(graphlib::OpType("vslice", {op_model.t_stream_factor.r}, {}));
-        }
-        if (op_model.t_stream_factor.c > 1)
-        {
-            tms.push_back(graphlib::OpType("hslice", {op_model.t_stream_factor.c}, {}));
-        }
-    }
-    else
-    {
-        if (op_model.t_stream_factor.c > 1)
-        {
-            tms.push_back(graphlib::OpType("hslice", {op_model.t_stream_factor.c}, {}));
-        }
-        if (op_model.t_stream_factor.r > 1)
-        {
-            tms.push_back(graphlib::OpType("vslice", {op_model.t_stream_factor.r}, {}));
-        }
-    }
-    return tms;
-}
-
-std::pair<CanCoord, TensorShape> map_inverse_tms(
-    CanCoord coord, TensorShape shape, std::vector<graphlib::OpType> const& tms)
-{
-    for (auto iter = tms.rbegin(); iter != tms.rend(); ++iter)
-    {
-        graphlib::OpType const& tm = *iter;
-        // TODO: this string comparison is a little ridiculous, we should adopt union graphlib::TM for faster TM eval
-        //       but for now it makes this routine ~10x faster and measurable in constraint solving time
-        switch (tm.op[0])
-        {
-            case 'b':  // broadcast
-            {
-                if (tm.op[1] == 'u')  // buda_pad
-                {
-                    int rt = std::get<int>(tm.attr[0]);
-                    int ct = std::get<int>(tm.attr[1]);
-                    if (tm.op[5] == 'p')  // pad
-                    {
-                        shape.rt -= rt;
-                        shape.ct -= ct;
-                        coord.rt = std::min(coord.rt, shape.rt - 1);
-                        coord.ct = std::min(coord.ct, shape.ct - 1);
-                    }
-                    else  // unpad
-                    {
-                        shape.rt += rt;
-                        shape.ct += ct;
-                    }
-                }
-                else
-                {
-                    int dim = std::get<int>(tm.attr[0]);
-                    int factor = std::get<int>(tm.attr[1]);
-                    switch (dim)
-                    {
-                        case 1:
-                        {
-                            shape.z /= factor;
-                            coord.t %= shape.z;
-                            break;
-                        }
-                        case 2:
-                        {
-                            shape.rt /= factor;
-                            coord.rt %= shape.rt;
-                            break;
-                        }
-                        case 3:
-                        {
-                            shape.ct /= factor;
-                            coord.ct %= shape.ct;
-                            break;
-                        }
-                        default:
-                        {
-                            log_fatal(LogBalancer, "Unsupported broadcast dim: {}", dim);
-                            break;
-                        }
-                    }
-                }
-                break;
-            }
-            case 'v':
-            {
-                if (tm.op[2] == 'l')  // vslice
-                {
-                    int factor = std::get<int>(tm.attr[0]);
-                    TT_ASSERT(shape.z % factor == 0);
-                    coord.rt = (coord.t % factor) * shape.rt + coord.rt;
-                    coord.t /= factor;
-                    shape.z /= factor;
-                    shape.rt *= factor;
-                    TT_ASSERT(coord.t < shape.z);
-                    TT_ASSERT(coord.rt < shape.rt);
-                }
-                else  // vstack
-                {
-                    int factor = std::get<int>(tm.attr[0]);
-                    TT_ASSERT(shape.rt % factor == 0);
-                    shape.rt /= factor;
-                    shape.z *= factor;
-                    coord.t = coord.t * factor + coord.rt / shape.rt;
-                    coord.rt %= shape.rt;
-                    TT_ASSERT(coord.t < shape.z);
-                    TT_ASSERT(coord.rt < shape.rt);
-                }
-                break;
-            }
-            case 'h':
-            {
-                if (tm.op[2] == 'l')  // hslice
-                {
-                    int factor = std::get<int>(tm.attr[0]);
-                    TT_ASSERT(shape.z % factor == 0);
-                    coord.ct = (coord.t % factor) * shape.ct + coord.ct;
-                    coord.t /= factor;
-                    shape.z /= factor;
-                    shape.ct *= factor;
-                    TT_ASSERT(coord.t < shape.z);
-                    TT_ASSERT(coord.ct < shape.ct);
-                }
-                else  // hstack
-                {
-                    int factor = std::get<int>(tm.attr[0]);
-                    TT_ASSERT(shape.ct % factor == 0);
-                    shape.ct /= factor;
-                    shape.z *= factor;
-                    coord.t = coord.t * factor + coord.ct / shape.ct;
-                    coord.ct %= shape.ct;
-                    TT_ASSERT(coord.t < shape.z);
-                    TT_ASSERT(coord.ct < shape.ct);
-                }
-                break;
-            }
-            case 't':  // transpose
-            {
-                if (tm.op[1] == 'i')  // tile_broadcast
-                    break;
-                std::swap(shape.rt, shape.ct);
-                std::swap(coord.rt, coord.ct);
-                break;
-            }
-            default:
-            {
-                TT_ASSERT(false, "Unhandled case map_inverse_tms");
-                break;
-            }
-        }
-    }
-
-    return std::make_pair(coord, shape);
-}
-
-// Checks whether a pattern of length cycle_len exists in the vector of linear coordinates
-//
-bool is_pattern(std::vector<LinCoord> const& vec, int cycle_len)
-{
-    int vec_len = vec.size();
-    if (vec_len < cycle_len)
-    {
-        return false;
-    }
-
-    for (int i = 0; i < vec_len - cycle_len; i++)
-    {
-        if (vec[i] != vec[i + cycle_len])
-            return false;
-    }
-
-    return true;
-}
-
-// Calculates addresses of tiles of producer, that a single consumer core receives, and returns the length of the
-// repeating pattern, if one exists. Otherwise returns 0.
-//
-int detect_repetitive_pattern(std::unordered_map<Pipe, int>* const kb_cache, Pipe const& pipe)
-{
-    TensorShape consumer_shape = pipe.consumer_layout.shape();
-
-    const int block_volume = pipe.consumer_layout.block_shape.volume();
-    const int block_volume_no_t = pipe.consumer_layout.block_shape.volume_no_t();
-
-    log_trace(LogKernelBroadcast, "      block_volume: {:15}", block_volume);
-    log_trace(LogKernelBroadcast, "      block_volume_no_t: {:10}", block_volume_no_t);
-
-    // We can only check for the first core - if there is a pattern, it will be the same on all cores
-    //
-    constexpr int grid_r = 0;
-    constexpr int grid_c = 0;
-
-    // Structure to keep track of producer addresses that a single consumer core sees
-    //
-    std::vector<LinCoord> producer_addresses;
-    producer_addresses.resize(block_volume);
-
-    // Generate addresses of producer tiles that a single consumer core sees
-    //
-    for (int block_offset = 0; block_offset < block_volume; ++block_offset)
-    {
-        // Walk the consumer tile order linearly
-        //
-        LinCoord consumer_linear(grid_r, grid_c, block_offset);
-        CanCoord consumer_coord = pipe.consumer_layout.map(consumer_linear);
-
-        // Map consumer tile position to producer tile origin
-        //
-        auto [producer_coord, p_shape] = map_inverse_tms(consumer_coord, consumer_shape, pipe.tms);
-        LinCoord producer_linear = pipe.producer_layout.map(producer_coord);
-
-        producer_addresses[block_offset] = producer_linear;
-    }
-
-    // Check if the producer addresses are repetitive
-    //
-    // Lower bound: 1
-    //   this bound can be improved, but there's several edge cases, let's do it only if compile time is an
-    //   issue
-    // Upper bound: block_volume_no_t
-    //   no need to go above a single mblock
-    //
-    for (int pattern_candidate = 1; pattern_candidate <= block_volume_no_t; pattern_candidate++)
-    {
-        if (is_pattern(producer_addresses, pattern_candidate))
-        {
-            log_trace(LogKernelBroadcast, "      found pattern of len: {:10}", pattern_candidate);
-            if (kb_cache)
-            {
-                kb_cache->insert({pipe, pattern_candidate});
-            }
-            return pattern_candidate;
-        }
-    }
-
-    // We haven't found a pattern, add cache entry and return 0 length
-    //
-    log_trace(LogKernelBroadcast, "      pattern not found...");
-    if (kb_cache)
-    {
-        kb_cache->insert({pipe, 0});
-    }
-    return 0;  // no pattern
-}
-
-inline int ordered(GridCoord coord, GridShape shape) { return shape.c * coord.r + coord.c; }
-
-ResourceUsage get_edge_resource_usage(std::unordered_map<Pipe, ResourceUsage>& pipe_to_ru_cache, Pipe pipe)
-{
-    auto match = pipe_to_ru_cache.find(pipe);
-    if (match != pipe_to_ru_cache.end())
-    {
-        return match->second;
-    }
-
-    ResourceUsage usage;
-
-    struct ProducerPhase
-    {
-        LinCoord prev;
-        int first_t_phases = 0;
-        int first_repeat = 0;
-        int contiguous = 1;
-        int phases = 0;
-    };
-
-    TensorShape consumer_shape = pipe.consumer_layout.shape();
-    LinCoord prev_producer_linear;
-    SmallVector<std::uint64_t, 8> unique_consumer_grids;
-    unique_consumer_grids.resize(pipe.producer_layout.grid_shape.volume());
-    SmallVector<std::uint64_t, 8> unique_producer_grids;
-    unique_producer_grids.resize(pipe.consumer_layout.grid_shape.volume());
-    SmallVector<ProducerPhase, 8> producer_phases;
-    producer_phases.resize(pipe.producer_layout.grid_shape.volume());
-
-    GridShape consumer_grid = pipe.consumer_layout.grid_shape;
-    int block_volume = pipe.consumer_layout.block_shape.volume();
-    int producer_block_volume = pipe.producer_layout.block_shape.volume();
-
-    bool monotonic_producer_ts = true;
-    for (int grid_r = 0; grid_r < consumer_grid.r; ++grid_r)
-    {
-        for (int grid_c = 0; grid_c < consumer_grid.c; ++grid_c)
-        {
-            GridCoord consumer_grid_coord(grid_r, grid_c);
-            int consumer_core_phases = 0;
-            int first_t_consumer_core_phases = 0;
-            int prev_producer_t = 0;
-            for (int block_offset = 0; block_offset < block_volume; ++block_offset)
-            {
-                // Walk the consumer tile order linearly
-                LinCoord consumer_linear(grid_r, grid_c, block_offset);
-                CanCoord consumer_coord = pipe.consumer_layout.map(consumer_linear);
-                // Map consumer tile position to producer tile origin
-                auto [producer_coord, p_shape] = map_inverse_tms(consumer_coord, consumer_shape, pipe.tms);
-                LinCoord producer_linear = pipe.producer_layout.map(producer_coord);
-                // Check if this tile comes from the same grid coordinate, if not we need a new phase
-                bool consumer_contiguous = prev_producer_linear.next().grid_coord() == producer_linear.grid_coord();
-                consumer_core_phases += int(not consumer_contiguous);
-                prev_producer_linear = producer_linear;
-
-                // Check if we go backwards in t
-                if (producer_coord.t == 0)
-                    first_t_consumer_core_phases = consumer_core_phases;
-                monotonic_producer_ts &= prev_producer_t <= producer_coord.t;
-                prev_producer_t = producer_coord.t;
-
-                int producer_grid_idx = ordered(producer_linear.grid_coord(), pipe.producer_layout.grid_shape);
-                int consumer_grid_idx = ordered(consumer_grid_coord, pipe.consumer_layout.grid_shape);
-                std::uint64_t& consumer_grid_mask = unique_consumer_grids[producer_grid_idx];
-                if (consumer_grid_idx < 64)
-                    consumer_grid_mask |= (1llu << std::uint64_t(consumer_grid_idx));
-                else
-                    consumer_grid_mask |= (consumer_grid_mask + 1llu);
-
-                std::uint64_t& producer_grid_mask = unique_producer_grids[consumer_grid_idx];
-                if (producer_grid_idx < 64)
-                    producer_grid_mask |= (1llu << std::uint64_t(producer_grid_idx));
-                else
-                    producer_grid_mask |= (producer_grid_mask + 1llu);
-
-                // Calculate producer phases
-                ProducerPhase& producer_phase = producer_phases[producer_grid_idx];
-                bool producer_contiguous = (producer_phase.prev.next() == producer_linear and consumer_contiguous);
-                producer_phase.contiguous += int(producer_contiguous);
-                producer_phase.phases += int(not producer_contiguous);
-                if (not producer_phase.first_repeat and producer_phase.contiguous == producer_block_volume)
-                    producer_phase.first_repeat = producer_phase.phases;
-                if (producer_coord.t == 0)
-                    producer_phase.first_t_phases = producer_phase.phases;
-                producer_phase.prev = (not producer_phase.prev.valid() or producer_phase.prev.next() == producer_linear)
-                                          ? producer_linear
-                                          : LinCoord{};
-            }
-
-            // If the tile read order never went backwards in t, this can be turned into a loop
-            if (monotonic_producer_ts)
-                consumer_core_phases = first_t_consumer_core_phases;
-
-            usage.consumer_phases = std::max(usage.consumer_phases, consumer_core_phases);
-        }
-    }
-
-    for (std::uint64_t mask : unique_consumer_grids)
-    {
-        usage.producer_fan_out = std::max(usage.producer_fan_out, __builtin_popcountll(mask));
-    }
-
-    for (std::uint64_t mask : unique_producer_grids)
-    {
-        usage.consumer_fan_in = std::max(usage.consumer_fan_in, __builtin_popcountll(mask));
-    }
-
-    for (ProducerPhase producer_phase : producer_phases)
-    {
-        usage.producer_phases = std::max(
-            usage.producer_phases,
-            producer_phase.first_repeat ? producer_phase.first_repeat
-            : monotonic_producer_ts     ? producer_phase.first_t_phases
-                                        : producer_phase.phases);
-    }
-
-    usage.producer_phases *= std::min(2, pipe.producer_out_buf_mb);
-    usage.consumer_phases *= std::min(2, pipe.producer_out_buf_mb);
-
-    pipe_to_ru_cache.insert({pipe, usage});
-    return usage;
-}
-
-ResourceUsage get_edge_resource_usage(
-    Graph const* graph,
-    std::unordered_map<Pipe, ResourceUsage>& pipe_to_ru_cache,
-    graphlib::Edge edge,
-    OpModel const& producer_op_model,
-    OpModel const& consumer_op_model,
-    bool is_queue)
-{
-    graphlib::Node const* producer_node = graph->node_by_id(edge.producer_node_id);
-    graphlib::OpNode const* consumer_node =
-        dynamic_cast<graphlib::OpNode const*>(graph->node_by_id(edge.consumer_node_id));
-
-    auto edge_attr = graph->get_edge_attributes(edge);
-    auto tms = edge_attr->get_tms();
-
-    insert_t_stream_tms(
-        consumer_node,
-        tms,
-        consumer_op_model.t_stream_factor,
-        producer_op_model.t_stream_factor,
-        edge.consumer_input_port_id,
-        is_queue);
-
-    GridShape producer_grid_shape = producer_op_model.grid_shape;
-    BlockShape producer_block_shape = producer_op_model.output_buffers[0].block_shape.canonical();
-    if (producer_op_model.fracture_factor > 1)
-    {
-        TT_ASSERT(producer_grid_shape.c % producer_op_model.fracture_factor == 0);
-        producer_grid_shape.r *= producer_op_model.fracture_factor;
-        producer_grid_shape.c /= producer_op_model.fracture_factor;
-    }
-
-    GridShape consumer_grid_shape = consumer_op_model.grid_shape;
-    BlockShape consumer_block_shape =
-        consumer_op_model.input_buffers[edge.consumer_input_port_id].block_shape.canonical();
-    if (consumer_node->is_matmul())
-        consumer_grid_shape = edge.consumer_input_port_id == 0 ? GridShape(consumer_grid_shape.r, 1)
-                                                               : GridShape(1, consumer_grid_shape.c);
-
-    Pipe pipe(
-        TileLayout(
-            producer_grid_shape,
-            producer_block_shape,
-            get_output_ublock_order(graph, producer_node),
-            producer_op_model.padding),
-        producer_op_model.output_buffers[0].buffer_factor,
-        tms,
-        TileLayout(
-            consumer_grid_shape, consumer_block_shape, edge_attr->get_ublock_order(), consumer_op_model.padding));
-
-    try
-    {
-        return get_edge_resource_usage(pipe_to_ru_cache, pipe);
-    }
-    catch (...)
-    {
-        log_error("{} -> {}[{}]", producer_node->name(), consumer_node->name(), edge.consumer_input_port_id);
-        log_error("producer {}", producer_op_model);
-        log_error("consumer {}", consumer_op_model);
-        log_error("Test\n{}", pipe);
-        throw;
-    }
-}
-
-ResourceUsage get_edge_resource_usage_simple(
-    Graph const*,
-    graphlib::Edge edge,
-    OpModel const& producer_op_model,
-    OpModel const& consumer_op_model,
-    bool is_queue)
-{
-    ResourceUsage usage;
-
-    if (is_queue)
-    {
-        auto producer_grid_shape = producer_op_model.grid_shape;
-        bool matmul_lhs = consumer_op_model.buda_op_node->is_matmul() and edge.consumer_input_port_id == 0;
-        bool matmul_rhs = consumer_op_model.buda_op_node->is_matmul() and edge.consumer_input_port_id == 1;
-        int fork_factor_r =
-            matmul_rhs ? producer_grid_shape.r : round_up_div(producer_grid_shape.r, consumer_op_model.grid_shape.r);
-        int fork_factor_c =
-            matmul_lhs ? producer_grid_shape.c : round_up_div(producer_grid_shape.c, consumer_op_model.grid_shape.c);
-        usage.consumer_fan_in = fork_factor_r * fork_factor_c;
-    }
-    else
-    {
-        int producer_grid_volume = producer_op_model.grid_shape.volume();
-        int consumer_grid_volume = 0;
-        if (consumer_op_model.buda_op_node->is_matmul() and edge.consumer_input_port_id < 2)
-        {
-            consumer_grid_volume =
-                (edge.consumer_input_port_id == 1) ? consumer_op_model.grid_shape.c : consumer_op_model.grid_shape.r;
-        }
-        else
-        {
-            consumer_grid_volume = consumer_op_model.grid_shape.volume();
-        }
-        usage.producer_fan_out = round_up_div(consumer_grid_volume, producer_grid_volume);
-    }
-    return usage;
-}
-
-std::tuple<uint32_t, uint32_t, uint32_t> get_sparse_matmul_metadata(balancer::OpModel const& op_model)
-{
-    int grid_r = op_model.grid_shape.r;
-    int u_rt = op_model.output_buffers[0].block_shape.ublock.rt;
-    int u_kt = op_model.input_buffers[1].block_shape.ublock.rt;
-    int t_factor_c = op_model.t_stream_factor.c;
-    int t_factor_r = op_model.t_stream_factor.r;
-    const sparse::SparseBUDA& sparse_buda = *(op_model.sparse_buda);
-    auto layout = sparse::SparseBUDA::create_layout(
-        op_model.has_sparse_buffer() or env_as<bool>("PYBUDA_FORCE_SPARSE_BUFFER_LAYOUT"),
-        op_model.t_stream_factor.dir.z_major(),
-        op_model.fracture_factor);
-    int bcast_factor = sparse_buda.bcast_factor;
-    int zdim = sparse_buda.sparse_zs.size();
-
-    // Initialize tiles/ublocks/strips counter
-    int sum_nz_tiles = 0;
-    int sum_nz_ublocks = 0;
-    int sum_nz_strips = 0;
-    constexpr int TILE_DIM = tt::sparse::TILE_DIM;
-
-    struct CounterEntry
-    {
-        std::unordered_set<uint64_t> rt_ct_cmb;
-        std::unordered_set<uint64_t> ubc_ubr_cmb;
-        std::unordered_set<int> ubc_idxs;
-        int smallest_rt;
-        CounterEntry() : smallest_rt(INT_MAX){};
-    };
-
-    std::vector<CounterEntry> counters;
-    int slice_count = grid_r * t_factor_r;
-
-    // Iterate throufh all sparse tensors
-    for (int z = 0; z < zdim; z++)
-    {
-        auto sparse = sparse_buda.sparse_zs[z];
-
-        // Take stat of the sparseCOO
-        int dflow_factor = (layout == sparse::SparseBUDA::Layout::ZMajorDataflow)
-                               ? (sparse.rt() / grid_r / t_factor_r / bcast_factor)
-                               : 1;
-        int num_slices = (layout == tt::sparse::SparseBUDA::Layout::Default)
-                             ? grid_r * t_factor_r
-                             : grid_r * t_factor_r * bcast_factor * dflow_factor;
-        std::int64_t slice_height = sparse.shape[0] / num_slices;
-
-        std::vector<CounterEntry> ret(num_slices);
-        for (size_t idx = 0; idx < sparse.rows.size(); idx++)
-        {
-            // Count nonzero tiles/ublocks/strips in the SparseCOO
-            int ret_slice_idx = -1, rt = -1;
-            if (layout == tt::sparse::SparseBUDA::Layout::Default)
-            {
-                ret_slice_idx = sparse.rows[idx] / slice_height;
-                rt = (sparse.rows[idx] % slice_height) / TILE_DIM;
-            }
-            else if (layout == tt::sparse::SparseBUDA::Layout::ZMajor)
-            {
-                int slice_idx = sparse.rows[idx] / slice_height;
-                int inner_idx = (slice_idx / slice_count) * grid_r + (slice_idx % grid_r);
-                int slice_inner_idx = inner_idx % bcast_factor;
-                ret_slice_idx = (slice_idx % (grid_r * t_factor_r)) / grid_r * grid_r + (inner_idx / bcast_factor);
-                int new_rows = (sparse.rows[idx] % slice_height) + slice_height * slice_inner_idx;
-                rt = new_rows / TILE_DIM;
-            }
-            else
-            {
-                TT_ASSERT(
-                    layout == sparse::SparseBUDA::Layout::BufferOp or
-                    layout == sparse::SparseBUDA::Layout::ZMajorDataflow);
-                if (layout == sparse::SparseBUDA::Layout::ZMajorDataflow and
-                    ((sparse.rt() / grid_r / t_factor_r) % bcast_factor != 0))
-                    continue;
-
-                int slice_idx = sparse.rows[idx] / slice_height;
-                int inner_idx = (slice_idx % (dflow_factor * grid_r)) * bcast_factor +
-                                (slice_idx / (dflow_factor * grid_r * t_factor_r));
-                int slice_inner_idx = inner_idx % (bcast_factor * dflow_factor);
-                ret_slice_idx = (slice_idx / (dflow_factor * grid_r)) % t_factor_r * grid_r +
-                                (inner_idx / (bcast_factor * dflow_factor));
-                int new_rows = (sparse.rows[idx] % slice_height) + slice_height * slice_inner_idx;
-                rt = new_rows / TILE_DIM;
-            }
-            int ct = sparse.cols[idx] / TILE_DIM;
-            int ubr_idx = rt / u_rt;
-            int ubc_idx = ct / u_kt;
-            uint64_t rt_ct_key = (uint64_t(rt) << 32) | (ct & 0x0FFFF);
-            uint64_t ubc_ubr_key = (uint64_t(ubc_idx) << 32) | (ubr_idx & 0x0FFFF);
-
-            // Add the metadata to counting struct
-            CounterEntry& e = ret[ret_slice_idx];
-            e.rt_ct_cmb.insert(rt_ct_key);
-            e.ubc_ubr_cmb.insert(ubc_ubr_key);
-            e.ubc_idxs.insert(ubc_idx);
-            if (rt < ret[ret_slice_idx].smallest_rt)
-                ret[ret_slice_idx].smallest_rt = rt;
-        }
-
-        // Count tiles, ublocks, strips
-        for (int idx = 0; idx < slice_count; idx++)
-        {
-            const CounterEntry& e = ret[idx];
-            sum_nz_tiles += e.rt_ct_cmb.size();
-            sum_nz_ublocks += e.ubc_ubr_cmb.size();
-            sum_nz_strips += e.ubc_idxs.size();
-            if (e.smallest_rt >= 1 and e.smallest_rt < INT_MAX)
-            {
-                sum_nz_tiles++;
-                sum_nz_ublocks++;
-            }
-        }
-    }
-
-    sum_nz_tiles *= t_factor_c;
-    sum_nz_ublocks *= t_factor_c;
-    sum_nz_strips *= t_factor_c;
-    return std::make_tuple<>(sum_nz_tiles, sum_nz_ublocks, sum_nz_strips);
-}
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/balancer_utils.hpp b/pybuda/csrc/balancer/balancer_utils.hpp
deleted file mode 100644
index b21c6408d..000000000
--- a/pybuda/csrc/balancer/balancer_utils.hpp
+++ /dev/null
@@ -1,157 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <iomanip>
-
-#include "balancer/python_interface.hpp"
-#include "balancer/types.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-#include "graph_lib/utils.hpp"
-#include "placer/placer.hpp"
-#include "types.hpp"
-#include "utils/assert.hpp"
-
-using Graph = tt::graphlib::Graph;
-using Node = tt::graphlib::Node;
-using Edge = tt::graphlib::Edge;
-using DataFormat = tt::DataFormat;
-
-namespace tt::balancer
-{
-inline std::size_t tile_size_bytes(DataFormat data_format, bool include_header_padding = true)
-{
-    std::size_t size = 0xbadface;
-    std::size_t header_padding_size = include_header_padding ? 32 : 0;
-    switch (data_format)
-    {
-        // clang-format off
-        case DataFormat::Float32  : size = 32*32*4    + header_padding_size /* header and pad */ ; break;
-        case DataFormat::Float16  : size = 32*32*2    + header_padding_size /* header and pad */ ; break;
-        case DataFormat::Bfp8     : size = 32*32 + 64 + header_padding_size /* header and pad */ ; break;
-        case DataFormat::Bfp4     : size = 512   + 64 + header_padding_size /* header and pad */ ; break;
-        case DataFormat::Bfp2     : size = 256   + 64 + header_padding_size /* header and pad */ ; break;
-        case DataFormat::Float16_b: size = 32*32*2    + header_padding_size /* header and pad */ ; break;
-        case DataFormat::Bfp8_b   : size = 32*32 + 64 + header_padding_size /* header and pad */ ; break;
-        case DataFormat::Bfp4_b   : size = 512   + 64 + header_padding_size /* header and pad */ ; break;
-        case DataFormat::Bfp2_b   : size = 256   + 64 + header_padding_size /* header and pad */ ; break;
-        case DataFormat::Lf8      : size = 32*32      + header_padding_size /* header and pad */ ; break;
-        case DataFormat::UInt16   : size = 32*32*2    + header_padding_size /* header and pad */ ; break;
-        case DataFormat::Int8     : size = 32*32      + header_padding_size /* header and pad */ ; break;
-        case DataFormat::Int32    : size = 32*32*4    + header_padding_size /* header and pad */ ; break;
-        case DataFormat::RawUInt8 : size = 32*32      + header_padding_size /* header and pad */ ; break;
-        case DataFormat::RawUInt16: size = 32*32*2    + header_padding_size /* header and pad */ ; break;
-        case DataFormat::RawUInt32: size = 32*32*4    + header_padding_size /* header and pad */ ; break;
-        case DataFormat::Invalid  : size = 0xbadface ; break;
-            // clang-format on
-    }
-    return size;
-}
-
-template <typename T>
-T round_up_div(T n, T d)
-{
-    return (n + d - 1) / d;
-}
-
-inline std::size_t calculate_dst_size_tiles(
-    std::size_t dst_size_bytes, DataFormat accumulate_df, int tile_volume, int num_buffers = 2)
-{
-    TT_ASSERT(is_valid_accumulate_df(accumulate_df));
-    std::size_t available_dst_size = dst_size_bytes / num_buffers;  // half-dst for double-buffering
-    std::size_t bytes_per_datum = data_format_byte_size(accumulate_df);
-    TT_ASSERT(bytes_per_datum == 2 or bytes_per_datum == 4);
-    std::size_t bytes_per_tile = tile_volume * bytes_per_datum;
-    return round_up_div(available_dst_size, bytes_per_tile);
-}
-
-template <typename T>
-std::string round_float(const T num, int precision)
-{
-    static_assert(std::is_same<T, float>::value || std::is_same<T, double>::value, "Only floats and doubles allowed!");
-    TT_ASSERT(!(precision < 0), "precision can't be negative!");
-
-    std::stringstream ss;
-    ss << std::fixed << std::setprecision(precision) << num;
-    return ss.str();
-}
-
-template <typename T>
-inline T gcd(T a, T b)
-{
-    T r = 1;
-    for (T i = std::min(a, b); i > 0; --i)
-    {
-        if (((a % i) == 0) and ((b % i) == 0))
-        {
-            r = i;
-            break;
-        }
-    }
-    return r;
-}
-
-inline bool divisible_either_direction(int a, int b) { return ((a % b) == 0) || ((b % a) == 0); }
-
-OpShape get_op_shape(
-    Graph const *graph,
-    Node const *node,
-    GridShape grid_shape = {1, 1},
-    int u_kt = 1,
-    int u_rt = 1,
-    TStreamFactor t_stream_factor = {},
-    int fracture_factor = 1);
-std::vector<tt::graphlib::OpType> calculate_undo_t_streaming_tms(
-    Graph const *graph, Node const *node, OpModel const &op_model);
-std::vector<tt::graphlib::OpType> calculate_t_streaming_tms(
-    Graph const *graph, Node const *node, OpModel const &op_model);
-
-std::pair<CanCoord, TensorShape> map_inverse_tms(
-    CanCoord coord, TensorShape shape, std::vector<graphlib::OpType> const &tms);
-
-int detect_repetitive_pattern(std::unordered_map<Pipe, int> *const kb_cache, Pipe const &pipe);
-
-ResourceUsage get_edge_resource_usage(std::unordered_map<Pipe, ResourceUsage> &pipe_to_ru_cache, Pipe pipe);
-
-// This path does a full tile order check to ensure that the pipe doesn't violate any HW constraints
-ResourceUsage get_edge_resource_usage(
-    Graph const *graph,
-    std::unordered_map<Pipe, ResourceUsage> &pipe_to_ru_cache,
-    graphlib::Edge edge,
-    OpModel const &producer_op_model,
-    OpModel const &consumer_op_model,
-    bool is_queue = false);
-
-// This path uses the old path, super simple heuristic based check that really only enforces some grid forking
-// constraints
-ResourceUsage get_edge_resource_usage_simple(
-    Graph const *graph,
-    graphlib::Edge edge,
-    OpModel const &producer_op_model,
-    OpModel const &consumer_op_model,
-    bool is_queue = false);
-
-std::tuple<uint32_t, uint32_t, uint32_t> get_sparse_matmul_metadata(balancer::OpModel const &grid);
-
-}  // namespace tt::balancer
-
-namespace std
-{
-template <>
-struct hash<tt::balancer::BlockShape>
-{
-    std::size_t operator()(const tt::balancer::BlockShape &block_shape) const
-    {
-        std::size_t seed = 0;
-        tt::hash_combine(seed, static_cast<size_t>(block_shape.tblock_m));
-        tt::hash_combine(seed, static_cast<size_t>(block_shape.tblock_n));
-        tt::hash_combine(seed, static_cast<size_t>(block_shape.mblock_m));
-        tt::hash_combine(seed, static_cast<size_t>(block_shape.mblock_m));
-        tt::hash_combine(seed, static_cast<size_t>(block_shape.ublock.rt));
-        tt::hash_combine(seed, static_cast<size_t>(block_shape.ublock.ct));
-        return seed;
-    }
-};
-}  // namespace std
diff --git a/pybuda/csrc/balancer/exceptions.hpp b/pybuda/csrc/balancer/exceptions.hpp
deleted file mode 100644
index 542a03e8e..000000000
--- a/pybuda/csrc/balancer/exceptions.hpp
+++ /dev/null
@@ -1,132 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <exception>
-#include <graph_lib/node.hpp>
-#include <string>
-#include <unordered_set>
-#include <variant>
-#include <vector>
-namespace tt::balancer
-{
-
-// New failure reason can be defined after initial NoFailure value but prior to MaxFailureReason.
-//
-#define OpModelFailureReasons                                              \
-    X(NoFailure, "No Failure - Valid OpModel")                             \
-    X(IllegalStackForGrid, "Illegal stack for grid dimension")             \
-    X(UserAccessPreventsStreaming, "User access prevents streaming")       \
-    X(IllegalStreaming, "Illegal streaming")                               \
-    X(L1UsageOverMaxLimit, "L1 usage > L1 Max")                            \
-    X(ExceededDramChannelCapacity, "Exceeded DRAM channel capacity")       \
-    X(InputBufferAllocationFailure, "Failed to allocate input buffers")    \
-    X(PaddingConstraintsNotSatisfied, "Padding constraints not satisfied") \
-    X(MaxFailureReason, "")
-
-#define X(a, b) a,
-enum OpModelFailureReason
-{
-    OpModelFailureReasons
-};
-#undef X
-
-#define X(a, b) b,
-static std::string OpModelFailureReasonMessages[] = {OpModelFailureReasons};
-#undef X
-
-class BudaOpNodeLegalizerFailureInfo
-{
-   private:
-    std::uint32_t opModelFailureCountByType[MaxFailureReason];
-
-   public:
-    BudaOpNodeLegalizerFailureInfo() : opModelFailureCountByType() {}
-    void recordOpModelFailure(OpModelFailureReason failureReason)
-    {
-        TT_ASSERT(failureReason < MaxFailureReason, "Invalid failure reason.");
-        opModelFailureCountByType[failureReason]++;
-    }
-
-    std::uint32_t getOpModelFailureCountByType(OpModelFailureReason failureReason) const
-    {
-        return opModelFailureCountByType[failureReason];
-    }
-
-    std::string toString() const
-    {
-        std::string result = "Op model failure counts by type: \n";
-
-        for (int i = NoFailure; i < MaxFailureReason; i++)
-        {
-            result += OpModelFailureReasonMessages[i] + ": " + std::to_string(opModelFailureCountByType[i]) + "\n";
-        }
-
-        return result;
-    }
-};
-
-struct BalancerError : public std::exception
-{
-    struct NodeExceedsMaxOpForks
-    {
-        std::int64_t max_forks;
-        std::int64_t node_id;
-
-        bool specific_node() const { return node_id != -1; }
-
-        NodeExceedsMaxOpForks(std::int64_t max_forks, std::int64_t node_id = -1) :
-            max_forks(max_forks), node_id(node_id)
-        {
-        }
-    };
-
-    struct InputBroadcastExceedsMaxGridForks
-    {
-        std::int64_t input_node_id;
-
-        InputBroadcastExceedsMaxGridForks(std::int64_t input_node_id) : input_node_id(input_node_id) {}
-    };
-
-    struct DRAMWriterNOPNeeded
-    {
-        const std::string src;
-        bool transpose;
-
-        DRAMWriterNOPNeeded(const std::string& src, bool transpose) : src(src), transpose(transpose) {}
-    };
-
-    struct NoValidGrid
-    {
-        std::unordered_map<graphlib::Node*, const BudaOpNodeLegalizerFailureInfo> nodes_without_legal_op_model;
-
-        NoValidGrid(
-            std::unordered_map<graphlib::Node*, const BudaOpNodeLegalizerFailureInfo>&& nodes_without_legal_op_model) :
-            nodes_without_legal_op_model(std::move(nodes_without_legal_op_model))
-        {
-        }
-    };
-
-    // Generic unrecoverable error
-    struct Fatal
-    {
-        std::string message;
-        Fatal(const std::string& message) : message(message) {}
-    };
-
-    using Type = std::variant<
-        std::monostate,
-        NodeExceedsMaxOpForks,
-        InputBroadcastExceedsMaxGridForks,
-        DRAMWriterNOPNeeded,
-        NoValidGrid,
-        Fatal>;
-
-    std::string message;
-    Type type;
-
-    BalancerError(std::string const& message, Type type = std::monostate{}) : message(message), type(type) {}
-    virtual char const* what() const noexcept override { return message.c_str(); }
-};
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/legalizer/constraints.cpp b/pybuda/csrc/balancer/legalizer/constraints.cpp
deleted file mode 100644
index 8d403f761..000000000
--- a/pybuda/csrc/balancer/legalizer/constraints.cpp
+++ /dev/null
@@ -1,391 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "balancer/legalizer/constraints.hpp"
-
-#include "balancer/balancer_utils.hpp"
-#include "balancer/exceptions.hpp"
-#include "balancer/legalizer/graph_solver.hpp"
-#include "graph_lib/node_types.hpp"
-#include "graph_lib/utils.hpp"
-#include "passes/t_stream.hpp"
-#include "utils/logger.hpp"
-
-namespace tt::balancer::legalizer
-{
-static bool block_compatible(
-    OpModel const& producer,
-    OpModel const& consumer,
-    graphlib::Edge edge,
-    graphlib::UBlockOrder ublock_order,
-    bool has_transpose,
-    bool consumer_needs_matmul_streaming)
-{
-    TStreamFactor producer_t_stream_factor =
-        has_transpose ? TStreamFactor::Transposed(producer.t_stream_factor) : producer.t_stream_factor;
-
-    if (producer_t_stream_factor.none())
-        return true;
-
-    tt::graphlib::NodeId node_id = edge.consumer_node_id;
-    TensorShape producer_shape = producer.effective_input_buffer_shape_for_user.at(node_id);
-    BlockShape consumer_block_shape = consumer.input_buffers[edge.consumer_input_port_id].block_shape;
-    int consumer_shape_rt = consumer_block_shape.rt() * consumer.grid_shape.r;
-    int consumer_shape_ct = consumer_block_shape.ct() * consumer.grid_shape.c;
-    UBlockShape consumer_ublock = consumer.input_buffers[edge.consumer_input_port_id].block_shape.ublock;
-
-    bool producer_anti_ublock_order = not producer_t_stream_factor.dir.is_ublock_order(ublock_order);
-    bool ublock_order_preserved = false;
-    if (producer_anti_ublock_order or consumer_needs_matmul_streaming)
-    {
-        ublock_order_preserved =
-            (ublock_order == graphlib::UBlockOrder::R)
-                ? ((producer_shape.rt == consumer_ublock.rt) or (producer_shape.ct == consumer_shape_ct))
-                : ((producer_shape.ct == consumer_ublock.ct) or (producer_shape.rt == consumer_shape_rt));
-    }
-    else  // ublock order == eltwise style streaming
-    {
-        // Either you've sliced all the way down to a single ublock, or you're not streaming in anti-ublock dimension
-        ublock_order_preserved = (ublock_order == graphlib::UBlockOrder::R)
-                                     ? ((producer_shape.rt == consumer_ublock.rt) or (producer_t_stream_factor.c == 1))
-                                     : ((producer_shape.ct == consumer_ublock.ct) or (producer_t_stream_factor.r == 1));
-    }
-
-    return ((producer_shape.rt % consumer_ublock.rt) == 0) and ((producer_shape.ct % consumer_ublock.ct) == 0) and
-           ublock_order_preserved;
-}
-
-static bool double_counted(graphlib::Graph const* graph, graphlib::Edge edge)
-{
-    auto operand_edges = graph->operand_data_edges(graph->node_by_id(edge.consumer_node_id));
-    auto smallest_port_id = edge.consumer_input_port_id;
-    for (graphlib::Edge operand : operand_edges)
-    {
-        if (operand.producer_node_id == edge.producer_node_id)
-        {
-            smallest_port_id = std::min(smallest_port_id, operand.consumer_input_port_id);
-        }
-    }
-    return smallest_port_id != edge.consumer_input_port_id;
-}
-
-static bool legal_t_stream_dirs(
-    TStreamFactor producer_t_stream_factor,
-    TStreamFactor consumer_t_stream_factor,
-    bool producer_has_z,
-    bool consumes_rz_major)
-{
-    auto producer_dir = producer_t_stream_factor.dir;
-    auto consumer_dir = consumer_t_stream_factor.dir;
-
-    // TMs with pattern vslice(N) -> hstack(N) force RZ -> R streaming
-    // If stream direction is identical this is always legal
-    // If the producer has a z then the consumer must handle same streaming dir
-    // otherwise only their primary streaming directions need to match
-    if (consumes_rz_major)
-        return producer_dir == TStreamDir::RZ and consumer_dir == TStreamDir::R;
-    else if (producer_dir == consumer_dir)
-        return true;
-    else if (producer_has_z and producer_dir.z_major())
-        return producer_dir == consumer_dir;
-    else if (not producer_t_stream_factor.none() and not consumer_t_stream_factor.none())
-        return producer_dir.primary_dir_compatible(consumer_dir);
-    return true;
-}
-
-static bool legal_matmul_streaming(
-    TStreamFactor producer_t_stream_factor,
-    TStreamFactor consumer_t_stream_factor,
-    graphlib::Edge edge,
-    bool consumer_is_matmul)
-{
-    if (not consumer_is_matmul)
-        return true;
-
-    bool lhs_matmul = edge.consumer_input_port_id == 0;
-    bool rhs_matmul = edge.consumer_input_port_id == 1;
-    bool both_streaming = producer_t_stream_factor.is_streaming() and consumer_t_stream_factor.is_streaming();
-
-    if (both_streaming and rhs_matmul and consumer_t_stream_factor.dir.r())
-    {
-        return false;
-    }
-
-    if (both_streaming and lhs_matmul and consumer_t_stream_factor.dir.c())
-    {
-        return false;
-    }
-
-    return true;
-}
-
-static ConstraintFailureReason legal_t_streaming(
-    OpModel const& producer,
-    OpModel const& consumer,
-    graphlib::Edge edge,
-    graphlib::UBlockOrder ublock_order,
-    bool has_transpose,
-    bool consumer_needs_matmul_streaming)
-{
-    TStreamFactor producer_t_stream_factor =
-        has_transpose ? TStreamFactor::Transposed(producer.t_stream_factor) : producer.t_stream_factor;
-    TStreamFactor consumer_t_stream_factor = consumer.t_stream_factor;
-
-    if (producer_t_stream_factor.none() and consumer_t_stream_factor.none())
-    {
-        return NoConstraintFailure;  // early out if we're not streaming
-    }
-
-    if (not producer_t_stream_factor.compatible_consumer(
-            consumer_t_stream_factor, consumer.is_sparse_matmul, consumer.consumes_rz_major))
-    {
-        return TStreamIncompatibleConsumer;
-    }
-
-    bool producer_has_z = consumer.op_shape.inputs[edge.consumer_input_port_id].z > consumer_t_stream_factor.t();
-    if (not legal_t_stream_dirs(
-            producer_t_stream_factor, consumer_t_stream_factor, producer_has_z, consumer.consumes_rz_major))
-    {
-        return TStreamIllegalTStreamDir;
-    }
-
-    bool producer_streaming = producer_t_stream_factor.is_streaming();
-    bool consumer_streaming = consumer_t_stream_factor.is_streaming();
-
-    // We must ensure block shape is compatible through the entire stream
-    if (not block_compatible(producer, consumer, edge, ublock_order, has_transpose, consumer_needs_matmul_streaming))
-    {
-        return TStreamBlockIncompatible;
-    }
-
-    // This might be able to be relaxed in the future, but for now, disallow anti-ublock-order stacking
-    auto orig_producer_t_stream_factor = producer.t_stream_factor;  // non-transposed
-    bool producer_anti_ublock_order = not orig_producer_t_stream_factor.dir.is_ublock_order(ublock_order);
-    if (producer_streaming and not consumer_streaming and producer_anti_ublock_order)
-    {
-        return TStreamAntiUblockOrderStacking;
-    }
-
-    // If matmul is streaming then producer can only stream from respective side LHS=TStreamDir::R, RHS=TStreamDir::C
-    if (not legal_matmul_streaming(
-            producer_t_stream_factor, consumer_t_stream_factor, edge, consumer_needs_matmul_streaming))
-    {
-        return TStreamMatmulStreamingSide;
-    }
-
-    // Stack / grid divsilibility constraints
-    bool lhs_matmul = consumer.op_type() == "matmul" and edge.consumer_input_port_id == 0;
-    bool rhs_matmul = consumer.op_type() == "matmul" and edge.consumer_input_port_id == 1;
-    int stack_factor_r = producer_t_stream_factor.r / consumer_t_stream_factor.r;
-    int stack_factor_c = producer_t_stream_factor.c / consumer_t_stream_factor.c;
-    if (not rhs_matmul and stack_factor_r and not divisible_either_direction(stack_factor_r, consumer.grid_shape.r))
-    {
-        return TStreamNotDivisableRow;
-    }
-
-    if (not lhs_matmul and stack_factor_c and not divisible_either_direction(stack_factor_c, consumer.grid_shape.c))
-    {
-        return TStreamNotDivisableColumn;
-    }
-
-    return NoConstraintFailure;
-}
-
-std::pair<EdgeCost, ConstraintFailureReason> GrayskullConstraint::queue_to_op_cost(
-    graphlib::Graph const* graph,
-    graphlib::Edge edge,
-    std::optional<OpModel> queue_producer_op_model,
-    OpModel const& consumer)
-{
-    // This is the input edge case
-    graphlib::Node const* producer_node = graph->node_by_id(edge.producer_node_id);
-    graphlib::InputNode const* input = dynamic_cast<graphlib::InputNode const*>(producer_node);
-
-    if (input and (input->is_constant() or input->is_parameter() or input->is_optimizer_parameter()))
-    {
-        // We can reblock parameters and constants to have the same grid shape as consumer
-        // Therefore 1 to 1 mapping
-        return std::make_pair(EdgeCost(0, 0, 0, 1, &device_config, nullptr, &consumer), NoConstraintFailure);
-    }
-    else if (queue_producer_op_model)
-    {
-        ResourceUsage usage =
-            resource_usage_fallback_mode
-                ? get_edge_resource_usage_simple(graph, edge, *queue_producer_op_model, consumer, true)
-                : get_edge_resource_usage(
-                      graph,
-                      balancer_cache_collection->pipe_to_resource_usage_cache,
-                      edge,
-                      *queue_producer_op_model,
-                      consumer,
-                      true);
-        ConstraintFailureReason constraint_failure =
-            (usage.consumer_fan_in > EdgeCost::kMaxDRAMInQueues) ? ExceedsDRAMInQueues : NoConstraintFailure;
-        return std::make_pair(
-            EdgeCost(0, 0, 0, usage.consumer_fan_in, &device_config, nullptr, &consumer), constraint_failure);
-    }
-    else if (consumer.op_type() == "matmul")
-    {
-        return std::make_pair(
-            EdgeCost(
-                0,
-                0,
-                0,
-                edge.consumer_input_port_id == 0 ? consumer.grid_shape.r : consumer.grid_shape.c,
-                &device_config,
-                nullptr,
-                &consumer),
-            NoConstraintFailure);
-    }
-    else
-    {
-        // If there is no queue_producer_op_model this is likely coming from an activation on 1x1 grid
-        return std::make_pair(EdgeCost(0, 0, 0, 1, &device_config, nullptr, &consumer), NoConstraintFailure);
-    }
-}
-
-std::pair<EdgeCost, ConstraintFailureReason> GrayskullConstraint::op_to_op_cost(
-    graphlib::Graph const* graph, graphlib::Edge edge, OpModel const& producer, OpModel const& consumer)
-{
-    graphlib::Node const* producer_node = graph->node_by_id(edge.producer_node_id);
-    if (producer_node->node_type() != graphlib::NodeType::kInput &&
-        (graph->user_data_edges(producer_node).size() > EdgeCost::kMaxDRAMOutQueues))
-    {
-        throw BalancerError(
-            fmt::format("Node exceeds kMaxDRAMOutQueues {}", producer_node->name()),
-            BalancerError::NodeExceedsMaxOpForks(EdgeCost::kMaxDRAMOutQueues));
-    }
-
-    graphlib::OpNode const* consumer_node =
-        dynamic_cast<graphlib::OpNode const*>(graph->node_by_id(edge.consumer_node_id));
-    TT_ASSERT(consumer_node);
-    bool consumer_needs_matmul_streaming = consumer_node->is_matmul() and not consumer_node->is_sparse_matmul();
-    bool is_double_counted = double_counted(graph, edge);
-    auto edge_attr = graph->get_edge_attributes(edge);
-    bool has_transpose = edge_attr->has_tm("transpose");
-
-    ConstraintFailureReason tStreamFailureReason = legal_t_streaming(
-        producer, consumer, edge, edge_attr->get_ublock_order(), has_transpose, consumer_needs_matmul_streaming);
-
-    if (NoConstraintFailure != tStreamFailureReason)
-    {
-        return std::make_pair(EdgeCost(0, 0, 0, 0, &device_config), tStreamFailureReason);
-    }
-
-    // Hack, for convs there are slice/stack tms that immediately follow a sparse matmul which
-    // causes hangs on silicon if we then go into matmul streaming.
-    graphlib::OpNode const* producer_op_node = dynamic_cast<graphlib::OpNode const*>(producer_node);
-    if (producer.t_stream_factor.none() and not consumer.t_stream_factor.none() and producer_op_node != nullptr and
-        producer_op_node->is_sparse_matmul() and consumer_needs_matmul_streaming)
-    {
-        return std::make_pair(EdgeCost(0, 0, 0, 0, &device_config), ConvSliceStackMatmulStreamingHang);
-    }
-
-    if (is_double_counted)
-    {
-        return std::make_pair(
-            EdgeCost(0, 0, 0, 0, &device_config, &producer, &consumer),
-            NoConstraintFailure);  // This edge has a sibling that connects the same pair of nodes
-    }
-
-    // Calculate edge cost
-    ResourceUsage usage =
-        resource_usage_fallback_mode
-            ? get_edge_resource_usage_simple(graph, edge, producer, consumer)
-            : get_edge_resource_usage(
-                  graph, balancer_cache_collection->pipe_to_resource_usage_cache, edge, producer, consumer);
-
-    return std::make_pair(
-        EdgeCost(
-            usage.producer_fan_out,
-            usage.producer_phases,
-            usage.consumer_phases,
-            0,
-            &device_config,
-            &producer,
-            &consumer),
-        NoConstraintFailure);
-}
-
-std::pair<EdgeCost, ConstraintFailureReason> GrayskullConstraint::op_to_queue_cost(
-    graphlib::Graph const* graph,
-    graphlib::Edge edge,
-    OpModel const& producer,
-    std::optional<OpModel> queue_consumer_op_model)
-{
-    if (graph->node_by_id(edge.consumer_node_id)->node_type() == graphlib::NodeType::kQueue)
-    {
-        // As queue is inheriting OpModels from producer we need to make them tightly 1-1 coupled,
-        // as this will cause invalid OpModel elimination propagation work properly from Queue to Producer and vice
-        // versa.
-        //
-        OpModel* consumer_op_model = queue_consumer_op_model.has_value() ? &queue_consumer_op_model.value() : nullptr;
-
-        if (consumer_op_model && consumer_op_model->id == producer.id)
-        {
-            // If there is an OpModel for consumer queue, then the ids must match
-            return std::make_pair(
-                EdgeCost(1, 0, 0, 0, &device_config, &producer, consumer_op_model), NoConstraintFailure);
-        }
-        else
-        {
-            return std::make_pair(EdgeCost(0, 0, 0, 0, &device_config, &producer, nullptr), OpToQueueMapping);
-        }
-    }
-    else
-    {
-        return std::make_pair(
-            EdgeCost(1, 0, 0, 0, &device_config, &producer, nullptr),
-            NoConstraintFailure);  // This is the output edge case
-    }
-}
-
-#ifdef DEBUG
-std::string EdgeConstraintDebugInfo::toString(const Graph* graph) const
-{
-    std::string result = "Edge constraint failure counts by type: \n";
-
-    for (int i = NoConstraintFailure; i < MaxConstraintFailureReason; i++)
-    {
-        result += ConstraintFailureReasonDesc[i] + ": " + std::to_string(constraintFailureCountByType[i]) + "\n";
-    }
-
-    if (nullptr != graph && eliminatingEdges.size() > 0)
-    {
-        result += "Edge elimination data for valid paths:\n";
-        for (const std::pair<const graphlib::EdgeUniqueId, int>& pair : eliminatingEdges)
-        {
-            graphlib::NodeId producer_node_id = std::get<0>(pair.first);
-            graphlib::NodeId consumer_node_id = std::get<2>(pair.first);
-
-            if (0 == producer_node_id)
-            {
-                result += "Subgraph consistency update eliminated: " + std::to_string(pair.second) + " valid paths.\n";
-            }
-            else
-            {
-                std::string producerNodeName = graph->node_by_id(producer_node_id)->name();
-                std::string consumerNodeName = graph->node_by_id(consumer_node_id)->name();
-                result += "Edge " + producerNodeName + " -> " + consumerNodeName +
-                          " eliminated: " + std::to_string(pair.second) + " valid paths.\n";
-            }
-        }
-    }
-
-    return result;
-}
-
-void EdgeConstraintDebugInfo::addEliminatingEdge(const graphlib::EdgeUniqueId edge)
-{
-    if (eliminatingEdges.count(edge) == 0)
-    {
-        eliminatingEdges.insert(std::make_pair(edge, 1));
-    }
-    else
-    {
-        eliminatingEdges[edge]++;
-    }
-}
-#endif
-
-}  // namespace tt::balancer::legalizer
diff --git a/pybuda/csrc/balancer/legalizer/constraints.hpp b/pybuda/csrc/balancer/legalizer/constraints.hpp
deleted file mode 100644
index 9142994fa..000000000
--- a/pybuda/csrc/balancer/legalizer/constraints.hpp
+++ /dev/null
@@ -1,341 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <optional>
-
-#include "backend_api/device_config.hpp"
-#include "balancer/balancer_cache_collection.hpp"
-#include "balancer/types.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-#include "graph_lib/node_types.hpp"
-#include "utils/assert.hpp"
-#include "utils/env.hpp"
-
-namespace tt::graphlib
-{
-class Graph;
-enum class EdgeType;
-using EdgeUniqueId = std::tuple<NodeId, PortId, NodeId, PortId, EdgeType>;
-}  // namespace tt::graphlib
-
-namespace tt::balancer::legalizer
-{
-class GraphSolver;
-
-// New constraint failure reason can be defined after initial NoConstraintFailure value but prior to
-// MaxConstraintFailureReason.
-//
-// clang-format off
-#define EdgeConstraintFailureReasons \
-    X (NoConstraintFailure, "No Failure - Valid path") \
-    X (Failed, "Failed") \
-    X (TStreamIncompatibleConsumer, "Tstream consumer not compatible with producer TStream factor") \
-    X (TStreamIllegalTStreamDir, "TStream direction illegal") \
-    X (TStreamPipegenDoubleCounted, "TStream pipe cannot scatter to more than 4 destinations due to constraints with forking") \
-    X (TStreamBlockIncompatible, "TStream block shape incompatible") \
-    X (TStreamAntiUblockOrderStacking, "TStream anti ublock order stacking is dissallowed") \
-    X (TStreamMatmulStreamingSide, "TStream if matmul is streaming then producer can only stream from respective side LHS=TStreamDir::R, RHS=TStreamDir::C") \
-    X (TStreamNotDivisableRow, "TStream stack/grid not divisible - Row") \
-    X (TStreamNotDivisableColumn, "TStream stack/grid not divisible - Column") \
-    X (ConvSliceStackMatmulStreamingHang, "For convs there are slice/stack tms that immediately follow a sparse matmul which causes hangs on silicon if we then go into matmul streaming") \
-    X (MaxCostExceeded, "Cost exceeded, cost too high") \
-    X (EdgePathRemovedByPriorEdgeElimination, "Valid path removed by edge elimination") \
-    X (OpToQueueMapping, "Special case for generating valid Op to Queue paths") \
-    X (ExceedsDRAMInQueues, "Exceeds the maximum number of DRAM input queues per core") \
-    X (MaxConstraintFailureReason, "")
-// clang-format on
-
-#define X(a, b) a,
-enum ConstraintFailureReason
-{
-    EdgeConstraintFailureReasons
-};
-#undef X
-
-#define X(a, b) b,
-static std::string ConstraintFailureReasonDesc[] = {EdgeConstraintFailureReasons};
-#undef X
-
-#ifdef DEBUG
-class EdgeConstraintDebugInfo
-{
-   private:
-    int constraintFailureCountByType[MaxConstraintFailureReason];
-    std::unordered_map<const graphlib::EdgeUniqueId, int, EdgeUniqueIdHash> eliminatingEdges;
-
-   public:
-    EdgeConstraintDebugInfo() : constraintFailureCountByType() {}
-    void recordEdgeConstraintFailure(ConstraintFailureReason failureReason)
-    {
-        TT_ASSERT(failureReason < MaxConstraintFailureReason, "Invalid failure reason.");
-        constraintFailureCountByType[failureReason]++;
-    }
-
-    int getConstraintFailureCountByType(ConstraintFailureReason failureReason) const
-    {
-        return constraintFailureCountByType[failureReason];
-    }
-
-    void addEliminatingEdge(const graphlib::EdgeUniqueId edge);
-    std::string toString(const graphlib::Graph* = nullptr) const;
-};
-#endif
-
-struct EdgeCost
-{
-    // TODO: Read all of these constants from DeviceConfig
-    // tenstorrent/budabackend#2345
-    static constexpr int kMaxBytesPerPhase = 38;
-    static constexpr int kBackendReservedInQueues = 1;
-    static constexpr int kMaxDRAMInQueues = 40 - kBackendReservedInQueues;
-    static constexpr int kMaxDRAMOutQueues = 8;
-    static constexpr int kMaxFanOutStreams = 16;
-    int kMaxStreamPhasesProducer = 0;
-    int kMaxStreamPhasesConsumer = 0;
-
-    std::uint16_t fan_out_streams = 0;
-    std::uint16_t producer_stream_phases = 0;
-    std::uint16_t consumer_stream_phases = 0;
-    std::uint16_t consumer_dram_in_queues = 0;
-
-    EdgeCost() = default;
-
-    EdgeCost(
-        std::uint16_t fan_out_streams,
-        std::uint16_t producer_stream_phases,
-        std::uint16_t consumer_stream_phases,
-        std::uint16_t consumer_dram_in_queues,
-        const DeviceConfig* device_config = nullptr,
-        const OpModel* producer = nullptr,
-        const OpModel* consumer = nullptr) :
-        fan_out_streams(fan_out_streams),
-        producer_stream_phases(producer_stream_phases),
-        consumer_stream_phases(consumer_stream_phases),
-        consumer_dram_in_queues(consumer_dram_in_queues)
-    {
-        kMaxStreamPhasesProducer = calculate_max_stream_phases(producer, device_config);
-        kMaxStreamPhasesConsumer = calculate_max_stream_phases(consumer, device_config);
-    }
-
-    int calculate_max_stream_phases(const OpModel* op_model, const DeviceConfig* device_config)
-    {
-        // Set default overlay size to BBE reserved space (64kb)
-        // If there is space in given op model (producer or consumer), assume double the space, but don't edit op model
-        // yet, we do that after we select the op models, as the usage of the space is dependent on the
-        // producer/consumer op models.
-        // We divide all the space by 2 because we want to give half to producers and half to consumers.
-
-        if (not op_model)
-        {
-            return 0;  // Default value should be 0 as queues can be on the ends of these edges
-        }
-
-        // All calls with op_model should have device_config provided
-        TT_ASSERT(device_config);
-        TT_ASSERT(op_model->overlay_size == 0, "Expected overlay size to be set do default value of 0.");
-
-        // TODO: read from device config
-        // tenstorrent/budabackend#2344
-        //
-        constexpr int default_overlay_size = 1 << 16;  // 64kB
-        constexpr int overlay_size_to_add = 1 << 16;   // 64kB
-
-        // If there is a global override, use that one instead
-        static int global_overlay_size_to_add = device_config->get_overlay_blob_extra_size();
-        if (global_overlay_size_to_add)
-        {
-            return ((default_overlay_size + global_overlay_size_to_add) / kMaxBytesPerPhase) / 2;
-        }
-
-        int available_l1_space = device_config->get_l1_usable_size() - op_model->get_l1_memory_usage();
-        if (available_l1_space >= overlay_size_to_add)
-        {
-            return ((default_overlay_size + overlay_size_to_add) / kMaxBytesPerPhase) / 2;
-        }
-        else
-        {
-            return (default_overlay_size / kMaxBytesPerPhase) / 2;
-        }
-    }
-
-    bool exceeded() const
-    {
-        return (fan_out_streams > kMaxFanOutStreams) or (producer_stream_phases > kMaxStreamPhasesProducer) or
-               (consumer_stream_phases > kMaxStreamPhasesConsumer) or (consumer_dram_in_queues > kMaxDRAMInQueues);
-    }
-
-    static EdgeCost producer_sum(EdgeCost a, EdgeCost b)
-    {
-        return EdgeCost(
-            a.fan_out_streams + b.fan_out_streams, a.producer_stream_phases + b.producer_stream_phases, 0, 0);
-    }
-
-    static EdgeCost consumer_sum(EdgeCost a, EdgeCost b)
-    {
-        return EdgeCost(
-            0,
-            0,
-            a.consumer_stream_phases + b.consumer_stream_phases,
-            a.consumer_dram_in_queues + b.consumer_dram_in_queues);
-    }
-
-    static EdgeCost sum_fan_out_streams(EdgeCost a, EdgeCost b)
-    {
-        return EdgeCost(a.fan_out_streams + b.fan_out_streams, 0, 0, 0);
-    }
-
-    static EdgeCost sum_producer_stream_phases(EdgeCost a, EdgeCost b)
-    {
-        auto ret = EdgeCost(0, a.producer_stream_phases + b.producer_stream_phases, 0, 0);
-        ret.kMaxStreamPhasesProducer = std::min(a.kMaxStreamPhasesProducer, b.kMaxStreamPhasesProducer);
-
-        return ret;
-    }
-
-    static EdgeCost sum_consumer_stream_phases(EdgeCost a, EdgeCost b)
-    {
-        auto ret = EdgeCost(0, 0, a.consumer_stream_phases + b.consumer_stream_phases, 0);
-        ret.kMaxStreamPhasesConsumer = std::min(a.kMaxStreamPhasesConsumer, b.kMaxStreamPhasesConsumer);
-
-        return ret;
-    }
-
-    static EdgeCost sum_consumer_dram_in_queues(EdgeCost a, EdgeCost b)
-    {
-        return EdgeCost(0, 0, 0, a.consumer_dram_in_queues + b.consumer_dram_in_queues);
-    }
-
-    static bool sort_fan_out_streams(EdgeCost a, EdgeCost b)
-    {
-        if (a.fan_out_streams == b.fan_out_streams)  // Defer to producer_stream_phases if eq
-        {
-            return a.producer_stream_phases < b.producer_stream_phases;
-        }
-        return a.fan_out_streams < b.fan_out_streams;
-    }
-
-    static bool sort_producer_stream_phases(EdgeCost a, EdgeCost b)
-    {
-        if (a.producer_stream_phases == b.producer_stream_phases)  // Defer to fan_out_streams if eq
-        {
-            return a.fan_out_streams < b.fan_out_streams;
-        }
-        return a.producer_stream_phases < b.producer_stream_phases;
-    }
-
-    static bool sort_consumer_stream_phases(EdgeCost a, EdgeCost b)
-    {
-        if (a.consumer_stream_phases == b.consumer_stream_phases)  // Defer to consumer_dram_in_queuesif eq
-        {
-            return a.consumer_dram_in_queues < b.consumer_dram_in_queues;
-        }
-        return a.consumer_stream_phases < b.consumer_stream_phases;
-    }
-
-    static bool sort_consumer_dram_in_queues(EdgeCost a, EdgeCost b)
-    {
-        if (a.consumer_dram_in_queues == b.consumer_dram_in_queues)  // Defer to consumer_stream_phases if eq
-        {
-            return a.consumer_stream_phases < b.consumer_stream_phases;
-        }
-        return a.consumer_dram_in_queues < b.consumer_dram_in_queues;
-    }
-
-    static std::array<std::pair<decltype(&sort_fan_out_streams), decltype(&sum_fan_out_streams)>, 2> producer_cost_fns()
-    {
-        return {
-            std::make_pair(sort_fan_out_streams, sum_fan_out_streams),
-            std::make_pair(sort_producer_stream_phases, sum_producer_stream_phases)};
-    }
-
-    static std::array<std::pair<decltype(&sort_fan_out_streams), decltype(&sum_fan_out_streams)>, 2> consumer_cost_fns()
-    {
-        return {
-            std::make_pair(sort_consumer_stream_phases, sum_consumer_stream_phases),
-            std::make_pair(sort_consumer_dram_in_queues, sum_consumer_dram_in_queues)};
-    }
-
-    static std::array<std::pair<decltype(&sort_fan_out_streams), decltype(&sum_fan_out_streams)>, 4> cost_fns()
-    {
-        return {
-            std::make_pair(sort_fan_out_streams, sum_fan_out_streams),
-            std::make_pair(sort_producer_stream_phases, sum_producer_stream_phases),
-            std::make_pair(sort_consumer_stream_phases, sum_fan_out_streams),
-            std::make_pair(sort_consumer_dram_in_queues, sum_consumer_dram_in_queues)};
-    }
-};
-
-//
-// Constraint interface
-//
-struct Constraint
-{
-    // Reference to device config, whose lifetime is managed outside of this class. However, this class isn't expected
-    // to outlive device config.
-    const DeviceConfig& device_config;
-    std::shared_ptr<BalancerCacheCollection> balancer_cache_collection;
-    bool resource_usage_fallback_mode = false;
-
-    Constraint(const DeviceConfig& device_config, std::shared_ptr<BalancerCacheCollection> balancer_cache_collection) :
-        device_config(device_config), balancer_cache_collection(balancer_cache_collection)
-    {
-        resource_usage_fallback_mode = env_as<bool>("PYBUDA_RESOURCE_USAGE_FALLBACK_MODE");
-    }
-
-    virtual std::pair<EdgeCost, ConstraintFailureReason> queue_to_op_cost(
-        graphlib::Graph const* graph,
-        graphlib::Edge edge,
-        std::optional<OpModel> queue_producer_op_model,
-        OpModel const& consumer) = 0;
-
-    virtual std::pair<EdgeCost, ConstraintFailureReason> op_to_op_cost(
-        graphlib::Graph const* graph, graphlib::Edge edge, OpModel const& producer, OpModel const& consumer) = 0;
-
-    virtual std::pair<EdgeCost, ConstraintFailureReason> op_to_queue_cost(
-        graphlib::Graph const* graph,
-        graphlib::Edge edge,
-        OpModel const& producer,
-        std::optional<OpModel> queue_consumer_op_model) = 0;
-
-    virtual ~Constraint() {}
-};
-
-struct GrayskullConstraint : public Constraint
-{
-    GrayskullConstraint(
-        const DeviceConfig& device_config, std::shared_ptr<BalancerCacheCollection> balancer_cache_collection) :
-        Constraint(device_config, balancer_cache_collection)
-    {
-    }
-
-    virtual std::pair<EdgeCost, ConstraintFailureReason> queue_to_op_cost(
-        graphlib::Graph const* graph,
-        graphlib::Edge edge,
-        std::optional<OpModel> queue_producer_op_model,
-        OpModel const& consumer) override;
-
-    virtual std::pair<EdgeCost, ConstraintFailureReason> op_to_op_cost(
-        graphlib::Graph const* graph, graphlib::Edge edge, OpModel const& producer, OpModel const& consumer) override;
-
-    virtual std::pair<EdgeCost, ConstraintFailureReason> op_to_queue_cost(
-        graphlib::Graph const* graph,
-        graphlib::Edge edge,
-        OpModel const& producer,
-        std::optional<OpModel> queue_consumer_op_model) override;
-};
-
-using WormholeConstraint = GrayskullConstraint;
-
-inline std::ostream& operator<<(std::ostream& os, EdgeCost const& cost)
-{
-    os << "EdgeCost{.fan_out_streams = " << cost.fan_out_streams
-       << ", .producer_stream_phases = " << cost.producer_stream_phases
-       << ", .consumer_stream_phases = " << cost.consumer_stream_phases
-       << ", .consumer_dram_in_queues = " << cost.consumer_dram_in_queues << "}";
-    return os;
-}
-
-}  // namespace tt::balancer::legalizer
diff --git a/pybuda/csrc/balancer/legalizer/graph_solver.cpp b/pybuda/csrc/balancer/legalizer/graph_solver.cpp
deleted file mode 100644
index 92ebc1e2b..000000000
--- a/pybuda/csrc/balancer/legalizer/graph_solver.cpp
+++ /dev/null
@@ -1,2163 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "balancer/legalizer/graph_solver.hpp"
-
-#include <limits>
-
-#include "balancer/balancer_utils.hpp"
-#include "balancer/legalizer/constraints.hpp"
-#include "balancer/legalizer/legalizer.hpp"
-#include "graph_lib/node.hpp"
-#include "graph_lib/node_types.hpp"
-#include "reportify/reportify.hpp"
-#include "utils/assert.hpp"
-
-namespace tt::balancer::legalizer
-{
-GraphSolver::Bitset GraphSolver::kBitsetAll = ~kBitsetNone;
-
-#ifdef DEBUG
-void log_op_model_info(Node* producer_node, Node* consumer_node)
-{
-    graphlib::BudaOpNode* producer_op_node = dynamic_cast<graphlib::BudaOpNode*>(producer_node);
-    graphlib::BudaOpNode* consumer_op_node = dynamic_cast<graphlib::BudaOpNode*>(consumer_node);
-
-    if (producer_op_node)
-    {
-        log_debug(
-            LogGraphSolver,
-            "OpModel failure statistics for producer node: {} {} {}",
-            producer_op_node->name(),
-            producer_op_node->get_type(),
-            producer_op_node->shape());
-        log_debug(LogGraphSolver, producer_op_node->leg_debug_info->toString().c_str());
-    }
-
-    if (consumer_op_node)
-    {
-        log_debug(
-            LogGraphSolver,
-            "OpModel failure statistics for consumer node: {} {} {}",
-            consumer_node->name(),
-            consumer_node->get_type(),
-            consumer_node->shape());
-        log_debug(LogGraphSolver, consumer_op_node->leg_debug_info->toString().c_str());
-    }
-}
-#endif
-
-void GraphSolver::reset(bool partial_reset_allowed)
-{
-    path_sets.clear();
-    path_set_ids.clear();
-    failure_reasons.clear();
-    if (!partial_reset_allowed)
-    {
-        bitsets.clear();
-        bitset_ids.clear();
-    }
-
-#ifdef DEBUG
-    if (env_as<bool>("PYBUDA_LEGALIZER_DETAILED_DEBUGGING"))
-    {
-        // Cleanup debug data in OpModels.
-        //
-        for (auto& mapElem : shared_data->legal_op_models)
-        {
-            std::vector<OpModel>& opModels = mapElem.second;
-
-            for (OpModel& model : opModels)
-            {
-                model.eliminating_edge = graphlib::EdgeUniqueId();
-            }
-        }
-
-        for (const auto& verElem : op_model_recompute_version)
-        {
-            std::vector<OpModel>& opModels = shared_data->recomputed_legal_op_models.at(verElem.first)[verElem.second];
-
-            for (OpModel& model : opModels)
-            {
-                model.eliminating_edge = graphlib::EdgeUniqueId();
-            }
-        }
-    }
-#endif
-}
-
-std::pair<EdgeCost, ConstraintFailureReason> cost_fn(
-    Constraint* constraint,
-    graphlib::Graph const* graph,
-    graphlib::Edge const& edge,
-    std::vector<OpModel> const& producer_op_models,
-    std::vector<OpModel> const& consumer_op_models,
-    std::uint64_t producer_id,
-    std::uint64_t consumer_id)
-{
-    graphlib::Node const* producer = graph->node_by_id(edge.producer_node_id);
-    graphlib::Node const* consumer = graph->node_by_id(edge.consumer_node_id);
-    static OpModel null_op_model;
-    if (producer->node_type() != graphlib::NodeType::kBudaOp)
-    {
-        TT_ASSERT(consumer_id < consumer_op_models.size());
-        return constraint->queue_to_op_cost(
-            graph,
-            edge,
-            producer_op_models.size() > 0 ? producer_op_models[producer_id] : std::optional<OpModel>{},
-            consumer_op_models[consumer_id]);
-    }
-    else if (consumer->node_type() != graphlib::NodeType::kBudaOp)
-    {
-        TT_ASSERT(producer_id < producer_op_models.size());
-        return constraint->op_to_queue_cost(
-            graph,
-            edge,
-            producer_op_models[producer_id],
-            consumer_op_models.size() > 0 ? consumer_op_models[consumer_id] : std::optional<OpModel>{});
-    }
-    else
-    {
-        TT_ASSERT(producer_id < producer_op_models.size());
-        TT_ASSERT(consumer_id < consumer_op_models.size());
-        return constraint->op_to_op_cost(graph, edge, producer_op_models[producer_id], consumer_op_models[consumer_id]);
-    }
-}
-
-// Generate unique id of op model pair.
-//
-static std::uint64_t get_op_model_pair_id(const std::uint64_t prod_om_id, const std::uint64_t cons_om_id)
-{
-    return (size_t(prod_om_id) << 32llu) | size_t(cons_om_id);
-}
-
-bool GraphSolver::resolve_step(const bool self_cut_allowed)
-{
-#ifdef DEBUG
-    EdgeConstraintDebugInfo graph_constraint_debug_info;
-    bool enable_legalizer_detailed_debugging = env_as<bool>("PYBUDA_LEGALIZER_DETAILED_DEBUGGING");
-    std::string node_name_edge_debug = env_as<std::string>("PYBUDA_LEGALIZER_DEBUG_NODE_NAME");
-    bool collect_failure_reasons = env_as<bool>("PYBUDA_COLLECT_CONSTRAINT_INFO");
-#endif
-
-    Constraint* constraint = shared_data->constraint.get();
-    NodePathsProcessor node_processor;
-    std::vector<graphlib::Node*> nodes = graphlib::topological_sort(*graph);
-    bitsets.reserve(nodes.size());
-    bitset_ids.reserve(nodes.size());
-    op_disabled_bitset_cache.reserve(nodes.size());
-    selected_op_models.reserve(nodes.size());
-    bool fast_cut_used = false;  // Self-cutting is performed in a single graphsolver pass, followed by one more final
-                                 // graphsolver resolution.
-    bool consumer_op_model_exceeds = false;
-    bool producer_op_model_exceeds = false;
-
-    std::vector<int> self_cut_disabled_on_subgraphs = env_as_vector<int>("PYBUDA_DISABLE_SELF_CUT_FOR_SUBGRAPHS");
-
-    for (graphlib::Node* consumer_node : nodes)
-    {
-        Bitset* consumer_bitset = get_or_insert_bitset(consumer_node->id(), kBitsetAll);
-        std::vector<OpModel> const& consumer_op_models = get_legal_op_models(consumer_node);
-
-        for (graphlib::Edge edge : graph->operand_data_edges(consumer_node))
-        {
-#ifdef DEBUG
-            EdgeConstraintDebugInfo edge_constraint_debug_info;
-#endif
-            // With virtual queue processing on cut edges within GraphTraversalContext,
-            // we shouldn't be processing cut edges anymore unless we are in fast cut.
-            //
-            if (cut_edges.count(edge) > 0)
-            {
-                TT_ASSERT(edges_to_ignore.count(edge) > 0, "Cut edge must be ignored!");
-                TT_ASSERT(fast_cut_used, "If we are processing cut edge this must be fast cut!");
-
-                continue;
-            }
-
-            graphlib::Node* producer_node = graph->node_by_id(edge.producer_node_id);
-            Bitset* producer_bitset = get_or_insert_bitset(producer_node->id(), kBitsetAll);
-            std::vector<OpModel> const& producer_op_models = get_legal_op_models(producer_node);
-
-            TT_ASSERT(not(consumer_op_models.empty() and producer_op_models.empty()));
-
-            if (consumer_op_models.size() > kNumBitsetBits)
-            {
-                consumer_op_model_exceeds = true;
-                log_trace(
-                    LogGraphSolver,
-                    "Consumer op models [{}] exceeds kNumBitsetBits [{}] node {}",
-                    consumer_op_models.size(),
-                    kNumBitsetBits,
-                    consumer_node->name());
-            }
-
-            if (producer_op_models.size() > kNumBitsetBits)
-            {
-                producer_op_model_exceeds = true;
-                log_trace(
-                    LogGraphSolver,
-                    "Producer op models [{}] exceeds kNumBitsetBits [{}] node {}",
-                    producer_op_models.size(),
-                    kNumBitsetBits,
-                    producer_node->name());
-            }
-
-            PathSet::Paths paths;
-            Bitset edge_producer_bitset = kBitsetNone;
-            Bitset edge_consumer_bitset = kBitsetNone;
-            std::uint64_t producer_count = std::min(kNumBitsetBits, std::max(1lu, producer_op_models.size()));
-            std::uint64_t consumer_count = std::min(kNumBitsetBits, std::max(1lu, consumer_op_models.size()));
-            bool cacheable = producer_node->node_type() == graphlib::NodeType::kBudaOp and
-                             consumer_node->node_type() == graphlib::NodeType::kBudaOp;
-            for (std::uint64_t producer_id = 0; producer_id < producer_count; ++producer_id)
-            {
-                // If the producer cannot accomodate this path, continue.
-                // Also if this is not the OpModel we selected, continue.
-                //
-                if (!producer_bitset->test(producer_id))
-                    continue;
-
-                for (std::uint64_t consumer_id = 0; consumer_id < consumer_count; ++consumer_id)
-                {
-                    // If the consumer cannot accomodate this path, continue.
-                    // Also if this is not the OpModel we selected, continue.
-                    //
-                    if (!consumer_bitset->test(consumer_id))
-                        continue;
-
-                    // Load constraint check result from cache for Op-Op verification if possible,
-                    // otherwise populate cache.
-                    //
-                    std::uint64_t pair_id = 0;
-                    std::unordered_map<std::uint64_t, const std::pair<const EdgeCost, const ConstraintFailureReason>>::
-                        const_iterator cache_it;
-                    EdgeCost cost;
-                    ConstraintFailureReason constraint_failure_reason;
-
-                    if (cacheable)
-                    {
-                        pair_id = get_op_model_pair_id(
-                            producer_op_models[producer_id].id.id, consumer_op_models[consumer_id].id.id);
-                        cache_it = shared_data->constraint_result_cache.find(pair_id);
-                    }
-
-                    if (!cacheable or cache_it == shared_data->constraint_result_cache.end())
-                    {
-                        std::tie(cost, constraint_failure_reason) = cost_fn(
-                            constraint, graph, edge, producer_op_models, consumer_op_models, producer_id, consumer_id);
-                        if (cacheable)
-                        {
-                            shared_data->constraint_result_cache.try_emplace(pair_id, cost, constraint_failure_reason);
-                        }
-                    }
-                    else
-                    {
-                        std::tie(cost, constraint_failure_reason) = cache_it->second;
-                    }
-
-                    if (NoConstraintFailure == constraint_failure_reason)
-                    {
-                        if (not cost.exceeded())
-                        {
-                            TT_ASSERT(producer_id <= std::numeric_limits<decltype(Path::producer_id)>::max());
-                            TT_ASSERT(consumer_id <= std::numeric_limits<decltype(Path::consumer_id)>::max());
-                            paths.push_back(Path(producer_id, consumer_id, cost));
-                            edge_producer_bitset.set(producer_id);
-                            edge_consumer_bitset.set(consumer_id);
-                        }
-                        else
-                        {
-                            constraint_failure_reason = MaxCostExceeded;
-                        }
-                    }
-#ifdef DEBUG
-                    else if (
-                        collect_failure_reasons and not producer_op_models.empty() and not consumer_op_models.empty())
-                    {
-                        std::string key = fmt::format(
-                            "{}:{}", producer_op_models[producer_id].id.id, consumer_op_models[consumer_id].id.id);
-                        failure_reasons.insert({key, constraint_failure_reason});
-                    }
-
-                    edge_constraint_debug_info.recordEdgeConstraintFailure(constraint_failure_reason);
-                    graph_constraint_debug_info.recordEdgeConstraintFailure(constraint_failure_reason);
-#endif
-                }
-            }
-
-#ifdef DEBUG
-            if (enable_legalizer_detailed_debugging)
-            {
-                compute_edge_elimination_debug_info(
-                    edge,
-                    producer_bitset,
-                    consumer_bitset,
-                    edge_producer_bitset,
-                    edge_consumer_bitset,
-                    const_cast<std::vector<OpModel>&>(producer_op_models),
-                    const_cast<std::vector<OpModel>&>(consumer_op_models),
-                    producer_count,
-                    consumer_count,
-                    edge_constraint_debug_info,
-                    graph_constraint_debug_info);
-            }
-#endif
-
-            if (paths.empty() or ((*producer_bitset & edge_producer_bitset) == 0) or
-                ((*consumer_bitset & edge_consumer_bitset) == 0))
-            {
-#ifdef DEBUG
-                // If we fail print whole graph edge constraint statistics, and statistics for this edge.
-                // If you enable detailed legalizer debugging "PYBUDA_LEGALIZER_DETAILED_DEBUGGING"
-                // you will also get edge elimination stats and stats of OpModels for both nodes on this edge.
-                //
-                log_debug(LogGraphSolver, "Constraint failure statistics for whole graph:");
-                log_debug(LogGraphSolver, graph_constraint_debug_info.toString().c_str());
-                log_debug(
-                    LogGraphSolver,
-                    "Constraint failure statistics for egde: {} -> {}",
-                    producer_node->name(),
-                    consumer_node->name());
-                log_debug(LogGraphSolver, edge_constraint_debug_info.toString(graph).c_str());
-                if (enable_legalizer_detailed_debugging)
-                {
-                    log_op_model_info(producer_node, consumer_node);
-                }
-#endif
-
-                // No valid paths found for this edge, lets try self-cutting if enabled.
-                //
-                if (GraphSolverSelfCutType::None != balancer_config.graph_solver_self_cut_type and self_cut_allowed and
-                    producer_node->node_type() == graphlib::NodeType::kBudaOp and
-                    consumer_node->node_type() == graphlib::NodeType::kBudaOp and
-                    (std::find(
-                         self_cut_disabled_on_subgraphs.begin(),
-                         self_cut_disabled_on_subgraphs.end(),
-                         graph->get_subgraph_id_for_node(producer_node->id())) == self_cut_disabled_on_subgraphs.end()))
-                {
-                    fast_cut_used = self_cut(producer_node, consumer_node);
-
-                    if (fast_cut_used)
-                    {
-                        *consumer_bitset = kBitsetAll;
-                        continue;
-                    }
-                    else
-                    {
-                        return false;
-                    }
-                }
-
-                throw_error_for_edge(edge);
-            }
-
-#ifdef DEBUG
-            if (enable_legalizer_detailed_debugging)
-            {
-                if (node_name_edge_debug == producer_node->name() or node_name_edge_debug == consumer_node->name() or
-                    node_name_edge_debug.empty())
-                {
-                    log_debug(
-                        LogGraphSolver,
-                        "Constraint failure statistics for egde: {} -> {}",
-                        producer_node->name(),
-                        consumer_node->name());
-                    log_debug(LogGraphSolver, edge_constraint_debug_info.toString(graph).c_str());
-
-                    if (!node_name_edge_debug.empty())
-                    {
-                        log_op_model_info(producer_node, consumer_node);
-                    }
-                }
-            }
-#endif
-            if (!is_subset(*producer_bitset, edge_producer_bitset) && !fast_cut_used)
-            {
-                node_processor.add_node(producer_node);
-            }
-
-            *producer_bitset &= edge_producer_bitset;
-            *consumer_bitset &= edge_consumer_bitset;
-            TT_ASSERT(path_set_ids.find(edge) == path_set_ids.end());
-            PathSetId path_set_id = (PathSetId)path_sets.size();
-            path_sets.emplace_back(
-                bitset_ids[producer_node->id()], bitset_ids[consumer_node->id()], producer_node, consumer_node, paths);
-            path_set_ids.emplace(edge, path_set_id);
-        }
-
-        if (!fast_cut_used)
-        {
-            node_processor.process(this);
-        }
-    }
-
-    if (consumer_op_model_exceeds)
-    {
-        log_warning(
-            LogGraphSolver,
-            "Consumer op models exceed kNumBitsetBits requirements for some nodes, check trace for detail");
-    }
-
-    if (producer_op_model_exceeds)
-    {
-        log_warning(
-            LogGraphSolver,
-            "Producer op models exceed kNumBitsetBits requirements for some nodes, check trace for detail");
-    }
-
-    if (fast_cut_used)
-    {
-        return false;
-    }
-
-#ifdef DEBUG
-    log_debug(LogGraphSolver, "Constraint failure statistics for whole graph:");
-    log_debug(LogGraphSolver, graph_constraint_debug_info.toString().c_str());
-#endif
-
-    for (graphlib::Node* node : nodes)
-    {
-        // No need to expand root as we are calling for all nodes anyway.
-        //
-        update_solver(node, false /* expand_root */);
-    }
-
-    resolve_step_postprocess(nodes);
-
-    return true;
-}
-
-// Used for tweaking output of graphsolver resolve.
-//
-void GraphSolver::resolve_step_postprocess(const std::vector<graphlib::Node*>& nodes)
-{
-    // Invalidate streaming into output if possible(enabled by default).
-    //
-    invalidate_streaming_into_output(nodes);
-
-    // Invalidate suboptimal op models according to invalidation strategy.
-    //
-    if (suboptimal_opmodel_invalidation_strategy)
-    {
-        invalidate_suboptimal_op_models(nodes);
-    }
-}
-
-// Self-cutting is used when we cannot find valid path for an edge due to constraints. Then as last resort so that we
-// can resolve this graph we mark this edge as a virtual one and in later phase we will place a queue in its place
-// between producer and consumer. Returns whether fast cut should be used - instead of resolving whole graph after each
-// cut, compute all cuts first(approximation which may lead to more cuts).
-//
-bool GraphSolver::self_cut(graphlib::Node* producer_node, graphlib::Node* consumer_node)
-{
-    std::unordered_set<graphlib::Node*> nodes_to_legalize;
-    bool use_fast_cut = false;
-
-    switch (balancer_config.graph_solver_self_cut_type)
-    {
-        case FastCut: use_fast_cut = true; [[fallthrough]];
-        case ConsumerOperandDataEdgesFirst:
-        {
-            log_debug(
-                LogGraphSolver,
-                "Constraint failure - trying to resolve by self-cutting all operand data edges for node {}",
-                consumer_node->name());
-
-            for (graphlib::Edge edge : graph->operand_data_edges(consumer_node))
-            {
-                graphlib::Node* producer_node = graph->node_by_id(edge.producer_node_id);
-
-                if (producer_node->node_type() == graphlib::NodeType::kBudaOp)
-                {
-                    TT_ASSERT(cut_edges.count(edge) == 0, "Same edge should not be cut twice!");
-                    TT_ASSERT(
-                        selected_op_models.count(producer_node) == 0 or selected_op_models.count(consumer_node) == 0,
-                        "At least one node affected by CUT must not be SET!");
-                    if (selected_op_models.count(producer_node) == 0)
-                    {
-                        nodes_to_legalize.insert(producer_node);
-                    }
-
-                    if (selected_op_models.count(consumer_node) == 0)
-                    {
-                        nodes_to_legalize.insert(consumer_node);
-                    }
-
-                    cut_edges.insert(std::make_pair(edge, true /* self cutting edge */));
-
-                    // Insert virtual queue on cut edge.
-                    //
-                    insert_virtual_queue(edge, producer_node, consumer_node);
-                }
-            }
-        }
-        break;
-
-        case ProducerUserDataEdgesFirst:
-        {
-            log_debug(
-                LogGraphSolver,
-                "Constraint failure - trying to resolve by self-cutting all user data edges for node {}",
-                producer_node->name());
-
-            for (graphlib::Edge edge : graph->user_data_edges(producer_node))
-            {
-                graphlib::Node* consumer_node = graph->node_by_id(edge.consumer_node_id);
-
-                if (consumer_node->node_type() == graphlib::NodeType::kBudaOp)
-                {
-                    TT_ASSERT(cut_edges.count(edge) == 0, "Same edge should not be cut twice!");
-                    TT_ASSERT(
-                        selected_op_models.count(producer_node) == 0 or selected_op_models.count(consumer_node) == 0,
-                        "At least one node affected by CUT must not be SET!");
-                    if (selected_op_models.count(producer_node) == 0)
-                    {
-                        nodes_to_legalize.insert(producer_node);
-                    }
-
-                    if (selected_op_models.count(consumer_node) == 0)
-                    {
-                        nodes_to_legalize.insert(consumer_node);
-                    }
-
-                    cut_edges.insert(std::make_pair(edge, true /* self cutting edge */));
-
-                    // Insert virtual queue on cut edge.
-                    //
-                    insert_virtual_queue(edge, producer_node, consumer_node);
-                }
-            }
-        }
-        break;
-
-        case None:
-        default: TT_ASSERT(false, "Invalid self cut type!"); break;
-    }
-
-    // Recalculate OpModels for nodes affected by queue insertion.
-    //
-    recompute_legal_op_models_on_cut(nodes_to_legalize);
-    return use_fast_cut;
-}
-
-void GraphSolver::resolve(bool partial_reset_allowed)
-{
-    PROFILE_SCOPE();
-    graphlib::GraphTraversalContext graph_solver_graph_context(graph, &virtual_nodes, &edges_to_ignore);
-    int default_resolve_retry_count_self_cutting = 20;
-    if (env_as<int>("PYBUDA_MAX_GRAPH_CUT_RETRY", 0))
-    {
-        default_resolve_retry_count_self_cutting = env_as<int>("PYBUDA_MAX_GRAPH_CUT_RETRY");
-    }
-    const int max_retry_step = GraphSolverSelfCutType::None != balancer_config.graph_solver_self_cut_type
-                                   ? default_resolve_retry_count_self_cutting
-                                   : 1;
-    int retry_step = 1;
-    bool resolved = false;
-
-    do
-    {
-        // Reset GraphSolver to default state.
-        //
-        reset(partial_reset_allowed);
-
-        // Try to resolve graph(currently retry is used only by self-cutting mechanism).
-        // Self-cutting means graph will cut edge for which it cannot produce valid paths and retry resolve again.
-        //
-        resolved = resolve_step(retry_step < max_retry_step);
-        retry_step++;
-    } while (!resolved and retry_step <= max_retry_step);
-
-    if (!resolved and env_as<bool>("PYBUDA_COLLECT_CONSTRAINT_INFO"))
-    {
-        update_constraint_info();
-    }
-
-    TT_ASSERT(resolved, "Graph is either resolved or error is thrown from resolve_step.");
-
-#ifdef DEBUG
-    if (cut_edges.size() > 0)
-    {
-        log_debug(LogGraphSolver, "Graph is resolved with cut edges: ");
-        for (auto& it : cut_edges)
-        {
-            const Edge& edge = it.first;
-            std::string producerNodeName = graph->node_by_id(edge.producer_node_id)->name();
-            std::string consumerNodeName = graph->node_by_id(edge.consumer_node_id)->name();
-            std::string selfCutEdge = it.second ? "Self-cut " : "";
-            log_debug(LogGraphSolver, "{}Edge: {} -> {}", selfCutEdge, producerNodeName, consumerNodeName);
-        }
-    }
-#endif
-}
-
-std::vector<graphlib::Edge> GraphSolver::get_epoch_type_switch_cut_edges()
-{
-    std::vector<graphlib::Edge> epoch_to_epoch_cuts;
-    for (graphlib::Node* consumer : graphlib::topological_sort(*graph))
-    {
-        if (consumer->node_type() != graphlib::NodeType::kBudaOp)
-            continue;
-        for (graphlib::Edge edge : graph->operand_data_edges(consumer))
-        {
-            graphlib::Node* producer = graph->node_by_id(edge.producer_node_id);
-            if (producer->node_type() != graphlib::NodeType::kBudaOp)
-                continue;
-            if (producer->get_epoch_type() != consumer->get_epoch_type())
-            {
-                epoch_to_epoch_cuts.push_back(edge);
-            }
-        }
-    }
-    return epoch_to_epoch_cuts;
-}
-
-GraphSolver::GraphSolver(
-    graphlib::Graph* graph,
-    std::unique_ptr<Constraint>&& constraint,
-    LegalOpModels const& legal_op_models,
-    BalancerConfig const& balancer_config,
-    std::shared_ptr<balancer::BalancerCacheCollection> balancer_cache_collection,
-    std::vector<graphlib::Edge> const& in_cut_edges,
-    bool use_op_model_recalculation_on_cut,
-    bool resolve_on_create) :
-    graph(graph),
-    shared_data(std::make_shared<SharedData>(std::move(constraint), legal_op_models)),
-    balancer_config(balancer_config),
-    balancer_cache_collection(balancer_cache_collection),
-    use_op_model_recalculation_on_cut(use_op_model_recalculation_on_cut)
-{
-    std::size_t num_edges = graph->operands_map().size();
-    uint op_model_pairs_per_edge_estimate = 256;
-    path_sets.reserve(num_edges);
-    path_set_ids.reserve(num_edges);
-    shared_data->constraint_result_cache.reserve(num_edges * op_model_pairs_per_edge_estimate);
-
-    single_core_ip_mode =
-        balancer_config.device_config.grid_size.r * balancer_config.device_config.grid_size.c == 1 and
-        balancer_config.use_interactive_placer and
-        (balancer_config.policy_type == PolicyType::NLP || balancer_config.policy_type == PolicyType::Ribbon);
-
-    if (env_as<bool>("PYBUDA_COLLECT_CONSTRAINT_INFO"))
-    {
-        constraint_info_ptr = std::make_shared<ConstraintInfo>();
-    }
-
-    if (resolve_on_create)
-    {
-        std::vector<graphlib::Edge> initial_cuts = get_epoch_type_switch_cut_edges();
-        initial_cuts.insert(initial_cuts.end(), in_cut_edges.begin(), in_cut_edges.end());
-
-        if (initial_cuts.empty())
-            resolve();
-        else
-            cut(initial_cuts, true /*epoch_cut*/);
-    }
-}
-
-GraphSolver::PathSet& GraphSolver::get_path_set(const graphlib::Edge& edge)
-{
-    TT_ASSERT(
-        path_set_ids.find(edge) != path_set_ids.end(),
-        graph->node_by_id(edge.producer_node_id)->name(),
-        graph->node_by_id(edge.consumer_node_id)->name());
-    return path_sets[path_set_ids.at(edge)];
-}
-
-GraphSolver::PathSet const& GraphSolver::get_path_set(const graphlib::Edge& edge) const
-{
-    return path_sets[path_set_ids.at(edge)];
-}
-
-GraphSolver::PathSet* GraphSolver::get_path_set_pt(const graphlib::Edge& edge)
-{
-    if (path_set_ids.count(edge) > 0)
-    {
-        return &path_sets[path_set_ids.at(edge)];
-    }
-    else
-    {
-        return nullptr;
-    }
-}
-
-SmallVector<GraphSolver::PathSet*> GraphSolver::get_operand_path_sets_pts(graphlib::Node const* node)
-{
-    SmallVector<PathSet*> operand_path_sets;
-    for (auto edge : graph->operand_data_edges(node))
-    {
-        PathSet* el = get_path_set_pt(edge);
-        if (nullptr != el)
-        {
-            operand_path_sets.push_back(el);
-        }
-    }
-
-    return operand_path_sets;
-}
-
-SmallVector<GraphSolver::PathSet*> GraphSolver::get_user_path_sets_pts(graphlib::Node const* node)
-{
-    SmallVector<PathSet*> user_path_sets;
-    for (auto edge : graph->user_data_edges(node))
-    {
-        PathSet* el = get_path_set_pt(edge);
-        if (nullptr != el)
-        {
-            user_path_sets.push_back(el);
-        }
-    }
-
-    return user_path_sets;
-}
-
-void GraphSolver::log_bitset(graphlib::Node const* node, const Bitset& set) const
-{
-    const std::vector<OpModel>& op_models = get_legal_op_models(node);
-    log_debug(
-        LogGraphSolver, "    {} {}:{}", node->name(), node->node_type(), !op_models.empty() ? "" : " No op models");
-
-    TT_ASSERT(
-        set.count() <= op_models.size(),
-        "set.count() = {} is NOT <= op_models.size() = {}",
-        set.count(),
-        op_models.size());
-    for (std::uint64_t i = 0; i < op_models.size() and i < set.size(); i++)
-    {
-        if (set[i])
-            log_debug(
-                LogGraphSolver, "      [{}] {} {}", i, op_models.at(i).grid_shape, op_models.at(i).t_stream_factor);
-    }
-}
-
-template <bool kOperand>
-void GraphSolver::handle_cumulative_paths_error(
-    PathSet const& path_set,
-    const Bitset& debug_snapshot,
-    graphlib::Node const* producer,
-    graphlib::Node const* consumer)
-{
-    auto user_data_edges = graph->user_data_edges(producer);
-    if (not kOperand and user_data_edges.size() > 2)
-    {
-        throw BalancerError(
-            fmt::format("Node forks {}", producer->name()),
-            BalancerError::NodeExceedsMaxOpForks(round_up_div(user_data_edges.size(), std::size_t{2}), producer->id()));
-    }
-
-    //
-    // Fail with error message
-    //
-    const std::string msg =
-        fmt::format("Could not reconcile constraints: path[{} -> {}]", producer->name(), consumer->name());
-    log_debug(LogGraphSolver, "{}", msg);
-
-    if (kOperand)
-    {
-        log_debug(LogGraphSolver, "  Offending Producer info:");
-        log_bitset(producer, path_set.get_producer_set(bitsets));
-
-        log_debug(LogGraphSolver, "  Consumer info:");
-        log_bitset(consumer, debug_snapshot);
-    }
-    else
-    {
-        log_debug(LogGraphSolver, "  Producer info:");
-        log_bitset(producer, debug_snapshot);
-
-        log_debug(LogGraphSolver, "  Offending Consumer info:");
-        log_bitset(consumer, path_set.get_consumer_set(bitsets));
-    }
-
-    update_constraint_info();
-    reportify::dump_constraints(graph->name(), this);
-
-    throw BalancerError(msg, BalancerError::Fatal(msg));
-}
-
-template <bool kOperand, typename CostFns>
-bool GraphSolver::apply_cumulative_costs(
-    SmallVector<tt::balancer::legalizer::GraphSolver::PathSet*> const& path_sets,
-    graphlib::Node const* node,
-    CostFns cost_fns)
-{
-    // cull out user paths who's sum exceeds the max cost
-    bool path_changed = false;
-    SmallVector<Bitset> debug_snapshot;
-    for (int i = 0; i < (int)path_sets.size(); ++i)
-    {
-        debug_snapshot.push_back(
-            kOperand ? path_sets[i]->get_consumer_set(bitsets) : path_sets[i]->get_producer_set(bitsets));
-    }
-
-    for (auto [sort_fn, sum_fn] : cost_fns())
-    {
-        for (int i = 0; i < (int)path_sets.size(); ++i)
-        {
-            auto path_set_i = path_sets[i];
-
-            auto max_path = path_set_i->max_cost(sort_fn);
-            if (not max_path)
-            {
-                if (kOperand)
-                    handle_cumulative_paths_error<kOperand>(
-                        *path_set_i, debug_snapshot[i], path_set_i->get_producer_node(), node);
-                else
-                    handle_cumulative_paths_error<kOperand>(
-                        *path_set_i, debug_snapshot[i], node, path_set_i->get_consumer_node());
-            }
-
-            std::uint16_t max_path_index = kOperand ? max_path->consumer_id : max_path->producer_id;
-            EdgeCost total_cost = max_path->cost;
-            bool valid = true;
-
-            for (int j = 0; j < (int)path_sets.size(); ++j)
-            {
-                auto path_set_j = path_sets[j];
-
-                if (i == j)
-                    continue;
-
-                auto min_path = path_set_j->min_cost<kOperand>(sort_fn, max_path_index);
-                if (not min_path)
-                {
-                    valid = false;
-                    break;
-                }
-
-                total_cost = sum_fn(total_cost, min_path->cost);
-            }
-
-            if (total_cost.exceeded() or not valid)
-            {
-                path_changed |= path_set_i->erase(max_path, bitsets);
-                --i;  // retry
-            }
-        }
-    }
-
-    return path_changed;
-}
-
-void add_operands_and_users(
-    const graphlib::Graph* graph,
-    const graphlib::Node* node,
-    std::vector<graphlib::Node const*>& needs_update,
-    const graphlib::Node* ignore_node = nullptr)
-{
-    for (graphlib::Node* node_o : graph->data_operands(node))
-    {
-        if (node_o == ignore_node)
-            continue;
-
-        needs_update.push_back(node_o);
-    }
-
-    for (graphlib::Node* node_u : graph->data_users(node))
-    {
-        if (node_u == ignore_node)
-            continue;
-
-        needs_update.push_back(node_u);
-    }
-}
-
-void GraphSolver::handle_no_paths_left_on_update(
-    bool invoked_by_set, const std::string& root_node_name, const std::string& current_node_name)
-{
-    // We ended-up in a situation without valid solution due to circular dependency.
-    //
-    if (invoked_by_set)
-    {
-        // Invoking resolve again will use self-cut to try to resolve this issue.
-        //
-        log_debug(
-            LogGraphSolver,
-            "Update solver failed for root node {} on node {}. Invoking re-resolve!",
-            root_node_name,
-            current_node_name);
-        return resolve();
-    }
-    else
-    {
-        // Already in resolve, we can only error out at this point.
-        //
-        const std::string msg = fmt::format(
-            "Update solver failed to reconcile constraints invoked for root node {} on node {}!",
-            root_node_name,
-            current_node_name);
-        throw BalancerError(msg, BalancerError::Fatal(msg));
-    }
-}
-
-void GraphSolver::update_solver(graphlib::Node const* root, bool expand_root, bool invoked_by_set)
-{
-    TT_ASSERT(
-        graph->virtual_node_count() == 0 or graph->is_graph_traversal_context_set(),
-        "We have virtual nodes and no graph traversal context set - this could lead to unexpected results in GS graph "
-        "resolution.");
-    std::vector<graphlib::Node const*> needs_update = {root};
-
-    if (expand_root)
-    {
-        auto operand_path_sets = get_operand_path_sets_pts(root);
-        auto user_path_sets = get_user_path_sets_pts(root);
-
-        for (auto path_set : operand_path_sets)
-        {
-            path_set->update(bitsets);
-        }
-
-        for (auto path_set : user_path_sets)
-        {
-            path_set->update(bitsets);
-        }
-    }
-
-    if (expand_root)
-    {
-        // When node bitsets are updated(set of valid op models), we need to update paths for all operands and users.
-        //
-        add_operands_and_users(graph, root, needs_update);
-    }
-
-    while (not needs_update.empty())
-    {
-        auto node = needs_update.back();
-
-        auto operand_path_sets = get_operand_path_sets_pts(node);
-        auto user_path_sets = get_user_path_sets_pts(node);
-
-        bool path_changed = false;
-
-        // Cumulative cost of operand edges coming into this consumer
-        path_changed |= apply_cumulative_costs<true>(operand_path_sets, node, EdgeCost::consumer_cost_fns);
-        // Cumulative cost of user edges going out of this producer
-        path_changed |= apply_cumulative_costs<false>(user_path_sets, node, EdgeCost::producer_cost_fns);
-
-        bool edge_changed = false;
-        for (auto path_set : operand_path_sets)
-        {
-            bool producers_changed = path_set->update(bitsets);
-
-            if (path_set->empty(bitsets))
-            {
-                return handle_no_paths_left_on_update(invoked_by_set, root->name(), node->name());
-            }
-
-            if (path_changed or producers_changed)
-            {
-                const Node* producer_node = path_set->get_producer_node();
-                needs_update.push_back(producer_node);
-                if (producers_changed)
-                {
-                    add_operands_and_users(graph, producer_node, needs_update, node);
-                }
-
-                edge_changed = true;
-            }
-        }
-
-        for (auto path_set : user_path_sets)
-        {
-            bool consumers_changed = path_set->update(bitsets);
-            if (path_set->empty(bitsets))
-            {
-                return handle_no_paths_left_on_update(invoked_by_set, root->name(), node->name());
-            }
-
-            if (path_changed or consumers_changed)
-            {
-                const Node* consumer_node = path_set->get_consumer_node();
-                needs_update.push_back(consumer_node);
-                if (consumers_changed)
-                {
-                    add_operands_and_users(graph, consumer_node, needs_update, node);
-                }
-
-                edge_changed = true;
-            }
-        }
-
-        if (not edge_changed)
-            needs_update.pop_back();
-    }
-}
-
-GraphSolver::Bitset* GraphSolver::get_bitset(graphlib::NodeId node_id) { return &bitsets[bitset_ids.at(node_id)]; }
-
-GraphSolver::Bitset const* GraphSolver::get_bitset(graphlib::NodeId node_id) const
-{
-    return &bitsets[bitset_ids.at(node_id)];
-}
-
-GraphSolver::Bitset* GraphSolver::get_or_insert_bitset(graphlib::NodeId node_id, const Bitset& init)
-{
-    auto match = bitset_ids.find(node_id);
-    if (match == bitset_ids.end())
-    {
-        BitsetId bitset_id = bitsets.size();
-        bitset_ids.insert({node_id, bitset_id});
-        auto tmp = bitsets.data();
-        const auto disabled_bitset = op_disabled_bitset_cache.find(node_id);
-        if (disabled_bitset == op_disabled_bitset_cache.end())
-        {
-            bitsets.push_back(init);
-        }
-        else
-        {
-            bitsets.push_back(init & ~disabled_bitset->second);
-        }
-
-        TT_ASSERT(tmp == bitsets.data(), "bitsets reallocated, pointers invalid");
-        return &bitsets.back();
-    }
-    else
-    {
-        return &bitsets[match->second];
-    }
-}
-
-void GraphSolver::throw_error_for_edge(graphlib::Edge edge)
-{
-    graphlib::Node* producer = graph->node_by_id(edge.producer_node_id);
-    graphlib::Node* consumer = graph->node_by_id(edge.consumer_node_id);
-    if (producer->node_type() == graphlib::NodeType::kInput and producer->shape().rt() == 1 and
-        producer->shape().ct() == 1 and producer->shape().z() == 1 and producer->shape().w() == 1 and
-        graph->get_edge_attributes(edge)->has_broadcast_dims())
-    {
-        // Single tile broadcast case
-        throw BalancerError(
-            fmt::format("Input exceeds max grid forks: {}", producer->name()),
-            BalancerError::InputBroadcastExceedsMaxGridForks(producer->id()));
-    }
-    else
-    {
-        update_constraint_info();
-        reportify::dump_constraints(graph->name(), this);
-        throw BalancerError(
-            fmt::format("Could not satisfy all constraints for edge: {} -> {}", producer->name(), consumer->name()));
-    }
-}
-
-// Returns vector of legal OpModels for passed in node by merging legal OpModels of inserted NOP nodes
-// and legal OpModels of non-modified nodes from shared data.
-//
-const std::vector<OpModel>& GraphSolver::get_legal_op_models(graphlib::Node const* node) const
-{
-    static std::vector<OpModel> null_op_models;
-
-    // For Queue take its producer OpModels.
-    //
-    if (node->node_type() == graphlib::NodeType::kQueue)
-    {
-        node = graph->data_operands(node).back();
-    }
-
-    const auto recomputed_version_it = op_model_recompute_version.find(node);
-
-    if (recomputed_version_it != op_model_recompute_version.end())
-    {
-        return shared_data->recomputed_legal_op_models.at(node)[recomputed_version_it->second];
-    }
-
-    const auto legal_it = shared_data->legal_op_models.find(node);
-
-    if (legal_it != shared_data->legal_op_models.end())
-    {
-        return legal_it->second;
-    }
-
-    return null_op_models;
-}
-
-GraphSolver::RemainingOpModels GraphSolver::at(graphlib::Node const* node) const
-{
-    auto op_models = RemainingOpModels(get_legal_op_models(node), *get_bitset(node->id()));
-    TT_ASSERT(op_models.begin() != op_models.end());
-    return op_models;
-}
-
-void GraphSolver::set(graphlib::Node const* node, OpModel const& op_model, bool skip_update)
-{
-    TT_LOG_ASSERT(selected_op_models.count(node) == 0, "OpModel has already been selected for node {}!", node->name());
-    graphlib::GraphTraversalContext graph_solver_graph_context(graph, &virtual_nodes, &edges_to_ignore);
-
-    selected_op_models.emplace(node, op_model);
-    if (skip_update)  // don't worry about setting legal vs. not, just keep track of what we have in here
-        return;
-
-    auto const& op_models = get_legal_op_models(node);
-    TT_ASSERT(!op_models.empty());
-    std::size_t selection = op_models.size();
-    for (std::size_t i = 0; i < op_models.size(); ++i)
-    {
-        if (op_models[i] == op_model)
-        {
-            selection = i;
-            break;
-        }
-    }
-
-    Bitset* node_bitset = get_bitset(node->id());
-
-    TT_LOG_ASSERT(selection != op_models.size(), "OpModel not found in legal OpModels for node {}!", node->name());
-    TT_LOG_ASSERT((*node_bitset)[selection], "Selection not in legal OpModel set");
-
-    node_bitset->reset();
-    node_bitset->set(selection);
-    op_disabled_bitset_cache[node->id()] = ~(*node_bitset);
-
-    // If placing on single core grid, don't update the solver as it will overconstraint and waste time modeling op-op
-    // connections, since we are cutting and re-resolving anyway after each placed op(op-queue-op).
-    //
-    if (single_core_ip_mode)
-    {
-        return;
-    }
-
-    update_solver(node, true /*expand_root*/, true /*invoked_by_set*/);
-}
-
-// Given current epoch ops, runs the overlay model to determine the amount of memory used for each of the ops. Where
-// needed, it adds extra overlay memory to op (in its OpModel) via overlay_size attribute.
-//
-OpModels* GraphSolver::get_selected_op_models_for_buffering(
-    std::unordered_set<const tt::graphlib::Node*> const& current_epoch_ops)
-{
-    // If fallback (simple) mode is on, we don't need to model ovelay and can just return the selected op models
-    //
-    if (this->shared_data->constraint->resource_usage_fallback_mode)
-    {
-        return &selected_op_models;
-    }
-
-    // If global overlay blob extra size is set, we don't need to model overlay blob memory footprint as it's accounted
-    // for by BBE reserved space
-    //
-    if (this->shared_data->constraint->device_config.get_overlay_blob_extra_size())
-    {
-        return &selected_op_models;
-    }
-
-    // TODO: Read these value from device config
-    // tenstorrent/budabackend#2345
-    //
-    static constexpr int kPhases32kb = 32 * 1024 / 38;  // 862
-    static constexpr int kPhases64kb = 64 * 1024 / 38;  // 1724
-
-    for (const tt::graphlib::Node* node : current_epoch_ops)
-    {
-        TT_ASSERT(node->node_type() == graphlib::NodeType::kBudaOp);
-
-        std::vector<graphlib::Edge> data_operand_edges = graph->operand_data_edges(node);
-        std::vector<graphlib::Edge> data_user_edges = graph->user_data_edges(node);
-
-        // Check usage with producers
-        int total_producer_phases = 0;
-        for (tt::graphlib::Edge e : data_operand_edges)
-        {
-            Node* producer = graph->node_by_id(e.producer_node_id);
-            if (producer->node_type() != graphlib::NodeType::kBudaOp or not current_epoch_ops.count(producer))
-            {
-                continue;
-            }
-
-            ResourceUsage ru = get_edge_resource_usage(
-                graph,
-                balancer_cache_collection->pipe_to_resource_usage_cache,
-                e,
-                selected_op_models.at(producer),
-                selected_op_models.at(node));
-            // From the perspective of the edge, we're interested in the consumer phases, but from the perspective of
-            // the current node, those are producer-side phases
-            total_producer_phases += ru.consumer_phases;
-        }
-
-        // Check usage with consumers
-        int total_consumer_phases = 0;
-        for (Edge e : data_user_edges)
-        {
-            Node* consumer = graph->node_by_id(e.consumer_node_id);
-            if (consumer->node_type() != graphlib::NodeType::kBudaOp or not current_epoch_ops.count(consumer))
-            {
-                continue;
-            }
-
-            ResourceUsage ru = get_edge_resource_usage(
-                graph,
-                balancer_cache_collection->pipe_to_resource_usage_cache,
-                e,
-                selected_op_models.at(node),
-                selected_op_models.at(consumer));
-            // From the perspective of the edge, we're interested in the producer phases, but from the perspective of
-            // the current node, those are consumer-side phases
-            total_consumer_phases += ru.producer_phases;
-        }
-
-        // We confirm that the total phases for both producers and consumers are within the limits. If this asserts, it
-        // probably means we didn't do a good job in graph solver when calculating EdgeCosts
-        //
-        TT_ASSERT(total_producer_phases <= kPhases64kb, "Node {} exceeds 64kb phases with producers", node->name());
-        TT_ASSERT(total_consumer_phases <= kPhases64kb, "Node {} exceeds 64kb phases with consumers", node->name());
-
-        if (total_producer_phases > kPhases32kb || total_consumer_phases > kPhases32kb)
-        {
-            // In this case, we need to add extra overlay memory to the op model
-            //
-            selected_op_models.at(node).overlay_size = 128 * 1024;
-        }
-    }
-
-    return &selected_op_models;
-}
-
-// Returns GraphSolverSolution.
-// FINISH also performs graph modification in case buffer was used(there are NOPs inserted by this instance of GS)
-// they will be no longer virtual and edges which were marked for removal will be removed.
-// Therefore it is important that Balancer pass is completed by invoking finish on chosen GS instance.
-//
-GraphSolverSolution const GraphSolver::finish()
-{
-    // Assert in condition finish was already called(potentially from another instance of GS) as this could lead to
-    // data inconsistency in graph structure due to virtual nodes being persisted from two different sources.
-    //
-    TT_ASSERT(!shared_data->graph_solving_finished, "Finish already called on another instance of GraphSolver!");
-
-    // Validate that all nodes have been assigned an OpModel
-    // with a consistent state in op_disabled_bitset_cache.
-    //
-    for (const auto& [node, op_model] : selected_op_models)
-    {
-        const auto it = op_disabled_bitset_cache.find(node->id());
-
-        // Every selected node must be represented in op_disabled_bitset_cache.
-        //
-        TT_ASSERT(it != op_disabled_bitset_cache.end());
-
-        // Every selected node must have exactly one bit unset in op_disabled_bitset_cache.
-        //
-        TT_ASSERT(it->second.count() == it->second.size() - 1);
-    }
-
-    // Prevent cleanup of nodes added by this GS.
-    //
-    for (std::shared_ptr<graphlib::NodeGraphContainer> node_container : virtual_nodes_management)
-    {
-        node_container->remove_from_graph = false;
-    }
-
-    // Persist changes made via this instance of GraphSolver.
-    // Remove virtual queues.
-    //
-    for (const Node* node : virtual_nodes)
-    {
-        if (node->node_type() != graphlib::NodeType::kQueue)
-        {
-            graph->mark_node_persisted(node);
-        }
-        else
-        {
-            graph->remove_node(node);
-        }
-    }
-
-    // Remove edges marked for removal.
-    //
-    for (Edge edge : edges_pending_removal)
-    {
-        graph->remove_edge(edge);
-    }
-
-    update_constraint_info();
-    reportify::dump_constraints(graph->name(), this);
-    shared_data->graph_solving_finished = true;
-
-    return GraphSolverSolution(selected_op_models, cut_edges);
-}
-
-// Will return legal OpModels from shared data, but only for persisted common nodes without buffered nodes/NOPs.
-//
-LegalOpModels const& GraphSolver::legal_op_models_no_buffering() const { return shared_data->legal_op_models; }
-
-// Will cut passed in edges in a graph and call resolve to recompute GraphSolver.
-//
-void GraphSolver::cut(std::vector<graphlib::Edge> const& edges, bool epoch_cut)
-{
-    TT_ASSERT(edges.size() > 0, "At least one edge needs to be passed in for cutting!");
-    std::unordered_set<graphlib::Node*> nodes_to_legalize;
-    bool partial_reset_allowed = env_as<bool>("PYBUDA_GRAPHSOLVER_FAST") and bitsets.size() > 0;
-
-    for (Edge edge : edges)
-    {
-        Node* src = graph->node_by_id(edge.producer_node_id);
-        Node* dest = graph->node_by_id(edge.consumer_node_id);
-
-#ifdef DEBUG
-        // Cutting between non-op nodes will make GraphSolver sad.
-        //
-        TT_ASSERT(dest->node_type() == graphlib::NodeType::kBudaOp, "Only cutting between BudaOps is supported!");
-        TT_ASSERT(src->node_type() == graphlib::NodeType::kBudaOp, "Only cutting between BudaOps is supported!");
-#endif
-        TT_ASSERT(cut_edges.count(edge) == 0, "Same edge should not be cut twice!");
-        TT_LOG_ASSERT(
-            selected_op_models.count(src) == 0 or selected_op_models.count(dest) == 0,
-            "At least one node affected by CUT must not be SET! {} -> {}",
-            src->name(),
-            dest->name());
-        cut_edges.insert(std::make_pair(edge, false /* self cutting edge */));
-        if (selected_op_models.count(src) == 0)
-        {
-            nodes_to_legalize.insert(src);
-
-            if (partial_reset_allowed)
-            {
-                *get_bitset(src->id()) = kBitsetAll;
-            }
-        }
-
-        if (selected_op_models.count(dest) == 0)
-        {
-            nodes_to_legalize.insert(dest);
-            if (partial_reset_allowed)
-            {
-                *get_bitset(dest->id()) = kBitsetAll;
-            }
-        }
-
-        // Insert virtual queue on cut edge.
-        //
-        insert_virtual_queue(edge, src, dest, epoch_cut);
-    }
-
-    // Recalculate OpModels for nodes affected by queue insertion
-    // and resolve whole graph again.
-    //
-    recompute_legal_op_models_on_cut(nodes_to_legalize);
-    resolve(partial_reset_allowed);
-}
-
-void GraphSolver::insert_virtual_queue(graphlib::Edge& edge, const Node* src, const Node* dest, bool is_e2e_queue)
-{
-    TT_ASSERT(edge.edge_type == graphlib::EdgeType::kData, "Cut only data edges!");
-    graphlib::Node* queue_node = nullptr;
-    uintptr_t gs_unique = (uintptr_t) static_cast<const void*>(this);
-    std::string queue_name = "virtual_queue_" + src->name() + "_" + dest->name() + "_" +
-                             std::to_string(edge.consumer_input_port_id) + "_" + std::to_string(gs_unique);
-
-    // Insert virtual dummy queue on this edge.
-    //
-    if (!is_e2e_queue)
-    {
-        queue_node = graph->add_node(
-            graphlib::create_node<graphlib::BufferingQueueNode>(queue_name, 1 /* num_entries */),
-            graph->get_subgraph_id_for_node(src->id()));
-    }
-    else
-    {
-        // cross_epoch_type and cross_chip_type will be properly recalculated at a later phase
-        // since these virtual dummy queues will be replaced with proper ones in post placer pass.
-        //
-        queue_node = graph->add_node(
-            graphlib::create_node<graphlib::EpochToEpochQueueNode>(
-                queue_name, false /*cross_epoch_type*/, false /*cross_chip_type*/),
-            graph->get_subgraph_id_for_node(src->id()));
-    }
-
-    queue_node->set_shape(graph->node_by_id(edge.producer_node_id)->shape());
-    queue_node->set_output_df(graph->node_by_id(edge.producer_node_id)->output_df());
-    queue_node->set_epoch_type(dest->get_epoch_type());
-
-    Edge node_to_q_edge(
-        edge.producer_node_id, edge.producer_output_port_id, queue_node->id(), 0, graphlib::EdgeType::kData);
-    graph->add_edge(node_to_q_edge);
-    graph->get_edge_attributes(node_to_q_edge)->set_ublock_order(graph->get_edge_attributes(edge)->get_ublock_order());
-
-    graphlib::Edge q_to_node_edge =
-        Edge(queue_node->id(), 0, edge.consumer_node_id, edge.consumer_input_port_id, graphlib::EdgeType::kData);
-    graph->add_edge(q_to_node_edge);
-    graph->copy_edge_attributes(edge, q_to_node_edge);
-
-    // Going forward this edge will be ignored within GraphTraversalContext.
-    //
-    edges_to_ignore.insert(edge);
-
-    // Register inserted queue as virtual.
-    //
-    register_virtual_node(queue_node);
-}
-
-void GraphSolver::recompute_legal_op_models_on_cut(std::unordered_set<graphlib::Node*>& nodes_to_legalize)
-{
-    if (use_op_model_recalculation_on_cut)
-    {
-        TT_ASSERT(nodes_to_legalize.size() > 0, "At least one node must be specified for legal op models recompute!");
-        graphlib::GraphTraversalContext graph_solver_graph_context(graph, &virtual_nodes, &edges_to_ignore);
-
-        recompute_legal_op_models(nodes_to_legalize);
-    }
-}
-
-void GraphSolver::recompute_legal_op_models(std::unordered_set<graphlib::Node*>& nodes_to_legalize)
-{
-    // Call Legalizer and calculate OpModels for inserted NOP nodes. At the same time recalculate OpModels for
-    // affected non virtual nodes, ie nodes which are in direct connection with inserted NOP nodes/virtual queues.
-    //
-    LegalOpModels recomputed_legal_op_models =
-        legalizer::get_legal_op_models(graph, balancer_config, balancer_cache_collection, &nodes_to_legalize);
-
-    // Insert or update legal OpModels in recomputed_legal_op_models. Assert if SET was already invoked for them.
-    //
-    for (auto& it : recomputed_legal_op_models)
-    {
-        // Reset disabled op model cache as op models are being recomputed.
-        //
-        op_disabled_bitset_cache.erase(it.first->id());
-
-        auto recomputed_it = this->shared_data->recomputed_legal_op_models.find(it.first);
-        if (recomputed_it == this->shared_data->recomputed_legal_op_models.end())
-        {
-            std::vector<std::vector<OpModel>> versioned_recomputed_legal_op_models;
-            versioned_recomputed_legal_op_models.push_back(std::move(it.second));
-            this->shared_data->recomputed_legal_op_models.emplace(
-                it.first, std::move(versioned_recomputed_legal_op_models));
-            op_model_recompute_version.emplace(it.first, 0);
-        }
-        else
-        {
-            recomputed_it->second.push_back(std::move(it.second));
-            op_model_recompute_version[it.first] = recomputed_it->second.size() - 1;
-        }
-    }
-}
-
-// BUFFER is used to directly insert NOPs in GraphSolving phase on vector of provided graph edges according to
-// attributes specified in BufferInfo. NOPs are inserted only for this instance of GS and are not visible globally until
-// FINISH is invoked for GS instance which is accepted as a solution. This is achieved via introduction of concept of
-// virtual nodes and GraphTraversalContext allowing separate instances of GS to work independently with their own
-// version of the graph.
-//
-// RETURNS vector of nodes which were inserted plus modified nodes in case SET needs to be called  again for them.
-// Note that for every edge, producer-consumer node pair, selected op models will be reset for both producer and
-// consumer as OpModels for them will be recalculated. If SET was already invoked for those nodes, selected op models
-// will be reset, and they will be returned as part of the results along with NOP nodes so that SET can be invoked again
-// for them.
-//
-std::vector<graphlib::Node*> GraphSolver::buffer(std::vector<BufferInfo>& buffer_edges)
-{
-    graphlib::GraphTraversalContext graph_solver_graph_context(graph, &virtual_nodes, &edges_to_ignore);
-    bool partial_reset_allowed = env_as<bool>("PYBUDA_GRAPHSOLVER_FAST") and bitsets.size() > 0;
-    std::vector<graphlib::Node*> inserted_nodes;
-    std::unordered_set<graphlib::Node*> nodes_to_legalize;
-    auto op_name = [](Node* src, Node* dest, std::uint32_t buffer_index)
-    {
-        return "buffer_" + std::to_string(buffer_index) + "_" + std::to_string(src->id()) + "_" +
-               std::to_string(dest->id());
-    };
-    for (BufferInfo buff_info : buffer_edges)
-    {
-        TT_ASSERT(buff_info.nop_count > 0, "NOP insertion count must be higher than 0!");
-
-        Node* dest = graph->node_by_id(buff_info.edge.consumer_node_id);
-        Node* src = graph->node_by_id(buff_info.edge.producer_node_id);
-        TT_ASSERT(
-            graph->is_node_visible(dest),
-            "Node invisible for this instance of GS, probably owned by different instance of GS!");
-        TT_ASSERT(
-            graph->is_node_visible(src),
-            "Node invisible for this instance of GS, probably owned by different instance of GS!");
-        Node* original_dest = dest;
-
-        std::size_t buffer_index = 0;
-
-        // Besides legalizing newly inserted NOP nodes, re-legalize nodes which will be connected to them
-        // as having NOP in between can potentially produce more valid/different OpModels.
-        // Do this only for nodes which are not SET already, this is up to Balancer policy to decide/control.
-        //
-        if (selected_op_models.count(src) == 0)
-        {
-            nodes_to_legalize.insert(src);
-            if (partial_reset_allowed)
-            {
-                *get_bitset(src->id()) = kBitsetAll;
-            }
-        }
-
-        if (selected_op_models.count(dest) == 0)
-        {
-            nodes_to_legalize.insert(dest);
-            if (partial_reset_allowed)
-            {
-                *get_bitset(dest->id()) = kBitsetAll;
-            }
-        }
-
-        for (int nop_count = 0; nop_count < buff_info.nop_count; nop_count++)
-        {
-            graphlib::BudaOpNode* buffer_nop = nullptr;
-            while (graph->has_node_with_name(op_name(src, original_dest, buffer_index))) buffer_index++;
-
-            for (graphlib::Edge e : graph->get_edges(src, dest))
-            {
-                if (e.edge_type != graphlib::EdgeType::kData)
-                    continue;
-
-                if (buffer_nop == nullptr)
-                {
-                    buffer_nop = graph->add_node(
-                        graphlib::create_node<graphlib::BudaOpNode>(op_name(src, original_dest, buffer_index), "nop"),
-                        graph->get_subgraph_id_for_node(src->id()));
-                    buffer_nop->set_shape(src->shape());
-                    buffer_nop->set_buffering_op(true);
-                    buffer_nop->set_epoch_type(original_dest->get_epoch_type());
-                    buffer_nop->set_output_df(src->output_df());
-                    auto src_buda_op = dynamic_cast<graphlib::BudaOpNode*>(src);
-                    if (src_buda_op != nullptr)
-                    {
-                        buffer_nop->set_intermediate_df(src_buda_op->intermediate_df());
-                        buffer_nop->set_accumulate_df(src_buda_op->accumulate_df());
-                        buffer_nop->set_math_fidelity(src_buda_op->math_fidelity());
-                    }
-
-                    register_virtual_node(buffer_nop);
-                    nodes_to_legalize.insert(buffer_nop);
-                    inserted_nodes.push_back(buffer_nop);
-                }
-
-                auto [edge0, edge1] = graphlib::insert_node_on_edge(
-                    graph, e, buffer_nop, false /*inherit_consumer_attrs*/, false /*remove_edge*/);
-
-                // Edge cannot be removed right away from the graph as we will affect global state
-                // for all GS instances and end up modifing common graph by GS instance that may end up discarded.
-                // Thats why we collect list of edges which are pending for removal and removing them in FINISH method
-                // when balancing is complete.
-                //
-                edges_to_ignore.insert(e);
-                edges_pending_removal.push_back(e);
-
-                log_trace(
-                    LogGraphCompiler,
-                    "Inserted buffer nop node {} between {} and {}",
-                    buffer_nop->name(),
-                    src->name(),
-                    dest->name());
-
-                // Move TMs to edge1.
-                //
-                auto& tms = graph->get_edge_attributes(edge0)->get_tms();
-
-                // TODO Should we do this by default, should hoist_tms remain?
-                //
-                if (not buff_info.hoist_tms)
-                {
-                    // Not hoisting tms, move them to edge1.
-                    //
-                    graph->get_edge_attributes(edge1)->set_tms(tms);
-                    graph->get_edge_attributes(edge0)->set_tms(std::vector<graphlib::OpType>{});
-                }
-
-                dest = buffer_nop;
-            }
-        }
-    }
-
-    recompute_legal_op_models(nodes_to_legalize);
-    // We need to resolve again with new nodes, edges and OpModels.
-    //
-    resolve(partial_reset_allowed);
-
-    return inserted_nodes;
-}
-
-// REGISTER_VIRTUAL_NODE marks inserted node as virtual, and tracks it in virtual_nodes and
-// virtual_nodes_management for auto removal from graph in case this GS gets discarded.
-//
-void GraphSolver::register_virtual_node(graphlib::Node* virtual_node)
-{
-    graph->mark_node_virtual(virtual_node);
-    virtual_nodes.insert(virtual_node);
-    virtual_nodes_management.emplace_back(std::make_shared<graphlib::NodeGraphContainer>(virtual_node, graph));
-}
-
-// Set GraphTraversalContext of this GS instance externally wherever it is needed for graph operations
-// set in context of this GS instance.
-//
-std::unique_ptr<graphlib::GraphTraversalContext> GraphSolver::get_graph_traversal_context()
-{
-    return std::make_unique<graphlib::GraphTraversalContext>(graph, &virtual_nodes, &edges_to_ignore);
-}
-
-// Similar to above but for epoch traversal context based on passed in nodes.
-//
-std::unique_ptr<graphlib::GraphTraversalContext> GraphSolver::get_graph_epoch_traversal_context(
-    const std::unordered_set<const graphlib::Node*>* epoch_nodes)
-{
-    return std::make_unique<graphlib::GraphTraversalContext>(graph, epoch_nodes, &virtual_nodes, &edges_to_ignore);
-}
-
-// Suboptimal op models invalidation according to provided invalidation
-// strategy(GraphSolverOpModelInvalidationStrategy).
-//
-void GraphSolver::invalidate_suboptimal_op_models(int invalidation_strategy)
-{
-    if (!env_as<bool>("PYBUDA_BALANCER_PREPASS_DISABLED"))
-    {
-        this->suboptimal_opmodel_invalidation_strategy = invalidation_strategy;
-        invalidate_suboptimal_op_models(graphlib::topological_sort(*graph));
-    }
-}
-
-void GraphSolver::invalidate_suboptimal_op_models(const std::vector<graphlib::Node*>& nodes)
-{
-    for (GraphSolverOpModelInvalidationStrategyTier tier : {FirstTier, SecondTier})
-    {
-        for (const graphlib::Node* node : nodes)
-        {
-            if (node->node_type() == graphlib::NodeType::kBudaOp)
-            {
-                const graphlib::BudaOpNode* op_node = static_cast<const graphlib::BudaOpNode*>(node);
-                invalidate_suboptimal_op_models_for_op(op_node, tier);
-            }
-        }
-    }
-}
-
-void GraphSolver::invalidate_streaming_into_output(const std::vector<graphlib::Node*>& nodes)
-{
-    for (graphlib::Node* node : nodes)
-    {
-        // Try to eliminate streaming into output if possible.
-        //
-        if (node->node_type() == graphlib::NodeType::kOutput)
-        {
-            for (graphlib::Node* operand_node : graph->data_operands(node))
-            {
-                if (operand_node->node_type() == graphlib::NodeType::kBudaOp)
-                {
-                    bool no_stream_output_valid = false;
-                    const graphlib::BudaOpNode* op_node = static_cast<const graphlib::BudaOpNode*>(operand_node);
-
-                    // Op model already selected for this node, skip.
-                    //
-                    if (selected_op_models.count(op_node) > 0)
-                    {
-                        continue;
-                    }
-
-                    const std::vector<tt::balancer::OpModel>& op_models = get_legal_op_models(op_node);
-                    if (op_models.size() == 1)
-                    {
-                        continue;
-                    }
-
-                    Bitset* node_bitset = get_bitset(op_node->id());
-                    std::uint32_t op_model_count = std::min(kNumBitsetBits, std::max(1lu, op_models.size()));
-
-                    for (size_t index = 0; index < op_model_count; index++)
-                    {
-                        if (!node_bitset->test(index))
-                        {
-                            continue;
-                        }
-
-                        if (op_models[index].t_stream_factor.none())
-                        {
-                            no_stream_output_valid = true;
-                            break;
-                        }
-                    }
-
-                    // At least one valid non stream option present. Eliminate streaming ones.
-                    //
-                    if (no_stream_output_valid)
-                    {
-                        Bitset discarded_op_models_bitset;
-                        bool stream_option_eliminated = false;
-
-                        for (std::size_t index = 0; index < op_model_count; index++)
-                        {
-                            if (node_bitset->test(index) and !op_models[index].t_stream_factor.none())
-                            {
-                                discarded_op_models_bitset.set(index);
-                                stream_option_eliminated = true;
-                            }
-                        }
-
-                        // We eliminated at least one op_model. Update bitset and solver.
-                        // Also update op_disabled_bitset_cache for this op node, to speed up future resolves.
-                        //
-                        if (stream_option_eliminated)
-                        {
-                            *node_bitset &= ~discarded_op_models_bitset;
-                            auto it = op_disabled_bitset_cache.find(op_node->id());
-
-                            if (it == op_disabled_bitset_cache.end())
-                            {
-                                op_disabled_bitset_cache.emplace(op_node->id(), discarded_op_models_bitset);
-                            }
-                            else
-                            {
-                                it->second |= discarded_op_models_bitset;
-                            }
-
-                            update_solver(operand_node);
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-void GraphSolver::invalidate_suboptimal_op_models_for_op(
-    const graphlib::BudaOpNode* op_node, GraphSolverOpModelInvalidationStrategyTier tier)
-{
-    // Op model already selected for this node, skip.
-    //
-    if (selected_op_models.count(op_node) > 0)
-    {
-        return;
-    }
-
-    const std::vector<tt::balancer::OpModel>& op_models = get_legal_op_models(op_node);
-
-    if (op_models.size() == 1)
-    {
-        return;
-    }
-
-    Bitset* node_bitset = get_bitset(op_node->id());
-    std::uint32_t op_model_count = std::min(kNumBitsetBits, std::max(1lu, op_models.size()));
-
-    switch (tier)
-    {
-        case FirstTier:
-        {
-            if (op_node->is_matmul_not_sparse())
-            {
-                if (suboptimal_opmodel_invalidation_strategy & ((int)DenseMatmulPrologue | (int)DenseMatmulBetterUkt))
-                {
-                    uint32_t disabled_op_models = 0;
-                    uint32_t discarded_op_models = 0;
-                    Bitset discarded_op_models_bitset;
-
-                    int max_ukt = 0;
-                    bool has_one_valid_prologue = false;
-
-                    // First screen what is available for pruning.
-                    //
-                    for (size_t i = 0; i < op_model_count; i++)
-                    {
-                        if (!node_bitset->test(i))
-                        {
-                            disabled_op_models++;
-                            continue;
-                        }
-
-                        bool has_prologue = op_models[i].parameter_buffers[1];
-                        has_one_valid_prologue |= has_prologue;
-
-                        int u_kt = op_models[i].input_buffers[0].block_shape.ublock.ct;
-                        int m_k = op_models[i].op_shape.inputs[0].ct / u_kt;
-
-                        if ((m_k * u_kt >= 8) and u_kt > max_ukt)
-                        {
-                            max_ukt = u_kt;
-                        }
-                    }
-
-                    if (has_one_valid_prologue and suboptimal_opmodel_invalidation_strategy & (int)DenseMatmulPrologue)
-                    {
-                        max_ukt = 0;
-                        for (size_t i = 0; i < op_model_count; i++)
-                        {
-                            if (!node_bitset->test(i))
-                            {
-                                continue;
-                            }
-
-                            bool has_prologue = op_models[i].parameter_buffers[1];
-                            if (!has_prologue)
-                            {
-                                discarded_op_models_bitset.set(i);
-                                discarded_op_models++;
-                                continue;
-                            }
-
-                            int u_kt = op_models[i].input_buffers[0].block_shape.ublock.ct;
-                            int m_k = op_models[i].op_shape.inputs[0].ct / u_kt;
-
-                            if ((m_k * u_kt >= 8) and u_kt > max_ukt)
-                            {
-                                max_ukt = u_kt;
-                            }
-                        }
-                    }
-
-                    int ukt_limit = std::min(max_ukt, 4);
-
-                    if (ukt_limit > 1 and suboptimal_opmodel_invalidation_strategy & (int)DenseMatmulBetterUkt)
-                    {
-                        for (size_t i = 0; i < op_model_count; i++)
-                        {
-                            if (!node_bitset->test(i) or discarded_op_models_bitset.test(i))
-                            {
-                                continue;
-                            }
-
-                            int u_kt = op_models[i].input_buffers[0].block_shape.ublock.ct;
-                            int m_k = op_models[i].op_shape.inputs[0].ct / u_kt;
-                            if ((m_k * u_kt >= 8) && u_kt < ukt_limit)
-                            {
-                                discarded_op_models_bitset.set(i);
-                                discarded_op_models++;
-                                continue;
-                            }
-                        }
-                    }
-
-                    if (discarded_op_models > 0 and discarded_op_models + disabled_op_models < op_model_count)
-                    {
-                        *node_bitset &= ~discarded_op_models_bitset;
-                        auto it = op_disabled_bitset_cache.find(op_node->id());
-
-                        if (it == op_disabled_bitset_cache.end())
-                        {
-                            op_disabled_bitset_cache.emplace(op_node->id(), discarded_op_models_bitset);
-                        }
-                        else
-                        {
-                            it->second |= discarded_op_models_bitset;
-                        }
-
-                        update_solver(op_node);
-                    }
-                }
-            }
-        }
-        break;
-
-        case SecondTier:
-        {
-            if (op_node->is_sparse_matmul())
-            {
-                if (suboptimal_opmodel_invalidation_strategy & (int)MatmulSparseDenseGridPairing)
-                {
-                    uint32_t disabled_op_models = 0;
-                    uint32_t discarded_op_models = 0;
-                    Bitset discarded_op_models_bitset;
-
-                    for (size_t i = 0; i < op_model_count; i++)
-                    {
-                        if (!node_bitset->test(i))
-                        {
-                            disabled_op_models++;
-                            continue;
-                        }
-
-                        if (op_models[i].grid_shape.c != 1)
-                        {
-                            discarded_op_models_bitset.set(i);
-                            discarded_op_models++;
-                            continue;
-                        }
-                    }
-
-                    if (discarded_op_models > 0 and discarded_op_models + disabled_op_models < op_model_count)
-                    {
-                        *node_bitset &= ~discarded_op_models_bitset;
-                        auto it = op_disabled_bitset_cache.find(op_node->id());
-
-                        if (it == op_disabled_bitset_cache.end())
-                        {
-                            op_disabled_bitset_cache.emplace(op_node->id(), discarded_op_models_bitset);
-                        }
-                        else
-                        {
-                            it->second |= discarded_op_models_bitset;
-                        }
-
-                        update_solver(op_node);
-                    }
-
-                    PathSet* sparse_to_dense_pathset = get_user_path_sets_pts(op_node)[0];
-                    const graphlib::BudaOpNode* consumer =
-                        dynamic_cast<const graphlib::BudaOpNode*>(sparse_to_dense_pathset->get_consumer_node());
-                    if (!consumer or !consumer->should_pair_with_sparse(op_node, graph))
-                    {
-                        return;
-                    }
-
-                    bool can_prune_paths = false;
-                    const std::vector<tt::balancer::OpModel>& dense_op_models = get_legal_op_models(consumer);
-                    for (const auto& path : sparse_to_dense_pathset->get_paths())
-                    {
-                        if (op_models[path.producer_id].grid_shape.r == dense_op_models[path.consumer_id].grid_shape.r)
-                        {
-                            can_prune_paths = true;
-                            break;
-                        }
-                    }
-
-                    if (can_prune_paths)
-                    {
-                        PathSet::Paths* paths = sparse_to_dense_pathset->get_paths_pt();
-                        for (size_t i = 0; i < paths->size(); i++)
-                        {
-                            if (op_models[(*paths)[i].producer_id].grid_shape.r !=
-                                dense_op_models[(*paths)[i].consumer_id].grid_shape.r)
-                            {
-                                (*paths)[i] = paths->back();
-                                paths->pop_back();
-                                i--;
-                            }
-                        }
-
-                        update_solver(consumer);
-                    }
-                }
-            }
-        }
-        break;
-
-        default: TT_ASSERT("Invalid/undefined tier!");
-    }
-}
-
-#ifdef DEBUG
-// Computes and logs if there are valid connections for this edge among paths
-// that were discarded by previously computed edges(edge eliminated by disabling some OpModels).
-//
-void GraphSolver::compute_edge_elimination_debug_info(
-    Edge& edge,
-    Bitset* producer_bitset,
-    Bitset* consumer_bitset,
-    Bitset& edge_producer_bitset,
-    Bitset& edge_consumer_bitset,
-    std::vector<OpModel>& producer_op_models_debug,
-    std::vector<OpModel>& consumer_op_models_debug,
-    std::uint64_t producer_count,
-    std::uint64_t consumer_count,
-    EdgeConstraintDebugInfo& edge_constraint_debug_info,
-    EdgeConstraintDebugInfo& graph_constraint_debug_info)
-{
-    Constraint* constraint = shared_data->constraint.get();
-    Bitset eliminatedProducers = *producer_bitset ^ edge_producer_bitset;
-    Bitset eliminatedConsumers = *consumer_bitset ^ edge_consumer_bitset;
-
-    // Propagate edge elimination for path eliminated but valid prod-consumer combinations.
-    //
-    for (std::uint64_t producer_id = 0; producer_id < producer_count; ++producer_id)
-    {
-        for (std::uint64_t consumer_id = 0; consumer_id < consumer_count; ++consumer_id)
-        {
-            bool producerNodeOpModelDisabled = (*producer_bitset)[producer_id] == 0;
-            bool consumerNodeOpModelDisabled = (*consumer_bitset)[consumer_id] == 0;
-            bool producerNodeOpModelEliminatedByCurrentEdge = eliminatedProducers[producer_id] != 0;
-            bool consumerNodeOpModelEliminatedByCurrentEdge = eliminatedConsumers[consumer_id] != 0;
-
-            // We already went through these non disabled paths.
-            //
-            if (!producerNodeOpModelDisabled && !consumerNodeOpModelDisabled)
-            {
-                continue;
-            }
-
-            auto [cost, constraint_failure_reason] = cost_fn(
-                constraint, graph, edge, producer_op_models_debug, consumer_op_models_debug, producer_id, consumer_id);
-
-            if (NoConstraintFailure == constraint_failure_reason)
-            {
-                if (not cost.exceeded())
-                {
-                    TT_ASSERT(producer_id <= std::numeric_limits<decltype(Path::producer_id)>::max());
-                    TT_ASSERT(consumer_id <= std::numeric_limits<decltype(Path::consumer_id)>::max());
-
-                    if (producerNodeOpModelDisabled && !producer_op_models_debug.empty())
-                    {
-                        if (!consumerNodeOpModelDisabled)
-                        {
-                            if (consumerNodeOpModelEliminatedByCurrentEdge && !consumer_op_models_debug.empty())
-                            {
-                                consumer_op_models_debug[consumer_id].eliminating_edge =
-                                    producer_op_models_debug[producer_id].eliminating_edge;
-                            }
-                        }
-
-                        edge_constraint_debug_info.addEliminatingEdge(
-                            producer_op_models_debug[producer_id].eliminating_edge);
-                    }
-
-                    if (consumerNodeOpModelDisabled && !consumer_op_models_debug.empty())
-                    {
-                        if (!producerNodeOpModelDisabled)
-                        {
-                            if (producerNodeOpModelEliminatedByCurrentEdge && !producer_op_models_debug.empty())
-                            {
-                                producer_op_models_debug[producer_id].eliminating_edge =
-                                    consumer_op_models_debug[consumer_id].eliminating_edge;
-                            }
-                        }
-
-                        edge_constraint_debug_info.addEliminatingEdge(
-                            consumer_op_models_debug[consumer_id].eliminating_edge);
-                    }
-
-                    edge_constraint_debug_info.recordEdgeConstraintFailure(EdgePathRemovedByPriorEdgeElimination);
-                    graph_constraint_debug_info.recordEdgeConstraintFailure(EdgePathRemovedByPriorEdgeElimination);
-                }
-            }
-        }
-    }
-
-    // Mark OpModels eliminated by current edge which are not already eliminated by ancestor egdes.
-    //
-    for (std::uint64_t producer_id = 0; producer_id < producer_count; ++producer_id)
-    {
-        if (eliminatedProducers[producer_id] != 0)
-        {
-            if (0 == std::get<0>(producer_op_models_debug[producer_id].eliminating_edge))
-            {
-                producer_op_models_debug[producer_id].eliminating_edge = edge.unique_id();
-            }
-        }
-    }
-
-    for (std::uint64_t consumer_id = 0; consumer_id < consumer_count; ++consumer_id)
-    {
-        if (eliminatedConsumers[consumer_id] != 0)
-        {
-            if (0 == std::get<0>(consumer_op_models_debug[consumer_id].eliminating_edge))
-            {
-                consumer_op_models_debug[consumer_id].eliminating_edge = edge.unique_id();
-            }
-        }
-    }
-}
-#endif
-
-void GraphSolver::update_constraint_info()
-{
-    PROFILE_SCOPE();
-
-    if (!env_as<bool>("PYBUDA_COLLECT_CONSTRAINT_INFO"))
-        return;
-
-    auto create_edge_name = [](graphlib::Edge edge)
-    {
-        return fmt::format(
-            "{}@{}:{}@{}",
-            edge.producer_node_id,
-            edge.producer_output_port_id,
-            edge.consumer_node_id,
-            edge.consumer_input_port_id);
-    };
-
-    int num_pages = (graph->num_nodes() + ConstraintInfo::kPageSize - 1) / ConstraintInfo::kPageSize;
-    auto nodes = graphlib::topological_sort(*graph);
-    bool recomputed = false;
-
-    // Needs to change if the number of nodes changed or if GS owner instance changed.
-    //
-    if (constraint_info_ptr->node_id_to_name.size() != nodes.size() or
-        constraint_info_ptr->needs_to_be_recomputed(this))
-    {
-        constraint_info_ptr->graph_name = graph->name();
-        constraint_info_ptr->pages.clear();
-        constraint_info_ptr->pages.resize(num_pages);
-        constraint_info_ptr->node_name_to_page.clear();
-        recomputed = true;
-
-        int page_idx = 0;
-        for (ConstraintInfo::Page& page : constraint_info_ptr->pages)
-        {
-            page.node_id_order.reserve(ConstraintInfo::kPageSize);
-            for (int i = page_idx * ConstraintInfo::kPageSize;
-                 i < std::min((page_idx + 1) * ConstraintInfo::kPageSize, (int)nodes.size());
-                 ++i)
-            {
-                auto* node = nodes[i];
-
-                page.node_id_order.push_back(node->id());
-                constraint_info_ptr->node_id_to_name.insert({std::to_string(node->id()), node->name()});
-                constraint_info_ptr->node_name_to_page.insert(
-                    {node->name(), std::make_pair(page_idx, i % ConstraintInfo::kPageSize)});
-
-                auto const& op_models = get_legal_op_models(node);
-                if (!op_models.empty())
-                {
-                    auto& node_op_model_ids = page.node_id_to_op_model_ids[std::to_string(node->id())];
-                    for (auto const& op_model : op_models)
-                    {
-                        page.id_to_op_models.insert({std::to_string(op_model.id.id), op_model});
-                        node_op_model_ids.push_back(op_model.id.id);
-                    }
-                }
-            }
-            page_idx += 1;
-        }
-
-        constraint_info_ptr->gs_owner_cache = this;
-    }
-
-    // Clear existing info and overwrite with latest resolve
-    int page_idx = 0;
-    for (ConstraintInfo::Page& page : constraint_info_ptr->pages)
-    {
-        for (int i = page_idx * ConstraintInfo::kPageSize;
-             i < std::min((page_idx + 1) * ConstraintInfo::kPageSize, (int)nodes.size());
-             ++i)
-        {
-            auto* node = nodes[i];
-
-            // Do not update nodes that have already been selected for
-            if (not recomputed and selected_op_models.find(node) != selected_op_models.end())
-                continue;
-
-            auto const& consumer_op_models = get_legal_op_models(node);
-            for (graphlib::Edge edge : graph->operand_data_edges(node))
-            {
-                auto* producer = graph->node_by_id(edge.producer_node_id);
-                auto const& producer_op_models = get_legal_op_models(producer);
-                auto match = path_set_ids.find(edge);
-                if (match != path_set_ids.end())
-                {
-                    std::string edge_name = create_edge_name(edge);
-                    auto& paths = page.edge_to_path_sets[edge_name];
-                    PathSet const& path_set = get_path_set(edge);
-                    paths.clear();
-                    paths.reserve(path_set.get_paths().size());
-                    for (Path const& path : path_set.get_paths())
-                    {
-                        paths.push_back(std::make_tuple(path.producer_id, path.consumer_id));
-                    }
-                }
-
-                std::uint64_t producer_count = std::min(kNumBitsetBits, producer_op_models.size());
-                std::uint64_t consumer_count = std::min(kNumBitsetBits, consumer_op_models.size());
-                for (std::uint64_t producer_id = 0; producer_id < producer_count; ++producer_id)
-                {
-                    for (std::uint64_t consumer_id = 0; consumer_id < consumer_count; ++consumer_id)
-                    {
-                        std::string key = fmt::format(
-                            "{}:{}", producer_op_models[producer_id].id.id, consumer_op_models[consumer_id].id.id);
-                        auto match = failure_reasons.find(key);
-                        if (match != failure_reasons.end())
-                            page.failure_reason_ids[key] = match->second;
-                    }
-                }
-            }
-        }
-
-        page_idx += 1;
-    }
-
-    constraint_info_ptr->op_model_selection.clear();
-    constraint_info_ptr->op_model_selection.reserve(selected_op_models.size());
-    for (auto const& [node, op_model] : selected_op_models)
-    {
-        constraint_info_ptr->op_model_selection.push_back(op_model.id.id);
-    }
-}
-}  // namespace tt::balancer::legalizer
diff --git a/pybuda/csrc/balancer/legalizer/graph_solver.hpp b/pybuda/csrc/balancer/legalizer/graph_solver.hpp
deleted file mode 100644
index d4f5fe824..000000000
--- a/pybuda/csrc/balancer/legalizer/graph_solver.hpp
+++ /dev/null
@@ -1,541 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <algorithm>
-#include <bitset>
-#include <memory>
-#include <unordered_map>
-#include <vector>
-
-#include "balancer/balancer_cache_collection.hpp"
-#include "balancer/balancer_config.hpp"
-#include "balancer/exceptions.hpp"
-#include "balancer/legalizer/constraints.hpp"
-#include "balancer/legalizer/graph_solver_types.hpp"
-#include "balancer/types.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-#include "graph_lib/node_types.hpp"
-#include "graph_lib/utils.hpp"
-#include "utils/logger.hpp"
-#include "utils/profile.hpp"
-#include "utils/small_vector.hpp"
-
-namespace tt::balancer::legalizer
-{
-
-struct GraphSolverSolution
-{
-    OpModels selected_op_models;
-    CutEdges cut_edges;
-
-    GraphSolverSolution() = default;
-    GraphSolverSolution(const OpModels& selected_op_models, const CutEdges& cut_edges) :
-        selected_op_models(selected_op_models), cut_edges(cut_edges)
-    {
-    }
-};
-
-inline GraphSolverSelfCutType graph_solver_self_cut_type_from_string(std::string const& s)
-{
-    if ("None" == s)
-        return GraphSolverSelfCutType::None;
-    else if ("ConsumerOperandDataEdgesFirst" == s)
-        return GraphSolverSelfCutType::ConsumerOperandDataEdgesFirst;
-    else if ("ProducerUserDataEdgesFirst" == s)
-        return GraphSolverSelfCutType::ProducerUserDataEdgesFirst;
-    else if ("FastCut" == s)
-        return GraphSolverSelfCutType::FastCut;
-
-    log_error(LogGraphSolver, "Failed to parse graph solver self cut type from string: {}", s);
-    log_error(LogGraphSolver, "Falling back to GraphSolverSelfCutType::None");
-
-    return GraphSolverSelfCutType::None;
-}
-
-inline std::string graph_solver_self_cut_type_to_string(GraphSolverSelfCutType gssct)
-{
-    switch (gssct)
-    {
-        case GraphSolverSelfCutType::None: return "None";
-        case GraphSolverSelfCutType::ConsumerOperandDataEdgesFirst: return "ConsumerOperandDataEdgesFirst";
-        case GraphSolverSelfCutType::ProducerUserDataEdgesFirst: return "ProducerUserDataEdgesFirst";
-        case GraphSolverSelfCutType::FastCut: return "FastCut";
-        default: break;
-    }
-
-    return "Unknown";
-}
-
-class GraphSolver
-{
-   private:
-    static constexpr std::size_t kNumBitsetBits = 1024;
-    using Bitset = std::bitset<kNumBitsetBits>;
-    static Bitset kBitsetAll;
-    static constexpr Bitset kBitsetNone = Bitset{};
-
-   public:
-    struct ConstraintInfo
-    {
-        static constexpr int kPageSize = 20;
-
-        struct Page
-        {
-            std::vector<std::int64_t> node_id_order;
-            std::unordered_map<std::string, OpModel const&> id_to_op_models;
-            std::unordered_map<std::string, std::vector<std::uint64_t>> node_id_to_op_model_ids;
-            std::unordered_map<std::string, std::vector<std::tuple<std::uint16_t, std::uint16_t>>> edge_to_path_sets;
-            std::unordered_map<std::string, std::uint64_t> failure_reason_ids;
-        };
-
-        std::string graph_name;
-        std::vector<Page> pages;
-        std::vector<std::uint64_t> op_model_selection;
-        std::unordered_map<std::string, std::string> node_id_to_name;
-        std::unordered_map<std::string, std::pair<int, int>> node_name_to_page;
-        GraphSolver* gs_owner_cache = nullptr;
-
-        bool needs_to_be_recomputed(const GraphSolver* graph_solver) const { return graph_solver != gs_owner_cache; }
-    };
-
-    struct RemainingOpModels
-    {
-        class Iterator : public std::iterator<std::input_iterator_tag, OpModel const>
-        {
-            std::uint64_t i = 0;
-            std::vector<OpModel> const* p = nullptr;
-            Bitset mask = 0;
-
-           private:
-            void next_valid()
-            {
-                if (mask == Bitset{})
-                {
-                    i = p->size();
-                    return;
-                }
-
-                while (mask.any() and not mask[i]) ++i;
-                mask.reset(i);
-            }
-
-           public:
-            Iterator(std::vector<OpModel> const* p, const Bitset& mask, std::uint64_t i = 0) : i(i), p(p), mask(mask)
-            {
-                next_valid();
-            }
-
-            Iterator& operator++()
-            {
-                next_valid();
-                return *this;
-            }
-
-            Iterator operator++(int)
-            {
-                auto r = *this;
-                next_valid();
-                return r;
-            }
-
-            bool operator==(Iterator other) const { return (p == other.p) and (i == other.i); }
-            bool operator!=(Iterator other) const { return not(*this == other); }
-            reference operator*() const { return (*p)[i]; }
-        };
-
-        RemainingOpModels(std::vector<OpModel> const& p, const Bitset& mask) : p(&p), mask(mask) {}
-
-        Iterator begin() const { return Iterator(p, mask); }
-        Iterator end() const { return Iterator(p, 0, std::min(kNumBitsetBits, p->size())); }
-        size_t size() const { return mask.count(); }
-
-        std::vector<OpModel> const* p = nullptr;
-        Bitset mask = 0;
-    };
-
-    LegalOpModels const& legal_op_models_no_buffering() const;
-    GraphSolverSolution const finish();
-    void recompute_legal_op_models_on_cut(std::unordered_set<graphlib::Node*>& nodes_to_legalize);
-    void recompute_legal_op_models(std::unordered_set<graphlib::Node*>& nodes_to_legalize);
-    std::vector<graphlib::Node*> buffer(std::vector<BufferInfo>& buffer_edges);
-
-   private:
-    static Bitset bitset(std::uint64_t bit)
-    {
-        Bitset b;
-        b.set(bit);
-        return b;
-    }
-    // is `a` a subset of `b`
-    static bool is_subset(const Bitset& a, const Bitset& b) { return a == (a & b); }
-
-    using PathSetId = int;
-    using BitsetId = int;
-
-    class NodePathsProcessor
-    {
-       public:
-        void add_node(const graphlib::Node* node)
-        {
-            if (control_set.count(node) == 0)
-            {
-                queue.push_back(node);
-                control_set.insert(node);
-            }
-        }
-
-        void process(GraphSolver* graph_solver)
-        {
-            while (!queue.empty())
-            {
-                const graphlib::Node* node = queue.back();
-                queue.pop_back();
-                control_set.erase(node);
-
-                auto operand_path_sets = graph_solver->get_operand_path_sets_pts(node);
-                auto user_path_sets = graph_solver->get_user_path_sets_pts(node);
-                for (auto path_set : operand_path_sets)
-                {
-                    path_set->update_node_processor(graph_solver->bitsets, this);
-                }
-                for (auto path_set : user_path_sets)
-                {
-                    path_set->update_node_processor(graph_solver->bitsets, this);
-                }
-            }
-        }
-
-       private:
-        std::vector<graphlib::Node const*> queue;
-        std::unordered_set<graphlib::Node const*> control_set;
-    };
-
-    struct Path
-    {
-        std::uint16_t producer_id = 0;
-        std::uint16_t consumer_id = 0;
-        EdgeCost cost;
-
-        Path() = default;
-        Path(std::uint16_t producer_id, std::uint16_t consumer_id, EdgeCost cost) :
-            producer_id(producer_id), consumer_id(consumer_id), cost(cost)
-        {
-        }
-    };
-
-    class PathSet
-    {
-       public:
-        using Paths = SmallVector<Path, 16>;
-
-        PathSet(
-            BitsetId producer_set_id,
-            BitsetId consumer_set_id,
-            graphlib::Node* producer_node,
-            graphlib::Node* consumer_node,
-            Paths const& paths) :
-            producer_set_id(producer_set_id),
-            consumer_set_id(consumer_set_id),
-            producer_node(producer_node),
-            consumer_node(consumer_node),
-            paths(paths)
-        {
-        }
-
-        Bitset get_producer_set(const std::vector<Bitset>& bitsets) const { return bitsets[producer_set_id]; }
-        Bitset get_consumer_set(const std::vector<Bitset>& bitsets) const { return bitsets[consumer_set_id]; }
-
-        Paths const& get_paths() const { return paths; }
-        Paths* get_paths_pt() { return &paths; }
-
-        template <typename F>
-        typename Paths::ConstIterator max_cost(F f) const
-        {
-            typename Paths::ConstIterator result = nullptr;
-
-            for (auto iter = paths.begin(); iter != paths.end(); ++iter)
-            {
-                if (!result or f(result->cost, iter->cost))
-                {
-                    result = iter;
-                }
-            }
-
-            return result;
-        }
-
-        template <bool is_operand, typename F>
-        typename Paths::ConstIterator min_cost(F f, const std::uint16_t index) const
-        {
-            typename Paths::ConstIterator result = nullptr;
-
-            for (auto iter = paths.begin(); iter != paths.end(); ++iter)
-            {
-                if ((is_operand and index == iter->consumer_id) or (!is_operand and index == iter->producer_id))
-                {
-                    if (!result or f(iter->cost, result->cost))
-                    {
-                        result = iter;
-                    }
-                }
-            }
-
-            return result;
-        }
-
-        bool erase(typename Paths::ConstIterator pos, std::vector<Bitset>& bitsets)
-        {
-            *const_cast<typename Paths::Iterator>(pos) = paths.back();
-            paths.pop_back();
-            return update(bitsets);
-        }
-
-        bool empty(const std::vector<Bitset>& bitsets) const
-        {
-            return paths.empty() or (bitsets[producer_set_id] == 0) or (bitsets[consumer_set_id] == 0);
-        }
-
-        bool update(std::vector<Bitset>& bitsets)
-        {
-            Bitset valid_producer_set = 0;
-            Bitset valid_consumer_set = 0;
-            Bitset producer = bitsets[producer_set_id];
-            Bitset consumer = bitsets[consumer_set_id];
-
-            for (std::size_t i = 0; i < paths.size(); i++)
-            {
-                Path const& path = paths[i];
-                if (consumer[path.consumer_id] and producer[path.producer_id])
-                {
-                    valid_producer_set.set(path.producer_id);
-                    valid_consumer_set.set(path.consumer_id);
-                }
-                else
-                {
-                    paths[i] = paths.back();
-                    paths.pop_back();
-                    i--;
-                }
-            }
-
-            bool is_producer_sub = is_subset(producer, valid_producer_set);
-            bool is_consumer_sub = is_subset(consumer, valid_consumer_set);
-            bool unchanged = is_producer_sub and is_consumer_sub;
-
-            if (!unchanged)
-            {
-                bitsets[producer_set_id] &= valid_producer_set;
-                bitsets[consumer_set_id] &= valid_consumer_set;
-            }
-
-            return not unchanged;
-        }
-
-        void update_node_processor(std::vector<Bitset>& bitsets, NodePathsProcessor* node_processor)
-        {
-            Bitset valid_producer_set = 0;
-            Bitset valid_consumer_set = 0;
-            Bitset producer = bitsets[producer_set_id];
-            Bitset consumer = bitsets[consumer_set_id];
-            for (std::size_t i = 0; i < paths.size(); i++)
-            {
-                Path const& path = paths[i];
-                if (consumer[path.consumer_id] and producer[path.producer_id])
-                {
-                    valid_producer_set.set(path.producer_id);
-                    valid_consumer_set.set(path.consumer_id);
-                }
-                else
-                {
-                    paths[i] = paths.back();
-                    paths.pop_back();
-                    i--;
-                }
-            }
-
-            if (!is_subset(producer, valid_producer_set))
-            {
-                node_processor->add_node(consumer_node);
-                node_processor->add_node(producer_node);
-                bitsets[producer_set_id] &= valid_producer_set;
-            }
-
-            if (!is_subset(consumer, valid_consumer_set))
-            {
-                node_processor->add_node(producer_node);
-                node_processor->add_node(consumer_node);
-                bitsets[consumer_set_id] &= valid_consumer_set;
-            }
-        }
-
-        const graphlib::Node* get_producer_node() const { return producer_node; }
-        const graphlib::Node* get_consumer_node() const { return consumer_node; }
-
-       private:
-       private:
-        BitsetId producer_set_id = -1;
-        BitsetId consumer_set_id = -1;
-        graphlib::Node* producer_node = nullptr;
-        graphlib::Node* consumer_node = nullptr;
-        Paths paths;
-    };
-
-    const std::vector<OpModel>& get_legal_op_models(graphlib::Node const* node) const;
-    void reset(bool partial_reset_allowed = false);
-    void invalidate_suboptimal_op_models(const std::vector<graphlib::Node*>& nodes);
-    void invalidate_streaming_into_output(const std::vector<graphlib::Node*>& nodes);
-    void invalidate_suboptimal_op_models_for_op(
-        const graphlib::BudaOpNode* node, GraphSolverOpModelInvalidationStrategyTier tier);
-
-    struct SharedData
-    {
-       public:
-        std::unique_ptr<Constraint> constraint;
-        std::unordered_map<std::uint64_t, const std::pair<const EdgeCost, const ConstraintFailureReason>>
-            constraint_result_cache;
-
-       private:
-        LegalOpModels legal_op_models;
-        bool graph_solving_finished;
-        std::unordered_map<const tt::graphlib::Node*, std::vector<std::vector<tt::balancer::OpModel>>>
-            recomputed_legal_op_models;
-
-       public:
-        SharedData(std::unique_ptr<Constraint>&& constraint, LegalOpModels const& legal_op_models) :
-            constraint(std::move(constraint)), legal_op_models(legal_op_models), graph_solving_finished(false)
-        {
-        }
-
-        friend const std::vector<OpModel>& GraphSolver::get_legal_op_models(graphlib::Node const* node) const;
-        friend void GraphSolver::reset(bool partial_reset_allowed);
-        friend LegalOpModels const& GraphSolver::legal_op_models_no_buffering() const;
-        friend GraphSolverSolution const GraphSolver::finish();
-        friend void GraphSolver::recompute_legal_op_models(std::unordered_set<graphlib::Node*>& nodes_to_legalize);
-        friend std::vector<graphlib::Node*> GraphSolver::buffer(std::vector<BufferInfo>& buffer_edges);
-    };
-
-    PathSet& get_path_set(const graphlib::Edge& edge);
-    PathSet const& get_path_set(const graphlib::Edge& edge) const;
-
-    PathSet* get_path_set_pt(const graphlib::Edge& edge);
-    SmallVector<PathSet*> get_operand_path_sets_pts(graphlib::Node const* node);
-    SmallVector<PathSet*> get_user_path_sets_pts(graphlib::Node const* node);
-
-    void log_bitset(graphlib::Node const* node, const Bitset& set) const;
-    template <bool kOperand>
-    void handle_cumulative_paths_error(
-        PathSet const& path_set,
-        const Bitset& debug_snapshot,
-        graphlib::Node const* producer,
-        graphlib::Node const* consumer);
-    template <bool kOperand, typename CostFns>
-    bool apply_cumulative_costs(
-        tt::SmallVector<PathSet*> const& path_sets, graphlib::Node const* node, CostFns cost_fns);
-    void handle_no_paths_left_on_update(
-        bool invoked_by_set, const std::string& root_node_name, const std::string& current_node_name);
-    void update_solver(graphlib::Node const* root, bool expand_root = true, bool invoked_by_set = false);
-
-    Bitset* get_bitset(graphlib::NodeId node_id);
-    Bitset const* get_bitset(graphlib::NodeId node_id) const;
-    Bitset* get_or_insert_bitset(graphlib::NodeId node_id, const Bitset& init);
-
-    void throw_error_for_edge(graphlib::Edge edge);
-    void resolve(bool partial_reset_allowed = false);
-    bool resolve_step(const bool self_cut_allowed);
-    std::vector<graphlib::Edge> get_epoch_type_switch_cut_edges();
-    void update_constraint_info();
-    bool self_cut(graphlib::Node* producer_node, graphlib::Node* consumer_node);
-    void register_virtual_node(graphlib::Node* buffer_nop);
-    void insert_virtual_queue(
-        graphlib::Edge& edge, const graphlib::Node* src, const graphlib::Node* dest, bool is_e2e_queue = false);
-    void resolve_step_postprocess(const std::vector<graphlib::Node*>& nodes);
-
-    GraphSolver(
-        graphlib::Graph* graph,
-        std::unique_ptr<Constraint>&& constraint,
-        LegalOpModels const& legal_op_models,
-        BalancerConfig const& balancer_config,
-        std::shared_ptr<tt::balancer::BalancerCacheCollection> balancer_cache_collection,
-        std::vector<graphlib::Edge> const& cut_edges,
-        bool use_op_model_recalculation_on_cut,
-        bool resolve_on_create = true);
-
-   public:
-    template <typename ConstraintT>
-    static GraphSolver create(
-        graphlib::Graph* graph,
-        LegalOpModels const& legal_op_models,
-        BalancerConfig const& balancer_config,
-        std::shared_ptr<tt::balancer::BalancerCacheCollection> balancer_cache_collection,
-        bool use_op_model_recalculation_on_cut,
-        std::vector<graphlib::Edge> const& cut_edges = {},
-        bool resolve_on_create = true)
-    {
-        return GraphSolver(
-            graph,
-            std::make_unique<ConstraintT>(balancer_config.device_config, balancer_cache_collection),
-            legal_op_models,
-            balancer_config,
-            balancer_cache_collection,
-            cut_edges,
-            use_op_model_recalculation_on_cut,
-            resolve_on_create);
-    }
-
-    RemainingOpModels at(graphlib::Node const* node) const;
-    const BalancerConfig& get_balancer_config() const { return balancer_config; }
-    std::shared_ptr<BalancerCacheCollection> get_balancer_cache_collection() const { return balancer_cache_collection; }
-    OpModels* get_selected_op_models_for_buffering(
-        std::unordered_set<const tt::graphlib::Node*> const& current_epoch_ops);
-    void set(graphlib::Node const* node, OpModel const& op_model, bool skip_update = false);
-    void cut(std::vector<graphlib::Edge> const& edge, bool epoch_cut = false);
-    std::unique_ptr<graphlib::GraphTraversalContext> get_graph_traversal_context();
-    std::unique_ptr<graphlib::GraphTraversalContext> get_graph_epoch_traversal_context(
-        const std::unordered_set<const graphlib::Node*>* epoch_nodes);
-    const CutEdges& get_cut_edges() const { return cut_edges; }
-    const OpModels& get_selected_op_models() const { return selected_op_models; }
-    void invalidate_suboptimal_op_models(int invalidation_strategy);
-#ifdef DEBUG
-    void compute_edge_elimination_debug_info(
-        graphlib::Edge& edge,
-        Bitset* producer_bitset,
-        Bitset* consumer_bitset,
-        Bitset& edge_producer_bitset,
-        Bitset& edge_consumer_bitset,
-        std::vector<OpModel>& producer_op_models_debug,
-        std::vector<OpModel>& consumer_op_models_debug,
-        std::uint64_t producer_count,
-        std::uint64_t consumer_count,
-        EdgeConstraintDebugInfo& edge_constraint_debug_info,
-        EdgeConstraintDebugInfo& graph_constraint_debug_info);
-#endif
-    ConstraintInfo const& get_constraint_info() const { return *constraint_info_ptr.get(); }
-
-   private:
-    graphlib::Graph* graph;
-    std::shared_ptr<SharedData> shared_data;
-    BalancerConfig const& balancer_config;
-    std::shared_ptr<balancer::BalancerCacheCollection> balancer_cache_collection;
-    std::vector<PathSet> path_sets;
-    std::vector<Bitset> bitsets;
-    OpModels selected_op_models;
-    CutEdges cut_edges;
-    std::unordered_map<const graphlib::Node*, int> op_model_recompute_version;
-    std::unordered_set<const graphlib::Node*> virtual_nodes;
-    std::unordered_set<graphlib::Edge> edges_to_ignore;
-    std::vector<graphlib::Edge> edges_pending_removal;
-    std::vector<std::shared_ptr<graphlib::NodeGraphContainer>> virtual_nodes_management;
-    std::unordered_map<graphlib::Edge, PathSetId> path_set_ids;
-    std::unordered_map<graphlib::NodeId, BitsetId> bitset_ids;
-    std::unordered_map<graphlib::NodeId, Bitset> op_disabled_bitset_cache;
-    bool use_op_model_recalculation_on_cut;
-    std::unordered_map<std::string, ConstraintFailureReason> failure_reasons;
-    std::shared_ptr<ConstraintInfo> constraint_info_ptr;
-    int suboptimal_opmodel_invalidation_strategy = 0;
-    bool single_core_ip_mode;
-};
-
-}  // namespace tt::balancer::legalizer
diff --git a/pybuda/csrc/balancer/legalizer/graph_solver_types.hpp b/pybuda/csrc/balancer/legalizer/graph_solver_types.hpp
deleted file mode 100644
index 78a768e00..000000000
--- a/pybuda/csrc/balancer/legalizer/graph_solver_types.hpp
+++ /dev/null
@@ -1,46 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "graph_lib/edge.hpp"
-
-namespace tt::balancer::legalizer
-{
-
-// Simple struct describing how NOPs should be inserted in graph.
-//
-struct BufferInfo
-{
-    graphlib::Edge edge;
-    int nop_count;
-    bool hoist_tms;
-
-    BufferInfo(const graphlib::Edge& edge, int nop_count, bool hoist_tms) :
-        edge(edge), nop_count(nop_count), hoist_tms(hoist_tms)
-    {
-    }
-};
-
-enum GraphSolverOpModelInvalidationStrategy
-{
-    MatmulSparseDenseGridPairing = 1,
-    DenseMatmulPrologue = 1 << 1,
-    DenseMatmulBetterUkt = 1 << 2,
-};
-
-enum GraphSolverOpModelInvalidationStrategyTier
-{
-    FirstTier = 1,
-    SecondTier
-};
-
-enum GraphSolverSelfCutType
-{
-    None = 0,
-    ConsumerOperandDataEdgesFirst,
-    ProducerUserDataEdgesFirst,
-    FastCut
-};
-
-}  // namespace tt::balancer::legalizer
\ No newline at end of file
diff --git a/pybuda/csrc/balancer/legalizer/legalizer.cpp b/pybuda/csrc/balancer/legalizer/legalizer.cpp
deleted file mode 100644
index d58e19f43..000000000
--- a/pybuda/csrc/balancer/legalizer/legalizer.cpp
+++ /dev/null
@@ -1,3163 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "balancer/legalizer/legalizer.hpp"
-
-#include <algorithm>
-
-#include "autograd/binding.hpp"
-#include "balancer/balancer.hpp"
-#include "balancer/balancer_cache_collection.hpp"
-#include "balancer/balancer_utils.hpp"
-#include "balancer/exceptions.hpp"
-#include "balancer/python_interface.hpp"
-#include "balancer/types.hpp"
-#include "graph_lib/defines.hpp"
-#include "graph_lib/node.hpp"
-#include "graph_lib/node_types.hpp"
-#include "graph_lib/utils.hpp"
-#include "passes/fuse_ops.hpp"
-#include "passes/print_graph.hpp"
-#include "passes/t_stream.hpp"
-#include "shared_utils/sparse_matmul_utils.hpp"
-#include "utils/logger.hpp"
-
-using NodeType = tt::graphlib::NodeType;
-using PortId = tt::graphlib::PortId;
-using Edge = tt::graphlib::Edge;
-
-namespace tt::balancer
-{
-
-std::ostream& operator<<(std::ostream& os, OpOverride const& op_override)
-{
-    os << "OpOverride{";
-    if (op_override.grid_shape)
-        os << " .grid_shape = (" << op_override.grid_shape->first << ", " << op_override.grid_shape->second << ")";
-    if (op_override.force_dram_parameters)
-        os << " .force_dram_parameters = true";
-    if (not op_override.t_stream_dir.empty())
-        os << " .t_stream_dir = " << op_override.t_stream_dir;
-    if (op_override.t_stream_shape)
-        os << " .t_stream_shape = (" << op_override.t_stream_shape->first << ", " << op_override.t_stream_shape->second
-           << ")";
-    os << " }";
-    return os;
-}
-
-void OpOverride::apply(
-    FactorizedShape& grid_pars,
-    bool& force_dram_parameters_out,
-    std::vector<TStreamDir>& t_stream_dirs,
-    FactorizedShape& overridden_streaming_pars,
-    bool& enable_t_streaming,
-    const std::string& op_name)
-{
-    log_debug(LogBalancer, "  {}", *this);
-
-    if (grid_shape)
-    {
-        auto [r, c] = grid_shape.value();
-        grid_pars = grid_pars & FactorizedShape(Parallelization(r, c));
-        if (grid_pars.empty())
-        {
-            log_fatal(
-                LogBalancer,
-                "Illegal grid shape chosen for op '{}' override, grid_shape: {}",
-                op_name,
-                GridShape(r, c));
-        }
-    }
-
-    if (force_dram_parameters.has_value())
-    {
-        force_dram_parameters_out = force_dram_parameters.value();
-    }
-
-    if (t_stream_shape.has_value())
-    {
-        auto [r, c] = t_stream_shape.value();
-        overridden_streaming_pars = FactorizedShape(FactorizedInt::Constant(r), FactorizedInt::Constant(c));
-        enable_t_streaming = true;
-    }
-
-    if (t_stream_dir == "r")
-        t_stream_dirs = {TStreamDir::R};
-    else if (t_stream_dir == "c")
-        t_stream_dirs = {TStreamDir::C};
-    else if (t_stream_dir == "rz")
-        t_stream_dirs = {TStreamDir::RZ};
-    else if (t_stream_dir == "cz")
-        t_stream_dirs = {TStreamDir::CZ};
-    else if (t_stream_dir == "n")
-        enable_t_streaming = false;
-}
-
-std::optional<int> OpOverride::get_fracture_factor()
-{
-    if (this->fracture_factor.has_value())
-    {
-        return this->fracture_factor.value();
-    }
-
-    return {};
-}
-
-std::optional<int> OpOverride::get_u_kt()
-{
-    if (this->u_kt.has_value())
-    {
-        return this->u_kt;
-    }
-
-    return {};
-}
-
-}  // namespace tt::balancer
-
-namespace tt::balancer::legalizer
-{
-static bool sparse_buffer_legal(Graph const* graph, graphlib::BudaOpNode const* op_node)
-{
-    if (not op_node->is_sparse_matmul())
-        return false;
-
-    auto users = graph->data_users(op_node);
-    if (users.size() > 1)
-        return false;
-
-    auto user = dynamic_cast<graphlib::BudaOpNode const*>(users.front());
-    bool user_is_reduce_z =
-        user and (user->op_name() == "reduce" and std::get<std::string>(user->buda_attrs().at("dim")) == "z");
-    return not user_is_reduce_z;
-}
-
-static bool edge_tms_consume_rz_major(Graph const* graph, graphlib::Edge edge)
-{
-    // Must come from sparse matmul
-    // Must do vslice + hstack pattern
-    Node* node = graph->node_by_id(edge.producer_node_id);
-    if (node->node_type() == NodeType::kQueue)
-    {
-        node = graph->data_operands(node).back();
-    }
-
-    graphlib::OpNode* op = dynamic_cast<graphlib::OpNode*>(node);
-    if (not(op and op->is_sparse_matmul()))
-        return false;
-
-    auto const& tms = graph->get_edge_attributes(edge)->get_tms();
-    if (tms.empty())
-        return false;
-
-    int internal_slice_stack_factor = 1;
-    for (auto const& tm : tms)
-    {
-        if (tm.op == "vslice")
-        {
-            internal_slice_stack_factor *= std::get<int>(tm.attr[0]);
-        }
-        else if (tm.op == "hstack")
-        {
-            if (internal_slice_stack_factor % std::get<int>(tm.attr[0]) == 0)
-                internal_slice_stack_factor /= std::get<int>(tm.attr[0]);
-            else
-                internal_slice_stack_factor = 0;
-        }
-        else
-        {
-            internal_slice_stack_factor = 0;
-        }
-    }
-    return internal_slice_stack_factor == 1;
-}
-
-static std::vector<TStreamDir> get_legal_streaming_dirs(Graph const* graph, graphlib::BudaOpNode const* op_node)
-{
-    auto operands = graph->operand_data_edges(op_node);
-    bool has_z = std::any_of(
-        operands.begin(), operands.end(), [graph](Edge edge) { return post_tms_shape(graph, edge).z() > 1; });
-    bool tms_consume_rz_major = std::any_of(
-        operands.begin(), operands.end(), [graph](Edge edge) { return edge_tms_consume_rz_major(graph, edge); });
-    bool sparse_matmul_bcast_factor =
-        (op_node->is_sparse_matmul() and
-         graph->data_operands(op_node)[0]->as<graphlib::ConstantInputNode>()->get_sparse_buda().bcast_factor > 1);
-    bool is_reduce_z = graphlib::is_reduce_z(op_node);
-    if (is_reduce_z or tms_consume_rz_major)
-        return {TStreamDir::R};
-    else if (has_z or sparse_matmul_bcast_factor)
-        return {TStreamDir::R, TStreamDir::C, TStreamDir::RZ};
-    else
-        return {TStreamDir::R, TStreamDir::C};
-}
-
-static FactorizedInt get_fracture_factorization(
-    graphlib::Graph const* graph, graphlib::BudaOpNode const* op_node, std::optional<OpOverride> op_override)
-{
-    bool fracturization_disable = env_as<bool>("PYBUDA_FRACTURIZATION_DISABLE");
-    if (fracturization_disable)
-        return FactorizedInt(1);
-
-    if (not op_node->is_sparse_matmul())
-        return FactorizedInt(1);
-
-    FactorizedInt fracture_factorization(
-        graph->data_operands(op_node)[0]->as<graphlib::ConstantInputNode>()->get_sparse_buda().fracture_factor);
-
-    if (op_override)
-    {
-        if (auto fracture_factor = op_override->get_fracture_factor())
-        {
-            fracture_factorization =
-                fracture_factorization & FactorizedInt(FactorizedInt::Constant(fracture_factor.value()));
-
-            if (fracture_factorization.empty())
-            {
-                log_fatal(
-                    LogBalancer, "Illegal fracture factor chose for override, factor: {}", fracture_factor.value());
-            }
-        }
-    }
-
-    return fracture_factorization;
-}
-
-std::optional<int> get_output_buffer_override(
-    graphlib::BudaOpNode const* op_node, std::optional<OpOverride> op_override)
-{
-    if (op_override and op_override->output_buffer_multiplier)
-    {
-        log_warning(
-            LogBalancer,
-            "Internal Override: User is overriding output buffer factor for op {} to {}",
-            op_node->op_name(),
-            op_override->output_buffer_multiplier.value());
-        return op_override->output_buffer_multiplier.value();
-    }
-    return {};
-}
-
-std::map<std::uint32_t, std::uint32_t> get_min_input_buffer_multiplier_overrides(std::optional<OpOverride> op_override)
-{
-    if (op_override and op_override->input_buffer_multiplier)
-    {
-        return op_override->input_buffer_multiplier.value();
-    }
-    return {};
-}
-
-static int get_u_kt(std::optional<OpOverride> op_override)
-{
-    if (op_override)
-    {
-        if (auto u_kt = op_override->get_u_kt())
-        {
-            return u_kt.value();
-        }
-    }
-
-    return 0;
-}
-
-static int get_output_buffer_factor(
-    graphlib::BudaOpNode const*, int calculated_user_buffer_factor, std::optional<int> output_buffer_factor_override)
-{
-    int output_buffer_factor = calculated_user_buffer_factor * 2;  // double buffer
-    if (output_buffer_factor_override)
-    {
-        output_buffer_factor = output_buffer_factor_override.value();
-    }
-    return output_buffer_factor;
-}
-
-static std::pair<FactorizedShape, LegalSparseUKts> calculate_streaming_pars(
-    Graph const* graph,
-    graphlib::BudaOpNode const* op_node,
-    Parallelization grid_par,
-    FactorizedShape all_pars,
-    TStreamDir dir,
-    int fracture_factor,
-    bool sparse_buffer_enable)
-{
-    bool is_reduce_z = graphlib::is_reduce_z(op_node);
-    int operand_z_dim = graph->operands(op_node)[0]->shape().z();
-
-    if (is_reduce_z and operand_z_dim != 1)
-        return std::make_pair(FactorizedShape(1, 1), LegalSparseUKts{});
-
-    if (op_node->is_embedding())
-        return std::make_pair(FactorizedShape(1, 1), LegalSparseUKts{});
-
-    if (op_node->is_sparse_matmul() and dir.r())
-    {
-        // Get lhs sparse tensor
-        sparse::SparseBUDA& sparse_buda =
-            graph->data_operands(op_node)[0]->as<graphlib::ConstantInputNode>()->get_sparse_buda();
-        std::vector<tt::sparse::SparseCOO>& sparse_zs = sparse_buda.sparse_zs;
-        auto layout = sparse::SparseBUDA::create_layout(sparse_buffer_enable, dir.z_major(), fracture_factor);
-        int bcast_factor = (layout == sparse::SparseBUDA::Layout::ZMajor) ? sparse_buda.bcast_factor : 1;
-
-        // Each potential t needs to evenly divide output's r-dim but also in1's r-dim (in0's c-dim)
-        std::vector<graphlib::Edge> operands = graph->operand_data_edges(op_node);
-        TT_ASSERT(operands.size() == 3);
-        graphlib::Shape rhs_shape = post_tms_shape(graph, operands[1]);
-        FactorizedInt inner_dim(rhs_shape.rt());
-        FactorizedInt total_t = all_pars.r / FactorizedInt::Constant(grid_par.r * bcast_factor);
-
-        LegalSparseUKts r_para_to_legal_u_kts = sparse::SparseBUDA::get_par_t_values(
-            grid_par.r,
-            total_t.get_factors(),
-            sparse_zs,
-            inner_dim.get_factors(),
-            sparse_buda.bcast_factor,
-            fracture_factor,
-            layout);
-
-        std::vector<int> r_para;
-        r_para.reserve(r_para_to_legal_u_kts.size());
-        for (auto const& [para, u_kts] : r_para_to_legal_u_kts) r_para.push_back(para);
-        std::sort(r_para.begin(), r_para.end());
-
-        log_trace(LogBalancer, "Streaming dir: {} -- Found t values: {} // {}", dir, fmt::join(r_para, ", "), total_t);
-
-        return std::make_pair(FactorizedShape(FactorizedInt(r_para.begin(), r_para.end()), 1), r_para_to_legal_u_kts);
-    }
-    else if (op_node->is_sparse_matmul() and dir.c() and fracture_factor > 1)
-    {
-        return std::make_pair(FactorizedShape(1, 1), LegalSparseUKts{});
-    }
-
-    FactorizedInt r(FactorizedInt::FactorRange(grid_par.r, all_pars.r.get_max_factor()));
-    FactorizedInt c(FactorizedInt::FactorRange(grid_par.c, all_pars.c.get_max_factor()));
-    r = r / FactorizedInt::Constant(grid_par.r);
-    c = c / FactorizedInt::Constant(grid_par.c);
-
-    if (dir.r())
-    {
-        c = 1;
-    }
-    else if (dir.c())
-    {
-        r = 1;
-    }
-
-    if (op_node->is_fused_op())
-    {
-        auto fused_op = op_node->get_fused_op();
-        if (fused_op->has_matmul_op())
-        {
-            r = 1;
-            c = 1;
-        }
-
-        if (fused_op->has_broadcast_c())
-        {
-            c = 1;
-        }
-
-        if (fused_op->has_reduce_op())
-        {
-            std::uint32_t dim = fused_op->get_reduce_dim();
-            if (dim == 2)
-                r = 1;
-            else if (dim == 3)
-                c = 1;
-        }
-    }
-
-    return std::make_pair(FactorizedShape(r, c), LegalSparseUKts{});
-}
-
-static bool streaming_unsupported_op(const std::string& op_type_name)
-{
-    // TODO Consider adding enum for all OPs and converting to switch-case.
-    //
-    if ("embedding" == op_type_name)
-    {
-        return true;
-    }
-
-    return false;
-}
-
-static std::pair<FactorizedShape, LegalSparseUKts> calculate_streaming_pars(
-    Graph const* graph,
-    graphlib::BudaOpNode const* op_node,
-    Parallelization grid_par,
-    FactorizedShape all_pars,
-    TStreamDir dir,
-    FactorizedShape overridden_pars,
-    bool enable_t_streaming,
-    int fracture_factor,
-    bool sparse_buffer_enable)
-{
-    if (not enable_t_streaming or streaming_unsupported_op(op_node->op_type().op))
-    {
-        return std::make_pair(FactorizedShape(1, 1), LegalSparseUKts{});
-    }
-
-    auto [streaming_pars, legal_sparse_u_kts] =
-        calculate_streaming_pars(graph, op_node, grid_par, all_pars, dir, fracture_factor, sparse_buffer_enable);
-
-    if (not overridden_pars.empty())
-        streaming_pars = streaming_pars & overridden_pars;
-
-    return std::make_pair(streaming_pars, legal_sparse_u_kts);
-}
-
-static std::vector<int> enumerate_factored_u_kts(OpModel const& op_model, int user_overriden_u_kt, bool enabled)
-{
-    if (not enabled)
-        return {};
-
-    // If u_kt is user-overriden, then don't test all possible u_kts
-    if (user_overriden_u_kt > 0)
-        return {};
-
-    if (op_model.op_type() != "matmul")
-        return {};
-
-    auto factors = FactorizedInt(op_model.input_buffers[1].block_shape.ublock.rt).get_factors();
-    TT_ASSERT(not factors.empty());
-    factors.pop_back();  // The initial op model holds the last factor
-    return factors;
-}
-
-// Remove legacy path, once fork/join hangs are removed:
-//   tenstorrent/pybuda#1697
-static UBlockShape calculate_ublock_shape_legacy(
-    OpShape op_shape,
-    Parallelization par,
-    std::size_t dst_size_tiles,
-    UBlockOrder ublock_order,
-    const OpType& op_type,
-    bool is_splice,
-    bool is_sparse_matmul,
-    bool is_embedding,
-    bool is_tilize)
-{
-    // 2 * 4 = Half Dest
-    constexpr int kMaxUBlockR = 2;
-    constexpr int kMaxUBlockC = 4;
-    auto max_pot_multiple = [](int a) -> int { return (1 << __builtin_ctz(a)); };
-    auto is_pot = [](int a) { return (a & (a - 1)) == 0; };
-
-    int max_ublock_volume = (int)dst_size_tiles;
-    TT_ASSERT(is_pot(max_ublock_volume));
-    TT_ASSERT(is_pot(kMaxUBlockR));
-    TT_ASSERT(is_pot(kMaxUBlockC));
-
-    UBlockShape ublock;
-
-    TensorShape tensor = op_shape.outputs[0];
-    TT_ASSERT(tensor.rt % par.r == 0);
-    TT_ASSERT(tensor.ct % par.c == 0);
-    int block_rt = tensor.rt / par.r;
-    int block_ct = tensor.ct / par.c;
-
-    if (is_splice)
-    {
-        for (auto input : op_shape.inputs)
-        {
-            block_rt = gcd(input.rt, block_rt);
-            block_ct = gcd(input.ct, block_ct);
-        }
-
-        // Splice ublock size must be a factor of length and stride
-        int dim = op_type.get_attr_as<int>("dim");
-        for (auto [index, num_tile_length, num_tile_stride] :
-             op_type.get_attr_as<std::vector<std::tuple<int, int, int>>>("canonical_ranges"))
-        {
-            if (dim == 2)
-            {
-                if (index > 0)
-                    block_rt = gcd(index, block_rt);
-                block_rt = gcd(index + num_tile_length, block_rt);
-                block_rt = gcd(index + num_tile_stride, block_rt);
-            }
-            else if (dim == 3)
-            {
-                if (index > 0)
-                    block_ct = gcd(index, block_ct);
-                block_ct = gcd(index + num_tile_length, block_ct);
-                block_ct = gcd(index + num_tile_stride, block_ct);
-            }
-        }
-    }
-
-    if (is_sparse_matmul)
-    {
-        // For sparse matmul we use a different ublock heurisctic to always maximize its volume
-        int r_major_ublock_r = FactorizedInt(block_rt).get_nearest_factor_le(max_ublock_volume);
-        int r_major_ublock_c = FactorizedInt(block_ct).get_nearest_factor_le(max_ublock_volume / r_major_ublock_r);
-        int c_major_ublock_c = FactorizedInt(block_ct).get_nearest_factor_le(max_ublock_volume);
-        int c_major_ublock_r = FactorizedInt(block_rt).get_nearest_factor_le(max_ublock_volume / c_major_ublock_c);
-        bool r_major = (r_major_ublock_r * r_major_ublock_c) > (c_major_ublock_r * c_major_ublock_c);
-        ublock.rt = r_major ? r_major_ublock_r : c_major_ublock_r;
-        ublock.ct = r_major ? r_major_ublock_c : c_major_ublock_c;
-        return ublock;
-    }
-
-    int max_ublock_r = is_embedding || is_tilize ? 1 : std::min(kMaxUBlockR, max_ublock_volume);
-    int max_ublock_c = std::min(kMaxUBlockC, max_ublock_volume);
-
-    // Maximize ublock, precidence to anti-ublock order
-    if (ublock_order == UBlockOrder::C)
-    {
-        ublock.rt = std::min(max_pot_multiple(block_rt), max_ublock_r);
-        ublock.ct = std::min({max_pot_multiple(block_ct), max_ublock_volume / ublock.rt, max_ublock_c});
-    }
-    else
-    {
-        ublock.ct = std::min(max_pot_multiple(block_ct), max_ublock_c);
-        ublock.rt = std::min({max_pot_multiple(block_rt), max_ublock_volume / ublock.ct, max_ublock_r});
-    }
-
-    TT_ASSERT(block_rt % ublock.rt == 0);
-    TT_ASSERT(block_ct % ublock.ct == 0);
-    return ublock;
-}
-
-static std::pair<UBlockShape, std::unordered_map<std::string, balancer::UBlockShape>> calculate_ublock_shape_legacy(
-    OpShape op_shape,
-    Parallelization total_par,
-    std::size_t dst_size_tiles,
-    UBlockOrder ublock_order,
-    graphlib::BudaOpNode const* op_node)
-{
-    UBlockShape ublock = calculate_ublock_shape_legacy(
-        op_shape,
-        total_par,
-        dst_size_tiles,
-        ublock_order,
-        op_node->op_type(),
-        op_node->op_name() == "splice",
-        op_node->is_sparse_matmul(),
-        op_node->is_embedding(),
-        op_node->is_tilize());
-
-    std::unordered_map<std::string, balancer::UBlockShape> fused_op_ublock_shape;
-    if (op_node->is_fused_op())
-    {
-        auto fused_op = op_node->get_fused_op();
-        for (auto const& sch : fused_op->get_schedules())
-        {
-            for (auto const& op : sch.ops)
-            {
-                fused_op_ublock_shape.insert(std::make_pair(
-                    op.name,
-                    calculate_ublock_shape_legacy(
-                        op.op_shape, total_par, dst_size_tiles, ublock_order, op.op_type, false, false, false, false)));
-            }
-        }
-    }
-
-    return std::make_pair(ublock, fused_op_ublock_shape);
-}
-
-static FactorizedShape calculate_ublock_shape(
-    OpShape op_shape, Parallelization par, std::size_t dst_size_tiles, const OpType& op_type)
-{
-    TensorShape tensor = op_shape.outputs[0];
-    TT_ASSERT(tensor.rt % par.r == 0);
-    TT_ASSERT(tensor.ct % par.c == 0);
-    int block_rt = tensor.rt / par.r;
-    int block_ct = tensor.ct / par.c;
-    int max_ublock_volume = static_cast<int>(dst_size_tiles);
-    TT_ASSERT(max_ublock_volume > 0);
-
-    if (op_type.op == "splice")
-    {
-        for (auto input : op_shape.inputs)
-        {
-            block_rt = gcd(input.rt, block_rt);
-            block_ct = gcd(input.ct, block_ct);
-        }
-
-        // Splice ublock size must be a factor of length and stride
-        int dim = op_type.get_attr_as<int>("dim");
-        for (auto [index, num_tile_length, num_tile_stride] :
-             op_type.get_attr_as<std::vector<std::tuple<int, int, int>>>("canonical_ranges"))
-        {
-            if (dim == 2)
-            {
-                block_rt = gcd(index, block_rt);
-                block_rt = gcd(index + num_tile_length, block_rt);
-                block_rt = gcd(index + num_tile_stride, block_rt);
-            }
-            else if (dim == 3)
-            {
-                block_ct = gcd(index, block_ct);
-                block_ct = gcd(index + num_tile_length, block_ct);
-                block_ct = gcd(index + num_tile_stride, block_ct);
-            }
-        }
-    }
-
-    bool is_embedding = op_type.op == "embedding";
-    bool is_tilize = op_type.op == "tilizer";
-    int max_ublock_r = (is_embedding or is_tilize) ? 1 : max_ublock_volume;
-    int max_ublock_c = max_ublock_volume;
-    max_ublock_r = FactorizedInt(block_rt).get_nearest_factor_le(max_ublock_r);
-    max_ublock_c = FactorizedInt(block_ct).get_nearest_factor_le(max_ublock_c);
-    return FactorizedShape(max_ublock_r, max_ublock_c);
-}
-
-static std::pair<UBlockShape, std::unordered_map<std::string, balancer::UBlockShape>> calculate_ublock_shape(
-    OpShape op_shape,
-    Parallelization total_par,
-    std::size_t dst_size_tiles,
-    UBlockOrder ublock_order,
-    graphlib::BudaOpNode const* op_node)
-{
-    if (env_as<bool>("PYBUDA_LEGACY_UBLOCK_SHAPE"))
-        return calculate_ublock_shape_legacy(op_shape, total_par, dst_size_tiles, ublock_order, op_node);
-
-    FactorizedShape ublock_factors = calculate_ublock_shape(op_shape, total_par, dst_size_tiles, op_node->op_type());
-
-    // All subops + top level op of fused op must have the same ublock shape
-    if (op_node->is_fused_op())
-    {
-        auto fused_op = op_node->get_fused_op();
-        for (auto const& sch : fused_op->get_schedules())
-        {
-            for (auto const& op : sch.ops)
-            {
-                FactorizedShape sub_op_ublock_factors =
-                    calculate_ublock_shape(op.op_shape, total_par, dst_size_tiles, op.op_type);
-                ublock_factors = ublock_factors & sub_op_ublock_factors;
-            }
-        }
-    }
-
-    // always maximize its volume
-    UBlockShape ublock(1, 1);
-    for (auto candidate : ublock_factors)
-    {
-        if (candidate.volume() > (int)dst_size_tiles)
-            continue;
-
-        // It's generally better to bias one dimension, either r major or c major so that back to back ops with
-        // similar tensor shapes are more likely end up with the same ublock shape and reduce reblocking
-        // Arbitrarily bias r-major, i.e. wider ublocks
-        bool r_major_bias = (candidate.volume() == ublock.volume() and candidate.c > ublock.ct);
-        if (candidate.volume() > ublock.volume() or r_major_bias)
-            ublock = UBlockShape(candidate.r, candidate.c);
-    }
-
-    // All subops + top level op of fused op must have the same ublock shape
-    std::unordered_map<std::string, balancer::UBlockShape> fused_op_ublock_shape;
-    if (op_node->is_fused_op())
-    {
-        auto fused_op = op_node->get_fused_op();
-        for (auto const& sch : fused_op->get_schedules())
-        {
-            for (auto const& op : sch.ops)
-            {
-                fused_op_ublock_shape[op.name] = ublock;
-            }
-        }
-    }
-
-    return std::make_pair(ublock, fused_op_ublock_shape);
-}
-
-static std::tuple<int, bool, bool> calculate_user_buffer_factor(
-    Graph const* graph, graphlib::BudaOpNode const* op_node, UBlockOrder ublock_order, OpModel op_model)
-{
-    //
-    // Returns a tuple (factor, can_stream, is_legal_stack_for_grid)
-    //   Used as a multiplier on the mblock to denote how many mblocks we need to buffer
-    //   if (can_stream == true) then we are allowed to slice the mblock into t
-    //
-
-    bool can_stream = true;
-    bool is_legal_stack_for_grid = true;
-    TStreamFactor t_stream_factor = op_model.t_stream_factor;
-    std::vector<Edge> operands = graph->operand_data_edges(op_node);
-    for (Edge operand : operands)
-    {
-        auto edge_attrs = graph->get_edge_attributes(operand);
-        int hstack_factor = 1;
-        int vstack_factor = 1;
-        int hslice_factor = 1;
-        int vslice_factor = 1;
-
-        for (graphlib::OpType const& tm : edge_attrs->get_tms())
-        {
-            if (tm.op == "hslice")
-            {
-                int slice_factor = std::get<int>(tm.attr[0]);
-                hslice_factor *= slice_factor;
-            }
-            else if (tm.op == "vslice")
-            {
-                int slice_factor = std::get<int>(tm.attr[0]);
-                vslice_factor *= slice_factor;
-            }
-            else if (tm.op == "hstack")
-            {
-                int stack_factor = std::get<int>(tm.attr[0]);
-                hstack_factor *= stack_factor;
-            }
-            else if (tm.op == "vstack")
-            {
-                int stack_factor = std::get<int>(tm.attr[0]);
-                vstack_factor *= stack_factor;
-            }
-        }
-
-        int total_stack_factor = hstack_factor * vstack_factor;
-        int total_slice_factor = hslice_factor * vslice_factor;
-        int stack_factor = total_stack_factor / total_slice_factor;
-        if (stack_factor > 1)
-        {
-            can_stream &=
-                t_stream_factor.dir.z_major() or
-                (divisible_either_direction(vstack_factor, vslice_factor * t_stream_factor.r) and
-                 divisible_either_direction(hstack_factor, hslice_factor * t_stream_factor.c) and
-                 (not op_node->is_matmul() or
-                  ((op_node->is_matmul() and operand.consumer_input_port_id == 0 and t_stream_factor.dir.r()) or
-                   (op_node->is_matmul() and operand.consumer_input_port_id == 1 and t_stream_factor.dir.c()))));
-
-            int grid_dim = (t_stream_factor.dir == TStreamDir::R) ? op_model.grid_shape.r : op_model.grid_shape.c;
-            is_legal_stack_for_grid &= divisible_either_direction(total_stack_factor, grid_dim);
-        }
-    }
-
-    std::vector<Edge> users = graph->user_data_edges(op_node);
-    int buffer_factor = 1;
-    for (Edge user : users)
-    {
-        graphlib::Node* user_node = graph->node_by_id(user.consumer_node_id);
-
-        // For now, disable streaming for loopback, it can cause the gradient queue and parameter
-        // to stream differently and therefore have a different shape.  We need to support consteval
-        // on gradient queues or some other solution.
-        if (user_node->node_type() == graphlib::NodeType::kInput)
-        {
-            can_stream = false;
-        }
-
-        // Only applies to users on the same epoch
-        if (user_node->get_epoch_type() != op_node->get_epoch_type())
-            continue;
-
-        // Can always stream through queues
-        if (user_node->node_type() == graphlib::NodeType::kQueue)
-            continue;
-
-        auto shape = op_node->shape();
-        auto edge_attrs = graph->get_edge_attributes(user);
-        int total_stack_factor = 1;
-        int total_slice_factor = 1;
-        bool needs_stack_factor = false;
-        bool needs_slice_factor = false;
-
-        for (graphlib::OpType const& tm : edge_attrs->get_tms())
-        {
-            if (tm.op == "hslice" or tm.op == "vslice")
-            {
-                int slice_factor = std::get<int>(tm.attr[0]);
-                needs_slice_factor |=
-                    ((tm.op == "hslice" and (ublock_order == UBlockOrder::R or t_stream_factor.dir.r()) and
-                      shape.rt() > 1) or
-                     (tm.op == "vslice" and (ublock_order == UBlockOrder::C or t_stream_factor.dir.c()) and
-                      shape.ct() > 1));
-                can_stream &=
-                    t_stream_factor.dir.z_major() or
-                    (not needs_slice_factor and divisible_either_direction(slice_factor, t_stream_factor.t()));
-                total_slice_factor *= slice_factor;
-            }
-            else if (tm.op == "hstack" or tm.op == "vstack")
-            {
-                int stack_factor = std::get<int>(tm.attr[0]);
-                needs_stack_factor |=
-                    ((tm.op == "hstack" and (ublock_order == UBlockOrder::R or t_stream_factor.dir.r()) and
-                      shape.rt() > 1) or
-                     (tm.op == "vstack" and (ublock_order == UBlockOrder::C or t_stream_factor.dir.c()) and
-                      shape.ct() > 1));
-                can_stream &=
-                    t_stream_factor.dir.z_major() or
-                    (not needs_stack_factor and divisible_either_direction(stack_factor, t_stream_factor.t()));
-                total_stack_factor *= stack_factor;
-            }
-            else if (tm.op == "transpose")
-            {
-                can_stream &= (t_stream_factor.r == 1 or t_stream_factor.c == 1);
-                auto producer_ublock_order = ublock_order;
-                auto consumer_ublock_order = edge_attrs->get_ublock_order();
-
-                // Check if user feeds graph output queue
-                auto consumer_users = graph->users(user_node);
-                bool feeds_graph_output_queue =
-                    consumer_users.size() == 1 and consumer_users[0]->node_type() == graphlib::NodeType::kOutput;
-                bool producer_is_one_tile_wide_or_tall = (shape.rt() == 1 or shape.ct() == 1);
-
-                // Ublock order needs to swap through transpose except for directly feeding graph output queue or matmul
-                // (matmul requires ublock order)
-                can_stream &= (producer_ublock_order != consumer_ublock_order) or producer_is_one_tile_wide_or_tall or
-                              feeds_graph_output_queue;
-            }
-            else if (tm.op == "broadcast")
-            {
-                int dim = std::get<int>(tm.attr[0]);
-                if (dim == 3 and ublock_order == UBlockOrder::C)
-                {
-                    can_stream = false;
-                }
-                else if (dim == 2 and ublock_order == UBlockOrder::R)
-                {
-                    can_stream = false;
-                }
-                else
-                {
-                    // Cannot stream period if bcast on z
-                    can_stream = false;
-                }
-            }
-            else if (tm.op == "buda_unpad")
-            {
-                int r_pad = std::get<int>(tm.attr[0]);
-                int c_pad = std::get<int>(tm.attr[1]);
-                if ((r_pad and t_stream_factor.dir.c()) or (c_pad and t_stream_factor.dir.r()))
-                {
-                    can_stream = false;
-                }
-            }
-
-            shape = ::get_tm_shape(tm, shape, true);
-        }
-
-        if (user_node->as<graphlib::TaggedNode>()->has_tag("padding_nop"))
-        {
-            can_stream = false;
-        }
-
-        auto is_partial_datacopy_edge = [](Edge e) { return (e.edge_type == graphlib::EdgeType::kPartialDataCopy); };
-        std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_edges(user_node, is_partial_datacopy_edge);
-        if (user_node->node_type() == graphlib::NodeType::kOutput and partial_datacopy_edges.empty())
-        {
-            // Host runtime outputs cannot support undoing z major order
-            can_stream &= not t_stream_factor.dir.z_major() and not env_as<bool>("PYBUDA_DISABLE_STREAM_OUTPUT");
-        }
-
-        int stack_factor = (total_stack_factor / total_slice_factor) * needs_stack_factor;
-        int slice_factor = (total_slice_factor / total_stack_factor) * needs_slice_factor;
-        if (stack_factor > 1 or slice_factor > 1)
-            can_stream = false;
-
-        buffer_factor = std::max(buffer_factor, stack_factor);
-    }
-
-    return std::make_tuple(buffer_factor, can_stream, is_legal_stack_for_grid);
-}
-
-static bool legal_t_streaming(BlockShape block_shape, TStreamFactor t_stream_factor, UBlockOrder ublock_order)
-{
-    if (t_stream_factor.none())
-        return true;
-
-    if (t_stream_factor.dir.is_ublock_order(ublock_order))
-    {
-        return t_stream_factor.dir.r() ? ((block_shape.mblock_m == 1) or (t_stream_factor.c == 1))
-                                       : ((block_shape.mblock_n == 1) or (t_stream_factor.r == 1));
-    }
-    else
-    {
-        // For now this is over-constrained, this is allowed to simply be `return true`.
-        // Below is to reduce combinations / probably reduces chances of limiting matmul
-        return t_stream_factor.dir.r() ? (t_stream_factor.c == 1) : (t_stream_factor.r == 1);
-    }
-}
-
-static std::vector<BufferModel> calculate_output_buffer_models_for_grid(
-    OpShape op_shape,
-    GridShape selected_grid,
-    UBlockShape ublock,
-    TStreamFactor t_stream_factor,
-    int output_buffer_factor,
-    bool is_gradient_op,
-    DataFormat output_df)
-{
-    std::vector<BufferModel> output_buffers;
-
-    for (int output_idx = 0; output_idx < (int)op_shape.outputs.size(); ++output_idx)
-    {
-        // Block shape is determined by the combination of
-        // (how much we parallelized into the grid) * (how much we parallelized into t)
-        int par_r = selected_grid.r * t_stream_factor.r;
-        int par_c = selected_grid.c * t_stream_factor.c;
-        int par_t = t_stream_factor.t();
-        TT_ASSERT(par_r and par_c and par_t, par_r, par_c, par_t);
-        // All outputs need to have the same dim & parallelization
-        TensorShape output = op_shape.outputs[output_idx];
-        BlockShape block_shape(output, par_r, par_c, par_t, ublock);
-        output_buffers.emplace_back(block_shape, is_gradient_op ? 1 : output_buffer_factor, output_df);
-    }
-
-    return output_buffers;
-}
-
-static std::vector<BufferModel> calculate_parameter_buffer_models_for_grid(
-    OpShape const& op_shape,
-    std::vector<graphlib::Node*> const& operands,
-    GridShape selected_grid,
-    bool force_dram_parameters)
-{
-    std::vector<BufferModel> parameter_buffers;
-    parameter_buffers.resize(operands.size());
-    for (int input_idx = 0; input_idx < (int)operands.size(); ++input_idx)
-    {
-        graphlib::InputNode* input_node = dynamic_cast<graphlib::InputNode*>(operands[input_idx]);
-        if (input_node and
-            (input_node->is_parameter() or input_node->is_optimizer_parameter() or input_node->is_constant()) and
-            not force_dram_parameters)
-        {
-            TensorShape const& parameter_shape = op_shape.producer_shapes[input_idx];
-            int grid_r = FactorizedInt(parameter_shape.rt).get_nearest_factor_le(selected_grid.r);
-            int grid_c =
-                FactorizedInt(parameter_shape.ct)
-                    .get_nearest_factor_le(selected_grid.c);  // TODO: Should we do this for sparse_matmul in0 and in2?
-            TT_ASSERT(parameter_shape.rt % grid_r == 0);
-            TT_ASSERT(parameter_shape.ct % grid_c == 0);
-            BlockShape parameter_block_shape(parameter_shape, grid_r, grid_c, 1, UBlockShape(1, 1));
-            parameter_buffers[input_idx] = BufferModel(parameter_block_shape, 1, operands[input_idx]->output_df());
-        }
-    }
-    return parameter_buffers;
-}
-
-static std::vector<BufferModel> calculate_intermediate_buffer_models_for_grid(
-    graphlib::BudaOpNode const* op,
-    BufferModel const& output_buffer,
-    FusedOp const* fused_op,
-    std::unordered_map<std::string, balancer::UBlockShape> const& fused_op_ublock_shape)
-{
-    std::vector<BufferModel> intermediate_buffers;
-    bool intermediate_alias_output = op->intermediate_df() == output_buffer.data_format;
-    bool is_reduce_z = graphlib::is_reduce_z(op);
-    bool needs_intermediate_buffer_allocation =
-        ((op->is_gradient_op() or op->is_matmul() or is_reduce_z) and not intermediate_alias_output);
-    if (fused_op)
-    {
-        std::vector<int> mapped_intermediate_buffers;
-        for (FusedSchedule const& schedule : fused_op->get_schedules())
-        {
-            for (FusedSubOp const& op : schedule.ops)
-            {
-                // TODO: do we need to handle dest here?
-                if ((op.output_type == FusedSubOp::OutputType::INTERMED) &&
-                    std::find(
-                        mapped_intermediate_buffers.begin(), mapped_intermediate_buffers.end(), op.output_buffer) ==
-                        mapped_intermediate_buffers.end())
-                {
-                    TT_ASSERT(op.op_shape.outputs.size() == 1);
-                    mapped_intermediate_buffers.push_back(op.output_buffer);
-                    BlockShape block_shape(1, 1, 1, fused_op_ublock_shape.at(op.name));  // ublock buffered
-                    intermediate_buffers.emplace_back(block_shape, 1, op.output_df);
-                }
-            }
-        }
-    }
-    else if (needs_intermediate_buffer_allocation)
-    {
-        intermediate_buffers.emplace_back(output_buffer.block_shape, 1, op->intermediate_df());
-    }
-    return intermediate_buffers;
-}
-
-static std::pair<std::uint32_t, bool> calculate_input_multiplier(
-    const std::map<std::uint32_t, std::uint32_t>& input_multiplier_overrides,
-    std::uint32_t operand,
-    const TensorShape& shape,
-    const UBlockShape& ublock)
-{
-    constexpr std::uint32_t DEFAULT_INPUT_MULTIPLIER = 2;
-    if (auto input_multiplier_override_it = input_multiplier_overrides.find(operand);
-        input_multiplier_override_it != input_multiplier_overrides.end())
-    {
-        log_debug(
-            LogBalancer,
-            "Using input multiplier override for operand {}: {}",
-            operand,
-            input_multiplier_override_it->second);
-        return {
-            std::max(input_multiplier_override_it->second, DEFAULT_INPUT_MULTIPLIER) * shape.ct * shape.rt /
-                ublock.volume(),
-            true};
-    }
-    return {DEFAULT_INPUT_MULTIPLIER, false};
-}
-
-static TensorShape calculate_effective_input_buffer_shape(
-    Graph const* graph,
-    graphlib::Edge edge,
-    TStreamFactor producer_t_stream_factor,
-    TStreamFactor consumer_t_stream_factor)
-{
-    TT_ASSERT(
-        producer_t_stream_factor.none() or consumer_t_stream_factor.none(),
-        "This function only handles one or the other not both");
-
-    auto shape = graph->node_by_id(edge.producer_node_id)->shape();
-    auto edge_attrs = graph->get_edge_attributes(edge);
-
-    // Special eval that clamps if we over slice
-    auto tm_shape = [](graphlib::OpType tm, graphlib::Shape const& shape) -> graphlib::Shape
-    {
-        if (tm.op == "vslice" and
-            (std::get<int>(tm.attr[0]) > (int)shape.rt() or ((int)shape.rt() % std::get<int>(tm.attr[0]) != 0)))
-        {
-            std::get<int>(tm.attr[0]) = (int)shape.rt();
-        }
-        if (tm.op == "hslice" and
-            (std::get<int>(tm.attr[0]) > (int)shape.ct() or ((int)shape.ct() % std::get<int>(tm.attr[0]) != 0)))
-        {
-            std::get<int>(tm.attr[0]) = (int)shape.ct();
-        }
-        return ::get_tm_shape(tm, shape, true);
-    };
-
-    shape = tm_shape(graphlib::OpType("vslice", {producer_t_stream_factor.r}, {}), shape);
-    shape = tm_shape(graphlib::OpType("hslice", {producer_t_stream_factor.c}, {}), shape);
-
-    int internal_slice_stack_factor = 1;
-    for (graphlib::OpType const& tm : edge_attrs->get_tms())
-    {
-        bool eval_tm = true;
-
-        if (tm.op == "hslice" or tm.op == "vslice")
-        {
-            int slice_factor = std::get<int>(tm.attr[0]);
-            internal_slice_stack_factor *= slice_factor;
-        }
-        else if (tm.op == "hstack" or tm.op == "vstack")
-        {
-            int stack_factor = std::get<int>(tm.attr[0]);
-            eval_tm = ((internal_slice_stack_factor % stack_factor) == 0);
-            if (eval_tm)
-                internal_slice_stack_factor /= stack_factor;
-        }
-
-        if (eval_tm)
-            shape = tm_shape(tm, shape);
-    }
-
-    shape = tm_shape(graphlib::OpType("vslice", {consumer_t_stream_factor.r}, {}), shape);
-    shape = tm_shape(graphlib::OpType("hslice", {consumer_t_stream_factor.c}, {}), shape);
-
-    return shape;
-}
-
-static std::unordered_map<graphlib::NodeId, TensorShape> calculate_effective_input_buffer_shapes_for_users(
-    graphlib::Graph const* graph, graphlib::Node const* node, TStreamFactor t_stream_factor)
-{
-    std::unordered_map<graphlib::NodeId, TensorShape> effective_input_buffer_shape_for_user;
-    for (auto edge : graph->user_data_edges(node))
-    {
-        effective_input_buffer_shape_for_user[edge.consumer_node_id] =
-            calculate_effective_input_buffer_shape(graph, edge, t_stream_factor, TStreamFactor());
-    }
-    return effective_input_buffer_shape_for_user;
-}
-
-static int calculate_max_u_kt_sparse(
-    graphlib::Graph const* graph, graphlib::BudaOpNode const* op_node, int u_kt_override)
-{
-    TT_ASSERT(op_node->is_sparse_matmul());
-    if (u_kt_override)
-        return u_kt_override;
-
-    std::vector<graphlib::Edge> operands = graph->operand_data_edges(op_node);
-    TT_ASSERT(operands.size() == 3);
-    TensorShape rhs_shape =
-        calculate_effective_input_buffer_shape(graph, operands[1], TStreamFactor(), TStreamFactor());
-    return rhs_shape.rt;
-}
-
-static int calculate_max_u_kt(
-    graphlib::Graph const* graph, graphlib::BudaOpNode const* op_node, TStreamFactor t_stream_factor, int u_kt_override)
-{
-    TT_ASSERT(op_node->is_matmul());
-    if (u_kt_override)
-        return u_kt_override;
-    std::vector<graphlib::Edge> operands = graph->operand_data_edges(op_node);
-    TT_ASSERT(operands.size() >= 2);
-    TensorShape lhs_shape = calculate_effective_input_buffer_shape(
-        graph, operands[0], TStreamFactor(), t_stream_factor.dir.r() ? t_stream_factor : TStreamFactor());
-    TensorShape rhs_shape = calculate_effective_input_buffer_shape(
-        graph, operands[1], TStreamFactor(), t_stream_factor.dir.c() ? t_stream_factor : TStreamFactor());
-    return int(std::min(lhs_shape.ct, rhs_shape.rt));
-}
-
-static std::vector<BufferModel> calculate_matmul_input_buffer_models_for_l1_budget(
-    graphlib::Graph const* graph,
-    const graphlib::BudaOpNode* op_node,
-    GridShape grid_shape,
-    OpShape op_shape,
-    std::vector<graphlib::Node*> const& operands,
-    BlockShape const& output_block_shape,
-    TStreamFactor const& t_stream_factor,
-    std::size_t input_l1_buffer_space,
-    int,
-    std::unordered_map<std::string, balancer::UBlockShape>,
-    LegalSparseUKts const&,
-    int u_kt_override,
-    const std::map<std::uint32_t, std::uint32_t>& input_multiplier_overrides)
-{
-    TT_ASSERT(op_shape.inputs.size() >= 2 && op_shape.inputs.size() <= 4);
-    TT_ASSERT(operands.size() >= 2 && operands.size() <= 4);
-    std::vector<BufferModel> input_buffers;
-    DataFormat input0_df = operands[0]->output_df();
-    DataFormat input1_df = operands[1]->output_df();
-    TensorShape input0 = op_shape.inputs[0];
-    TensorShape input1 = op_shape.inputs[1];
-    int max_u_kt = calculate_max_u_kt(graph, op_node, t_stream_factor, u_kt_override);
-
-    auto buda_attrs = op_node->buda_attrs();
-    bool has_requant = buda_attrs.find("requant") != buda_attrs.end() and std::get<bool>(buda_attrs.at("requant"));
-    std::optional<BufferModel> fused_bias;
-    std::optional<BufferModel> fused_requant;
-
-    if (has_requant)
-    {
-        TT_ASSERT(operands.size() >= 3);
-        TensorShape input2 = op_shape.inputs[2];
-        DataFormat input2_df = operands[2]->output_df();
-        UBlockShape input2_ublock = output_block_shape.ublock;
-        BlockShape input2_block_shape(input2, GridShape(1, grid_shape.c), 1, 1, input2_ublock);
-        auto [input_buffer_multiplier, override_enabled] =
-            calculate_input_multiplier(input_multiplier_overrides, 2, input2, input2_ublock);
-
-        if (operands.size() == 3)
-        {
-            // Just dequant
-            fused_requant = BufferModel(input2_block_shape, input_buffer_multiplier, input2_df, override_enabled);
-
-            // Carve out fused operand before calculating u_kt
-            if (fused_requant->size_bytes() <= input_l1_buffer_space)
-                input_l1_buffer_space -= fused_requant->size_bytes();
-        }
-        else if (operands.size() == 4)
-        {
-            // Bias + dequant
-            fused_bias = BufferModel(input2_block_shape, input_buffer_multiplier, input2_df, override_enabled);
-
-            // Carve out fused operand before calculating u_kt
-            if (fused_bias->size_bytes() <= input_l1_buffer_space)
-                input_l1_buffer_space -= fused_bias->size_bytes();
-
-            TensorShape input3 = op_shape.inputs[3];
-            DataFormat input3_df = operands[3]->output_df();
-            UBlockShape input3_ublock = output_block_shape.ublock;
-            BlockShape input3_block_shape(input3, GridShape(1, grid_shape.c), 1, 1, input3_ublock);
-            auto [input_buffer_multiplier, override_enabled] =
-                calculate_input_multiplier(input_multiplier_overrides, 3, input3, input3_ublock);
-            fused_requant = BufferModel(input3_block_shape, input_buffer_multiplier, input3_df, override_enabled);
-
-            // Carve out fused operand before calculating u_kt
-            if (fused_requant->size_bytes() <= input_l1_buffer_space)
-                input_l1_buffer_space -= fused_requant->size_bytes();
-        }
-    }
-    else
-    {
-        if (operands.size() == 3)
-        {
-            // fused bias
-            TensorShape input2 = op_shape.inputs[2];
-            DataFormat input2_df = operands[2]->output_df();
-            UBlockShape input2_ublock = output_block_shape.ublock;
-            BlockShape input2_block_shape(input2, GridShape(1, grid_shape.c), 1, 1, input2_ublock);
-            auto [input_buffer_multiplier, override_enabled] =
-                calculate_input_multiplier(input_multiplier_overrides, 2, input2, input2_ublock);
-            fused_bias = BufferModel(input2_block_shape, input_buffer_multiplier, input2_df, override_enabled);
-
-            // Carve out fused operand before calculating u_kt
-            if (fused_bias->size_bytes() <= input_l1_buffer_space)
-                input_l1_buffer_space -= fused_bias->size_bytes();
-        }
-    }
-
-    // 1 outer strip of tiles per input double buffered
-    int input0_outer_dim_bytes =
-        output_block_shape.mblock_m * output_block_shape.ublock.rt * tile_size_bytes(input0_df) * 2;
-    int input1_outer_dim_bytes =
-        output_block_shape.mblock_n * output_block_shape.ublock.ct * tile_size_bytes(input1_df) * 2;
-
-    bool minimize_op0 = false;
-    bool minimize_op1 = false;
-
-    if (!env_as<bool>("PYBUDA_DISABLE_MIN_MATMUL_BUFFER"))
-    {
-        // Minimize one of the buffers - whichever strip is bigger if input1 is alowed through switch
-        if ((input1_outer_dim_bytes > input0_outer_dim_bytes) && env_as<bool>("PYBUDA_MIN_MATMUL_BUFFER_ALLOW_IN1"))
-        {
-            input1_outer_dim_bytes /= output_block_shape.mblock_n;
-            minimize_op1 = true;
-        }
-        else
-        {
-            input0_outer_dim_bytes /= output_block_shape.mblock_m;
-            minimize_op0 = true;
-        }
-    }
-    std::size_t k_factor =
-        std::max(std::size_t(1), input_l1_buffer_space / (input0_outer_dim_bytes + input1_outer_dim_bytes));
-
-    TT_ASSERT(k_factor <= INT_MAX);
-    int u_kt = FactorizedInt(max_u_kt).get_nearest_factor_le(static_cast<int>(k_factor));
-
-    UBlockShape input0_ublock(output_block_shape.ublock.rt, u_kt);
-    UBlockShape input1_ublock(u_kt, output_block_shape.ublock.ct);
-
-    BlockShape input0_block_shape(
-        input0, GridShape(grid_shape.r, 1), minimize_op0 ? 1 : output_block_shape.mblock_m, 1, input0_ublock);
-    BlockShape input1_block_shape(
-        input1, GridShape(1, grid_shape.c), 1, minimize_op1 ? 1 : output_block_shape.mblock_n, input1_ublock);
-
-    TT_ASSERT(u_kt != 0);
-    auto [input0_buffer_multiplier, override_enabled0] =
-        calculate_input_multiplier(input_multiplier_overrides, 0, input0, input0_ublock);
-    input_buffers.emplace_back(input0_block_shape, input0_buffer_multiplier, input0_df, override_enabled0);
-    input_buffers[0].minimize_input_buffer = minimize_op0;
-
-    auto [input1_buffer_multiplier, override_enabled1] =
-        calculate_input_multiplier(input_multiplier_overrides, 1, input1, input1_ublock);
-    input_buffers.emplace_back(input1_block_shape, input1_buffer_multiplier, input1_df, override_enabled1);
-    input_buffers[1].minimize_input_buffer = minimize_op1;
-
-    if (fused_bias)
-    {
-        fused_bias->buffer_factor = std::max((std::uint32_t)fused_bias->buffer_factor / u_kt, (std::uint32_t)2);
-        input_buffers.push_back(*fused_bias);
-    }
-    if (fused_requant)
-    {
-        fused_requant->buffer_factor = std::max((std::uint32_t)fused_requant->buffer_factor / u_kt, (std::uint32_t)2);
-        input_buffers.push_back(*fused_requant);
-    }
-
-    return input_buffers;
-}
-
-static std::vector<BufferModel> calculate_sparse_matmul_input_buffer_models_for_l1_budget(
-    graphlib::Graph const* graph,
-    const graphlib::BudaOpNode* op_node,
-    GridShape grid_shape,
-    OpShape op_shape,
-    std::vector<graphlib::Node*> const& operands,
-    BlockShape const& output_block_shape,
-    TStreamFactor const& t_stream_factor,
-    std::size_t input_l1_buffer_space,
-    int fracture_factor,
-    std::unordered_map<std::string, balancer::UBlockShape>,
-    LegalSparseUKts const& legal_sparse_u_kts,
-    int u_kt_override,
-    const std::map<std::uint32_t, std::uint32_t>& input_multiplier_overrides)
-{
-    DataFormat input0_df = operands[0]->output_df();
-    DataFormat input1_df = operands[1]->output_df();
-    DataFormat input2_df = operands[2]->output_df();
-    TensorShape input0 = op_shape.inputs[0];
-    TensorShape input1 = op_shape.inputs[1];
-    TensorShape input2 = op_shape.inputs[2];
-
-    TT_ASSERT(op_shape.inputs.size() == 3);
-    TT_ASSERT(operands.size() == 3);
-
-    BlockShape sparse_block_shape(input0, GridShape(grid_shape.r, 1), 1, 1, UBlockShape(1, 1));
-    BlockShape index_block_shape(input2, GridShape(grid_shape.r, 1), 1, 1, UBlockShape(1, 1));
-
-    BufferModel buffer_model0 = BufferModel(sparse_block_shape, 1, input0_df);
-    BufferModel buffer_model2 = BufferModel(index_block_shape, 1, input2_df);
-
-    // Sparse MM will access the parameter buffer allocation for in0 so no need to allocate an input buffer for it.  We
-    // will keep the input buffer info for other bits of code to reference shapes
-    buffer_model0.l1_size_tiles = 0;
-
-    int leftover_l1_space = input_l1_buffer_space - buffer_model0.size_bytes() - buffer_model2.size_bytes();
-
-    // Calculate max u_kt given leftover l1 space
-    auto [input1_buffer_multiplier, override_enabled] =
-        calculate_input_multiplier(input_multiplier_overrides, 1, input1, UBlockShape(1, output_block_shape.ublock.ct));
-    BlockShape input_block_shape_ukt1(
-        input1,
-        GridShape(1, grid_shape.c / fracture_factor),
-        1,
-        output_block_shape.mblock_n,
-        UBlockShape(1, output_block_shape.ublock.ct));
-    int min_buffer_mem =
-        BufferModel(input_block_shape_ukt1, input1_buffer_multiplier, input1_df, override_enabled).size_bytes();
-
-    // Find max u_kt given input dims
-    int max_u_kt_dimensionwise = calculate_max_u_kt_sparse(graph, op_node, u_kt_override);
-    TT_ASSERT(max_u_kt_dimensionwise > 0);
-
-    // Additional limits on u_kt (memory- and encoding- imposed constraints)
-    const sparse::SparseBUDA& sparse_buda =
-        graph->data_operands(op_node)[0]->as<graphlib::ConstantInputNode>()->get_sparse_buda();
-    int max_u_kt_memorywise = (int)(leftover_l1_space / min_buffer_mem);  // sparse op
-    if (env_as<bool>("PYBUDA_SPARSE_BUFFER_ENABLE"))
-    {
-        max_u_kt_memorywise = std::min(
-            max_u_kt_memorywise,
-            (int)((input_l1_buffer_space + BufferModel(output_block_shape, 2, op_node->output_df()).size_bytes()) / (min_buffer_mem * 2))  // buffer op, where both input and output grow with u_kt
-        );
-    }
-    int max_u_kt_encodingwise = sparse_buda.get_max_u_kt(grid_shape.r, t_stream_factor.r, output_block_shape.ublock.rt);
-    std::size_t k_factor = std::max(1, max_u_kt_memorywise);
-    TT_ASSERT(k_factor <= INT_MAX);
-    auto legal_factors = FactorizedInt(max_u_kt_dimensionwise);
-    TT_ASSERT(not legal_factors.empty());
-
-    TT_ASSERT(not t_stream_factor.is_streaming_r() or not legal_sparse_u_kts.empty());
-    if (t_stream_factor.is_streaming_r())
-    {
-        std::vector<int> const& legal_u_kts = legal_sparse_u_kts.at(t_stream_factor.r);
-        TT_ASSERT(not legal_u_kts.empty());
-        legal_factors = legal_factors & FactorizedInt(legal_u_kts.begin(), legal_u_kts.end());
-        if (legal_factors.empty())
-            return {};
-    }
-
-    if (static_cast<int>(k_factor) < legal_factors.get_min_factor())
-        return {};
-    int u_kt_memorywise = legal_factors.get_nearest_factor_le(static_cast<int>(k_factor));  // Limit u_kt by memory
-
-    if (static_cast<int>(max_u_kt_encodingwise) < legal_factors.get_min_factor())
-        return {};
-    int u_kt_encodingwise =
-        legal_factors.get_nearest_factor_le(static_cast<int>(max_u_kt_encodingwise));  // Limit u_kt by encoding limits
-
-    // u_kt is now the min of memory/encoding limits
-    int u_kt = std::max(1, std::min(u_kt_memorywise, u_kt_encodingwise));
-
-    // Recreate buffer model with new u_kt
-    BlockShape input_block_shape = BlockShape(
-        input1,
-        GridShape(1, grid_shape.c / fracture_factor),
-        1,
-        output_block_shape.mblock_n,
-        UBlockShape(u_kt, output_block_shape.ublock.ct));
-    BufferModel buffer_model1 = BufferModel(input_block_shape, input1_buffer_multiplier, input1_df);
-
-    return {buffer_model0, buffer_model1, buffer_model2};
-}
-
-static std::vector<BufferModel> calculate_depthwise_input_buffer_models_for_l1_budget(
-    graphlib::Graph const*,
-    const graphlib::BudaOpNode*,
-    GridShape grid_shape,
-    OpShape op_shape,
-    std::vector<graphlib::Node*> const& operands,
-    BlockShape const& output_block_shape,
-    TStreamFactor const&,
-    std::size_t input_l1_buffer_space,
-    int,
-    std::unordered_map<std::string, balancer::UBlockShape>,
-    LegalSparseUKts const&,
-    int,
-    const std::map<std::uint32_t, std::uint32_t>& input_multiplier_overrides)
-{
-    TT_ASSERT(op_shape.inputs.size() >= 2 && op_shape.inputs.size() <= 3);
-    TT_ASSERT(operands.size() >= 2 && operands.size() <= 3);
-    std::vector<BufferModel> input_buffers;
-    DataFormat input0_df = operands[0]->output_df();
-    DataFormat input1_df = operands[1]->output_df();
-    TensorShape input0 = op_shape.inputs[0];
-    TensorShape input1 = op_shape.inputs[1];
-    int u_kt = 1;  // HLK-imposed limit
-
-    std::optional<BufferModel> fused_bias;
-    if (operands.size() == 3)
-    {
-        // fused bias
-        TensorShape input2 = op_shape.inputs[2];
-        DataFormat input2_df = operands[2]->output_df();
-        UBlockShape input2_ublock = output_block_shape.ublock;
-        BlockShape input2_block_shape(input2, GridShape(1, grid_shape.c), 1, 1, input2_ublock);
-        auto [input_buffer_multiplier, override_enabled] =
-            calculate_input_multiplier(input_multiplier_overrides, 2, input2, input2_ublock);
-        fused_bias = BufferModel(input2_block_shape, input_buffer_multiplier, input2_df, override_enabled);
-
-        // Carve out fused operand before calculating u_kt
-        if (fused_bias->size_bytes() <= input_l1_buffer_space)
-            input_l1_buffer_space -= fused_bias->size_bytes();
-    }
-
-    // HLK doesn't support minimizing op1, but op0 is okay
-    bool minimize_op0 = not env_as<bool>("PYBUDA_DISABLE_MIN_DEPTHWISE_BUFFER", false);
-
-    UBlockShape input0_ublock(output_block_shape.ublock.rt, output_block_shape.ublock.ct);
-    UBlockShape input1_ublock(u_kt, output_block_shape.ublock.ct);
-
-    BlockShape input0_block_shape(
-        input0, GridShape(grid_shape.r, 1), minimize_op0 ? 1 : output_block_shape.mblock_m, 1, input0_ublock);
-    BlockShape input1_block_shape(input1, GridShape(1, grid_shape.c), 1, output_block_shape.mblock_n, input1_ublock);
-
-    auto [input0_buffer_multiplier, override_enabled0] =
-        calculate_input_multiplier(input_multiplier_overrides, 0, input0, input0_ublock);
-    input_buffers.emplace_back(input0_block_shape, input0_buffer_multiplier, input0_df, override_enabled0);
-    input_buffers[0].minimize_input_buffer = minimize_op0;
-
-    auto [input1_buffer_multiplier, override_enabled1] =
-        calculate_input_multiplier(input_multiplier_overrides, 1, input1, input1_ublock);
-    input_buffers.emplace_back(input1_block_shape, input1_buffer_multiplier, input1_df, override_enabled1);
-    input_buffers[1].minimize_input_buffer = false;  // HLK-imposed limit
-
-    if (fused_bias)
-    {
-        fused_bias->buffer_factor = std::max((std::uint32_t)fused_bias->buffer_factor / u_kt, (std::uint32_t)2);
-        input_buffers.push_back(*fused_bias);
-    }
-
-    return input_buffers;
-}
-
-static std::vector<BufferModel> calculate_eltwise_input_buffer_models_for_l1_budget(
-    graphlib::Graph const*,
-    const graphlib::BudaOpNode*,
-    GridShape grid_shape,
-    OpShape op_shape,
-    std::vector<graphlib::Node*> const& operands,
-    BlockShape const& output_block_shape,
-    TStreamFactor const&,
-    std::size_t,
-    int,
-    std::unordered_map<std::string, balancer::UBlockShape>,
-    LegalSparseUKts const&,
-    int,
-    const std::map<std::uint32_t, std::uint32_t>& input_multiplier_overrides)
-{
-    TT_ASSERT(op_shape.inputs.size() == operands.size());
-
-    std::vector<BufferModel> input_buffers;
-    for (int input_idx = 0; input_idx < (int)op_shape.inputs.size(); ++input_idx)
-    {
-        TensorShape const& input = op_shape.inputs[input_idx];
-        BlockShape input_block_shape(input, grid_shape, 1, 1, output_block_shape.ublock);
-        auto [input_buffer_multiplier, override_enabled] =
-            calculate_input_multiplier(input_multiplier_overrides, input_idx, input, output_block_shape.ublock);
-        input_buffers.emplace_back(
-            input_block_shape, input_buffer_multiplier, operands[input_idx]->output_df(), override_enabled);
-    }
-    return input_buffers;
-}
-
-static std::vector<BufferModel> calculate_reduce_input_buffer_models_for_l1_budget(
-    graphlib::Graph const* graph,
-    const graphlib::BudaOpNode* op_node,
-    GridShape grid_shape,
-    OpShape op_shape,
-    std::vector<graphlib::Node*> const& operands,
-    BlockShape const& output_block_shape,
-    TStreamFactor const& t_stream_factor,
-    std::size_t input_l1_buffer_space,
-    int,
-    std::unordered_map<std::string, balancer::UBlockShape>,
-    LegalSparseUKts const&,
-    int,
-    const std::map<std::uint32_t, std::uint32_t>& input_multiplier_overrides)
-{
-    const graphlib::OpType& op_type = op_node->op_type();
-    TT_ASSERT(op_type.op == "reduce");
-    TT_ASSERT(op_shape.inputs.size() == operands.size());
-    TT_ASSERT(op_shape.inputs.size() == 1);
-
-    if (std::get<std::string>(op_type.buda_attrs.at("dim")) == "z")
-    {
-        return calculate_eltwise_input_buffer_models_for_l1_budget(
-            graph,
-            op_node,
-            grid_shape,
-            op_shape,
-            operands,
-            output_block_shape,
-            t_stream_factor,
-            input_l1_buffer_space,
-            1,
-            {},
-            {},
-            0,
-            input_multiplier_overrides);
-    }
-
-    auto calc_u_kt = [](std::size_t input_l1_buffer_space,
-                        int reduce_dim_tiles,
-                        int non_reduce_dim_ublock_tiles,
-                        DataFormat df) -> int
-    {
-        std::size_t non_reduce_dim_bytes = non_reduce_dim_ublock_tiles * tile_size_bytes(df);
-        std::size_t k_factor = std::max(std::size_t(1), input_l1_buffer_space / (non_reduce_dim_bytes * 2));
-        int u_kt = FactorizedInt(reduce_dim_tiles).get_nearest_factor_le(static_cast<int>(k_factor));
-        return u_kt;
-    };
-
-    graphlib::Node* operand = operands[0];
-    TensorShape input = op_shape.inputs[0];
-    UBlockShape input_ublock;
-    if (std::get<std::string>(op_type.buda_attrs.at("dim")) == "r")
-    {
-        int u_kt = calc_u_kt(input_l1_buffer_space, input.rt, output_block_shape.ublock.ct, operand->output_df());
-        input_ublock = UBlockShape(u_kt, output_block_shape.ublock.ct);
-    }
-    else
-    {
-        TT_ASSERT(std::get<std::string>(op_type.buda_attrs.at("dim")) == "c");
-        int u_kt = calc_u_kt(input_l1_buffer_space, input.ct, output_block_shape.ublock.rt, operand->output_df());
-        input_ublock = UBlockShape(output_block_shape.ublock.rt, u_kt);
-    }
-
-    BlockShape input_block_shape(input, grid_shape, 1, 1, input_ublock);
-    auto [input_buffer_multiplier, override_enabled] =
-        calculate_input_multiplier(input_multiplier_overrides, 0, input, input_ublock);
-    return {BufferModel(input_block_shape, input_buffer_multiplier, operand->output_df(), override_enabled)};
-}
-
-static std::vector<BufferModel> calculate_embedding_input_buffer_models_for_l1_budget(
-    graphlib::Graph const*,
-    const graphlib::BudaOpNode* op_node,
-    GridShape grid_shape,
-    OpShape op_shape,
-    std::vector<graphlib::Node*> const& operands,
-    BlockShape const& output_block_shape,
-    TStreamFactor const&,
-    std::size_t,
-    int,
-    std::unordered_map<std::string, balancer::UBlockShape>,
-    LegalSparseUKts const&,
-    int,
-    const std::map<std::uint32_t, std::uint32_t>&)
-{
-    const graphlib::OpType& op_type = op_node->op_type();
-    TT_ASSERT(op_type.op == "embedding");
-    TT_ASSERT(op_shape.inputs.size() == operands.size());
-    TT_ASSERT(op_shape.inputs.size() == 2);
-
-    BlockShape embedding_table_block_shape(
-        op_shape.inputs[0], GridShape(1, grid_shape.c), 1, 1, UBlockShape(1, output_block_shape.ublock.ct));
-    BufferModel embedding_table(embedding_table_block_shape, 2, operands[1]->output_df());
-
-    TensorShape indices_shape = op_shape.inputs[1];
-    TT_ASSERT((indices_shape.ct % grid_shape.r) == 0);
-    indices_shape.rt = indices_shape.rt * grid_shape.r;
-    indices_shape.ct = indices_shape.ct / grid_shape.r;
-    BlockShape indices_block_shape(indices_shape, GridShape(grid_shape.r, 1), 1, 1, UBlockShape(1, 1));
-    BufferModel indices(indices_block_shape, 2, operands[1]->output_df());
-
-    return {embedding_table, indices};
-}
-
-static std::vector<BufferModel> calculate_fused_input_buffer_models_for_l1_budget(
-    graphlib::Graph const*,
-    const graphlib::BudaOpNode* op_node,
-    GridShape grid_shape,
-    OpShape op_shape,
-    std::vector<graphlib::Node*> const& operands,
-    BlockShape const&,
-    TStreamFactor const&,
-    std::size_t,
-    int,
-    std::unordered_map<std::string, balancer::UBlockShape> fused_op_ublock_shape,
-    LegalSparseUKts const&,
-    int,
-    const std::map<std::uint32_t, std::uint32_t>& input_multiplier_overrides)
-{
-    TT_ASSERT(op_shape.inputs.size() == operands.size());
-
-    std::vector<BufferModel> input_buffers;
-    std::vector<bool> visited(op_shape.inputs.size(), false);
-    for (FusedSchedule const& schedule : op_node->get_fused_op()->get_schedules())
-    {
-        for (FusedSubOp const& sub_op : schedule.ops)
-        {
-            // Id of input for this specific sub op.
-            int input_id = -1;
-            for (FusedSubOpInput const& sub_input : sub_op.inputs)
-            {
-                input_id++;
-
-                if (sub_input.type != FusedSubOpInput::InputType::INPUT)
-                    continue;
-
-                TT_ASSERT(sub_input.index < op_shape.inputs.size());
-                TT_ASSERT(visited[sub_input.index] == false);
-
-                if (visited[sub_input.index])
-                    continue;
-                visited[sub_input.index] = true;
-
-                TensorShape const& input = op_shape.inputs[sub_input.index];
-                UBlockShape ublock_shape = fused_op_ublock_shape.at(sub_op.name);
-
-                // Special case for fused matmul op.
-                if (sub_op.op_type == "matmul")
-                {
-                    int u_kt = 1;  // The only legal u_kt for matmul sub-ops
-                    if (input_id == 0)
-                        ublock_shape.ct = u_kt;
-                    else
-                        ublock_shape.rt = u_kt;
-                }
-
-                BlockShape input_block_shape(input, grid_shape, 1, 1, ublock_shape);
-                auto [input_buffer_multiplier, override_enabled] =
-                    calculate_input_multiplier(input_multiplier_overrides, sub_input.index, input, ublock_shape);
-                input_buffers.emplace_back(
-                    input_block_shape,
-                    input_buffer_multiplier,
-                    operands[sub_input.index]->output_df(),
-                    override_enabled);
-            }
-        }
-    }
-    TT_ASSERT(input_buffers.size() == op_shape.inputs.size());
-
-    return input_buffers;
-}
-
-template <typename... Args>
-static std::vector<BufferModel> calculate_input_buffer_models(
-    graphlib::Graph const* graph, const graphlib::BudaOpNode* op_node, Args... args)
-{
-    if (op_node->is_fused_op())
-    {
-        return calculate_fused_input_buffer_models_for_l1_budget(graph, op_node, args...);
-    }
-    else if (op_node->is_matmul() and not op_node->is_sparse_matmul() and not op_node->is_depthwise_matmul())
-    {
-        return calculate_matmul_input_buffer_models_for_l1_budget(graph, op_node, args...);
-    }
-    else if (op_node->is_sparse_matmul())
-    {
-        return calculate_sparse_matmul_input_buffer_models_for_l1_budget(graph, op_node, args...);
-    }
-    else if (op_node->is_depthwise_matmul())
-    {
-        return calculate_depthwise_input_buffer_models_for_l1_budget(graph, op_node, args...);
-    }
-    else if (op_node->op_name() == "reduce")
-    {
-        return calculate_reduce_input_buffer_models_for_l1_budget(graph, op_node, args...);
-    }
-    else if (op_node->op_name() == "embedding")
-    {
-        return calculate_embedding_input_buffer_models_for_l1_budget(graph, op_node, args...);
-    }
-
-    return calculate_eltwise_input_buffer_models_for_l1_budget(graph, op_node, args...);
-}
-
-// Returns the length of the pattern for kernel broadcast, by creating the pipe for the consumer and producer, and
-// analyzing the addresses of tiles being sent
-//
-int get_kernel_broadcast_len(
-    std::unordered_map<Pipe, int>* const kb_cache,
-    graphlib::Graph const* graph,
-    balancer::OpModel const& op_model,
-    graphlib::Edge const& edge,
-    graphlib::EdgeAttributes* edge_attr,
-    std::vector<OpType> const& tms)
-{
-    log_trace(LogKernelBroadcast, "  get_kernel_broadcast_len, operand index: {}", edge.consumer_input_port_id);
-
-    graphlib::Node* producer = graph->node_by_id(edge.producer_node_id);
-
-    // Don't kernel broadcast if producer is a buda op
-    //
-    if (producer->node_type() == graphlib::NodeType::kBudaOp)
-    {
-        return 0;
-    }
-
-    // Don't kernel broadcast from producers with shape.z > 1
-    //
-    if (producer->shape().z() > 1)
-    {
-        // This constraint is imposed by net2pipe, the test below errors out if this constraint is removed
-        //
-        //   pybuda/test/test_constraints.py::test_stream_stacking_transpose
-        //     ERROR: TM ERROR (producer = input_1_mm0, consumer = mm0): with kernel broadcast that's not per-t,
-        //     producer must have t = 1 and buf_size_mb = 1 or 2
-        //
-        return 0;
-    }
-
-    // Don't kernel broadcast from buffering and e2e queues
-    //
-    if (producer->node_type() == graphlib::NodeType::kQueue and
-        (producer->as<graphlib::QueueNode>()->queue_type() == graphlib::QueueNodeType::Buffering or
-         producer->as<graphlib::QueueNode>()->queue_type() == graphlib::QueueNodeType::EpochToEpoch))
-    {
-        // This helps us avoid user inserted queues and virtual queues, all of which can end up having t > 1
-        //
-        return 0;
-    }
-
-    // If there's no broadcasts, there's nothing to kernel broadcast
-    //
-    if (not std::any_of(tms.begin(), tms.end(), [](auto const& op_type) { return op_type.op == "broadcast"; }))
-    {
-        return 0;
-    }
-
-    // If a producer is a single tile or if each consumer core "sees" a single tile from producer, we can return early
-    // out and return a pattern length of 1
-    // There are some other edge cases where the pattern will be of length 1, but those are harder to detect for early
-    // out scenarios. We delegate the responsibility of finding those to pattern detection
-    //
-    if (producer->shape().is_single_tile())
-    {
-        return 1;
-    }
-
-    // Producer cannot be a buda op
-    // OpModel must belong to a buda op
-    //
-    TT_ASSERT(producer->node_type() != graphlib::NodeType::kBudaOp);
-    TT_ASSERT(op_model.buda_op_node);
-
-    // Producer tile layout is not relevant, so we simplify by using default/trivial values
-    //
-    TileLayout producer_tile_layout = TileLayout(
-        GridShape(1, 1),                                                        // this can always be 1x1
-        BlockShape(producer->shape(), 1, 1, 1, UBlockShape(1, 1)).canonical(),  // doesn't matter what the dims are
-        graphlib::UBlockOrder::R,  // this doesn't matter, so always set to R
-        Padding());                // queues don't have padding on them
-
-    // As matmul doesn't have eltwise-like pipes, but rather multicasts in some dimensions, we need to adjust the grid
-    // shape of the consumer to not divide a dimension like an eltwise-style pipe would
-    //
-    GridShape consumer_grid_shape = op_model.grid_shape;
-    if (op_model.buda_op_node->is_matmul())
-    {
-        consumer_grid_shape = edge.consumer_input_port_id == 0 ? GridShape(consumer_grid_shape.r, 1)
-                                                               : GridShape(1, consumer_grid_shape.c);
-    }
-
-    // Consumer tile layout - describes the input buffer
-    //
-    TileLayout consumer_tile_layout = TileLayout(
-        consumer_grid_shape,
-        op_model.input_buffers[edge.consumer_input_port_id].block_shape.canonical(),
-        edge_attr->get_ublock_order(),
-        Padding());  // padding isn't relevant for input buffer of a consumer op which has padding set
-
-    // Create the pipe
-    //
-    Pipe pipe(
-        producer_tile_layout,
-        1,  // producer_out_buf_mb is not relevant for kernel broadcast
-        tms,
-        consumer_tile_layout);
-
-    // Check if pipe exists in cache
-    //
-    if (kb_cache)
-    {
-        auto match = kb_cache->find(pipe);
-        if (match != kb_cache->end())
-        {
-            log_trace(LogKernelBroadcast, "    Found in cache - len: {}", match->second);
-            return match->second;
-        }
-    }
-
-    int pattern_len = detect_repetitive_pattern(kb_cache, pipe);
-
-    return pattern_len;
-}
-
-static void try_promote_kernel_broadcast_inputs(
-    std::unordered_map<Pipe, int>* const kb_cache,
-    graphlib::Graph const* graph,
-    graphlib::OpNode const* op_node,
-    std::size_t l1_usable_size,
-    OpModel& op_model)
-{
-    // Check if kernel broadcasting is disabled
-    //
-    static const bool disable_kernel_broadcast = env_as<bool>("PYBUDA_DISABLE_KERNEL_BROADCAST");
-    if (disable_kernel_broadcast)
-    {
-        return;
-    }
-
-    // Embedding, tilize and reduce ops don't support kernel broadcasting
-    //
-    if (op_node->is_embedding() || op_node->is_tilize() || op_node->is_reduce())
-    {
-        return;
-    }
-
-    log_trace(
-        LogKernelBroadcast,
-        "try_promote_kernel_broadcast_inputs, op: {}, op model id: {:8}",
-        op_node->name(),
-        op_model.id.id);
-
-    // Check each edge for kernel broadcasting
-    //
-    for (graphlib::Edge const& edge : graph->operand_data_edges(op_node))
-    {
-        // Sparse matmul's in0 is always fully prologued
-        //
-        if (op_node->is_sparse_matmul() and edge.consumer_input_port_id == 0)
-        {
-            continue;
-        }
-
-        graphlib::Node const* producer = graph->node_by_id(edge.producer_node_id);
-
-        auto attr = graph->get_edge_attributes(edge);
-        auto tms = attr->get_tms();
-        insert_t_stream_tms(op_node, tms, op_model.t_stream_factor, TStreamFactor{}, edge.consumer_input_port_id);
-
-        static const bool use_legacy_kernel_broadcast_path = env_as<bool>("PYBUDA_LEGACY_KERNEL_BROADCAST");
-        if (use_legacy_kernel_broadcast_path)
-        {
-            log_trace(LogKernelBroadcast, "  Using legacy path...");
-            if (not tms_support_kernel_broadcast(
-                    producer->shape(), tms, attr->get_ublock_order(), op_model.block_shape().ublock.ct))
-                continue;
-
-            bool single_tile = producer->shape().is_single_tile();
-            TensorShape shape = post_tms_shape(producer->shape(), tms, graphlib::ignore_broadcast_tm_evaluator);
-            int input_idx = edge.consumer_input_port_id;
-            bool is_prologue = bool(op_model.parameter_buffers[input_idx]);
-            int per_core_rt = round_up_div(
-                shape.rt, (op_node->is_matmul() and edge.consumer_input_port_id == 1) ? 1 : op_model.grid_shape.r);
-            int per_core_ct = round_up_div(
-                shape.ct, (op_node->is_matmul() and edge.consumer_input_port_id == 0) ? 1 : op_model.grid_shape.c);
-            UBlockShape ublock = single_tile ? UBlockShape(1, 1) : op_model.input_buffers[input_idx].block_shape.ublock;
-            int t = shape.z;
-            int mblock_m = round_up_div(per_core_rt, ublock.rt);
-            int mblock_n = round_up_div(per_core_ct, ublock.ct);
-            BlockShape block_shape(t, mblock_m, mblock_n, ublock);
-            BufferModel l1_buffer_model(block_shape, is_prologue ? 1 : 2, producer->output_df());
-            // Kernel always wants programming like it is single buffered
-            BufferModel kernel_buffer_model(block_shape, 1, producer->output_df());
-
-            static const bool include_t =
-                use_legacy_kernel_broadcast_path;  // we don't actually want to include t in the size calculation, but
-                                                   // we did use it in legacy path, keeping it for bwd compatibility
-            std::size_t current_input_size =
-                op_model.input_buffers[input_idx].size_bytes() + op_model.parameter_buffers[input_idx].size_bytes();
-            TT_ASSERT(current_input_size <= op_model.get_l1_memory_usage());
-            std::size_t adjusted_memory_usage =
-                op_model.get_l1_memory_usage() - current_input_size + l1_buffer_model.size_bytes(include_t);
-            if (adjusted_memory_usage <= l1_usable_size)
-            {
-                // Clobber the input/param buffer's allocation size with adjusted kernel broadcast size / zero to
-                // reflect their new L1 footprint. Leave the blocking information intact so canonical form checks work
-                // as is
-                op_model.input_buffers[input_idx].kernel_broadcast_tiles = kernel_buffer_model.size_tiles(include_t);
-                op_model.input_buffers[input_idx].l1_size_tiles = l1_buffer_model.size_tiles(include_t);
-                op_model.parameter_buffers[input_idx].l1_size_tiles = 0;
-            }
-
-            continue;
-        }
-
-        // Default kernel broadcast path (non-legacy)
-
-        // Get kernel_broadcast len (0 if no pattern)
-        //
-        int kb_len = get_kernel_broadcast_len(kb_cache, graph, op_model, edge, attr.get(), tms);
-        if (not kb_len)
-        {
-            continue;
-        }
-
-        const int input_idx = edge.consumer_input_port_id;
-
-        // This is the number of tiles that a single consumer core will need to "see" from producer op in order to
-        // produce a single mblock of output
-        //
-        const int producer_tiles_single_mblock =
-            op_model.input_buffers[input_idx].block_shape.canonical().volume_no_t();
-
-        // kb_len should be *no bigger* than producer_tiles_single_mblock
-        //
-        TT_ASSERT(
-            kb_len <= producer_tiles_single_mblock,
-            "kb_len: {}, producer_tiles_single_mblock: {}",
-            kb_len,
-            producer_tiles_single_mblock);
-
-        // If kernel broadcast fits into L1, set it on the input buffer
-        //
-        int kb_mem_footprint = kb_len * tile_size_bytes(producer->output_df());
-        std::size_t current_input_size =
-            op_model.input_buffers[input_idx].size_bytes() + op_model.parameter_buffers[input_idx].size_bytes();
-        TT_ASSERT(current_input_size <= op_model.get_l1_memory_usage());
-        std::size_t adjusted_memory_usage = op_model.get_l1_memory_usage() - current_input_size + kb_mem_footprint;
-        if (adjusted_memory_usage <= l1_usable_size)
-        {
-            // Change the l1_size_tiles property of input buffer to reflect memory footprint of kernel broadcast.
-            // Additionally, change prologue buffer to 0 since it is no longer needed.
-            // Leaving the blocking information intact so canonical form checks work as is.
-            //
-            op_model.input_buffers[input_idx].kernel_broadcast_tiles = kb_len;
-            op_model.input_buffers[input_idx].l1_size_tiles = kb_len;
-            op_model.parameter_buffers[input_idx].l1_size_tiles = 0;
-
-            log_trace(
-                LogKernelBroadcast,
-                "  Kernel broadcast detected on op {}, op model id: {}, operand id {}, kernel broadcast length: {}",
-                op_node->name(),
-                op_model.id.id,
-                input_idx,
-                kb_len);
-
-            TT_ASSERT(op_model.get_l1_memory_usage() == adjusted_memory_usage);
-        }
-    }
-}
-
-static std::optional<int> find_max_parameter_buffer_l1_user(OpModel const& op_model, bool is_sparse_matmul)
-{
-    if (is_sparse_matmul)
-    {
-        // Only encodings can be streamed from dram, sparse tiles cannot
-        constexpr int encodings_parameter_index = 2;
-        return (op_model.parameter_buffers.size() > 2 and op_model.parameter_buffers[2])
-                   ? std::optional<int>(encodings_parameter_index)
-                   : std::nullopt;
-    }
-
-    std::size_t max = 0;
-    int max_idx = 0;
-    for (int i = 0; i < (int)op_model.parameter_buffers.size(); ++i)
-    {
-        auto const& parameter_buffer = op_model.parameter_buffers[i];
-        if (parameter_buffer and max < parameter_buffer.size_bytes(true))
-        {
-            max = parameter_buffer.size_bytes(true);
-            max_idx = i;
-        }
-    }
-
-    return max ? std::optional<int>(max_idx) : std::nullopt;
-}
-
-static std::vector<BufferModel> upsize_output_buffer(
-    graphlib::Graph const* graph,
-    std::vector<BufferModel> output_buffers,
-    std::size_t l1_remaining_size,
-    bool is_gradient_op)
-{
-    int factor = int(l1_remaining_size / output_buffers[0].size_bytes());
-    if (factor < 2 or is_gradient_op)
-        return output_buffers;
-    int microbatch = graph->get_microbatch();
-    factor = FactorizedInt(microbatch).get_nearest_factor_le(factor);
-    if (divisible_either_direction(output_buffers[0].block_shape.t, factor))
-        output_buffers[0].buffer_factor *= factor;
-    return output_buffers;
-}
-
-static std::pair<OpModelFailureReason, std::string> validate_memory_requirements(
-    OpModel const& op_model, std::size_t l1_usable_size, std::size_t dram_channel_capacity)
-{
-    std::size_t buffer_usage_bytes = op_model.get_l1_memory_usage();
-    if (buffer_usage_bytes > l1_usable_size)
-    {
-        return std::make_pair(
-            L1UsageOverMaxLimit, fmt::format("L1 Usage[{}] > L1 Max[{}]", buffer_usage_bytes, l1_usable_size));
-    }
-
-    for (BufferModel const& dram_buffer : op_model.dram_buffers)
-    {
-        constexpr bool include_t = true;
-        if (dram_buffer.size_bytes(include_t) > dram_channel_capacity)
-        {
-            return make_pair(
-                ExceededDramChannelCapacity,
-                fmt::format(
-                    "Exceeded DRAM channel capacity: Buffer Usage[{}] DRAM Channel[{}]",
-                    dram_buffer.size_bytes(include_t),
-                    dram_channel_capacity));
-        }
-    }
-
-    return std::make_pair(NoFailure, "");
-}
-
-static bool unpadding_producer_macroblock(Graph const* graph, graphlib::OpNode const* op_node, BlockShape block_shape)
-{
-    // Constraint 1. Unpadding must be less than the producer grid's macroblock size in both r & c dimensions.
-    //       - unpad_rt < producer_mb_r * producer_ub_r and unpad_ct < producer_mb_c * producer_ub_c
-    //       - This constraint ensures that every core within the producer grid's output kernel buffer is popped.
-    //         Otherwise, data will backpressure and the system will hang.
-    //       - This should be a rational constraint because if the data is never read, we should never be producing the
-    //       data.
-
-    // Extract macroblock and microblock sizes from the block shape
-    int producer_mb_r = block_shape.mblock_m;
-    int producer_mb_c = block_shape.mblock_n;
-    int producer_ub_r = block_shape.ublock.rt;
-    int producer_ub_c = block_shape.ublock.ct;
-
-    // Iterate through outgoing edges to get unpad nodes for the particular op node
-    std::vector<tt::graphlib::Edge> outgoing_edges = graph->user_data_edges(op_node);
-    for (graphlib::Edge outgoing_edge : outgoing_edges)
-    {
-        vector<OpType> tms = graph->get_edge_attributes(outgoing_edge)->get_tms();
-        for (OpType op_type : tms)
-        {
-            if (op_type.op == "buda_unpad")
-            {
-                int unpad_rt = std::get<int>(op_type.buda_attrs["rt"]);
-                int unpad_ct = std::get<int>(op_type.buda_attrs["ct"]);
-                if (unpad_rt >= producer_mb_r * producer_ub_r || unpad_ct >= producer_mb_c * producer_ub_c)
-                {
-                    return false;
-                }
-
-                // We break on first buda_unpad node because we assume that there is only one buda_unpad TM per outgoing
-                // edge
-                break;
-            }
-        }
-    }
-
-    return true;
-}
-
-static bool padding_consumer_macroblock(Graph const* graph, graphlib::OpNode const* op_node, BlockShape block_shape)
-{
-    // Padding must be less than the consumer grid's macroblock size in both r & c dimensions.
-    //      - pad_rt < consumer_mb_r * consumer_ub_r and pad_ct < consumer_mb_c * consumer_ub_c
-    //      - This constraint ensures that every core in the consumer grid is producing some functional data.
-    //      - This should be a rational constraint because if we are padding more than an additional macro block,
-    //        this means that we are using cores to compute only padding and that we could have satisfied divisibility
-    //        constraints with a smaller padding.
-
-    // Extract macroblock and microblock sizes from the block shape
-    int consumer_mb_r = block_shape.mblock_m;
-    int consumer_mb_c = block_shape.mblock_n;
-    int consumer_ub_r = block_shape.ublock.rt;
-    int consumer_ub_c = block_shape.ublock.ct;
-
-    // Iterate through incoming edges to get pad nodes for the particular op node
-    std::vector<graphlib::Edge> incoming_edges = graph->operand_data_edges(op_node);
-    for (graphlib::Edge incoming_edge : incoming_edges)
-    {
-        vector<OpType> tms = graph->get_edge_attributes(incoming_edge)->get_tms();
-        for (OpType op_type : tms)
-        {
-            if (op_type.op == "buda_pad")
-            {
-                // Extract padding from the op type
-                int pad_rt = std::get<int>(op_type.buda_attrs["rt"]);
-                int pad_ct = std::get<int>(op_type.buda_attrs["ct"]);
-                if (pad_rt >= consumer_mb_r * consumer_ub_r || pad_ct >= consumer_mb_c * consumer_ub_c)
-                {
-                    return false;
-                }
-
-                // We break on first buda_pad node because we expect only one buda_pad TM per op node edge
-                break;
-            }
-        }
-    }
-
-    return true;
-}
-
-static bool padding_multiple_pre_stack(Graph const* graph, graphlib::OpNode const* op_node)
-{
-    // If stacking TMs without full t buffering are used on the data transformation path,
-    // padding must be a multiple of the pre-stacked dimension in the dimension of the stack.
-    //      - pad_rt % pre_stack_rt == 0 if vstack
-    //      - pad_ct % pre_stack_ct == 0 if hstack
-    //      - This is a hard constraint due to the nature of the underlying output scatter pipes that implement
-    //      stacking.
-
-    // Iterate through incoming edges to get pad nodes for the particular op node
-    // with aim to check if pad is a multiple of the pre-stacked dimension.
-    std::vector<graphlib::Edge> incoming_edges = graph->operand_data_edges(op_node);
-    for (graphlib::Edge incoming_edge : incoming_edges)
-    {
-        // Get producer node and its shape
-        graphlib::NodeId incoming_node_id = incoming_edge.producer_node_id;
-        Node* incoming_node = graph->node_by_id(incoming_node_id);
-        Shape shape = incoming_node->shape();  // init shape of the operation before PAD
-
-        // Init pre-stack dimension values
-        int pre_stack_rt = 1;
-        int pre_stack_ct = 1;
-
-        vector<OpType> tms = graph->get_edge_attributes(incoming_edge)->get_tms();
-        // Iterate through the all TMs
-        for (int i = 0; i < (int)tms.size() - 1; i++)
-        {
-            shape = ::get_tm_shape(tms[i], shape, true);
-
-            // Check if TM sequence contains stack operation
-            if (tms[i].op == "vstack" || tms[i].op == "hstack")
-            {
-                // Check if padding is after stack
-                if (tms[i + 1].op == "buda_pad")
-                {
-                    std::vector<std::uint32_t> shape_vect = shape.as_vector();
-                    std::uint32_t shape_size = shape.size();
-                    if (tms[i].op == "vstack")
-                    {
-                        pre_stack_rt = shape[shape_size - 1];
-                    }
-                    else if (tms[i].op == "hstack")
-                    {
-                        pre_stack_ct = shape[shape_size - 2];
-                    }
-
-                    // Check if the pad is a multiple of the pre-stacked dimension
-                    int pad_rt = std::get<int>(tms[i + 1].buda_attrs["rt"]);
-                    int pad_ct = std::get<int>(tms[i + 1].buda_attrs["ct"]);
-                    if (pad_rt % pre_stack_rt != 0 || pad_ct % pre_stack_ct != 0)
-                    {
-                        return false;
-                    }
-                }
-            }
-        }
-    }
-
-    return true;
-}
-
-// Padding constraints
-static bool check_padding_constraints(Graph const* graph, graphlib::OpNode const* op_node, BlockShape block_shape)
-{
-    bool padding_constraints_satisfied = true;
-
-    // Constraint 1
-    padding_constraints_satisfied &= unpadding_producer_macroblock(graph, op_node, block_shape);
-    // Constraint 2
-    padding_constraints_satisfied &= padding_consumer_macroblock(graph, op_node, block_shape);
-    // Constraint 3
-    padding_constraints_satisfied &= padding_multiple_pre_stack(graph, op_node);
-
-    return padding_constraints_satisfied;
-}
-
-static std::pair<OpModel, OpModelFailureReason> calculate_op_model_impl(
-    Graph const* graph,
-    std::shared_ptr<BalancerCacheCollection> cache_collection,
-    graphlib::BudaOpNode const* op_node,
-    GridShape selected_grid,
-    TStreamFactor t_stream_factor,
-    UBlockOrder ublock_order,
-    bool force_dram_parameters,
-    std::size_t dst_size_tiles,
-    std::size_t l1_usable_size,
-    std::size_t dram_channel_capacity,
-    std::string& customFailureMessage,
-    int fracture_factor,
-    bool sparse_buffer_enable,
-    LegalSparseUKts const& legal_sparse_u_kts,
-    int u_kt_override,
-    std::map<std::uint32_t, std::uint32_t> const& min_input_buffer_factor_overrides,
-    std::optional<int> output_buffer_factor_override,
-    bool fallback_single_buffer)
-{
-    OpModel op_model;
-
-    op_model.grid_shape = selected_grid;
-    // if sparse matmul, op shape depends on ublock shape (u_rt and u_kt for sparse) -
-    // it needs to be updated after they have been chosen
-    op_model.op_shape = get_op_shape(graph, op_node, selected_grid, 1, 1, t_stream_factor, fracture_factor);
-    op_model.buda_op_node = op_node;
-    op_model.data_format = op_node->output_df();
-    op_model.t_stream_factor = t_stream_factor;
-    op_model.fracture_factor = fracture_factor;
-    auto [pad_rt, pad_ct] = graphlib::get_padding(graph, op_node);
-    op_model.padding = Padding(pad_rt, pad_ct);
-
-    int streaming_threshold = env_as<int>("PYBUDA_SUPRESS_T_FACTOR_MM", 0);
-    if (streaming_threshold and op_node->is_matmul())
-    {
-        if (t_stream_factor.t() > streaming_threshold)
-            return std::make_pair(op_model, IllegalStreaming);
-    }
-
-    TT_ASSERT(op_model.op_shape.outputs.size() == 1, "Currently we only support 1 output for 1 ublock shape below");
-    Parallelization total_par(selected_grid.r * t_stream_factor.r, selected_grid.c * t_stream_factor.c);
-    UBlockShape ublock;
-    std::tie(ublock, op_model.fused_op_ublock_shape) =
-        calculate_ublock_shape(op_model.op_shape, total_par, dst_size_tiles, ublock_order, op_node);
-
-    // Calculate output_buffer_factor (buf_size_mb)
-    auto [calculated_user_buffer_factor, user_access_allows_streaming, is_legal_stack_for_grid] =
-        calculate_user_buffer_factor(graph, op_node, ublock_order, op_model);
-
-    if (not is_legal_stack_for_grid)
-        return std::make_pair(op_model, IllegalStackForGrid);
-
-    int output_buffer_factor =
-        get_output_buffer_factor(op_node, calculated_user_buffer_factor, output_buffer_factor_override);
-
-    if (not op_model.t_stream_factor.none() and not user_access_allows_streaming)
-    {
-        return std::make_pair(op_model, UserAccessPreventsStreaming);
-    }
-
-    op_model.effective_input_buffer_shape_for_user =
-        calculate_effective_input_buffer_shapes_for_users(graph, op_node, t_stream_factor);
-
-    // Calculate output buffer shape
-    op_model.output_buffers = calculate_output_buffer_models_for_grid(
-        op_model.op_shape,
-        selected_grid,
-        ublock,
-        op_model.t_stream_factor,
-        output_buffer_factor,
-        op_node->is_gradient_op(),
-        op_node->output_df());
-
-    if (not legal_t_streaming(op_model.output_buffers[0].block_shape, op_model.t_stream_factor, ublock_order))
-        return std::make_pair(op_model, IllegalStreaming);
-
-    // Calculate parameter buffer shapes
-    std::vector<graphlib::Node*> operands = graph->data_operands(op_node);
-    op_model.parameter_buffers =
-        calculate_parameter_buffer_models_for_grid(op_model.op_shape, operands, selected_grid, force_dram_parameters);
-
-    // Calculate intermediate buffer shapes
-    TT_ASSERT(op_model.output_buffers.size() == 1);
-    op_model.intermediate_buffers = calculate_intermediate_buffer_models_for_grid(
-        op_node, op_model.output_buffers[0], op_model.fused_op().get(), op_model.fused_op_ublock_shape);
-
-    // Calculate input buffer shapes
-
-    // After output + parameters have been allocated try to pick input buffers that'll fit in the remaining space
-    // Try with parameters in L1 first time around, then fallback to streaming the parameters if we can't fit
-    // Note: sparse matmul must be able to fit in0 and in2 as parameters
-    BlockShape const& output_block_shape = op_model.output_buffers[0].block_shape;
-
-    bool padding_constraints_satisfied = check_padding_constraints(graph, op_node, output_block_shape);
-    if (not padding_constraints_satisfied)
-        return std::make_pair(op_model, PaddingConstraintsNotSatisfied);
-
-    int fallback_loop_count = op_node->is_sparse_matmul() ? 3 : op_model.num_parameter_buffers() + 2;
-    for (int i = 0; i < fallback_loop_count; ++i)
-    {
-        bool try_fallback = (i >= 1);
-        std::optional<int> potential_max_l1_user =
-            find_max_parameter_buffer_l1_user(op_model, op_node->is_sparse_matmul());
-        if (try_fallback and potential_max_l1_user)
-        {
-            int max_l1_user = *potential_max_l1_user;
-            if (op_model.dram_buffers.empty())
-                op_model.dram_buffers.resize(op_model.parameter_buffers.size());
-            op_model.dram_buffers[max_l1_user] = op_model.parameter_buffers[max_l1_user];
-            op_model.parameter_buffers[max_l1_user] = BufferModel{};
-            log_trace(
-                LogBalancer,
-                "{}: cannot fit parameters in L1, fallback to streaming at input index[{}] usage[{}/{}]",
-                op_node->name(),
-                max_l1_user,
-                op_model.get_l1_memory_usage(),
-                l1_usable_size);
-        }
-        else if (try_fallback and not output_buffer_factor_override and fallback_single_buffer)
-        {
-            TT_ASSERT(op_model.output_buffers[0].buffer_factor % 2 == 0);
-            TT_ASSERT(op_model.output_buffers[0].l1_size_tiles % 2 == 0);
-            op_model.output_buffers[0].buffer_factor /= 2;
-            op_model.output_buffers[0].l1_size_tiles /= 2;
-            fallback_single_buffer = false;
-            log_trace(
-                LogBalancer,
-                "{}: cannot fit output buffer in L1, fallback to single buffer usage[{}/{}]",
-                op_node->name(),
-                op_model.get_l1_memory_usage(),
-                l1_usable_size);
-        }
-
-        if (op_model.get_l1_memory_usage() >= l1_usable_size)
-            continue;
-
-        std::size_t input_l1_buffer_space = l1_usable_size - op_model.get_l1_memory_usage();
-        op_model.input_buffers = calculate_input_buffer_models(
-            graph,
-            op_node,
-            op_model.grid_shape,
-            op_model.op_shape,
-            operands,
-            output_block_shape,
-            t_stream_factor,
-            input_l1_buffer_space,
-            fracture_factor,
-            op_model.fused_op_ublock_shape,
-            legal_sparse_u_kts,
-            u_kt_override,
-            min_input_buffer_factor_overrides);
-
-        if (op_model.input_buffers.empty())
-            break;  // Change this to continue, causes fallout:
-                    // tenstorrent/pybuda#1243
-
-        if (op_node->is_sparse_matmul())
-        {
-            // in2 operand's shape of sparse matmul depends on chosen u_rt and u_kt, we can update it here
-            // after both u_rt and u_kt have been chosen
-            op_model.op_shape = get_op_shape(
-                graph,
-                op_node,
-                selected_grid,
-                ublock.rt,
-                op_model.input_buffers[1].block_shape.ublock.rt,
-                t_stream_factor,
-                fracture_factor);
-        }
-
-        try_promote_kernel_broadcast_inputs(
-            &cache_collection->pipe_to_kb_len_cache, graph, op_node, l1_usable_size, op_model);
-
-        if (op_model.get_l1_memory_usage() <= l1_usable_size)
-            break;
-    }
-
-    if (op_model.input_buffers.empty())
-        return std::make_pair(op_model, InputBufferAllocationFailure);
-
-    if (env_as<bool>("PYBUDA_ENABLE_OUTPUT_BUFFER_UPSIZING"))
-    {
-        std::size_t l1_remaining_size = l1_usable_size - op_model.get_l1_memory_usage();
-        op_model.output_buffers =
-            upsize_output_buffer(graph, op_model.output_buffers, l1_remaining_size, op_node->is_gradient_op());
-    }
-
-    auto operand_edges = graph->operand_data_edges(op_node);
-    bool is_reduce_z = graphlib::is_reduce_z(op_node);
-    op_model.is_sparse_matmul = op_node->is_sparse_matmul();
-    op_model.sparse_buffer = op_node->is_sparse_matmul() and sparse_buffer_enable;
-    op_model.consumes_rz_major = std::any_of(
-                                     operand_edges.begin(),
-                                     operand_edges.end(),
-                                     [graph](Edge edge) { return edge_tms_consume_rz_major(graph, edge); }) or
-                                 is_reduce_z;
-
-    if (op_model.sparse_buffer)
-    {
-        // When sparse buffer is enabled we use 2x columns and divide the op into 2
-        // The first half of the grid is used for the buffer op, the second half for
-        // the sparse mm itself
-        op_model.grid_shape.c *= 2;
-    }
-    if (op_model.is_sparse_matmul)
-    {
-        const sparse::SparseBUDA& sparse_buda =
-            graph->data_operands(op_node)[0]->as<graphlib::ConstantInputNode>()->get_sparse_buda();
-        op_model.sparse_indices = sparse_buda.sparse_indices.size();
-        op_model.sparse_buda = &sparse_buda;
-    }
-
-    auto [failedOpMemoryRequirementReason, customMemoryReaquirementReasonMessage] =
-        validate_memory_requirements(op_model, l1_usable_size, dram_channel_capacity);
-
-    customFailureMessage = customMemoryReaquirementReasonMessage;
-    return std::make_pair(op_model, failedOpMemoryRequirementReason);
-}
-
-std::pair<OpModel, OpModelFailureReason> calculate_op_model(
-    Graph const* graph,
-    std::shared_ptr<BalancerCacheCollection> cache_collection,
-    graphlib::BudaOpNode const* op_node,
-    GridShape selected_grid,
-    TStreamFactor t_stream_factor,
-    UBlockOrder ublock_order,
-    bool force_dram_parameters,
-    std::size_t dst_size_tiles,
-    std::size_t l1_usable_size,
-    std::size_t dram_channel_capacity,
-    std::string& customFailureMessage,
-    int fracture_factor,
-    bool sparse_buffer_enable,
-    LegalSparseUKts const& legal_sparse_u_kts,
-    int u_kt_override,
-    std::map<std::uint32_t, std::uint32_t> const& min_input_buffer_factor_overrides,
-    std::optional<int> output_buffer_factor_override,
-    bool fallback_single_buffer)
-{
-    OpModel op_model;
-    OpModelFailureReason failure_reason;
-    bool retry = true;
-    while (retry)
-    {
-        retry = false;
-        std::tie(op_model, failure_reason) = calculate_op_model_impl(
-            graph,
-            cache_collection,
-            op_node,
-            selected_grid,
-            t_stream_factor,
-            ublock_order,
-            force_dram_parameters,
-            dst_size_tiles,
-            l1_usable_size,
-            dram_channel_capacity,
-            customFailureMessage,
-            fracture_factor,
-            sparse_buffer_enable,
-            legal_sparse_u_kts,
-            u_kt_override,
-            min_input_buffer_factor_overrides,
-            output_buffer_factor_override,
-            fallback_single_buffer);
-
-        if ((failure_reason == L1UsageOverMaxLimit or failure_reason == InputBufferAllocationFailure) and
-            dst_size_tiles > 1)
-        {
-            dst_size_tiles /= 2;
-            retry = true;
-        }
-    }
-    return std::make_pair(op_model, failure_reason);
-}
-
-// Calculate legal OpModels for a graph.
-// Optionally override can be passed in via nodes_to_legalize to only calculate OpModels for specified set of nodes.
-//
-LegalOpModels get_legal_op_models(
-    Graph const* graph,
-    BalancerConfig const& config,
-    std::shared_ptr<BalancerCacheCollection> cache_collection,
-    std::unordered_set<graphlib::Node*>* nodes_to_legalize)
-{
-    PROFILE_SCOPE();
-#ifdef DEBUG
-    BudaOpNodeLegalizerFailureInfo op_graph_debug_info;
-    bool enable_legalizer_detailed_debugging = env_as<bool>("PYBUDA_LEGALIZER_DETAILED_DEBUGGING");
-    std::string node_name_leg_debug = env_as<std::string>("PYBUDA_LEGALIZER_DEBUG_NODE_NAME");
-#endif
-
-    std::unordered_map<Node*, const BudaOpNodeLegalizerFailureInfo> nodes_without_legal_op_model;
-    LegalOpModels valid_op_models;
-    FactorizedShape device_grid(
-        FactorizedInt::Factorial(config.device_config.grid_size.r),
-        FactorizedInt::Factorial(config.device_config.grid_size.c));
-    // Sparse buffer op takes 2x columns, so cut device core grid c in half
-    FactorizedShape sparse_buffer_device_grid(
-        FactorizedInt::Factorial(config.device_config.grid_size.r),
-        FactorizedInt::Factorial(config.device_config.grid_size.c / 2));
-
-    // Nebula is harvested in Nebula+Galaxy setup, but the device_grid is for
-    // unharvested galaxy
-    FactorizedShape harvested_device_grid(
-        FactorizedInt::Factorial(config.device_config.get_harvested_nebula_galaxy_grid().r),
-        FactorizedInt::Factorial(config.device_config.get_harvested_nebula_galaxy_grid().c));
-
-    for (Node* node : tt::graphlib::topological_sort(*graph))
-    {
-        if (node->node_type() != NodeType::kBudaOp)
-        {
-            continue;
-        }
-
-        graphlib::BudaOpNode const* op_node = static_cast<graphlib::BudaOpNode const*>(node);
-
-        if (nullptr != nodes_to_legalize and nodes_to_legalize->count(node) == 0)
-        {
-            continue;
-        }
-
-        BudaOpNodeLegalizerFailureInfo failure_info;
-
-#ifdef DEBUG
-        graphlib::BudaOpNode* debug_op_node = nullptr;
-        if (enable_legalizer_detailed_debugging)
-        {
-            debug_op_node = const_cast<graphlib::BudaOpNode*>(op_node);
-            debug_op_node->leg_debug_info = std::make_shared<BudaOpNodeLegalizerFailureInfo>();
-        }
-#endif
-
-        auto op_override = config.get_op_override(node->name());
-        FactorizedInt fracture_factorization = get_fracture_factorization(graph, op_node, op_override);
-        std::optional<int> output_buffer_override = get_output_buffer_override(op_node, op_override);
-        std::map<std::uint32_t, std::uint32_t> input_buffer_multipliers =
-            get_min_input_buffer_multiplier_overrides(op_override);
-        int user_overriden_u_kt = get_u_kt(op_override);
-        UBlockOrder ublock_order = get_output_ublock_order(graph, op_node);
-        bool sparse_buffer_enable = env_as<bool>("PYBUDA_SPARSE_BUFFER_ENABLE") and sparse_buffer_legal(graph, op_node);
-        bool fallback_single_buffer = config.enable_single_buffer_fallback;
-        // Support for full dst mode was removed by backend:
-        //   tenstorrent/budabackend#1543
-        // Follow up for re-enablement:
-        //   tenstorrent/budabackend#2098
-        bool full_dst_mode = false and op_node->is_sparse_matmul() and env_as<bool>("PYBUDA_MAXIMIZE_SPARSE_UBLOCK");
-        std::size_t dst_size_tiles = calculate_dst_size_tiles(
-            config.device_config.get_dst_size(),
-            op_node->accumulate_df(),
-            op_node->shape().get_tile_volume(),
-            full_dst_mode ? 1 : 2);
-
-        std::vector<OpModel> valid_grids;
-        for (int fracture_factor : fracture_factorization.get_factors())
-        {
-            // all_pars can extend beyond the device grid, used to express t-streaming
-            auto all_pars = FactorizedShape(get_parallelization(graph, op_node, fracture_factor, sparse_buffer_enable));
-            all_pars.c = all_pars.c.keep_factors_divisible_by(
-                FactorizedInt::Constant(fracture_factor));  // remove invalid factors
-            // TODO: each op's parallelization() should define FactorizedShape, instead of returning a 2-tuple, in order
-            // to avoid having the line above (which is specific to sparse mm)
-            auto grid_pars = all_pars & device_grid;
-            bool force_dram_parameters = config.default_dram_parameters;
-            FactorizedShape overridden_streaming_pars;
-
-            if (op_node->is_sparse_matmul() and sparse_buffer_enable)
-            {
-                grid_pars = grid_pars & sparse_buffer_device_grid;
-            }
-
-            // output ops will be placed on Nebula hence they should fit a harvested grid
-            if (env_as<bool>("PYBUDA_NEBULA_GALAXY_PLACER"))
-            {
-                auto consumers = graph->users(op_node);
-                bool feeds_graph_output_queue = std::any_of(
-                    consumers.begin(),
-                    consumers.end(),
-                    [](Node* n) { return n->node_type() == graphlib::NodeType::kOutput; });
-                if (feeds_graph_output_queue)
-                {
-                    grid_pars = grid_pars & harvested_device_grid;
-                }
-            }
-
-            std::vector<TStreamDir> streaming_dirs = get_legal_streaming_dirs(graph, op_node);
-
-            log_debug(LogBalancer, "Calculate legal op models for node {} {}:", node->name(), node->get_type());
-
-            bool override_enable_t_streaming = not config.manual_t_streaming;
-            if (auto op_override = config.get_op_override(node->name()))
-                op_override->apply(
-                    grid_pars,
-                    force_dram_parameters,
-                    streaming_dirs,
-                    overridden_streaming_pars,
-                    override_enable_t_streaming,
-                    node->name());
-
-            bool enable_t_streaming = config.enable_t_streaming and override_enable_t_streaming and
-                                      (not node->as<graphlib::TaggedNode>()->has_tag("padding_nop"));
-
-            log_trace(LogBalancer, "  Grids:");
-            for (Parallelization grid_par : grid_pars)
-            {
-                bool did_non_streaming = false;
-                for (auto streaming_dir : streaming_dirs)
-                {
-                    auto [streaming_pars, legal_sparse_u_kts] = calculate_streaming_pars(
-                        graph,
-                        op_node,
-                        grid_par,
-                        all_pars,
-                        streaming_dir,
-                        overridden_streaming_pars,
-                        enable_t_streaming,
-                        fracture_factor,
-                        sparse_buffer_enable);
-
-                    for (auto streaming_par : streaming_pars)
-                    {
-                        if (did_non_streaming and streaming_par == Parallelization(1, 1))
-                            continue;  // We already covered this case with TStreamDir::R, i.e. non-streaming
-                        did_non_streaming |= (streaming_par == Parallelization(1, 1));
-
-                        std::string customFailureMessage;
-
-                        auto [op_model, failure_reason] = calculate_op_model(
-                            graph,
-                            cache_collection,
-                            op_node,
-                            grid_par,
-                            TStreamFactor(streaming_dir, streaming_par),
-                            ublock_order,
-                            force_dram_parameters,
-                            dst_size_tiles,
-                            config.device_config.get_l1_usable_size(),
-                            config.device_config.get_dram_channel_capacity(),
-                            customFailureMessage,
-                            fracture_factor,
-                            sparse_buffer_enable,
-                            legal_sparse_u_kts,
-                            user_overriden_u_kt,
-                            input_buffer_multipliers,
-                            output_buffer_override,
-                            fallback_single_buffer);
-
-                        if (NoFailure == failure_reason)
-                        {
-                            valid_grids.push_back(op_model);
-
-                            for (int u_kt_override :
-                                 enumerate_factored_u_kts(op_model, user_overriden_u_kt, config.enable_enumerate_u_kt))
-                            {
-                                auto [factored_u_kt_op_model, factored_u_kt_failure_reason] = calculate_op_model(
-                                    graph,
-                                    cache_collection,
-                                    op_node,
-                                    grid_par,
-                                    TStreamFactor(streaming_dir, streaming_par),
-                                    ublock_order,
-                                    force_dram_parameters,
-                                    dst_size_tiles,
-                                    config.device_config.get_l1_usable_size(),
-                                    config.device_config.get_dram_channel_capacity(),
-                                    customFailureMessage,
-                                    fracture_factor,
-                                    sparse_buffer_enable,
-                                    legal_sparse_u_kts,
-                                    u_kt_override,
-                                    input_buffer_multipliers,
-                                    output_buffer_override,
-                                    fallback_single_buffer);
-                                if (factored_u_kt_failure_reason == NoFailure)
-                                    valid_grids.push_back(factored_u_kt_op_model);
-                            }
-
-                            log_trace(
-                                LogBalancer,
-                                "    {} {:<32} {} Legalizer Valid",
-                                op_node->name(),
-                                GridShape(grid_par),
-                                TStreamFactor(streaming_dir, streaming_par));
-                            log_trace(LogBalancer, "      L1: {:<16}", op_model.get_l1_memory_usage());
-                            log_trace(
-                                LogBalancer,
-                                "      Cycles: {:<16}",
-                                op_model.get_execution_cycles(config.device_config.arch_name));
-                            log_trace(LogBalancer, "{}", op_model);
-                        }
-                        else
-                        {
-                            log_trace(
-                                LogBalancer,
-                                "    {} {:<26} {} Legalizer Failed: {}",
-                                op_node->name(),
-                                GridShape(grid_par),
-                                TStreamFactor(streaming_dir, streaming_par),
-                                customFailureMessage.empty() ? OpModelFailureReasonMessages[failure_reason]
-                                                             : customFailureMessage);
-                            log_trace(LogBalancer, "{}", op_model);
-                            failure_info.recordOpModelFailure(failure_reason);
-                        }
-
-#ifdef DEBUG
-                        if (enable_legalizer_detailed_debugging)
-                        {
-                            debug_op_node->leg_debug_info->recordOpModelFailure(failure_reason);
-                        }
-
-                        op_graph_debug_info.recordOpModelFailure(failure_reason);
-#endif
-                    }
-                }
-            }
-        }
-
-#ifdef DEBUG
-        if (enable_legalizer_detailed_debugging)
-        {
-            if (node_name_leg_debug == node->name() or node_name_leg_debug.empty())
-            {
-                log_debug(
-                    LogBalancer,
-                    "OpModel failure statistics for node: {} {} {}",
-                    node->name(),
-                    node->get_type(),
-                    node->shape());
-                log_debug(LogBalancer, debug_op_node->leg_debug_info->toString().c_str());
-            }
-        }
-#endif
-
-        log_debug(LogBalancer, "Total op models for node: {} {}", node->name(), valid_grids.size());
-        if (valid_grids.empty())
-        {
-            nodes_without_legal_op_model.emplace(node, failure_info);
-            log_warning(
-                LogBalancer, "No valid grids found for node: {} {} {}", node->name(), node->get_type(), node->shape());
-        }
-        valid_op_models.emplace(node, valid_grids);
-    }
-
-#ifdef DEBUG
-    log_debug(LogBalancer, "OpModel failure statistics for whole graph:");
-    log_debug(LogBalancer, op_graph_debug_info.toString().c_str());
-#endif
-    if (nodes_without_legal_op_model.size() > 0)
-    {
-        std::size_t nodes_without_legal_op_model_count = nodes_without_legal_op_model.size();
-        throw BalancerError(
-            fmt::format("{} Nodes have no valid grids, exiting", nodes_without_legal_op_model_count),
-            BalancerError::NoValidGrid(std::move(nodes_without_legal_op_model)));
-    }
-
-    return valid_op_models;
-}
-
-static OpModel create_input_queue_op_model(
-    TensorShape input_shape, GridShape grid_shape, BlockShape block_shape, DataFormat data_format, bool prologue)
-{
-    BufferModel input_buffer_model;
-    input_buffer_model.block_shape = block_shape;
-    input_buffer_model.buffer_factor = 1;
-    input_buffer_model.l1_size_tiles = input_buffer_model.block_shape.volume();
-    input_buffer_model.data_format = data_format;
-
-    OpModel input_op_model;
-    input_op_model.grid_shape = grid_shape;
-    input_op_model.op_shape.outputs.push_back(input_shape);
-    input_op_model.output_buffers.push_back(input_buffer_model);
-    input_op_model.data_format = data_format;
-    input_op_model.input_prologue = prologue;
-
-    return input_op_model;
-}
-
-static void resolve_input_queue_block_shapes(Graph const* graph, BalancerConfig const& config, OpModelMap& op_models)
-{
-    auto compatible_queue_grid_for_users = [](TensorShape const& input_shape,
-                                              std::vector<OpModel const*> const& users,
-                                              bool parameter = false) -> GridShape
-    {
-        GridShape grid_shape = users[0]->grid_shape;
-        for (OpModel const* user_op_model : users)
-        {
-            GridShape user_grid_shape = user_op_model->grid_shape;
-            bool user_is_matmul = (user_op_model->op_type() == "matmul");
-            grid_shape.r = std::min(grid_shape.r, user_grid_shape.r);
-            grid_shape.c = std::min(
-                grid_shape.c,
-                (user_is_matmul && !parameter)
-                    ? 1
-                    : user_grid_shape.c);  // for matmul, only one column reads, so giving it more only hurts it
-        }
-
-        int grid_r = FactorizedInt(input_shape.rt).get_nearest_factor_le(grid_shape.r);
-        int grid_c = FactorizedInt(input_shape.ct).get_nearest_factor_le(grid_shape.c);
-        return GridShape(grid_r, grid_c);
-    };
-
-    auto compatible_queue_ublock_for_users = [](TensorShape const& input_shape,
-                                                GridShape grid_shape,
-                                                std::vector<graphlib::Edge> const& user_edges,
-                                                std::vector<OpModel const*> const& users) -> UBlockShape
-    {
-        TT_ASSERT(not user_edges.empty());
-        TT_ASSERT(not users.empty());
-        TT_ASSERT(user_edges.size() == users.size());
-        // For now just take the first user, unclear what's best for all users
-        graphlib::Edge user_edge = user_edges.front();
-        OpModel const* user_op_model = users.front();
-        UBlockShape ublock = user_op_model->input_buffers[user_edge.consumer_input_port_id].block_shape.ublock;
-
-        // Clamp ublock to tensor shape, needed if bcasting
-        TT_ASSERT((input_shape.rt % grid_shape.r) == 0);
-        TT_ASSERT((input_shape.ct % grid_shape.c) == 0);
-        int par_r = input_shape.rt / grid_shape.r;
-        int par_c = input_shape.ct / grid_shape.c;
-        ublock.rt = gcd(ublock.rt, par_r);
-        ublock.ct = gcd(ublock.ct, par_c);
-
-        return ublock;
-    };
-
-    // when enabled, we won't force the input-activations to be blocked to 1x1
-    bool enable_reblock_input_activations = env_as<bool>("PYBUDA_REBLOCK_INPUT_ACT");
-    const std::uint32_t reblock_input_max_size =
-        64;  // reblock small inputs smaller than this, regardless of enable switch
-
-    for (Node* node : graph->nodes())
-    {
-        switch (node->node_type())
-        {
-            case NodeType::kInput:
-            {
-                static constexpr int kMaxPrefetchBufStreams = 24;
-
-                GridShape grid_shape;
-                BlockShape block_shape;
-                graphlib::Shape shape = node->shape();
-                TensorShape input_shape(shape);
-                graphlib::InputNode* input = dynamic_cast<graphlib::InputNode*>(node);
-                std::vector<graphlib::Node*> data_loopback = graph->data_operands(node);
-                std::vector<graphlib::Edge> user_edges = graph->user_data_edges(node);
-                std::vector<OpModel const*> users;
-                std::vector<OpModel const*> prologue_users;
-
-                auto is_partial_datacopy_edge = [](Edge e)
-                { return (e.edge_type == graphlib::EdgeType::kPartialDataCopy); };
-                std::vector<graphlib::Edge> partial_datacopy_edges =
-                    graph->operand_edges(node, is_partial_datacopy_edge);
-                for (auto edge : user_edges)
-                {
-                    graphlib::Node* user = graph->node_by_id(edge.consumer_node_id);
-                    OpModel const& user_op_model = op_models.at(user->name());
-                    users.push_back(&user_op_model);
-                    if (user_op_model.parameter_buffers[edge.consumer_input_port_id])
-                    {
-                        prologue_users.push_back(&user_op_model);
-                    }
-                }
-                TT_ASSERT(not users.empty(), "Node {} has no users", node->name());
-                bool all_users_prologue = prologue_users.size() == users.size();
-                bool is_embedding_table =
-                    input->is_parameter() and
-                    graph->node_by_id(user_edges.front().consumer_node_id)->as<graphlib::OpNode>()->is_embedding();
-
-                auto users_tilize = graph->data_users(input);
-
-                bool is_tilize_op_input = std::any_of(
-                    users_tilize.begin(),
-                    users_tilize.end(),
-                    [](auto* n)
-                    {
-                        graphlib::OpNode* op_node = dynamic_cast<graphlib::OpNode*>(n);
-                        return op_node->is_tilize();
-                    });
-                //
-                // Each branch must initialize grid_shape and block_shape
-                //
-                if (is_embedding_table || is_tilize_op_input)
-                {
-                    TT_ASSERT(!is_embedding_table || users.size() == 1);
-                    TT_ASSERT(!is_embedding_table || user_edges.size() == 1);
-                    TT_ASSERT(user_edges.front().consumer_input_port_id == 0);
-                    // Embedding table constraints
-                    //   - prologue = false
-                    //   - grid_r must = 1 for now
-                    //   - grid_c must = op.grid_c
-                    //   - mblock_m must = [1, 1]
-                    all_users_prologue = false;
-                    OpModel const& op_model = *users.front();
-                    grid_shape.r = 1;
-                    grid_shape.c = op_model.grid_shape.c;
-                    TT_ASSERT(input->shape().ct() % grid_shape.c == 0);
-
-                    if (is_embedding_table)
-                    {
-                        block_shape =
-                            BlockShape(1, 1, 1, UBlockShape(input->shape().rt(), input->shape().ct() / grid_shape.c));
-                    }
-                    else if (is_tilize_op_input)
-                    {
-                        block_shape = BlockShape(input_shape, 1, 1, 1, UBlockShape(1, op_model.ublock_shape().ct));
-                    }
-                }
-
-                else if (not partial_datacopy_edges.empty())
-                {
-                    // op model for partial datacopy inputs is determined by output that feeds it
-                    auto* output_node = graph->node_by_id(partial_datacopy_edges.front().producer_node_id);
-                    auto output_operands = graph->data_operands(output_node);
-                    TT_ASSERT(output_operands.size() == 1);
-                    auto* writeback_op = output_operands.front();
-                    OpModel const& op_model = op_models.at(writeback_op->name());
-                    grid_shape = op_model.grid_shape;
-                    block_shape = op_model.block_shape();
-                    for (auto edge : partial_datacopy_edges)
-                    {
-                        auto other_output = graph->node_by_id(edge.producer_node_id);
-                        auto other_writeback_op = graph->data_operands(other_output).front();
-                        OpModel const& other_op_model = op_models.at(other_writeback_op->name());
-                        TT_ASSERT(
-                            other_op_model.grid_shape == grid_shape,
-                            "Partial datacopy grid shape mismatch on {} and {}",
-                            writeback_op->name(),
-                            other_output->name());
-                        bool block_shapes_match = other_op_model.block_shape().mblock_m == block_shape.mblock_m and
-                                                  other_op_model.block_shape().mblock_n == block_shape.mblock_n and
-                                                  other_op_model.block_shape().ublock == block_shape.ublock;
-                        TT_ASSERT(
-                            block_shapes_match,
-                            "Partial datacopy block shape mismatch on (note, t's don't have to match)",
-                            writeback_op->name(),
-                            other_op_model.block_shape(),
-                            other_output->name(),
-                            block_shape);
-                    }
-
-                    // Update read-view with t multiplier
-                    TT_ASSERT(node->shape().volume() % output_node->shape().volume() == 0);
-                    size_t multiplier = node->shape().volume() / output_node->shape().volume();
-                    block_shape.t *= multiplier;
-                }
-                else if (not data_loopback.empty())
-                {
-                    // If an optimizer node writes to this input (kDataLoopback) then we need to inherit its blockshape
-                    auto node = data_loopback[0];
-                    if (node->node_type() == NodeType::kOutput)
-                    {
-                        node = graph->data_operands(node)[0];
-                    }
-                    OpModel const& op_model = op_models.at(node->name());
-                    grid_shape = op_model.grid_shape;
-                    block_shape = op_model.block_shape();
-
-                    // Users need to be at least as big as the optimizer op writing to it because otherwise the
-                    // parameters wouldn't be able to fit on their core grid. This can be enforced by the balancer
-                    // policies, but for now we assert.
-                    for (OpModel const* user_op_model : prologue_users)
-                    {
-                        GridShape user_grid_shape = user_op_model->grid_shape;
-                        if (user_grid_shape.r < grid_shape.r or user_grid_shape.c < grid_shape.c)
-                        {
-                            log_debug(
-                                LogBalancer,
-                                "Optimizer grid for input exceeds consumer op grid dims: {} optimizer({}) user({})",
-                                node->name(),
-                                grid_shape,
-                                user_grid_shape);
-                            log_debug(LogBalancer, "  Fallback to stream parameters: {}", node->name());
-                            all_users_prologue = false;
-                        }
-                    }
-                }
-                else if (input and (input->is_parameter() or input->is_optimizer_parameter() or input->is_constant()))
-                {
-                    // If it's a parameter, we need the grid shape of the smallest consumer grid dims
-                    grid_shape = compatible_queue_grid_for_users(input_shape, users, true /*parameter*/);
-                    UBlockShape ublock = compatible_queue_ublock_for_users(input_shape, grid_shape, user_edges, users);
-                    block_shape = BlockShape(input_shape, grid_shape.r, grid_shape.c, 1, ublock);
-
-                    // Test to make sure that after placing all ops that reference this prologue buffer still fit in L1
-                    // Fallback to streaming the param buffer
-                    if (all_users_prologue)
-                    {
-                        int idx = 0;
-                        for (OpModel const* user_op_model_ptr : prologue_users)
-                        {
-                            // Take a copy to test if we fit in L1 with updated parameter grid blocking
-                            OpModel user_op_model = *user_op_model_ptr;
-                            Edge edge = user_edges[idx++];
-
-                            // Only replace the parameter buffer model if not kernel broadcast, we've
-                            // already determined that the entire buffer can fit in this core's L1
-                            bool is_kernel_broadcast =
-                                user_op_model.input_buffers[edge.consumer_input_port_id].kernel_broadcast_tiles > 0;
-                            if (not is_kernel_broadcast)
-                                user_op_model.parameter_buffers[edge.consumer_input_port_id] =
-                                    BufferModel(block_shape, 1, graph->node_by_id(edge.producer_node_id)->output_df());
-
-                            bool out_of_memory =
-                                user_op_model.get_l1_memory_usage() > config.device_config.get_l1_usable_size();
-                            int num_prefetch_streams = 0;
-                            auto user = graph->node_by_id(edge.consumer_node_id);
-                            for (auto operand_edge : graph->operand_data_edges(user))
-                            {
-                                if (user_op_model.parameter_buffers[operand_edge.consumer_input_port_id])
-                                {
-                                    auto operand_shape = graph->node_by_id(operand_edge.producer_node_id)->shape();
-                                    std::vector<OpModel const*> operand_users;
-                                    for (auto operand_user_node :
-                                         graph->data_users(graph->node_by_id(operand_edge.producer_node_id)))
-                                    {
-                                        operand_users.push_back(&op_models.at(operand_user_node->name()));
-                                    }
-                                    auto operand_grid_shape = compatible_queue_grid_for_users(
-                                        operand_shape, operand_users, true /*parameter*/);
-
-                                    num_prefetch_streams +=
-                                        (round_up_div(user_op_model.grid_shape.r, operand_grid_shape.r) *
-                                         round_up_div(user_op_model.grid_shape.c, operand_grid_shape.c));
-                                }
-                            }
-                            bool out_of_prefetch_streams = num_prefetch_streams > kMaxPrefetchBufStreams;
-
-                            if (out_of_memory or out_of_prefetch_streams)
-                            {
-                                // tenstorrent/pybuda#390
-                                // TT_ASSERT(prologue_users.size() > 1, "Single user should alway fit in L1, unless op
-                                // model calculation changed");
-
-                                log_debug(
-                                    LogBalancer,
-                                    "Smallest consumer grid shape forces other parameter consumer to fall out of L1, "
-                                    "prologue_users[{}] out_of_memory[{}] out_of_prefetch_streams[{}]",
-                                    prologue_users.size(),
-                                    out_of_memory,
-                                    out_of_prefetch_streams);
-                                log_debug(LogBalancer, "  Fallback to stream parameters: {}", node->name());
-                                all_users_prologue = false;
-                                break;
-                            }
-                        }
-                    }
-                }
-                else if (
-                    (enable_reblock_input_activations or
-                     (node->shape().rt() * node->shape().ct() <= reblock_input_max_size)) and
-                    input and input->is_activation())
-                {
-                    // If it's activation, we'll arbitrarily pick the smallest grid shape
-                    grid_shape = compatible_queue_grid_for_users(input_shape, users);
-                    UBlockShape ublock = compatible_queue_ublock_for_users(input_shape, grid_shape, user_edges, users);
-                    block_shape = BlockShape(input_shape, grid_shape.r, grid_shape.c, 1, ublock);
-                }
-                else
-                {
-                    // We can choose anything for ordinary input, so 1x1 grid/ublock for now (to support bcast shapes)
-                    grid_shape = GridShape(1, 1);
-                    block_shape = BlockShape(input_shape, grid_shape.r, grid_shape.c, 1, UBlockShape(1, 1));
-
-                    bool exceeds_dram_channel_size = (block_shape.volume() * tile_size_bytes(node->output_df())) >
-                                                     config.device_config.get_dram_channel_capacity();
-                    if (exceeds_dram_channel_size)
-                    {
-                        FactorizedShape legal_grid_shapes = FactorizedShape(input_shape.rt, input_shape.ct);
-                        FactorizedShape::Iterator legal_grid_shapes_iter = legal_grid_shapes.begin();
-                        bool init = true;
-                        while (exceeds_dram_channel_size and legal_grid_shapes_iter != legal_grid_shapes.end())
-                        {
-                            if (init)
-                            {
-                                grid_shape = compatible_queue_grid_for_users(input_shape, users);
-                                init = false;
-                            }
-                            else
-                            {
-                                grid_shape = GridShape(*legal_grid_shapes_iter++);
-                            }
-
-                            block_shape = BlockShape(input_shape, grid_shape.r, grid_shape.c, 1, UBlockShape(1, 1));
-                            exceeds_dram_channel_size = (block_shape.volume() * tile_size_bytes(node->output_df())) >
-                                                        config.device_config.get_dram_channel_capacity();
-                        }
-
-                        TT_ASSERT(
-                            not exceeds_dram_channel_size,
-                            "Could not find queue grid size large enough to fit queue into dram");
-                    }
-                }
-
-                OpModel op_model = create_input_queue_op_model(
-                    input_shape, grid_shape, block_shape, node->output_df(), all_users_prologue);
-                op_models.emplace(node->name(), op_model);
-                break;
-            }
-            default: break;
-        }
-    }
-}
-
-std::tuple<OpModelMap, BlockShapeMap, OutputHostTMMap, CutEdges> resolve_block_shapes(
-    Graph const* graph, BalancerConfig const& config, GraphSolverSolution const& graph_solver_solution)
-{
-    log_debug(LogBalancer, "Resolve block shapes:");
-    OpModelMap op_models;
-    OutputHostTMMap output_host_tms;
-
-    for (Node* node : graph->nodes())
-    {
-        if (node->node_type() != NodeType::kBudaOp)
-        {
-            continue;
-        }
-        TT_LOG_ASSERT(
-            graph_solver_solution.selected_op_models.count(node) > 0, "Missing op model for node {}", node->name());
-        op_models.emplace(node->name(), graph_solver_solution.selected_op_models.at(node));
-    }
-
-    resolve_input_queue_block_shapes(graph, config, op_models);
-
-    BlockShapeMap block_shape_map;
-    for (Node* node : tt::graphlib::topological_sort(*graph))
-    {
-        auto is_partial_datacopy_edge = [](Edge e) { return (e.edge_type == graphlib::EdgeType::kPartialDataCopy); };
-        std::vector<graphlib::Edge> partial_datacopy_operand_edges =
-            graph->operand_edges(node, is_partial_datacopy_edge);
-
-        BlockShape block_shape;
-        switch (node->node_type())
-        {
-            case NodeType::kInput:
-            {
-                block_shape = op_models.at(node->name()).block_shape();
-                break;
-            }
-            case NodeType::kOutput:
-            {
-                // Scale the block based on the operand's grid shape, since output queue is always on one "core" (host)
-                std::vector<Node*> operands = graph->data_operands(node);
-                TT_ASSERT(operands.size() == 1);
-                Node* operand = operands[0];
-                OpModel const& operand_op_model = op_models.at(operand->name());
-                BlockShape operand_block_shape = operand_op_model.block_shape();
-                GridShape operand_grid = operand_op_model.grid_shape;
-
-                block_shape = operand_block_shape;
-                std::vector<graphlib::Edge> partial_datacopy_edges = graph->user_edges(node, is_partial_datacopy_edge);
-
-                if (not operand_op_model.t_stream_factor.none())
-                {
-                    OutputHostTM tm;
-                    tm.hstack_factor = operand_op_model.t_stream_factor.c;
-                    tm.vstack_factor = operand_op_model.t_stream_factor.r;
-                    tm.row_major = operand_op_model.t_stream_factor.dir.r();
-                    output_host_tms.emplace(node->name(), tm);
-                    if (not tm.row_major or tm.hstack_factor > 1)
-                    {
-                        node->as<graphlib::OutputNode>()->set_untilize(false);
-                    }
-                }
-
-                if (config.output_queues_on_host and node->as<graphlib::OutputNode>()->untilize() and
-                    partial_datacopy_edges.empty())
-                {
-                    block_shape.mblock_m *= (operand_grid.r);
-                    block_shape.mblock_n *= (operand_grid.c);
-                }
-
-                log_debug(LogBalancer, "  kOutput {:64} {} inherit: {}", node->name(), block_shape, operand->name());
-                break;
-            }
-            case NodeType::kQueue:
-            {
-                std::vector<Node*> operands = graph->data_operands(node);
-                TT_ASSERT(operands.size() == 1);
-                Node* operand = operands[0];
-                OpModel const& operand_op_model = op_models.at(operand->name());
-                block_shape = operand_op_model.block_shape();
-                if (not operand_op_model.t_stream_factor.none())
-                {
-                    OutputHostTM tm;
-                    tm.hstack_factor = operand_op_model.t_stream_factor.c;
-                    tm.vstack_factor = operand_op_model.t_stream_factor.r;
-                    tm.row_major = operand_op_model.t_stream_factor.dir.r();
-                    output_host_tms.emplace(node->name(), tm);
-                }
-                log_debug(LogBalancer, "  kQueue {:64} {} inherit: {}", node->name(), block_shape, operand->name());
-                break;
-            }
-            case NodeType::kBudaOp:
-            {
-                OpModel& op_model = op_models.at(node->name());
-                block_shape = op_model.block_shape();
-                break;
-            }
-            case NodeType::kBudaNaryTM:
-            {
-                break;
-            }
-            default:
-            {
-                TT_ASSERT(false, "Unhandled node_type", node->node_type());
-                break;
-            }
-        }
-
-        log_debug(LogBalancer, "  {:64} {} {}", node->name(), block_shape, node->shape());
-        block_shape_map.emplace(node->name(), block_shape);
-    }
-
-    return std::make_tuple(op_models, block_shape_map, output_host_tms, graph_solver_solution.cut_edges);
-}
-
-}  // namespace tt::balancer::legalizer
diff --git a/pybuda/csrc/balancer/legalizer/legalizer.hpp b/pybuda/csrc/balancer/legalizer/legalizer.hpp
deleted file mode 100644
index 22433bfaf..000000000
--- a/pybuda/csrc/balancer/legalizer/legalizer.hpp
+++ /dev/null
@@ -1,60 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <optional>
-#include <unordered_map>
-#include <unordered_set>
-#include <variant>
-#include <vector>
-
-#include "balancer/balancer.hpp"
-#include "balancer/balancer_cache_collection.hpp"
-#include "balancer/balancer_utils.hpp"
-#include "balancer/types.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-
-namespace tt::balancer
-{
-struct BalancerConfig;
-}  // namespace tt::balancer
-
-namespace tt::balancer::legalizer
-{
-using UBlockOrder = tt::graphlib::UBlockOrder;
-using LegalSparseUKts = std::unordered_map<int, std::vector<int>>;
-
-LegalOpModels get_legal_op_models(
-    Graph const* graph,
-    BalancerConfig const& config,
-    std::shared_ptr<BalancerCacheCollection> cache_collection,
-    std::unordered_set<graphlib::Node*>* nodes_to_legalize = nullptr);
-
-OpModels resolve_fork_grids(Graph const* graph, BalancerConfig const& config, OpModels selected_op_models);
-
-std::tuple<OpModelMap, BlockShapeMap, OutputHostTMMap, CutEdges> resolve_block_shapes(
-    Graph const* graph, BalancerConfig const& config, GraphSolverSolution const& graph_solver_solution);
-
-std::pair<OpModel, OpModelFailureReason> calculate_op_model(
-    Graph const* graph,
-    std::shared_ptr<BalancerCacheCollection> cache_collection,
-    graphlib::BudaOpNode const* op_node,
-    GridShape selected_grid,
-    TStreamFactor t_stream_factor,
-    UBlockOrder ublock_order,
-    bool force_dram_parameters,
-    std::size_t dst_size,
-    std::size_t l1_usable_size,
-    std::size_t dram_channel_capacity,
-    std::string& customFailureMessage,
-    int fracture_factor = 1,
-    bool sparse_buffer_enable = false,
-    LegalSparseUKts const& = {},
-    int u_kt_override = 0,
-    std::map<std::uint32_t, std::uint32_t> const& min_input_buffer_factor_overrides = {},
-    std::optional<int> output_buffer_factor_override = {},
-    bool fallback_single_buffer = false);
-
-}  // namespace tt::balancer::legalizer
diff --git a/pybuda/csrc/balancer/module.mk b/pybuda/csrc/balancer/module.mk
deleted file mode 100644
index 3a1ccc1f9..000000000
--- a/pybuda/csrc/balancer/module.mk
+++ /dev/null
@@ -1,36 +0,0 @@
-# Every variable in subdir must be prefixed with subdir (emulating a namespace)
-
-PYBUDA_CSRC_BALANCER_LIB = $(LIBDIR)/libbalancer.a
-PYBUDA_CSRC_BALANCER_SRCS += \
-	pybuda/csrc/balancer/balancer.cpp \
-	pybuda/csrc/balancer/balancer_utils.cpp \
-	pybuda/csrc/balancer/legalizer/constraints.cpp \
-	pybuda/csrc/balancer/legalizer/graph_solver.cpp \
-	pybuda/csrc/balancer/legalizer/legalizer.cpp \
-	pybuda/csrc/balancer/types.cpp \
-	pybuda/csrc/balancer/python_bindings.cpp \
-	$(wildcard pybuda/csrc/balancer/policies/*.cpp)
-
-PYBUDA_CSRC_BALANCER_INCLUDES = $(PYBUDA_CSRC_INCLUDES)
-
-PYBUDA_CSRC_BALANCER_OBJS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_BALANCER_SRCS:.cpp=.o))
-PYBUDA_CSRC_BALANCER_DEPS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_BALANCER_SRCS:.cpp=.d))
-
--include $(PYBUDA_CSRC_BALANCER_DEPS)
-
-PYBUDA_CSRC_LOGGER_SRC     = third_party/budabackend/src/net2pipe/src/net2pipe_logger.cpp
-PYBUDA_CSRC_LOGGER_OBJ     = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_LOGGER_SRC:.cpp=.o))
-PYBUDA_CSRC_LOGGER_DEP     = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_LOGGER_SRC:.cpp=.d))
-
--include $(PYBUDA_CSRC_LOGGER_DEP)
-
-# Each module has a top level target as the entrypoint which must match the subdir name
-pybuda/csrc/balancer: $(PYBUDA_CSRC_BALANCER_LIB)
-
-$(PYBUDA_CSRC_BALANCER_LIB): $(PYBUDA_CSRC_PLACER_LIB) $(PYBUDA_CSRC_BALANCER_OBJS) $(PYBUDA_CSRC_GRAPH_LIB)
-	@mkdir -p $(LIBDIR)
-	ar rcs $@ $^
-
-$(OBJDIR)/pybuda/csrc/balancer/%.o: pybuda/csrc/balancer/%.cpp
-	@mkdir -p $(@D)
-	$(CXX) $(PYBUDA_CSRC_CFLAGS) $(CXXFLAGS) $(STATIC_LIB_FLAGS) $(PYBUDA_CSRC_BALANCER_INCLUDES) -c -o $@ $<
diff --git a/pybuda/csrc/balancer/output_host_tm_types.hpp b/pybuda/csrc/balancer/output_host_tm_types.hpp
deleted file mode 100644
index 0a41fad78..000000000
--- a/pybuda/csrc/balancer/output_host_tm_types.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-namespace tt::balancer
-{
-struct OutputHostTM
-{
-    int hstack_factor = 1;
-    int vstack_factor = 1;
-    bool row_major = true;
-};
-
-
-using OutputHostTMMap = std::unordered_map<std::string, OutputHostTM>;
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/policies/policies.cpp b/pybuda/csrc/balancer/policies/policies.cpp
deleted file mode 100644
index d880fa85e..000000000
--- a/pybuda/csrc/balancer/policies/policies.cpp
+++ /dev/null
@@ -1,134 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "balancer/policies/policies.hpp"
-
-#include "balancer/balancer.hpp"
-#include "balancer/policies/policy_cnn.hpp"
-#include "balancer/policies/policy_maximize_t_minimize_grid.hpp"
-#include "balancer/policies/policy_minimize_grid.hpp"
-#include "balancer/policies/policy_nlp.hpp"
-#include "balancer/policies/policy_random.hpp"
-#include "balancer/policies/policy_ribbon.hpp"
-
-using Graph = tt::graphlib::Graph;
-using Node = tt::graphlib::Node;
-using Edge = tt::graphlib::Edge;
-using DataFormat = tt::DataFormat;
-using Schedule = std::vector<std::string>;
-
-namespace tt::balancer
-{
-legalizer::GraphSolverSolution run_policy(
-    Graph const *graph,
-    BalancerConfig &config,
-    legalizer::GraphSolver &graph_solver,
-    std::optional<placer::PlacerSolution> &placer_solution)
-{
-    TT_ASSERT(
-        !config.use_interactive_placer or can_use_interactive_placer(config.policy_type),
-        "Interactive_placer is not currently supported by this policy!");
-
-    legalizer::GraphSolverSolution graph_solver_solution;
-
-    switch (config.policy_type)
-    {
-        case PolicyType::MaximizeTMinimizeGrid:
-        {
-            graph_solver_solution = run_policy_maximize_t_minimize_grid(graph, config, graph_solver);
-            break;
-        }
-        case PolicyType::MinimizeGrid:
-        {
-            graph_solver_solution = run_policy_minimize_grid(graph, config, graph_solver);
-            break;
-        }
-        case PolicyType::Random:
-        {
-            TT_ASSERT(config.use_interactive_placer);
-            graph_solver_solution = run_policy_random(graph, config, graph_solver, placer_solution);
-            break;
-        }
-        case PolicyType::NLP:
-        {
-            // Use newest policy version if using interactive placer.
-            //
-            if (config.use_interactive_placer)
-            {
-                graph_solver_solution = run_policy_nlp_v2(graph, config, graph_solver, placer_solution);
-            }
-            // Fallback to legacy policy version if not using interactive placer.
-            //
-            else
-            {
-                graph_solver_solution = run_policy_nlp(graph, config, graph_solver);
-            }
-            break;
-        }
-        case PolicyType::CNN:
-        {
-            graph_solver_solution = run_policy_cnn(graph, config, graph_solver);
-            break;
-        }
-        case PolicyType::Ribbon:
-        {
-            // There is no implementation without interactive placer.
-            //
-            if (!config.use_interactive_placer)
-            {
-                TT_THROW(
-                    "Ribbon policy has to use interactive placer! Enable interactive placer or switch to other "
-                    "balancing policy.");
-            }
-
-            // Ribbon2 is not default yet until it's been tested across all models are large blobs are handled.
-            bool use_ribbon2 = env_as<bool>("PYBUDA_RIBBON2", false);
-            if (use_ribbon2)
-            {
-                graph_solver_solution = run_policy_ribbon2(graph, config, graph_solver, placer_solution);
-            }
-            else
-            {
-                graph_solver_solution = run_policy_ribbon(graph, config, graph_solver, placer_solution);
-            }
-            break;
-        }
-        default:
-        {
-            log_fatal("Unsupported policy_type {}", config.policy_type);
-            return {};
-        }
-    }
-
-    // If we used interactive placer, we should have a placer solution and vice versa.
-    //
-    TT_ASSERT(
-        placer_solution.has_value() == config.use_interactive_placer,
-        "Interactive placer usage not properly defined for chosen policy type(can_use_interactive_placer)!");
-
-    return graph_solver_solution;
-}
-
-// Does policy support using interactive placer or not.
-//
-bool can_use_interactive_placer(PolicyType policy_type)
-{
-    switch (policy_type)
-    {
-        case PolicyType::MaximizeTMinimizeGrid:
-        case PolicyType::MinimizeGrid:
-        case PolicyType::CNN: return false;
-
-        case PolicyType::Random:
-        case PolicyType::NLP:
-        case PolicyType::Ribbon: return true;
-
-        default: TT_ASSERT("Undefined interactive placer usage for policy!");
-    }
-
-    return false;
-}
-
-}  // namespace tt::balancer
-
-// Include this hpp to include all policies
diff --git a/pybuda/csrc/balancer/policies/policies.hpp b/pybuda/csrc/balancer/policies/policies.hpp
deleted file mode 100644
index 4dba08385..000000000
--- a/pybuda/csrc/balancer/policies/policies.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <unordered_map>
-#include <vector>
-
-#include "balancer/legalizer/graph_solver.hpp"
-#include "balancer/types.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-
-namespace tt::balancer
-{
-struct BalancerConfig;
-}  // namespace tt::balancer
-
-namespace tt::balancer
-{
-legalizer::GraphSolverSolution run_policy(
-    graphlib::Graph const* graph,
-    BalancerConfig& config,
-    legalizer::GraphSolver& graph_solver,
-    std::optional<placer::PlacerSolution>& placer_solution);
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/policies/policy_cnn.cpp b/pybuda/csrc/balancer/policies/policy_cnn.cpp
deleted file mode 100644
index a948be173..000000000
--- a/pybuda/csrc/balancer/policies/policy_cnn.cpp
+++ /dev/null
@@ -1,221 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "balancer/policies/policy_cnn.hpp"
-
-#include <cstdint>
-#include <cstdlib>
-
-#include "balancer/balancer.hpp"
-#include "passes/fuse_ops.hpp"
-#include "utils/logger.hpp"
-
-using Graph = tt::graphlib::Graph;
-using Node = tt::graphlib::Node;
-using NodeType = tt::graphlib::NodeType;
-using Edge = tt::graphlib::Edge;
-using DataFormat = tt::DataFormat;
-
-namespace tt::balancer
-{
-
-namespace
-{
-std::unordered_map<Node*, std::uint32_t> find_min_prologue_volumes(
-    Graph const* graph, const std::vector<Node*>& topo_sort, legalizer::GraphSolver& graph_solver)
-{
-    // Until back-end can reblock on prologue, we have to ensure that shared parameters (i.e. parameters read by
-    // multiple matmuls) are reads by ops with the same grid sizes. Otherwise, the queue will have to be blocked for the
-    // smaller grid, and then the op that needs a bigger grid will no longer fit in L1
-
-    std::unordered_map<Node*, std::uint32_t> min_param_grid_volume;
-
-    // Search for matmul parameters
-    for (Node* node : topo_sort)
-    {
-        if ((node->node_type() != NodeType::kBudaOp) || (node->as<graphlib::BudaOpNode>()->op_type().op != "matmul"))
-            continue;
-
-        // Find minimum valid for prologue
-        auto grids = graph_solver.at(node);
-        bool found_prologue = false;
-        std::uint32_t min_volume = 100000;
-        for (auto grid : grids)
-        {
-            bool has_prologue = grid.parameter_buffers[1];
-            std::uint32_t volume = grid.grid_shape.volume();
-            if (has_prologue && (!found_prologue || (min_volume > volume)))
-            {
-                min_volume = volume;
-                found_prologue = true;
-            }
-        }
-        Node* param_node = graph->data_operands(node)[1];
-        auto it = min_param_grid_volume.find(param_node);
-        // Record max of all the min volumes
-        if (found_prologue && ((it == min_param_grid_volume.end()) || (it->second < min_volume)))
-        {
-            min_param_grid_volume[param_node] = min_volume;
-            log_debug(
-                LogBalancer,
-                "Setting minimum prologue volume on {} to {} due to {}",
-                param_node->name(),
-                min_volume,
-                node->name());
-            found_prologue = true;
-        }
-    }
-
-    return min_param_grid_volume;
-}
-}  // namespace
-
-legalizer::GraphSolverSolution run_policy_cnn(
-    graphlib::Graph const* graph, BalancerConfig const& config, legalizer::GraphSolver& graph_solver, std::uint32_t)
-{
-    log_debug(LogBalancer, "Starting CNN balancing");
-    auto topo_sort = tt::graphlib::topological_sort(*graph);
-
-    // Get min prologue volume that fits for each parameter
-    std::unordered_map<Node*, std::uint32_t> min_param_grid_volume =
-        find_min_prologue_volumes(graph, topo_sort, graph_solver);
-
-    // Pick a grid for each node.
-    for (Node* node : topo_sort)
-    {
-        if (node->node_type() != NodeType::kBudaOp)
-            continue;
-
-        std::string op_type = node->as<graphlib::BudaOpNode>()->op_type().op;
-        bool conv_matmul = (op_type == "matmul") && !node->as<graphlib::BudaOpNode>()->is_sparse_matmul();
-        bool sparse_matmul = (op_type == "matmul") && node->as<graphlib::BudaOpNode>()->is_sparse_matmul();
-        std::uint32_t min_prologue_volume = 0;  // min volume needed to remain prologue after other checks
-        auto it = min_param_grid_volume.end();
-
-        if (conv_matmul)
-            it = min_param_grid_volume.find(graph->data_operands(node)[1]);
-        else if (sparse_matmul)
-            it = min_param_grid_volume.find(graph->data_operands(node)[2]);
-
-        if (it != min_param_grid_volume.end())
-            min_prologue_volume = it->second;
-
-        // Find the largest row grid that works
-        auto grids = graph_solver.at(node);
-        std::uint32_t target_rows = 0;
-        for (auto grid : grids)
-        {
-            // std::cout << "Looking for max row for " << node->name() << ": " << grid << std::endl;
-            if ((std::uint32_t)grid.grid_shape.r > target_rows)
-                target_rows = grid.grid_shape.r;
-        }
-
-        TT_ASSERT(target_rows > 0);
-
-        using pick = std::pair<std::uint32_t, OpModel>;
-        std::unordered_map<std::string, pick> closest_distance;
-        pick default_pick = {0, *grids.begin()};
-        closest_distance["best"] = default_pick;
-        closest_distance["failed_prologue"] = default_pick;
-        closest_distance["bad_rows"] = default_pick;
-        closest_distance["bad_rows_failed_prologue"] = default_pick;
-        closest_distance["too_slow"] = default_pick;
-        closest_distance["too_slow_failed_prologue"] = default_pick;
-        for (auto grid : grids)
-        {
-            std::uint32_t execution_cycles = grid.get_execution_cycles(config.device_config.arch_name);
-            log_trace(
-                LogBalancer,
-                "Policy CNN considering {}: {}", 
-                node->name(),
-                grid);
-
-            pick current_test_pick = {execution_cycles, grid};
-
-            bool needs_prologue = (op_type == "matmul");  // others don't really matter, prologues are tiny
-            bool has_prologue = false;
-            if (needs_prologue)
-            {
-                if (node->as<graphlib::BudaOpNode>()->is_sparse_matmul())
-                {
-                    TT_ASSERT(grid.parameter_buffers.size() == 3);
-                    has_prologue = grid.parameter_buffers[0] && grid.parameter_buffers[2];
-
-                    if (grid.grid_shape.volume() > 40)
-                        continue;  // TODO: this cases a pipegen error of too many DRAM readers
-                }
-                else
-                {
-                    TT_ASSERT(grid.parameter_buffers.size() > 1);
-                    has_prologue = grid.parameter_buffers[1];
-                }
-            }
-
-            bool prologue_ok =
-                !needs_prologue || (has_prologue && ((std::uint32_t)grid.grid_shape.volume() >= min_prologue_volume));
-
-            // Check and save the pick if it's better, in the right category
-            // clang-format off
-            std::string category =
-                ((std::uint32_t)grid.grid_shape.r == target_rows)
-                    ? prologue_ok ? "best" : "failed_prologue"
-                    : prologue_ok ? "bad_rows" : "bad_rows_failed_prologue";
-            // clang-format on
-
-            /*if (execution_cycles > target_cycles)
-            {
-                // Invalid, unless we really have nothing else, in which case we'll pick the fastest
-                category = prologue_ok ? "too_slow" : "too_slow_failed_prologue";
-                if ( (execution_cycles < closest_distance[category].first) || (closest_distance[category].first == 0))
-                    closest_distance[category] = current_test_pick;
-            }
-            else*/
-            if (execution_cycles > closest_distance[category].first)
-            {
-                closest_distance[category] = current_test_pick;
-            }
-
-            log_trace(
-                LogBalancer,
-                "  Node {} grid {}: cat={}, cycles={}, closest_distance for category={}",
-                node->name(),
-                grid.grid_shape,
-                category,
-                execution_cycles,
-                closest_distance[category].first);
-        }
-
-        // Pick the grid. TODO: failed prologue is not always worse than prologue - it only is now where dram access is
-        // too slow to be useful If we model the cycles with dram access accurately, we could pick no-prologue as the
-        // best choice
-        auto picked_grid = *grids.begin();
-        for (std::string category : std::vector<std::string>{
-                 "best",
-                 "bad_rows",
-                 "too_slow",
-                 "failed_prologue",
-                 "bad_rows_failed_prologue",
-                 "too_slow_failed_prologue"})
-        {
-            if (closest_distance[category].first != 0)
-            {
-                picked_grid = closest_distance[category].second;
-                break;
-            }
-        }
-
-        graph_solver.set(node, picked_grid);
-        log_debug(
-            LogBalancer,
-            "Selected grid for node {} is {}, {}, cycles {}",
-            node->name(),
-            picked_grid.grid_shape,
-            picked_grid.t_stream_factor,
-            picked_grid.get_execution_cycles(config.device_config.arch_name));
-    }
-
-    return graph_solver.finish();
-}
-
-}  // namespace tt::balancer
-
diff --git a/pybuda/csrc/balancer/policies/policy_cnn.hpp b/pybuda/csrc/balancer/policies/policy_cnn.hpp
deleted file mode 100644
index e7a62b460..000000000
--- a/pybuda/csrc/balancer/policies/policy_cnn.hpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <unordered_map>
-#include <vector>
-
-#include "balancer/legalizer/graph_solver.hpp"
-#include "balancer/types.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-
-namespace tt::balancer
-{
-struct BalancerConfig;
-}  // namespace tt::balancer
-
-namespace tt::balancer
-{
-
-legalizer::GraphSolverSolution run_policy_cnn(
-    graphlib::Graph const* graph,
-    BalancerConfig const&,
-    legalizer::GraphSolver& graph_solver,
-    std::uint32_t target_cycles = 0);
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/policies/policy_manager.cpp b/pybuda/csrc/balancer/policies/policy_manager.cpp
deleted file mode 100644
index 7530e8d54..000000000
--- a/pybuda/csrc/balancer/policies/policy_manager.cpp
+++ /dev/null
@@ -1,700 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "balancer/policies/policy_manager.hpp"
-
-#include "balancer/policies/policy_utils.hpp"
-#include "placer/lower_to_placer.hpp"
-
-using NodeType = tt::graphlib::NodeType;
-
-namespace tt::balancer
-{
-
-// Create policy manager and initialize with GS instance and schedule.
-//
-PolicyManager::PolicyManager(
-    graphlib::Graph const* graph,
-    BalancerConfig const& config,
-    legalizer::GraphSolver& graph_solver,
-    bool ribbon_policy) :
-    graph(graph),
-    config(config),
-    interactive_placer(graph, config),
-    interactive_placer_tester(graph, config),
-    op_names_to_epoch_break(config.op_names_to_epoch_break),
-    op_names_to_chip_break(config.op_names_to_chip_break),
-    ribbon_policy(ribbon_policy)
-{
-    graph_solver_main = std::make_unique<legalizer::GraphSolver>(graph_solver);
-    graph_solver_epoch_snapshot = nullptr;
-    graph_solver_buffering_snapshot = nullptr;
-    traversal_context = graph_solver_main->get_graph_traversal_context();
-
-    std::tie(scheduled_ops, epoch_break_ops, chip_break_ops) =
-        policy_run_scheduler(graph, config, processed_nodes, processed_schedule, op_names_to_epoch_break, op_names_to_chip_break);
-    op_nodes_to_process = scheduled_ops.size();
-
-    if (ribbon_policy)
-    {
-        std::tie(next_ribbon_change, current_matmul_dim_r) = get_next_ribbon_change_op(graph, 0, scheduled_ops);
-        current_ribbon_size = pick_ribbon_size(
-            0, next_ribbon_change, graph, *graph_solver_main, scheduled_ops, config.device_config.grid_size.r);
-
-        if (current_ribbon_size == 0)
-        {
-            current_ribbon_size = 1;
-        }
-
-        epoch_start_ribbon_size = current_ribbon_size;
-        epoch_start_matmul_dim_r = current_matmul_dim_r;
-
-        log_debug(
-            LogBalancer,
-            "Initial ribbon size set to {}, window of ops: {}-{}",
-            current_ribbon_size,
-            0,
-            next_ribbon_change);
-
-        // Try transposing op if it doesn't fit.
-        //
-        try_transpose_op = !env_as<bool>("PYBUDA_DISABLE_RIBBON_TRANSPOSE") && config.enable_auto_transposing_placement;
-    }
-
-    // Epoch snapshoting used by fj buffering or ribbon like policy for sparse-dense matmul epoch colocation.
-    //
-    if (use_interactive_fj_buffering or ribbon_policy)
-    {
-        graph_solver_epoch_snapshot = std::make_unique<legalizer::GraphSolver>(*graph_solver_main);
-    }
-}
-
-// SETs op_model for currently balanced op_node and performs placing on chip.
-// Returns tuple (op_commited, epoch_completed, new_epoch_forced).
-// If epoch is completed, you need to finish_current_epoch() to proceed.
-//
-std::tuple<bool, bool, bool> PolicyManager::commit_op(const OpModel& selected_op_model)
-{
-    const graphlib::BudaOpNode* op = current_op_node->as<graphlib::BudaOpNode>();
-    log_trace(LogBalancer, "Balancing: op_node {}.", current_op_node->name());
-
-    bool force_new_epoch = epoch_break_ops.count(current_op_node->name()) > 0;
-    bool skip_op_set = false;
-    bool skip_op_place = false;
-
-    bool new_epoch =
-        (force_new_epoch || current_epoch_type != op->get_epoch_type()) && !interactive_placer.current_epoch_empty();
-
-    // If transpose can help us align with current ribbon size, do it.
-    //
-    bool op_force_transposed = false;
-    std::unordered_map<std::string, tt::placer::PlacerOpOverride>::iterator it;
-
-    if (ribbon_policy and !op->is_sparse_matmul() and !buffered_op_model.has_value() and
-        static_cast<std::uint32_t>(selected_op_model.grid_shape.r) != current_ribbon_size and
-        static_cast<std::uint32_t>(selected_op_model.grid_shape.c) == current_ribbon_size and
-        interactive_placer.can_fit_on_single_epoch(selected_op_model.grid_shape.c, selected_op_model.grid_shape.r) and
-        interactive_placer.get_op_overrides().find(op->name()) == interactive_placer.get_op_overrides().end())
-    {
-        std::tie(it, op_force_transposed) =
-            interactive_placer.get_op_overrides().emplace(op->name(), placer::PlacerOpOverride::force_op_transpose());
-    }
-
-    // Place, and figure out if it fits on the current epoch.
-    //
-    std::optional<placer::CoordRange> op_placement = std::nullopt;
-    if (!new_epoch)
-    {
-        pair_two_ops_if_possible(op, selected_op_model, op_placement, skip_op_set, skip_op_place);
-
-        if (!skip_op_place)
-        {
-            op_placement = interactive_placer.place_op(
-                op->name(),
-                selected_op_model.grid_shape,
-                try_transpose_op /* enable_transpose */,
-                chip_break_ops.find(op->name()) != chip_break_ops.end() /* chip_break */);
-        }
-
-        new_epoch = !op_placement.has_value() and !buffered_op_model;
-    }
-
-    if (new_epoch && interactive_placer.current_epoch_empty())
-    {
-        TT_THROW("Op {} doesn't fit on a single epoch", op->name());
-    }
-
-    // Make transpose op override expired so that it is not impacting rewinds(context might change and transpose is
-    // not wanted anymore).
-    //
-    if (op_force_transposed)
-    {
-        interactive_placer.get_op_overrides().erase(it);
-    }
-
-    // Op placed, set it in graphsolver.
-    //
-    if (!new_epoch and !skip_op_set)
-    {
-        if (ribbon_policy)
-        {
-            set_op_model_for_node_ribbon(*graph_solver_main, current_op_node, selected_op_model, current_ribbon_size);
-            update_ribbon_size();
-        }
-        else
-        {
-            set_op_model_for_node(
-                *graph_solver_main, current_op_node, selected_op_model, config.device_config.arch_name);
-        }
-
-        current_epoch_ops.insert(current_op_node);
-        epoch_schedule.push_back(current_op_node->name());
-    }
-
-    // Return if op is commited, if current epoch is completed, and if switch to new epoch was forced.
-    //
-    return std::make_tuple(!new_epoch, new_epoch or op_index >= scheduled_ops.size(), force_new_epoch);
-}
-
-// Try buffering and pairing two ops for optimal(atomic) placing. Currently used for sparse-dense like matmul pairs.
-//
-void PolicyManager::pair_two_ops_if_possible(
-    const graphlib::BudaOpNode* op,
-    const OpModel& selected_op_model,
-    std::optional<placer::CoordRange>& op_placement,
-    bool& skip_op_set,
-    bool& skip_op_place)
-{
-    // If we have sparse matmul followed by a pairable op(dense/depthwise matmul, reduce max)
-    // buffer it for atomic placement in next iteration.
-    //
-    if (ribbon_policy and op->is_sparse_matmul() and op_index < scheduled_ops.size())
-    {
-        graphlib::Node* next_node = graph->get_node_by_name(scheduled_ops[op_index]);
-        if (next_node->node_type() == NodeType::kBudaOp and epoch_break_ops.count(scheduled_ops[op_index]) == 0 and
-            chip_break_ops.find(next_node->name()) == chip_break_ops.end())
-        {
-            const graphlib::BudaOpNode* dense_matmul_op = static_cast<graphlib::BudaOpNode*>(next_node);
-            if (dense_matmul_op->should_pair_with_sparse(op, graph))
-            {
-                TT_ASSERT(!buffered_op_model.has_value());
-                graph_solver_pairing_checkpoint = std::make_unique<legalizer::GraphSolver>(*graph_solver_main);
-                set_op_model_for_node_ribbon(
-                    *graph_solver_main, current_op_node, selected_op_model, current_ribbon_size);
-                buffered_op_model = selected_op_model;
-                skip_op_set = true;
-                skip_op_place = true;
-            }
-        }
-    }
-    // We have a buffered op_model check compatibility with current selected one.
-    //
-    else if (buffered_op_model.has_value())
-    {
-        // Check if buffered + selected can fit on single epoch.
-        //
-        if (can_fit_on_single_epoch(
-                interactive_placer_tester,
-                buffered_op_model->buda_op_node->name(),
-                buffered_op_model->grid_shape,
-                op->name(),
-                selected_op_model.grid_shape,
-                try_transpose_op))
-        {
-            skip_op_place = true;
-            skip_op_set = true;
-
-            // Rowsize matches, place them as bound pair.
-            //
-            if (selected_op_model.grid_shape.r == buffered_op_model->grid_shape.r and
-                interactive_placer.can_fit_on_single_epoch(
-                    buffered_op_model->grid_shape.r,
-                    buffered_op_model->grid_shape.c + selected_op_model.grid_shape.c,
-                    try_transpose_op /* allow_transpose */))
-            {
-                op_placement = interactive_placer.place_two_ops_rowwise(
-                    buffered_op_model->buda_op_node->name(),
-                    buffered_op_model->grid_shape,
-                    op->name(),
-                    selected_op_model.grid_shape,
-                    try_transpose_op, /* enable_transpose */
-                    chip_break_ops.find(op->name()) != chip_break_ops.end() /* chip_break */
-                );
-
-                if (op_placement.has_value())
-                {
-                    current_epoch_ops.insert(buffered_op_model->buda_op_node);
-                    epoch_schedule.push_back(buffered_op_model->buda_op_node->name());
-                    set_op_model_for_node_ribbon(
-                        *graph_solver_main, current_op_node, selected_op_model, current_ribbon_size);
-                    update_ribbon_size();
-                    current_epoch_ops.insert(op);
-                    epoch_schedule.push_back(op->name());
-                }
-            }
-            // Rowsize does not match. Still try to place them next to each other in a single epoch.
-            //
-            else
-            {
-                op_placement = interactive_placer.place_op(
-                    buffered_op_model->buda_op_node->name(),
-                    buffered_op_model->grid_shape,
-                    try_transpose_op /* enable_transpose */,
-                    chip_break_ops.find(op->name()) != chip_break_ops.end() /* chip_break */);
-
-                if (op_placement.has_value())
-                {
-                    op_placement = interactive_placer.place_op(
-                        op->name(),
-                        selected_op_model.grid_shape,
-                        try_transpose_op /* enable_transpose */,
-                        false /* chip_break */);
-
-                    if (op_placement.has_value())
-                    {
-                        current_epoch_ops.insert(buffered_op_model->buda_op_node);
-                        epoch_schedule.push_back(buffered_op_model->buda_op_node->name());
-                        set_op_model_for_node_ribbon(
-                            *graph_solver_main, current_op_node, selected_op_model, current_ribbon_size);
-                        update_ribbon_size();
-                        current_epoch_ops.insert(op);
-                        epoch_schedule.push_back(op->name());
-                        skip_op_set = true;
-                    }
-                    else
-                    {
-                        // Revert buffered op placement as paired op placement failed. We dont want them in separate
-                        // epochs.
-                        //
-                        interactive_placer.rewind_to(buffered_op_model->buda_op_node->name());
-                    }
-                }
-            }
-        }
-        // If buffered + selected cannot fit on single epoch
-        // we need to back out and place them separately. Buffered first with currently selected one coming after in
-        // regular independent placement.
-        //
-        else
-        {
-            // Place only buffered one.
-            //
-            op_placement = interactive_placer.place_op(
-                buffered_op_model->buda_op_node->name(),
-                buffered_op_model->grid_shape,
-                try_transpose_op /* enable_transpose */,
-                chip_break_ops.find(op->name()) != chip_break_ops.end() /* chip_break */);
-
-            if (op_placement.has_value())
-            {
-                current_epoch_ops.insert(buffered_op_model->buda_op_node);
-                epoch_schedule.push_back(buffered_op_model->buda_op_node->name());
-            }
-        }
-
-        if (!op_placement.has_value())
-        {
-            // Paired placement failed, revert to prebuffering pairing checkpoint.
-            //
-            traversal_context.reset();
-            graph_solver_main = std::make_unique<legalizer::GraphSolver>(*graph_solver_pairing_checkpoint);
-            traversal_context = graph_solver_main->get_graph_traversal_context();
-            skip_op_set = true;
-            skip_op_place = true;
-            op_index--;
-        }
-
-        buffered_op_model.reset();
-        graph_solver_pairing_checkpoint = nullptr;
-    }
-}
-
-// Check for ribbon size changes.
-//
-void PolicyManager::update_ribbon_size()
-{
-    TT_ASSERT(ribbon_policy);
-
-    if (op_index == next_ribbon_change and op_index > 0 and op_index < scheduled_ops.size())
-    {
-        std::tie(next_ribbon_change, current_matmul_dim_r) = get_next_ribbon_change_op(graph, op_index, scheduled_ops);
-        std::uint32_t next_ribbon_size = pick_ribbon_size(
-            op_index, next_ribbon_change, graph, *graph_solver_main, scheduled_ops, config.device_config.grid_size.r);
-        if (next_ribbon_change < scheduled_ops.size())
-            log_debug(LogBalancer, "Next change at {}", scheduled_ops[next_ribbon_change]);
-
-        // Force epoch change if ribbon size changes. In the future, we can handle this with a queue, or padding.
-        if ((current_ribbon_size != next_ribbon_size) && !interactive_placer.current_epoch_empty())
-        {
-            const graphlib::Node* node = graph->get_node_by_name(scheduled_ops[op_index]);
-            TT_ASSERT(node->node_type() == NodeType::kBudaOp);
-            cut_graph_solver_ribbon(graph, node, interactive_placer, *graph_solver_main);
-
-            current_ribbon_size = next_ribbon_size;
-            log_debug(LogBalancer, "Changing current ribbon size to {} at op {}", current_ribbon_size, node->name());
-        }
-    }
-}
-
-// Finish current epoch. Performs inline fork-join buffering if enabled.
-// Returns true if epoch is finished. If balancing is not complete new epoch will be auto-started.
-// Returns false if epoch is rewinded due to buffering. State/counters/current_op_node are reset to current epoch start.
-//
-bool PolicyManager::finish_current_epoch()
-{
-    TT_ASSERT(!interactive_placer.current_epoch_empty(), "Cannot finish empty epoch!");
-    bool balancing_complete = op_index >= scheduled_ops.size() and current_epoch_ops.count(current_op_node) > 0;
-    if (use_interactive_fj_buffering)
-    {
-        // Handle case when current epoch overflows to next one due to buffering.
-        //
-        if (!balancing_complete and !pre_buffering_epoch_ops.empty())
-        {
-            bool buffered_epoch_overflow = handle_epoch_buffering_overflow();
-            if (buffered_epoch_overflow)
-                return false;
-        }
-
-        // If we are at the end of current epoch try buffering fork joins.
-        //
-        bool epoch_buffered = buffer_epoch();
-        if (epoch_buffered)
-            return false;
-    }
-
-    if (!balancing_complete)
-    {
-        start_new_epoch(current_op_node->as<graphlib::BudaOpNode>()->get_epoch_type());
-        Logger<kLoggerABI>::get().log_level_type(
-            Logger<kLoggerABI>::Level::Info,
-            LogBalancer,
-            "Balancing {}% complete.",
-            processed_nodes.size() * 100 / op_nodes_to_process);
-    }
-    else
-    {
-        processed_nodes.insert(current_epoch_ops.begin(), current_epoch_ops.end());
-        processed_schedule.insert(processed_schedule.end(), epoch_schedule.begin(), epoch_schedule.end());
-        epoch_schedule.clear();
-        current_epoch_ops.clear();
-        TT_ASSERT(processed_nodes.size() == op_nodes_to_process, "Not all nodes were processed!");
-        Logger<kLoggerABI>::get().log_level_type(
-            Logger<kLoggerABI>::Level::Info, LogBalancer, "Balancing 100% completed!");
-    }
-
-    return true;
-}
-
-// Starts new epoch with incoming op epoch type. Update and reset epoch related variables and counters.
-//
-void PolicyManager::start_new_epoch(graphlib::NodeEpochType epoch_type)
-{
-    last_epoch_start = op_index - 1;
-    cut_graph_solver_epoch(graph, interactive_placer, *graph_solver_main);
-    current_epoch_type = epoch_type;
-    interactive_placer.next_epoch(current_epoch_type);
-    if (use_interactive_fj_buffering or ribbon_policy)
-    {
-        // Starting new epoch, make graph solver snapshot, record processed nodes,
-        // clear epoch overflow, clear previous epoch nodes and clear buffering instructions.
-        //
-        if (overflow_set_for_epoch)
-        {
-            op_names_to_epoch_break.pop_back();
-            epoch_break_ops = placer::lowering::tag_ops_for_epoch_break(
-                config.device_config.arch_name,
-                op_names_to_epoch_break,
-                op_names_to_chip_break,
-                scheduled_ops,
-                graph,
-                true /* use_interactive_placer */);
-        }
-
-        overflow_set_for_epoch = false;
-        graph_solver_epoch_snapshot = std::make_unique<legalizer::GraphSolver>(*graph_solver_main);
-        graph_solver_buffering_snapshot = nullptr;
-        pre_buffering_epoch_ops.clear();
-        processed_nodes.insert(current_epoch_ops.begin(), current_epoch_ops.end());
-        processed_schedule.insert(processed_schedule.end(), epoch_schedule.begin(), epoch_schedule.end());
-        current_epoch_ops.clear();
-        epoch_schedule.clear();
-        inst.clear();
-
-        if (ribbon_policy)
-        {
-            epoch_start_ribbon_size = current_ribbon_size;
-            epoch_start_matmul_dim_r = current_matmul_dim_r;
-        }
-    }
-
-    op_index--;
-
-    // Start new epoch, place op again.
-    //
-    log_debug(LogBalancer, "Starting new epoch");
-}
-
-// Buffer current epoch. Returns true if epoch was buffered/graph was changed.
-//
-bool PolicyManager::buffer_epoch()
-{
-    graphlib::Graph* graph_modify = const_cast<graphlib::Graph*>(graph);
-    OpModels* op_models = graph_solver_main->get_selected_op_models_for_buffering(current_epoch_ops);
-    std::unordered_set<const tt::graphlib::Node*> current_epoch_nodes =
-        calculate_current_epoch_nodes(graph, current_epoch_ops);
-    FJBufferingResult fj_buffering;
-
-    {
-        // Generate buffering instructions if this epoch needs buffering.
-        // We are scoping down FJ buffering algorithm to subgraph by setting GraphTraversalContext
-        // to current epoch nodes.
-        //
-        std::unique_ptr<graphlib::GraphTraversalContext> epoch_traversal_context =
-            graph_solver_main->get_graph_epoch_traversal_context(&current_epoch_nodes);
-        fj_buffering = insert_fork_join_buffering(
-            graph_modify,
-            nullptr /* postplacer op models */,
-            op_models,
-            config.device_config.get_l1_usable_size(),
-            prev_inst,
-            config.fork_join_tiles_treshold,
-            ribbon_policy ? &ribbon_buffering_factor : [](const tt::balancer::OpModel&) { return 1; });
-    }
-
-    inst = fj_buffering.instructions;
-    if (!std::get<0>(is_subset_of_instructions(inst, prev_inst)))
-    {
-        // We need to buffer, so we need to rewind the epoch and place again with buffer nodes.
-        // Revert graphsolver to snapshot. Release old traversal context.
-        //
-        bool graph_modified = false;
-        interactive_placer.rewind_epoch();
-        traversal_context.reset();
-
-        // If we are buffering this epoch for the first time, save snapshot of current epoch nodes.
-        //
-        if (pre_buffering_epoch_ops.empty())
-        {
-            pre_buffering_epoch_ops.insert(
-                pre_buffering_epoch_ops.end(), current_epoch_ops.begin(), current_epoch_ops.end());
-        }
-
-        graph_solver_main = std::make_unique<legalizer::GraphSolver>(
-            graph_solver_buffering_snapshot ? *graph_solver_buffering_snapshot : *graph_solver_epoch_snapshot);
-        {
-            // Operate only within current epoch nodes.
-            //
-            std::unique_ptr<graphlib::GraphTraversalContext> epoch_traversal_context =
-                graph_solver_main->get_graph_epoch_traversal_context(&current_epoch_nodes);
-            graph_modified = buffer_graph(graph_modify, inst, *graph_solver_main);
-        }
-
-        // Reset current epoch nodes and traversal context to old state(snapshot).
-        //
-        current_epoch_ops.clear();
-        epoch_schedule.clear();
-        traversal_context = graph_solver_main->get_graph_traversal_context();
-        if (graph_modified)
-        {
-            // If we added new non queue nodes we need to rerun scheduler.
-            // Make scheduler ignore already processed nodes.
-            //
-            std::tie(scheduled_ops, epoch_break_ops, chip_break_ops) =
-                policy_run_scheduler(graph, config, processed_nodes, processed_schedule, op_names_to_epoch_break, op_names_to_chip_break);
-            op_nodes_to_process = scheduled_ops.size() + processed_nodes.size();
-            op_index = 0;
-            last_epoch_start = 0;
-        }
-        else
-        {
-            // No new nodes added, continue from last epoch start.
-            //
-            op_index = last_epoch_start;
-        }
-
-        if (ribbon_policy)
-        {
-            std::tie(next_ribbon_change, current_matmul_dim_r) =
-                get_next_ribbon_change_op(graph, op_index, scheduled_ops, epoch_start_matmul_dim_r);
-            current_ribbon_size = epoch_start_ribbon_size;
-        }
-
-        // Record new snapshot and cache buffering instructions for next buffering cycle.
-        //
-        graph_solver_buffering_snapshot = std::make_unique<legalizer::GraphSolver>(*graph_solver_main);
-        return true;
-    }
-
-    return false;
-}
-
-// In case buffering causes current epoch to overflow, cut graph before the overflow and retry epoch with new
-// buffering(likely fewer buffers as smaller number of true OPs remain).
-//
-bool PolicyManager::handle_epoch_buffering_overflow()
-{
-    // Record all ops which were present in one epoch before buffering but overflowed for the current epoch.
-    //
-    scheduler::Schedule overflowed_ops;
-    for (const Node* node : pre_buffering_epoch_ops)
-    {
-        if (current_epoch_ops.count(node) == 0)
-        {
-            overflowed_ops.push_back(node->name());
-        }
-    }
-
-    if (!overflowed_ops.empty())
-    {
-        // If we have already set an overflow epoch break for this epoch, remove it.
-        // Due to additional buffering it turns out that we need to cut earlier.
-        //
-        if (overflow_set_for_epoch)
-        {
-            op_names_to_epoch_break.pop_back();
-        }
-
-        // Mark these nodes as set of epoch break nodes. This will effectively resolve fork-join buffering for
-        // this path with E2E queue(if one of these nodes was indeed part of the fork join).
-        //
-        op_names_to_epoch_break.push_back(overflowed_ops);
-        overflow_set_for_epoch = true;
-
-        // Rewind the epoch, reset state of all counters, revert buffering, reschedule and try again.
-        //
-        rewind_epoch();
-
-        return true;
-    }
-
-    return false;
-}
-
-// Get next OP to balance.
-//
-const graphlib::Node* PolicyManager::get_next_op()
-{
-    if (op_index >= scheduled_ops.size())
-    {
-        current_op_node = nullptr;
-    }
-    else
-    {
-        current_op_node = graph->get_node_by_name(scheduled_ops[op_index++]);
-        TT_ASSERT(current_op_node->node_type() == NodeType::kBudaOp);
-    }
-
-    return current_op_node;
-}
-
-// Rewinds epoch in progress from interactive placer. Reverts GS state to epoch start snapshot.
-// Resets all epoch related counters.
-//
-void PolicyManager::rewind_epoch()
-{
-    TT_ASSERT(graph_solver_epoch_snapshot != nullptr, "Cannot rewind epoch without snapshot!");
-    interactive_placer.rewind_epoch();
-    traversal_context.reset();
-    graph_solver_main = std::make_unique<legalizer::GraphSolver>(*graph_solver_epoch_snapshot);
-    pre_buffering_epoch_ops.clear();
-    current_epoch_ops.clear();
-    epoch_schedule.clear();
-    buffered_op_model.reset();
-    graph_solver_pairing_checkpoint = nullptr;
-    traversal_context = graph_solver_main->get_graph_traversal_context();
-
-    // If epoch was buffered or overflowed(epoch break was set), we need to reschedule.
-    //
-    if (graph_solver_buffering_snapshot or overflow_set_for_epoch)
-    {
-        std::tie(scheduled_ops, epoch_break_ops, chip_break_ops) =
-            policy_run_scheduler(graph, config, processed_nodes, processed_schedule, op_names_to_epoch_break, op_names_to_chip_break);
-        op_nodes_to_process = scheduled_ops.size() + processed_nodes.size();
-
-        if (ribbon_policy)
-        {
-            std::tie(next_ribbon_change, current_matmul_dim_r) =
-                get_next_ribbon_change_op(graph, 0, scheduled_ops, epoch_start_matmul_dim_r);
-        }
-
-        op_index = 0;
-        last_epoch_start = 0;
-    }
-    else
-    {
-        op_index = last_epoch_start;
-
-        if (ribbon_policy)
-        {
-            std::tie(next_ribbon_change, current_matmul_dim_r) =
-                get_next_ribbon_change_op(graph, last_epoch_start, scheduled_ops, epoch_start_matmul_dim_r);
-        }
-    }
-
-    if (ribbon_policy)
-    {
-        current_ribbon_size = epoch_start_ribbon_size;
-    }
-
-    graph_solver_buffering_snapshot = nullptr;
-    inst.clear();
-}
-
-// Force current epoch to break at specified OP. Will automatically rewind current epoch so that new epoch break could
-// be applied.
-// Returns true if epoch break was successful.
-//
-bool PolicyManager::force_current_epoch_break(const std::string& op_name)
-{
-    // Can't break epoch on the first op.
-    //
-    if (scheduled_ops[last_epoch_start] == op_name)
-    {
-        return false;
-    }
-
-    // If we have already set an overflow epoch break for this epoch, remove it.
-    //
-    if (overflow_set_for_epoch)
-    {
-        op_names_to_epoch_break.pop_back();
-    }
-
-    scheduler::Schedule current_epoch_break;
-    current_epoch_break.push_back(op_name);
-    op_names_to_epoch_break.push_back(current_epoch_break);
-    overflow_set_for_epoch = true;
-
-    // Rewind the epoch, reset state of all counters, revert buffering, reschedule and try again.
-    //
-    rewind_epoch();
-
-    return true;
-}
-
-// Commit and validate interactive placer solution.
-//
-tt::placer::PlacerSolution PolicyManager::commit_solution()
-{
-    if (use_interactive_fj_buffering or ribbon_policy)
-    {
-        // If we used fork join buffering, we rerun scheduler more than once so in order to validate
-        // we need to reconstruct scheduled_ops.
-        //
-        std::unordered_set<const tt::graphlib::Node*> empty_set_processed_nodes;
-        tt::scheduler::Schedule empty_processed_schedule;
-        // op_names_to_epoch_break and op_names_to_chip_break are empty by now because processed nodes are removed as the placement was done
-        // we want original epoch_breaks and chip_breaks, hence, we pass in config.op_names_to_epoch_break and config.op_names_to_chip_break
-        std::tie(scheduled_ops, epoch_break_ops, chip_break_ops) =
-            policy_run_scheduler(graph, config, empty_set_processed_nodes, empty_processed_schedule, config.op_names_to_epoch_break, config.op_names_to_chip_break);
-    }
-
-    tt::placer::PlacerSolution placer_solution = interactive_placer.commit(chip_break_ops);
-    placer_solution.fork_join_buffered = use_interactive_fj_buffering;
-
-    validate_solution(scheduled_ops, placer_solution);
-
-    return placer_solution;
-}
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/policies/policy_manager.hpp b/pybuda/csrc/balancer/policies/policy_manager.hpp
deleted file mode 100644
index c2a3dc305..000000000
--- a/pybuda/csrc/balancer/policies/policy_manager.hpp
+++ /dev/null
@@ -1,112 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "balancer/legalizer/graph_solver.hpp"
-#include "passes/fork_join.hpp"
-#include "placer/interactive_placer.hpp"
-
-namespace tt::balancer
-{
-
-// Class that decouples interactive placer and graphsolver from balancer policy logic.
-// Op placement, epoch switches and inline fork-join buffering are handled by PolicyManager.
-//
-class PolicyManager
-{
-    graphlib::Graph const* graph;
-    BalancerConfig config;
-    placer::InteractivePlacer interactive_placer;
-    placer::InteractivePlacer interactive_placer_tester;
-    std::unordered_set<string> epoch_break_ops;
-    std::unordered_set<string> chip_break_ops;
-    scheduler::Schedule scheduled_ops;
-    graphlib::NodeEpochType current_epoch_type = NodeEpochType::Forward;
-    std::uint32_t last_epoch_start = 0;
-    std::uint32_t op_index = 0;
-    std::uint32_t op_nodes_to_process = 0;
-    std::unordered_set<const tt::graphlib::Node*> current_epoch_ops;
-    std::vector<const tt::graphlib::Node*> pre_buffering_epoch_ops;
-    tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash> inst;
-    const tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        prev_inst;
-    std::unordered_set<const tt::graphlib::Node*> processed_nodes;
-    tt::scheduler::Schedule processed_schedule;
-    tt::scheduler::Schedule epoch_schedule;
-    std::vector<tt::scheduler::Schedule> op_names_to_epoch_break;
-    std::vector<tt::scheduler::Schedule> op_names_to_chip_break;
-    bool overflow_set_for_epoch = false;
-    bool use_interactive_fj_buffering = !env_as<bool>("PYBUDA_DISABLE_INTERACTIVE_FJ_BUFFERING", false);
-    std::unique_ptr<legalizer::GraphSolver> graph_solver_main;
-    std::unique_ptr<legalizer::GraphSolver> graph_solver_epoch_snapshot;
-    std::unique_ptr<legalizer::GraphSolver> graph_solver_buffering_snapshot;
-    std::unique_ptr<legalizer::GraphSolver> graph_solver_pairing_checkpoint;
-    std::unique_ptr<graphlib::GraphTraversalContext> traversal_context;
-    std::optional<OpModel> buffered_op_model;
-    const Node* current_op_node = nullptr;
-    bool try_transpose_op = true;
-
-    // Section for ribbon like policies.
-    //
-    bool ribbon_policy = false;
-
-    // Number of rows in the ribbon. Should only change when matmul with a different R dim is encountered.
-    // Scheduler needs to ensure that we don't go to lower res and then jump back to bigger, wherever possible to avoid
-    // it.
-    std::uint32_t current_ribbon_size = 0;
-    std::uint32_t current_matmul_dim_r = 0;
-    std::uint32_t epoch_start_matmul_dim_r = 0;
-    std::uint32_t next_ribbon_change = 0;
-    std::uint32_t epoch_start_ribbon_size = 0;
-    void update_ribbon_size();
-
-    bool buffer_epoch();
-    bool handle_epoch_buffering_overflow();
-    void start_new_epoch(graphlib::NodeEpochType epoch_type);
-    void pair_two_ops_if_possible(
-        const graphlib::BudaOpNode* op,
-        const OpModel& selected_op_model,
-        std::optional<placer::CoordRange>& op_placement,
-        bool& op_already_set,
-        bool& skip_placing);
-
-   public:
-    PolicyManager(
-        graphlib::Graph const* graph,
-        BalancerConfig const& config,
-        legalizer::GraphSolver& graph_solver,
-        bool ribbon_policy = false);
-
-    // Main interfaces.
-    //
-    const graphlib::Node* get_next_op();
-    std::tuple<bool, bool, bool> commit_op(const OpModel& selected_op_model);
-    bool finish_current_epoch();
-    tt::placer::PlacerSolution commit_solution();
-    void rewind_epoch();
-    bool force_current_epoch_break(const std::string& op_name);
-
-    // Graph solver interface.
-    //
-    legalizer::GraphSolver::RemainingOpModels at(const tt::graphlib::Node* node) const
-    {
-        return graph_solver_main->at(node);
-    }
-    legalizer::GraphSolverSolution finish() { return graph_solver_main->finish(); }
-    void invalidate_suboptimal_op_models(int invalidation_strategy)
-    {
-        graph_solver_main->invalidate_suboptimal_op_models(invalidation_strategy);
-    }
-
-    // Simple getters.
-    //
-    std::uint32_t get_current_ribbon_size() const { return current_ribbon_size; }
-    std::uint32_t get_current_epoch_index() const { return interactive_placer.get_current_epoch_index(); }
-    std::uint32_t get_current_epoch_size() const { return interactive_placer.current_epoch_size(); }
-
-    // Currently balanced OP.
-    //
-    const graphlib::Node* get_current_op() { return current_op_node; }
-};
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/policies/policy_maximize_t_minimize_grid.cpp b/pybuda/csrc/balancer/policies/policy_maximize_t_minimize_grid.cpp
deleted file mode 100644
index 128e75ece..000000000
--- a/pybuda/csrc/balancer/policies/policy_maximize_t_minimize_grid.cpp
+++ /dev/null
@@ -1,69 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "balancer/policies/policy_maximize_t_minimize_grid.hpp"
-
-#include "balancer/balancer.hpp"
-#include "utils/logger.hpp"
-
-using Graph = tt::graphlib::Graph;
-using Node = tt::graphlib::Node;
-using NodeType = tt::graphlib::NodeType;
-using Edge = tt::graphlib::Edge;
-using DataFormat = tt::DataFormat;
-
-namespace tt::balancer
-{
-template <typename Fn>
-void run_policy_maximize_t_minimize_grid(Graph const* graph, legalizer::GraphSolver& graph_solver, Fn fn)
-{
-    for (Node* node : tt::graphlib::topological_sort(*graph, fn))
-    {
-        if (node->node_type() != NodeType::kBudaOp)
-            continue;
-
-        auto legal_op_models = graph_solver.at(node);
-        std::vector<OpModel> op_models(legal_op_models.begin(), legal_op_models.end());
-        std::sort(
-            op_models.begin(),
-            op_models.end(),
-            [](OpModel const& a, OpModel const& b) -> bool
-            {
-                if (a.t_stream_factor.t() == b.t_stream_factor.t())
-                {
-                    int perimeter_a = a.grid_shape.r + a.grid_shape.c;
-                    int perimeter_b = b.grid_shape.r + b.grid_shape.c;
-
-                    if (perimeter_a == perimeter_b)
-                    {
-                        return a.grid_shape.r < b.grid_shape.r;
-                    }
-
-                    return perimeter_a < perimeter_b;
-                }
-
-                return a.t_stream_factor.t() > b.t_stream_factor.t();
-            });
-        graph_solver.set(node, op_models.front());
-        log_debug(LogBalancer, "Selected max t min grid for node: {}", node->name());
-        log_debug(LogBalancer, "  {} {}", op_models.front().grid_shape, op_models.front().t_stream_factor);
-    }
-}
-
-legalizer::GraphSolverSolution run_policy_maximize_t_minimize_grid(
-    Graph const* graph, BalancerConfig const&, legalizer::GraphSolver& graph_solver)
-{
-    std::string node_name_filter = env_as<std::string>("PYBUDA_BALANCER_MAXIMIZE_T_FILTER");
-    auto filter = [&node_name_filter](Node const* n)
-    {
-        if (node_name_filter.empty())
-            return true;
-        return n->name().find(node_name_filter) != std::string::npos;
-    };
-    auto not_filter = [filter](Node const* n) { return not filter(n); };
-    run_policy_maximize_t_minimize_grid(graph, graph_solver, filter);
-    run_policy_maximize_t_minimize_grid(graph, graph_solver, not_filter);
-    return graph_solver.finish();
-}
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/policies/policy_maximize_t_minimize_grid.hpp b/pybuda/csrc/balancer/policies/policy_maximize_t_minimize_grid.hpp
deleted file mode 100644
index 88e6a3ace..000000000
--- a/pybuda/csrc/balancer/policies/policy_maximize_t_minimize_grid.hpp
+++ /dev/null
@@ -1,24 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <unordered_map>
-#include <vector>
-
-#include "balancer/legalizer/graph_solver.hpp"
-#include "balancer/types.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-
-namespace tt::balancer
-{
-struct BalancerConfig;
-}  // namespace tt::balancer
-
-namespace tt::balancer
-{
-legalizer::GraphSolverSolution run_policy_maximize_t_minimize_grid(
-    graphlib::Graph const* graph, BalancerConfig const&, legalizer::GraphSolver& graph_solver);
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/policies/policy_minimize_grid.cpp b/pybuda/csrc/balancer/policies/policy_minimize_grid.cpp
deleted file mode 100644
index d4fe36245..000000000
--- a/pybuda/csrc/balancer/policies/policy_minimize_grid.cpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "balancer/policies/policy_minimize_grid.hpp"
-
-#include "balancer/balancer.hpp"
-#include "utils/logger.hpp"
-
-using Graph = tt::graphlib::Graph;
-using Node = tt::graphlib::Node;
-using NodeType = tt::graphlib::NodeType;
-using Edge = tt::graphlib::Edge;
-using DataFormat = tt::DataFormat;
-
-namespace tt::balancer {
-legalizer::GraphSolverSolution run_policy_minimize_grid(Graph const* graph, BalancerConfig const&, legalizer::GraphSolver& graph_solver)
-{
-    for (Node* node : tt::graphlib::topological_sort(*graph)) {
-        if (node->node_type() != NodeType::kBudaOp)
-            continue;
-
-        auto legal_op_models = graph_solver.at(node);
-        std::vector<OpModel> op_models(legal_op_models.begin(), legal_op_models.end());
-        std::sort(
-            op_models.begin(),
-            op_models.end(),
-            [](OpModel const& a, OpModel const& b) -> bool
-            {
-                int perimeter_a = a.grid_shape.r + a.grid_shape.c;
-                int perimeter_b = b.grid_shape.r + b.grid_shape.c;
-                if (perimeter_a == perimeter_b)
-                    return a.grid_shape.r < b.grid_shape.r;
-                return perimeter_a < perimeter_b;
-            });
-        graph_solver.set(node, op_models.front());
-        log_debug(LogBalancer, "Selected minimum grid for node: {}", node->name());
-        log_debug(LogBalancer, "  {} {}", op_models.front().grid_shape, op_models.front().t_stream_factor);
-    }
-
-    return graph_solver.finish();
-}
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/policies/policy_minimize_grid.hpp b/pybuda/csrc/balancer/policies/policy_minimize_grid.hpp
deleted file mode 100644
index 039828ce7..000000000
--- a/pybuda/csrc/balancer/policies/policy_minimize_grid.hpp
+++ /dev/null
@@ -1,22 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <unordered_map>
-#include <vector>
-
-#include "balancer/legalizer/graph_solver.hpp"
-#include "balancer/types.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-
-namespace tt::balancer {
-struct BalancerConfig;
-}  // namespace tt::balancer
-
-namespace tt::balancer {
-legalizer::GraphSolverSolution run_policy_minimize_grid(
-    graphlib::Graph const* graph, BalancerConfig const&, legalizer::GraphSolver& graph_solver);
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/policies/policy_nlp.cpp b/pybuda/csrc/balancer/policies/policy_nlp.cpp
deleted file mode 100644
index 426301c2e..000000000
--- a/pybuda/csrc/balancer/policies/policy_nlp.cpp
+++ /dev/null
@@ -1,209 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "balancer/policies/policy_nlp.hpp"
-
-#include <cstdint>
-#include <cstdlib>
-
-#include "balancer/policies/policy_utils.hpp"
-#include "graph_lib/node_types.hpp"
-#include "utils/logger.hpp"
-
-using Graph = tt::graphlib::Graph;
-using Node = tt::graphlib::Node;
-using NodeType = tt::graphlib::NodeType;
-using Edge = tt::graphlib::Edge;
-using DataFormat = tt::DataFormat;
-
-namespace tt::balancer
-{
-
-// Return balancing target_cycles, i.e. the number of cycles all ops should be below, but as close as possible.
-std::uint32_t calculate_target_cycles(
-    graphlib::Graph const* graph, legalizer::GraphSolver& graph_solver, std::string const& arch_name)
-{
-    auto topo_sort = tt::graphlib::topological_sort(*graph);
-
-    // Get min prologue volume that fits for each parameter
-    std::unordered_map<Node*, std::uint32_t> min_param_grid_volume =
-        find_min_prologue_volumes(graph, topo_sort, graph_solver);
-
-    return get_matmul_target_cycles(graph, topo_sort, graph_solver, min_param_grid_volume, arch_name);
-}
-
-legalizer::GraphSolverSolution run_policy_nlp(
-    graphlib::Graph const* graph,
-    BalancerConfig const& config,
-    legalizer::GraphSolver& graph_solver,
-    std::uint32_t target_cycles)
-{
-    (void)config;
-    log_debug(LogBalancer, "Starting NLP balancing.");
-    std::vector<tt::graphlib::Node*> topo_sort = tt::graphlib::topological_sort(*graph);
-
-    // Get min prologue volume that fits for each parameter
-    std::unordered_map<Node*, std::uint32_t> min_param_grid_volume =
-        find_min_prologue_volumes(graph, topo_sort, graph_solver);
-
-    if (target_cycles == 0)
-    {
-        if (auto manual_target_cycles = env_as_optional<int>("PYBUDA_NLP_MANUAL_TARGET"))
-        {
-            target_cycles = *manual_target_cycles;
-            log_info(LogBalancer, "Manual override of target cycles to {}", target_cycles);
-        }
-        else
-        {
-            target_cycles = get_matmul_target_cycles(
-                graph, topo_sort, graph_solver, min_param_grid_volume, config.device_config.arch_name);
-        }
-    }
-
-    bool skip_small_ukt = env_as<bool>("PYBUDA_SKIP_SMALL_UKT", false);
-
-    // Pick OpModel for each node.
-    //
-    for (Node* node : topo_sort)
-    {
-        if (node->node_type() != NodeType::kBudaOp)
-            continue;
-
-        const graphlib::BudaOpNode* op = node->as<graphlib::BudaOpNode>();
-        std::string op_type = op->op_type().op;
-        std::uint32_t min_prologue_volume = 0;  // min volume needed to remain prologue after other checks
-        auto it = min_param_grid_volume.end();
-        if (graph->data_operands(node).size() > 1)
-            it = min_param_grid_volume.find(graph->data_operands(node)[1]);
-
-        if (it != min_param_grid_volume.end())
-            min_prologue_volume = it->second;
-
-        // Find the actual smallest grid, with matching target rows, if possible
-        auto op_models = graph_solver.at(node);
-        const OpShape& op_shape = (*graph_solver.at(node).begin()).op_shape;
-        std::uint32_t target_rows = std::uint32_t(op_shape.outputs.at(0).rt / 6);
-        if (target_rows == 0)
-            target_rows = 1;
-
-        using pick = std::pair<std::uint32_t, OpModel>;
-        std::unordered_map<std::string, pick> closest_distance;
-        pick default_pick = {0, *op_models.begin()};
-        closest_distance["best"] = default_pick;
-        closest_distance["failed_prologue"] = default_pick;
-        closest_distance["bad_rows"] = default_pick;
-        closest_distance["bad_rows_failed_prologue"] = default_pick;
-        closest_distance["too_slow"] = default_pick;
-        closest_distance["too_slow_failed_prologue"] = default_pick;
-        for (auto op_model : op_models)
-        {
-            std::uint32_t execution_cycles = op_model.get_execution_cycles(config.device_config.arch_name);
-
-            if ((op_type == "matmul") && skip_small_ukt)
-            {
-                balancer::BlockShape input0_block_shape = op_model.input_buffers[0].block_shape;
-                int u_kt = input0_block_shape.ublock.ct;
-                int m_k = op_model.op_shape.inputs[0].ct / input0_block_shape.ublock.ct;
-
-                if ((u_kt < 4) && (m_k > 1))
-                    continue;  // Skip bad u_kt settings. TODO: have a second pass that disables this if nothing is
-                               // found
-            }
-
-            pick current_test_pick = {execution_cycles, op_model};
-
-            bool needs_prologue = (op_type == "matmul") &&  // others don't really matter, prologues are tiny
-                                                            // it needs a prologue if there's either a dram or parameter
-                                                            // buffer for the second operand
-                                  (((op_model.parameter_buffers.size() > 1) && op_model.parameter_buffers[1]) ||
-                                   ((op_model.dram_buffers.size() > 1) && op_model.dram_buffers[1]));
-
-            // if ( (op_type == "matmul") && !needs_prologue)
-            if ((op_type == "matmul") && node->as<graphlib::BudaOpNode>()->is_gradient_op())
-            {
-                // Matmul with two non-prologue operands, it's going to be slower than usual
-                // execution_cycles *= 2;
-            }
-
-            bool has_prologue = false;
-
-            if (needs_prologue)
-            {
-                if (node->as<graphlib::BudaOpNode>()->is_sparse_matmul())
-                {
-                    TT_ASSERT(op_model.parameter_buffers.size() == 3);
-                    has_prologue = op_model.parameter_buffers[0] && op_model.parameter_buffers[2];
-                }
-                else
-                {
-                    TT_ASSERT(op_model.parameter_buffers.size() > 1);
-                    has_prologue = op_model.parameter_buffers[1];
-                }
-            }
-
-            bool prologue_ok = !needs_prologue ||
-                               (has_prologue && ((std::uint32_t)op_model.grid_shape.volume() >= min_prologue_volume));
-
-            // Check and save the pick if it's better, in the right category
-
-            // Matching target rows - or target columns, in which case we can transpose the op on placement
-            // For now, let's not transpose matmuls, that could get dangerous.
-            bool matching_rows = ((std::uint32_t)op_model.grid_shape.r == target_rows) ||
-                                 ((op_model.grid_shape.c < op_model.grid_shape.r) &&
-                                  ((std::uint32_t)op_model.grid_shape.c == target_rows) && (op_type != "matmul"));
-
-            // clang-format off
-            std::string category =
-                matching_rows ?
-                      prologue_ok ? "best" : "failed_prologue"
-                    : prologue_ok ? "bad_rows" : "bad_rows_failed_prologue";
-            // clang-format on
-
-            if (execution_cycles > target_cycles)
-            {
-                // Invalid, unless we really have nothing else, in which case we'll pick the fastest
-                category = prologue_ok ? "too_slow" : "too_slow_failed_prologue";
-                if ((execution_cycles < closest_distance[category].first) || (closest_distance[category].first == 0))
-                    closest_distance[category] = current_test_pick;
-            }
-            else if (execution_cycles > closest_distance[category].first)
-            {
-                closest_distance[category] = current_test_pick;
-            }
-
-            log_trace(
-                LogBalancer,
-                "  Node {} grid {}: cat={}, cycles={}, closest_distance for category={}",
-                node->name(),
-                op_model.grid_shape,
-                category,
-                execution_cycles,
-                closest_distance[category].first);
-        }
-
-        // Pick the grid. TODO: failed prologue is not always worse than prologue - it only is now where dram access is
-        // too slow to be useful If we model the cycles with dram access accurately, we could pick no-prologue as the
-        // best choice
-        auto selected_op_model = *op_models.begin();
-        for (std::string category : std::vector<std::string>{
-                 "best",
-                 "bad_rows",
-                 "too_slow",
-                 "failed_prologue",
-                 "bad_rows_failed_prologue",
-                 "too_slow_failed_prologue"})
-        {
-            if (closest_distance[category].first != 0)
-            {
-                selected_op_model = closest_distance[category].second;
-                break;
-            }
-        }
-
-        set_op_model_for_node(graph_solver, node, selected_op_model, config.device_config.arch_name);
-    }
-
-    return graph_solver.finish();
-}
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/policies/policy_nlp.hpp b/pybuda/csrc/balancer/policies/policy_nlp.hpp
deleted file mode 100644
index 4ae076219..000000000
--- a/pybuda/csrc/balancer/policies/policy_nlp.hpp
+++ /dev/null
@@ -1,248 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <unordered_map>
-#include <vector>
-
-#include "balancer/legalizer/graph_solver.hpp"
-#include "balancer/types.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-
-namespace tt::balancer
-{
-struct BalancerConfig;
-}  // namespace tt::balancer
-
-namespace tt::balancer
-{
-
-legalizer::GraphSolverSolution run_policy_nlp(
-    graphlib::Graph const* graph,
-    BalancerConfig const&,
-    legalizer::GraphSolver& graph_solver,
-    std::uint32_t target_cycles = 0);
-
-legalizer::GraphSolverSolution run_policy_nlp_v2(
-    graphlib::Graph const* graph,
-    BalancerConfig const& config,
-    legalizer::GraphSolver& graph_solver,
-    std::optional<placer::PlacerSolution>& placer_solution,
-    std::uint32_t target_cycles = 0);
-
-std::uint32_t calculate_target_cycles(
-    graphlib::Graph const* graph, legalizer::GraphSolver& graph_solver, std::string const& arch_name);
-
-template <typename T>
-std::unordered_map<graphlib::Node*, std::uint32_t> find_min_prologue_volumes(
-    graphlib::Graph const* graph, const std::vector<graphlib::Node*>& topo_sort, T& graph_solver)
-{
-    // Until back-end can reblock on prologue, we have to ensure that shared parameters (i.e. parameters read by
-    // multiple matmuls) are reads by ops with the same grid sizes. Otherwise, the queue will have to be blocked for the
-    // smaller grid, and then the op that needs a bigger grid will no longer fit in L1
-
-    std::unordered_map<graphlib::Node*, std::uint32_t> min_param_grid_volume;
-
-    // Search for matmul parameters
-    for (graphlib::Node* node : topo_sort)
-    {
-        if ((node->node_type() != graphlib::NodeType::kBudaOp) ||
-            (node->as<graphlib::BudaOpNode>()->op_type().op != "matmul"))
-            continue;
-
-        // Find minimum valid for prologue
-        auto grids = graph_solver.at(node);
-        bool found_prologue = false;
-        std::uint32_t min_volume = 100000;
-        for (auto grid : grids)
-        {
-            bool has_prologue = grid.parameter_buffers[1];
-            std::uint32_t volume = grid.grid_shape.volume();
-            if (has_prologue && (!found_prologue || (min_volume > volume)))
-            {
-                min_volume = volume;
-                found_prologue = true;
-            }
-        }
-        graphlib::Node* param_node = graph->data_operands(node)[1];
-        auto it = min_param_grid_volume.find(param_node);
-        // Record max of all the min volumes
-        if (found_prologue && ((it == min_param_grid_volume.end()) || (it->second < min_volume)))
-        {
-            min_param_grid_volume[param_node] = min_volume;
-            log_debug(
-                LogBalancer,
-                "Setting minimum prologue volume on {} to {} due to {}",
-                param_node->name(),
-                min_volume,
-                node->name());
-            found_prologue = true;
-        }
-    }
-
-    return min_param_grid_volume;
-}
-
-template <typename T>
-std::uint32_t get_matmul_target_cycles(
-    graphlib::Graph const* graph,
-    const std::vector<graphlib::Node*>& topo_sort,
-    T& graph_solver,
-    const std::unordered_map<graphlib::Node*, std::uint32_t>& min_param_grid_volume,
-    std::string const& arch_name)
-{
-    // Aim for the biggest block while fitting all parameters in L1... if possible.
-    // To start, find the slowest cycle count for each matmul in which parameters fit.
-    std::uint32_t slowest_matmul_cycles = UINT32_MAX;
-
-    std::vector<graphlib::Node*> topo_matmuls;
-    for (graphlib::Node* node : topo_sort)
-    {
-        if (node->node_type() != graphlib::NodeType::kBudaOp)
-            continue;
-
-        std::string op_type = node->as<graphlib::BudaOpNode>()->op_type().op;
-        if (op_type != "matmul")
-            continue;
-
-        if (node->as<graphlib::BudaOpNode>()->is_sparse_matmul())
-            continue;  // for now, ignore sparse matmuls
-
-        topo_matmuls.push_back(node);
-    }
-
-    std::uint32_t min_cycles_filter = 0;
-    for (graphlib::Node* node : topo_matmuls)
-    {
-        // Find the largest cycle count of the fastest we can do in the target rows. This is the minimum
-        // cycle count we can allow.
-
-        std::uint32_t min_prologue_volume = 0;  // min volume needed to remain prologue after other checks
-        auto it = min_param_grid_volume.find(graph->data_operands(node)[1]);
-        if (it != min_param_grid_volume.end())
-            min_prologue_volume = it->second;
-
-        const OpShape& op_shape = (*graph_solver.at(node).begin()).op_shape;
-        std::uint32_t target_rows = std::uint32_t(op_shape.outputs.at(0).rt / 6);
-        if (target_rows == 0)
-            target_rows = 1;
-
-        std::uint32_t fastest_cycles = UINT32_MAX;
-
-        auto grids = graph_solver.at(node);
-        for (auto grid : grids)
-        {
-            if ((std::uint32_t)grid.grid_shape.volume() < min_prologue_volume)
-                continue;
-
-            if ((std::uint32_t)grid.grid_shape.r != target_rows)
-                continue;
-
-            // Skip the extrmely small shapes, as they are very inefficient at the moment
-            // TODO: add these as config options
-            std::string op_type = node->as<graphlib::BudaOpNode>()->op_type().op;
-            if (op_type == "matmul")
-            {
-                if (op_shape.outputs.at(0).ct / grid.grid_shape.c < 3)
-                    continue;
-
-                if (op_shape.outputs.at(0).rt / grid.grid_shape.r < 4)
-                    continue;
-            }
-
-            std::uint32_t cycles = grid.get_execution_cycles(arch_name);
-            if (cycles < fastest_cycles)
-                fastest_cycles = cycles;
-        }
-
-        if ((fastest_cycles != UINT32_MAX) && (fastest_cycles > min_cycles_filter))
-        {
-            min_cycles_filter = fastest_cycles;
-            log_debug(LogBalancer, "Setting min cycle filter to {} due to {}", min_cycles_filter, node->name());
-        }
-    }
-
-    float min_cycles_margin = 0.85;
-    min_cycles_filter = 1.0 * min_cycles_filter * min_cycles_margin;
-    log_debug(LogBalancer, "Final min cycle filter is {} after margin {}", min_cycles_filter, min_cycles_filter);
-
-    for (graphlib::Node* node : topo_matmuls)
-    {
-        std::uint32_t min_prologue_volume = 0;  // min volume needed to remain prologue after other checks
-        auto it = min_param_grid_volume.find(graph->data_operands(node)[1]);
-        if (it != min_param_grid_volume.end())
-            min_prologue_volume = it->second;
-
-        // Find the actual smallest grid, with matching target rows, if possible
-        const OpShape& op_shape = (*graph_solver.at(node).begin()).op_shape;
-        std::uint32_t target_rows = std::uint32_t(op_shape.outputs.at(0).rt / 6);
-        if (target_rows == 0)
-            target_rows = 1;
-
-        std::uint32_t smallest_grid_volume = UINT32_MAX;
-        std::uint32_t smallest_grid_cycles;
-        std::uint32_t smallest_grid_volume_bad_rows = UINT32_MAX;  // backup in case we can't find the right target rows
-        std::uint32_t smallest_grid_cycles_bad_rows = 0;
-
-        auto grids = graph_solver.at(node);
-        for (auto grid : grids)
-        {
-            if ((std::uint32_t)grid.grid_shape.volume() < min_prologue_volume)
-                continue;
-
-            std::uint32_t cycles = grid.get_execution_cycles(arch_name);
-
-            if ((std::uint32_t)grid.grid_shape.r == target_rows)
-            {
-                if ((std::uint32_t)grid.grid_shape.volume() < smallest_grid_volume)
-                {
-                    smallest_grid_volume = grid.grid_shape.volume();
-                    smallest_grid_cycles = cycles;
-                }
-            }
-            else if ((std::uint32_t)grid.grid_shape.volume() < smallest_grid_volume_bad_rows)
-            {
-                smallest_grid_volume_bad_rows = grid.grid_shape.volume();
-                smallest_grid_cycles_bad_rows = cycles;
-            }
-        }
-
-        if (smallest_grid_volume == UINT32_MAX && smallest_grid_volume_bad_rows == UINT32_MAX)
-        {
-            log_warning(
-                LogBalancer,
-                "Matmul {} has no grid for which we can fit parameters in L1. Performance might suffer.",
-                node->name());
-        }
-        else
-        {
-            std::uint32_t cycles =
-                (smallest_grid_volume < UINT32_MAX) ? smallest_grid_cycles : smallest_grid_cycles_bad_rows;
-            // std::cout << "Node " << node->name() << " target cycles: " << smallest_grid_volume << ": "
-            //           << smallest_grid_cycles << ", bad rows: " << smallest_grid_volume_bad_rows << ": "
-            //           << smallest_grid_cycles_bad_rows << std::endl;
-            if ((cycles >= min_cycles_filter) && (cycles < slowest_matmul_cycles))
-            {
-                slowest_matmul_cycles = cycles;
-                // std::cout << "Setting slowest matmul cycles to " << cycles << " because of " << node->name()
-                //           << std::endl;
-            }
-        }
-    }
-
-    float margin = 1.2;
-    std::uint32_t target_cycles = 1.0 * slowest_matmul_cycles * margin;
-
-    // Set a reasonable range until this is more robust
-    if (target_cycles < 45000)
-        target_cycles = 45000;
-    if (target_cycles > 125000)
-        target_cycles = 125000;
-
-    log_info(LogBalancer, "Based on NLP matmul analysis, target cycle count is set to {}", target_cycles);
-    return target_cycles;
-}
-
-}  // namespace tt::balancer
\ No newline at end of file
diff --git a/pybuda/csrc/balancer/policies/policy_nlp_2.cpp b/pybuda/csrc/balancer/policies/policy_nlp_2.cpp
deleted file mode 100644
index 85f630bf7..000000000
--- a/pybuda/csrc/balancer/policies/policy_nlp_2.cpp
+++ /dev/null
@@ -1,302 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include <cstdint>
-#include <cstdlib>
-
-#include "balancer/policies/policy_utils.hpp"
-#include "balancer/policies/policy_manager.hpp"
-#include "balancer/policies/policy_nlp.hpp"
-#include "graph_lib/node_types.hpp"
-
-using Graph = tt::graphlib::Graph;
-using Node = tt::graphlib::Node;
-using NodeType = tt::graphlib::NodeType;
-using Edge = tt::graphlib::Edge;
-using DataFormat = tt::DataFormat;
-
-namespace tt::balancer
-{
-
-bool is_small_ukt(OpModel op_model)
-{
-    balancer::BlockShape input0_block_shape = op_model.input_buffers[0].block_shape;
-    int u_kt = input0_block_shape.ublock.ct;
-    int m_k = op_model.op_shape.inputs[0].ct / input0_block_shape.ublock.ct;
-
-    if ((u_kt < 4) && (m_k > 1))
-        return true;
-
-    return false;
-}
-
-bool is_small_grid_size(OpModel op_model, int limit_r, int limit_c)
-{
-    if (op_model.grid_shape.r <= limit_r and op_model.grid_shape.c <= limit_c)
-        return true;
-
-    return false;
-}
-
-legalizer::GraphSolverSolution run_policy_nlp_v2(
-    graphlib::Graph const* graph,
-    BalancerConfig const& config,
-    legalizer::GraphSolver& graph_solver,
-    std::optional<placer::PlacerSolution>& placer_solution,
-    std::uint32_t target_cycles)
-{
-    (void)config;
-    log_debug(LogBalancer, "Starting NLP balancing.");
-    log_debug(LogBalancer, "Using interactive placer.");
-
-    PolicyManager policy_manager(graph, config, graph_solver);
-    bool epoch_completed = false;
-    std::vector<tt::graphlib::Node*> topo_sort = tt::graphlib::topological_sort(*graph);
-
-    // Get min prologue volume that fits for each parameter
-    std::unordered_map<Node*, std::uint32_t> min_param_grid_volume =
-        find_min_prologue_volumes(graph, topo_sort, policy_manager);
-
-    if (target_cycles == 0)
-    {
-        if (auto manual_target_cycles = env_as_optional<int>("PYBUDA_NLP_MANUAL_TARGET"))
-        {
-            target_cycles = *manual_target_cycles;
-            log_info(LogBalancer, "Manual override of target cycles to {}", target_cycles);
-        }
-        else
-        {
-            target_cycles = get_matmul_target_cycles(
-                graph, topo_sort, policy_manager, min_param_grid_volume, config.device_config.arch_name);
-        }
-    }
-    std::vector<int> target_cycles_per_subgraph = env_as_vector<int>("PYBUDA_NLP_MANUAL_TARGET_PER_SUBGRAPH");
-    std::map<int, int> target_cycles_per_subgraph_map;
-    int default_target_cycles = target_cycles;
-    if (not target_cycles_per_subgraph.empty())
-    {
-        for (size_t i = 0; i < target_cycles_per_subgraph.size(); i += 2)
-        {
-            target_cycles_per_subgraph_map[target_cycles_per_subgraph[i]] = target_cycles_per_subgraph[i + 1];
-        }
-        log_info(LogBalancer, "Target cycles per subgraph: {}", target_cycles_per_subgraph_map);
-    }
-
-    bool skip_small_ukt = env_as<bool>("PYBUDA_SKIP_SMALL_UKT", false);
-    std::vector<int> limit_grid_shape_per_subgraph = env_as_vector<int>("PYBUDA_LIMIT_GRID_SHAPE_PER_SUBGRAPH");
-    bool skip_large_grid = false;
-    if (limit_grid_shape_per_subgraph.size() == 0)
-    {
-        limit_grid_shape_per_subgraph = {0, 0, 0};
-    }
-    unsigned int subgraph_id = limit_grid_shape_per_subgraph[0];
-    int limit_r = limit_grid_shape_per_subgraph[1];
-    int limit_c = limit_grid_shape_per_subgraph[2];
-    if (limit_r > 0 or limit_c > 0)
-    {
-        skip_large_grid = true;
-    }
-
-    // Pick OpModel for each node.
-    //
-    while (const graphlib::Node* node = policy_manager.get_next_op())
-    {
-        const graphlib::BudaOpNode* op = node->as<graphlib::BudaOpNode>();
-        std::string op_type = op->op_type().op;
-        std::uint32_t min_prologue_volume = 0;  // min volume needed to remain prologue after other checks
-        auto it = min_param_grid_volume.end();
-        if (graph->data_operands(node).size() > 1)
-            it = min_param_grid_volume.find(graph->data_operands(node)[1]);
-
-        if (it != min_param_grid_volume.end())
-            min_prologue_volume = it->second;
-
-        // Find the actual smallest grid, with matching target rows, if possible
-        auto op_models = policy_manager.at(node);
-        const OpShape& op_shape = (*policy_manager.at(node).begin()).op_shape;
-        std::uint32_t target_rows = std::uint32_t(op_shape.outputs.at(0).rt / 6);
-        if (target_rows == 0)
-            target_rows = 1;
-
-        using pick = std::pair<std::uint32_t, OpModel>;
-        std::unordered_map<std::string, pick> closest_distance;
-        pick default_pick = {0, *op_models.begin()};
-        closest_distance["best"] = default_pick;
-        closest_distance["failed_prologue"] = default_pick;
-        closest_distance["bad_rows"] = default_pick;
-        closest_distance["bad_rows_failed_prologue"] = default_pick;
-        closest_distance["too_slow"] = default_pick;
-        closest_distance["too_slow_failed_prologue"] = default_pick;
-
-        bool skip_large_grid_for_subgraph = (skip_large_grid & (subgraph_id == graph->get_subgraph_id_for_node(node->id())));
-        if (not target_cycles_per_subgraph_map.empty())
-        {
-            if (target_cycles_per_subgraph_map.find(graph->get_subgraph_id_for_node(node->id())) != target_cycles_per_subgraph_map.end())
-            {
-                target_cycles = target_cycles_per_subgraph_map[graph->get_subgraph_id_for_node(node->id())];
-            }
-            else
-            {
-                target_cycles = default_target_cycles;
-            }
-        }
-        bool available_not_small_ukt = false;
-        bool available_small_grid = false;
-
-        if (skip_large_grid_for_subgraph or skip_small_ukt)
-        {
-            for (auto op_model : op_models)
-            {
-                if (op_type == "matmul")
-                {
-                    if (not is_small_ukt(op_model))
-                    {
-                        available_not_small_ukt = true;
-                    }
-                }
-                if (is_small_grid_size(op_model, limit_r, limit_c))
-                {
-                    available_small_grid = true;
-                }
-                if (available_not_small_ukt && available_small_grid)
-                {
-                    continue;
-                }
-            }
-        }
-        
-        for (auto op_model : op_models)
-        {
-            std::uint32_t execution_cycles = op_model.get_execution_cycles(config.device_config.arch_name);
-
-            if ((op_type == "matmul") && skip_small_ukt && available_not_small_ukt)
-            {
-                if (is_small_ukt(op_model))
-                    continue;
-            }
-            if (available_small_grid and skip_large_grid_for_subgraph)
-            {
-                if (not is_small_grid_size(op_model, limit_r, limit_c))
-                    continue;
-            }
-
-            pick current_test_pick = {execution_cycles, op_model};
-
-            bool needs_prologue = (op_type == "matmul") &&  // others don't really matter, prologues are tiny
-                                                            // it needs a prologue if there's either a dram or parameter
-                                                            // buffer for the second operand
-                                  (((op_model.parameter_buffers.size() > 1) && op_model.parameter_buffers[1]) ||
-                                   ((op_model.dram_buffers.size() > 1) && op_model.dram_buffers[1]));
-
-            // if ( (op_type == "matmul") && !needs_prologue)
-            if ((op_type == "matmul") && node->as<graphlib::BudaOpNode>()->is_gradient_op())
-            {
-                // Matmul with two non-prologue operands, it's going to be slower than usual
-                // execution_cycles *= 2;
-            }
-
-            bool has_prologue = false;
-
-            if (needs_prologue)
-            {
-                if (node->as<graphlib::BudaOpNode>()->is_sparse_matmul())
-                {
-                    TT_ASSERT(op_model.parameter_buffers.size() == 3);
-                    has_prologue = op_model.parameter_buffers[0] && op_model.parameter_buffers[2];
-                }
-                else
-                {
-                    TT_ASSERT(op_model.parameter_buffers.size() > 1);
-                    has_prologue = op_model.parameter_buffers[1];
-                }
-            }
-
-            bool prologue_ok = !needs_prologue ||
-                               (has_prologue && ((std::uint32_t)op_model.grid_shape.volume() >= min_prologue_volume));
-
-            // Check and save the pick if it's better, in the right category
-
-            // Matching target rows - or target columns, in which case we can transpose the op on placement
-            // For now, let's not transpose matmuls, that could get dangerous.
-            bool matching_rows = ((std::uint32_t)op_model.grid_shape.r == target_rows) ||
-                                 ((op_model.grid_shape.c < op_model.grid_shape.r) &&
-                                  ((std::uint32_t)op_model.grid_shape.c == target_rows) && (op_type != "matmul"));
-
-            // clang-format off
-            std::string category =
-                matching_rows ?
-                      prologue_ok ? "best" : "failed_prologue"
-                    : prologue_ok ? "bad_rows" : "bad_rows_failed_prologue";
-            // clang-format on
-
-            if (execution_cycles > target_cycles)
-            {
-                // Invalid, unless we really have nothing else, in which case we'll pick the fastest
-                category = prologue_ok ? "too_slow" : "too_slow_failed_prologue";
-                if ((execution_cycles < closest_distance[category].first) || (closest_distance[category].first == 0))
-                    closest_distance[category] = current_test_pick;
-            }
-            else
-            {
-                // Check if we are close to target cycles, and if so, base pick preference on other attributes.
-                // Currently try to pick biggest m block. We may extend this logic in future.
-                //
-                if (close_to_target(closest_distance[category].first, target_cycles))
-                {
-                    if (close_to_target(execution_cycles, target_cycles))
-                    {
-                        if (op_model.block_shape().volume_no_t() > closest_distance[category].second.block_shape().volume_no_t())
-                            closest_distance[category] = current_test_pick;
-                    }
-                }
-                else if (execution_cycles > closest_distance[category].first)
-                {
-                    closest_distance[category] = current_test_pick;
-                }
-            }
-
-            log_trace(
-                LogBalancer,
-                "  Node {} grid {}: cat={}, cycles={}, closest_distance for category={}",
-                node->name(),
-                op_model.grid_shape,
-                category,
-                execution_cycles,
-                closest_distance[category].first);
-        }
-
-        // Pick the grid. TODO: failed prologue is not always worse than prologue - it only is now where dram access is
-        // too slow to be useful If we model the cycles with dram access accurately, we could pick no-prologue as the
-        // best choice
-        auto selected_op_model = *op_models.begin();
-        for (std::string category : std::vector<std::string>{
-                 "best",
-                 "bad_rows",
-                 "too_slow",
-                 "failed_prologue",
-                 "bad_rows_failed_prologue",
-                 "too_slow_failed_prologue"})
-        {
-            if (closest_distance[category].first != 0)
-            {
-                selected_op_model = closest_distance[category].second;
-                break;
-            }
-        }
-
-        std::tie(std::ignore, epoch_completed, std::ignore) = policy_manager.commit_op(selected_op_model);
-
-        // If we're done with the epoch, finish it.
-        //
-        if (epoch_completed)
-        {
-            policy_manager.finish_current_epoch();
-        }
-    }
-
-    placer_solution = policy_manager.commit_solution();
-
-    return policy_manager.finish();
-}
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/policies/policy_random.cpp b/pybuda/csrc/balancer/policies/policy_random.cpp
deleted file mode 100644
index dae2c35e5..000000000
--- a/pybuda/csrc/balancer/policies/policy_random.cpp
+++ /dev/null
@@ -1,66 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "balancer/policies/policy_random.hpp"
-
-#include <algorithm>
-#include <random>
-
-#include "balancer/policies/policy_manager.hpp"
-#include "balancer/policies/policy_utils.hpp"
-#include "graph_lib/node_types.hpp"
-
-using Graph = tt::graphlib::Graph;
-using Node = tt::graphlib::Node;
-using NodeType = tt::graphlib::NodeType;
-using Edge = tt::graphlib::Edge;
-using DataFormat = tt::DataFormat;
-
-namespace tt::balancer
-{
-
-legalizer::GraphSolverSolution run_policy_random(
-    graphlib::Graph const* graph,
-    BalancerConfig const& config,
-    legalizer::GraphSolver& graph_solver,
-    std::optional<placer::PlacerSolution>& placer_solution)
-
-{
-    (void)config;
-    log_debug(LogBalancer, "Starting Random balancing.");
-
-    PolicyManager policy_manager(graph, config, graph_solver);
-    bool epoch_completed = false;
-    
-    std::mt19937 rand_gen(config.random_policy_seed);
-
-    // Pick OpModel for each node.
-    //
-    while (const graphlib::Node* node = policy_manager.get_next_op())
-    {
-        auto op_models = policy_manager.at(node);
-
-        std::uniform_int_distribution<int> d(0, op_models.size() - 1);
-        int random = d(rand_gen);
-
-        auto op_model = op_models.begin();
-        for (int i = 0; i < random; ++i)
-        {
-            ++op_model;
-        }
-        std::tie(std::ignore, epoch_completed, std::ignore) = policy_manager.commit_op(*op_model);
-
-        // If we're done with the epoch, finish it.
-        //
-        if (epoch_completed)
-        {
-            policy_manager.finish_current_epoch();
-        }
-    }
-
-    placer_solution = policy_manager.commit_solution();
-
-    return policy_manager.finish();
-}
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/policies/policy_random.hpp b/pybuda/csrc/balancer/policies/policy_random.hpp
deleted file mode 100644
index 176064a73..000000000
--- a/pybuda/csrc/balancer/policies/policy_random.hpp
+++ /dev/null
@@ -1,28 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <unordered_map>
-#include <vector>
-
-#include "balancer/legalizer/graph_solver.hpp"
-#include "balancer/types.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-
-namespace tt::balancer
-{
-struct BalancerConfig;
-}  // namespace tt::balancer
-
-namespace tt::balancer
-{
-
-legalizer::GraphSolverSolution run_policy_random(
-    graphlib::Graph const* graph,
-    BalancerConfig const& config,
-    legalizer::GraphSolver& graph_solver,
-    std::optional<placer::PlacerSolution>& placer_solution);
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/policies/policy_ribbon.cpp b/pybuda/csrc/balancer/policies/policy_ribbon.cpp
deleted file mode 100644
index fa39770a8..000000000
--- a/pybuda/csrc/balancer/policies/policy_ribbon.cpp
+++ /dev/null
@@ -1,78 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "balancer/policies/policy_ribbon.hpp"
-
-#include "balancer/policies/policy_manager.hpp"
-
-namespace tt::balancer
-{
-// Return true if all of node's sources have been scheduled already
-/*
-bool ok_to_schedule_next(
-    const scheduler::Schedule &scheduled_ops, std::uint32_t scheduled_so_far, const Graph *graph, Node *node)
-{
-    for (Node *operand : graph->data_operands(node))
-    {
-        if (operand->node_type() != graphlib::kBudaOp)
-            continue;
-
-        auto it = std::find(scheduled_ops.begin(), scheduled_ops.end(), operand->name());
-        std::cout << "ok to schedule? " << node->name() << ", operand: " << operand->name()
-                  << ", delta: " << (it - scheduled_ops.begin()) << ", so far: " << scheduled_so_far << std::endl;
-        if (it - scheduled_ops.begin() > scheduled_so_far)
-            return false;
-    }
-    return true;
-}
-*/
-
-legalizer::GraphSolverSolution run_policy_ribbon(
-    graphlib::Graph const *graph,
-    const BalancerConfig &config,
-    legalizer::GraphSolver &graph_solver,
-    std::optional<placer::PlacerSolution> &placer_solution)
-{
-    log_info(LogBalancer, "Starting Ribbon balancing.");
-    PolicyManager policy_manager(graph, config, graph_solver, true /*ribbon_policy*/);
-    if (env_as<bool>("PYBUDA_RIBBON1_PREPASS_ENABLED", false))
-    {
-    policy_manager.invalidate_suboptimal_op_models(legalizer::MatmulSparseDenseGridPairing | legalizer::DenseMatmulPrologue | legalizer::DenseMatmulBetterUkt);
-    }
-
-    bool epoch_completed = false;
-    std::unordered_set<std::uint64_t> validated_cache;  // list of op model IDs that have been validated to be ok, so we
-                                                        // don't have to validate them again
-    const int target_cycles = env_as<int>("PYBUDA_RIBBON_TARGET_CYCLES", 45000);
-
-    // Pick op models.
-    //
-    while (const graphlib::Node *node = policy_manager.get_next_op())
-    {
-        const graphlib::BudaOpNode *op = node->as<graphlib::BudaOpNode>();
-
-        const auto &selected_op_model = select_best_op_model_ribbon(
-            policy_manager,
-            op,
-            policy_manager.get_current_ribbon_size(),
-            config,
-            graph,
-            validated_cache,
-            target_cycles);
-
-        std::tie(std::ignore, epoch_completed, std::ignore) = policy_manager.commit_op(selected_op_model);
-
-        // If we're done with the epoch, finish it.
-        //
-        if (epoch_completed)
-        {
-            policy_manager.finish_current_epoch();
-        }
-    }
-
-    placer_solution = policy_manager.commit_solution();
-
-    return policy_manager.finish();
-}
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/policies/policy_ribbon.hpp b/pybuda/csrc/balancer/policies/policy_ribbon.hpp
deleted file mode 100644
index 0a7ce3c09..000000000
--- a/pybuda/csrc/balancer/policies/policy_ribbon.hpp
+++ /dev/null
@@ -1,95 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <unordered_map>
-#include <vector>
-
-#include "balancer/policies/policy_utils.hpp"
-#include "utils/logger.hpp"
-
-namespace tt::balancer
-{
-struct BalancerConfig;
-}  // namespace tt::balancer
-
-namespace tt::balancer
-{
-legalizer::GraphSolverSolution run_policy_ribbon(
-    graphlib::Graph const *graph,
-    const BalancerConfig &,
-    legalizer::GraphSolver &graph_solver,
-    std::optional<placer::PlacerSolution> &placer_solution);
-
-legalizer::GraphSolverSolution run_policy_ribbon2(
-    graphlib::Graph const *graph,
-    const BalancerConfig &,
-    legalizer::GraphSolver &graph_solver,
-    std::optional<placer::PlacerSolution> &placer_solution);
-
-class RibbonSolution
-{
-   public:
-    struct OpModelPair
-    {
-        OpModel model;
-        const graphlib::BudaOpNode *op;
-    };
-
-    std::unordered_set<const tt::graphlib::Node *> current_epoch_nodes;
-    std::unordered_set<const tt::graphlib::Node *> current_epoch_ops;
-
-   private:
-    std::uint32_t ribbon_size;
-    std::vector<OpModelPair> ops;
-    float utilization;
-    const DeviceConfig *device_config;
-    const Graph *graph;
-    int dram_readers_core_count;
-    int dram_writers_core_count;
-
-    float evaluate() const;
-    void recalc_nodes();
-
-   public:
-    RibbonSolution(
-        std::uint32_t ribbon_size,
-        const DeviceConfig *device_config,
-        std::vector<OpModelPair> &ops,
-        const Graph *graph) :
-        ribbon_size(ribbon_size), ops(ops), utilization(0.0f), device_config(device_config), graph(graph)
-    {
-        recalc_nodes();
-        utilization = evaluate();
-    }
-
-    void update_model(std::uint32_t index, const OpModel &model)
-    {
-        ops[index].model = model;
-        recalc_nodes();
-        utilization = evaluate();
-    }
-
-    void set_op_count(std::size_t op_count)
-    {
-        ops.resize(op_count);
-        recalc_nodes();
-        utilization = evaluate();
-    }
-
-    void print() const;
-    float get_score() const { return utilization; }
-    const DeviceConfig *get_device_config() const { return device_config; }
-    const std::vector<OpModelPair> &get_ops() const { return ops; }
-    std::uint32_t get_ribbon_size() const { return ribbon_size; }
-    const std::unordered_set<const tt::graphlib::Node *>& get_current_epoch_ops() { return current_epoch_ops; }
-    const std::unordered_set<const tt::graphlib::Node *>& get_current_epoch_nodes() { return current_epoch_nodes; }
-};
-
-bool validate_sparse_matmul_model(
-    const graphlib::BudaOpNode *op,
-    const OpModel &op_model,
-    const graphlib::Graph *graph,
-    std::unordered_set<std::uint64_t> &validated_cache);
-}  // namespace tt::balancer
\ No newline at end of file
diff --git a/pybuda/csrc/balancer/policies/policy_ribbon2.cpp b/pybuda/csrc/balancer/policies/policy_ribbon2.cpp
deleted file mode 100644
index b877ce7a5..000000000
--- a/pybuda/csrc/balancer/policies/policy_ribbon2.cpp
+++ /dev/null
@@ -1,1189 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include <algorithm>
-#include <cstdint>
-#include <cstdlib>
-#include <exception>
-#include <optional>
-#include <stdexcept>
-#include <unordered_map>
-#include <variant>
-
-#include "balancer/policies/policy_ribbon.hpp"
-#include "graph_lib/utils.hpp"
-#include "passes/fork_join.hpp"
-#include "placer/interactive_placer.hpp"
-#include "placer/lower_to_placer.hpp"
-#include "placer/placer.hpp"
-#include "scheduler/scheduler.hpp"
-#include "scheduler/utils.hpp"
-#include "utils/assert.hpp"
-#include "utils/logger.hpp"
-
-using NodeType = tt::graphlib::NodeType;
-
-namespace tt::balancer
-{
-
-float RibbonSolution::evaluate() const
-{
-    float pipeline_cycles = 0;
-    const int non_matmul_penalty = 128;
-    for (auto &op : ops)
-    {
-        // We have full epoch candidate. Recalculate impact on DRAM BW.
-        //
-        int cycles = get_limiter_cycles(
-            op.model, graph, *device_config, dram_readers_core_count + dram_writers_core_count, &current_epoch_nodes);
-
-        if (cycles > pipeline_cycles)
-            pipeline_cycles = cycles;
-    }
-
-    log_trace(LogBalancer, "RIBBON2: pipeline_cycles = {}", pipeline_cycles);
-
-    float used_cores = 0;
-    float utilization = 0;
-    for (auto &op : ops)
-    {
-        std::uint32_t cores = op.model.grid_shape.volume();
-        used_cores += cores;
-
-        if (op.op->is_matmul_not_sparse())
-        {
-            utilization += cores * (op.model.get_execution_cycles(device_config->arch_name, true) / pipeline_cycles);
-        }
-        else if (!env_as<bool>("PYBUDA_RIBBON2_DISABLE_NON_MATMUL_UTIL", 0) and !op.op->is_buffering_op())
-        {
-            utilization += cores * (op.model.get_execution_cycles(device_config->arch_name, true) / pipeline_cycles) /
-                           non_matmul_penalty;
-        }
-    }
-
-    log_trace(
-        LogBalancer,
-        "RIBBON2: pipeline_cycles = {}, used_cores = {}, utilization = {}",
-        pipeline_cycles,
-        used_cores,
-        utilization);
-
-    return utilization;
-}
-
-void RibbonSolution::print() const
-{
-    for (auto &op : ops)
-    {
-        log_trace(
-            LogBalancer,
-            "RIBBON2: (ribbon={})   {}: {}",
-            ribbon_size,
-            op.op->name(),
-            get_limiter_cycles(op.model, graph, *device_config));
-    }
-}
-
-void RibbonSolution::recalc_nodes()
-{
-    dram_readers_core_count = 0;
-    dram_writers_core_count = 0;
-    current_epoch_ops.clear();
-    current_epoch_nodes.clear();
-    for (const auto &op : ops)
-    {
-        current_epoch_ops.insert(op.op);
-    }
-    current_epoch_nodes = calculate_current_epoch_nodes(graph, current_epoch_ops);
-
-    for (const auto &op : ops)
-    {
-        std::vector<Edge> data_operands = graph->operand_data_edges(op.model.buda_op_node);
-        std::vector<Edge> data_users = graph->user_data_edges(op.model.buda_op_node);
-
-        for (const Edge &edge : data_operands)
-        {
-            bool producer_is_queue =
-                graph->node_by_id(edge.producer_node_id)->node_type() == tt::graphlib::NodeType::kQueue ||
-                graph->node_by_id(edge.producer_node_id)->node_type() == tt::graphlib::NodeType::kInput;
-
-            if (producer_is_queue and !op.model.parameter_buffers[edge.consumer_input_port_id])
-            {
-                dram_readers_core_count += op.model.get_input_grid_shape(edge.consumer_input_port_id).volume();
-            }
-        }
-
-        for (const Edge &edge : data_users)
-        {
-            const tt::graphlib::Node *user_node = graph->node_by_id(edge.consumer_node_id);
-            bool consumer_is_queue = user_node->node_type() == tt::graphlib::NodeType::kQueue ||
-                                     user_node->node_type() == tt::graphlib::NodeType::kOutput ||
-                                     current_epoch_nodes.count(user_node) == 0;
-
-            if (consumer_is_queue)
-            {
-                dram_writers_core_count += op.model.grid_shape.volume();
-            }
-        }
-    }
-}
-
-OpModel get_closest_op_model(
-    const legalizer::GraphSolver &graph_solver_snapshot,
-    const RibbonSolution::OpModelPair &op,
-    const DeviceConfig *device_config,
-    const graphlib::Graph *graph,
-    std::unordered_set<std::uint64_t> &validated_cache)
-{
-    std::optional<OpModel> closest_model = std::nullopt;
-    bool is_sparse_matmul = op.op->is_sparse_matmul();
-    for (auto op_model : graph_solver_snapshot.at(op.op))
-    {
-        if (is_sparse_matmul)
-        {
-            if (!validate_sparse_matmul_model(op.op, op_model, graph, validated_cache))
-            {
-                continue;
-            }
-        }
-
-        // try to set the same op model as before, if possible. If not, then pick the closest one
-        if (op_model == op.model)
-        {
-            closest_model = op_model;
-            break;
-        }
-        if (!closest_model.has_value())
-        {
-            closest_model = op_model;
-        }
-        else
-        {
-            auto my_delta = std::abs(
-                get_limiter_cycles(op_model, graph, *device_config) -
-                get_limiter_cycles(op.model, graph, *device_config));
-            auto best_delta = std::abs(
-                get_limiter_cycles(*closest_model, graph, *device_config) -
-                get_limiter_cycles(op.model, graph, *device_config));
-
-            if (my_delta < best_delta)
-            {
-                closest_model = op_model;
-            }
-            else if (my_delta == best_delta)
-            {
-                // Prefer the same shape
-                if (op.model.grid_shape == op_model.grid_shape)
-                {
-                    closest_model = op_model;
-                }
-            }
-        }
-    }
-    TT_ASSERT(closest_model.has_value());
-    return closest_model.value();
-}
-
-// Optimize a solution by iteratively bumping up grids of the slowest ops, as long as that
-// improves the utilization of the epoch. We ideally try to stick to the same ribbon size, but if
-// that's not possible, we'll bump up the grid to anything available that's slightly better than
-// the current grid.
-RibbonSolution optimize_solution(
-    RibbonSolution &solution,
-    const legalizer::GraphSolver &graph_solver,
-    placer::InteractivePlacer &interactive_placer,
-    const graphlib::Graph *graph,
-    std::unordered_set<std::uint64_t> &validated_cache,
-    std::uint32_t max_iterations)
-{
-    log_trace(LogBalancer, "RIBBON2: optimize solution, score {}, coming in:", solution.get_score());
-    solution.print();
-
-    RibbonSolution best_solution = solution;
-
-    std::uint32_t iterations = 0;
-    std::uint32_t bad_iterations = 0;  // number of iterations in a row that made thing worse
-    const DeviceConfig *device_config = solution.get_device_config();  // save some typing
-    while ((bad_iterations < 3) && (iterations < max_iterations))
-    {
-        // Find the slowest cycle count
-        float slowest_cycles = 0;
-        for (auto &op : best_solution.get_ops())
-        {
-            float cycles = get_limiter_cycles(op.model, graph, *device_config);
-            if (cycles > slowest_cycles)
-                slowest_cycles = cycles;
-        }
-
-        // Now go through the models, and bump up the ones that are slowest
-        auto graph_solver_snapshot = std::make_unique<legalizer::GraphSolver>(graph_solver);
-        auto new_solution = best_solution;
-        auto target_cycles = 0.9 * slowest_cycles;
-        std::vector<OpModel> blacklisted_models;  // models from previous bad iterations that shouldn't be tried again
-        log_trace(LogBalancer, "RIBBON2: target_cycles = {}", target_cycles);
-        for (std::size_t op_index = 0; op_index < new_solution.get_ops().size(); op_index++)
-        {
-            auto &op = new_solution.get_ops()[op_index];
-            bool is_sparse_matmul = op.op->is_sparse_matmul();
-            float cycles = get_limiter_cycles(op.model, graph, *device_config);
-            if (cycles < target_cycles)
-            {
-                log_trace(LogBalancer, "RIBBON2: op {} is fast enough", op.op->name());
-                auto closest_model =
-                    get_closest_op_model(*graph_solver_snapshot, op, device_config, graph, validated_cache);
-                graph_solver_snapshot->set(op.op, closest_model);
-                if (!(closest_model == op.model))
-                {
-                    log_trace(
-                        LogBalancer,
-                        "RIBBON2: had to change the grid to {} with cycles {}",
-                        closest_model.grid_shape,
-                        get_limiter_cycles(closest_model, graph, *device_config));
-                    new_solution.update_model(op_index, closest_model);
-                }
-            }
-            else
-            {
-                // Bump up the grid
-                // Ideally, use the same ribbon size first
-                log_trace(LogBalancer, "RIBBON2: op {} is too slow, bumping up grid", op.op->name());
-                std::optional<OpModel> new_op_model = std::nullopt;
-                for (bool same_ribbon : {true, false})
-                {
-                    // Check for the case where none of the grids can have prologue, and then waive it
-                    bool waive_prologue = true;
-                    for (const auto &op_model : graph_solver_snapshot->at(op.op))
-                        if (prologue_ok(op_model))
-                        {
-                            waive_prologue = false;
-                            break;
-                        }
-
-                    for (const auto &op_model : graph_solver_snapshot->at(op.op))
-                    {
-                        log_trace(
-                            LogBalancer,
-                            "RIBBON2: trying grid {} with cycles {}, for same ribbon {}",
-                            op_model.grid_shape,
-                            get_limiter_cycles(op_model, graph, *device_config),
-                            same_ribbon);
-                        if (is_sparse_matmul)
-                        {
-                            log_trace(
-                                LogBalancer,
-                                "RIBBON2: trying sparse_matmul grid {} with cycles {}, u_kt = {}",
-                                op_model.grid_shape,
-                                get_limiter_cycles(op_model, graph, *device_config),
-                                op_model.input_buffers.at(1).block_shape.ublock.rt);
-                        }
-
-                        if (std::find(blacklisted_models.begin(), blacklisted_models.end(), op_model) !=
-                            blacklisted_models.end())
-                        {
-                            log_trace(LogBalancer, "RIBBON2: skipping blacklisted op_model");
-                            continue;
-                        }
-
-                        if (!waive_prologue && !prologue_ok(op_model))
-                            continue;
-
-                        if (same_ribbon && (op_model.grid_shape.r != (int)new_solution.get_ribbon_size()))
-                            continue;
-
-                        if (get_limiter_cycles(op_model, graph, *device_config) >= slowest_cycles)
-                            continue;
-
-                        // Find the slowest improvement over the current op_model, to reduce drastic changes
-                        if (!new_op_model.has_value() ||  // nothing has been picked
-
-                            // current best is improvement, but not +10%
-                            ((get_limiter_cycles(*new_op_model, graph, *device_config) >= target_cycles) &&
-                             (get_limiter_cycles(op_model, graph, *device_config) < target_cycles)) ||
-
-                            // pick slower improvement
-                            (get_limiter_cycles(*new_op_model, graph, *device_config) <
-                             get_limiter_cycles(op_model, graph, *device_config)))
-                        {
-                            bool op_ok = true;
-                            if (is_sparse_matmul)
-                            {
-                                // Make sure that this sparse model can be encoded correctly
-                                op_ok = validate_sparse_matmul_model(op.op, op_model, graph, validated_cache);
-                            }
-
-                            if (op_ok)
-                            {
-                                new_op_model = op_model;
-                                log_trace(
-                                    LogBalancer,
-                                    "RIBBON2: setting new grid for {}: {} with cycles {}",
-                                    op.op->name(),
-                                    op_model.grid_shape,
-                                    get_limiter_cycles(op_model, graph, *device_config));
-                            }
-                        }
-                    }
-                    if (same_ribbon && new_op_model.has_value())
-                        break;  // don't try changing the ribbon size, since we found an improvement with the same
-                                // ribbon
-                }
-
-                // If we found a larger grid, then use it
-                if (new_op_model.has_value())
-                {
-                    log_trace(
-                        LogBalancer,
-                        "RIBBON2: bumping up {} from {} to {}",
-                        op.op->name(),
-                        op.model.grid_shape,
-                        new_op_model->grid_shape);
-                    new_solution.update_model(op_index, new_op_model.value());
-                    graph_solver_snapshot->set(op.op, new_op_model.value());
-                    blacklisted_models.push_back(new_op_model.value());  // record in case this bump ended up being bad
-                }
-                else
-                {
-                    // We haven't found anything better, set the same (or closest legal)
-                    auto closest_model =
-                        get_closest_op_model(*graph_solver_snapshot, op, device_config, graph, validated_cache);
-                    new_solution.update_model(op_index, closest_model);
-                    graph_solver_snapshot->set(op.op, closest_model);
-                }
-            }
-        }
-
-        // We need to place this new solution to see how much of it actually fits
-        std::size_t placed_ops = 0;
-        for (std::size_t i = 0; i < new_solution.get_ops().size(); i++)
-        {
-            auto &op = new_solution.get_ops()[i];
-            std::optional<placer::CoordRange> op_placement;
-            int placing_step = 1;
-
-            const RibbonSolution::OpModelPair *next_op =
-                i < new_solution.get_ops().size() - 1 ? &new_solution.get_ops()[i + 1] : nullptr;
-
-            // Special case for sparse-dense matmul pairing. We want to always place them atomically together if
-            // possible.
-            //
-            if (next_op and
-                can_bind_sparse_dense_matmul_pair(
-                    graph, op.op, op.model, next_op->op, next_op->model, interactive_placer, true /*allow_transpose*/))
-            {
-                op_placement = interactive_placer.place_two_ops_rowwise(
-                    op.op->name(), op.model.grid_shape, next_op->op->name(), next_op->model.grid_shape, true);
-
-                placing_step = 2;
-                i++;
-            }
-            else
-            {
-                op_placement = interactive_placer.place_op(op.op->name(), op.model.grid_shape, true);
-            }
-
-            if (op_placement.has_value())
-            {
-                placed_ops += placing_step;
-            }
-            else
-            {
-                break;
-            }
-        }
-        interactive_placer.rewind_epoch();  // rewind, we were just testing what fits
-        if (placed_ops < new_solution.get_ops().size())
-        {
-            // Trim the solution
-            new_solution.set_op_count(placed_ops);
-            log_trace(LogBalancer, "RIBBON2: trimmed solution to {} ops", placed_ops);
-        }
-
-        if (new_solution.get_score() > best_solution.get_score())
-        {
-            best_solution = new_solution;
-            bad_iterations = 0;
-            blacklisted_models.clear();
-            log_trace(LogBalancer, "RIBBON2: improved to {}", best_solution.get_score());
-        }
-        else
-        {
-            bad_iterations++;
-            log_trace(LogBalancer, "RIBBON2: solution got worse, bad iterations in a row = {}", bad_iterations);
-        }
-        iterations++;
-    }
-
-    log_trace(LogBalancer, "RIBBON2: optimized solution with score {}:", best_solution.get_score());
-    best_solution.print();
-    return best_solution;
-}
-
-bool handle_fork_join_nop_overflow(
-    graphlib::Graph const *graph,
-    const BalancerConfig &config,
-    std::vector<std::vector<std::string>> &op_names_to_epoch_break,
-    RibbonSolution &solution,
-    std::unique_ptr<RibbonSolution> &pre_buffered_solution,
-    std::unique_ptr<legalizer::GraphSolver> &graph_solver,
-    std::unique_ptr<legalizer::GraphSolver> &pre_buffered_graph_snapshot,
-    std::unordered_set<std::string> &epoch_break_ops,
-    std::uint32_t &placed_op_index,
-    scheduler::Schedule &scheduled_ops,
-    const std::unordered_set<const Node *> &processed_nodes,
-    const tt::scheduler::Schedule &processed_schedule,
-    std::unique_ptr<graphlib::GraphTraversalContext> &traversal_context,
-    std::uint32_t &nodes_to_process,
-    std::uint32_t current_epoch,
-    std::vector<const Node *> &fork_and_join_nodes,
-    bool &epoch_breaks_added)
-{
-    const bool cleanup_buffering_nops = !env_as<bool>("PYBUDA_RIBBON2_DISABLE_CLEANUP_BUF_NOPS", 0);
-    if (!cleanup_buffering_nops)
-    {
-        return false;
-    }
-
-    if (pre_buffered_graph_snapshot.get() == nullptr)
-    {
-        return false;
-    }
-
-    // Fork-join buffering for this epoch was added in previous iteration.
-    // Check if added buffering caused any of the fork-joins to split into two epochs.
-    // If that is the case, there is no point in keeping the added nops for buffering.
-
-    // Get all ops in current epoch.
-    std::unordered_set<const Node *> ops_in_curr_epoch;
-    for (auto &op : solution.get_ops())
-    {
-        ops_in_curr_epoch.insert(op.op);
-    }
-
-    // Check if all fork and join nodes are in this epoch.
-    bool needs_epoch_break = false;
-    for (auto node : fork_and_join_nodes)
-    {
-        if (!ops_in_curr_epoch.count(node))
-        {
-            needs_epoch_break = true;
-        }
-    }
-
-    if (!needs_epoch_break)
-    {
-        return false;
-    }
-
-    log_debug(LogBalancer, "Detected fork-join split due to buffering in epoch {}.", current_epoch);
-
-    // Get all ops which we wanted to place in this epoch (pre_buffered_solution) and make explicit epoch breaks
-    // for all of the ops which didn't fit.
-    scheduler::Schedule epoch_break;
-    for (auto &op : pre_buffered_solution->get_ops())
-    {
-        // We don't mark nops for epoch break, since they won't exist when we revert the graph to the pre-buffered
-        // snapshot.
-        if (!ops_in_curr_epoch.count(op.op) and !op.op->is_buffering_op())
-        {
-            epoch_break.push_back(op.op->name());
-        }
-    }
-
-    if (epoch_breaks_added)
-    {
-        op_names_to_epoch_break.pop_back();
-    }
-
-    op_names_to_epoch_break.push_back(epoch_break);
-    epoch_breaks_added = true;
-    pre_buffered_solution.reset();
-
-    // Since we can no longer fit all of the pre-buffered ops on a single epoch,
-    // undo the buffering, reschedule everything (with explicit epoch breaks added) and continue to search for a
-    // solution. This takes care of cases where we leave unnecessary fork-join buffering which spans multiple epochs.
-    graph_solver = std::move(pre_buffered_graph_snapshot);
-
-    traversal_context.reset();
-    traversal_context = graph_solver->get_graph_traversal_context();
-
-    std::tie(scheduled_ops, epoch_break_ops) =
-        policy_run_scheduler(graph, config, processed_nodes, processed_schedule, op_names_to_epoch_break);
-
-    placed_op_index = 0;
-    nodes_to_process = processed_nodes.size() + scheduled_ops.size();
-    fork_and_join_nodes.clear();
-
-    return true;
-}
-
-// Try to insert fork join buffering, and then apply solution to the graph solver.
-// If graph has changed due to new ops, functions doesn't apply the solution and
-// returns false. It is expected that the parent will then re-solve the epoch and
-// call this again.
-bool apply_solution(
-    graphlib::Graph const *graph,
-    const BalancerConfig &config,
-    std::vector<std::vector<std::string>> &op_names_to_epoch_break,
-    RibbonSolution &solution,
-    std::unique_ptr<legalizer::GraphSolver> &graph_solver,
-    std::unique_ptr<legalizer::GraphSolver> &graph_solver_epoch_snapshot,
-    placer::InteractivePlacer &interactive_placer,
-    std::unordered_set<string> &epoch_break_ops,
-    scheduler::Schedule &scheduled_ops,
-    std::unordered_set<const tt::graphlib::Node *> &processed_nodes,
-    tt::scheduler::Schedule &processed_schedule,
-    std::uint32_t &placed_op_index,
-    std::unique_ptr<graphlib::GraphTraversalContext> &traversal_context,
-    const tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        &prev_inst,
-    std::uint32_t &nodes_to_process,
-    std::vector<const Node *> &fork_and_join_nodes)
-{
-    // Apply the solution to the graph solver so that we can extract the pointer to its models and
-    // buffer them appropriately. Otherwise, we will be buffering a local copy of models in the solution,
-    // which will eventually get discarded.
-
-    TT_LOG_ASSERT(solution.get_ops().size() > 0, "Solution should have at least one op placed");
-    for (auto &op : solution.get_ops())
-    {
-        log_trace(LogBalancer, "RIBBON2: Graph solver set for {} with grid {}", op.op->name(), op.model.grid_shape);
-        graph_solver->set(op.op, op.model);
-    }
-    OpModels *op_models = graph_solver->get_selected_op_models_for_buffering(solution.get_current_epoch_ops());
-
-    graphlib::Graph *graph_modify = const_cast<graphlib::Graph *>(graph);
-    FJBufferingResult fj_buffering;
-    {
-        // Generate buffering instructions if this epoch needs buffering.
-        // We are scoping down FJ buffering algorithm to subgraph by setting GraphTraversalContext
-        // to current epoch nodes.
-        //
-        std::unique_ptr<graphlib::GraphTraversalContext> epoch_traversal_context =
-            graph_solver->get_graph_epoch_traversal_context(&solution.get_current_epoch_nodes());
-        fj_buffering = insert_fork_join_buffering(
-            graph_modify,
-            nullptr /* postplacer op models */,
-            op_models,
-            config.device_config.get_l1_usable_size(),
-            prev_inst,
-            config.fork_join_tiles_treshold,
-            &ribbon_buffering_factor);
-
-        for (auto &fj : fj_buffering.nop_buffered_fjs)
-        {
-            // Extract all fork and join nodes of nop buffered fork-joins.
-            fork_and_join_nodes.push_back(fj.first[0]);
-            fork_and_join_nodes.push_back(fj.first.back());
-        }
-    }
-
-    if (!std::get<0>(is_subset_of_instructions(fj_buffering.instructions, prev_inst)))
-    {
-        // We need to buffer, so we need to rewind the epoch and place again with buffer nodes.
-        // Revert graphsolver to snapshot. Release old traversal context.
-        //
-
-        bool graph_modified = false;
-        log_trace(LogBalancer, "RIBBON2: buffering required, reverting to snapshot");
-        graph_solver = std::make_unique<legalizer::GraphSolver>(
-            *graph_solver_epoch_snapshot);  // reset to epoch snapshot to clear the set op models
-        {
-            // Operate only within current epoch nodes.
-            std::unique_ptr<graphlib::GraphTraversalContext> epoch_traversal_context =
-                graph_solver->get_graph_epoch_traversal_context(&solution.get_current_epoch_nodes());
-            graph_modified = buffer_graph(graph_modify, fj_buffering.instructions, *graph_solver);
-        }
-
-        // Reset current epoch nodes and traversal context to old state(snapshot).
-        //
-        traversal_context.reset();
-        traversal_context = graph_solver->get_graph_traversal_context();
-
-        if (graph_modified)
-        {
-            // If we added new non queue nodes we need to rerun scheduler, and re-create the ribbon solution.
-            // For most ops, we should be able to find the same op model, and for the others we'll have to pick
-            // a new one. Those should only be nops, though.
-
-            std::tie(scheduled_ops, epoch_break_ops) =
-                policy_run_scheduler(graph, config, processed_nodes, processed_schedule, op_names_to_epoch_break);
-            placed_op_index = 0;  // we've reset the scheduled ops
-            nodes_to_process = processed_nodes.size() + scheduled_ops.size();
-        }
-
-        return false;
-    }
-
-    log_trace(LogBalancer, "RIBBON2: Applying solution with score: {}", solution.get_score());
-    solution.print();
-
-    // Create a map for quicker retrieval as we go through the schedule
-    std::unordered_map<std::string, RibbonSolution::OpModelPair> op_name_to_model;
-    for (auto &op : solution.get_ops())
-    {
-        log_trace(LogBalancer, "RIBBON2: emplacing op {}", op.op->name());
-        op_name_to_model.emplace(op.op->name(), op);
-    }
-
-    std::uint32_t solution_ops_placed = 0;
-    while (placed_op_index < scheduled_ops.size())
-    {
-        graphlib::Node *node = graph->get_node_by_name(scheduled_ops[placed_op_index]);
-        TT_ASSERT(node->node_type() == NodeType::kBudaOp);
-
-        const graphlib::BudaOpNode *op = static_cast<graphlib::BudaOpNode *>(node);
-        auto it = op_name_to_model.find(scheduled_ops[placed_op_index]);
-        TT_ASSERT(it != op_name_to_model.end(), "Model for {} is missing", scheduled_ops[placed_op_index]);
-        std::optional<placer::CoordRange> op_placement;
-        bool sparse_dense_pair = false;
-
-        // Special case for sparse-dense matmul pairing. We want to always place them atomically together.
-        //
-        if (op->is_sparse_matmul() and solution_ops_placed < solution.get_ops().size() - 1)
-        {
-            graphlib::Node *next_node = graph->get_node_by_name(scheduled_ops[placed_op_index + 1]);
-            const graphlib::BudaOpNode *dense_matmul_op = static_cast<graphlib::BudaOpNode *>(next_node);
-            auto it_dense = op_name_to_model.find(scheduled_ops[placed_op_index + 1]);
-
-            if (can_bind_sparse_dense_matmul_pair(
-                    graph,
-                    op,
-                    it->second.model,
-                    dense_matmul_op,
-                    it_dense->second.model,
-                    interactive_placer,
-                    true /*allow_transpose*/))
-            {
-                sparse_dense_pair = true;
-                op_placement = interactive_placer.place_two_ops_rowwise(
-                    op->name(),
-                    it->second.model.grid_shape,
-                    dense_matmul_op->name(),
-                    it_dense->second.model.grid_shape,
-                    true);
-
-                if (op_placement.has_value())
-                {
-                    processed_nodes.insert(op);
-                    processed_schedule.emplace_back(op->name());
-                    placed_op_index++;
-                    solution_ops_placed++;
-                    op = dense_matmul_op;
-                }
-            }
-        }
-
-        if (!sparse_dense_pair)
-        {
-            op_placement =
-                interactive_placer.place_op(scheduled_ops[placed_op_index], it->second.model.grid_shape, true);
-        }
-
-        TT_ASSERT(op_placement.has_value(), "Failed to re-place the solution on op {}", scheduled_ops[placed_op_index]);
-        log_trace(LogBalancer, "RIBBON2: placed {}", scheduled_ops[placed_op_index]);
-        processed_nodes.insert(op);
-        processed_schedule.emplace_back(op->name());
-        placed_op_index++;
-        solution_ops_placed++;
-
-        if (solution_ops_placed == solution.get_ops().size())
-        {
-            // We've placed all the ops in the solution, so we're done
-            break;
-        }
-    }
-
-    cut_graph_solver_epoch(graph, interactive_placer, *graph_solver);
-    return true;
-}
-
-legalizer::GraphSolverSolution run_policy_ribbon2(
-    graphlib::Graph const *graph,
-    const BalancerConfig &config,
-    legalizer::GraphSolver &graph_solver,
-    std::optional<placer::PlacerSolution> &placer_solution)
-{
-    //
-    // Ribbon2 policy
-    //
-    // Balancer works epoch by epoch, and tries to optimize each epoch for the maximum matmul utilization. It explores
-    // all possible ribbon sizes for each epoch to generate an initial set of solutions. Grids are picked based on some
-    // heuristics, trying to stick to ribbon size, fit in prologue, and so on.
-    //
-    // Then, each of the solutions is optimized by iteratively bumping up grids of the slowest ops, as long as that
-    // improves the utilization of the epoch. Once this is exhausted, all solutions are compared and the highest
-    // utilization is picked as the best for the epoch.
-    //
-    // At that point, fork-join buffering is added, and epoch is applied to graph solver and interactive placer.
-    //
-    // The utilization of the epoch is based on sum of matmul utilizations on each core, where the utilization is
-    // calculated as "theoretical lowest cycles / longest op in epoch cycles", where theoretical cycles in the
-    // number of cycles it would take at 100% utilization.
-    //
-    // Limitations:
-    //
-    // - Ribbon2 does not force sparse and dense matmuls to be on the same epoch. This was previous done as a
-    //   performance herustic, but it is not necessary in most situations. Accurate modeling of DRAM bandwidths could
-    //   allow balancer to make an optimal decision without this heuristic.
-    // - Because the optimal solution will have a much more random selection of grids vs. a clean ribbon, the blob sizes
-    //   are likely to grow much larger for some ops. For example, resnet epoch 1, at the moment, needs 77KB of extra
-    //   blob space, mobilenet v2 330kb!  However, once backend is given this space, resnet is significantly faster
-    //   than with the original ribbon. Going forward, accurate tile modeling (currently worked on by Nick) will allow
-    //   us to predict blob sizes better and add space only to cores that need it (or avoid combinations that create
-    //   large blobs).
-    // - Only one ribbon size is set per epoch. Having multiple ribbon sizes per epoch could explode the search space,
-    //   and make the algorithm impractical. Because ribbon size is only used to seed the initial solution before
-    //   optimization (which is free to change it), this appears to work well enough in limited testing.
-    // - Success is heavily dependent on accurate modeling of the backend cycles. This isn't necessarily a limitation,
-    //   of the algorithm itself, but because modeling is not completely accurate in all situations, Ribbon2 can
-    //   make bad choices. Resnet epoch0 is a good example, where sparse matmuls are estimate to run 5-6x slower than
-    //   they actually do, and the chosen solution is far from ideal.
-    // - Each epoch takes longer to solve, due to the nature of the algorithm. None of it is particularly compute-
-    //   intensive, but for a very large model, it could add up.
-    // - Ribbon2 gives up on optimizing an epoch after changes don't increase utilization. However, it could be a case
-    //   of a local minimum, and further iterations could continue to optimize. However, letting it always run for 10+
-    //   iterations would add a lot to the runtime, and many of those searches will not be fruitful. Some kind of a
-    //   heuristic to decide when to continue would be helpful.
-    // - Ribbon2 arbitrarily stops after 10 iterations of optimizations of a particular solution. Further testing is
-    //   needed to see if this is reasonable.
-    //
-    // Future improvements:
-    //
-    // - Convolution fracturing decision is made before Ribbon2 runs. However, letting the balancer determine which
-    //   convolutions would benefit from fracturing would allow us to make better decisions.
-    // - We could apply fork join buffering on each candidate solution, but due to the added complexity of graph changes
-    //   and cuts, it is likely going to slow down the alogorithm too much to make it practical. Evaluation is needed to
-    //   see if this would yield better solutions.
-    // - Seed the initial epoch solution with multiple ribbon sizes and queues to break between dimension changes.
-    // - This is a greedy algorithm which tries to optimize each epoch as it goes. However, choices made in current
-    //   epoch can affect future ones. Cross-epoch search, with epoch back-tracking is likely to yield better results
-    //   for some models.
-    //
-
-    log_info(LogBalancer, "Starting Ribbon2 balancing");
-    placer::InteractivePlacer interactive_placer(graph, config);
-    placer::InteractivePlacer ip_fittment_tester(graph, config);
-    std::unordered_set<string> epoch_break_ops;
-    scheduler::Schedule scheduled_ops;
-    graphlib::NodeEpochType current_epoch_type = NodeEpochType::Forward;
-    std::vector<const tt::graphlib::Node *> pre_buffering_epoch_nodes;
-    const tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        prev_inst;
-    std::unordered_set<const tt::graphlib::Node *> processed_nodes;
-    std::vector<tt::scheduler::Schedule> op_names_to_epoch_break = config.op_names_to_epoch_break;
-    tt::scheduler::Schedule processed_schedule;
-
-    std::unique_ptr<legalizer::GraphSolver> graph_solver_main = std::make_unique<legalizer::GraphSolver>(graph_solver);
-    std::unique_ptr<graphlib::GraphTraversalContext> traversal_context =
-        graph_solver_main->get_graph_traversal_context();
-    std::tie(scheduled_ops, epoch_break_ops) =
-        policy_run_scheduler(graph, config, processed_nodes, processed_schedule, op_names_to_epoch_break);
-
-    std::unordered_set<std::uint64_t> validated_cache;  // list of op model IDs that have been validated to be ok, so we
-                                                        // don't have to validate them again
-    const int target_cycles = env_as<int>("PYBUDA_RIBBON_TARGET_CYCLES", 95000);
-    const int max_iterations = env_as<int>("PYBUDA_RIBBON2_OPTIMIZATION_ITERATIONS", 0);
-
-    TT_ASSERT(config.op_names_to_chip_break.size() == 0, "Ribbon2 policy does not process chip breaks");
-
-    std::uint32_t epoch = 0;
-    bool done = false;
-    std::uint32_t placed_op_index = 0;
-    std::uint32_t nodes_to_process = scheduled_ops.size();
-    std::unique_ptr<legalizer::GraphSolver>
-        pre_buffered_graph_snapshot;                        // snapshot before any fork-join buffering
-                                                            // graph modifications were made for the current epoch
-    std::unique_ptr<RibbonSolution> pre_buffered_solution;  // current epoch solution before the last fork-join
-                                                            // buffering attempt - used to check if added buffering
-                                                            // caused any fork-join to split accross epochs
-    std::vector<const Node *>
-        fork_and_join_nodes;  // fork and join nodes of every nop-buffered fork-join in current epoch.
-    bool epoch_breaks_added = false;
-
-    graph_solver_main->invalidate_suboptimal_op_models(
-        legalizer::MatmulSparseDenseGridPairing | legalizer::DenseMatmulPrologue | legalizer::DenseMatmulBetterUkt);
-
-    while (!done)
-    {
-        // Try placing an epoch for each ribbon size, and figure out the score for each
-        std::vector<RibbonSolution> solutions;
-        std::exception_ptr first_error = nullptr;
-        bool first_error_is_fatal = false;
-
-        // Per-epoch overrides
-        const int force_target_cycles =
-            env_as<int>((std::string("PYBUDA_RIBBON2_TARGET_CYCLES_FOR_EPOCH") + std::to_string(epoch)).c_str(), 0);
-        const int epoch_target_cycles = (force_target_cycles != 0) ? force_target_cycles : target_cycles;
-
-        const int force_optimization_iterations = env_as<int>(
-            (std::string("PYBUDA_RIBBON2_OPTIMIZATION_ITERATIONS_FOR_EPOCH") + std::to_string(epoch)).c_str(), -1);
-        const int epoch_max_iterations =
-            (force_optimization_iterations != -1) ? force_optimization_iterations : max_iterations;
-
-        const int force_ribbon =
-            env_as<int>((std::string("PYBUDA_RIBBON2_RIBBON_FOR_EPOCH") + std::to_string(epoch)).c_str(), 0);
-
-        log_debug(
-            LogBalancer,
-            "Epoch {} settings: target_cycles={}, max_iterations={}, force_ribbon={}",
-            epoch,
-            epoch_target_cycles,
-            epoch_max_iterations,
-            force_ribbon);
-
-        for (std::uint32_t ribbon_size = 1; ribbon_size <= (std::uint32_t)config.device_config.grid_size.r;
-             ribbon_size++)
-        {
-            // Per epoch ribbon size override
-            if (force_ribbon != 0 && (int)ribbon_size != force_ribbon)
-            {
-                continue;
-            }
-
-            try
-            {
-                auto graph_solver_epoch_snapshot = std::make_unique<legalizer::GraphSolver>(*graph_solver_main);
-                std::vector<RibbonSolution::OpModelPair> selected_models;
-
-                // Pick op models
-                for (std::uint32_t op_index = placed_op_index; op_index < scheduled_ops.size(); op_index++)
-                {
-                    graphlib::Node *node = graph->get_node_by_name(scheduled_ops[op_index]);
-                    if (node->node_type() != NodeType::kBudaOp)
-                        continue;
-
-                    const graphlib::BudaOpNode *op = node->as<graphlib::BudaOpNode>();
-
-                    // check if there is a forced break at this op
-                    bool new_epoch = (op_index > placed_op_index) && ((epoch_break_ops.count(node->name()) > 0) ||
-                                                                      (current_epoch_type != op->get_epoch_type()));
-
-                    if (!new_epoch)
-                    {
-                        // Pick the best op model.
-                        //
-                        auto selected_op_model = select_best_op_model_ribbon(
-                            *graph_solver_epoch_snapshot,
-                            op,
-                            ribbon_size,
-                            config,
-                            graph,
-                            validated_cache,
-                            epoch_target_cycles);
-                        log_trace(
-                            LogBalancer,
-                            "RIBBON2: (epoch={}, op_index={}, ribbon={}) {} best grid: {}, cycles: {} ",
-                            epoch,
-                            op_index,
-                            ribbon_size,
-                            node->name(),
-                            selected_op_model.grid_shape,
-                            get_limiter_cycles(selected_op_model, graph, config.device_config));
-                        std::optional<placer::CoordRange> op_placement;
-                        bool sparse_dense_pair = false;
-                        bool op_already_set = false;
-
-                        // Special case for sparse matmuls. Try to pair them with the next op if preferable(sparse-dense
-                        // like pairs, see should_pair_with_sparse()).
-                        //
-                        if (op->is_sparse_matmul() and op_index < scheduled_ops.size() - 1)
-                        {
-                            graphlib::Node *next_node = graph->get_node_by_name(scheduled_ops[op_index + 1]);
-                            if (next_node->node_type() == NodeType::kBudaOp)
-                            {
-                                const graphlib::BudaOpNode *dense_matmul_op =
-                                    static_cast<const graphlib::BudaOpNode *>(next_node);
-                                if (dense_matmul_op->should_pair_with_sparse(op, graph))
-                                {
-                                    graph_solver_epoch_snapshot->set(op, selected_op_model);
-                                    op_already_set = true;
-
-                                    auto selected_op_model_dense = select_best_op_model_ribbon(
-                                        *graph_solver_epoch_snapshot,
-                                        dense_matmul_op,
-                                        ribbon_size,
-                                        config,
-                                        graph,
-                                        validated_cache,
-                                        epoch_target_cycles);
-
-                                    // Place pair atomically in case row size matches and we can fit on a single epoch.
-                                    //
-                                    if (selected_op_model_dense.grid_shape.r == selected_op_model.grid_shape.r and
-                                        interactive_placer.can_fit_on_single_epoch(
-                                            selected_op_model.grid_shape.r,
-                                            selected_op_model.grid_shape.c + selected_op_model_dense.grid_shape.c,
-                                            true /* allow_transpose */))
-                                    {
-                                        sparse_dense_pair = true;
-                                        op_placement = interactive_placer.place_two_ops_rowwise(
-                                            op->name(),
-                                            selected_op_model.grid_shape,
-                                            dense_matmul_op->name(),
-                                            selected_op_model_dense.grid_shape,
-                                            true);
-                                    }
-                                    // Row size doesn't match, still try placing them within the same epoch if possible.
-                                    //
-                                    else if (can_fit_on_single_epoch(
-                                                 ip_fittment_tester,
-                                                 op->name(),
-                                                 selected_op_model.grid_shape,
-                                                 dense_matmul_op->name(),
-                                                 selected_op_model_dense.grid_shape))
-                                    {
-                                        sparse_dense_pair = true;
-                                        op_placement = interactive_placer.place_op(
-                                            op->name(), selected_op_model.grid_shape, true /* enable_transpose */);
-
-                                        if (op_placement.has_value())
-                                        {
-                                            op_placement = interactive_placer.place_op(
-                                                dense_matmul_op->name(),
-                                                selected_op_model_dense.grid_shape,
-                                                true /* enable_transpose */);
-                                        }
-                                    }
-
-                                    // Pair has been placed, mark opmodels, and skip next op as it is already selected
-                                    // and set.
-                                    //
-                                    if (op_placement.has_value())
-                                    {
-                                        selected_models.push_back({selected_op_model, op});
-                                        selected_models.push_back({selected_op_model_dense, dense_matmul_op});
-                                        graph_solver_epoch_snapshot->set(dense_matmul_op, selected_op_model_dense);
-                                        op_index++;
-                                    }
-                                }
-                            }
-                        }
-
-                        if (!sparse_dense_pair)
-                        {
-                            op_placement = interactive_placer.place_op(op->name(), selected_op_model.grid_shape, true);
-                        }
-
-                        new_epoch = !op_placement.has_value() || (op_index == scheduled_ops.size() - 1);
-
-                        if (op_placement.has_value())
-                        {
-                            if (!sparse_dense_pair)
-                            {
-                                selected_models.push_back({selected_op_model, op});
-                                if (!op_already_set)
-                                {
-                                    graph_solver_epoch_snapshot->set(op, selected_op_model);
-                                }
-                            }
-                        }
-                        else
-                        {
-                            log_trace(LogBalancer, "RIBBON2: Doesn't fit, starting new epoch");
-                        }
-                    }
-
-                    if (new_epoch || (placed_op_index + op_index == scheduled_ops.size() - 1))
-                    {
-                        TT_ASSERT(!new_epoch || selected_models.size() > 0);
-                        // Record the solution
-                        RibbonSolution new_solution(ribbon_size, &config.device_config, selected_models, graph);
-
-                        // Check if the same solution was provided by another ribbon
-                        bool found_same_solution = false;
-                        for (auto &s : solutions)
-                        {
-                            if ((s.get_score() != new_solution.get_score()) ||
-                                (s.get_ops().size() != selected_models.size()))
-                                continue;
-
-                            bool same = true;
-                            for (std::size_t i = 0; i < s.get_ops().size(); i++)
-                            {
-                                if (!(s.get_ops()[i].model.id == selected_models[i].model.id))
-                                {
-                                    same = false;
-                                    break;
-                                }
-                            }
-
-                            if (same)
-                            {
-                                found_same_solution = true;
-                                break;
-                            }
-                        }
-                        if (!found_same_solution)
-                        {
-                            solutions.push_back(new_solution);
-                        }
-
-                        interactive_placer.rewind_epoch();
-                        break;
-                    }
-                }
-            }
-            catch (const BalancerError &e)
-            {
-                log_debug(
-                    LogBalancer,
-                    "Encountered BalancerException while trying ribbon size {}: {}",
-                    ribbon_size,
-                    e.what());
-
-                bool fatal_exception = std::holds_alternative<balancer::BalancerError::Fatal>(e.type);
-                if ((first_error == nullptr) || (first_error_is_fatal && !fatal_exception))
-                {
-                    first_error = std::current_exception();
-                    first_error_is_fatal = fatal_exception;
-                }
-
-                interactive_placer.rewind_epoch();
-            }
-        }
-
-        if (solutions.size() == 0)
-        {
-            log_debug(LogBalancer, "No solution found, throwing first error encountered");
-            TT_ASSERT(first_error != nullptr);
-            std::rethrow_exception(first_error);
-        }
-
-        log_trace(LogBalancer, "RIBBON2: (epoch={}) number of solutions: {}", epoch, solutions.size());
-        auto best_solution = solutions[0];
-        for (auto &s : solutions)
-        {
-            try
-            {
-                auto optimized_solution = optimize_solution(
-                    s, *graph_solver_main, interactive_placer, graph, validated_cache, epoch_max_iterations);
-                if (optimized_solution.get_score() > best_solution.get_score())
-                {
-                    best_solution = optimized_solution;
-                }
-            }
-            catch (const BalancerError &e)
-            {
-                log_debug(LogBalancer, "Encountered BalancerException while optimizing solution: {}", e.what());
-                // Use the unoptimized solution
-                if (s.get_score() > best_solution.get_score())
-                {
-                    best_solution = s;
-                }
-            }
-        }
-
-        bool rescheduled = handle_fork_join_nop_overflow(
-            graph,
-            config,
-            op_names_to_epoch_break,
-            best_solution,
-            pre_buffered_solution,
-            graph_solver_main,
-            pre_buffered_graph_snapshot,
-            epoch_break_ops,
-            placed_op_index,
-            scheduled_ops,
-            processed_nodes,
-            processed_schedule,
-            traversal_context,
-            nodes_to_process,
-            epoch,
-            fork_and_join_nodes,
-            epoch_breaks_added);
-
-        if (rescheduled)
-        {
-            // We have a new schedule, restart search.
-            continue;
-        }
-
-        // Insert fj buffering as needed, and apply the solution to the main graph solver and placer
-        std::unique_ptr<legalizer::GraphSolver> graph_solver_snapshot =
-            std::make_unique<legalizer::GraphSolver>(*graph_solver_main);
-        bool applied = apply_solution(
-            graph,
-            config,
-            op_names_to_epoch_break,
-            best_solution,
-            graph_solver_main,
-            graph_solver_snapshot,
-            interactive_placer,
-            epoch_break_ops,
-            scheduled_ops,
-            processed_nodes,
-            processed_schedule,
-            placed_op_index,
-            traversal_context,
-            prev_inst,
-            nodes_to_process,
-            fork_and_join_nodes);
-
-        if (applied)
-        {
-            if (placed_op_index >= scheduled_ops.size())
-            {
-                Logger<kLoggerABI>::get().log_level_type(
-                    Logger<kLoggerABI>::Level::Info, LogBalancer, "Balancing 100% completed!");
-                break;
-            }
-            else
-            {
-                Logger<kLoggerABI>::get().log_level_type(
-                    Logger<kLoggerABI>::Level::Info,
-                    LogBalancer,
-                    "Balancing {}% complete.",
-                    processed_nodes.size() * 100 / nodes_to_process);
-            }
-
-            epoch++;
-
-            graphlib::Node *next_node = graph->get_node_by_name(scheduled_ops[placed_op_index]);
-            current_epoch_type = next_node->get_epoch_type();
-            interactive_placer.next_epoch(current_epoch_type);
-
-            if (epoch_breaks_added)
-            {
-                // Remove previously added epoch breaks, since we have successfully applied the solution.
-                //
-                // We also need to remove coresponding 'epoch break op' generated by the scheduler based on
-                // epoch breaks we've added (in op_names_to_epoch_break). This is done because the chosen
-                // epoch solution might not contain all nodes up to 'epoch break op' - and in that case
-                // the next epoch created will be broken again on the 'epoch break op', which is not necessary
-                // in our case and can cause perf degradation.
-                op_names_to_epoch_break.pop_back();
-                epoch_break_ops = placer::lowering::tag_ops_for_epoch_break(
-                    config.device_config.arch_name,
-                    op_names_to_epoch_break,
-                    config.op_names_to_chip_break,
-                    scheduled_ops,
-                    graph,
-                    true /* use_interactive_placer */);
-                epoch_breaks_added = false;
-            }
-
-            pre_buffered_graph_snapshot.reset();
-            pre_buffered_solution.reset();
-            fork_and_join_nodes.clear();
-        }
-        else
-        {
-            if (pre_buffered_graph_snapshot.get() == nullptr)
-            {
-                // The solution hasn't been applied, which means fork-join buffering has been added for this epoch.
-                // Save pre buffered state in case we need to revert to it.
-                log_debug(LogBalancer, "Saving pre_buffered graph snapshot.");
-                pre_buffered_graph_snapshot = std::move(graph_solver_snapshot);
-            }
-
-            pre_buffered_solution = std::make_unique<RibbonSolution>(best_solution);
-        }
-    }
-
-    placer_solution = interactive_placer.commit();
-    placer_solution.value().fork_join_buffered = true;
-    validate_solution(scheduled_ops, placer_solution.value());
-
-    return graph_solver_main->finish();
-}
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/policies/policy_types.hpp b/pybuda/csrc/balancer/policies/policy_types.hpp
deleted file mode 100644
index e2bb4e4a0..000000000
--- a/pybuda/csrc/balancer/policies/policy_types.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-namespace tt::balancer
-{
-
-// Update can_use_interactive_placer() when adding new policy type.
-//
-enum class PolicyType
-{
-    MaximizeTMinimizeGrid,
-    MinimizeGrid,
-    Random,
-    NLP,
-    CNN,
-    Ribbon
-};
-
-bool can_use_interactive_placer(PolicyType policy_type);
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/policies/policy_utils.cpp b/pybuda/csrc/balancer/policies/policy_utils.cpp
deleted file mode 100644
index 7b8ce79f2..000000000
--- a/pybuda/csrc/balancer/policies/policy_utils.cpp
+++ /dev/null
@@ -1,1240 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "balancer/policies/policy_utils.hpp"
-
-#include <experimental/filesystem>
-#include <fstream>
-
-#include "balancer/legalizer/legalizer.hpp"
-#include "passes/fork_join.hpp"
-#include "placer/dram.hpp"
-#include "placer/interactive_placer.hpp"
-#include "placer/lower_to_placer.hpp"
-#include "scheduler/scheduler.hpp"
-#include "shared_utils/placement_printer.hpp"
-#include "shared_utils/pretty_table.hpp"
-
-using Graph = tt::graphlib::Graph;
-using Node = tt::graphlib::Node;
-using NodeType = tt::graphlib::NodeType;
-using Edge = tt::graphlib::Edge;
-using DataFormat = tt::DataFormat;
-
-namespace tt::balancer
-{
-
-OpModelMap to_op_model_map(OpModels const &selected_op_models)
-{
-    OpModelMap op_model_map;
-    for (auto const &[node, op_model] : selected_op_models)
-    {
-        op_model_map.insert({node->name(), op_model});
-    }
-    return op_model_map;
-}
-
-placer::PlacerSolution run_placer(
-    Graph const *graph, const BalancerConfig &config, OpModelMap const &selected_op_models)
-{
-    std::unordered_map<std::string, tt::placer::GridShape> op_to_grid_shape;
-    std::unordered_map<std::string, tt::placer::GridShape> input_queue_to_grid_shape;
-    for (auto [node_name, op_model] : selected_op_models)
-    {
-        Node *node = graph->get_node_by_name(node_name);
-        switch (node->node_type())
-        {
-            case NodeType::kInput:
-            {
-                input_queue_to_grid_shape.insert(
-                    {node_name,
-                     tt::placer::GridShape(
-                         (std::uint32_t)op_model.grid_shape.r, (std::uint32_t)op_model.grid_shape.c)});
-                break;
-            }
-            case NodeType::kBudaOp:
-            {
-                op_to_grid_shape.insert(
-                    {node_name,
-                     tt::placer::GridShape(
-                         (std::uint32_t)op_model.grid_shape.r, (std::uint32_t)op_model.grid_shape.c)});
-                break;
-            }
-            default: break;
-        }
-    }
-
-    scheduler::Schedule scheduled_ops = run_scheduler(config.scheduler_config, graph);
-
-    placer::PlacerConfig placer_config = {
-        .chip_ids = config.chip_ids,
-        .chip_placement_policy = config.chip_placement_policy,
-        .device_config = config.device_config,
-        .device_grid =
-            placer::GridShape((uint32_t)config.device_config.grid_size.r, (uint32_t)config.device_config.grid_size.c),
-        .contains_recompute = graph->contains_recompute_nodes(),
-        .output_queues_on_host = config.output_queues_on_host,
-        .strategy = placer::PlacementStrategy::LeftToRight,
-        .op_to_grid_shape = op_to_grid_shape,
-        .input_queue_to_grid_shape = input_queue_to_grid_shape,
-        .op_to_epoch_type = placer::lowering::get_op_to_epoch_type_mapping(graph, scheduled_ops),
-        .op_to_grad_op = placer::lowering::get_op_to_grad_op_mapping(graph, scheduled_ops),
-        .op_to_recompute_op = placer::lowering::get_op_to_recompute_mapping(graph, scheduled_ops),
-        .ops_tagged_for_chip_id_break = placer::lowering::tag_ops_for_chip_break(
-            config.device_config.arch_name,
-            config.op_names_to_chip_break,
-            scheduled_ops,
-            graph,
-            config.use_interactive_placer),
-        .ops_tagged_for_epoch_break = placer::lowering::tag_ops_for_epoch_break(
-            config.device_config.arch_name,
-            config.op_names_to_epoch_break,
-            config.op_names_to_chip_break,
-            scheduled_ops,
-            graph,
-            config.use_interactive_placer),
-        .ops_tagged_for_temporal_epoch_break = placer::lowering::tag_ops_for_temporal_epoch_break(
-            graph, scheduled_ops, config.op_name_to_placer_overrides),
-        .fwd_to_bwd_nodes = placer::lowering::get_fwd_to_bwd_nodes(graph),
-        .fwd_to_opt_nodes = placer::lowering::get_fwd_to_opt_nodes(graph, scheduled_ops),
-        .output_ops = placer::lowering::get_output_nodes(graph),
-        .op_to_chip_id_assignment = config.op_to_chip_id_assignment,
-        .op_to_overrides = config.op_name_to_placer_overrides,
-        .enable_auto_transposing_placement = config.enable_auto_transposing_placement,
-    };
-
-    // NB: We can avoid introducing both core-graph-lib and autograd modules in as dependencies
-    // if we move the lowering code (relevant dependencies on both packages) here. Alternatively
-    // only have lowering.hpp/cpp files depend on core-graph-lib/autograd
-    placer::PlacerSolution solution = placer::placer(placer_config, scheduled_ops);
-
-    // Visualize placement
-    if (env_as<bool>("PYBUDA_BALANCER_PLACER_DATA"))
-    {
-        const std::string placement_dir_path = "bp_data";
-        std::experimental::filesystem::create_directory(placement_dir_path);
-        std::string file_name = placement_dir_path + "/" + (graph->name().empty() ? "noname" : graph->name()) + "_" +
-                                policy_to_string(config.policy_type) + ".txt";
-        std::ofstream of(file_name);
-        dump_balancer_placer_data(
-            graph, config.chip_ids, solution, selected_op_models, of, config.device_config.arch_name);
-    }
-
-    return solution;
-}
-
-std::vector<uint> get_num_epochs_per_node_epoch_type(Graph const *graph, tt::placer::PlacerSolution placer_solution)
-{
-    (void)graph;
-    constexpr int NUM_EPOCH_TYPES = 3;
-    constexpr std::array<NodeEpochType, NUM_EPOCH_TYPES> epoch_types = {
-        NodeEpochType::Forward, NodeEpochType::Backward, NodeEpochType::Optimizer};
-
-    std::vector<uint> num_epochs_per_node_type(NUM_EPOCH_TYPES, 0);
-    std::unordered_map<uint, std::vector<std::string>> epoch_to_op_names;
-
-    for (uint i = 0; i < placer_solution.num_epochs; i++)
-    {
-        epoch_to_op_names.emplace(i, std::vector<std::string>());
-    }
-
-    for (auto kvp : placer_solution.name_to_op_placement)
-    {
-        epoch_to_op_names.at(kvp.second.epoch_id()).push_back(kvp.first);
-    }
-
-    for (int i = 0; i < NUM_EPOCH_TYPES; ++i)
-    {
-        num_epochs_per_node_type[i] = placer_solution.num_temporal_epochs(epoch_types[i]);
-    }
-
-    // Pop opt and bwd if not training mode
-
-    while (num_epochs_per_node_type.back() == 0)
-    {
-        num_epochs_per_node_type.pop_back();
-    }
-
-    return num_epochs_per_node_type;
-}
-
-void dump_balancer_placer_data(
-    Graph const *graph,
-    std::vector<std::uint32_t> chip_ids,
-    tt::placer::PlacerSolution const &placer_solution,
-    OpModelMap const &op_model_map,
-    std::ostream &of,
-    const std::string &arch_name)
-{
-    if (not env_as<bool>("PYBUDA_BALANCER_PLACER_DATA"))
-        return;
-
-    // Create some supporting structures
-    std::unordered_map<std::string, int> op_name_to_id_map;
-    for (std::pair<const std::string, tt::placer::OpPlacement> kvp : placer_solution.name_to_op_placement)
-    {
-        op_name_to_id_map.emplace(kvp.first, graph->get_node_by_name(kvp.first)->id());
-    }
-
-    std::vector<std::pair<std::string, int>> sorted_op_id_name_pairs;
-    std::transform(
-        op_name_to_id_map.begin(),
-        op_name_to_id_map.end(),
-        std::back_inserter(sorted_op_id_name_pairs),
-        [](const std::pair<const std::string, int> &kvp) { return kvp; });
-
-    std::sort(
-        sorted_op_id_name_pairs.begin(),
-        sorted_op_id_name_pairs.end(),
-        [](const auto &lhs, const auto &rhs) { return lhs.second < rhs.second; });
-
-    // Create mapping of op id to new set of ids that are in [0, N)
-    std::unordered_map<int, int> original_id_to_visualized_id;
-    int new_id = 0;
-    for (std::pair<std::string, int> kvp : sorted_op_id_name_pairs)
-    {
-        original_id_to_visualized_id.emplace(kvp.second, new_id);
-        new_id++;
-    }
-
-    // Placer doesn't have access to graph and PlacerSolution is NodeEpochType-agnostic, so printer will be called here
-    // Whether we're training or not, should be read from compiler config, but hack it for now
-    uint node_epoch_types_count = graph->contains_bwd_nodes() ? 3 : 1;
-    std::vector<uint> epochs_per_epoch_type = get_num_epochs_per_node_epoch_type(graph, placer_solution);
-
-    tt::utils::PlacementPrinter::DeviceType dev_type = (arch_name == "grayskull")
-                                                           ? tt::utils::PlacementPrinter::DeviceType::Grayskull
-                                                           : tt::utils::PlacementPrinter::DeviceType::Wormhole;
-
-    std::uint32_t max_chip_id = 0;
-    for (std::uint32_t chip_id : chip_ids)
-    {
-        max_chip_id = std::max(max_chip_id, chip_id);
-    }
-
-    tt::utils::PlacementPrinter printer(dev_type, node_epoch_types_count, epochs_per_epoch_type, max_chip_id + 1);
-
-    for (auto &kvp : placer_solution.name_to_op_placement)
-    {
-        std::string name = kvp.first;
-        tt::placer::OpPlacement opPlacement = kvp.second;
-
-        auto coords = opPlacement.placed_cores;
-
-        printer.fillRectangle(
-            placer_solution.temporal_epoch_id(name),
-            opPlacement.chip_id,
-            coords.start.row,
-            coords.start.col,
-            coords.end.row,
-            coords.end.col,
-            original_id_to_visualized_id.at(op_name_to_id_map[name])  // prints id for visualization
-        );
-    }
-
-    of << printer.generatePlacementString();
-
-    // Print op data
-    tt::utils::PrettyTable table;
-    table.add_row(
-        {"Visual id",
-         "Op id",
-         "Op name",
-         "Op type",
-         "Grid (RxC)",
-         "Cores",
-         "Cycles",
-         "mblock (t)",
-         "ublock (u_kt)",
-         "Data fmt",
-         "Math fdlty",
-         "L1 mem (kb)"});
-
-    for (auto &kvp : sorted_op_id_name_pairs)
-    {
-        const std::string op_name = kvp.first;
-        const int op_id = kvp.second;
-
-        // Since op type is of format "BudaOp::matmul", we remove the prefix
-        std::string op_type = graph->node_by_id(op_id)->get_type();
-        TT_ASSERT(op_type.substr(0, 8) == "BudaOp::", "Op not a buda op!");
-        op_type = op_type.substr(8);
-
-        std::string placed_core_shapes;
-        int placed_cores_volume = 0;
-        tt::placer::CoordRange coord_range = placer_solution.name_to_op_placement.at(op_name).placed_cores;
-        placed_core_shapes += " " + std::to_string(coord_range.size_r()) + "x" + std::to_string(coord_range.size_c());
-        placed_cores_volume += coord_range.size_r() * coord_range.size_c();
-
-        const OpModel &op_model = op_model_map.at(op_name);
-
-        std::string execution_cycles = std::to_string(op_model.get_execution_cycles(arch_name));
-        std::string memory_used_kb = round_float(op_model.get_l1_memory_usage() / 1024.f, 2);
-        std::string mblock = std::to_string(op_model.block_shape().mblock_m) + "x" +
-                             std::to_string(op_model.block_shape().mblock_n) + " " +
-                             std::to_string(op_model.block_shape().t);
-        std::string ublock =
-            std::to_string(op_model.block_shape().ublock.rt) + "x" + std::to_string(op_model.block_shape().ublock.ct);
-        std::string data_format = ((std::stringstream &)(std::stringstream() << op_model.data_format)).str();
-        std::string math_fidelity = ((std::stringstream &)(std::stringstream() << op_model.math_fidelity())).str();
-
-        table.add_row({
-            std::to_string(original_id_to_visualized_id.at(op_id)),
-            std::to_string(op_id),
-            op_name,
-            op_type,
-            placed_core_shapes,
-            std::to_string(placed_cores_volume),
-            execution_cycles,
-            mblock,
-            ublock,
-            data_format,
-            math_fidelity,
-            memory_used_kb,
-        });
-    }
-
-    of << table.generate_table_string(tt::utils::PrettyTable::Format::Pretty) << std::endl;
-
-    int epoch_id = 0;
-    int total_cost = 0;
-    std::vector<EpochCost> epoch_costs = calculate_epoch_costs(placer_solution, op_model_map, arch_name);
-    fmt::print(of, "Epoch costs:\n");
-    for (EpochCost epoch_cost : epoch_costs)
-    {
-        fmt::print(of, "  {}: {} cycles\n", epoch_id++, epoch_cost.setup_cycles + epoch_cost.runtime_cycles);
-        total_cost += epoch_cost.setup_cycles + epoch_cost.runtime_cycles;
-    }
-    fmt::print(of, "  Total: {} cycles\n", total_cost);
-
-    // TODO: print graph of ops to file stream
-    // Consider graphviz:
-    // -
-    // https://stackoverflow.com/questions/9181183/how-to-print-a-boost-graph-in-graphviz-with-one-of-the-properties-displayed
-    // - https://stackoverflow.com/questions/33301493/network-graph-visualisation
-}
-
-std::vector<EpochCost> calculate_epoch_costs(
-    placer::PlacerSolution const &placer_solution, OpModelMap const &selected_op_models, std::string const &arch_name)
-{
-    std::vector<EpochCost> epoch_costs;
-    epoch_costs.resize(placer_solution.num_epochs);
-    for (auto const &[node, placement] : placer_solution.name_to_op_placement)
-    {
-        OpModel const &op_model = selected_op_models.at(node);
-        epoch_costs[placement.epoch_id()].runtime_cycles =
-            std::max(epoch_costs[placement.epoch_id()].runtime_cycles, op_model.get_execution_cycles(arch_name));
-    }
-    return epoch_costs;
-}
-
-void epoch_or_chip_break_remove_processed_nodes(
-    const Graph *graph,
-    std::vector<tt::scheduler::Schedule> &op_names_to_epoch_or_chip_break,
-    const std::unordered_set<const tt::graphlib::Node *> &processed_nodes)
-{
-    if (processed_nodes.empty())
-    {
-        return;
-    }
-
-    auto it = op_names_to_epoch_or_chip_break.begin();
-    while (it != op_names_to_epoch_or_chip_break.end())
-    {
-        auto &op_names = *it;
-        auto op_names_it = op_names.begin();
-        bool delete_op_names = false;
-        while (op_names_it != op_names.end())
-        {
-            auto &op_name = *op_names_it;
-            auto node = graph->get_node_by_name(op_name);
-            if (processed_nodes.find(node) != processed_nodes.end())
-            {
-                delete_op_names = true;
-                break;
-            }
-            else
-            {
-                ++op_names_it;
-            }
-        }
-
-        if (delete_op_names)
-        {
-            it = op_names_to_epoch_or_chip_break.erase(it);
-        }
-        else
-        {
-            ++it;
-        }
-    }
-}
-
-std::pair<scheduler::Schedule, std::unordered_set<string>> policy_run_scheduler(
-    graphlib::Graph const *graph,
-    BalancerConfig const &config,
-    const std::unordered_set<const tt::graphlib::Node *> &processed_nodes,
-    const tt::scheduler::Schedule &processed_schedule,
-    std::vector<tt::scheduler::Schedule> &op_names_to_epoch_break)
-{
-    std::vector<tt::scheduler::Schedule> op_names_to_chip_break;
-    const auto [scheduled_ops, epoch_break_ops, chip_break_ops] = policy_run_scheduler(
-        graph, config, processed_nodes, processed_schedule, op_names_to_epoch_break, op_names_to_chip_break);
-    return make_pair(std::move(scheduled_ops), std::move(epoch_break_ops));
-}
-
-std::tuple<scheduler::Schedule, std::unordered_set<string>, std::unordered_set<string>> policy_run_scheduler(
-    graphlib::Graph const *graph,
-    BalancerConfig const &config,
-    const std::unordered_set<const tt::graphlib::Node *> &processed_nodes,
-    const tt::scheduler::Schedule &processed_schedule,
-    std::vector<tt::scheduler::Schedule> &op_names_to_epoch_break,
-    std::vector<tt::scheduler::Schedule> &op_names_to_chip_break)
-{
-    scheduler::SchedulerConfig scheduler_config = config.scheduler_config;
-    if (processed_nodes.size() > 0)
-    {
-        TT_ASSERT(processed_nodes.size() == processed_schedule.size());
-        scheduler_config.ignored_nodes = &processed_nodes;
-        scheduler_config.scheduler_constraints.push_back(processed_schedule);
-    }
-
-    scheduler::Schedule scheduled_ops = run_scheduler(scheduler_config, graph);
-
-    epoch_or_chip_break_remove_processed_nodes(graph, op_names_to_epoch_break, processed_nodes);
-    epoch_or_chip_break_remove_processed_nodes(graph, op_names_to_chip_break, processed_nodes);
-    std::unordered_set<string> epoch_break_ops = placer::lowering::tag_ops_for_epoch_break(
-        config.device_config.arch_name,
-        op_names_to_epoch_break,
-        op_names_to_chip_break,
-        scheduled_ops,
-        graph,
-        config.use_interactive_placer);
-    std::unordered_set<string> chip_break_ops = placer::lowering::tag_ops_for_chip_break(
-        config.device_config.arch_name, op_names_to_chip_break, scheduled_ops, graph, config.use_interactive_placer);
-
-    return make_tuple(std::move(scheduled_ops), std::move(epoch_break_ops), std::move(chip_break_ops));
-}
-
-// Cuts OPs in current epoch from rest of the graph.
-//
-void cut_graph_solver_epoch(
-    const graphlib::Graph *graph, placer::InteractivePlacer &placer, legalizer::GraphSolver &graph_solver)
-{
-    // Only cut edges from ops that have been placed already
-    balancer::CutEdges const &already_cut_edges = graph_solver.get_cut_edges();
-    std::vector<std::string> const &current_epoch_ops = placer.current_epoch_ops();
-    std::vector<graphlib::Edge> edges_to_cut;
-    for (auto const &op_name : current_epoch_ops)
-    {
-        for (auto const &edge : graph->user_data_edges(graph->get_node_by_name(op_name)))
-        {
-            auto *user = graph->node_by_id(edge.consumer_node_id);
-            if (user->node_type() != graphlib::NodeType::kBudaOp)
-                continue;
-
-            if (already_cut_edges.find(edge) != already_cut_edges.end())
-                continue;
-
-            if (std::find(current_epoch_ops.begin(), current_epoch_ops.end(), user->name()) != current_epoch_ops.end())
-                continue;
-
-            edges_to_cut.push_back(edge);
-        }
-    }
-
-    if (edges_to_cut.size() > 0)
-    {
-        graph_solver.cut(edges_to_cut, true /*epoch_cut*/);
-    }
-}
-
-// Validate that all ops in scheduled_ops have been placed in placer_solution.
-//
-void validate_solution(const scheduler::Schedule &scheduled_ops, const placer::PlacerSolution &placer_solution)
-{
-    if (placer_solution.name_to_op_placement.size() < scheduled_ops.size())
-    {
-        log_error(LogBalancer, "Some ops haven't been placed:");
-        for (std::size_t i = 0; i < scheduled_ops.size(); i++)
-        {
-            if (placer_solution.name_to_op_placement.count(scheduled_ops[i]) == 0)
-            {
-                log_error(LogBalancer, "  - {}", scheduled_ops[i]);
-            }
-        }
-        TT_THROW("Failed to place all ops.");
-    }
-}
-
-// Merge buffering queues and ops for total current epoch nodes.
-// Most balancer policies will track and work with op nodes only
-// but for setting proper traversal contexts we need other nodes as well.
-//
-std::unordered_set<const tt::graphlib::Node *> calculate_current_epoch_nodes(
-    const Graph *graph, const std::unordered_set<const tt::graphlib::Node *> &current_epoch_ops)
-{
-    std::unordered_set<const tt::graphlib::Node *> current_epoch_nodes(current_epoch_ops);
-
-    for (const Node *op_node : current_epoch_ops)
-    {
-        for (Node *node : graph->data_operands(op_node))
-        {
-            if (node->node_type() == NodeType::kQueue and current_epoch_ops.count(graph->data_operands(node)[0]) > 0)
-            {
-                TT_ASSERT(node->as<graphlib::QueueNode>()->is_buffering());
-                current_epoch_nodes.insert(node);
-            }
-        }
-    }
-
-    return current_epoch_nodes;
-}
-
-// Invoke SET of selected op_model on graphsolver instance for given node.
-//
-void set_op_model_for_node(
-    legalizer::GraphSolver &graph_solver,
-    const graphlib::Node *node,
-    const OpModel &selected_op_model,
-    std::string const &arch_name)
-{
-    graph_solver.set(node, selected_op_model);
-    log_debug(
-        LogBalancer,
-        "Selected grid for node {} is {}, {}, {}, cycles {}",
-        node->name(),
-        selected_op_model.grid_shape,
-        selected_op_model.t_stream_factor,
-        selected_op_model.output_buffers[0].block_shape.ublock,
-        selected_op_model.get_execution_cycles(arch_name));
-}
-
-void set_op_model_for_node_ribbon(
-    legalizer::GraphSolver &graph_solver,
-    const graphlib::Node *op,
-    const OpModel &selected_op_model,
-    std::uint32_t current_ribbon_size)
-{
-    log_trace(
-        LogBalancer,
-        "Selected grid for op {}: {}, {}, t-stream: {}, current_ribon={}",
-        op->name(),
-        selected_op_model.grid_shape.r,
-        selected_op_model.grid_shape.c,
-        selected_op_model.t_stream_factor,
-        current_ribbon_size);
-    graph_solver.set(op, selected_op_model);
-}
-
-int ribbon_buffering_factor(const OpModel &op_model) { return op_model.grid_shape.r; }
-
-void cut_graph_solver_ribbon(
-    const graphlib::Graph *graph,
-    const graphlib::Node *op,
-    placer::InteractivePlacer &placer,
-    legalizer::GraphSolver &graph_solver)
-{
-    CutEdges pre_cut_edges = graph_solver.get_cut_edges();
-
-    // Only cut edges from ops that have been placed already
-    std::vector<graphlib::Edge> edges_to_cut;
-    for (auto &edge : graph->operand_data_edges(op))
-    {
-        if (placer.op_placed(graph->node_by_id(edge.producer_node_id)->name()) && pre_cut_edges.count(edge) == 0)
-        {
-            edges_to_cut.push_back(edge);
-        }
-    }
-
-    if (edges_to_cut.size() > 0)
-    {
-        log_debug(LogBalancer, "Cutting {} edges to {}", edges_to_cut.size(), op->name());
-        graph_solver.cut(edges_to_cut);
-    }
-}
-
-bool is_matmul(const graphlib::BudaOpNode *op)
-{
-    if (!op->is_matmul_not_sparse())
-        return false;
-
-    if (op->has_tag("reduce_r") || op->has_tag("reduce_c"))
-        return false;
-
-    return true;
-}
-
-bool prologue_ok(const OpModel &op_model)
-{
-    bool needs_prologue = op_model.buda_op_node->is_matmul();  // others don't matter much, as they are small
-    bool has_prologue = false;
-    if (needs_prologue)
-    {
-        if (op_model.buda_op_node->is_sparse_matmul())
-        {
-            TT_ASSERT(op_model.parameter_buffers.size() == 3);
-            has_prologue = op_model.parameter_buffers[0] && op_model.parameter_buffers[2];
-        }
-        else if (op_model.buda_op_node->is_dense_matmul())
-        {
-            TT_ASSERT(op_model.parameter_buffers.size() > 1);
-            has_prologue = op_model.parameter_buffers[1];
-        }
-        else
-        {
-            has_prologue = op_model.parameter_buffers.size() > 1 and op_model.parameter_buffers[1];
-        }
-    }
-
-    bool prologue_ok = !needs_prologue || has_prologue;
-
-    return prologue_ok;
-}
-
-bool ukt_ok(const OpModel &op_model)
-{
-    if (op_model.buda_op_node->is_matmul_not_sparse())
-    {
-        return op_model.input_buffers[0].block_shape.ublock.ct >= 4;
-    }
-    else if (op_model.buda_op_node->is_sparse_matmul())
-    {
-        return op_model.input_buffers[1].block_shape.ublock.rt >= 4;
-    }
-
-    return true;
-}
-
-bool mblock_size_ok(const OpModel &op_model)
-{
-    if (op_model.block_shape().t > 1)
-    {
-        return op_model.block_shape().volume_no_t() >= 8;
-    }
-
-    return true;
-}
-
-bool close_to_target_exec_cycles(int kernel_exec_cycles, int limiter_cycles, int target)
-{
-    return (limiter_cycles < target) && (kernel_exec_cycles > target * 0.8);
-}
-
-// OpModel preference comparison function. Returns true if candidate is better than current pick.
-//
-bool is_candidate_better_than_current(
-    const OpModel &current,
-    const OpModel &candidate,
-    const Graph *graph,
-    int ribbon_size,
-    int target_exec_cycles,
-    const DeviceConfig &device_config)
-{
-    TT_ASSERT(current.buda_op_node == candidate.buda_op_node);
-
-    // Op model compare version. If making major changes increment version and put the newest behaviour under that
-    // version.
-    //
-    int op_model_compare_version = env_as<int>("PYBUDA_OP_MODEL_COMPARE_VERSION", 2);
-
-    if (std::abs(ribbon_size - candidate.grid_shape.r) < std::abs(ribbon_size - current.grid_shape.r))
-    {
-        return true;
-    }
-    else if (std::abs(ribbon_size - candidate.grid_shape.r) == std::abs(ribbon_size - current.grid_shape.r))
-    {
-        // If both are same diff from target ribbon size, prefer smaller one.
-        // It makes smaller "disturbance" to targeted ribbon and uses smaller number of cores.
-        //
-        if (candidate.grid_shape.r != current.grid_shape.r)
-        {
-            return candidate.grid_shape.r < current.grid_shape.r;
-        }
-
-        bool candidate_prologue_ok = prologue_ok(candidate);
-        bool current_prologue_ok = prologue_ok(current);
-
-        if (candidate_prologue_ok > current_prologue_ok)
-        {
-            return true;
-        }
-        else if (candidate_prologue_ok == current_prologue_ok)
-        {
-            int current_cycles = get_limiter_cycles(current, graph, device_config);
-            int candidate_cycles = get_limiter_cycles(candidate, graph, device_config);
-
-            // Both op_models are within target. Prefer smaller number of columns.
-            //
-            if (candidate_cycles <= target_exec_cycles and current_cycles <= target_exec_cycles)
-            {
-                if (candidate.grid_shape.c < current.grid_shape.c)
-                {
-                    return true;
-                }
-                else if (candidate.grid_shape.c > current.grid_shape.c)
-                {
-                    return false;
-                }
-            }
-
-            bool ukt_ok_candidate = ukt_ok(candidate);
-            bool ukt_ok_current = ukt_ok(current);
-
-            if (ukt_ok_candidate > ukt_ok_current)
-            {
-                return true;
-            }
-            else if (ukt_ok_candidate == ukt_ok_current)
-            {
-                bool mblock_size_ok_candidate = mblock_size_ok(candidate);
-                bool mblock_size_ok_current = mblock_size_ok(current);
-                if (mblock_size_ok_candidate > mblock_size_ok_current)
-                {
-                    return true;
-                }
-                else if (mblock_size_ok_candidate == mblock_size_ok_current)
-                {
-                    // (1) if both are close to target, pick the one with the largest block (volume_no_t)
-                    // (2) if only one is close to target, pick that one
-                    // (3) if both are far from target, pick the one that is closer to target (in terms of execution
-                    // cycles)
-
-                    int current_exec_cycles = current.get_execution_cycles(device_config.arch_name);
-                    int candidate_exec_cycles = candidate.get_execution_cycles(device_config.arch_name);
-                    float current_exec_util = (float)current_exec_cycles / (float)current_cycles;
-                    float candidate_exec_util = (float)candidate_exec_cycles / (float)candidate_cycles;
-
-                    if (op_model_compare_version == 2)
-                    {
-                        if (close_to_target_exec_cycles(current_exec_cycles, current_cycles, target_exec_cycles))
-                        {
-                            if (close_to_target_exec_cycles(
-                                    candidate_exec_cycles, candidate_cycles, target_exec_cycles))
-                            {
-                                if (candidate.block_shape().volume_no_t() > current.block_shape().volume_no_t())
-                                {
-                                    return true;
-                                }
-                                else if (candidate.block_shape().volume_no_t() == current.block_shape().volume_no_t())
-                                {
-                                    if (candidate_exec_util > current_exec_util)
-                                    {
-                                        return true;
-                                    }
-                                }
-                            }
-                        }
-                        else if (close_to_target_exec_cycles(
-                                     candidate_exec_cycles, candidate_cycles, target_exec_cycles))
-                        {
-                            return true;
-                        }
-                        else
-                        {
-                            if (candidate_cycles <= target_exec_cycles)
-                            {
-                                if (current_cycles > target_exec_cycles)
-                                {
-                                    return true;
-                                }
-                                else
-                                {
-                                    if (candidate.block_shape().volume_no_t() > current.block_shape().volume_no_t())
-                                    {
-                                        return true;
-                                    }
-                                    else if (
-                                        candidate.block_shape().volume_no_t() == current.block_shape().volume_no_t())
-                                    {
-                                        if (candidate_exec_util > current_exec_util)
-                                        {
-                                            return true;
-                                        }
-                                    }
-                                }
-                            }
-                            else if (candidate_cycles < current_cycles)
-                            {
-                                return true;
-                            }
-                        }
-                    }
-                    else if (op_model_compare_version == 1)
-                    {
-                        if (close_to_target(current_cycles, target_exec_cycles))
-                        {
-                            if (close_to_target(candidate_cycles, target_exec_cycles))
-                            {
-                                if (candidate.block_shape().volume_no_t() > current.block_shape().volume_no_t())
-                                {
-                                    return true;
-                                }
-                            }
-                        }
-                        else if (close_to_target(candidate_cycles, target_exec_cycles))
-                        {
-                            return true;
-                        }
-                        else if (
-                            std::abs(target_exec_cycles - candidate_cycles) <
-                            std::abs(target_exec_cycles - current_cycles))
-                        {
-                            return true;
-                        }
-                    }
-                }
-            }
-        }
-    }
-
-    return false;
-}
-
-bool validate_sparse_matmul_model(
-    const graphlib::BudaOpNode *op,
-    const OpModel &op_model,
-    const graphlib::Graph *graph,
-    std::unordered_set<std::uint64_t> &validated_cache)
-{
-    if (validated_cache.count(op_model.id.id) > 0)
-        return true;
-
-    TT_ASSERT(op->is_sparse_matmul());
-
-    int grid_r = op_model.grid_shape.r;
-    int u_rt = op_model.output_buffers[0].block_shape.ublock.rt;
-    int u_kt = op_model.input_buffers[1].block_shape.ublock.rt;
-    bool has_buffer_op = op_model.has_sparse_buffer();
-    bool force_buffer_op_layout = env_as<bool>("PYBUDA_FORCE_SPARSE_BUFFER_LAYOUT");
-    bool buffer_op_layout = has_buffer_op or force_buffer_op_layout;
-    const sparse::SparseBUDA &sparse_buda =
-        graph->data_operands(op)[0]->as<graphlib::ConstantInputNode>()->get_sparse_buda();
-    auto layout = sparse::SparseBUDA::create_layout(
-        buffer_op_layout, op_model.t_stream_factor.dir.z_major(), op_model.fracture_factor);
-
-    std::string visualize_sparse_path = "";
-    try
-    {
-        auto [sparse, encodings, sparse_s, encodings_s, num_strips_per_row] =
-            sparse_buda.get_sparse_tiles_and_encodings(
-                grid_r,
-                op_model.t_stream_factor.r,
-                op_model.t_stream_factor.c,
-                u_rt,
-                u_kt,
-                op_model.fracture_factor,
-                layout,
-                visualize_sparse_path);
-    }
-    catch (...)
-    {
-        log_trace(LogBalancer, "RIBBON2: Rejecting sparse matmul that can't be encoded: {}", op->name());
-        return false;  // we can't encode this model
-    }
-    validated_cache.insert(op_model.id.id);
-    return true;
-}
-
-bool can_fit_on_single_epoch(
-    tt::placer::InteractivePlacer &ip_fittment_tester,
-    const std::string &op_name_1,
-    const tt::balancer::GridShape &op_shape_1,
-    const std::string &op_name_2,
-    const tt::balancer::GridShape &op_shape_2,
-    bool enable_transpose)
-{
-    TT_ASSERT(ip_fittment_tester.current_epoch_empty(), "Test placer epoch must be empty!");
-    std::optional<placer::CoordRange> test_placement;
-
-    test_placement = ip_fittment_tester.place_op(op_name_1, op_shape_1, enable_transpose);
-
-    TT_ASSERT(test_placement.has_value(), "Single op must always fit!");
-
-    test_placement = ip_fittment_tester.place_op(op_name_2, op_shape_2, enable_transpose);
-
-    ip_fittment_tester.rewind_epoch();
-    return test_placement.has_value();
-}
-
-// Pick ribbon size for a given window of ops. The assumption is that all of them have the same r/c image dimension.
-//
-std::uint32_t pick_ribbon_size(
-    std::uint32_t start_index,
-    std::uint32_t end_index,  // end is not inclusive
-    const Graph *graph,
-    const legalizer::GraphSolver &graph_solver,
-    const std::vector<std::string> &scheduled_ops,
-    std::uint32_t device_rows)
-{
-    // set some tile limits. Min number ensures big enough blocks to keep perf running reasonably, and max avoids
-    // blob sizes from exploding.
-    std::uint32_t min_tile_height = env_as<int>("PYBUDA_RIBBON_MIN_TILE_HEIGHT", 1);
-    std::uint32_t max_tile_height = env_as<int>("PYBUDA_RIBBON_MAX_TILE_HEIGHT", 200);
-
-    // pick smallest legal ribbon
-    bool minimize_ribbon = !env_as<bool>("PYBUDA_RIBBON_MAXIMIZE");
-
-    bool skip_streaming = env_as<bool>("PYBUDA_RIBBON_SKIP_STREAMING");
-
-    // override the max ribon size
-    std::uint32_t max_ribbon_size = std::min(env_as<int>("PYBUDA_RIBBON_MAX_HEIGHT", device_rows), (int)device_rows);
-
-    // Try to find a ribbon size that work for all ops in the ribbon
-    std::unordered_set<std::uint32_t> candidates;
-    std::unordered_map<std::uint32_t, std::unordered_set<std::uint32_t>>
-        valid_map;  // map of ribbons that are valid for each op
-    for (std::uint32_t i = 1; i <= max_ribbon_size; i++) candidates.insert(i);
-
-    log_trace(LogBalancer, "Starting ribbon size search for {} ops", end_index - start_index);
-    for (std::uint32_t i = start_index; i < end_index; i++)
-    {
-        graphlib::BudaOpNode *op = graph->get_node_by_name(scheduled_ops[i])->as<graphlib::BudaOpNode>();
-        log_trace(LogBalancer, "  Checking op {}", op->name());
-        for (auto grid : graph_solver.at(op))
-        {
-            if (skip_streaming && (grid.t_stream_factor.r > 1))
-                continue;
-
-            log_trace(
-                LogBalancer,
-                "    - Grid: {}, t-stream: {}, block shape rt: {}",
-                grid.grid_shape,
-                grid.t_stream_factor,
-                grid.block_shape().rt());
-            if (prologue_ok(grid) && ((std::uint32_t)grid.block_shape().rt() >= min_tile_height) &&
-                ((std::uint32_t)grid.block_shape().rt() <= max_tile_height))
-            {
-                log_trace(LogBalancer, "     - valid");
-                valid_map[i].insert(grid.grid_shape.r);
-            }
-        }
-
-        std::unordered_set<std::uint32_t> to_erase;
-        for (auto it : candidates)
-            if (valid_map[i].count(it) == 0)
-                to_erase.insert(it);
-        for (auto it : to_erase) candidates.erase(it);
-
-        if (candidates.empty())
-            break;  // stop searching, we don't have anything
-    }
-
-    // If there are candidates available, pick smallest / largest
-    if (!candidates.empty())
-    {
-        return minimize_ribbon ? *std::min_element(candidates.begin(), candidates.end())
-                               : *std::max_element(candidates.begin(), candidates.end());
-    }
-
-    // std::cout << "No valid ribbon size found, looking for partials" << std::endl;
-    //  TT_THROW("No valid ribbon size found"); // TODO: handle this case... right now, it hangs
-
-    // No candidates available for everything. Need to find the best choice, so that everyone at least fits under
-    // some ribbon size and nobody goes beyond it
-    std::vector<std::uint32_t> partial_candidates;
-    if (minimize_ribbon)
-        for (std::uint32_t i = 1; i <= max_ribbon_size; i++) partial_candidates.push_back(i);
-    else
-        for (std::uint32_t i = max_ribbon_size; i > 0; i--) partial_candidates.push_back(i);
-
-    // For each candidate, find if all ops would fit in something equal or smaller, and then take that.
-    for (auto candidate : partial_candidates)
-    {
-        // At least one op should fit on this ribbon, otherwise it's not a real choice
-        bool one_match = false;
-        for (std::uint32_t i = start_index; i < end_index; i++)
-        {
-            if (valid_map[i].count(candidate) > 0)
-            {
-                one_match = true;
-                break;
-            }
-        }
-
-        if (!one_match)
-            continue;
-
-        bool all_ok = true;
-        for (std::uint32_t i = start_index; i < end_index; i++)
-        {
-            bool ok = false;
-            for (std::uint32_t ribbon = 1; ribbon <= candidate; ribbon++)
-            {
-                if (valid_map[i].count(ribbon) > 0)
-                {
-                    ok = true;
-                    break;
-                }
-            }
-            if (!ok)
-            {
-                all_ok = false;
-                break;
-            }
-        }
-
-        if (all_ok)
-            return candidate;
-    }
-
-    return 1;  // we couldn't find anything... so we'll just have to pick smallest legal values
-}
-
-// Return the index of the next op that should change the ribbon size. It's either matmul or sparse
-// matmul feeding it. Size of the array returned if no more changes found.
-// In case we are recomputing within current ribbon, pass in current_matmul_dim_r from previous computation.
-//
-std::pair<std::uint32_t, std::uint32_t> get_next_ribbon_change_op(
-    const graphlib::Graph *graph,
-    std::uint32_t current_index,
-    const std::vector<std::string> &scheduled_ops,
-    std::uint32_t current_matmul_dim_r)
-{
-    for (std::uint32_t i = current_index; i < scheduled_ops.size(); i++)
-    {
-        graphlib::Node *node = graph->get_node_by_name(scheduled_ops[i]);
-
-        if (node->node_type() != NodeType::kBudaOp)
-            continue;
-
-        const graphlib::BudaOpNode *op = node->as<graphlib::BudaOpNode>();
-        if (!is_matmul(op))
-            continue;
-
-        std::uint32_t dim_r = op->shape().rt();
-        if (current_matmul_dim_r == 0)
-        {
-            current_matmul_dim_r = dim_r;
-            continue;
-        }
-
-        if (dim_r == current_matmul_dim_r)
-            continue;
-
-        // Matmul with different row shape. Let's see if there's a sparse matmul feeding it
-        for (Node *operand : graph->data_operands(op))
-        {
-            // Skip through buffering queue.
-            //
-            if (operand->node_type() == NodeType::kQueue)
-            {
-                if (operand->as<graphlib::QueueNode>()->is_buffering())
-                {
-                    auto data_operands = graph->data_operands(operand);
-                    TT_ASSERT(data_operands.size() == 1);
-                    operand = data_operands.back();
-                }
-            }
-
-            if (operand->node_type() != NodeType::kBudaOp)
-                continue;
-
-            if (operand->as<graphlib::BudaOpNode>()->is_sparse_matmul())
-            {
-                // Find the index. Should be a quick search back.
-                for (int sparse_i = i - 1; sparse_i >= 0; sparse_i--)
-                {
-                    if (operand->name() == scheduled_ops[sparse_i])
-                    {
-                        return std::make_pair(sparse_i, current_matmul_dim_r);
-                    }
-                }
-            }
-
-            // No sparse matmul, switch on matmul itself
-            return std::make_pair(i, current_matmul_dim_r);
-        }
-    }
-
-    // No change until the end
-    return std::make_pair(scheduled_ops.size(), current_matmul_dim_r);
-}
-
-// Can we bind sparse matmul and matmul and place them atomically together in a single block.
-//
-bool can_bind_sparse_dense_matmul_pair(
-    const Graph *graph,
-    const graphlib::BudaOpNode *sparse_op,
-    OpModel const &sparse_op_model,
-    const graphlib::BudaOpNode *dense_op,
-    OpModel const &dense_op_model,
-    placer::InteractivePlacer const &interactive_placer,
-    bool allow_transpose)
-{
-    return sparse_op and sparse_op->is_sparse_matmul() and dense_op and
-           dense_op->should_pair_with_sparse(sparse_op, graph) and
-           sparse_op_model.grid_shape.r == dense_op_model.grid_shape.r and
-           interactive_placer.can_fit_on_single_epoch(
-               sparse_op_model.grid_shape.r,
-               sparse_op_model.grid_shape.c + dense_op_model.grid_shape.c,
-               allow_transpose) and
-           dense_op == graph->data_users(sparse_op)[0];
-}
-
-// Test whether provided value is within specified range from the target execution cycles.
-//
-bool close_to_target(std::uint32_t test, std::uint32_t target) { return (test < target) && (test > target * 0.8); }
-
-int get_limiter_cycles(
-    const OpModel &op_model,
-    const Graph *graph,
-    const DeviceConfig &device_config,
-    const int dram_access_core_count,
-    const std::unordered_set<const tt::graphlib::Node *> *current_epoch_nodes,
-    bool invalidate_cached)
-{
-    const float inefficency_divider = 2.0;
-    const float subchannel_oversub_coeff = 1.5;
-    TT_ASSERT(op_model.buda_op_node);
-    int kernel_cycles = op_model.get_execution_cycles(device_config.arch_name, false, invalidate_cached);
-
-    if (env_as<bool>("PYBUDA_BALANCER_LEGACY_CYCLES_CALC", false))
-    {
-        return kernel_cycles;
-    }
-
-    std::vector<Edge> data_operands = graph->operand_data_edges(op_model.buda_op_node);
-    std::vector<Edge> data_users = graph->user_data_edges(op_model.buda_op_node);
-
-    // Use half of theoretical max for better average estimate for now.
-    //
-    float noc_bw = static_cast<float>(device_config.get_noc_bandwidth_bytes_per_cycle()) / inefficency_divider;
-    float dram_bw_divider = std::max(
-        inefficency_divider,
-        std::ceil(dram_access_core_count / (device_config.get_dram_num_channels() * device_config.get_dram_num_subchannels() /
-                                  subchannel_oversub_coeff)));
-
-    // API is currently returning wrong value for WH
-    // tenstorrent/budabackend#2423
-    //
-    float dram_bw = device_config.is_wormhole()
-                        ? 20.4 / dram_bw_divider
-                        : static_cast<float>(device_config.get_dram_bandwidth_bytes_per_cycle()) / dram_bw_divider;
-    int memory_read_cycles = 0;
-
-    for (const Edge &edge : data_operands)
-    {
-        bool producer_is_queue = graph->node_by_id(edge.producer_node_id)->node_type() == NodeType::kQueue ||
-                                 graph->node_by_id(edge.producer_node_id)->node_type() == NodeType::kInput;
-
-        if (producer_is_queue and !op_model.parameter_buffers[edge.consumer_input_port_id])
-        {
-            memory_read_cycles = std::max(
-                memory_read_cycles,
-                static_cast<int>(op_model.input_buffers[edge.consumer_input_port_id].total_size_bytes() / dram_bw));
-        }
-        else
-        {
-            memory_read_cycles = std::max(
-                memory_read_cycles,
-                static_cast<int>(op_model.input_buffers[edge.consumer_input_port_id].total_size_bytes() / noc_bw));
-        }
-    }
-
-    int memory_write_cycles = 0;
-
-    for (const Edge &edge : data_users)
-    {
-        const tt::graphlib::Node *user_node = graph->node_by_id(edge.consumer_node_id);
-        bool consumer_is_queue = user_node->node_type() == NodeType::kQueue ||
-                                 user_node->node_type() == NodeType::kOutput ||
-                                 (nullptr != current_epoch_nodes && current_epoch_nodes->count(user_node) == 0);
-
-        if (consumer_is_queue)
-        {
-            memory_write_cycles = std::max(
-                memory_write_cycles,
-                static_cast<int>(op_model.output_buffers[edge.producer_output_port_id].total_size_bytes() / dram_bw));
-        }
-        else
-        {
-            memory_write_cycles = std::max(
-                memory_write_cycles,
-                static_cast<int>(op_model.output_buffers[edge.producer_output_port_id].total_size_bytes() / noc_bw));
-        }
-    }
-
-    return std::max({kernel_cycles, memory_read_cycles, memory_write_cycles});
-}
-
-bool is_output_write_to_dram_over_target(
-    const OpModel &op_model, const DeviceConfig &device_config, const int target_exec_cycles)
-{
-    int memory_write_cycles = 0;
-
-    // API is currently returning wrong value for WH
-    // tenstorrent/budabackend#2423
-    //
-    float dram_bw = device_config.is_wormhole()
-                        ? 20.4 / 2
-                        : static_cast<float>(device_config.get_dram_bandwidth_bytes_per_cycle()) / 2;
-
-    for (const BufferModel &output_buffer : op_model.output_buffers)
-    {
-        memory_write_cycles =
-            std::max(memory_write_cycles, static_cast<int>(output_buffer.total_size_bytes() / dram_bw));
-    }
-
-    return memory_write_cycles > target_exec_cycles;
-}
-
-// Depending on insertion instructions insert NOPs or queues directly into GraphSolver.
-//
-bool buffer_graph(
-    Graph *graph,
-    tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash> &inst,
-    legalizer::GraphSolver &graph_solver)
-{
-    vector<legalizer::BufferInfo> buffer_info;
-    vector<graphlib::Edge> edges_to_cut;
-    bool graph_modified = false;
-
-    for (auto it : inst)
-    {
-        if (it.second->instr_type == InsructionType::NopInstruction)
-        {
-            NopInsertionInstruction *nopInsertInst = static_cast<NopInsertionInstruction *>(it.second.get());
-            for (graphlib::Edge edge : graph->get_edges(
-                     graph->get_node_by_name(nopInsertInst->src), graph->get_node_by_name(nopInsertInst->dest)))
-            {
-                if (edge.edge_type != graphlib::EdgeType::kData)
-                {
-                    continue;
-                }
-
-                buffer_info.emplace_back(edge, nopInsertInst->nop_count, nopInsertInst->hoist_tms);
-            }
-        }
-        else if (it.second->instr_type == InsructionType::QueueInstruction)
-        {
-            QueueInsertionInstruction *qInsertInst = static_cast<QueueInsertionInstruction *>(it.second.get());
-            std::function<bool(Edge)> edge_filter = [qInsertInst](Edge edge)
-            { return edge.consumer_input_port_id == qInsertInst->input_id.value(); };
-            std::vector<tt::graphlib::Edge> operand_edges =
-                graph->operand_data_edges(graph->get_node_by_name(qInsertInst->dest), edge_filter);
-            TT_ASSERT(operand_edges.size() == 1, "Expected exactly one operand edge per queue instruction!");
-            edges_to_cut.push_back(operand_edges[0]);
-        }
-        else
-        {
-            TT_THROW("Unexpected insertion instruction type!");
-        }
-    }
-
-    if (buffer_info.size() > 0)
-    {
-        auto result = graph_solver.buffer(buffer_info);
-        graph_modified = true;
-        TT_ASSERT(result.size() > 0, "Expected buffering to occur but nothing was buffered!");
-    }
-
-    if (edges_to_cut.size() > 0)
-    {
-        graph_solver.cut(edges_to_cut);
-    }
-
-    return graph_modified;
-}
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/policies/policy_utils.hpp b/pybuda/csrc/balancer/policies/policy_utils.hpp
deleted file mode 100644
index 8fc6713a9..000000000
--- a/pybuda/csrc/balancer/policies/policy_utils.hpp
+++ /dev/null
@@ -1,283 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <random>
-#include <tuple>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-#include "balancer/balancer.hpp"
-#include "balancer/balancer_utils.hpp"
-#include "balancer/policies/policy_types.hpp"
-#include "utils/logger.hpp"
-
-namespace tt::placer
-{
-class InteractivePlacer;
-}
-
-namespace tt
-{
-// last bool in InsInstructionUniqueId represents instruction field mergeable. Tells if nop is mergeable (when two nops
-// have the same producer and are flagged as mergeable, they are merged into one that feeds all of their consummers)
-using InsInstructionUniqueId = std::tuple<std::string, std::string, std::uint32_t, std::uint32_t, bool>;
-struct InsInstructionUniqueIdHash;
-struct InsertionInstruction;
-}  // namespace tt
-
-namespace tt::balancer
-{
-
-struct EpochCost
-{
-    int setup_cycles = 0;
-    int runtime_cycles = 0;
-};
-
-OpModelMap to_op_model_map(OpModels const& selected_op_models);
-
-placer::PlacerSolution run_placer(
-    Graph const* graph, const BalancerConfig& config, OpModelMap const& selected_op_models);
-
-void dump_balancer_placer_data(
-    Graph const* graph,
-    std::vector<std::uint32_t> chip_ids,
-    tt::placer::PlacerSolution const& placer_solution,
-    OpModelMap const& op_model_map,
-    std::ostream& of,
-    const std::string& arch_name);
-
-std::vector<EpochCost> calculate_epoch_costs(
-    placer::PlacerSolution const& placer_solution, OpModelMap const& selected_op_models, std::string const& arch_name);
-
-inline int epoch_costs_sum(std::vector<EpochCost> const& epoch_costs)
-{
-    int sum = 0;
-    for (EpochCost cost : epoch_costs) sum += cost.setup_cycles + cost.runtime_cycles;
-    return sum;
-}
-
-inline PolicyType policy_from_string(std::string const& s)
-{
-    if (s == "MaximizeTMinimizeGrid")
-        return PolicyType::MaximizeTMinimizeGrid;
-    else if (s == "MinimizeGrid")
-        return PolicyType::MinimizeGrid;
-    else if (s == "Random")
-        return PolicyType::Random;
-    else if (s == "NLP")
-        return PolicyType::NLP;
-    else if (s == "CNN")
-        return PolicyType::CNN;
-    else if (s == "Ribbon")
-        return PolicyType::Ribbon;
-    else if (s == "default")  // default policy
-        return PolicyType::NLP;
-
-    log_error(LogBalancer, "Failed to parse policy from string: {}", s);
-    log_error(LogBalancer, "Falling back to PolicyType::MinimizeGrid");
-
-    return PolicyType::MinimizeGrid;
-}
-
-inline std::string policy_to_string(PolicyType p)
-{
-    switch (p)
-    {
-        case PolicyType::MinimizeGrid: return "MinimizeGrid";
-        case PolicyType::Random: return "Random";
-        case PolicyType::NLP: return "NLP";
-        case PolicyType::CNN: return "CNN";
-        case PolicyType::Ribbon: return "Ribbon";
-        default: break;
-    }
-    return "Unknown";
-}
-
-void epoch_or_chip_break_remove_processed_nodes(
-    const Graph* graph,
-    std::vector<std::vector<std::string>>& op_names_to_epoch_or_chip_break,
-    const std::unordered_set<const tt::graphlib::Node*>& processed_nodes);
-
-std::pair<scheduler::Schedule, std::unordered_set<string>> policy_run_scheduler(
-    graphlib::Graph const* graph,
-    BalancerConfig const& config,
-    const std::unordered_set<const tt::graphlib::Node*>& processed_nodes,
-    const tt::scheduler::Schedule& processed_schedule,
-    std::vector<tt::scheduler::Schedule>& op_names_to_epoch_break);
-
-std::tuple<std::vector<std::string>, std::unordered_set<string>, std::unordered_set<string>> policy_run_scheduler(
-    graphlib::Graph const* graph,
-    BalancerConfig const& config,
-    const std::unordered_set<const tt::graphlib::Node*>& processed_nodes,
-    const tt::scheduler::Schedule& processed_schedule,
-    std::vector<std::vector<std::string>>& op_names_to_epoch_break,
-    std::vector<tt::scheduler::Schedule>& op_names_to_chip_break);
-
-int get_limiter_cycles(
-    const OpModel& op_model,
-    const Graph* graph,
-    const DeviceConfig& device_config,
-    const int dram_access_core_count = 0,
-    const std::unordered_set<const tt::graphlib::Node*>* current_epoch_nodes = nullptr,
-    bool invalidate_cached = false);
-
-bool is_output_write_to_dram_over_target(
-    const OpModel& op_model, const DeviceConfig& device_config, const int target_exec_cycles);
-
-void cut_graph_solver_epoch(
-    const graphlib::Graph* graph, placer::InteractivePlacer& placer, legalizer::GraphSolver& graph_solver);
-
-void validate_solution(const std::vector<std::string>& scheduled_ops, const placer::PlacerSolution& placer_solution);
-
-std::unordered_set<const tt::graphlib::Node*> calculate_current_epoch_nodes(
-    const Graph* graph, const std::unordered_set<const tt::graphlib::Node*>& current_epoch_ops);
-
-void set_op_model_for_node(
-    legalizer::GraphSolver& graph_solver,
-    const graphlib::Node* node,
-    const OpModel& selected_op_model,
-    std::string const& arch_name);
-
-void set_op_model_for_node_ribbon(
-    legalizer::GraphSolver& graph_solver,
-    const graphlib::Node* op,
-    const OpModel& selected_op_model,
-    std::uint32_t current_ribbon_size);
-
-int ribbon_buffering_factor(const OpModel& op_model);
-
-bool is_matmul(const graphlib::BudaOpNode* op);
-
-bool prologue_ok(const OpModel& op_model);
-bool ukt_ok(const OpModel& op_model);
-bool mblock_size_ok(const OpModel& op_model);
-bool is_candidate_better_than_current(
-    const OpModel& current,
-    const OpModel& candidate,
-    const Graph* graph,
-    int ribbon_size,
-    int target_exec_cycles,
-    const DeviceConfig& device_config);
-
-std::uint32_t pick_ribbon_size(
-    std::uint32_t start_index,
-    std::uint32_t end_index,
-    const Graph* graph,
-    const legalizer::GraphSolver& graph_solver,
-    const std::vector<std::string>& scheduled_ops,
-    std::uint32_t device_rows);
-
-void cut_graph_solver_ribbon(
-    const graphlib::Graph* graph,
-    const graphlib::Node* op,
-    placer::InteractivePlacer& placer,
-    legalizer::GraphSolver& graph_solver);
-
-std::pair<std::uint32_t, std::uint32_t> get_next_ribbon_change_op(
-    const graphlib::Graph* graph,
-    std::uint32_t current_index,
-    const std::vector<std::string>& scheduled_ops,
-    std::uint32_t current_matmul_dim_r = 0);
-
-bool can_bind_sparse_dense_matmul_pair(
-    const Graph* graph,
-    const graphlib::BudaOpNode* sparse_op,
-    OpModel const& sparse_op_model,
-    const graphlib::BudaOpNode* dense_op,
-    OpModel const& dense_op_model,
-    placer::InteractivePlacer const& interactive_placer,
-    bool allow_transpose);
-
-bool close_to_target(std::uint32_t test, std::uint32_t target);
-
-bool validate_sparse_matmul_model(
-    const graphlib::BudaOpNode* op,
-    const OpModel& op_model,
-    const graphlib::Graph* graph,
-    std::unordered_set<std::uint64_t>& validated_cache);
-
-bool can_fit_on_single_epoch(
-    tt::placer::InteractivePlacer& ip_fittment_tester,
-    const std::string& op_name_1,
-    const tt::balancer::GridShape& op_shape_1,
-    const std::string& op_name_2,
-    const tt::balancer::GridShape& op_shape_2,
-    bool enable_transpose = true);
-
-template <typename T>
-OpModel select_best_op_model_ribbon(
-    const T& current_graph_solver,
-    const graphlib::BudaOpNode* op,
-    std::uint32_t current_ribbon_size,
-    const BalancerConfig& config,
-    const graphlib::Graph* graph,
-    std::unordered_set<std::uint64_t>& validated_cache,
-    const int target_cycles)
-{
-    auto op_models = current_graph_solver.at(op);
-    bool is_sparse_matmul = op->is_sparse_matmul();
-    const OpModel* selected_op_model = nullptr;
-    log_trace(LogBalancer, "  Selecting best op_model for {}. Choices:", op->name());
-
-    for (const auto& op_model : op_models)
-    {
-        log_trace(
-            LogBalancer,
-            "    Examining Grid: {}, {}, stream: {}",
-            op_model.grid_shape.r,
-            op_model.grid_shape.c,
-            op_model.t_stream_factor);
-
-        // If it is sparse matmul op skip op model that can't be encoded.
-        if (is_sparse_matmul)
-        {
-            if (!validate_sparse_matmul_model(op, op_model, graph, validated_cache))
-            {
-                log_trace(
-                    LogBalancer,
-                    "    Invalid sparse matmul op model. Grid: {}, {}, stream: {}",
-                    op_model.grid_shape.r,
-                    op_model.grid_shape.c,
-                    op_model.t_stream_factor);
-
-                continue;
-            }
-        }
-
-        // If it is first valid op model select it.
-        if (nullptr == selected_op_model)
-        {
-            selected_op_model = &op_model;
-            continue;
-        }
-
-        // If we already have valid op model selected compare it with new one and select better.
-        if (is_candidate_better_than_current(
-                *selected_op_model,
-                op_model,
-                graph,
-                current_ribbon_size,
-                target_cycles,
-                config.device_config))
-        {
-            selected_op_model = &op_model;
-        }
-    }
-
-    TT_ASSERT(nullptr != selected_op_model, "No valid op_models for operation: ", op->name());
-
-    return *selected_op_model;
-}
-
-bool buffer_graph(
-    Graph* graph,
-    tt::ordered_map<
-        tt::InsInstructionUniqueId,
-        std::shared_ptr<tt::InsertionInstruction>,
-        tt::InsInstructionUniqueIdHash>& inst,
-    legalizer::GraphSolver& graph_solver);
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/python_bindings.cpp b/pybuda/csrc/balancer/python_bindings.cpp
deleted file mode 100644
index 22701dcf2..000000000
--- a/pybuda/csrc/balancer/python_bindings.cpp
+++ /dev/null
@@ -1,552 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "balancer/python_bindings.hpp"
-
-#include <sstream>
-
-#include "balancer/balancer.hpp"
-#include "balancer/exceptions.hpp"
-#include "balancer/policies/policy_utils.hpp"
-#include "balancer/python_interface.hpp"
-#include "balancer/balancer_utils.hpp"
-#include "graph_lib/utils.hpp"
-#include "placer/placer.hpp"
-#include "passes/fuse_ops.hpp"
-
-#include "third_party/json/json.hpp"
-
-using namespace tt::balancer;
-
-template <typename T>
-inline std::optional<std::array<T, 2>> pair_as_array(std::optional<std::pair<T, T>> const& p)
-{
-    if (not p)
-        return std::nullopt;
-    return std::array<T, 2>{p->first, p->second};
-}
-
-template <typename T>
-inline std::optional<std::pair<T, T>> array_as_pair(std::optional<std::array<T, 2>> const& p)
-{
-    if (not p)
-        return std::nullopt;
-    return std::make_pair((*p)[0], (*p)[1]);
-}
-
-void BalancerModule(py::module &m_balancer) {
-    py::class_<BalancerSolution, std::shared_ptr<BalancerSolution>>(m_balancer, "BalancerSolution")
-        .def_readonly("placer_solution", &BalancerSolution::placer_solution)
-        .def_readonly("op_models", &BalancerSolution::op_models)
-        .def_readonly("output_host_tms", &BalancerSolution::output_host_tms)
-        .def("cut_edges_as_override", [](BalancerSolution const& s, tt::graphlib::Graph* graph) {
-            std::vector<std::tuple<std::string, std::string, int>> edges;
-            edges.reserve(s.graph_solver_cut_edges.size());
-            for (auto [edge, a] : s.graph_solver_cut_edges)
-            {
-                edges.push_back(std::make_tuple(
-                    graph->node_by_id(edge.producer_node_id)->name(),
-                    graph->node_by_id(edge.consumer_node_id)->name(),
-                    edge.consumer_input_port_id));
-            }
-            return edges;
-        });
-
-    py::enum_<PolicyType>(m_balancer, "PolicyType")
-        .value("MinimizeGrid", PolicyType::MinimizeGrid)
-        .value("Random", PolicyType::Random)
-        .value("NLP", PolicyType::NLP)
-        .value("CNN", PolicyType::CNN)
-        .value("Ribbon", PolicyType::Ribbon)
-        .export_values();
-
-    py::class_<GridShape>(m_balancer, "GridShape")
-        .def_readonly("r", &GridShape::r)
-        .def_readonly("c", &GridShape::c)
-        .def("__eq__", [](GridShape const& a, GridShape const& b) { return a == b; })
-        .def(
-            "__eq__",
-            [](GridShape const& a, std::pair<int, int> const& b) { return a == GridShape(b.first, b.second); })
-        .def("__repr__", [](GridShape const& a) {
-            std::stringstream ss;
-            ss << a;
-            return ss.str();
-        });
-
-    using OpOverrideTypes = std::variant<std::optional<bool>, std::string, std::optional<int>, std::optional<std::array<int, 2>>, std::optional<std::map<std::uint32_t, std::uint32_t>>>;
-    py::class_<OpOverride>(m_balancer, "OpOverride")
-        .def(py::init<>())
-        .def_readwrite("grid_shape", &OpOverride::grid_shape)
-        .def_readwrite("force_dram_parameters", &OpOverride::force_dram_parameters)
-        .def_readwrite("t_stream_dir", &OpOverride::t_stream_dir)
-        .def_readwrite("t_stream_shape", &OpOverride::t_stream_shape)
-        .def_readwrite("fracture_factor", &OpOverride::fracture_factor)
-        .def_readwrite("u_kt", &OpOverride::u_kt)
-        .def_readwrite("input_buffer_multiplier", &OpOverride::input_buffer_multiplier)
-        .def_readwrite("output_buffer_multiplier", &OpOverride::output_buffer_multiplier)
-        .def(py::pickle(
-            [](const OpOverride& p) {  // __getstate__
-                return py::make_tuple(
-                    p.grid_shape,
-                    p.force_dram_parameters,
-                    p.t_stream_dir,
-                    p.t_stream_shape,
-                    p.fracture_factor,
-                    p.u_kt,
-                    p.input_buffer_multiplier,
-                    p.output_buffer_multiplier);
-            },
-            [](py::tuple t) {  // __setstate__
-                if (t.size() != 8)
-                {
-                    throw std::runtime_error("OpOverride: Invalid state!");
-                }
-
-                OpOverride p;
-                p.grid_shape = t[0].cast<std::optional<std::pair<int, int>>>();
-                p.force_dram_parameters = t[1].cast<std::optional<bool>>();
-                p.t_stream_dir = t[2].cast<std::string>();
-                p.t_stream_shape = t[3].cast<std::optional<std::pair<int, int>>>();
-                p.fracture_factor = t[4].cast<std::optional<int>>();
-                p.u_kt = t[5].cast<std::optional<int>>();
-                p.input_buffer_multiplier = t[6].cast<std::optional<std::map<std::uint32_t, std::uint32_t>>>();
-                p.output_buffer_multiplier = t[7].cast<std::optional<int>>();
-                return p;
-            }))
-        .def(
-            "to_json",
-            [](OpOverride const& op_override)
-            {
-                std::unordered_map<std::string, OpOverrideTypes> d;
-                d["grid_shape"] = pair_as_array(op_override.grid_shape);
-                d["force_dram_parameters"] = op_override.force_dram_parameters;
-                d["t_stream_dir"] = op_override.t_stream_dir;
-                d["t_stream_shape"] = pair_as_array(op_override.t_stream_shape);
-                d["fracture_factor"] = op_override.fracture_factor;
-                d["u_kt"] = op_override.u_kt;
-                d["input_buffer_multiplier"] = op_override.input_buffer_multiplier;
-                d["output_buffer_multiplier"] = op_override.output_buffer_multiplier;
-                return d;
-            })
-        .def(
-            "from_json",
-            [](std::unordered_map<std::string, OpOverrideTypes> const& d)
-            {
-                OpOverride op_override;
-                if (auto match = d.find("grid_shape");
-                    match != d.end() && std::holds_alternative<std::optional<std::array<int, 2>>>(match->second))
-                    op_override.grid_shape = array_as_pair(std::get<std::optional<std::array<int, 2>>>(match->second));
-                if (auto match = d.find("force_dram_parameters"); match != d.end())
-                    op_override.force_dram_parameters = std::get<std::optional<bool>>(match->second);
-                if (auto match = d.find("t_stream_dir"); match != d.end())
-                    op_override.t_stream_dir = std::get<std::string>(match->second);
-                if (auto match = d.find("t_stream_shape");
-                    match != d.end() && std::holds_alternative<std::optional<std::array<int, 2>>>(match->second))
-                    op_override.t_stream_shape =
-                        array_as_pair(std::get<std::optional<std::array<int, 2>>>(match->second));
-                if (auto match = d.find("fracture_factor");
-                    match != d.end() and std::holds_alternative<std::optional<int>>(match->second))
-                    op_override.fracture_factor = std::get<std::optional<int>>(match->second);
-                if (auto match = d.find("u_kt");
-                    match != d.end() and std::holds_alternative<std::optional<int>>(match->second))
-                    op_override.fracture_factor = std::get<std::optional<int>>(match->second);
-                if (auto match = d.find("input_buffer_multiplier");
-                    match != d.end() &&
-                    std::holds_alternative<std::optional<std::map<std::uint32_t, std::uint32_t>>>(match->second))
-                    op_override.input_buffer_multiplier =
-                        std::get<std::optional<std::map<std::uint32_t, std::uint32_t>>>(match->second);
-                if (auto match = d.find("output_buffer_multiplier");
-                    match != d.end() && std::holds_alternative<std::optional<int>>(match->second))
-                    op_override.output_buffer_multiplier = std::get<std::optional<int>>(match->second);
-                return op_override;
-            });
-
-    py::enum_<tt::placer::ChipPlacementPolicy>(m_balancer, "ChipPlacementPolicy")
-        .value("MMIO_LAST", tt::placer::ChipPlacementPolicy::MMIO_LAST)
-        .value("SNAKE", tt::placer::ChipPlacementPolicy::SNAKE)
-        .export_values();
-
-    py::enum_<legalizer::GraphSolverSelfCutType>(m_balancer, "GraphSolverSelfCutType")
-        .value("None", legalizer::GraphSolverSelfCutType::None)
-        .value("ConsumerOperandDataEdgesFirst", legalizer::GraphSolverSelfCutType::ConsumerOperandDataEdgesFirst)
-        .value("ProducerUserDataEdgesFirst", legalizer::GraphSolverSelfCutType::ProducerUserDataEdgesFirst)
-        .value("FastCut", legalizer::GraphSolverSelfCutType::FastCut)
-        .export_values();
-
-    py::class_<BalancerConfig>(m_balancer, "BalancerConfig")
-        .def(
-            py::init<
-                tt::DeviceConfig,
-                tt::scheduler::SchedulerConfig,
-                PolicyType,
-                int,
-                std::vector<std::uint32_t>,
-                tt::placer::ChipPlacementPolicy,
-                bool,
-                bool,
-                bool,
-                bool,
-                bool,
-                bool,
-                std::unordered_map<std::string, OpOverride>,
-                std::vector<std::vector<std::string>>,
-                std::vector<std::vector<std::string>>,
-                std::unordered_map<std::string, std::uint32_t>,
-                std::unordered_map<std::string, ::tt::placer::PlacerOpOverride>,
-                bool,
-                legalizer::GraphSolverSelfCutType,
-                bool,
-                bool,
-                bool>(),
-            py::arg("device_config"),
-            py::arg("scheduler_config"),
-            py::arg("policy_type") = PolicyType::NLP,
-            py::arg("random_policy_seed") = 0,
-            py::arg("chip_ids") = std::vector<std::uint32_t>{0},
-            py::arg("chip_placement_policy") = tt::placer::ChipPlacementPolicy::MMIO_LAST,
-            py::arg("default_dram_parameters") = false,
-            py::arg("skip_l1_usage_validation") = false,
-            py::arg("enable_t_streaming") = false,
-            py::arg("manual_t_streaming") = false,
-            py::arg("input_queues_on_host") = true,
-            py::arg("output_queues_on_host") = true,
-            py::arg("op_overrides") = std::unordered_map<std::string, OpOverride>{},
-            py::arg("op_names_to_epoch_break") = std::vector<std::vector<std::string>>{},
-            py::arg("op_names_to_chip_break") = std::vector<std::vector<std::string>>{},
-            py::arg("op_names_to_chip_id_assignment") = std::unordered_map<std::string, std::uint32_t>{},
-            py::arg("op_name_to_placer_overrides") = std::unordered_map<std::string, ::tt::placer::PlacerOpOverride>{},
-            py::arg("enable_auto_transposing_placement") = false,
-            py::arg("graph_solver_self_cut_type") = legalizer::GraphSolverSelfCutType::None,
-            py::arg("use_interactive_placer") = true,
-            py::arg("enable_enumerate_u_kt") = true,
-            py::arg("enable_single_buffer_fallback") = false)
-        .def_readwrite("device_config", &BalancerConfig::device_config)
-        .def_readwrite("scheduler_config", &BalancerConfig::scheduler_config)
-        .def_readwrite("policy_type", &BalancerConfig::policy_type)
-        .def_readwrite("random_policy_seed", &BalancerConfig::random_policy_seed)
-        .def_readwrite("chip_ids", &BalancerConfig::chip_ids)
-        .def_readwrite("default_dram_parameters", &BalancerConfig::default_dram_parameters)
-        .def_readwrite("skip_l1_usage_validation", &BalancerConfig::skip_l1_usage_validation)
-        .def_readwrite("enable_t_streaming", &BalancerConfig::enable_t_streaming)
-        .def_readwrite("manual_t_streaming", &BalancerConfig::manual_t_streaming)
-        .def_readwrite("input_queues_on_host", &BalancerConfig::input_queues_on_host)
-        .def_readwrite("output_queues_on_host", &BalancerConfig::output_queues_on_host)
-        .def_readwrite("op_overrides", &BalancerConfig::op_overrides)
-        .def_readwrite("op_names_to_epoch_break", &BalancerConfig::op_names_to_epoch_break)
-        .def_readwrite("op_names_to_chip_break", &BalancerConfig::op_names_to_chip_break)
-        .def_readwrite("op_names_to_chip_id_assignment", &BalancerConfig::op_names_to_chip_break)
-        .def_readwrite("op_name_to_placer_overrides", &BalancerConfig::op_name_to_placer_overrides)
-        .def_readwrite("enable_auto_transposing_placement", &BalancerConfig::enable_auto_transposing_placement)
-        .def_readwrite("graph_solver_self_cut_type", &BalancerConfig::graph_solver_self_cut_type)
-        .def_readwrite("use_interactive_placer", &BalancerConfig::use_interactive_placer)
-        .def_readwrite("enable_enumerate_u_kt", &BalancerConfig::enable_enumerate_u_kt)
-        .def_readwrite("enable_single_buffer_fallback", &BalancerConfig::enable_single_buffer_fallback)
-        .def_readwrite("fork_join_tiles_treshold", &BalancerConfig::fork_join_tiles_treshold);
-
-    py::class_<TensorShape>(m_balancer, "TensorShape")
-        .def_readonly("w", &TensorShape::w)
-        .def_readonly("z", &TensorShape::z)
-        .def_readonly("rt", &TensorShape::rt)
-        .def_readonly("ct", &TensorShape::ct)
-        .def("__getitem__", [](TensorShape const& shape, int i) { return shape[i]; })
-        .def("__setitem__", [](TensorShape& shape, int i, int val) { shape[i] = val; })
-        .def(
-            "__repr__",
-            [](TensorShape const& a)
-            {
-                std::stringstream ss;
-                ss << a;
-                return ss.str();
-            });
-
-    py::class_<OpShape>(m_balancer, "OpShape")
-        .def(
-            py::init(
-                [](std::vector<std::tuple<int, int, int, int>> const& input_shapes,
-                   std::tuple<int, int, int, int> const& output_shape,
-                   bool scalar_dims = true)
-                {
-                    auto tile_dim = [scalar_dims](int scalar_or_tile)
-                    { return scalar_dims ? (scalar_or_tile / tt::graphlib::Shape::BUDA_TILE_DIM) : scalar_or_tile; };
-                    std::vector<TensorShape> tensor_input_shapes;
-                    for (auto [w, z, r, c] : input_shapes)
-                        tensor_input_shapes.emplace_back(w, z, tile_dim(r), tile_dim(c));
-                    auto [w, z, r, c] = output_shape;
-                    return OpShape(
-                        tensor_input_shapes, tensor_input_shapes, {TensorShape(w, z, tile_dim(r), tile_dim(c))});
-                }),
-            py::arg("input_shapes"),
-            py::arg("output_shape"),
-            py::arg("scalar_dims") = true)
-        .def_readonly("inputs", &OpShape::inputs)
-        .def_readonly("outputs", &OpShape::outputs)
-        .def(
-            "__repr__",
-            [](OpShape const& a)
-            {
-                std::stringstream ss;
-                ss << a;
-                return ss.str();
-            });
-
-    py::class_<UBlockShape>(m_balancer, "UBlockShape")
-        .def_readonly("rt", &UBlockShape::rt)
-        .def_readonly("ct", &UBlockShape::ct)
-        .def("volume", [](UBlockShape const& a) { return a.volume(); })
-        .def("__eq__", [](UBlockShape const& a, UBlockShape const& b) { return a == b; })
-        .def(
-            "__eq__",
-            [](UBlockShape const& a, std::pair<int, int> const& b) { return a.rt == b.first and a.ct == b.second; })
-        .def("__repr__", [](UBlockShape const& a) {
-            std::stringstream ss;
-            ss << a;
-            return ss.str();
-        });
-
-    py::class_<BlockShape>(m_balancer, "BlockShape")
-        .def_readonly("t", &BlockShape::t)
-        .def_readonly("mblock_m", &BlockShape::mblock_m)
-        .def_readonly("mblock_n", &BlockShape::mblock_n)
-        .def_readonly("ublock", &BlockShape::ublock)
-        .def("volume", &BlockShape::volume)
-        .def("buffer_tiles", &BlockShape::buffer_tiles)
-        .def("__eq__", [](BlockShape const& a, BlockShape const& b) { return a == b; })
-        .def("__repr__", [](BlockShape const& a) {
-            std::stringstream ss;
-            ss << a;
-            return ss.str();
-        });
-
-    py::enum_<TStreamDir::Value>(m_balancer, "TStreamDir")
-        .value("R", TStreamDir::Value::R)
-        .value("C", TStreamDir::Value::C)
-        .value("RZ", TStreamDir::Value::RZ)
-        .value("CZ", TStreamDir::Value::CZ)
-        .export_values();
-
-    py::class_<TStreamFactor>(m_balancer, "TStreamFactor")
-        .def_readonly("r", &TStreamFactor::r)
-        .def_readonly("c", &TStreamFactor::c)
-        .def_property_readonly("dir", [](TStreamFactor const& a) { return a.dir.v; })
-        .def("__eq__", [](TStreamFactor const& a, TStreamFactor const& b) { return a == b; })
-        .def("__repr__", [](TStreamFactor const& a) {
-            std::stringstream ss;
-            ss << a;
-            return ss.str();
-        });
-
-    py::class_<BufferModel>(m_balancer, "BufferModel")
-        .def_readonly("block_shape", &BufferModel::block_shape)
-        .def_readonly("l1_size_tiles", &BufferModel::l1_size_tiles)
-        .def_readonly("data_format", &BufferModel::data_format)
-        .def("__repr__", [](BufferModel const& a) {
-            std::stringstream ss;
-            ss << a;
-            return ss.str();
-        });
-
-    py::class_<OpModel>(m_balancer, "OpModel")
-        .def_readonly("grid_shape", &OpModel::grid_shape)
-        .def_readonly("op_shape", &OpModel::op_shape)
-        .def("op_type", &OpModel::op_type)
-        .def("buda_op_attrs", &OpModel::buda_op_attrs)
-        .def("get_reduce_dim", &OpModel::get_reduce_dim)
-        .def_readonly("data_format", &OpModel::data_format)
-        .def("math_fidelity", &OpModel::math_fidelity)
-        .def_readonly("t_stream_factor", &OpModel::t_stream_factor)
-        .def_readonly("fracture_factor", &OpModel::fracture_factor)
-        .def_readonly("sparse_indices", &OpModel::sparse_indices)
-        .def_readonly("input_buffers", &OpModel::input_buffers)
-        .def_readonly("output_buffers", &OpModel::output_buffers)
-        .def_readonly("parameter_buffers", &OpModel::parameter_buffers)
-        .def_readonly("is_sparse_matmul", &OpModel::is_sparse_matmul)
-        .def_readonly("nz_tiles", &OpModel::nz_tiles)
-        .def_readonly("nz_ublocks", &OpModel::nz_ublocks)
-        .def_readonly("nz_strips", &OpModel::nz_strips)
-        .def("block_shape", &OpModel::block_shape)
-        .def("__repr__", [](OpModel const& a) {
-            std::stringstream ss;
-            ss << a;
-            return ss.str();
-        });
-
-    py::class_<FusedSubOpModel>(m_balancer, "FusedSubOpModel")
-        .def_readonly("type", &FusedSubOpModel::type)
-        .def_readonly("mblock_m", &FusedSubOpModel::mblock_m)
-        .def_readonly("mblock_n", &FusedSubOpModel::mblock_n)
-        .def_readonly("ublock_rt", &FusedSubOpModel::ublock_rt)
-        .def_readonly("ublock_ct", &FusedSubOpModel::ublock_ct)
-        .def_readonly("mblock_k", &FusedSubOpModel::mblock_k)
-        .def_readonly("ublock_kt", &FusedSubOpModel::ublock_kt)
-        .def_readonly("reduce_dim", &FusedSubOpModel::reduce_dim)
-        .def_readonly("has_dest_input", &FusedSubOpModel::has_dest_input)
-        .def_readonly("has_dest_output", &FusedSubOpModel::has_dest_output)
-        .def("__repr__", [](FusedSubOpModel const& a) {
-            std::stringstream ss;
-            ss << a;
-            return ss.str();
-        });
-
-    using OutputHostTMTypes = std::variant<bool, int>;
-    py::class_<OutputHostTM>(m_balancer, "OutputHostTM")
-        .def_readonly("hstack_factor", &OutputHostTM::hstack_factor)
-        .def_readonly("vstack_factor", &OutputHostTM::vstack_factor)
-        .def_readonly("row_major", &OutputHostTM::row_major)
-        .def(py::pickle(
-            [](const OutputHostTM &p) { // __getstate__
-                return py::make_tuple(
-                    p.hstack_factor,
-                    p.vstack_factor,
-                    p.row_major
-                );
-            },
-          [](py::tuple t) { // __setstate__
-              if (t.size() != 3)
-                  throw std::runtime_error("OutputHostTM: Invalid state!");
-
-                OutputHostTM p = {
-                    .hstack_factor = t[0].cast<int>(),
-                    .vstack_factor = t[1].cast<int>(),
-                    .row_major = t[2].cast<bool>()
-                };
-
-                return p;
-            }))
-        .def(
-            "to_json",
-            [](OutputHostTM const& output_host_tm) {
-                std::unordered_map<std::string, OutputHostTMTypes> d;
-                d["hstack_factor"] = output_host_tm.hstack_factor;
-                d["vstack_factor"] = output_host_tm.vstack_factor;
-                d["row_major"] = output_host_tm.row_major;
-                return d;
-            })
-        .def_static("from_json", [](std::unordered_map<std::string, OutputHostTMTypes> const& d) {
-            OutputHostTM output_host_tm;
-            if (auto match = d.find("hstack_factor");
-                match != d.end() && std::holds_alternative<int>(match->second))
-                output_host_tm.hstack_factor = std::get<int>(match->second);
-            if (auto match = d.find("vstack_factor"); match != d.end())
-                output_host_tm.vstack_factor = std::get<int>(match->second);
-            if (auto match = d.find("row_major"); match != d.end())
-                    output_host_tm.row_major = std::get<bool>(match->second);
-            return output_host_tm;
-        });
-
-    py::class_<FactorizedInt::Constant>(m_balancer, "Constant").def(py::init<int>(), py::arg("value"));
-
-    py::class_<FactorizedInt>(m_balancer, "FactorizedInt")
-        .def(py::init<>())
-        .def(py::init<int>(), py::arg("max_val"))
-        .def(py::init<std::pair<int, int>>(), py::arg("range"))
-        .def(py::init<FactorizedInt::Constant>(), py::arg("constant"))
-        .def("__and__", &FactorizedInt::operator&)
-        .def("__or__", &FactorizedInt::operator|)
-        .def("__sub__", &FactorizedInt::operator-)
-        .def("__mul__", &FactorizedInt::operator*)
-        .def("__div__", &FactorizedInt::operator/)
-        .def_property_readonly("factors", &FactorizedInt::get_factors)
-        .def_property_readonly("max_factor", &FactorizedInt::get_max_factor);
-
-    m_balancer.def(
-        "policy_from_string", &policy_from_string, "Returns policy type from string", py::arg("policy_type_str"));
-
-    m_balancer.def(
-        "graph_solver_self_cut_type_from_string",
-        &legalizer::graph_solver_self_cut_type_from_string,
-        "Returns graph solver self cut type from string",
-        py::arg("graph_solver_self_cut_type_from_string_str"));
-
-    m_balancer.def(
-        "can_use_interactive_placer",
-        &can_use_interactive_placer,
-        "Returns whether provided policy can use interactive placer",
-        py::arg("policy_type"));
-
-    m_balancer.def(
-        "chip_placement_policy_from_string",
-        &tt::placer::chip_placement_policy_from_string,
-        "Returns how chip ids will be ordered in placement",
-        py::arg("chip_placement_policy_str"));
-
-}
-
-// python_interface.hpp implementation
-namespace tt::balancer
-{
-
-std::pair<int, int> get_parallelization(
-    Graph const* graph, OpNode const* node, int fracture_factor, bool sparse_buffer_enable)
-{
-    auto eval_module = py::module_::import("pybuda.op.eval.buda");
-    py::function pybuda_parallelization = eval_module.attr("get_f_pybuda_parallelization")(node->op_type_ptr());
-
-    auto op_shape = get_op_shape(graph, node);
-    if ( (node->node_type() == graphlib::kBudaOp) && node->as<graphlib::BudaOpNode>()->is_fused_op())
-    {
-        // For the purposes of parallelization, the output op shape is the greatest common divisor of outputs 
-        // of all fused op shapes
-        auto fused_op = node->as<graphlib::BudaOpNode>()->get_fused_op();
-
-        const auto &schedules = fused_op->get_schedules();
-        balancer::TensorShape gcd_shape = schedules[0].ops[0].op_shape.outputs[0];
-        for (const auto &schedule : schedules)
-            for (const auto &op: schedule.ops)
-            {
-                gcd_shape.w = gcd(gcd_shape.w, op.op_shape.outputs[0].w);
-                gcd_shape.z = gcd(gcd_shape.z, op.op_shape.outputs[0].z);
-                gcd_shape.rt = gcd(gcd_shape.rt, op.op_shape.outputs[0].rt);
-                gcd_shape.ct = gcd(gcd_shape.ct, op.op_shape.outputs[0].ct);
-
-                // We need to take all fused op post tm inputs into account too.
-                for (std::uint32_t input_id = 0; input_id < op.inputs.size(); input_id++)
-                {
-                    if (op.inputs[input_id].type != FusedSubOpInput::InputType::INPUT)
-                        continue;
-
-                    gcd_shape.w = gcd(gcd_shape.w, op.op_shape.inputs[input_id].w);
-                    gcd_shape.z = gcd(gcd_shape.z, op.op_shape.inputs[input_id].z);
-                    gcd_shape.rt = gcd(gcd_shape.rt, op.op_shape.inputs[input_id].rt);
-                    gcd_shape.ct = gcd(gcd_shape.ct, op.op_shape.inputs[input_id].ct);
-                }
-            }
-
-        log_trace(LogBalancer, "OpOverriding output shape for parallelization of {} to ({},{},{},{})", 
-                node->name(),
-                gcd_shape.w,
-                gcd_shape.z,
-                gcd_shape.rt,
-                gcd_shape.ct);
-        op_shape.outputs[0] = gcd_shape;
-    }
-    else if (node->as<graphlib::BudaOpNode>()->is_sparse_matmul() and sparse_buffer_enable)
-    {
-        int bcast_factor =
-            graph->data_operands(node)[0]->as<graphlib::ConstantInputNode>()->get_sparse_buda().bcast_factor;
-        auto [r, c] = pybuda_parallelization(op_shape, fracture_factor).cast<std::pair<int, int>>();
-        TT_ASSERT((r % bcast_factor) == 0);
-        return std::make_pair(r / bcast_factor, c);
-    }
-    return pybuda_parallelization(op_shape, fracture_factor).cast<std::pair<int, int>>();
-}
-
-int get_execution_cycles(std::string const& arch_name, OpModel const& op_model, bool theoretical, std::vector<FusedSubOpModel> const& sub_op_models)
-{
-    auto eval_module = py::module_::import("pybuda.op.eval.buda");
-    py::function pybuda_op_execution_cycles =
-        eval_module.attr("get_f_pybuda_execution_cycles")(op_model.buda_op_node->op_type_ptr());
-    if (op_model.buda_op_node->op_type() == "matmul")
-    {
-        // Theoretical execution cycles are only applicable to matmuls
-        return pybuda_op_execution_cycles(arch_name, op_model, theoretical).cast<int>();
-    }
-
-    if (op_model.fused_op() != nullptr)
-    {
-        return pybuda_op_execution_cycles(arch_name, op_model, sub_op_models).cast<int>();
-    }
-
-    return pybuda_op_execution_cycles(arch_name, op_model).cast<int>();
-}
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/python_bindings.hpp b/pybuda/csrc/balancer/python_bindings.hpp
deleted file mode 100644
index a98544573..000000000
--- a/pybuda/csrc/balancer/python_bindings.hpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <pybind11/numpy.h>
-namespace py = pybind11;
-
-void BalancerModule(py::module &m_balancer);
-
diff --git a/pybuda/csrc/balancer/python_interface.hpp b/pybuda/csrc/balancer/python_interface.hpp
deleted file mode 100644
index 903516826..000000000
--- a/pybuda/csrc/balancer/python_interface.hpp
+++ /dev/null
@@ -1,26 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <string>
-#include <utility>
-#include <vector>
-
-#include "balancer/types.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-#include "graph_lib/node_types.hpp"
-#include "lower_to_buda/common.hpp"
-
-using Graph = tt::graphlib::Graph;
-using Node = tt::graphlib::Node;
-using OpNode = tt::graphlib::OpNode;
-
-namespace tt::balancer {
-
-// Defined by buda_bindings_balancer.cpp and tbd by test harness
-std::pair<int, int> get_parallelization(Graph const* graph, OpNode const* node, int fracture_factor, bool sparse_buffer_enable);
-int get_execution_cycles(std::string const& arch_name, OpModel const& op_model, bool theoretical = false, std::vector<FusedSubOpModel> const& sub_op_models = {});
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/tests/json/ghostnet_subgraph.json b/pybuda/csrc/balancer/tests/json/ghostnet_subgraph.json
deleted file mode 100644
index 0f4f7b921..000000000
--- a/pybuda/csrc/balancer/tests/json/ghostnet_subgraph.json
+++ /dev/null
@@ -1 +0,0 @@
-{"flags": ["FastCut"], "node_names":{"10725":"conv2d_14.dc.matmul.8","10735":"multiply_21","10750":"input_1_add_15_fork_clone1895","10751":"add_26","10772":"input_1_add_15","10777":"lc.input_tensor.conv2d_29.dc.sparse_matmul.10.dc.sparse_matmul.1.0","10778":"lc.input_tensor.conv2d_29.dc.sparse_matmul.10.dc.sparse_matmul.1.1","10779":"conv2d_29.dc.sparse_matmul.10.dc.sparse_matmul.1.lc2","10782":"conv2d_29.dc.matmul.12","10791":"blocks.0.0.ghost1.cheap_operation.0.weight","10794":"multiply_36","10809":"input_1_add_30_fork_clone1898","10810":"add_41","10831":"input_1_add_30","10834":"concatenate_43.dc.concatenate.2","10836":"lc.input_tensor.concatenate_43.dc.sparse_matmul.5.0","10837":"lc.input_tensor.concatenate_43.dc.sparse_matmul.5.1","10838":"concatenate_43.dc.sparse_matmul.5.lc2","10843":"conv2d_44.dc.matmul.8","10850":"blocks.0.0.ghost2.primary_conv.0.weight","10853":"multiply_51","15745":"conv2d_44.dc.matmul.8_transpose_nop_0"},"edges":[{"producer":10725,"consumer":10735,"input_port":0,"paths":[[0,0],[0,9],[0,16],[0,18],[0,24],[0,26],[1,1],[1,10],[1,16],[1,19],[1,24],[1,27],[2,2],[2,11],[2,16],[2,20],[2,24],[2,28],[3,3],[3,12],[3,16],[3,21],[3,24],[3,29],[4,4],[4,13],[4,16],[4,22],[4,24],[4,30],[5,5],[5,14],[5,16],[5,23],[5,24],[6,6],[6,15],[6,16],[6,24],[8,8],[8,16],[8,17],[8,24],[8,25],[9,0],[9,9],[9,16],[9,18],[9,24],[9,26],[10,1],[10,10],[10,16],[10,19],[10,24],[10,27],[11,2],[11,11],[11,16],[11,20],[11,24],[11,28],[12,3],[12,12],[12,16],[12,21],[12,24],[12,29],[13,4],[13,13],[13,16],[13,22],[13,24],[13,30],[14,5],[14,14],[14,16],[14,23],[14,24],[15,6],[15,15],[15,16],[15,24],[17,8],[17,16],[17,17],[17,24],[17,25],[18,0],[18,9],[18,16],[18,18],[18,24],[18,26],[19,1],[19,10],[19,16],[19,19],[19,24],[19,27],[20,2],[20,11],[20,16],[20,20],[20,24],[20,28],[21,3],[21,12],[21,16],[21,21],[21,24],[21,29],[22,4],[22,13],[22,16],[22,22],[22,24],[22,30],[23,5],[23,14],[23,16],[23,23],[23,24],[25,8],[25,16],[25,17],[25,24],[25,25],[26,0],[26,9],[26,16],[26,18],[26,24],[26,26],[27,1],[27,10],[27,16],[27,19],[27,24],[27,27],[28,2],[28,11],[28,16],[28,20],[28,24],[28,28],[29,3],[29,12],[29,16],[29,21],[29,24],[29,29],[30,4],[30,13],[30,16],[30,22],[30,24],[30,30]]},{"producer":10735,"consumer":10751,"input_port":0,"paths":[[0,0],[0,9],[0,16],[0,18],[0,24],[0,26],[1,1],[1,10],[1,16],[1,19],[1,24],[1,27],[2,2],[2,11],[2,16],[2,20],[2,24],[2,28],[3,3],[3,12],[3,16],[3,21],[3,24],[3,29],[4,4],[4,13],[4,16],[4,22],[4,24],[4,30],[5,5],[5,14],[5,16],[5,23],[5,24],[6,6],[6,15],[6,16],[6,24],[8,8],[8,16],[8,17],[8,24],[8,25],[9,0],[9,9],[9,16],[9,18],[9,24],[9,26],[10,1],[10,10],[10,16],[10,19],[10,24],[10,27],[11,2],[11,11],[11,16],[11,20],[11,24],[11,28],[12,3],[12,12],[12,16],[12,21],[12,24],[12,29],[13,4],[13,13],[13,16],[13,22],[13,24],[13,30],[14,5],[14,14],[14,16],[14,23],[14,24],[15,6],[15,15],[15,16],[15,24],[16,0],[16,1],[16,2],[16,3],[16,4],[16,5],[16,6],[16,7],[16,8],[16,9],[16,10],[16,11],[16,12],[16,13],[16,14],[16,15],[16,16],[16,17],[16,18],[16,19],[16,20],[16,21],[16,22],[16,23],[16,24],[16,25],[16,26],[16,27],[16,28],[16,29],[16,30],[17,8],[17,16],[17,17],[17,24],[17,25],[18,0],[18,9],[18,16],[18,18],[18,24],[18,26],[19,1],[19,10],[19,16],[19,19],[19,24],[19,27],[20,2],[20,11],[20,16],[20,20],[20,24],[20,28],[21,3],[21,12],[21,16],[21,21],[21,24],[21,29],[22,4],[22,13],[22,16],[22,22],[22,24],[22,30],[23,5],[23,14],[23,16],[23,23],[23,24],[24,0],[24,1],[24,2],[24,3],[24,4],[24,5],[24,6],[24,7],[24,8],[24,9],[24,10],[24,11],[24,12],[24,13],[24,14],[24,15],[24,16],[24,17],[24,18],[24,19],[24,20],[24,21],[24,22],[24,23],[24,24],[24,25],[24,26],[24,27],[24,28],[24,29],[24,30],[25,8],[25,16],[25,17],[25,24],[25,25],[26,0],[26,9],[26,16],[26,18],[26,24],[26,26],[27,1],[27,10],[27,16],[27,19],[27,24],[27,27],[28,2],[28,11],[28,16],[28,20],[28,24],[28,28],[29,3],[29,12],[29,16],[29,21],[29,24],[29,29],[30,4],[30,13],[30,16],[30,22],[30,24],[30,30]]},{"producer":10750,"consumer":10735,"input_port":1,"paths":[[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,8],[0,9],[0,10],[0,11],[0,12],[0,13],[0,14],[0,15],[0,16],[0,17],[0,18],[0,19],[0,20],[0,21],[0,22],[0,23],[0,24],[0,25],[0,26],[0,27],[0,28],[0,29],[0,30]]},{"producer":10751,"consumer":10779,"input_port":1,"paths":[[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7],[0,8],[0,9],[0,10],[0,11],[0,12],[0,13],[0,14],[0,15],[0,16],[0,17],[0,18],[0,19],[0,20],[0,21],[0,22],[0,23],[0,24],[0,25],[0,26],[0,27],[0,28],[0,29],[0,30],[0,31],[0,32],[0,33],[0,34],[0,35],[0,36],[0,37],[0,38],[0,39],[0,40],[0,41],[0,42],[0,43],[0,44],[0,45],[0,46],[0,47],[0,48],[0,49],[0,50],[0,51],[0,52],[0,53],[0,54],[0,55],[0,56],[0,57],[0,58],[0,59],[0,60],[0,61],[0,62],[0,63],[0,64],[0,65],[0,66],[0,67],[0,68],[0,69],[0,70],[0,71],[0,72],[1,0],[1,1],[1,2],[1,3],[1,4],[1,5],[1,6],[1,7],[1,8],[1,9],[1,10],[1,11],[1,12],[1,13],[1,14],[1,15],[1,16],[1,17],[1,18],[1,19],[1,20],[1,21],[1,22],[1,23],[1,24],[1,25],[1,26],[1,27],[1,28],[1,29],[1,30],[1,31],[1,32],[1,33],[1,34],[1,35],[1,36],[1,37],[1,38],[1,39],[1,40],[1,41],[1,42],[1,43],[1,44],[1,45],[1,46],[1,47],[1,48],[1,49],[1,50],[1,51],[1,52],[1,53],[1,54],[1,55],[1,56],[1,57],[1,58],[1,59],[1,60],[1,61],[1,62],[1,63],[1,64],[1,65],[1,66],[1,67],[1,68],[1,69],[1,70],[1,71],[1,72],[2,0],[2,1],[2,2],[2,3],[2,4],[2,5],[2,7],[2,8],[2,9],[2,10],[2,11],[2,12],[2,14],[2,15],[2,16],[2,17],[2,18],[2,19],[2,20],[2,21],[2,22],[2,23],[2,24],[2,26],[2,27],[2,28],[2,29],[2,30],[2,31],[2,33],[2,34],[2,35],[2,36],[2,37],[2,38],[2,40],[2,41],[2,42],[2,43],[2,44],[2,45],[2,46],[2,47],[2,48],[2,49],[2,50],[2,51],[2,53],[2,54],[2,55],[2,56],[2,57],[2,58],[2,60],[2,61],[2,62],[2,63],[2,64],[2,65],[2,67],[2,68],[2,69],[2,70],[2,71],[2,72],[3,1],[3,2],[3,3],[3,4],[3,5],[3,7],[3,8],[3,9],[3,10],[3,11],[3,14],[3,15],[3,16],[3,17],[3,20],[3,21],[3,22],[3,23],[3,24],[3,26],[3,27],[3,28],[3,29],[3,30],[3,33],[3,34],[3,35],[3,36],[3,37],[3,40],[3,41],[3,42],[3,43],[3,44],[3,47],[3,48],[3,49],[3,50],[3,51],[3,53],[3,54],[3,55],[3,56],[3,57],[3,60],[3,61],[3,62],[3,63],[3,64],[3,67],[3,68],[3,69],[3,70],[3,71],[4,1],[4,2],[4,3],[4,4],[4,7],[4,8],[4,9],[4,10],[4,14],[4,15],[4,16],[4,20],[4,21],[4,22],[4,23],[4,26],[4,27],[4,28],[4,29],[4,33],[4,34],[4,35],[4,36],[4,40],[4,41],[4,42],[4,43],[4,47],[4,48],[4,49],[4,50],[4,53],[4,54],[4,55],[4,56],[4,60],[4,61],[4,62],[4,63],[4,67],[4,68],[4,69],[4,70],[5,1],[5,2],[5,3],[5,7],[5,8],[5,9],[5,14],[5,15],[5,20],[5,21],[5,22],[5,26],[5,27],[5,28],[5,33],[5,34],[5,35],[5,40],[5,41],[5,42],[5,47],[5,48],[5,49],[5,53],[5,54],[5,55],[5,60],[5,61],[5,62],[5,67],[5,68],[5,69],[6,1],[6,2],[6,7],[6,8],[6,14],[6,20],[6,21],[6,26],[6,27],[6,33],[6,34],[6,40],[6,41],[6,47],[6,48],[6,53],[6,54],[6,60],[6,61],[6,67],[6,68],[7,1],[7,7],[7,20],[7,26],[7,33],[7,40],[7,47],[7,53],[7,60],[7,67],[8,0],[8,1],[8,2],[8,3],[8,4],[8,5],[8,6],[8,7],[8,8],[8,9],[8,10],[8,11],[8,12],[8,13],[8,14],[8,15],[8,16],[8,17],[8,18],[8,19],[8,20],[8,21],[8,22],[8,23],[8,24],[8,25],[8,26],[8,27],[8,28],[8,29],[8,30],[8,31],[8,32],[8,33],[8,34],[8,35],[8,36],[8,37],[8,38],[8,39],[8,40],[8,41],[8,42],[8,43],[8,44],[8,45],[8,46],[8,47],[8,48],[8,49],[8,50],[8,51],[8,52],[8,53],[8,54],[8,55],[8,56],[8,57],[8,58],[8,59],[8,60],[8,61],[8,62],[8,63],[8,64],[8,65],[8,66],[8,67],[8,68],[8,69],[8,70],[8,71],[8,72],[9,0],[9,1],[9,2],[9,3],[9,4],[9,5],[9,6],[9,7],[9,8],[9,9],[9,10],[9,11],[9,12],[9,13],[9,14],[9,15],[9,16],[9,17],[9,18],[9,19],[9,20],[9,21],[9,22],[9,23],[9,24],[9,25],[9,26],[9,27],[9,28],[9,29],[9,30],[9,31],[9,32],[9,33],[9,34],[9,35],[9,36],[9,37],[9,38],[9,39],[9,40],[9,41],[9,42],[9,43],[9,44],[9,45],[9,46],[9,47],[9,48],[9,49],[9,50],[9,51],[9,52],[9,53],[9,54],[9,55],[9,56],[9,57],[9,58],[9,59],[9,60],[9,61],[9,62],[9,63],[9,64],[9,65],[9,66],[9,67],[9,68],[9,69],[9,70],[9,71],[9,72],[10,0],[10,1],[10,2],[10,3],[10,4],[10,5],[10,6],[10,7],[10,8],[10,9],[10,10],[10,11],[10,12],[10,13],[10,14],[10,15],[10,16],[10,17],[10,18],[10,19],[10,20],[10,21],[10,22],[10,23],[10,24],[10,25],[10,26],[10,27],[10,28],[10,29],[10,30],[10,31],[10,32],[10,33],[10,34],[10,35],[10,36],[10,37],[10,38],[10,39],[10,40],[10,41],[10,42],[10,43],[10,44],[10,45],[10,46],[10,47],[10,48],[10,49],[10,50],[10,51],[10,52],[10,53],[10,54],[10,55],[10,56],[10,57],[10,58],[10,59],[10,60],[10,61],[10,62],[10,63],[10,64],[10,65],[10,66],[10,67],[10,68],[10,69],[10,70],[10,71],[10,72],[11,0],[11,1],[11,2],[11,3],[11,4],[11,5],[11,7],[11,8],[11,9],[11,10],[11,11],[11,12],[11,14],[11,15],[11,16],[11,17],[11,18],[11,19],[11,20],[11,21],[11,22],[11,23],[11,24],[11,26],[11,27],[11,28],[11,29],[11,30],[11,31],[11,33],[11,34],[11,35],[11,36],[11,37],[11,38],[11,40],[11,41],[11,42],[11,43],[11,44],[11,45],[11,46],[11,47],[11,48],[11,49],[11,50],[11,51],[11,53],[11,54],[11,55],[11,56],[11,57],[11,58],[11,60],[11,61],[11,62],[11,63],[11,64],[11,65],[11,67],[11,68],[11,69],[11,70],[11,71],[11,72],[12,1],[12,2],[12,3],[12,4],[12,5],[12,7],[12,8],[12,9],[12,10],[12,11],[12,14],[12,15],[12,16],[12,17],[12,20],[12,21],[12,22],[12,23],[12,24],[12,26],[12,27],[12,28],[12,29],[12,30],[12,33],[12,34],[12,35],[12,36],[12,37],[12,40],[12,41],[12,42],[12,43],[12,44],[12,47],[12,48],[12,49],[12,50],[12,51],[12,53],[12,54],[12,55],[12,56],[12,57],[12,60],[12,61],[12,62],[12,63],[12,64],[12,67],[12,68],[12,69],[12,70],[12,71],[13,1],[13,2],[13,3],[13,4],[13,7],[13,8],[13,9],[13,10],[13,14],[13,15],[13,16],[13,20],[13,21],[13,22],[13,23],[13,26],[13,27],[13,28],[13,29],[13,33],[13,34],[13,35],[13,36],[13,40],[13,41],[13,42],[13,43],[13,47],[13,48],[13,49],[13,50],[13,53],[13,54],[13,55],[13,56],[13,60],[13,61],[13,62],[13,63],[13,67],[13,68],[13,69],[13,70],[14,1],[14,2],[14,3],[14,7],[14,8],[14,9],[14,14],[14,15],[14,20],[14,21],[14,22],[14,26],[14,27],[14,28],[14,33],[14,34],[14,35],[14,40],[14,41],[14,42],[14,47],[14,48],[14,49],[14,53],[14,54],[14,55],[14,60],[14,61],[14,62],[14,67],[14,68],[14,69],[15,1],[15,2],[15,7],[15,8],[15,14],[15,20],[15,21],[15,26],[15,27],[15,33],[15,34],[15,40],[15,41],[15,47],[15,48],[15,53],[15,54],[15,60],[15,61],[15,67],[15,68],[16,0],[16,1],[16,2],[16,3],[16,4],[16,5],[16,6],[16,7],[16,8],[16,9],[16,10],[16,11],[16,12],[16,13],[16,14],[16,15],[16,16],[16,17],[16,18],[16,19],[16,20],[16,21],[16,22],[16,23],[16,24],[16,25],[16,26],[16,27],[16,28],[16,29],[16,30],[16,31],[16,32],[16,33],[16,34],[16,35],[16,36],[16,37],[16,38],[16,39],[16,40],[16,41],[16,42],[16,43],[16,44],[16,45],[16,46],[16,47],[16,48],[16,49],[16,50],[16,51],[16,52],[16,53],[16,54],[16,55],[16,56],[16,57],[16,58],[16,59],[16,60],[16,61],[16,62],[16,63],[16,64],[16,65],[16,66],[16,67],[16,68],[16,69],[16,70],[16,71],[16,72],[17,0],[17,1],[17,2],[17,3],[17,4],[17,5],[17,6],[17,7],[17,8],[17,9],[17,10],[17,11],[17,12],[17,13],[17,14],[17,15],[17,16],[17,17],[17,18],[17,19],[17,20],[17,21],[17,22],[17,23],[17,24],[17,25],[17,26],[17,27],[17,28],[17,29],[17,30],[17,31],[17,32],[17,33],[17,34],[17,35],[17,36],[17,37],[17,38],[17,39],[17,40],[17,41],[17,42],[17,43],[17,44],[17,45],[17,46],[17,47],[17,48],[17,49],[17,50],[17,51],[17,52],[17,53],[17,54],[17,55],[17,56],[17,57],[17,58],[17,59],[17,60],[17,61],[17,62],[17,63],[17,64],[17,65],[17,66],[17,67],[17,68],[17,69],[17,70],[17,71],[17,72],[18,0],[18,1],[18,2],[18,3],[18,4],[18,5],[18,6],[18,7],[18,8],[18,9],[18,10],[18,11],[18,12],[18,13],[18,14],[18,15],[18,16],[18,17],[18,18],[18,19],[18,20],[18,21],[18,22],[18,23],[18,24],[18,25],[18,26],[18,27],[18,28],[18,29],[18,30],[18,31],[18,32],[18,33],[18,34],[18,35],[18,36],[18,37],[18,38],[18,39],[18,40],[18,41],[18,42],[18,43],[18,44],[18,45],[18,46],[18,47],[18,48],[18,49],[18,50],[18,51],[18,52],[18,53],[18,54],[18,55],[18,56],[18,57],[18,58],[18,59],[18,60],[18,61],[18,62],[18,63],[18,64],[18,65],[18,66],[18,67],[18,68],[18,69],[18,70],[18,71],[18,72],[19,0],[19,1],[19,2],[19,3],[19,4],[19,5],[19,6],[19,7],[19,8],[19,9],[19,10],[19,11],[19,12],[19,13],[19,14],[19,15],[19,16],[19,17],[19,18],[19,19],[19,20],[19,21],[19,22],[19,23],[19,24],[19,25],[19,26],[19,27],[19,28],[19,29],[19,30],[19,31],[19,32],[19,33],[19,34],[19,35],[19,36],[19,37],[19,38],[19,39],[19,40],[19,41],[19,42],[19,43],[19,44],[19,45],[19,46],[19,47],[19,48],[19,49],[19,50],[19,51],[19,52],[19,53],[19,54],[19,55],[19,56],[19,57],[19,58],[19,59],[19,60],[19,61],[19,62],[19,63],[19,64],[19,65],[19,66],[19,67],[19,68],[19,69],[19,70],[19,71],[19,72],[20,0],[20,1],[20,2],[20,3],[20,4],[20,5],[20,7],[20,8],[20,9],[20,10],[20,11],[20,12],[20,14],[20,15],[20,16],[20,17],[20,18],[20,19],[20,20],[20,21],[20,22],[20,23],[20,24],[20,26],[20,27],[20,28],[20,29],[20,30],[20,31],[20,33],[20,34],[20,35],[20,36],[20,37],[20,38],[20,40],[20,41],[20,42],[20,43],[20,44],[20,45],[20,46],[20,47],[20,48],[20,49],[20,50],[20,51],[20,53],[20,54],[20,55],[20,56],[20,57],[20,58],[20,60],[20,61],[20,62],[20,63],[20,64],[20,65],[20,67],[20,68],[20,69],[20,70],[20,71],[20,72],[21,1],[21,2],[21,3],[21,4],[21,5],[21,7],[21,8],[21,9],[21,10],[21,11],[21,14],[21,15],[21,16],[21,17],[21,20],[21,21],[21,22],[21,23],[21,24],[21,26],[21,27],[21,28],[21,29],[21,30],[21,33],[21,34],[21,35],[21,36],[21,37],[21,40],[21,41],[21,42],[21,43],[21,44],[21,47],[21,48],[21,49],[21,50],[21,51],[21,53],[21,54],[21,55],[21,56],[21,57],[21,60],[21,61],[21,62],[21,63],[21,64],[21,67],[21,68],[21,69],[21,70],[21,71],[22,1],[22,2],[22,3],[22,4],[22,7],[22,8],[22,9],[22,10],[22,14],[22,15],[22,16],[22,20],[22,21],[22,22],[22,23],[22,26],[22,27],[22,28],[22,29],[22,33],[22,34],[22,35],[22,36],[22,40],[22,41],[22,42],[22,43],[22,47],[22,48],[22,49],[22,50],[22,53],[22,54],[22,55],[22,56],[22,60],[22,61],[22,62],[22,63],[22,67],[22,68],[22,69],[22,70],[23,1],[23,2],[23,3],[23,7],[23,8],[23,9],[23,14],[23,15],[23,20],[23,21],[23,22],[23,26],[23,27],[23,28],[23,33],[23,34],[23,35],[23,40],[23,41],[23,42],[23,47],[23,48],[23,49],[23,53],[23,54],[23,55],[23,60],[23,61],[23,62],[23,67],[23,68],[23,69],[24,0],[24,1],[24,2],[24,3],[24,4],[24,5],[24,6],[24,7],[24,8],[24,9],[24,10],[24,11],[24,12],[24,13],[24,14],[24,15],[24,16],[24,17],[24,18],[24,19],[24,20],[24,21],[24,22],[24,23],[24,24],[24,25],[24,26],[24,27],[24,28],[24,29],[24,30],[24,31],[24,32],[24,33],[24,34],[24,35],[24,36],[24,37],[24,38],[24,39],[24,40],[24,41],[24,42],[24,43],[24,44],[24,45],[24,46],[24,47],[24,48],[24,49],[24,50],[24,51],[24,52],[24,53],[24,54],[24,55],[24,56],[24,57],[24,58],[24,59],[24,60],[24,61],[24,62],[24,63],[24,64],[24,65],[24,66],[24,67],[24,68],[24,69],[24,70],[24,71],[24,72],[25,0],[25,1],[25,2],[25,3],[25,4],[25,5],[25,6],[25,7],[25,8],[25,9],[25,10],[25,11],[25,12],[25,13],[25,14],[25,15],[25,16],[25,17],[25,18],[25,19],[25,20],[25,21],[25,22],[25,23],[25,24],[25,25],[25,26],[25,27],[25,28],[25,29],[25,30],[25,31],[25,32],[25,33],[25,34],[25,35],[25,36],[25,37],[25,38],[25,39],[25,40],[25,41],[25,42],[25,43],[25,44],[25,45],[25,46],[25,47],[25,48],[25,49],[25,50],[25,51],[25,52],[25,53],[25,54],[25,55],[25,56],[25,57],[25,58],[25,59],[25,60],[25,61],[25,62],[25,63],[25,64],[25,65],[25,66],[25,67],[25,68],[25,69],[25,70],[25,71],[25,72],[26,0],[26,1],[26,2],[26,3],[26,4],[26,5],[26,6],[26,7],[26,8],[26,9],[26,10],[26,11],[26,12],[26,13],[26,14],[26,15],[26,16],[26,17],[26,18],[26,19],[26,20],[26,21],[26,22],[26,23],[26,24],[26,25],[26,26],[26,27],[26,28],[26,29],[26,30],[26,31],[26,32],[26,33],[26,34],[26,35],[26,36],[26,37],[26,38],[26,39],[26,40],[26,41],[26,42],[26,43],[26,44],[26,45],[26,46],[26,47],[26,48],[26,49],[26,50],[26,51],[26,52],[26,53],[26,54],[26,55],[26,56],[26,57],[26,58],[26,59],[26,60],[26,61],[26,62],[26,63],[26,64],[26,65],[26,66],[26,67],[26,68],[26,69],[26,70],[26,71],[26,72],[27,0],[27,1],[27,2],[27,3],[27,4],[27,5],[27,6],[27,7],[27,8],[27,9],[27,10],[27,11],[27,12],[27,13],[27,14],[27,15],[27,16],[27,17],[27,18],[27,19],[27,20],[27,21],[27,22],[27,23],[27,24],[27,25],[27,26],[27,27],[27,28],[27,29],[27,30],[27,31],[27,32],[27,33],[27,34],[27,35],[27,36],[27,37],[27,38],[27,39],[27,40],[27,41],[27,42],[27,43],[27,44],[27,45],[27,46],[27,47],[27,48],[27,49],[27,50],[27,51],[27,52],[27,53],[27,54],[27,55],[27,56],[27,57],[27,58],[27,59],[27,60],[27,61],[27,62],[27,63],[27,64],[27,65],[27,66],[27,67],[27,68],[27,69],[27,70],[27,71],[27,72],[28,0],[28,1],[28,2],[28,3],[28,4],[28,5],[28,7],[28,8],[28,9],[28,10],[28,11],[28,12],[28,14],[28,15],[28,16],[28,17],[28,18],[28,19],[28,20],[28,21],[28,22],[28,23],[28,24],[28,26],[28,27],[28,28],[28,29],[28,30],[28,31],[28,33],[28,34],[28,35],[28,36],[28,37],[28,38],[28,40],[28,41],[28,42],[28,43],[28,44],[28,45],[28,46],[28,47],[28,48],[28,49],[28,50],[28,51],[28,53],[28,54],[28,55],[28,56],[28,57],[28,58],[28,60],[28,61],[28,62],[28,63],[28,64],[28,65],[28,67],[28,68],[28,69],[28,70],[28,71],[28,72],[29,1],[29,2],[29,3],[29,4],[29,5],[29,7],[29,8],[29,9],[29,10],[29,11],[29,14],[29,15],[29,16],[29,17],[29,20],[29,21],[29,22],[29,23],[29,24],[29,26],[29,27],[29,28],[29,29],[29,30],[29,33],[29,34],[29,35],[29,36],[29,37],[29,40],[29,41],[29,42],[29,43],[29,44],[29,47],[29,48],[29,49],[29,50],[29,51],[29,53],[29,54],[29,55],[29,56],[29,57],[29,60],[29,61],[29,62],[29,63],[29,64],[29,67],[29,68],[29,69],[29,70],[29,71],[30,1],[30,2],[30,3],[30,4],[30,7],[30,8],[30,9],[30,10],[30,14],[30,15],[30,16],[30,20],[30,21],[30,22],[30,23],[30,26],[30,27],[30,28],[30,29],[30,33],[30,34],[30,35],[30,36],[30,40],[30,41],[30,42],[30,43],[30,47],[30,48],[30,49],[30,50],[30,53],[30,54],[30,55],[30,56],[30,60],[30,61],[30,62],[30,63],[30,67],[30,68],[30,69],[30,70]]},{"producer":10751,"consumer":10834,"input_port":0,"paths":[[0,7],[0,15],[0,23],[1,0],[1,8],[1,16],[1,24],[2,1],[2,9],[2,17],[2,25],[3,2],[3,10],[3,18],[3,26],[4,3],[4,11],[4,19],[4,27],[5,4],[5,12],[5,20],[6,5],[6,13],[7,6],[8,14],[8,22],[9,7],[9,15],[9,23],[10,0],[10,8],[10,16],[10,24],[11,1],[11,9],[11,17],[11,25],[12,2],[12,10],[12,18],[12,26],[13,3],[13,11],[13,19],[13,27],[14,4],[14,12],[14,20],[15,5],[15,13],[16,0],[16,1],[16,2],[16,3],[16,4],[16,5],[16,6],[16,7],[16,8],[16,9],[16,10],[16,11],[16,12],[16,13],[16,14],[16,15],[16,16],[16,17],[16,18],[16,19],[16,20],[16,21],[16,22],[16,23],[16,24],[16,25],[16,26],[16,27],[17,14],[17,22],[18,7],[18,15],[18,23],[19,0],[19,8],[19,16],[19,24],[20,1],[20,9],[20,17],[20,25],[21,2],[21,10],[21,18],[21,26],[22,3],[22,11],[22,19],[22,27],[23,4],[23,12],[23,20],[24,0],[24,1],[24,2],[24,3],[24,4],[24,5],[24,6],[24,7],[24,8],[24,9],[24,10],[24,11],[24,12],[24,13],[24,14],[24,15],[24,16],[24,17],[24,18],[24,19],[24,20],[24,21],[24,22],[24,23],[24,24],[24,25],[24,26],[24,27],[25,14],[25,22],[26,7],[26,15],[26,23],[27,0],[27,8],[27,16],[27,24],[28,1],[28,9],[28,17],[28,25],[29,2],[29,10],[29,18],[29,26],[30,3],[30,11],[30,19],[30,27]]},{"producer":10772,"consumer":10751,"input_port":1,"paths":[[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7],[0,8],[0,9],[0,10],[0,11],[0,12],[0,13],[0,14],[0,15],[0,16],[0,17],[0,18],[0,19],[0,20],[0,21],[0,22],[0,23],[0,24],[0,25],[0,26],[0,27],[0,28],[0,29],[0,30]]},{"producer":10777,"consumer":10779,"input_port":0,"paths":[[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7],[0,8],[0,9],[0,10],[0,11],[0,12],[0,13],[0,14],[0,15],[0,16],[0,17],[0,18],[0,19],[0,20],[0,21],[0,22],[0,23],[0,24],[0,25],[0,26],[0,27],[0,28],[0,29],[0,30],[0,31],[0,32],[0,33],[0,34],[0,35],[0,36],[0,37],[0,38],[0,39],[0,40],[0,41],[0,42],[0,43],[0,44],[0,45],[0,46],[0,47],[0,48],[0,49],[0,50],[0,51],[0,52],[0,53],[0,54],[0,55],[0,56],[0,57],[0,58],[0,59],[0,60],[0,61],[0,62],[0,63],[0,64],[0,65],[0,66],[0,67],[0,68],[0,69],[0,70],[0,71],[0,72]]},{"producer":10778,"consumer":10779,"input_port":2,"paths":[[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7],[0,8],[0,9],[0,10],[0,11],[0,12],[0,13],[0,14],[0,15],[0,16],[0,17],[0,18],[0,19],[0,20],[0,21],[0,22],[0,23],[0,24],[0,25],[0,26],[0,27],[0,28],[0,29],[0,30],[0,31],[0,32],[0,33],[0,34],[0,35],[0,36],[0,37],[0,38],[0,39],[0,40],[0,41],[0,42],[0,43],[0,44],[0,45],[0,46],[0,47],[0,48],[0,49],[0,50],[0,51],[0,52],[0,53],[0,54],[0,55],[0,56],[0,57],[0,58],[0,59],[0,60],[0,61],[0,62],[0,63],[0,64],[0,65],[0,66],[0,67],[0,68],[0,69],[0,70],[0,71],[0,72]]},{"producer":10779,"consumer":10782,"input_port":0,"paths":[[0,7],[0,8],[0,9],[0,32],[0,33],[0,34],[0,58],[0,59],[0,60],[0,82],[0,83],[0,84],[1,7],[1,8],[1,9],[1,32],[1,33],[1,34],[1,58],[1,59],[1,60],[1,82],[1,83],[1,84],[2,7],[2,8],[2,9],[2,32],[2,33],[2,34],[2,58],[2,59],[2,60],[2,82],[2,83],[2,84],[3,7],[3,8],[3,9],[3,32],[3,33],[3,34],[3,58],[3,59],[3,60],[3,82],[3,83],[3,84],[4,7],[4,8],[4,9],[4,32],[4,33],[4,34],[4,58],[4,59],[4,60],[4,82],[4,83],[4,84],[5,7],[5,8],[5,9],[5,32],[5,33],[5,34],[5,58],[5,59],[5,60],[5,82],[5,83],[5,84],[6,10],[6,11],[6,12],[6,35],[6,36],[6,37],[6,61],[6,62],[6,63],[6,85],[6,86],[6,87],[7,10],[7,11],[7,12],[7,35],[7,36],[7,37],[7,61],[7,62],[7,63],[7,85],[7,86],[7,87],[8,10],[8,11],[8,12],[8,35],[8,36],[8,37],[8,61],[8,62],[8,63],[8,85],[8,86],[8,87],[9,10],[9,11],[9,12],[9,35],[9,36],[9,37],[9,61],[9,62],[9,63],[9,85],[9,86],[9,87],[10,10],[10,11],[10,12],[10,35],[10,36],[10,37],[10,61],[10,62],[10,63],[10,85],[10,86],[10,87],[11,10],[11,11],[11,12],[11,35],[11,36],[11,37],[11,61],[11,62],[11,63],[11,85],[11,86],[11,87],[12,10],[12,11],[12,12],[12,35],[12,36],[12,37],[12,61],[12,62],[12,63],[12,85],[12,86],[12,87],[13,13],[13,14],[13,15],[13,38],[13,39],[13,40],[13,64],[13,65],[13,66],[14,13],[14,14],[14,15],[14,38],[14,39],[14,40],[14,64],[14,65],[14,66],[15,13],[15,14],[15,15],[15,38],[15,39],[15,40],[15,64],[15,65],[15,66],[16,13],[16,14],[16,15],[16,38],[16,39],[16,40],[16,64],[16,65],[16,66],[17,13],[17,14],[17,15],[17,38],[17,39],[17,40],[17,64],[17,65],[17,66],[18,13],[18,14],[18,15],[18,38],[18,39],[18,40],[18,64],[18,65],[18,66],[19,4],[19,5],[19,6],[19,29],[19,30],[19,31],[19,55],[19,56],[19,57],[19,79],[19,80],[19,81],[20,4],[20,5],[20,6],[20,29],[20,30],[20,31],[20,55],[20,56],[20,57],[20,79],[20,80],[20,81],[21,4],[21,5],[21,6],[21,29],[21,30],[21,31],[21,55],[21,56],[21,57],[21,79],[21,80],[21,81],[22,4],[22,5],[22,6],[22,29],[22,30],[22,31],[22,55],[22,56],[22,57],[22,79],[22,80],[22,81],[23,4],[23,5],[23,6],[23,29],[23,30],[23,31],[23,55],[23,56],[23,57],[23,79],[23,80],[23,81],[24,4],[24,5],[24,6],[24,29],[24,30],[24,31],[24,55],[24,56],[24,57],[24,79],[24,80],[24,81],[25,7],[25,8],[25,9],[25,32],[25,33],[25,34],[25,58],[25,59],[25,60],[25,82],[25,83],[25,84],[26,7],[26,8],[26,9],[26,32],[26,33],[26,34],[26,58],[26,59],[26,60],[26,82],[26,83],[26,84],[27,7],[27,8],[27,9],[27,32],[27,33],[27,34],[27,58],[27,59],[27,60],[27,82],[27,83],[27,84],[28,7],[28,8],[28,9],[28,32],[28,33],[28,34],[28,58],[28,59],[28,60],[28,82],[28,83],[28,84],[29,7],[29,8],[29,9],[29,32],[29,33],[29,34],[29,58],[29,59],[29,60],[29,82],[29,83],[29,84],[30,7],[30,8],[30,9],[30,32],[30,33],[30,34],[30,58],[30,59],[30,60],[30,82],[30,83],[30,84],[31,7],[31,8],[31,9],[31,32],[31,33],[31,34],[31,58],[31,59],[31,60],[31,82],[31,83],[31,84],[32,10],[32,11],[32,12],[32,35],[32,36],[32,37],[32,61],[32,62],[32,63],[32,85],[32,86],[32,87],[33,10],[33,11],[33,12],[33,35],[33,36],[33,37],[33,61],[33,62],[33,63],[33,85],[33,86],[33,87],[34,10],[34,11],[34,12],[34,35],[34,36],[34,37],[34,61],[34,62],[34,63],[34,85],[34,86],[34,87],[35,10],[35,11],[35,12],[35,35],[35,36],[35,37],[35,61],[35,62],[35,63],[35,85],[35,86],[35,87],[36,10],[36,11],[36,12],[36,35],[36,36],[36,37],[36,61],[36,62],[36,63],[36,85],[36,86],[36,87],[37,10],[37,11],[37,12],[37,35],[37,36],[37,37],[37,61],[37,62],[37,63],[37,85],[37,86],[37,87],[38,10],[38,11],[38,12],[38,35],[38,36],[38,37],[38,61],[38,62],[38,63],[38,85],[38,86],[38,87],[39,13],[39,14],[39,15],[39,38],[39,39],[39,40],[39,64],[39,65],[39,66],[40,13],[40,14],[40,15],[40,38],[40,39],[40,40],[40,64],[40,65],[40,66],[41,13],[41,14],[41,15],[41,38],[41,39],[41,40],[41,64],[41,65],[41,66],[42,13],[42,14],[42,15],[42,38],[42,39],[42,40],[42,64],[42,65],[42,66],[43,13],[43,14],[43,15],[43,38],[43,39],[43,40],[43,64],[43,65],[43,66],[44,13],[44,14],[44,15],[44,38],[44,39],[44,40],[44,64],[44,65],[44,66],[45,13],[45,14],[45,15],[45,38],[45,39],[45,40],[45,64],[45,65],[45,66],[46,1],[46,2],[46,3],[46,26],[46,27],[46,28],[46,52],[46,53],[46,54],[46,76],[46,77],[46,78],[47,1],[47,2],[47,3],[47,26],[47,27],[47,28],[47,52],[47,53],[47,54],[47,76],[47,77],[47,78],[48,1],[48,2],[48,3],[48,26],[48,27],[48,28],[48,52],[48,53],[48,54],[48,76],[48,77],[48,78],[49,1],[49,2],[49,3],[49,26],[49,27],[49,28],[49,52],[49,53],[49,54],[49,76],[49,77],[49,78],[50,1],[50,2],[50,3],[50,26],[50,27],[50,28],[50,52],[50,53],[50,54],[50,76],[50,77],[50,78],[51,1],[51,2],[51,3],[51,26],[51,27],[51,28],[51,52],[51,53],[51,54],[51,76],[51,77],[51,78],[52,4],[52,5],[52,6],[52,29],[52,30],[52,31],[52,55],[52,56],[52,57],[52,79],[52,80],[52,81],[53,4],[53,5],[53,6],[53,29],[53,30],[53,31],[53,55],[53,56],[53,57],[53,79],[53,80],[53,81],[54,4],[54,5],[54,6],[54,29],[54,30],[54,31],[54,55],[54,56],[54,57],[54,79],[54,80],[54,81],[55,4],[55,5],[55,6],[55,29],[55,30],[55,31],[55,55],[55,56],[55,57],[55,79],[55,80],[55,81],[56,4],[56,5],[56,6],[56,29],[56,30],[56,31],[56,55],[56,56],[56,57],[56,79],[56,80],[56,81],[57,4],[57,5],[57,6],[57,29],[57,30],[57,31],[57,55],[57,56],[57,57],[57,79],[57,80],[57,81],[58,4],[58,5],[58,6],[58,29],[58,30],[58,31],[58,55],[58,56],[58,57],[58,79],[58,80],[58,81],[59,7],[59,8],[59,9],[59,32],[59,33],[59,34],[59,58],[59,59],[59,60],[59,82],[59,83],[59,84],[60,7],[60,8],[60,9],[60,32],[60,33],[60,34],[60,58],[60,59],[60,60],[60,82],[60,83],[60,84],[61,7],[61,8],[61,9],[61,32],[61,33],[61,34],[61,58],[61,59],[61,60],[61,82],[61,83],[61,84],[62,7],[62,8],[62,9],[62,32],[62,33],[62,34],[62,58],[62,59],[62,60],[62,82],[62,83],[62,84],[63,7],[63,8],[63,9],[63,32],[63,33],[63,34],[63,58],[63,59],[63,60],[63,82],[63,83],[63,84],[64,7],[64,8],[64,9],[64,32],[64,33],[64,34],[64,58],[64,59],[64,60],[64,82],[64,83],[64,84],[65,7],[65,8],[65,9],[65,32],[65,33],[65,34],[65,58],[65,59],[65,60],[65,82],[65,83],[65,84],[66,10],[66,11],[66,12],[66,35],[66,36],[66,37],[66,61],[66,62],[66,63],[66,85],[66,86],[66,87],[67,10],[67,11],[67,12],[67,35],[67,36],[67,37],[67,61],[67,62],[67,63],[67,85],[67,86],[67,87],[68,10],[68,11],[68,12],[68,35],[68,36],[68,37],[68,61],[68,62],[68,63],[68,85],[68,86],[68,87],[69,10],[69,11],[69,12],[69,35],[69,36],[69,37],[69,61],[69,62],[69,63],[69,85],[69,86],[69,87],[70,10],[70,11],[70,12],[70,35],[70,36],[70,37],[70,61],[70,62],[70,63],[70,85],[70,86],[70,87],[71,10],[71,11],[71,12],[71,35],[71,36],[71,37],[71,61],[71,62],[71,63],[71,85],[71,86],[71,87],[72,10],[72,11],[72,12],[72,35],[72,36],[72,37],[72,61],[72,62],[72,63],[72,85],[72,86],[72,87]]},{"producer":10782,"consumer":10794,"input_port":0,"paths":[[1,1],[1,18],[1,35],[1,50],[2,1],[2,18],[2,35],[2,50],[3,1],[3,18],[3,35],[3,50],[4,2],[4,19],[4,36],[4,51],[5,2],[5,19],[5,36],[5,51],[6,2],[6,19],[6,36],[6,51],[7,3],[7,20],[7,37],[7,52],[8,3],[8,20],[8,37],[8,52],[9,3],[9,20],[9,37],[9,52],[10,4],[10,21],[10,38],[10,53],[11,4],[11,21],[11,38],[11,53],[12,4],[12,21],[12,38],[12,53],[13,5],[13,22],[13,39],[14,5],[14,22],[14,39],[15,5],[15,22],[15,39],[26,1],[26,18],[26,35],[26,50],[27,1],[27,18],[27,35],[27,50],[28,1],[28,18],[28,35],[28,50],[29,2],[29,19],[29,36],[29,51],[30,2],[30,19],[30,36],[30,51],[31,2],[31,19],[31,36],[31,51],[32,3],[32,20],[32,37],[32,52],[33,3],[33,20],[33,37],[33,52],[34,3],[34,20],[34,37],[34,52],[35,4],[35,21],[35,38],[35,53],[36,4],[36,21],[36,38],[36,53],[37,4],[37,21],[37,38],[37,53],[38,5],[38,22],[38,39],[39,5],[39,22],[39,39],[40,5],[40,22],[40,39],[52,1],[52,18],[52,35],[52,50],[53,1],[53,18],[53,35],[53,50],[54,1],[54,18],[54,35],[54,50],[55,2],[55,19],[55,36],[55,51],[56,2],[56,19],[56,36],[56,51],[57,2],[57,19],[57,36],[57,51],[58,3],[58,20],[58,37],[58,52],[59,3],[59,20],[59,37],[59,52],[60,3],[60,20],[60,37],[60,52],[61,4],[61,21],[61,38],[61,53],[62,4],[62,21],[62,38],[62,53],[63,4],[63,21],[63,38],[63,53],[64,5],[64,22],[64,39],[65,5],[65,22],[65,39],[66,5],[66,22],[66,39],[76,1],[76,18],[76,35],[76,50],[77,1],[77,18],[77,35],[77,50],[78,1],[78,18],[78,35],[78,50],[79,2],[79,19],[79,36],[79,51],[80,2],[80,19],[80,36],[80,51],[81,2],[81,19],[81,36],[81,51],[82,3],[82,20],[82,37],[82,52],[83,3],[83,20],[83,37],[83,52],[84,3],[84,20],[84,37],[84,52],[85,4],[85,21],[85,38],[85,53],[86,4],[86,21],[86,38],[86,53],[87,4],[87,21],[87,38],[87,53]]},{"producer":10791,"consumer":10782,"input_port":1,"paths":[[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7],[0,8],[0,9],[0,10],[0,11],[0,12],[0,13],[0,14],[0,15],[0,26],[0,27],[0,28],[0,29],[0,30],[0,31],[0,32],[0,33],[0,34],[0,35],[0,36],[0,37],[0,38],[0,39],[0,40],[0,52],[0,53],[0,54],[0,55],[0,56],[0,57],[0,58],[0,59],[0,60],[0,61],[0,62],[0,63],[0,64],[0,65],[0,66],[0,76],[0,77],[0,78],[0,79],[0,80],[0,81],[0,82],[0,83],[0,84],[0,85],[0,86],[0,87]]},{"producer":10794,"consumer":10810,"input_port":0,"paths":[[1,1],[1,18],[1,35],[1,50],[2,2],[2,19],[2,36],[2,51],[3,3],[3,20],[3,37],[3,52],[4,4],[4,21],[4,38],[4,53],[5,5],[5,22],[5,39],[18,1],[18,18],[18,35],[18,50],[19,2],[19,19],[19,36],[19,51],[20,3],[20,20],[20,37],[20,52],[21,4],[21,21],[21,38],[21,53],[22,5],[22,22],[22,39],[35,1],[35,18],[35,35],[35,50],[36,2],[36,19],[36,36],[36,51],[37,3],[37,20],[37,37],[37,52],[38,4],[38,21],[38,38],[38,53],[39,5],[39,22],[39,39],[50,1],[50,18],[50,35],[50,50],[51,2],[51,19],[51,36],[51,51],[52,3],[52,20],[52,37],[52,52],[53,4],[53,21],[53,38],[53,53]]},{"producer":10809,"consumer":10794,"input_port":1,"paths":[[0,1],[0,2],[0,3],[0,4],[0,5],[0,18],[0,19],[0,20],[0,21],[0,22],[0,35],[0,36],[0,37],[0,38],[0,39],[0,50],[0,51],[0,52],[0,53]]},{"producer":10810,"consumer":10834,"input_port":1,"paths":[[1,0],[1,8],[1,16],[1,24],[2,1],[2,9],[2,17],[2,25],[3,2],[3,10],[3,18],[3,26],[4,3],[4,11],[4,19],[4,27],[5,4],[5,12],[5,20],[18,0],[18,8],[18,16],[18,24],[19,1],[19,9],[19,17],[19,25],[20,2],[20,10],[20,18],[20,26],[21,3],[21,11],[21,19],[21,27],[22,4],[22,12],[22,20],[35,0],[35,8],[35,16],[35,24],[36,1],[36,9],[36,17],[36,25],[37,2],[37,10],[37,18],[37,26],[38,3],[38,11],[38,19],[38,27],[39,4],[39,12],[39,20],[50,0],[50,8],[50,16],[50,24],[51,1],[51,9],[51,17],[51,25],[52,2],[52,10],[52,18],[52,26],[53,3],[53,11],[53,19],[53,27]]},{"producer":10831,"consumer":10810,"input_port":1,"paths":[[0,1],[0,2],[0,3],[0,4],[0,5],[0,18],[0,19],[0,20],[0,21],[0,22],[0,35],[0,36],[0,37],[0,38],[0,39],[0,50],[0,51],[0,52],[0,53]]},{"producer":10834,"consumer":10838,"input_port":1,"paths":[[0,0],[1,0],[2,0],[3,0],[4,0],[8,0],[9,0],[10,0],[11,0],[12,0],[16,0],[17,0],[18,0],[19,0],[20,0],[24,0],[25,0],[26,0],[27,0]]},{"producer":10836,"consumer":10838,"input_port":0,"paths":[[0,0]]},{"producer":10837,"consumer":10838,"input_port":2,"paths":[[0,0]]},{"producer":10838,"consumer":15745,"input_port":0,"paths":[[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7],[0,8],[0,9],[0,10],[0,11],[0,12],[0,13],[0,14],[0,15],[0,16],[0,17],[0,18],[0,19],[0,20],[0,21],[0,22],[0,23],[0,24],[0,25],[0,26],[0,27],[0,28],[0,29],[0,30],[0,31],[0,32],[0,33],[0,34],[0,35],[0,36],[0,37],[0,38],[0,39],[0,40],[0,41],[0,42],[0,43],[0,44],[0,45],[0,46],[0,47],[0,48],[0,49],[0,50],[0,51],[0,52],[0,53],[0,54],[0,55],[0,56],[0,57],[0,58],[0,59]]},{"producer":10843,"consumer":10853,"input_port":0,"paths":[[0,0],[0,9],[0,16],[0,18],[0,24],[0,26],[1,1],[1,10],[1,16],[1,19],[1,24],[1,27],[2,2],[2,11],[2,16],[2,20],[2,24],[2,28],[3,3],[3,12],[3,16],[3,21],[3,24],[3,29],[4,4],[4,13],[4,16],[4,22],[4,24],[4,30],[5,5],[5,14],[5,16],[5,23],[5,24],[6,6],[6,15],[6,16],[6,24],[7,7],[8,8],[8,16],[8,17],[8,24],[8,25],[9,0],[9,9],[9,16],[9,18],[9,24],[9,26],[10,1],[10,10],[10,16],[10,19],[10,24],[10,27],[11,2],[11,11],[11,16],[11,20],[11,24],[11,28],[12,3],[12,12],[12,16],[12,21],[12,24],[12,29],[13,4],[13,13],[13,16],[13,22],[13,24],[13,30],[14,5],[14,14],[14,16],[14,23],[14,24],[15,6],[15,15],[15,16],[15,24],[16,0],[16,1],[16,2],[16,3],[16,4],[16,5],[16,6],[16,7],[16,8],[16,9],[16,10],[16,11],[16,12],[16,13],[16,14],[16,15],[16,16],[16,17],[16,18],[16,19],[16,20],[16,21],[16,22],[16,23],[16,24],[16,25],[16,26],[16,27],[16,28],[16,29],[16,30],[17,8],[17,16],[17,17],[17,24],[17,25],[18,0],[18,9],[18,16],[18,18],[18,24],[18,26],[19,1],[19,10],[19,16],[19,19],[19,24],[19,27],[20,2],[20,11],[20,16],[20,20],[20,24],[20,28],[21,3],[21,12],[21,16],[21,21],[21,24],[21,29],[22,4],[22,13],[22,16],[22,22],[22,24],[22,30],[23,5],[23,14],[23,16],[23,23],[23,24],[24,0],[24,1],[24,2],[24,3],[24,4],[24,5],[24,6],[24,7],[24,8],[24,9],[24,10],[24,11],[24,12],[24,13],[24,14],[24,15],[24,16],[24,17],[24,18],[24,19],[24,20],[24,21],[24,22],[24,23],[24,24],[24,25],[24,26],[24,27],[24,28],[24,29],[24,30],[25,8],[25,16],[25,17],[25,24],[25,25],[26,0],[26,9],[26,16],[26,18],[26,24],[26,26],[27,1],[27,10],[27,16],[27,19],[27,24],[27,27],[28,2],[28,11],[28,16],[28,20],[28,24],[28,28],[29,3],[29,12],[29,16],[29,21],[29,24],[29,29],[30,4],[30,13],[30,16],[30,22],[30,24],[30,30]]},{"producer":10850,"consumer":10843,"input_port":1,"paths":[[0,0],[0,1],[0,2],[0,3],[0,4],[0,5],[0,6],[0,7],[0,8],[0,9],[0,10],[0,11],[0,12],[0,13],[0,14],[0,15],[0,16],[0,17],[0,18],[0,19],[0,20],[0,21],[0,22],[0,23],[0,24],[0,25],[0,26],[0,27],[0,28],[0,29],[0,30]]},{"producer":15745,"consumer":10843,"input_port":0,"paths":[[0,0],[0,9],[0,18],[0,26],[1,1],[1,10],[1,19],[1,27],[2,2],[2,11],[2,20],[2,28],[3,3],[3,12],[3,21],[3,29],[4,4],[4,13],[4,22],[4,30],[5,5],[5,14],[5,23],[6,6],[6,15],[7,7],[8,16],[8,24],[9,16],[9,24],[10,16],[10,24],[11,16],[11,24],[12,16],[12,24],[13,16],[13,24],[14,16],[14,24],[16,8],[16,17],[16,25],[17,0],[17,9],[17,18],[17,26],[18,1],[18,10],[18,19],[18,27],[19,2],[19,11],[19,20],[19,28],[20,3],[20,12],[20,21],[20,29],[21,4],[21,13],[21,22],[21,30],[22,5],[22,14],[22,23],[23,6],[23,15],[24,16],[24,24],[25,16],[25,24],[26,16],[26,24],[27,16],[27,24],[28,16],[28,24],[29,16],[29,24],[30,16],[30,24],[31,16],[31,24],[32,0],[32,1],[32,2],[32,3],[32,4],[32,5],[32,6],[32,7],[32,8],[32,9],[32,10],[32,11],[32,12],[32,13],[32,14],[32,15],[32,16],[32,17],[32,18],[32,19],[32,20],[32,21],[32,22],[32,23],[32,24],[32,25],[32,26],[32,27],[32,28],[32,29],[32,30],[33,8],[33,17],[33,25],[34,0],[34,9],[34,18],[34,26],[35,1],[35,10],[35,19],[35,27],[36,2],[36,11],[36,20],[36,28],[37,3],[37,12],[37,21],[37,29],[38,4],[38,13],[38,22],[38,30],[39,5],[39,14],[39,23],[40,16],[40,24],[41,16],[41,24],[42,16],[42,24],[43,16],[43,24],[44,16],[44,24],[45,16],[45,24],[46,16],[46,24],[47,0],[47,1],[47,2],[47,3],[47,4],[47,5],[47,6],[47,7],[47,8],[47,9],[47,10],[47,11],[47,12],[47,13],[47,14],[47,15],[47,16],[47,17],[47,18],[47,19],[47,20],[47,21],[47,22],[47,23],[47,24],[47,25],[47,26],[47,27],[47,28],[47,29],[47,30],[48,8],[48,17],[48,25],[49,0],[49,9],[49,18],[49,26],[50,1],[50,10],[50,19],[50,27],[51,2],[51,11],[51,20],[51,28],[52,3],[52,12],[52,21],[52,29],[53,4],[53,13],[53,22],[53,30],[54,16],[54,24],[55,16],[55,24],[56,16],[56,24],[57,16],[57,24],[58,16],[58,24],[59,16],[59,24]]}]}
diff --git a/pybuda/csrc/balancer/tests/module.mk b/pybuda/csrc/balancer/tests/module.mk
deleted file mode 100644
index cb859203c..000000000
--- a/pybuda/csrc/balancer/tests/module.mk
+++ /dev/null
@@ -1,48 +0,0 @@
-PYBUDA_CSRC_BALANCER_TESTS = $(TESTDIR)/pybuda/csrc/balancer/tests/balancer_unit_tests
-PYBUDA_CSRC_BALANCER_TESTS_SRCS = \
-	$(wildcard pybuda/csrc/balancer/tests/*.cpp) \
-
-PYBUDA_CSRC_BALANCER_TESTS_INCLUDES = -Ipybuda/csrc/graph_lib $(PYBUDA_CSRC_BALANCER_INCLUDES)
-PYBUDA_CSRC_BALANCER_TESTS_LDFLAGS = -lgtest -lgtest_main -lpthread -l$(PYTHON_VERSION) -lm
-
-PYBUDA_CSRC_BALANCER_TESTS_OBJS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_BALANCER_TESTS_SRCS:.cpp=.o))
-PYBUDA_CSRC_BALANCER_TESTS_DEPS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_BALANCER_TESTS_SRCS:.cpp=.d))
-
-# BBE TILE MAPS
-PYBUDA_CSRC_TILE_MAPS_LIB     = $(LIBDIR)/libnet2pipe_tile_maps.a
-PYBUDA_CSRC_TILE_MAPS_INCLUDES = \
-	$(INCDIR)/net2pipe/tile_maps.h \
-	$(INCDIR)/net2pipe/tile_maps_common.h \
-	$(INCDIR)/net2pipe/net2pipe_logger.h
-PYBUDA_CSRC_TILE_MAPS_SRC     = third_party/budabackend/src/net2pipe/src/tile_maps.cpp
-PYBUDA_CSRC_TILE_MAPS_OBJ     = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_TILE_MAPS_SRC:.cpp=.o))
-PYBUDA_CSRC_TILE_MAPS_DEP     = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_TILE_MAPS_SRC:.cpp=.d))
-
--include $(PYBUDA_CSRC_TILE_MAPS_DEP)
--include $(PYBUDA_CSRC_BALANCER_TESTS_DEPS)
-
-pybuda/csrc/balancer/tests: $(PYBUDA_CSRC_BALANCER_TESTS)
-
-$(INCDIR)/net2pipe/%.h: third_party/budabackend/src/net2pipe/inc/%.h
-	@mkdir -p $(@D)
-	cp $^ $@
-
-$(PYBUDA_CSRC_TILE_MAPS_OBJ): $(PYBUDA_CSRC_TILE_MAPS_INCLUDES) $(PYBUDA_CSRC_TILE_MAPS_SRC)
-	@mkdir -p $(@D)
-	$(CXX) $(CXXFLAGS) $(STATIC_LIB_FLAGS) -I$(INCDIR)/net2pipe -c -o $@ $(PYBUDA_CSRC_TILE_MAPS_SRC)
-
-$(PYBUDA_CSRC_LOGGER_OBJ): $(PYBUDA_CSRC_TILE_MAPS_INCLUDES) $(PYBUDA_CSRC_LOGGER_SRC)
-	@mkdir -p $(@D)
-	$(CXX) $(CXXFLAGS) $(STATIC_LIB_FLAGS) -I$(INCDIR)/net2pipe -c -o $@ $(PYBUDA_CSRC_LOGGER_SRC)
-
-$(PYBUDA_CSRC_TILE_MAPS_LIB): $(PYBUDA_CSRC_TILE_MAPS_OBJ) $(PYBUDA_CSRC_LOGGER_OBJ)
-	@mkdir -p $(LIBDIR)
-	ar rcs $@ $^
-
-$(PYBUDA_CSRC_BALANCER_TESTS): $(PYBUDA_CSRC_BALANCER_TESTS_OBJS) $(PYBUDA_CSRC_LIB) $(PYBUDA_CSRC_TILE_MAPS_LIB)
-	@mkdir -p $(@D)
-	$(CXX) $(PYBUDA_CSRC_CFLAGS) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) $(PYBUDA_CSRC_BALANCER_TESTS_LDFLAGS)
-
-$(OBJDIR)/pybuda/csrc/balancer/tests/%.o: pybuda/csrc/balancer/tests/%.cpp $(PYBUDA_CSRC_TILE_MAPS_INCLUDES)
-	@mkdir -p $(@D)
-	$(CXX) $(PYBUDA_CSRC_CFLAGS) $(CXXFLAGS) $(PYBUDA_CSRC_BALANCER_TESTS_INCLUDES) -c -o $@ $<
diff --git a/pybuda/csrc/balancer/tests/test_balancer.cpp b/pybuda/csrc/balancer/tests/test_balancer.cpp
deleted file mode 100644
index c922eea5b..000000000
--- a/pybuda/csrc/balancer/tests/test_balancer.cpp
+++ /dev/null
@@ -1,14 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include <gtest/gtest.h>
-#include <pybind11/embed.h>
-
-#include <filesystem>
-
-int main(int argc, char **argv)
-{
-    ::testing::InitGoogleTest(&argc, argv);
-    pybind11::scoped_interpreter guard{};
-    return RUN_ALL_TESTS();
-}
diff --git a/pybuda/csrc/balancer/tests/test_balancer_utils.cpp b/pybuda/csrc/balancer/tests/test_balancer_utils.cpp
deleted file mode 100644
index bd001f40d..000000000
--- a/pybuda/csrc/balancer/tests/test_balancer_utils.cpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "test_balancer_utils.hpp"
-
-#include "passes/passes_utils.hpp"
-#include "passes/pre_placer_buda_passes.hpp"
-
-namespace tt::test
-{
-std::unique_ptr<Graph> prepare_graph_for_legalizer(Graph *graph)
-{
-    std::unique_ptr<Graph> lowered_graph = lower_to_buda_ops(graph);
-    recalculate_shapes(lowered_graph.get());
-    calculate_ublock_order(lowered_graph.get());
-
-    return lowered_graph;
-}
-
-balancer::BalancerConfig create_balancer_config(
-    Arch arch,
-    std::optional< std::vector<std::uint32_t> > device_chip_ids,
-    balancer::PolicyType policy_type,
-    std::string cluster_config_yaml,
-    std::string runtime_params_yaml,
-    placer::ChipPlacementPolicy chip_placement_policy)
-{
-    return balancer::BalancerConfig(create_device_config(
-        arch,
-        device_chip_ids,
-        cluster_config_yaml,
-        runtime_params_yaml),
-        policy_type,
-        chip_placement_policy);
-}
-
-std::shared_ptr<tt::balancer::BalancerCacheCollection> create_balancer_cache_collection()
-{
-    return std::make_shared<tt::balancer::BalancerCacheCollection>();
-}
-
-}  // namespace tt::test
diff --git a/pybuda/csrc/balancer/tests/test_balancer_utils.hpp b/pybuda/csrc/balancer/tests/test_balancer_utils.hpp
deleted file mode 100644
index 2098d523e..000000000
--- a/pybuda/csrc/balancer/tests/test_balancer_utils.hpp
+++ /dev/null
@@ -1,34 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-#include <memory>
-#include <optional>
-#include <vector>
-#include "balancer/balancer.hpp"
-#include "balancer/balancer_cache_collection.hpp"
-#include "test/common.hpp"
-
-namespace tt::graphlib
-{
-class Graph;
-}
-
-namespace tt::balancer
-{
-struct BalancerConfig;
-}
-
-namespace tt::test
-{
-std::unique_ptr<tt::graphlib::Graph> prepare_graph_for_legalizer(tt::graphlib::Graph *graph);
-balancer::BalancerConfig create_balancer_config(
-    Arch arch = Arch::Grayskull,
-    std::optional< std::vector<std::uint32_t> > device_chip_ids = std::nullopt,
-    balancer::PolicyType policy_type = balancer::PolicyType::Ribbon,
-    std::string cluster_config_yaml = "",
-    std::string runtime_params_yaml = "",
-    placer::ChipPlacementPolicy chip_placement_policy = placer::ChipPlacementPolicy::MMIO_LAST);
-
-std::shared_ptr<tt::balancer::BalancerCacheCollection> create_balancer_cache_collection();
-}  // namespace tt::test
diff --git a/pybuda/csrc/balancer/tests/test_graph_solver.cpp b/pybuda/csrc/balancer/tests/test_graph_solver.cpp
deleted file mode 100644
index 19316eb61..000000000
--- a/pybuda/csrc/balancer/tests/test_graph_solver.cpp
+++ /dev/null
@@ -1,1204 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include <fstream>
-#include <memory>
-
-#include "balancer/balancer_cache_collection.hpp"
-#include "balancer/legalizer/graph_solver.hpp"
-#include "balancer/legalizer/legalizer.hpp"
-#include "gtest/gtest.h"
-#include "json.hpp"
-#include "test/common.hpp"
-#include "test_balancer_utils.hpp"
-
-namespace tt::test
-{
-using namespace balancer;
-
-struct UnitTestConstraint : public legalizer::Constraint
-{
-    UnitTestConstraint(
-        const DeviceConfig& device_config,
-        std::shared_ptr<balancer::BalancerCacheCollection> balancer_cache_collection) :
-        Constraint(device_config, balancer_cache_collection)
-    {
-    }
-
-    virtual std::pair<legalizer::EdgeCost, legalizer::ConstraintFailureReason> queue_to_op_cost(
-        graphlib::Graph const*, graphlib::Edge, std::optional<OpModel>, OpModel const&) override
-    {
-        return std::make_pair(legalizer::EdgeCost{}, legalizer::NoConstraintFailure);
-    }
-
-    virtual std::pair<legalizer::EdgeCost, legalizer::ConstraintFailureReason> op_to_queue_cost(
-        graphlib::Graph const*, graphlib::Edge, OpModel const&, std::optional<OpModel>) override
-    {
-        return std::make_pair(legalizer::EdgeCost{}, legalizer::NoConstraintFailure);
-    }
-
-    virtual std::pair<legalizer::EdgeCost, legalizer::ConstraintFailureReason> op_to_op_cost(
-        graphlib::Graph const*, graphlib::Edge, OpModel const& producer, OpModel const& consumer) override
-    {
-        if (producer.op_model_valid_pair_id.count(consumer.id.id) == 0 or
-            consumer.op_model_valid_pair_id.count(producer.id.id) == 0)
-        {
-            return std::make_pair(legalizer::EdgeCost{}, legalizer::Failed);
-        }
-
-        return std::make_pair(legalizer::EdgeCost{}, legalizer::NoConstraintFailure);
-    }
-};
-
-struct JsonTest
-{
-    using Path = std::pair<int, int>;
-
-    struct Edge
-    {
-        int producer;
-        int consumer;
-        int input_port;
-        std::vector<Path> paths;
-    };
-
-    std::vector<Edge> edges;
-    std::unordered_map<std::string, std::string> node_id_to_name;
-    std::vector<std::string> flags;
-
-    bool has_flag(std::string const& flag) const { return std::find(flags.begin(), flags.end(), flag) != flags.end(); }
-};
-
-void from_json(const nlohmann::json& j, JsonTest::Edge& e)
-{
-    j.at("producer").get_to(e.producer);
-    j.at("consumer").get_to(e.consumer);
-    j.at("input_port").get_to(e.input_port);
-    j.at("paths").get_to(e.paths);
-}
-
-void from_json(const nlohmann::json& j, JsonTest& t)
-{
-    j.at("edges").get_to(t.edges);
-    if (j.contains("node_names"))
-        j.at("node_names").get_to(t.node_id_to_name);
-    if (j.contains("flags"))
-        j.at("flags").get_to(t.flags);
-}
-
-template <typename Fn>
-static void cross(std::vector<OpModel>& as, std::vector<OpModel>& bs, Fn fn)
-{
-    for (auto& a : as)
-        for (auto& b : bs) fn(a, b);
-}
-
-// OpModel Association functions
-static void a2b(OpModel& a, OpModel& b) { b.op_model_valid_pair_id.insert(a.id.id); }
-static void b2a(OpModel& a, OpModel& b) { a2b(b, a); }
-static void both(OpModel& a, OpModel& b)
-{
-    a2b(a, b);
-    b2a(a, b);
-}
-
-struct GraphSolverResolveSanity : testing::Test
-{
-    std::unique_ptr<Graph> graph;
-
-    void SetUp() override
-    {
-        graph = std::make_unique<Graph>(graphlib::IRLevel::IR_PYBUDA);
-
-        graphlib::Shape shape = graphlib::Shape::create({1, 1, 512, 160});
-
-        auto in0_a = create_input(*graph, "in0_a", graphlib::Shape::create({1, 1, shape[2], 256}));
-        auto in0_b = create_input(*graph, "in0_b", graphlib::Shape::create({1, 1, 256, shape[3]}));
-        auto matmul0 = add_node<graphlib::PyOpNode>(*graph, "matmul0", "matmul", {}, {in0_a, in0_b});
-
-        auto in1_a = create_input(*graph, "in1_a", graphlib::Shape::create({1, 1, shape[2], 128}));
-        auto in1_b = create_input(*graph, "in1_b", graphlib::Shape::create({1, 1, 128, shape[3]}));
-        auto matmul1 = add_node<graphlib::PyOpNode>(*graph, "matmul1", "matmul", {}, {in1_a, in1_b});
-
-        auto add = add_node<graphlib::PyOpNode>(*graph, "add", "add", {}, {matmul0, matmul1});
-
-        create_output(*graph, "out0", add);
-
-        graph = prepare_graph_for_legalizer(graph.get());
-    }
-};
-
-TEST_F(GraphSolverResolveSanity, resolve)
-{
-    balancer::BalancerConfig balancer_config = create_balancer_config();
-    std::shared_ptr<balancer::BalancerCacheCollection> cache_collection = create_balancer_cache_collection();
-    balancer::LegalOpModels valid_op_models =
-        balancer::legalizer::get_legal_op_models(graph.get(), balancer_config, cache_collection);
-    EXPECT_EQ(valid_op_models.size(), 3);
-    legalizer::GraphSolver graph_solver =
-        get_graph_solver(balancer_config, cache_collection, graph.get(), valid_op_models);
-
-    auto topo_sort = tt::graphlib::topological_sort(*graph);
-
-    // Simple - just set first available.
-    //
-    for (Node* node : topo_sort)
-    {
-        if (node->node_type() != graphlib::NodeType::kBudaOp)
-        {
-            continue;
-        }
-
-        auto opmodels = graph_solver.at(node);
-        graph_solver.set(node, *opmodels.begin());
-
-        // After calling SET, we expect having only one OpModel available for this node.
-        //
-        auto opmodels_after_set = graph_solver.at(node);
-        EXPECT_EQ(opmodels_after_set.mask.count(), 1);
-    }
-
-    balancer::legalizer::GraphSolverSolution solution = graph_solver.finish();
-
-    EXPECT_EQ(solution.cut_edges.size(), 0);
-    EXPECT_EQ(solution.selected_op_models.size(), 3);
-}
-
-TEST_F(GraphSolverResolveSanity, resolve_no_streaming_output)
-{
-    balancer::BalancerConfig balancer_config = create_balancer_config();
-    std::shared_ptr<balancer::BalancerCacheCollection> cache_collection = create_balancer_cache_collection();
-    balancer_config.enable_t_streaming = true;
-    balancer::LegalOpModels valid_op_models =
-        balancer::legalizer::get_legal_op_models(graph.get(), balancer_config, cache_collection);
-    EXPECT_EQ(valid_op_models.size(), 3);
-    legalizer::GraphSolver graph_solver =
-        get_graph_solver(balancer_config, cache_collection, graph.get(), valid_op_models);
-
-    auto topo_sort = tt::graphlib::topological_sort(*graph);
-
-    // Validate that there is no op_model allowing streaming into output.
-    //
-    for (Node* node : topo_sort)
-    {
-        if (node->node_type() == graphlib::NodeType::kOutput)
-        {
-            for (auto output_op : graph->data_operands(node))
-            {
-                auto opmodels = graph_solver.at(output_op);
-
-                for (auto op_model : opmodels)
-                {
-                    EXPECT_TRUE(op_model.t_stream_factor.none());
-                }
-            }
-        }
-    }
-
-    // Simple - just set first available.
-    //
-    for (Node* node : topo_sort)
-    {
-        if (node->node_type() != graphlib::NodeType::kBudaOp)
-        {
-            continue;
-        }
-
-        auto opmodels = graph_solver.at(node);
-        graph_solver.set(node, *opmodels.begin());
-
-        // After calling SET, we expect having only one OpModel available for this node.
-        //
-        auto opmodels_after_set = graph_solver.at(node);
-        EXPECT_EQ(opmodels_after_set.mask.count(), 1);
-    }
-
-    balancer::legalizer::GraphSolverSolution solution = graph_solver.finish();
-
-    EXPECT_EQ(solution.cut_edges.size(), 0);
-    EXPECT_EQ(solution.selected_op_models.size(), 3);
-}
-
-TEST_F(GraphSolverResolveSanity, graphsolverforking)
-{
-    using balancer::legalizer::GraphSolver;
-    using balancer::legalizer::GraphSolverSolution;
-    balancer::BalancerConfig balancer_config = create_balancer_config();
-    std::shared_ptr<balancer::BalancerCacheCollection> cache_collection = create_balancer_cache_collection();
-    balancer::LegalOpModels valid_op_models =
-        balancer::legalizer::get_legal_op_models(graph.get(), balancer_config, cache_collection);
-    EXPECT_EQ(valid_op_models.size(), 3);
-    GraphSolver graph_solver = get_graph_solver(balancer_config, cache_collection, graph.get(), valid_op_models);
-
-    // Make a clone snapshot.
-    //
-    GraphSolver graph_solver_fork = graph_solver;
-
-    auto topo_sort = tt::graphlib::topological_sort(*graph);
-
-    {
-        // Clone of a clone, within limited scope on purpose.
-        //
-        GraphSolver graph_solver_fork_2 = graph_solver_fork;
-        bool flipflop = false;  // Alternate between first and last.
-
-        // Simple - just set first available.
-        //
-        for (Node* node : topo_sort)
-        {
-            if (node->node_type() != graphlib::NodeType::kBudaOp)
-            {
-                continue;
-            }
-
-            auto opmodels = graph_solver.at(node);
-            graph_solver.set(node, *opmodels.begin());
-
-            auto opmodels2 = graph_solver_fork_2.at(node);
-            if (flipflop)
-            {
-                int opModelCount = opmodels2.mask.count();
-                int currentModel = 0;
-                auto it = opmodels2.begin();
-                while (currentModel < opModelCount - 1)
-                {
-                    it++;
-                    currentModel++;
-                }
-
-                graph_solver_fork_2.set(node, *it);
-            }
-            else
-            {
-                graph_solver_fork_2.set(node, *opmodels2.begin());
-            }
-
-            flipflop ^= true;
-
-            // After calling SET, we expect having only one OpModel available for this node.
-            //
-            auto opmodels_after_set = graph_solver.at(node);
-            EXPECT_EQ(opmodels_after_set.mask.count(), 1);
-            auto opmodels_after_set2 = graph_solver_fork_2.at(node);
-            EXPECT_EQ(opmodels_after_set2.mask.count(), 1);
-        }
-
-        EXPECT_EQ(graph_solver_fork_2.get_cut_edges().size(), 0);
-        EXPECT_EQ(graph_solver_fork_2.get_selected_op_models().size(), 3);
-    }
-
-    EXPECT_EQ(graph_solver.get_cut_edges().size(), 0);
-    EXPECT_EQ(graph_solver.get_selected_op_models().size(), 3);
-
-    // Invoking SET on original graph solver should not impact forks.
-    //
-    EXPECT_EQ(graph_solver_fork.get_selected_op_models().size(), 0);
-
-    // This time set last available.
-    //
-    for (Node* node : topo_sort)
-    {
-        if (node->node_type() != graphlib::NodeType::kBudaOp)
-        {
-            continue;
-        }
-
-        auto opmodels = graph_solver_fork.at(node);
-        int opModelCount = opmodels.mask.count();
-        int currentModel = 0;
-        auto it = opmodels.begin();
-        while (currentModel < opModelCount - 1)
-        {
-            it++;
-            currentModel++;
-        }
-
-        graph_solver_fork.set(node, *it);
-
-        // After calling SET, we expect having only one OpModel available for this node.
-        //
-        auto opmodels_after_set = graph_solver_fork.at(node);
-        EXPECT_EQ(opmodels_after_set.mask.count(), 1);
-    }
-
-    GraphSolverSolution secondary_solution = graph_solver_fork.finish();
-    EXPECT_EQ(secondary_solution.cut_edges.size(), 0);
-    EXPECT_EQ(secondary_solution.selected_op_models.size(), 3);
-}
-
-TEST_F(GraphSolverResolveSanity, graphsolverforking_cut)
-{
-    using balancer::legalizer::GraphSolver;
-    using balancer::legalizer::GraphSolverSolution;
-    balancer::BalancerConfig balancer_config = create_balancer_config();
-    std::shared_ptr<balancer::BalancerCacheCollection> cache_collection = create_balancer_cache_collection();
-    balancer::LegalOpModels valid_op_models =
-        balancer::legalizer::get_legal_op_models(graph.get(), balancer_config, cache_collection);
-    EXPECT_EQ(valid_op_models.size(), 3);
-    GraphSolver graph_solver = get_graph_solver(balancer_config, cache_collection, graph.get(), valid_op_models);
-
-    GraphSolver graph_solver_fork = graph_solver;
-
-    auto topo_sort = tt::graphlib::topological_sort(*graph);
-    bool should_cut = true;
-    int edges_cut = 0;
-
-    // For original just set first available, for fork lets cut a bit and set last.
-    //
-    for (Node* node : topo_sort)
-    {
-        if (node->node_type() != graphlib::NodeType::kBudaOp)
-        {
-            continue;
-        }
-
-        auto opmodels = graph_solver.at(node);
-        graph_solver.set(node, *opmodels.begin());
-        bool made_cut = false;
-        Node* cutNode = nullptr;
-
-        if (should_cut)
-        {
-            std::vector<graphlib::Edge> edges_to_cut;
-            for (auto& edge : graph->user_data_edges(node))
-            {
-                Node* consumerNode = graph->node_by_id(edge.consumer_node_id);
-                if (consumerNode->node_type() == graphlib::NodeType::kBudaOp and (!cutNode || cutNode == consumerNode))
-                {
-                    should_cut = edges_cut < 2;
-                    edges_to_cut.push_back(edge);
-                    cutNode = consumerNode;
-                }
-            }
-
-            if (edges_to_cut.size() > 0)
-            {
-                edges_cut += edges_to_cut.size();
-                graph_solver_fork.cut(edges_to_cut);
-                made_cut = true;
-            }
-        }
-
-        auto opmodels2 = graph_solver_fork.at(node);
-        int opModel2Count = opmodels2.mask.count();
-        if (made_cut and node == cutNode)
-        {
-            // If we have cut out this node from rest of the graph, we expect:
-            // 1. All op models are available
-            // 2. More op models compared to non-cut graph version
-            //
-            EXPECT_EQ(opModel2Count, opmodels2.p->size());
-            EXPECT_GT(opModel2Count, opmodels.mask.count());
-        }
-
-        int currentModel = 0;
-        auto it = opmodels2.begin();
-        while (currentModel < opModel2Count - 1)
-        {
-            it++;
-            currentModel++;
-        }
-
-        graph_solver_fork.set(node, *it);
-
-        // After calling SET, we expect having only one OpModel available for this node.
-        //
-        auto opmodels_after_set = graph_solver.at(node);
-        EXPECT_EQ(opmodels_after_set.mask.count(), 1);
-    }
-
-    EXPECT_EQ(graph_solver.get_cut_edges().size(), 0);
-    EXPECT_EQ(graph_solver.get_selected_op_models().size(), 3);
-
-    EXPECT_EQ(graph->virtual_node_count(), edges_cut);
-    auto visible_global_graph = tt::graphlib::topological_sort(*graph);
-    EXPECT_EQ(visible_global_graph.size(), topo_sort.size());
-    GraphSolverSolution solution2 = graph_solver_fork.finish();
-    EXPECT_EQ(graph->virtual_node_count(), 0);
-    EXPECT_EQ(solution2.cut_edges.size(), edges_cut);
-    EXPECT_EQ(solution2.selected_op_models.size(), 3);
-}
-
-TEST_F(GraphSolverResolveSanity, graphsolverforking_cut_all_forks)
-{
-    using balancer::legalizer::GraphSolver;
-    using balancer::legalizer::GraphSolverSolution;
-    balancer::BalancerConfig balancer_config = create_balancer_config();
-    std::shared_ptr<balancer::BalancerCacheCollection> cache_collection = create_balancer_cache_collection();
-    balancer::LegalOpModels valid_op_models =
-        balancer::legalizer::get_legal_op_models(graph.get(), balancer_config, cache_collection);
-    EXPECT_EQ(valid_op_models.size(), 3);
-    GraphSolver graph_solver = get_graph_solver(balancer_config, cache_collection, graph.get(), valid_op_models);
-
-    GraphSolver graph_solver_fork = graph_solver;
-
-    auto topo_sort = tt::graphlib::topological_sort(*graph);
-    bool should_cut = true;
-    int edges_cut = 0;
-
-    // For original just set first available, for fork lets cut a bit and set last.
-    //
-    for (Node* node : topo_sort)
-    {
-        if (node->node_type() != graphlib::NodeType::kBudaOp)
-        {
-            continue;
-        }
-
-        auto opmodels = graph_solver.at(node);
-        graph_solver.set(node, *opmodels.begin());
-        bool made_cut = false;
-        Node* cutNode = nullptr;
-
-        if (should_cut)
-        {
-            std::vector<graphlib::Edge> edges_to_cut;
-            for (auto& edge : graph->user_data_edges(node))
-            {
-                Node* consumerNode = graph->node_by_id(edge.consumer_node_id);
-                if (consumerNode->node_type() == graphlib::NodeType::kBudaOp and (!cutNode || cutNode == consumerNode))
-                {
-                    should_cut = edges_cut < 2;
-                    edges_to_cut.push_back(edge);
-                    cutNode = consumerNode;
-                }
-            }
-
-            if (edges_to_cut.size() > 0)
-            {
-                edges_cut += edges_to_cut.size();
-                graph_solver_fork.cut(edges_to_cut);
-                graph_solver.cut(edges_to_cut);
-                made_cut = true;
-            }
-        }
-
-        auto opmodels2 = graph_solver_fork.at(node);
-        int opModel2Count = opmodels2.mask.count();
-        if (made_cut and node == cutNode)
-        {
-            // If we have cut out this node from rest of the graph, we expect:
-            // 1. All op models are available
-            // 2. More op models compared to non-cut graph version
-            //
-            EXPECT_EQ(opModel2Count, opmodels2.p->size());
-            EXPECT_GT(opModel2Count, opmodels.mask.count());
-        }
-
-        int currentModel = 0;
-        auto it = opmodels2.begin();
-        while (currentModel < opModel2Count - 1)
-        {
-            it++;
-            currentModel++;
-        }
-
-        graph_solver_fork.set(node, *it);
-
-        // After calling SET, we expect having only one OpModel available for this node.
-        //
-        auto opmodels_after_set = graph_solver.at(node);
-        EXPECT_EQ(opmodels_after_set.mask.count(), 1);
-    }
-
-    EXPECT_EQ(graph_solver.get_cut_edges().size(), edges_cut);
-    EXPECT_EQ(graph_solver.get_selected_op_models().size(), 3);
-
-    EXPECT_EQ(graph->virtual_node_count(), edges_cut * 2);
-    auto visible_global_graph = tt::graphlib::topological_sort(*graph);
-    EXPECT_EQ(visible_global_graph.size(), topo_sort.size());
-    GraphSolverSolution solution2 = graph_solver_fork.finish();
-    EXPECT_EQ(graph->virtual_node_count(), edges_cut);
-    EXPECT_EQ(solution2.cut_edges.size(), edges_cut);
-    EXPECT_EQ(solution2.selected_op_models.size(), 3);
-}
-
-TEST_F(GraphSolverResolveSanity, nop_insertion)
-{
-    balancer::BalancerConfig balancer_config = create_balancer_config();
-    std::shared_ptr<balancer::BalancerCacheCollection> cache_collection = create_balancer_cache_collection();
-    balancer::LegalOpModels valid_op_models =
-        balancer::legalizer::get_legal_op_models(graph.get(), balancer_config, cache_collection);
-    EXPECT_EQ(valid_op_models.size(), 3);
-    legalizer::GraphSolver graph_solver =
-        get_graph_solver(balancer_config, cache_collection, graph.get(), valid_op_models);
-
-    auto topo_sort = tt::graphlib::topological_sort(*graph);
-
-    // Simple - just set first available.
-    //
-    for (Node* node : topo_sort)
-    {
-        if (node->node_type() != graphlib::NodeType::kBudaOp)
-        {
-            continue;
-        }
-
-        if (node->name() == "add")
-        {
-            vector<balancer::legalizer::BufferInfo> buffer_info;
-            int nop_count = 1;
-            for (Edge edge : graph->operand_data_edges(node))
-            {
-                buffer_info.emplace_back(balancer::legalizer::BufferInfo(edge, nop_count++, true));
-            }
-
-            // Check virtual node count is as expected before and after buffering.
-            //
-            EXPECT_EQ(graph->virtual_node_count(), 0);
-            std::vector<Node*> buffered_nodes = graph_solver.buffer(buffer_info);
-            EXPECT_EQ(graph->virtual_node_count(), 3);
-
-            for (Node* buf_node : buffered_nodes)
-            {
-                auto opmodels = graph_solver.at(buf_node);
-                graph_solver.set(buf_node, *opmodels.begin());
-            }
-        }
-
-        auto opmodels = graph_solver.at(node);
-        graph_solver.set(node, *opmodels.begin());
-
-        // After calling SET, we expect having only one OpModel available for this node.
-        //
-        auto opmodels_after_set = graph_solver.at(node);
-        EXPECT_EQ(opmodels_after_set.mask.count(), 1);
-    }
-
-    balancer::legalizer::GraphSolverSolution solution = graph_solver.finish();
-
-    EXPECT_EQ(graph->virtual_node_count(), 0);
-    EXPECT_EQ(solution.cut_edges.size(), 0);
-    EXPECT_EQ(solution.selected_op_models.size(), 6);
-    EXPECT_EQ(graph->nodes().size(), 11);
-}
-
-TEST_F(GraphSolverResolveSanity, nop_insertion_forking)
-{
-    balancer::BalancerConfig balancer_config = create_balancer_config();
-    std::shared_ptr<balancer::BalancerCacheCollection> cache_collection = create_balancer_cache_collection();
-    balancer::LegalOpModels valid_op_models =
-        balancer::legalizer::get_legal_op_models(graph.get(), balancer_config, cache_collection);
-    EXPECT_EQ(valid_op_models.size(), 3);
-    legalizer::GraphSolver graph_solver =
-        get_graph_solver(balancer_config, cache_collection, graph.get(), valid_op_models);
-
-    auto topo_sort = tt::graphlib::topological_sort(*graph);
-    int initial_node_count = topo_sort.size();
-    int nop_inserted_gs = 0;
-    for (Node* node : topo_sort)
-    {
-        if (node->node_type() != graphlib::NodeType::kBudaOp)
-        {
-            continue;
-        }
-
-        if (node->name() == "add")
-        {
-            // Fork on buffer for operand edges of add op.
-            //
-            legalizer::GraphSolver gs_buffer_fork = graph_solver;
-            vector<balancer::legalizer::BufferInfo> buffer_info;
-            vector<balancer::legalizer::BufferInfo> buffer_info_fork;
-            int nop_count = 1;
-            int nop_inserted_gs_fork1 = 0;
-            int nop_inserted_gs_fork2 = 0;
-
-            // Insert different number of NOPs on two GS forks.
-            //
-            for (Edge edge : graph->operand_data_edges(node))
-            {
-                nop_inserted_gs += nop_count;
-                buffer_info.emplace_back(balancer::legalizer::BufferInfo(edge, nop_count++, false));
-                nop_inserted_gs_fork1 += nop_count;
-                buffer_info_fork.emplace_back(balancer::legalizer::BufferInfo(edge, nop_count, false));
-            }
-
-            // Check if virtual node count and graph size are as expected before and after buffering.
-            //
-            EXPECT_EQ(graph->virtual_node_count(), 0);
-            EXPECT_EQ(graph->nodes().size(), initial_node_count);
-            std::vector<Node*> buffered_nodes = graph_solver.buffer(buffer_info);
-            EXPECT_EQ(graph->virtual_node_count(), nop_inserted_gs);
-            EXPECT_EQ(graph->nodes().size(), initial_node_count + graph->virtual_node_count());
-            {
-                // Create another GS fork after one buffering pass.
-                // On purpose in limited scope to test virtual node cleanup.
-                //
-                legalizer::GraphSolver gs_buffer_fork2 = graph_solver;
-                std::vector<Node*> buffered_nodes_fork = gs_buffer_fork.buffer(buffer_info_fork);
-                EXPECT_EQ(graph->virtual_node_count(), nop_inserted_gs + nop_inserted_gs_fork1);
-                EXPECT_EQ(graph->nodes().size(), initial_node_count + graph->virtual_node_count());
-                int edges_cut = 0;
-
-                for (Node* buf_node : buffered_nodes)
-                {
-                    auto opmodels = graph_solver.at(buf_node);
-                    graph_solver.set(buf_node, *opmodels.begin());
-                    opmodels = gs_buffer_fork2.at(buf_node);
-                    gs_buffer_fork2.set(buf_node, *opmodels.begin());
-                }
-
-                vector<balancer::legalizer::BufferInfo> gs_buffer_fork2_buffer_info;
-                for (Edge edge : graph->user_data_edges(node))
-                {
-                    gs_buffer_fork2_buffer_info.emplace_back(balancer::legalizer::BufferInfo(edge, nop_count, false));
-                    nop_inserted_gs_fork2 += nop_count;
-                }
-
-                // Invoke buffer for second fork and check nodes afterwards.
-                //
-                std::vector<Node*> buffered_nodes_fork2 = gs_buffer_fork2.buffer(gs_buffer_fork2_buffer_info);
-                EXPECT_EQ(graph->virtual_node_count(), nop_inserted_gs + nop_inserted_gs_fork1 + nop_inserted_gs_fork2);
-                EXPECT_EQ(graph->nodes().size(), initial_node_count + graph->virtual_node_count());
-
-                {
-                    // Test cuting of virtual edge. Also showcase for GS external usage
-                    // of GraphTraversalContext. Here it allows traversing graph in the context
-                    // of gs_buffer_fork2 GS instance externally.
-                    //
-                    auto graph_traversal_context = gs_buffer_fork2.get_graph_traversal_context();
-                    std::vector<graphlib::Edge> edges_to_cut;
-                    for (auto& edge : graph->user_data_edges(node))
-                    {
-                        edges_to_cut.push_back(edge);
-                    }
-
-                    gs_buffer_fork2.cut(edges_to_cut);
-                    edges_cut = edges_to_cut.size();
-                }
-
-                for (Node* buf_node : buffered_nodes_fork)
-                {
-                    auto opmodels = gs_buffer_fork.at(buf_node);
-                    gs_buffer_fork.set(buf_node, *opmodels.begin());
-                }
-
-                for (Node* buf_node : buffered_nodes_fork2)
-                {
-                    auto opmodels = gs_buffer_fork2.at(buf_node);
-                    auto it = opmodels.begin();
-                    int opModelCount = opmodels.mask.count();
-                    int currentModel = 0;
-                    while (currentModel < opModelCount - 1)
-                    {
-                        it++;
-                        currentModel++;
-                    }
-
-                    gs_buffer_fork2.set(buf_node, *it);
-                }
-
-                // Now lets compare graph traversal for each instance of GraphSolver.
-                //
-                {
-                    auto graph_traversal_context = graph_solver.get_graph_traversal_context();
-                    auto gs_subgraph = tt::graphlib::topological_sort(*graph);
-                    EXPECT_EQ(gs_subgraph.size(), initial_node_count + nop_inserted_gs);
-                }
-
-                {
-                    auto graph_traversal_context = gs_buffer_fork.get_graph_traversal_context();
-                    auto gs_subgraph = tt::graphlib::topological_sort(*graph);
-                    EXPECT_EQ(gs_subgraph.size(), initial_node_count + nop_inserted_gs_fork1);
-                }
-
-                {
-                    auto graph_traversal_context = gs_buffer_fork2.get_graph_traversal_context();
-                    auto gs_subgraph = tt::graphlib::topological_sort(*graph);
-                    EXPECT_EQ(
-                        gs_subgraph.size(), initial_node_count + nop_inserted_gs + nop_inserted_gs_fork2 + edges_cut);
-                }
-
-                auto visible_global_graph = tt::graphlib::topological_sort(*graph);
-                EXPECT_EQ(visible_global_graph.size(), initial_node_count);
-            }
-
-            // Check if cleanup of deleted GS(gs_buffer_fork2) works as expected.
-            //
-            EXPECT_EQ(graph->virtual_node_count(), nop_inserted_gs + nop_inserted_gs_fork1);
-            EXPECT_EQ(graph->nodes().size(), initial_node_count + graph->virtual_node_count());
-        }
-
-        auto opmodels = graph_solver.at(node);
-        graph_solver.set(node, *opmodels.begin());
-
-        // After calling SET, we expect having only one OpModel available for this node.
-        //
-        auto opmodels_after_set = graph_solver.at(node);
-        EXPECT_EQ(opmodels_after_set.mask.count(), 1);
-    }
-
-    balancer::legalizer::GraphSolverSolution solution = graph_solver.finish();
-
-    // After calling finish all virtual nodes should be gone and graph state is stable/final.
-    //
-    EXPECT_EQ(graph->virtual_node_count(), 0);
-
-    EXPECT_EQ(solution.cut_edges.size(), 0);
-    EXPECT_EQ(solution.selected_op_models.size(), 6);
-    EXPECT_EQ(graph->nodes().size(), initial_node_count + nop_inserted_gs);
-}
-
-// Buffer on edge then fork GS. On forked GS buffer again between buffer and persisted node.
-// Then make forked GS expire. This used to cause source GS to end up with unconnected node/edge removed in forked GS
-// cleanup.
-//
-TEST_F(GraphSolverResolveSanity, nop_insertion_forking_snapshot)
-{
-    balancer::BalancerConfig balancer_config = create_balancer_config();
-    std::shared_ptr<balancer::BalancerCacheCollection> cache_collection = create_balancer_cache_collection();
-    balancer::LegalOpModels valid_op_models =
-        balancer::legalizer::get_legal_op_models(graph.get(), balancer_config, cache_collection);
-    EXPECT_EQ(valid_op_models.size(), 3);
-    legalizer::GraphSolver graph_solver =
-        get_graph_solver(balancer_config, cache_collection, graph.get(), valid_op_models);
-
-    auto topo_sort = tt::graphlib::topological_sort(*graph);
-    for (Node* node : topo_sort)
-    {
-        if (node->node_type() != graphlib::NodeType::kBudaOp)
-        {
-            continue;
-        }
-
-        if (node->name() == "add")
-        {
-            legalizer::GraphSolver gs_buffer_fork = graph_solver;
-            vector<balancer::legalizer::BufferInfo> buffer_info;
-            int nop_count = 1;
-
-            for (Edge edge : graph->operand_data_edges(node))
-            {
-                buffer_info.emplace_back(balancer::legalizer::BufferInfo(edge, nop_count, false));
-            }
-
-            gs_buffer_fork.buffer(buffer_info);
-
-            {
-                legalizer::GraphSolver gs_buffer_fork_2 = gs_buffer_fork;
-                auto graph_traversal_context = gs_buffer_fork_2.get_graph_traversal_context();
-                buffer_info.clear();
-
-                for (Edge edge : graph->operand_data_edges(node))
-                {
-                    buffer_info.emplace_back(balancer::legalizer::BufferInfo(edge, nop_count, false));
-                }
-
-                gs_buffer_fork_2.buffer(buffer_info);
-            }
-
-            // After gs_buffer_fork_2 is gone we still need to have connected data operands.
-            //
-            auto graph_traversal_context = gs_buffer_fork.get_graph_traversal_context();
-            EXPECT_TRUE(graph->data_operands(graph->data_operands(node).back()).back() != nullptr);
-        }
-
-        auto opmodels = graph_solver.at(node);
-        graph_solver.set(node, *opmodels.begin());
-
-        // After calling SET, we expect having only one OpModel available for this node.
-        //
-        auto opmodels_after_set = graph_solver.at(node);
-        EXPECT_EQ(opmodels_after_set.mask.count(), 1);
-    }
-
-    balancer::legalizer::GraphSolverSolution solution = graph_solver.finish();
-
-    // After calling finish all virtual nodes should be gone and graph state is stable/final.
-    //
-    EXPECT_EQ(graph->virtual_node_count(), 0);
-
-    EXPECT_EQ(solution.cut_edges.size(), 0);
-    EXPECT_EQ(solution.selected_op_models.size(), 3);
-}
-
-#ifdef DEBUG
-struct GraphSolverResolveEdge : testing::Test
-{
-    std::unique_ptr<Graph> graph;
-
-    GraphSolverResolveEdge()
-    {
-        graph = std::make_unique<Graph>(graphlib::IRLevel::IR_PYBUDA);
-
-        auto in0 = create_input(*graph, "in0", graphlib::Shape::create({1, 1, 512, 256}));
-        auto add1 = add_node<graphlib::PyOpNode>(*graph, "add1", "add", {}, {in0, in0});
-        auto mul1 = add_node<graphlib::PyOpNode>(*graph, "mul1", "multiply", {}, {add1, add1});
-        auto add2 = add_node<graphlib::PyOpNode>(*graph, "add2", "add", {}, {mul1, mul1});
-        auto mul2 = add_node<graphlib::PyOpNode>(*graph, "mul2", "multiply", {}, {add1, add2});
-
-        create_output(*graph, "out0", mul2);
-
-        graph = prepare_graph_for_legalizer(graph.get());
-    }
-};
-
-TEST_F(GraphSolverResolveEdge, resolveedgecase)
-{
-    auto topo_sort = tt::graphlib::topological_sort(*graph);
-
-    // Set valid pairs of OpModels for GraphSolver(simulation).
-    //
-    const Node* add1 = graph->get_node_by_name("add1");
-    const Node* mul1 = graph->get_node_by_name("mul1");
-    const Node* add2 = graph->get_node_by_name("add2");
-    const Node* mul2 = graph->get_node_by_name("mul2");
-
-    LegalOpModels valid_op_models = {
-        {add1, std::vector<OpModel>(2)},
-        {mul1, std::vector<OpModel>(2)},
-        {add2, std::vector<OpModel>(2)},
-        {mul2, std::vector<OpModel>(2)},
-    };
-
-    both(valid_op_models[add1][0], valid_op_models[mul1][0]);
-    both(valid_op_models[add1][1], valid_op_models[mul1][1]);
-    both(valid_op_models[mul1][0], valid_op_models[add2][0]);
-    both(valid_op_models[add1][1], valid_op_models[mul2][0]);
-
-    balancer::BalancerConfig balancer_config = create_balancer_config();
-    std::shared_ptr<balancer::BalancerCacheCollection> balancer_cache_collection = create_balancer_cache_collection();
-
-    balancer_config.graph_solver_self_cut_type = balancer::legalizer::GraphSolverSelfCutType::FastCut;
-
-    // Legalizer recalc would change fixed OpModel valid pairs, thats why we are disabling it for this test.
-    //
-    legalizer::GraphSolver graph_solver = legalizer::GraphSolver::create<UnitTestConstraint>(
-        graph.get(), valid_op_models, balancer_config, balancer_cache_collection, false /*use_op_model_recalculation*/);
-
-    // Simple - just set first available.
-    //
-    for (Node* node : topo_sort)
-    {
-        if (node->node_type() != graphlib::NodeType::kBudaOp)
-        {
-            continue;
-        }
-
-        auto opmodels = graph_solver.at(node);
-        graph_solver.set(node, *opmodels.begin());
-
-        // After calling SET, we expect having only one OpModel available for this node.
-        //
-        auto opmodels_after_set = graph_solver.at(node);
-        EXPECT_EQ(opmodels_after_set.mask.count(), 1);
-    }
-
-    EXPECT_EQ(graph->virtual_node_count(), 2);
-    balancer::legalizer::GraphSolverSolution solution = graph_solver.finish();
-
-    EXPECT_EQ(graph->virtual_node_count(), 0);
-    EXPECT_EQ(solution.cut_edges.size(), 2);
-    EXPECT_EQ(solution.selected_op_models.size(), 4);
-}
-#endif
-
-struct ForkGraph : public BudaGraphTest
-{
-   protected:
-    virtual std::vector<OpType*> create_graph() override
-    {
-        int r = 32;
-        int c = 32;
-
-        auto a = create_activation(1, 1, r, c);
-        auto b = create_activation(1, 1, r, c);
-
-        x = create_op("add", {a, b});
-        f0 = create_op("void", {x}, 1, 1, r, c);
-        f1 = create_op("void", {x}, 1, 1, r, c);
-        out = create_op("add", {f0, f1});
-
-        return {out};
-    }
-
-    OpType* x;
-    OpType* f0;
-    OpType* f1;
-    OpType* out;
-};
-
-TEST_F(ForkGraph, non_overlapping_forks)
-{
-    int even_counter = 0;
-    int odd_counter = 1;
-    auto evens = [&even_counter](OpModel& a, OpModel& b)
-    {
-        if (even_counter++ % 2 == 0)
-            b2a(a, b);
-    };
-    auto odds = [&odd_counter](OpModel& a, OpModel& b)
-    {
-        if (odd_counter++ % 2 == 0)
-            b2a(a, b);
-    };
-
-    std::vector<OpModel> x_op_models(8);
-    std::vector<OpModel> f0_op_models(8);
-    std::vector<OpModel> f1_op_models(8);
-    std::vector<OpModel> out_op_models(8);
-
-    // Fork point
-    {
-        even_counter = 0;
-        odd_counter = 1;
-        cross(x_op_models, f0_op_models, odds);
-        cross(x_op_models, f1_op_models, evens);
-    }
-
-    // Fork paths
-    {
-        cross(f0_op_models, out_op_models, b2a);
-        cross(f1_op_models, out_op_models, b2a);
-    }
-
-    // Join point
-    {
-        even_counter = 0;
-        odd_counter = 1;
-        cross(out_op_models, f0_op_models, evens);
-        cross(out_op_models, f1_op_models, odds);
-    }
-
-    LegalOpModels valid_op_models = {
-        {x, x_op_models},
-        {f0, f0_op_models},
-        {f1, f1_op_models},
-        {out, out_op_models},
-    };
-
-    graphlib::Graph* graph = get_graph();
-    balancer::BalancerConfig balancer_config = create_balancer_config();
-    std::shared_ptr<balancer::BalancerCacheCollection> balancer_cache_collection = create_balancer_cache_collection();
-
-    balancer_config.graph_solver_self_cut_type = balancer::legalizer::GraphSolverSelfCutType::FastCut;
-    legalizer::GraphSolver graph_solver = legalizer::GraphSolver::create<UnitTestConstraint>(
-        graph, valid_op_models, balancer_config, balancer_cache_collection, false /*use_op_model_recalculation*/);
-
-    for (auto* node : std::vector<OpType*>{x, f0, f1, out})
-    {
-        auto op_model_range = graph_solver.at(node);
-        ASSERT_NE(op_model_range.begin(), op_model_range.end());
-        graph_solver.set(node, *op_model_range.begin());
-    }
-
-    legalizer::GraphSolverSolution solution = graph_solver.finish();
-    EXPECT_EQ(solution.selected_op_models.size(), 4);
-    EXPECT_EQ(graph->virtual_node_count(), 0);
-}
-
-struct AggregateInputGraph : public BudaGraphTest, public testing::WithParamInterface<int>
-{
-    struct Constraint : public legalizer::Constraint
-    {
-        Constraint(
-            const DeviceConfig& device_config,
-            std::shared_ptr<balancer::BalancerCacheCollection> balancer_cache_collection) :
-            legalizer::Constraint(device_config, balancer_cache_collection)
-        {
-        }
-
-        virtual std::pair<legalizer::EdgeCost, legalizer::ConstraintFailureReason> queue_to_op_cost(
-            graphlib::Graph const*, graphlib::Edge, std::optional<OpModel>, OpModel const&) override
-        {
-            return std::make_pair(legalizer::EdgeCost(0, 0, 0, 1), legalizer::NoConstraintFailure);
-        }
-
-        virtual std::pair<legalizer::EdgeCost, legalizer::ConstraintFailureReason> op_to_queue_cost(
-            graphlib::Graph const*, graphlib::Edge, OpModel const&, std::optional<OpModel>) override
-        {
-            return std::make_pair(legalizer::EdgeCost{}, legalizer::NoConstraintFailure);
-        }
-
-        virtual std::pair<legalizer::EdgeCost, legalizer::ConstraintFailureReason> op_to_op_cost(
-            graphlib::Graph const*, graphlib::Edge, OpModel const&, OpModel const&) override
-        {
-            return std::make_pair(legalizer::EdgeCost{}, legalizer::NoConstraintFailure);
-        }
-    };
-
-   protected:
-    virtual std::vector<OpType*> create_graph() override
-    {
-        int r = 32;
-        int c = 32;
-
-        num_inputs = GetParam();
-
-        std::vector<graphlib::Node*> inputs;
-        for (int i = 0; i < num_inputs; ++i) inputs.push_back(create_activation(1, 1, r, c));
-
-        nary = create_op("void", inputs, 1, 1, r, c);
-        return {nary};
-    }
-
-    int num_inputs = 0;
-    OpType* nary;
-};
-
-TEST_P(AggregateInputGraph, aggregate_input_test)
-{
-    LegalOpModels legal_op_models = {
-        {nary, std::vector<OpModel>(num_inputs)},
-    };
-
-    auto test = [this, &legal_op_models]()
-    {
-        legalizer::GraphSolver graph_solver = legalizer::GraphSolver::create<AggregateInputGraph::Constraint>(
-            this->get_graph(),
-            legal_op_models,
-            create_balancer_config(),
-            create_balancer_cache_collection(),
-            false /*use_op_model_recalculation*/);
-        legalizer::GraphSolverSolution solution = graph_solver.finish();
-    };
-
-    if (num_inputs > legalizer::EdgeCost::kMaxDRAMInQueues)
-    {
-        EXPECT_THROW({ test(); }, balancer::BalancerError);
-    }
-    else
-    {
-        test();
-    }
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    AggregateInputTest,
-    AggregateInputGraph,
-    testing::Values(
-        1,
-        2,
-        legalizer::EdgeCost::kMaxDRAMInQueues - 1,
-        legalizer::EdgeCost::kMaxDRAMInQueues,
-        legalizer::EdgeCost::kMaxDRAMInQueues + 1));
-
-struct JsonGraph : public BudaGraphTest, public testing::WithParamInterface<std::string>
-{
-   protected:
-    using NodeId = int;
-
-    graphlib::Node* create_node(NodeId node_id)
-    {
-        graphlib::Node*& node = nodes[node_id];
-        if (node)
-            return node;
-
-        auto operand_match = operands.find(node_id);
-        if (operand_match == operands.end())
-        {
-            node = create_activation(1, 1, 32, 32);
-        }
-        else
-        {
-            std::vector<graphlib::Node*> operand_nodes;
-            for (auto operand : operand_match->second) operand_nodes.push_back(create_node(operand));
-            node = create_op("void", operand_nodes);
-        }
-
-        auto key = std::to_string(node_id);
-        auto name_match = test.node_id_to_name.find(key);
-        node->set_name(name_match == test.node_id_to_name.end() ? key : name_match->second);
-        return node;
-    }
-
-    virtual std::vector<OpType*> create_graph() override
-    {
-        auto json_file = GetParam();
-        std::string json_dir = "./pybuda/csrc/balancer/tests/json/";
-        std::ifstream input(json_dir + json_file);
-        nlohmann::json j = nlohmann::json::parse(input);
-        from_json(j, test);
-
-        // Init operands + users first
-        for (auto const& edge : test.edges)
-        {
-            auto& ops = operands[edge.consumer];
-            ops.resize(std::max((int)ops.size(), edge.input_port + 1));
-            ops[edge.input_port] = edge.producer;
-            users[edge.producer].push_back(edge.consumer);
-        }
-
-        // Create nodes / graph
-        for (auto const& edge : test.edges)
-        {
-            create_node(edge.producer);
-            create_node(edge.consumer);
-        }
-
-        // Create outputs
-        std::vector<OpType*> outputs;
-        for (auto const& edge : test.edges)
-        {
-            auto user_match = users.find(edge.consumer);
-            if (user_match == users.end())
-            {
-                OpType* op = dynamic_cast<OpType*>(nodes.at(edge.consumer));
-                TT_ASSERT(op);
-                outputs.push_back(op);
-            }
-        }
-
-        return outputs;
-    }
-
-    JsonTest test;
-    std::unordered_map<NodeId, graphlib::Node*> nodes;
-    std::unordered_map<NodeId, std::vector<NodeId>> operands;
-    std::unordered_map<NodeId, std::vector<NodeId>> users;
-};
-
-TEST_P(JsonGraph, json_test)
-{
-    auto create_op_model_id = [](NodeId node_id, int op_model_index) -> std::uint64_t
-    {
-        std::uint64_t l = static_cast<std::uint64_t>(node_id);
-        std::uint64_t u = static_cast<std::uint64_t>(op_model_index);
-        return (u << 32) | l;
-    };
-
-    std::unordered_map<std::uint64_t, OpModel> op_model_pool;
-    LegalOpModels legal_op_models;
-    for (auto const& edge : test.edges)
-    {
-        if (nodes.at(edge.producer)->node_type() == graphlib::NodeType::kInput)
-            continue;
-
-        std::vector<OpModel>& producer_op_models = legal_op_models[nodes.at(edge.producer)];
-        std::vector<OpModel>& consumer_op_models = legal_op_models[nodes.at(edge.consumer)];
-        for (auto [source_idx, target_idx] : edge.paths)
-        {
-            auto producer_id = create_op_model_id(edge.producer, source_idx);
-            auto consumer_id = create_op_model_id(edge.consumer, target_idx);
-
-            OpModel& producer_op_model = op_model_pool[producer_id];
-            producer_op_model.id.id = producer_id;  // override the id
-            OpModel& consumer_op_model = op_model_pool[consumer_id];
-            consumer_op_model.id.id = consumer_id;  // override the id
-
-            both(producer_op_model, consumer_op_model);  // relate them
-
-            if (std::find(producer_op_models.begin(), producer_op_models.end(), producer_op_model) ==
-                producer_op_models.end())
-                producer_op_models.push_back(producer_op_model);
-            if (std::find(consumer_op_models.begin(), consumer_op_models.end(), consumer_op_model) ==
-                consumer_op_models.end())
-                consumer_op_models.push_back(consumer_op_model);
-        }
-    }
-
-    graphlib::Graph* graph = get_graph();
-    balancer::BalancerConfig balancer_config = create_balancer_config();
-    std::shared_ptr<balancer::BalancerCacheCollection> balancer_cache_collection = create_balancer_cache_collection();
-    balancer_config.graph_solver_self_cut_type =
-        test.has_flag("FastCut") ? legalizer::GraphSolverSelfCutType::FastCut : legalizer::GraphSolverSelfCutType::None;
-    legalizer::GraphSolver graph_solver = legalizer::GraphSolver::create<UnitTestConstraint>(
-        graph, legal_op_models, balancer_config, balancer_cache_collection, false /*use_op_model_recalculation*/);
-    legalizer::GraphSolverSolution solution = graph_solver.finish();
-    EXPECT_EQ(graph->virtual_node_count(), 0);
-    EXPECT_EQ(solution.selected_op_models.size(), 0);
-}
-
-INSTANTIATE_TEST_SUITE_P(JsonTest, JsonGraph, testing::Values("ghostnet_subgraph.json"));
-}  // namespace tt::test
diff --git a/pybuda/csrc/balancer/tests/test_interactive_placer.cpp b/pybuda/csrc/balancer/tests/test_interactive_placer.cpp
deleted file mode 100644
index b3c4a138c..000000000
--- a/pybuda/csrc/balancer/tests/test_interactive_placer.cpp
+++ /dev/null
@@ -1,260 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include <fstream>
-
-#include "graph_lib/query.hpp"
-#include "gtest/gtest.h"
-#include "json.hpp"
-#include "placer/interactive_placer.hpp"
-#include "placer/placer.hpp"
-#include "test/common.hpp"
-#include "test_balancer_utils.hpp"
-
-namespace tt::test
-{
-using namespace balancer;
-
-struct InteractivePlacerSanity : testing::Test
-{
-};
-
-// Unit test for InteractivePlacer::rewind_to(const std::string &op_name).
-//
-TEST_F(InteractivePlacerSanity, rewind_to)
-{
-    balancer::BalancerConfig balancer_config = create_balancer_config();
-    placer::InteractivePlacer interactive_placer(nullptr /*graph*/, balancer_config);
-    std::map<std::string, placer::GridShape> op_to_grid_shape = {
-        {"op1", placer::GridShape(1, 1)},
-        {"op2", placer::GridShape(1, 10)},
-        {"op3_pair1", placer::GridShape(5, 1)},
-        {"op4_pair2", placer::GridShape(5, 3)},
-        {"op5", placer::GridShape(1, 9)},
-        {"op6_pair1", placer::GridShape(5, 2)},
-        {"op7_pair2", placer::GridShape(5, 2)},
-        {"op8", placer::GridShape(3, 3)}};
-
-    std::optional<std::pair<std::string, placer::GridShape>> buffered_op;
-    std::optional<placer::CoordRange> op_placement;
-    interactive_placer.get_op_overrides().emplace("op5", placer::PlacerOpOverride::force_op_transpose());
-    for (const auto& to_place : op_to_grid_shape)
-    {
-        if (to_place.first.find("pair1") != std::string::npos)
-        {
-            buffered_op = to_place;
-            continue;
-        }
-
-        if (buffered_op.has_value())
-        {
-            op_placement = interactive_placer.place_two_ops_rowwise(
-                buffered_op->first, buffered_op->second, to_place.first, to_place.second, true /* enable_transpose */);
-            buffered_op.reset();
-        }
-        else
-        {
-            op_placement = interactive_placer.place_op(to_place.first, to_place.second, true /* enable_transpose */);
-        }
-
-        ASSERT_TRUE(op_placement.has_value());
-    }
-
-    // Snapshot placement configuration prior to rewind. Needs to remain unchanged after rewind.
-    //
-    std::unordered_map<std::string, placer::OpPlacement> pre_rewind_placements =
-        interactive_placer.get_current_name_to_op_placement();
-    interactive_placer.rewind_to(std::prev(op_to_grid_shape.end())->first);
-
-    // Verify that user overrides are preserved after rewind.
-    //
-    EXPECT_EQ(interactive_placer.get_op_overrides().at("op5"), placer::PlacerOpOverride::force_op_transpose());
-
-    // Verify that placement is identical for rewinded ops.
-    //
-    for (const auto& pre_rewind_placement : pre_rewind_placements)
-    {
-        if (pre_rewind_placement.first == std::prev(op_to_grid_shape.end())->first)
-        {
-            continue;
-        }
-
-        auto post_rewind_placement =
-            interactive_placer.get_current_name_to_op_placement().find(pre_rewind_placement.first);
-        ASSERT_TRUE(post_rewind_placement != interactive_placer.get_current_name_to_op_placement().end());
-        EXPECT_EQ(pre_rewind_placement.second, post_rewind_placement->second);
-    }
-}
-
-// Unit test for InteractivePlacer::place_op for multi_chip
-//
-TEST_F(InteractivePlacerSanity, chip_id_override)
-{
-    const std::vector<std::uint32_t> chip_ids = {0, 1, 2, 3};
-    balancer::BalancerConfig balancer_config = create_balancer_config(Arch::Wormhole_b0, chip_ids, balancer::PolicyType::Ribbon);
-    placer::InteractivePlacer interactive_placer(nullptr /*graph*/, balancer_config);
-    std::map<std::string, placer::GridShape> op_to_grid_shape = {
-        {"op1", placer::GridShape(8, 8)},
-        {"op2", placer::GridShape(8, 8)},
-        {"op3", placer::GridShape(8, 8)},
-        {"op4", placer::GridShape(8, 8)},
-    };
-
-    interactive_placer.get_op_overrides()["op1"].chip_id = 3;
-    interactive_placer.get_op_overrides()["op2"].chip_id = 2;
-    interactive_placer.get_op_overrides()["op3"].chip_id = 1;
-
-    for (const auto& to_place : op_to_grid_shape)
-    {
-        std::optional<placer::CoordRange> op_placement = interactive_placer.place_op(to_place.first, to_place.second);
-        ASSERT_TRUE(op_placement.has_value());
-    }
-
-    const unordered_map<string, placer::OpPlacement>& name_to_op_placement = interactive_placer.get_current_name_to_op_placement();
-    EXPECT_EQ(name_to_op_placement.at("op1").chip_id, 3);
-    EXPECT_EQ(name_to_op_placement.at("op2").chip_id, 2);
-    EXPECT_EQ(name_to_op_placement.at("op3").chip_id, 1);
-    EXPECT_EQ(name_to_op_placement.at("op4").chip_id, 0);
-    EXPECT_EQ(interactive_placer.get_current_epoch_index(), 3);
-}
-
-// Unit test for ChipPlacementPolicy::SNAKE - order the given chip_ids on a snake pattern based on budabackend/cluster_desc.yaml
-//
-TEST_F(InteractivePlacerSanity, chip_id_galaxy_snake)
-{
-    const std::vector<std::uint32_t> chip_ids = {1, 2, 18, 25, 19, 24, 20, 23};
-    balancer::BalancerConfig balancer_config = create_balancer_config(
-        Arch::Wormhole_b0,
-        chip_ids,
-        balancer::PolicyType::Ribbon,
-        "pybuda/test/galaxy/one_shelf_eth_connections.yaml",
-        "pybuda/test/galaxy/one_shelf_runtime_params.yaml",
-        placer::ChipPlacementPolicy::SNAKE
-    );
-    placer::InteractivePlacer interactive_placer(nullptr /*graph*/, balancer_config);
-    std::map<std::string, placer::GridShape> op_to_grid_shape = {
-        {"op1", placer::GridShape(8, 8)},
-        {"op2", placer::GridShape(8, 8)},
-        {"op3", placer::GridShape(8, 8)},
-        {"op4", placer::GridShape(8, 8)},
-        {"op5", placer::GridShape(8, 8)},
-        {"op6", placer::GridShape(8, 8)},
-        {"op7", placer::GridShape(8, 8)},
-        {"op8", placer::GridShape(8, 8)},
-    };
-
-    for (const auto& to_place : op_to_grid_shape)
-    {
-        std::optional<placer::CoordRange> op_placement = interactive_placer.place_op(to_place.first, to_place.second);
-        ASSERT_TRUE(op_placement.has_value());
-    }
-
-    const unordered_map<string, placer::OpPlacement>& name_to_op_placement = interactive_placer.get_current_name_to_op_placement();
-    EXPECT_EQ(name_to_op_placement.at("op1").chip_id, 1);
-    EXPECT_EQ(name_to_op_placement.at("op2").chip_id, 25);
-    EXPECT_EQ(name_to_op_placement.at("op3").chip_id, 24);
-    EXPECT_EQ(name_to_op_placement.at("op4").chip_id, 23);
-    EXPECT_EQ(name_to_op_placement.at("op5").chip_id, 20);
-    EXPECT_EQ(name_to_op_placement.at("op6").chip_id, 19);
-    EXPECT_EQ(name_to_op_placement.at("op7").chip_id, 18);
-    EXPECT_EQ(name_to_op_placement.at("op8").chip_id, 2);
-    EXPECT_EQ(interactive_placer.get_current_epoch_index(), 7);
-}
-
-struct MultiLayerGraph : public BudaGraphTest
-{
-   protected:
-    std::string layer_string(int chip_id) const { return "layer." + std::to_string(chip_id); }
-
-    virtual std::vector<OpType*> create_graph() override
-    {
-        std::uint32_t seq_len = 128;
-        std::uint32_t embed = 128;
-        std::uint32_t hidden = 128;
-
-        auto act = create_activation(shape(1, 1, seq_len, embed));
-        auto w0 = create_parameter(shape(1, 1, embed, hidden));
-        auto w1 = create_parameter(shape(1, 1, hidden, embed));
-        auto w2 = create_parameter(shape(1, 1, hidden, embed));
-
-        auto e0 = create_op("matmul", {act, w0});
-        auto g0 = create_op("gelu", {e0});
-        auto e1 = create_op("matmul", {g0, w1});
-        auto g1 = create_op("gelu", {e1});
-        auto e2 = create_op("matmul", {g1, w2});
-        auto g2 = create_op("gelu", {e2});
-
-        e0->tag("layer", layer_string(chip_ids[0]));
-        g0->tag("layer", layer_string(chip_ids[0]));
-        e1->tag("layer", layer_string(chip_ids[1]));
-        g1->tag("layer", layer_string(chip_ids[1]));
-        e2->tag("layer", layer_string(chip_ids[2]));
-        g2->tag("layer", layer_string(chip_ids[2]));
-
-        return {g2};
-    }
-
-    const std::vector<std::uint32_t> chip_ids = {0, 1, 2};
-};
-
-// Unit test for InteractivePlacer::place_op using layer predicate
-//
-TEST_F(MultiLayerGraph, chip_id_layer_override)
-{
-    graphlib::Graph* graph = get_graph();
-
-    balancer::BalancerConfig balancer_config =
-        create_balancer_config(Arch::Wormhole_b0, chip_ids, balancer::PolicyType::Ribbon);
-    balancer_config.op_name_to_placer_overrides = placer::match_op_names_to_placer_overrides(
-        graph,
-        {
-            {graphlib::query::layer_regex("l.*\\.0"), placer::PlacerOpOverride::override_chip_id(chip_ids.at(0))},
-            {graphlib::query::layer_regex("l.*\\.1"), placer::PlacerOpOverride::override_chip_id(chip_ids.at(1))},
-            {graphlib::query::layer_regex("l.*\\.2"), placer::PlacerOpOverride::override_chip_id(chip_ids.at(2))},
-        });
-
-    placer::InteractivePlacer interactive_placer(graph, balancer_config);
-
-    for (auto* node : graphlib::topological_sort(*graph))
-    {
-        if (node->node_type() != graphlib::NodeType::kBudaOp)
-            continue;
-        std::optional<placer::CoordRange> op_placement = interactive_placer.place_op(node->name(), GridShape(1, 1));
-        ASSERT_TRUE(op_placement.has_value());
-    }
-
-    const unordered_map<string, placer::OpPlacement>& name_to_op_placement = interactive_placer.get_current_name_to_op_placement();
-
-    for (auto* node : graphlib::topological_sort(*graph))
-    {
-        if (node->node_type() != graphlib::NodeType::kBudaOp)
-            continue;
-
-        int chip_id = name_to_op_placement.at(node->name()).chip_id;
-        std::string expected_layer_string = node->as<graphlib::TaggedNode>()->tag_value<std::string>("layer");
-        std::string actual_layer_string = layer_string(chip_id);
-        ASSERT_EQ(actual_layer_string, expected_layer_string);
-    }
-}
-
-TEST_F(InteractivePlacerSanity, nebula_grid_8x8)
-{
-    ASSERT_FALSE(getenv("PYBUDA_NEBULA_GALAXY_PLACER"));
-    setenv("PYBUDA_NEBULA_GALAXY_PLACER", "1", 0);
-    const std::vector<std::uint32_t> chip_ids = {0};
-    balancer::BalancerConfig balancer_config = create_balancer_config(Arch::Wormhole_b0, chip_ids, balancer::PolicyType::Ribbon);
-    placer::InteractivePlacer interactive_placer(nullptr /*graph*/, balancer_config);
-    std::map<std::string, placer::GridShape> op_to_grid_shape = {
-        {"op1", placer::GridShape(9, 1)},
-    };
-
-    interactive_placer.get_op_overrides()["op1"].chip_id = 0;
-
-    std::optional<placer::CoordRange> op_placement = interactive_placer.place_op("op1", op_to_grid_shape["op1"]);
-
-    // cannot fit on nebula 8x8 grid
-    EXPECT_EQ(op_placement.has_value(), false);
-    unsetenv("PYBUDA_NEBULA_GALAXY_PLACER");
-}
-
-}  // namespace tt::test
diff --git a/pybuda/csrc/balancer/tests/test_op_override.cpp b/pybuda/csrc/balancer/tests/test_op_override.cpp
deleted file mode 100644
index b7b46a56f..000000000
--- a/pybuda/csrc/balancer/tests/test_op_override.cpp
+++ /dev/null
@@ -1,52 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include "balancer/balancer_config.hpp"
-#include "balancer/types.hpp"
-#include "gtest/gtest.h"
-
-namespace tt::test
-{
-using OpOverride = tt::balancer::OpOverride;
-
-TEST(TestOpOverride, no_override1)
-{
-    // Test that default override doesn't change anything
-    tt::balancer::FactorizedShape shape = std::make_pair(10, 16);
-    bool force_dram_parameters_out = false;
-    std::vector<balancer::TStreamDir> t_stream_dirs = {};
-    tt::balancer::FactorizedShape streaming_pars = std::make_pair(8, 12);
-    bool enable_t_streaming = false;
-
-    OpOverride blank;
-    blank.apply(shape, force_dram_parameters_out, t_stream_dirs, streaming_pars, enable_t_streaming, "foo");
-
-    EXPECT_EQ(shape, tt::balancer::FactorizedShape(10, 16));
-    EXPECT_EQ(streaming_pars, tt::balancer::FactorizedShape(8, 12));
-    EXPECT_EQ(force_dram_parameters_out, false);
-    EXPECT_EQ(t_stream_dirs.size(), 0);
-    EXPECT_EQ(enable_t_streaming, false);
-}
-
-TEST(TestOpOverride, no_override2)
-{
-    // Test that default override doesn't change anything
-    tt::balancer::FactorizedShape shape = std::make_pair(2, 4);
-    bool force_dram_parameters_out = true;
-    std::vector<balancer::TStreamDir> t_stream_dirs = {tt::balancer::TStreamDir::R};
-    tt::balancer::FactorizedShape streaming_pars = std::make_pair(6, 1);
-    bool enable_t_streaming = true;
-
-    OpOverride blank;
-    blank.apply(shape, force_dram_parameters_out, t_stream_dirs, streaming_pars, enable_t_streaming, "foo");
-
-    EXPECT_EQ(shape, tt::balancer::FactorizedShape(2, 4));
-    EXPECT_EQ(streaming_pars, tt::balancer::FactorizedShape(6, 1));
-    EXPECT_EQ(force_dram_parameters_out, true);
-    EXPECT_EQ(t_stream_dirs.size(), 1);
-    EXPECT_EQ(t_stream_dirs[0], tt::balancer::TStreamDir::R);
-    EXPECT_EQ(enable_t_streaming, true);
-}
-
-}
diff --git a/pybuda/csrc/balancer/tests/test_tile_layout.cpp b/pybuda/csrc/balancer/tests/test_tile_layout.cpp
deleted file mode 100644
index 9039144ad..000000000
--- a/pybuda/csrc/balancer/tests/test_tile_layout.cpp
+++ /dev/null
@@ -1,686 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include <random>
-#include <unordered_map>
-
-#include "balancer/balancer_utils.hpp"
-#include "balancer/types.hpp"
-#include "graph_lib/utils.hpp"
-#include "gtest/gtest.h"
-#include "net2pipe/tile_maps.h"
-#include "utils/logger.hpp"
-
-namespace tt::test
-{
-using namespace balancer;
-using graphlib::UBlockOrder;
-
-static std::vector<int> factorize(int begin, int f)
-{
-    std::vector<int> factors;
-    for (int i = begin; i <= f; ++i)
-        if (i % f == 0)
-            factors.push_back(i);
-    return factors;
-}
-
-static std::vector<int> factorize(int f) { return factorize(1, f); }
-
-template <typename... Attrs>
-graphlib::OpType tm(std::string const& type, Attrs... attrs)
-{
-    return graphlib::OpType(type, {attrs...}, {});
-}
-
-template <typename... Attrs>
-graphlib::OpType vslice(Attrs... attrs)
-{
-    return tm("vslice", attrs...);
-}
-
-template <typename... Attrs>
-graphlib::OpType hslice(Attrs... attrs)
-{
-    return tm("hslice", attrs...);
-}
-
-template <typename... Attrs>
-graphlib::OpType vstack(Attrs... attrs)
-{
-    return tm("vstack", attrs...);
-}
-
-template <typename... Attrs>
-graphlib::OpType hstack(Attrs... attrs)
-{
-    return tm("hstack", attrs...);
-}
-
-template <typename... Attrs>
-graphlib::OpType broadcast(Attrs... attrs)
-{
-    return tm("broadcast", attrs...);
-}
-
-template <typename... Attrs>
-graphlib::OpType tile_broadcast(Attrs... attrs)
-{
-    return tm("tile_broadcast", attrs...);
-}
-
-inline graphlib::OpType transpose()
-{
-    return graphlib::OpType("transpose", {}, {}, {{"dim0", 2}, {"dim1", 3}, {"z_dim_slice", -1}});
-}
-
-template <typename... Attrs>
-graphlib::OpType buda_pad(Attrs... attrs)
-{
-    return tm("buda_pad", attrs...);
-}
-
-template <typename... Attrs>
-graphlib::OpType buda_unpad(Attrs... attrs)
-{
-    return tm("buda_unpad", attrs...);
-}
-
-static std::tuple<int, int, int> get_net2pipe_resource_usage(
-    GridShape producer_grid_shape,
-    BlockShape producer_block_shape,
-    graphlib::UBlockOrder producer_ublock_order,
-    int producer_out_buf_mb,
-    std::vector<graphlib::OpType> const& tms,
-    std::string const& consumer_op_type,
-    int consumer_input_port_id,
-    GridShape consumer_grid_shape,
-    BlockShape consumer_block_shape,
-    graphlib::UBlockOrder consumer_ublock_order,
-    std::string const& producer_name = "producer",
-    std::string const& consumer_name = "consumer")
-{
-    three_d_array_tile_src_map tile_map(
-        producer_name,
-        consumer_name,
-        producer_block_shape.t,
-        producer_block_shape.ublock.rt,
-        producer_block_shape.ublock.ct,
-        producer_block_shape.mblock_m,
-        producer_block_shape.mblock_n,
-        producer_grid_shape.r,
-        producer_grid_shape.c,
-        producer_out_buf_mb / 2,
-        producer_ublock_order == graphlib::UBlockOrder::R);
-
-    for (graphlib::OpType const& tm : tms)
-    {
-        if (tm.op == "tile_broadcast")
-            continue;
-
-        if (tm.op == "broadcast")
-        {
-            int dim = std::get<int>(tm.attr[0]);
-            std::string dims[] = {"w", "z", "r", "c"};
-            tile_map = tile_map.apply_tm(dims[dim] + "_" + tm.op, {std::get<int>(tm.attr[1])});
-        }
-        else if (tm.op == "buda_pad")
-        {
-            int rt = std::get<int>(tm.attr[0]);
-            int ct = std::get<int>(tm.attr[1]);
-            tile_map = tile_map.pad(rt, ct);
-        }
-        else if (tm.op == "buda_unpad")
-        {
-            int rt = std::get<int>(tm.attr[0]);
-            int ct = std::get<int>(tm.attr[1]);
-            tile_map = tile_map.unpad(rt, ct);
-        }
-        else if (tm.op == "transpose")
-        {
-            tile_map = tile_map.apply_tm(tm.op, {});
-        }
-        else
-        {
-            tile_map = tile_map.apply_tm(tm.op, {std::get<int>(tm.attr[0])});
-        }
-    }
-
-    constexpr int kernel_broadcast_tiles = 0;
-
-    consumer_to_producer_tile_map edge_tile_map;
-    if (consumer_op_type == "matmul")
-    {
-        if (consumer_input_port_id == 0)
-        {
-            edge_tile_map = tile_map.get_op_matmul_row_input(
-                kernel_broadcast_tiles,
-                false /*kernel_bcast_tiles_per_t*/,
-                consumer_block_shape.t,
-                consumer_block_shape.ublock.rt,
-                consumer_block_shape.ublock.ct,
-                consumer_block_shape.mblock_m,
-                consumer_block_shape.mblock_n,
-                consumer_grid_shape.r,
-                consumer_grid_shape.c);
-        }
-        else
-        {
-            edge_tile_map = tile_map.get_op_matmul_col_input(
-                kernel_broadcast_tiles,
-                false /*kernel_bcast_tiles_per_t*/,
-                consumer_block_shape.t,
-                consumer_block_shape.ublock.rt,
-                consumer_block_shape.ublock.ct,
-                consumer_block_shape.mblock_m,
-                consumer_block_shape.mblock_n,
-                consumer_grid_shape.r,
-                consumer_grid_shape.c);
-        }
-    }
-    else
-    {
-        edge_tile_map = tile_map.get_op_eltwise_input(
-            kernel_broadcast_tiles,
-            false /*kernel_bcast_tiles_per_t*/,
-            consumer_block_shape.t,
-            consumer_block_shape.ublock.rt,
-            consumer_block_shape.ublock.ct,
-            consumer_block_shape.mblock_m,
-            consumer_block_shape.mblock_n,
-            consumer_grid_shape.r,
-            consumer_grid_shape.c,
-            consumer_ublock_order == graphlib::UBlockOrder::R);
-    }
-
-    return std::make_tuple(
-        edge_tile_map.max_producer_core_fan_out(),
-        edge_tile_map.max_producer_core_phases(),
-        edge_tile_map.max_consumer_core_phases());
-}
-
-void test_inverse(CanCoord c0, TensorShape s0, std::vector<graphlib::OpType> const& tms)
-{
-    auto [c1, s1] = map_inverse_tms(c0, s0, tms);
-    ASSERT_EQ(c1, c0);
-    ASSERT_EQ(s1, s0);
-}
-
-TEST(TileLayoutTest, test_tm_inverses)
-{
-    std::vector<std::pair<std::string, std::string>> inverses = {
-        {"vslice", "vstack"},
-        {"vstack", "vslice"},
-        {"hslice", "hstack"},
-        {"hstack", "hslice"},
-        {"transpose", "transpose"},
-    };
-
-    int max_t = 8;
-    int max_r = 8;
-    int max_c = 8;
-
-    for (auto [from, to] : inverses)
-        for (int tdim = 1; tdim <= max_t; ++tdim)
-            for (int rdim = 1; rdim <= max_r; ++rdim)
-                for (int cdim = 1; cdim <= max_c; ++cdim)
-                    for (int t = 0; t < tdim; ++t)
-                        for (int r = 0; r < rdim; ++r)
-                            for (int c = 0; c < cdim; ++c)
-                            {
-                                TensorShape shape(1, tdim, rdim, cdim);
-                                CanCoord coord(t, r, c);
-                                if (from == "transpose")
-                                {
-                                    test_inverse(coord, shape, {transpose(), transpose()});
-                                }
-                                else
-                                {
-                                    int dim = (from == "vstack" or from == "hstack") ? tdim
-                                              : (from == "vslice")                   ? rdim
-                                                                                     : cdim;
-                                    for (int f : factorize(dim))
-                                    {
-                                        test_inverse(coord, shape, {tm(from, f), tm(to, f)});
-                                    }
-                                }
-                            }
-}
-
-TEST(TileLayoutTest, test_tile_layout)
-{
-    int max_t = 16;
-    int max_r = 16;
-    int max_c = 16;
-    for (int tdim = 1; tdim <= max_t; ++tdim)
-        for (int rdim = 1; rdim <= max_r; ++rdim)
-            for (int cdim = 1; cdim <= max_c; ++cdim)
-                for (int grid_r : factorize(rdim))
-                    for (int grid_c : factorize(cdim))
-                        for (int ublock_r : factorize(rdim / grid_r))
-                            for (int ublock_c : factorize(cdim / grid_c))
-                                for (auto ublock_order : std::vector<UBlockOrder>{UBlockOrder::R, UBlockOrder::C})
-                                {
-                                    ASSERT_EQ(rdim % (grid_r * ublock_r), 0);
-                                    ASSERT_EQ(cdim % (grid_c * ublock_c), 0);
-                                    int m = rdim / (grid_r * ublock_r);
-                                    int n = cdim / (grid_c * ublock_c);
-                                    TileLayout layout(
-                                        GridShape(grid_r, grid_c),
-                                        BlockShape(tdim, m, n, UBlockShape(ublock_r, ublock_c)),
-                                        ublock_order);
-                                    for (int t = 0; t < tdim; ++t)
-                                        for (int r = 0; r < rdim; ++r)
-                                            for (int c = 0; c < cdim; ++c)
-                                            {
-                                                CanCoord coord(t, r, c);
-                                                LinCoord lin = layout.map(coord);
-                                                CanCoord coord1 = layout.map(lin);
-                                                ASSERT_EQ(coord, coord1);
-                                            }
-                                }
-}
-
-template <typename T>
-T sample(std::mt19937& gen, std::vector<T> v, int start = 0)
-{
-    TT_ASSERT(not v.empty());
-    if ((int)v.size() <= (start + 1))
-        return v.front();
-    std::uniform_int_distribution<> distrib(start, v.size() - 1);
-    return v[distrib(gen)];
-}
-
-static int randint(std::mt19937& gen, int from, int to)
-{
-    std::uniform_int_distribution<> distrib(from, to);
-    return distrib(gen);
-}
-
-static std::vector<std::string> rand_tm_types(std::mt19937& gen, int num_samples)
-{
-    static std::vector<std::string> types = {
-        "broadcast",
-        "vslice",
-        "hslice",
-        "vstack",
-        "hstack",
-        "transpose",
-    };
-
-    std::vector<std::string> selected;
-    selected.reserve(num_samples);
-    for (int i = 0; i < num_samples; ++i) selected.push_back(sample(gen, types));
-    return selected;
-}
-
-std::vector<graphlib::OpType> rand_tms(std::mt19937& gen, TensorShape shape, int num_samples)
-{
-    int max_bcast = 6;
-    auto types = rand_tm_types(gen, num_samples);
-    std::vector<graphlib::OpType> tms;
-    tms.reserve(types.size());
-    for (auto type : types)
-    {
-        if (type == "broadcast")
-        {
-            int dim = randint(gen, 1, 3);
-            int factor = randint(gen, 2, max_bcast);
-            if (dim == 1)
-                shape.z *= factor;
-            else if (dim == 2)
-                shape.rt *= factor;
-            else if (dim == 3)
-                shape.ct *= factor;
-            if (factor > 1)
-                tms.push_back(tm(type, dim, factor));
-        }
-        else if (type == "vslice")
-        {
-            int factor = sample(gen, factorize(shape.rt), 2);
-            TT_ASSERT(shape.rt % factor == 0);
-            shape.z *= factor;
-            shape.rt /= factor;
-            if (factor > 1)
-                tms.push_back(tm(type, factor));
-        }
-        else if (type == "hslice")
-        {
-            int factor = sample(gen, factorize(shape.ct), 2);
-            TT_ASSERT(shape.ct % factor == 0);
-            shape.z *= factor;
-            shape.ct /= factor;
-            if (factor > 1)
-                tms.push_back(tm(type, factor));
-        }
-        else if (type == "vstack")
-        {
-            int factor = sample(gen, factorize(shape.z), 2);
-            TT_ASSERT(shape.z % factor == 0);
-            shape.rt *= factor;
-            shape.z /= factor;
-            if (factor > 1)
-                tms.push_back(tm(type, factor));
-        }
-        else if (type == "hstack")
-        {
-            int factor = sample(gen, factorize(shape.z), 2);
-            TT_ASSERT(shape.z % factor == 0);
-            shape.ct *= factor;
-            shape.z /= factor;
-            if (factor > 1)
-                tms.push_back(tm(type, factor));
-        }
-        else if (type == "transpose")
-        {
-            std::swap(shape.rt, shape.ct);
-            tms.push_back(transpose());
-        }
-    }
-    return tms;
-}
-
-static graphlib::Shape to_shape(TensorShape shape)
-{
-    return graphlib::Shape::create_buda(
-        shape.w, shape.z, shape.rt * graphlib::Shape::BUDA_TILE_DIM, shape.ct * graphlib::Shape::BUDA_TILE_DIM);
-}
-
-TileLayout random_layout(std::mt19937& gen, TensorShape shape)
-{
-    int grid_r = sample(gen, factorize(shape.rt));
-    int grid_c = sample(gen, factorize(shape.ct));
-    int ublock_r = sample(gen, factorize(shape.rt / grid_r));
-    int ublock_c = sample(gen, factorize(shape.ct / grid_c));
-    int m = shape.rt / (grid_r * ublock_r);
-    int n = shape.ct / (grid_c * ublock_c);
-    auto ublock_order = sample(gen, std::vector<UBlockOrder>{UBlockOrder::R, UBlockOrder::C});
-    return TileLayout(
-        GridShape(grid_r, grid_c), BlockShape(shape.z, m, n, UBlockShape(ublock_r, ublock_c)), ublock_order);
-}
-
-TEST(TileLayoutTest, test_tile_layout_random)
-{
-    static std::mt19937 gen;
-
-    int start_seed = 0;
-    int num_tests = 1024;
-    int max_t = 16;
-    int max_r = 16;
-    int max_c = 16;
-    int max_tms = 4;
-
-    for (int seed = start_seed; seed < (num_tests + start_seed); ++seed)
-    {
-        gen.seed(seed);
-
-        int tdim = randint(gen, 1, max_t);
-        int rdim = randint(gen, 1, max_r);
-        int cdim = randint(gen, 1, max_c);
-        TensorShape producer_shape(1, tdim, rdim, cdim);
-        auto tms = rand_tms(gen, producer_shape, randint(gen, 0, max_tms));
-        TensorShape consumer_shape = graphlib::post_tms_shape(to_shape(producer_shape), tms);
-
-        TileLayout producer_layout = random_layout(gen, producer_shape);
-        TileLayout consumer_layout = random_layout(gen, consumer_shape);
-
-        for (int grid_r = 0; grid_r < consumer_layout.grid_shape.r; ++grid_r)
-        {
-            for (int grid_c = 0; grid_c < consumer_layout.grid_shape.c; ++grid_c)
-            {
-                int max_core_t = 0;
-                for (int address = 0; address < consumer_layout.block_shape.volume(); ++address)
-                {
-                    LinCoord consumer_linear(grid_r, grid_c, address);
-                    CanCoord consumer_coord = consumer_layout.map(consumer_linear);
-                    auto [producer_coord, p_shape] = map_inverse_tms(consumer_coord, consumer_shape, tms);
-                    EXPECT_EQ(p_shape, producer_shape);
-                    LinCoord producer_linear = producer_layout.map(producer_coord);
-                    EXPECT_GE(producer_linear.address(), 0);
-
-                    if (address < consumer_layout.block_shape.volume_no_t())
-                        max_core_t = std::max(max_core_t, producer_coord.t);
-                }
-            }
-        }
-    }
-}
-
-#if 0
-TEST(TileLayoutTest, test_tile_layout_perf)
-{
-    static std::mt19937 gen;
-
-    int start_seed = 0;
-    int num_tests = 4098;
-    int max_t = 16;
-    int max_r = 64;
-    int max_c = 64;
-    int max_tms = 4;
-
-    volatile int* ptr = (volatile int*)malloc(sizeof(int));
-    for (int seed = start_seed; seed < (num_tests + start_seed); ++seed)
-    {
-        gen.seed(seed);
-
-        int tdim = randint(gen, 1, max_t);
-        int rdim = randint(gen, 1, max_r);
-        int cdim = randint(gen, 1, max_c);
-        TensorShape producer_shape(1, tdim, rdim, cdim);
-        auto tms = rand_tms(gen, producer_shape, randint(gen, 0, max_tms));
-        TensorShape consumer_shape = graphlib::post_tms_shape(to_shape(producer_shape), tms);
-
-        TileLayout producer_layout = random_layout(gen, producer_shape);
-        TileLayout consumer_layout = random_layout(gen, consumer_shape);
-        Pipe pipe(producer_layout, 2, tms, consumer_layout);
-        auto resource_usage = get_edge_resource_usage(pipe);
-        *ptr = resource_usage.producer_fan_out;
-        *ptr = resource_usage.consumer_fan_in;
-        *ptr = resource_usage.producer_phases;
-        *ptr = resource_usage.consumer_phases;
-    }
-}
-#endif
-
-class PipeTest : public ::testing::TestWithParam<Pipe>
-{
-};
-
-TEST_P(PipeTest, test_tile_layout_targeted)
-{
-    std::unordered_map<Pipe, ResourceUsage> dummy_cache;
-    auto pipe = GetParam();
-    ResourceUsage usage = get_edge_resource_usage(
-        dummy_cache, Pipe(pipe.producer_layout, pipe.producer_out_buf_mb, pipe.tms, pipe.consumer_layout));
-
-    auto [n2p_producer_core_fan_out, n2p_producer_core_phases, n2p_consumer_core_phases] = get_net2pipe_resource_usage(
-        pipe.producer_layout.grid_shape,
-        pipe.producer_layout.block_shape,
-        pipe.producer_layout.ublock_order,
-        pipe.producer_out_buf_mb,
-        pipe.tms,
-        "eltwise",
-        0,
-        pipe.consumer_layout.grid_shape,
-        pipe.consumer_layout.block_shape,
-        pipe.consumer_layout.ublock_order);
-
-    EXPECT_EQ(usage.producer_fan_out, n2p_producer_core_fan_out);
-    // GE for now because we don't account for some special cases that can loop
-    EXPECT_GE(usage.producer_phases, n2p_producer_core_phases);
-    EXPECT_EQ(usage.consumer_phases, n2p_consumer_core_phases);
-
-    log_debug(LogTest, "Test:");
-    log_debug(LogTest, "  producer:");
-    log_debug(LogTest, "    {}", pipe.producer_layout.grid_shape);
-    log_debug(LogTest, "    {}", pipe.producer_layout.block_shape);
-    log_debug(LogTest, "    {}", pipe.producer_layout.ublock_order);
-    log_debug(LogTest, "  tms: {}", pipe.tms);
-    log_debug(LogTest, "  consumer:");
-    log_debug(LogTest, "    {}", pipe.consumer_layout.grid_shape);
-    log_debug(LogTest, "    {}", pipe.consumer_layout.block_shape);
-    log_debug(LogTest, "    {}", pipe.consumer_layout.ublock_order);
-    log_debug(LogTest, "Result:");
-    log_debug(LogTest, "  calculated: {} {} {}", usage.producer_fan_out, usage.producer_phases, usage.consumer_phases);
-    log_debug(
-        LogTest,
-        "  net2pipe:   {} {} {}",
-        n2p_producer_core_fan_out,
-        n2p_producer_core_phases,
-        n2p_consumer_core_phases);
-
-    log_debug(LogTest, "  fan_in: {}", usage.consumer_fan_in);
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    TileLayoutTest,
-    PipeTest,
-    testing::Values(
-        Pipe(
-            TileLayout(GridShape(1, 1), BlockShape(1, 8, 8, UBlockShape(1, 1)), UBlockOrder::R),
-            2,
-            {},
-            TileLayout(GridShape(1, 1), BlockShape(1, 8, 8, UBlockShape(1, 1)), UBlockOrder::R)),
-        Pipe(
-            TileLayout(GridShape(2, 2), BlockShape(1, 16, 16, UBlockShape(1, 1)), UBlockOrder::R),
-            2,
-            {},
-            TileLayout(GridShape(1, 1), BlockShape(1, 32, 32, UBlockShape(1, 1)), UBlockOrder::R)),
-
-        Pipe(
-            TileLayout(GridShape(2, 1), BlockShape(2, 32, 1, UBlockShape(1, 1)), UBlockOrder::R),
-            2,
-            {tm("vstack", 2)},
-            TileLayout(GridShape(2, 1), BlockShape(1, 64, 1, UBlockShape(1, 1)), UBlockOrder::R)),
-
-        Pipe(
-            TileLayout(GridShape(6, 2), BlockShape(1, 96, 1, UBlockShape(1, 1)), UBlockOrder::R),
-            2,
-            {tm("vslice", 12)},
-            TileLayout(GridShape(1, 1), BlockShape(12, 48, 1, UBlockShape(1, 2)), UBlockOrder::R)),
-
-        Pipe(
-            TileLayout(GridShape(1, 1), BlockShape(1, 1, 1, UBlockShape(1, 1)), UBlockOrder::R),
-            2,
-            {},
-            TileLayout(GridShape(1, 1), BlockShape(1, 1, 1, UBlockShape(1, 1)), UBlockOrder::R)),
-
-        Pipe(
-            TileLayout(GridShape(1, 1), BlockShape(1, 16, 16, UBlockShape(1, 1)), UBlockOrder::R),
-            2,
-            {},
-            TileLayout(GridShape(1, 1), BlockShape(1, 16, 16, UBlockShape(1, 1)), UBlockOrder::C)),
-
-        Pipe(
-            TileLayout(GridShape(2, 2), BlockShape(14, 112, 1, UBlockShape(1, 1)), graphlib::UBlockOrder::R),
-            2,
-            {vslice(16), hstack(16)},
-            TileLayout(GridShape(1, 1), BlockShape(14, 14, 2, UBlockShape(1, 16)), graphlib::UBlockOrder::C)),
-
-        Pipe(
-            TileLayout(GridShape(8, 8), BlockShape(1, 16, 1, UBlockShape(1, 4)), graphlib::UBlockOrder::R),
-            2,
-            {vslice(64)},
-            TileLayout(GridShape(1, 1), BlockShape(64, 2, 8, UBlockShape(1, 4)), graphlib::UBlockOrder::C)),
-        Pipe(
-            TileLayout(GridShape(1, 4), BlockShape(1, 25, 1, UBlockShape(1, 1)), graphlib::UBlockOrder::R),
-            2,
-            {},
-            TileLayout(GridShape(1, 1), BlockShape(1, 25, 2, UBlockShape(1, 2)), graphlib::UBlockOrder::C)),
-        Pipe(
-            TileLayout(GridShape(9, 2), BlockShape(1, 98, 1, UBlockShape(1, 1)), graphlib::UBlockOrder::R),
-            2,
-            {vslice(9), hstack(9)},
-            TileLayout(GridShape(7, 1), BlockShape(1, 14, 18, UBlockShape(1, 1)), graphlib::UBlockOrder::C)),
-        Pipe(
-            TileLayout(GridShape(3, 4), BlockShape(1, 75, 1, UBlockShape(1, 1)), graphlib::UBlockOrder::R),
-            2,
-            {vslice(9), hstack(9)},
-            TileLayout(GridShape(1, 1), BlockShape(1, 25, 1, UBlockShape(1, 36)), graphlib::UBlockOrder::C)),
-        Pipe(
-            TileLayout(GridShape(10, 2), BlockShape(16, 6, 1, UBlockShape(1, 1)), graphlib::UBlockOrder::R),
-            2,
-            {hstack(16)},
-            TileLayout(GridShape(1, 1), BlockShape(1, 60, 16, UBlockShape(1, 2)), graphlib::UBlockOrder::C)),
-        Pipe(
-            TileLayout(GridShape(8, 2), BlockShape(1, 64, 1, UBlockShape(1, 1)), graphlib::UBlockOrder::R),
-            2,
-            {transpose(), hslice(512), vstack(512)},
-            TileLayout(GridShape(1, 1), BlockShape(1, 64, 1, UBlockShape(16, 1)), graphlib::UBlockOrder::R)),
-        Pipe(
-            TileLayout(GridShape(4, 3), BlockShape(1, 32, 3, UBlockShape(1, 1)), graphlib::UBlockOrder::R),
-            2,
-            {hslice(9), vstack(9)},
-            TileLayout(GridShape(1, 1), BlockShape(1, 24, 1, UBlockShape(48, 1)), graphlib::UBlockOrder::R)),
-        Pipe(
-            TileLayout(GridShape(1, 1), BlockShape(1, 5, 1, UBlockShape(2, 1)), graphlib::UBlockOrder::R),
-            2,
-            {broadcast(3, 128), vslice(5)},
-            TileLayout(GridShape(1, 2), BlockShape(5, 1, 16, UBlockShape(2, 4)), graphlib::UBlockOrder::R)),
-        // operand[0] layernorm_251.dc.add.14 -> matmul_256
-        Pipe(
-            TileLayout(GridShape(2, 5), BlockShape(1, 32, 1, UBlockShape(2, 2)), graphlib::UBlockOrder::C),
-            2,
-            {broadcast(2, 10), vslice(10)},
-            TileLayout(GridShape(1, 1), BlockShape(10, 64, 1, UBlockShape(2, 10)), graphlib::UBlockOrder::C)),
-        // operand[2] unet.down_blocks.0.attentions.0.transformer_blocks.0.ff.net.0.proj.bias_0 -> matmul_119
-        Pipe(
-            TileLayout(GridShape(1, 1), BlockShape(1, 1, 40, UBlockShape(1, 1)), graphlib::UBlockOrder::R),
-            2,
-            {broadcast(2, 128), hslice(40)},
-            TileLayout(GridShape(1, 1), BlockShape(40, 64, 1, UBlockShape(2, 1)), graphlib::UBlockOrder::R)),
-        // Pipe(
-        //     TileLayout(GridShape(1, 1), BlockShape(12, 48, 1, UBlockShape(1, 2)), graphlib::UBlockOrder::R),
-        //     2,
-        //     {vstack(12)},
-        //     TileLayout(GridShape(6, 2), BlockShape(1, 96, 1, UBlockShape(1, 1)), graphlib::UBlockOrder::R)),
-        //  operand[0] dc.input_tensor.resize2d_204.2 -> resize2d_204.dc.matmul.3
-        Pipe(
-            TileLayout(GridShape(8, 1), BlockShape(1, 49, 98, UBlockShape(1, 1)), graphlib::UBlockOrder::C),
-            2,
-            {vslice(49)},
-            TileLayout(GridShape(8, 1), BlockShape(49, 1, 98, UBlockShape(1, 1)), graphlib::UBlockOrder::C)),
-        Pipe(
-            TileLayout(GridShape(1, 1), BlockShape(4, 4, 1, UBlockShape(1, 1)), graphlib::UBlockOrder::R),
-            2,
-            {tile_broadcast(3), broadcast(3, 4, 0)},
-            TileLayout(GridShape(1, 1), BlockShape(4, 4, 1, UBlockShape(1, 4)), graphlib::UBlockOrder::C)),
-        Pipe(
-            TileLayout(GridShape(4, 8), BlockShape(1, 4, 4, UBlockShape(1, 4)), graphlib::UBlockOrder::R),
-            2,
-            {transpose(), vslice(128), hslice(16), vstack(2048)},
-            TileLayout(GridShape(1, 1), BlockShape(1, 64, 1, UBlockShape(32, 1)), graphlib::UBlockOrder::R)),
-        Pipe(
-            TileLayout(GridShape(1, 1), BlockShape(1, 1, 1, UBlockShape(1, 1)), graphlib::UBlockOrder::R),
-            2,
-            {broadcast(3, 2048, false)},
-            TileLayout(GridShape(1, 1), BlockShape(1, 1, 2048, UBlockShape(1, 1)), graphlib::UBlockOrder::C)),
-
-        Pipe(
-            TileLayout(GridShape(1, 2), BlockShape(16, 6, 1, UBlockShape(2, 1)), graphlib::UBlockOrder::R),
-            32,
-            {hstack(16)},
-            TileLayout(GridShape(1, 1), BlockShape(1, 2, 8, UBlockShape(6, 4)), graphlib::UBlockOrder::R)),
-
-        // Padding
-        Pipe(
-            TileLayout(
-                GridShape(1, 1), BlockShape(1, 10, 5, UBlockShape(1, 2)), graphlib::UBlockOrder::R, Padding(2, 2)),
-            2,
-            {buda_unpad(2, 2, 256, 256)},
-            TileLayout(GridShape(1, 1), BlockShape(1, 8, 2, UBlockShape(1, 4)), graphlib::UBlockOrder::R)),
-        Pipe(
-            TileLayout(
-                GridShape(1, 1), BlockShape(1, 10, 5, UBlockShape(1, 2)), graphlib::UBlockOrder::R, Padding(9, 9)),
-            2,
-            {buda_unpad(9, 9, 12, 12)},
-            TileLayout(GridShape(1, 1), BlockShape(1, 1, 1, UBlockShape(1, 1)), graphlib::UBlockOrder::R))
-        // Pipe(
-        //     TileLayout(GridShape(1, 1), BlockShape(1, 1, 1, UBlockShape(1, 1)), graphlib::UBlockOrder::R),
-        //     2,
-        //     {buda_pad(9, 9, 12, 12)},
-        //     TileLayout(
-        //         GridShape(1, 1), BlockShape(1, 10, 5, UBlockShape(1, 2)), graphlib::UBlockOrder::R, Padding(9, 9)))
-        ));
-
-}  // namespace tt::test
diff --git a/pybuda/csrc/balancer/types.cpp b/pybuda/csrc/balancer/types.cpp
deleted file mode 100644
index 7bab5e5a9..000000000
--- a/pybuda/csrc/balancer/types.cpp
+++ /dev/null
@@ -1,682 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "balancer/types.hpp"
-
-#include <climits>
-#include <cmath>
-
-#include "balancer/balancer_utils.hpp"
-#include "balancer/python_interface.hpp"
-#include "graph_lib/node_types.hpp"
-#include "lower_to_buda/common.hpp"
-#include "passes/fuse_ops.hpp"
-#include "utils/assert.hpp"
-#include "utils/logger.hpp"
-
-namespace tt::balancer
-{
-std::uint64_t UniqueId::next_id = 0;
-
-TensorShape::TensorShape(graphlib::Shape const &shape) :
-    w((int)shape.w()), z((int)shape.z()), rt((int)shape.rt()), ct((int)shape.ct())
-{
-}
-
-int const &TensorShape::operator[](int i) const { return (*const_cast<TensorShape *>(this))[i]; }
-
-int &TensorShape::operator[](int i)
-{
-    if (i < 0)
-        i += 4;
-    TT_ASSERT(i <= 3);
-    TT_ASSERT(i >= 0);
-    switch (i)
-    {
-        case 0: return w;
-        case 1: return z;
-        case 2: return rt;
-        case 3: return ct;
-        default: TT_ASSERT(false); return w;
-    }
-}
-
-//
-// BlockShape
-//
-
-BlockShape::BlockShape(TensorShape tensor_shape, int par_r, int par_c, int par_t, UBlockShape ublock) :
-    t(tensor_shape.z * par_t),
-    tblock_m(1),
-    tblock_n(1),
-    mblock_m((tensor_shape.rt / par_r) / ublock.rt),
-    mblock_n((tensor_shape.ct / par_c) / ublock.ct),
-    ublock(ublock)
-{
-    TT_LOG_ASSERT(
-        (tensor_shape.rt % (par_r * ublock.rt)) == 0,
-        "Not divisible on R: {} / ({} * {})",
-        tensor_shape.rt,
-        par_r,
-        ublock.rt);
-    TT_LOG_ASSERT(
-        (tensor_shape.ct % (par_c * ublock.ct)) == 0,
-        "Not divisible on C: {} / ({} * {})",
-        tensor_shape.ct,
-        par_c,
-        ublock.ct);
-    TT_ASSERT(mblock_m > 0, "Invalid ublock provided", tensor_shape, par_r, par_c, par_t, ublock);
-    TT_ASSERT(mblock_n > 0, "Invalid ublock provided", tensor_shape, par_r, par_c, par_t, ublock);
-}
-
-BlockShape::BlockShape(TensorShape tensor_shape, GridShape grid_shape, int mblock_m, int mblock_n, UBlockShape ublock) :
-    mblock_m(mblock_m), mblock_n(mblock_n), ublock(ublock)
-{
-    TT_ASSERT((tensor_shape.rt % mblock_m) == 0);
-    TT_ASSERT((tensor_shape.ct % mblock_n) == 0);
-    TT_ASSERT((tensor_shape.rt % (mblock_m * ublock.rt)) == 0);
-    TT_ASSERT((tensor_shape.ct % (mblock_n * ublock.ct)) == 0);
-    TT_ASSERT((tensor_shape.rt % (grid_shape.r * mblock_m * ublock.rt)) == 0);
-    TT_ASSERT((tensor_shape.ct % (grid_shape.c * mblock_n * ublock.ct)) == 0);
-    TT_ASSERT(((tensor_shape.z * tensor_shape.rt * tensor_shape.ct) % (mblock_m * mblock_n * ublock.volume())) == 0);
-
-    tblock_m = tensor_shape.rt / (grid_shape.r * mblock_m * ublock.rt);
-    tblock_n = tensor_shape.ct / (grid_shape.c * mblock_n * ublock.ct);
-    t = tensor_shape.z;
-}
-
-BlockShape::BlockShape(int t, int mblock_m, int mblock_n, UBlockShape ublock) :
-    t(t), tblock_m(1), tblock_n(1), mblock_m(mblock_m), mblock_n(mblock_n), ublock(ublock)
-{
-}
-
-bool BlockShape::operator==(BlockShape o) const
-{
-    return (t == o.t) and (tblock_m == o.tblock_m) and (tblock_n == o.tblock_n) and (mblock_m == o.mblock_m) and
-           (mblock_n == o.mblock_n) and (ublock == o.ublock);
-}
-
-bool BlockShape::operator!=(BlockShape o) const { return !(*this == o); }
-
-int BlockShape::volume() const { return t * tblock_m * tblock_n * mblock_m * mblock_n * ublock.volume(); }
-int BlockShape::volume_no_t() const { return mblock_m * mblock_n * ublock.volume(); }
-
-int BlockShape::buffer_tiles(int buffer_factor) const { return buffer_factor * mblock_m * mblock_n * ublock.volume(); }
-
-void BlockShape::set_ublock_shape(UBlockShape new_ublock)
-{
-    // canonicalize shape first
-    mblock_m *= ublock.rt;
-    mblock_n *= ublock.ct;
-    ublock = {1, 1};
-
-    TT_ASSERT((mblock_m % new_ublock.rt) == 0, *this, new_ublock);
-    TT_ASSERT((mblock_n % new_ublock.ct) == 0, *this, new_ublock);
-
-    mblock_m /= new_ublock.rt;
-    mblock_n /= new_ublock.ct;
-    ublock = new_ublock;
-}
-
-BudaBlocks BlockShape::as_buda_blocks() const
-{
-    BudaBlocks blocks;
-    TT_ASSERT(tblock_m == 1 and tblock_n == 1);
-    blocks.z = t;
-    blocks.ublock_rt = ublock.rt;
-    blocks.ublock_ct = ublock.ct;
-    blocks.mblock_m = mblock_m;
-    blocks.mblock_n = mblock_n;
-    return blocks;
-}
-
-//
-// BufferModel
-//
-std::size_t BufferModel::size_tiles(bool include_t) const { return l1_size_tiles * (include_t ? block_shape.t : 1); }
-
-std::size_t BufferModel::size_bytes(bool include_t) const
-{
-    return size_tiles(include_t) * tile_size_bytes(data_format);
-}
-
-std::size_t BufferModel::single_buffered_size_tiles() const { return block_shape.buffer_tiles(1); }
-
-std::size_t BufferModel::single_buffered_size_bytes() const
-{
-    return single_buffered_size_tiles() * tile_size_bytes(data_format);
-}
-
-std::size_t BufferModel::total_size_bytes() const { return block_shape.volume() * tile_size_bytes(data_format); }
-
-//
-// OpModel
-//
-std::size_t OpModel::get_l1_memory_usage() const
-{
-    std::size_t usage = 0;
-
-    for (BufferModel const &buffer_model : input_buffers)
-    {
-        usage += buffer_model.size_bytes();
-    }
-
-    for (BufferModel const &buffer_model : output_buffers)
-    {
-        bool const include_t = is_gradient_op();
-        usage += buffer_model.size_bytes(include_t);
-    }
-
-    for (BufferModel const &buffer_model : parameter_buffers)
-    {
-        constexpr bool include_t = true;
-        usage += buffer_model.size_bytes(include_t);
-    }
-
-    for (BufferModel const &buffer_model : intermediate_buffers)
-    {
-        usage += buffer_model.size_bytes();
-    }
-
-    // Note: global overlay blob override (TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE) is not included in the usage
-    // calculation - when querying L1 space that FE can use, L1 space is reduced by the value of this global overlay
-    // blob override
-    //
-    static constexpr std::int32_t bbe_reserved_blob_size = 64 * 1024;  // 64 kB
-    if (overlay_size > bbe_reserved_blob_size)
-    {
-        usage += overlay_size - bbe_reserved_blob_size;
-    }
-
-    return usage;
-}
-
-const std::string &OpModel::op_type() const { return buda_op_node->op_name(); }
-
-MathFidelity OpModel::math_fidelity() const
-{
-    return buda_op_node != nullptr ? buda_op_node->math_fidelity() : MathFidelity::Invalid;
-}
-
-bool OpModel::is_gradient_op() const { return buda_op_node->is_gradient_op(); }
-
-bool OpModel::is_matmul() const { return buda_op_node->is_matmul(); }
-
-std::shared_ptr<FusedOp> OpModel::fused_op() const
-{
-    return buda_op_node->is_fused_op() ? buda_op_node->get_fused_op() : nullptr;
-}
-
-const std::string OpModel::get_reduce_dim() const
-{
-    if (op_type() == "reduce")
-    {
-        return std::get<std::string>(buda_op_node->op_type().buda_attrs.at("dim"));
-    }
-    else
-    {
-        return "";
-    }
-}
-
-const BudaOpAttrs OpModel::buda_op_attrs() const
-{
-    TT_ASSERT(buda_op_node, "Called on non-buda op!");
-    return buda_op_node->op_type().buda_attrs;
-}
-
-int OpModel::get_execution_cycles_uncached(std::string const &arch_name, bool theoretical) const
-{
-    std::shared_ptr<FusedOp> fused_op = this->fused_op();
-
-    // Calculate sparse-matmul metadata and cache the result
-    if (env_as<bool>("PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES", false) and this->is_sparse_matmul and
-        this->nz_ublocks == -1)
-    {
-        auto mf = this->math_fidelity();
-        if (mf == tt::MathFidelity::HiFi2 or mf == tt::MathFidelity::LoFi)
-        {
-            auto [nz_tiles, nz_ublocks, nz_strips] = get_sparse_matmul_metadata(*this);
-            auto *p_this = const_cast<OpModel *>(this);
-            p_this->nz_tiles = nz_tiles;
-            p_this->nz_ublocks = nz_ublocks;
-            p_this->nz_strips = nz_strips;
-        }
-    }
-
-    if (fused_op == nullptr)
-    {
-        return tt::balancer::get_execution_cycles(arch_name, *this, theoretical);
-    }
-
-    if (env_as<bool>("PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES", false))
-    {
-        // to obtain the execution cycles for fused op, we are calculating cycles for each subop, so
-        // we need to prepare necessary information and pass it inside the FusedSubOpModel object
-        std::vector<FusedSubOpModel> sub_op_models;
-        for (const auto &schedule : fused_op->get_schedules())
-        {
-            for (const auto &sub_op : schedule.ops)
-            {
-                FusedSubOpModel &sub_op_model = sub_op_models.emplace_back();
-                sub_op_model.type = sub_op.op_type.op;
-
-                sub_op_model.mblock_m = sub_op.op_shape.outputs[0].rt /
-                                        (grid_shape.r * fused_op_ublock_shape.at(sub_op.name).rt * t_stream_factor.r);
-                sub_op_model.mblock_n = sub_op.op_shape.outputs[0].ct /
-                                        (grid_shape.c * fused_op_ublock_shape.at(sub_op.name).ct * t_stream_factor.c);
-                sub_op_model.ublock_rt = fused_op_ublock_shape.at(sub_op.name).rt;
-                sub_op_model.ublock_ct = fused_op_ublock_shape.at(sub_op.name).ct;
-
-                if (sub_op_model.type == "matmul")
-                {
-                    if (sub_op.inputs[1].type == FusedSubOpInput::INPUT)
-                    {
-                        sub_op_model.ublock_kt = input_buffers[sub_op.inputs[1].index].block_shape.ublock.rt;
-                        sub_op_model.mblock_k = input_buffers[sub_op.inputs[1].index].block_shape.mblock_m;
-                    }
-                    else if (sub_op.inputs[1].type == FusedSubOpInput::INTERMED)
-                    {
-                        sub_op_model.ublock_kt = intermediate_buffers[sub_op.inputs[1].index].block_shape.ublock.rt;
-                        sub_op_model.mblock_k = intermediate_buffers[sub_op.inputs[1].index].block_shape.mblock_m;
-                    }
-                }
-                else if (sub_op_model.type == "reduce")
-                {
-                    sub_op_model.reduce_dim = std::get<std::string>(sub_op.get_sub_op_buda_attr().at("dim"));
-                }
-                else
-                {
-                    // other ops can use dest register as input/output and it impacts the number of cycles;
-                    // matmul and reduce cannot use dest on input or output
-                    sub_op_model.has_dest_input =
-                        sub_op.inputs[0].type == FusedSubOpInput::DEST ||
-                        (sub_op.inputs.size() == 2 && sub_op.inputs[1].type == FusedSubOpInput::DEST);
-                    sub_op_model.has_dest_output = sub_op.output_type == FusedSubOp::DEST;
-                }
-            }
-        }
-
-        return tt::balancer::get_execution_cycles(arch_name, *this, theoretical, sub_op_models);
-    }
-
-    // Go through schedules and add up execution cycles for each op
-    std::uint32_t execution_cycles = 0;
-    // std::cout << "Fused op " << fused_op->id() << std::endl;
-    for (auto schedule : fused_op->get_schedules())
-    {
-        for (auto sub_op : schedule.ops)
-        {
-            // We don't have the right op model for each sub op... so we'll do a quick and dirty "per tile"
-            // calculation. This is mostly ok, because matmuls in fused ops always have one tile * more tiles, so
-            // it's very eltwise-like... we can count the number of tiles and multiple with some number
-
-            // TODO: add approx flag to OpModel
-            bool exp_approx = env_as<bool>("PYBUDA_EXP_APPROX");
-            std::unordered_map<std::string, std::uint32_t> op_weights = {
-                {"exp", exp_approx ? 357 : 700},
-                {"gelu", 286},
-                {"gelu_derivative", exp_approx ? 1500 : 3116},
-                {"log", 1413},
-                {"nop", 56},
-                {"buffer", 56},
-                {"reciprocal", exp_approx ? 606 : 915},
-                {"sigmoid", 168},
-                {"sqrt", 159},
-                {"add", 20},
-                {"multiply", 20},
-                {"sub", 20},
-                {"matmul", 40},
-            };
-
-            std::uint32_t tiles = (float)sub_op.op_shape.outputs[0].ct * sub_op.op_shape.outputs[0].rt *
-                                  sub_op.op_shape.outputs[0].z / grid_shape.volume();
-
-            if (sub_op.op_type.op == "matmul")
-            {
-                tiles = (float)sub_op.op_shape.outputs[0].z * sub_op.op_shape.inputs[0].rt *
-                        sub_op.op_shape.inputs[0].ct * sub_op.op_shape.inputs[1].ct / grid_shape.volume();
-            }
-
-            std::uint32_t tile_weight = 40;  // some placeholder for other ops
-            auto it = op_weights.find(sub_op.op_type.op);
-            if (it != op_weights.end())
-                tile_weight = it->second;
-
-            if (sub_op.op_type.op == "matmul" || sub_op.op_type.op == "multiply")
-            {
-                switch (this->math_fidelity())
-                {
-                    case tt::MathFidelity::HiFi2: tile_weight *= 2; break;
-                    case tt::MathFidelity::HiFi3: tile_weight *= 3; break;
-                    case tt::MathFidelity::HiFi4: tile_weight *= 4; break;
-                    default: break;
-                }
-            }
-
-            // int sub_op_cycles = tt::balancer::get_execution_cycles(sub_op.op_type.op, sub_op_model);
-            std::uint32_t sub_op_cycles = tiles * tile_weight;
-            execution_cycles += sub_op_cycles;
-            // std::cout << "  add sub_op " << sub_op.name << " / " << sub_op.op_type.op << " cycles " << sub_op_cycles
-            // << ", total " << execution_cycles << std::endl;
-        }
-    }
-
-    // Multiply cycle count estimate to be conservative
-    std::uint32_t fused_op_cycle_multiplier = env_as<int>("PYBUDA_FUSED_OP_MULTIPLIER", 1);
-
-    execution_cycles *= fused_op_cycle_multiplier;
-
-    return execution_cycles;
-}
-
-int OpModel::get_execution_cycles(std::string const &arch_name, bool theoretical, bool invalidate_cached) const
-{
-    // Do not cache theoretical cycles otherwise we'd need to maintain multiple cache entries, one for w/ theoretical
-    // cycles and one without. Theoretical cycles is only used in a few places so it didn't seem worth the additional
-    // complexity to add a separate caching mechanism for it.
-    if (theoretical)
-        return get_execution_cycles_uncached(arch_name, theoretical);
-
-    if (invalidate_cached)
-        invalidate_cached_execution_cycles();
-
-    if (cached_execution_cycles)
-        return cached_execution_cycles;
-
-    cached_execution_cycles = get_execution_cycles_uncached(arch_name, theoretical);
-    return cached_execution_cycles;
-}
-
-//
-// FactorizedInt
-//
-
-FactorizedInt::FactorizedInt(Factors max_val) : factors(factorize(1, max_val)) {}
-
-FactorizedInt::FactorizedInt(FactorRange r) : factors(factorize(std::max(1, r.first), r.second)) {}
-
-FactorizedInt::FactorizedInt(Constant s) : factors(factorize(s.v, s.v)) {}
-
-FactorizedInt::FactorizedInt(Factorial f)
-{
-    for (int i = 1; i <= f.max; ++i)
-    {
-        factors.push_back(i * f.multiplier);
-    }
-}
-
-int FactorizedInt::value() const
-{
-    TT_ASSERT(is_singleton());
-    return factors.back();
-}
-
-int FactorizedInt::get_min_factor() const { return factors.front(); }
-int FactorizedInt::get_max_factor() const { return factors.back(); }
-
-int FactorizedInt::get_nearest_factor_le(int integer) const
-{
-    // Find nearest factor less than or equal to integer
-    int nearest = factors[0];
-    TT_ASSERT(nearest <= integer);
-    for (auto factor : factors)
-    {
-        if (factor > integer)
-        {
-            break;
-        }
-        nearest = factor;
-    }
-    return nearest;
-}
-
-std::vector<int> const &FactorizedInt::get_factors() const { return factors; }
-
-FactorizedInt FactorizedInt::keep_factors_divisible_by(FactorizedInt const &other) const
-{
-    TT_ASSERT(other.is_singleton());
-
-    FactorizedInt ret{};
-
-    for (size_t i = 0; i < this->factors.size(); i++)
-    {
-        if (this->factors[i] % other.value() == 0)
-        {
-            ret.factors.push_back(this->factors[i]);
-        }
-    }
-
-    return ret;
-}
-
-FactorizedInt FactorizedInt::operator&(FactorizedInt const &other) const
-{
-    FactorizedInt intersection;
-    std::set_intersection(
-        factors.begin(),
-        factors.end(),
-        other.factors.begin(),
-        other.factors.end(),
-        std::back_inserter(intersection.factors));
-    return intersection;
-}
-
-FactorizedInt FactorizedInt::operator|(FactorizedInt const &other) const
-{
-    FactorizedInt intersection;
-    std::set_union(
-        factors.begin(),
-        factors.end(),
-        other.factors.begin(),
-        other.factors.end(),
-        std::back_inserter(intersection.factors));
-    return intersection;
-}
-
-FactorizedInt FactorizedInt::operator-(FactorizedInt const &other) const
-{
-    FactorizedInt intersection;
-    std::set_difference(
-        factors.begin(),
-        factors.end(),
-        other.factors.begin(),
-        other.factors.end(),
-        std::back_inserter(intersection.factors));
-    return intersection;
-}
-
-FactorizedInt FactorizedInt::operator*(FactorizedInt const &other) const
-{
-    TT_ASSERT(other.is_singleton(), "Currently only support singletons");
-    FactorizedInt result = *this;
-    for (int &f : result.factors)
-    {
-        f *= other.factors.back();
-    }
-    return result;
-}
-
-FactorizedInt FactorizedInt::operator/(FactorizedInt const &other) const
-{
-    TT_ASSERT(other.is_singleton(), "Currently only support singletons");
-    FactorizedInt result;
-    for (int f : factors)
-    {
-        if (f >= other.factors.back() and (f % other.factors.back()) == 0)
-            result.factors.push_back(f / other.factors.back());
-    }
-    return result;
-}
-
-bool FactorizedInt::operator==(FactorizedInt const &other) const { return factors == other.factors; }
-
-bool FactorizedInt::overlaps(FactorizedInt const &other) const
-{
-    auto iter_a = factors.begin();
-    auto iter_b = other.factors.begin();
-    while (iter_a != factors.end() and iter_b != other.factors.end())
-    {
-        if (*iter_a == *iter_b)
-            return true;
-        if (*iter_a < *iter_b)
-            ++iter_a;
-        else
-            ++iter_b;
-    }
-    return false;
-}
-
-bool FactorizedInt::contains(int v) const
-{
-    for (int f : factors)
-        if (v == f)
-            return true;
-    return false;
-}
-
-bool FactorizedInt::is_singleton() const { return factors.size() == 1; }
-
-std::vector<int> FactorizedInt::factorize(int min_val, int max_val)
-{
-    std::vector<int> factors;
-    for (int i = min_val; i <= max_val; ++i)
-    {
-        if ((max_val % i) == 0 and (i % min_val) == 0)
-            factors.push_back(i);
-    }
-    return factors;
-}
-
-//
-// FactorizedShape
-//
-FactorizedShape::FactorizedShape(graphlib::Shape const &shape) : r(shape.rt()), c(shape.ct()) {}
-
-FactorizedShape::FactorizedShape(std::pair<int, int> shape) : r(shape.first), c(shape.second) {}
-
-FactorizedShape::FactorizedShape(Parallelization par) : r(Constant((int)par.r)), c(Constant((int)par.c)) {}
-
-FactorizedShape::FactorizedShape(FactorizedInt r, FactorizedInt c) : r(r), c(c) {}
-
-FactorizedShape FactorizedShape::operator&(FactorizedShape const &other) const
-{
-    return FactorizedShape(r & other.r, c & other.c);
-}
-
-FactorizedShape FactorizedShape::operator|(FactorizedShape const &other) const
-{
-    return FactorizedShape(r | other.r, c | other.c);
-}
-
-FactorizedShape FactorizedShape::operator-(FactorizedShape const &other) const
-{
-    return FactorizedShape(r - other.r, c - other.c);
-}
-
-Parallelization FactorizedShape::operator[](int idx) const
-{
-    std::vector<int> const &r_factors = r.get_factors();
-    std::vector<int> const &c_factors = c.get_factors();
-    int ridx = idx / (int)c_factors.size();
-    int cidx = idx % (int)c_factors.size();
-    return Parallelization(r_factors[ridx], c_factors[cidx]);
-}
-
-bool FactorizedShape::operator==(FactorizedShape const &other) const { return (r == other.r) and (c == other.c); }
-
-bool FactorizedShape::empty() const { return r.get_factors().empty() or c.get_factors().empty(); }
-
-std::size_t FactorizedShape::size() const { return r.get_factors().size() * c.get_factors().size(); }
-
-bool FactorizedShape::is_subset_of(FactorizedShape const &other) const { return (*this & other) == *this; }
-
-bool FactorizedShape::is_superset_of(FactorizedShape const &other) const { return (*this & other) == other; }
-
-bool FactorizedShape::is_singleton() const { return r.is_singleton() and c.is_singleton(); }
-
-FactorizedShape::Iterator::Iterator(FactorizedShape const *p) : p(p) {}
-
-FactorizedShape::Iterator::Iterator(FactorizedShape const *p, int i) : i(i), p(p) {}
-
-FactorizedShape::Iterator &FactorizedShape::Iterator::operator++()
-{
-    ++i;
-    return *this;
-}
-
-FactorizedShape::Iterator FactorizedShape::Iterator::operator++(int)
-{
-    auto retval = *this;
-    ++(*this);
-    return retval;
-}
-
-bool FactorizedShape::Iterator::operator==(Iterator other) const { return (p == other.p) and (i == other.i); }
-
-bool FactorizedShape::Iterator::operator!=(Iterator other) const { return !(*this == other); }
-
-FactorizedShape::Iterator::reference FactorizedShape::Iterator::operator*() const { return (*p)[i]; }
-
-FactorizedShape::Iterator FactorizedShape::begin() const { return Iterator(this); }
-
-FactorizedShape::Iterator FactorizedShape::end() const { return Iterator(this, (int)size()); }
-
-//
-// TileLayout
-//
-LinCoord TileLayout::operator[](int idx) const
-{
-    int idx_t = idx / (rt() * ct());
-    int idx_rt = (idx % (rt() * ct())) / ct();
-    int idx_ct = idx % ct();
-    return map(CanCoord(idx_t, idx_rt, idx_ct));
-}
-
-LinCoord TileLayout::map(CanCoord can_coord) const
-{
-    auto [w, t, r, c] = can_coord;
-    TT_ASSERT(t < this->t());
-    TT_ASSERT(r < this->rt());
-    TT_ASSERT(c < this->ct());
-    UBlockShape ublock_coord(r % block_shape.ublock.rt, c % block_shape.ublock.ct);
-    int m = (r / block_shape.ublock.rt) % block_shape.mblock_m;
-    int n = (c / block_shape.ublock.ct) % block_shape.mblock_n;
-    BlockShape mblock_coord(t, m, n, ublock_coord);
-    GridShape grid_coord(r / block_shape.rt(), c / block_shape.ct());
-    int mblock_volume = block_shape.volume_no_t();
-    int ublock_volume = block_shape.ublock.volume();
-    int t_linear = t * mblock_volume;
-    int mblock_linear = (ublock_order == graphlib::UBlockOrder::R)
-                            ? m * block_shape.mblock_n * ublock_volume + n * ublock_volume
-                            : n * block_shape.mblock_m * ublock_volume + m * ublock_volume;
-    int ublock_linear = ublock_coord.rt * block_shape.ublock.ct + ublock_coord.ct;
-    return LinCoord(grid_coord, t_linear + mblock_linear + ublock_linear);
-}
-
-CanCoord TileLayout::map(LinCoord lin_coord) const
-{
-    GridCoord grid_coord(lin_coord.grid_r(), lin_coord.grid_c());
-    int offset = lin_coord.address();
-    int mblock_offset = offset % block_shape.volume_no_t();
-    int ublock_offset = mblock_offset % block_shape.ublock.volume();
-    int num_ublocks = mblock_offset / block_shape.ublock.volume();
-    int t = offset / block_shape.volume_no_t();
-    int m = (ublock_order == graphlib::UBlockOrder::R) ? num_ublocks / block_shape.mblock_n
-                                                       : num_ublocks % block_shape.mblock_m;
-    int n = (ublock_order == graphlib::UBlockOrder::R) ? num_ublocks % block_shape.mblock_n
-                                                       : num_ublocks / block_shape.mblock_m;
-    UBlockShape ublock_coord(ublock_offset / block_shape.ublock.ct, ublock_offset % block_shape.ublock.ct);
-    int r = grid_coord.r * block_shape.rt() + m * block_shape.ublock.rt + ublock_coord.rt;
-    int c = grid_coord.c * block_shape.ct() + n * block_shape.ublock.ct + ublock_coord.ct;
-    return CanCoord(t, r, c);
-}
-
-GridCoord TileLayout::grid_coord(CanCoord can_coord) const
-{
-    return GridCoord(can_coord.rt / block_shape.rt(), can_coord.ct / block_shape.ct());
-}
-
-GridCoord TileLayout::grid_coord(LinCoord lin_coord) const { return lin_coord.grid_coord(); }
-
-}  // namespace tt::balancer
diff --git a/pybuda/csrc/balancer/types.hpp b/pybuda/csrc/balancer/types.hpp
deleted file mode 100644
index 0310594b9..000000000
--- a/pybuda/csrc/balancer/types.hpp
+++ /dev/null
@@ -1,893 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <algorithm>
-#include <initializer_list>
-#include <map>
-#include <ostream>
-#include <set>
-#include <vector>
-
-#include "lower_to_buda/common.hpp"
-#include "output_host_tm_types.hpp"
-#include "placer/placer.hpp"
-#include "shared_utils/sparse_matmul_utils.hpp"
-
-namespace tt::graphlib
-{
-class Graph;
-class Node;
-class Shape;
-struct Edge;
-class BudaOpNode;
-enum class EdgeType;
-using EdgeUniqueId = std::tuple<NodeId, PortId, NodeId, PortId, EdgeType>;
-}  // namespace tt::graphlib
-
-namespace tt
-{
-class FusedOp;
-}
-
-namespace tt::balancer
-{
-struct UniqueId
-{
-    static std::uint64_t next_id;
-    std::uint64_t id = 0;
-    UniqueId() : id(next_id++) {}
-    UniqueId(std::uint64_t id) : id(id) {}
-    bool operator==(UniqueId other) const { return id == other.id; };
-};
-
-struct TensorShape
-{
-    int w = 0;
-    int z = 0;
-    int rt = 0;
-    int ct = 0;
-
-    TensorShape() = default;
-    TensorShape(int w, int z, int rt, int ct) : w(w), z(z), rt(rt), ct(ct) {}
-    TensorShape(graphlib::Shape const &shape);
-    inline int volume_in_tiles() const { return w * z * rt * ct; }
-    inline bool operator==(TensorShape o) const { return w == o.w and z == o.z and rt == o.rt and ct == o.ct; }
-    inline bool operator!=(TensorShape o) const { return not(*this == o); }
-    int const &operator[](int index) const;
-    int &operator[](int index);
-};
-
-struct OpShape
-{
-    std::vector<TensorShape> producer_shapes;
-    std::vector<TensorShape> inputs;
-    std::vector<TensorShape> outputs;
-
-    OpShape() = default;
-    OpShape(
-        std::vector<TensorShape> const &producer_shapes,
-        std::vector<TensorShape> const &inputs,
-        std::vector<TensorShape> const &outputs) :
-        producer_shapes(producer_shapes), inputs(inputs), outputs(outputs)
-    {
-    }
-};
-
-struct Parallelization
-{
-    int r = 0;
-    int c = 0;
-
-    Parallelization() = default;
-    Parallelization(int r, int c) : r(r), c(c) {}
-
-    inline bool operator==(Parallelization o) const { return (r == o.r) and (c == o.c); }
-    inline bool operator!=(Parallelization o) const { return not(*this == o); }
-    inline int volume() const { return r * c; }
-    static Parallelization from_array(std::array<int, 2> array) { return Parallelization(array[0], array[1]); }
-};
-
-struct GridShape
-{
-    int r = 0;
-    int c = 0;
-
-    GridShape() = default;
-    GridShape(int r, int c) : r(r), c(c) {}
-    GridShape(Parallelization p) : r(p.r), c(p.c) {}
-
-    inline bool operator==(GridShape o) const { return (r == o.r) and (c == o.c); }
-    inline bool operator!=(GridShape o) const { return not(*this == o); }
-    inline bool square() const { return r == c; }
-    inline int volume() const { return r * c; }
-    inline GridShape transposed() const { return GridShape(c, r); }
-    static GridShape from_array(std::array<int, 2> array) { return GridShape(array[0], array[1]); }
-};
-
-struct UBlockShape
-{
-    int rt = 0;
-    int ct = 0;
-
-    UBlockShape() = default;
-    UBlockShape(int rt, int ct) : rt(rt), ct(ct) {}
-    UBlockShape(std::pair<int, int> shape) : rt(shape.first), ct(shape.second) {}
-    inline bool operator==(UBlockShape o) const { return (rt == o.rt) and (ct == o.ct); }
-    inline bool operator!=(UBlockShape o) const { return !(*this == o); }
-    inline int volume() const { return rt * ct; }
-};
-
-struct BlockShape
-{
-    int t = 0;
-    int tblock_m = 0;
-    int tblock_n = 0;
-    int mblock_m = 0;
-    int mblock_n = 0;
-    UBlockShape ublock;
-
-    BlockShape() = default;
-    BlockShape(TensorShape tensor_shape, int par_r, int par_c, int par_t, UBlockShape ublock);
-    BlockShape(TensorShape tensor_shape, GridShape grid_shape, int mblock_m, int mblock_n, UBlockShape ublock);
-    BlockShape(int t, int mblock_m, int mblock_n, UBlockShape ublock);
-    bool operator==(BlockShape o) const;
-    bool operator!=(BlockShape o) const;
-    inline int m() const { return tblock_m * mblock_m; }
-    inline int n() const { return tblock_n * mblock_n; }
-    inline int rt() const { return tblock_m * mblock_m * ublock.rt; }
-    inline int ct() const { return tblock_n * mblock_n * ublock.ct; }
-    inline int buffered_rt() const { return mblock_m * ublock.rt; }
-    inline int buffered_ct() const { return mblock_n * ublock.ct; }
-    int volume() const;
-    int volume_no_t() const;
-    int buffer_tiles(int buffer_factor = 2) const;
-    void set_ublock_shape(UBlockShape new_ublock);
-    BlockShape canonical() const { return BlockShape(t, mblock_m * tblock_m, mblock_n * tblock_n, ublock); }
-    BudaBlocks as_buda_blocks() const;
-};
-
-struct TStreamDir
-{
-    enum Value
-    {
-        R,
-        C,
-        RZ,
-        CZ,
-    } v;
-
-    TStreamDir(Value v) : v(v) {}
-    static TStreamDir Transposed(TStreamDir o)
-    {
-        Value v;
-        switch (o.v)
-        {
-            case R: v = C; break;
-            case C: v = R; break;
-            case RZ: v = CZ; break;
-            case CZ: v = RZ; break;
-            default: v = R; break;
-        }
-        return TStreamDir(v);
-    }
-
-    inline bool operator==(TStreamDir o) const { return v == o.v; }
-    inline bool operator!=(TStreamDir o) const { return not(*this == o); }
-    inline bool primary_dir_compatible(TStreamDir o) const { return (r() == o.r()) and (c() == o.c()); }
-    inline bool is_ublock_order(graphlib::UBlockOrder ublock_order) const
-    {
-        return (r() == (ublock_order == graphlib::UBlockOrder::R)) and
-               (c() == (ublock_order == graphlib::UBlockOrder::C));
-    }
-    inline bool r() const { return v == R or v == RZ; }
-    inline bool c() const { return v == C or v == CZ; }
-    inline bool z_major() const { return v == RZ or v == CZ; }
-};
-
-struct TStreamFactor
-{
-    TStreamDir dir = TStreamDir::R;
-    int r = 1;
-    int c = 1;
-
-    TStreamFactor() = default;
-    TStreamFactor(TStreamDir dir, Parallelization p) : dir(dir), r(p.r), c(p.c) {}
-    TStreamFactor(TStreamDir dir, int r, int c) : dir(dir), r(r), c(c) {}
-    static TStreamFactor Transposed(TStreamFactor o) { return TStreamFactor(TStreamDir::Transposed(o.dir), o.c, o.r); }
-    inline int t() const { return r * c; }
-    inline bool none() const { return r == 1 and c == 1; }
-    inline bool is_streaming() const { return not none(); }
-    inline bool is_streaming_r() const { return is_streaming() and dir.r(); }
-    inline bool is_streaming_c() const { return is_streaming() and dir.c(); }
-    inline bool operator==(TStreamFactor o) const { return dir == o.dir and r == o.r and c == o.c; }
-    inline bool operator!=(TStreamFactor o) const { return not(*this == o); }
-    inline bool compatible_consumer(TStreamFactor consumer, bool is_sparse_mm, bool consumes_rz_major) const
-    {
-        bool allowed_none = (none() and not consumes_rz_major) or (not dir.z_major() and consumer.none());
-        return allowed_none or
-               (dir.primary_dir_compatible(consumer.dir) and (r == consumer.r or is_sparse_mm) and c == consumer.c);
-    }
-};
-
-struct BufferModel
-{
-    BlockShape block_shape;
-    int buffer_factor = 0;
-    std::size_t l1_size_tiles = 0;
-    bool size_tiles_override = false;  // set to pass l1_size_tiles to netlist
-    DataFormat data_format;
-    bool minimize_input_buffer = false;  // only buffer 2 ublocks for matmul
-    int kernel_broadcast_tiles = 0;
-
-    BufferModel() = default;
-    BufferModel(BlockShape block_shape, int buffer_factor, DataFormat data_format, bool size_tiles_override = false) :
-        block_shape(block_shape),
-        buffer_factor(buffer_factor),
-        l1_size_tiles(block_shape.buffer_tiles(buffer_factor)),
-        size_tiles_override(size_tiles_override),
-        data_format(data_format)
-    {
-    }
-    std::size_t size_tiles(bool include_t = false) const;
-    std::size_t size_bytes(bool include_t = false) const;
-    std::size_t single_buffered_size_tiles() const;
-    std::size_t single_buffered_size_bytes() const;
-    std::size_t total_size_bytes() const;
-    operator bool() const { return buffer_factor > 0; }
-};
-
-struct Padding
-{
-    int rt = 0;
-    int ct = 0;
-
-    Padding() = default;
-    Padding(int rt, int ct) : rt(rt), ct(ct) {}
-};
-
-// Do not add new fields to OpModel as it is very perf sensitive structure.
-// In case you really need to add something talk to nsmith/nobradovic.
-//
-struct OpModel
-{
-    UniqueId id;
-    GridShape grid_shape;
-    OpShape op_shape;
-    const graphlib::BudaOpNode *buda_op_node = nullptr;
-    DataFormat data_format;
-    bool input_prologue = false;
-    bool sparse_buffer = false;
-    bool is_sparse_matmul = false;
-    bool consumes_rz_major = false;
-    int nz_tiles = 0;                                 // sparse-matmul specific
-    int nz_ublocks = -1;                              // sparse-matmul specific
-    int nz_strips = -1;                               // sparse-matmul specific
-    const sparse::SparseBUDA *sparse_buda = nullptr;  // sparse-matmul specific
-    TStreamFactor t_stream_factor;
-    int fracture_factor;
-    int sparse_indices;
-    Padding padding;
-    int overlay_size = 0;  // Op-level override for overlay blob size in Bytes, value 0 maps to default size, which is
-                           // currently 65536 (64 kB)
-    std::vector<BufferModel> input_buffers;
-    std::vector<BufferModel> output_buffers;
-    std::vector<BufferModel> parameter_buffers;
-    std::vector<BufferModel> intermediate_buffers;
-    std::vector<BufferModel> dram_buffers;
-    std::unordered_map<std::string, balancer::UBlockShape> fused_op_ublock_shape;
-    std::unordered_map<graphlib::NodeId, TensorShape> effective_input_buffer_shape_for_user;
-    mutable int cached_execution_cycles = 0;
-#ifdef DEBUG
-    graphlib::EdgeUniqueId eliminating_edge;
-    std::unordered_set<std::uint64_t> op_model_valid_pair_id;
-#endif
-
-    inline BlockShape block_shape() const { return output_buffers.at(0).block_shape; }
-    inline UBlockShape ublock_shape() const { return output_buffers.at(0).block_shape.ublock; }
-    inline void set_ublock_shape(UBlockShape ublock)
-    {
-        for (BufferModel &output_buffer : output_buffers)
-        {
-            output_buffer.block_shape.set_ublock_shape(ublock);
-        }
-    }
-    std::size_t get_l1_memory_usage() const;
-    void invalidate_cached_execution_cycles() const { cached_execution_cycles = 0; }
-    int get_execution_cycles(
-        std::string const &arch_name, bool theoretical = false, bool invalidate_cached = false) const;
-    int get_output_buffer_factor() const { return output_buffers.at(0).buffer_factor; }
-    bool has_sparse_buffer() const { return sparse_buffer; }
-    bool has_parameter_buffers() const
-    {
-        return std::any_of(parameter_buffers.begin(), parameter_buffers.end(), [](auto b) { return bool(b); });
-    }
-    int num_parameter_buffers() const
-    {
-        return std::count_if(parameter_buffers.begin(), parameter_buffers.end(), [](auto b) { return bool(b); });
-    }
-    bool is_streaming() const { return not t_stream_factor.none(); }
-    Parallelization parallelization() const
-    {
-        return Parallelization(grid_shape.r * t_stream_factor.r, grid_shape.c * t_stream_factor.c);
-    }
-
-    const std::string &op_type() const;
-    MathFidelity math_fidelity() const;
-    bool is_gradient_op() const;
-    bool is_matmul() const;
-    std::shared_ptr<FusedOp> fused_op() const;
-
-    const std::string get_reduce_dim() const;
-    const BudaOpAttrs buda_op_attrs() const;
-
-    GridShape get_input_grid_shape(int input_idx) const
-    {
-        return buda_op_node->is_matmul_not_sparse()
-                   ? (input_idx == 0 ? GridShape(grid_shape.r, 1) : GridShape(1, grid_shape.c))
-                   : grid_shape;
-    }
-    int get_input_bytes(int input_idx) const
-    {
-        return input_buffers[input_idx].total_size_bytes() * get_input_grid_shape(input_idx).volume();
-    }
-    int get_param_bytes(int input_idx) const
-    {
-        return parameter_buffers[input_idx].total_size_bytes() * get_input_grid_shape(input_idx).volume();
-    }
-    int get_dram_bytes(int input_idx) const
-    {
-        return dram_buffers[input_idx].total_size_bytes() * get_input_grid_shape(input_idx).volume();
-    }
-    int get_output_bytes() const { return output_buffers[0].total_size_bytes() * grid_shape.volume(); }
-    int get_total_param_bytes() const
-    {
-        int total_param_bytes = 0;
-        for (auto const &parameter_buffer : parameter_buffers) total_param_bytes += parameter_buffer.total_size_bytes();
-        return total_param_bytes;
-    }
-    int get_total_dram_bytes() const
-    {
-        int total_dram_bytes = 0;
-        for (auto const &dram_buffer : dram_buffers) total_dram_bytes += dram_buffer.total_size_bytes();
-        return total_dram_bytes;
-    }
-    float get_input_bytes_per_cycle(int input_idx, std::string const &arch_name) const
-    {
-        return static_cast<float>(get_input_bytes(input_idx)) / get_execution_cycles(arch_name);
-    }
-    float get_output_bytes_per_cycle(std::string const &arch_name) const
-    {
-        return static_cast<float>(get_output_bytes()) / get_execution_cycles(arch_name);
-    }
-
-    bool operator==(OpModel const &other) const { return id == other.id; }
-
-    TensorShape get_out_shape(bool post_t_stream = true) const
-    {
-        TensorShape out_shape = op_shape.outputs[0];
-        if (post_t_stream)
-        {
-            return TensorShape(
-                out_shape.w,
-                out_shape.z * t_stream_factor.t(),
-                out_shape.rt / t_stream_factor.r,
-                out_shape.ct / t_stream_factor.c);
-        }
-        else
-        {
-            return out_shape;
-        }
-    }
-
-   private:
-    int get_execution_cycles_uncached(std::string const &arch_name, bool theoretical = false) const;
-};
-
-using LegalOpModels = std::unordered_map<graphlib::Node const *, std::vector<OpModel>>;
-using OpModels = std::unordered_map<graphlib::Node const *, OpModel>;
-using OpModelMap = std::unordered_map<std::string, OpModel>;
-using BlockShapeMap = std::unordered_map<std::string, BlockShape>;
-using CutEdges = std::unordered_map<graphlib::Edge, bool>;
-
-struct FusedSubOpModel
-{
-    std::string type;
-    int mblock_m;
-    int mblock_n;
-    int ublock_rt;
-    int ublock_ct;
-    int mblock_k = 0;
-    int ublock_kt = 0;
-    std::string reduce_dim = "";
-    bool has_dest_input = false;
-    bool has_dest_output = false;
-};
-
-class FactorizedInt
-{
-   public:
-    using Factors = int;
-    using FactorRange = std::pair<int, int>;
-    struct Constant
-    {
-        Constant(int v) : v(v) {}
-        int v;
-    };
-    struct Factorial
-    {
-        Factorial(int max, int multiplier = 1) : max(max), multiplier(multiplier) {}
-        int max;
-        int multiplier;
-    };
-
-   public:
-    FactorizedInt() = default;
-    // Inclusive ranges
-    FactorizedInt(Factors max_val);
-    FactorizedInt(FactorRange r);
-    template <typename Iterator>
-    FactorizedInt(Iterator begin, Iterator end) : factors(begin, end)
-    {
-    }
-    FactorizedInt(Constant s);
-    FactorizedInt(Factorial f);
-
-    int value() const;
-    int get_min_factor() const;
-    int get_max_factor() const;
-    int get_nearest_factor_le(int integer) const;
-    std::vector<int> const &get_factors() const;
-    FactorizedInt keep_factors_divisible_by(FactorizedInt const &other) const;
-
-    // Set intersection
-    FactorizedInt operator&(FactorizedInt const &other) const;
-    // Set union
-    FactorizedInt operator|(FactorizedInt const &other) const;
-    // Set difference
-    FactorizedInt operator-(FactorizedInt const &other) const;
-
-    // Multiply
-    FactorizedInt operator*(FactorizedInt const &other) const;
-    // Divide
-    FactorizedInt operator/(FactorizedInt const &other) const;
-
-    bool operator==(FactorizedInt const &other) const;
-    bool overlaps(FactorizedInt const &other) const;
-    bool contains(int v) const;
-    bool is_singleton() const;
-    inline bool empty() const { return factors.empty(); }
-
-   private:
-    static std::vector<int> factorize(int min_val, int max_val);
-
-   private:
-    std::vector<int> factors;
-};
-
-struct FactorizedShape
-{
-    using Constant = FactorizedInt::Constant;
-
-    FactorizedInt r;
-    FactorizedInt c;
-
-    FactorizedShape() = default;
-    FactorizedShape(graphlib::Shape const &shape);
-    FactorizedShape(std::pair<int, int> shape);
-    FactorizedShape(Parallelization par);
-    FactorizedShape(FactorizedInt r, FactorizedInt c);
-
-    std::pair<int, int> get_min_factor() const { return std::make_pair(r.get_min_factor(), c.get_min_factor()); }
-    std::pair<int, int> get_max_factor() const { return std::make_pair(r.get_max_factor(), c.get_max_factor()); }
-
-    // Set intersection on r & c independently
-    FactorizedShape operator&(FactorizedShape const &other) const;
-    // Set union on r & c independently
-    FactorizedShape operator|(FactorizedShape const &other) const;
-    // Set difference on r & c independently
-    FactorizedShape operator-(FactorizedShape const &other) const;
-    // Random access into set
-    Parallelization operator[](int idx) const;
-
-    bool operator==(FactorizedShape const &other) const;
-    bool empty() const;
-    std::size_t size() const;
-    bool is_subset_of(FactorizedShape const &other) const;
-    bool is_superset_of(FactorizedShape const &other) const;
-    bool is_singleton() const;
-
-    class Iterator
-        : public std::iterator<std::input_iterator_tag, Parallelization, int, Parallelization const *, Parallelization>
-    {
-        int i = 0;
-        FactorizedShape const *p;
-
-       public:
-        Iterator(FactorizedShape const *p);
-        Iterator(FactorizedShape const *p, int i);
-
-        Iterator &operator++();
-        Iterator operator++(int);
-        bool operator==(Iterator other) const;
-        bool operator!=(Iterator other) const;
-        reference operator*() const;
-    };
-
-    Iterator begin() const;
-    Iterator end() const;
-};
-
-using GridCoord = GridShape;
-
-struct CanCoord
-{
-    int w = 0;
-    int t = 0;
-    int rt = 0;
-    int ct = 0;
-    CanCoord(int w, int t, int rt, int ct) : w(w), t(t), rt(rt), ct(ct) {}
-    CanCoord(int t, int rt, int ct) : w(0), t(t), rt(rt), ct(ct) {}
-    bool operator==(CanCoord o) const { return w == o.w and t == o.t and rt == o.rt and ct == o.ct; }
-};
-
-union LinCoord
-{
-    struct
-    {
-        std::uint64_t grid_r : 16;
-        std::uint64_t grid_c : 16;
-        std::uint64_t address : 32;
-    } v;
-    std::uint64_t bits;
-
-    LinCoord()
-    {
-        v.grid_r = std::numeric_limits<std::uint16_t>::max();
-        v.grid_c = std::numeric_limits<std::uint16_t>::max();
-        v.address = 0;
-    }
-    LinCoord(int grid_r, int grid_c, int address)
-    {
-        v.grid_r = grid_r;
-        v.grid_c = grid_c;
-        v.address = address;
-    }
-    LinCoord(GridCoord grid, int address) : LinCoord(grid.r, grid.c, address) {}
-    int grid_r() const { return (int)v.grid_r; }
-    int grid_c() const { return (int)v.grid_c; }
-    GridCoord grid_coord() const { return GridCoord(grid_r(), grid_c()); }
-    int address() const { return (int)v.address; }
-    LinCoord next() const
-    {
-        auto n = *this;
-        ++n.v.address;
-        return n;
-    }
-    bool operator==(LinCoord o) const { return bits == o.bits; }
-    bool operator!=(LinCoord o) const { return bits != o.bits; }
-    bool valid() const
-    {
-        return not(
-            v.grid_r == std::numeric_limits<std::uint16_t>::max() and
-            v.grid_c == std::numeric_limits<std::uint16_t>::max() and v.address == 0);
-    }
-
-   private:
-    LinCoord(std::uint64_t bits) : bits(bits) {}
-};
-
-struct TileLayout
-{
-    GridShape grid_shape;
-    BlockShape block_shape;
-    graphlib::UBlockOrder ublock_order;
-    Padding padding;
-
-    TileLayout(
-        GridShape grid_shape, BlockShape block_shape, graphlib::UBlockOrder ublock_order, Padding padding = Padding()) :
-        grid_shape(grid_shape), block_shape(block_shape), ublock_order(ublock_order), padding(padding)
-    {
-    }
-
-    bool operator==(TileLayout const &other) const
-    {
-        return grid_shape == other.grid_shape and block_shape == other.block_shape and
-               ublock_order == other.ublock_order;
-    }
-    inline TensorShape shape() const { return TensorShape(w(), t(), rt(), ct()); }
-    inline int w() const { return 1; }
-    inline int t() const { return block_shape.t; }
-    inline int rt() const { return grid_shape.r * block_shape.rt(); }
-    inline int ct() const { return grid_shape.c * block_shape.ct(); }
-    inline int volume(bool include_t = true, bool include_padding = false) const
-    {
-        return (include_t ? t() : 1) * (rt() - int(not include_padding) * padding.rt) *
-               (ct() - int(not include_padding) * padding.ct);
-    }
-    LinCoord operator[](int idx) const;
-    LinCoord map(CanCoord can_coord) const;
-    CanCoord map(LinCoord lin_coord) const;
-    GridCoord grid_coord(CanCoord can_coord) const;
-    GridCoord grid_coord(LinCoord lin_coord) const;
-};
-
-struct Pipe
-{
-    TileLayout producer_layout;
-    TileLayout consumer_layout;
-    std::vector<graphlib::OpType> tms;
-    int producer_out_buf_mb;
-
-    Pipe(
-        TileLayout producer_layout,
-        int producer_out_buf_mb,
-        std::vector<graphlib::OpType> tms,
-        TileLayout consumer_layout) :
-        producer_layout(producer_layout),
-        consumer_layout(consumer_layout),
-        tms(tms),
-        producer_out_buf_mb(producer_out_buf_mb)
-    {
-    }
-
-    bool operator==(Pipe const &other) const
-    {
-        return producer_layout == other.producer_layout and consumer_layout == other.consumer_layout and
-               tms == other.tms and producer_out_buf_mb == other.producer_out_buf_mb;
-    }
-};
-
-struct ResourceUsage
-{
-    int producer_fan_out = 0;
-    int consumer_fan_in = 0;
-    int producer_phases = 0;
-    int consumer_phases = 0;
-};
-
-inline std::ostream &operator<<(std::ostream &os, CanCoord const &coord)
-{
-    os << "CanCoord{.w = " << coord.w << ", .t = " << coord.t << ", .rt = " << coord.rt << ", .ct = " << coord.ct
-       << "}";
-    return os;
-}
-
-inline std::ostream &operator<<(std::ostream &os, LinCoord const &coord)
-{
-    os << "LinCoord{.grid_r = " << coord.grid_r() << ", .grid_c = " << coord.grid_c()
-       << ", .address = " << coord.address() << "}";
-    return os;
-}
-
-inline std::ostream &operator<<(std::ostream &os, TileLayout const &layout)
-{
-    os << "TileLayout(GridShape(" << layout.grid_shape.r << ", " << layout.grid_shape.c << "), BlockShape("
-       << layout.block_shape.t << ", " << layout.block_shape.m() << ", " << layout.block_shape.n() << ", UBlockShape("
-       << layout.block_shape.ublock.rt << ", " << layout.block_shape.ublock.ct << ")), "
-       << (layout.ublock_order == graphlib::UBlockOrder::R ? "graphlib::UBlockOrder::R" : "graphlib::UBlockOrder::C");
-    if (layout.padding.rt or layout.padding.ct)
-        os << ", Padding(" << layout.padding.rt << ", " << layout.padding.ct << ")";
-    os << ")";
-    return os;
-}
-
-inline std::ostream &operator<<(std::ostream &os, Pipe const &pipe)
-{
-    os << "Pipe(" << pipe.producer_layout << ", " << pipe.producer_out_buf_mb << ", {";
-    for (auto const &tm : pipe.tms) os << tm << ", ";
-    os << "}, " << pipe.consumer_layout << ")";
-    return os;
-}
-
-inline std::ostream &operator<<(std::ostream &os, TensorShape const &tensor_shape)
-{
-    os << "TensorShape{.w = " << tensor_shape.w << ", .z = " << tensor_shape.z << ", .rt = " << tensor_shape.rt
-       << ", .ct = " << tensor_shape.ct << "}";
-    return os;
-}
-
-inline std::ostream &ostream_with_indent(std::ostream &os, OpShape const &op_shape, char const *indent = "")
-{
-    os << indent << "OpShape{" << std::endl;
-    os << indent << "  .inputs = {" << std::endl;
-    int i = 0;
-    for (TensorShape const &input : op_shape.inputs)
-    {
-        os << indent << "    [" << i++ << "] = " << input << std::endl;
-    }
-    os << indent << "  }," << std::endl;
-    os << indent << "  .outputs = {" << std::endl;
-    i = 0;
-    for (TensorShape const &output : op_shape.outputs)
-    {
-        os << indent << "    [" << i++ << "] = " << output << std::endl;
-    }
-    os << indent << "  }," << std::endl;
-    os << indent << "}";
-    return os;
-}
-
-inline std::ostream &operator<<(std::ostream &os, OpShape const &op_shape) { return ostream_with_indent(os, op_shape); }
-
-inline std::ostream &operator<<(std::ostream &os, GridShape const &grid_shape)
-{
-    os << "GridShape{.r = " << grid_shape.r << ", .c = " << grid_shape.c << "}";
-    return os;
-}
-
-inline std::ostream &operator<<(std::ostream &os, Parallelization const &grid_shape)
-{
-    os << "Parallelization{.r = " << grid_shape.r << ", .c = " << grid_shape.c << "}";
-    return os;
-}
-
-inline std::ostream &operator<<(std::ostream &os, UBlockShape const &ublock)
-{
-    os << "UBlockShape{.rt = " << ublock.rt << ", .ct = " << ublock.ct << "}";
-    return os;
-}
-
-inline std::ostream &operator<<(std::ostream &os, BlockShape const &block_shape)
-{
-    os << "BlockShape{.t = " << block_shape.t << ", .mblock_m = " << (block_shape.mblock_m * block_shape.tblock_m)
-       << ", .mblock_n = " << (block_shape.mblock_n * block_shape.tblock_n) << ", .ublock = " << block_shape.ublock
-       << "}";
-    return os;
-}
-
-inline std::ostream &operator<<(std::ostream &os, TStreamDir const &dir)
-{
-    switch (dir.v)
-    {
-        case TStreamDir::R: os << "TStreamDir::R"; break;
-        case TStreamDir::C: os << "TStreamDir::C"; break;
-        case TStreamDir::RZ: os << "TStreamDir::RZ"; break;
-        case TStreamDir::CZ: os << "TStreamDir::CZ"; break;
-        default: os << "TStreamDir::Unknown"; break;
-    }
-    return os;
-}
-
-inline std::ostream &operator<<(std::ostream &os, TStreamFactor const &tsf)
-{
-    os << "TStreamFactor{.dir = " << tsf.dir << ", .r = " << tsf.r << ", .c = " << tsf.c << "}";
-    return os;
-}
-
-inline std::ostream &operator<<(std::ostream &os, BufferModel const &buffer_model)
-{
-    os << "BufferModel{.block_shape = " << buffer_model.block_shape
-       << ", .buffer_factor = " << buffer_model.buffer_factor << ", .l1_size_tiles = " << buffer_model.l1_size_tiles
-       << ", .data_format = " << buffer_model.data_format << "}";
-    return os;
-}
-
-inline std::ostream &ostream_with_indent(
-    std::ostream &os, std::vector<BufferModel> const &buffer_models, char const *indent = "")
-{
-    os << "{" << std::endl;
-    int i = 0;
-    for (BufferModel const &buffer_model : buffer_models)
-    {
-        if (not buffer_model)
-            continue;
-        os << indent << "    [" << i++ << "] = " << buffer_model << std::endl;
-    }
-    os << indent << "  }," << std::endl;
-    return os;
-}
-
-inline std::ostream &ostream_with_indent(std::ostream &os, OpModel const &op_model, char const *indent = "")
-{
-    os << indent << "OpModel{" << std::endl;
-    os << indent << "  .id = " << op_model.id.id << std::endl;
-    os << indent << "  .grid_shape = " << op_model.grid_shape << std::endl;
-    os << indent << "  .op_shape = ";
-    ostream_with_indent(os, op_model.op_shape, (std::string("  ") + indent).c_str()) << std::endl;
-    os << indent << "  .op_type = " << op_model.op_type() << "," << std::endl;
-    os << indent << "  .data_format = " << op_model.data_format << "," << std::endl;
-    os << indent << "  .math_fidelity = " << op_model.math_fidelity() << "," << std::endl;
-    os << indent << "  .t_stream_factor = " << op_model.t_stream_factor << "," << std::endl;
-    os << indent << "  .fracture_factor = " << op_model.fracture_factor << "," << std::endl;
-    os << indent << "  .cached_execution_cycles = " << op_model.cached_execution_cycles << "," << std::endl;
-    os << indent << "  .input_buffers = ";
-    ostream_with_indent(os, op_model.input_buffers, indent);
-    os << indent << "  .output_buffers = ";
-    ostream_with_indent(os, op_model.output_buffers, indent);
-    os << indent << "  .parameter_buffers = ";
-    ostream_with_indent(os, op_model.parameter_buffers, indent);
-    os << indent << "  .intermediate_buffers = ";
-    ostream_with_indent(os, op_model.intermediate_buffers, indent);
-    os << indent << "  .dram_buffers = ";
-    ostream_with_indent(os, op_model.dram_buffers, indent);
-    os << indent << "}";
-    return os;
-}
-
-inline std::ostream &operator<<(std::ostream &os, OpModel const &op_model) { return ostream_with_indent(os, op_model); }
-
-inline std::ostream &operator<<(std::ostream &os, FusedSubOpModel const &sub_op_model)
-{
-    os << "FusedSubOpModel{" << std::endl;
-    os << "  .type = " << sub_op_model.type << std::endl;
-    os << "  .mblock_m = " << sub_op_model.mblock_m << std::endl;
-    os << "  .mblock_n = " << sub_op_model.mblock_n << std::endl;
-    os << "  .ublock_rt = " << sub_op_model.ublock_rt << std::endl;
-    os << "  .ublock_ct = " << sub_op_model.ublock_ct << std::endl;
-    os << "  .mblock_k = " << sub_op_model.mblock_k << std::endl;
-    os << "  .ublock_kt = " << sub_op_model.ublock_kt << std::endl;
-    os << "  .reduce_dim = " << sub_op_model.reduce_dim << std::endl;
-    os << "  .has_dest_input = " << sub_op_model.has_dest_input << std::endl;
-    os << "  .has_dest_output = " << sub_op_model.has_dest_output << std::endl;
-    os << "}";
-    return os;
-}
-
-inline std::ostream &operator<<(std::ostream &os, FactorizedInt const &fi)
-{
-    os << "{";
-    for (auto i : fi.get_factors()) os << i << ", ";
-    os << "}";
-    return os;
-}
-
-inline std::ostream &operator<<(std::ostream &os, FactorizedShape const &gp)
-{
-    os << "{";
-    for (Parallelization parallelization : gp)
-    {
-        os << parallelization << ", ";
-    }
-    os << "}";
-    return os;
-}
-
-inline std::ostream &operator<<(std::ostream &os, ResourceUsage const &ru)
-{
-    os << "ResourceUsage{.producer_fan_out=" << ru.producer_fan_out << ", .consumer_fan_in=" << ru.consumer_fan_in
-       << ", .producer_phases=" << ru.producer_phases << ", .consumer_phases=" << ru.consumer_phases << "}";
-    return os;
-}
-
-}  // namespace tt::balancer
-
-namespace std
-{
-template <>
-struct hash<tt::balancer::TileLayout>
-{
-    std::size_t operator()(tt::balancer::TileLayout const &layout) const
-    {
-        std::size_t seed = 0;
-        // intentionally exclude edge_creation_id from the hash
-        tt::hash_combine(seed, static_cast<size_t>(layout.grid_shape.r));
-        tt::hash_combine(seed, static_cast<size_t>(layout.grid_shape.c));
-        tt::hash_combine(seed, static_cast<size_t>(layout.block_shape.t));
-        tt::hash_combine(seed, static_cast<size_t>(layout.block_shape.tblock_m));
-        tt::hash_combine(seed, static_cast<size_t>(layout.block_shape.tblock_n));
-        tt::hash_combine(seed, static_cast<size_t>(layout.block_shape.mblock_m));
-        tt::hash_combine(seed, static_cast<size_t>(layout.block_shape.mblock_n));
-        tt::hash_combine(seed, static_cast<size_t>(layout.block_shape.ublock.rt));
-        tt::hash_combine(seed, static_cast<size_t>(layout.block_shape.ublock.ct));
-        tt::hash_combine(seed, static_cast<size_t>(layout.ublock_order));
-        return seed;
-    }
-};
-
-template <>
-struct hash<tt::balancer::Pipe>
-{
-    std::size_t operator()(tt::balancer::Pipe const &pipe) const
-    {
-        std::size_t seed = 0;
-        // intentionally exclude edge_creation_id from the hash
-        tt::hash_combine(seed, hash<tt::balancer::TileLayout>{}(pipe.producer_layout));
-        tt::hash_combine(seed, hash<tt::balancer::TileLayout>{}(pipe.consumer_layout));
-        tt::hash_combine(seed, static_cast<size_t>(pipe.tms.size()));
-        tt::hash_combine(seed, static_cast<size_t>(pipe.producer_out_buf_mb));
-        return seed;
-    }
-};
-}  // namespace std
\ No newline at end of file
diff --git a/pybuda/csrc/buda_passes.cpp b/pybuda/csrc/buda_passes.cpp
index 2a1234262..4c5dddf0c 100644
--- a/pybuda/csrc/buda_passes.cpp
+++ b/pybuda/csrc/buda_passes.cpp
@@ -18,10 +18,9 @@
 #include "passes/erase_consecutive_reshape.hpp"
 #include "passes/erase_inverse_ops.hpp"
 #include "passes/erase_unnecessary_4d_tm_sequence.hpp"
+#include "passes/mlir_compiler.hpp"
 #include "passes/explicate_unsqueeze.hpp"
-#include "passes/fork_join.hpp"
 #include "passes/fuse_conv2d_bias.hpp"
-#include "passes/fuse_ops.hpp"
 #include "passes/fuse_pad_conv2d.hpp"
 #include "passes/fuse_per_channel_ops.hpp"
 #include "passes/fuse_redundant_tm_sequence.hpp"
@@ -34,6 +33,7 @@
 #include "passes/lower_concat_to_runtime_transform.hpp"
 #include "passes/lower_reinterpret_shape.hpp"
 #include "passes/lowering_context.hpp"
+#include "passes/move_requantize.hpp"
 #include "passes/move_select_after_matmul_optional.hpp"
 #include "passes/pad_output_buffer.hpp"
 #include "passes/passes_utils.hpp"
@@ -42,16 +42,8 @@
 #include "passes/pre_placer_buda_passes.hpp"
 #include "passes/print_graph.hpp"
 #include "passes/replace_incommutable_patterns.hpp"
-#include "passes/reproduce_subgraph.hpp"
 #include "passes/set_tile_dim.hpp"
 #include "passes/squeeze_to_reshape.hpp"
-#include "passes/t_stream.hpp"
-#include "perf_model/perf_model.hpp"
-#include "placer/dram.hpp"
-#include "placer/dram_allocator.hpp"
-#include "placer/host_memory_allocator.hpp"
-#include "placer/lower_to_placer.hpp"
-#include "placer/utils.hpp"
 #include "python_bindings_common.hpp"
 #include "reportify/reportify.hpp"
 #include "utils/assert.hpp"
@@ -103,7 +95,7 @@ run_post_initial_graph_passes(graphlib::Graph *graph, py::object compiler_cfg_ob
     return std::make_tuple(inserted_node_id_mapping, chip_id_assignments);
 }
 
-void run_optimization_graph_passes(graphlib::Graph *graph, const DeviceConfig &device_config)
+void run_optimization_graph_passes(graphlib::Graph *graph)
 {
     passes::print_graph(graph, "PRE OPTIMIZE");
     passes::lower_concat_to_runtime_transform(graph);
@@ -160,11 +152,11 @@ void run_optimization_graph_passes(graphlib::Graph *graph, const DeviceConfig &d
             passes::bypass_nop_tms(graph);
         }
     }
+    passes::move_tm_through_requantize(graph);
     recalculate_shapes(graph);
 
     passes::hoist_transforms_to_inputs(graph);
     passes::erase_consecutive_reshape(graph, true);
-    passes::pad_output_buffer(graph, device_config);
     passes::lower_reinterpret_shape(graph);
     passes::bind_reshape_to_io(graph);
 
@@ -201,9 +193,9 @@ std::vector<std::pair<graphlib::NodeId, graphlib::NodeId>> run_post_autograd_gra
 }
 
 // ********** Run pre-lowering passes **********
-void run_pre_lowering_passes(graphlib::Graph *graph)
+graphlib::Graph* run_pre_lowering_passes(graphlib::Graph *graph)
 {
-    passes::print_graph(graph, "PRE_LOWERING");
+    passes::print_graph(graph, "PRE_MLIR");
     // Recalculate shapes, and figure out implicit broadcasts that are missing
     recalculate_shapes(graph);
 
@@ -234,16 +226,15 @@ void run_pre_lowering_passes(graphlib::Graph *graph)
     // Fold tile broadcasts into reduce and inputs
     fold_tile_broadcast_ops_into_inputs(graph);
     fold_tile_broadcast_ops_into_reduce(graph);
+
+    return graph;
 }
 
 // ********** Run lowering passes **********
-std::pair<std::unique_ptr<graphlib::Graph>, placer::PlacerConfigUpdate> run_pre_placer_buda_passes(
+std::unique_ptr<graphlib::Graph> run_pre_placer_buda_passes(
     graphlib::Graph *graph,
-    scheduler::SchedulerConfig scheduler_config,
     const DeviceConfig &device_config,
     std::vector<std::uint32_t> chip_ids,
-    const placer::PredicatesToBreaks &predicates_to_chip_break,
-    const placer::PredicatesToBreaks &predicates_to_epoch_break,
     const std::vector<std::string> &op_names_dont_fuse,
     const std::vector<std::string> &op_names_manual_fuse,
     const passes::FractureChipIdAssignments &fracture_chip_id_assignments,
@@ -256,10 +247,9 @@ std::pair<std::unique_ptr<graphlib::Graph>, placer::PlacerConfigUpdate> run_pre_
     const int amp_level,
     const bool enable_recompute,
     const bool output_queues_on_host,
-    const tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        &ins_instructions,
+    const bool input_queues_on_host,
     const std::vector<std::tuple<std::string, std::string, int>> &insert_queues,
-    const std::vector<AMPNodeProperties> &amp_properties,
+    std::vector<AMPNodeProperties> amp_properties,
     const std::vector<std::string> &op_intermediates_to_save,
     const bool use_interactive_placer,
     bool enable_device_tilize)
@@ -280,21 +270,11 @@ std::pair<std::unique_ptr<graphlib::Graph>, placer::PlacerConfigUpdate> run_pre_
     // Remove nops
     remove_nops(lowered_graph.get());
 
-    auto op_names_to_chip_break = placer::match_op_names_to_breaks(lowered_graph.get(), predicates_to_chip_break);
-    auto op_names_to_epoch_break = placer::match_op_names_to_breaks(lowered_graph.get(), predicates_to_epoch_break);
-
-    // Fuse ops
-    if (enable_auto_fusing)
+    // Add buffer NOP between host input and ops if there are multiple ops reading from same host input.
+    //
+    if (input_queues_on_host and env_as<bool>("PYBUDA_ENABLE_HOST_INPUT_NOP_BUFFERING"))
     {
-        recalculate_shapes(lowered_graph.get());
-        fuse_ops(
-            lowered_graph.get(),
-            device_config,
-            op_names_to_chip_break,
-            op_names_to_epoch_break,
-            op_names_dont_fuse,
-            op_names_manual_fuse,
-            amp_properties);
+        fix_host_inputs(lowered_graph.get());
     }
 
     // Sanitize past cache IOs
@@ -349,12 +329,6 @@ std::pair<std::unique_ptr<graphlib::Graph>, placer::PlacerConfigUpdate> run_pre_
 
     insert_partial_datacopy_tms(lowered_graph.get());
 
-    // inserted fork-join NOPs
-    for (auto instruction : ins_instructions)
-    {
-        instruction.second->insert(lowered_graph.get());
-    }
-
     insert_user_defined_queues(lowered_graph.get(), insert_queues);
 
     //
@@ -370,219 +344,6 @@ std::pair<std::unique_ptr<graphlib::Graph>, placer::PlacerConfigUpdate> run_pre_
         amp_level,
         amp_properties);
 
-    // At this point, there should be no more graph mutations.
-    placer::PlacerConfigUpdate placer_config_update = schedule_pre_placer_graph(
-        lowered_graph.get(),
-        device_config,
-        scheduler_config,
-        chip_ids,
-        op_names_to_chip_break,
-        op_names_to_epoch_break,
-        fracture_chip_id_assignments,
-        "" /* nops_remote_devices_postfix */,
-        use_interactive_placer);
-
-    return std::make_pair(std::move(lowered_graph), placer_config_update);
-}
-
-static std::vector<placer::DramAllocator> initialize_dram_allocators(const DeviceConfig& device_config, const PostPlacerConfig& config, const std::string &graph_name, std::vector<std::vector<placer::Blocks>> &pre_allocated_blocks)
-{
-    placer::DRAMPlacementAlgorithm placement_algorithm =
-        env_as<bool>("PYBUDA_DRAM_CLOSEST") ? placer::CLOSEST : 
-        env_as<bool>("PYBUDA_DRAM_PICK_CAPACITY") ? placer::GREATEST_CAPACITY : 
-        env_as<bool>("PYBUDA_DRAM_FLIP_FLOP") ? placer::ROUND_ROBIN_FLIP_FLOP :
-        config.placement_algorithm;
-    std::vector<placer::DramAllocator> chip_dram_allocators;
-    auto max_chip_id = *std::max_element(device_config.chip_ids.begin(),device_config.chip_ids.end());
-
-    if (pre_allocated_blocks.size() <= max_chip_id)
-    {
-        pre_allocated_blocks.resize(max_chip_id + 1);
-    }
-
-    for (uint32_t chip_id = 0; chip_id <= max_chip_id; chip_id++) 
-    {
-        chip_dram_allocators.emplace_back(config.dram_placer_config, graph_name, chip_id, pre_allocated_blocks[chip_id], placement_algorithm);
-    }
-
-    return chip_dram_allocators;
-}
-
-static bool tensix_datacopy_eth_link_serialization_enabled() { return env_as<bool>("PYBUDA_ENABLE_ETH_SERIALIZATION"); }
-static bool eth_datacopy_link_serialization_enabled() { return env_as<bool>("PYBUDA_ENABLE_ETH_DATACOPY_SERIALIZATION"); }
-
-// ********** Run post-placer passes, like queue and buffer insertion **********
-PostPlacerResults run_post_placer_buda_passes(
-    graphlib::Graph *graph,
-    const std::string &graph_name,
-    const DeviceConfig &device_config,
-    placer::PlacerSolution &placer_solution,
-    PostPlacerConfig &config,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution,
-    const tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        &previous_ins_instructions,
-    std::vector<std::vector<placer::Blocks>> &pre_allocated_blocks,
-    std::uint32_t last_host_address)
-{
-    set_prologue_queues(graph, balancer_solution->op_models);
-
-    replace_recompute_with_checkpoint(graph, placer_solution);
-
-    validate_subgraph_placement(graph, placer_solution);
-
-    remove_buffering_queues_from_cross_epoch_edges(graph, placer_solution);
-
-    // Insert queues between ops on different epochs.
-    insert_epoch_to_epoch_queues(
-        graph,
-        placer_solution,
-        {graphlib::NodeEpochType::Forward, graphlib::NodeEpochType::Backward, graphlib::Optimizer},
-        balancer_solution->graph_solver_cut_edges);
-
-    if (config.enable_t_streaming)
-    {
-        insert_t_stream_tms(graph, balancer_solution->op_models);
-        optimize_tms(graph);
-        recalculate_shapes(graph);
-    }
-
-    // Set queue entry sizes based on the configuration for different types of queues
-    set_queue_sizes(graph, config, placer_solution);
-
-    // Place and allocate DRAM queues
-    std::vector<placer::DramAllocator> chip_dram_allocators = initialize_dram_allocators(device_config, config, graph_name, pre_allocated_blocks);
-    placer::HostMemoryAllocator host_memory_allocator(config.host_memory_placer_config, last_host_address);
-
-    placer::place_host_queues(
-        config.host_memory_placer_config, host_memory_allocator, graph, placer_solution, *balancer_solution);
-    placer::place_dram_queues(
-        graph,
-        placer_solution,
-        *balancer_solution,
-        config.host_memory_placer_config,
-        config.dram_placer_config,
-        chip_dram_allocators);
-
-    if (eth_datacopy_link_serialization_enabled()) {
-        TT_ASSERT(!tensix_datacopy_eth_link_serialization_enabled(), "Environment variables `PYBUDA_ENABLE_ETH_SERIALIZATION` and `PYBUDA_ENABLE_ETH_DATACOPY_SERIALIZATION` cannot be enabled at the same time.");
-        // eth data copy ethernet stream serialization can be run prioer to dram queue allocation
-        // Reduce ethernet stream usage runs after we insert queues because we may need to apply T streaming to e2e queues if they are on a remote chip
-        // from their producer
-        reduce_ethernet_stream_usage(config, graph, *(balancer_solution.get()), placer_solution, device_config);
-        recalculate_shapes(graph);
-        
-        // allocate the new or flattened/serialized queues
-        placer::place_host_queues(
-            config.host_memory_placer_config, host_memory_allocator, graph, placer_solution, *balancer_solution);
-        placer::place_dram_queues(
-            graph,
-            placer_solution,
-            *balancer_solution,
-            config.host_memory_placer_config,
-            config.dram_placer_config,
-            chip_dram_allocators);
-    }
-    
-    if (tensix_datacopy_eth_link_serialization_enabled()) 
-    {
-        // Reduce ethernet stream usage runs after we insert queues because we may need to apply T streaming to e2e queues if they are on a remote chip
-        // from their producer
-        reduce_ethernet_stream_usage(config, graph, *(balancer_solution.get()), placer_solution, device_config);
-        recalculate_shapes(graph);
-        
-        // allocate the new or flattened/serialized queues
-        placer::place_host_queues(
-            config.host_memory_placer_config, host_memory_allocator, graph, placer_solution, *balancer_solution);
-        placer::place_dram_queues(
-            graph,
-            placer_solution,
-            *balancer_solution,
-            config.host_memory_placer_config,
-            config.dram_placer_config,
-            chip_dram_allocators);
-    }
-
-    // Lower additional buda attrs post placer
-    post_placer_lower_buda_attrs(graph, device_config, balancer_solution->op_models);
-    passes::validate_post_placer_data_formats(graph, device_config);
-
-    PostPlacerResults results;
-
-    results.current_host_address = host_memory_allocator.get_current_allocation_address();
-    std::vector<std::vector<placer::Blocks>> allocated_blocks;
-    for (auto &chip_dram_allocator : chip_dram_allocators) 
-    {
-        allocated_blocks.push_back(chip_dram_allocator.get_blocks());
-    }
-    results.allocated_blocks = allocated_blocks;
-
-    if (!balancer_solution->placer_solution.fork_join_buffered)
-    {
-        // Add fork/join buffering, post-placement.
-        //
-        FJBufferingResult fj_buffering;
-        fj_buffering = insert_fork_join_buffering(
-            graph,
-            &balancer_solution->op_models,
-            nullptr /* inline op models */,
-            config.device_config.get_l1_usable_size(),
-            previous_ins_instructions,
-            config.fork_join_tiles_treshold
-            );
-
-        results.ins_instructions = fj_buffering.instructions;
-
-        if (!std::get<0>(is_subset_of_instructions(results.ins_instructions, previous_ins_instructions)))
-        {
-            return results;  // return here, since we'll have to redo anyway
-        }
-    }
-
-    if (env_as<bool>("PYBUDA_UPSIZE_DRAM_INPUT"))
-    {
-        upsize_dram_input(graph, balancer_solution->op_models, config.device_config.get_l1_usable_size());
-    }
-
-    validate_multichip_queue_placements(config, graph, placer_solution);
-
-    // Estimate model performance
-    results.perf_model_results = perf_model::run_performance_model(graph, graph_name, device_config, balancer_solution);
-
-    return results;
-}
-
-// ********** Last chance to run any non-iterative passes before netlist is generated **********
-void run_pre_netlist_generation_buda_passes(
-    graphlib::Graph *graph,
-    const std::string &graph_name,
-    const DeviceConfig &device_config,
-    std::unordered_map<std::string, py::object> intermediates,
-    placer::PlacerSolution &placer_solution,
-    PostPlacerConfig &config,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution,
-    std::vector<std::vector<placer::Blocks>> &pre_allocated_blocks,
-    std::uint32_t last_host_address
-    )
-{
-    if (env_as<bool>("PYBUDA_REPRODUCE_SUBGRAPH"))
-    {
-        std::string input_name = env_as<string> ("PYBUDA_REPRODUCE_SUBGRAPH_INPUT");
-        std::string output_name = env_as<string> ("PYBUDA_REPRODUCE_SUBGRAPH_OUTPUT");
-        passes::reproduce_subgraph(graph, input_name, output_name, intermediates, balancer_solution, &placer_solution);
-        // cutting graph will change the shape of some queuues and add new ones, so we re-place
-        std::vector<placer::DramAllocator> chip_dram_allocators = initialize_dram_allocators(device_config, config, graph_name, pre_allocated_blocks);
-        placer::HostMemoryAllocator host_memory_allocator(config.host_memory_placer_config, last_host_address);
-
-        placer::place_host_queues(
-            config.host_memory_placer_config, host_memory_allocator, graph, placer_solution, *balancer_solution);
-        placer::place_dram_queues(
-            graph,
-            placer_solution,
-            *balancer_solution,
-            config.host_memory_placer_config,
-            config.dram_placer_config,
-            chip_dram_allocators);
-    }
-    return;
+    return lowered_graph;
 }
 }  // namespace tt
diff --git a/pybuda/csrc/buda_passes.hpp b/pybuda/csrc/buda_passes.hpp
index 1598f6bbd..819769d75 100644
--- a/pybuda/csrc/buda_passes.hpp
+++ b/pybuda/csrc/buda_passes.hpp
@@ -3,44 +3,33 @@
 // SPDX-License-Identifier: Apache-2.0
 #pragma once
 
-#include "balancer/balancer.hpp"
 #include "graph_lib/node.hpp"
 #include "graph_lib/node_types.hpp"
-#include "passes/fork_join.hpp"
-#include "passes/fracture.hpp"
 #include "passes/dataformat.hpp"
-#include "passes/post_placer_buda_passes.hpp"
-#include "placer/chip_id_assignment.hpp"
-#include "placer/dram.hpp"
-#include "placer/dram_allocator.hpp"
-#include "placer/placer.hpp"
-#include "scheduler/scheduler.hpp"
+#include "passes/fracture.hpp"
 #include "utils/ordered_associative_containers/ordered_map.hpp"
 
 namespace tt {
 
 using NodeId = graphlib::NodeId;
 using PortId = graphlib::PortId;
-void lower_reshape(Graph *, graphlib::OpNode *node);    // unused
+void lower_reshape(graphlib::Graph *, graphlib::OpNode *node);  // unused
 
 // Run post initial graph passes
 std::tuple<std::vector<std::pair<graphlib::NodeId, graphlib::NodeId>>, passes::FractureChipIdAssignments>
 run_post_initial_graph_passes(
     graphlib::Graph *graph, py::object compiler_cfg_object, passes::FractureGroups const &fracture_groups);
-void run_optimization_graph_passes(graphlib::Graph *graph, const DeviceConfig &device_config);
+void run_optimization_graph_passes(graphlib::Graph *graph);
 std::vector<std::pair<graphlib::NodeId, graphlib::NodeId>> run_post_optimize_decompose_graph_passes(
     graphlib::Graph *graph, py::object compiler_cfg_object);
 std::vector<std::pair<graphlib::NodeId, graphlib::NodeId>> run_post_autograd_graph_passes(
     graphlib::Graph *graph, py::object compiler_cfg_object);
 
 // Run lowering passes
-std::pair<std::unique_ptr<graphlib::Graph>, placer::PlacerConfigUpdate> run_pre_placer_buda_passes(
+std::unique_ptr<graphlib::Graph> run_pre_placer_buda_passes(
     graphlib::Graph *graph,
-    scheduler::SchedulerConfig scheduler_config,
     const DeviceConfig &device_config,
     std::vector<std::uint32_t> chip_ids = {0},
-    const placer::PredicatesToBreaks &predicates_to_chip_break = {},
-    const placer::PredicatesToBreaks &predicates_to_epoch_break = {},
     const std::vector<std::string> &op_names_dont_fuse = {},
     const std::vector<std::string> &op_names_manual_fuse = {},
     const passes::FractureChipIdAssignments &fracture_chip_id_assignments = {},
@@ -53,48 +42,14 @@ std::pair<std::unique_ptr<graphlib::Graph>, placer::PlacerConfigUpdate> run_pre_
     const int amp_level = 0,
     const bool enable_recompute = false,
     const bool output_queues_on_host = true,
-    const tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        &ins_instructions = {},
+    const bool input_queues_on_host = true,
     const std::vector<std::tuple<std::string, std::string, int>> &insert_queues = {},
-    const std::vector<AMPNodeProperties> &amp_properties = {},
+    std::vector<AMPNodeProperties> amp_properties = {},
     const std::vector<std::string> &op_intermediates_to_save = {},
     bool use_interactive_placer = true,
     bool enable_device_tilize = false);
-struct PostPlacerResults
-{
-    std::unordered_map<std::string, float> perf_model_results;
-    tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        ins_instructions;
-    std::vector<std::vector<placer::Blocks>> allocated_blocks;
-    std::uint32_t current_host_address;
-};
-
-// Run post-placer passes, like queue and buffer insertion. Return perf model results, if applicable.
-PostPlacerResults run_post_placer_buda_passes(
-    graphlib::Graph *graph,
-    const std::string &graph_name,
-    const DeviceConfig &device_config,
-    placer::PlacerSolution &placer_solution,
-    PostPlacerConfig &config,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution,
-    const tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        &previous_ins_instructions,
-    std::vector<std::vector<placer::Blocks>> &pre_allocated_blocks,
-    std::uint32_t last_host_address);
-
-// Last minute changes before netlist generation
-void run_pre_netlist_generation_buda_passes(
-    graphlib::Graph *graph,
-    const std::string &graph_name,
-    const DeviceConfig &device_config,
-    std::unordered_map<std::string, py::object> intermediates,
-    placer::PlacerSolution &placer_solution,
-    PostPlacerConfig &config,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution,
-    std::vector<std::vector<placer::Blocks>> &pre_allocated_blocks,
-    std::uint32_t last_host_address);
 
-// Pre-lowering passes, last-minute changes before going to buda ops
-void run_pre_lowering_passes(graphlib::Graph *graph);
+// Pre-lowering passes, last-minute changes before going to MLIR
+graphlib::Graph* run_pre_lowering_passes(graphlib::Graph *graph);
 
 }
diff --git a/pybuda/csrc/graph_lib/CMakeLists.txt b/pybuda/csrc/graph_lib/CMakeLists.txt
new file mode 100644
index 000000000..b3b0549aa
--- /dev/null
+++ b/pybuda/csrc/graph_lib/CMakeLists.txt
@@ -0,0 +1,13 @@
+add_library(graph
+    STATIC
+    defines.cpp
+    edge.cpp
+    graph.cpp
+    node.cpp
+    node_types.cpp
+    shape.cpp
+    utils.cpp
+    python_bindings.cpp)
+
+target_compile_options(graph PRIVATE ${STATIC_LIB_FLAGS} ${PYBUDA_CSRC_CFLAGS})
+
diff --git a/pybuda/csrc/graph_lib/edge.hpp b/pybuda/csrc/graph_lib/edge.hpp
index 7a16f1f40..ee63d2539 100644
--- a/pybuda/csrc/graph_lib/edge.hpp
+++ b/pybuda/csrc/graph_lib/edge.hpp
@@ -25,6 +25,7 @@ enum class EdgeType {
     kAutogradOutputToLoss = 8,
     kAutogradInputToGradientOut = 9,
     kPartialDataCopy = 10,
+    kSubgraphLink = 11,
 };
 using EdgeUniqueId = std::tuple<NodeId, PortId, NodeId, PortId, EdgeType>;
 using EdgeCreationId = std::int64_t;
@@ -88,6 +89,7 @@ inline std::string edge_type_to_string(const EdgeType& edge_type)
         case EdgeType::kAutogradInputToGradientOut: retstring = "AutogradInputToGradientOut"; break;
         case EdgeType::kControlLoop: retstring = "kControlLoop"; break;
         case EdgeType::kPartialDataCopy: retstring = "kPartialDataCopy"; break;
+        case EdgeType::kSubgraphLink: retstring = "kSubgraphLink"; break;
         default: assert(false && "Unimplemented edge_type ostream");
     }
     return retstring;
diff --git a/pybuda/csrc/graph_lib/graph.cpp b/pybuda/csrc/graph_lib/graph.cpp
index 69e8e5e2a..125ade3c6 100644
--- a/pybuda/csrc/graph_lib/graph.cpp
+++ b/pybuda/csrc/graph_lib/graph.cpp
@@ -539,6 +539,9 @@ std::shared_ptr<EdgeAttributes> Graph::remove_edge(const Edge &edge)
     for (auto &operand_edge : operand_edges_to_remove) {
         this->operands_map_[edge.consumer_node_id].erase(operand_edge);
     }
+
+    TT_DBG_ASSERT(operand_edges_to_remove.size(), "Edge not found in graph");
+
     return attr;
 }
 
@@ -586,8 +589,13 @@ void Graph::update_node_name(Node *node, const std::string &new_name) {
     node_name_to_node_id_[new_name] = node_id;
 }
 
-void Graph::register_module_inputs(const std::vector<NodeId>& module_inputs) {
-    this->ordered_module_input_node_ids_ = module_inputs;
+void Graph::register_module_inputs(const std::vector<NodeId>& module_inputs, bool append) {
+    if (!append) {
+        this->ordered_module_input_node_ids_.clear();
+    }
+    for (NodeId module_input : module_inputs) {
+        this->ordered_module_input_node_ids_.push_back(module_input);
+    }
 }
 
 std::size_t Graph::remove_module_input(NodeId input)
@@ -668,9 +676,14 @@ void Graph::copy_module_targets(Graph *old_graph, const std::unordered_map<Node
 
 
 
-void Graph::register_module_outputs(const std::vector<NodeId>& module_outputs, std::vector<bool> requires_grad) {
+void Graph::register_module_outputs(const std::vector<NodeId>& module_outputs, std::vector<bool> requires_grad, bool append) {
     TT_ASSERT(module_outputs.size() == requires_grad.size());
-    this->ordered_module_output_node_ids_ = module_outputs;
+    if (!append) {
+        this->ordered_module_output_node_ids_.clear();
+    }
+    for (NodeId module_output : module_outputs) {
+        this->ordered_module_output_node_ids_.push_back(module_output);
+    }
     for (std::size_t i=0; i < module_outputs.size(); i++)
     {
         OutputNode *out = node_by_id(module_outputs[i])->as<OutputNode>();
diff --git a/pybuda/csrc/graph_lib/graph.hpp b/pybuda/csrc/graph_lib/graph.hpp
index 31a5fed3f..92ae66a74 100644
--- a/pybuda/csrc/graph_lib/graph.hpp
+++ b/pybuda/csrc/graph_lib/graph.hpp
@@ -18,7 +18,7 @@
 #include "graph_lib/edge.hpp"
 
 // Jumping through some hoops to allow modifiable edge attributes
-struct EdgeUniqueIdHash : public std::unary_function<tt::graphlib::EdgeUniqueId, std::size_t>
+struct EdgeUniqueIdHash
 {
     std::size_t operator()(const tt::graphlib::EdgeUniqueId &edge) const
     {
@@ -36,11 +36,6 @@ struct EdgeUniqueIdHash : public std::unary_function<tt::graphlib::EdgeUniqueId,
 namespace tt
 {
 
-namespace balancer::legalizer
-{
-class GraphSolver;
-}
-
 namespace graphlib
 {
 
@@ -222,8 +217,8 @@ class Graph
 
     void update_node_name(Node *node, const std::string &new_name);
 
-    void register_module_inputs(const std::vector<NodeId> &module_inputs);
-    void register_module_outputs(const std::vector<NodeId> &module_outputs, std::vector<bool> requires_grad);
+    void register_module_inputs(const std::vector<NodeId> &module_inputs, bool append = false);
+    void register_module_outputs(const std::vector<NodeId> &module_outputs, std::vector<bool> requires_grad, bool append = false);
     void register_module_targets(const std::vector<NodeId> &module_targets);
     void copy_module_inputs(Graph *old_graph, const std::unordered_map<Node *, Node *> &old_to_new);
     void copy_module_outputs(Graph *old_graph, const std::unordered_map<Node *, Node *> &old_to_new);
@@ -329,7 +324,6 @@ class Graph
     std::unordered_set<NodeId> virtual_nodes_;
 
     friend class GraphTraversalContext;
-    friend class tt::balancer::legalizer::GraphSolver;
 };
 
 template <typename NodeClassType>
diff --git a/pybuda/csrc/graph_lib/module.mk b/pybuda/csrc/graph_lib/module.mk
index c2ef75819..9df49e78c 100644
--- a/pybuda/csrc/graph_lib/module.mk
+++ b/pybuda/csrc/graph_lib/module.mk
@@ -24,7 +24,7 @@ $(PYBUDA_CSRC_GRAPH_LIB): $(PYBUDA_CSRC_GRAPH_LIB_OBJS)
 	@mkdir -p $(LIBDIR)
 	ar rcs $@ $^
 
-$(OBJDIR)/pybuda/csrc/graph_lib/%.o: pybuda/csrc/graph_lib/%.cpp
+$(OBJDIR)/pybuda/csrc/graph_lib/%.o: pybuda/csrc/graph_lib/%.cpp $(PYTHON_ENV)
 	@mkdir -p $(@D)
 	$(CXX) $(PYBUDA_CSRC_CFLAGS) $(CXXFLAGS) $(STATIC_LIB_FLAGS) $(PYBUDA_CSRC_GRAPH_LIB_INCLUDES) -c -o $@ $<
 
diff --git a/pybuda/csrc/graph_lib/node_types.cpp b/pybuda/csrc/graph_lib/node_types.cpp
index 74bc4facb..5f298836c 100644
--- a/pybuda/csrc/graph_lib/node_types.cpp
+++ b/pybuda/csrc/graph_lib/node_types.cpp
@@ -23,7 +23,7 @@ template<> const TaggedNode* Node::as<TaggedNode>() const
 {
     const TaggedNode* tagged_node = dynamic_cast<TaggedNode const *>(this);
     TT_ASSERT(tagged_node != nullptr);
-    return dynamic_cast<TaggedNode const *>(this);
+    return tagged_node;
 }
 template<> const OpNode* Node::as<OpNode>() const
 {
diff --git a/pybuda/csrc/graph_lib/node_types.hpp b/pybuda/csrc/graph_lib/node_types.hpp
index b3b67a12c..cf62fb3ae 100644
--- a/pybuda/csrc/graph_lib/node_types.hpp
+++ b/pybuda/csrc/graph_lib/node_types.hpp
@@ -16,13 +16,9 @@
 #include "graph_lib/utils.hpp"
 #include "lower_to_buda/common.hpp"
 #include "shared_utils/sparse_matmul_utils.hpp"
-#include "utils/small_vector.hpp"
 
-namespace tt::balancer {
-class BudaOpNodeLegalizerFailureInfo;
-} 
-
-namespace tt {
+namespace tt
+{
 class FusedOp;
 
 namespace graphlib {
@@ -367,6 +363,18 @@ struct OpType
                     ret += std::to_string(std::get<float>(attr[i])) + ",";
                 } else if (std::holds_alternative<std::string>(attr[i])) {
                     ret += std::get<std::string>(attr[i]) + ",";
+                } else if (std::holds_alternative<std::vector<int>>(attr[i])) {
+                    auto attr_val = std::get<std::vector<int>>(attr[i]);
+                    size_t num_items = attr_val.size();
+
+                    ret += "[";
+                    for (size_t j = 0 ; j < num_items; ++j)
+                    {
+                        ret += std::to_string(attr_val[j]);
+                        if (j < num_items-1)
+                            ret += ", ";
+                    }
+                    ret += "], ";
                 } else {
                     TT_ASSERT(false, "Unknown alternative in Attr");
                 }
@@ -444,6 +452,12 @@ class OpNode : public TaggedNode
     bool is_splice() const { return op_name() == "splice"; }
     bool is_tilize() const {return op_name().find("tilizer")!= std::string::npos;}
     bool is_reduce() const { return op_name() == "reduce"; }
+    bool is_add() const { return op_name() == "add"; }
+    bool is_maximum() const { return op_name() == "maximum"; }
+    bool is_quantization() const { return op_name() == "quantization"; }
+    bool is_dequantization() const { return op_name() == "dequantization"; }
+    bool is_requantization() const { return op_name() == "requantization"; }
+    bool is_quantization_related_op() const { return is_quantization() or is_dequantization() or is_requantization(); }
     bool is_dense_matmul() const { return is_matmul() and not is_sparse_matmul() and not is_depthwise_matmul(); }
     bool is_sparse_matmul() const { return is_matmul() and (buda_attrs().find("identity") != buda_attrs().end()); }
     bool is_depthwise_matmul() const { return op_name().compare("depthwise") == 0; }
@@ -499,16 +513,8 @@ class BudaOpNode : public OpNode {
 
     virtual std::unique_ptr<Node> clone(std::string const& name = "") override;
 
-    void set_fused_op(std::shared_ptr<FusedOp> fused_op) { fused_op_ = fused_op; }
-    bool is_fused_op() const { return fused_op_ != nullptr; }
-    std::shared_ptr<FusedOp> get_fused_op() const { TT_ASSERT(fused_op_ != nullptr); return fused_op_; }
-
     void set_buffering_op(bool buffering_op) { buffering_op_ = buffering_op; }
     bool is_buffering_op() const { return buffering_op_; }
-
-    #ifdef DEBUG
-    std::shared_ptr<balancer::BudaOpNodeLegalizerFailureInfo> leg_debug_info = nullptr;
-    #endif
 };
 
 class BudaNaryTMNode : public Node
@@ -576,6 +582,11 @@ class EdgeAttributes {
         return std::find_if(tms.begin(), tms.end(), [tm](OpType const &op) { return op.op == tm; }) != tms.end();
     }
 
+    bool operator==(EdgeAttributes const &other) const
+    {
+        return edge_type_ == other.edge_type_ and tms == other.tms and ublock_order == other.ublock_order;
+    }
+
     static std::shared_ptr<EdgeAttributes> create(EdgeType edge_type);
 
     // Checked casting to sub-node type
diff --git a/pybuda/csrc/graph_lib/python_bindings.cpp b/pybuda/csrc/graph_lib/python_bindings.cpp
index 72ff2d6cc..a60263f71 100644
--- a/pybuda/csrc/graph_lib/python_bindings.cpp
+++ b/pybuda/csrc/graph_lib/python_bindings.cpp
@@ -9,13 +9,11 @@
 #include <optional>
 
 #include "autograd/autograd.hpp"
-#include "balancer/balancer.hpp"
 #include "graph_lib/graph.hpp"
 #include "graph_lib/node_types.hpp"
 #include "graph_lib/query.hpp"
 #include "graph_lib/utils.hpp"
 #include "json.hpp"
-#include "passes/fuse_ops.hpp"
 #include "pybind11_json.hpp"
 #include "python_bindings_common.hpp"
 #include "reportify/reportify.hpp"
@@ -43,7 +41,6 @@ eval_graph(
     const std::unordered_map<int, py::object> &intermediate_golden_tensors,
     const std::vector<py::object> &losses,
     const std::vector<py::object> &targets,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution,
     float relative_atol,
     float pcc,
     std::string const &dump_tensors_path,
@@ -72,14 +69,19 @@ void GraphModule(py::module &m_graph)
         .def("enable_training", &Graph::enable_training)
         .def("set_microbatch", &Graph::set_microbatch)
         .def("get_microbatch", &Graph::get_microbatch)
-        .def("nodes", [](const Graph &self) { 
-            std::vector<graphlib::Node*> nodes = self.nodes();
-            std::vector<std::string> names;
-            std::transform(nodes.begin(), nodes.end(), std::back_inserter(names), [](graphlib::Node* node) {
-                return node->name();
-            });
-            return names;
-        })
+        .def(
+            "nodes",
+            [](const Graph &self)
+            {
+                std::vector<graphlib::Node *> nodes = self.nodes();
+                std::vector<std::string> names;
+                std::transform(
+                    nodes.begin(),
+                    nodes.end(),
+                    std::back_inserter(names),
+                    [](graphlib::Node *node) { return node->name(); });
+                return names;
+            })
         .def("get_ordered_input_names", &Graph::get_ordered_input_names)
         .def("get_ordered_intermediate_names", &Graph::get_ordered_intermediate_names)
         .def("get_ordered_output_names", &Graph::get_ordered_output_names)
@@ -99,8 +101,17 @@ void GraphModule(py::module &m_graph)
             py::arg("recurse") = false)
         .def("get_subgraph_id_for_node", &Graph::get_subgraph_id_for_node)
         .def("get_parameter_nodes", &Graph::get_parameter_nodes, py::return_value_policy::reference)
-        .def("register_module_inputs", &Graph::register_module_inputs)
-        .def("register_module_outputs", &Graph::register_module_outputs)
+        .def(
+            "register_module_inputs",
+            &Graph::register_module_inputs,
+            py::arg("module_inputs"),
+            py::arg("append") = false)
+        .def(
+            "register_module_outputs",
+            &Graph::register_module_outputs,
+            py::arg("module_outputs"),
+            py::arg("requires_grad"),
+            py::arg("append") = false)
         .def("register_module_targets", &Graph::register_module_targets)
         .def("get_ordered_input_shapes", &Graph::get_ordered_input_shapes)
         .def("get_ordered_output_shapes", &Graph::get_ordered_output_shapes)
@@ -154,48 +165,9 @@ void GraphModule(py::module &m_graph)
                     transforms.push_back(transform);
                 }
                 return transforms;
-            })
-        // Return information about fused ops and their schedule. Currently used purely for test verification,
-        // i.e. to ensure that fusing occured exactly in the way that was expected
-        .def(
-            "get_fused_ops",
-            [](Graph *graph)
-            {
-                std::vector<std::tuple<
-                    std::uint32_t,                         // inputs
-                    std::vector<std::vector<std::string>>  // schedules
-                    >>
-                    ret;
-
-                for (Node *node : graph->nodes())
-                {
-                    if (node->node_type() != graphlib::kBudaOp)
-                        continue;
-                    graphlib::BudaOpNode *op = node->as<graphlib::BudaOpNode>();
-                    if (!op->is_fused_op())
-                        continue;
-
-                    auto f = op->get_fused_op();
-                    auto s = f->get_schedules();
-                    std::vector<std::vector<std::string>> schedules;
-                    for (auto sch : s)
-                    {
-                        std::vector<std::string> schedule;
-                        for (FusedSubOp subop : sch.ops) schedule.push_back(subop.op_type.op);
-                        schedules.push_back(schedule);
-                    }
-
-                    std::tuple<
-                        std::uint32_t,                         // inputs
-                        std::vector<std::vector<std::string>>  // schedules
-                        >
-                        op_data = {f->get_input_count(), schedules};
-                    ret.push_back(op_data);
-                }
-                return ret;
             });
-    py::class_<Shape> shape(m_graph, "Shape");
 
+    py::class_<Shape> shape(m_graph, "Shape");
     shape.def_property_readonly("v", [](Shape const &shape) { return shape[-5]; })
         .def_property_readonly("w", [](Shape const &shape) { return shape[-4]; })
         .def_property_readonly("z", [](Shape const &shape) { return shape[-3]; })
@@ -207,15 +179,15 @@ void GraphModule(py::module &m_graph)
         .def("get_tile_height", &Shape::get_tile_height)
         .def("get_tile_width", &Shape::get_tile_width)
         .def_static("create", &Shape::create, py::arg("values"))
-        .def_static(
-            "create_buda", py::overload_cast<std::vector<std::uint32_t>, int, int>(&Shape::create_buda))
+        .def_static("create_buda", py::overload_cast<std::vector<std::uint32_t>, int, int>(&Shape::create_buda))
         .def_static(
             "create_with_type_from_other", Shape::create_with_type_from_other, py::arg("other"), py::arg("values"))
         .def("len", [](Shape const &shape) { return shape.as_vector().size(); })
         .def("__len__", [](Shape const &shape) { return shape.as_vector().size(); })
-        .def("__iter__", [](Shape &shape) {
-            return py::make_iterator(shape.begin(), shape.end()); 
-        }, py::keep_alive<0, 1>())
+        .def(
+            "__iter__",
+            [](Shape &shape) { return py::make_iterator(shape.begin(), shape.end()); },
+            py::keep_alive<0, 1>())
         .def("as_list", [](Shape const &shape) { return shape.as_vector(); })
         .def("__getitem__", [](Shape const &shape, int idx) { return shape[idx]; })
         .def("__setitem__", [](Shape &shape, int idx, std::uint32_t val) { shape[idx] = val; })
@@ -244,7 +216,13 @@ void GraphModule(py::module &m_graph)
             }))
         .def("__get_pickle_data", &Shape::get_pickle_data)
         .def_static("__create_from_pickled", &Shape::create_from_pickled)
-        .def("to_json", [](Shape const &shape) { json j = shape; return j; })
+        .def(
+            "to_json",
+            [](Shape const &shape)
+            {
+                json j = shape;
+                return j;
+            })
         .def_static("from_json", [](json j) { return j.get<Shape>(); });
 
     py::enum_<Shape::Type>(shape, "Type")
@@ -495,6 +473,20 @@ void GraphModule(py::module &m_graph)
         input->as<graphlib::TaggedNode>()->tag("dont_consteval", "true");
     });
 
+    m_graph.def("add_subgraph_io_link_edge", [](
+          Graph *graph,
+          const graphlib::NodeId start,
+          int out_port_id,
+          const graphlib::NodeId end,
+          int in_port_id)
+    {
+        graphlib::Edge edge(start, (graphlib::PortId)out_port_id, end, (graphlib::PortId)in_port_id, graphlib::EdgeType::kSubgraphLink);
+        graph->add_edge(edge);
+        // Disable consteval for partial datacopy inputs
+        graphlib::Node *input = graph->node_by_id(end);
+        input->as<graphlib::TaggedNode>()->tag("dont_consteval", "true");
+    });
+
     m_graph.def("create_control_edge", [](
           Graph *graph,
           const graphlib::NodeId start,
@@ -525,11 +517,10 @@ void GraphModule(py::module &m_graph)
             const std::unordered_map<int, py::object> &intermediate_golden_tensors,
             const std::vector<py::object> &losses,
             const std::vector<py::object> &targets,
-            std::shared_ptr<balancer::BalancerSolution> balancer_solution,
             std::string const& dump_tensors_path,
             bool allow_modified_shapes) {
                 
-        auto [ret, fwd_to_gradient_mapping, bwd_gradients, updated_parameter_mapping, intermediate_tensors] =  eval_graph(graph, inputs, parameters, tt_device, intermediate_golden_tensors, losses, targets, balancer_solution, relative_atol, pcc, dump_tensors_path, allow_modified_shapes, true);
+        auto [ret, fwd_to_gradient_mapping, bwd_gradients, updated_parameter_mapping, intermediate_tensors] =  eval_graph(graph, inputs, parameters, tt_device, intermediate_golden_tensors, losses, targets, relative_atol, pcc, dump_tensors_path, allow_modified_shapes, true);
         return intermediate_tensors;
     },
         py::arg("graph"),
@@ -541,7 +532,6 @@ void GraphModule(py::module &m_graph)
         py::arg("intermediate_golden_tensors") = std::unordered_map<int, py::object>(),
         py::arg("losses") = std::vector<py::object>(),
         py::arg("targets") = std::vector<py::object>(),
-        py::arg("balancer_solution") = nullptr,
         py::arg("dump_tensors_path") = "",
         py::arg("allow_modified_shapes") = false
     );
@@ -557,10 +547,9 @@ void GraphModule(py::module &m_graph)
             const std::unordered_map<int, py::object> &intermediate_golden_tensors,
             const std::vector<py::object> &losses,
             const std::vector<py::object> &targets,
-            std::shared_ptr<balancer::BalancerSolution> balancer_solution,
             std::string const& dump_tensors_path,
             bool allow_modified_shapes) {
-        auto ret =  eval_graph(graph, inputs, parameters, tt_device, intermediate_golden_tensors, losses, targets, balancer_solution, relative_atol, pcc, dump_tensors_path, allow_modified_shapes, false);
+        auto ret =  eval_graph(graph, inputs, parameters, tt_device, intermediate_golden_tensors, losses, targets, relative_atol, pcc, dump_tensors_path, allow_modified_shapes, false);
         return std::make_tuple(std::get<0>(ret), std::get<1>(ret), std::get<2>(ret), std::get<3>(ret));
 
     },
@@ -573,7 +562,6 @@ void GraphModule(py::module &m_graph)
         py::arg("intermediate_golden_tensors") = std::unordered_map<int, py::object>(),
         py::arg("losses") = std::vector<py::object>(),
         py::arg("targets") = std::vector<py::object>(),
-        py::arg("balancer_solution") = nullptr,
         py::arg("dump_tensors_path") = "",
         py::arg("allow_modified_shapes") = false
     );
@@ -677,9 +665,9 @@ py::object eval_relu(py::object tensor, graphlib::OpType type)
         float relu_threshold = (type.buda_attrs.find("relu_threshold") != type.buda_attrs.end())
                                    ? std::get<float>(type.buda_attrs["relu_threshold"])
                                    : 0.0;
-        string relu_mode = (type.buda_attrs.find("relu_mode") != type.buda_attrs.end())
-                                   ? std::get<string>(type.buda_attrs["relu_mode"])
-                                   : "min";
+        std::string relu_mode = (type.buda_attrs.find("relu_mode") != type.buda_attrs.end())
+                                    ? std::get<std::string>(type.buda_attrs["relu_mode"])
+                                    : "min";
 
         graphlib::OpType relu("relu", {relu_threshold, relu_mode});
         tensor = eval_op(relu, inputs, graphlib::IRLevel::IR_PYBUDA);
@@ -687,116 +675,14 @@ py::object eval_relu(py::object tensor, graphlib::OpType type)
     return tensor;
 }
 
-py::object eval_fused_op(std::shared_ptr<FusedOp> fused_op, std::vector<py::object> inputs)
+py::object eval_t_streaming_tms(py::object tensor, graphlib::Graph *graph, graphlib::Node *node, std::string const &dir)
 {
-    std::unordered_map<std::uint32_t, py::object> buffers;
-    std::optional<py::object> dest = std::nullopt;
-    for (auto schedule : fused_op->get_schedules())
-    {
-        for (auto sub_op : schedule.ops)
-        {
-            std::vector<py::object> sub_op_inputs;
-
-            for (FusedSubOpInput i : sub_op.inputs)
-            {
-                if (i.type == FusedSubOpInput::InputType::INPUT)  {
-                    TT_ASSERT(i.index < inputs.size(), "Refering to input that doesn't exist for fused op");
-                    sub_op_inputs.push_back(inputs.at(i.index));
-                }
-                else if (i.type == FusedSubOpInput::InputType::DEST) {
-                    TT_ASSERT(dest.has_value());
-                    sub_op_inputs.push_back(dest.value());
-                    dest = std::nullopt; // done with reuse
-                }
-                else {
-                    auto it = buffers.find(i.index);
-                    TT_ASSERT(it != buffers.end(), "Referring to intermediate buffer that doesn't exist");
-                    sub_op_inputs.push_back(it->second);
-                }
-
-                // In case the input for this sub_op is from the input buffer,
-                // we don't need to apply any tms (they were applied before this method).
-                if (i.type == FusedSubOpInput::InputType::INPUT)
-                    continue;
-
-                auto input = sub_op_inputs.back();
-
-                // Apply needed tms...
-                if (i.has_tile_broadcast())
-                {
-                    int tile_broadcast_dim = i.tile_broadcast.first ? 2 : 3;
-                    graphlib::OpType op = graphlib::OpType("tile_broadcast", {tile_broadcast_dim});
-                    input = eval_op(op, {input}, graphlib::IRLevel::IR_BUDA);
-                }
-
-                if (i.has_broadcast())
-                {
-                    int broadcast_dim = i.broadcast.first;
-                    int broadcast_factor = i.broadcast.second;
-
-                    graphlib::OpType op = graphlib::OpType("broadcast", {broadcast_dim, broadcast_factor});
-                    input = eval_op(op, {input}, graphlib::IRLevel::IR_BUDA);
-                }
-
-                sub_op_inputs.pop_back();
-                sub_op_inputs.emplace_back(input);
-            }
-            
-            py::object result = eval_op(sub_op.op_type, sub_op_inputs, graphlib::IRLevel::IR_BUDA);
-
-            if (sub_op.output_type == FusedSubOp::OutputType::OUTPUT)
-                return result;
-            else if (sub_op.output_type == FusedSubOp::OutputType::DEST)
-                dest = result;
-            else {
-                // intermed buffer
-                if (buffers.count((std::uint32_t)sub_op.output_buffer) == 0)
-                    buffers.insert(std::make_pair((std::uint32_t)sub_op.output_buffer, result));
-                else
-                    buffers[(std::uint32_t)sub_op.output_buffer] = result;
-            }
-        }
-    }
-    TT_THROW("Evaluated the full fused op, but haven't reached the output.");
-    return py::none();
-}
-
-py::object eval_t_streaming_tms(
-    py::object tensor,
-    graphlib::Graph *graph,
-    graphlib::Node *node,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution,
-    std::string const &dir)
-{
-    if (not balancer_solution)
-    {
-        return tensor;
-    }
-
-    auto match = balancer_solution->op_models.find(node->name());
-    if (match != balancer_solution->op_models.end())
-    {
-        std::vector<graphlib::OpType> t_streaming_tms = calculate_t_streaming_tms(graph, node, match->second);
-        if (not t_streaming_tms.empty())
-        {
-            log_trace(LogEval, "{} t streaming: {}", dir, node->name());
-        }
-        for (auto tm : t_streaming_tms)
-        {
-            tensor = eval_op(tm, {tensor}, graph->get_ir_level());
-        }
-    }
-
     return tensor;
 }
 
-py::object eval_t_streaming_tms(
-    py::object tensor,
-    graphlib::Graph *graph,
-    graphlib::Node *node,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution)
+py::object eval_t_streaming_tms(py::object tensor, graphlib::Graph *graph, graphlib::Node *node)
 {
-    return eval_t_streaming_tms(tensor, graph, node, balancer_solution, "Redo");
+    return eval_t_streaming_tms(tensor, graph, node, "Redo");
 }
 
 py::object eval_golden_transforms(graphlib::Node *node, py::object tensor, bool eval_for_output = false)
@@ -1397,7 +1283,6 @@ eval_graph(
     const std::unordered_map<int, py::object> &intermediate_golden_tensors,
     const std::vector<py::object> &losses,
     const std::vector<py::object> &targets,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution,
     float relative_atol,
     float pcc,
     std::string const &dump_tensors_path,
@@ -1550,10 +1435,8 @@ eval_graph(
 
             std::vector<py::object> inputs = eval_operand_tms(graph, node, node_outputs);
 
-            bool is_fused_op = (node->node_type() == graphlib::kBudaOp) && node->as<graphlib::BudaOpNode>()->is_fused_op();
-            py::object obj = 
-                is_fused_op ? eval_fused_op(node->as<graphlib::BudaOpNode>()->get_fused_op(), inputs) :
-                              eval_op(op_node->op_type(), inputs, graph->get_ir_level(), false); // Don't Eval relu for intermediate checking
+            py::object obj = eval_op(
+                op_node->op_type(), inputs, graph->get_ir_level(), false);  // Don't Eval relu for intermediate checking
 
             auto gradient_edges = graph->operand_edges(node, 
                 [](const auto& edge) { return edge.edge_type == graphlib::EdgeType::kAutogradFwdToGradient; });
@@ -1638,7 +1521,7 @@ eval_graph(
                         if (operands.size() == 1)
                         {
                             graphlib::Node *optimizer = operands[0];
-                            ret = eval_t_streaming_tms(ret, graph, optimizer, balancer_solution);
+                            ret = eval_t_streaming_tms(ret, graph, optimizer);
                         }
                         ret = eval_input_bw(producer, ret, is_buda);
                         ret = eval_runtime_tensor_transform(graph, {producer}, {ret}, true).at(0);
diff --git a/pybuda/csrc/graph_lib/python_bindings.hpp b/pybuda/csrc/graph_lib/python_bindings.hpp
index bfb82e0d6..c89277691 100644
--- a/pybuda/csrc/graph_lib/python_bindings.hpp
+++ b/pybuda/csrc/graph_lib/python_bindings.hpp
@@ -1,7 +1,12 @@
 // SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 //
 // SPDX-License-Identifier: Apache-2.0
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
 #include <pybind11/pybind11.h>
+#pragma clang diagnostic pop
+
 #include <pybind11/stl.h>
 #include <pybind11/numpy.h>
 namespace py = pybind11;
diff --git a/pybuda/csrc/graph_lib/utils.cpp b/pybuda/csrc/graph_lib/utils.cpp
index ee6455941..df4628a5b 100644
--- a/pybuda/csrc/graph_lib/utils.cpp
+++ b/pybuda/csrc/graph_lib/utils.cpp
@@ -5,22 +5,24 @@
 
 #include <functional>
 #include <map>
+#include <queue>
 #include <unordered_set>
 #include <vector>
-#include <queue>
 
-#include "graph_lib/graph.hpp"
+#include "autograd/binding.hpp"
+#include "graph_lib/defines.hpp"
 #include "graph_lib/edge.hpp"
+#include "graph_lib/graph.hpp"
 #include "graph_lib/node.hpp"
 #include "graph_lib/node_types.hpp"
-#include "autograd/binding.hpp"
-#include "utils/logger.hpp"
 #include "reportify/reportify.hpp"
-#include "passes/fuse_ops.hpp"
+#include "utils/logger.hpp"
 
-namespace tt {
+namespace tt
+{
 
-namespace graphlib {
+namespace graphlib
+{
 
 bool is_eltwise(const OpNode *op)
 {
@@ -62,122 +64,93 @@ bool is_reduce_z(OpNode const *op)
            op->has_tag("reduce_z");
 }
 
-bool default_node_filter(Node*) {
-    return true;
-}
+bool default_node_filter(Node *) { return true; }
 
-static bool requires_visit(const std::unordered_map<NodeId, bool>& visited, NodeId node_id) {
+static bool requires_visit(const std::unordered_map<NodeId, bool> &visited, NodeId node_id)
+{
     return visited.find(node_id) == visited.end() or visited.at(node_id) == false;
 }
 
-int get_row_size_from_tile_size(TileDim tile_dim) {
+int get_row_size_from_tile_size(TileDim tile_dim)
+{
     int ret = 32;
-    switch (tile_dim) {
-        case TileDim::Dim32x32:
-            ret = 32;
-            break;
-        case TileDim::Dim16x32:
-            ret = 16;
-            break;
-        case TileDim::Dim32x16:
-            ret = 32;
-            break;
-        case TileDim::Dim8x32:
-            ret = 8;
-            break;
-        case TileDim::Dim4x32:
-            ret = 4;
-            break;
-        case TileDim::Dim2x32:
-            ret = 2;
-            break;
-        case TileDim::Dim1x32:
-            ret = 1;
-            break;
-        default:
-            TT_ASSERT(false, "Invalid tile dim");
+    switch (tile_dim)
+    {
+        case TileDim::Dim32x32: ret = 32; break;
+        case TileDim::Dim16x32: ret = 16; break;
+        case TileDim::Dim32x16: ret = 32; break;
+        case TileDim::Dim8x32: ret = 8; break;
+        case TileDim::Dim4x32: ret = 4; break;
+        case TileDim::Dim2x32: ret = 2; break;
+        case TileDim::Dim1x32: ret = 1; break;
+        default: TT_ASSERT(false, "Invalid tile dim");
     }
     return ret;
 }
 
-int get_col_size_from_tile_size(TileDim tile_dim) {
+int get_col_size_from_tile_size(TileDim tile_dim)
+{
     int ret = 32;
-    switch (tile_dim) {
-        case TileDim::Dim32x32:
-            ret = 32;
-            break;
-        case TileDim::Dim16x32:
-            ret = 32;
-            break;
-        case TileDim::Dim32x16:
-            ret = 16;
-            break;
-        case TileDim::Dim8x32:
-            ret = 32;
-            break;
-        case TileDim::Dim4x32:
-            ret = 32;
-            break;
-        case TileDim::Dim2x32:
-            ret = 32;
-            break;
-        case TileDim::Dim1x32:
-            ret = 32;
-            break;
-        default:
-            TT_ASSERT(false, "Invalid tile dim");
+    switch (tile_dim)
+    {
+        case TileDim::Dim32x32: ret = 32; break;
+        case TileDim::Dim16x32: ret = 32; break;
+        case TileDim::Dim32x16: ret = 16; break;
+        case TileDim::Dim8x32: ret = 32; break;
+        case TileDim::Dim4x32: ret = 32; break;
+        case TileDim::Dim2x32: ret = 32; break;
+        case TileDim::Dim1x32: ret = 32; break;
+        default: TT_ASSERT(false, "Invalid tile dim");
     }
     return ret;
 }
 
-TileDim get_tile_dim_from_height_width(int tile_height, int tile_width) {
+TileDim get_tile_dim_from_height_width(int tile_height, int tile_width)
+{
     TileDim ret = TileDim::Dim32x32;
 
-    switch(tile_height) {
+    switch (tile_height)
+    {
         case 32:
-            if (tile_width == 16) {
+            if (tile_width == 16)
+            {
                 ret = TileDim::Dim32x16;
-            } else if (tile_width == 32) {
+            }
+            else if (tile_width == 32)
+            {
                 ret = TileDim::Dim32x32;
-            } else {
+            }
+            else
+            {
                 TT_ASSERT(false, "Invalid tile dim");
             }
             break;
-        case 16:
-            ret = TileDim::Dim16x32;
-            break;
-        case 8:
-            ret = TileDim::Dim8x32;
-            break;
-        case 4:
-            ret = TileDim::Dim4x32;
-            break;
-        case 2:
-            ret = TileDim::Dim2x32;
-            break;
-        case 1:
-            ret = TileDim::Dim1x32;
-            break;
-        default:
-            TT_ASSERT(false, "Invalid tile dim");
+        case 16: ret = TileDim::Dim16x32; break;
+        case 8: ret = TileDim::Dim8x32; break;
+        case 4: ret = TileDim::Dim4x32; break;
+        case 2: ret = TileDim::Dim2x32; break;
+        case 1: ret = TileDim::Dim1x32; break;
+        default: TT_ASSERT(false, "Invalid tile dim");
     }
     return ret;
 }
 
-
-void validate_tile_dims(Graph *graph, graphlib::OpNode *op_node) {
-
+void validate_tile_dims(Graph *graph, graphlib::OpNode *op_node)
+{
     if (graphlib::is_eltwise_binary(op_node))
     {
         auto srcA_tile_dim = graph->operands(op_node)[0]->shape().get_tile_dim();
         auto srcB_tile_dim = graph->operands(op_node)[1]->shape().get_tile_dim();
-        if (srcA_tile_dim == srcB_tile_dim){
+        if (srcA_tile_dim == srcB_tile_dim)
+        {
             return;
         }
 
         // Canonicalize tile dim for binary op
-        auto srcA_tile_volume = graph->operands(op_node)[0]->shape().get_tile_height() * graph->operands(op_node)[0]->shape().get_tile_width();
-        auto srcB_tile_volume = graph->operands(op_node)[1]->shape().get_tile_height() * graph->operands(op_node)[1]->shape().get_tile_width();
+        auto srcA_tile_volume = graph->operands(op_node)[0]->shape().get_tile_height() *
+                                graph->operands(op_node)[0]->shape().get_tile_width();
+        auto srcB_tile_volume = graph->operands(op_node)[1]->shape().get_tile_height() *
+                                graph->operands(op_node)[1]->shape().get_tile_width();
 
         auto srcA_shape = graph->operands(op_node)[0]->shape();
         auto srcB_shape = graph->operands(op_node)[1]->shape();
@@ -188,40 +161,54 @@ void validate_tile_dims(Graph *graph, graphlib::OpNode *op_node) {
             trans_shape.set_tile_dim(srcA_tile_dim);
             auto padded_srcB_shape = graphlib::Shape::to_buda(trans_shape);
             graph->operands(op_node)[1]->set_shape(padded_srcB_shape);
-        } else if (srcA_tile_volume < srcB_tile_volume) {
+        }
+        else if (srcA_tile_volume < srcB_tile_volume)
+        {
             graphlib::Shape trans_shape(true, Shape::Type::FREE, srcA_shape.as_vector());
             trans_shape.set_tile_dim(srcB_tile_dim);
             auto padded_srcA_shape = graphlib::Shape::to_buda(trans_shape);
             graph->operands(op_node)[0]->set_shape(padded_srcA_shape);
-        } else {
+        }
+        else
+        {
             // Volume match iff 32x16 and 16x32
             // Insert NOP to make sure both inputs are padded to 32x32
             TT_ASSERT(false, "Volume match but tile dims don't match");
         }
 
-        TT_ASSERT(graph->operands(op_node)[0]->shape().get_tile_dim() 
-                    == graph->operands(op_node)[1]->shape().get_tile_dim());
-    } else if (op_node->is_matmul()) {
+        TT_ASSERT(
+            graph->operands(op_node)[0]->shape().get_tile_dim() == graph->operands(op_node)[1]->shape().get_tile_dim());
+    }
+    else if (op_node->is_matmul())
+    {
         // check RHS matmul, set to full tile
         auto rhs = graph->operands(op_node)[1];
 
-        if (rhs->shape().get_tile_dim() != TileDim::Dim32x32) {
+        if (rhs->shape().get_tile_dim() != TileDim::Dim32x32)
+        {
             graphlib::Shape trans_shape(true, Shape::Type::FREE, rhs->shape().as_vector());
             trans_shape.set_tile_dim(TileDim::Dim32x32);
             auto padded_rhs_shape = graphlib::Shape::to_buda(trans_shape);
             rhs->set_shape(padded_rhs_shape);
         }
-    } else if (op_node->op_type().op == "reduce") {
+    }
+    else if (op_node->op_type().op == "reduce")
+    {
         auto operand = graph->operands(op_node)[0];
-        if (operand->shape().get_tile_dim() != TileDim::Dim32x32) {
+        if (operand->shape().get_tile_dim() != TileDim::Dim32x32)
+        {
             graphlib::Shape trans_shape(true, Shape::Type::FREE, operand->shape().as_vector());
             trans_shape.set_tile_dim(TileDim::Dim32x32);
             auto padded_shape = graphlib::Shape::to_buda(trans_shape);
             operand->set_shape(padded_shape);
         }
-    } else if (op_node->op_type().op == "embedding") {
-        for (auto operand : graph->operands(op_node)) {
-            if (operand->shape().get_tile_dim() != TileDim::Dim32x32) {
+    }
+    else if (op_node->op_type().op == "embedding")
+    {
+        for (auto operand : graph->operands(op_node))
+        {
+            if (operand->shape().get_tile_dim() != TileDim::Dim32x32)
+            {
                 graphlib::Shape trans_shape(true, Shape::Type::FREE, operand->shape().as_vector());
                 trans_shape.set_tile_dim(TileDim::Dim32x32);
                 auto padded_shape = graphlib::Shape::to_buda(trans_shape);
@@ -233,13 +220,14 @@ void validate_tile_dims(Graph *graph, graphlib::OpNode *op_node) {
     return;
 }
 
-std::vector<std::vector<Node*>> topological_generations(const Graph& graph) {
-    std::vector<std::vector<Node*>> generations;
+std::vector<std::vector<Node *>> topological_generations(const Graph &graph)
+{
+    std::vector<std::vector<Node *>> generations;
 
     // the first step is to discover top level nodes in the graph
     // queue up all visible nodes
-    std::vector<Node*> nodes = graph.nodes();
-    std::queue<Node*> node_queue;
+    std::vector<Node *> nodes = graph.nodes();
+    std::queue<Node *> node_queue;
     for (Node *node : nodes)
     {
         if (graph.is_node_visible(node))
@@ -248,105 +236,122 @@ std::vector<std::vector<Node*>> topological_generations(const Graph& graph) {
         }
     }
     // vector to store top level nodes
-    std::vector<Node*> top_level_nodes;
+    std::vector<Node *> top_level_nodes;
     std::unordered_map<NodeId, bool> visited{};
 
-    std::function<void(Node*)> VisitNode = [&](Node* node) {
+    std::function<void(Node *)> VisitNode = [&](Node *node)
+    {
         visited[node->id()] = true;
 
         // count the number of operands of the node
         int num_operands = 0;
-        for (const Edge& operand_edge : graph.operand_edges(node)) {
-            if (operand_edge.edge_type == EdgeType::kDataLoopback or operand_edge.edge_type == EdgeType::kPartialDataCopy)
+        for (const Edge &operand_edge : graph.operand_edges(node))
+        {
+            if (operand_edge.edge_type == EdgeType::kDataLoopback or
+                operand_edge.edge_type == EdgeType::kPartialDataCopy)
             {
                 continue;
             }
             else if (operand_edge.edge_type == EdgeType::kControlLoop)
             {
-                continue; // not unrolling loops, just terminate
+                continue;  // not unrolling loops, just terminate
             }
             num_operands++;
 
             NodeId predecessor_id = operand_edge.producer_node_id;
-            Node* predecessor_node = graph.node_by_id(predecessor_id);
-            if (requires_visit(visited, predecessor_id)) {
+            Node *predecessor_node = graph.node_by_id(predecessor_id);
+            if (requires_visit(visited, predecessor_id))
+            {
                 VisitNode(predecessor_node);
             }
         }
-        if (num_operands == 0) {
+        if (num_operands == 0)
+        {
             top_level_nodes.push_back(node);
         }
     };
 
     // recurse through node operands until top, then stop, and add to result
-    while (not node_queue.empty()) {
-        Node* node = node_queue.front();
+    while (not node_queue.empty())
+    {
+        Node *node = node_queue.front();
 
-        if (requires_visit(visited, node->id())) {
+        if (requires_visit(visited, node->id()))
+        {
             VisitNode(node);
         }
         node_queue.pop();
     }
 
     // now do a BFS through nodes
-    std::queue<Node*> bfs_queue;
+    std::queue<Node *> bfs_queue;
 
     // also store a mapping of each node to its level (or generation)
     std::unordered_map<NodeId, unsigned> node_to_level;
 
     // add top level nodes to the queue
-    for (Node* node : top_level_nodes) {
+    for (Node *node : top_level_nodes)
+    {
         bfs_queue.push(node);
         node_to_level[node->id()] = 0;
     }
 
     // iterate through the queue
     // store processed nodes in a set
-    unordered_set<NodeId> processed_nodes;
-    while (not bfs_queue.empty()) {
-        Node* node = bfs_queue.front();
+    std::unordered_set<NodeId> processed_nodes;
+    while (not bfs_queue.empty())
+    {
+        Node *node = bfs_queue.front();
         bfs_queue.pop();
 
         // queue eligible children of this node
-        for (const Edge& user_edge : graph.user_edges(node)) {
+        for (const Edge &user_edge : graph.user_edges(node))
+        {
             if (user_edge.edge_type == EdgeType::kControlLoop)
             {
-                continue; // not unrolling loops, just terminate
+                continue;  // not unrolling loops, just terminate
             }
             if (user_edge.edge_type == EdgeType::kDataLoopback or user_edge.edge_type == EdgeType::kPartialDataCopy)
             {
                 continue;
             }
             NodeId user_id = user_edge.consumer_node_id;
-            Node* user_node = graph.node_by_id(user_id);
-            
+            Node *user_node = graph.node_by_id(user_id);
+
             // if this node has already been processed, then skip it
-            if (processed_nodes.find(user_id) != processed_nodes.end()) {
+            if (processed_nodes.find(user_id) != processed_nodes.end())
+            {
                 continue;
             }
 
             // if all the operands of this node already have levels, then this node will be inserted into the queue
             bool all_operands_have_levels = true;
             unsigned level = 0;
-            for (const Edge& operand_edge : graph.operand_edges(user_node)) {
-                if (operand_edge.edge_type == EdgeType::kDataLoopback or operand_edge.edge_type == EdgeType::kPartialDataCopy)
+            for (const Edge &operand_edge : graph.operand_edges(user_node))
+            {
+                if (operand_edge.edge_type == EdgeType::kDataLoopback or
+                    operand_edge.edge_type == EdgeType::kPartialDataCopy)
                 {
                     continue;
                 }
                 else if (operand_edge.edge_type == EdgeType::kControlLoop)
                 {
-                    continue; // not unrolling loops, just terminate
+                    continue;  // not unrolling loops, just terminate
                 }
                 NodeId operand_id = operand_edge.producer_node_id;
-                if (node_to_level.find(operand_id) == node_to_level.end()) {
+                if (node_to_level.find(operand_id) == node_to_level.end())
+                {
                     all_operands_have_levels = false;
                     break;
-                } else {
+                }
+                else
+                {
                     level = std::max(level, node_to_level[operand_id]);
                 }
             }
             // insert node into queue if all operands have levels
-            if (all_operands_have_levels) {
+            if (all_operands_have_levels)
+            {
                 bfs_queue.push(user_node);
                 node_to_level[user_id] = level + 1;
                 // mark node as processed
@@ -356,8 +361,10 @@ std::vector<std::vector<Node*>> topological_generations(const Graph& graph) {
     }
 
     // now that we have the levels, we can create the generations
-    for (auto const& [node_id, level] : node_to_level) {
-        if (generations.size() <= level) {
+    for (auto const &[node_id, level] : node_to_level)
+    {
+        if (generations.size() <= level)
+        {
             generations.resize(level + 1);
         }
         generations[level].push_back(graph.node_by_id(node_id));
@@ -366,8 +373,7 @@ std::vector<std::vector<Node*>> topological_generations(const Graph& graph) {
     return generations;
 }
 
-std::vector<Node *> top_row(
-    graphlib::Graph const *graph, std::vector<Node *> const &nodes)
+std::vector<Node *> top_row(graphlib::Graph const *graph, std::vector<Node *> const &nodes)
 {
     std::vector<Node *> sorted_nodes;
 
@@ -392,8 +398,7 @@ std::vector<Node *> top_row(
     return sorted_nodes;
 }
 
-std::vector<Node *> bot_row(
-    graphlib::Graph const *graph, std::vector<Node *> const &nodes)
+std::vector<Node *> bot_row(graphlib::Graph const *graph, std::vector<Node *> const &nodes)
 {
     std::vector<Node *> sorted_nodes;
 
@@ -425,13 +430,14 @@ std::vector<Node *> bot_row(
     return sorted_nodes;
 }
 
-std::vector<Node*> topological_sort(const Graph& graph, std::function<bool(Node*)> node_filter, bool unroll_loops) {
-    std::vector<Node*> result;
+std::vector<Node *> topological_sort(const Graph &graph, std::function<bool(Node *)> node_filter, bool unroll_loops)
+{
+    std::vector<Node *> result;
     std::unordered_map<NodeId, bool> visited{};
     std::unordered_map<Edge, int> control_loop_edge_to_iteration;
 
-    std::vector<Node*> nodes = graph.nodes();
-    std::queue<Node*> node_queue;
+    std::vector<Node *> nodes = graph.nodes();
+    std::queue<Node *> node_queue;
     for (Node *node : nodes)
     {
         if (graph.is_node_visible(node))
@@ -440,48 +446,54 @@ std::vector<Node*> topological_sort(const Graph& graph, std::function<bool(Node*
         }
     }
 
-    std::function<void(Node*)> VisitNode = [&](Node* node) {
+    std::function<void(Node *)> VisitNode = [&](Node *node)
+    {
         visited[node->id()] = true;
-        
-        for (const Edge& operand_edge : graph.operand_edges(node)) {
-            if (operand_edge.edge_type == EdgeType::kDataLoopback or operand_edge.edge_type == EdgeType::kPartialDataCopy)
+
+        for (const Edge &operand_edge : graph.operand_edges(node))
+        {
+            if (operand_edge.edge_type == EdgeType::kDataLoopback or
+                operand_edge.edge_type == EdgeType::kPartialDataCopy)
             {
                 continue;
             }
             else if (operand_edge.edge_type == EdgeType::kControlLoop)
             {
-                continue; // not unrolling loops, just terminate
+                continue;  // not unrolling loops, just terminate
             }
 
-
             NodeId predecessor_id = operand_edge.producer_node_id;
-            Node* predecessor_node = graph.node_by_id(predecessor_id);
-            if (requires_visit(visited, predecessor_id)) {
+            Node *predecessor_node = graph.node_by_id(predecessor_id);
+            if (requires_visit(visited, predecessor_id))
+            {
                 VisitNode(predecessor_node);
             }
         }
-        if (node_filter(node)) {
+        if (node_filter(node))
+        {
             result.push_back(node);
         }
 
         if (unroll_loops)
         {
-            for (const Edge& user_edge : graph.user_edges(node)) {
+            for (const Edge &user_edge : graph.user_edges(node))
+            {
                 if (user_edge.edge_type == EdgeType::kControlLoop)
                 {
                     auto loop_attributes = EdgeAttributes::as<LoopEdgeAttributes>(graph.get_edge_attributes(user_edge));
                     if (control_loop_edge_to_iteration.find(user_edge) == control_loop_edge_to_iteration.end())
                     {
-                        control_loop_edge_to_iteration[user_edge] = 1; // initialize loop count
+                        control_loop_edge_to_iteration[user_edge] = 1;  // initialize loop count
                     }
                     if (control_loop_edge_to_iteration[user_edge] < loop_attributes->loop_iterations())
                     {
                         // Re-enqueue nodes in the same order they were originally intended to be processed
-                        for (Node* node : nodes) {
-                            if (loop_attributes->is_processed_in_loop(node->id())) {
+                        for (Node *node : nodes)
+                        {
+                            if (loop_attributes->is_processed_in_loop(node->id()))
+                            {
                                 visited[node->id()] = false;
                                 node_queue.push(node);
-
                             }
                         }
                     }
@@ -489,14 +501,14 @@ std::vector<Node*> topological_sort(const Graph& graph, std::function<bool(Node*
                 }
             }
         }
-
-
     };
 
-    while (not node_queue.empty()) {
-        Node* node = node_queue.front();
+    while (not node_queue.empty())
+    {
+        Node *node = node_queue.front();
 
-        if (requires_visit(visited, node->id())) {
+        if (requires_visit(visited, node->id()))
+        {
             VisitNode(node);
         }
         node_queue.pop();
@@ -519,28 +531,35 @@ std::vector<Node *> visible_nodes(Graph const &graph, std::function<bool(Node *)
     return result;
 }
 
-std::vector<Node *> reachable_nodes(const Graph *graph, Node *start, std::function<bool(Node*)> node_filter, bool ancenstors_only)
+std::vector<Node *> reachable_nodes(
+    const Graph *graph, Node *start, std::function<bool(Node *)> node_filter, bool ancenstors_only)
 {
     std::vector<Node *> result;
     std::unordered_map<NodeId, bool> visited{};
-    
-    std::function<void(Node*)> VisitNode = [&](Node* node) {
+
+    std::function<void(Node *)> VisitNode = [&](Node *node)
+    {
         visited[node->id()] = true;
-        
-        for (auto operand : graph->data_operands(node)) {
-            if (requires_visit(visited, operand->id())) {
+
+        for (auto operand : graph->data_operands(node))
+        {
+            if (requires_visit(visited, operand->id()))
+            {
                 VisitNode(operand);
             }
         }
         if (node->node_type() != NodeType::kInput and not ancenstors_only)
         {
-            for (auto user : graph->data_users(node)) {
-                if (requires_visit(visited, user->id())) {
+            for (auto user : graph->data_users(node))
+            {
+                if (requires_visit(visited, user->id()))
+                {
                     VisitNode(user);
                 }
             }
         }
-        if (node_filter(node) ) {
+        if (node_filter(node))
+        {
             result.push_back(node);
         }
     };
@@ -561,12 +580,13 @@ bool check_producer_consumer(Graph *graph, Node *producer, Node *consumer, std::
 
 // Find the longest path from the graph. Optionally look for paths that don't start from ordered inputs.
 // TODO: write a few unit tests
-std::vector<Node *> get_longest_path(const Graph *graph, bool from_inputs_only) {
-
+std::vector<Node *> get_longest_path(const Graph *graph, bool from_inputs_only)
+{
     std::unordered_map<Node *, int> cost;
     std::unordered_map<Node *, Node *> parent_map;
 
-    if (from_inputs_only) {
+    if (from_inputs_only)
+    {
         // set big negative numbers on all other inputs
         for (Node *node : graph->nodes()) cost.emplace(std::make_pair(node, std::numeric_limits<int>::lowest()));
         for (Node *node : graph->ordered_module_inputs()) cost[node] = 0;
@@ -574,15 +594,17 @@ std::vector<Node *> get_longest_path(const Graph *graph, bool from_inputs_only)
 
     int max_distance = std::numeric_limits<int>::lowest();
     Node *max_path_output = NULL;
-    for (Node *node: topological_sort(*graph))
+    for (Node *node : topological_sort(*graph))
     {
         for (Node *user : graph->data_users(node))
         {
-            if (cost[user] < cost[node] + 1) {
+            if (cost[user] < cost[node] + 1)
+            {
                 cost[user] = cost[node] + 1;
                 parent_map[user] = node;
             }
-            if (cost[node] > max_distance) {
+            if (cost[node] > max_distance)
+            {
                 max_distance = cost[node];
                 max_path_output = node;
             }
@@ -601,17 +623,23 @@ std::vector<Node *> get_longest_path(const Graph *graph, bool from_inputs_only)
     return max_path;
 }
 
-std::vector<Node *> get_nodes_with_indegree_zero(Graph* graph) {
+std::vector<Node *> get_nodes_with_indegree_zero(Graph *graph)
+{
     std::vector<Node *> indegree_zero_nodes;
-    for (Node *node : graph->nodes()) {
+    for (Node *node : graph->nodes())
+    {
         int num_operands = 0;
-        for (auto operand : graph->operands(node)) {
-            if (operand->node_type() != NodeType::kInput) {
+        for (auto operand : graph->operands(node))
+        {
+            if (operand->node_type() != NodeType::kInput)
+            {
                 num_operands++;
             }
         }
-        if (num_operands == 0) {
-            if (node->node_type() != NodeType::kInput) {
+        if (num_operands == 0)
+        {
+            if (node->node_type() != NodeType::kInput)
+            {
                 indegree_zero_nodes.push_back(node);
             }
         }
@@ -619,11 +647,15 @@ std::vector<Node *> get_nodes_with_indegree_zero(Graph* graph) {
     return indegree_zero_nodes;
 }
 
-std::vector<Node *> get_nodes_with_outdegree_zero(Graph* graph) {
-    std::vector<Node*> outdegree_zero_nodes;
-    for (Node* node : graph->nodes()) {
-        if (graph->users(node).size() == 0) {
-            if (node->node_type() != NodeType::kInput) {
+std::vector<Node *> get_nodes_with_outdegree_zero(Graph *graph)
+{
+    std::vector<Node *> outdegree_zero_nodes;
+    for (Node *node : graph->nodes())
+    {
+        if (graph->users(node).size() == 0)
+        {
+            if (node->node_type() != NodeType::kInput)
+            {
                 outdegree_zero_nodes.push_back(node);
             }
         }
@@ -631,11 +663,15 @@ std::vector<Node *> get_nodes_with_outdegree_zero(Graph* graph) {
     return outdegree_zero_nodes;
 }
 
-std::vector<Node *> get_nodes_with_data_outdegree_zero(Graph* graph) {
-    std::vector<Node*> outdegree_zero_nodes;
-    for (Node* node : graph->nodes()) {
-        if (graph->user_data_edges(node).size() == 0) {
-            if (node->node_type() != NodeType::kInput) {
+std::vector<Node *> get_nodes_with_data_outdegree_zero(Graph *graph)
+{
+    std::vector<Node *> outdegree_zero_nodes;
+    for (Node *node : graph->nodes())
+    {
+        if (graph->user_data_edges(node).size() == 0)
+        {
+            if (node->node_type() != NodeType::kInput)
+            {
                 outdegree_zero_nodes.push_back(node);
             }
         }
@@ -643,17 +679,15 @@ std::vector<Node *> get_nodes_with_data_outdegree_zero(Graph* graph) {
     return outdegree_zero_nodes;
 }
 
-
 // Insert new node on the given edge. Node attributes will be picked up from consumer node.
 std::pair<Edge, Edge> insert_node_on_edge(
-    Graph *graph, 
-    Edge &edge, 
-    Node *node, 
-    bool inherit_consumer_attrs, 
+    Graph *graph,
+    Edge &edge,
+    Node *node,
+    bool inherit_consumer_attrs,
     bool remove_edge,
     std::uint32_t consumer_index,
-    bool place_tms_on_outgoing
-)
+    bool place_tms_on_outgoing)
 {
     Node *consumer = graph->node_by_id(edge.consumer_node_id);
     Node *producer = graph->node_by_id(edge.producer_node_id);
@@ -661,49 +695,45 @@ std::pair<Edge, Edge> insert_node_on_edge(
     graph->copy_node_attributes(inherit_consumer_attrs ? consumer : producer, node);
 
     // Don't copy "gradient op" flag, since the last node is still the one accumulating
-    if ( (node->node_type() == NodeType::kBudaOp) || 
-         (node->node_type() == NodeType::kPyOp) )
+    if ((node->node_type() == NodeType::kBudaOp) || (node->node_type() == NodeType::kPyOp))
         node->as<graphlib::OpNode>()->set_gradient_op(false);
 
     // Create new edges
-    Edge new_edge0 = Edge(
-            edge.producer_node_id,
-            edge.producer_output_port_id,
-            node->id(),
-            consumer_index,
-            edge.edge_type);
+    Edge new_edge0 =
+        Edge(edge.producer_node_id, edge.producer_output_port_id, node->id(), consumer_index, edge.edge_type);
 
-    Edge new_edge1 = Edge(
-            node->id(),
-            0,
-            edge.consumer_node_id,
-            edge.consumer_input_port_id,
-            edge.edge_type);
+    Edge new_edge1 = Edge(node->id(), 0, edge.consumer_node_id, edge.consumer_input_port_id, edge.edge_type);
 
     graph->add_edge(new_edge0);
     graph->add_edge(new_edge1);
+
+    graph->copy_edge_attributes(edge, new_edge0);
+    graph->copy_edge_attributes(edge, new_edge1);
+
+    // TMs should be placed only on one of the edges.
+    // Since we've copied all edge attributes (including TMs) to both edges,
+    // we need to remove TMs from one of them.
     if (not place_tms_on_outgoing)
-        graph->copy_edge_attributes(edge, new_edge0);
+    {
+        graph->get_edge_attributes(new_edge1)->set_tms({});
+    }
     else
-        graph->copy_edge_attributes(edge, new_edge1);
-    graph->get_edge_attributes(new_edge1)->set_ublock_order(graph->get_edge_attributes(edge)->get_ublock_order());
+    {
+        graph->get_edge_attributes(new_edge0)->set_tms({});
+    }
 
     bool edges_added = false;
-    for (Edge &e : graph->operand_edges(consumer)) {
+    for (Edge &e : graph->operand_edges(consumer))
+    {
         // Adjust control & autograd edges
-        if ( (e.edge_type != EdgeType::kData) && (e.edge_type != EdgeType::kAutogradOutputToLoss) && 
-                (e.edge_type != EdgeType::kAutogradInputToGradientOut) &&
-                (e.edge_type != EdgeType::kAutogradFwdToGradient)  &&
-                (e.edge_type != EdgeType::kAutogradFwdToRecompute) 
+        if ((e.edge_type != EdgeType::kData) && (e.edge_type != EdgeType::kAutogradOutputToLoss) &&
+            (e.edge_type != EdgeType::kAutogradInputToGradientOut) &&
+            (e.edge_type != EdgeType::kAutogradFwdToGradient) && (e.edge_type != EdgeType::kAutogradFwdToRecompute)
 
-                ) {
+        )
+        {
             edges_added = true;
-            graph->add_edge(
-                    graph->node_by_id(e.producer_node_id),
-                    node,
-                    e.producer_output_port_id,
-                    0,
-                    e.edge_type);
+            graph->add_edge(graph->node_by_id(e.producer_node_id), node, e.producer_output_port_id, 0, e.edge_type);
         }
     }
 
@@ -711,21 +741,17 @@ std::pair<Edge, Edge> insert_node_on_edge(
     // the need to go to the new op, too
     if (not edges_added and producer->get_epoch_type() != graphlib::NodeEpochType::Forward)
     {
-        for (Edge &e : graph->operand_edges(producer)) {
+        for (Edge &e : graph->operand_edges(producer))
+        {
             // Adjust control & autograd edges
-            if ( (e.edge_type == EdgeType::kAutogradFwdToBwd) || 
-                (e.edge_type == EdgeType::kAutogradFwdToOptimizer) || 
-                (e.edge_type == EdgeType::kAutogradFwdToGradient) ) 
+            if ((e.edge_type == EdgeType::kAutogradFwdToBwd) || (e.edge_type == EdgeType::kAutogradFwdToOptimizer) ||
+                (e.edge_type == EdgeType::kAutogradFwdToGradient))
             {
-                graph->add_edge(
-                        graph->node_by_id(e.producer_node_id),
-                        node,
-                        e.producer_output_port_id,
-                        0,
-                        e.edge_type);
+                graph->add_edge(graph->node_by_id(e.producer_node_id), node, e.producer_output_port_id, 0, e.edge_type);
             }
             // Move the kAutogradFwdToGradient edge, since we can only have one
-            if (e.edge_type == EdgeType::kAutogradFwdToGradient) {
+            if (e.edge_type == EdgeType::kAutogradFwdToGradient)
+            {
                 graph->remove_edge(e);
             }
         }
@@ -733,16 +759,12 @@ std::pair<Edge, Edge> insert_node_on_edge(
     // If the consumer of the edge we're trying to add a node on is a "recompute-node",
     // we need to also create replicated fwd->recompute edges on the newly added node.
     // this is to keep track of which nodes are considered to be "recompute".
-    for (Edge &e : graph->operand_edges(consumer)) {
+    for (Edge &e : graph->operand_edges(consumer))
+    {
         if (e.edge_type == EdgeType::kAutogradFwdToRecompute)
         {
-            Node* fwd_node_being_recompute = graph->node_by_id(e.producer_node_id);
-            graph->add_edge(
-                    fwd_node_being_recompute,
-                    node,
-                    e.producer_output_port_id,
-                    0,
-                    e.edge_type);
+            Node *fwd_node_being_recompute = graph->node_by_id(e.producer_node_id);
+            graph->add_edge(fwd_node_being_recompute, node, e.producer_output_port_id, 0, e.edge_type);
         }
     }
 
@@ -754,53 +776,95 @@ std::pair<Edge, Edge> insert_node_on_edge(
     return std::make_pair(new_edge0, new_edge1);
 }
 
+std::tuple<BudaOpNode*, Edge, Edge> insert_nop_on_edge(Graph *graph, Edge &edge, const std::string &nop_name, bool is_buffering, bool hoist_tms, bool remove_edge)
+{
+    const Node *src = graph->node_by_id(edge.producer_node_id);
+    const Node *dest = graph->node_by_id(edge.consumer_node_id);
+
+    BudaOpNode *nop = graph->add_node(
+        graphlib::create_node<graphlib::BudaOpNode>(nop_name, "nop"),
+        graph->get_subgraph_id_for_node(src->id()));
+    nop->set_shape(src->shape());
+    nop->set_buffering_op(is_buffering);
+
+    nop->set_epoch_type(dest->get_epoch_type());
+    nop->set_output_df(src->output_df());
+
+    if (src->node_type() == NodeType::kBudaOp)
+    {
+        const BudaOpNode *src_op = src->as<BudaOpNode>();
+        if (src_op->op_name() != "dequantization")
+        {
+            nop->set_accumulate_df(src_op->accumulate_df());
+            nop->set_intermediate_df(src_op->intermediate_df());
+            nop->set_math_fidelity(src_op->math_fidelity());
+        }
+    }
+
+    auto [edge0, edge1] = insert_node_on_edge(graph, edge, nop, false, remove_edge, 0 /* consumer_index */, not hoist_tms);
+
+    return std::make_tuple(nop, edge0, edge1);
+}
+
 // Copy non-data edges from old dest to new
 void copy_control_edges(Graph *graph, Node *old_dest, Node *new_dest)
 {
-    std::vector<Node*> data_operands = graph->data_operands(old_dest);
+    std::vector<Node *> data_operands = graph->data_operands(old_dest);
     Node *data_operand = data_operands.at(0);
 
-    for (Edge &e : graph->operand_edges(old_dest)) {
-        if (e.edge_type == EdgeType::kData) {
+    for (Edge &e : graph->operand_edges(old_dest))
+    {
+        if (e.edge_type == EdgeType::kData)
+        {
             continue;
         }
         Node *new_consumer = data_operand;
 
-        if (new_consumer->node_type() != NodeType::kBudaOp) {
+        if (new_consumer->node_type() != NodeType::kBudaOp)
+        {
             // If `new_dest` is an OutputNode, we'll fetch it off of its data-operand since we still want to
             // copy this control edge over (consider kInputToGradient being connected to kOutput node)
             new_consumer = data_operand;
         }
 
-        if (new_consumer->node_type() != NodeType::kBudaOp) {
+        if (new_consumer->node_type() != NodeType::kBudaOp)
+        {
             continue;
         }
 
-        if ((e.edge_type == EdgeType::kAutogradFwdToBwd and new_consumer->get_epoch_type() != NodeEpochType::Backward)
-            or (e.edge_type == EdgeType::kAutogradFwdToOptimizer and new_consumer->get_epoch_type() != NodeEpochType::Optimizer))
+        if ((e.edge_type == EdgeType::kAutogradFwdToBwd and
+             new_consumer->get_epoch_type() != NodeEpochType::Backward) or
+            (e.edge_type == EdgeType::kAutogradFwdToOptimizer and
+             new_consumer->get_epoch_type() != NodeEpochType::Optimizer))
         {
-            // There are cases where we're trying to connect kAutogradFwdToBwd on a Fwd consumer node which doesn't make sense.
+            // There are cases where we're trying to connect kAutogradFwdToBwd on a Fwd consumer node which doesn't make
+            // sense.
             continue;
         }
 
         // Copy control & autograd edges
         graph->add_edge(
-                graph->node_by_id(e.producer_node_id),
-                new_consumer,
-                e.producer_output_port_id,
-                e.consumer_input_port_id,
-                e.edge_type);
+            graph->node_by_id(e.producer_node_id),
+            new_consumer,
+            e.producer_output_port_id,
+            e.consumer_input_port_id,
+            e.edge_type);
     }
 
-    for (Edge &e : graph->user_edges(old_dest)) {
-        if (e.edge_type == EdgeType::kData) {
+    for (Edge &e : graph->user_edges(old_dest))
+    {
+        if (e.edge_type == EdgeType::kData)
+        {
             continue;
         }
 
         // Copy control & autograd edges
-        if (e.edge_type == EdgeType::kControl) {
+        if (e.edge_type == EdgeType::kControl)
+        {
             graph->add_edge(new_dest, graph->node_by_id(e.consumer_node_id), 0, 0, e.edge_type);
-        } else {
+        }
+        else
+        {
             // if it's an autograd-edge between <NODE_TO_DELETE> -> consumer, we'll reassign
             // the edge to the producer node since `new_dest` may be an output node
             graph->add_edge(data_operand, graph->node_by_id(e.consumer_node_id), 0, 0, e.edge_type);
@@ -816,17 +880,15 @@ void handle_control_edges_when_removing_node(Graph *graph, Node *node_being_remo
         operand_data_edges.size() == 1,
         "Tried to handle control edges, but node being removed has more than 1 operand!");
 
-    Edge& producer_to_nbr_edge = operand_data_edges.front();
-    Node* producer = graph->node_by_id(producer_to_nbr_edge.producer_node_id);
+    Edge &producer_to_nbr_edge = operand_data_edges.front();
+    Node *producer = graph->node_by_id(producer_to_nbr_edge.producer_node_id);
 
-    auto is_not_data_edge = [](Edge e) {
-        return (e.edge_type != EdgeType::kData);
-    };
+    auto is_not_data_edge = [](Edge e) { return (e.edge_type != EdgeType::kData); };
     std::vector<Edge> operand_edges = graph->operand_edges(node_being_removed, is_not_data_edge);
     std::vector<Edge> user_edges = graph->user_edges(node_being_removed, is_not_data_edge);
 
     // Handle operand edges
-    for (Edge& o_e : operand_edges)
+    for (Edge &o_e : operand_edges)
     {
         if (node_being_removed->is_forward())
         {
@@ -835,11 +897,11 @@ void handle_control_edges_when_removing_node(Graph *graph, Node *node_being_remo
                 for (Edge &user : graph->user_data_edges(node_being_removed))
                 {
                     Edge new_edge(
-                            o_e.producer_node_id,
-                            o_e.producer_output_port_id,
-                            user.consumer_node_id,
-                            user.consumer_input_port_id,
-                            o_e.edge_type);
+                        o_e.producer_node_id,
+                        o_e.producer_output_port_id,
+                        user.consumer_node_id,
+                        user.consumer_input_port_id,
+                        o_e.edge_type);
                     graph->add_edge(new_edge);
                 }
             }
@@ -883,7 +945,7 @@ void handle_control_edges_when_removing_node(Graph *graph, Node *node_being_remo
     }
 
     // Handle user edges
-    for (Edge& u_e : user_edges)
+    for (Edge &u_e : user_edges)
     {
         if (node_being_removed->is_forward())
         {
@@ -925,6 +987,31 @@ void handle_control_edges_when_removing_node(Graph *graph, Node *node_being_remo
     }
 }
 
+// Creates buffering queue and adds it to the graph. Returns pointer to created queue node.
+// Queue inherits shape output_df, and epoch_type from producer node.
+graphlib::QueueNode *create_buffering_queue(
+    Graph *graph, const graphlib::Node *producer_node, const std::string name, int num_entries)
+{
+    TT_ASSERT(num_entries > 0, "Number of entries in queue has to be greater than 0");
+    if (num_entries > graph->get_microbatch())
+    {
+        log_warning(
+            "Wasting DRAM. Number of entries in queue is greater than microbatch size. For buffering queue the "
+            "theoretical maximum number of entries is equal to microbatch size.");
+    }
+
+    // Create new queue
+    std::unique_ptr<graphlib::BufferingQueueNode> queue_node_unique =
+        graphlib::create_node<graphlib::BufferingQueueNode>(name, num_entries);
+    queue_node_unique->set_shape(producer_node->shape());
+    queue_node_unique->set_output_df(producer_node->output_df());
+    queue_node_unique->set_epoch_type(producer_node->get_epoch_type());
+
+    graphlib::QueueNode *queue =
+        graph->add_node(std::move(queue_node_unique), graph->get_subgraph_id_for_node(producer_node->id()));
+    return queue;
+}
+
 // Bypass queue, connecting its source to its destination. There has to be only one source for queue, and user is
 // defined by user_edge.
 std::unique_ptr<Node> connect_queue_src_to_queue_user(Graph *graph, Node *queue, Edge &user_edge, bool remove_queue)
@@ -977,11 +1064,11 @@ std::unique_ptr<Node> bypass_node(Graph *graph, Node *node, bool remove_node, st
         std::vector<graphlib::OpType> user_tms = graph->get_edge_attributes(user)->get_tms();
 
         Edge new_edge(
-                src_edge.producer_node_id,
-                src_edge.producer_output_port_id,
-                user.consumer_node_id,
-                user.consumer_input_port_id,
-                user.edge_type);
+            src_edge.producer_node_id,
+            src_edge.producer_output_port_id,
+            user.consumer_node_id,
+            user.consumer_input_port_id,
+            user.edge_type);
         graph->add_edge(new_edge);
 
         std::vector<graphlib::OpType> new_edge_tms;
@@ -998,7 +1085,8 @@ std::unique_ptr<Node> bypass_node(Graph *graph, Node *node, bool remove_node, st
     handle_control_edges_when_removing_node(graph, node);
 
     OpNode *op_node = dynamic_cast<OpNode *>(node);
-    if (op_node and op_node->is_gradient_op()) {
+    if (op_node and op_node->is_gradient_op())
+    {
         OpNode *producer_op_node = dynamic_cast<OpNode *>(graph->node_by_id(src_edge.producer_node_id));
         if (producer_op_node)
             producer_op_node->set_gradient_op();
@@ -1011,27 +1099,31 @@ std::unique_ptr<Node> bypass_node(Graph *graph, Node *node, bool remove_node, st
 // The new node must have the same number of operands, or skip_operands must be set.
 void replace_node(Graph *graph, Node *original_node, Node *new_node, bool skip_operands)
 {
-    if (!skip_operands) {
-        for (Edge &operand : graph->operand_data_edges(original_node)) {
+    if (!skip_operands)
+    {
+        for (Edge &operand : graph->operand_data_edges(original_node))
+        {
             Edge new_edge = Edge(
-                    operand.producer_node_id,
-                    operand.producer_output_port_id,
-                    new_node->id(),
-                    operand.consumer_input_port_id,
-                    operand.edge_type);
+                operand.producer_node_id,
+                operand.producer_output_port_id,
+                new_node->id(),
+                operand.consumer_input_port_id,
+                operand.edge_type);
             graph->add_edge(new_edge);
             graph->copy_edge_attributes(operand, new_edge);
         }
     }
 
-    for (Edge &user : graph->user_edges(original_node)) {
-        if (user.edge_type == graphlib::EdgeType::kData) {
+    for (Edge &user : graph->user_edges(original_node))
+    {
+        if (user.edge_type == graphlib::EdgeType::kData)
+        {
             Edge new_edge = Edge(
-                    new_node->id(),
-                    (graphlib::PortId)0,
-                    user.consumer_node_id,
-                    user.consumer_input_port_id,
-                    user.edge_type);
+                new_node->id(),
+                (graphlib::PortId)0,
+                user.consumer_node_id,
+                user.consumer_input_port_id,
+                user.edge_type);
             graph->add_edge(new_edge);
             graph->copy_edge_attributes(user, new_edge);
         }
@@ -1170,7 +1262,8 @@ graphlib::Node *cascade_nary_to_binary_op(graphlib::Graph *graph, graphlib::Node
         auto attrs_a = graph->get_edge_attributes(operand_a);
         auto attrs_b = graph->get_edge_attributes(operand_b);
         graphlib::Node *add = graph->add_node(
-            nary_op->clone(nary_op->name() + "_cascade_" + std::to_string(i)), graph->get_subgraph_id_for_node(nary_op->id()));
+            nary_op->clone(nary_op->name() + "_cascade_" + std::to_string(i)),
+            graph->get_subgraph_id_for_node(nary_op->id()));
         operand_a.consumer_input_port_id = 0;
         operand_a.consumer_node_id = add->id();
         operand_b.consumer_input_port_id = 1;
@@ -1201,7 +1294,7 @@ graphlib::Node *cascade_nary_to_binary_op(graphlib::Graph *graph, graphlib::Node
 
 bool swap_broadcast_dims(graphlib::Graph *graph, graphlib::Edge edge, int old_dim, int new_dim)
 {
-    bool swapped = false; 
+    bool swapped = false;
     auto tms = graph->get_edge_attributes(edge)->get_tms();
     std::vector<graphlib::OpType> new_tms;
     for (graphlib::OpType &op_type : tms)
@@ -1272,7 +1365,8 @@ void handle_change_rank(graphlib::Graph *graph, graphlib::Edge edge)
         TT_ASSERT(inherit);
         // If there are 2 edges from the same producer to the same consumer (eg. eltwise binary op),
         // need edge_creation_id to differentiate naming.
-        std::string name = producer->name() + "_" + consumer->name() + "_" + op + std::to_string(rank) + "_" + std::to_string(edge.edge_creation_id);
+        std::string name = producer->name() + "_" + consumer->name() + "_" + op + std::to_string(rank) + "_" +
+                           std::to_string(edge.edge_creation_id);
         graphlib::OpNode *change_rank = dynamic_cast<graphlib::OpNode *>(
             graph->add_node(inherit->clone(name), graph->get_subgraph_id_for_node(producer->id())));
         TT_ASSERT(change_rank);
@@ -1327,12 +1421,10 @@ graphlib::Edge clone_input_forking_edge(graphlib::Graph *graph, graphlib::Edge u
     Node *input = graph->node_by_id(user_edge.producer_node_id);
     TT_ASSERT(input->node_type() == NodeType::kInput);
     TT_ASSERT(graph->data_operands(input).empty(), "Cannot clone a loopback input");
-    TT_ASSERT(graph->data_users(input).size() > 1 or allow_single_user, 
-                "Cannot clone input that doesn't fork");
-    Node *clone =
-        graph->add_node(
-            input->clone(input->name() + "_fork_clone" + std::to_string(user_edge.consumer_node_id)),
-            graph->get_subgraph_id_for_node(input->id()));
+    TT_ASSERT(graph->data_users(input).size() > 1 or allow_single_user, "Cannot clone input that doesn't fork");
+    Node *clone = graph->add_node(
+        input->clone(input->name() + "_fork_clone" + std::to_string(user_edge.consumer_node_id)),
+        graph->get_subgraph_id_for_node(input->id()));
 
     auto edge_attr = graph->get_edge_attributes(user_edge);
     graph->remove_edge(user_edge);
@@ -1349,7 +1441,8 @@ graphlib::Edge clone_input_forking_edge(graphlib::Graph *graph, graphlib::Edge u
 graphlib::Shape default_tm_evaluator(graphlib::OpType const &tm, graphlib::Shape shape, graphlib::IRLevel ir_level)
 {
     std::vector<Shape> shapes = {shape};
-    std::tuple<Shape, std::vector<DimBroadcast>> shape_data = get_op_shape(tm, shapes, ir_level == IRLevel::IR_BUDA, shape.get_tile_dim());
+    std::tuple<Shape, std::vector<DimBroadcast>> shape_data =
+        get_op_shape(tm, shapes, ir_level == IRLevel::IR_BUDA, shape.get_tile_dim());
     shape = std::get<0>(shape_data);
     TT_ASSERT(std::get<1>(shape_data).size() == 0, "TMs should not cause broadcasts");
     return shape;
@@ -1487,17 +1580,20 @@ void calculate_and_set_node_shape(Graph *graph, Node *node)
 
     // Validate / Canonicalize TileDim
     auto op_node = dynamic_cast<graphlib::OpNode *>(node);
-    if (op_node) {
+    if (op_node)
+    {
         validate_tile_dims(graph, op_node);
     }
 
-    for (graphlib::Edge &e : graph->operand_data_edges(node)) {
+    for (graphlib::Edge &e : graph->operand_data_edges(node))
+    {
         auto operand_shape = graph->node_by_id(e.producer_node_id)->shape();
         std::vector<OpType> tms = graph->get_edge_attributes(e)->get_tms();
-        for (OpType tm: tms)
+        for (OpType tm : tms)
         {
             std::vector<Shape> shapes = {operand_shape};
-            std::tuple<Shape, std::vector<DimBroadcast>> shape_data = get_op_shape(tm, shapes, graph->get_ir_level() == IRLevel::IR_BUDA, operand_shape.get_tile_dim());
+            std::tuple<Shape, std::vector<DimBroadcast>> shape_data =
+                get_op_shape(tm, shapes, graph->get_ir_level() == IRLevel::IR_BUDA, operand_shape.get_tile_dim());
             operand_shape = std::get<0>(shape_data);
             TT_ASSERT(std::get<1>(shape_data).size() == 0, "TMs should not cause broadcasts");
             log_trace(LogGraphCompiler, "    TM {} {}", tm.as_string(), operand_shape);
@@ -1511,7 +1607,8 @@ void calculate_and_set_node_shape(Graph *graph, Node *node)
         operand_shapes.push_back(operand_shape);
     }
 
-    if ( (node->node_type() == graphlib::NodeType::kOutput) || (node->node_type() == graphlib::NodeType::kQueue) ) { 
+    if ((node->node_type() == graphlib::NodeType::kOutput) || (node->node_type() == graphlib::NodeType::kQueue))
+    {
         // Graph shape from first, and only, operand
         TT_ASSERT(operand_shapes.size() == 1, "Node should have exactly one operand");
         node->set_shape(operand_shapes[0]);
@@ -1526,23 +1623,22 @@ void calculate_and_set_node_shape(Graph *graph, Node *node)
                                    ? dynamic_cast<graphlib::BudaNaryTMNode *>(node)->op_type()
                                    : dynamic_cast<graphlib::OpNode *>(node)->op_type();
 
-    bool is_fused_op = (node->node_type() == graphlib::kBudaOp) && node->as<graphlib::BudaOpNode>()->is_fused_op();
-
     std::tuple<Shape, std::vector<DimBroadcast>> shape_data =
-        is_fused_op ? get_fused_op_shape(node->as<graphlib::BudaOpNode>(), operand_shapes) :
         get_op_shape(op_type, operand_shapes, graph->get_ir_level() == IRLevel::IR_BUDA, node->shape().get_tile_dim());
 
     log_trace(LogGraphCompiler, "  {}", std::get<0>(shape_data));
     node->set_shape(std::get<0>(shape_data));
 
     // Set broadcast attributes on edges
-    for (graphlib::Edge &e : graph->operand_data_edges(node)) {
-
-        for (DimBroadcast &b : std::get<1>(shape_data)) {
+    for (graphlib::Edge &e : graph->operand_data_edges(node))
+    {
+        for (DimBroadcast &b : std::get<1>(shape_data))
+        {
             log_trace(LogGraphCompiler, "  brcst {} {} {}", std::get<0>(b), std::get<1>(b), std::get<2>(b));
 
             int operand = std::get<0>(b);
-            if (operand == (int)e.consumer_input_port_id) {
+            if (operand == (int)e.consumer_input_port_id)
+            {
                 int dim = std::get<1>(b);
                 int size = std::get<2>(b);
                 bool const is_buda = graph->get_ir_level() == IRLevel::IR_BUDA;
@@ -1595,39 +1691,118 @@ std::vector<UBlockOrder> get_input_ublock_order(Graph const *graph, Node const *
     return ublock_order;
 }
 
+tt::graphlib::Node *get_input_queue_producer(Graph const *graph, tt::graphlib::InputNode const *node)
+{
+    auto is_partial_datacopy_edge = [](Edge e) { return (e.edge_type == graphlib::EdgeType::kPartialDataCopy); };
+    std::vector<graphlib::Edge> partial_datacopy_edges = graph->operand_edges(node, is_partial_datacopy_edge);
+    auto producers = graph->data_operands(node);
+
+    if (not producers.empty() and not partial_datacopy_edges.empty())
+    {
+        throw std::runtime_error("Input queue " + node->name() + " has both producer and partial datacopy edge!");
+    }
+    else if (not producers.empty())
+    {
+        TT_ASSERT(producers.size() == 1);
+        return producers[0];
+    }
+    else if (not partial_datacopy_edges.empty())
+    {
+        std::vector<graphlib::Edge> producer_edges;
+        for (auto edge : partial_datacopy_edges)
+        {
+            auto output_node = graph->node_by_id(edge.producer_node_id);
+            TT_ASSERT(graph->operand_edges(output_node).size() == 1, "Output node should only have 1 producer");
+            producer_edges.push_back(graph->operand_edges(output_node).front());
+        }
+
+        // Assert all partial datacopy producer edges have the same ublock order
+        TT_ASSERT(std::all_of(
+            producer_edges.begin(),
+            producer_edges.end(),
+            [graph, producer_edges](Edge e)
+            {
+                return graph->get_edge_attributes(e)->get_ublock_order() ==
+                       graph->get_edge_attributes(producer_edges[0])->get_ublock_order();
+            }));
+
+        graphlib::OutputNode *output =
+            graph->node_by_id(partial_datacopy_edges[0].producer_node_id)->as<graphlib::OutputNode>();
+        auto output_producer = graph->data_operands(output);
+        TT_ASSERT(output_producer.size() == 1);
+        TT_ASSERT(output_producer[0]->node_type() == graphlib::NodeType::kBudaOp);
+        return output_producer[0];
+    }
+
+    return nullptr;
+}
+
+tt::graphlib::UBlockOrder get_input_queue_ublock_order(Graph const *graph, Node const *node)
+{
+    UBlockOrder ublock_order = UBlockOrder::R;
+    if (tt::graphlib::Node *producer = get_input_queue_producer(graph, node->as<graphlib::InputNode>()); producer)
+    {
+        ublock_order = get_output_ublock_order(graph, producer);
+    }
+    else
+    {
+        std::vector<tt::graphlib::Edge> consumers = graph->user_data_edges(node);
+        bool all_users_transpose = std::all_of(
+            consumers.begin(),
+            consumers.end(),
+            [graph](graphlib::Edge e) { return graph->get_edge_attributes(e)->has_tm("transpose"); });
+        tt::graphlib::UBlockOrder user_ublock_order = graph->get_edge_attributes(consumers.front())->get_ublock_order();
+        bool all_users_same_order = std::all_of(
+            consumers.begin(),
+            consumers.end(),
+            [graph, user_ublock_order](graphlib::Edge e)
+            { return user_ublock_order == graph->get_edge_attributes(e)->get_ublock_order(); });
+
+        tt::graphlib::UBlockOrder q_ublock_order = all_users_same_order ? user_ublock_order : graphlib::UBlockOrder::R;
+        ublock_order = all_users_transpose ? flip_ublock_order(q_ublock_order) : q_ublock_order;
+    }
+
+    return ublock_order;
+}
+
 UBlockOrder get_output_ublock_order(Graph const *graph, Node const *node)
 {
     if (node->node_type() == graphlib::NodeType::kInput)
-        return UBlockOrder::R;
+    {
+        return get_input_queue_ublock_order(graph, node);
+    }
 
     graphlib::BudaOpNode const *op_node = dynamic_cast<graphlib::BudaOpNode const *>(node);
     if (op_node and op_node->op_name() == "reduce")
+    {
         return UBlockOrder::R;
+    }
 
     return get_input_ublock_order(graph, node).back();
 }
 
-
 // Insert NOP on an edge with transpose TM, then flip ublock order for better streaming
 // returns true if nop inserted
-bool try_insert_nop_on_transpose_edge(Graph *graph, Edge &edge) 
+bool try_insert_nop_on_transpose_edge(Graph *graph, Edge &edge)
 {
     auto node = graph->node_by_id(edge.consumer_node_id);
     std::vector<graphlib::OpType> tms = graph->get_edge_attributes(edge)->get_tms();
-    if(tms.size() > 0 && tms[tms.size()-1].op == "nop")
+    if (tms.size() > 0 && tms[tms.size() - 1].op == "nop")
         return false;
 
     // even number of transposes are ok, tiles are not transposed in the end
     int transposes = 0;
     int last_transpose = 0;
-    for (std::size_t i = 0; i < tms.size(); i++) {
-        if (tms[i].op == "transpose") {
+    for (std::size_t i = 0; i < tms.size(); i++)
+    {
+        if (tms[i].op == "transpose")
+        {
             transposes++;
             last_transpose = i;
         }
     }
     if (transposes % 2 == 0)
-        return false; // Even number of transposes cancel out
+        return false;  // Even number of transposes cancel out
 
     // Add a NOP on the edge, and move TMs after last transpose to it
     graphlib::BudaOpNode *nop = graph->add_node(
@@ -1642,17 +1817,20 @@ bool try_insert_nop_on_transpose_edge(Graph *graph, Edge &edge)
     int num_second_group = (last_transpose == 0) ? tms.size() - 1 : 1;
     int num_third_group = tms.size() - num_first_group - 1;
     graph->get_edge_attributes(new_edge0)->set_tms(
-            std::vector<graphlib::OpType>(tms.begin(), tms.begin() + num_first_group));
+        std::vector<graphlib::OpType>(tms.begin(), tms.begin() + num_first_group));
 
     // Flip the ublock order wrt the producer for more likely streaming
     graphlib::UBlockOrder producer_ublock_order =
         graphlib::get_output_ublock_order(graph, graph->node_by_id(new_edge0.producer_node_id));
     graph->get_edge_attributes(new_edge0)->set_ublock_order(graphlib::flip_ublock_order(producer_ublock_order));
 
-    if (num_second_group > 0) {
+    if (num_second_group > 0)
+    {
         // No need to add second nop if we have only 1 transpose on position 0.
-        if (last_transpose != 0) {
-            // Assign last transpose to its own edge so it could be streamed. We might need an extra Nop for this purpose.
+        if (last_transpose != 0)
+        {
+            // Assign last transpose to its own edge so it could be streamed. We might need an extra Nop for this
+            // purpose.
             graphlib::BudaOpNode *nop2 = graph->add_node(
                 graphlib::create_node<graphlib::BudaOpNode>(
                     node->name() + "_transpose_nop_2_" + std::to_string(edge.edge_creation_id), "nop"),
@@ -1661,14 +1839,16 @@ bool try_insert_nop_on_transpose_edge(Graph *graph, Edge &edge)
 
             auto [mid_edge, last_edge] = graphlib::insert_node_on_edge(graph, new_edge1, nop2);
             graph->get_edge_attributes(mid_edge)->set_tms(
-                    std::vector<graphlib::OpType>(tms.begin() + num_first_group, tms.begin() + num_first_group + 1));
+                std::vector<graphlib::OpType>(tms.begin() + num_first_group, tms.begin() + num_first_group + 1));
 
-            if (num_third_group > 0) {
+            if (num_third_group > 0)
+            {
                 graph->get_edge_attributes(last_edge)->set_tms(
-                        std::vector<graphlib::OpType>(tms.begin() + num_first_group + 1, tms.end()));
+                    std::vector<graphlib::OpType>(tms.begin() + num_first_group + 1, tms.end()));
             }
         }
-        else {
+        else
+        {
             // Keep rest of TMs on new_edge1 if nop2 is not added.
             graph->get_edge_attributes(new_edge1)->set_tms(
                 std::vector<graphlib::OpType>(tms.begin() + num_first_group, tms.end()));
@@ -1679,26 +1859,33 @@ bool try_insert_nop_on_transpose_edge(Graph *graph, Edge &edge)
 }
 
 // Return a vector of pairs of optimizer parameter input nodes and optimizer key names for a given model parameter node
-std::vector<std::pair<InputNode *, std::string>> get_optimizer_param_info(const Graph *graph, const Node *model_parameter)
+std::vector<std::pair<InputNode *, std::string>> get_optimizer_param_info(
+    const Graph *graph, const Node *model_parameter)
 {
     // If autograd has run, there will be EdgeType::kAutogradFwdToOptimizer edges. We parse through this
     // list looking for inputs that require its tensors to be populated by the python-side optimizer obj
     std::vector<std::pair<InputNode *, std::string>> ret;
-    for (graphlib::Edge edge : graph->user_edges(model_parameter)) {
-
-        if (edge.edge_type != graphlib::EdgeType::kAutogradFwdToOptimizer) continue;
-        if (graph->node_by_id(edge.consumer_node_id)->node_type() != NodeType::kInput) continue;
+    for (graphlib::Edge edge : graph->user_edges(model_parameter))
+    {
+        if (edge.edge_type != graphlib::EdgeType::kAutogradFwdToOptimizer)
+            continue;
+        if (graph->node_by_id(edge.consumer_node_id)->node_type() != NodeType::kInput)
+            continue;
 
         graphlib::InputNode *input = graph->node_by_id(edge.consumer_node_id)->as<graphlib::InputNode>();
-        if (not input->is_optimizer_parameter()) { continue; }
+        if (not input->is_optimizer_parameter())
+        {
+            continue;
+        }
 
         // Parse out the optimizer-param suffix string and do a lookup to get the tensor
         std::string optimizer_input_name = input->name();
         std::string::size_type optimizer_param_idx = optimizer_input_name.rfind('.');
-        TT_ASSERT(optimizer_param_idx != std::string::npos,
-                "Expecting optimizer node to have a '.<optimizer-param>' suffix identifier");
+        TT_ASSERT(
+            optimizer_param_idx != std::string::npos,
+            "Expecting optimizer node to have a '.<optimizer-param>' suffix identifier");
 
-        std::string optimizer_param_key = optimizer_input_name.substr(optimizer_param_idx+1);
+        std::string optimizer_param_key = optimizer_input_name.substr(optimizer_param_idx + 1);
         ret.push_back(std::make_pair(input, optimizer_param_key));
     }
     return ret;
@@ -1712,18 +1899,22 @@ bool is_constant_input(const Node *node)
 
 bool is_recompute(const Graph *graph, const Node *node)
 {
-    for (const Edge& edge : graph->operand_edges(node)) {
-        if (edge.edge_type == graphlib::EdgeType::kAutogradFwdToRecompute) {
+    for (const Edge &edge : graph->operand_edges(node))
+    {
+        if (edge.edge_type == graphlib::EdgeType::kAutogradFwdToRecompute)
+        {
             return true;
         }
     }
     return false;
 }
 
-Node* get_fwd_from_recompute(const Graph *graph, const Node *node)
+Node *get_fwd_from_recompute(const Graph *graph, const Node *node)
 {
-    for (const Edge& edge : graph->operand_edges(node)) {
-        if (edge.edge_type == graphlib::EdgeType::kAutogradFwdToRecompute) {
+    for (const Edge &edge : graph->operand_edges(node))
+    {
+        if (edge.edge_type == graphlib::EdgeType::kAutogradFwdToRecompute)
+        {
             return graph->node_by_id(edge.producer_node_id);
         }
     }
@@ -1759,8 +1950,7 @@ std::unique_ptr<Node> ConstEvalGraph::promote_node(
 
     graph_updated_since_autograd = true;
 
-    Node *consteval_node = consteval_graph.add_node<Node>(
-        std::move(consteval_node_free), subgraph_id_);
+    Node *consteval_node = consteval_graph.add_node<Node>(std::move(consteval_node_free), subgraph_id_);
 
     // Promoted consteval nodes are always in the forward epoch for their respective consteval graph
     // ConstEvalGraph will automatically run its own autograd and insert its own, respective BW ops
@@ -1786,6 +1976,7 @@ std::unique_ptr<Node> ConstEvalGraph::promote_node(
         for (Edge const &runtime_edge : runtime_graph->operand_data_edges(runtime_node))
         {
             auto runtime_attr = runtime_graph->get_edge_attributes(runtime_edge);
+            int const_producer_id = 0;
 
             if (runtime_to_consteval_map.find(runtime_edge.producer_node_id) == runtime_to_consteval_map.end())
             {
@@ -1793,13 +1984,21 @@ std::unique_ptr<Node> ConstEvalGraph::promote_node(
                     dynamic_cast<InputNode *>(runtime_graph->node_by_id(runtime_edge.producer_node_id));
                 TT_ASSERT(runtime_operand, "All operands of promoted nodes must be graph inputs");
                 Node *consteval_operand = nullptr;
+                
+                // Only add the node if it doesn't already exist in the consteval graph
                 if (ConstEvalGraph *nested_consteval_graph = runtime_operand->get_consteval_graph())
                     consteval_operand = graft(nested_consteval_graph->get_graph());
+                else if (!consteval_graph.has_node_with_name(runtime_operand->name()))
+                    consteval_operand = consteval_graph.add_node<Node>(runtime_operand->clone(), subgraph_id_);
                 else
-                    consteval_operand = consteval_graph.add_node<Node>(
-                        runtime_operand->clone(), subgraph_id_);
+                    consteval_operand = consteval_graph.get_node_by_name(runtime_operand->name());
+
+                // Only map the operand if it has 1 user
+                if (runtime_graph->user_data_edges(runtime_operand).size() > 1)
+                    const_producer_id = consteval_operand->id();
+                else if (runtime_graph->user_data_edges(runtime_operand).size() == 1)
+                    runtime_to_consteval_map.insert({runtime_operand->id(), consteval_operand->id()});
 
-                runtime_to_consteval_map.insert({runtime_operand->id(), consteval_operand->id()});
                 runtime_graph->remove_edge(runtime_edge);
                 auto users = runtime_graph->user_edges(runtime_operand);
                 if (users.empty())
@@ -1807,7 +2006,7 @@ std::unique_ptr<Node> ConstEvalGraph::promote_node(
             }
 
             Edge consteval_edge = Edge(
-                runtime_to_consteval_map.at(runtime_edge.producer_node_id),
+                const_producer_id ? const_producer_id : runtime_to_consteval_map.at(runtime_edge.producer_node_id),
                 runtime_edge.producer_output_port_id,
                 runtime_to_consteval_map.at(runtime_edge.consumer_node_id),
                 runtime_edge.consumer_input_port_id,
@@ -1839,10 +2038,8 @@ std::unique_ptr<Node> ConstEvalGraph::promote_node(
     }
     else
     {
-        consteval_output =
-            consteval_graph.add_node<Node>(
-                std::make_unique<OutputNode>(consteval_graph.name() + ".output"),
-                subgraph_id_);
+        consteval_output = consteval_graph.add_node<Node>(
+            std::make_unique<OutputNode>(consteval_graph.name() + ".output"), subgraph_id_);
     }
 
     Edge consteval_edge(consteval_node->id(), 0, consteval_output->id(), 0, EdgeType::kData);
@@ -1887,8 +2084,7 @@ Node *ConstEvalGraph::graft(Graph *other)
             continue;
         }
 
-        Node *new_node = consteval_graph.add_node<Node>(
-            node->clone(), subgraph_id_);
+        Node *new_node = consteval_graph.add_node<Node>(node->clone(), subgraph_id_);
         node_id_map.insert({node->id(), new_node->id()});
     }
 
@@ -1914,7 +2110,7 @@ Node *ConstEvalGraph::graft(Graph *other)
     return output;
 }
 
-std::unique_ptr<ConstEvalGraph> ConstEvalGraph::clone(Node *new_runtime_input, const std::string& new_input_node_name)
+std::unique_ptr<ConstEvalGraph> ConstEvalGraph::clone(Node *new_runtime_input, const std::string &new_input_node_name)
 {
     TT_ASSERT(new_runtime_input);
     int unique_id = Graph::generate_unique_graph_id();
@@ -1931,13 +2127,14 @@ std::unique_ptr<ConstEvalGraph> ConstEvalGraph::clone(Node *new_runtime_input, c
     // Map the old ids to cloned ones
     for (auto [runtime_node_id, consteval_node_id] : runtime_to_consteval_map)
     {
-        Node* consteval_node = consteval_graph.node_by_id(consteval_node_id);
+        Node *consteval_node = consteval_graph.node_by_id(consteval_node_id);
         std::string node_name = consteval_node->name();
 
         if (consteval_node->node_type() == NodeType::kInput and new_input_node_name != "")
         {
             std::string const &old_node_name = consteval_node->name();
-            cloned->consteval_graph.update_node_name(cloned->consteval_graph.get_node_by_name(old_node_name), new_input_node_name);
+            cloned->consteval_graph.update_node_name(
+                cloned->consteval_graph.get_node_by_name(old_node_name), new_input_node_name);
             node_name = new_input_node_name;
         }
         cloned->runtime_to_consteval_map[runtime_node_id] = cloned->consteval_graph.get_node_by_name(node_name)->id();
@@ -1991,8 +2188,8 @@ void ConstEvalGraph::autograd()
 bool is_consteval_capable_input_type(Node *node)
 {
     graphlib::InputNode *input = dynamic_cast<graphlib::InputNode *>(node);
-    return input and (input->is_parameter() or input->is_constant()) 
-           and not node->as<graphlib::TaggedNode>()->has_tag("dont_consteval");
+    return input and (input->is_parameter() or input->is_constant()) and
+           not node->as<graphlib::TaggedNode>()->has_tag("dont_consteval");
 }
 
 bool is_consteval_capable_op(Graph *graph, Node *node, bool allow_forks)
@@ -2039,14 +2236,14 @@ bool is_consteval_capable_op(Graph *graph, Node *node, bool allow_forks)
     return op->is_tm();
 }
 
-bool is_consteval_capable_input_no_operand_forks(Graph *graph, InputNode *input) 
+bool is_consteval_capable_input_no_operand_forks(Graph *graph, InputNode *input)
 {
     if (not is_consteval_capable_input_type(input))
         return false;
 
     std::vector<Node *> users = graph->data_users(input);
     std::vector<Edge> user_edges = graph->user_data_edges(input);
-    
+
     // If there is only one user then check if that op is consteval capable
     if (users.size() == 1)
         return is_consteval_capable_op(graph, users[0]) and graph->data_operands(users[0]).size() == 1;
@@ -2060,12 +2257,15 @@ bool is_consteval_capable_input_no_operand_forks(Graph *graph, InputNode *input)
     if (not std::all_of(users.begin(), users.end(), [graph](Node *n) { return graph->data_operands(n).size() == 1; }))
         return false;
 
-    if (not std::all_of(user_edges.begin(), user_edges.end(), [graph](Edge e) { return graph->get_edge_attributes(e)->get_tms().size() == 0; }))
+    if (not std::all_of(
+            user_edges.begin(),
+            user_edges.end(),
+            [graph](Edge e) { return graph->get_edge_attributes(e)->get_tms().size() == 0; }))
         return false;
 
     std::vector<OpNode *> user_ops;
     for (Node *user : users)
-        if (auto* op = dynamic_cast<OpNode*>(user))
+        if (auto *op = dynamic_cast<OpNode *>(user))
             user_ops.push_back(op);
         else
             return false;
@@ -2106,12 +2306,11 @@ bool try_consteval_input_no_operand_forks(Graph *graph, InputNode *input, bool d
     auto consteval_graph = input->get_consteval_graph(graph, true, true);
 
     auto users = graph->data_users(input);
-    
+
     // Thanks to is_consteval_capable_input(), we know that each user is identical (same op, same attrs, no edge tms)
     consteval_graph->promote_node(graph, users[0]);
 
-    for (uint32_t i = 1; i < users.size(); i++)
-        bypass_node(graph, users[i], true);
+    for (uint32_t i = 1; i < users.size(); i++) bypass_node(graph, users[i], true);
 
     if (dump_graph)
         reportify::dump_consteval_graph(graph->name(), input->name(), consteval_graph->get_graph());
@@ -2121,18 +2320,18 @@ bool try_consteval_input_no_operand_forks(Graph *graph, InputNode *input, bool d
 
 bool can_swap_operands(Graph *graph, Node *node)
 {
-        if (graph->data_operands(node).size() != 2)
-            return false;
+    if (graph->data_operands(node).size() != 2)
+        return false;
     if (node->node_type() == kBudaOp)
     {
         auto op = node->as<BudaOpNode>()->op_type().op;
-        return ( (op != "sub") && (op != "matmul") );
+        return ((op != "sub") && (op != "matmul"));
     }
 
     if (node->node_type() == kPyOp)
     {
         auto op = node->as<PyOpNode>()->op_type().op;
-        return ( (op != "sub") && (op != "matmul") );
+        return ((op != "sub") && (op != "matmul"));
     }
     return false;
 }
@@ -2153,11 +2352,14 @@ void swap_operands(Graph *graph, Node *node)
     }
 }
 
-Edge retrieve_between_edge(Graph *graph, Node *producer, Node *consumer) {
+Edge retrieve_between_edge(Graph *graph, Node *producer, Node *consumer)
+{
     auto producer_user_edges = graph->user_data_edges(producer);
     Edge *edge = nullptr;
-    for (auto &e :  producer_user_edges) {
-        if (e.consumer_node_id == consumer->id()) {
+    for (auto &e : producer_user_edges)
+    {
+        if (e.consumer_node_id == consumer->id())
+        {
             edge = &e;
             break;
         }
@@ -2166,28 +2368,34 @@ Edge retrieve_between_edge(Graph *graph, Node *producer, Node *consumer) {
     return *edge;
 }
 
-bool are_bcasts_between_ops(Graph *graph, Node *producer, Node *consumer) {
+bool are_bcasts_between_ops(Graph *graph, Node *producer, Node *consumer)
+{
     auto edge = retrieve_between_edge(graph, producer, consumer);
     auto edge_attr = graph->get_edge_attributes(edge);
     return edge_attr->has_broadcast_dims();
 }
 
-bool are_different_ranked_shapes_equivalent(Shape a, Shape b) {
+bool are_different_ranked_shapes_equivalent(Shape a, Shape b)
+{
     auto a_vec = a.as_vector();
     auto b_vec = b.as_vector();
 
     // Remove all pre 1s
     std::vector<int> new_a;
-    for (int i = 0; i < (int)a_vec.size(); i++) {
-        if (a_vec[i] == 1) {
+    for (int i = 0; i < (int)a_vec.size(); i++)
+    {
+        if (a_vec[i] == 1)
+        {
             a_vec.erase(a_vec.begin() + i);
             i--;
         }
         else if (a_vec[i] > 1)
             break;
     }
-    for (int i = 0; i < (int)b_vec.size(); i++) {
-        if (b_vec[i] == 1) {
+    for (int i = 0; i < (int)b_vec.size(); i++)
+    {
+        if (b_vec[i] == 1)
+        {
             b_vec.erase(b_vec.begin() + i);
             i--;
         }
@@ -2196,15 +2404,19 @@ bool are_different_ranked_shapes_equivalent(Shape a, Shape b) {
     }
 
     // Remove all post 1s
-    for (int i = (int)a_vec.size() - 1; i >= 0; i--) {
-        if (a_vec[i] == 1) {
+    for (int i = (int)a_vec.size() - 1; i >= 0; i--)
+    {
+        if (a_vec[i] == 1)
+        {
             a_vec.erase(a_vec.begin() + i);
         }
         else if (a_vec[i] > 1)
             break;
     }
-    for (int i = (int)b_vec.size() - 1; i >= 0; i--) {
-        if (b_vec[i] == 1) {
+    for (int i = (int)b_vec.size() - 1; i >= 0; i--)
+    {
+        if (b_vec[i] == 1)
+        {
             b_vec.erase(b_vec.begin() + i);
         }
         else if (b_vec[i] > 1)
@@ -2214,13 +2426,61 @@ bool are_different_ranked_shapes_equivalent(Shape a, Shape b) {
     if (a_vec.size() != b_vec.size())
         return false;
 
-    for (int i = 0; i < (int)a_vec.size(); i++) {
+    for (int i = 0; i < (int)a_vec.size(); i++)
+    {
         if (a_vec[i] != b_vec[i])
             return false;
     }
     return true;
 }
 
+// Check if this is a linked queue.
+// Linked queues are output queues which have users nodes connected via partial data copy edges.
+//
+bool is_linked_queue(const graphlib::Graph *graph, const graphlib::Node *node)
+{
+    bool output_link_queue = node->node_type() == graphlib::NodeType::kOutput and
+                             not graph
+                                     ->user_edges(
+                                         node,
+                                         [](graphlib::Edge e) {
+                                             return e.edge_type == graphlib::EdgeType::kPartialDataCopy or
+                                                    e.edge_type == graphlib::EdgeType::kSubgraphLink;
+                                         })
+                                     .empty();
+    bool input_link_queue = node->node_type() == graphlib::NodeType::kInput and
+                            not graph
+                                    ->operand_edges(
+                                        node,
+                                        [](graphlib::Edge e) {
+                                            return e.edge_type == graphlib::EdgeType::kPartialDataCopy or
+                                                   e.edge_type == graphlib::EdgeType::kSubgraphLink;
+                                        })
+                                    .empty();
+    return output_link_queue or input_link_queue;
+}
+
+// Check whether queue is input queue on host, meaning it's data resides on host and is accessed via PCIe.
+//
+bool is_input_host_queue(bool input_queues_on_host, const Graph *graph, const Node *node)
+{
+    bool input_on_host =
+        input_queues_on_host && node->as<graphlib::QueueNode>()->is_input() &&
+        (node->as<graphlib::InputNode>()->is_activation() or node->as<graphlib::InputNode>()->is_loss()) &&
+        not is_linked_queue(graph, node);
+
+    return input_on_host;
+}
+
+// Check whether queue is output queue on host, meaning it's data resides on host and is transferred via PCIe.
+//
+bool is_output_host_queue(bool output_queues_on_host, const Graph *graph, const Node *node)
+{
+    bool output_on_host = output_queues_on_host && (node->node_type() == graphlib::NodeType::kOutput) &&
+                          node->as<graphlib::OutputNode>()->untilize() && not is_linked_queue(graph, node);
+    return output_on_host;
+}
+
 NodeGraphContainer::~NodeGraphContainer()
 {
     if (remove_from_graph)
diff --git a/pybuda/csrc/graph_lib/utils.hpp b/pybuda/csrc/graph_lib/utils.hpp
index 02a12c74a..8eef32653 100644
--- a/pybuda/csrc/graph_lib/utils.hpp
+++ b/pybuda/csrc/graph_lib/utils.hpp
@@ -3,24 +3,26 @@
 // SPDX-License-Identifier: Apache-2.0
 #pragma once
 
+#include <pybind11/pybind11.h>
+#include <pybind11/stl.h>
+
+#include <functional>
 #include <string_view>
 #include <typeindex>
 #include <typeinfo>
-#include <functional>
 
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
+#include "graph_lib/defines.hpp"
 namespace py = pybind11;
 
+#include "graph_lib/edge.hpp"
 #include "graph_lib/graph.hpp"
 #include "graph_lib/node.hpp"
-#include "graph_lib/edge.hpp"
-#include "third_party/json/json.hpp"
-
 #include "python_bindings_common.hpp"
+#include "third_party/json/json.hpp"
 
 template <typename T>
-constexpr auto type_name(const T&) noexcept {
+constexpr auto type_name(const T &) noexcept
+{
     std::string_view name = __PRETTY_FUNCTION__;
     std::string_view prefix = "auto type_name(const T &) [T = ";
     std::string_view suffix = "]";
@@ -29,17 +31,22 @@ constexpr auto type_name(const T&) noexcept {
     return name;
 }
 
-namespace tt {
+namespace tt
+{
 
-namespace graphlib {
+namespace graphlib
+{
 struct OpType;
+class QueueNode;
+class InputNode;
+class BudaOpNode;
 
 // pass through
-bool default_node_filter(Node*);
+bool default_node_filter(Node *);
 
 // Checks if given opnode is element-wise
 class OpNode;
-bool is_eltwise(const OpNode* op);
+bool is_eltwise(const OpNode *op);
 bool is_eltwise_nary(const OpNode *op);
 bool is_eltwise_unary(const OpNode *op);
 bool is_eltwise_binary(const OpNode *op);
@@ -54,7 +61,7 @@ TileDim get_tile_dim_from_height_width(int tile_height, int tile_width);
 std::vector<Node *> topological_sort(
     Graph const &graph, std::function<bool(Node *)> node_filter = default_node_filter, bool unroll_loops = false);
 
-std::vector<std::vector<Node*>> topological_generations(const Graph& graph);
+std::vector<std::vector<Node *>> topological_generations(const Graph &graph);
 
 // Returns vector of all visible nodes in the graph.
 //
@@ -63,21 +70,27 @@ std::vector<Node *> visible_nodes(Graph const &graph, std::function<bool(Node *)
 // Find the longest path from the graph. Optionally look for paths that don't start from ordered inputs.
 std::vector<Node *> get_longest_path(const Graph *graph, bool from_inputs_only = true);
 
-std::vector<Node*> get_nodes_with_indegree_zero(Graph* graph);
-std::vector<Node*> get_nodes_with_outdegree_zero(Graph* graph);
-std::vector<Node*> get_nodes_with_data_outdegree_zero(Graph* graph);
+std::vector<Node *> get_nodes_with_indegree_zero(Graph *graph);
+std::vector<Node *> get_nodes_with_outdegree_zero(Graph *graph);
+std::vector<Node *> get_nodes_with_data_outdegree_zero(Graph *graph);
 
 // Insert new node on the given edge. Node attributes will be picked up from consumer node.
 // Returns new edges to and from the new node.
 std::pair<Edge, Edge> insert_node_on_edge(
-    Graph *graph, 
-    Edge &edge, 
-    Node *node, 
-    bool inherit_consumer_attrs = true, 
+    Graph *graph,
+    Edge &edge,
+    Node *node,
+    bool inherit_consumer_attrs = true,
     bool remove_edge = true,
     std::uint32_t consumer_index = 0,
-    bool place_tms_on_outgoing = false
-);
+    bool place_tms_on_outgoing = false);
+
+QueueNode *create_buffering_queue(
+    Graph *graph, const graphlib::Node *producer_node, const std::string name, int num_entries);
+
+// Creates and inserts a nop node on the given edge.
+// Returns newly created node and edges.
+std::tuple<BudaOpNode*, Edge, Edge> insert_nop_on_edge(Graph *graph, Edge &edge, const std::string &nop_name, bool is_buffering = false, bool hoist_tms = false, bool remove_edge = true);
 
 // Bypass queue, connecting its source to its destination. There has to be only one source for queue, and user is
 // defined by user_edge. Diference from bypassing node (bypass_node) is that here we can bypass some users of queue and
@@ -111,16 +124,19 @@ Edge swap(
 std::vector<Node *> subgraph(const Graph *graph, Node *producer, Node *consumer);
 
 // Return nodes reachable from a given start node, using only data edges
-std::vector<Node *> reachable_nodes(const Graph *graph, Node *start, std::function<bool(Node *)> node_filter = default_node_filter, bool ancenstors_only = false);
+std::vector<Node *> reachable_nodes(
+    const Graph *graph,
+    Node *start,
+    std::function<bool(Node *)> node_filter = default_node_filter,
+    bool ancenstors_only = false);
 
-std::vector<Node *> top_row(
-    Graph const *graph, std::vector<Node *> const &nodes);
+std::vector<Node *> top_row(Graph const *graph, std::vector<Node *> const &nodes);
 
-std::vector<Node *> bot_row(
-    Graph const *graph, std::vector<Node *> const &nodes);
+std::vector<Node *> bot_row(Graph const *graph, std::vector<Node *> const &nodes);
 
 // Check if there is a data dependency between two nodes(producer, consumer), return true if it exists
-bool check_producer_consumer(Graph* graph, Node* producer, Node* consumer, std::function<bool(Node*)> node_filter = default_node_filter);
+bool check_producer_consumer(
+    Graph *graph, Node *producer, Node *consumer, std::function<bool(Node *)> node_filter = default_node_filter);
 
 // Return a subset of graph nodes in their respective topological order
 // There are two ways to filter:
@@ -175,7 +191,7 @@ graphlib::Node *cascade_nary_to_binary_op(graphlib::Graph *graph, graphlib::Node
 void convert_implicit_to_explicit_bcasts(Graph *graph, Edge edge);
 
 // Swap dimensions of any broadcast tms, return true if change made
-bool swap_broadcast_dims(graphlib::Graph *graph, graphlib::Edge edge, int old_dim, int new_dim); 
+bool swap_broadcast_dims(graphlib::Graph *graph, graphlib::Edge edge, int old_dim, int new_dim);
 
 // Insert squeezes / unsqueezes to satisfy change in rank
 void handle_change_rank(graphlib::Graph *graph, graphlib::Edge edge);
@@ -183,7 +199,8 @@ void handle_change_rank(graphlib::Graph *graph, graphlib::Node *node);
 
 // This function clones the input producer node and creates a new edge connection replacing
 // the old edge. user_edge must come from an input node, returns new edge.
-graphlib::Edge clone_input_forking_edge(graphlib::Graph *graph, graphlib::Edge user_edge, bool allow_single_user = false);
+graphlib::Edge clone_input_forking_edge(
+    graphlib::Graph *graph, graphlib::Edge user_edge, bool allow_single_user = false);
 
 graphlib::Shape default_tm_evaluator(graphlib::OpType const &tm, graphlib::Shape shape, graphlib::IRLevel ir_level);
 graphlib::Shape ignore_broadcast_tm_evaluator(
@@ -210,7 +227,9 @@ bool tms_support_kernel_broadcast(
 // Calculate node shape from operand shapes, using python callback
 void calculate_and_set_node_shape(Graph *graph, Node *node);
 
+tt::graphlib::Node *get_input_queue_producer(Graph const *graph, tt::graphlib::InputNode const *node);
 std::vector<tt::graphlib::UBlockOrder> get_input_ublock_order(Graph const *graph, Node const *node);
+tt::graphlib::UBlockOrder get_input_queue_ublock_order(Graph const *graph, Node const *node);
 tt::graphlib::UBlockOrder get_output_ublock_order(Graph const *graph, Node const *node);
 
 // Insert NOP on an edge with transpose TM, then flip ublock order for better streaming
@@ -219,11 +238,12 @@ bool try_insert_nop_on_transpose_edge(Graph *graph, Edge &edge);
 
 // Return a vector of pairs of optimizer parameter input nodes and optimizer key names for a given model parameter node
 class InputNode;
-std::vector<std::pair<InputNode *, std::string>> get_optimizer_param_info(const Graph *graph, const Node *model_parameter);
+std::vector<std::pair<InputNode *, std::string>> get_optimizer_param_info(
+    const Graph *graph, const Node *model_parameter);
 
 bool is_constant_input(const Node *node);
 bool is_recompute(const Graph *graph, const Node *node);
-Node* get_fwd_from_recompute(const Graph *graph, const Node *node);
+Node *get_fwd_from_recompute(const Graph *graph, const Node *node);
 
 bool can_swap_operands(Graph *graph, Node *node);
 void swap_operands(Graph *graph, Node *node);
@@ -243,7 +263,8 @@ std::unique_ptr<Node> try_consteval_op(Graph *graph, Node *node, bool dump_graph
 
 bool try_consteval_input_no_operand_forks(Graph *graph, InputNode *input, bool dump_graph = false);
 
-class ConstEvalGraph {
+class ConstEvalGraph
+{
    public:
     explicit ConstEvalGraph(
         std::string const &name, Node *runtime_input, bool promote_input, unsigned int subgraph_id, int unique_id = -1);
@@ -288,15 +309,17 @@ enum class RuntimeTensorTransformType
     Concatenate,
 };
 
-NLOHMANN_JSON_SERIALIZE_ENUM(tt::graphlib::RuntimeTensorTransformType, {
-    {tt::graphlib::RuntimeTensorTransformType::NoTransform, "NoTransform"},
-    {tt::graphlib::RuntimeTensorTransformType::ReinterpretShape, "ReinterpretShape"},
-    {tt::graphlib::RuntimeTensorTransformType::Prestride, "Prestride"},
-    {tt::graphlib::RuntimeTensorTransformType::EmbeddingIndex, "EmbeddingIndex"},
-    {tt::graphlib::RuntimeTensorTransformType::ConstantInput, "ConstantInput"},
-    {tt::graphlib::RuntimeTensorTransformType::Unpad, "Unpad"},
-    {tt::graphlib::RuntimeTensorTransformType::Concatenate, "Concatenate"},
-});
+NLOHMANN_JSON_SERIALIZE_ENUM(
+    tt::graphlib::RuntimeTensorTransformType,
+    {
+        {tt::graphlib::RuntimeTensorTransformType::NoTransform, "NoTransform"},
+        {tt::graphlib::RuntimeTensorTransformType::ReinterpretShape, "ReinterpretShape"},
+        {tt::graphlib::RuntimeTensorTransformType::Prestride, "Prestride"},
+        {tt::graphlib::RuntimeTensorTransformType::EmbeddingIndex, "EmbeddingIndex"},
+        {tt::graphlib::RuntimeTensorTransformType::ConstantInput, "ConstantInput"},
+        {tt::graphlib::RuntimeTensorTransformType::Unpad, "Unpad"},
+        {tt::graphlib::RuntimeTensorTransformType::Concatenate, "Concatenate"},
+    });
 
 class RuntimeTensorTransform
 {
@@ -334,11 +357,20 @@ class RuntimeTensorTransform
         int kernel_width,
         int concat_group,
         int concat_index,
-        int concat_dim)
-        : type(type), original_shape(original_shape), reinterpreted_shape(reinterpreted_shape),
-          unpadded_shape(unpadded_shape), stride_height(stride_height), stride_width(stride_width),
-          kernel_height(kernel_height), kernel_width(kernel_width), concat_group(concat_group),
-          concat_index(concat_index), concat_dim(concat_dim) {}
+        int concat_dim) :
+        type(type),
+        original_shape(original_shape),
+        reinterpreted_shape(reinterpreted_shape),
+        unpadded_shape(unpadded_shape),
+        stride_height(stride_height),
+        stride_width(stride_width),
+        kernel_height(kernel_height),
+        kernel_width(kernel_width),
+        concat_group(concat_group),
+        concat_index(concat_index),
+        concat_dim(concat_dim)
+    {
+    }
 
     RuntimeTensorTransform(Shape original_shape, Shape reinterpreted_shape)
     {
@@ -386,10 +418,7 @@ class RuntimeTensorTransform
         this->constant_tensor = make_shared_py_object(tensor);
     }
 
-    py::object get_constant_input_tensor()
-    {
-        return borrow_shared_py_object(this->constant_tensor);
-    }
+    py::object get_constant_input_tensor() { return borrow_shared_py_object(this->constant_tensor); }
 
     NLOHMANN_DEFINE_TYPE_INTRUSIVE(
         RuntimeTensorTransform,
@@ -406,13 +435,17 @@ class RuntimeTensorTransform
         concat_dim);
 
    private:
-
     // Constant Input
     std::shared_ptr<void> constant_tensor;
-
 };
 bool are_different_ranked_shapes_equivalent(Shape a, Shape b);
 
+bool is_linked_queue(const Graph *graph, const Node *node);
+
+bool is_input_host_queue(bool input_queues_on_host, const Graph *graph, const Node *node);
+
+bool is_output_host_queue(bool output_queues_on_host, const Graph *graph, const Node *node);
+
 // Wrapper graph management utility class for Node.
 // If remove_from_graph is set to true on destruction of NodeGraphContainer
 // graph->remove_node(node) will be invoked.
diff --git a/pybuda/csrc/lower_to_buda/comment.hpp b/pybuda/csrc/lower_to_buda/comment.hpp
deleted file mode 100644
index d5fb8350a..000000000
--- a/pybuda/csrc/lower_to_buda/comment.hpp
+++ /dev/null
@@ -1,43 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <algorithm>
-#include <ostream>
-#include <sstream>
-#include <string>
-#include <string_view>
-
-namespace tt
-{
-
-struct Comment
-{
-    Comment() = default;
-    Comment(char const *str) : str(str) {}
-    Comment(std::string const& str) : str(str) {}
-    Comment(std::stringstream const& str) : str(str.str()) {}
-
-    operator bool() const { return not str.empty(); }
-
-    std::string str;
-};
-
-inline std::ostream &operator<<(std::ostream &os, Comment const &comment)
-{
-    std::string::size_type begin = 0;
-    std::string::size_type end = 0;
-    while (end != comment.str.size())
-    {
-        end = comment.str.find('\n', begin);
-        if (end == std::string::npos)
-            end = comment.str.size();
-        std::string::size_type size = end - begin;
-        os << "# " << std::string_view(comment.str.data() + begin, size) << std::endl;
-        begin = end + 1;
-    }
-    return os;
-}
-
-}  // namespace tt
diff --git a/pybuda/csrc/lower_to_buda/common.hpp b/pybuda/csrc/lower_to_buda/common.hpp
index fcc7c70f5..96f71d6f3 100644
--- a/pybuda/csrc/lower_to_buda/common.hpp
+++ b/pybuda/csrc/lower_to_buda/common.hpp
@@ -187,7 +187,13 @@ inline bool is_b_data_format(DataFormat df)
 
 inline bool is_a_data_format(DataFormat df)
 {
-    return not is_b_data_format(df);
+    switch(df){
+        case DataFormat::Float16:
+        case DataFormat::Bfp8:
+        case DataFormat::Bfp4:
+        case DataFormat::Bfp2: return true;
+        default: return false;
+    }
 }
 
 inline DataFormat to_a_data_format(DataFormat df)
diff --git a/pybuda/csrc/lower_to_buda/debug.cpp b/pybuda/csrc/lower_to_buda/debug.cpp
deleted file mode 100644
index e4611569f..000000000
--- a/pybuda/csrc/lower_to_buda/debug.cpp
+++ /dev/null
@@ -1,122 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "lower_to_buda/debug.hpp"
-
-#include "backend_api/device_config.hpp"
-#include "balancer/balancer_utils.hpp"
-#include "balancer/types.hpp"
-#include "utils/env.hpp"
-
-namespace tt
-{
-static bool debug_info_enabled() { return env_as<bool>("PYBUDA_NETLIST_DEBUG_INFO"); }
-
-void to_debug_info(std::ostream& os, tt::DeviceConfig const& device_config)
-{
-    if (not debug_info_enabled())
-        return;
-    os << device_config << std::endl;
-}
-
-static void to_debug_info(std::ostream& os, balancer::BufferModel const& buffer, bool include_t = false)
-{
-    os << "l1_size_tiles: " << buffer.l1_size_tiles;
-    os << ", ";
-    os << "total_size_bytes: " << buffer.size_bytes(include_t);
-}
-
-static void to_debug_info(std::ostream& os, std::vector<balancer::BufferModel> const& buffers, bool include_t = false)
-{
-    for (std::size_t i = 0; i < buffers.size(); ++i)
-    {
-        if (not buffers[i])
-            continue;
-        os << "    [" << i << "] = ";
-        to_debug_info(os, buffers[i], include_t);
-        os << std::endl;
-    }
-}
-
-static void to_debug_info(std::ostream& os, std::vector<std::size_t> const& input_dram_io_buf_size_tiles)
-{
-    for (std::size_t i = 0; i < input_dram_io_buf_size_tiles.size(); ++i)
-    {
-        os << "    [" << i << "] = " << input_dram_io_buf_size_tiles[i];
-        os << std::endl;
-    }
-}
-
-static void to_debug_info_tile_sizes(std::ostream& os, balancer::OpModel const& op_model)
-{
-    std::vector<DataFormat> dfs;
-    auto collect_dfs = [&dfs](std::vector<balancer::BufferModel> const& buffers)
-    {
-        for (balancer::BufferModel const& buffer : buffers)
-        {
-            if (std::find(dfs.begin(), dfs.end(), buffer.data_format) == dfs.end())
-                dfs.push_back(buffer.data_format);
-        }
-    };
-    collect_dfs(op_model.input_buffers);
-    collect_dfs(op_model.parameter_buffers);
-    collect_dfs(op_model.intermediate_buffers);
-    collect_dfs(op_model.output_buffers);
-    std::sort(dfs.begin(), dfs.end());
-    for (auto df : dfs)
-    {
-        os << "    " << df << ": " << balancer::tile_size_bytes(df) << " bytes";
-    }
-}
-
-static std::size_t get_dram_io_size_bytes(
-    std::vector<balancer::BufferModel> const& buffers, std::vector<std::size_t> const& input_dram_io_buf_size_tiles)
-{
-    TT_ASSERT(buffers.size() == input_dram_io_buf_size_tiles.size());
-    std::size_t total_size = 0;
-    for (std::size_t i = 0; i < input_dram_io_buf_size_tiles.size(); ++i)
-      {
-        total_size += balancer::tile_size_bytes(buffers[i].data_format) * input_dram_io_buf_size_tiles[i];
-    }
-
-    return total_size;
-}
-
-    void to_debug_info(
-        std::ostream& os,
-        std::string const& name,
-        balancer::OpModel const& op_model,
-        std::string const& arch_name,
-        std::vector<std::size_t> const& input_dram_io_buf_size_tiles)
-{
-    if (not debug_info_enabled())
-        return;
-
-    os << std::endl << "Debug Info: " << name << std::endl;
-    os << std::endl;
-    os << op_model;
-    os << std::endl;
-    os << std::endl;
-    os << "L1 Breakdown:" << std::endl;
-    os << "  tile_sizes:" << std::endl;
-    to_debug_info_tile_sizes(os, op_model);
-    os << std::endl;
-    os << "  input_buffers:" << std::endl;
-    to_debug_info(os, op_model.input_buffers);
-    os << "  parameter_buffers:" << std::endl;
-    to_debug_info(os, op_model.parameter_buffers, true);
-    os << "  intermediate_buffers:" << std::endl;
-    to_debug_info(os, op_model.intermediate_buffers);
-    os << "  output_buffers:" << std::endl;
-    to_debug_info(os, op_model.output_buffers);
-    os << std::endl;
-    os << "  dram_io_buffers:" << std::endl;
-    to_debug_info(os, input_dram_io_buf_size_tiles);
-    os << std::endl;
-    os << "  overlay_size: " << op_model.overlay_size << std::endl;
-    os << std::endl;
-    std::size_t dram_io_size_bytes = get_dram_io_size_bytes(op_model.input_buffers, input_dram_io_buf_size_tiles);
-    os << "Total L1 buffer usage: " << (op_model.get_l1_memory_usage() + dram_io_size_bytes + op_model.overlay_size - 64 * 1024) << " bytes" << std::endl;
-    os << "Estimated cycle count: " << op_model.get_execution_cycles(arch_name) << " cycles" << std::endl;
-}
-}
diff --git a/pybuda/csrc/lower_to_buda/debug.hpp b/pybuda/csrc/lower_to_buda/debug.hpp
deleted file mode 100644
index 43d1ad8c5..000000000
--- a/pybuda/csrc/lower_to_buda/debug.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <ostream>
-#include <string>
-#include <vector>
-
-namespace tt
-{
-
-struct DeviceConfig;
-
-namespace balancer
-{
-struct OpModel;
-}
-
-void to_debug_info(std::ostream& os, tt::DeviceConfig const& device_config);
-void to_debug_info(
-    std::ostream& os,
-    std::string const& name,
-    tt::balancer::OpModel const& op_model,
-    std::string const& arch_name,
-    std::vector<std::size_t> const& input_dram_io_buf_size_tiles);
-}  // namespace tt
diff --git a/pybuda/csrc/lower_to_buda/device.cpp b/pybuda/csrc/lower_to_buda/device.cpp
deleted file mode 100644
index 007c53467..000000000
--- a/pybuda/csrc/lower_to_buda/device.cpp
+++ /dev/null
@@ -1,13 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "lower_to_buda/device.hpp"
-
-namespace tt {
-
-std::ostream &operator<<(std::ostream &os, BudaDevice const &d) {
-    return os << d.id;
-}
-
-
-}
diff --git a/pybuda/csrc/lower_to_buda/device.hpp b/pybuda/csrc/lower_to_buda/device.hpp
deleted file mode 100644
index e4152885e..000000000
--- a/pybuda/csrc/lower_to_buda/device.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <ostream>
-
-namespace tt {
-
-struct BudaDevice {
-    int id;
-
-    BudaDevice(int id) : id(id) {}
-};
-
-std::ostream &operator<<(std::ostream &os, BudaDevice const &d);
-
-} // namespace tt
diff --git a/pybuda/csrc/lower_to_buda/fused_op.cpp b/pybuda/csrc/lower_to_buda/fused_op.cpp
deleted file mode 100644
index a6cacd05b..000000000
--- a/pybuda/csrc/lower_to_buda/fused_op.cpp
+++ /dev/null
@@ -1,245 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include "lower_to_buda/fused_op.hpp"
-
-#include <iostream>
-
-namespace tt
-{
-
-std::ostream &operator<<(std::ostream &os, BudaFusedOp const &op)
-{
-    os << op.id << ": " << std::endl;
-
-    const std::string indent = "    ";
-    os << indent << "inputs: " << op.input_count << std::endl;
-
-    // Dropped support for intermed formats at the moment. We'll provide a count only.
-    /*os << indent << "intermed_df: [";
-
-    bool first = true;
-    for (DataFormat df : op.intermediate_buffer_df)
-    {
-        if (!first) os << ", ";
-        first = false;
-        os << df;
-    }
-    os << "]" << std::endl;*/
-    os << indent << "intermediates: " << op.intermediate_buffer_df.size() << std::endl;
-
-    os << indent << "schedules: " << std::endl;
-
-    auto print_op = [&os](auto op, auto indent, int sch_id)
-    {
-        os << indent << "    - " << op.name << "." << sch_id << ": { type: " << op.type << ", inputs: [";
-
-        bool first = true;
-        for (auto i : op.inputs)
-        {
-            if (!first)
-                os << ", ";
-            first = false;
-            os << i;
-        }
-
-        os << "]";
-
-        for (auto t : op.tms)
-        {
-            if (t.second.size() == 0)
-                continue;
-
-            os << ", input_" << t.first << "_tms: [";
-
-            bool first_tm = true;
-            for (auto &tm : t.second)
-            {
-                if (!first_tm)
-                    os << ", ";
-                first_tm = false;
-
-                TT_ASSERT(tm.op == "broadcast" || tm.op == "tile_broadcast");  // only supported kind here
-
-                if (tm.op == "broadcast")
-                {
-                    // User-friendly dims
-                    os << "broadcast: {";
-                    assert(tm.attr.size() == 2);
-                    switch (std::get<int>(tm.attr[0]))
-                    {
-                        case 0: throw std::runtime_error("Broadcast of W not supported");
-                        case 1:
-                            os << "z";
-                            os << ": " << std::get<int>(tm.attr[1]) << "}";
-                            break;
-                        case 2:
-                            os << "r";
-                            os << ": " << std::get<int>(tm.attr[1]) << "}";
-                            break;
-                        case 3:
-                            os << "c";
-                            os << ": " << std::get<int>(tm.attr[1]) << "}";
-                            break;
-                    }
-                    continue;
-                }
-                else if (tm.op == "tile_broadcast")
-                {
-                    // User-friendly dims
-                    os << "tile_broadcast: ";
-                    assert(tm.attr.size() == 1);
-                    switch (std::get<int>(tm.attr[0]))
-                    {
-                        case 0:
-                        case 1: throw std::runtime_error("Tile broadcast of W/Z not supported");
-                        case 2: os << "r"; break;
-                        case 3: os << "c"; break;
-                    }
-                    continue;
-                }
-            }
-            os << "]";
-        }
-
-        if (!op.attrs.empty())
-        {
-            os << ", attributes: {";
-            bool first = true;
-            for (auto [key, value] : op.attrs)
-            {
-                if (!first)
-                {
-                    os << ", ";
-                }
-                first = false;
-                os << key << ": " << value;
-            }
-            os << "}";
-        }
-
-        if (op.popped_buffers.size() > 0)
-        {
-            os << ", pop: [";
-            bool first = true;
-            for (std::uint32_t buf : op.popped_buffers)
-            {
-                if (!first)
-                {
-                    os << ", ";
-                }
-                first = false;
-                os << "intermed" << buf;
-            }
-            os << "]";
-        }
-
-        if (op.popped_last_buffers.size() > 0)
-        {
-            os << ", pop_last: [";
-            bool first = true;
-            for (std::uint32_t buf : op.popped_last_buffers)
-            {
-                if (!first)
-                {
-                    os << ", ";
-                }
-                first = false;
-                os << "intermed" << buf;
-            }
-            os << "]";
-        }
-
-        os << ", mblock: [" << op.block_shape.first << ", " << op.block_shape.second << "]";
-        os << ", ublock: [" << op.ublock_shape.first << ", " << op.ublock_shape.second << "]";
-
-        os << ", output: " << op.output << "}" << std::endl;
-    };
-
-    int sch_id = 0;
-    for (auto sch : op.schedule)
-    {
-        os << indent << "  -" << std::endl;
-        for (auto op : sch)
-        {
-            print_op(op, indent, sch_id);
-        }
-        sch_id++;
-    }
-    return os;
-}
-
-bool BudaFusedSubOp::equivalent(const BudaFusedSubOp &other) const
-{
-    if (type != other.type)
-        return false;
-    if (output != other.output)
-        return false;
-
-    if (inputs.size() != other.inputs.size())
-        return false;
-    for (std::size_t i = 0; i < inputs.size(); i++)
-        if (inputs[i] != other.inputs[i])
-            return false;
-
-    if (attrs != other.attrs)
-        return false;
-
-    if (popped_buffers != other.popped_buffers)
-        return false;
-
-    for (auto &[index, input_tms] : tms)
-    {
-        if (other.tms.count(index) == 0)
-            return false;
-        if (input_tms.size() != other.tms.at(index).size())
-            return false;
-
-        for (std::size_t i = 0; i < input_tms.size(); i++)
-        {
-            const graphlib::OpType &us = input_tms[i];
-            const graphlib::OpType &them = other.tms.at(index).at(i);
-            if (us.op != them.op)
-                return false;
-            if (us.attr != them.attr)
-                return false;
-        }
-    }
-
-    if (block_shape != other.block_shape)
-        return false;
-
-    if (ublock_shape != other.ublock_shape)
-        return false;
-
-    return true;
-}
-
-// return true if two fused ops are equivalent - i.e. same except for sub op name
-bool BudaFusedOp::equivalent(const BudaFusedOp &other) const
-{
-    if (input_count != other.input_count)
-        return false;
-    if (intermediate_buffer_df.size() != other.intermediate_buffer_df.size())
-        return false;
-    for (std::size_t i = 0; i < intermediate_buffer_df.size(); i++)
-        if (intermediate_buffer_df[i] != other.intermediate_buffer_df[i])
-            return false;
-
-    if (schedule.size() != other.schedule.size())
-        return false;
-    for (std::size_t schedule_index = 0; schedule_index < schedule.size(); schedule_index++)
-    {
-        if (schedule[schedule_index].size() != other.schedule[schedule_index].size())
-            return false;
-
-        for (std::size_t i = 0; i < schedule[schedule_index].size(); i++)
-            if (!schedule[schedule_index][i].equivalent(other.schedule[schedule_index][i]))
-                return false;
-    }
-    return true;
-}
-
-}  // namespace tt
-
diff --git a/pybuda/csrc/lower_to_buda/fused_op.hpp b/pybuda/csrc/lower_to_buda/fused_op.hpp
deleted file mode 100644
index d99494f4d..000000000
--- a/pybuda/csrc/lower_to_buda/fused_op.hpp
+++ /dev/null
@@ -1,45 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <string>
-#include <unordered_map>
-#include <vector>
-
-#include "graph_lib/node_types.hpp"
-#include "lower_to_buda/common.hpp"
-
-namespace tt
-{
-
-struct BudaFusedSubOp
-{
-    std::string name;
-    std::string type;
-    std::vector<std::string> inputs;
-    BudaOpAttrs attrs;
-    std::unordered_map<int, std::vector<graphlib::OpType>> tms;  // per operand
-    std::string output;
-    std::vector<std::uint32_t> popped_buffers;
-    std::vector<std::uint32_t> popped_last_buffers;
-    std::pair<std::uint32_t, std::uint32_t> block_shape;
-    std::pair<std::uint32_t, std::uint32_t> ublock_shape;
-    bool equivalent(const BudaFusedSubOp &other)
-        const;  // return true if two sub ops are equivalent - i.e. same except for op names
-};
-
-struct BudaFusedOp
-{
-    std::uint32_t id;
-    std::uint32_t input_count;
-    std::vector<DataFormat> intermediate_buffer_df;
-    std::vector<std::vector<BudaFusedSubOp>> schedule;
-
-    bool equivalent(const BudaFusedOp &other)
-        const;  // return true if two fused ops are equivalent - i.e. same except for sub op names
-};
-
-std::ostream &operator<<(std::ostream &os, BudaFusedOp const &op);
-
-}  // namespace tt
diff --git a/pybuda/csrc/lower_to_buda/graph.cpp b/pybuda/csrc/lower_to_buda/graph.cpp
deleted file mode 100644
index bb58c4de7..000000000
--- a/pybuda/csrc/lower_to_buda/graph.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "lower_to_buda/graph.hpp"
-
-#include <ostream>
-
-#include "lower_to_buda/comment.hpp"
-
-namespace tt {
-
-std::string get_subgraph_name(
-    graphlib::NodeEpochType epoch_type,
-    int epoch_number,
-    const std::string& arch_name,
-    std::uint32_t temporal_epoch_id,
-    std::uint32_t subgraph_index)
-{
-    std::string ret;
-    switch (epoch_type) {
-        case graphlib::NodeEpochType::Forward: ret = "fwd_"; break;
-        case graphlib::NodeEpochType::Backward: ret = "bwd_"; break;
-        case graphlib::NodeEpochType::Optimizer: ret = "opt_"; break;
-    }
-    ret = ret + std::to_string(subgraph_index) + "_";
-    if (arch_name == "wormhole" or arch_name == "wormhole_b0") {
-        return ret + std::to_string(epoch_number) + "_" + "temporal_epoch_" + std::to_string(temporal_epoch_id);
-    } else {
-        return ret + std::to_string(epoch_number);
-    }
-}
-
-std::vector<std::uint32_t> BudaGraph::get_matching_epoch(graphlib::NodeEpochType type) const
-{
-    std::vector<std::uint32_t> ret;
-    for (std::size_t i=0; i < epoch_types.size(); i++)
-        if (epoch_types[i] == type)
-            ret.push_back(i);
-    return ret;
-}
-
-std::ostream &operator<<(std::ostream &os, BudaGraph const &g) {
-
-    const std::string indent = "    ";
-    for (std::size_t epoch = 0; epoch < g.ops.size(); epoch++) {
-        if (g.arch_name == "grayskull" and g.ops[epoch].size() == 0)
-        {
-            continue;
-        }
-        os << "  " << get_subgraph_name(g.epoch_types[epoch], epoch, g.arch_name, g.epoch_to_temporal_epoch_id[epoch], g.epoch_to_subgraph_index[epoch]) << ":" << std::endl;
-        os << indent << "target_device: " << g.epoch_target_devices[epoch] << std::endl;
-
-        int input_count = (g.epoch_types[epoch] == graphlib::NodeEpochType::Optimizer) ? 1 : g.microbatch_size;
-        os << indent << "input_count: " << input_count << std::endl;
-        for (const BudaOp &op : g.ops[epoch]) {
-            if (op.debug_info)
-                os << std::endl << op.debug_info;
-            os << indent << op << std::endl;
-        }
-        os << std::endl;
-    }
-
-    return os;
-}
-
-}
-
-
diff --git a/pybuda/csrc/lower_to_buda/graph.hpp b/pybuda/csrc/lower_to_buda/graph.hpp
deleted file mode 100644
index 2218c357a..000000000
--- a/pybuda/csrc/lower_to_buda/graph.hpp
+++ /dev/null
@@ -1,33 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "lower_to_buda/device.hpp"
-#include "lower_to_buda/op.hpp"
-
-namespace tt {
-
-struct BudaGraph {
-
-    std::string name;
-    std::string arch_name;
-    std::vector<graphlib::NodeEpochType> epoch_types;
-    std::vector<BudaDevice> epoch_target_devices;
-    std::vector<std::vector <BudaOp>> ops; // arrays of ops per epoch
-    std::vector<std::uint32_t> epoch_to_temporal_epoch_id;
-    std::vector<std::uint32_t> epoch_to_subgraph_index;
-    std::uint32_t microbatch_size;
-
-    BudaGraph(const std::string &name, const std::string& arch_name, std::uint32_t microbatch)
-        : name(name), arch_name(arch_name), microbatch_size(microbatch) {}
-    std::vector<std::uint32_t> get_matching_epoch(graphlib::NodeEpochType type) const;
-
-};
-
-std::string get_subgraph_name(graphlib::NodeEpochType epoch_type, int epoch_number, const std::string& arch_name, std::uint32_t temporal_epoch_id, std::uint32_t subgraph_index);
-std::ostream &operator<<(std::ostream &os, BudaGraph const &g);
-
-} // namespace tt
-
-
diff --git a/pybuda/csrc/lower_to_buda/netlist.cpp b/pybuda/csrc/lower_to_buda/netlist.cpp
deleted file mode 100644
index c503f55d2..000000000
--- a/pybuda/csrc/lower_to_buda/netlist.cpp
+++ /dev/null
@@ -1,1572 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "lower_to_buda/netlist.hpp"
-
-#include <sstream>
-
-#include "buda_passes.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-#include "graph_lib/node_types.hpp"
-#include "graph_lib/utils.hpp"
-#include "lower_to_buda/debug.hpp"
-#include "lower_to_buda/graph.hpp"
-#include "lower_to_buda/program.hpp"
-#include "lower_to_buda/queue.hpp"
-#include "passes/fuse_ops.hpp"
-#include "passes/forked_dram_inputs.hpp"
-#include "placer/utils.hpp"
-#include "utils/assert.hpp"
-#include "utils/logger.hpp"
-
-namespace tt
-{
-
-using Graph = graphlib::Graph;
-using Node = graphlib::Node;
-using NodeType = graphlib::NodeType;
-using Edge = graphlib::Edge;
-
-void dump_group_of_queues(
-    std::stringstream &ss, const std::string &type, const std::vector<BudaQueue> &qs, std::size_t longest_name)
-{
-    if (qs.size() > 0)
-    {
-        ss << std::endl
-           << "  "
-           << "# " << type << std::endl;
-        for (const BudaQueue &q : qs)
-        {
-            ss << "  " << q.as_string(longest_name) << std::endl;
-        }
-    }
-}
-
-std::string BudaNetlist::dump_to_yaml() const
-{
-    std::stringstream ss;
-
-    ss << comments;
-    if (comments)
-        ss << std::endl;  // Add an extra newline for readability
-
-    ss << debug_info;
-
-    ss << "devices:" << std::endl;
-    ss << "  arch: " << arch_string << std::endl << std::endl;
-
-    ss << "queues:" << std::endl;
-
-    std::size_t longest_name = 0;
-    for (const BudaQueue &q : queues)
-    {
-        if (q.name.length() > longest_name)
-            longest_name = q.name.length();
-    }
-
-    // Group by type, for nicer display
-    std::unordered_map<std::string, std::vector<BudaQueue>> grouped;
-
-    for (const BudaQueue &q : queues)
-    {
-        grouped[q.type].push_back(q);
-    }
-
-    // Fixed order of common types
-    std::vector<std::string> types = {"input", "output", "parameter", "constant", "epoch_to_epoch", "accumulator"};
-
-    for (std::string type : types)
-    {
-        dump_group_of_queues(ss, type, grouped[type], longest_name);
-        grouped.erase(type);
-    }
-
-    // Print other groups that might've showed up that we don't know about
-    for (auto const &[type, qs] : grouped)
-    {
-        dump_group_of_queues(ss, type, qs, longest_name);
-    }
-
-    ss << std::endl;
-    ss << "graphs:" << std::endl;
-    for (const BudaGraph &g : graphs)
-    {
-        ss << g;
-    }
-
-    ss << std::endl;
-    ss << "programs:" << std::endl;
-    for (const program::Program &p : programs)
-    {
-        ss << "  - " << p;
-    }
-
-    ss << std::endl;
-
-    if (fused_ops.size() > 0)
-    {
-        ss << "fused_ops:" << std::endl;
-        for (const BudaFusedOp &op : fused_ops) ss << "  " << op;
-    }
-
-    return ss.str();
-}
-
-std::string get_buda_queue_type(graphlib::QueueNode *node)
-{
-    if (node->node_type() == graphlib::NodeType::kInput)
-    {
-        graphlib::InputNode *input_node = node->as<graphlib::InputNode>();
-        return input_node->input_type_string();
-    }
-
-    return node->queue_type_string();
-}
-
-
-static graphlib::Node *get_input_queue_producer(Graph *graph, graphlib::InputNode *node)
-{
-    auto is_partial_datacopy_edge = [](Edge e) { return (e.edge_type == graphlib::EdgeType::kPartialDataCopy); };
-    std::vector<graphlib::Edge> partial_datacopy_edges = graph->operand_edges(node, is_partial_datacopy_edge);
-    auto producers = graph->data_operands(node);
-
-    if (not producers.empty() and not partial_datacopy_edges.empty()) {
-        throw std::runtime_error("Input queue " +  node->name() + " has both producer and partial datacopy edge!");
-    }
-    else if (not producers.empty())
-    {
-        TT_ASSERT(producers.size() == 1);
-        return producers[0];
-    }
-    else if (not partial_datacopy_edges.empty())
-    {
-        std::vector<graphlib::Edge> producer_edges;
-        for (auto edge : partial_datacopy_edges) {
-            auto output_node = graph->node_by_id(edge.producer_node_id);
-            TT_ASSERT(graph->operand_edges(output_node).size() == 1, "Output node should only have 1 producer");
-            producer_edges.push_back(
-                graph->operand_edges(output_node).front()
-            );
-        }
-
-        // Assert all partial datacopy producer edges have the same ublock order
-        TT_ASSERT(
-            std::all_of(
-                producer_edges.begin(),
-                producer_edges.end(),
-                [graph, producer_edges](Edge e)
-                { return graph->get_edge_attributes(e)->get_ublock_order() == graph->get_edge_attributes(producer_edges[0])->get_ublock_order(); }
-            )
-        );
-
-        graphlib::OutputNode *output =
-            graph->node_by_id(partial_datacopy_edges[0].producer_node_id)->as<graphlib::OutputNode>();
-        auto output_producer = graph->data_operands(output);
-        TT_ASSERT(output_producer.size() == 1);
-        TT_ASSERT(output_producer[0]->node_type() == graphlib::NodeType::kBudaOp);
-        return output_producer[0];
-    }
-
-    return nullptr;
-}
-
-// Common code for all queues
-BudaQueue create_queue(
-    Graph *graph,
-    graphlib::QueueNode *node,
-    placer::QueuePlacement const &placement,
-    balancer::BlockShapeMap const &block_shape_map)
-{
-    std::string type = get_buda_queue_type(node);
-    std::string memory_access = node->memory_access_type_string();
-
-    BudaQueue q(placement.name, type, memory_access, placement.chip_id, node->shape().get_tile_dim());
-
-    q.input_name = placement.input_name;
-    q.entries = node->get_num_entries();
-    q.microbatch = graph->get_microbatch();
-    q.data_format = node->output_df();
-    q.alias = node->get_alias();
-    q.dims = {
-        .grid_r = (int)placement.grid_shape.rows,
-        .grid_c = (int)placement.grid_shape.columns,
-    };
-
-    if (placement.on_host)
-    {
-        q.loc = BudaQueueLocation::HOST;
-        for (placer::QueueHostBufferPlacement const &p : placement.host_buffers)
-        {
-            q.host_loc.push_back({p.channel, p.address});
-        }
-    }
-    else
-    {
-        q.loc = BudaQueueLocation::DRAM;
-        for (placer::QueueBufferPlacement const &p : placement.dram_buffers)
-        {
-            q.dram_loc.push_back({p.dram_channel, p.dram_address});
-        }
-    }
-
-    if ((node->node_type() == NodeType::kOutput and q.loc == BudaQueueLocation::HOST) or
-        node->node_type() != NodeType::kQueue)
-    {
-        q.blocks = block_shape_map.at(node->name()).as_buda_blocks();
-    }
-    else
-    {
-        // e2e queue inserted after balancer, inherit from producer
-        Node *producer = graph->data_operands(node)[0];
-        q.blocks = block_shape_map.at(producer->name()).as_buda_blocks();
-    }
-
-
-    switch (node->node_type())
-    {
-        case NodeType::kInput:
-        {
-            q.layout = node->as<graphlib::InputNode>()->get_layout();
-
-            if (auto producer = get_input_queue_producer(graph, node->as<graphlib::InputNode>()); producer)
-            {
-                q.ublock_order = get_output_ublock_order(graph, producer);
-            }
-            else
-            {
-                auto consumers = graph->user_data_edges(node);
-                bool all_users_transpose = std::all_of(
-                    consumers.begin(),
-                    consumers.end(),
-                    [graph](graphlib::Edge e) { return graph->get_edge_attributes(e)->has_tm("transpose"); });
-                auto user_ublock_order = graph->get_edge_attributes(consumers.front())->get_ublock_order();
-                bool all_users_same_order = std::all_of(
-                    consumers.begin(),
-                    consumers.end(),
-                    [graph, user_ublock_order](graphlib::Edge e)
-                    { return user_ublock_order == graph->get_edge_attributes(e)->get_ublock_order(); });
-
-                auto q_ublock_order = all_users_same_order ? user_ublock_order : graphlib::UBlockOrder::R;
-                q.ublock_order = all_users_transpose ? flip_ublock_order(q_ublock_order) : q_ublock_order;
-            }
-            break;
-        }
-        case NodeType::kOutput:  // fallthrough
-        case NodeType::kQueue:   // fallthrough
-        {
-            auto producer = graph->data_operands(node);
-            TT_ASSERT(producer.size() == 1);
-            q.ublock_order = get_output_ublock_order(graph, producer[0]);
-            break;
-        }
-        default:
-        {
-            TT_ASSERT(false, "Unhandled queue node type", node->node_type());
-            break;
-        }
-    }
-
-    return q;
-}
-
-void validate_op_grid_size(BudaOp const &op)
-{
-    if (op.type == "ethernet_datacopy")
-        return;
-
-    TT_ASSERT(op.grid.grid_size_c > 0 && op.grid.grid_size_r > 0, "Op {} has a 0 in grid size", op.name);
-}
-
-static BudaOp create_op(
-    Graph *graph,
-    graphlib::BudaOpNode *node,
-    std::vector<BudaOperand> const &operands,
-    std::vector<tt::DataFormat> &input_df,
-    BudaOpAttrs const &buda_attrs,
-    placer::OpPlacement const &placement,
-    std::unordered_map<std::string, placer::QueuePlacement> const &name_to_queue_placement,
-    balancer::OpModel const &op_model,
-    balancer::BlockShape const &block_shape,
-    std::string const &arch_name,
-    std::vector<std::size_t> const &input_dram_io_buf_size_tiles,
-    const std::vector<graphlib::Edge> &forked_dram_edges,
-    bool ignore_tms = false
-    )
-{
-    BudaOp op;
-
-    op.name = node->name();
-    op.type = node->op_type().op;
-    op.gradient_op = node->is_gradient_op();
-    op.input_data_formats = input_df;
-    op.output_data_format = node->output_df();
-    op.intermediate_data_format = node->intermediate_df();
-    op.accumulate_data_format = node->accumulate_df();
-    op.fidelity = node->math_fidelity();
-    op.attrs = buda_attrs;
-    op.tile_dim = node->shape().get_tile_dim();
-
-    if (!forked_dram_edges.empty())
-    {
-        for (auto edge : forked_dram_edges)
-        {
-            Node *producer_node = nullptr;
-            Node *consumer_node = nullptr;
-            producer_node = graph->node_by_id(edge.producer_node_id);
-            consumer_node = graph->node_by_id(edge.consumer_node_id);
-            // forked_dram_inputs holds following pair info {QueueNode name,BudaOpNode name}
-            op.forked_dram_inputs.push_back({producer_node->name(), consumer_node->name()});
-        }
-    }
-
-    // For fused op override op.attr with global attributes.
-    // Specific op attributes will be added later for each fused sub op.
-    if (node->is_fused_op())
-    {
-        op.attrs = node->get_fused_op()->get_operation_attr();
-    }
-
-    op.untilize_output = false;
-    for (Edge e : graph->user_data_edges(node))
-    {
-        Node *output = graph->node_by_id(e.consumer_node_id);
-        if (output->node_type() == graphlib::NodeType::kOutput)
-        {
-            try
-            {
-                placer::QueuePlacement qp = name_to_queue_placement.at(output->name());
-                if (qp.on_host)
-                {
-                    op.untilize_output = true;  // ops that are writing to host need to untilize their output
-                }
-            }
-            catch (std::out_of_range &e)
-            {
-                throw std::runtime_error(
-                    "Queue missing in placement results for " + output->name() + ", something went wrong.");
-            }
-        }
-    }
-
-    TT_ASSERT(
-        !node->op_type().buda_attrs.empty() or node->op_type().attr.size() == 0 or node->is_fused_op(),
-        "All ops with attributes should've been lowered to something else by this point.",
-        OStreamJoin("node:", node->name()),
-        OStreamJoin("type:", node->op_type().op));
-
-    try
-    {
-        placer::Coord s = placement.placed_cores.start;
-        placer::Coord e = placement.placed_cores.end;
-        placer::Coord op_grid_size = {.row = e.row - s.row, .col = e.col - s.col};
-        op.grid = (placement.grid_transpose) ? BudaOpGrid{s.row, s.col, op_grid_size.col, op_grid_size.row} :  BudaOpGrid{s.row, s.col, op_grid_size.row, op_grid_size.col};
-        op.grid_transpose = placement.grid_transpose;
-
-        validate_op_grid_size(op);
-
-        auto operand_edges = graph->operand_data_edges(node);
-        std::vector<std::uint32_t> input_buf_min_size_tiles;
-        bool input_buf_overrides = false;
-        for (std::size_t i = 0; i < operand_edges.size(); i++)
-        {
-            const Edge &operand = operand_edges[i];
-            auto edge_attrs = graph->get_edge_attributes(operand);
-            std::vector<graphlib::OpType> tms = edge_attrs->get_tms();
-            if (tms.size() > 0 and not ignore_tms)
-            {
-                // Tile broadcasts are fused into the fused ops themselves
-                std::vector<graphlib::OpType> filtered_tms;
-                std::copy_if(begin(tms), end(tms), std::back_inserter(filtered_tms),
-                        [](auto op_type) { return op_type.op != "tile_broadcast"; });
-                if (filtered_tms.size() < tms.size())
-                    TT_ASSERT(node->op_type().op == "fused_op", 
-                            "Only fused ops should have tile broadcast TMs: {}", node->name());
-                op.tms.insert(std::make_pair(operand.consumer_input_port_id, filtered_tms));
-            }
-
-            // We could just get the number from the input buffer model, but we don't want to pollute
-            // the netlist with "default" values - we only want to list something when there's an override
-            if (op_model.input_buffers.at(i).size_tiles_override)
-            {
-                input_buf_overrides = true;
-                input_buf_min_size_tiles.push_back(op_model.input_buffers.at(i).l1_size_tiles);
-            }
-            else
-            {
-                input_buf_min_size_tiles.push_back(0);
-            }
-
-            for (std::size_t i=0; i < op_model.input_buffers.size(); i++)
-                if (op_model.input_buffers[i].minimize_input_buffer)
-                {
-                    op.attrs["min_buffer_input"] = (int)i;
-                    break;
-                }
-        }
-
-        if (input_buf_overrides)
-            op.input_buf_min_size_tiles = input_buf_min_size_tiles;
-
-        op.ublock_order = get_output_ublock_order(graph, node);
-        op.blocks = block_shape.as_buda_blocks();
-        // TODO: ensure that there's actually room? it's probably not the best place to do this
-        op.buf_size_mb = op_model.get_output_buffer_factor();
-
-        // Overlay blob
-        op.overlay_size = op_model.overlay_size;
-    }
-    catch (std::out_of_range &e)
-    {
-        throw std::runtime_error("Op missing in placement results for " + node->name() + ", something went wrong.");
-    }
-
-    TT_ASSERT(
-        not op.untilize_output or op.ublock_order == graphlib::UBlockOrder::R,
-        "Untilizer requires row-major ublock ordering");
-
-    op.inputs = operands;
-    op.input_dram_io_buf_size_tiles = input_dram_io_buf_size_tiles;
-
-    std::stringstream ss;
-    to_debug_info(ss, node->name(), op_model, arch_name, input_dram_io_buf_size_tiles);
-    op.debug_info = Comment(ss);
-
-    return op;
-}
-
-static std::tuple<BudaOp, std::vector<BudaOperand>, std::vector<tt::DataFormat>, BudaOpAttrs, placer::OpPlacement>
-create_sparse_buffer_op(
-    Graph *graph,
-    graphlib::BudaOpNode *node,
-    std::vector<BudaOperand> operands,
-    std::vector<tt::DataFormat> const &input_df,
-    BudaOpAttrs const &buda_attrs,
-    placer::OpPlacement const &placement,
-    std::unordered_map<std::string, placer::QueuePlacement> const &name_to_queue_placement,
-    balancer::OpModel const &op_model,
-    balancer::BlockShape const &block_shape,
-    std::string const &arch_name,
-    std::vector<std::size_t> const &input_dram_io_buf_size_tiles)
-{
-    TT_ASSERT(operands.size() == 3);
-    TT_ASSERT(input_df.size() == 3);
-    std::vector<BudaOperand> buffer_operands = {operands[1], operands[2]};
-    std::vector<tt::DataFormat> buffer_input_df = {input_df[1], input_df[2]};
-    // Fixup op attributes
-    BudaOpAttrs sparse_mm_buda_attrs = buda_attrs;
-    BudaOpAttrs buffer_buda_attrs = buda_attrs;
-    buffer_buda_attrs.erase("num_sparse_tiles");
-    buffer_buda_attrs.erase("identity");
-    buffer_buda_attrs.erase("act_buffered");
-    if (buda_attrs.find("kernel_broadcast") != buda_attrs.end())
-    {
-        TT_ASSERT(std::get<std::vector<int>>(buda_attrs.at("kernel_broadcast"))[0] == 1);
-        buffer_buda_attrs["kernel_broadcast"] = std::vector<int>({0});
-        sparse_mm_buda_attrs.erase("kernel_broadcast");
-    }
-    // Fixup block shape
-    balancer::BlockShape buffer_block_shape = block_shape;
-    buffer_block_shape.mblock_m = 1;
-    buffer_block_shape.ublock.rt = op_model.input_buffers[1].block_shape.ublock.rt;
-    std::vector<tt::DataFormat> sparse_mm_input_df = input_df;
-    sparse_mm_input_df[1] = node->output_df();
-    // Fixup placement
-    placer::OpPlacement buffer_placement = placement;
-    placer::OpPlacement sparse_mm_placement = placement;
-    TT_ASSERT((buffer_placement.placed_cores.size_c() % 2) == 0);
-    std::uint32_t split_grid_c = buffer_placement.placed_cores.size_c() / 2;
-    buffer_placement.placed_cores.end.col = buffer_placement.placed_cores.start.col + split_grid_c;
-    sparse_mm_placement.placed_cores.start.col = buffer_placement.placed_cores.end.col;
-    std::vector<graphlib::Edge> forked_dram_edges;
-    BudaOp op = create_op(
-        graph,
-        node,
-        buffer_operands,
-        buffer_input_df,
-        buffer_buda_attrs,
-        buffer_placement,
-        name_to_queue_placement,
-        op_model,
-        buffer_block_shape,
-        arch_name,
-        input_dram_io_buf_size_tiles,
-        forked_dram_edges,
-        false);
-
-    op.name = node->name() + "_buffer";
-    op.type = "buffer";
-    operands[1] = op.name;
-    // Remap index 1 tms to index 0
-    if (op.tms.find(1) != op.tms.end())
-    {
-        op.tms[0] = op.tms[1];
-        op.tms.erase(1);
-    }
-
-    return std::make_tuple(op, operands, sparse_mm_input_df, sparse_mm_buda_attrs, sparse_mm_placement);
-}
-
-BudaFusedOp create_fused_op(graphlib::BudaOpNode *op, const balancer::OpModel &op_model)
-{
-    TT_ASSERT(op->is_fused_op());
-
-    std::vector<std::vector<BudaFusedSubOp>> schedules;
-    std::uint32_t input_count = 0;
-    std::vector<DataFormat> intermed_df;
-    std::unordered_map<std::uint32_t, balancer::UBlockShape> intermed_ublock_shape;
-    balancer::UBlockShape dest_ublock_shape;
-
-    // TODO: a lot of these calculations don't belong here, but in some kind of post-placer pass
-    // This should be a straight data transfer from op attributes to netlist attributes.
-    for (auto sch : op->get_fused_op()->get_schedules())
-    {
-        std::vector<BudaFusedSubOp> schedule;  
-        for (FusedSubOp f : sch.ops)
-        {
-            std::vector<std::string> inputs;
-            std::vector<balancer::UBlockShape> input_block_shapes;
-            std::unordered_map<int, std::vector<graphlib::OpType>> tms;  // per operand
-            std::uint32_t sub_op_input_count = 0;
-            for (auto i : f.inputs)
-            {
-                tms.insert(std::make_pair(sub_op_input_count, std::vector<graphlib::OpType>{}));
-                if (i.type == FusedSubOpInput::InputType::INPUT)
-                {
-                    inputs.push_back("input" + std::to_string(i.index));
-                    if (i.index + 1 > input_count)
-                        input_count = i.index + 1;
-                    input_block_shapes.push_back(op_model.input_buffers[i.index].block_shape.ublock);
-                }
-                else if (i.type == FusedSubOpInput::InputType::DEST)
-                {
-                    inputs.push_back("dest");
-                    input_block_shapes.push_back(dest_ublock_shape);
-                }
-                else
-                {
-                    inputs.push_back("intermed" + std::to_string(i.index));
-                    if (i.broadcast.second > 0)
-                    {
-                        graphlib::OpType brcst("broadcast",std::vector<BudaOpAttr>{(int)i.broadcast.first, (int)i.broadcast.second});
-                        tms[sub_op_input_count].push_back(brcst);
-                    }
-                    input_block_shapes.push_back(intermed_ublock_shape.at(i.index));
-                }
-                if (i.tile_broadcast.first)
-                {
-                    tms[sub_op_input_count].push_back(
-                        graphlib::OpType("tile_broadcast", {2}, {}));
-                }
-                else if (i.tile_broadcast.second)
-                {
-                    tms[sub_op_input_count].push_back(graphlib::OpType("tile_broadcast", {3}, {}));
-                }
-                sub_op_input_count++;
-            }
-            std::string output = "intermed" + std::to_string(f.output_buffer);
-            balancer::UBlockShape u_shape = op_model.fused_op_ublock_shape.at(f.name);
-
-            if (f.output_type == FusedSubOp::OutputType::OUTPUT)
-            {
-                output = "output";  // final output
-            }
-            else if (f.output_type == FusedSubOp::OutputType::DEST)
-            {
-                output = "dest";  // dest reuse
-                dest_ublock_shape = u_shape;
-            }
-            else
-            {
-                if (intermed_df.size() < (std::uint32_t)f.output_buffer + 1)
-                    intermed_df.resize(f.output_buffer + 1);
-                intermed_df[f.output_buffer] = f.output_df;
-                intermed_ublock_shape.insert(std::make_pair(f.output_buffer, u_shape));
-            }
-            // std::pair<std::uint32_t, std::uint32_t> ublock_shape = {blocks.ublock_rt, blocks.ublock_ct};
-            balancer::Parallelization par = op_model.parallelization();
-            std::pair<std::uint32_t, std::uint32_t> ublock_shape = {u_shape.rt, u_shape.ct};
-            std::pair<std::uint32_t, std::uint32_t> block_shape =
-                f.get_mblock_for_ublock(ublock_shape, std::make_pair(par.r, par.c));
-
-            if (f.op_type.op == "matmul")
-            {
-                balancer::UBlockShape input0_ublock = input_block_shapes.at(0);
-                balancer::UBlockShape input1_ublock = input_block_shapes.at(1);
-                std::uint32_t m_k = f.op_shape.inputs.at(0).ct / std::max(input0_ublock.ct, input1_ublock.rt);
-                if (m_k == 0)
-                    m_k = 1;
-                std::uint32_t u_kt = f.op_shape.inputs.at(0).ct / m_k;
-                f.op_type.buda_attrs["m_k"] = (int)m_k;
-                f.op_type.buda_attrs["u_kt"] = (int)u_kt;
-            }
-            // TODO: Should we also somehow merge attr to buda_attr?
-            auto sub_op = BudaFusedSubOp{
-                f.name,
-                f.op_type.op,
-                inputs,
-                f.get_sub_op_buda_attr(),
-                tms,
-                output,
-                f.popped_buffers,
-                f.popped_last_buffers,
-                block_shape,
-                ublock_shape};
-
-            schedule.push_back(sub_op);
-        }
-        schedules.push_back(schedule);
-    }
-
-    return BudaFusedOp{op->get_fused_op()->id(), input_count, intermed_df, schedules};
-}
-
-BudaNaryTM create_nary_tm(graphlib::BudaNaryTMNode *node, std::vector<BudaOperand> const &operands)
-{
-    BudaNaryTM tm;
-    tm.name = node->name();
-    tm.type = node->op_type().op;
-    tm.inputs = operands;
-    return tm;
-}
-
-// Given a linearized list of global epoch-ids, get a 2d-list capturing the
-// temporal-special relation
-std::vector<std::vector<std::uint32_t>> get_temporal_to_concurrent_spatial_epochs(
-    const placer::PlacerSolution &placer_solution, const std::vector<std::uint32_t> &epochs)
-{
-    std::vector<std::vector<std::uint32_t>> temporal_to_spatial_epochs(placer_solution.num_temporal_epochs());
-    for (std::uint32_t global_epoch_id : epochs)
-    {
-        uint32_t temporal_epoch_id = placer_solution.temporal_epoch_id(global_epoch_id);
-        TT_ASSERT(temporal_epoch_id < temporal_to_spatial_epochs.size());
-        temporal_to_spatial_epochs[temporal_epoch_id].push_back(global_epoch_id);
-    }
-    return temporal_to_spatial_epochs;
-}
-
-std::pair<int, int> get_epoch_allocate_deallocate(graphlib::Node *q, const placer::PlacerSolution &placer_solution)
-{
-    try
-    {
-        const auto &qp = placer_solution.name_to_queue_placement.at(q->name());
-        return std::make_pair(qp.epoch_allocate, qp.epoch_deallocate);
-    }
-    catch (std::out_of_range &e)
-    {
-        throw std::runtime_error("Queue missing in placement results for " + q->name() + ", something went wrong.");
-    }
-}
-
-std::vector<program::Program> create_programs(
-    Graph *graph, placer::PlacerSolution &placer_solution, BudaGraph &buda_graph, const std::string &arch_string)
-{
-    std::vector<program::Program> programs;
-
-    auto bwd_epochs = buda_graph.get_matching_epoch(graphlib::NodeEpochType::Backward);
-    bool have_bwd_epochs = bwd_epochs.size() > 0;
-    bool have_opt_epochs = buda_graph.get_matching_epoch(graphlib::NodeEpochType::Optimizer).size() > 0;
-
-    std::uint32_t last_bwd_epoch = 0;
-    if (have_bwd_epochs)
-    {
-        last_bwd_epoch = *std::max_element(bwd_epochs.begin(), bwd_epochs.end());
-    }
-
-    bool firmware_looping_enabled = env_as<int>("NUM_EXEC_LOOP_ITERATIONS", 0) > 1;
-    bool disable_dynamic_dram = env_as<bool>("PYBUDA_DISABLE_DYNAMIC_DRAM");
-
-    for (unsigned int subgraph_index = 0; subgraph_index < graph->num_subgraphs(); subgraph_index++)
-    {
-        for (graphlib::NodeEpochType epoch_type :
-            {graphlib::NodeEpochType::Forward, graphlib::NodeEpochType::Backward, graphlib::NodeEpochType::Optimizer})
-        {
-            // Get fwd epochs
-            std::vector<std::uint32_t> epochs = buda_graph.get_matching_epoch(epoch_type);
-
-            if (!have_bwd_epochs && (epoch_type != graphlib::NodeEpochType::Forward))
-                continue;
-
-            // For each epoch, find the input and e2e nodes that feed it
-            std::vector<std::vector<graphlib::Node *>> input_queues;
-            for (std::uint32_t epoch : epochs)
-            {
-                input_queues.push_back(graph->nodes(
-                    [&graph, &placer_solution, epoch](Node *node)
-                    {
-                        if ((node->node_type() != graphlib::NodeType::kInput) &&
-                            (node->node_type() != graphlib::NodeType::kQueue) &&
-                            (node->node_type() != graphlib::NodeType::kOutput))
-                            return false;
-
-                        std::vector<graphlib::Edge> edges;
-                        bool check_producer = false;
-                        if ((node->node_type() == graphlib::NodeType::kInput) ||
-                            (node->node_type() == graphlib::NodeType::kQueue))
-                            edges = graph->user_data_edges(node);
-                        else if (node->node_type() == graphlib::NodeType::kOutput) 
-                        {
-                            edges = graph->operand_data_edges(node);
-                            check_producer = true;
-                        }
-                        // If there's any edge matching, we should record the connection between queue and epoch
-                        for (Edge edge : edges)
-                        {
-                            Node *neighbour = graph->node_by_id(check_producer ? edge.producer_node_id : edge.consumer_node_id);
-                            try
-                            {
-                                if (
-                                    // Our epoch
-                                    (placer_solution.name_to_op_placement.at(neighbour->name()).epoch_id() == epoch) &&
-
-                                    (
-                                        // Input
-                                        ((node->node_type() == graphlib::NodeType::kInput) &&
-                                        (node->as<graphlib::InputNode>()->is_activation() ||
-                                        node->as<graphlib::InputNode>()->is_loss() ||
-                                        node->as<graphlib::InputNode>()->is_target()))
-
-                                        // Or e2e
-                                        || ((node->node_type() == graphlib::NodeType::kQueue) &&
-                                            node->as<graphlib::QueueNode>()->is_epoch_to_epoch())
-
-                                        // Or output with loopback edge
-                                        || ((node->node_type() == graphlib::NodeType::kOutput) &&
-                                            not graph->user_edges(node, [](Edge e) { return e.edge_type == graphlib::EdgeType::kPartialDataCopy; }).empty())
-
-                                        // Or buffering queue
-                                        || ((node->node_type() == graphlib::NodeType::kQueue) &&
-                                            node->as<graphlib::QueueNode>()->is_buffering())))
-                                    return true;
-                            }
-                            catch (std::out_of_range &e)
-                            {
-                                throw std::runtime_error(
-                                    "Op missing in placement results for " + neighbour->name() + ", something went wrong.");
-                            }
-                        }
-
-                        return false;
-                    }));
-            }
-
-            // For each epoch, find the parameter and constant queues that feed it
-            std::vector<std::vector<graphlib::Node *>> parameter_queues;
-            for (std::uint32_t epoch : epochs)
-            {
-                parameter_queues.push_back(graph->nodes(
-                    [&graph, &placer_solution, epoch](Node *node)
-                    {
-                        if (node->node_type() != graphlib::NodeType::kInput)
-                            return false;
-
-                        // If there's any user matching, we should record the connection between queue and epoch
-                        for (Edge user_edge : graph->user_data_edges(node))
-                        {
-                            Node *user = graph->node_by_id(user_edge.consumer_node_id);
-                            try
-                            {
-                                if (
-                                    // Our epoch
-                                    (placer_solution.name_to_op_placement.at(user->name()).epoch_id() == epoch) &&
-                                    ((node->as<graphlib::InputNode>()->is_parameter()) ||
-                                    (node->as<graphlib::InputNode>()->is_constant())))
-                                    return true;
-                            }
-                            catch (std::out_of_range &e)
-                            {
-                                throw std::runtime_error(
-                                    "Op missing in placement results for " + user->name() + ", something went wrong.");
-                            }
-                        }
-                        return false;
-                    }));
-            }
-
-            // For each epoch, find the gradient queues
-            std::vector<std::vector<graphlib::Node *>> gradient_queues;
-            for (std::uint32_t epoch : epochs)
-            {
-                gradient_queues.push_back(graph->nodes(
-                    [&graph, &placer_solution, epoch, have_opt_epochs](Node *node)
-                    {
-                        if ((node->node_type() != graphlib::NodeType::kQueue) ||
-                            (!node->as<graphlib::QueueNode>()->is_grad_accumulator()))
-                            return false;
-
-                        TT_ASSERT(
-                            graph->operand_data_edges(node).size() == 1,
-                            "Grad accumulator " + node->name() + " should have exactly one producer");
-                        Node *producer = graph->node_by_id(graph->operand_data_edges(node)[0].producer_node_id);
-
-                        Node *consumer = nullptr;
-                        if (have_opt_epochs)
-                        {
-                            TT_ASSERT(
-                                graph->user_data_edges(node).size() == 1,
-                                "Grad accumulator " + node->name() +
-                                    " should have exactly one consumer when optimizer is used");
-                            consumer = graph->node_by_id(graph->user_data_edges(node)[0].consumer_node_id);
-                        }
-
-                        try
-                        {
-                            return
-                                // Bwd
-                                ((placer_solution.name_to_op_placement.at(producer->name()).epoch_id() == epoch) &&
-                                producer->as<graphlib::BudaOpNode>()->is_gradient_op()) ||
-
-                                // Optimizer
-                                ((consumer != nullptr) &&
-                                (placer_solution.name_to_op_placement.at(consumer->name()).epoch_id() == epoch));
-                        }
-                        catch (std::out_of_range &e)
-                        {
-                            throw std::runtime_error(
-                                "Op missing in placement results for " + producer->name() + ", something went wrong.");
-                        }
-                    }));
-            }
-
-            //
-            // Generate queue settings for each epoch execute
-            //
-            std::vector<std::vector<program::QueueSettings>> queue_settings;
-            std::vector<std::set<std::tuple<std::string, program::VariableP, std::uint32_t, bool>>>
-                increment_list;  // list of ptrs to increment after execute, with wrap values
-
-            // Variable map for queues
-            // We'll reuse variable for queues in the same epoch, of same type, and same size
-            int qvar_index = 0;
-            std::unordered_map<
-                std::uint32_t,  // epoch
-                std::unordered_map<
-                    std::string,  // type
-                    std::unordered_map<
-                        std::uint32_t,  // size
-                        std::unordered_map<
-                            bool,  // static
-                            std::pair<program::VariableP, program::VariableP>>>>>
-                qvars_map;
-            auto qvars = [&qvars_map, &qvar_index](
-                            graphlib::Node *node, int epoch, program::Variable::ShadowType shadow_type, bool is_static)
-            {
-                std::string type = "";
-                std::uint32_t size = node->as<graphlib::QueueNode>()->get_num_entries();
-
-                if (node->node_type() == graphlib::NodeType::kQueue)
-                    type = "e2e";
-                else if (node->node_type() == graphlib::NodeType::kInput)
-                    type = node->as<graphlib::InputNode>()->input_type_string();
-
-                if (shadow_type == program::Variable::ShadowType::CROSS_EPOCH)
-                    type += "_ce_ng";
-                else if (shadow_type == program::Variable::ShadowType::CROSS_PROGRAM)
-                    type += "_cp_ng";
-
-                if ((qvars_map[epoch].find(type) == qvars_map[epoch].end()) ||
-                    (qvars_map[epoch][type].find(size) == qvars_map[epoch][type].end()) ||
-                    (qvars_map[epoch][type][size].find(is_static) == qvars_map[epoch][type][size].end()))
-                {
-                    // Node doesn't have variables, let's create some
-                    qvars_map[epoch][type][size].insert(std::make_pair(
-                        is_static,
-                        std::make_pair(
-                            std::make_shared<program::Variable>("lptr_q" + std::to_string(qvar_index), is_static),
-                            std::make_shared<program::Variable>(
-                                "gptr_q" + std::to_string(qvar_index), is_static, 0, shadow_type))));
-                    qvar_index++;
-                }
-                return qvars_map[epoch].at(type).at(size).at(is_static);
-            };
-
-            // Generate settings
-            std::vector<program::VariableP> queue_variables;
-            bool has_cache_buffers = false;
-            bool has_cache_read_buffer = false;
-            std::unordered_map<std::uint32_t, std::uint32_t> epoch_to_epoch_index;
-            for (std::uint32_t epoch_index = 0; epoch_index < epochs.size(); epoch_index++)
-            {
-                std::uint32_t epoch = epochs[epoch_index];
-                epoch_to_epoch_index[epoch] = epoch_index;
-                queue_settings.push_back({});
-                increment_list.push_back({});
-                if (placer_solution.epoch_id_to_subgraph_index[epoch] != subgraph_index)
-                    continue;
-
-                // All input should be read locally. In the last epoch that reads the input, it should be read
-                // globally, too.
-                for (graphlib::Node *q : input_queues[epoch_index])
-                {
-                    if (q->as<graphlib::QueueNode>()->is_buffering())
-                    {
-                        // If dynamic dram is disabled
-                        if (disable_dynamic_dram)
-                        {
-                            // Need to increment static queue rd/wtr ptrs as queue is persistant
-                            uint32_t temporal_epoch_id = placer_solution.temporal_epoch_id(epoch);
-                            const auto &[lptr, gptr] = qvars(q, temporal_epoch_id, program::Variable::ShadowType::NONE, true);
-
-                            auto qs = program::QueueSettings(
-                            q->name(),
-                            program::QueueAttributes{
-                                .read_ptr_global_ = gptr,
-                                .read_ptr_local_ = lptr,
-                                .epoch_allocate = -1,
-                                .epoch_deallocate = -1});
-
-                            std::uint32_t wrap_value = q->as<graphlib::QueueNode>()->get_num_entries();
-                            increment_list[epoch_index].insert(std::make_tuple(lptr->name(), lptr, wrap_value, true));
-                            increment_list[epoch_index].insert(std::make_tuple(gptr->name(), gptr, wrap_value, true));
-                            queue_settings[epoch_index].push_back(qs);
-                        }
-                        else
-                        {
-                            // We do not increment any of the rd/wtr ptrs for buffering queues because
-                            // queue will be produced and consumed within the same epoch execution
-                            auto qs = program::QueueSettings(
-                                q->name(),
-                                program::QueueAttributes{
-                                    .read_ptr_global_ = nullptr,
-                                    .read_ptr_local_ = nullptr,
-                                    .epoch_allocate = (int)epoch,
-                                    .epoch_deallocate = (int)epoch});
-                            // since buffering queue is produced and consumed within the same epoch, we alway enable global autoincrement
-                            qs.global_rdptr_autoinc = true;
-                            queue_settings[epoch_index].push_back(qs);
-                        }
-                        continue;
-                    }
-
-                    uint32_t temporal_epoch_id = placer_solution.temporal_epoch_id(epoch);
-                    bool read_global;
-                    if (q->as<graphlib::QueueNode>()->is_output())
-                    {
-                        read_global = (temporal_epoch_id == get_first_epoch_producer(graph, q, placer_solution));
-                    }
-                    else
-                    {
-                        read_global = (temporal_epoch_id == get_last_epoch_use(graph, q, placer_solution));
-
-                    }
-                    auto [epoch_allocate, epoch_deallocate] = get_epoch_allocate_deallocate(q, placer_solution);
-
-                    // WORKAROUND: If we have a backward, we need a "shadow global read pointer" to track the
-                    // real one when fwd-bwd-fwd-bwd are called, otherwise setting it to 0 will "reset" it for bwd.
-                    // This needs to be removed eventually, as it only works around this particular call pattern.
-                    bool needs_shadow_global_read_pointer = !read_global;
-                    program::Variable::ShadowType shadow_pointer_type = program::Variable::ShadowType::NONE;
-                    if (needs_shadow_global_read_pointer)
-                    {
-                        if (epoch_type == graphlib::NodeEpochType::Forward) {
-                            shadow_pointer_type = any_consumers_cross_epoch(graph, q) ? program::Variable::ShadowType::CROSS_PROGRAM : program::Variable::ShadowType::CROSS_EPOCH;
-                        } else {
-                            shadow_pointer_type = program::Variable::ShadowType::CROSS_EPOCH;  // backward & opt are always the last consumer program
-                        }
-                    }
-                    bool is_write_only = placer_solution.name_to_queue_placement[q->name()].write_only;
-                    if (is_write_only)
-                    {
-                        int write_stride = placer_solution.name_to_queue_placement[q->name()].write_stride;
-                        auto write_ptr = std::make_shared<program::Variable>("v_cache_write_index", true);
-                        auto qs = program::QueueSettings(
-                            q->name(), program::RamAttributes{.read_ptr_ = nullptr, .write_ptr_ = write_stride != 1 ? write_ptr : nullptr});
-                        if (write_stride != 1)
-                        {
-                            qs.global_wrptr_autoinc =
-                                placer_solution.name_to_queue_placement[q->name()]
-                                    .write_stride;  // get t size of reader to support write pointer striding
-                        }
-                        qs.prologue = false;
-                        has_cache_buffers = true;
-                        queue_settings[epoch_index].push_back(qs);
-                    }
-                    else
-                    {
-                        const auto &[lptr, gptr] =
-                            qvars(q, temporal_epoch_id, shadow_pointer_type, epoch_allocate == -1);
-
-                        auto qs = program::QueueSettings(
-                            q->name(),
-                            program::QueueAttributes{
-                                // see workaround above
-                                //.read_ptr_global_= read_global ? gptr : nullptr,
-                                .read_ptr_global_ = gptr,
-                                .read_ptr_local_ = lptr,
-                                .epoch_allocate = epoch_allocate,
-                                .epoch_deallocate = epoch_deallocate,
-                            });
-
-                        bool rd_ptr_autoinc_enabled = (read_global && firmware_looping_enabled);
-                        if (rd_ptr_autoinc_enabled) {
-                            qs.global_rdptr_autoinc = true;
-                        }
-                        queue_settings[epoch_index].push_back(qs);
-                        std::uint32_t wrap_value = q->as<graphlib::QueueNode>()->get_num_entries();
-                        increment_list[epoch_index].insert(std::make_tuple(lptr->name(), lptr, wrap_value, true));
-                        // see workaround above
-                        // if (read_global) {
-                        increment_list[epoch_index].insert(std::make_tuple(gptr->name(), gptr, wrap_value, true));
-                        //}
-                    }
-                }
-
-                if ((epoch_type == graphlib::NodeEpochType::Forward) || (epoch_type == graphlib::NodeEpochType::Backward))
-                {
-                    // In forward and backward, all parameter and constant queues should be read in during prologue
-                    for (graphlib::Node *q : parameter_queues[epoch_index])
-                    {
-                        // TODO(jchu): change RamAttributes (r/w ptrs) for repeated-structure integration
-                        if (q->as<graphlib::InputNode>()->is_parameter())
-                        {
-                            if (placer_solution.name_to_queue_placement[q->name()].read_only)
-                            {
-                                auto read_ptr = std::make_shared<program::Variable>("v_cache_read", true);
-                                auto qs = program::QueueSettings(
-                                    q->name(), program::RamAttributes{.read_ptr_ = read_ptr, .write_ptr_ = nullptr});
-                                    
-                                qs.read_only = true;
-                                qs.global_rdptr_autoinc = 1; // RAM needs global_rdptr_autoinc
-                                queue_settings[epoch_index].push_back(qs);
-                                if (not has_cache_read_buffer)
-                                {
-                                    queue_variables.push_back(read_ptr);
-                                    has_cache_read_buffer = true;
-                                }
-                            }
-                            else
-                            {
-                                auto qs = program::QueueSettings(
-                                    q->name(), program::RamAttributes{.read_ptr_ = nullptr, .write_ptr_ = nullptr});
-                                qs.prologue = q->as<graphlib::InputNode>()->is_prologue();
-                                queue_settings[epoch_index].push_back(qs);
-                            }
-                        }
-                        else
-                        {
-                            auto qs = program::QueueSettings(
-                                q->name(),
-                                program::QueueAttributes{.read_ptr_global_ = nullptr, .read_ptr_local_ = nullptr});
-
-                            qs.prologue = q->as<graphlib::InputNode>()->is_prologue();
-                            if (!qs.prologue)
-                                qs.rd_ptr_autoinc = false;
-                            queue_settings[epoch_index].push_back(qs);
-                        }
-                    }
-                }
-
-                if (epoch_type == graphlib::NodeEpochType::Optimizer)
-                {
-                    // In optimizer, constants are read from dram
-                    for (graphlib::Node *q : parameter_queues[epoch_index])
-                    {
-                        if (q->as<graphlib::InputNode>()->is_constant())
-                        {
-                            auto qs = program::QueueSettings(
-                                q->name(),
-                                program::QueueAttributes{.read_ptr_global_ = nullptr, .read_ptr_local_ = nullptr});
-
-                            qs.prologue = false;
-                            queue_settings[epoch_index].push_back(qs);
-                        }
-                    }
-                }
-
-                if (epoch_type == graphlib::NodeEpochType::Backward)
-                {
-                    // In backward, we need to epilogue copy them back to dram
-                    for (graphlib::Node *q : gradient_queues[epoch_index])
-                    {
-                        auto qs = program::QueueSettings(
-                            q->name(), program::RamAttributes{.read_ptr_ = nullptr, .write_ptr_ = nullptr});
-                        qs.prologue = true;
-                        qs.epilogue = true;
-                        qs.zero = "$v_zero_grad";
-                        queue_settings[epoch_index].push_back(qs);
-                    }
-                }
-
-                if (epoch_type == graphlib::NodeEpochType::Optimizer)
-                {
-                    // In optimizer, we read & write parameter queues
-                    for (graphlib::Node *q : parameter_queues[epoch_index])
-                    {
-                        if (q->as<graphlib::InputNode>()->is_parameter())
-                        {
-                            auto qs = program::QueueSettings(
-                                q->name(), program::RamAttributes{.read_ptr_ = nullptr, .write_ptr_ = nullptr});
-                            queue_settings[epoch_index].push_back(qs);
-                        }
-                        else
-                        {
-                            auto qs = program::QueueSettings(
-                                q->name(),
-                                program::QueueAttributes{.read_ptr_global_ = nullptr, .read_ptr_local_ = nullptr});
-                        }
-                    }
-
-                    // Optimizer reads gradient accumulators
-                    for (graphlib::Node *q : gradient_queues[epoch_index])
-                    {
-                        auto qs = program::QueueSettings(
-                            q->name(), program::RamAttributes{.read_ptr_ = nullptr, .write_ptr_ = nullptr});
-                        queue_settings[epoch_index].push_back(qs);
-                    }
-                }
-            }
-
-            std::string program_name = (epoch_type == graphlib::NodeEpochType::Forward)           ? "run_fwd_"
-                                            : (epoch_type == graphlib::NodeEpochType::Backward)  ? "run_bwd_"
-                                            : (epoch_type == graphlib::NodeEpochType::Optimizer) ? "run_opt_"
-                                                                                                : "error";
-            program_name += std::to_string(subgraph_index);
-
-            for (const auto &[epoch, qvars] : qvars_map)
-            {
-                for (const auto &[type, qvars2] : qvars)
-                {
-                    for (const auto &[size, qvars3] : qvars2)
-                    {
-                        for (const auto &kv : qvars3)
-                        {
-                            queue_variables.push_back(std::get<0>(kv.second));  // lptr
-                            queue_variables.push_back(std::get<1>(kv.second));  // gptr
-                        }
-                    }
-                }
-            }
-
-            // Figure out allocate/deallocate commands
-            std::unordered_map<std::uint32_t, std::shared_ptr<program::AllocateQueue>> alloc_queue_cmds;
-            std::unordered_map<std::uint32_t, std::shared_ptr<program::DeallocateQueue>> dealloc_queue_cmds;
-            std::unordered_map<std::uint32_t, std::vector<program::QueueSettings>> to_alloc;
-            std::unordered_map<std::uint32_t, std::vector<program::QueueSettings>> to_dealloc;
-            std::unordered_set<std::string> visited_queue_alloc_dealloc;
-            for (std::uint32_t epoch_index = 0; epoch_index < epochs.size(); epoch_index++)
-            {
-                std::uint32_t epoch = epochs[epoch_index];
-                if (placer_solution.epoch_id_to_subgraph_index[epoch] != subgraph_index)
-                    continue;
-                for (const program::QueueSettings &qs : queue_settings[epoch_index])
-                {
-                    try
-                    {
-                        // queue should have single alloc/dealloc invocation: {earliest alloc, latest dealloc}
-                        if (visited_queue_alloc_dealloc.find(qs.name()) == visited_queue_alloc_dealloc.end())
-                        {
-                            int epoch_allocate = qs.epoch_allocate();
-                            if (epoch_allocate >= 0)
-                                to_alloc[epoch_allocate].push_back(qs);
-
-                            int epoch_deallocate = qs.epoch_deallocate();
-                            if (epoch_deallocate >= 0)
-                                to_dealloc[epoch_deallocate].push_back(qs);
-                            visited_queue_alloc_dealloc.insert(qs.name());
-                        }
-                    }
-                    catch (std::out_of_range &e)
-                    {
-                        throw std::runtime_error("Invalid allocate/deallocate epoch for " + qs.name());
-                    }
-                }
-            }
-            for (auto &[epoch_index, qs] : to_alloc)
-                alloc_queue_cmds.emplace(std::make_pair(epoch_index, std::make_shared<program::AllocateQueue>(qs)));
-            for (auto &[epoch_index, qs] : to_dealloc)
-                dealloc_queue_cmds.emplace(std::make_pair(epoch_index, std::make_shared<program::DeallocateQueue>(qs)));
-
-            bool has_zero_grad = epoch_type == graphlib::NodeEpochType::Backward;
-            bool is_optimizer_loop = epoch_type == graphlib::NodeEpochType::Optimizer;
-            std::vector<std::vector<std::uint32_t>> temporal_to_spatial_epochs =
-                get_temporal_to_concurrent_spatial_epochs(placer_solution, epochs);
-
-            programs.push_back(program::Program::loop_template(
-                program_name,
-                queue_variables,
-                graph->get_microbatch(),
-                has_zero_grad,
-                is_optimizer_loop,
-                has_cache_buffers,
-                [&queue_settings,
-                &temporal_to_spatial_epochs,
-                epoch_type,
-                &increment_list,
-                has_zero_grad,
-                &epoch_to_epoch_index,
-                &arch_string,
-                &last_bwd_epoch,
-                &alloc_queue_cmds,
-                &dealloc_queue_cmds,
-                &buda_graph,
-                is_optimizer_loop,
-                &firmware_looping_enabled,
-                subgraph_index,
-                placer_solution](program::Program &p)
-                {
-                    // Loop body
-                    for (std::uint32_t temporal_epoch_id = 0; temporal_epoch_id < temporal_to_spatial_epochs.size();
-                        temporal_epoch_id++)
-                    {
-                        // Add queue allocate command for the temporal epoch
-                        if (alloc_queue_cmds.count(temporal_epoch_id) > 0)
-                            p.add(alloc_queue_cmds.at(temporal_epoch_id));
-
-                        std::vector<std::shared_ptr<program::DeallocateQueue>> queued_dealloc_cmds;
-
-                        bool empty_temporal_epoch = true;
-                        for (std::uint32_t spatial_epoch_index = 0;
-                            spatial_epoch_index < temporal_to_spatial_epochs[temporal_epoch_id].size();
-                            spatial_epoch_index++)
-                        {
-                            std::uint32_t epoch = temporal_to_spatial_epochs[temporal_epoch_id][spatial_epoch_index];
-                            empty_temporal_epoch &= buda_graph.ops[epoch].empty();
-                        }
-
-                        for (std::uint32_t spatial_epoch_index = 0;
-                            spatial_epoch_index < temporal_to_spatial_epochs[temporal_epoch_id].size();
-                            spatial_epoch_index++)
-                        {
-                            if (empty_temporal_epoch)
-                            {
-                                continue;
-                            }
-                            std::uint32_t epoch = temporal_to_spatial_epochs[temporal_epoch_id][spatial_epoch_index];
-                            if (placer_solution.epoch_id_to_subgraph_index.at(epoch) != subgraph_index)
-                                continue;
-                            std::uint32_t epoch_index = epoch_to_epoch_index.at(epoch);
-
-                            const std::string subgraph_name =
-                                get_subgraph_name(epoch_type, epoch, arch_string, temporal_epoch_id, subgraph_index);
-
-                            // Generate execute command
-                            if (empty_temporal_epoch and buda_graph.ops[epoch].size() == 0)
-                            {
-                                continue;
-                            }
-                            p.add(std::make_shared<program::Execute>(subgraph_name, queue_settings[epoch_index]));
-                        }
-
-                        // Write out all dealloc commands
-                        if (dealloc_queue_cmds.count(temporal_epoch_id) > 0)
-                            p.add(dealloc_queue_cmds.at(temporal_epoch_id));
-
-                        std::unordered_set<std::string> q_ptr_variables_incremented_this_temporal_epoch = {};
-
-                        for (std::uint32_t spatial_epoch_index = 0;
-                            spatial_epoch_index < temporal_to_spatial_epochs[temporal_epoch_id].size();
-                            spatial_epoch_index++)
-                        {
-                            std::uint32_t epoch = temporal_to_spatial_epochs[temporal_epoch_id][spatial_epoch_index];
-                            if (placer_solution.epoch_id_to_subgraph_index.at(epoch) != subgraph_index)
-                                continue;
-                            std::uint32_t epoch_index = epoch_to_epoch_index.at(epoch);
-
-                            for (auto [var_name, var, wrap_value, double_wrap] : increment_list[epoch_index])
-                            {
-                                if (firmware_looping_enabled) {
-                                    continue;
-                                }
-                                if (var->needs_shadow_global_read_pointer())
-                                {
-                                    var = var->get_shadow_global_read_pointer();
-                                }
-                                if (q_ptr_variables_incremented_this_temporal_epoch.find(var->name()) == q_ptr_variables_incremented_this_temporal_epoch.end()) {
-                                    int wrap =
-                                        double_wrap ? wrap_value * 2 : wrap_value;  // hack for backend to set wrap to wrap x2 to match hardware wrap
-                                    p.instruction_incwrap(
-                                        var, is_optimizer_loop or not double_wrap ? p.get_var("c_one") : p.get_var("c_microbatch_size"), wrap);
-                                    q_ptr_variables_incremented_this_temporal_epoch.insert(var->name());
-                                }
-                            }
-
-                            // Clear zero grad for subsequent iterations of the loop
-                            if (has_zero_grad && (epoch == last_bwd_epoch))
-                            {
-                                program::VariableP v_zero_grad = p.get_var("v_zero_grad");
-                                p.set_variable_value(v_zero_grad, 0);
-                            }
-                        }
-                    }
-                }));
-        }
-    }
-
-    return programs;
-}
-
-static std::vector<std::size_t> get_input_dram_io_buf_size_tiles(
-    Graph const *graph,
-    DeviceConfig const &device_config,
-    placer::PlacerSolution const &placer_solution,
-    Node const *node,
-    balancer::OpModel const &op_model)
-{
-    placer::OpPlacement const& op_placement = placer_solution.name_to_op_placement.at(node->name());
-    auto dram_io_queue_node = [&placer_solution, &op_placement](graphlib::Node *operand)
-    {
-        auto *queue = dynamic_cast<graphlib::QueueNode *>(operand);
-        if (not queue)
-            return false;
-        placer::QueuePlacement const &queue_placement = placer_solution.name_to_queue_placement.at(queue->name());
-
-        // Currently router (part of net2pipe that does chip2chip routing) does not support input_dram_io_buf_size_tiles
-        // and its presence in the netlist actually causes issues.  So we need to disable emitting this attribute for
-        // remote dram reads. For more context:
-        //   tenstorrent/budabackend#1979#note_263659
-        bool remote_read = queue_placement.chip_id != op_placement.chip_id;
-        return not remote_read and (queue->is_input() or queue->is_epoch_to_epoch() or queue->is_buffering());
-    };
-
-    std::size_t num_dram_readers = 0;
-    auto operands = graph->data_operands(node);
-    std::vector<std::size_t> input_dram_io_buf_size_tiles(operands.size(), 0);
-    int input_idx = 0;
-    for (Node *operand : operands)
-    {
-        bool is_prologue = bool(op_model.parameter_buffers[input_idx]);
-        num_dram_readers += int(dram_io_queue_node(operand) and not is_prologue);
-        ++input_idx;
-    }
-
-    if (num_dram_readers == 0 or env_as<bool>("PYBUDA_DISABLE_EXPLICIT_DRAM_IO"))
-        return input_dram_io_buf_size_tiles;
-
-    // DRAM IO buffer sizing
-    TT_ASSERT(op_model.get_l1_memory_usage() <= device_config.get_l1_usable_size());
-
-    // We can reclaim the default pipegen carve out space for DRAM io get_l1_dram_io_backend_reserved_size
-    std::size_t l1_per_input_dram_prefetch_buffer_size =
-        device_config.get_l1_dram_io_backend_reserved_size() / num_dram_readers;
-
-    input_idx = 0;
-    for (Node *operand : operands)
-    {
-        bool is_prologue = bool(op_model.parameter_buffers[input_idx]);
-        if (dram_io_queue_node(operand) and not is_prologue)
-        {
-            std::size_t input_buffer_bytes = op_model.input_buffers[input_idx].single_buffered_size_bytes();
-            std::size_t multiplier = l1_per_input_dram_prefetch_buffer_size / input_buffer_bytes;
-
-            if (multiplier > 0)
-            {
-                // If we can fit > 0 additional multiplier of the input buffer,
-                // then we try to fit some multiple of the input buffer
-                std::size_t input_buffer_tiles = op_model.input_buffers[input_idx].single_buffered_size_tiles();
-                input_dram_io_buf_size_tiles[input_idx] = input_buffer_tiles * multiplier;
-            }
-            else
-            {
-                // If we can't fit any multiplier input buffers, then we just
-                // allocate as many tiles as we can into this region
-                std::size_t input_buffer_tiles =
-                    l1_per_input_dram_prefetch_buffer_size /
-                    balancer::tile_size_bytes(op_model.input_buffers[input_idx].data_format);
-                input_dram_io_buf_size_tiles[input_idx] = input_buffer_tiles;
-            }
-        }
-        ++input_idx;
-    }
-
-    return input_dram_io_buf_size_tiles;
-}
-
-// Create Buda queues, program, and graphs
-BudaNetlist lower_to_buda_netlist(
-    Graph *graph,
-    std::string &graph_name,
-    placer::PlacerSolution &placer_solution,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution,
-    const std::vector<std::uint32_t> &chip_ids,
-    const DeviceConfig &device_config,
-    bool enable_forked_dram_inputs)
-{
-    BudaNetlist net;
-    const std::string &arch_string = device_config.arch_name;
-    net.graphs.push_back(BudaGraph(graph_name, arch_string, graph->get_microbatch()));
-    BudaGraph &buda_graph = net.graphs.back();
-    std::unordered_map<int, BudaOperand> node_to_op;
-    std::unordered_set<std::uint32_t> fused_ops_register;
-
-    std::uint32_t epoch_count = balancer_solution->placer_solution.epoch_id_to_epoch_info.size();
-    buda_graph.epoch_types.resize(epoch_count);
-    buda_graph.ops.resize(epoch_count);
-    for (auto &[epoch_id, epoch_info] : balancer_solution->placer_solution.epoch_id_to_epoch_info)
-    {
-        buda_graph.epoch_types[epoch_id] = epoch_info.epoch_type;
-    }
-
-    // Returns a mapping of keys that are input edges that can reuse the DRAM reads of the mapped values
-    std::unordered_map<graphlib::Edge, graphlib::Edge> forked_dram_inputs =
-    tt::passes::get_forked_dram_inputs(enable_forked_dram_inputs, graph, &placer_solution.name_to_op_placement, &balancer_solution->op_models);
-
-    for (Node *node : graphlib::topological_sort(*graph))
-    {
-        try
-        {
-            if (node->node_type() == NodeType::kInput or node->node_type() == NodeType::kOutput or
-                node->node_type() == NodeType::kQueue)
-            {
-                BudaQueue q = create_queue(
-                    graph,
-                    node->as<graphlib::QueueNode>(),
-                    placer_solution.name_to_queue_placement.at(node->name()),
-                    balancer_solution->block_shapes);
-                net.queues.push_back(q);
-                node_to_op.insert(std::make_pair(node->id(), q.name));
-            }
-            else if (node->node_type() == NodeType::kBudaOp)
-            {
-                std::vector<BudaOperand> operands;
-                std::vector<tt::DataFormat> input_df;
-                for (Node *in_node : graph->data_operands(node))
-                {
-                    operands.push_back(node_to_op.at(in_node->id()));
-                    input_df.push_back(graph->node_by_id(in_node->id())->output_df());
-                }
-
-                std::vector<graphlib::Edge> forked_dram_edges;
-
-                for (auto operand : graph->operand_data_edges(node))
-                {
-                    if (forked_dram_inputs.find(operand) != forked_dram_inputs.end())
-                    {
-                        forked_dram_edges.push_back(forked_dram_inputs[operand]);
-                    }
-                }
-
-                balancer::OpModel const &op_model = balancer_solution->op_models.at(node->name());
-                balancer::BlockShape const &block_shape = balancer_solution->block_shapes.at(node->name());
-                placer::OpPlacement placement = placer_solution.name_to_op_placement.at(node->name());
-                BudaOpAttrs buda_attrs = node->as<graphlib::BudaOpNode>()->op_type().buda_attrs;
-                bool ignore_tms = false;
-                std::vector<std::size_t> input_dram_io_buf_size_tiles =
-                    get_input_dram_io_buf_size_tiles(graph, device_config, placer_solution, node, op_model);
-
-                if (op_model.has_sparse_buffer())
-                {
-                    BudaOp sparse_buffer_op;
-                    std::tie(sparse_buffer_op, operands, input_df, buda_attrs, placement) = create_sparse_buffer_op(
-                        graph,
-                        node->as<graphlib::BudaOpNode>(),
-                        operands,
-                        input_df,
-                        buda_attrs,
-                        placement,
-                        placer_solution.name_to_queue_placement,
-                        op_model,
-                        block_shape,
-                        arch_string,
-                        input_dram_io_buf_size_tiles);
-
-                    // Sparse buffer handled the incoming tms
-                    ignore_tms = true;
-                    buda_graph.ops[placement.epoch_id()].push_back(sparse_buffer_op);
-                }
-
-                BudaOp op = create_op(
-                    graph,
-                    node->as<graphlib::BudaOpNode>(),
-                    operands,
-                    input_df,
-                    buda_attrs,
-                    placement,
-                    placer_solution.name_to_queue_placement,
-                    op_model,
-                    block_shape,
-                    arch_string,
-                    input_dram_io_buf_size_tiles,
-                    forked_dram_edges,
-                    ignore_tms);
-
-                if (node->as<graphlib::BudaOpNode>()->is_fused_op())
-                {
-                    BudaFusedOp fused_op = create_fused_op(
-                        node->as<graphlib::BudaOpNode>(), op_model);
-                    bool reused = false;
-                    for (const BudaFusedOp &prev : net.fused_ops)
-                    {
-                        if (prev.equivalent(fused_op))
-                        {
-                            // We can reuse an old op
-                            op.attrs["fused_op_id"] = (int)prev.id;
-                            reused = true;
-                            break;
-                        }
-                    }
-                    if (!reused)
-                    {
-                        net.fused_ops.push_back(fused_op);
-                        TT_ASSERT(fused_ops_register.count(fused_op.id) == 0, "Duplicate fused op id found!");
-                        fused_ops_register.insert(fused_op.id);
-                    }
-
-                    if (env_as<bool>("PYBUDA_EXP_APPROX"))  // TODO: config
-                        op.attrs["approximate_mode"] = true;
-                }
-
-                buda_graph.ops[placement.epoch_id()].push_back(op);
-
-                node_to_op.insert(std::make_pair(node->id(), op.name));
-            }
-            else if (node->node_type() == NodeType::kBudaNaryTM)
-            {
-                std::vector<BudaOperand> operands;
-                for (Node *in_node : graph->data_operands(node)) operands.push_back(node_to_op.at(in_node->id()));
-                BudaNaryTM tm = create_nary_tm(node->as<graphlib::BudaNaryTMNode>(), operands);
-                node_to_op.insert(std::make_pair(node->id(), tm));
-            }
-        }
-        catch (std::out_of_range &e)
-        {
-            throw std::runtime_error(
-                "Op or queue missing in placement results for " + node->name() + ", something went wrong.");
-        }
-    }
-
-    for (size_t epoch_id = 0; epoch_id < buda_graph.epoch_types.size(); ++epoch_id)
-    {
-        int chip_id = placer_solution.epoch_id_to_chip.at(epoch_id);
-        buda_graph.epoch_target_devices.push_back(BudaDevice(chip_id));
-        buda_graph.epoch_to_temporal_epoch_id.push_back(placer_solution.temporal_epoch_id(epoch_id));
-        buda_graph.epoch_to_subgraph_index.push_back(placer_solution.epoch_id_to_subgraph_index[epoch_id]);
-    }
-
-    net.programs = create_programs(graph, placer_solution, buda_graph, arch_string);
-    net.chip_ids = chip_ids;
-    net.arch_string = arch_string;
-
-    std::stringstream ss;
-    to_debug_info(ss, device_config);
-    net.debug_info = Comment(ss);
-
-    return net;
-}
-
-BudaNetlist merge_netlists(std::vector<BudaNetlist> subgraphs)
-{
-    BudaNetlist net;
-    net.arch_string = subgraphs[0].arch_string;
-    net.chip_ids = subgraphs[0].chip_ids;
-    for (auto subgraph : subgraphs)
-    {
-        for (auto fused_op : subgraph.fused_ops)
-        {
-            net.fused_ops.push_back(fused_op);
-        }
-        for (auto program : subgraph.programs)
-        {
-            net.programs.push_back(program);
-        }
-        for (auto queue : subgraph.queues)
-        {
-            net.queues.push_back(queue);
-        }
-        for (auto graph : subgraph.graphs)
-        {
-            net.graphs.push_back(graph);
-        }
-    }
-    return net;
-}
-}  // namespace tt
diff --git a/pybuda/csrc/lower_to_buda/netlist.hpp b/pybuda/csrc/lower_to_buda/netlist.hpp
deleted file mode 100644
index 1f69f2d5d..000000000
--- a/pybuda/csrc/lower_to_buda/netlist.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <memory>
-#include <vector>
-
-#include "balancer/balancer.hpp"
-#include "lower_to_buda/comment.hpp"
-#include "lower_to_buda/fused_op.hpp"
-#include "lower_to_buda/graph.hpp"
-#include "lower_to_buda/program.hpp"
-#include "lower_to_buda/queue.hpp"
-#include "placer/placer.hpp"
-
-namespace tt {
-
-struct BudaNetlistConfig {
-};
-
-struct BudaNetlist {
-    Comment comments;
-    Comment debug_info;
-    std::vector<program::Program> programs;
-    std::vector<BudaQueue> queues;
-    std::vector<BudaGraph> graphs;
-    std::vector<BudaFusedOp> fused_ops;
-
-    std::vector<std::uint32_t> chip_ids;
-    std::string arch_string;
-
-    std::string dump_to_yaml() const;
-    inline void append_comment(std::string const &comment)
-    {
-        if (comments)
-            comments.str += "\n";
-        comments.str += comment;
-    }
-};
-
-BudaNetlist merge_netlists(std::vector<BudaNetlist> subgraphs);
-// Create Buda queues, program, and graphs
-BudaNetlist lower_to_buda_netlist(
-    graphlib::Graph *graph,
-    std::string &graph_name,
-    placer::PlacerSolution &placer_solution,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution,
-    const std::vector<std::uint32_t> &chip_ids,
-    const DeviceConfig &device_config,
-    bool disable_forked_dram_inputs);
-
-BudaFusedOp create_fused_op(graphlib::BudaOpNode *op, const balancer::OpModel &op_model);
-
-} // namespace tt
diff --git a/pybuda/csrc/lower_to_buda/op.cpp b/pybuda/csrc/lower_to_buda/op.cpp
deleted file mode 100644
index 8a92f51ba..000000000
--- a/pybuda/csrc/lower_to_buda/op.cpp
+++ /dev/null
@@ -1,352 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include <iostream>
-
-#include "lower_to_buda/op.hpp"
-#include "lower_to_buda/common.hpp"
-
-namespace tt {
-
-static bool op_uses_grid_location(BudaOp const &op) {
-    return op.type != "ethernet_datacopy";
-}
-
-std::ostream &operator<<(std::ostream &os, BudaOp const &op) {
-    os << op.name << ": {";
-
-    os << "type: " << op.type << ", ";
-    if (op_uses_grid_location(op))
-    {
-        os << op.grid << ", ";
-    } else {
-        os << "grid_size: [" << op.grid.grid_size_r << ", " << op.grid.grid_size_c << "],";
-    }
-    os << "inputs: "<< op.inputs;
-
-    os <<op.forked_dram_inputs;
-
-    if (op.gradient_op)
-        os << ", gradient_op: true";
-
-    if (op.untilize_output)
-        os << ", untilize_output: true";
-    
-    if (op.grid_transpose)
-        os << ", grid_transpose: true";
-
-    if (op.overlay_size > 0)
-    {
-        os << ", overlay_size: " << op.overlay_size;
-    }
-
-    os << ","
-       << std::endl << "         "
-       << op.blocks;
-
-    // Next, write out tile dim
-    os << ", tile_dim: " << op.tile_dim;
-    os << ", buf_size_mb: " << op.buf_size_mb;
-
-    if (op.input_buf_min_size_tiles.size() > 0)
-    {
-        bool first = true;
-        os << ", input_buf_min_size_tiles: [";
-        for (std::uint32_t tiles : op.input_buf_min_size_tiles)
-        {
-            if (!first) os << ", ";
-            first = false;
-            os << tiles;
-        }
-        os << "]";
-    }
-
-    if (not op.input_dram_io_buf_size_tiles.empty())
-    {
-        bool first = true;
-        os << ", input_dram_io_buf_size_tiles: [";
-        for (std::size_t size_tiles : op.input_dram_io_buf_size_tiles)
-        {
-            if (!first)
-                os << ", ";
-            first = false;
-            os << size_tiles;
-        }
-        os << "]";
-    }
-
-    switch (op.ublock_order) {
-        case graphlib::UBlockOrder::R: os << ", ublock_order: r"; break;
-        case graphlib::UBlockOrder::C: os << ", ublock_order: c"; break;
-    }
-
-    os << ", "
-       << "in_df: " << op.input_data_formats << ", "
-       << "out_df: " << op.output_data_format << ", "
-       << "intermed_df: " << op.intermediate_data_format << ", "
-       << "acc_df: " << op.accumulate_data_format << ", " 
-       << "math_fidelity: " << op.fidelity;
-
-    if (op.tms.size() > 0) {
-
-        os << ","
-           << std::endl << "         ";
-
-        // Next, we write out the TMs
-        bool first = true;
-        for (auto t : op.tms) {
-
-            if (!first) os << ", ";
-            first = false;
-
-            std::vector<graphlib::OpType> unpaddings;
-
-            for (auto it = t.second.begin(); it != t.second.end(); ++it) {
-
-                if ((*it).op == "buda_unpad") {
-                    unpaddings.push_back(*it);
-                    continue;
-                }
-
-            }
-
-            // First, we write out the padding unpads
-            if (unpaddings.size() == 1) {
-
-                // Create unpadding pad TM in the netlist
-                os << "input_" << t.first << "_unpad: " 
-                    // Write out number of tiles for R dimension
-                   << "{rt: " << unpaddings[0].buda_attrs["rt"]
-                    // Write out number of tiles for C dimension
-                   << ", ct: " << unpaddings[0].buda_attrs["ct"] << "}";
-
-                if (t.second.size() - unpaddings.size() > 0)
-                    os << ", ";
-
-            }
-
-            if (t.second.size() - unpaddings.size() > 0) {
-                
-                os << "input_" << t.first << "_tms: [";
-
-                bool first_tm = true;
-                for (auto &tm : t.second) {
-
-                    // Unpadding operations/atrributes are in rank of TMs, 
-                    // they shouldn't be listed inside TMs' attributes
-                    if (tm.op == "buda_unpad")
-                        continue;
-
-                    // If the operation was pad or unpad, this flag should be reset after that check
-                    // otherwise we can have hanging comma in the netlist, in the list of tms
-                    if (!first_tm) os << ", ";
-                    first_tm = false;
-
-                    // Unpadding operations/atrributes are in rank of TMs,
-                    // they shouldn't be listed inside TMs' attributes
-                    if (tm.op == "buda_pad") {
-                        os << "pad: [" << tm.buda_attrs["rt"] << ", " << tm.buda_attrs["ct"] << ", " << tm.buda_attrs["pad_value"] << "]";
-                        continue;
-                    }
-
-                    if (tm.op == "broadcast") {
-                        // User-friendly dims
-                        os << "broadcast: {";
-                        assert(tm.attr.size() <= 3);
-                        switch(std::get<int>(tm.attr[0])) {
-                            case 0: throw std::runtime_error("Broadcast of W not supported");
-                            case 1: os << "z";
-                                os << ": " << std::get<int>(tm.attr[1]) << "}";
-                                break;
-                            case 2: os << "r";
-                                os << ": " << std::get<int>(tm.attr[1]) << "}";
-                                break;
-                            case 3: os << "c";
-                                os << ": " << std::get<int>(tm.attr[1]) << "}";
-                                break;
-                        }
-                        continue;
-                    }
-
-                    if (tm.op == "transpose") {
-                        os << "transpose";
-                        continue;
-                    }
-
-                    if (tm.op == "select") {
-                        // User-friendly dims
-                        os << "select: {";
-                        assert(tm.attr.size() == 4);
-                        os << "range: [" << tm.buda_attrs["index"] << ", " << tm.buda_attrs["length"]
-                        << "], stride: " << tm.buda_attrs["stride"] << "}";
-                        continue;
-                    }
-
-                    if (tm.op == "gather") {
-                        // User-friendly dims
-                        os << "gather: {";
-                        assert(tm.attr.size() == 5);
-                        os << "range: [" << tm.buda_attrs["index"] << ", " << tm.buda_attrs["length"]
-                        << "], stride: " << tm.buda_attrs["stride"] << ", size: " << tm.buda_attrs["size"] << "}";
-                        continue;
-                    }
-
-                    if (tm.op == "hslice" or tm.op == "vslice" or tm.op == "hstack" or tm.op == "vstack")
-                    {
-                        TT_ASSERT(tm.attr.size() == 1);
-                        os << tm.op << ": " << std::get<int>(tm.attr[0]);
-                        continue;
-                    }
-
-                    os << tm.op;
-                    if (tm.attr.size() > 0) {
-                        os << ": {";
-                        bool first_param = true;
-                        for (graphlib::OpType::Attr param : tm.attr) {
-                            if (!first_param) os << ", ";
-                            first_param = false;
-                            // os << param;
-                            os << std::get<int>(param);
-                        }
-                        os << "}";
-                    }
-                }
-
-                os << "]";
-
-            }
-
-        }
-
-    }
-
-    if (!op.attrs.empty()) {
-        os << "," << std::endl << "         attributes: {";
-        bool first = true;
-        for (auto [key, value] : op.attrs) {
-            if (!first) {
-                os << ", ";
-            }
-            first = false;
-            os << key << ": " << value;
-        }
-        os << "}";
-    }
-
-    os << "}";
-
-
-    return os;
-
-}
-
-std::ostream &operator<<(std::ostream &os, std::vector<std::pair<std::string, std::string>> const &forked_dram_inputs)
-{
-    if (!forked_dram_inputs.empty())
-    {
-        bool first = true;
-        os << ", forked_dram_inputs: [";
-        for (auto &forked_input : forked_dram_inputs)
-        {
-            if (!first)
-                os << ", ";
-            first = false;
-            os << forked_input.first << ": " << forked_input.second;
-        }
-        os << "]";
-    }
-    return os;
-}
-
-std::ostream &operator<<(std::ostream &os, BudaNaryTM const &tm)
-{
-    os << tm.type << ": " << tm.inputs;
-    return os;
-}
-
-std::ostream &operator<<(std::ostream &os, BudaOperand const &operand)
-{
-    if (const BudaName *n = std::get_if<BudaName>(&operand))
-    {
-        os << *n;
-    }
-    else if (const BudaNaryTM *c = std::get_if<BudaNaryTM>(&operand))
-    {
-        os << *c;
-    }
-    else
-    {
-        TT_ASSERT(false, "Unhandled variant type for BudaOperand");
-    }
-    return os;
-}
-
-std::ostream &operator<<(std::ostream &os, std::vector<BudaOperand> const &operands)
-{
-    bool first = true;
-    os << "[";
-    for (const BudaOperand &operand : operands)
-    {
-        if (!first)
-            os << ", ";
-        os << operand;
-        first = false;
-    }
-    return os << "]";
-}
-
-std::ostream &operator<<(std::ostream &os, BudaOpGrid const &g) {
-    os << "grid_loc: [" << g.grid_loc_r << ", " << g.grid_loc_c << "], ";
-    os << "grid_size: [" << g.grid_size_r << ", " << g.grid_size_c << "]";
-    return os;
-
-}
-
-std::ostream &operator<<(std::ostream &os, std::vector<DataFormat> const &dfs)
-{
-    if (dfs.size() == 0) {
-        return os << "[]";
-    }
-
-    os << "[" << dfs[0];
-    for (std::size_t i=1; i < dfs.size(); i++)
-        os << ", " << dfs[i];
-    return os << "]";
-}
-
-std::ostream &operator<<(std::ostream &os, TileDim const tile_dim) {
-
-    switch(tile_dim)
-    {
-        case TileDim::Dim32x32:
-            os << "[32, 32]";
-            return os;
-        case TileDim::Dim16x32:
-            os << "[16, 32]";
-            return os;
-        case TileDim::Dim32x16:
-            os << "[32, 16]";
-            return os;
-        case TileDim::Dim8x32:
-            os << "[8, 32]";
-            return os;
-        case TileDim::Dim4x32:
-            os << "[4, 32]";
-            return os;
-        case TileDim::Dim2x32:
-            os << "[2, 32]";
-            return os;
-        case TileDim::Dim1x32:
-            os << "[1, 32]";
-            return os;  
-        default:
-            TT_ASSERT(false, "Invalid tile dim");
-    }
-    return os;
-}
-
-}
-
-
-
diff --git a/pybuda/csrc/lower_to_buda/op.hpp b/pybuda/csrc/lower_to_buda/op.hpp
deleted file mode 100644
index 162f1dfb0..000000000
--- a/pybuda/csrc/lower_to_buda/op.hpp
+++ /dev/null
@@ -1,71 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <string>
-#include <variant>
-#include <vector>
-
-#include "graph_lib/node_types.hpp"
-#include "lower_to_buda/comment.hpp"
-#include "lower_to_buda/common.hpp"
-#include "lower_to_buda/queue.hpp"
-
-namespace tt {
-
-struct BudaOpGrid {
-    std::uint32_t grid_loc_r, grid_loc_c;
-    std::uint32_t grid_size_r, grid_size_c;
-};
-
-struct BudaNaryTM;
-
-using BudaOperand = std::variant<BudaName, BudaNaryTM>;
-
-struct BudaOp {
-    std::string name;
-    std::string type;
-    BudaOpGrid grid;
-    std::vector<BudaOperand> inputs;
-    std::vector<std::pair<std::string, std::string>> forked_dram_inputs;
-    std::vector<DataFormat> input_data_formats;
-    std::vector<std::uint32_t> input_buf_min_size_tiles;
-    std::vector<std::size_t> input_dram_io_buf_size_tiles;
-    DataFormat output_data_format;
-    DataFormat intermediate_data_format;
-    DataFormat accumulate_data_format;
-    MathFidelity fidelity;
-    bool untilize_output;
-    BudaBlocks blocks;
-    graphlib::UBlockOrder ublock_order = graphlib::UBlockOrder::R;
-    int buf_size_mb;
-    int overlay_size = 0;  // Op-level override for overlay blob size in Bytes, default is 65536 (64 kB)
-    std::unordered_map<int, std::vector<graphlib::OpType>> tms; // per operand
-    BudaOpAttrs attrs;
-    bool gradient_op;
-    bool grid_transpose;
-    Comment debug_info;
-    TileDim tile_dim;
-
-};
-
-struct BudaNaryTM
-{
-    std::string name;
-    std::string type;
-    std::vector<BudaOperand> inputs;
-};
-
-std::ostream &operator<<(std::ostream &os, BudaOp const &op);
-std::ostream &operator<<(std::ostream &os, BudaNaryTM const &tm);
-std::ostream &operator<<(std::ostream &os, BudaOperand const &operand);
-std::ostream &operator<<(std::ostream &os, std::vector<BudaOperand> const &operands);
-std::ostream &operator<<(std::ostream &os, BudaOpGrid const &g);
-std::ostream &operator<<(std::ostream &os, std::vector<DataFormat> const &dfs);
-std::ostream &operator<<(std::ostream &os, TileDim const tile_dim);
-std::ostream &operator<<(std::ostream &os, std::vector<std::pair<std::string, std::string>> const &forked_dram_inputs);
-} // namespace tt
-
-
-
diff --git a/pybuda/csrc/lower_to_buda/program.cpp b/pybuda/csrc/lower_to_buda/program.cpp
deleted file mode 100644
index b4bb38a83..000000000
--- a/pybuda/csrc/lower_to_buda/program.cpp
+++ /dev/null
@@ -1,417 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include <sstream>
-
-#include "lower_to_buda/program.hpp"
-#include "utils/assert.hpp"
-
-namespace tt::program {
-
-ParameterP Program::add_parameter(std::string name)
-{
-    ParameterP var = std::make_shared<Parameter>(name);
-    parameters_.insert(std::make_pair(name, var));
-    return var;
-}
-
-
-VariableP Program::add_variable(std::string name, bool static_var, int initial_value)
-{
-    VariableP var = std::make_shared<Variable>(name, static_var, initial_value);
-    return add_variable(var);
-}
-
-VariableP Program::add_variable(VariableP var)
-{
-    // add already created variable
-    if (var->is_static())
-        static_variables_.insert(std::make_pair(var->name(), var));
-    else
-        variables_.insert(std::make_pair(var->name(), var));
-    return var;
-}
-
-void Program::set_variable_value(VariableP var, std::variant<VariableP, ParameterP, int> value)
-{
-    lines_.push_back(std::make_shared<VarInstruction>(
-                VarInstruction::SET, var, 
-                std::vector<std::variant<VariableP, ParameterP, int>>{value},
-                std::unordered_map<std::string, std::string>{}));
-}
-
-void Program::instruction_incwrap(VariableP var, VariableP increment, int wrap)
-{
-    lines_.push_back(std::make_shared<VarInstruction>(
-                VarInstruction::INCWRAP, var, 
-                std::vector<std::variant<VariableP, ParameterP, int>>{increment, wrap}, 
-                std::unordered_map<std::string, std::string>{}));
-}
-
-void Program::instruction_add(VariableP var, VariableP increment)
-{
-    lines_.push_back(std::make_shared<VarInstruction>(
-                VarInstruction::ADD, var, 
-                std::vector<std::variant<VariableP, ParameterP, int>>{var, increment},
-                std::unordered_map<std::string, std::string>{}));
-}
-
-void Program::instruction_inc(VariableP var, VariableP increment)
-{
-    lines_.push_back(std::make_shared<VarInstruction>(
-                VarInstruction::INC, var, 
-                std::vector<std::variant<VariableP, ParameterP, int>>{var, increment},
-                std::unordered_map<std::string, std::string>{}));
-}
-
-VariableP Program::get_var(const std::string &name) const 
-{
-    try {
-        return variables_.at(name);
-    } catch (std::out_of_range &e) {
-        throw std::runtime_error("Variable " + name + " doesn't exist.");
-    }
-}
-
-Program Program::loop_template(
-        std::string program_name,
-        std::vector<VariableP> queue_variables,
-        std::uint64_t microbatch,
-        bool has_zero_grad,
-        bool is_optimizer_loop,
-        bool has_cache_buffers,
-        std::function<void(Program &p)> gen_execute)
-{
-    if (is_optimizer_loop)
-        return Program::opt_template(program_name, queue_variables, microbatch, gen_execute);
-
-    //
-    // Generate a local and global rd ptr for each queue
-    Program p(program_name);
-
-    // Create constant variables to hold 1, and microbatch
-    VariableP c_zero = p.add_variable("c_zero", false, 0);
-    VariableP c_one = p.add_variable("c_one", false, 1);
-    VariableP c_microbatch_size = p.add_variable("c_microbatch_size", false, microbatch);
-
-
-    if (has_zero_grad) {
-        ParameterP p_zero_grad = p.add_parameter("p_zero_grad");
-        VariableP v_zero_grad = p.add_variable("v_zero_grad", false, 0);
-        p.lines_.push_back(std::make_shared<VarInstruction>(VarInstruction::Instruction::SET, v_zero_grad,
-                std::vector<std::variant<VariableP, ParameterP, int>>{p_zero_grad}));
-    }
-
-    for (VariableP var : queue_variables) {
-        p.add_variable(var);
-        if (var->is_cross_program_shadow()) {
-            VariableP shadow = var->get_shadow_global_read_pointer();
-            p.add_variable(shadow);
-            p.lines_.push_back(std::make_shared<VarInstruction>(VarInstruction::Instruction::SET, var,
-                std::vector<std::variant<VariableP, ParameterP, int>>{shadow}));
-        }
-    }
-
-    VariableP v_cache_write_index;
-    ParameterP p_inner_increment;
-    VariableP v_inner_increment;
-    ParameterP p_outer_increment;
-    VariableP v_outer_increment;
-    if (has_cache_buffers) {
-        ParameterP p_cache_write_index = p.add_parameter("p_cache_write_index");
-        ParameterP p_inner_loop_count = p.add_parameter("p_inner_loop_count");
-        p_inner_increment = p.add_parameter("p_inner_increment");
-        ParameterP p_outer_loop_count = p.add_parameter("p_outer_loop_count");
-        p_outer_increment = p.add_parameter("p_outer_increment");
-        v_cache_write_index = p.add_variable("v_cache_write_index", false, 0);
-        p.lines_.push_back(std::make_shared<VarInstruction>(VarInstruction::Instruction::SET, v_cache_write_index,
-                std::vector<std::variant<VariableP, ParameterP, int>>{p_cache_write_index}));
-
-        p.lines_.push_back(std::make_shared<Loop>(p_outer_loop_count->to_string()));
-        p.lines_.push_back(std::make_shared<Loop>(p_inner_loop_count->to_string()));
-    }
-    else {
-        ParameterP p_loop_count = p.add_parameter("p_loop_count");
-        p.lines_.push_back(std::make_shared<Loop>(p_loop_count->to_string()));
-    }
-
-    for (VariableP var : queue_variables) {
-        p.add_variable(var);
-        if (var->is_cross_epoch_shadow()) {
-            VariableP shadow = var->get_shadow_global_read_pointer();
-            p.add_variable(shadow);
-            p.lines_.push_back(std::make_shared<VarInstruction>(VarInstruction::Instruction::SET, var,
-                std::vector<std::variant<VariableP, ParameterP, int>>{shadow}));
-        }
-    }
-
-    gen_execute(p);
-    if (has_cache_buffers) {
-        p.lines_.push_back(std::make_shared<VarInstruction>(VarInstruction::Instruction::INC, v_cache_write_index,
-                std::vector<std::variant<VariableP, ParameterP, int>>{p_inner_increment}));
-        p.lines_.push_back(std::make_shared<EndLoop>());
-        p.lines_.push_back(std::make_shared<VarInstruction>(VarInstruction::Instruction::INC, v_cache_write_index,
-                std::vector<std::variant<VariableP, ParameterP, int>>{p_outer_increment}));
-        p.lines_.push_back(std::make_shared<EndLoop>());
-    }
-    else {
-        p.lines_.push_back(std::make_shared<EndLoop>());
-    }
-
-    return p;
-    
-}
-
-// Generate a non-looping program, generally used for the optimizer. The provided
-// function should generate the 'execute' instructions.
-Program Program::opt_template(
-        std::string program_name,
-        std::vector<VariableP> queue_variables,
-        std::uint64_t microbatch,
-        std::function<void(Program &p)> gen_execute)
-{
-    // Generate a local and global rd ptr for each queue
-    Program p(program_name);
-
-    // for (VariableP var : queue_variables)
-    //     p.variables_.insert(std::make_pair(var->name(), var));
-
-    for (VariableP var : queue_variables) {
-        p.add_variable(var);
-        if (var->needs_shadow_global_read_pointer()) {
-            VariableP shadow = var->get_shadow_global_read_pointer();
-            p.add_variable(shadow);
-            p.lines_.push_back(std::make_shared<VarInstruction>(VarInstruction::Instruction::SET, var,
-                std::vector<std::variant<VariableP, ParameterP, int>>{shadow}));
-        }
-    }
-
-    // Create constant variables to hold 1, and microbatch
-    VariableP c_zero = p.add_variable("c_zero", false, 0);
-    VariableP c_one = p.add_variable("c_one", false, 1);
-    VariableP c_microbatch_size = p.add_variable("c_microbatch_size", false, microbatch);
-
-    gen_execute(p);
-
-    return p;
-}
-
-std::string VarDeclaration::to_string(const std::string &indent) const
-{
-    const std::string name = static_vars_ ? "staticvar" : "var";
-    if (variables_.size() == 0)
-        return indent + name + ": []";
-
-    std::string ret = "";
-    ret += variables_[0]->to_string() + ": " + std::to_string(variables_[0]->initial_value_);
-    for (std::size_t i=1; i < variables_.size(); i++)
-        ret += ", " + variables_[i]->to_string() + ": " + std::to_string(variables_[i]->initial_value_);
-
-    return indent + name + ": {" + ret + "}";
-}
-
-std::string ParamDeclaration::to_string(const std::string &indent) const
-{
-    std::string ret = "";
-    ret += params_[0]->to_string();
-    for (std::size_t i=1; i < params_.size(); i++)
-        ret += ", " + params_[i]->to_string();
-
-    return indent + "param" + ": [" + ret + "]";
-}
-
-std::string VarInstruction::opcode_string() const
-{
-    switch (opcode_) {
-        case VarInstruction::SET: return "set";
-        case VarInstruction::ADD: return "add";
-        case VarInstruction::INCWRAP: return "incwrap";
-        case VarInstruction::INC: return "inc";
-    }
-    TT_ASSERT(false, "Unknown upcode");
-    return "";
-}
-
-std::string VarInstruction::to_string(const std::string &indent) const
-{
-    std::string ret = indent + "varinst: [" + output_->to_string() + ", " + opcode_string();
-
-    for (std::variant<VariableP, ParameterP, int> input : inputs_) {
-        if (std::holds_alternative<VariableP>(input)) {
-            ret += ", " + std::get<VariableP>(input)->to_string();
-        } else if (std::holds_alternative<ParameterP>(input)) {
-            ret += ", " + std::get<ParameterP>(input)->to_string();
-        } else {
-            ret += ", " + std::to_string(std::get<int>(input));
-        }
-    }
-
-
-    if (!attributes_.empty()) {
-        for (const auto &[key, value] : attributes_)
-        {
-            if (key == "value") {
-                ret += ", " + value;
-            } else {
-                TT_ASSERT(false, "Unknown attribute", key, value);
-            }
-        }
-    }
-    ret += "]";
-    return ret;
-}
-
-std::string QueueSettings::to_string() const
-{
-    std::string ret = name_ + ": {";
-    std::vector<std::string> attrs;
-    attrs.push_back("prologue: " + std::string(prologue ? "true" : "false"));
-    attrs.push_back("epilogue: " + std::string(epilogue ? "true" : "false"));
-    attrs.push_back("zero: " + zero);
-
-    if (global_rdptr_autoinc) {
-        attrs.push_back("global_rdptr_autoinc: " + std::to_string((int)global_rdptr_autoinc));
-    }
-
-    if (!rd_ptr_autoinc) {
-        attrs.push_back("rd_ptr_autoinc: " + std::to_string((int)rd_ptr_autoinc));
-    }
-
-    if (read_only) {
-        attrs.push_back("read_only: true");
-    }
-    if (this->queue_type_ == QueueSettings::Type::Queue) {
-        const QueueAttributes& queue_attrs = this->queue_attributes();
-
-        attrs.push_back("rd_ptr_local: " + (queue_attrs.read_ptr_local_ ? queue_attrs.read_ptr_local_->to_string() : std::string("$c_zero")));
-        attrs.push_back("rd_ptr_global: " + (queue_attrs.read_ptr_global_ ? queue_attrs.read_ptr_global_->to_string() : std::string("$c_zero")));
-    } else if (this->queue_type_ == QueueSettings::Type::RAM) {
-        const RamAttributes& ram_attrs = this->ram_attributes();
-
-        attrs.push_back("rd_ptr_global: " + (ram_attrs.read_ptr_ ? ram_attrs.read_ptr_->to_string() : std::string("$c_zero")));
-        attrs.push_back("wr_ptr_global: " + (ram_attrs.write_ptr_ ? ram_attrs.write_ptr_->to_string() : std::string("$c_zero")));
-    }
-
-    if (global_wrptr_autoinc)
-    {
-        attrs.push_back("global_wrptr_autoinc: " + std::to_string(global_wrptr_autoinc));
-    }
-
-    if (attrs.size() == 0) {
-        return ret + "}";
-    }
-
-    ret += attrs[0];
-    for (std::size_t i=1; i < attrs.size(); i++) 
-        ret += ", " + attrs[i];
-    ret += "}";
-    return ret;
-}
-
-int QueueSettings::epoch_allocate() const {
-    if (this->queue_type_ == Type::RAM)
-        return -1;
-
-    return queue_attributes().epoch_allocate;
-}
-
-int QueueSettings::epoch_deallocate() const {
-    if (this->queue_type_ == Type::RAM)
-        return -1;
-
-    return queue_attributes().epoch_deallocate;
-}
-
-
-const RamAttributes& QueueSettings::ram_attributes() const {
-    TT_ASSERT(this->queue_type_ == Type::RAM);
-    return std::get<RamAttributes>(this->attributes_);
-}
-
-const QueueAttributes& QueueSettings::queue_attributes() const {
-    TT_ASSERT(this->queue_type_ == Type::Queue);
-    return std::get<QueueAttributes>(this->attributes_);
-}
-
-std::string Execute::to_string(const std::string &indent) const 
-{
-    std::string ret = indent + "execute: {graph_name: " + graph_name_;
-    if (queue_settings_.size() > 0) {
-        ret += ", queue_settings: {\n";
-
-        bool first = true;
-        for (const QueueSettings &q : queue_settings_) {
-            if (!first) ret += ",\n";
-            first = false;
-            ret += indent + "             " + q.to_string();
-        }
-        ret += "} }";
-    }
-    else {
-        ret += "}";
-    }
-
-    return ret;
-}
-
-std::string qs_vec_to_str(const std::vector<QueueSettings> &qs)
-{
-    std::string ret = "";
-    bool first = true;
-    for (const QueueSettings &q : qs) {
-        if (!first) ret += ", ";
-        first = false;
-        ret += q.name();
-    }
-    return ret;
-}
-
-std::string AllocateQueue::to_string(const std::string &indent) const {
-    return indent + "allocate_queue: [" + qs_vec_to_str(queue_settings_) + "]";
-}
-
-std::string DeallocateQueue::to_string(const std::string &indent) const {
-    return indent + "deallocate_queue: [" + qs_vec_to_str(queue_settings_) + "]";
-}
-
-
-std::string Program::to_yaml() const
-{
-    std::stringstream ss;
-
-    ss << name_ << ":" << std::endl;
-
-    std::vector<ParameterP> params;
-    for (const auto &[name, p] : parameters_) params.push_back(p);
-    if (params.size() > 0) 
-        ss << "    - " << ParamDeclaration{params}.to_string("") << std::endl;
-
-    std::vector<VariableP> vars;
-    for (const auto &[name, var] : variables_) vars.push_back(var);
-    if (vars.size() > 0) 
-        ss << "    - " << VarDeclaration{vars, false}.to_string("") << std::endl;
-
-    std::vector<VariableP> staticvars;
-    for (const auto &[name, var] : static_variables_) staticvars.push_back(var);
-    if (staticvars.size() > 0) 
-        ss << "    - " << VarDeclaration{staticvars, true}.to_string("") << std::endl;
-    
-    int indent = 0;
-    for (const LineP &line : lines_) 
-    {
-        indent += line->indent_pre_change();
-        std::string str_indent(indent*2, ' ');
-        ss << "    - " << line->to_string(str_indent) << std::endl;
-        indent += line->indent_post_change();
-    }
-    ss << std::endl;
-
-    return ss.str();
-}
-
-std::ostream &operator<<(std::ostream &os, Program const &p)
-{
-    return os << p.to_yaml();
-}
-
-} // namespace tt::program
diff --git a/pybuda/csrc/lower_to_buda/program.hpp b/pybuda/csrc/lower_to_buda/program.hpp
deleted file mode 100644
index 2c6dcaaaf..000000000
--- a/pybuda/csrc/lower_to_buda/program.hpp
+++ /dev/null
@@ -1,252 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-#include <string>
-#include <vector>
-#include <functional>
-#include <memory>
-#include <variant>
-#include <unordered_map>
-
-#include "utils/assert.hpp"
-
-namespace tt {
-
-namespace program {
-
-struct Variable {
-
-    enum ShadowType {
-        CROSS_EPOCH,
-        CROSS_PROGRAM,
-        NONE
-    };
-
-    std::string name_;
-    int initial_value_;
-    bool is_static_;
-    ShadowType shadow_type_ = ShadowType::NONE;
-    std::shared_ptr<Variable> shadow = nullptr;
-
-    Variable(std::string name, bool is_static, int initial_value = 0, ShadowType shadow_type = NONE) 
-        : name_(name), initial_value_(initial_value), is_static_(is_static), shadow_type_(shadow_type) 
-    {
-        if (needs_shadow_global_read_pointer()) {
-            shadow = std::make_shared<Variable>(name_ + "_shadow", is_static_, 0, ShadowType::NONE);
-        }
-    }
-    bool is_static() const { return is_static_; }
-    std::string name() const { return name_; }
-    std::string to_string() const { return "$" + name_; }
-
-    // temporary workaround for backend
-    bool needs_shadow_global_read_pointer() const { return shadow_type_ != ShadowType::NONE; }
-    bool is_cross_epoch_shadow() const { return shadow_type_ == ShadowType::CROSS_EPOCH; }
-    bool is_cross_program_shadow() const { return shadow_type_ == ShadowType::CROSS_PROGRAM; }
-    std::shared_ptr<Variable> get_shadow_global_read_pointer() const {
-        TT_ASSERT(needs_shadow_global_read_pointer());
-        return shadow;
-    }
-};
-
-struct Parameter {
-    std::string name_;
-
-    Parameter(std::string name) : name_(name) {}
-    std::string name() const { return name_; }
-    std::string to_string() const { return "$" + name_; }
-};
-
-using VariableP = std::shared_ptr<Variable>;
-using VariableMap = std::unordered_map<std::string, VariableP>;
-using ParameterP = std::shared_ptr<Parameter>;
-using ParameterMap = std::unordered_map<std::string, ParameterP>;
-
-struct Line {
-    virtual ~Line() {}
-    virtual std::string to_string(const std::string &indent) const = 0;
-    virtual int indent_post_change() const { return 0; }
-    virtual int indent_pre_change() const { return 0; }
-};
-
-using LineP = std::shared_ptr<Line>;
-
-struct VarDeclaration : public Line {
-    std::vector<VariableP> variables_;
-    bool static_vars_;
-    VarDeclaration(std::vector<VariableP> &variables, bool static_vars) : variables_(variables), static_vars_(static_vars) {}
-    virtual std::string to_string(const std::string &indent) const override;
-};
-
-struct ParamDeclaration : public Line {
-    std::vector<ParameterP> params_;
-    ParamDeclaration(std::vector<ParameterP> &params) : params_(params) {}
-    virtual std::string to_string(const std::string &indent) const override;
-};
-
-struct VarInstruction : public Line {
-
-    enum Instruction {
-        SET,
-        ADD,
-        INCWRAP,
-        INC
-    };
-
-    Instruction opcode_;
-    std::vector<std::variant<VariableP, ParameterP, int>> inputs_;
-    VariableP output_;
-    std::unordered_map<std::string, std::string> attributes_;
-
-    VarInstruction(Instruction opcode, VariableP output, std::vector<std::variant<VariableP, ParameterP, int>> inputs = {},
-            std::unordered_map<std::string, std::string> attributes = {}) :
-        opcode_(opcode), inputs_(inputs), output_(output), attributes_(attributes) {}
-    virtual std::string to_string(const std::string &indent) const override;
-    std::string opcode_string() const;
-};
-
-struct QueueAttributes {
-    VariableP read_ptr_global_;
-    VariableP read_ptr_local_;
-
-    // For dynamic queues, this will allocate/deallocate on specified epoch
-    int epoch_allocate = -1;
-    int epoch_deallocate = -1;
-};
-
-struct RamAttributes {
-    VariableP read_ptr_;
-    VariableP write_ptr_;
-};
-
-struct QueueSettings {
-public:
-    std::string name_;
-    bool prologue = false;
-    bool epilogue = false;
-    bool global_rdptr_autoinc = false;
-    bool rd_ptr_autoinc = true;
-    int global_wrptr_autoinc = 0;
-    std::string zero = "False";
-    bool read_only = false;
-
-private:
-    enum Type {
-        Queue,
-        RAM,
-    };
-
-    // Usage: access through helpers. Don't expose usage of std::variant outside
-    Type queue_type_;
-    std::variant<QueueAttributes, RamAttributes> attributes_;
-
-public:
-    QueueSettings(std::string name, QueueAttributes queue_attributes)
-        : name_(name), queue_type_(Type::Queue), attributes_(queue_attributes) {}
-    QueueSettings(std::string name, RamAttributes ram_attributes)
-        : name_(name), queue_type_(Type::RAM), attributes_(ram_attributes) {}
-
-    const RamAttributes& ram_attributes() const;
-    const QueueAttributes& queue_attributes() const;
-
-    int epoch_allocate() const;
-    int epoch_deallocate() const;
-
-    std::string to_string() const;
-    std::string name() const { return name_; }
-};
-
-struct Execute : public Line {
-    std::string graph_name_;
-    std::vector<QueueSettings> queue_settings_;
-
-    Execute(std::string graph_name, std::vector<QueueSettings> queue_setting) :
-        graph_name_(graph_name), queue_settings_(queue_setting) {}
-
-    virtual std::string to_string(const std::string &indent) const override;
-};
-
-struct AllocateQueue : public Line {
-    std::vector<QueueSettings> queue_settings_;
-
-    AllocateQueue(std::vector<QueueSettings> queue_setting) : queue_settings_(queue_setting) {}
-
-    virtual std::string to_string(const std::string &indent) const override;
-};
-
-struct DeallocateQueue : public Line {
-    std::vector<QueueSettings> queue_settings_;
-
-    DeallocateQueue(std::vector<QueueSettings> queue_setting) : queue_settings_(queue_setting) {}
-
-    virtual std::string to_string(const std::string &indent) const override;
-};
-
-
-struct Loop : public Line {
-
-    std::string variable_;
-
-    Loop(std::string v) : variable_(v) {}
-    virtual int indent_post_change() const override { return 1; }
-    virtual std::string to_string(const std::string &indent) const override { return indent + "loop: " + variable_; }
-};
-
-struct EndLoop : public Line {
-
-    virtual int indent_pre_change() const override { return -1; }
-    virtual std::string to_string(const std::string &indent) const override { return indent + "endloop"; }
-};
-
-
-class Program {
-
-    std::string name_;
-    std::vector<LineP> lines_;
-    VariableMap variables_;
-    VariableMap static_variables_;
-    ParameterMap parameters_;
-
-    VariableP add_variable(std::string name, bool static_var, int initial_value = 0);
-    VariableP add_variable(VariableP var); // add already created variable
-    ParameterP add_parameter(std::string name);
-
-public:
-    Program(std::string name) : name_(name) {}
-    std::string to_yaml() const;
-
-    // Generate a standard looping program, given a number of looping variables for inputs. The provided
-    // function should generate the 'execute' instructions in the core of the loop
-    static Program loop_template(
-        std::string program_name,
-        std::vector<VariableP> queue_variables,
-        std::uint64_t microbatch,
-        bool has_zero_grad,
-        bool is_optimizer_loop,
-        bool has_cache_buffers,
-        std::function<void(Program &p)> gen_execute);
-
-    // Generate a non-looping program, generally used for the optimizer. The provided
-    // function should generate the 'execute' instructions.
-    static Program opt_template(
-        std::string program_name,
-        std::vector<VariableP> queue_variables,
-        std::uint64_t microbatch,
-        std::function<void(Program &p)> gen_execute);
-
-    // Get vars, add lines
-    VariableP get_var(const std::string &name) const;
-    void add(LineP line) { lines_.push_back(line); }
-
-    // Variable instructions
-    void set_variable_value(VariableP var, std::variant<VariableP, ParameterP, int> value);
-    void instruction_incwrap(VariableP var, VariableP increment, int wrap);
-    void instruction_inc(VariableP var, VariableP increment);
-    void instruction_add(VariableP var, VariableP increment);
-};
-
-std::ostream &operator<<(std::ostream &os, Program const &p);
-
-} // namespace program
-} // namespace tt
diff --git a/pybuda/csrc/lower_to_buda/queue.cpp b/pybuda/csrc/lower_to_buda/queue.cpp
deleted file mode 100644
index 079d13333..000000000
--- a/pybuda/csrc/lower_to_buda/queue.cpp
+++ /dev/null
@@ -1,88 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include <iostream>
-
-#include "lower_to_buda/queue.hpp"
-#include "lower_to_buda/op.hpp"
-
-namespace tt {
-
-std::string BudaQueue::as_string(int padded_name_length) const
-{
-    std::stringstream ss;
-    ss << name << ": ";
-    for (int i = name.length(); i < padded_name_length + 1; i++) ss << " ";
-
-    ss << "{";
-    ss << "input: " << input_name << ", ";
-    if (alias != "") ss << "alias: " << alias << ", ";
-    ss << "type: " << (memory_access == "RAM" ? "ram" : "queue") << ", ";
-    ss << "entries: " << entries << ", ";
-    ss << "grid_size: [" << dims.grid_r << ", " << dims.grid_c << "], ";
-    ss << blocks << ", ";
-    switch (ublock_order)
-    {
-        case graphlib::UBlockOrder::R: ss << "ublock_order: r, "; break;
-        case graphlib::UBlockOrder::C: ss << "ublock_order: c, "; break;
-    }
-
-    ss << "tile_dim: " << tile_dim_ << ", ";
-    ss << "df: " << data_format << ", ";
-
-    if (layout != BudaQueueLayout::Tilized)
-        ss << "layout: " << layout << ", ";
-
-    ss << "target_device: " << target_device << ", ";
-    ss << "loc: " << loc;
-
-    if (loc == BudaQueueLocation::DRAM) {
-        TT_ASSERT(dram_loc.size() > 0);
-        ss << ", dram: [" << dram_loc[0];
-        for (std::size_t i=1; i < dram_loc.size(); i++)
-            ss << ", " << dram_loc[i];
-        ss << "]";
-    } else if (loc == BudaQueueLocation::HOST) {
-        TT_ASSERT(host_loc.size() > 0);
-        ss << ", host: [" << host_loc[0];
-        for (std::size_t i = 1; i < host_loc.size(); i++) ss << ", " << host_loc[i];
-        ss << "]";
-    }
-
-    ss << "}";
-
-    return ss.str();
-}
-
-std::ostream &operator<<(std::ostream &os, BudaQueue const &q) 
-{
-    return os << q.as_string();
-}
-
-std::ostream &operator<<(std::ostream &os, BudaQueueDramLoc const &l)
-{
-    return os << "[" << l.dram_channel << ", 0x" << std::hex << l.dram_address << std::dec << "]";
-}
-
-std::ostream &operator<<(std::ostream &os, BudaQueueHostLoc const &l)
-{
-    return os << "[" << l.host_channel << ", 0x" << std::hex << l.host_address << std::dec << "]";
-}
-
-std::ostream &operator<<(std::ostream &os, BudaQueueLocation const &l)
-{
-    return os << ((l == BudaQueueLocation::DRAM) ? "dram" : "host");
-}
-
-std::ostream &operator<<(std::ostream &os, BudaQueueLayout const &l)
-{
-    switch (l)
-    {
-        case BudaQueueLayout::Tilized: os << "tilized"; break;
-        case BudaQueueLayout::Flat: os << "flat"; break;
-    }
-
-    return os;
-}
-
-} // namespace tt
diff --git a/pybuda/csrc/lower_to_buda/queue.hpp b/pybuda/csrc/lower_to_buda/queue.hpp
deleted file mode 100644
index 4292ff08c..000000000
--- a/pybuda/csrc/lower_to_buda/queue.hpp
+++ /dev/null
@@ -1,66 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <memory>
-#include <optional>
-
-#include "graph_lib/node_types.hpp"
-#include "lower_to_buda/common.hpp"
-#include "lower_to_buda/device.hpp"
-#include "lower_to_buda/op.hpp"
-
-namespace tt {
-
-struct BudaQueueDimensions {
-    int grid_r, grid_c;
-};
-
-enum BudaQueueLocation { DRAM, HOST };
-
-struct BudaQueueDramLoc {
-    std::uint32_t dram_channel;
-    std::uint32_t dram_address;
-};
-
-struct BudaQueueHostLoc {
-    std::uint32_t host_channel;
-    std::uint32_t host_address;
-};
-
-struct BudaQueue {
-
-    std::string name;
-    std::string input_name;
-    std::string type;
-    std::string memory_access;
-    std::string alias;
-    int entries;
-    int microbatch;
-    BudaQueueDimensions dims;
-    DataFormat data_format;
-    BudaDevice target_device;
-    BudaQueueLocation loc;
-    std::vector<BudaQueueDramLoc> dram_loc;
-    std::vector<BudaQueueHostLoc> host_loc;
-    BudaBlocks blocks;
-    graphlib::UBlockOrder ublock_order = graphlib::UBlockOrder::R;
-    BudaQueueLayout layout = BudaQueueLayout::Tilized;
-    TileDim tile_dim_;
-
-    BudaQueue(const std::string &name, const std::string &type, const std::string& memory_access, int device, TileDim tile_dim)
-        : name(name), type(type), memory_access(memory_access), target_device(BudaDevice(device)), tile_dim_(tile_dim)  {}
-
-    std::string as_string(int padded_name_length = 0) const;
-
-};
-
-std::ostream &operator<<(std::ostream &os, BudaQueue const &m);
-std::ostream &operator<<(std::ostream &os, BudaQueueLocation const &l);
-std::ostream &operator<<(std::ostream &os, BudaQueueDramLoc const &l);
-std::ostream &operator<<(std::ostream &os, BudaQueueHostLoc const &l);
-std::ostream &operator<<(std::ostream &os, BudaQueueLayout const &l);
-
-} // namespace tt
-
diff --git a/pybuda/csrc/module.mk b/pybuda/csrc/module.mk
index e2ddfc916..4739f2d51 100644
--- a/pybuda/csrc/module.mk
+++ b/pybuda/csrc/module.mk
@@ -2,54 +2,42 @@
 PYBUDA_CSRC_INCLUDES = \
 	-Ipybuda/csrc \
 	-Ithird_party/json \
-	-Ithird_party/budabackend \
-	-Ithird_party/budabackend/netlist \
 	-I/usr/include/$(PYTHON_VERSION) \
-	-isystem $(PYTHON_ENV)/lib/$(PYTHON_VERSION)/site-packages/torch/include \
- 	-isystem $(PYTHON_ENV)/lib/$(PYTHON_VERSION)/site-packages/torch/include/torch/csrc/api/include
-
-PYBUDA_CSRC_WARNINGS ?= -Wall -Wextra -Wno-pragmas
+	-isystem $(PYTHON_ENV_ROOT)/lib/$(PYTHON_VERSION)/site-packages/torch/include \
+ 	-isystem $(PYTHON_ENV_ROOT)/lib/$(PYTHON_VERSION)/site-packages/torch/include/torch/csrc/api/include \
+	-I/opt/ttmlir-toolchain/include \
+	-Ithird_party/tt-mlir/build/include \
+	-Ithird_party/tt-mlir/runtime/include \
+	-Ithird_party/tt-mlir/include
+
+PYBUDA_CSRC_WARNINGS ?= -Wall -Wextra -Wno-pragmas -Wno-unused-parameter
 PYBUDA_CSRC_CFLAGS ?= $(CFLAGS_NO_WARN) $(PYBUDA_CSRC_WARNINGS) -DUTILS_LOGGER_PYTHON_OSTREAM_REDIRECT=1
-BOOST_LIB_DIR = /usr/lib/x86_64-linux-gnu # use system installed boost
-TORCH_LIB_DIR = $(PYTHON_ENV)/lib/$(PYTHON_VERSION)/site-packages/torch/lib
+TORCH_LIB_DIR = $(PYTHON_ENV_ROOT)/lib/$(PYTHON_VERSION)/site-packages/torch/lib
 
 PYBUDA_CSRC_LIB = $(LIBDIR)/libpybuda_csrc.so
+TTMLIR_TOOLCHAIN_DIR = /opt/ttmlir-toolchain
+RUNTIME_LIB_DIR = third_party/tt-mlir/third_party/tt-metal/src/tt-metal-build/lib
+MLIR_LIB_DIR = -L$(TTMLIR_TOOLCHAIN_DIR)/lib -Lthird_party/tt-mlir/build/lib -Lthird_party/tt-mlir/build/runtime/lib
+MLIR_LIBS = -Wl,-rpath,$(TTMLIR_TOOLCHAIN_DIR)/lib -lLLVM -lMLIR
+TT_MLIR_LIBS = -lMLIRTTDialect -lMLIRTTIRDialect -lMLIRTTNNDialect -lMLIRTTIRTransforms -lMLIRTTNNTransforms -lMLIRTTKernelDialect -lMLIRTTMetalDialect -lMLIRTTIRAnalysis
+RUNTIME_LIBS = -lTTRuntime -lTTRuntimeTTNN -L$(RUNTIME_LIB_DIR) -Wl,-rpath,\$$ORIGIN/../../$(RUNTIME_LIB_DIR) -l:_ttnn.so -ltt_metal -ldevice -ltt_eager
 
 include pybuda/csrc/graph_lib/module.mk
 include pybuda/csrc/shared_utils/module.mk
-include pybuda/csrc/scheduler/module.mk
-include pybuda/csrc/placer/module.mk
 include pybuda/csrc/autograd/module.mk
-include pybuda/csrc/balancer/module.mk
 include pybuda/csrc/reportify/module.mk
 include pybuda/csrc/backend_api/module.mk
-include pybuda/csrc/pattern_matcher/module.mk
-include pybuda/csrc/perf_model/module.mk
 include pybuda/csrc/tt_torch_device/module.mk
+include pybuda/csrc/runtime/module.mk
 
-ifndef BUDABACKEND_LIBDIR
-$(error BUDABACKEND_LIBDIR not set)
-endif
-
-PYBUDA_CSRC_LDFLAGS = -Wl,-z,origin -Wl,-rpath,\$$ORIGIN/../python_env/lib/$(PYTHON_VERSION)/site-packages/torch/lib -Wl,-rpath,\$$ORIGIN/../budabackend/build/lib -Wl,-rpath,\$$ORIGIN/../../$(BUDABACKEND_LIBDIR) -lstdc++fs -lboost_serialization -ltorch -ltorch_cpu -lc10 -ltorch_python
+PYBUDA_CSRC_LDFLAGS = -Wl,-rpath,\$$ORIGIN/../python_env/lib/$(PYTHON_VERSION)/site-packages/torch/lib -ltorch -ltorch_cpu -lc10 -ltorch_python $(PYTHON_LDFLAGS) -l$(PYTHON_VERSION) $(MLIR_LIB_DIR) $(MLIR_LIBS) $(TT_MLIR_LIBS) $(RUNTIME_LIBS) -lm -lz -lcurses -lxml2 -lflatbuffers
 
 PYBUDA_CSRC_SRCS = \
 		pybuda/csrc/pybuda_bindings.cpp \
 		pybuda/csrc/buda_passes.cpp \
 		$(wildcard pybuda/csrc/passes/*.cpp) \
-		pybuda/csrc/lower_to_buda/netlist.cpp \
-		pybuda/csrc/lower_to_buda/queue.cpp \
-		pybuda/csrc/lower_to_buda/graph.cpp \
-		pybuda/csrc/lower_to_buda/op.cpp \
-		pybuda/csrc/lower_to_buda/device.cpp \
-		pybuda/csrc/lower_to_buda/debug.cpp \
-		pybuda/csrc/lower_to_buda/program.cpp \
-		pybuda/csrc/lower_to_buda/fused_op.cpp \
 		pybuda/csrc/lower_to_buda/common.cpp
 
-include pybuda/csrc/passes/tests/module.mk
-include pybuda/csrc/balancer/tests/module.mk
-
 PYBUDA_CSRC_OBJS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_SRCS:.cpp=.o))
 PYBUDA_CSRC_DEPS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_SRCS:.cpp=.d))
 
@@ -57,18 +45,18 @@ PYBUDA_THIRD_PARTY_DEPS = $(SUBMODULESDIR)/third_party/pybind11.checkout
 
 -include $(PYBUDA_CSRC_DEPS)
 
-$(PYBUDA_CSRC_LIB): $(PYBUDA_CSRC_OBJS) $(PYBUDA_CSRC_GRAPH_LIB) $(PYBUDA_CSRC_AUTOGRAD) $(PYBUDA_CSRC_PATTERN_MATCHER_LIB) $(PYBUDA_CSRC_BALANCER_LIB) $(PYBUDA_CSRC_PLACER_LIB) $(PYBUDA_CSRC_SCHEDULER_LIB) $(PYBUDA_CSRC_REPORTIFY) $(PYBUDA_CSRC_SHARED_UTILS_LIB) $(PYBUDA_CSRC_BACKENDAPI_LIB) $(PYBUDA_CSRC_TILE_MAPS_LIB) $(PYBUDA_CSRC_PERF_MODEL_LIB) $(PYBUDA_THIRD_PARTY_DEPS) $(PYBUDA_THIRD_PARTY_DEPS) $(PYBUDA_CSRC_TT_TORCH_DEVICE_LIB)
+$(PYBUDA_CSRC_LIB): $(PYBUDA_CSRC_OBJS) $(PYBUDA_CSRC_GRAPH_LIB) $(PYBUDA_CSRC_AUTOGRAD) $(PYBUDA_CSRC_PATTERN_MATCHER_LIB) $(PYBUDA_CSRC_BALANCER_LIB) $(PYBUDA_CSRC_PLACER_LIB) $(PYBUDA_CSRC_SCHEDULER_LIB) $(PYBUDA_CSRC_REPORTIFY) $(PYBUDA_CSRC_BACKENDAPI_LIB) $(PYBUDA_CSRC_SHARED_UTILS_LIB) $(PYBUDA_CSRC_PERF_MODEL_LIB) $(PYBUDA_CSRC_TT_TORCH_DEVICE_LIB) $(PYBUDA_CSRC_RUNTIME_LIB)
 	@mkdir -p $(LIBDIR)
-	$(CXX) $(PYBUDA_CSRC_CFLAGS) $(CXXFLAGS) $(SHARED_LIB_FLAGS) -L$(BOOST_LIB_DIR) -L$(TORCH_LIB_DIR) -o $@ $^ $(LDFLAGS) $(PYBUDA_CSRC_LDFLAGS)
+	$(CXX) $(PYBUDA_CSRC_CFLAGS) $(CXXFLAGS) $(SHARED_LIB_FLAGS) -L$(TORCH_LIB_DIR) -o $@ $^ $(LDFLAGS) $(PYBUDA_CSRC_LDFLAGS)
 
-$(PYTHON_ENV)/lib/$(PYTHON_VERSION)/site-packages/pybuda/_C.so: $(PYBUDA_CSRC_LIB)
-	@mkdir -p $(PYTHON_ENV)/lib/$(PYTHON_VERSION)/site-packages/pybuda
+$(PYTHON_ENV_ROOT)/lib/$(PYTHON_VERSION)/site-packages/pybuda/_C.so: $(PYBUDA_CSRC_LIB)
+	@mkdir -p $(PYTHON_ENV_ROOT)/lib/$(PYTHON_VERSION)/site-packages/pybuda
 	cp $^ $@
 	touch -r $^ $@
-	ln -sf ../../$(PYTHON_ENV)/lib/$(PYTHON_VERSION)/site-packages/pybuda/_C.so pybuda/pybuda/_C.so
+	ln -sf ../../$(PYTHON_ENV_ROOT)/lib/$(PYTHON_VERSION)/site-packages/pybuda/_C.so pybuda/pybuda/_C.so
 
-$(OBJDIR)/pybuda/csrc/%.o: pybuda/csrc/%.cpp
+$(OBJDIR)/pybuda/csrc/%.o: pybuda/csrc/%.cpp $(PYTHON_ENV) $(PYBUDA_THIRD_PARTY_DEPS)
 	@mkdir -p $(@D)
 	$(CXX) $(PYBUDA_CSRC_CFLAGS) $(CXXFLAGS) $(SHARED_LIB_FLAGS) $(PYBUDA_CSRC_INCLUDES) -c -o $@ $<
 
-pybuda/csrc: $(PYBUDA_CSRC_LIB) third_party/budabackend/src/net2pipe ;
+pybuda/csrc: $(PYBUDA_CSRC_LIB) ;
diff --git a/pybuda/csrc/passes/amp.cpp b/pybuda/csrc/passes/amp.cpp
index c821fa5d3..e21aa04d9 100644
--- a/pybuda/csrc/passes/amp.cpp
+++ b/pybuda/csrc/passes/amp.cpp
@@ -3,23 +3,21 @@
 // SPDX-License-Identifier: Apache-2.0
 #include "passes/amp.hpp"
 
-#include <functional>
+#include <filesystem>
 #include <fstream>
-#include <sstream>
-#include <experimental/filesystem>
+#include <functional>
 #include <regex>
+#include <sstream>
 
-
-
+#include "backend_api/device_config.hpp"
 #include "graph_lib/node_types.hpp"
 #include "graph_lib/utils.hpp"
-#include "utils/logger.hpp"
-#include "reportify/paths.hpp"
-#include "reportify/to_json.hpp"
 #include "lower_to_buda/common.hpp"
 #include "passes/dataformat.hpp"
-
+#include "reportify/paths.hpp"
+#include "reportify/to_json.hpp"
 #include "third_party/json/json.hpp"
+#include "utils/logger.hpp"
 
 using Graph = tt::graphlib::Graph;
 using Node = tt::graphlib::Node;
@@ -62,30 +60,24 @@ namespace tt::passes
 using impl::conjunction;
 using impl::get_queried_nodes;
 
-class RegexMatcher {
-private:
-    std::unordered_map<std::string, std::regex> regex_cache;
-
-public:
-    bool has_matching_string(const std::string& regex_string, const std::string& candidate_string) {
-        // Immediately return true if regex_string is empty
-        if (regex_string.empty()) {
-            return true;
-        }
-        std::smatch base_match;
-        // Check if the regex is already compiled in the cache
-        auto it = regex_cache.find(regex_string);
-        if (it == regex_cache.end()) {
-            // Compile the regex and store it in the cache
-            std::regex compiled_regex(regex_string);
-            regex_cache[regex_string] = compiled_regex;
-            return std::regex_match(candidate_string, base_match, compiled_regex);
-        } else {
-            // Use the compiled regex from the cache
-            return std::regex_match(candidate_string, base_match, it->second);
-        }
+bool RegexMatcher::has_matching_string(const std::string& regex_string, const std::string& candidate_string) {
+    // Immediately return true if regex_string is empty
+    if (regex_string.empty()) {
+        return true;
     }
-};
+    std::smatch base_match;
+    // Check if the regex is already compiled in the cache
+    auto it = regex_cache.find(regex_string);
+    if (it == regex_cache.end()) {
+        // Compile the regex and store it in the cache
+        std::regex compiled_regex(regex_string);
+        regex_cache[regex_string] = compiled_regex;
+        return std::regex_match(candidate_string, base_match, compiled_regex);
+    } else {
+        // Use the compiled regex from the cache
+        return std::regex_match(candidate_string, base_match, it->second);
+    }
+}
 
 
 template <class T>
@@ -340,7 +332,7 @@ void dump_mixed_precision_json_to_file(graphlib::Graph *graph, std::optional<std
     if (not filepath.has_value())
     {
         std::string output_dir = reportify::get_default_reportify_path(graph->name());
-        std::experimental::filesystem::create_directories(output_dir);
+        std::filesystem::create_directories(output_dir);
         filename = output_dir + "/amp_settings.json";
     }
     else
@@ -376,56 +368,55 @@ std::optional<std::string> original_op_type(const graphlib::Node* node)
     return {};
 }
 
+bool is_matched_op(const AMPNodeProperties &amp_properties, RegexMatcher &regex_matcher, const Node* node) {
+    bool is_match = true;
+    if (amp_properties.name_regex_match.has_value())
+    {
+        is_match &= regex_matcher.has_matching_string(amp_properties.name_regex_match.value(), node->name());
+    }
+    if (amp_properties.epoch_type.has_value())
+    {
+        is_match &= amp_properties.epoch_type.value() == node->get_epoch_type();
+    }
+    if (amp_properties.is_gradient_op.has_value())
+    {
+        const graphlib::OpNode* op = dynamic_cast<const graphlib::OpNode*>(node);
+        if (op != nullptr)
+        {
+            is_match &= amp_properties.is_gradient_op.value() == op->is_gradient_op();
+        }
+    }
+    if (amp_properties.op_type.has_value())
+    {
+        const graphlib::OpNode* op_node = dynamic_cast<const graphlib::OpNode*>(node);
+        if (op_node != nullptr)
+        {
+            is_match &= (
+                amp_properties.op_type.value() == op_node->op_name() or
+                amp_properties.op_type.value() == original_op_type(node)
+            );
+        }
+        else if (auto input_node = dynamic_cast<const graphlib::InputNode*>(node); input_node != nullptr)
+        {
+            is_match &= amp_properties.op_type.value() == graphlib::to_string(input_node->input_type());
+        }
+        else
+        {
+            is_match &= false;
+        }
+    }
+    return is_match;
+};
 
 void apply_configuration(const Graph* graph, const std::vector<AMPNodeProperties>& user_properties)
 {
     RegexMatcher regex_matcher;
     for (const auto& amp_properties : user_properties)
     {
-        // TODO-> turn this into a bind and move outside loop
-        auto is_matched_op = [&amp_properties, &regex_matcher](const Node* node) -> bool {
-
-            bool is_match = true;
-            if (amp_properties.name_regex_match.has_value())
-            {
-                is_match &= regex_matcher.has_matching_string(amp_properties.name_regex_match.value(), node->name());
-            }
-            if (amp_properties.epoch_type.has_value())
-            {
-                is_match &= amp_properties.epoch_type.value() == node->get_epoch_type();
-            }
-            if (amp_properties.is_gradient_op.has_value())
-            {
-                const graphlib::OpNode* op = dynamic_cast<const graphlib::OpNode*>(node);
-                if (op != nullptr)
-                {
-                    is_match &= amp_properties.is_gradient_op.value() == op->is_gradient_op();
-                }
-            }
-            if (amp_properties.op_type.has_value())
-            {
-                const graphlib::OpNode* op_node = dynamic_cast<const graphlib::OpNode*>(node);
-                if (op_node != nullptr)
-                {
-                    is_match &= (
-                        amp_properties.op_type.value() == op_node->op_name() or
-                        amp_properties.op_type.value() == original_op_type(node)
-                    );
-                }
-                else if (auto input_node = dynamic_cast<const graphlib::InputNode*>(node); input_node != nullptr)
-                {
-                    is_match &= amp_properties.op_type.value() == graphlib::to_string(input_node->input_type());
-                }
-                else
-                {
-                    is_match &= false;
-                }
-            }
-            return is_match;
-        };
+        auto is_matched_op_ = std::bind(is_matched_op, amp_properties, regex_matcher, std::placeholders::_1);
 
         apply_optimization(
-            graph, get_queried_nodes(graph, is_matched_op), amp_properties 
+            graph, get_queried_nodes(graph, is_matched_op_), amp_properties 
         );
     }
 }
@@ -489,19 +480,11 @@ void apply_mixed_b_optimization(const Graph *graph)
             {2, {DataFormat::Bfp8_b, false}}
         }
     };
-    AMPNodePropertiesInternal fused_config = {
-        .op_type = "fused_op",
-        .output_df = DataFormat::Float16_b,
-        .intermediate_df = DataFormat::Float16_b,
-        .accumulate_df = DataFormat::Float16_b,
-        .input_df = DataFormat::Float16_b
-    };
 
     std::vector<AMPNodeProperties> default_opt_configuration = {
         softmax_config.create(),
         layernorm_config.create(),
         matmul_config.create(),
-        fused_config.create(),
     };
     
     apply_configuration(graph, default_opt_configuration);
@@ -593,8 +576,8 @@ void apply_mixed_a_optimization(const Graph *graph)
 
 enum class MixedPrecisionSetting {
     None = 0,
-    Mixed_A_Formats = 1,
-    Mixed_B_Formats = 2,
+    Mixed_B_Formats = 1,
+    Mixed_A_Formats = 2,
 };
 using OptToFunctionMapping = std::unordered_map<MixedPrecisionSetting, std::function<void(Graph*)>>;
 const OptToFunctionMapping opt_dispatch_table = {
@@ -622,7 +605,7 @@ void const_tag_propagation(Graph *graph)
 
 static bool is_valid_opt_level(const int opt_level)
 {
-    return opt_level >= static_cast<int>(MixedPrecisionSetting::None) and opt_level <= static_cast<int>(MixedPrecisionSetting::Mixed_B_Formats);
+    return opt_level >= static_cast<int>(MixedPrecisionSetting::None) and opt_level <= static_cast<int>(MixedPrecisionSetting::Mixed_A_Formats);
 }
 
 MixedPrecisionSetting get_mixed_precision_settings(
diff --git a/pybuda/csrc/passes/amp.hpp b/pybuda/csrc/passes/amp.hpp
index f6c78c8cf..4e7597bfe 100644
--- a/pybuda/csrc/passes/amp.hpp
+++ b/pybuda/csrc/passes/amp.hpp
@@ -7,7 +7,9 @@
 #include <optional>
 #include <variant>
 #include <unordered_map>
+#include <regex>
 
+#include "graph_lib/graph.hpp"
 #include "lower_to_buda/common.hpp"
 #include "graph_lib/defines.hpp"
 #include "third_party/json/json.hpp"
@@ -77,6 +79,16 @@ struct AMPNodeProperties
     }
 };
 
+class RegexMatcher {
+private:
+    std::unordered_map<std::string, std::regex> regex_cache;
+
+public:
+    bool has_matching_string(const std::string& regex_string, const std::string& candidate_string);
+};
+
+bool is_matched_op(const AMPNodeProperties &amp_properties, RegexMatcher &regex_matcher, const graphlib::Node* node);
+
 void to_json(nlohmann::json& j, const AMPNodeProperties& p);
 void from_json(const nlohmann::json& j, AMPNodeProperties& p);
 
diff --git a/pybuda/csrc/passes/commute_utils.cpp b/pybuda/csrc/passes/commute_utils.cpp
index a08065bd1..db80e8a1e 100644
--- a/pybuda/csrc/passes/commute_utils.cpp
+++ b/pybuda/csrc/passes/commute_utils.cpp
@@ -817,9 +817,10 @@ bool is_elementwise(graphlib::OpNode *op)
 
 bool is_quantization_ops(graphlib::OpNode *op)
 {
-    return op->op_name() == "buda_quantize" or op->op_name() == "dequantize" or op->op_name() == "buda_requantize";
+    return op->op_name() == "buda_quantize" or op->op_name() == "buda_dequantize" or op->op_name() == "buda_requantize";
 }
 
+
 bool can_commute_past_op(
     graphlib::OpNode *op,
     graphlib::OpNode *initial_op,
@@ -845,6 +846,7 @@ bool can_commute_past_op(
         bool can_commute = can_commute_through_select(graph, op, initial_op, producer, commute_shape, clone_shape, commute_up);
         return can_commute;
     }
+
     return (is_elementwise(op) and op->op_name() != "interleave") or is_quantization_ops(op);
 }
 
diff --git a/pybuda/csrc/passes/constant_folding.cpp b/pybuda/csrc/passes/constant_folding.cpp
index 1610045bc..74a030f3c 100644
--- a/pybuda/csrc/passes/constant_folding.cpp
+++ b/pybuda/csrc/passes/constant_folding.cpp
@@ -280,7 +280,7 @@ static bool try_fold_constant_multiply_into_matmul_rhs(
         multiply_clone->set_shape(matmul_rhs->shape());
 
         auto *constant_clone = graph->add_node(
-            constant->clone(constant->name() + "_" + matmul_rhs->name()),
+            constant->clone(constant->name() + "_" + multiply->name()),
             graph->get_subgraph_id_for_node(matmul->id()));
 
         // Connect matmul rhs to multiply LHS
@@ -295,9 +295,12 @@ static bool try_fold_constant_multiply_into_matmul_rhs(
         graphlib::try_consteval_op(graph, multiply_clone);
     }
 
-    // Remove multiply from the graph
-    graph->remove_node(constant);
+    // Remove multiply from the graph, but check if constant has other consumers before removing constant
     graphlib::bypass_node(graph, multiply, true);
+    if (graph->user_edges(constant).size() == 0)
+    {
+        graph->remove_node(constant);
+    }
 
     return true;
 }
diff --git a/pybuda/csrc/passes/dataformat.cpp b/pybuda/csrc/passes/dataformat.cpp
index 10c0575d8..507891039 100644
--- a/pybuda/csrc/passes/dataformat.cpp
+++ b/pybuda/csrc/passes/dataformat.cpp
@@ -13,6 +13,7 @@
 
 namespace tt::passes
 {
+using namespace graphlib;
 
 static std::vector<Node*> get_non_constants(const std::vector<Node*>& nodes)
 {
@@ -58,6 +59,22 @@ static bool are_data_formats_same_exponent_widths(const std::vector<DataFormat>
         { return is_b_data_format(data_format) == is_b_data_format(non_integer_data_formats.at(0)); });
 }
 
+static bool are_data_formats_all_integer(const std::vector<DataFormat> &data_formats)
+{
+    return std::all_of(
+        data_formats.begin(),
+        data_formats.end(),
+        [](DataFormat data_format) { return is_integer_data_format(data_format); });
+}
+
+static bool are_data_formats_all_float(const std::vector<DataFormat> &data_formats)
+{
+    return std::all_of(
+        data_formats.begin(),
+        data_formats.end(),
+        [](DataFormat data_format) { return !is_integer_data_format(data_format); });
+}
+
 static bool are_data_formats_same(const std::vector<DataFormat> &data_formats)
 {
     if (data_formats.empty())
@@ -498,14 +515,32 @@ void fix_data_formats(graphlib::Graph *graph, bool fp32_acc_supported)
                         op->accumulate_df());
                     op->set_accumulate_df(DataFormat::Int32);
                 }
-                if (op->output_df() != DataFormat::Int8 and op->op_name() != "dequantization")
+                if (op->output_df() != DataFormat::Int8 
+                    and op->op_name() != "dequantization"
+                    and op->buda_attrs().find("has_dequant") == op->buda_attrs().end())
                 {
-                    log_warning(
-                        "Op {} is configured for Int8, but output_df != Int8. "
-                        "Setting output_df from {} to Int8.",
-                        op->name(),
-                        op->output_df());
-                    op->set_output_df(DataFormat::Int8);
+                    if (op->buda_attrs().find("has_requant") != op->buda_attrs().end()) {
+                        log_warning(
+                            "Op {} is configured for Int8, but output_df != Int8. "
+                            "Setting output_df from {} to Int8.",
+                            op->name(),
+                            op->output_df());
+                        op->set_output_df(DataFormat::Int8);
+                    } else if (op->is_matmul()){
+                        log_warning(
+                            "Op {} is configured for Int8, but output_df != Int8. "
+                            "Setting output_df from {} to Int8.",
+                            op->name(),
+                            op->output_df());
+                        op->set_output_df(DataFormat::Int32);
+                    } else {
+                        log_warning(
+                            "Op {} is configured for Int8, but output_df != Int8. "
+                            "Setting output_df from {} to Int8.",
+                            op->name(),
+                            op->output_df());
+                        op->set_output_df(DataFormat::Int8);
+                    }
                 }
             }
             else if (is_configured_for_int32(graph, node))
@@ -722,16 +757,6 @@ void validate_data_formats(const graphlib::Graph *graph, const DeviceConfig& dev
                     op->intermediate_df(),
                     op->output_df());
             }
-            if (op->is_fused_op())
-            {
-                all_data_formats.push_back(op->intermediate_df());
-
-                TT_LOG_ASSERT(are_data_formats_same_exponent_widths(all_data_formats),
-                    "For fused ops, we expect all data formats to be of the same type. (a or b type)\
-                    Data formats for {}: {}",
-                    op->name(),
-                    all_data_formats);
-            }
             if (op->is_sparse_matmul())
             {
                 std::vector<graphlib::Node *> data_operands = graph->data_operands(op);
@@ -785,6 +810,18 @@ void validate_data_formats(const graphlib::Graph *graph, const DeviceConfig& dev
                     "op: {}, accumulate_df: {}: If op is configured for Int8/Int32, accumulate data format must be Int32.",
                     op->name(),
                     op->accumulate_df());
+                TT_LOG_ASSERT(
+                    device_config.is_wormhole_b0() && !device_config.is_blackhole(),
+                    "op: {}, arch: {}: Int8/Int32 is only supported on Wormhole B0.",
+                    op->name(),
+                    device_config.arch_name);
+            }
+            if (graphlib::is_eltwise_binary(op) or op->is_splice()) {
+                TT_LOG_ASSERT(
+                    are_data_formats_all_float(all_data_formats) or are_data_formats_all_integer(all_data_formats),
+                    "All input data formats should either be all float or all integer. Data formats for {}: {}",
+                    op->name(),
+                    all_data_formats);
             }
             if (device_config.is_grayskull() and is_exponent_width_reconfigured(op->accumulate_df(), op->output_df()))
             {
diff --git a/pybuda/csrc/passes/decomposing_context.cpp b/pybuda/csrc/passes/decomposing_context.cpp
index 51f9d9589..59546df06 100644
--- a/pybuda/csrc/passes/decomposing_context.cpp
+++ b/pybuda/csrc/passes/decomposing_context.cpp
@@ -3,23 +3,16 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #include "decomposing_context.hpp"
+
 #include "buda_passes.hpp"
-#include "utils/assert.hpp"
-#include "utils/logger.hpp"
 #include "graph_lib/node_types.hpp"
 #include "graph_lib/utils.hpp"
-#include "placer/dram.hpp"
-#include "placer/utils.hpp"
 #include "reportify/reportify.hpp"
-
+#include "utils/assert.hpp"
+#include "utils/logger.hpp"
 
 namespace tt {
 
-using NodeType = graphlib::NodeType;
-using Edge = graphlib::Edge;
-using EdgeType = graphlib::EdgeType;
-
-
 // TODO: move tags to a vector of enums
 NodeContext DecomposingContext::op(
     graphlib::OpType const &op_type,
@@ -69,9 +62,10 @@ NodeContext DecomposingContext::op(
         graphlib::Node *current_node = this->graph->node_by_id(operands[i].id);
 
         if (new_node->get_epoch_type() != current_node->get_epoch_type()) {
-
-            TT_ASSERT((current_node->get_epoch_type() == NodeEpochType::Forward and new_node->get_epoch_type() == NodeEpochType::Backward) or
-                      (new_node->get_epoch_type() == NodeEpochType::Optimizer));
+            TT_ASSERT(
+                (current_node->get_epoch_type() == graphlib::NodeEpochType::Forward and
+                 new_node->get_epoch_type() == graphlib::NodeEpochType::Backward) or
+                (new_node->get_epoch_type() == graphlib::NodeEpochType::Optimizer));
 
             graphlib::Edge edge(current_node->id(), 0, new_node->id(), 0, graphlib::EdgeType::kAutogradFwdToBwd);
             graph->add_edge(edge);
@@ -82,7 +76,7 @@ NodeContext DecomposingContext::op(
         graph->add_edge(edge);
 
         if (copy_tms) {
-            for(Edge op_edge : graph->operand_data_edges(this->node_)) {
+            for(graphlib::Edge op_edge : graph->operand_data_edges(this->node_)) {
                 if (op_edge.producer_node_id == operands[i].id) {
                     graph->get_edge_attributes(edge)->set_tms(graph->get_edge_attributes(op_edge)->get_tms());
                 }
@@ -111,35 +105,36 @@ void DecomposingContext::fuse(NodeContext operand, graphlib::PortId producer_out
 
 
     // Map operand control edges
-    for (Edge in_edge : graph->operand_edges(node_)) {
-        if (in_edge.edge_type == EdgeType::kData)
+    for (graphlib::Edge in_edge : graph->operand_edges(node_)) {
+        if (in_edge.edge_type == graphlib::EdgeType::kData)
             continue;
 
-        if (in_edge.edge_type == EdgeType::kAutogradFwdToGradient)
+        if (in_edge.edge_type == graphlib::EdgeType::kAutogradFwdToGradient)
             continue;
 
         TT_ASSERT(
-            in_edge.edge_type != EdgeType::kControl or in_edge.edge_type != EdgeType::kDataLoopback or
-            in_edge.edge_type != EdgeType::kControlLoop);
+            in_edge.edge_type != graphlib::EdgeType::kControl or
+            in_edge.edge_type != graphlib::EdgeType::kDataLoopback or
+            in_edge.edge_type != graphlib::EdgeType::kControlLoop);
 
         for (graphlib::PyOpNode *inserted_node : inserted_nodes) {
-            Edge new_in_edge(
+          graphlib::Edge new_in_edge(
                 in_edge.producer_node_id, in_edge.producer_output_port_id, inserted_node->id(), 0, in_edge.edge_type);
             this->graph->add_edge(new_in_edge);
         }
     }
 
-    for (Edge in_edge : graph->operand_edges(node_)) {
-        if (in_edge.edge_type != EdgeType::kAutogradFwdToGradient)
+    for (graphlib::Edge in_edge : graph->operand_edges(node_)) {
+        if (in_edge.edge_type != graphlib::EdgeType::kAutogradFwdToGradient)
             continue;
 
-        Edge new_in_edge(in_edge.producer_node_id, in_edge.producer_output_port_id, operand.id, 0, in_edge.edge_type);
+        graphlib::Edge new_in_edge(in_edge.producer_node_id, in_edge.producer_output_port_id, operand.id, 0, in_edge.edge_type);
         this->graph->add_edge(new_in_edge);
         break;
     }
 
-    for (Edge out_edge : graph->user_edges(node_)) {
-        Edge new_out_edge(
+    for (graphlib::Edge out_edge : graph->user_edges(node_)) {
+      graphlib::Edge new_out_edge(
             operand.id,
             producer_out_port,
             out_edge.consumer_node_id,
@@ -189,7 +184,7 @@ std::vector<std::pair<graphlib::NodeId, graphlib::NodeId>> decompose_pybuda_grap
             py::function pybuda_decompose = eval_module.attr(dispatcher_name)(type);
 
             std::vector<NodeContext> inputs;
-            for(Edge op_edge : graph->operand_data_edges(node)) {
+            for(graphlib::Edge op_edge : graph->operand_data_edges(node)) {
                 inputs.push_back(NodeContext(graph->node_by_id(op_edge.producer_node_id), op_edge.producer_output_port_id));
                 inputs.back().shape = py_node->shape_of_operand(graph, graph->node_by_id(op_edge.producer_node_id));
                 inputs.back().unbroadcast_shape = py_node->shape_of_operand(graph, graph->node_by_id(op_edge.producer_node_id), true);
diff --git a/pybuda/csrc/passes/decomposing_context.hpp b/pybuda/csrc/passes/decomposing_context.hpp
index e7ae0832b..fa033ce2d 100644
--- a/pybuda/csrc/passes/decomposing_context.hpp
+++ b/pybuda/csrc/passes/decomposing_context.hpp
@@ -3,12 +3,9 @@
 // SPDX-License-Identifier: Apache-2.0
 #pragma once
 
-#include "balancer/balancer.hpp"
 #include "graph_lib/node.hpp"
 #include "graph_lib/node_types.hpp"
 #include "graph_lib/utils.hpp"
-#include "placer/dram.hpp"
-#include "placer/placer.hpp"
 
 namespace tt
 {
diff --git a/pybuda/csrc/passes/eth_stream_reduction.cpp b/pybuda/csrc/passes/eth_stream_reduction.cpp
deleted file mode 100644
index 2b771c107..000000000
--- a/pybuda/csrc/passes/eth_stream_reduction.cpp
+++ /dev/null
@@ -1,1141 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "passes/eth_stream_reduction.hpp"
-
-#include "balancer/balancer.hpp"
-
-#include "backend_api/device_config.hpp"
-#include "balancer/balancer_cache_collection.hpp"
-#include "balancer/legalizer/legalizer.hpp"
-#include "graph_lib/defines.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-#include "graph_lib/node_types.hpp"
-#include "graph_lib/utils.hpp"
-
-#include "placer/placer.hpp"
-#include "post_placer_buda_passes.hpp"
-#include "t_stream.hpp"
-
-#include "graph_lib/defines.hpp"
-#include "third_party/budabackend/device/tt_cluster_descriptor.h"
-
-#include "lower_to_buda/common.hpp"
-#include "utils/logger.hpp"
-
-#include <numeric>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-namespace tt {
-using chip_boundary_id_t = std::pair<uint32_t,uint32_t>;
-using producer_consumer_pair_t = std::tuple<std::string, std::string, graphlib::PortId>;
-};
-
-namespace std {
-template <>
-struct hash<tt::chip_boundary_id_t> {
-  std::size_t operator()(tt::chip_boundary_id_t const &o) const {
-    std::size_t seed = 0;
-    seed = std::hash<std::size_t>()(o.first) ^ std::hash<std::size_t>()(o.second) << 1;
-    return seed;
-  }
-};
-
-template <>
-struct hash<tt::producer_consumer_pair_t> {
-  std::size_t operator()(tt::producer_consumer_pair_t const &o) const {
-    std::size_t seed = 0;
-    seed = std::hash<std::string>()(std::get<0>(o)) ^ (std::hash<std::string>()(std::get<1>(o)) << 1) ^ (std::hash<std::size_t>()(std::get<2>(o)) << 2);
-    return seed;
-  }
-};
-
-}; // namespace std
-
-
-namespace tt {
-
-struct chip_to_chip_data_edge_t 
-{
-    uint32_t producer_chip;
-    uint32_t consumer_chip;
-    graphlib::PortId operand_index;
-    int streams_needed_per_hop;
-    int streams_needed_total; // in case of multiple hops this may be different from above
-};
-
-// producer-consumer_pair, data_edge, chip_to_insert_serializing op on
-using data_edge_serialization_spec_t = std::tuple<producer_consumer_pair_t, chip_to_chip_data_edge_t, placer::PlacerSolution::EpochId>;
-
-struct temporal_epoch_chip_to_chip_data_edges_t 
-{
-    std::unordered_map<chip_boundary_id_t, std::unordered_set<producer_consumer_pair_t>> chip_boundary_producer_consumer_pairs;
-    std::unordered_map<producer_consumer_pair_t, chip_to_chip_data_edge_t> chip_to_chip_data_edges;
-    std::unordered_map<chip_boundary_id_t, int> chip_boundary_needed_streams;
-};
-
-
-static tt_xy_pair get_op_or_queue_placed_grid_size(placer::PlacerSolution const& placer_solution, const graphlib::Node &op_info) {
-    bool has_op_placement = placer_solution.name_to_op_placement.find(op_info.name()) != placer_solution.name_to_op_placement.end();
-    if (has_op_placement) {
-        placer::OpPlacement const& placement = placer_solution.name_to_op_placement.at(op_info.name());
-        return tt_xy_pair(placement.placed_cores.size_c(), placement.placed_cores.size_r());
-    } else {
-        placer::QueuePlacement const& placement = placer_solution.name_to_queue_placement.at(op_info.name());
-        return tt_xy_pair(placement.grid_shape.columns, placement.grid_shape.rows);
-    }
-}
-
-static int get_op_num_input_streams(const graphlib::Node &op_info, placer::PlacerSolution const& placer_solution, graphlib::PortId operand_index) 
-{
-    tt_xy_pair placed_grid_size = get_op_or_queue_placed_grid_size(placer_solution, op_info);
-
-    if (op_info.get_type() == "BudaOp::matmul") 
-    {
-        return operand_index == 0 ? placed_grid_size.y : placed_grid_size.x;
-    } 
-    else if (op_info.get_type() == "fused_op") 
-    {
-        TT_ASSERT(false, "Don't know how to support yet");
-        return placed_grid_size.y * placed_grid_size.x;
-    } 
-    else 
-    {
-        return placed_grid_size.y * placed_grid_size.x;
-    }
-}
-
-
-static uint32_t get_producer_consumer_pair_temporal_epoch(placer::PlacerSolution const& placer_solution, std::string const& producer, std::string const& consumer) {
-    bool producer_has_op_placement = placer_solution.name_to_op_placement.find(producer) != placer_solution.name_to_op_placement.end();
-    if (producer_has_op_placement) {
-        return placer_solution.temporal_epoch_id(placer_solution.name_to_op_placement.at(producer).global_epoch_id);
-    } else {
-        return placer_solution.temporal_epoch_id(placer_solution.name_to_op_placement.at(consumer).global_epoch_id);
-    }
-}
-
-static std::unordered_map<int, temporal_epoch_chip_to_chip_data_edges_t> collect_chip_to_chip_data_edges_per_temporal_epoch(
-    graphlib::Graph *graph, 
-    placer::PlacerSolution &placer_solution) 
-{
-    auto chip_to_chip_data_edges_per_temporal_epoch = std::unordered_map<int, temporal_epoch_chip_to_chip_data_edges_t>{};
-    for (auto const& [node_id, edges] : graph->operands_map())
-    {
-        graphlib::Node* consumer = graph->node_by_id(node_id);
-
-        auto consumer_chip = placer_solution.chip_id(consumer->name());
-        for (graphlib::Edge const& edge : edges)
-        {
-            if (edge.edge_type != graphlib::EdgeType::kData and edge.edge_type != graphlib::EdgeType::kDataLoopback)
-            {
-                continue;
-            }
-            TT_ASSERT(node_id == edge.consumer_node_id);
-
-            graphlib::Node* producer = graph->node_by_id(edge.producer_node_id);
-
-            uint32_t producer_chip = placer_solution.chip_id(producer->name());
-            bool is_chip_to_chip_edge = consumer_chip != producer_chip;
-
-            if (is_chip_to_chip_edge)
-            {
-                int temporal_epoch = (consumer->node_type() == graphlib::NodeType::kBudaOp) ? placer_solution.temporal_epoch_id(consumer->name()) : placer_solution.temporal_epoch_id(producer->name());
-                graphlib::Node* consumer_node = graph->node_by_id(edge.consumer_node_id);
-                auto &chip_to_chip_edges = chip_to_chip_data_edges_per_temporal_epoch[temporal_epoch];
-                int streams_needed_per_hop = get_op_num_input_streams(*consumer_node, placer_solution, edge.consumer_input_port_id); 
-                int num_hops = std::abs(static_cast<long>(producer_chip) - static_cast<long>(consumer_chip));
-                int streams_needed_total = num_hops * streams_needed_per_hop;
-                auto const& chip_boundary = chip_boundary_id_t{std::min(producer_chip, consumer_chip), std::max(producer_chip, consumer_chip)};
-                auto const& producer_consumer_pair = producer_consumer_pair_t{producer->name(), consumer->name(), edge.consumer_input_port_id};
-
-                log_debug("\tChip-to-chip edge between {} (chip {}) and {} (chip {}). {} streams needed per hop", producer->name(), producer_chip, consumer->name(), consumer_chip, streams_needed_per_hop);
-                for (auto c = chip_boundary.first; c != chip_boundary.second; c++) 
-                {
-                    auto const& one_hop_chip_boundary = chip_boundary_id_t{c, c+1};
-                    chip_to_chip_edges.chip_boundary_producer_consumer_pairs[one_hop_chip_boundary].insert(producer_consumer_pair);
-                    chip_to_chip_edges.chip_boundary_needed_streams[one_hop_chip_boundary] += streams_needed_per_hop;
-                    log_debug("\t\t chip {} -> chip {}: {} required streams added, {} needed in total", one_hop_chip_boundary.first, one_hop_chip_boundary.second, streams_needed_per_hop, chip_to_chip_edges.chip_boundary_needed_streams.at(one_hop_chip_boundary));
-                }
-                chip_to_chip_edges.chip_to_chip_data_edges[producer_consumer_pair] = chip_to_chip_data_edge_t{
-                        .producer_chip=producer_chip, 
-                        .consumer_chip=consumer_chip, 
-                        .operand_index=edge.consumer_input_port_id,
-                        .streams_needed_per_hop=streams_needed_per_hop,
-                        .streams_needed_total=streams_needed_total
-                    };
-                TT_ASSERT(edge.producer_node_id == producer->id());
-            }
-        }
-    }
-
-    return chip_to_chip_data_edges_per_temporal_epoch;
-}
-
-static std::unordered_map<placer::PlacerSolution::EpochId, std::unordered_map<uint32_t, int>> collect_available_cores_per_temporal_epoch_per_chip(
-    placer::PlacerSolution const& placer_solution,
-    DeviceConfig const& device_config
-    ) 
-{   
-    std::unordered_map<int, std::unordered_map<uint32_t, placer::PlacerSolution::EpochId>> temporal_epoch_chip_id_to_global_epoch_id_map;
-    for (uint32_t e = 0; e < placer_solution.num_epochs; e++) {
-        temporal_epoch_chip_id_to_global_epoch_id_map[placer_solution.temporal_epoch_id(e)][placer_solution.epoch_id_to_chip.at(e)] = e;
-    }
-
-    auto available_cores = std::unordered_map<placer::PlacerSolution::EpochId, std::unordered_map<uint32_t, int>>{};
-    int num_worker_cores = device_config.grid_size.r * device_config.grid_size.c;
-    for (uint32_t i = 0; i < placer_solution.num_epochs; i++) 
-    {
-        int temporal_epoch = placer_solution.temporal_epoch_id(i);
-        for (auto chip_id : device_config.chip_ids) {
-            available_cores[temporal_epoch][chip_id] = num_worker_cores;
-        }
-    }
-
-    for (auto const& [epoch_id, op_placements] : placer_solution.epoch_id_to_op_placement)
-    {
-        for (placer::OpPlacement const& placement : op_placements)
-        {
-            TT_ASSERT(static_cast<int>(placement.global_epoch_id) == static_cast<int>(epoch_id));
-            std::uint32_t temporal_epoch = placer_solution.temporal_epoch_id(epoch_id);
-            int chip_id = placer_solution.epoch_id_to_chip.at(placement.global_epoch_id);
-            TT_ASSERT(static_cast<long>(temporal_epoch) == static_cast<long>(placer_solution.temporal_epoch_id(placement.name)));
-            available_cores.at(temporal_epoch).at(chip_id) -= (placement.placed_cores.size_r() * placement.placed_cores.size_c());
-            TT_ASSERT(available_cores.at(temporal_epoch).at(chip_id) >= 0, "More tensix cores used than are available");
-        }
-    }
-
-    return available_cores;
-}
-
-void try_serialize(
-    std::vector<producer_consumer_pair_t>::iterator& edge_iter,
-    std::unordered_map<producer_consumer_pair_t, chip_to_chip_data_edge_t> const& chip_to_chip_data_edges,
-    std::vector<data_edge_serialization_spec_t>& edges_to_serialize,
-    std::unordered_map<chip_boundary_id_t, std::unordered_set<producer_consumer_pair_t>>&
-        chip_boundary_producer_consumer_pairs,
-    placer::PlacerSolution::EpochId target_epoch_id)
-{
-    auto const& data_edge = chip_to_chip_data_edges.at(*edge_iter);
-    edges_to_serialize.push_back({*edge_iter, data_edge, target_epoch_id});
-
-    // remove the edge from all chip-to-chip-boundaries from producer to consumer
-    auto start = std::min(data_edge.producer_chip, data_edge.consumer_chip);
-    auto end = std::max(data_edge.producer_chip, data_edge.consumer_chip);
-    TT_ASSERT(end > start);
-    for (auto c = start; c != end; c++)
-    {
-        auto const& chip_boundary = chip_boundary_id_t{c, c + 1};
-        chip_boundary_producer_consumer_pairs.at(chip_boundary).erase(*edge_iter);
-    }
-}
-
-void try_serialize_with_tensix_datacopy(
-    std::vector<producer_consumer_pair_t>::iterator& edge_iter,
-    int& stream_overage,
-    int& chip_available_cores,
-    std::unordered_map<producer_consumer_pair_t, chip_to_chip_data_edge_t> const& chip_to_chip_data_edges,
-    std::vector<data_edge_serialization_spec_t>& edges_to_serialize,
-    std::unordered_map<chip_boundary_id_t, std::unordered_set<producer_consumer_pair_t>>&
-        chip_boundary_producer_consumer_pairs,
-    placer::PlacerSolution::EpochId target_epoch_id,
-    std::unordered_map<chip_boundary_id_t, int>& chip_boundary_needed_streams)
-{
-    auto const& data_edge = chip_to_chip_data_edges.at(*edge_iter);
-    int streams_saved = data_edge.streams_needed_per_hop - 1;  // we still need a stream after serialization
-    auto start = std::min(data_edge.producer_chip, data_edge.consumer_chip);
-    auto end = std::max(data_edge.producer_chip, data_edge.consumer_chip);
-    TT_ASSERT(end > start);
-    for (auto c = start; c != end; c++)
-    {
-        auto const& chip_boundary = chip_boundary_id_t{c, c + 1};
-        chip_boundary_needed_streams.at(chip_boundary) -= streams_saved;
-    }
-
-    try_serialize(
-        edge_iter, chip_to_chip_data_edges, edges_to_serialize, chip_boundary_producer_consumer_pairs, target_epoch_id);
-
-    stream_overage -= streams_saved;
-    log_debug(
-        "\tSaved {} streams. New overage: {}. For producer {} -> consumer {} @ port {} ",
-        streams_saved,
-        stream_overage,
-        std::get<0>(*edge_iter),
-        std::get<1>(*edge_iter),
-        std::get<2>(*edge_iter));
-    edge_iter++;
-    chip_available_cores--;
-};
-
-template <bool SERIALIZE_WITH_TENSIX_DATACOPY>
-static void serialize_edges_while_above_threshold(
-    graphlib::Graph* graph,
-    std::vector<producer_consumer_pair_t>::iterator& edge_iter,
-    std::vector<producer_consumer_pair_t>::iterator edge_iter_end,
-    int& stream_overage,
-    int& producer_chip_available_cores,
-    int& consumer_chip_available_cores,
-    std::unordered_map<uint32_t, placer::PlacerSolution::EpochId> const& chip_id_to_epoch_map,
-    placer::PlacerSolution const& placer_solution,
-    std::unordered_map<producer_consumer_pair_t, chip_to_chip_data_edge_t> const& chip_to_chip_data_edges,
-    std::vector<data_edge_serialization_spec_t>& edges_to_serialize,
-    std::unordered_map<chip_boundary_id_t, std::unordered_set<producer_consumer_pair_t>>&
-        chip_boundary_producer_consumer_pairs,
-    std::unordered_map<chip_boundary_id_t, int>& chip_boundary_needed_streams)
-{
-    while (edge_iter != edge_iter_end && (!SERIALIZE_WITH_TENSIX_DATACOPY || (stream_overage > 0 && (producer_chip_available_cores > 0 || consumer_chip_available_cores > 0)))) 
-    {
-        auto [producer_name, consumer_name, operand_index] = *edge_iter;
-        bool producer_is_q = placer_solution.name_to_op_placement.find(producer_name) == placer_solution.name_to_op_placement.end();
-        bool consumer_is_q = placer_solution.name_to_op_placement.find(consumer_name) == placer_solution.name_to_op_placement.end();
-        TT_ASSERT(!consumer_is_q || !producer_is_q);
-        bool is_q_to_op = producer_is_q;
-        bool producer_is_input = graph->get_node_by_name(producer_name)->node_type() == graphlib::NodeType::kInput;
-
-        if (producer_is_input && SERIALIZE_WITH_TENSIX_DATACOPY) {
-            // We can't currently support if producer is input because we need to be able to inherit characteristics from the producer (op) - currently we
-            // can only inherit from ops.
-            edge_iter++;
-            continue;
-        }
-        
-        uint32_t producer_chip = placer_solution.chip_id(producer_name);
-        uint32_t consumer_chip = placer_solution.chip_id(consumer_name);
-        placer::PlacerSolution::EpochId producer_epoch_id = chip_id_to_epoch_map.at(producer_chip);
-        placer::PlacerSolution::EpochId consumer_epoch_id = chip_id_to_epoch_map.at(consumer_chip);
-        log_trace(tt::LogPlacer,"\tProducer {} is on chip {} in epoch {}, consumer {} is on chip {} in epoch {} ", producer_name, producer_chip, producer_epoch_id, consumer_name, consumer_chip, consumer_epoch_id);
-        // cleanup: separate concerns and handle incrementing separately from serializing
-        if (SERIALIZE_WITH_TENSIX_DATACOPY)
-        {
-            if (is_q_to_op)
-            {
-                // only use the producer chip for q to -op since we may need to
-                if (producer_chip_available_cores > 0)
-                {
-                    try_serialize_with_tensix_datacopy(
-                        edge_iter,
-                        stream_overage,
-                        producer_chip_available_cores,
-                        chip_to_chip_data_edges,
-                        edges_to_serialize,
-                        chip_boundary_producer_consumer_pairs,
-                        producer_epoch_id,
-                        chip_boundary_needed_streams);
-                }
-                else
-                {
-                    edge_iter++;
-                }
-            }
-            else
-            {
-                if (producer_chip_available_cores > 0)
-                {
-                    try_serialize_with_tensix_datacopy(
-                        edge_iter,
-                        stream_overage,
-                        producer_chip_available_cores,
-                        chip_to_chip_data_edges,
-                        edges_to_serialize,
-                        chip_boundary_producer_consumer_pairs,
-                        producer_epoch_id,
-                        chip_boundary_needed_streams);
-                }
-                else if (consumer_chip_available_cores > 0)
-                {
-                    try_serialize_with_tensix_datacopy(
-                        edge_iter,
-                        stream_overage,
-                        consumer_chip_available_cores,
-                        chip_to_chip_data_edges,
-                        edges_to_serialize,
-                        chip_boundary_producer_consumer_pairs,
-                        consumer_epoch_id,
-                        chip_boundary_needed_streams);
-                }
-            }
-        }
-        else
-        {
-            log_trace(tt::LogPlacer, "\tSerializing edges while above threshold producer_epoch_id={} consumer_epoch_id={}", producer_epoch_id, consumer_epoch_id);
-            try_serialize(
-                edge_iter,
-                chip_to_chip_data_edges,
-                edges_to_serialize,
-                chip_boundary_producer_consumer_pairs,
-                producer_epoch_id); // the ethernet datacopy is "placed" on the same epoch as consumer
-                                    // and the "dest_device" attribute denotes the consumer epoch chip
-            ++edge_iter;
-        }
-    }
-}
-
-// get_directly_connected_ethernet_channels_between_chips
-template <bool SERIALIZE_WITH_TENSIX_DATACOPY>
-static std::vector<data_edge_serialization_spec_t> choose_chip_to_chip_data_edges_to_serialize(
-    graphlib::Graph *graph,
-    std::unordered_map<int, temporal_epoch_chip_to_chip_data_edges_t>& chip_to_chip_data_edges_per_temporal_epoch, 
-    placer::PlacerSolution &placer_solution,
-    balancer::BalancerSolution &balancer_solution,
-    DeviceConfig const& device_config) 
-{
-    std::unordered_map<int, std::unordered_map<uint32_t, placer::PlacerSolution::EpochId>> temporal_epoch_chip_id_to_global_epoch_id_map;
-    for (std::uint32_t e = 0; e < placer_solution.num_epochs; e++) {
-        const auto &epoch_info = placer_solution.epoch_id_to_epoch_info.at(e);
-        log_trace(tt::LogPlacer, "epoch {} has epoch_info(.global_epoch_id={}, .temporal_epoch_id={}, .spatial_epoch_id={}). epoch_id_to_chip -> {}", e, epoch_info.global_epoch_id, epoch_info.temporal_epoch_id, epoch_info.spatial_epoch_id, placer_solution.epoch_id_to_chip.at(e));
-        temporal_epoch_chip_id_to_global_epoch_id_map[placer_solution.temporal_epoch_id(e)][placer_solution.epoch_id_to_chip.at(e)] = epoch_info.global_epoch_id;
-    }
-
-    // Temporary flag to pick correct number of eth links depending on setup, until general implementation comes
-    bool eth_links_between_chips_nebula = (bool)env_as<int>("PYBUDA_ETH_LINKS_NEBULA", 0);
-
-    // auto cluster_desc_uniq = tt_ClusterDescriptor::create_from_yaml(device_config.cluster_config_yaml);
-
-    auto edges_to_serialize = std::vector<data_edge_serialization_spec_t>{};
-    constexpr int ETH_STREAMS_PER_LINK = 8;
-    auto available_cores_per_temporal_epoch = collect_available_cores_per_temporal_epoch_per_chip(placer_solution, device_config);
-    
-    for (auto& [temporal_epoch_id, temporal_epoch_chip_to_chip_data_edges_specs] : chip_to_chip_data_edges_per_temporal_epoch)
-    {
-        auto& chip_to_chip_data_edges = temporal_epoch_chip_to_chip_data_edges_specs.chip_to_chip_data_edges;
-        auto& chip_boundary_producer_consumer_pairs = temporal_epoch_chip_to_chip_data_edges_specs.chip_boundary_producer_consumer_pairs;
-
-        for (auto const& [chip_boundary, required_streams] : temporal_epoch_chip_to_chip_data_edges_specs.chip_boundary_needed_streams)
-        {
-            // Old code that only worked with topologies where adjacent chip IDs were always connected to each other
-            // this isn't generally true. For now we hardcode to 4 links for galaxy setups and have a flag for nebula setups, but we should generalize
-            // -> first to get the # links between any connected chips and assume that's the number for any other
-            //    pair of connected chips. This generally true (assuming all links train on boot)
-            // Then we need to update this pass to always serialize chip to chip if ethernet datacopy is enabled
-            //auto const& links_between_chips = cluster_desc_uniq->get_directly_connected_ethernet_channels_between_chips(chip_boundary.first, chip_boundary.second);
-            int eth_links_between_chips = eth_links_between_chips_nebula ? 2 : 4; //links_between_chips.size();
-            TT_ASSERT(eth_links_between_chips >= 0, "Entries should only be produced for adjacent chips");
-            int available_streams = eth_links_between_chips * ETH_STREAMS_PER_LINK;
-            // For ethernet datacopy serialization, we serialize all chip to chip edges
-            // For tensix datacopy we only can conditionally serialize to save cores
-            if (SERIALIZE_WITH_TENSIX_DATACOPY && required_streams <= available_streams)
-            {
-                continue;
-            }
-            auto producer_consumer_pairs_sorted = std::vector<producer_consumer_pair_t>(chip_boundary_producer_consumer_pairs.at(chip_boundary).begin(), chip_boundary_producer_consumer_pairs.at(chip_boundary).end());
-            TT_ASSERT(producer_consumer_pairs_sorted.size() > 0);
-            std::sort(
-                producer_consumer_pairs_sorted.begin(), 
-                producer_consumer_pairs_sorted.end(), 
-                [&chip_to_chip_data_edges] (auto const& pair_a, auto const& pair_b) 
-                {
-                    return chip_to_chip_data_edges.at(pair_a).streams_needed_per_hop > chip_to_chip_data_edges.at(pair_b).streams_needed_per_hop;
-                }
-            );
-            TT_ASSERT(chip_to_chip_data_edges.at(producer_consumer_pairs_sorted.front()).streams_needed_per_hop >= chip_to_chip_data_edges.at(producer_consumer_pairs_sorted.back()).streams_needed_per_hop) ;
-            
-            // Choose which candidates to serialize
-            int stream_overage = required_streams - available_streams;
-            TT_ASSERT(!SERIALIZE_WITH_TENSIX_DATACOPY || stream_overage > 0);
-
-            // There are a ton of different ways to choose which producer-consumer data edge to serialize but for
-            // now we'll just default to choosing the one(s) that result in serializing the fewest edges get us to 
-            // below the threshold. In practice this will mean choosing the largest ops first
-            // Other options include:
-            // - first choosing those with the largest overall stream usage (at the cost of perf)
-            // or ... choosing the largest stream usage on the current edge
-            // or ... iteratively serializing the smallest op grids until we meet the threshold
-
-            auto const& producer_name = std::get<0>(producer_consumer_pairs_sorted.at(0));
-            auto const& consumer_name = std::get<1>(producer_consumer_pairs_sorted.at(0));
-            uint32_t producer_chip = placer_solution.chip_id(producer_name);
-            uint32_t consumer_chip = placer_solution.chip_id(consumer_name);
-
-            auto current_data_edge_iter = producer_consumer_pairs_sorted.begin();
-            int temporal_epoch = get_producer_consumer_pair_temporal_epoch(placer_solution, producer_name, consumer_name);
-            if (SERIALIZE_WITH_TENSIX_DATACOPY)
-            {
-                log_debug(
-                    "Temporal epoch {} requires {} eth streams between chips {} and {} but {} are available by "
-                    "default. {} empty cores on producer chip {}, {} empty cores on consumer chip {}",
-                    static_cast<int>(temporal_epoch_id),
-                    required_streams,
-                    chip_boundary.first,
-                    chip_boundary.second,
-                    available_streams,
-                    available_cores_per_temporal_epoch.at(temporal_epoch).at(producer_chip),
-                    producer_chip,
-                    available_cores_per_temporal_epoch.at(temporal_epoch).at(consumer_chip),
-                    consumer_chip);
-            }
-            log_trace(tt::LogPlacer, "Serializing data edges between chips {} and {}", chip_boundary.first, chip_boundary.second);
-            log_trace(tt::LogPlacer, "\tProducer op {}, Consumer op: {}", producer_name, consumer_name);
-            serialize_edges_while_above_threshold<SERIALIZE_WITH_TENSIX_DATACOPY> (
-                graph,
-                current_data_edge_iter, 
-                producer_consumer_pairs_sorted.end(),
-                stream_overage, 
-                available_cores_per_temporal_epoch.at(temporal_epoch).at(producer_chip),
-                available_cores_per_temporal_epoch.at(temporal_epoch).at(consumer_chip),
-                temporal_epoch_chip_id_to_global_epoch_id_map.at(temporal_epoch),
-                placer_solution,
-                chip_to_chip_data_edges, 
-                edges_to_serialize, 
-                chip_boundary_producer_consumer_pairs,
-                temporal_epoch_chip_to_chip_data_edges_specs.chip_boundary_needed_streams);
-        }
-    }
-
-    // Log the edges to serialize
-    log_trace(tt::LogPlacer, "Chip to chip edges to serialize");
-    for (auto const& [producer_consumer_pair, chip_to_chip_data_edge, epoch_id] : edges_to_serialize)
-    {
-        std::stringstream ss;
-        auto const& producer_name = std::get<0>(producer_consumer_pair);
-        auto const& consumer_name = std::get<1>(producer_consumer_pair);
-        ss << producer_name << " -> " << consumer_name << " ";
-        ss << "\n";
-
-        Node *producer_op = graph->get_node_by_name(producer_name);
-        ss << " producer_shape=(" << producer_op->shape().w() << "," << producer_op->shape().z() << "," << producer_op->shape().rt() << "," << producer_op->shape().ct() << "), ";
-        if (producer_op->node_type() != graphlib::NodeType::kQueue) {
-
-            tt_xy_pair const& producer_grid_size = get_op_or_queue_placed_grid_size(placer_solution, *producer_op);
-            ss << " producer_grid_size=(" << producer_grid_size.y << "," << producer_grid_size.x << "), ";
-            
-            auto const& block_shape = balancer_solution.op_models.at(producer_name).output_buffers.at(0).block_shape;
-            ss << " t=" << block_shape.t << ", ";
-            ss << " mblock=(" << block_shape.mblock_m << "," << block_shape.mblock_n << "), ";
-            ss << " ublock=(" << block_shape.ublock.rt << "," << block_shape.ublock.ct << ")";
-        } else {
-            ss << " producer is a queue - use producer shape info as reference.";
-        }
-
-        ss << "\n";
-
-        Node *consumer_op = graph->get_node_by_name(consumer_name);
-        ss << " consumer_shape=(" << consumer_op->shape().w() << "," << consumer_op->shape().z() << "," << consumer_op->shape().rt() << "," << consumer_op->shape().ct() << "), ";
-        if (consumer_op->node_type() != graphlib::NodeType::kQueue) {
-
-            tt_xy_pair const& consumer_grid_size = get_op_or_queue_placed_grid_size(placer_solution, *consumer_op);
-            ss << " producer_grid_size=(" << consumer_grid_size.y << "," << consumer_grid_size.x << "), ";
-            
-            auto const& block_shape = balancer_solution.op_models.at(consumer_name).output_buffers.at(0).block_shape;
-            ss << " t=" << block_shape.t << ", ";
-            ss << " mblock=(" << block_shape.mblock_m << "," << block_shape.mblock_n << "), ";
-            ss << " ublock=(" << block_shape.ublock.rt << "," << block_shape.ublock.ct << ")";
-        } else {
-            ss << ". consumer is a queue - use producer shape info as reference.";
-        }
-
-        log_trace(tt::LogPlacer, "{}", ss.str());
-    }
-
-    return edges_to_serialize;
-}
-
-std::tuple<Edge, graphlib::Node*, Edge> insert_serialized_dram_queue_between_ops(
-    graphlib::Graph* graph,
-    std::string const& producer_name,
-    std::string const& consumer_name,
-    graphlib::PortId consumer_input_port_id,
-    int num_entries)
-{
-    std::stringstream name_ss;
-    name_ss << producer_name << "_to_" << consumer_name << "_" << consumer_input_port_id << "_serialized_dram_queue";
-
-    auto producer_node = graph->get_node_by_name(producer_name);
-    std::unique_ptr<graphlib::BufferingQueueNode> queue_node_uniq = graphlib::create_node<graphlib::BufferingQueueNode>(name_ss.str(), 1);
-    log_debug("\tCreating dram buffering queue node {} between {} and {}", name_ss.str(), producer_name, consumer_name);
-    queue_node_uniq->set_node_type(graphlib::NodeType::kQueue);
-    queue_node_uniq->set_output_df(producer_node->output_df());
-    auto queue_node = graph->add_node(std::move(queue_node_uniq), graph->get_subgraph_id_for_node(producer_node->id()));
-    queue_node->set_shape(producer_node->shape());
-    queue_node->as<graphlib::QueueNode>()->set_num_entries(num_entries);
-    queue_node->set_epoch_type(producer_node->get_epoch_type());  // take epoch type from producer
-
-    // Check port id, i.e. operand index if there is multiple edges between producer and consumer nodes
-    std::uint32_t edge_index = 0;
-    bool edge_found = false;
-    std::vector<Edge> producer_outgoing_edges = graph->user_data_edges(producer_node);
-    for (std::uint32_t i = 0; i < producer_outgoing_edges.size(); i++) {
-        graphlib::NodeId producer_outgoing_node_id = producer_outgoing_edges[i].consumer_node_id;
-        graphlib::Node* producer_outgoing_node = graph->node_by_id(producer_outgoing_node_id);
-        if (producer_outgoing_node->name() == consumer_name && 
-            producer_outgoing_edges[i].consumer_input_port_id == consumer_input_port_id) {
-            edge_index = i;
-            edge_found = true;
-        }
-    }
-
-    TT_ASSERT(edge_found, "Edge with given consumer port id for given consumer node doesn't exist. ");
-
-    Edge consumer_input_edge = producer_outgoing_edges[edge_index];
-
-    Edge queue_input_edge = Edge(
-            consumer_input_edge.producer_node_id,
-            consumer_input_edge.producer_output_port_id,
-            queue_node->id(),
-            0,
-            consumer_input_edge.edge_type);
-
-    Edge queue_output_edge = Edge(
-            queue_node->id(),
-            0,
-            consumer_input_edge.consumer_node_id,
-            consumer_input_edge.consumer_input_port_id,
-            consumer_input_edge.edge_type);
-    graph->add_edge(queue_input_edge);
-    graph->add_edge(queue_output_edge);
-    graph->copy_edge_attributes(consumer_input_edge, queue_output_edge);
-    graph->get_edge_attributes(queue_output_edge)->set_ublock_order(graph->get_edge_attributes(consumer_input_edge)->get_ublock_order());
-    graph->remove_edge(consumer_input_edge);
-
-    TT_ASSERT(graph->operand_data_edges(queue_node).size() == 1);
-
-    return {queue_input_edge, queue_node, queue_output_edge};
-}
-
-static std::tuple<Edge,Node*,Edge> insert_datacopy_node(
-    graphlib::Graph *graph, 
-    Node *producer_op_node,
-    std::string const& consumer_name, 
-    graphlib::PortId operand_index
-)
-{
-    std::string const& producer_op_name = producer_op_node->name();
-
-    std::stringstream name_ss;
-    name_ss << producer_op_name << "_serialized_to_" << consumer_name << "_" << operand_index;
-    TT_ASSERT(producer_op_node != nullptr);
-
-    auto consumer_node = graph->get_node_by_name(consumer_name);
-    auto datacopy_node_uniq = producer_op_node->clone(name_ss.str());
-    datacopy_node_uniq->set_node_type(graphlib::NodeType::kBudaOp);
-    datacopy_node_uniq->set_output_df(producer_op_node->output_df());
-    datacopy_node_uniq->as<graphlib::OpNode>()->change_op_type("nop");
-    auto datacopy_node = graph->add_node(std::move(datacopy_node_uniq), graph->get_subgraph_id_for_node(producer_op_node->id()));
-    TT_ASSERT(graph->operand_data_edges(datacopy_node).size() == 0, "Expected no operands yet");
-    Edge consumer_input_edge = graph->operand_data_edges(consumer_node).at(operand_index);
-    
-    TT_ASSERT(consumer_input_edge.consumer_input_port_id == operand_index);
-
-    // Insert the datacopy on the edge (`insert_node_on_edge` does some extra things we don't want)
-    Edge datacopy_input_edge = Edge(
-            consumer_input_edge.producer_node_id,
-            consumer_input_edge.producer_output_port_id,
-            datacopy_node->id(),
-            0,
-            consumer_input_edge.edge_type);
-
-    Edge datacopy_output_edge_before_dram = Edge(
-            datacopy_node->id(),
-            0,
-            consumer_input_edge.consumer_node_id,
-            consumer_input_edge.consumer_input_port_id,
-            consumer_input_edge.edge_type);
-    graph->add_edge(datacopy_input_edge);
-    graph->add_edge(datacopy_output_edge_before_dram);
-    graph->copy_edge_attributes(consumer_input_edge, datacopy_output_edge_before_dram); // fails on lookup of datacopy_output_edge
-    graph->get_edge_attributes(datacopy_output_edge_before_dram)->set_ublock_order(graph->get_edge_attributes(consumer_input_edge)->get_ublock_order());
-    auto datacopy_input_block_order = graph->get_edge_attributes(consumer_input_edge)->get_ublock_order();
-    graph->get_edge_attributes(datacopy_input_edge)->set_ublock_order(datacopy_input_block_order);
-    graph->remove_edge(consumer_input_edge);
-
-    auto const& datacopy_input_edges = graph->operand_edges(datacopy_node);
-    TT_ASSERT(datacopy_input_edges.size() == 1, "Expected datacopy to only have 1 operand but it has " + std::to_string(datacopy_input_edges.size()));
-
-    return {datacopy_input_edge, datacopy_node, datacopy_output_edge_before_dram};
-}
-
-static std::tuple<Edge, Node*, Edge> insert_ethernet_datacopy_node(
-    graphlib::Graph* graph, placer::PlacerSolution const& placer_solution, Node* producer_op_node, std::string const& consumer_name, graphlib::PortId operand_index)
-{
-    std::string const& producer_op_name = producer_op_node->name();
-
-    std::stringstream name_ss;
-    name_ss << producer_op_name << "_eth_datacopy_to_" << consumer_name << "_" << operand_index;
-    TT_ASSERT(producer_op_node != nullptr);
-
-    auto consumer_node = graph->get_node_by_name(consumer_name);
-    uint32_t consumer_chip = placer_solution.chip_id(consumer_name);
-    // Want to clone the consumer because the consumer will have the applied TM
-    // shapes since we want to push all TMs upwards to the ethernet datacopy
-    // operand edge
-    // This is the baseline scenario. We can get more sophisticated later if we want and move TMs
-    // around
-    if (producer_op_node->node_type() == tt::graphlib::NodeType::kQueue) {
-        producer_op_node = graph->operands(producer_op_node).at(0);
-    }
-    graphlib::Node *datacopy_node = nullptr;
-    if (producer_op_node->node_type() == tt::graphlib::NodeType::kQueue || producer_op_node->node_type() == tt::graphlib::NodeType::kInput) {
-        datacopy_node = graph->add_node(consumer_node->clone(name_ss.str()), graph->get_subgraph_id_for_node(consumer_node->id()));
-    } else {
-        auto datacopy_node_uniq = producer_op_node->clone(name_ss.str());
-        datacopy_node_uniq->as<graphlib::OpNode>()->change_op_type("ethernet_datacopy");
-        datacopy_node = graph->add_node(std::move(datacopy_node_uniq), graph->get_subgraph_id_for_node(producer_op_node->id()));
-    }
-
-    datacopy_node->set_node_type(graphlib::NodeType::kBudaOp);
-    datacopy_node->set_output_df(producer_op_node->output_df());
-
-    auto ethernet_datacopy_op_attrs = BudaOpAttrs();
-    ethernet_datacopy_op_attrs["dest_device"] = static_cast<int>(consumer_chip);
-    datacopy_node->as<graphlib::OpNode>()->change_op_type(
-        graphlib::OpType("ethernet_datacopy", {}, ethernet_datacopy_op_attrs));
-
-    TT_ASSERT(graph->operand_data_edges(datacopy_node).size() == 0, "Expected no operands yet");
-    Edge consumer_input_edge = graph->operand_data_edges(consumer_node).at(operand_index);
-    TT_ASSERT(consumer_input_edge.consumer_input_port_id == operand_index);
-
-    // Insert the datacopy on the edge (`insert_node_on_edge` does some extra things we don't want)
-    Edge datacopy_input_edge = Edge(
-        consumer_input_edge.producer_node_id,
-        consumer_input_edge.producer_output_port_id,
-        datacopy_node->id(),
-        0,
-        consumer_input_edge.edge_type);
-
-    Edge datacopy_output_edge = Edge(
-        datacopy_node->id(),
-        0,
-        consumer_input_edge.consumer_node_id,
-        consumer_input_edge.consumer_input_port_id,
-        consumer_input_edge.edge_type);
-    graph->add_edge(datacopy_input_edge);
-    graph->add_edge(datacopy_output_edge);
-    graph->copy_edge_attributes(consumer_input_edge, datacopy_output_edge); // fails on lookup of datacopy_output_edge
-    auto datacopy_input_block_order = graph->get_edge_attributes(consumer_input_edge)->get_ublock_order();
-    graph->get_edge_attributes(datacopy_output_edge)
-        ->set_ublock_order(datacopy_input_block_order);
-    graph->get_edge_attributes(datacopy_input_edge)->set_ublock_order(datacopy_input_block_order);
-    graph->remove_edge(consumer_input_edge);
-
-    auto const& datacopy_input_edges = graph->operand_edges(datacopy_node);
-    TT_ASSERT(
-        datacopy_input_edges.size() == 1,
-        "Expected datacopy to only have 1 operand but it has " + std::to_string(datacopy_input_edges.size()));
-
-    return {datacopy_input_edge, datacopy_node, datacopy_output_edge};
-}
-
-template <bool SERIALIZE_WITH_TENSIX_DATACOPY>
-static void add_datacopy_placement_entry(
-    graphlib::Graph *graph, 
-    placer::PlacerSolution &placer_solution,
-    placer::PlacerSolution::EpochId epoch_id,
-    std::vector<tt_xy_pair>& epoch_available_cores,
-    Node const* datacopy_node)
-{
-
-    placer::Coord placed_cores_start = placer::Coord{.row = 0, .col = 0};
-    placer::Coord placed_cores_end = placer::Coord{.row = 1, .col = 1};
-
-    if constexpr (SERIALIZE_WITH_TENSIX_DATACOPY)
-    {
-        TT_ASSERT(epoch_available_cores.size() > 0);
-        auto const& available_core = epoch_available_cores.front();
-        placed_cores_start = placer::Coord{
-            .row = static_cast<uint32_t>(available_core.y), .col = static_cast<uint32_t>(available_core.x)};
-        placed_cores_end = placer::Coord{.row = placed_cores_start.row + 1, .col = placed_cores_start.col + 1};
-
-        epoch_available_cores.erase(epoch_available_cores.begin());
-    }
-    auto operands = graph->operands(datacopy_node);
-    TT_ASSERT(operands.size() == 1);
-    auto operand_node = operands.at(0);
-    auto users = graph->users(datacopy_node);
-    TT_ASSERT(users.size() == 1);
-    auto user_node = users.at(0);
-
-    uint32_t target_chip = placer_solution.chip_id(operand_node->name());
-
-    // Ethernet datacopy always gets placed on producer epoch
-    bool operand_node_is_queue_or_input = operand_node->node_type() == graphlib::NodeType::kQueue || operand_node->node_type() == graphlib::NodeType::kInput;
-    std::string const& operand_name = operand_node->name();
-    auto placement_id = !operand_node_is_queue_or_input
-                            ? placer_solution.name_to_op_placement.at(operand_name).epoch_id()
-                            : placer_solution.name_to_op_placement.at(user_node->name()).epoch_id();
-
-    //
-    // Add the placement entry:
-    //
-    TT_ASSERT(epoch_id >= 0, "Invalid value for conversion");
-    // Ethernet datacopy isn't placed on a tensix core so it doesn't have a valid XY. Otherwise we need to check for
-    // valid XY
-    TT_ASSERT(!SERIALIZE_WITH_TENSIX_DATACOPY || placed_cores_start.row < UINT_MAX, "Invalid value for conversion");
-    TT_ASSERT(!SERIALIZE_WITH_TENSIX_DATACOPY || placed_cores_start.col < UINT_MAX, "Invalid value for conversion");
-    auto const& placement = placer::OpPlacement{
-        .id = placement_id,
-        .name = datacopy_node->name(),
-        .chip_id = target_chip,
-        .global_epoch_id = static_cast<std::uint32_t>(epoch_id),
-        .grid_transpose = false,
-        .placed_cores = placer::CoordRange{
-            .start = placed_cores_start,
-            .end = placed_cores_end,
-        }};
-    placer_solution.name_to_op_placement.insert({datacopy_node->name(), placement});
-    placer_solution.epoch_id_to_op_placement.at(epoch_id).push_back(placement);
-    log_trace(tt::LogPlacer, "\tAdded ethernet datacopy placement for {} at global epoch {}, chip {}", datacopy_node->name(), epoch_id, target_chip);
-}
-
-static balancer::TStreamFactor set_datacopy_tstream_factor(
-    graphlib::Graph* graph,
-    balancer::BalancerSolution& balancer_solution,
-    Node* producer_op_node,
-    Node* /*consumer_op_node*/,
-    Edge datacopy_input_edge,
-    Node* datacopy_node)
-{
-    // 
-    // Set tstreaming values for datacopy:
-    //
-
-    bool use_consumer_as_reference = false;
-    if (producer_op_node->node_type() == tt::graphlib::kQueue) {
-        producer_op_node = graph->operands(producer_op_node).at(0);
-    }
-    int tstream_r_factor = datacopy_node->shape().rt();
-    int tstream_c_factor = datacopy_node->shape().ct();
-    TT_ASSERT(tstream_r_factor > 0 && tstream_c_factor);
-    log_trace(tt::LogPlacer, "producer grid size: {}r,{}c", balancer_solution.op_models.at(producer_op_node->name()).grid_shape.r, balancer_solution.op_models.at(producer_op_node->name()).grid_shape.c);
-    log_trace(tt::LogPlacer, "Setting tstream factor for datacopy {} to {}r,{}c. Datacopy shape = (rt={},ct={}), reference_block is {}.",
-        datacopy_node->name(), tstream_r_factor, tstream_c_factor,
-        datacopy_node->shape().rt(), datacopy_node->shape().ct(),
-        (use_consumer_as_reference ? "consumer": "producer"));
-
-    auto input_edge_attr = graph->get_edge_attributes(datacopy_input_edge);
-    auto datacopy_ublock_direction = input_edge_attr->get_ublock_order();
-    auto datacopy_tstream_direction = (datacopy_ublock_direction == graphlib::UBlockOrder::R) ? balancer::TStreamDir::R : balancer::TStreamDir::C;
-    auto datacopy_tstream_factor = balancer::TStreamFactor(datacopy_tstream_direction, tstream_r_factor, tstream_c_factor);
-
-    return datacopy_tstream_factor;
-}
-
-static void add_datacopy_balancer_entry(
-    graphlib::Graph *graph, 
-    // placer::PlacerSolution& placer_solution,
-    balancer::BalancerSolution& balancer_solution,
-    DeviceConfig const& device_config,
-    Node *producer_op_node,
-    Edge datacopy_input_edge,
-    Node *datacopy_node,
-    balancer::TStreamFactor const& datacopy_tstream_factor)
-{
-    // First create the op model
-    if (producer_op_node->node_type() == tt::graphlib::kQueue) {
-        producer_op_node = graph->operands(producer_op_node).at(0);
-    }
-    auto datacopy_input_block_order = graph->get_edge_attributes(datacopy_input_edge)->get_ublock_order();
-    std::size_t dst_size_tiles = balancer::calculate_dst_size_tiles(
-        device_config.get_dst_size(), producer_op_node->output_df(), producer_op_node->shape().get_tile_volume());
-    std::string customErrorMessage;
-
-    // We don't care about caches here, so we create a dummy cache
-    //
-    auto dummy_cache = std::make_shared<balancer::BalancerCacheCollection>();
-
-    auto [datacopy_op_model, failure_reason] = balancer::legalizer::calculate_op_model(
-        graph,
-        dummy_cache,
-        datacopy_node->as<graphlib::BudaOpNode>(),
-        balancer::GridShape(1, 1),
-        datacopy_tstream_factor,
-        datacopy_input_block_order,
-        false, /*force_dram_parameters,*/
-        dst_size_tiles,
-        device_config.get_l1_size(), /*std::size_t l1_usable_size*/
-        0 /*std::size_t dram_channel_capacity*/,
-        customErrorMessage);
-    if (failure_reason != tt::balancer::OpModelFailureReason::NoFailure) {
-        graph->dump("eth_serialization_failure_" + datacopy_node->name());
-        tt::log_error("Calculate op model failed for ethernet datacopy op {} with reason {}", datacopy_node->name(), failure_reason);
-    }
-    if (datacopy_op_model.output_buffers.size() == 0) {
-        graph->dump("eth_serialization_failure_" + datacopy_node->name());
-        tt::log_error("Calculate op model failed for ethernet datacopy op {}. No output buffers created", datacopy_node->name());
-    }
-    
-    balancer_solution.block_shapes.insert({datacopy_node->name(), datacopy_op_model.block_shape()});
-    balancer_solution.op_models.insert({datacopy_node->name(), datacopy_op_model});
-}
-
-template <bool SERIALIZE_WITH_TENSIX_DATACOPY>
-static std::tuple<Edge, graphlib::Node*, Edge, balancer::TStreamFactor> insert_datacopy_between_ops(
-    graphlib::Graph *graph, 
-    placer::PlacerSolution &placer_solution, 
-    balancer::BalancerSolution& balancer_solution, 
-    DeviceConfig const& device_config,
-    std::string const& producer_op_or_queue_name,
-    std::string const& consumer_name, 
-    graphlib::PortId operand_index, 
-    placer::PlacerSolution::EpochId epoch_id,
-    std::vector<tt_xy_pair>& epoch_available_cores)
-{
-    // this must be an op since we inherit attributes from it. It may not be the topologically connected operand if there is a queue in between
-    auto producer_op_node = graph->get_node_by_name(producer_op_or_queue_name);
-
-    auto [datacopy_input_edge, datacopy_node, datacopy_output_edge_before_dram] =
-        SERIALIZE_WITH_TENSIX_DATACOPY
-            ? insert_datacopy_node(graph, producer_op_node, consumer_name, operand_index)
-            : insert_ethernet_datacopy_node(graph, placer_solution, producer_op_node, consumer_name, operand_index);
-
-    add_datacopy_placement_entry<SERIALIZE_WITH_TENSIX_DATACOPY>(graph, placer_solution, epoch_id, epoch_available_cores, datacopy_node);
-
-    auto datacopy_tstream_factor = set_datacopy_tstream_factor(graph, balancer_solution, producer_op_node, graph->get_node_by_name(consumer_name)/*producer_op_node*/, datacopy_input_edge, datacopy_node);
-
-    bool consumer_is_op = placer_solution.name_to_op_placement.find(consumer_name) != placer_solution.name_to_op_placement.end();
-    Edge final_output_edge = datacopy_output_edge_before_dram;
-    if (consumer_is_op) {
-        TT_ASSERT(graph->operand_data_edges(datacopy_node).size() == 1);
-        auto [datacopy_output_edge, queue_node, queue_output_edge] = insert_serialized_dram_queue_between_ops(
-            graph, 
-            datacopy_node->name(),
-            consumer_name, 
-            operand_index);
-        final_output_edge = queue_output_edge;
-    } else {
-        // Delete the placement entry since we are serializing the buffer and need to redo it's placement/allocation
-        // Might not need to do this for eth datacopy
-        placer_solution.name_to_queue_placement.erase(consumer_name);
-    }
-
-    add_datacopy_balancer_entry(
-        graph, 
-        balancer_solution, 
-        device_config, 
-        producer_op_node,
-        datacopy_input_edge, 
-        datacopy_node, 
-        datacopy_tstream_factor);
-
-    TT_ASSERT(graph->operand_data_edges(datacopy_node).size() == 1);
-    TT_ASSERT(graph->user_data_edges(datacopy_node).size() == 1);
-
-    return {datacopy_input_edge, datacopy_node, final_output_edge, datacopy_tstream_factor};
-}
-
-static std::unordered_map<placer::PlacerSolution::EpochId, std::vector<tt_xy_pair>> collect_available_cores_per_epoch(
-    placer::PlacerSolution const& placer_solution,
-    DeviceConfig const& device_config
-) {
-    auto available_cores = std::unordered_map<placer::PlacerSolution::EpochId, std::vector<tt_xy_pair>>{};
-    auto occupied_cores = std::unordered_map<placer::PlacerSolution::EpochId, std::unordered_set<tt_xy_pair>>{};
-    int num_epochs = placer_solution.epoch_id_to_op_placement.size();
-    for (int i = 0; i < num_epochs; i++) 
-    {
-        available_cores[i] = {};
-        occupied_cores[i] = {};
-        for (auto const& placement : placer_solution.epoch_id_to_op_placement.at(i)) 
-        {
-            int start_row = placement.placed_cores.start.row;
-            int end_row = placement.placed_cores.end.row;
-            int start_col = placement.placed_cores.start.col;
-            int end_col = placement.placed_cores.end.col;
-            for (int r = start_row; r < end_row; r++) 
-            {
-                for (int c = start_col; c < end_col; c++) 
-                {
-                    occupied_cores[i].insert(tt_xy_pair(c,r));
-                }
-            }
-        }
-    }
-
-    int chip_grid_r = device_config.grid_size.r;
-    int chip_grid_c = device_config.grid_size.c;
-    for (int i = 0; i < num_epochs; i++) 
-    {
-        auto const& epoch_used_cores = occupied_cores.at(i);
-        auto& epoch_available_cores = available_cores[i];
-        for (int r = 0; r < chip_grid_r; r++) 
-        {
-            for (int c = 0; c < chip_grid_c; c++) 
-            {
-                tt_xy_pair const& core = tt_xy_pair(c,r);
-                bool core_unoccupied = epoch_used_cores.find(core) == epoch_used_cores.end();
-                if (core_unoccupied) 
-                {
-                    epoch_available_cores.push_back(core);
-                } 
-            }
-        }
-    }
-
-    return available_cores;
-}
-
-static void update_tstreaming_factors(
-    graphlib::Graph *graph, 
-    placer::PlacerSolution &placer_solution, 
-    balancer::BalancerSolution& balancer_solution, 
-    Edge datacopy_operand_edge,
-    Edge consumer_new_operand_edge, 
-    balancer::TStreamFactor const& datacopy_tstream_factor) 
-{
-    auto get_producer_tstreaming_factor = [&](graphlib::Graph *graph, balancer::BalancerSolution const& balancer_solution, Edge const& datacopy_operand_edge) -> balancer::TStreamFactor {
-        auto op_producer_node = graph->node_by_id(datacopy_operand_edge.producer_node_id);
-        if (op_producer_node->node_type() == graphlib::NodeType::kInput) {
-            return balancer::TStreamFactor{balancer::TStreamDir::R, 1, 1};
-        }
-        if (op_producer_node->node_type() == graphlib::NodeType::kQueue) {
-            op_producer_node = graph->data_operands(op_producer_node).at(0);
-        }
-        balancer::OpModel const& producer_op_model = balancer_solution.op_models.at(op_producer_node->name());
-        return producer_op_model.t_stream_factor;
-    };
-
-    auto producer_t_stream_factor = get_producer_tstreaming_factor(graph, balancer_solution, datacopy_operand_edge);
-    auto datacopy_input_edge_attr = graph->get_edge_attributes(datacopy_operand_edge);
-    log_debug("update_tstreaming_factors for serializing datacopy op {}. Producer t-stream factor: {}. Consumer t-stream factor: {}",
-        graph->node_by_id(datacopy_operand_edge.consumer_node_id)->name(), producer_t_stream_factor, datacopy_tstream_factor);
-    insert_t_stream_tms_for_eltwise(datacopy_input_edge_attr->get_tms(), datacopy_tstream_factor, producer_t_stream_factor);
-    
-    auto const& consumer_name = graph->node_by_id(consumer_new_operand_edge.consumer_node_id)->name();
-    bool consumer_is_op = placer_solution.name_to_op_placement.find(consumer_name) != placer_solution.name_to_op_placement.end();
-    if (consumer_is_op) 
-    {
-        Node* consumer_node = graph->get_node_by_name(consumer_name);
-        auto consumer_input_edge_attr = graph->get_edge_attributes(consumer_new_operand_edge);
-        balancer::OpModel const& consumer_op_model = balancer_solution.op_models.at(consumer_node->name());
-        auto consumer_t_stream_factor = consumer_op_model.t_stream_factor;
-
-        log_debug("update_tstreaming_factors for consumer {} (from inserted ethernet serializing op {}). Producer t-stream factor: {}. Consumer t-stream factor: {}",
-            consumer_node->name(), graph->node_by_id(datacopy_operand_edge.consumer_node_id)->name(), datacopy_tstream_factor, consumer_t_stream_factor);
-        // We don't need to specify the consumer t-stream factor because it has already been calculated. If we pass the consumer factor in
-        // this function will generate the net/merged t factor required to get from producer->consumer and won't take into account already
-        // existing t stream tms introduced by the t stream pass originally
-        insert_t_stream_tms_for_eltwise(consumer_input_edge_attr->get_tms(), {}/*consumer_t_stream_factor*/, datacopy_tstream_factor);
-    } 
-    else 
-    {
-        for (auto e2e_user_edge : graph->user_data_edges(graph->get_node_by_name(consumer_name))) 
-        {
-            Node* consumer_node = graph->node_by_id(e2e_user_edge.consumer_node_id);
-            balancer::OpModel const& consumer_op_model = balancer_solution.op_models.at(consumer_node->name());
-            auto consumer_t_stream_factor = consumer_op_model.t_stream_factor;
-            auto consumer_input_edge_attr = graph->get_edge_attributes(e2e_user_edge);
-            log_debug("update_tstreaming_factors for e2e consumer {} (from inserted ethernet serializing op {}). Producer t-stream factor: {}. Consumer t-stream factor: {}",
-                consumer_node->name(), graph->node_by_id(datacopy_operand_edge.consumer_node_id)->name(), datacopy_tstream_factor, consumer_t_stream_factor);
-            insert_t_stream_tms_for_eltwise(consumer_input_edge_attr->get_tms(), {}/*consumer_t_stream_factor*/, datacopy_tstream_factor);
-        }
-    }
-}
-
-template <bool SERIALIZE_WITH_TENSIX_DATACOPY>
-static void serialize_chosen_chip_to_chip_data_edges(
-    graphlib::Graph *graph, 
-    placer::PlacerSolution &placer_solution, 
-    balancer::BalancerSolution& balancer_solution,
-    DeviceConfig const& device_config,
-    std::vector<data_edge_serialization_spec_t> const& edges_to_serialize)
-{
-    std::unordered_map<placer::PlacerSolution::EpochId, std::vector<tt_xy_pair>> epoch_available_cores = {};
-    if constexpr (SERIALIZE_WITH_TENSIX_DATACOPY)
-    {
-        epoch_available_cores = collect_available_cores_per_epoch(placer_solution, device_config);
-    }
-
-    std::vector<std::tuple<Edge, graphlib::Node*, Edge, balancer::TStreamFactor>> serialized_edge_specs;
-    for (auto const &[producer_consumer_pair, data_edge, target_epoch_id] : edges_to_serialize) 
-    {
-        auto const &producer_name = std::get<0>(producer_consumer_pair);
-        auto const &consumer_name = std::get<1>(producer_consumer_pair);
-
-        // Insert the tensor serializing datacopy/nop op node
-        TT_ASSERT(
-            !SERIALIZE_WITH_TENSIX_DATACOPY ||
-            epoch_available_cores.find(target_epoch_id) != epoch_available_cores.end());
-        auto const& serialized_edge_spec = insert_datacopy_between_ops<SERIALIZE_WITH_TENSIX_DATACOPY>(
-            graph,
-            placer_solution,
-            balancer_solution,
-            device_config,
-            producer_name,
-            consumer_name,
-            data_edge.operand_index,
-            target_epoch_id,
-            epoch_available_cores[target_epoch_id]);
-        serialized_edge_specs.push_back(serialized_edge_spec);
-    }
-
-    for (auto [datacopy_operand_edge, datacopy_node, consumer_new_operand_edge, datacopy_tstream_factor] : serialized_edge_specs)
-    {
-        update_tstreaming_factors(
-            graph, 
-            placer_solution, 
-            balancer_solution, 
-            datacopy_operand_edge, 
-            consumer_new_operand_edge, 
-            datacopy_tstream_factor);
-    }
-}
-
-void deallocate_dynamic_buffers(graphlib::Graph *graph, placer::DramPlacerConfig const& config, placer::PlacerSolution &placer_solution) 
-{
-    log_debug("Eth stream reduction, deallocating dynamic buffers so they can be reallocated with serialized eth buffers");
-    auto is_cross_epoch_type = [](const Node *q) -> bool {
-        if (q->node_type() != graphlib::NodeType::kQueue) return false;
-        if (q->as<graphlib::QueueNode>()->queue_type() != graphlib::QueueNodeType::EpochToEpoch) return false;
-        return q->as<graphlib::EpochToEpochQueueNode>()->is_cross_epoch_type();
-    };
-
-    bool disable_dynamic_dram = config.disable_dynamic_dram;
-    auto is_static_queue = [disable_dynamic_dram, is_cross_epoch_type](const Node *node, bool is_input) {
-        return disable_dynamic_dram || is_input || is_cross_epoch_type(node) || node->as<graphlib::QueueNode>()->is_grad_accumulator();
-    };
-
-    for (auto &[name, placement] : placer_solution.name_to_queue_placement)
-    {
-        auto node = graph->get_node_by_name(placement.name);
-        bool output_on_host = config.output_queues_on_host &&
-            (node->node_type() == graphlib::NodeType::kOutput) &&
-            node->as<graphlib::OutputNode>()->untilize();
-        bool is_dynamic_queue = !output_on_host && !is_static_queue(node, node->as<graphlib::QueueNode>()->is_input());
-
-        if (is_dynamic_queue) 
-        {
-            log_debug("Deallocating dynamic buffer {}", node->name());
-            placement.dram_buffers.clear();
-            placement.epoch_allocate = -1;
-            placement.epoch_deallocate = -1;
-            TT_ASSERT(placer_solution.name_to_queue_placement.at(node->name()).dram_buffers.size() == 0);
-        }
-
-    }
-}
-
-void reduce_ethernet_stream_usage(
-    PostPlacerConfig& config,
-    graphlib::Graph* graph,
-    balancer::BalancerSolution& balancer_solution,
-    placer::PlacerSolution& placer_solution,
-    DeviceConfig const& device_config)
-{
-    bool tensix_datacopy_eth_link_serialization_enabled = env_as<bool>("PYBUDA_ENABLE_ETH_SERIALIZATION");
-    auto chip_to_chip_data_edges = collect_chip_to_chip_data_edges_per_temporal_epoch(graph, placer_solution);
-
-    if (tensix_datacopy_eth_link_serialization_enabled)
-    {
-        auto const& edges_to_serialize = choose_chip_to_chip_data_edges_to_serialize<true>(
-            graph, chip_to_chip_data_edges, placer_solution, balancer_solution, device_config);
-        serialize_chosen_chip_to_chip_data_edges<true>(
-            graph, placer_solution, balancer_solution, device_config, edges_to_serialize);
-        // Deallocate here so we can reallocate them alongside the serialized buffers. Otherwise when we try to allocate
-        // the serialized buffers we won't know the lifetimes of the previous dynamic buffers which will likely cause us
-        // to allocate in overlapping memory regions
-        deallocate_dynamic_buffers(graph, config.dram_placer_config, placer_solution);
-    }
-    else
-    {
-        auto const& edges_to_serialize = choose_chip_to_chip_data_edges_to_serialize<false>(
-            graph, chip_to_chip_data_edges, placer_solution, balancer_solution, device_config);
-        serialize_chosen_chip_to_chip_data_edges<false>(
-            graph, placer_solution, balancer_solution, device_config, edges_to_serialize);
-        // Deallocate here so we can reallocate them alongside the serialized buffers. Otherwise when we try to allocate
-        // the serialized buffers we won't know the lifetimes of the previous dynamic buffers which will likely cause us
-        // to allocate in overlapping memory regions
-        deallocate_dynamic_buffers(graph, config.dram_placer_config, placer_solution);
-    }
-}
-
-}; // namespace tt
diff --git a/pybuda/csrc/passes/eth_stream_reduction.hpp b/pybuda/csrc/passes/eth_stream_reduction.hpp
deleted file mode 100644
index 4b64d6e18..000000000
--- a/pybuda/csrc/passes/eth_stream_reduction.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <string>
-#include <utility>
-
-#include "graph_lib/defines.hpp"
-
-namespace tt::graphlib
-{
-class Graph;
-struct Edge;
-class Node;
-
-}  // namespace tt::graphlib
-
-namespace tt
-{
-std::tuple<graphlib::Edge, graphlib::Node*, graphlib::Edge> insert_serialized_dram_queue_between_ops(
-    graphlib::Graph* graph,
-    std::string const& producer_name,
-    std::string const& consumer_name,
-    graphlib::PortId consumer_input_port_id,
-    int num_entries = 2);
-}
diff --git a/pybuda/csrc/passes/fork_join.cpp b/pybuda/csrc/passes/fork_join.cpp
deleted file mode 100644
index 9c0ca44e8..000000000
--- a/pybuda/csrc/passes/fork_join.cpp
+++ /dev/null
@@ -1,2193 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "passes/fork_join.hpp"
-
-#include <cmath>
-#include <string>
-#include <queue>
-#include <unordered_map>
-
-#include "balancer/balancer.hpp"
-#include "buda_passes.hpp"
-#include "graph_lib/edge.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-#include "graph_lib/node_types.hpp"
-#include "graph_lib/utils.hpp"
-#include "passes/eth_stream_reduction.hpp"
-#include "passes/fuse_ops.hpp"
-#include "post_placer_buda_passes.hpp"
-#include "reportify/reportify.hpp"
-#include "utils/logger.hpp"
-#include "utils/ordered_associative_containers/ordered_map.hpp"
-namespace tt
-{
-
-using Node = graphlib::Node;
-using Graph = graphlib::Graph;
-using NodeId = graphlib::NodeId;
-// Trace fork BFS until join
-using NodeMap = std::unordered_map<Node *, std::pair<Node *, std::uint32_t>>;
-
-struct CurrentState
-{
-    Node *fork;
-    std::vector<Node *> current_nodes;
-    NodeMap parents;
-    std::unordered_map<Node *, std::uint32_t> node_depth;
-};
-
-using ForkJoin = std::pair<std::vector<Node *>, std::vector<Node *>>;
-
-std::vector<Node *> remove_duplicates_and_outputs(std::vector<Node *> nodes, graphlib::NodeEpochType epoch_type)
-{
-    std::vector<Node *> ret;
-    for (std::size_t i = 0; i < nodes.size(); i++)
-    {
-        bool ok = (nodes[i]->get_epoch_type() == epoch_type) && (nodes[i]->node_type() != graphlib::NodeType::kOutput);
-        if (ok)
-        {
-            for (std::size_t j = i + 1; j < nodes.size(); j++)
-                if (nodes[i] == nodes[j])
-                {
-                    ok = false;
-                    break;
-                }
-        }
-        if (ok)
-            ret.push_back(nodes[i]);
-    }
-    return ret;
-}
-
-void record_fork_join(
-    Node *fork,
-    Node *join_point,
-    Node *parent0,
-    Node *parent1,
-    const NodeMap &parents,
-    std::vector<ForkJoin> &fork_joins)
-{
-    std::vector<Node *> path0 = {join_point, parent0};
-    std::vector<Node *> path1 = {join_point, parent1};
-
-    try
-    {
-        while (path0.back() != fork)
-        {
-            path0.push_back(parents.at(path0.back()).first);
-        }
-        while (path1.back() != fork)
-        {
-            path1.push_back(parents.at(path1.back()).first);
-        }
-    }
-    catch (std::out_of_range &e)
-    {
-        TT_THROW("Missing parent when traversing back to fork point.");
-    }
-
-    // We'll find sub-forks, which should be thrown out - they will be found later from that fork spot
-    std::set<Node *> path0_set = {path0.begin() + 1, path0.end() - 1};
-    for (Node *node : path1)
-        if (path0_set.count(node) > 0)
-            return;
-
-    std::reverse(path0.begin(), path0.end());
-    std::reverse(path1.begin(), path1.end());
-
-    fork_joins.push_back(std::make_pair(path0, path1));
-}
-
-void print_fork_join(const ForkJoin &fj)
-{
-    std::cout << "Fork / Join found" << std::endl;
-    std::cout << "Fork at: " << fj.first[0]->name() << std::endl;
-    std::cout << " Path0: " << std::endl;
-    for (Node *node : fj.first) std::cout << "   - " << node->name() << std::endl;
-    std::cout << " Path1: " << std::endl;
-    for (Node *node : fj.second) std::cout << "   - " << node->name() << std::endl;
-}
-
-void trace_fork(const Graph *graph, Node *fork, std::vector<ForkJoin> &fork_joins, graphlib::NodeEpochType epoch_type)
-{
-    // The strategy is to traverse all forks and count the size of consumed inputs to make fwd progress. The difference
-    // when joined is how much we need to buffer to make sure that long side can make full fwd progress and max speed.
-
-    // This needs to be after balancer, so we know input/output blocks and bws per op.
-
-    // Best reference I could find:
-    // https://cs.stackexchange.com/questions/57221/efficient-algorithms-for-identifying-the-diamond-forkjoin-vertices-and-the-diam
-
-    // We can probably do something simpler, since forks in graphs are relatively rare. On each fork, do a step-by-step
-    // BFS search on both forks until one "dies" (i.e. reaches an output), or we find a common point. Keep track of the
-    // path along the way. This is quite inefficient for large graphs with lots of complex fork/joins, but those should
-    // be extremely rare in AI models.
-
-    std::vector<Node *> data_users = remove_duplicates_and_outputs(graph->data_users(fork), epoch_type);
-    if (data_users.size() == 1)
-        return;  // this is not the fork you're looking for
-
-    // Initial state
-    NodeMap parents = NodeMap();
-    std::unordered_map<Node *, std::uint32_t> node_depth;
-    // std::cout << "trace fork: " << fork->name() << std::endl;
-
-    std::vector<CurrentState> states(data_users.size());
-    for (std::size_t i = 0; i < data_users.size(); i++)
-    {
-        Node *user = data_users[i];
-        // std::cout << "Initializing child: " << user->name() << " at depth 1" << std::endl;
-        states[i].fork = fork;
-        states[i].current_nodes = {user};
-        states[i].parents[user] = std::make_pair(fork, 1);
-        states[i].node_depth[user] = 1;
-    }
-
-    std::string indent = " ";
-    bool done = false;
-    std::unordered_map<std::uint32_t, std::unordered_set<std::uint32_t>> joined;
-    while (!done)
-    {
-        done = true;
-        for (std::size_t i = 0; i < states.size(); i++)
-        {
-            if (states[i].current_nodes.size() == 0)
-                continue;
-
-            CurrentState &state = states[i];
-
-            std::vector<Node *> next_children;
-            for (Node *node : state.current_nodes)
-            {
-                if (node->node_type() == graphlib::kQueue && !(node->as<graphlib::QueueNode>()->is_buffering()))
-                    continue;  // all queues except for buffering break fork-joins
-
-                std::vector<Node *> children = remove_duplicates_and_outputs(graph->data_users(node), epoch_type);
-
-                for (Node *child : children)
-                {
-                    if (child->node_type() == graphlib::kQueue && !(child->as<graphlib::QueueNode>()->is_buffering()))
-                        continue;  // all queues except for buffering break fork-joins
-
-                    std::uint32_t depth = state.node_depth[node] + 1;
-                    if (depth > state.node_depth[child])
-                        state.node_depth[child] = depth;
-
-                    // std::cout << indent << " - branch: " << i << " -> child (depth " << depth << "): " <<
-                    // child->name() << std::endl;
-
-                    if ((state.parents.count(child) == 0) || (depth > state.parents[child].second))
-                    {
-                        // std::cout << indent << "     updating parent of  " << child->name() << " to " << node->name()
-                        // << " due to depth " << depth << std::endl;
-                        state.parents[child] = std::make_pair(node, depth);
-                        next_children.push_back(child);
-                    }
-
-                    // if any of the children have been visited already in another branch, then we've got a join
-                    for (std::size_t j = 0; j < states.size(); j++)
-                    {
-                        if (i == j)  // same branch
-                            continue;
-
-                        if (joined[i].count(j) > 0)  // branches already joined, everything after this will also join
-                            continue;
-
-                        if (states[j].parents.count(child) > 0)
-                        {
-                            // std::cout << indent << " found fork with branch " << j << std::endl;
-                            NodeMap common_parents;
-                            common_parents.insert(state.parents.begin(), state.parents.end());
-                            common_parents.insert(states[j].parents.begin(), states[j].parents.end());
-                            record_fork_join(
-                                state.fork, child, states[j].parents.at(child).first, node, common_parents, fork_joins);
-                            joined[i].insert(j);
-                            joined[j].insert(i);
-                        }
-                    }
-                }
-            }
-            state.current_nodes = next_children;
-            indent += "  ";
-
-            if (state.current_nodes.size() > 0)
-                done = false;
-        }
-    }
-}
-
-std::vector<ForkJoin> find_fork_joins(Graph *graph)
-{
-    std::vector<ForkJoin> fork_joins;
-    for (Node *node : graphlib::topological_sort(*graph))
-    {
-        // fork from input can be ignored, as queues can buffer the source at any rate
-        if ((node->node_type() == graphlib::kInput) || (node->node_type() == graphlib::kQueue))
-            continue;
-
-        if (graph->data_users(node).size() > 1)
-        {
-            trace_fork(graph, node, fork_joins, node->get_epoch_type());
-        }
-    }
-    return fork_joins;
-}
-
-struct pair_hash
-{
-    template <class T1, class T2>
-    std::size_t operator()(const std::pair<T1, T2> &pair) const
-    {
-        return std::hash<T1>()(pair.first) ^ std::hash<T2>()(pair.second);
-    }
-};
-
-// Recover src node if it's missing, from dest node and input_id
-Node *recover_missing_src(graphlib::Graph *graph, Node *dest, std::uint32_t input_id)
-{
-    for (Edge e : graph->operand_data_edges(dest))
-    {
-        if (e.consumer_input_port_id == input_id)
-            return graph->node_by_id(e.producer_node_id);
-    }
-    TT_THROW("Unable to find input with given input_id");
-    return nullptr;
-}
-
-// Return some dest from the given src, if original dest_name node is now missing
-Node *recover_missing_dest(graphlib::Graph *graph, Node *src, std::uint32_t fork_id)
-{
-    auto edges = graph->user_data_edges(src);
-    if (fork_id >= edges.size())
-    {
-        fork_id = 0;  // fall-back, since graph has changed enough that this isn't even a fork any more
-    }
-    return graph->node_by_id(edges[fork_id].consumer_node_id);
-}
-
-void merge_tagged_nops_with_same_src(graphlib::Graph *graph, bool daisy_chain)
-{
-    // populate a map of src_op -> inserted_buffer_nops(mergeable == True)
-    // Make this a map (ordered) to preserve deterministic order of transforms
-    std::map<std::string, std::vector<Node *>> src_op_to_mergeable_nops;
-    for (Node *node : graph->nodes())
-    {
-        if (graphlib::OpNode *op = dynamic_cast<graphlib::OpNode *>(node);
-            op and op->op_name() == "nop" and op->has_tag("mergeable") and std::get<bool>(op->tag_value("mergeable")))
-        {
-            Node *src_op = graph->data_operands(node).at(0);
-            src_op_to_mergeable_nops[src_op->name()].push_back(node);
-        }
-    }
-
-    // check all mergeable nops have the same tms on their edges
-    for (const auto &[src_op, nops] : src_op_to_mergeable_nops)
-    {
-        if (nops.size() > 1)
-        {
-            Edge edge_between_first_nop_and_src_op = graph->operand_data_edges(nops[0]).at(0);
-            auto &tms = graph->get_edge_attributes(edge_between_first_nop_and_src_op)->get_tms();
-            for (std::size_t i = 1; i < nops.size(); i++)
-            {
-                Edge edge_between_nop_and_src_op = graph->operand_data_edges(nops[i]).at(0);
-                auto &tms2 = graph->get_edge_attributes(edge_between_nop_and_src_op)->get_tms();
-                if (tms != tms2)
-                {
-                    log_error(
-                        "User tried to add a buffering nop from src_op: {}, with (hoist_tms = True)"
-                        "between two nodes with different tms. Try setting hoist_tms = False",
-                        src_op);
-                }
-            }
-        }
-    }
-
-    // check all mergeable nops have the same tms on their edges
-    for (const auto &[src_op, nops] : src_op_to_mergeable_nops)
-    {
-        if (nops.size() > 1)
-        {
-            if (daisy_chain)
-            {
-                Node *src_node = graph->get_node_by_name(src_op);
-                Node *current_nop = nops[0];
-
-                for (std::size_t i = 1; i < nops.size(); i++)
-                {
-                    auto edge_to_reattach = graph->get_edges(src_node, nops[i]).at(0);
-                    auto edge_attributes = graph->get_edge_attributes(edge_to_reattach);
-
-                    auto new_edge = edge_to_reattach;
-                    new_edge.producer_node_id = current_nop->id();
-
-                    graph->remove_edge(edge_to_reattach);
-                    graph->add_edge(new_edge, edge_attributes);
-                    log_trace(LogGraphCompiler, "Trying to connect a new edge between producer={} and consumer={}", current_nop->name(), nops[i]->name());
-                    current_nop = nops[i];
-                }
-
-                for (std::size_t i = 0; i < nops.size(); i++) {
-                    // Make all merged daisy-chain nops unmergeable to allow for insertion of later daisy-chains
-                    Node *current_nop = nops[i];
-                    graphlib::OpNode *current_op = dynamic_cast<graphlib::OpNode *>(current_nop);
-                    current_op->tag("mergeable", false);
-                }
-            }
-            else
-            {
-                Node *first_nop = nops[0];
-                for (std::size_t i = 1; i < nops.size(); i++)
-                {
-                    graphlib::replace_node(graph, nops[i], first_nop, true /* skip operands*/);
-                }
-            }
-        }
-    }
-}
-
-// if path contains fork node and join node of fork-join, then, it contains that fork-join
-// this is used when determining if one fork-join is ancestor of another (one wrapping around another)
-bool is_fork_join_on_path(const std::vector<Node *> &path, const ForkJoin *fj)
-{
-    bool contains_fork = false;
-    bool contains_join = false;
-    for (const Node *node : path)
-    {
-        if (node->id() == fj->first[0]->id())
-        {
-            contains_fork = true;
-        }
-
-        if (node->id() == fj->first.back()->id())
-        {
-            contains_join = true;
-        }
-    }
-    return contains_fork && contains_join;
-}
-
-// returns true if first fork-join is ancestor of second fork-join
-bool is_ancestor(const ForkJoin *fj_1, const ForkJoin *fj_2)
-{
-    // if path is only 2 nodes then it can't contain fork-join without having same fork and join as descendant.
-    if ((fj_1->first.size() > 2 && is_fork_join_on_path(fj_1->first, fj_2)) ||
-        (fj_1->second.size() > 2 && is_fork_join_on_path(fj_1->second, fj_2)))
-    {
-        return true;
-    }
-    else
-    {
-        return false;
-    }
-}
-
-// Graph of fork-joins. Each node is one fork-join, and each edge tells which fork-join is contained inside another
-// edge goes from child to parent fork-join.
-FJGraph::FJGraph(graphlib::Graph *graph)
-{
-    fork_joins = find_fork_joins(graph);
-    if (fork_joins.size() == 0)
-    {
-        return;
-    }
-    // initializing adjacency vector with empty sets. Empty set in vector on index i means that i-th node doesn't
-    // have any starting from it. later in this constructor we populate adjacency vector.
-    this->adjacency_vector.resize(fork_joins.size());
-
-    for (std::size_t i = 0; i < fork_joins.size(); i++)
-    {
-        this->fj_ids.push_back(i);
-    }
-
-    // create edges that connects fj to its ancestor fork-joins.
-    // ancestor fork-join is the one that contains current fork-join one of its paths. we check this by asking if both
-    // fork and join of current fj is contained on either of two paths on possible ancestor  fork-join. special case is
-    // when two fork-joins share both fork and join node. Then, no one is ancestor so we don't have connection between
-    // nodes.
-    for (std::size_t i = 0; i < fork_joins.size() - 1; i++)
-    {
-        for (std::size_t j = i + 1; j < fork_joins.size(); j++)
-        {
-            const ForkJoin *fj_1 = &fork_joins[i];
-            const ForkJoin *fj_2 = &fork_joins[j];
-            if (fj_1->first[0] == fj_2->first[0] && fj_1->first.back() == fj_2->first.back())
-            {
-                // if fork and join from two fork-joins are the same, then this is the special where we can't say which
-                // fork-join is ancestor and which descendant.
-                continue;
-            }
-            if (is_ancestor(fj_1, fj_2))
-            {
-                // fj_1 is ancestor to fj2 so we need edge from fj_2 to fj_1
-                this->add_edge(j, i);
-            }
-            else
-            {
-                if (is_ancestor(fj_2, fj_1))
-                {
-                    // fj_2 is ancestor to fj_1 so we need edge from fj_1 to fj_2
-                    this->add_edge(i, j);
-                }
-            }
-        }
-    }
-
-    this->topological_sort();
-    this->create_parents_map();
-}
-
-// adds edge from src to dest
-void FJGraph::add_edge(std::uint32_t src, std::uint32_t dest) { adjacency_vector[src].insert(dest); }
-
-// Topologically sorts fork-join graph. In the end we get array that is sorted from most inner fork-join to most outer.
-void FJGraph::topological_sort()
-{
-    // initialize cnt of visited nodes to 0
-    std::uint32_t cnt_visited_nodes = 0;
-    std::queue<std::uint32_t> nodes_to_visit;
-
-    // initialize the vector of number of incoming edges to zeros.
-    // for each node in graph we want to know how many incomming edges it has.
-    // node that has 0 incomming edges should be the first one in topological order (it does not depend on any other node).
-    std::vector<std::uint32_t> num_incomming_edges(fj_ids.size(), 0); 
-    for (std::uint32_t src_fj_id : fj_ids)
-    {
-        for (auto dest_fj_id : adjacency_vector[src_fj_id])
-        {
-            num_incomming_edges[dest_fj_id]++;
-        }
-    }
-
-    // add all verticies that have num_incomming_edges 0 to queue nodes_to_visit
-
-    for (std::uint32_t fj_id : fj_ids)
-    {
-        if (num_incomming_edges[fj_id] == 0)
-        {
-            // add fj_id into the queue nodes_to_visit
-            nodes_to_visit.push(fj_id);
-        }
-    }
-
-    while (nodes_to_visit.size() != 0)
-    {
-        // take one element from the begining and pop it
-        std::uint32_t current_fj_id = nodes_to_visit.front();
-        nodes_to_visit.pop();
-        cnt_visited_nodes++;
-        // add that current fj to topologically sorted vector.
-        topo_sort_fjs.push_back(&fork_joins[current_fj_id]);
-        topo_sort_fj_indices.push_back(current_fj_id);
-
-        // decrease num_incomming_edges for all neighbouring nodes of current_fj_id
-        for (auto dest_fj_id : adjacency_vector[current_fj_id])
-        {
-            TT_ASSERT(num_incomming_edges[dest_fj_id] > 0, " It is expected that num_incomming_edges is greater than null, but it is not");
-            num_incomming_edges[dest_fj_id]--;
-            // if num_incomming_edges of dest_fj_id is reduced to 0, emplace dest_fj_id to nodes_to_visit
-            if (num_incomming_edges[dest_fj_id] == 0)
-            {
-                nodes_to_visit.push(dest_fj_id);
-            }
-        }
-    }
-    TT_ASSERT(cnt_visited_nodes == fj_ids.size(), "Number of visited nodes is not equal to number of nodes -> topological sort is not possible for the given graph.");
-}
-
-// Fork join FJ_1 is parent to FJ_2 if FJ_1 is the most inner fj that contains FJ_2. We need map that tells us who is the parent for each fork-join.
-// This method creates a map of node -> sorted_fj_ind, to track first ForkJoin index that contains node not including most inner fork
-// join that contains node. this can be called parent fork-join. We need this structure to handle skipping already
-// buffered fork-joins effectively. Most outer fork-join in graph won't have parent fork-join.
-void FJGraph::create_parents_map()
-{
-    for (std::size_t i = 0; i < topo_sort_fj_indices.size(); i++)
-    {
-        std::size_t fj_child_id = topo_sort_fj_indices[i];
-
-        // By default, fork join is a parent to itself.
-        parent_fj_map[topo_sort_fjs[i]] = topo_sort_fjs[i];
-
-        // for current fork-join parent will be on the right in the array of topo_sort_fj_indices
-        // and will also have edge from fj_id to parent_id in adjacency_vector
-        for (std::size_t j = i + 1; j < topo_sort_fj_indices.size(); j++)
-        {
-            std::size_t fj_parent_id = topo_sort_fj_indices[j];
-            // j is parent to i if adjacent matrix contains edge from fj_child_id to fj_parent_id
-            if (adjacency_vector[fj_child_id].find(fj_parent_id) != adjacency_vector[fj_child_id].end())
-            {
-                parent_fj_map[topo_sort_fjs[i]] = topo_sort_fjs[j];
-                break;
-            }
-        }
-    }
-}
-
-void FJGraph::add_elem_to_buffered_fjs(
-    NodeId fork_id, FJBufferingInfo fj_buff_info)
-{
-    if (buffered_fjs.count(fork_id))
-    {
-        buffered_fjs[fork_id].push_back(fj_buff_info);
-    }
-    else
-    {
-        // if there is no key fork_id in the map yet.
-        buffered_fjs[fork_id] =
-            std::vector<FJBufferingInfo>{fj_buff_info};
-    }
-}
-
-void FJGraph::erase_elem_from_buffered_fjs(NodeId fork_id, std::size_t idx)
-{
-    buffered_fjs[fork_id].erase(buffered_fjs[fork_id].begin() + idx);
-}
-
-/*
-Checks if nodes with names this->src this->dest exist. If they do, returns pointers on them.
-If one of them doesn't exist in the graph we try to find which node is currently connected to the specified port
-on existing node. Therefore, if one of the src, dest exists, we can infer the other one's replacement from PortIds
-and still return them.
-On the other hand if both src and dest don't exist anymore we return false
-*/
-std::pair<Node *, Node *> InsertionInstruction::is_instruction_still_valid(graphlib::Graph *graph)
-{
-    // In some cases, an op will not exist any more -- the known case is when a balancer exception has caused
-    // modifications to the graph to be made that might not be needed once NOPs are inserted.
-    Node *src, *dest;
-    src = graph->get_node_by_name(this->src, false);
-    dest = graph->get_node_by_name(this->dest, false);
-    if (src == nullptr)  // graph doesn't have node with name this->src
-    {
-        if (this->user_defined)
-        {
-            log_error("User constructed Nop Instruction constructed with invalid src-nop: {}", this->src);
-        }
-
-        if (dest == nullptr)
-        {
-            log_debug(
-                LogGraphCompiler,
-                "Both {} and {} can't be found, re-lowered graph is different. Skipping nop insertion for the pair",
-                this->src,
-                this->dest);
-            return std::make_pair(src, dest);
-        }
-        TT_ASSERT(this->input_id.has_value(), "Nop Instruction missing input_id attribute populated.");
-        src = recover_missing_src(graph, dest, this->input_id.value());
-    }
-
-    else if (dest == nullptr)
-    {
-        if (this->user_defined)
-        {
-            log_error("User constructed Nop Instruction constructed with invalid dest-nop: {}", this->dest);
-        }
-        TT_ASSERT(this->input_id.has_value(), "Nop Instruction missing fork_id attribute populated.");
-        dest = recover_missing_dest(graph, src, this->fork_id.value());
-    }
-
-    this->src = src->name();
-    this->dest = dest->name();
-    return std::make_pair(src, dest);
-}
-
-/*
-Inserts nops between src and dest nodes that are specified in this.
-*/
-void NopInsertionInstruction::insert(graphlib::Graph *graph)
-{
-    // some reasonable max after which we'll likely change epochs enough to not overdo it
-    std::uint32_t max_nops = (std::uint32_t)env_as<int>("PYBUDA_MAX_FORK_NOPS", 2);
-
-    // If this is an user-defined insert instruction (override), don't limit nop count.
-    if (this->user_defined)
-    {
-        max_nops = this->nop_count;
-    }
-
-    Node *src, *dest;
-    std::tie(src, dest) = this->is_instruction_still_valid(graph);
-    // if instruction isn't valid anymore (src or dest is nullptr after calling is_instruction_still_valid) we skip
-    // adding nop
-    if (src == nullptr || dest == nullptr)
-    {
-        return;
-    }
-
-    // when multiple buffering nops are needed, this string becomes too long and breaks yaml spec when dumped to netlist
-    // so if dest name contains buffer_N_src, increment index, and remove buffer_N_src from dest name
-    auto op_name = [](Node *src, Node *dest,  graphlib::Graph* graph)
-    {
-        std::uint32_t buffer_index = 0;
-        auto dest_name = dest->name();
-        if (dest->name().find("buffer_") != std::string::npos and dest->name().find(src->name()) != std::string::npos)
-        {
-            buffer_index = std::stoi(dest_name.substr(dest_name.find("buffer_") + 7, dest_name.find(src->name()) - dest_name.find("buffer_") - 7));
-            std::string remove = "buffer_" + std::to_string(buffer_index) + "_" + src->name() + "_";
-            dest_name.erase(dest_name.find(remove), remove.length());
-        }
-        std::string op_name;
-        do
-        {
-            op_name = "buffer_" + std::to_string(buffer_index++) + "_" + src->name() + "_" + dest_name;
-        } while (graph->has_node_with_name(op_name));
-        return op_name;
-    };
-
-    if (src->node_type() == graphlib::NodeType::kQueue || dest->node_type() == graphlib::NodeType::kQueue)
-    {
-        return;  // don't need nop if src or dest are queues
-    }
-    Node *original_dest = dest;
-    // insert min(nop_count,max_nops) nops between src and dest
-    for (std::size_t nop_index = 0; nop_index < std::min(this->nop_count, max_nops); nop_index++)
-    {
-        graphlib::BudaOpNode *buffer_nop = nullptr;
-
-        auto edges = graph->get_edges(src, dest);
-        for (graphlib::Edge e : edges)
-        {
-            if (e.edge_type != graphlib::EdgeType::kData)
-            {
-                continue;
-            }
-
-            if (buffer_nop == nullptr)
-            {
-                // create new nop BudaOpNode
-                buffer_nop = graph->add_node(
-                    graphlib::create_node<graphlib::BudaOpNode>(op_name(src, original_dest, graph), "nop"),
-                    graph->get_subgraph_id_for_node(src->id()));
-                buffer_nop->set_shape(src->shape());
-                buffer_nop->set_buffering_op(true);
-                buffer_nop->tag("mergeable", this->mergeable);
-            }
-
-            // insert new node on edge
-            auto [edge0, edge1] = graphlib::insert_node_on_edge(graph, e, buffer_nop, false /* inherit_consumer_attrs */);
-            log_trace(
-                LogGraphCompiler,
-                "Inserted buffer nop node {} between {} and {}",
-                buffer_nop->name(),
-                src->name(),
-                dest->name());
-
-            // Move TMs to edge1
-            auto &tms = graph->get_edge_attributes(edge0)->get_tms();
-            if (not this->hoist_tms)
-            {
-                // not hoisting tms, move them to edge1
-                graph->get_edge_attributes(edge1)->set_tms(tms);
-                graph->get_edge_attributes(edge0)->set_tms(std::vector<graphlib::OpType>{});
-            }
-            dest = buffer_nop;
-        }
-    }
-    // sometimes we want to connect one src to pultiple consumers but adding one nop between them.
-    // This is done buy adding separate nops between every pair of src - dest and then calling this
-    // method merge_tagged_nops_with_same_src to merge tagged nops with same source.
-    // Tag is "mergeable" and if it is true than nop can be merged
-    if (this->request_merge)
-    {
-        merge_tagged_nops_with_same_src(graph, this->daisy_chain);
-    }
-}
-
-/*
-Inserts buffering queue node between src and dest nodes that are specified in this.
-*/
-void QueueInsertionInstruction::insert(graphlib::Graph *graph)
-{
-    Node *src, *dest;
-    std::tie(src, dest) = this->is_instruction_still_valid(graph);
-    // if instruction isn't valid anymore (src or dest is nullptr after calling is_instruction_still_valid) we skip
-    // adding nop
-    if (src == nullptr || dest == nullptr)
-    {
-        return;
-    }
-    if (src->node_type() == tt::graphlib::kQueue && src->as<graphlib::QueueNode>()->is_buffering())
-    {
-        // if src is BufferingQueueNode, then just ensure that we have enough buffering by choosing maximum
-        // number of entries
-        if (src->as<graphlib::QueueNode>()->get_num_entries() < this->num_entries)
-            src->as<graphlib::QueueNode>()->set_num_entries(this->num_entries);
-        return;
-    }
-    if (dest->node_type() == tt::graphlib::kQueue && dest->as<graphlib::QueueNode>()->is_buffering())
-    {
-        // if dest is BufferingQueueNode, then just ensure that we have enough buffering by choosing maximum
-        // number of entries
-        if (dest->as<graphlib::QueueNode>()->get_num_entries() < this->num_entries)
-            dest->as<graphlib::QueueNode>()->set_num_entries(this->num_entries);
-        return;
-    }
-    if (src->node_type() != graphlib::NodeType::kBudaOp || dest->node_type() != graphlib::NodeType::kBudaOp)
-    {
-        // can put BufferingQueueNode between nodes only if they both are kBudaOp
-        return;
-    }
-
-    // there has to be an edge between src and dest in order to add queue between them.
-    bool has_edge_between_src_dest = false;
-    for (Edge e : graph->user_data_edges(src))
-    {
-        if (e.consumer_node_id == dest->id())
-        {
-            has_edge_between_src_dest = true;
-        }
-    }
-
-    if (has_edge_between_src_dest == false)
-        return;
-
-    // if there is less than 2 user data edges from src than certainly there is no fork anymore.
-    if (graph->user_data_edges(src).size() < 2)
-        return;
-
-    // currently we skip adding queue between recompute nodes because fork join paths can reconnect after adding the
-    // queue
-    if (is_recompute(graph, src) && is_recompute(graph, dest))
-        return;
-
-    auto [edge0, queue_node, edge1] = insert_serialized_dram_queue_between_ops(
-        graph, src->name(), dest->name(), this->input_id.value(), this->num_entries);
-
-    log_trace(
-        LogGraphCompiler,
-        "Inserted buffer queue node {} between {} and {}",
-        queue_node->name(),
-        src->name(),
-        dest->name());
-}
-
-// Helper method to retrieve OpModel from either the inline or post-placer OpModelMap.
-//
-balancer::OpModel &get_op_model(balancer::OpModelMap *op_models_post_placer, balancer::OpModels *op_models, Node *node)
-{
-    return op_models_post_placer == nullptr ? op_models->at(node) : op_models_post_placer->at(node->name());
-}
-
-// Calculates next output multiplier from input_multiplier. Tracks how tensor is expanded/contracted from input to
-// output of an op.
-//  next output multiplier increases when tensor volume decreases form input to output.
-float get_output_multiplier(
-    Node *node, float input_multiplier, const balancer::OpModel &op_model, tt::graphlib::PortId input_port_id)
-{
-    tt::balancer::TensorShape in_shape = op_model.op_shape.inputs[input_port_id];
-    tt::balancer::TensorShape out_shape = op_model.op_shape.outputs[0];
-    // output shape is multiplied with t streaming so its rt and ct represent whole tensor
-    // rather than slices, so we have to manually divide it
-    float out_shape_volume = out_shape.ct * out_shape.rt / (float)(op_model.t_stream_factor.t());
-    float input_shape_volume = in_shape.ct * in_shape.rt;
-    // only for sparse matmul in_shape.rt and in_shape.ct calculate in t streaming. That means that
-    // in_shape.ct * in_shape.rt represent volume of the whole tensor rather than only one slice (like
-    // in the rest of the ops). we are interested in size of the slice that is calculated in one entry,
-    // so in this case we have to divide with t stream factor
-    if (node->as<graphlib::OpNode>()->is_sparse_matmul())
-    {
-        input_shape_volume /= (float)(op_model.t_stream_factor.t());
-    }
-    return input_multiplier * input_shape_volume / out_shape_volume ;
-}
-
-// Calculates stack factor between node and consumer node based on op_models, more specifically op shape z dimensions.
-float get_stack_factor(
-    Graph *graph,
-    balancer::OpModelMap *op_models_post_placer,
-    balancer::OpModels *op_models,
-    Node *node,
-    Node *consumer_node)
-{
-    // stack factor is calculated as ratio of output shape z dimension of producer, and input shape z dimension of
-    // consumer.
-    float stack_factor = 1;
-    // if consumer node is not nullptr and node type is BudaOpNode
-    if (consumer_node != nullptr && consumer_node->node_type() == graphlib::NodeType::kBudaOp)
-    {
-        const balancer::OpModel &op_model = get_op_model(op_models_post_placer, op_models, node);
-        const balancer::OpModel &consumer_op_model = get_op_model(op_models_post_placer, op_models, consumer_node);
-        std::function<bool(Edge)> edge_filter = [consumer_node](Edge edge)
-        { return edge.consumer_node_id == consumer_node->id(); };
-
-        std::vector<Edge> user_edges = graph->user_data_edges(node, edge_filter);
-        // If we have multiple edges (user_edges.size() > 1) from node to consumer_node, we find the one with the
-        // smallest z dim on input op shape, because that will produce highest stack_factor - we want to buffer the
-        // worst case scenario.
-        TT_ASSERT(user_edges.size() >= 1, "Expected to have edge between node and consumer_node");
-        int min_consumer_z_dim = INT_MAX;
-        int consumer_z_dim = INT_MAX;
-        for (Edge e : user_edges)
-        {
-            consumer_z_dim = consumer_op_model.op_shape.inputs[e.consumer_input_port_id].z;
-            if (consumer_z_dim < min_consumer_z_dim)
-            {
-                min_consumer_z_dim = consumer_z_dim;
-            }
-        }
-
-        TT_ASSERT(min_consumer_z_dim > 0, "input z dim of consumer op model can't be less than 1");
-        stack_factor = op_model.get_out_shape().z / (float)(min_consumer_z_dim);
-    }
-    return stack_factor;
-}
-
-// compares two fork-joins node by node
-bool is_same_fj(const ForkJoin& fj1, const ForkJoin& fj2)
-{
-    if (fj1.first.size() != fj2.first.size() || fj1.second.size() != fj2.second.size())
-    {
-        return false;
-    }
-
-    // compare first path
-    for (std::size_t i = 0; i < fj1.first.size(); i++)
-    {
-        if (fj1.first[i]->id() != fj2.first[i]->id())
-        {
-            return false;
-        }
-    }
-
-    // compare second path
-    for (std::size_t i = 0; i < fj1.second.size(); i++)
-    {
-        if (fj1.second[i]->id() != fj2.second[i]->id())
-        {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-// gets current node (fork) and tries to find if there is fork-join starting at that node (fork) and finishing at join
-// that belongs to current fork-join (fj) Also, fork-join which we are trying to find has to be already buffered
-// (contained in map fj_graph.buffered_fjs).
-FJBufferingInfo FJGraph::find_sub_fork_join_from_node(
-    const ForkJoin &fj, const std::vector<Node *> &path, Node *fork)
-{
-    if (buffered_fjs.count(fork->id()))
-    {
-        // there is some already buffered fork-join that shares the fork with current FJ which one of the paths is path
-        for (auto fj_buff_info : buffered_fjs.at(fork->id()))
-        {
-            const ForkJoin *buff_fj = fj_buff_info.fj;
-            // is parent of buffered_fj the same as fj
-            if (is_same_fj(*parent_fj_map.at(buff_fj), fj))
-            {
-                // we found the fj that is contained in current fork-join, and previously buffered.
-                // we have to check if this buffered fork-join (buff_fj) belongs to right path of fj
-                if (is_fork_join_on_path(path, buff_fj))
-                {
-                    // we will use its required and available buffering.
-                    return fj_buff_info;
-                    break;
-                }
-            }
-        }
-    }
-    return FJBufferingInfo(nullptr, 0, 0, nullptr);
-}
-
-
-void FJGraph::update_buffered_fj_map(const ForkJoin& fj, FJBufferingInfo fj_buff_info)
-{
-    // ForkJoinId fj_key = std::make_pair(fj.second[0]->id(),fj.second.back()->id());
-    NodeId fork_id = fj.second[0]->id();
-    if (buffered_fjs.count(fork_id) == 0)
-    {
-        // if there is no already buffered fork-joins with same fork, just add current fj to buffered_fjs
-        this->add_elem_to_buffered_fjs(fork_id, fj_buff_info);
-    }
-    else
-    {
-        // there are buffered fork-joins with same fork as current fj. We want to delete buffered fork-joins that have same fork
-        // as current fj if that fork-join is their parent.
-        std::vector<FJBufferingInfo> already_buff_fj_info =
-            buffered_fjs.at(fork_id);
-        std::vector<std::size_t> indices_to_delete;
-        for (std::size_t i = 0; i < already_buff_fj_info.size(); i++)
-        {
-            FJBufferingInfo value = already_buff_fj_info[i];
-            const ForkJoin *buff_fj = value.fj;
-
-            // check if buff_fj has parent fork-join in map parent_fj_map
-            if (parent_fj_map.count(buff_fj))
-            {
-                const ForkJoin *parent_fj = parent_fj_map.at(buff_fj);
-                if (is_same_fj(*parent_fj, fj))
-                {
-                    indices_to_delete.push_back(i);
-                    continue;
-                }
-            }
-            else
-            {
-                TT_ASSERT(
-                    parent_fj_map.count(buff_fj),
-                    "It is expected that each join in graph has parebt-fork-join. If a fork-join is not contained in "
-                    "other fj, then that fork-join is parent fork-join to itself.");
-            }
-        }
-
-        this->add_elem_to_buffered_fjs(fork_id, fj_buff_info);
-        // delete all fork-joins that share fork with current fj and are contained in current fj, because we only need
-        // info on most outer fj that is buffered. It is important that indices_to_delete is sorted in descending order.
-        // Then, each index that we delete won't influence index values of next elements we need to delete.
-        sort(indices_to_delete.begin(), indices_to_delete.end(), [](int a, int b) { return a > b; });
-        for (std::size_t idx : indices_to_delete)
-        {
-            this->erase_elem_from_buffered_fjs(fork_id, idx);
-        }
-    }
-}
-
-// if available is set, then we're looking for available buffering in the path... if not, then
-// we're looking for required buffering
-std::tuple<std::uint32_t, bool, int> get_buffering(
-    Graph *graph,
-    balancer::OpModelMap *op_models_post_placer,
-    balancer::OpModels *op_models,
-    const std::vector<Node *> &path,
-    const ForkJoin &fj,
-    bool available,
-    FJGraph& fj_graph)
-{
-    float current_output_multiplier = 1.0;  // keep track of expansions and reductions of fork outputs
-    Node *prev_node = path[0];
-    int sum_buff_queue_num_entries = 0;
-
-    bool dump_debug_info = false;
-    std::stringstream debug_info;
-
-    if (env_as<bool>("PYBUDA_FORK_JOIN_DEBUG_INFO"))
-    {
-        std::string fork_name = env_as<std::string>("PYBUDA_FORK_JOIN_DEBUG_FORK_NAME");
-        std::string join_name = env_as<std::string>("PYBUDA_FORK_JOIN_DEBUG_JOIN_NAME");
-
-        // If user has provided fork/join node names, apply them to filter unneeded debug info.
-        dump_debug_info = (fork_name.empty() || fork_name == path[0]->name());
-        dump_debug_info &= (join_name.empty() || join_name == path.back()->name());
-    }
-
-    std::uint32_t total_buffering = 0;
-    bool has_queue_on_path = false;
-
-    auto [join, req, avail, sub_fj] = fj_graph.find_sub_fork_join_from_node(fj, path, path[0]);
-
-    if (join != nullptr)
-    {
-        // we found sub fork-join from node path[0]
-        total_buffering += available ? avail : req;
-    }
-
-    for (std::uint32_t path_index = 1; path_index < path.size(); path_index++)
-    {
-        Node *node = path[path_index];
-        // if current node is join that means that we already included its input buffering in total_buffering
-        // calculation independent of wether it is available or required buffering.
-        bool curr_node_is_join = (join == node && node != nullptr);
-        Node *consumer_node = nullptr;
-
-        std::stringstream node_debug_info;
-
-        if (path_index < path.size() - 1)
-        {
-            consumer_node = path[path_index + 1];
-
-            if (consumer_node->node_type() != graphlib::NodeType::kBudaOp)
-            {
-                // currently if consumer op is not Buda Op we leave consummer_node to nullptr
-                // that causes next_stack_factor to be 1, which is what we want.
-                consumer_node = nullptr;
-            }
-        }
-
-        if (node->node_type() == graphlib::kQueue)
-        {
-            TT_ASSERT(
-                node->as<graphlib::QueueNode>()->is_buffering(),
-                "Buffering queues are the only type of queues that are tolerated in fork-joins");
-            has_queue_on_path = true;
-            sum_buff_queue_num_entries += node->as<graphlib::QueueNode>()->get_num_entries();
-            continue;
-        }
-
-        const balancer::OpModel &op_model = get_op_model(op_models_post_placer, op_models, node);
-
-        if (dump_debug_info)
-        {
-            node_debug_info << "node name: " << node->name() << std::endl;
-            node_debug_info << "op type: " << node->get_type() << std::endl;
-        }
-
-        std::uint32_t input_buffering = 0;  // number of tiles
-        float next_output_multiplier = 1.0;
-        bool is_join = (path_index == path.size() - 1);
-        // if node is buffering op, it should not influence required buffering of the path, since buffering op
-        // is just used for storing the data
-        if (node->as<graphlib::BudaOpNode>()->is_buffering_op() && !available)
-        {
-            prev_node = node;
-            continue;
-        }
-
-        for (Edge e : graph->user_data_edges(prev_node))
-        {
-            if (e.consumer_node_id != node->id())
-                continue;
-
-            auto tms = graph->get_edge_attributes(e)->get_tms();
-
-            float input_multiplier = current_output_multiplier;
-            int broadcast_factor = 0;
-            for (auto tm : tms)
-            {
-                // std::cout << "tm: " << tm.op << std::endl;
-                // if ((tm.op == "vstack") || (tm.op == "hstack"))
-                //    input_multiplier *= (float)std::get<int>(tm.attr[0]);
-                // else if ((tm.op == "vslice") || (tm.op == "hslice"))
-                //    input_multiplier /= (float)std::get<int>(tm.attr[0]);
-                if (tm.op == "broadcast")
-                {
-                    broadcast_factor = std::get<int>(tm.attr[1]);
-                    input_multiplier /= (float)broadcast_factor;
-                }
-            }
-
-            tt::balancer::TensorShape in_shape = op_model.op_shape.inputs[e.consumer_input_port_id];
-            std::uint32_t in_tiles = 0;
-
-            if (available)
-            {
-                // It's simply input and output buffers together, unless it's the join op, for which only input buffer
-                // counts
-                in_tiles = op_model.input_buffers.at(e.consumer_input_port_id).l1_size_tiles;
-                in_tiles *= op_model.grid_shape.volume();
-            }
-            else
-            {
-                // Figure out how much is required for full fwd progress
-                if (node->as<graphlib::BudaOpNode>()->op_type().op == "fused_op")
-                {
-                    // TODO: we can't tell how much of the input fused op needs to produce output, without analyzing and
-                    // tracing the fused op For now, we'll estimate by just using the input size.
-
-                    // Fused op can produce output from just one slice (in_shape.ct * in_shape.rt).
-                    // However that is not enough if the next op requires more slices to start calculating
-                    // (has some stack on next edge)
-                    // next_stack_factor handles this.
-                    float next_stack_factor =
-                        get_stack_factor(graph, op_models_post_placer, op_models, node, consumer_node);
-                    // int next_stack_factor = op_model.output_buffers[0].buffer_factor;
-                    // If stack_factor is < 1 that means that we have slice on next edge, which doesn't influence
-                    // current op required buffering estimation.
-                    next_stack_factor = std::max((float)(1), next_stack_factor);
-
-                    // Fused op can produce output from just one slice (in_shape.ct * in_shape.rt).
-                    // However that is not enough if the previous op produces output with smaller z dimension. That
-                    // means even if current op can move forward only with one slice, it will have to wait the previous
-                    // op to form complete output to get one slice. Therefore required buffering doesn't depend only on
-                    // slice size but on slice factor on previous edge. We have to take into consideration
-                    // prev_slice_factor on previous edge.
-                    const balancer::OpModel &previous_op_model =
-                        get_op_model(op_models_post_placer, op_models, prev_node);
-                    float prev_slice_factor = 1;
-                    if (op_model.op_shape.inputs[e.consumer_input_port_id].volume_in_tiles() == previous_op_model.op_shape.outputs[0].volume_in_tiles())
-                    {
-                        // if volumes are the same, then we can compare z dimensions to infer if it was slicing between prev_node and node.
-                        // if volume has changed from output of prev_node to input of node, we had some broadcast
-                        prev_slice_factor = op_model.op_shape.inputs[e.consumer_input_port_id].z /
-                            (float)(previous_op_model.get_out_shape().z);
-                    }
-
-                    // If prev_slice_factor is < 1 that means that we have stack onprevious edge, which doesn't
-                    // influence current op required buffering estimation.
-                    prev_slice_factor = std::max((float)(1), prev_slice_factor);
-                    in_tiles = ceil(in_shape.ct * in_shape.rt * std::max(prev_slice_factor, next_stack_factor) * 2);
-                }
-                else if (is_join)
-                {
-                    // We just need to fill the input buffer to make progress, not actually produce a full output
-                    in_tiles = op_model.input_buffers.at(e.consumer_input_port_id).block_shape.volume_no_t() * 2; // 2 because of double buffering
-                    in_tiles *= op_model.grid_shape.volume();
-                }
-                else if (node->as<graphlib::BudaOpNode>()->op_type().op == "matmul")
-                {
-                    // Matmul can produce output from just one slice (in_shape.ct * in_shape.rt).
-                    // However that is not enough if the next op requires more slices to start calculating
-                    // (has some stack on next edge)
-                    // next_stack_factor handles this.
-                    float next_stack_factor =
-                        get_stack_factor(graph, op_models_post_placer, op_models, node, consumer_node);
-                    // int next_stack_factor = op_model.output_buffers[0].buffer_factor;
-                    // If stack_factor is < 1 that means that we have slice on next edge, which doesn't influence
-                    // current op required buffering estimation.
-                    next_stack_factor = std::max((float)(1), next_stack_factor);
-
-                    // Matmul can produce output from just one slice (in_shape.ct * in_shape.rt).
-                    // However that is not enough if the previous op produces output with smaller z dimension. That
-                    // means even if current op can move forward only with one slice, it will have to wait the previous
-                    // op to form complete output to get one slice. Therefore required buffering doesn't depend only on
-                    // slice size but on slice factor on previous edge. We have to take into consideration
-                    // prev_slice_factor on previous edge.
-                    const balancer::OpModel &previous_op_model =
-                        get_op_model(op_models_post_placer, op_models, prev_node);
-                    float prev_slice_factor = 1;
-                    if (op_model.op_shape.inputs[e.consumer_input_port_id].volume_in_tiles() == previous_op_model.op_shape.outputs[0].volume_in_tiles())
-                    {
-                        // if volumes are the same, then we can compare z dimensions to infer if it was slicing between prev_node and node.
-                        // if volume has changed from output of prev_node to input of node, we had some broadcast
-                        prev_slice_factor = op_model.op_shape.inputs[e.consumer_input_port_id].z /
-                            (float)(previous_op_model.get_out_shape().z);
-                    }
-
-                    // If prev_slice_factor is < 1 that means that we have stack onprevious edge, which doesn't
-                    // influence current op required buffering estimation.
-                    prev_slice_factor = std::max((float)(1), prev_slice_factor);
-                    in_tiles = ceil(in_shape.ct * in_shape.rt * std::max(prev_slice_factor, next_stack_factor) * 2);
-                    // 2 is beacuse of double buffering.
-                    // only for sparse matmul in_shape.rt and in_shape.ct calculate in t streaming. That means that
-                    // in_shape.ct * in_shape.rt represent volume of the whole tensor rather than only one slice (like
-                    // in the rest of the ops). we are interested in size of the slice that is calculated in one entry,
-                    // so in this case we have to divide with t stream factor
-                    if (node->as<graphlib::OpNode>()->is_sparse_matmul())
-                    {
-                        in_tiles /= op_model.t_stream_factor.t();
-                    }
-                }
-                else
-                {
-                    in_tiles = op_model.output_buffers.at(0).l1_size_tiles;
-                    in_tiles *= op_model.grid_shape.volume();
-                }
-            }
-
-            std::uint32_t in_req = in_tiles * input_multiplier;
-
-            if (in_req > input_buffering)
-            {
-                input_buffering = in_req;  // largest edge wins if there's more than one edge
-                next_output_multiplier =
-                    get_output_multiplier(node, input_multiplier, op_model, e.consumer_input_port_id);
-            }
-
-            if (dump_debug_info)
-            {
-                node_debug_info << '\t' << "input port id: " << e.consumer_input_port_id << std::endl;
-                if (broadcast_factor)
-                    node_debug_info << '\t' << "input edge has broadcast of factor: " << broadcast_factor << std::endl;
-                node_debug_info << '\t' << "op grid shape: " << op_model.grid_shape << std::endl;
-                node_debug_info << '\t' << "input shape: " << in_shape << std::endl;
-                node_debug_info << '\t' << "input buffer block shape: " << op_model.input_buffers.at(e.consumer_input_port_id).block_shape << std::endl;
-                node_debug_info << '\t' << "l1 size tiles: " << op_model.input_buffers.at(e.consumer_input_port_id).l1_size_tiles << std::endl;
-                node_debug_info << '\t' << "in tiles: " << in_tiles << std::endl;
-                node_debug_info << '\t' << "input multiplier: " << input_multiplier << std::endl;
-                node_debug_info << '\t' << "total in: " << in_req << std::endl;
-                node_debug_info << '\t' << "next output multiplier: " << get_output_multiplier(node, input_multiplier, op_model, e.consumer_input_port_id) << std::endl;
-
-                node_debug_info << std::endl;
-            }
-        }
-
-        current_output_multiplier = next_output_multiplier;
-        std::uint32_t output_buffering =
-            (path_index < path.size() - 1)
-                ? op_model.output_buffers.at(0).l1_size_tiles * op_model.grid_shape.volume() * current_output_multiplier
-                : 0;  // output buffer on the last one doesn't count
-
-        if (join == nullptr)
-        {
-            if (available)
-            {
-                total_buffering += input_buffering + output_buffering;
-            }
-            else
-            {
-                // Somehwat arbitrary - a better algorithm is needed here
-                // std::string op_type = node->as<graphlib::BudaOpNode>()->op_type().op;
-                // if ((op_type == "matmul") || (op_type == "sparse_matmul") || (op_type == "exp"))
-                total_buffering += input_buffering;  // input buffering is the requirement
-            }
-        }
-        if (join == nullptr || join == node)
-        {
-            FJBufferingInfo fj_buff_info = fj_graph.find_sub_fork_join_from_node(fj, path, node);
-            join = fj_buff_info.join;
-            req = fj_buff_info.req;
-            avail = fj_buff_info.avail;
-            if (join != nullptr)
-            {
-                // we found sub fork-join from node node
-                // we uptade total_buffering with avail or req depending on bool available.
-                // We add scalling factor current_output_multiplier that takes into consideration tensor expansion and
-                // contraction from the begining of fork-join for which we calculate total buffering.
-                total_buffering += available ? avail * current_output_multiplier : req * current_output_multiplier;
-            }
-            if (curr_node_is_join && available)
-            {
-                // we are at the end of fj. add output buffer of join node to available buffering
-                total_buffering += output_buffering;
-            }
-        }
-
-        if (dump_debug_info)
-        {
-            if (available)
-            {
-                node_debug_info << "output_buffering: " << output_buffering << std::endl;
-                node_debug_info << "total_buffering (for node): " << input_buffering + output_buffering << std::endl;
-            }
-            else
-            {
-                node_debug_info << "total_buffering (for node): " << input_buffering << std::endl;
-            }
-
-            node_debug_info << "--------------------------------------------------->" << std::endl;
-            debug_info << node_debug_info.str();
-        }
-
-        prev_node = node;
-    }
-
-    if (dump_debug_info)
-    {
-        debug_info << "Total " << (available ? "available" : "required") << " buffering: " << total_buffering << std::endl;
-        log_debug(LogGraphCompiler, "Calculating {} buffering between nodes {} and {}\n\n{}", available ? "available" : "required", path[0]->name(), path.back()->name(), debug_info.str());
-    }
-
-    return std::make_tuple(total_buffering, has_queue_on_path, sum_buff_queue_num_entries);
-}
-
-std::tuple<std::uint32_t, bool, int> get_available_buffering(
-    Graph *graph,
-    balancer::OpModelMap *op_models_post_placer,
-    balancer::OpModels *op_models,
-    const std::vector<Node *> &path,
-    const ForkJoin &fj,
-    FJGraph& fj_graph)
-{
-    return get_buffering(graph, op_models_post_placer, op_models, path, fj, true, fj_graph);
-}
-
-std::tuple<std::uint32_t, bool, int> get_buffering_requirement(
-    Graph *graph,
-    balancer::OpModelMap *op_models_post_placer,
-    balancer::OpModels *op_models,
-    const std::vector<Node *> &path,
-    const ForkJoin &fj,
-    FJGraph& fj_graph)
-{
-    return get_buffering(graph, op_models_post_placer, op_models, path, fj, false, fj_graph);
-}
-
-/*
-Calculates how much dram memory buffering queue nodes consume in bytes. If this number exceeds some threshold
-we can stop producing queues and continue only using nops for fork-join buffering
-*/
-int buffering_queues_mem_consumption(
-    const tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        &instructions)
-{
-    int buff_queue_memory_consumption = 0;
-    for (auto instruction : instructions)
-    {
-        if (instruction.second->instr_type == InsructionType::QueueInstruction)
-        {
-            QueueInsertionInstruction *que_instr = static_cast<QueueInsertionInstruction *>(instruction.second.get());
-            buff_queue_memory_consumption += que_instr->queue_size;
-        }
-    }
-    return buff_queue_memory_consumption;
-}
-// Inserts new queue instruction to map of instructions. 
-void insert_queue_ins_to_instructions(
-    tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        &instructions,
-    InsInstructionUniqueId key,
-    std::shared_ptr<InsertionInstruction> new_ins)
-{
-    TT_ASSERT(
-        new_ins.get()->instr_type == InsructionType::QueueInstruction,
-        "Instruction has to be of type InsructionType::QueueInstruction");
-    if (instructions.count(key) > 0)
-    {
-        if (instructions[key].get()->instr_type == InsructionType::NopInstruction)
-        {
-            // if instructions contains element with key equal to key and current instruction is NopInstruction,
-            // we replace it with queue instruction. This is because if we add queue on the path, we don't need nops on that path.
-            instructions[key] = new_ins;
-        }
-        else if (instructions[key].get()->instr_type == InsructionType::QueueInstruction)
-        {
-            // if instructions contains element with key equal to key and current instruction is QueueInstruction,
-            // we update num entries of the queue to the maximum of the num_entries of two instructions.
-            QueueInsertionInstruction *instr = static_cast<QueueInsertionInstruction *>(instructions[key].get());
-            QueueInsertionInstruction *new_queue_ins = static_cast<QueueInsertionInstruction *>(new_ins.get());
-            if (instr->num_entries < new_queue_ins->num_entries)
-            {
-                instr->set_num_entries(new_queue_ins->num_entries);
-            }
-        }
-        else
-        {
-            log_error("Unsupported instruction type");
-        }
-    }
-    else
-    {
-        instructions[key] = new_ins;
-    }
-}
-
-uint32_t expand_output_buffer(balancer::OpModel& op_model, float scale_usable_l1_size, uint32_t usable_l1_size)
-{
-    if (scale_usable_l1_size * usable_l1_size <= op_model.get_l1_memory_usage())
-    {
-        return 0;
-    }
-
-    uint32_t added_tiles = 0;
-    balancer::BufferModel& output_buffer = op_model.output_buffers[0];
-    const uint32_t tile_size_bytes = balancer::tile_size_bytes(output_buffer.data_format);
-    const uint32_t available_space_tiles = (scale_usable_l1_size * usable_l1_size - op_model.get_l1_memory_usage()) / tile_size_bytes;
-    const uint32_t tiles_per_mb = op_model.block_shape().volume_no_t();
-    const uint32_t t_dim = op_model.block_shape().t;
-    const uint32_t initial_mb = output_buffer.buffer_factor;
-
-    // We want to expand output buffer to fit at most t macro blocks.
-    const uint32_t mb_limit = std::min((uint32_t)(available_space_tiles + output_buffer.l1_size_tiles) / tiles_per_mb / 2, t_dim);
-
-    if (mb_limit <= 1)
-    {
-        // No space to extend the output buffer, since we can't fit more than 1 macro block (double buffered).
-        return 0;
-    }
-
-    const auto factors = balancer::FactorizedInt(t_dim);
-
-    // Backend constraint is that the size of the output buffer in macro blocks must be divisible by t (or vice versa).
-    // Since we will buffer at most t macro blocks (whole output),
-    // take nearest factor of t less than or equal to the actual limit.
-    const uint32_t size_in_mb = factors.get_nearest_factor_le(mb_limit);
-
-    output_buffer.buffer_factor = size_in_mb * 2;
-    output_buffer.l1_size_tiles = tiles_per_mb * output_buffer.buffer_factor;
-
-    added_tiles = (output_buffer.buffer_factor - initial_mb) * tiles_per_mb;
-
-    return added_tiles;
-}
-
-
-// This function is attempting to add buffering along a given path in a graph, with the goal of minimizing the number of
-// nops that need to be inserted. It does this by iterating over the nodes in the path and attempting to add as much
-// buffering as possible at each node, using available memory space and respecting certain environment variables and
-// input parameters. If it is not possible to add enough buffering, the function will add nop insertion instructions to
-// a provided vector.
-void add_buffering_on_path(
-    const Graph *graph,
-    const std::vector<Node *> path,
-    std::uint32_t long_path_required,
-    std::uint32_t short_path_available,
-    tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        &instructions,
-    const tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        &previous_ins_instructions,
-    balancer::OpModelMap *op_models_post_placer,
-    balancer::OpModels *op_models,
-    const std::uint32_t usable_l1_size,
-    const int fork_join_tiles_treshold,
-    std::function<int(const tt::balancer::OpModel &)> buffering_factor,
-    const ForkJoin &fj,
-    FJGraph& fj_graph)
-{
-    // Go along the path and try to add buffering as much as it fits
-    std::uint32_t to_add = long_path_required - short_path_available;
-    // growth of available buffering is calculated after increasing input buffers and adding nops on path.
-    // this value reflects how many more tiles on input path will be able to buffer.
-    std::uint32_t additional_available_buff = 0;
-
-    // If enabled we will add buffering queues instead of NOPs to buffer fork-joins.
-    const bool add_buffer_queues = env_as<int>("PYBUDA_FORK_JOIN_BUF_QUEUES", 0);
-
-    // currently, we maximize input buffers by default.
-    const bool maximize_buffers = env_as<bool>("PYBUDA_MAX_FORK_JOIN_BUF", 1);
-
-    // If enabled we will expand fork node output buffer.
-    const bool expand_fork_output_buffer = env_as<bool>("PYBUDA_FORK_JOIN_EXPAND_FORK_OUTPUT_BUF", 1);
-
-    // If enabled we will expand output buffers (instead of input buffers) of the nodes on the path.
-    const bool expand_output_buffers = env_as<bool>("PYBUDA_FORK_JOIN_EXPAND_OUTPUT_BUFFERS", 0);
-
-    // If enabled we skip expanding buffers for regular ops, to force adding NOPs.
-    const bool skip_expanding_buffers = env_as<bool>("PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS");
-
-    const int max_queue_mem =
-        1024 * 1024 * 1024;  // 1GB, this is ad hoc limit for maximum memory buffering queues can consume on one chip.
-    const float scale_usable_l1_size = 0.95;
-
-    Node *prev_node = nullptr;
-    // current_output_multiplier keeps track of expansions and reductions of fork outputs
-    float current_output_multiplier = 1.0;
-    Node *join  = nullptr;
-
-    for (Node *node : path)
-    {
-        bool curr_node_is_join = (join == node && node != nullptr);
-
-        // join != nullptr is if node is inside of an inner fork-join. join is then pointing to the join of the inner
-        // fork-join so we know on which node this inner fork-join is finishing: node == join)
-        bool outside_fj = (join == nullptr && !curr_node_is_join);
-        if (join == nullptr || join == node)
-        {
-            FJBufferingInfo fj_buff_info = fj_graph.find_sub_fork_join_from_node(fj, path, node);
-            join = fj_buff_info.join;
-        }
-
-        if (prev_node == nullptr)
-        {
-            prev_node = node;
-
-            balancer::OpModel& op_model = get_op_model(op_models_post_placer, op_models, node);
-            if (expand_fork_output_buffer && to_add > (uint32_t)op_model.block_shape().volume_no_t())
-            {
-                // In cases when we need to buffer more than macro block size of tiles, we may end up
-                // in a situation where just expanding input buffers on the short path won't be
-                // sufficient - due to the fact that backend cannot generate pipes/streams to utilize
-                // 100% of the allocated input buffers.
-                //
-                // To workaround this limitation, we need to additionally expand output buffer of the fork node.
-                uint32_t added_tiles = expand_output_buffer(op_model, scale_usable_l1_size, usable_l1_size);
-
-                if (added_tiles > 0)
-                {
-                    log_debug(LogGraphCompiler, "Expanded fork node ({}) output buffers to a total of {} macro blocks.", node->name(), op_model.output_buffers[0].buffer_factor);
-                }
-            }
-
-            continue;
-        }
-
-        // If PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS is enabled we don't want to expand buffers for regular ops.
-        // This will cause the algorithm to resort to adding nops/buffers to buffer the path.
-        // NOTE: Here we don't skip buffering ops (nops), because after we add nops, this code will be executed again to expand their input buffers.
-        if (skip_expanding_buffers && !node->as<graphlib::BudaOpNode>()->is_buffering_op())
-        {
-            prev_node = node;
-            continue;
-        }
-
-        balancer::OpModel &op_model = get_op_model(op_models_post_placer, op_models, node);
-        uint32_t output_buffer_tiles_added = 0;
-
-        // If enabled, expand output buffer - except for join node since that doesn't help with fork-join buffering.
-        if (expand_output_buffers && path.back() != node)
-        {
-            output_buffer_tiles_added = expand_output_buffer(op_model, scale_usable_l1_size, usable_l1_size);
-        }
-
-        float next_output_multiplier = 1.0;
-        for (Edge e : graph->get_edges(prev_node, node))
-        {
-            if (e.edge_type == graphlib::EdgeType::kData)
-            {
-                // it turns out that sometimes pipegen requires more l1 space than what we model.
-                // that is why we won't use full usable l1 space in nops, but scale it with scale_usable_l1_size
-                // Calculate available size in tiles
-                std::uint32_t available_space = 0;
-                if (scale_usable_l1_size * usable_l1_size > op_model.get_l1_memory_usage())
-                {
-                    available_space =
-                        (scale_usable_l1_size * usable_l1_size - op_model.get_l1_memory_usage()) /
-                        balancer::tile_size_bytes(op_model.input_buffers.at(e.consumer_input_port_id).data_format);
-                }
-                else
-                {
-                    // if available_space is 0 then we skip to the next op in path. 
-                    continue;
-                }
-                std::uint32_t grid_size = op_model.grid_shape.volume();
-                available_space *= grid_size;
-
-                std::uint32_t add_amount = 0;
-                std::uint32_t effective_add_amount = 0;
-                auto tms = graph->get_edge_attributes(e)->get_tms();
-                float input_multiplier = current_output_multiplier;
-                for (auto tm : tms)
-                {
-                    // broadcast changes the volume of the complete tensor. Now if we have some number of tiles free in
-                    // L1 to buffer the data, these tiles will effectively buffer less if the size of the tensor
-                    // increased. It is not the same if you can buffer 100 tiles when one output is 10 tiles as when
-                    // that output passes though the broadcast (with expansion 10) and effectively becomes 10 times
-                    // bigger. Then, you can only buffer one tensor with 100 tiles. So we can't blindly add up free L1
-                    // space throughout the buffering path without weighting with respect to tensor volume change. This
-                    // is because we want to calculate how much tiles can we buffer with respect to the begining of the
-                    // path.
-                    if (tm.op == "broadcast")
-                        input_multiplier /= (float)std::get<int>(tm.attr[1]);
-                }
-                std::uint32_t effective_available_space = available_space * input_multiplier;
-                next_output_multiplier = get_output_multiplier(node, input_multiplier, op_model, e.consumer_input_port_id);
-
-                // Only expand input buffers if expanding output buffers is disabled.
-                if (!expand_output_buffers)
-                {
-                    if (maximize_buffers)
-                    {
-                        // Take up all available space
-                        effective_add_amount = effective_available_space / grid_size;
-                        add_amount = available_space / grid_size;
-                    }
-                    else
-                    {
-                        effective_add_amount = ceil((float)std::min(to_add, effective_available_space) / (float)grid_size);
-                        add_amount = effective_add_amount / input_multiplier;
-                    }
-                    add_amount -=
-                        add_amount % (op_model.input_buffers.at(e.consumer_input_port_id).block_shape.volume_no_t());
-
-                }
-
-                effective_add_amount = add_amount * input_multiplier + output_buffer_tiles_added * next_output_multiplier;
-                if (add_amount > 0 && outside_fj)
-                {
-                    // we only want to increase input l1 buffers to nodes that are not part of already buffered inner
-                    // fork-joins thus outside_fj tells if current node is outside of already buffered sub fjs.
-                    op_model.input_buffers.at(e.consumer_input_port_id).l1_size_tiles += add_amount;
-                    op_model.input_buffers.at(e.consumer_input_port_id).size_tiles_override = true;
-                }
-
-                // add_amount is the additional amount of l1 space (in tiles) that we allocated for fork join buffering.
-                // Due to contraction and expansion of tensor troughout the path, add_amount has to be scaled with
-                // input_multiplier factor. input multiplier because we add this add_amount of tiles to input buffers of
-                // the op. If we added it to output buffers we would use next_output_multiplier for scalling. Note that
-                // we still use up add_amount space in l1, but effectively for buffering it decreases to_add for
-                // add_amount * input_multiplier tiles.
-
-                // for one core. we have to multiply it with grid size.
-                // Prevent underflow.
-                if (outside_fj)
-                {
-                    if (effective_add_amount * grid_size > to_add)
-                    {
-                        additional_available_buff += to_add;
-                        to_add = 0;
-                    }
-                    else
-                    {
-                        additional_available_buff += effective_add_amount * grid_size;
-                        to_add -= effective_add_amount * grid_size;
-                    }
-                }
-                log_trace(
-                    LogGraphCompiler,
-                    "Available l1 size for {}: {}, grid_size={}, add_amount={}, remaining to_add={}",
-                    node->name(),
-                    available_space,
-                    grid_size,
-                    add_amount,
-                    to_add);
-            }
-
-            if ((to_add == 0) && !maximize_buffers)
-                break;
-        }
-        current_output_multiplier = next_output_multiplier;
-        if ((to_add == 0) && !maximize_buffers)
-            break;
-
-        prev_node = node;
-    }
-
-    // Optionally always add a nop on short path that is a direct connection, as a workaround for the 2K tile limit
-    bool always_add = ((path.size() == 2) && env_as<bool>("PYBUDA_NOP_ON_DIRECT_SHORT_PATH"));
-
-    if (always_add || (to_add > 0))
-    {
-        log_debug(
-            LogGraphCompiler, "Fork join long path requires additional buffering of shorter path {} tiles", to_add);
-        // insert NOPs or queue if number of tiles exceeds threshold
-        balancer::OpModel &op_model = get_op_model(op_models_post_placer, op_models, path[0]);
-
-        std::uint32_t tile_size = balancer::tile_size_bytes(op_model.output_buffers.at(0).data_format);
-
-        int buff_mem_consumption = buffering_queues_mem_consumption(instructions) +
-                                   buffering_queues_mem_consumption(previous_ins_instructions);
-        Node *src = path[0];
-        std::vector<Node*> dests;
-        // currently if src is recompute, we skip adding queue
-        // because of possible graph change (reconnecting consumers from recompute node)
-        // that results in hang.
-        bool src_is_recompute = is_recompute(graph, src);
-
-        // if there is sub fork-join from fork of current fj (fj) on path, we have to add nop effectivaly before sub
-        // fork-join. we do that by adding instructions for mergeable nops on both paths of sub fork-join. Nops with
-        // mergeable tag will be merged in one nop if they have same source (in method merge_tagged_nops_with_same_src)
-        auto [join, req, avail, sub_fj] = fj_graph.find_sub_fork_join_from_node(fj, path, src);
-        if (join != nullptr)
-        {
-            dests.push_back(sub_fj->first[1]);
-            dests.push_back(sub_fj->second[1]);
-        }
-        else
-        {
-            dests.push_back(path[1]);
-        }
-        bool merge_nops = dests.size() > 1;
-
-        if (add_buffer_queues && (int)to_add > (int)fork_join_tiles_treshold && buff_mem_consumption < max_queue_mem &&
-            !src_is_recompute)
-        {
-            // number of tiles to add (to_add) is greater than threshold config.fork_join_tiles_treshold => use queues
-            // instead of nops
-            auto edges = graph->user_data_edges(src);
-            for (std::uint32_t fork_id = 0; fork_id < edges.size(); fork_id++)
-            {
-                for (Node *dest : dests)
-                {
-                    graphlib::Edge e = edges[fork_id];
-                    if (e.consumer_node_id == dest->id())
-                    {
-                        // number if entries in queue is 2 * microbatch_size at maximum. We take the minimum of that
-                        // upper limit and estimation we get from padding requirement to_add (which is number of tiles
-                        // we have to padd on path) When one path of fork join is much longer than other this to_add
-                        // becomes large. That would increase number of tensors we have to buffer. Luckily for us,
-                        // maximum number of tensors that can be inside one queue ,that is inside one epoch, is
-                        // microbatch_size.
-                        int num_entries = std::min(
-                            2 * graph->get_microbatch(),
-                            (int)ceil((float)to_add / (float)op_model.op_shape.outputs.at(0).volume_in_tiles()));
-                        int queue_size = (int)(ceil(to_add * tile_size));  // in bytes
-
-                        // if dests have more than one element that means that I want to add queue with source src but
-                        // more than 1 destination. Even though I make 2 instructions, later there won't be 2 queues but
-                        // one that feeds to 2 consumers if dests.size() is 2 for example.
-                        std::shared_ptr<InsertionInstruction> ins = std::make_shared<QueueInsertionInstruction>(
-                            src->name() /* src */,
-                            dest->name() /* dest */,
-                            false /* hoist_tms */,
-                            num_entries,
-                            queue_size,
-                            e.consumer_input_port_id /* input_id */,
-                            fork_id /* fork_id */);
-                        InsInstructionUniqueId key = ins->unique_id();
-                        insert_queue_ins_to_instructions(instructions, key, ins);
-                    }
-                }
-            }
-        }
-        else
-        {
-            // Some heuristic to guess how many nops we need
-            float nop_buffering = usable_l1_size / (float)tile_size;
-            // std::cout << "Expect " << nop_buffering << " tiles per nop, need to add " << to_add << std::endl;
-            /*
-            add at most a third of nops needed, since disturbance of placement
-            will shift epochs and we might not need them any more
-            We don't add all necesarry nops in one step. On the contrary, we add fraction of needed nops in each pass of
-            pre-placer post-placer loop in compile.py. This is because adding nops can cause current fork-join to span
-            across two epochs, thus elliminating further need for adding new nops (because e2e queues act as buffers).
-            */
-            float nop_base_buffer = to_add / nop_buffering;
-
-            // In case buffering requirements are abysmal skip buffering.
-            //
-            if (nop_base_buffer < 0.1)
-                return;
-
-            // If we are using inline buffering within epoch we can add all NOPs at once.
-            // Legacy post placer path is adding one third at a time due to op shifts accross epochs.
-            //
-            int buffering_step = op_models_post_placer != nullptr ? 3 : 1;
-            int buffering_scale = buffering_factor(op_model) * buffering_step;
-            std::uint32_t nop_count = (uint32_t)std::ceil(to_add / (nop_buffering * buffering_scale));
-
-            // Check if we are trying to add unreasonable amount of NOPs.
-            // Currently, unreasonable is defined as "more that can fit on grayskull (10x12 grid)".
-            if (nop_count > 120)
-            {
-                log_warning(LogGraphCompiler, "Trying to add large number of NOPs for buffering.");
-            }
-
-            log_trace(
-                LogGraphCompiler,
-                "Ask for {} nops from {}, to_add: {}, tile_size: {}",
-                nop_count,
-                src->name(),
-                to_add,
-                tile_size);
-
-            auto edges = graph->user_data_edges(src);
-            for (std::uint32_t fork_id = 0; fork_id < edges.size(); fork_id++)
-            {
-                for (Node *dest : dests)
-                {
-                    graphlib::Edge e = edges[fork_id];
-                    if (e.consumer_node_id == dest->id())
-                    {
-                        InsInstructionUniqueId key = InsInstructionUniqueId(
-                            src->name(), dest->name(), e.consumer_input_port_id, fork_id, merge_nops);
-                        if (instructions.count(key) > 0)
-                        {
-                            if (NopInsertionInstruction *nop_instr =
-                                    dynamic_cast<NopInsertionInstruction *>(instructions[key].get()))
-                            {
-                                if (nop_instr->nop_count < nop_count)
-                                {
-                                    nop_instr->set_nop_count(nop_count);
-                                    // we already added
-                                    additional_available_buff +=
-                                        (nop_count - nop_instr->nop_count) * nop_buffering * buffering_factor(op_model);
-                                }
-                            }
-                        }
-                        else
-                        {
-                            // instruction doesn't exist in map of instructions
-                            additional_available_buff += nop_count * nop_buffering * buffering_factor(op_model);
-                            std::shared_ptr<InsertionInstruction> ins = std::make_shared<NopInsertionInstruction>(
-                                src->name() /* src */,
-                                dest->name() /* dest */,
-                                false /* hoist_tms */,
-                                nop_count /* nop_count */,
-                                e.consumer_input_port_id /* input_id */,
-                                fork_id /* fork_id */,
-                                merge_nops);
-                            instructions[key] = ins;
-                        }
-                    }
-                }
-            }
-
-            fj_graph.add_nop_buffered_fj(&fj);
-        }
-    }
-
-    FJBufferingInfo fj_buff_info =
-        FJBufferingInfo(fj.second.back(), long_path_required, short_path_available + additional_available_buff, &fj);
-    fj_graph.update_buffered_fj_map(fj, fj_buff_info);
-}
-
-/*
-Returns std::tuple<bool,int,int>
-First variable in output is true if instructions map is true subset of previous_instructions map. This means that for
-each key in instructions there is the key in previous_instructions and values match. Then, second and third variables in
-output are 0.
-*/
-std::tuple<bool, int, int> is_subset_of_instructions(
-    const tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        &instructions,
-    const tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        &previous_instructions)
-{
-    bool instr_not_updated = true;
-    int num_new_nops = 0;
-    int num_new_queues = 0;
-    for (auto elem : instructions)
-    {
-        InsInstructionUniqueId key = elem.first;
-        InsertionInstruction *instr = elem.second.get();
-        if (previous_instructions.count(key) == 0)
-        {
-            // if new instructions contain insertion instruction key and previous_instructions don't
-            if (instr->instr_type == InsructionType::QueueInstruction)
-            {
-                num_new_queues++;
-            }
-            else if (instr->instr_type == InsructionType::NopInstruction)
-            {
-                num_new_nops += static_cast<NopInsertionInstruction *>(instr)->nop_count;
-            }
-            else
-            {
-                log_error("Unsupported instruction type");
-            }
-            instr_not_updated = false;
-        }
-        else
-        {
-            // if previous_instructions contain instruction key and that instruction is NopInstruction
-            // we stil have to check if nop count is unchanged. If nop count is changed then we still return false
-            InsertionInstruction *prev_instr = previous_instructions.at(key).get();
-            if (prev_instr->instr_type == InsructionType::NopInstruction)
-            {
-                if (instr->instr_type == InsructionType::NopInstruction)
-                {
-                    int prev_nop_count = static_cast<NopInsertionInstruction *>(prev_instr)->nop_count;
-                    int curr_nop_count = static_cast<NopInsertionInstruction *>(instr)->nop_count;
-                    if (prev_nop_count != curr_nop_count)
-                    {
-                        instr_not_updated = false;
-                        num_new_nops += (curr_nop_count - prev_nop_count);
-                    }
-                }
-                else if (instr->instr_type == InsructionType::QueueInstruction)
-                {
-                    num_new_queues++;
-                }
-                else
-                {
-                    log_error("Unsupported instruction type");
-                }
-            }
-        }
-    }
-    return std::tuple<bool, int, int>(instr_not_updated, num_new_nops, num_new_queues);
-}
-
-/*
-Makes new tt::ordered map containing instructions and previous_instructions. If both maps contain the same nop
-instruction key we want to add nop_counts from previous_instructions to new nop_count. This hapens all the time because
-we don't add all necesarry nops in one step. On the contrary, we add fraction of needed nops in each pass of pre-placer
-post-placer loop in compile.py . This is because adding nops can cause fork-join to span across two epochs, thus
-elliminating further need for adding new nops (because e2e queues act as buffers).
-*/
-tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-append_prev_instr(
-    tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        &instructions,
-    const tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        previous_instructions)
-{
-    tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        combined_instructions = previous_instructions;
-    for (auto elem : instructions)
-    {
-        // Iterate through current instructions
-        InsInstructionUniqueId key = elem.first;
-        std::shared_ptr<InsertionInstruction> instr = elem.second;
-        if (combined_instructions.count(key) == 0)
-        {
-            // if combined instructions doesn't contain current key
-            combined_instructions[key] = instr;
-        }
-        else
-        {
-            // if combined instructions contains current key then we ask if value is instruction of type NopInstruction
-            // actually there should not be the case where two Queue instructions share the same key in instructions and
-            // previous_instructions maps because if queue instruction is in previous_instructions map then that
-            // fork-join is resolved and new instructions won't have that queue. however for future it is better to
-            // check.
-            if (instr->instr_type == InsructionType::NopInstruction &&
-                combined_instructions.at(key)->instr_type == InsructionType::NopInstruction)
-            {
-                // if we already have instructions for nop insertion on that place, we just update nop_count
-                NopInsertionInstruction *nop_instr = static_cast<NopInsertionInstruction *>(instr.get());
-                NopInsertionInstruction *instr_to_modify =
-                    static_cast<NopInsertionInstruction *>(combined_instructions[key].get());
-                instr_to_modify->set_nop_count(instr_to_modify->nop_count + nop_instr->nop_count);
-            }
-        }
-    }
-    return combined_instructions;
-}
-
-// Creates QueueInsertionInstruction to add queue between first and second node on path "path_to_buffer", and appends
-// that instruction to map of current instructions ,"instructions". Num entries for the queue is "buf_queue_num_entries"
-void add_queue_instr_based_on_queues_on_other_path(
-    Graph *graph,
-    std::vector<Node *> path_to_buffer,
-    int buf_queue_num_entries,
-    balancer::OpModelMap *op_models_post_placer,
-    balancer::OpModels *op_models,
-    tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        &instructions)
-{
-    Node *src = path_to_buffer[0];
-    Node *dest = path_to_buffer[1];
-
-    auto edges = graph->user_data_edges(src);
-    for (std::uint32_t fork_id = 0; fork_id < edges.size(); fork_id++)
-    {
-        graphlib::Edge e = edges[fork_id];
-        if (e.consumer_node_id == dest->id())
-        {
-            balancer::OpModel &op_model = get_op_model(op_models_post_placer, op_models, path_to_buffer[0]);
-            std::uint32_t tile_size = balancer::tile_size_bytes(op_model.output_buffers.at(0).data_format);
-            std::uint32_t queue_size = (std::uint32_t)(ceil(
-                buf_queue_num_entries * (float)op_model.op_shape.outputs.at(0).volume_in_tiles() * tile_size));
-
-            std::shared_ptr<InsertionInstruction> queue_ins = std::make_shared<QueueInsertionInstruction>(
-                src->name() /* src */,
-                dest->name() /* dest */,
-                false /* hoist_tms */,
-                buf_queue_num_entries,
-                queue_size,
-                e.consumer_input_port_id /* input_id */,
-                fork_id /* fork_id */);
-            InsInstructionUniqueId key = queue_ins->unique_id();
-            insert_queue_ins_to_instructions(instructions, key, queue_ins);
-        }
-    }
-}
-
-// Returns a map of pointers to insertion instructions needed for buffering the graph.
-tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-generate_graph_buffering(
-    Graph *graph,
-    FJGraph &fj_graph,
-    balancer::OpModelMap *op_models_post_placer,
-    balancer::OpModels *op_models,
-    const std::uint32_t usable_l1_size,
-    const tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        previous_ins_instructions,
-    const int fork_join_tiles_treshold,
-    std::function<int(const tt::balancer::OpModel &)> buffering_factor)
-{
-    tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        instructions;
-
-    std::vector<const ForkJoin *> sorted_fork_joins = fj_graph.get_topo_sorted_fjs();
-
-    bool dump_debug_info = (env_as<bool>("PYBUDA_FORK_JOIN_DEBUG_INFO")) ? true : false;
-    std::stringstream node_debug_info;
-
-    for (std::size_t j = 0; j < sorted_fork_joins.size(); j++)
-    {
-        const ForkJoin &fj = *sorted_fork_joins[j];
-        if (dump_debug_info)
-        {
-            // we log fork-join that is buffered to track order of buffering in graph
-            node_debug_info << "buffering fork-join: fork node name: " << fj.first[0]->name()
-                            << " join node name: " << fj.first.back()->name() << std::endl;
-        }
-        // std::cout << "== FORK JOIN ==" << std::endl;
-        // print_fork_join(fj);
-
-        // Figure out if buffering is needed.
-        auto [path0_req, path0_has_buff_queue, path0_buf_queue_num_entries] = get_buffering_requirement(
-            graph, op_models_post_placer, op_models, fj.first, fj, fj_graph);
-        auto [path1_req, path1_has_buff_queue, path1_buf_queue_num_entries] = get_buffering_requirement(
-            graph, op_models_post_placer, op_models, fj.second, fj, fj_graph);
-
-        log_trace(LogGraphCompiler, "path0_req = {}, path1_req = {}", path0_req, path1_req);
-
-        if (path0_has_buff_queue != path1_has_buff_queue )
-        {
-            // one of the paths has buffering queue, and other doesn't. We will add queue to the one that doesn't have
-            // queue. after that, we don't need buffering of that fork-join, because all buffering queues have
-            // num_entries equal to microbatch size this guaranties that both paths will be able to buffer all tensors
-            // that pass through them in one epoch. 
-            // These queue instructions don't conform to maximum queue memory
-            // consumption threshold (max_queue_mem). This threshold is introduced for buffering queues that replace
-            // nops in buffering fork joins that would require many nops. If all queue instructions exceed
-            // max_queue_mem, we will still add buffering queues on fork-joins where we have buffering queue in one
-            // path, but we will stop buffering regular fork-joins with buffering queues, and transfer to nops regardles
-            // of the path difference
-            if(path0_has_buff_queue)
-            {
-                // path0 has buffering queue and path1 doesn't. we want to add buffering queue to path1 to balance out fork-join
-
-                // create instruction for buffering queue between first and second node of path1 (fj.second), and add it to
-                // existing instructions.
-                add_queue_instr_based_on_queues_on_other_path(graph, fj.second, path0_buf_queue_num_entries, op_models_post_placer, op_models, instructions);
-            }
-            else
-            {
-                // path1 has buffering queue and path0 doesn't. we want to add buffering queue to path0 to balance out fork-join
-                // create instruction for buffering queue between first and second node of path0 (fj.first), and add it to
-                // existing instructions.
-                add_queue_instr_based_on_queues_on_other_path(graph, fj.first, path1_buf_queue_num_entries, op_models_post_placer, op_models, instructions);
-            }
-        }
-
-        // if path0_req < path1_req, then path 0 is faster path, and path 1 is slower.
-        if (path0_req < path1_req && !path0_has_buff_queue && !path1_has_buff_queue)
-        {
-            // Path 0 is the short path
-            std::uint32_t available_bufferings = std::get<0>(get_available_buffering(
-                graph, op_models_post_placer, op_models, fj.first, fj, fj_graph));
-            log_trace(LogGraphCompiler, "path0 available: {}", available_bufferings);
-            if (path1_req > available_bufferings)
-            {
-                add_buffering_on_path(
-                    graph,
-                    fj.first,
-                    path1_req,
-                    available_bufferings,
-                    instructions,
-                    previous_ins_instructions,
-                    op_models_post_placer,
-                    op_models,
-                    usable_l1_size,
-                    fork_join_tiles_treshold,
-                    buffering_factor,
-                    fj,
-                    fj_graph);
-            }
-        }
-        // if path1_req < path0_req, then path 1 is faster path, and path 0 is slower.
-        else if (path1_req < path0_req && !path1_has_buff_queue && !path0_has_buff_queue)
-        {
-            // Path 1 is the short path
-            std::uint32_t available_bufferings = std::get<0>(get_available_buffering(
-                graph, op_models_post_placer, op_models, fj.second, fj, fj_graph));
-            log_trace(LogGraphCompiler, "path1 available: {}", available_bufferings);
-            if (path0_req > available_bufferings)
-            {
-                add_buffering_on_path(
-                    graph,
-                    fj.second,
-                    path0_req,
-                    available_bufferings,
-                    instructions,
-                    previous_ins_instructions,
-                    op_models_post_placer,
-                    op_models,
-                    usable_l1_size,
-                    fork_join_tiles_treshold,
-                    buffering_factor,
-                    fj,
-                    fj_graph);
-            }
-        }
-    }
-
-    log_debug(LogGraphCompiler, "Buffering sequence of fork-joins: \n{}", node_debug_info.str());
-    // log new instructions. New instructions can be instructions that are completely new, and those that were updated
-    // for example, we increased nop_count in nop instruction, or num_entries in queue instruction
-    if (instructions.size() != 0)
-    {
-        log_trace(LogGraphCompiler, " new or updated instructions: ");
-        for (const auto &i : instructions)
-        {
-            log_trace(LogGraphCompiler, " - src: {}, dest: {}", i.second->src, i.second->dest);
-        }
-    }
-    // order of the arguments matters since tt::ordered_map keeps track of the adding order
-    // therefore we have to add previos_ins_instructions before we add instructions
-    return append_prev_instr(instructions, previous_ins_instructions);
-}
-
-// Generate buffering instructions for fork-join buffering.
-// op_models_post_placer is passed in if we are in the post-placer phase - legacy path.
-// op_models is passed in balancing phase for "inline" buffering - new path.
-//
-FJBufferingResult insert_fork_join_buffering(
-    graphlib::Graph *graph,
-    balancer::OpModelMap *op_models_post_placer,
-    balancer::OpModels *op_models,
-    const std::uint32_t usable_l1_size,
-    const tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        &previous_ins_instructions,
-    const int fork_join_tiles_treshold,
-    std::function<int(const tt::balancer::OpModel &)> buffering_factor)
-{
-    // We can't change the graph, only adjust buffer sizes.
-    //
-    TT_ASSERT(
-        (op_models_post_placer or op_models) and !(op_models_post_placer and op_models),
-        "op_models_post_placer or op_models must be passed in but not both!");
-
-    // Disable if env variable set
-    if (env_as<bool>("PYBUDA_DISABLE_FORK_JOIN_BUF"))
-        return {};
-
-    //
-    // Find fork-joins
-    //
-    FJGraph fj_graph = FJGraph(graph);
-    if (fj_graph.get_fjs().size() == 0)
-        return {};  // nothing to do
-
-    //
-    // Find buffering locations due to mismatched paths, and adjust buffers
-    //
-    tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        instructions = generate_graph_buffering(
-            graph,
-            fj_graph,
-            op_models_post_placer,
-            op_models,
-            usable_l1_size,
-            previous_ins_instructions,
-            fork_join_tiles_treshold,
-            buffering_factor);
-
-    // if instructions is not subset of previous instructions, then we have some new instructions (nop or queue)
-
-    auto new_instr_tuple = is_subset_of_instructions(instructions, previous_ins_instructions);
-    if (!std::get<0>(new_instr_tuple))
-    {
-        log_debug(
-            LogGraphCompiler,
-            "Added more buffering instructions. Additional Nops added: {}; Additional Queues added: {} ",
-            std::get<1>(new_instr_tuple),
-            std::get<2>(new_instr_tuple));
-    }
-
-    if (env_as<bool>("PYBUDA_DISABLE_FORK_JOIN_NOPS"))
-        instructions.clear();
-
-    FJBufferingResult res;
-    res.instructions = instructions;
-
-    for (auto& fj: fj_graph.get_nop_buffered_fjs())
-    {
-        res.nop_buffered_fjs.push_back(*fj);
-    }
-
-    return res;
-}
-
-void upsize_dram_input(graphlib::Graph *graph, balancer::OpModelMap &op_models, const std::uint32_t usable_l1_size)
-{
-    for (Node *node : graph->nodes())
-    {
-        if (node->node_type() != graphlib::kBudaOp)
-            continue;
-
-        auto edges = graph->operand_data_edges(node);
-        std::vector<std::size_t> queue_ops;
-        for (std::size_t i = 0; i < edges.size(); i++)
-        {
-            /*Node *op = graph->node_by_id(edges[i].producer_node_id);
-            if (op->node_type() == graphlib::kQueue)
-            {
-                queue_ops.push_back(i);
-            }
-            else if (op->node_type() == graphlib::kInput)
-            {
-                if (!op->as<graphlib::InputNode>()->is_prologue())
-                    queue_ops.push_back(i);
-            }*/
-            queue_ops.push_back(i);
-        }
-
-        if (queue_ops.size() > 0)
-        {
-            balancer::OpModel &op_model = op_models.at(node->name());
-            std::uint32_t available_space = usable_l1_size - op_model.get_l1_memory_usage();
-            log_trace(
-                LogGraphCompiler,
-                "Upsize dram for {}: usable: {}, usage: {}, available: {}",
-                node->name(),
-                usable_l1_size,
-                op_model.get_l1_memory_usage(),
-                available_space);
-            std::uint32_t to_add = 1.0 * available_space / (float)queue_ops.size();
-
-            for (std::size_t i : queue_ops)
-            {
-                // Calculate available size in tiles
-                std::uint32_t to_add_tiles =
-                    1.0 * to_add / (float)balancer::tile_size_bytes(op_model.input_buffers.at(i).data_format);
-                to_add_tiles -= to_add_tiles % (op_model.input_buffers.at(i).block_shape.volume_no_t());
-                log_trace(
-                    LogGraphCompiler,
-                    " - for operand {}, to_add: {}, current: {}",
-                    i,
-                    to_add_tiles,
-                    op_model.input_buffers.at(i).l1_size_tiles);
-                op_model.input_buffers.at(i).l1_size_tiles += to_add_tiles;
-                op_model.input_buffers.at(i).size_tiles_override = true;
-            }
-        }
-    }
-}
-
-}  // namespace tt
diff --git a/pybuda/csrc/passes/fork_join.hpp b/pybuda/csrc/passes/fork_join.hpp
deleted file mode 100644
index b95ddda31..000000000
--- a/pybuda/csrc/passes/fork_join.hpp
+++ /dev/null
@@ -1,294 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <optional>
-
-#include "balancer/balancer.hpp"
-#include "utils/ordered_associative_containers/ordered_map.hpp"
-namespace tt
-{
-
-using NodeId = tt::graphlib::NodeId;
-using PortId = tt::graphlib::PortId;
-namespace graphlib
-{
-class Graph;
-class Node;
-}  // namespace graphlib
-
-// Instruct pre-placer to insert a NOP between src/dest nodes
-// Further information on iteration attempt, etc. can be added in the future to augment this
-enum InsructionType
-{
-    NopInstruction,
-    QueueInstruction
-};
-
-using InsInstructionUniqueId = std::tuple<std::string, std::string, std::uint32_t, std::uint32_t, bool>;
-
-struct InsInstructionUniqueIdHash : public std::unary_function<InsInstructionUniqueId, std::size_t>
-{
-    std::size_t operator()(const InsInstructionUniqueId &instr) const
-    {
-        std::size_t seed = 0;
-        tt::hash_combine(seed, static_cast<std::size_t>(std::hash<std::string>{}(std::get<0>(instr))));
-        tt::hash_combine(seed, static_cast<std::size_t>(std::hash<std::string>{}(std::get<1>(instr))));
-        tt::hash_combine(seed, static_cast<std::size_t>(std::get<2>(instr)));
-        tt::hash_combine(seed, static_cast<std::size_t>(std::get<3>(instr)));
-        tt::hash_combine(seed, static_cast<std::size_t>(std::get<4>(instr)));
-        return seed;
-    }
-};
-
-using ForkJoinId = std::pair<NodeId, NodeId>;
-using ForkJoin = std::pair<std::vector<Node *>, std::vector<Node *>>;
-
-// information on buffered fork-join
-struct FJBufferingInfo
-{
-    Node* join; /* join node ptr */
-    std::uint32_t req; /* required buffering */
-    std::uint32_t avail; /* available buffering */
-    const ForkJoin* fj; /* pointer to buffered fork-join */
-
-    FJBufferingInfo(
-        Node* join,
-        std::uint32_t req,
-        std::uint32_t avail,
-        const ForkJoin* fj) :
-        join(join), req(req), avail(avail), fj(fj)
-    {
-    }
-};
-
-struct ForkJoinIdHash : public std::unary_function<ForkJoinId, std::size_t>
-{
-    std::size_t operator()(const ForkJoinId &fj_id) const
-    {
-        std::size_t seed = 0;
-        tt::hash_combine(seed, static_cast<std::size_t>(std::hash<NodeId>{}(fj_id.first)));
-        tt::hash_combine(seed, static_cast<std::size_t>(std::hash<NodeId>{}(fj_id.second)));
-        return seed;
-    }
-};
-
-struct InsertionInstruction
-{
-    /*
-    This is base class for insertion instructions. From this we inherit NopInsertionInstruction and
-    QueueInsertionInstruction
-    */
-   public:
-    std::string src, dest;
-    bool hoist_tms;                         // whether to hoist tms to the input to the new nop
-    std::optional<std::uint32_t> input_id;  // input id into dest; if nullopt, use input_id from original edge
-    std::optional<std::uint32_t> fork_id;   // index of output from src; if nullopt, use fork_id from original edge
-    bool user_defined;                      // whether these requested NOPs were user-defined
-    InsructionType instr_type;
-    InsertionInstruction() = default;
-    InsertionInstruction(
-        const std::string &src,
-        const std::string &dest,
-        bool hoist_tms,
-        std::optional<std::uint32_t> input_id = std::nullopt,
-        std::optional<std::uint32_t> fork_id = std::nullopt,
-        bool user_defined = false) :
-        src(src), dest(dest), hoist_tms(hoist_tms), input_id(input_id), fork_id(fork_id), user_defined(user_defined)
-    {
-    }
-
-    virtual ~InsertionInstruction() {}
-
-    virtual InsInstructionUniqueId unique_id() const = 0;
-
-    std::pair<Node *, Node *> is_instruction_still_valid(graphlib::Graph *graph);
-
-    virtual void insert(graphlib::Graph *graph) = 0;
-};
-
-struct PyInsertionInstruction : public InsertionInstruction
-{
-   public:
-    /* Inherit the constructors */
-    using InsertionInstruction::InsertionInstruction;
-
-    /* Trampoline (need one for each virtual function) */
-    void insert(graphlib::Graph *graph) override
-    {
-        PYBIND11_OVERRIDE_PURE(
-            void,                 /* Return type */
-            InsertionInstruction, /* Parent class */
-            insert,               /* Name of function in C++ (must match Python name) */
-            graph                 /* Argument(s) */
-        );
-    }
-
-    /* Trampoline (need one for each virtual function) */
-    InsInstructionUniqueId unique_id() const override
-    {
-        PYBIND11_OVERRIDE_PURE(
-            InsInstructionUniqueId, /* Return type */
-            InsertionInstruction,   /* Parent class */
-            unique_id,              /* Name of function in C++ (must match Python name) */
-        );
-    }
-};
-
-struct NopInsertionInstruction : public InsertionInstruction
-{
-   public:
-    std::uint32_t nop_count;  // number of nops to insert
-    bool mergeable;           // whether to merge user-defined NOPs with the same src
-    bool daisy_chain;         // change the behaviour for merging nops with src->multiple consumers
-    bool request_merge;       // enable to invoke the API call to perform the daisy-chain/merge
-
-    NopInsertionInstruction() : InsertionInstruction() {}
-    NopInsertionInstruction(
-        const std::string &src,
-        const std::string &dest,
-        bool hoist_tms,
-        std::uint32_t nop_count,
-        std::optional<std::uint32_t> input_id = std::nullopt,
-        std::optional<std::uint32_t> fork_id = std::nullopt,
-        bool user_defined = false,
-        bool mergeable = false,
-        bool daisy_chain = false,
-        bool request_merge = false) :
-        InsertionInstruction(src, dest, hoist_tms, input_id, fork_id, user_defined),
-        nop_count(nop_count),
-        mergeable(mergeable),
-        daisy_chain(daisy_chain),
-        request_merge(request_merge)
-    {
-        this->instr_type = InsructionType::NopInstruction;
-    }
-
-    InsInstructionUniqueId unique_id() const override
-    {
-        return std::make_tuple(
-            this->src, this->dest, this->input_id.value_or(-1), this->fork_id.value_or(-1), this->mergeable);
-    }
-
-    void insert(graphlib::Graph *graph) override;
-    void set_nop_count(int nop_count) { this->nop_count = nop_count; };
-};
-
-struct QueueInsertionInstruction : public InsertionInstruction
-{
-   public:
-    int num_entries;
-    int queue_size;  // in bytes
-    QueueInsertionInstruction(
-        const std::string &src,
-        const std::string &dest,
-        bool hoist_tms,
-        int num_entries,
-        std::uint32_t queue_size,
-        std::optional<std::uint32_t> input_id = std::nullopt,
-        std::optional<std::uint32_t> fork_id = std::nullopt,
-        bool user_defined = false) :
-        InsertionInstruction(src, dest, hoist_tms, input_id, fork_id, user_defined),
-        num_entries(num_entries),
-        queue_size(queue_size)
-    {
-        this->instr_type = InsructionType::QueueInstruction;
-    }
-
-    InsInstructionUniqueId unique_id() const override
-    {
-        // last parameter in unique id is mergeable, and it is false for QueueInsertionInstruction, since we use it only
-        // in NopInsertionInstruction. We need uniform paterns for unique id so we fix mergeable to false for queues.
-        return std::make_tuple(this->src, this->dest, this->input_id.value_or(-1), this->fork_id.value_or(-1), false);
-    }
-
-    void insert(graphlib::Graph *graph) override;
-    void set_num_entries(int num_entries) { this->num_entries = num_entries; };
-};
-
-struct FJBufferingResult
-{
-    // Instructions generated for fork-join buffering.
-    tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash> instructions;
-    // All fork-joins which were buffered with nops.
-    std::vector<ForkJoin> nop_buffered_fjs;
-};
-
-// Insert buffers to match short/long forks
-FJBufferingResult insert_fork_join_buffering(
-    graphlib::Graph *graph,
-    balancer::OpModelMap *op_models_post_placer,
-    balancer::OpModels *op_models,
-    const std::uint32_t usable_l1_size,
-    const tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        &previous_ins_instructions,
-    const int fork_join_tiles_treshold,
-    std::function<int(const tt::balancer::OpModel &)> buffering_factor = [](const tt::balancer::OpModel &) { return 1; });
-
-void upsize_dram_input(graphlib::Graph *graph, balancer::OpModelMap &op_models, const std::uint32_t usable_l1_size);
-
-// Checking if two maps of instructions are equal
-std::tuple<bool, int, int> is_subset_of_instructions(
-    const tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        &instructions,
-    const tt::ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>
-        &previous_instructions);
-
-class FJGraph
-{
-    std::vector<std::uint32_t>
-        fj_ids;  // fj_id is actually index of the in vector of fork-joins that is given to the constructor
-    std::vector<ForkJoin> fork_joins;
-    std::vector<const ForkJoin *> topo_sort_fjs;
-    std::vector<std::size_t> topo_sort_fj_indices;
-    std::vector<std::set<std::uint32_t>> adjacency_vector;  // indices in this vector pertain to indices in vector of
-                                                            // fork-joins that ptr_to_fjs points to.
-
-    // buffered_fjs map contains information about fork-joins that are already buffered. Key to map is fork node id, and
-    // value is tuple of: join node id, required buffering, available buffering, and pointer to buffered fork-join
-    // itself.
-    std::unordered_map<NodeId, std::vector<FJBufferingInfo>>
-        buffered_fjs;
-    std::unordered_map<const ForkJoin *, const ForkJoin *> parent_fj_map;
-    std::vector<const ForkJoin*> nop_buffered_fjs;
-
-   public:
-
-    FJGraph(graphlib::Graph *graph);
-
-    void add_edge(std::uint32_t src, std::uint32_t dest);
-
-    void topological_sort();
-
-    void create_parents_map();
-
-    FJBufferingInfo find_sub_fork_join_from_node(const ForkJoin &fj, const std::vector<Node *> &path, Node *fork);
-
-    void update_buffered_fj_map(const ForkJoin& fj, FJBufferingInfo fj_buff_info);
-
-    // getters
-    std::unordered_map<const ForkJoin *, const ForkJoin *> &get_parent_fj_map() { return parent_fj_map; }
-
-    const std::unordered_map<NodeId, std::vector<FJBufferingInfo>>
-        &get_buffered_fjs()
-    {
-        return buffered_fjs;
-    }
-
-    std::vector<const ForkJoin *> get_topo_sorted_fjs() { return topo_sort_fjs; }
-    std::vector<ForkJoin> &get_fjs() { return fork_joins; }
-    std::vector<const ForkJoin*>& get_nop_buffered_fjs() { return nop_buffered_fjs; }
-
-    // setters
-
-    // add buffered fork_join info to map of buffered fork-joins
-    void add_elem_to_buffered_fjs(
-        NodeId fork_id, FJBufferingInfo buff_fj_info);
-    // erase element with the key fork_id and index idx from the map
-    void erase_elem_from_buffered_fjs(NodeId fork_id, std::size_t idx);
-
-    void add_nop_buffered_fj(const ForkJoin* fj) { nop_buffered_fjs.push_back(fj); }
-};
-
-}  // namespace tt
diff --git a/pybuda/csrc/passes/forked_dram_inputs.cpp b/pybuda/csrc/passes/forked_dram_inputs.cpp
deleted file mode 100644
index ca024e2bb..000000000
--- a/pybuda/csrc/passes/forked_dram_inputs.cpp
+++ /dev/null
@@ -1,107 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "forked_dram_inputs.hpp"
-
-namespace std
-{
-template <>
-struct hash<std::pair<uint32_t, tt::balancer::BlockShape>>
-{
-    std::size_t operator()(const std::pair<uint32_t, tt::balancer::BlockShape> &block_shape) const
-    {
-        std::size_t seed = 0;
-        tt::hash_combine(seed, static_cast<size_t>(block_shape.first));
-        tt::hash_combine(seed, hash<tt::balancer::BlockShape>{}(block_shape.second));
-        return seed;
-    }
-};
-}  // namespace std
-
-namespace tt::passes
-{
-std::unordered_map<Edge, Edge> get_forked_dram_inputs(
-    bool enable_forked_dram_inputs,
-    Graph *graph,
-    unordered_map<string, placer::OpPlacement> *name_to_op_placement,
-    balancer::OpModelMap *op_model)
-{
-    if (!enable_forked_dram_inputs)
-        return {};
-
-    std::unordered_map<Edge, Edge> forked_dram_input_edges;
-    std::vector<Node *> nodes = graphlib::topological_sort(*graph);
-    for (Node *node : nodes)
-    {
-        // Only applies to nodes that are inputs or queues
-        if (node->node_type() != graphlib::NodeType::kInput && node->node_type() != graphlib::NodeType::kQueue){
-            continue;
-        }
-        // If it's an input, only apply to inputs where prologue=false
-        auto input = dynamic_cast<graphlib::InputNode *>(node);
-        if (input && input->is_prologue())
-            continue;
-
-        std::vector<Edge> consumer_edges = graph->user_data_edges(node);
-
-        std::unordered_map<std::uint32_t, std::vector<Edge>> per_epoch_edge_map;
-        std::unordered_map<std::pair<std::uint32_t, balancer::BlockShape>, std::vector<Edge>> per_block_shape_edge_map;
-
-        // Group consumer edges based on epoch_id
-        for (auto &edge : consumer_edges)
-        {
-            Node *consumer = graph->node_by_id(edge.consumer_node_id);
-            auto buda_op = consumer->as<graphlib::BudaOpNode>();
-            // If the op is using Sparse MM or Tilize optimization, disallow forked_dram optimization
-            if (buda_op->is_tilize() || buda_op->is_sparse_matmul() || buda_op->is_splice())
-                continue;
-            auto consumer_epoch_id = name_to_op_placement->at(consumer->as<graphlib::BudaOpNode>()->name()).epoch_id();
-            per_epoch_edge_map[consumer_epoch_id].push_back(edge);
-        }
-
-        // Group consumer edges based identical block_shapes per epoch
-        for (const auto &[epoch_id, edges] : per_epoch_edge_map)
-        {
-            for (auto edge : edges)
-            {
-                Node *consumer = graph->node_by_id(edge.consumer_node_id);
-                balancer::BlockShape consumer_block_shape = op_model->at(consumer->as<graphlib::BudaOpNode>()->name())
-                                                                .input_buffers[edge.consumer_input_port_id]
-                                                                .block_shape;
-                // Check this once again
-                per_block_shape_edge_map[std::make_pair(epoch_id, consumer_block_shape)].push_back(edge);
-            }
-        }
-        // Find edges that can reuse DRAM read from other edge
-        for (auto &[epoch_id_block_shape, edges] : per_block_shape_edge_map)
-        {
-            if (edges.size() > 1)
-            {
-                for (uint idx = 1; idx < edges.size(); idx++)
-                {
-                    uint32_t epoch_id = epoch_id_block_shape.first;
-
-                    auto is_reachable_epoch = [&epoch_id, &name_to_op_placement](graphlib::Node *n)
-                    {
-                        if (dynamic_cast<graphlib::BudaOpNode *>(n) == nullptr)
-                        {
-                            return false;
-                        }
-                        auto node_epoch_id = name_to_op_placement->at(n->as<graphlib::BudaOpNode>()->name()).epoch_id();
-                        return (node_epoch_id == epoch_id);
-                    };
-
-                    // check if any data dependency exists between two nodes
-                    if (check_producer_consumer(graph, graph->node_by_id(edges[idx].consumer_node_id), graph->node_by_id(edges[0].consumer_node_id), is_reachable_epoch) ||
-                        check_producer_consumer(graph, graph->node_by_id(edges[0].consumer_node_id), graph->node_by_id(edges[idx].consumer_node_id), is_reachable_epoch))
-                    {
-                        continue;
-                    }
-                    forked_dram_input_edges.insert({edges[idx], edges[0]});
-                }
-            }
-        }
-    }
-    return forked_dram_input_edges;
-}
-}  // namespace tt::passes
\ No newline at end of file
diff --git a/pybuda/csrc/passes/forked_dram_inputs.hpp b/pybuda/csrc/passes/forked_dram_inputs.hpp
deleted file mode 100644
index a2f107232..000000000
--- a/pybuda/csrc/passes/forked_dram_inputs.hpp
+++ /dev/null
@@ -1,20 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "balancer/types.hpp"
-#include "balancer/balancer_utils.hpp"
-#include "graph_lib/node_types.hpp"
-#include "passes_utils.hpp"
-#include "placer/placer.hpp"
-#include "balancer/balancer.hpp"
-
-namespace tt::passes
-{
-std::unordered_map<graphlib::Edge, graphlib::Edge> get_forked_dram_inputs(
-    bool enable_forked_dram_inputs,
-    Graph* graph,
-    std::unordered_map<std::string, placer::OpPlacement> *name_to_op_placement,
-    balancer::OpModelMap* op_model);
-};  // namespace tt::passess
\ No newline at end of file
diff --git a/pybuda/csrc/passes/fuse_ops.cpp b/pybuda/csrc/passes/fuse_ops.cpp
deleted file mode 100644
index a25c6221e..000000000
--- a/pybuda/csrc/passes/fuse_ops.cpp
+++ /dev/null
@@ -1,1796 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "passes/fuse_ops.hpp"
-
-#include <algorithm>
-#include <unordered_map>
-#include <unordered_set>
-
-#include "balancer/balancer_utils.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-#include "graph_lib/node_types.hpp"
-#include "graph_lib/query.hpp"
-#include "graph_lib/shape.hpp"
-#include "graph_lib/utils.hpp"
-#include "utils/logger.hpp"
-
-namespace tt
-{
-
-using Graph = graphlib::Graph;
-using Node = graphlib::Node;
-using Edge = graphlib::Edge;
-using EdgeType = graphlib::EdgeType;
-using EdgeAttributes = graphlib::EdgeAttributes;
-using PortId = graphlib::PortId;
-
-// Sub-topo order of ops in schedule whole result will be broadcast to another schedule
-struct SubTopo
-{
-    bool broadcast_r, broadcast_c;
-    std::vector<BudaOpNode *> ops;
-    std::unordered_set<BudaOpNode *> outputs;
-};
-
-// Represents a group of ops to be fused together. Provides algorithms to generate, modify, and legalize the groups.
-class FusionGroup
-{
-   private:
-    std::uint32_t id;
-    std::unordered_map<std::string, BudaOpNode *> nodes;
-    graphlib::NodeEpochType epoch_type;
-    bool has_matmul;
-    bool has_reduce;
-    bool has_broadcast_c;
-    std::uint32_t reduce_dim = 0;
-
-    std::vector<BudaOpNode *> topo_order;
-
-    // Remove the cases where one part of the fork is inside the fused op, and the other is not, "wrapping" around other
-    // ops
-    void remove_fork_wraps(Graph *graph, std::unordered_map<graphlib::NodeId, FusionGroupP> &fused_nodes);
-
-    // Look through the output cone of the node, and see if it converges on something in the fused op.
-    BudaOpNode *converge_back_on_fused_op(Graph *graph, Node *node, std::unordered_set<Node *> &visisted) const;
-
-    // Pick first output in topo sort, and remove all others
-    void pick_first_output(
-        Graph *graph, std::unordered_map<graphlib::NodeId, FusionGroupP> &fused_nodes, const std::vector<Node *> &topo);
-
-    // Remove op, and everything below it
-    void remove_op_and_below(
-        Graph *graph, BudaOpNode *op, std::unordered_map<graphlib::NodeId, FusionGroupP> &fused_nodes);
-
-    // Create schedules needed to execute the op
-    void create_schedules(
-        Graph *graph,
-        BudaOpNode *output_op,
-        std::vector<FusedSchedule> &schedules,
-        std::vector<Edge> &input_edges,
-        InputMapping &input_mapping,
-        bool reuse_dest_on_srcA_only);
-
-    // Get input cone of ops for the give node, and allowed ops.
-    // If stop_on_base_op is set, we won't include ops which are basis for the schedules (matmul/reduce ops).
-    std::unordered_set<BudaOpNode *> get_input_cone(
-        const Graph *graph,
-        BudaOpNode *node,
-        const std::unordered_set<BudaOpNode *> &allowed_ops,
-        bool stop_on_base_op = false) const;
-
-    // Reuse dest if possible. Returns true if dest is reused.
-    bool reuse_dest_if_possible(
-        BudaOpNode *op,
-        std::vector<FusedSubOpInput> &inputs,
-        std::uint32_t prev_output_allocated_buffer,
-        bool reuse_dest_on_srcA_only,
-        FusedSchedule &sch);
-
-   public:
-    FusionGroup() : id(next_fuse_id++), has_matmul(false), has_reduce(false) {}
-
-    FusionGroupP clone();
-
-    void add_op(BudaOpNode *op)
-    {
-        if (empty())
-            epoch_type = op->get_epoch_type();
-        else
-            TT_ASSERT(op->get_epoch_type() == epoch_type);
-        nodes.insert(std::make_pair(op->name(), op));
-    }
-    bool has_op(Node *op) const { return nodes.count(op->name()) > 0; }
-    bool has_op(const std::string name) const { return nodes.count(name) > 0; }
-    bool empty() const { return nodes.empty(); }
-    bool single_op() const { return nodes.size() == 1; }
-    std::size_t count() const { return nodes.size(); }
-    std::uint32_t get_id() const { return id; }
-    bool has_matmul_op() const { return has_matmul; }
-    bool has_broadcast_c_tm() const { return has_broadcast_c; }
-    bool has_reduce_op() const { return has_reduce; }
-    std::uint32_t get_reduce_dim() const { return reduce_dim; }
-
-    void set_reduce_op(uint32_t reduce_dim)
-    {
-        TT_ASSERT(has_reduce == false);
-        this->has_reduce = true;
-        this->reduce_dim = reduce_dim;
-    }
-
-    const std::vector<BudaOpNode *> &get_topo_order() const
-    {
-        TT_ASSERT(topo_order.size() > 0, "Call legalize() to legalize op and generator topo order");
-        return topo_order;
-    }
-
-    void remove_op(BudaOpNode *op, std::unordered_map<graphlib::NodeId, FusionGroupP> &fused_nodes)
-    {
-        TT_ASSERT(has_op(op));
-        nodes.erase(op->name());
-        fused_nodes.erase(op->id());
-    }
-
-    // Generate topological order of fused ops, using graph topological order as reference
-    void generate_topo_order(const std::vector<Node *> &graph_topo);
-
-    void clear(std::unordered_map<graphlib::NodeId, FusionGroupP> &fused_nodes)
-    {
-        for (auto node : nodes)
-        {
-            fused_nodes[node.second->id()] = nullptr;
-        }
-        nodes.clear();
-    }
-
-    graphlib::NodeEpochType get_epoch_type() const
-    {
-        TT_ASSERT(!empty());
-        return epoch_type;
-    }
-
-    std::vector<graphlib::Node const*> get_nodes_as_vector() const
-    {
-        std::vector<graphlib::Node const*> vec;
-        vec.reserve(nodes.size());
-        for (auto const& [name, node] : nodes) vec.push_back(node);
-        return vec;
-    }
-
-    int get_dram_input_count(graphlib::Graph const *graph, std::vector<graphlib::Node const *> additional_nodes = {}) const
-    {
-        return get_input_count(
-            graph,
-            additional_nodes,
-            [](auto const *node) { return dynamic_cast<graphlib::QueueNode const *>(node) != nullptr; });
-    }
-
-    int get_input_count(
-        graphlib::Graph const *graph,
-        std::vector<graphlib::Node const *> additional_nodes = {},
-        std::function<bool(graphlib::Node const *)> filter = [](auto const *) { return true; }) const
-    {
-        int input_count = 0;
-        std::vector<graphlib::Node const *> set = get_nodes_as_vector();
-        set.insert(set.end(), additional_nodes.begin(), additional_nodes.end());
-
-        for (auto const* node : set)
-        {
-            for (auto const* operand : graph->data_operands(node))
-            {
-                if (not filter(operand))
-                    continue;
-                bool in_fused_set = std::find(set.begin(), set.end(), operand) != set.end();
-                input_count += int(not in_fused_set);
-            }
-        }
-        return input_count;
-    }
-
-    int get_output_count(graphlib::Graph const *graph, std::vector<graphlib::Node const *> additional_nodes = {}) const
-    {
-        int output_count = 0;
-        std::vector<graphlib::Node const *> set = get_nodes_as_vector();
-        set.insert(set.end(), additional_nodes.begin(), additional_nodes.end());
-
-        for (auto const* node : set)
-        {
-            for (auto const* user : graph->data_users(node))
-            {
-                bool in_fused_set = std::find(set.begin(), set.end(), user) != set.end();
-                output_count += int(not in_fused_set);
-            }
-        }
-        return output_count;
-    }
-
-    int get_connection_count(
-        graphlib::Graph const *graph, std::vector<graphlib::Node const *> additional_nodes = {}) const
-    {
-        return get_input_count(graph, additional_nodes) + get_output_count(graph, additional_nodes);
-    }
-
-    bool legalize(
-        Graph *graph, std::unordered_map<graphlib::NodeId, FusionGroupP> &fused_nodes, const std::vector<Node *> &topo)
-    {
-        remove_fork_wraps(graph, fused_nodes);
-        pick_first_output(graph, fused_nodes, topo);
-        if (count() >= 2)
-        {
-            return true;
-        }
-        clear(fused_nodes);
-        return false;  // nothing left
-    }
-
-    // Remove fused ops from the graph, replace with a new fused op. Return pointer to new op.
-    void fuse(Graph *graph, FusionGroupP self, bool reuse_dest_on_srcA_only);
-
-    void print() const
-    {
-        log_trace(LogFuser, "Fused op id={}", id);
-        for (auto &[name, op] : nodes)
-        {
-            log_trace(LogFuser, "  {}: {}", name, op->op_type().op);
-        }
-    }
-
-    static uint32_t next_fuse_id;
-};
-
-uint32_t FusionGroup::next_fuse_id = 0;
-
-// TODO: There is already is_matmul and is_depthwise in BudaOpNode class... Remove one implementation...
-// TODO: Add const strings instead of comparing with specifically typed string every time...
-bool is_matmul(BudaOpNode *op) { return op->is_matmul(); }
-bool is_reduce_max(BudaOpNode *op) { return (op->op_type().op == "reduce"); }
-bool is_splice(BudaOpNode *op) { return (op->op_type().op == "splice"); }
-bool is_buffer(BudaOpNode *op) { return (op->op_type().op == "buffer"); }
-
-bool is_tile_broadcast(BudaOpNode *op)
-{
-    return op->as<graphlib::TaggedNode>()->has_tag("tile_broadcast_r") ||
-           op->as<graphlib::TaggedNode>()->has_tag("tile_broadcast_c");
-}
-
-// Return false if not a reduce, and reduce_dim set if it is
-bool find_reduce_dim(BudaOpNode *op, std::uint32_t &reduce_dim)
-{
-    if (is_reduce_max(op))
-    {
-        reduce_dim = std::get<int>(op->op_type().attr[0]);
-        return true;
-    }
-
-    if (!is_matmul(op))
-        return false;
-
-    if (op->as<graphlib::TaggedNode>()->has_tag("reduce_r"))
-    {
-        reduce_dim = 2;
-        return true;
-    }
-
-    if (op->as<graphlib::TaggedNode>()->has_tag("reduce_c"))
-    {
-        reduce_dim = 3;
-        return true;
-    }
-
-    return false;
-}
-
-bool is_reduce(BudaOpNode *op)
-{
-    // Checking already allowed ops to determine if it's a reduce, i.e. we need to break the schedule here
-    if (!is_matmul(op) && !is_reduce_max(op))
-        return false;
-
-    std::uint32_t reduce_dim;
-    return find_reduce_dim(op, reduce_dim);
-}
-
-// Return dim/amount pair indicating how src shape should be broadcast to dst. Only one dim can be broadcast.
-std::pair<std::uint32_t, std::uint32_t> get_broadcast(const Graph *graph, BudaOpNode *src, BudaOpNode *dst)
-{
-    const auto &tms = graph->get_edge_attributes(graph->get_edges(src, dst)[0])->get_tms();
-
-    std::pair<std::uint32_t, std::uint32_t> brcst = {0, 0};
-
-    for (auto tm : tms)
-    {
-        if (tm.op == "tile_broadcast")
-            continue;
-
-        if (tm.op != "broadcast")
-            TT_ASSERT("Unsupported TM inside fused op! {} between {} and {}", tm.op, src->name(), dst->name());
-
-        TT_ASSERT(brcst.second == 0, "More than one broadcast between {} and {} in fused op", src->name(), dst->name());
-
-        int dim = std::get<int>(tm.attr[0]);
-        int factor = std::get<int>(tm.attr[1]);
-
-        if (factor == 1)
-            factor = 0;  // broadcast to 1 is not really a broadcast
-
-        brcst = {dim, factor};
-
-        TT_ASSERT(
-            dim == 2 || dim == 3,
-            "Invalid broadcast dim inside fused op: {}, between {} and {}",
-            dim,
-            src->name(),
-            dst->name());
-    }
-
-    return brcst;
-}
-
-// Return dim/amount pair indicating how src shape should be broadcast to dst. Only one dim can be broadcast.
-std::pair<bool, bool> get_tile_broadcast(const Graph *graph, Node *src, BudaOpNode *dst)
-{
-    const auto &tms = graph->get_edge_attributes(graph->get_edges(src, dst)[0])->get_tms();
-
-    for (auto tm : tms)
-    {
-        if (tm.op != "tile_broadcast")
-            continue;
-
-        int dim = std::get<int>(tm.attr[0]);
-
-        TT_ASSERT(
-            dim == 2 || dim == 3,
-            "Invalid tile broadcast dim inside fused op: {}, between {} and {}",
-            dim,
-            src->name(),
-            dst->name());
-
-        if (dim == 2)
-            return {true, false};
-
-        return {false, true};
-    }
-
-    return {false, false};
-}
-
-bool is_allowed_matmul(Graph *graph, BudaOpNode *op)
-{
-    // Allowed matmuls are matmuls where one of the operands is a single tile.. these are reduce and tile broadcast ops
-    if (!is_matmul(op))
-        return false;
-
-    // Fusing depthwise matmuls hasn't been tested
-    if (op->is_depthwise_matmul())
-        return false;
-
-    bool allow_matmul = not env_as<bool>("PYBUDA_NO_FUSE_MATMUL");
-
-    if (!allow_matmul)
-        return false;
-
-    auto operands = graph->data_operands(op);
-    if (operands.size() > 2)
-        return false;  // fused matmul with bias
-
-    bool allow_reduce = env_as<bool>("PYBUDA_FUSE_REDUCE");
-    bool allow_broadcast = not env_as<bool>("PYBUDA_FUSE_NO_TILE_BROADCAST");
-
-    if (op->as<graphlib::TaggedNode>()->has_tag("reduce_r") || op->as<graphlib::TaggedNode>()->has_tag("reduce_c"))
-        return allow_reduce;
-
-    if (is_tile_broadcast(op))
-        return allow_broadcast;
-
-    return false;
-}
-
-std::unordered_set<BudaOpNode *> FusionGroup::get_input_cone(
-    const Graph *graph,
-    BudaOpNode *node,
-    const std::unordered_set<BudaOpNode *> &allowed_ops,
-    bool stop_on_base_op) const
-{
-    std::unordered_set<BudaOpNode *> input_cone;
-    input_cone.insert(node);
-    for (Node *operand : graph->data_operands(node))
-    {
-        if (operand->node_type() == graphlib::kBudaOp)
-        {
-            if (allowed_ops.count(operand->as<BudaOpNode>()) == 0)
-                continue;
-
-            if (stop_on_base_op)
-            {
-                std::uint32_t new_reduce_dim;
-                bool is_reduce_op = find_reduce_dim(operand->as<BudaOpNode>(), new_reduce_dim);
-
-                // Don't go further in case of encountering the base op (matmul/reduce).
-                if (is_reduce_op || is_matmul(operand->as<BudaOpNode>()))
-                    continue;
-            }
-
-            auto sub_cone = get_input_cone(graph, operand->as<BudaOpNode>(), allowed_ops, stop_on_base_op);
-            input_cone.insert(sub_cone.begin(), sub_cone.end());
-        }
-    }
-    return input_cone;
-}
-
-struct Buffer
-{
-    std::uint32_t id;
-    std::uint32_t allocated_schedule_index;
-    std::vector<std::uint32_t> outstanding_users;
-};
-
-class BufferAllocator
-{
-    std::uint32_t count;
-    std::unordered_map<std::uint32_t, std::shared_ptr<Buffer>> buffers;
-
-    // Once allocated for a particular data type, that buffer must always have the same one
-    std::unordered_map<std::uint32_t, DataFormat> data_formats;
-
-   public:
-    BufferAllocator(std::uint32_t count) : count(count) {}
-    std::shared_ptr<Buffer> allocate(
-        std::vector<std::uint32_t> users,
-        std::uint32_t schedule,
-        bool local,
-        const std::unordered_set<std::uint32_t> &blacklisted_buffers,
-        DataFormat df)
-    {
-        for (std::uint32_t i = 0; i < count; i++)
-        {
-            if (buffers.count(i) > 0)
-                continue;  // already allocated
-
-            if (blacklisted_buffers.count(i) > 0)
-                continue;  // not allowed
-
-            auto it = data_formats.find(i);
-            if ((it == data_formats.end()) || (it->second == df))
-            {
-                std::vector<std::uint32_t> local_users(users.size(), 0);
-                local_users[schedule] = users[schedule];
-
-                buffers[i] = std::make_shared<Buffer>(Buffer{i, schedule, local ? local_users : users});
-                data_formats[i] = df;
-                return buffers[i];
-            }
-        }
-        TT_THROW("Ran out of intermediate buffers.");
-        return 0;  // avoid warning
-    }
-
-    // return true if this buffer is now done with, and another flag to indicate if this was a cross-schedule deallocate
-    std::pair<bool, bool> deallocate(std::uint32_t id, std::uint32_t schedule)
-    {
-        auto it = buffers.find(id);
-        TT_LOG_ASSERT(it != buffers.end(), "Deallocating {}, which has already been deallocated.", id);
-
-        auto &outstanding_users = it->second->outstanding_users;
-        TT_ASSERT(outstanding_users[schedule] > 0);
-        outstanding_users[schedule]--;
-
-        if (outstanding_users[schedule] == 0)
-        {
-            // Check if there are later uses
-            for (std::size_t i = schedule + 1; i < outstanding_users.size(); i++)
-                if (outstanding_users[i] > 0)
-                    return std::make_pair(false, false);
-
-            auto allocated_schedule_index = it->second->allocated_schedule_index;
-            buffers.erase(it);
-
-            return std::make_pair(true, allocated_schedule_index != schedule);
-        }
-
-        return std::make_pair(false, false);
-    }
-};
-
-// Reuse dest if possible. Returns true if dest is reused.
-bool FusionGroup::reuse_dest_if_possible(
-    BudaOpNode *op,
-    std::vector<FusedSubOpInput> &inputs,
-    std::uint32_t prev_output_allocated_buffer,
-    bool reuse_dest_on_srcA_only,
-    FusedSchedule &sch)
-{
-    // Reusing dest not allowed for matmul.
-    if (is_matmul(op))
-    {
-        return false;
-    }
-
-    std::optional<std::uint32_t> reused_input_index = std::nullopt;
-    for (std::size_t index = 0; index < inputs.size(); index++)
-    {
-        auto i = inputs[index];
-        if ((i.type == FusedSubOpInput::INTERMED) && (i.index == prev_output_allocated_buffer))
-        {
-            // Dest can be reused only on 1 input and can't be reused if any input has broadcast.
-            if ((reused_input_index.has_value()) || (i.broadcast.second != 0))
-                return false;
-
-            if (i.has_tile_broadcast() && index > 0)
-                return false;
-
-            reused_input_index = index;
-        }
-    }
-
-    // Dest value not used on any of inputs.
-    if (!reused_input_index.has_value())
-        return false;
-
-    // Check if dest resuse is only allowed on input 0.
-    if (reuse_dest_on_srcA_only && (reused_input_index.value() != 0))
-        return false;
-
-    log_debug(LogFuser, "Reusing dest from previous fused op for {}", op->name());
-
-    // Modify the previous op.
-    TT_ASSERT(sch.ops.back().output_type == FusedSubOp::OutputType::INTERMED);
-    sch.ops.back().output_type = FusedSubOp::OutputType::DEST;
-
-    // Modify reused input.
-    inputs[reused_input_index.value()].type = FusedSubOpInput::InputType::DEST;
-
-    return true;
-}
-
-// Create schedules needed to execute the op
-void FusionGroup::create_schedules(
-    Graph *graph,
-    BudaOpNode *output_op,
-    std::vector<FusedSchedule> &schedules,
-    std::vector<Edge> &input_edges,
-    InputMapping &input_mapping,
-    bool reuse_dest_on_srcA_only)
-{
-    // Each matmul/reduce operation needs to have its own schedule and be the last operation of that schedule.
-    // Schedule algorithm:
-    //  - Find all matmul/reduce ops which will form the basis for the schedules.
-    //      - Track all dependencies between the base ops (matmul/reduce).
-    //  - Order the schedules so that the dependencies between the base ops are satisfied.
-    //  - Fill in the schedules with rest of the ops (each time starting from base op).
-
-    // Set of nodes in the op, for input cone searching
-    std::unordered_set<BudaOpNode *> node_set;
-    for (auto &[name, node] : nodes) node_set.insert(node);
-
-    // We need to recalculate has_reduce, since we could've pruned the reduce ops in legalize().
-    this->has_reduce = false;
-    this->reduce_dim = 0;
-
-    std::unordered_map<BudaOpNode*, std::unordered_set<BudaOpNode*>> schedule_dependencies;
-
-    for (auto &[name, node] : nodes)
-    {
-        std::uint32_t reduce_dim;
-        bool is_reduce_op = find_reduce_dim(node, reduce_dim);
-        bool is_matmul_op = is_matmul(node);
-
-        // We need to make schedules only for reduce/matmul.
-        if (!is_reduce_op && !is_matmul_op)
-            continue;
-
-        this->has_matmul |= is_matmul_op;
-        if (is_reduce_op)
-        {
-            this->has_reduce = true;
-            this->reduce_dim = reduce_dim;
-        }
-
-        auto input_cone = get_input_cone(graph, node, node_set);
-        schedule_dependencies.insert(std::make_pair(node, std::unordered_set<BudaOpNode*>{}));
-        for (BudaOpNode* c: input_cone)
-        {
-            if (c == node)
-                continue;
-
-            if (is_reduce(c) || is_matmul(c))
-                schedule_dependencies[node].insert(c);
-        }
-    }
-
-    std::vector<BudaOpNode *> schedule_output_nodes;  // list of outputs for each schedule
-    auto scheduled = [&schedule_output_nodes](BudaOpNode *op) -> bool
-    {
-        return std::find(schedule_output_nodes.begin(), schedule_output_nodes.end(), op) !=
-               schedule_output_nodes.end();
-    };
-
-    // Now that we have schedule dependencies, figure out the required order for the schedules.
-    while (schedule_dependencies.size() > schedule_output_nodes.size())
-    {
-        bool progress = false;
-
-        for (auto& [node, dependecies] : schedule_dependencies)
-        {
-            if (scheduled(node))
-                // We have already scheduled this one - continue...
-                continue;
-
-            bool ok = true;
-            for (auto d : dependecies)
-            {
-                if (!scheduled(d))
-                {
-                    // Dependency not satisfied so we cannot schedule this one.
-                    ok = false;
-                    break;
-                }
-            }
-
-            if (!ok)
-                continue;
-
-            progress = true;
-            schedule_output_nodes.push_back(node);
-            break;
-        }
-
-        TT_LOG_ASSERT(progress, "Deadlock trying to find reduce without dependencies");
-    }
-
-    if (schedule_output_nodes.size() == 0 || schedule_output_nodes.back() != output_op)
-    {
-        // Output of the fused op still doesn't have its schedule, so we need to schedule it.
-        schedule_output_nodes.push_back(output_op);
-    }
-
-    // Make sure we have enough schedules to cover all matmul/reduce ops.
-    TT_ASSERT(schedule_output_nodes.size() >= schedule_dependencies.size());
-
-    // Generate topo sort for each schedule
-    std::vector<std::vector<BudaOpNode *>> all_topos;
-    for (BudaOpNode *schedule_output_node : schedule_output_nodes)
-    {
-        // Get ops to schedule, while stopping on base ops (matmul/reduce), which must've already been scheduled
-        auto input_cone = get_input_cone(graph, schedule_output_node, node_set, true /* stop_on_base_op */);
-
-        // Topo sort
-        std::vector<BudaOpNode *> input_cone_topo;
-        for (BudaOpNode *node : get_topo_order())
-            if (input_cone.count(node) > 0)
-                input_cone_topo.push_back(node);
-
-        all_topos.push_back(input_cone_topo);
-    }
-
-    // To efficiently allocate intermed buffers, we need to find the number of readers for each op
-    std::unordered_map<BudaOpNode *, std::vector<std::uint32_t>> readers;
-    for (auto node : node_set)
-    {
-        readers[node] = std::vector<std::uint32_t>(all_topos.size(), 0);
-    }
-
-    for (std::size_t schedule_index = 0; schedule_index < all_topos.size(); schedule_index++)
-    {
-        auto input_cone_topo = all_topos[schedule_index];
-        for (BudaOpNode *op : input_cone_topo)
-        {
-            auto operands = graph->data_operands(op);
-            for (Node *operand : operands)
-            {
-                if ((operand->node_type() == graphlib::kBudaOp) && node_set.count(operand->as<BudaOpNode>()) > 0)
-                {
-                    TT_LOG_ASSERT(readers.count(operand->as<BudaOpNode>()) > 0, operand->name());
-                    readers[operand->as<BudaOpNode>()][schedule_index] += 1;
-                }
-            }
-        }
-    }
-
-    BufferAllocator buf_allocator(8);
-    std::unordered_map<BudaOpNode *, std::shared_ptr<Buffer>> buffers;  // evaluated ops and their output buffer number
-    std::uint32_t input_id = 0;
-    has_broadcast_c = false;
-
-    for (std::size_t schedule_index = 0; schedule_index < all_topos.size(); schedule_index++)
-    {
-        auto input_cone_topo = all_topos[schedule_index];
-
-        // Create ops in schedule
-        FusedSchedule sch;
-        std::unordered_set<std::uint32_t> blacklisted_buffers;  // buffers that can't be used any more in this schedule
-        std::optional<std::uint32_t> prev_output_allocated_buffer = std::nullopt;
-        for (BudaOpNode *op : input_cone_topo)
-        {
-            std::vector<FusedSubOpInput> inputs;
-            std::vector<std::uint32_t> popped_buffers;
-            std::vector<std::uint32_t> popped_last_buffers;
-            auto operands = graph->data_operands(op);
-            auto operand_edges = graph->operand_data_edges(op);
-
-            // Op output DF is not set yet at this point... And, as of now, back-end doesn't really support
-            // changing intermediate data formats, so we're going to hard-code this to a constant.
-            // DataFormat df = op->output_df();
-            DataFormat df = DataFormat::Float16_b;
-            std::shared_ptr<Buffer> output_buffer;
-
-            auto allocate_buffer = [&buffers,
-                                    &output_buffer,
-                                    &readers,
-                                    &buf_allocator,
-                                    schedule_index,
-                                    df,
-                                    &blacklisted_buffers](auto op)
-            {
-                TT_LOG_ASSERT(readers.count(op) > 0, op->name());
-                bool local = !is_reduce(op) && !is_matmul(op);
-                output_buffer = buf_allocator.allocate(readers[op], schedule_index, local, blacklisted_buffers, df);
-                buffers[op] = output_buffer;
-            };
-
-            if (is_reduce(op) && (op != output_op))
-            {
-                // Allocate buffers before deallocating inputs if this is reduce, since we need to accumulate into a new
-                // buffer
-                allocate_buffer(op);
-            }
-
-            for (std::uint32_t i = 0; i < operands.size(); i++)
-            {
-                Node *operand = operands.at(i);
-                if ((operand->node_type() != graphlib::kBudaOp) || !has_op(operand->as<BudaOpNode>()))
-                {
-                    std::uint32_t input_index = input_id;
-                    input_mapping[op].insert(std::make_pair(i, input_id++));
-                    input_edges.push_back(operand_edges.at(i));
-                    inputs.push_back(FusedSubOpInput{
-                        FusedSubOpInput::InputType::INPUT,
-                        input_index,
-                        {0, 0},
-                        get_tile_broadcast(graph, operand, op)});
-                }
-                else
-                {
-                    auto it = buffers.find(operand->as<BudaOpNode>());
-                    TT_LOG_ASSERT(it != buffers.end(), "Can't find source buffer for {}", operand->name());
-                    std::pair<std::uint32_t, std::uint32_t> broadcast =
-                        get_broadcast(graph, operand->as<BudaOpNode>(), op);
-
-                    // Check if broadcast C exists and mark it in fused group.
-                    if (broadcast.first == 3)
-                    {
-                        has_broadcast_c = true;
-                    }
-
-                    inputs.push_back(FusedSubOpInput{
-                        FusedSubOpInput::InputType::INTERMED,
-                        it->second->id,
-                        broadcast,
-                        get_tile_broadcast(graph, operand, op)});
-                    auto [pop, pop_last] = buf_allocator.deallocate(it->second->id, schedule_index);
-                    if (pop_last)
-                    {
-                        popped_last_buffers.push_back(it->second->id);
-                        blacklisted_buffers.insert(it->second->id);  // don't use this buffer in this schedule
-                    }
-                    else if (pop)
-                        popped_buffers.push_back(it->second->id);
-                }
-            }
-
-            if (op == output_op)
-            {
-                output_buffer = nullptr;
-            }
-            else if (!is_reduce(op))
-            {
-                allocate_buffer(op);
-            }
-
-            std::unordered_map<std::string, std::uint32_t> attrs;
-            // TODO: this needs to be set by legalizer, and not here...
-            if (op->op_type().op == "matmul")
-            {
-                attrs["u_kt"] = operands[0]->shape().ct();  // not this simple.. TODO
-                auto op0_tms = graph->get_edge_attributes(graph->get_edges(operands[0], op)[0])->get_tms();
-                for (auto tm : op0_tms)
-                    if ((tm.op == "broadcast") && (std::get<int>(tm.attr[0]) == 3))
-                        attrs["u_kt"] *= std::get<int>(tm.attr[1]);
-
-                if (is_reduce(op))
-                {
-                    attrs["m_k"] = 1;  // this is really a placeholder that we can override after ublocks are asigned
-                }
-                else
-                {
-                    // tile broadcast
-                    attrs["m_k"] = 1;
-                }
-            }
-
-            // See if it we can reuse previous dest
-            if (prev_output_allocated_buffer.has_value())
-            {
-                auto it = std::find(popped_buffers.begin(), popped_buffers.end(), prev_output_allocated_buffer.value());
-                if (it != popped_buffers.end())
-                {
-                    if (reuse_dest_if_possible(
-                            op, inputs, prev_output_allocated_buffer.value(), reuse_dest_on_srcA_only, sch))
-                    {
-                        // If dest is reused cleanup buffer tracking.
-                        popped_buffers.erase(it);
-                    }
-                }
-            }
-
-            std::uint32_t output_buffer_id = (output_buffer != nullptr) ? output_buffer->id : 0;
-
-            // If the operator has relu activation, don't reuse it (for better performance).
-            bool dont_reuse = op->buda_attrs().find("relu_en") != op->buda_attrs().end();
-
-            if (output_buffer == nullptr || dont_reuse)
-                prev_output_allocated_buffer.reset();
-            else
-                prev_output_allocated_buffer = output_buffer->id;  // save for dest reuse
-
-            sch.ops.push_back(FusedSubOp{
-                op->name(),
-                op->op_type(),
-                tt::balancer::get_op_shape(graph, op),
-                inputs,
-                (output_buffer == nullptr) ? FusedSubOp::OutputType::OUTPUT : FusedSubOp::OutputType::INTERMED,
-                output_buffer_id,
-                df,
-                attrs,
-                popped_buffers,
-                popped_last_buffers});
-        }
-        schedules.push_back(sch);
-    }
-}
-
-// Only R/C broadcasts allowed, and only one of them
-bool are_allowed_tms(
-    const std::vector<graphlib::OpType> &tms,
-    bool disable_broadcast = false,
-    bool is_matmul = false,
-    PortId input_id = 0)
-{
-    bool broadcast_seen = false;
-
-    for (auto tm : tms)
-    {
-        if ((tm.op != "broadcast") && (tm.op != "tile_broadcast"))
-            return false;
-
-        if (tm.op == "broadcast" and disable_broadcast)
-            return false;
-
-        if (tm.op == "broadcast")
-        {
-            if (broadcast_seen)
-                return false;  // can't have more than one
-            broadcast_seen = true;
-
-            // Only broadcast C on input 0 is allowed for matmul.
-            int dim = std::get<int>(tm.attr[0]);
-            if (is_matmul && dim != 3 && input_id != 0)
-                return false;
-        }
-
-        int dim = std::get<int>(tm.attr[0]);
-        if ((dim != 2) && (dim != 3))
-            return false;
-    }
-    return true;
-}
-
-// Checks if tile broadcast can be removed by going through all current BE limitations.
-// In some cases updates graph structure by swapping data operands to overcome BE limitations.
-bool is_tile_broadcast_replaceable(
-    graphlib::Graph *graph,
-    std::unordered_map<graphlib::NodeId, FusionGroupP> &fused_nodes,
-    graphlib::Edge &edge,
-    Node *src_node,
-    std::vector<Edge> src_edges)
-{
-    Node *node = graph->node_by_id(edge.consumer_node_id);
-    if (node->node_type() != graphlib::kBudaOp)
-    {
-        return false;
-    }
-
-    // Tile broadcast merge is enabled only for ops that are fused.
-    BudaOpNode *user_node = node->as<BudaOpNode>();
-    auto fused_op = fused_nodes.find(user_node->id());
-    if (fused_op == fused_nodes.end() || fused_op->second == nullptr)
-    {
-        return false;
-    }
-
-    // Tile broadcast merge is enabled only for unary and binary ops.
-    if (graph->data_operands(user_node).size() > 2)
-    {
-        return false;
-    }
-
-    // Tile broadcast is not supported for any op lowered to matmul.
-    if (is_matmul(user_node))
-    {
-        return false;
-    }
-
-    // If tile broadcast is enabled only for port 1 and it is on port 0, swap opearands if possible.
-    if (edge.consumer_input_port_id == 0)
-    {
-        if (!graphlib::can_swap_operands(graph, user_node))
-        {
-            return false;
-        }
-
-        graphlib::swap_operands(graph, user_node);
-    }
-
-    // Edge can handle only one tile broadcast operation.
-    // If this is second don't allow merge.
-    std::shared_ptr<EdgeAttributes> attr = graph->get_edge_attributes(edge);
-    for (auto tm : attr->get_tms())
-    {
-        if ("tile_broadcast" == tm.op)
-        {
-            return false;
-        }
-    }
-
-    // If src node is also fused we need to check if edge can handle all TMs from source and output egde.
-    fused_op = fused_nodes.find(src_node->id());
-    if (fused_op != fused_nodes.end() && fused_op->second != nullptr)
-    {
-        // Add source edge TMs to already initialized user edge attributes.
-        for (auto edge : src_edges) attr->append_tms(graph->get_edge_attributes(edge)->get_tms());
-
-        if (!are_allowed_tms(attr->get_tms()))
-            return false;
-    }
-
-    return true;
-}
-
-// Replace tile broadcasts with edge attributes, and remove constant inputs
-void replace_tile_broadcasts(
-    Graph *graph,
-    Node *node,
-    std::unordered_map<graphlib::NodeId, FusionGroupP> &fused_nodes,
-    std::unordered_set<Node *> &to_delete_nodes)
-{
-    if (!(node->node_type() == graphlib::kBudaOp))
-    {
-        return;
-    }
-
-    std::uint32_t broadcast_dim;
-    std::uint32_t src_operand, brcst_operand;
-    if (node->as<graphlib::TaggedNode>()->has_tag("tile_broadcast_r"))
-    {
-        broadcast_dim = 2;
-        src_operand = 1;
-        brcst_operand = 0;
-    }
-    else if (node->as<graphlib::TaggedNode>()->has_tag("tile_broadcast_c"))
-    {
-        broadcast_dim = 3;
-        src_operand = 0;
-        brcst_operand = 1;
-    }
-    else
-        return;  // not a tile broadcast
-
-    BudaOpNode *op = node->as<BudaOpNode>();
-    Node *src_node = graph->data_operands(op)[src_operand];
-    Node *brcst_node = graph->data_operands(op)[brcst_operand];
-    auto user_edges = graph->user_edges(op);
-    bool all_ok = true;
-    for (Edge user_edge : user_edges)
-    {
-        std::vector<Edge> src_edges = graph->get_edges(src_node, op);
-
-        if (!is_tile_broadcast_replaceable(graph, fused_nodes, user_edge, src_node, src_edges))
-        {
-            all_ok = false;
-            break;
-        }
-
-        // Tile broadcast is possible only on operand b and it could happen that we swapped inputs and that user_edge
-        // value is not valid anymore instead of refreshing it using hardcoded port 1.
-        // Merge tile broadcast to input TM.
-        Edge new_edge = Edge(src_node->id(), 0, user_edge.consumer_node_id, 1, user_edge.edge_type);
-
-        // Copy TMs from tile_brodacst output edge to new edge.
-        graph->add_edge(new_edge);
-        graph->copy_edge_attributes(user_edge, new_edge);
-        graph->remove_edge(user_edge);
-
-        // Add tile_broadcast TM to edge TMs.
-        graph->get_edge_attributes(new_edge)->prepend_tm(graphlib::OpType("tile_broadcast", {(int)broadcast_dim}));
-
-        // Copy TMs from tile_brodacst operand edge to new edge.
-        for (Edge src_edge : src_edges)
-        {
-            for (auto tm : graph->get_edge_attributes(src_edge)->get_tms())
-                graph->get_edge_attributes(new_edge)->prepend_tm(tm);
-        }
-    }
-
-    if (all_ok)
-    {
-        // Clean up leftover operand data edges.
-        // TODO: shouldn't this be removed on graph level when node is removed?
-        for (auto op_edge : graph->operand_data_edges(op))
-        {
-            graph->remove_edge(op_edge);
-        }
-
-        // If op is marked for fusing remove it from fusing structures first.
-        if ((fused_nodes.count(op->id()) > 0) && (fused_nodes[op->id()] != nullptr))
-        {
-            fused_nodes[op->id()]->remove_op(op, fused_nodes);
-        }
-
-        // Mark node for deletion
-        to_delete_nodes.insert(op);
-
-        bool remove_brcst = true;
-        for (auto user : graph->users(brcst_node))
-            if (to_delete_nodes.count(user) == 0)
-                remove_brcst = false;
-
-        // If there are no more users remove broadcast input node too.
-        if (remove_brcst)
-        {
-            to_delete_nodes.insert(brcst_node);
-        }
-    }
-}
-
-FusionGroupP FusionGroup::clone()
-{
-    FusionGroupP fusion_group_clone = std::make_shared<FusionGroup>(*this);
-
-    // Assign new unique id to the clone.
-    fusion_group_clone->id = FusionGroup::next_fuse_id++;
-    return fusion_group_clone;
-}
-
-// Remove fused ops from the graph, replace with a new fused op.
-void FusionGroup::fuse(Graph *graph, FusionGroupP self, bool reuse_dest_on_srcA_only)
-{
-    std::vector<Edge> input_edges;
-    InputMapping input_mapping;
-    BudaOpNode *output_op = nullptr;
-    std::vector<Edge> output_edges;
-    std::unordered_map<graphlib::Node *, std::uint32_t> input_ids;
-    std::unordered_map<std::uint32_t, std::uint32_t> input_reuse;
-    bool is_out = false;
-    for (BudaOpNode *op : get_topo_order())
-    {
-        std::vector<Edge> user_edges = graph->user_data_edges(op);
-        for (Edge edge : user_edges)
-        {
-            if ((edge.edge_type == graphlib::EdgeType::kDataLoopback) ||
-                ((edge.edge_type == graphlib::EdgeType::kData) && !has_op(graph->node_by_id(edge.consumer_node_id))))
-            {
-                is_out = true;
-                output_edges = user_edges;
-            }
-
-            if (is_out)
-            {
-                TT_ASSERT(output_op == nullptr, "Can't have more than one output");
-                output_op = op;
-                break;
-            }
-        }
-    }
-
-    // TT_ASSERT(input_edges.size() <= 8, "Too many inputs into fused op - 8 is max");
-
-    TT_ASSERT(output_op != nullptr);
-    std::vector<FusedSchedule> schedules;
-    create_schedules(graph, output_op, schedules, input_edges, input_mapping, reuse_dest_on_srcA_only);
-
-    std::unordered_set<std::uint32_t> ops_to_remove;
-    for (BudaOpNode *op : get_topo_order()) ops_to_remove.insert(op->id());
-
-    // Record non-data edges going into and out of ops that are going to be removed
-    std::unordered_set<Edge> incoming_non_data_edges, outgoing_non_data_edges;
-    for (BudaOpNode *op : get_topo_order())
-    {
-        auto edges = graph->operand_edges(op);
-        for (Edge edge : edges)
-        {
-            if ((edge.edge_type != graphlib::EdgeType::kData) && (ops_to_remove.count(edge.producer_node_id) == 0))
-                incoming_non_data_edges.insert(edge);
-        }
-
-        edges = graph->user_edges(op);
-        for (Edge edge : edges)
-        {
-            if (((edge.edge_type != graphlib::EdgeType::kData) &&
-                 (edge.edge_type != graphlib::EdgeType::kDataLoopback)) &&
-                (ops_to_remove.count(edge.consumer_node_id) == 0))
-            {
-                outgoing_non_data_edges.insert(edge);
-            }
-        }
-    }
-
-    std::vector<graphlib::OpType::Attr> attrs;
-    if (has_reduce or has_broadcast_c)
-    {
-        attrs.push_back((int)reduce_dim);
-        attrs.push_back(has_broadcast_c);
-    }
-    BudaOpNode *new_op = graph->add_node(
-        graphlib::create_node<graphlib::BudaOpNode>(
-            "_fused_op_" + std::to_string(id), graphlib::OpType("fused_op", attrs)),
-        graph->get_subgraph_id_for_node(output_op->id()));
-    new_op->set_shape(output_op->shape());
-
-    if (output_op->as<graphlib::TaggedNode>()->has_tag("original_op_type"))
-    {
-        new_op->tag("original_op_type", output_op->as<graphlib::TaggedNode>()->tag_value("original_op_type"));
-    }
-
-    graph->copy_node_attributes(output_op, new_op);
-
-    for (BudaOpNode *op : get_topo_order())
-    {
-        graph->remove_node(op);
-    }
-
-    for (std::uint32_t i = 0; i < input_edges.size(); i++)
-    {
-        Edge old_edge = input_edges.at(i);
-        Edge new_input_edge =
-            Edge(old_edge.producer_node_id, old_edge.producer_output_port_id, new_op->id(), i, EdgeType::kData);
-        graph->add_edge(new_input_edge);
-        graph->copy_edge_attributes(old_edge, new_input_edge);
-    }
-
-    for (Edge output_edge : output_edges)
-    {
-        Edge new_output_edge = Edge(
-            new_op->id(), 0, output_edge.consumer_node_id, output_edge.consumer_input_port_id, output_edge.edge_type);
-        graph->add_edge(new_output_edge);
-        graph->copy_edge_attributes(output_edge, new_output_edge);
-    }
-
-    for (Edge incoming_edge : incoming_non_data_edges)
-    {
-        Edge new_incoming_edge = Edge(incoming_edge.producer_node_id, 0, new_op->id(), 0, incoming_edge.edge_type);
-        graph->add_edge(new_incoming_edge);
-    }
-
-    for (Edge outgoing_edge : outgoing_non_data_edges)
-    {
-        Edge new_outgoing_edge = Edge(new_op->id(), 0, outgoing_edge.consumer_node_id, 0, outgoing_edge.edge_type);
-        graph->add_edge(new_outgoing_edge);
-    }
-
-    new_op->set_fused_op(std::make_shared<FusedOp>(self, new_op, input_mapping, output_op, schedules));
-}
-
-// Look through the output cone of the node, and see if it converges on something in the fused op.
-BudaOpNode *FusionGroup::converge_back_on_fused_op(Graph *graph, Node *node, std::unordered_set<Node *> &visited) const
-{
-    // Depth-first search for something that's in this fused op
-    if (visited.count(node) > 0)
-        return nullptr;
-
-    visited.insert(node);
-    // std::cout << " -- checking " << node->name() << std::endl;
-    for (Node *user : graph->data_users(node))
-    {
-        if (user->node_type() != graphlib::kBudaOp)
-            continue;
-
-        if (has_op(user->name()))
-        {
-            return user->as<BudaOpNode>();
-        }
-
-        if (user->get_epoch_type() != node->get_epoch_type())
-            continue;
-
-        BudaOpNode *converged = converge_back_on_fused_op(graph, user, visited);
-        if (converged != nullptr)
-            return converged;
-    }
-
-    return nullptr;
-}
-
-// Remove op, and everything below it
-void FusionGroup::remove_op_and_below(
-    Graph *graph, BudaOpNode *op, std::unordered_map<graphlib::NodeId, FusionGroupP> &fused_nodes)
-{
-    remove_op(op, fused_nodes);
-
-    for (Node *user : graph->data_users(op))
-    {
-        if (user->node_type() != graphlib::kBudaOp)
-            continue;
-
-        if (has_op(user->name()))
-            remove_op_and_below(graph, user->as<BudaOpNode>(), fused_nodes);
-    }
-}
-
-// Remove the cases where one part of the fork is inside the fused op, and the other is not, "wrapping" around
-// other ops
-void FusionGroup::remove_fork_wraps(Graph *graph, std::unordered_map<graphlib::NodeId, FusionGroupP> &fused_nodes)
-{
-    std::unordered_set<BudaOpNode *>
-        cleared_nodes;  // nodes we have determined are definitely ok and can't be causing a problem
-    while (nodes.size() > 1)
-    {
-        bool changed = false;
-        for (auto &[name, op] : nodes)
-        {
-            if (cleared_nodes.count(op) > 0)
-                continue;
-
-            // Look for outputs that are not in the fused op
-            for (Node *user : graph->data_users(op))
-            {
-                if (has_op(user))
-                    continue;
-
-                std::unordered_set<Node *> visited;
-                BudaOpNode *converge = converge_back_on_fused_op(graph, user, visited);
-                if (converge == nullptr)
-                    continue;  // ok, doesn't converge
-
-                // Need to remove converging op, and anything that uses it
-                remove_op_and_below(graph, converge, fused_nodes);
-                changed = true;
-            }
-
-            if (!changed)
-                cleared_nodes.insert(op);
-            else
-                break;  // need to start the search again because the fused op has changed
-        }
-
-        // check if we're done changing
-        if (!changed)
-            break;
-    }
-}
-
-// Sort nodes in topo order, based on the global topo order
-std::vector<BudaOpNode *> extract_topo_order(
-    const std::unordered_set<std::string> &nodes, const std::vector<Node *> &graph_topo)
-{
-    std::vector<BudaOpNode *> topo_order;
-    for (Node *node : graph_topo)
-        if (nodes.count(node->name()) > 0)
-            topo_order.push_back(node->as<BudaOpNode>());
-    TT_ASSERT(topo_order.size() == nodes.size());
-    return topo_order;
-}
-
-// Generate topological order of fused ops, using graph topological order as reference
-void FusionGroup::generate_topo_order(const std::vector<Node *> &graph_topo)
-{
-    TT_ASSERT(topo_order.size() == 0, "Topological order has already been created.");
-    std::unordered_set<std::string> node_set;
-    for (auto &[name, node] : nodes) node_set.insert(name);
-    topo_order = extract_topo_order(node_set, graph_topo);
-}
-
-// Pick first output in topo sort, and remove all others
-void FusionGroup::pick_first_output(
-    Graph *graph, std::unordered_map<graphlib::NodeId, FusionGroupP> &fused_nodes, const std::vector<Node *> &topo)
-{
-    if (empty() || single_op())
-        return;
-
-    Node *first_output = nullptr;
-    for (Node *node : topo)
-    {
-        if (has_op(node))
-        {
-            // data loopback edges must be outputs of fused ops, we can't fuse back to fwd
-            if (graph->user_edges(node, [](Edge edge) { return edge.edge_type == graphlib::EdgeType::kDataLoopback; })
-                    .size() > 0)
-            {
-                first_output = node;
-            }
-
-            if (first_output == nullptr)
-            {
-                for (Node *user : graph->data_users(node))
-                {
-                    if (!has_op(user->name()))
-                    {
-                        // Found first output
-                        first_output = node;
-                        break;
-                    }
-                }
-            }
-        }
-        if (first_output != nullptr)
-            break;
-    }
-    TT_ASSERT(first_output != nullptr, "There must be an output somewhere");
-
-    // Only nodes in this output's input cone are allowed.
-    std::unordered_set<BudaOpNode *> node_set;
-    for (auto &[name, node] : nodes) node_set.insert(node);
-
-    auto input_cone = get_input_cone(graph, first_output->as<BudaOpNode>(), node_set);
-
-    std::unordered_set<BudaOpNode *> to_remove;
-    for (auto &[name, node] : nodes)
-        if (input_cone.count(node) == 0)
-            to_remove.insert(node);
-
-    for (BudaOpNode *rm : to_remove) remove_op(rm, fused_nodes);
-
-    // We might've created new outputs by removing nodes, so let's do it again
-    if (to_remove.size() > 0)
-        pick_first_output(graph, fused_nodes, topo);
-}
-
-bool op_tagged_with_fuse_disable(const BudaOpNode *node)
-{
-    std::vector<std::string> ops_tagged_with_fuse_disable = env_as_vector<std::string>("PYBUDA_DISABLE_FUSE_TAGS");
-    if (ops_tagged_with_fuse_disable.empty())
-    {
-        return false;
-    }
-
-    if (node->as<graphlib::TaggedNode>()->has_tag("original_op_type"))
-    {
-        std::string original_op_type =
-            std::get<std::string>(node->as<graphlib::TaggedNode>()->tag_value("original_op_type"));
-
-        // If the original op type is in the list of ops to disable fusing, then disable fusing
-        for (const std::string &op_type : ops_tagged_with_fuse_disable)
-        {
-            if (original_op_type == op_type or node->op_name() == op_type)
-            {
-                log_debug("Fusion disabled on node: {} because it matches with: {}", node->name(), op_type);
-                return true;
-            }
-        }
-    }
-
-    return false;
-}
-
-// Handle all prechecks if fusing should be attempted for given node.
-// Returns false if node failed prechecks, true othervise.
-bool should_fuse_node(
-    FusionGroupP fused_op,
-    BudaOpNode *node,
-    Graph *graph,
-    std::unordered_map<graphlib::NodeId, FusionGroupP> &fused_nodes,
-    const std::vector<std::vector<std::string>> &op_names_to_chip_break,
-    const std::vector<std::vector<std::string>> &op_names_to_epoch_break)
-{
-    if (node->tag_value_or<bool>("dont_fuse", false))
-        return false;
-
-    // If node is already fused or fusion was already attempted for this node.
-    if (fused_nodes.count(node->id()) > 0)
-        return false;
-
-    // Note: get_dram_input_count isn't perfect because op fusion happens before
-    // balancing so we miss out on potential e2e queue inputs.  This only protects
-    // graph inputs from being fused past the limit
-    if (fused_op->get_input_count(graph, {node}) > FusedOp::kMaxNumInputs or
-        fused_op->get_dram_input_count(graph, {node}) > FusedOp::kMaxNumDRAMInputs or
-        fused_op->get_connection_count(graph, {node}) >= FusedOp::kMaxNumConnections)
-        return false;
-
-    // Filter out operations that by defintion shouldn't be fused.
-    // Matmul (except for special cases of reduce and tile_broadcast) and buffer op
-    if ((is_matmul(node) && !is_allowed_matmul(graph, node)) || is_buffer(node))
-        return false;
-
-    // These operations are not supported for fusing on backend.
-    if (is_reduce_max(node) || is_splice(node) || node->is_embedding())
-        return false;
-
-    // If it is accumulation op don't fuse it.
-    if (node->is_gradient_op())
-        return false;
-
-    // If user has tagged the op_type/original_op_type with a tag that should disable fusing, then disable fusing
-    if (op_tagged_with_fuse_disable(node))
-        return false;
-
-    //  Don't fuse operations that are explicitly marked for chip or epoch break.
-    /*
-        TODO: More optimal approach would be to fuse this op and don't fuse users of this op in this fusing op run.
-        That approach would increase complexity of this change due to explicit dependencies
-        on chip/epoch break op name down the stack which would be changed if op is fused.
-        Based on this reasoning leaving this as a follow up.
-    */
-    if (is_str_in_strings(node->name(), op_names_to_chip_break) ||
-        is_str_in_strings(node->name(), op_names_to_epoch_break))
-        return false;
-
-    // Can't fuse ops that are in differenct epoch types.
-    if (!fused_op->empty() && (fused_op->get_epoch_type() != node->get_epoch_type()))
-        return false;
-
-    uint32_t reduce_dim = 0;
-    if (find_reduce_dim(node, reduce_dim))
-    {
-        // We cannot allow reduce ops along different dimensions in one fused op.
-        if (fused_op->has_reduce_op() && reduce_dim != fused_op->get_reduce_dim())
-            return false;
-    }
-
-    bool disable_broadcast = env_as<bool>("PYBUDA_FUSE_DISABLE_BROADCAST");
-
-    // Block fusing of unsupported TMs.
-    for (Edge operand_edge : graph->operand_data_edges(node))
-    {
-        Node *operand = graph->node_by_id(operand_edge.producer_node_id);
-
-        // If this is matmul it is not alowed to fuse if input 1 is result of the same fused op.
-        if (is_matmul(node) && !is_tile_broadcast(node) && (operand_edge.consumer_input_port_id == 1) && fused_op->has_op(operand))
-        {
-            return false;
-        }
-
-        // If producer op is not fused it means that current egde is not fused, hence no need to check op fusing tm
-        // limits.
-        if (fused_op->has_op(operand))
-        {
-            if (!are_allowed_tms(
-                    graph->get_edge_attributes(operand_edge)->get_tms(),
-                    disable_broadcast,
-                    is_matmul(node),
-                    operand_edge.producer_node_id))
-                return false;
-        }
-    }
-
-    return true;
-}
-
-void expand_search(
-    FusionGroupP fused_op,
-    Graph *graph,
-    BudaOpNode *current_node,
-    std::unordered_map<graphlib::NodeId, FusionGroupP> &fused_nodes,
-    const std::vector<std::vector<std::string>> &op_names_to_chip_break,
-    const std::vector<std::vector<std::string>> &op_names_to_epoch_break)
-{
-    //
-    // search below and above for more ops to fuse
-    //
-
-    if (!should_fuse_node(fused_op, current_node, graph, fused_nodes, op_names_to_chip_break, op_names_to_epoch_break))
-        return;
-
-    fused_op->add_op(current_node);
-    fused_nodes.insert(std::make_pair(current_node->id(), fused_op));
-
-    // If this is the first reduce in this fused op remember its dimension.
-    uint32_t reduce_dim = 0;
-    if (find_reduce_dim(current_node, reduce_dim) && !fused_op->has_reduce_op())
-        fused_op->set_reduce_op(reduce_dim);
-
-    bool disable_broadcast = env_as<bool>("PYBUDA_FUSE_DISABLE_BROADCAST");
-
-    for (Edge operand_edge : graph->operand_data_edges(current_node))
-    {
-        // Not supported to use fused result as input 1 of matmul.
-        if (is_matmul(current_node) && !is_tile_broadcast(current_node) && (operand_edge.consumer_input_port_id == 1))
-            continue;
-
-        auto tms = graph->get_edge_attributes(operand_edge)->get_tms();
-
-        // Producer op can be fused only if bellow op can consume all tms as part of fused op.
-        if (!are_allowed_tms(tms, disable_broadcast, is_matmul(current_node), operand_edge.producer_node_id))
-            continue;
-
-        Node *operand = graph->node_by_id(operand_edge.producer_node_id);
-        if (operand->node_type() == graphlib::kBudaOp)
-            expand_search(
-                fused_op,
-                graph,
-                operand->as<BudaOpNode>(),
-                fused_nodes,
-                op_names_to_chip_break,
-                op_names_to_epoch_break);
-    }
-
-    // Don't go beyond exp for now, because it's expensive and we don't want to do it multiple times
-    // which will happen in softmax... TODO make this more generic
-    if (current_node->op_type().op == "exp")
-        return;
-
-    if (env_as<bool>("PYBUDA_FUSE_STOP_ON_RECIPROCAL") and current_node->op_type().op == "reciprocal")
-        return;
-
-    for (Edge user_edge : graph->user_data_edges(current_node))
-    {
-        // No need to check user tm limitation since expand_search will do all the checks.
-        Node *user = graph->node_by_id(user_edge.consumer_node_id);
-        if (user->node_type() == graphlib::kBudaOp)
-            expand_search(
-                fused_op, graph, user->as<BudaOpNode>(), fused_nodes, op_names_to_chip_break, op_names_to_epoch_break);
-    }
-}
-
-static void tag_ops_dont_fuse(
-    graphlib::Graph *graph,
-    const std::vector<std::string> &op_names_dont_fuse,
-    const std::vector<std::string> &op_names_manual_fuse)
-{
-    for (auto const& op_name : op_names_dont_fuse)
-    {
-        if (not graph->has_node_with_name(op_name))
-        {
-            log_warning(LogFuser, "Node name specified in op_names_dont_fuse doesn't exist in graph {}", op_name);
-            continue;
-        }
-
-        graph->get_node_by_name(op_name)->as<graphlib::TaggedNode>()->tag("dont_fuse");
-    }
-
-    if (not op_names_manual_fuse.empty())
-    {
-        auto regex_predicate = graphlib::query::Predicate<graphlib::Node *>::anyOf(
-            op_names_manual_fuse.begin(), op_names_manual_fuse.end(), graphlib::query::view_node_name);
-        auto predicate = graphlib::query::predicate_op_node_type() & regex_predicate.negate();
-        for (Node *node : filter_nodes(graph, predicate))
-        {
-            node->as<graphlib::TaggedNode>()->tag("dont_fuse");
-        }
-    }
-}
-
-// We skip fusing op_types that have override for output_df in amp_properties
-void skip_fusing_based_on_amp_properties(graphlib::Graph *graph, const std::vector<tt::passes::AMPNodeProperties> &amp_properties)
-{
-    std::unordered_set<std::string> op_types_skip_fusing;
-    for (const auto& amp_property : amp_properties)
-    {
-        if (amp_property.op_type.has_value() && amp_property.output_df.has_value())
-        {
-            op_types_skip_fusing.insert(amp_property.op_type.value());
-        }
-    }
-
-    for (Node *node : graphlib::topological_sort(*graph))
-    {
-        if(node->node_type() == graphlib::NodeType::kBudaOp)
-        {
-            std::string op_type = node->as<graphlib::BudaOpNode>()->op_type().op;
-            if (op_types_skip_fusing.find(op_type) != op_types_skip_fusing.end())
-            {
-                // current node has op_type which has output_df override in amp_properties
-                // therefore, we mark that node with dont_fuse to skip fusing.
-                node->as<graphlib::TaggedNode>()->tag("dont_fuse");
-            }
-        }
-    }
-}
-
-void fuse_ops(
-    graphlib::Graph *graph,
-    const DeviceConfig &device_config,
-    const std::vector<std::vector<std::string>> &op_names_to_chip_break,
-    const std::vector<std::vector<std::string>> &op_names_to_epoch_break,
-    const std::vector<std::string> &op_names_dont_fuse,
-    const std::vector<std::string> &op_names_manual_fuse,
-    const std::vector<tt::passes::AMPNodeProperties> &amp_properties)
-{
-    // Map of node IDs and fused groups that contain that node.
-    // If fuse group is nullptr, that means that fusing was already attempted and failed for this node.
-    std::unordered_map<graphlib::NodeId, FusionGroupP> fused_nodes;
-
-    // Reinit next_fuse_id, since we could be retrying graph compilation.
-    FusionGroup::next_fuse_id = 0;
-
-    log_debug(LogFuser, "Fusing ops...");
-
-    skip_fusing_based_on_amp_properties(graph, amp_properties);
-    tag_ops_dont_fuse(graph, op_names_dont_fuse, op_names_manual_fuse);
-
-    std::vector<FusionGroupP> fused_ops;
-    FusionGroupP fused_op = std::make_shared<FusionGroup>();
-    auto topo = graphlib::topological_sort(*graph);
-    for (Node *node : topo)
-    {
-        if (node->node_type() == graphlib::kBudaOp)
-        {
-            BudaOpNode *op = node->as<BudaOpNode>();
-            if (!should_fuse_node(fused_op, op, graph, fused_nodes, op_names_to_chip_break, op_names_to_epoch_break))
-                continue;
-
-            log_trace(LogFuser, "Expand search from {}", node->name());
-            expand_search(
-                fused_op, graph, node->as<BudaOpNode>(), fused_nodes, op_names_to_chip_break, op_names_to_epoch_break);
-            log_trace(LogFuser, "Legalize fused op from {}, with {} fused ops", node->name(), fused_op->count());
-            if (fused_op->legalize(graph, fused_nodes, topo))
-            {
-                fused_ops.push_back(fused_op);
-                fused_op->print();
-                fused_op = std::make_shared<FusionGroup>();
-            }
-        }
-    }
-
-    // Remove all tile broadcasts that can be replaced by input TM.
-    std::unordered_set<Node *> to_delete_nodes;
-    for (Node *node : topo)
-    {
-        replace_tile_broadcasts(graph, node, fused_nodes, to_delete_nodes);
-    }
-
-    // for (FusionGroupP f : fused_ops) f->print();
-    std::uint32_t initial_count = graph->nodes().size();
-    std::uint32_t fused_away = 0;
-    for (FusionGroupP f : fused_ops) fused_away += f->count() - 1;
-
-    log_debug(LogFuser, "Initial op count: {}, fused away: {}", initial_count, fused_away);
-
-    // Generate fused ops topo order
-    for (FusionGroupP f : fused_ops)
-    {
-        if (!(f->empty()))
-        {
-            f->generate_topo_order(topo);
-        }
-    }
-
-    // Remove nodes marked for deletion by tile replace algorithm.
-    for (auto op : to_delete_nodes) graph->remove_node(op);
-
-    // Make fusing graph changes
-    bool reuse_dest_on_srcA_only = device_config.is_grayskull();
-    for (FusionGroupP f : fused_ops)
-    {
-        if (!(f->empty()))
-        {
-            f->fuse(graph, f, reuse_dest_on_srcA_only);
-        }
-    }
-
-    // Clean up - remove any inputs that are no longer used
-    for (Node *node : graph->nodes())
-    {
-        if ((node->node_type() == graphlib::kInput) && graph->user_edges(node).size() == 0)
-            graph->remove_node(node);
-    }
-}
-
-std::shared_ptr<FusedOp> FusedOp::clone(BudaOpNode *parent_buda_node)
-{
-    return std::make_shared<FusedOp>(this->group->clone(), parent_buda_node, this->inputs, this->output_op, this->schedules);
-}
-
-std::uint32_t FusedOp::id() const { return group->get_id(); }
-
-std::uint32_t FusedOp::get_input_count() const
-{
-    std::uint32_t input_count = 0;
-    for (auto sch : schedules)
-    {
-        for (FusedSubOp op : sch.ops)
-        {
-            for (auto i : op.inputs)
-            {
-                if ((i.type == FusedSubOpInput::InputType::INPUT) && (i.index + 1 > input_count))
-                    input_count = i.index + 1;
-            }
-        }
-    }
-    return input_count;
-}
-
-// Return attributes that will be defined on fused op level.
-BudaOpAttrs FusedOp::get_operation_attr()
-{
-    BudaOpAttrs attrs;
-    attrs["fused_op_id"] = (int)id();
-
-    if (node->buda_attrs().count("kernel_broadcast") and not env_as<bool>("PYBUDA_DISABLE_FUSED_KERNEL_BROADCAST"))
-    {
-        attrs["kernel_broadcast"] = node->buda_attrs().at("kernel_broadcast");
-    }
-
-    // Currently BE limitation is that approximate_mode can be specified only as fused op attribute.
-    // Logic for merging is that if any sub op requires precise result, do not allow approximate mode on entire fuse op.
-    bool exists = false;
-    graphlib::OpType::Attr value = true;
-    for (auto sch : schedules)
-    {
-        for (FusedSubOp op : sch.ops)
-        {
-            auto attr = op.op_type.buda_attrs.find("approximate_mode");
-            if (attr != op.op_type.buda_attrs.end())
-            {
-                exists = true;
-                value = attr->second;
-                continue;
-            }
-
-            // Assert if op has attribute that is not supported on sub op level.
-            for (auto attr : op.op_type.buda_attrs)
-            {
-                if ((SubOpAttr[op.op_type.op].count(attr.first) > 0) || (SubOpAttr["*"].count(attr.first) > 0))
-                {
-                    TT_ASSERT(
-                        "Operation: {}, contains attribute: {}, that is lost in fusing.", op.op_type.op, attr.first);
-                }
-            }
-        }
-    }
-
-    if (exists)
-    {
-        attrs["approximate_mode"] = value;
-    }
-
-    return attrs;
-}
-
-std::pair<std::uint32_t, std::uint32_t> FusedSubOp::get_mblock_for_ublock(
-    const std::pair<std::uint32_t, std::uint32_t> ublock, const std::pair<std::uint32_t, std::uint32_t> grid) const
-{
-    const std::uint32_t rt = ublock.first;
-    const std::uint32_t ct = ublock.second;
-    const std::uint32_t grid_r = grid.first;
-    const std::uint32_t grid_c = grid.second;
-    const balancer::TensorShape &ts = op_shape.outputs.at(0);
-    TT_ASSERT(ts.rt % grid_r == 0, "For sub-op {}, rt {} is not divisible by grid_r {}", name, ts.rt, grid_r);
-    TT_ASSERT(ts.ct % grid_c == 0, "For sub-op {}, ct {} is not divisible by grid_c {}", name, ts.ct, grid_c);
-    TT_ASSERT(ts.rt % (grid_r * rt) == 0, "For sub-op {}, rt {} is not divisible by ublock dim {}", name, ts.rt, rt);
-    TT_ASSERT(ts.ct % (grid_c * ct) == 0, "For sub-op {}, ct {} is not divisible by ublock dim {}", name, ts.ct, ct);
-    std::uint32_t m, n;
-    m = ts.rt / (rt * grid_r);
-    n = ts.ct / (ct * grid_c);
-    return std::make_pair(m, n);
-}
-
-// Out of all buda attr return those that should be added to netlist on sub op level.
-BudaOpAttrs FusedSubOp::get_sub_op_buda_attr() const
-{
-    BudaOpAttrs new_attr;
-    for (auto attr : op_type.buda_attrs)
-    {
-        if ((SubOpAttr[op_type.op].count(attr.first) > 0) || (SubOpAttr["*"].count(attr.first) > 0))
-        {
-            new_attr.insert(attr);
-        }
-    }
-
-    return new_attr;
-}
-
-FusedOp::FusedOp(
-    FusionGroupP group,
-    BudaOpNode *node,
-    InputMapping inputs,
-    BudaOpNode *output_op,
-    std::vector<FusedSchedule> schedules) :
-    group(group), node(node), inputs(inputs), output_op(output_op), schedules(schedules)
-{
-    has_matmul_ = group->has_matmul_op();
-    has_reduce_ = group->has_reduce_op();
-    has_broadcast_c_ = group->has_broadcast_c_tm();
-    reduce_dim_ = group->get_reduce_dim();
-}
-
-}  // namespace tt
diff --git a/pybuda/csrc/passes/fuse_ops.hpp b/pybuda/csrc/passes/fuse_ops.hpp
deleted file mode 100644
index 22b105aec..000000000
--- a/pybuda/csrc/passes/fuse_ops.hpp
+++ /dev/null
@@ -1,154 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "backend_api/device_config.hpp"
-#include "balancer/types.hpp"
-#include "graph_lib/node_types.hpp"
-#include "passes_utils.hpp"
-#include "passes/amp.hpp"
-namespace tt
-{
-
-struct DeviceConfig;
-
-// Attributes that can be specified on sub op level.
-// TODO: Instead of hardcoding these values here get them from backend api. 
-inline static unordered_map<std::string, std::unordered_set<std::string>> SubOpAttr = {
-    {"*", std::unordered_set<std::string>{"m_k", "u_kt", "vector", "relu_en", "relu_threshold", "relu_mode"}},
-    {"reduce", std::unordered_set<std::string>{"dim", "type"}},
-    {"dropout", std::unordered_set<std::string>{"p", "seed"}},
-    {"lrelu", std::unordered_set<std::string>{"slope"}},
-    {"power", std::unordered_set<std::string>{"exp"}},
-};
-
-// Main entry
-void fuse_ops(
-    graphlib::Graph *graph,
-    const DeviceConfig &device_config,
-    const std::vector<std::vector<std::string>> &op_names_to_chip_break,
-    const std::vector<std::vector<std::string>> &op_names_to_epoch_break,
-    const std::vector<std::string> &op_names_dont_fuse,
-    const std::vector<std::string> &op_names_manual_fuse,
-    const std::vector<tt::passes::AMPNodeProperties> &amp_properties);
-
-// Op input type / id
-struct FusedSubOpInput
-{
-    enum InputType
-    {
-        INPUT,
-        INTERMED,
-        DEST
-    } type;
-
-    std::uint32_t index;  // either input index, or buffer index
-    std::pair<std::uint32_t, std::uint32_t> broadcast = {0, 0};
-    std::pair<bool, bool> tile_broadcast = {false, false};
-
-    bool has_broadcast() const { return broadcast.second != 0; }
-    bool has_tile_broadcast() const { return tile_broadcast.first || tile_broadcast.second; }
-};
-
-// Op within fused op
-struct FusedSubOp
-{
-    std::string name;
-    graphlib::OpType op_type;
-    balancer::OpShape op_shape;
-
-    // Inputs are either other ops or intermediate buffers
-    std::vector<FusedSubOpInput> inputs;
-
-    enum OutputType
-    {
-        OUTPUT,
-        DEST,
-        INTERMED
-    } output_type;
-    std::uint32_t output_buffer;  // only valid for INTERMED type
-    DataFormat output_df;
-
-    std::unordered_map<std::string, std::uint32_t> attrs;
-    std::vector<std::uint32_t> popped_buffers;
-    std::vector<std::uint32_t> popped_last_buffers;
-
-    std::pair<std::uint32_t, std::uint32_t> get_mblock_for_ublock(
-        const std::pair<std::uint32_t, std::uint32_t> ublock, const std::pair<std::uint32_t, std::uint32_t> grid) const;
-
-    BudaOpAttrs get_sub_op_buda_attr() const;
-};
-
-// Single sequential scheduled of fused ops to run through
-struct FusedSchedule
-{
-    std::vector<FusedSubOp> ops;
-};
-
-class FusionGroup;
-using FusionGroupP = std::shared_ptr<FusionGroup>;
-class FusedOp;
-using FusedOpP = std::shared_ptr<FusedOp>;
-using InputMapping = std::unordered_map<graphlib::Node *, std::unordered_map<std::uint32_t, std::uint32_t>>;
-using BudaOpNode = graphlib::BudaOpNode;
-
-// Represet information about the new op in the graph, made from the ops in fusion group
-class FusedOp
-{
-   public:
-    static constexpr int kMaxNumDRAMInputs = 8;
-    static constexpr int kMaxNumInputs = 16;
-    static constexpr int kMaxNumConnections = kMaxNumInputs + 1;  // +1 for at least 1 output connection
-
-   private:
-    FusionGroupP group;
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-private-field"
-    BudaOpNode *node;  // fused node
-
-    // Ordered list of inputs - pair of nodes which inputs are connected to, and their operand index
-    InputMapping inputs;
-
-    // Output op within fused nodes
-    BudaOpNode *output_op;
-#pragma GCC diagnostic pop
-
-    // Schedules to execute the op
-    std::vector<FusedSchedule> schedules;
-
-    bool has_matmul_ = false;
-    bool has_reduce_;
-    bool has_broadcast_c_ = false;
-    // Reduce dim - only one for all reduces allowed
-    std::uint32_t reduce_dim_;
-
-   public:
-    // Construct a new fused op, fusing itself into the graph
-    FusedOp(
-        FusionGroupP group,
-        BudaOpNode *node,
-        InputMapping inputs,
-        BudaOpNode *output_op,
-        std::vector<FusedSchedule> schedules);
-
-    std::shared_ptr<FusedOp> clone(BudaOpNode *node);
-
-    const std::vector<FusedSchedule> &get_schedules() const { return schedules; }
-    std::vector<FusedSchedule> &get_schedules() { return schedules; }
-    std::uint32_t id() const;
-    std::uint32_t get_input_count() const;
-    BudaOpAttrs get_operation_attr();
-
-    bool has_matmul_op() const { return has_matmul_; }
-    bool has_reduce_op() const { return has_reduce_; }
-    bool has_broadcast_c() const {return has_broadcast_c_; }
-
-    std::uint32_t get_reduce_dim() const
-    {
-        TT_ASSERT(has_reduce_op());
-        return reduce_dim_;
-    }
-};
-
-}  // namespace tt
diff --git a/pybuda/csrc/passes/fuse_redundant_tm_sequence.cpp b/pybuda/csrc/passes/fuse_redundant_tm_sequence.cpp
index baa3112a7..83de8006b 100644
--- a/pybuda/csrc/passes/fuse_redundant_tm_sequence.cpp
+++ b/pybuda/csrc/passes/fuse_redundant_tm_sequence.cpp
@@ -46,40 +46,126 @@ graphlib::Shape replacement_output_shape(graphlib::Shape input_shape, const TMPa
     return input_shape;
 }
 
+std::string pattern_to_string(const TMPattern& pattern) {
+    std::stringstream ss;
+    for (uint i = 0; i < pattern.size(); i++) {
+       ss << pattern[i].op_name;
+       if (pattern[i].attrs.size() > 0) {
+            ss << "(";
+            for (auto attr : pattern[i].attrs) {
+                ss << attr << ",";
+            }
+            ss << ")";
+       }
+       if (i < pattern.size() - 1)
+            ss << "-->";
+    }
+    return ss.str();
+}
 
-void replace_pattern_with_new_pattern(
+bool replace_pattern_with_new_pattern(
     tt::graphlib::Graph* graph,
     const TMPattern& current_pattern, 
     const TMPattern& replace_pattern, 
     graphlib::Node *sequence_producer, 
-    graphlib::Node * node) {
+    graphlib::Node * terminal_node) {
 
-    //Bypass all nodes until the end of the current pattern
+    log_debug(LogTMFusion, "Trying to replace pattern from {} to {}.", pattern_to_string(current_pattern), pattern_to_string(replace_pattern));
+
+    bool multiple_user = false;
+    std::vector<graphlib::Node *> users;
+    graphlib::Node * fuse_node = nullptr;
+
+    // Check whether the matched pattern has multiple user or not
+    // if there are multiple user at the end of the pattern matched node and
+    // multiple user are same op and same shape
+    // then the matched pattern can be fused by using replace pattern
+    // and other user nodes are connected to the fused op.
     auto current_node = graph->users(sequence_producer)[0];
+    while (current_node != terminal_node) {
+        users = graph->users(current_node);
+        if (users.size() > 1) {
+            bool user_is_terminal_node = std::find(users.begin(), users.end(), terminal_node) != users.end();
+
+            // If there is a fork in the middle of the matched pattern, we cannot fuse TMs
+            if (!user_is_terminal_node) {
+                log_debug(LogTMFusion, "There is a fork in the middle of the matched pattern - cannot fuse tms.");
+                return false;
+            }
+
+            OpType op_type = terminal_node->as<graphlib::OpNode>()->op_type();
+            for (auto& user : users) {
+
+                if (user->node_type() != graphlib::NodeType::kPyOp) {
+                    // All users should be PyOps
+                    return false;
+                }
+
+                if (user->as<graphlib::OpNode>()->op_type().op != op_type.op) {
+                    // All users should be the same op
+                    log_debug(LogTMFusion, "There is a user at the end of the matched pattern which is different op - cannot fuse tms.");
+                    return false;
+                }
+
+                if (user->shape() != terminal_node->shape()) {
+                    // All users should have the same shape
+                    log_debug(LogTMFusion, "There is a user at the end of the matched pattern which is same op but different shape - cannot fuse tms.");
+                    return false;
+                }
+            }
+            multiple_user = true;
+            break;
+
+        }
+        current_node = users[0];
+    }
+
+    // remove the edges of the users if it is same op and same shape
+    if (multiple_user) {
+        for (auto& user : users) {
+            if (user != terminal_node) {
+                auto edge_to_remove = graph->get_edges(current_node, user)[0];
+                graph->remove_edge(edge_to_remove);
+            }
+        }
+    }
+    // Bypass all nodes until the end of the current pattern
+    current_node = graph->users(sequence_producer)[0];
 
     // remove old pattern
-    while (current_node != node) {
+    while (current_node != terminal_node) {
         TT_ASSERT(graph->users(current_node).size() == 1);
         auto next_node = graph->users(current_node)[0];
         bypass_node(graph, current_node, true);
         current_node = next_node;
     }
 
-    TT_ASSERT(graph->get_edges(sequence_producer, node).size() == 1);
-    auto current_edge = graph->get_edges(sequence_producer, node)[0];
+    TT_ASSERT(graph->get_edges(sequence_producer, terminal_node).size() == 1);
+    auto current_edge = graph->get_edges(sequence_producer, terminal_node)[0];
     for (uint i = 0; i < replace_pattern.size(); i++) {
         auto op = replace_pattern[i];
         std::string name = sequence_producer->name() + "_fused_tm_op_" + std::to_string(i);
         auto new_node = graph->add_node(
             std::make_unique<graphlib::PyOpNode>(name, op.as_op_type()), graph->get_subgraph_id_for_node(sequence_producer->id()));
+        fuse_node = new_node;
         auto [new_in_edge, new_out_edge] = graphlib::insert_node_on_edge(graph, current_edge, new_node);
         current_edge = new_out_edge;
     }
 
     // Remove the final node
-    bypass_node(graph, node, true);
+    bypass_node(graph, terminal_node, true);
+
+    // connect the edge of the users to the fused op
+    if (multiple_user) {
+        for (auto& user : users){
+            if (user != terminal_node)
+                graph->add_edge(fuse_node, user);
+        }
+    }
+
     recalculate_shapes(graph);
     log_info(LogTMFusion, "Found replaceable TM sequence. Fuse from {} tms into {} tms.", current_pattern.size(), replace_pattern.size());
+    return true;
 }
 
 
@@ -164,12 +250,13 @@ bool fuse_tm_sequences(tt::graphlib::Graph* graph,TMPatternPairs& pattern_map) {
                     bool same_shape = output_shape == op->shape();
                     if (same_pattern and same_shape) {
                         // Replace current pattern with replace pattern
-                        replace_pattern_with_new_pattern(graph, current_pattern, replace_pattern, sequence_producer, node);
+                        bool is_pattern_replaced = replace_pattern_with_new_pattern(graph, current_pattern, replace_pattern, sequence_producer, node);
                         // Break and reset
                         current_pattern.clear();
                         sequence_producer = nullptr;
-                        updated = true;
-                        updated_anything = true;
+                        updated = is_pattern_replaced;
+                        if (is_pattern_replaced)
+                            updated_anything = is_pattern_replaced;
                         potential_prefix = true;
                         continue;
                     }
diff --git a/pybuda/csrc/passes/lower_to_mlir.cpp b/pybuda/csrc/passes/lower_to_mlir.cpp
new file mode 100644
index 000000000..5317ebeaa
--- /dev/null
+++ b/pybuda/csrc/passes/lower_to_mlir.cpp
@@ -0,0 +1,320 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "lower_to_mlir.hpp"
+
+// Standard headers
+#include <stdexcept>
+#include <string>
+
+// PyBuda headers
+#include "graph_lib/graph.hpp"
+#include "graph_lib/node.hpp"
+#include "graph_lib/utils.hpp"
+#include "graph_lib/node_types.hpp"
+#include "utils/logger.hpp"
+
+// MLIR headers
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-local-typedef"
+#include "mlir/Dialect/Func/IR/FuncOps.h"
+#include "mlir/Dialect/Tensor/IR/Tensor.h"
+#include "mlir/IR/BuiltinOps.h"
+#include "mlir/IR/Builders.h"
+#include "mlir/IR/MLIRContext.h"
+#include "mlir/IR/OperationSupport.h"
+#include "mlir/IR/Verifier.h"
+#pragma clang diagnostic pop
+
+// TTMLIR headers
+#include "ttmlir/Dialect/TT/IR/TT.h"
+#include "ttmlir/Dialect/TT/IR/TTOpsTypes.h"
+#include "ttmlir/Dialect/TTIR/IR/TTIR.h"
+#include "ttmlir/Dialect/TTIR/IR/TTIROps.h"
+#include "ttmlir/Dialect/TTIR/IR/TTIROps.h"
+
+namespace 
+{
+using namespace tt;
+/**
+ * @brief Implementation of TT-MLIR emission from the PyBuda graph.
+ */
+class MLIRGenerator
+{
+    public:
+        /// Construct a new MLIRGenerator object.
+        MLIRGenerator(mlir::MLIRContext &context) : builder_(&context) {}
+
+        /// Public API: Convert the PyBuda graph into an MLIR module operation for TTIR.
+        mlir::ModuleOp emit_mlir(graphlib::Graph *graph)
+        {
+            graphModule_ = mlir::ModuleOp::create(get_module_location(graph), "pybuda_graph");
+            graphModule_->setAttr(mlir::tt::SystemDescAttr::name,
+                      mlir::tt::SystemDescAttr::getDefault(builder_.getContext()));
+
+            builder_.setInsertionPointToStart(&graphModule_.getBodyRegion().front());
+
+            emit_mlir_function(graph);
+
+            /// Verify the module after we have finished constructing it, this will check
+            /// the structural properties of the IR and invoke any specific verifiers we
+            /// have on the TTIR operations.
+            if (failed(mlir::verify(graphModule_))) 
+            {
+                graphModule_.emitError("module verification failed.");
+                return nullptr;
+            }
+
+            mlir::OpPrintingFlags printFlags;
+            printFlags.enableDebugInfo();
+            graphModule_.print(llvm::outs(), printFlags);
+
+            return graphModule_;
+        }
+
+
+    private:
+        /// A "module" matches a PyBuda graph: containing a single function to exectue.
+        mlir::ModuleOp graphModule_;
+        /// The builder is a helper class to create IR. The builder
+        /// is stateful, in particular it keeps an "insertion point": this is where
+        /// the next operations will be introduced.
+        mlir::OpBuilder builder_;
+        // The symbol table maintains a mapping between the names of pybuda nodes and their corresponding values in the current scope.
+        // Initially, the function arguments (model activations) are added to the symbol table.
+        // After evaluating each pybuda op node, the declare function adds a new entry to the symbol table for future reference.
+        std::map<std::string, std::pair<mlir::Value, graphlib::Node*>> symbolTable_;
+
+        /// Declares a variable in the current (only) scope.
+        /// The declaration corresponds to exactly one operation node in the PyBuda graph.
+        void declare(graphlib::Node *node, mlir::Value value) {
+            if (symbolTable_.find(node->name()) != symbolTable_.end())
+            {
+                throw std::runtime_error("Variable " + node->name() + " already declared in the current scope.");
+            }
+            
+            symbolTable_[node->name()] = {value, node};
+        }
+
+        /// Emit a new function in MLIR.
+        /// A function represents a set of PyBuda operations that are executed to produce output results.
+        /// This function will generate the MLIR code for each PyBuda operation in the graph and emit the return operation for the function.
+        mlir::func::FuncOp emit_mlir_function(tt::graphlib::Graph *graph) {
+            // Assemble the function arguments (inputs)
+            llvm::SmallVector<mlir::Type> arguments;
+
+            for (auto *input : graph->nodes_by_type(tt::graphlib::kInput))
+            {
+                arguments.push_back(get_node_type(input));
+            }
+
+            // Assemble the function return values (outputs)
+            llvm::SmallVector<mlir::Type> returns;
+            for (auto *output : graph->nodes_by_type(tt::graphlib::kOutput))
+            {
+                returns.push_back(get_node_type(output));
+            }
+
+            // Create the function and emit it in the MLIR module.
+            auto funcType = builder_.getType<mlir::FunctionType>(mlir::TypeRange(arguments), mlir::TypeRange(returns));
+            auto func = builder_.create<mlir::func::FuncOp>(graphModule_.getLoc(), "main", funcType);
+            
+            // Start the body of the function by creating an entry block.
+            mlir::Block *entryBlock = func.addEntryBlock();
+
+            // Declare function arguments in the symbol table
+            for(auto namedValue: llvm::zip(graph->nodes_by_type(tt::graphlib::kInput), entryBlock->getArguments()))
+            {
+                auto node = std::get<0>(namedValue);
+                auto arg = std::get<1>(namedValue);
+                declare(node, arg);
+            }
+            
+            // Set the insertion point in the builder to the beginning of the function
+            // body, it will be used throughout the codegen to create operations in this
+            // function.
+            builder_.setInsertionPointToStart(entryBlock);
+
+            // Walk the graph in topological order and generate MLIR for each PyBuda operation
+            // node in the graph. For each new operation result, declare it in the symbol table.
+            for (auto *node : graphlib::topological_sort(*graph))
+            {
+                // Skip if the node isn't PyBuda operation
+                if (node->node_type() != tt::graphlib::NodeType::kPyOp)
+                {
+                    continue;
+                }
+
+                log_trace(LogMLIRGenerator, "Emitting MLIR for node {}", node->name());
+
+                tt::graphlib::OpNode *op_node = dynamic_cast<tt::graphlib::OpNode*>(node);
+                // Emit MLIR for the PyBuda operation node
+                mlir::Value opValue = emit_mlir_pybuda_operation(graph, op_node);
+
+                log_trace(LogMLIRGenerator, "Generated MLIR for node {} with value {}", node->name(), covnert_mlir_value_to_string(opValue));
+            }
+
+            emit_mlir_return_op(graph);
+
+            return func;
+        }
+
+        /// Emit an MLIR operation for a PyBuda node.
+        mlir::Value emit_mlir_pybuda_operation(tt::graphlib::Graph *graph, tt::graphlib::OpNode *op_node)
+        {
+            mlir::Value opResult;
+            if (tt::graphlib::is_eltwise(op_node))
+            {
+                opResult = emit_mlir_pybuda_elementwise_op(graph, op_node);
+            }
+
+            // This is the first time we are visiting this PyBuda node during the traversal of the graph using topological sort.
+            // Therefore, we need to declare the result of this operation so that we can refer to it later if needed.
+            declare(op_node, opResult);
+
+            return opResult;
+        }
+
+        /// Emit an MLIR operation for a PyBuda elementwise operation.
+        mlir::Value emit_mlir_pybuda_elementwise_op(tt::graphlib::Graph *graph, tt::graphlib::OpNode *op_node)
+        {
+            // Evaluate operation return type
+            llvm::SmallVector<mlir::Type> return_type_vector;
+            return_type_vector.push_back(get_node_type(op_node));
+            mlir::TypeRange return_types(return_type_vector);
+
+            // Creating input value range for the operation
+            // Since we are traversing the PyBuda graph using topological sort,
+            // all operands must be present in the symbol table.
+            // We iterate over the operands of the current node and retrieve their corresponding values from the symbol table.
+            llvm::SmallVector<mlir::Value> input_vector;
+            for (auto operand : graph->operands(op_node))
+            {
+                input_vector.push_back(symbolTable_.at(operand->name()).first);
+            }
+
+            mlir::ValueRange inputs(input_vector);
+
+            // Creating output value range for the operation by creating an empty tensor to hold the output value
+            llvm::SmallVector<mlir::Value> output_vector;
+            output_vector.push_back(emit_mlir_empty_tensor(graph, op_node));
+            mlir::ValueRange outputs = mlir::ValueRange(output_vector);
+
+            // Create an array attribute with three elements, each representing an operand constraint of type "AnyDevice"
+            auto atributes = builder_.getArrayAttr(llvm::SmallVector<mlir::Attribute>(
+                3, builder_.getAttr<mlir::tt::OperandConstraintAttr>(
+                           mlir::tt::OperandConstraint::AnyDevice)));
+
+            if (op_node->op_name() == "add")
+            {
+                auto opResult = builder_.create<mlir::tt::ttir::AddOp>(get_pybuda_operation_location(graph, op_node), return_types, inputs, outputs, atributes);
+                return opResult.getResult(0);
+            }
+            else if (op_node->op_name() == "multiply")
+            {
+                auto opResult = builder_.create<mlir::tt::ttir::MultiplyOp>(get_pybuda_operation_location(graph, op_node), return_types, inputs, outputs, atributes);
+                return opResult.getResult(0);
+            }
+            else {
+                log_error("Unsupported operation for lowering from PyBuda to TTIR: {}", op_node->op_name());
+                throw std::runtime_error("Unsupported operation for lowering from PyBuda to TTIR");
+            }
+        }
+
+        /// Emit an MLIR operation for an empty tensor.
+        mlir::Value emit_mlir_empty_tensor(tt::graphlib::Graph *graph, tt::graphlib::Node *node)
+        {
+            llvm::SmallVector<int64_t> shape_vec;
+            for(auto dim : node->shape().as_vector())
+            {
+                shape_vec.push_back((int64_t)dim);
+            }
+
+            return builder_.create<mlir::tensor::EmptyOp>(get_pybuda_operation_location(graph, node), shape_vec, get_float_type(node));
+        }
+
+        /// Emit the return operation for the function.
+        void emit_mlir_return_op(tt::graphlib::Graph *graph)
+        {
+            // Assemble the function return values (outputs)
+            llvm::SmallVector<mlir::Value> returnValues;
+            for (auto *output : graph->nodes_by_type(tt::graphlib::kOutput))
+            {
+                auto output_operand = graph->operands(output)[0];
+                auto outputValue = symbolTable_[output_operand->name()].first;
+                returnValues.push_back(outputValue);
+            }
+
+            builder_.create<mlir::func::ReturnOp>(builder_.getUnknownLoc(), mlir::ValueRange(returnValues));
+        }
+
+        /// Get the MLIR float type type for a PyBuda node.
+        mlir::FloatType get_float_type(graphlib::Node *node)
+        {
+            switch (node->output_df())
+            {
+                case tt::DataFormat::Float32:
+                    return builder_.getF32Type();
+                case tt::DataFormat::Float16_b:
+                    return builder_.getF16Type();
+                case tt::DataFormat::Float16:
+                    return builder_.getF16Type();
+                default:
+                    TT_ASSERT(false);
+            }
+
+            // TODO add all supported types in switch
+            return builder_.getF32Type();
+        }
+
+        /// Get the MLIR type for a PyBuda node.
+        mlir::Type get_node_type(graphlib::Node *node)
+        {
+            std::vector<int64_t> shape_vec;
+            for (auto dim : node->shape().as_vector())
+            {
+                shape_vec.push_back((int64_t)dim);
+            }
+            return mlir::RankedTensorType::get(shape_vec, get_float_type(node));
+        }
+
+        /// Get the location for a module.
+        mlir::Location get_module_location(tt::graphlib::Graph *graph)
+        {
+            return mlir::FileLineColLoc::get(builder_.getContext(), graph->name(), graph->id(), 0);
+        }
+
+        /// Get the simple location for a node in a format "graph_name", (graph_id), (node_id)
+        mlir::Location get_node_location(tt::graphlib::Graph *graph, tt::graphlib::Node *node)
+        {
+            return mlir::FileLineColLoc::get(builder_.getContext(), graph->name(), graph->id(), node->id());
+        }
+
+        /// Get the location for a PyBuda operation. The location is a combination of the operation name and the node location.
+        mlir::Location get_pybuda_operation_location(tt::graphlib::Graph *graph, tt::graphlib::Node *node)
+        {
+            return mlir::NameLoc::get(builder_.getStringAttr(node->name()), get_node_location(graph, node));
+        }
+
+        /// Convert an MLIR value to a string.
+        std::string covnert_mlir_value_to_string(mlir::Value &value)
+        {
+            std::string string_value;
+            llvm::raw_string_ostream os(string_value);
+
+            os << value;
+
+            os.flush();
+            return string_value;
+        }
+};
+}
+
+namespace tt::passes
+{
+    /// Public API for generating MLIR from the PyBuda graph.
+     mlir::OwningOpRef<mlir::ModuleOp> lower_to_mlir(graphlib::Graph * graph, mlir::MLIRContext& context)
+    {
+        return MLIRGenerator(context).emit_mlir(graph);
+    }
+}
diff --git a/pybuda/csrc/passes/lower_to_mlir.hpp b/pybuda/csrc/passes/lower_to_mlir.hpp
new file mode 100644
index 000000000..c6ec163a3
--- /dev/null
+++ b/pybuda/csrc/passes/lower_to_mlir.hpp
@@ -0,0 +1,21 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+namespace tt::graphlib
+{
+class Graph;
+}
+
+namespace mlir {
+    class MLIRContext;
+    class ModuleOp;
+    template <typename OpTy> class OwningOpRef;
+} // namespace mlir
+
+namespace tt::passes 
+{
+    // Public API for generating MLIR from the PyBuda graph.
+    mlir::OwningOpRef<mlir::ModuleOp> lower_to_mlir(tt::graphlib::Graph * graph, mlir::MLIRContext& context);
+} // namespace tt:passes
+
diff --git a/pybuda/csrc/passes/lowering_context.cpp b/pybuda/csrc/passes/lowering_context.cpp
index 5a1bcfe51..7aeda49ea 100644
--- a/pybuda/csrc/passes/lowering_context.cpp
+++ b/pybuda/csrc/passes/lowering_context.cpp
@@ -2,16 +2,14 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 #include "passes/lowering_context.hpp"
+
 #include "buda_passes.hpp"
-#include "utils/assert.hpp"
-#include "utils/logger.hpp"
 #include "graph_lib/node_types.hpp"
 #include "graph_lib/utils.hpp"
-#include "placer/dram.hpp"
-#include "placer/utils.hpp"
-#include "reportify/reportify.hpp"
 #include "passes/decomposing_context.hpp"
-
+#include "reportify/reportify.hpp"
+#include "utils/assert.hpp"
+#include "utils/logger.hpp"
 
 namespace tt {
 
@@ -309,19 +307,6 @@ Node *lower_queue(Graph *old_graph, Graph *new_graph, Node *old_node, NodeToNode
         }
     }
 
-    //
-    // WA for backend/golden issue which doesn't handle ops that format convert.  This is especially exposed
-    // since we will demote F32 ops to F16b, so this workaround also demotes inputs of F32 to F16b which
-    // enables our current test suite to pass.
-    //
-    // tenstorrent/budabackend#274
-    //
-    /*if (new_node->node_type() == NodeType::kInput and new_node->output_df() == DataFormat::Float32) {
-        new_node->set_output_df(DataFormat::Float16_b);
-        log_warning(
-            LogGraphCompiler,
-            "Demoting f32 input to f16b tenstorrent/budabackend#274");
-    }*/
     return new_node;
 }
 
@@ -392,17 +377,25 @@ void copy_operand_edges_to_new_graph(
 
 void lower_edge_tms(Graph *old_graph, Edge &old_edge, std::shared_ptr<graphlib::EdgeAttributes> new_attr)
 {
-    // Broadcasts were in the original dimensions, so we need to conver to 4d buda
+    // Broadcasts were in the original dimensions, so we need to convert to 4d buda
     std::vector<graphlib::OpType> old_tms = old_graph->get_edge_attributes(old_edge)->get_tms();
 
     for (const graphlib::OpType &tm : old_tms)
     {
-        int delta = 4 - old_graph->node_by_id(old_edge.producer_node_id)->shape().as_vector().size();
+        // Handle delta calculation for producers that are greater then 4D. For 4D shapes
+        // and below, we need to account for 4 dimensions to match the Buda expectations.
+        int delta = 0;
+        int producer_rank = old_graph->node_by_id(old_edge.producer_node_id)->shape().as_vector().size();
+        if (producer_rank <= 4) {
+            delta = 4 - producer_rank;
+            producer_rank = 4;
+        }
+
         auto new_tm = graphlib::OpType(tm);
 
         // If TM attr is referenced backwards (negative indexing), directly convert to positive axis.
         if (std::get<int>(new_tm.attr[0]) < 0) {
-            std::get<int>(new_tm.attr[0]) += 4;
+            std::get<int>(new_tm.attr[0]) += producer_rank;
         } else {
             std::get<int>(new_tm.attr[0]) += delta;
         }
diff --git a/pybuda/csrc/passes/lowering_context.hpp b/pybuda/csrc/passes/lowering_context.hpp
index d3d7560fd..18f0d9be7 100644
--- a/pybuda/csrc/passes/lowering_context.hpp
+++ b/pybuda/csrc/passes/lowering_context.hpp
@@ -3,9 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 #pragma once
 
-#include "balancer/balancer.hpp"
-#include "placer/placer.hpp"
-#include "placer/dram.hpp"
 #include "graph_lib/node.hpp"
 #include "graph_lib/node_types.hpp"
 #include "shared_utils/sparse_matmul_utils.hpp"
@@ -17,6 +14,7 @@ namespace tt {
 using Graph = graphlib::Graph;
 using Node = graphlib::Node;
 using NodeContext = graphlib::NodeContext;
+using Edge = graphlib::Edge;
 using NodeToNodeMap = std::unordered_map<Node *, Node *>;
 
 class LoweringContext {
@@ -100,5 +98,4 @@ void copy_operand_edges_to_new_graph(
 );
 
 void lower_edge_tms(Graph *old_graph, Edge &old_edge, std::shared_ptr<graphlib::EdgeAttributes> new_attr);
-
 }
diff --git a/pybuda/csrc/passes/mlir_compiler.cpp b/pybuda/csrc/passes/mlir_compiler.cpp
new file mode 100644
index 000000000..bda168640
--- /dev/null
+++ b/pybuda/csrc/passes/mlir_compiler.cpp
@@ -0,0 +1,67 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "mlir_compiler.hpp"
+#include <memory>
+#include "lower_to_mlir.hpp"
+#include "mlir_passes.hpp"
+
+// PyBuda headers
+#include "graph_lib/graph.hpp"
+
+// MLIR headers
+#include "mlir/IR/BuiltinOps.h"
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wunused-local-typedef"
+#include "mlir/InitAllDialects.h"
+#pragma clang diagnostic pop
+
+// TTMLIR headers
+#include "tt/runtime/types.h"
+#include "ttmlir/Dialect/TT/IR/TT.h"
+#include "ttmlir/Dialect/TTIR/IR/TTIR.h"
+#include "ttmlir/Dialect/TTNN/IR/TTNN.h"
+#include "ttmlir/Target/TTNN/TTNNToFlatbuffer.h"
+
+#include "tt_torch_device/tt_device.hpp"
+
+namespace tt::passes
+{
+    /// Public API for lowering to MLIR, running MLIR passes and generate runtime binary.
+    runtime::Binary run_mlir_compiler(tt::graphlib::Graph *graph)
+    {
+        // Register all the required dialects.
+        mlir::DialectRegistry registry;
+            
+        registry.insert<
+            mlir::tt::TTDialect, mlir::tt::ttir::TTIRDialect,
+            mlir::tt::ttnn::TTNNDialect, mlir::arith::ArithDialect,
+            mlir::func::FuncDialect, mlir::ml_program::MLProgramDialect,
+            mlir::tensor::TensorDialect>();
+
+        // Create a context with all registered dialects.
+        mlir::MLIRContext context(registry);
+        // Load all available dialects
+        context.loadAllAvailableDialects();
+
+        // Generate MLIR from the PyBuda graph.
+        mlir::OwningOpRef<mlir::ModuleOp> mlir_module = lower_to_mlir(graph, context);
+        tt::log_info("MLIR module generated successfully.");
+
+        // Run MLIR registered passes.
+        run_mlir_passes(mlir_module);
+        tt::log_info("MLIR passes run successfully.");
+
+        // Generate binary from the MLIR module.
+        auto binary = mlir::tt::ttnn::ttnnToFlatbuffer(mlir_module.get());
+        tt::log_info("Flatbuffer binary generated successfully.");
+
+        if (binary == nullptr)
+        {
+            throw std::runtime_error("Failed to generate flatbuffer binary."); 
+        }
+
+        return binary;
+    }
+}
diff --git a/pybuda/csrc/passes/mlir_compiler.hpp b/pybuda/csrc/passes/mlir_compiler.hpp
new file mode 100644
index 000000000..eed44b24a
--- /dev/null
+++ b/pybuda/csrc/passes/mlir_compiler.hpp
@@ -0,0 +1,21 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+#include <memory>
+
+#include "tt/runtime/types.h"
+
+namespace tt
+{
+    namespace graphlib
+    {
+        class Graph;
+    }
+}
+
+namespace tt::passes
+{
+    /// Public API for running MLIR passes and generating binary.
+    runtime::Binary run_mlir_compiler(tt::graphlib::Graph *graph);
+}
diff --git a/pybuda/csrc/passes/mlir_passes.cpp b/pybuda/csrc/passes/mlir_passes.cpp
new file mode 100644
index 000000000..851ef3703
--- /dev/null
+++ b/pybuda/csrc/passes/mlir_passes.cpp
@@ -0,0 +1,68 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#include "mlir_passes.hpp"
+
+// Standard headers
+#include <stdexcept>
+
+// MLIR headers
+#include "mlir/IR/BuiltinOps.h"
+
+// TTMLIR headers
+#include "ttmlir/Dialect/TTIR/Transforms/Passes.h"
+#include "ttmlir/Dialect/TTNN/Transforms/Passes.h"
+#include "ttmlir/Dialect/TTNN/Pipelines/Passes.h"
+#include "utils/logger.hpp"
+
+namespace tt::passes
+{
+    /// Public API for running MLIR passes and generating binary.
+    void run_mlir_passes(mlir::OwningOpRef<mlir::ModuleOp> &mlir_module)
+    {
+        static bool _ = []() {
+            // Register required passes
+            mlir::tt::ttir::registerPasses();
+            mlir::tt::ttnn::registerPasses();
+
+            // Register pass pipelines
+            // This will internally register the pipelines in the MLIR pipeline registry. Then,
+            // the registry can be used to lookup the pipeline by its name and add it to the pass manager.
+            mlir::tt::ttnn::registerTTNNPipelines();
+
+            return true;
+        }();
+        (void)_;
+
+        // Create a pass manager.
+        mlir::PassManager pm(mlir_module.get()->getName());
+
+        // Get the pipeline info for the wanted pipeline.
+        const auto pipelineInfo = mlir::PassPipelineInfo::lookup("ttir-to-ttnn-backend-pipeline");
+
+        // This error handler is necessary when adding the pipeline to the pass manager (via PassPipelineInfo).
+        // It's supposed to be called when there's an error during parsing of the pipeline options.
+        // However, I think it's wrongly implemented in the MLIR library, so it doesn't get called.
+        mlir::function_ref<mlir::LogicalResult(const mlir::Twine &)>  err_handler = [](const mlir::Twine &location) {
+            log_error(LogMLIRGenerator, "Error during parsing pipeline options: {}", location.str());
+            return mlir::failure();
+        };
+
+        // Pipeline options are empty for now.
+        std::string options{""};
+
+        auto result = pipelineInfo->addToPipeline(pm, options, err_handler);
+        if (mlir::failed(result))
+        {
+            throw std::runtime_error("Failed to add the pipeline to the pass manager!");
+        }
+
+        // Run the pass manager.
+        if (mlir::failed(pm.run(mlir_module.get())))
+        {
+            throw std::runtime_error("Failed to run MLIR compiler pass pipeline.");
+        }
+
+        mlir_module.get().dump();
+    }
+}
diff --git a/pybuda/csrc/passes/mlir_passes.hpp b/pybuda/csrc/passes/mlir_passes.hpp
new file mode 100644
index 000000000..4fdc87e82
--- /dev/null
+++ b/pybuda/csrc/passes/mlir_passes.hpp
@@ -0,0 +1,13 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+namespace mlir {
+    class ModuleOp;
+    template <typename OpTy> class OwningOpRef;
+} // namespace mlir
+
+namespace tt::passes
+{
+    /// Public API for running MLIR passes and generating binary.
+    void run_mlir_passes(mlir::OwningOpRef<mlir::ModuleOp> &mlir_module);
+}
\ No newline at end of file
diff --git a/pybuda/csrc/passes/move_requantize.cpp b/pybuda/csrc/passes/move_requantize.cpp
new file mode 100644
index 000000000..23eecee02
--- /dev/null
+++ b/pybuda/csrc/passes/move_requantize.cpp
@@ -0,0 +1,208 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#include <pybind11/pybind11.h>
+
+#include "graph_lib/node_types.hpp"
+#include "graph_lib/utils.hpp"
+#include "utils/logger.hpp"
+#include "python_bindings_common.hpp"
+#include "graph_lib/node.hpp"
+#include "graph_lib/graph.hpp"
+#include "passes/move_requantize.hpp"
+#include "utils/logger.hpp"
+#include "passes/passes_utils.hpp"
+#include "passes/commute_utils.hpp"
+
+namespace tt::passes
+{
+static void set_bcast_dims(graphlib::Graph *graph, std::vector<int> &volumes, graphlib::Edge edge) {
+    graph->get_edge_attributes(edge)->clear_broadcast_dims();
+
+    for (std::size_t i = 0; i < volumes.size(); i++) {
+        int volume = volumes[i];
+        if (volume > 1) {
+            graph->get_edge_attributes(edge)->set_broadcast_dim(i, volume, false); 
+        }
+    }
+}
+
+static std::vector<graphlib::Node *> find_path_to_requant(
+    graphlib::Graph *graph,
+    graphlib::OpNode *initial_op)
+{
+    std::vector<graphlib::Node *> path;
+
+    graphlib::Node *iter = initial_op;
+    auto clone_shape = initial_op->shape();
+
+    bool found_requant = false;
+    while (not found_requant)
+    {   
+        auto op = dynamic_cast<graphlib::OpNode *>(iter);
+        if (not op)
+            break;
+
+        if (op->op_name() == "buda_requantize")
+        {
+            found_requant = true;
+            path.push_back(op);
+            break;
+        }
+
+        if (graph->data_users(op).size() > 1)
+            break;
+
+        if (not (is_elementwise(op) or op == initial_op))
+            break;
+
+        // Only commute through elementwise ops
+        path.push_back(op);
+        iter = graph->data_users(op)[0];
+    }
+
+    if (not found_requant)
+        path.clear();
+
+    return path;
+}
+
+
+void commute_through_requant(graphlib::Graph *graph, std::vector<graphlib::Node *> const &path) {
+    TT_ASSERT(path.size() >= 2);
+    graphlib::OpNode *first = path.front()->as<graphlib::OpNode>();
+    graphlib::OpNode *last = path.back()->as<graphlib::OpNode>();
+    log_debug(LogGraphCompiler, "Commute and bypass TM through requant: {} -> {}", first->name(), last->name());
+    graphlib::OpType golden_transform = first->op_type();
+
+    graphlib::Shape commute_shape = shape_of_only_operand(graph, first);
+    graphlib::Shape clone_shape = first->shape();
+
+    for (std::size_t i = 1; i < path.size(); ++i)
+    {
+
+        graphlib::Node *producer = path[i - 1];
+        graphlib::Node *consumer = path[i];
+        auto consumer_df_before = consumer->output_df();
+
+        TT_ASSERT(graph->user_data_edges(producer).size() == 1);
+
+        // Set the shape to the desired final shape for this whole path
+        if (graphlib::OpNode *op = dynamic_cast<graphlib::OpNode *>(consumer))
+        {   
+            graphlib::OpNode *producer_as_op = dynamic_cast<graphlib::OpNode *>(producer);
+            if (producer_as_op) {
+                // Must change commute shape, clone shape, and golden transform if there are broadcasts on the incoming edge
+                auto [commute_bcasts, clone_bcasts] = handle_shape_change_through_bcast(graph, first, producer_as_op, op, &commute_shape, &clone_shape).second;
+                if (golden_transform.op == "reshape") {
+                    for (std::size_t i = 0; i < golden_transform.attr.size(); i++) {
+                        int current_dim = std::get<int>(golden_transform.attr[i]);
+                        golden_transform.attr[i] = clone_bcasts[i]*current_dim;
+                    }
+                }
+
+                graphlib::Edge between_edge = retrieve_between_edge(graph, producer, consumer);
+                set_bcast_dims(graph, commute_bcasts, between_edge);                
+            }
+
+            if (is_elementwise(op))
+            {
+                commute_through_eltwise(op, &commute_shape, &golden_transform);
+            }
+            else if (is_quantization_ops(op)) {
+                commute_through_quantization(op, &commute_shape, &golden_transform);
+            }
+            else 
+            {
+                TT_ASSERT(false, "Found non-elementwise and non-quant op in path to requantize");
+            }
+            log_trace(LogGraphCompiler, "  Op node: {} -> shape set to {}", consumer->name(), commute_shape);
+        }
+
+        // Handle nary operands (not on this `path`) 
+        std::vector<graphlib::Edge> consumer_operands = graph->operand_data_edges(consumer);
+        for (graphlib::Edge operand_edge : consumer_operands)
+        {
+            if (operand_edge.producer_node_id == producer->id())
+                continue;
+
+            convert_implicit_to_explicit_bcasts(graph, operand_edge);
+            auto name = last->name() + "_operand_commute_clone" + std::to_string(operand_edge.edge_creation_id);
+            graphlib::Node *clone = graph->add_node(first->clone(name), graph->get_subgraph_id_for_node(operand_edge.producer_node_id));
+            graphlib::OpNode *op = dynamic_cast<graphlib::OpNode *>(clone);
+            log_trace(LogGraphCompiler, "  Operand commute clone: {} -> between {} and {} ", name, consumer->name(), graph->node_by_id(operand_edge.producer_node_id)->name());
+
+            update_reshape_attr(op, commute_shape);
+            clone->set_shape(commute_shape);
+            log_trace(LogGraphCompiler, "  Operand commute clone shape: {}", commute_shape);
+
+
+            auto [in_edge, out_edge] = insert_node_on_edge(graph, operand_edge, clone);
+            // Set dataformat to match producer on operand edge
+            clone->set_output_df(graph->node_by_id(in_edge.producer_node_id)->output_df());
+            handle_change_rank(graph, clone);
+            try_commute_bcast_through_clone(graph, op);
+            if (graphlib::InputNode *input = dynamic_cast<graphlib::InputNode *>(graph->data_operands(clone)[0]))
+                try_consteval_input_no_operand_forks(graph, input, true);
+        }
+        // Maintain df from before commute
+        consumer->set_output_df(consumer_df_before);
+    }
+
+
+    // Insert the TM after requant op
+    auto tm_ = bypass_node(graph, first, true /*remove*/);
+    auto tags = tm_->as<graphlib::TaggedNode>()->get_tags();
+
+    for (auto requant_out_edge : graph->user_data_edges(last))
+    {
+        auto original_tms = graph->get_edge_attributes(requant_out_edge)->get_tms();
+        TT_ASSERT(original_tms.size() == 0);
+        auto name = tm_->name() + "_cloned" + std::to_string(requant_out_edge.edge_creation_id);
+
+        graphlib::Node *curr_node = graph->add_node(
+            tm_->clone(name), graph->get_subgraph_id_for_node(requant_out_edge.consumer_node_id));
+        curr_node->as<graphlib::TaggedNode>()->add_tags(tags);
+
+        insert_node_on_edge(graph, requant_out_edge, curr_node);
+        curr_node->set_output_df(last->output_df());
+
+    }
+}
+
+
+bool move_tm_through_requantize(graphlib::Graph *graph) {
+
+    bool attempt_update = true;
+    bool updated_anything = false;
+    while (attempt_update)
+    {   
+        // Set to false here because we want to stop looping if no update occurs
+        attempt_update = false;
+        for (auto *node : graphlib::topological_sort(*graph))
+        {
+            graphlib::OpNode *op = dynamic_cast<graphlib::OpNode *>(node);
+            if (not op)
+                continue;
+
+            if (op->as<graphlib::TaggedNode>()->has_tag("dont_erase"))
+                continue;
+
+            if (op->op_name() != "reshape" and op->op_name() != "transpose")
+                continue;
+
+            std::vector<graphlib::Node *> path = find_path_to_requant(graph, op);
+            if (path.empty())
+                continue;
+
+            commute_through_requant(graph, path);
+            attempt_update = true;
+            updated_anything = true;
+            recalculate_shapes(graph);
+            break;
+        }
+    }
+    return updated_anything;
+}
+
+}  // namespace tt::passes
\ No newline at end of file
diff --git a/pybuda/csrc/passes/move_requantize.hpp b/pybuda/csrc/passes/move_requantize.hpp
new file mode 100644
index 000000000..0ce449323
--- /dev/null
+++ b/pybuda/csrc/passes/move_requantize.hpp
@@ -0,0 +1,15 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+namespace tt::graphlib
+{
+class Graph;
+}
+
+namespace tt::passes
+{
+
+bool move_tm_through_requantize(graphlib::Graph *graph);
+}
diff --git a/pybuda/csrc/passes/padding_pass_placer.cpp b/pybuda/csrc/passes/padding_pass_placer.cpp
deleted file mode 100644
index 344ad8845..000000000
--- a/pybuda/csrc/passes/padding_pass_placer.cpp
+++ /dev/null
@@ -1,1771 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "passes/padding_pass_placer.hpp"
-
-#include <cmath>
-#include <iostream>
-#include <map>
-#include <set>
-#include <string>
-#include <variant>
-#include <vector>
-
-#include "balancer/balancer_utils.hpp"
-#include "balancer/legalizer/legalizer.hpp"
-#include "graph_lib/edge.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-#include "graph_lib/node_types.hpp"
-#include "graph_lib/shape.hpp"
-#include "graph_lib/utils.hpp"
-#include "lower_to_buda/common.hpp"
-#include "passes/eth_stream_reduction.hpp"
-#include "utils/assert.hpp"
-#include "shared_utils/sparse_matmul_utils.hpp"
-
-
-using BudaOpAttrs = tt::BudaOpAttrs;
-
-using Graph = tt::graphlib::Graph;
-using NodeId = tt::graphlib::NodeId;
-using Node = tt::graphlib::Node;
-using NodeType = tt::graphlib::NodeType;
-using TaggedNode = tt::graphlib::TaggedNode;
-using Edge = tt::graphlib::Edge;
-using EdgeType = tt::graphlib::EdgeType;
-using PortId = tt::graphlib::PortId;
-using BudaOpNode = tt::graphlib::BudaOpNode;
-using OpType = tt::graphlib::OpType;
-using Shape = tt::graphlib::Shape;
-using ConstantInputNode = tt::graphlib::ConstantInputNode;
-
-using Padding = tt::padding_placer::Padding;
-using PaddingCriterion = tt::padding_placer::PaddingCriterion;
-using PaddingOperation = tt::padding_placer::PaddingOperation;
-using PaddingDimension = tt::padding_placer::PaddingDimension;
-
-using SparseBUDA = tt::sparse::SparseBUDA;
-using SparseCOO = tt::sparse::SparseCOO;
-
-using OpModelFailureReason = tt::balancer::OpModelFailureReason;
-using BudaOpNodeLegalizerFailureInfo = tt::balancer::BudaOpNodeLegalizerFailureInfo;
-using BalancerConfig = tt::balancer::BalancerConfig;
-using LegalOpModels = std::unordered_map<Node const *, std::vector<tt::balancer::OpModel>>;
-
-namespace tt::padding_placer
-{
-
-bool pad_pass_placer(
-    Graph *graph,
-    const std::unordered_map<graphlib::Node *, 
-    const BudaOpNodeLegalizerFailureInfo> &nodes_to_pad,
-    const balancer::BalancerConfig &balancer_config,
-    std::shared_ptr<balancer::BalancerCacheCollection> balancer_cache_collection)
-{
-    const int PADDING_TRY_MAX = 10;
-    bool padded = false;
-
-    // We pass operations we want to pad, in other words if paddings map is not empty,
-    // for each operations in our graph we check if it should be padded or not.
-    // So, it should exist in the map and its flag should be TRUE, otherwise we skip the node.
-
-    for (const auto &node_fail_pair : nodes_to_pad)
-    {
-
-        Node* node = node_fail_pair.first;
-        const BudaOpNodeLegalizerFailureInfo failure_info = node_fail_pair.second;
-        log_debug(LogPadding, "Padding node {} with {}", node->name(), failure_info.toString().c_str());
-
-        if (node->as<graphlib::TaggedNode>()->has_tag("padding"))
-            continue;
-
-        if (node->as<graphlib::TaggedNode>()->has_tag("padding_nop"))
-            continue;
-
-        std::uint32_t user_access_cnt = failure_info.getOpModelFailureCountByType(OpModelFailureReason::UserAccessPreventsStreaming);
-        std::uint32_t buffer_alloc_cnt = failure_info.getOpModelFailureCountByType(OpModelFailureReason::InputBufferAllocationFailure);
-
-        int padding_try_it = 0;
-        bool buffer_alloc_flag = false;
-        bool padded_loop = false;
-
-        Padding padding;
-        // Preserve the original shape
-        padding.orig_shape = node->shape();
-
-        while (padding_try_it++ < PADDING_TRY_MAX && buffer_alloc_cnt > 0)
-        {
-
-            padded_loop = pad_node(graph, node, padding);
-            
-            if (padded_loop) 
-            {
-
-                std::unordered_map<Node*, const BudaOpNodeLegalizerFailureInfo> failures = check_node_legality(graph, node, balancer_config, balancer_cache_collection);
-                if (failures.size() > 0)
-                {
-                    remove_padding(graph, node, padding);
-                    if (padded_loop)
-                        padded_loop = false;
-
-                    if (padding.added_nop)
-                        break;
-                        
-                    buffer_alloc_cnt = failures[node].getOpModelFailureCountByType(OpModelFailureReason::InputBufferAllocationFailure);
-                }
-                else
-                {
-                    buffer_alloc_flag = true;
-                    padded |= padded_loop;
-                    break;
-                }
-
-            }
-
-        }
-
-        if (!buffer_alloc_flag) 
-        {
-
-            if (user_access_cnt > 0)
-            {
-                insert_queue(graph, node);
-                padded = true;
-            }
-            else 
-            {
-                // Reset padding structure
-                Padding padding;
-                // Preserve the original shape
-                padding.orig_shape = node->shape();
-                padded |= pad_node(graph, node, padding);
-            }
-
-        }
-
-    }
-
-    return padded;
-}
-
-std::unordered_map<Node *, const BudaOpNodeLegalizerFailureInfo> check_node_legality(
-    Graph *graph,
-    Node *node,
-    const BalancerConfig &balancer_config,
-    std::shared_ptr<balancer::BalancerCacheCollection> balancer_cache_collection)
-{
-    // We use this functions to check if the particular node has legal op models.
-    // This function is intended to be used in the padding pass,
-    // but it can also be a general purpose function.
-
-    std::unordered_set<graphlib::Node *> nodes_to_legalize = {node};
-
-    try
-    {
-        LegalOpModels legal_op_models = tt::balancer::legalizer::get_legal_op_models(
-            graph, balancer_config, balancer_cache_collection, &nodes_to_legalize);
-    }
-    catch (const balancer::BalancerError &e)
-    {
-        balancer::BalancerError::NoValidGrid const *type = std::get_if<balancer::BalancerError::NoValidGrid>(&e.type);
-        if (type)
-        {
-            return type->nodes_without_legal_op_model;
-        }
-    }
-
-    return {};
-}
-
-
-void remove_padding(Graph *graph, Node *node, Padding &padding)
-{
-    if (node->node_type() != NodeType::kBudaOp)
-        return;
-
-    remove_pad(graph, node, padding);
-    remove_unpad(graph, node /*, padding */);
-
-    // Reset the shape of the node.
-    node->set_shape(padding.orig_shape);
-}
-
-void remove_pad(Graph *graph, Node *node, Padding &padding)
-{
-    if (node->as<BudaOpNode>()->is_sparse_matmul())
-        restore_smm(graph, node, padding);
-    remove_buda_pad(graph, node);
-}
-
-void restore_smm(Graph *graph, Node *node, Padding &padding)
-{
-
-    std::vector<Edge> incoming_edges = graph->operand_data_edges(node);
-    Node *incoming_node = nullptr;
-
-    for (Edge incoming_edge : incoming_edges)
-    {
-        if (incoming_edge.consumer_input_port_id == 0)
-        {
-            NodeId incoming_node_id = incoming_edge.producer_node_id;
-            incoming_node = graph->node_by_id(incoming_node_id);
-        }
-    }
-
-    if (incoming_node == nullptr)
-        return;
-
-    // Create constant input node using incoming node.
-    ConstantInputNode *pad_node = incoming_node->as<ConstantInputNode>();
-
-    // Get sparse buda object that keeps information about the sparse tensor we want to pad
-    SparseBUDA sparse_pad_node = pad_node->get_sparse_buda();
-
-    // Change shape of the sparse buda tensor
-    std::vector<std::int64_t> sparse_shape = sparse_pad_node.sparse_shape;
-    std::uint32_t sparse_shape_size = sparse_shape.size();
-    sparse_shape[sparse_shape_size - 2] -= padding.pad_lhs_rt * Shape::BUDA_TILE_DIM;
-    sparse_shape[sparse_shape_size - 1] -= padding.pad_lhs_ct * Shape::BUDA_TILE_DIM;
-
-    // Change shape of the sparse_zs tensors
-    std::vector<SparseCOO> sparse_zs = sparse_pad_node.sparse_zs;
-    for (SparseCOO& sparse_z : sparse_zs) {
-        std::vector<std::int64_t> sparse_z_shape = sparse_z.shape;
-        std::uint32_t sparse_z_shape_size = sparse_z_shape.size();
-        sparse_z_shape[sparse_z_shape_size - 2] -= padding.pad_lhs_rt * Shape::BUDA_TILE_DIM;
-        sparse_z_shape[sparse_z_shape_size - 1] -= padding.pad_lhs_ct * Shape::BUDA_TILE_DIM;
-        sparse_z.shape = sparse_z_shape;
-    }
-
-    // Set the sparse buda tensor to pad node with the new shapes
-    sparse_pad_node.sparse_shape = sparse_shape;
-    sparse_pad_node.sparse_zs = sparse_zs;
-    pad_node->set_sparse_buda(sparse_pad_node);
-
-    if (padding.pad_lhs_rt > 0) {
-        graphlib::OpNode* op = node->as<graphlib::OpNode>();
-        auto op_attrs = op->op_attrs();
-        op_attrs[5] = padding.sparse_r_attr;
-        op->overwrite_op_attrs(op_attrs);
-    }
-}
-
-void remove_unpad(Graph *graph, Node *node /*, Padding &padding*/)
-{
-    remove_buda_unpad(graph, node);
-}
-
-void remove_buda_pad(Graph *graph, Node *node)
-{
-    std::vector<Edge> incoming_edges = graph->operand_data_edges(node);
-    for (Edge incoming_edge : incoming_edges)
-    {
-        std::vector<OpType> tms = graph->get_edge_attributes(incoming_edge)->get_tms();
-        // Buda Pad operation is always the last TM on the edge in this phase.
-        if (tms.size() > 0 && tms.back().op == "buda_pad")
-            // Remove certain TM.
-            tms.pop_back();
-        // Set the new TMs without buda pad.
-        graph->get_edge_attributes(incoming_edge)->set_tms(tms);
-    }
-}
-
-
-void remove_buda_unpad(Graph *graph, Node *node)
-{
-    std::vector<Edge> outgoing_edges = graph->user_data_edges(node);
-    for (Edge outgoing_edge : outgoing_edges)
-    {
-
-        NodeId outgoing_node_id = outgoing_edge.consumer_node_id;
-        Node* outgoing_node = graph->node_by_id(outgoing_node_id);
-
-        // Remove unpad node, nop, queue. In the future, we can have a few combinations.
-        // In buda space unpad is an attribute.
-        // Combination #1, nop, queue, buda_unpad
-        // Combination #2, only buda_unpad
-        // Combination #3, nop and buda_unpad
-        // Combination #4, queue and buda_unpad 
-        if (outgoing_node->node_type() == NodeType::kBudaOp)
-        {
-            // If the combination starts we possibly have combinations #1, #2 or #3.
-            BudaOpNode *buda_op_node = outgoing_node->as<BudaOpNode>();
-            // Get type of the operation
-            std::string op_type = buda_op_node->as<BudaOpNode>()->op_type().op;
-            if (op_type == "nop" && buda_op_node->as<graphlib::TaggedNode>()->has_tag("padding_nop")) 
-            {
-                // Potential combinations #1 and #3.
-                std::vector<Edge> nop_outgoing_edges = graph->user_data_edges(outgoing_node);
-                NodeId nop_outgoing_node_id = nop_outgoing_edges[0].consumer_node_id;
-                Node* nop_outgoing_node = graph->node_by_id(nop_outgoing_node_id);
-                if (nop_outgoing_edges.size() != 1)
-                    break;
-                std::vector<OpType> tms = graph->get_edge_attributes(nop_outgoing_edges[0])->get_tms();
-                // Buda Unpad operation is always first TM on the edge in this phase
-                if (tms.size() > 0 && tms[0].op == "buda_unpad") {
-                    // Potential combination #3.
-                    // Remove buda_unpad.
-                    tms.erase(tms.begin());
-                    graph->get_edge_attributes(nop_outgoing_edges[0])->set_tms(tms);
-                    // Remove nop.
-                    bypass_node(graph, outgoing_node, /* remove node */ true);
-                }
-                else if (nop_outgoing_node->node_type() == NodeType::kQueue)
-                {
-                    // Potential combination #1.
-                    std::vector<Edge> queue_outgoing_edges = graph->user_data_edges(nop_outgoing_node);
-                    if (queue_outgoing_edges.size() != 1)
-                        break;
-                    std::vector<OpType> tms = graph->get_edge_attributes(queue_outgoing_edges[0])->get_tms();
-                    // Buda Unpad operation is always first TM on the edge in this phase
-                    if (tms.size() > 0 && tms[0].op == "buda_unpad")
-                    {
-                        // Remove buda_unpad.
-                        tms.erase(tms.begin());
-                        graph->get_edge_attributes(queue_outgoing_edges[0])->set_tms(tms);
-                        // Remove queue.
-                        bypass_node(graph, nop_outgoing_node, /* remove node */ true);
-                        // Remove nop.
-                        bypass_node(graph, outgoing_node, /* remove node */ true);
-                    }
-                }
-                
-            }
-            else 
-            {
-                // Potential combination #2.
-                std::vector<OpType> tms = graph->get_edge_attributes(outgoing_edge)->get_tms();
-                // Buda Unpad operation is always first TM on the edge in this phase
-                if (tms.size() > 0 && tms[0].op == "buda_unpad")
-                {
-                    // Remove buda_unpad.
-                    tms.erase(tms.begin());
-                    graph->get_edge_attributes(outgoing_edge)->set_tms(tms);
-                }
-            }
-        }
-        else if (outgoing_node->node_type() == NodeType::kQueue)
-        {
-            // We possibly have combination #4.
-            std::vector<Edge> queue_outgoing_edges = graph->user_data_edges(outgoing_node);
-            if (queue_outgoing_edges.size() != 1)
-                break;
-            std::vector<OpType> tms = graph->get_edge_attributes(queue_outgoing_edges[0])->get_tms();
-            // Buda Unpad operation is always first TM on the edge in this phase
-            if (tms.size() > 0 && tms[0].op == "buda_unpad")
-            {
-                // Remove buda_unpad.
-                tms.erase(tms.begin());
-                graph->get_edge_attributes(queue_outgoing_edges[0])->set_tms(tms);
-                // Remove queue.
-                bypass_node(graph, outgoing_node, /* remove node */ true);
-            }
-        }
-
-    }
-}
-
-
-bool pad_node(
-    Graph *graph, 
-    Node *node,
-    Padding &padding
-)
-{
-
-    // Get environment variables that tell us if we should pad matmul and elemnt-wise operations.
-    bool element_wise_flag = env_as<bool>("PYBUDA_PADDING_PASS_ELEMENT_WISE", 1);
-    bool matmul_flag = env_as<bool>("PYBUDA_PADDING_PASS_MATMUL", 1);
-    bool sparse_matmul_flag = env_as<bool>("PYBUDA_PADDING_PASS_SPARSE_MATMUL", 1);
-    // TODO: Should be enabled or removed.
-    // bool splice_flag = env_as<bool>("PYBUDA_PADDING_PASS_SPLICE");
-
-    // Padding criterion for each type of operations
-    PaddingCriterion criterion = PaddingCriterion::BIGGEST_FACTOR_PRIME_10_INCREMENT;
-
-    // If the node is not operation it's not element-wise and matmul, too.
-    // If it is an operation, it can be for example "multiply", "add", "exp", etc.
-    if (node->node_type() != NodeType::kBudaOp)
-        return false;
-
-    BudaOpNode *buda_op_node = node->as<BudaOpNode>();
-    // Get type of the operation
-    std::string op_type = node->as<BudaOpNode>()->op_type().op;
-
-    if (!is_irregular(graph, node, padding, criterion))
-    {
-        padding.pad_lhs_rt++;
-        // TODO: For now we increment only R dimension.
-        // padding.pad_lhs_ct++;
-        // padding.pad_rhs_ct++;
-    }
-
-    if (graphlib::is_eltwise(buda_op_node))
-    {
-
-        if (element_wise_flag && op_type != "splice")
-        {
-            compute_pad_eltwise(node, padding, criterion);
-            return pad_eltwise(graph, node, padding);
-        }
-
-        /* TODO: Should be enabled.
-        if (splice_flag && op_type == "splice")
-            return pad_splice(graph, node);
-        */
-
-    }  // end if, is element-wise
-
-    if (buda_op_node->is_matmul())
-    {
-        // Pad sparse matmul
-        if (buda_op_node->is_sparse_matmul() && sparse_matmul_flag)
-        {
-            compute_pad_smm(graph, node, padding, criterion);
-            return pad_smm(graph, node, padding);
-        }
-
-        // Pad matmul
-        if (buda_op_node->is_matmul() && matmul_flag)
-        {
-            compute_pad_matmul(graph, node, padding, criterion);
-            return pad_matmul(graph, node, padding);
-        }
-
-    }  // end if, matmul
-
-    /* TODO: Should be enabled.
-    if (buda_op_node->is_fused_op())
-        return pad_fused_op(graph, node);
-    */
-
-    return false;
-}
-
-void set_padded_node_out_shape(Node* padded_node, Padding &padding)
-{
-    // Set shape
-    std::vector<std::uint32_t> shape = padded_node->shape().as_vector();
-    std::uint32_t shape_size = shape.size();
-    shape[shape_size - 2] += padding.pad_lhs_rt * Shape::BUDA_TILE_DIM;
-    shape[shape_size - 1] += padding.pad_rhs_ct * Shape::BUDA_TILE_DIM;
-    padded_node->set_shape(Shape::create_buda(shape));
-    padded_node->as<TaggedNode>()->add_tags({ { "padding", true } });
-}
-
-bool pad_eltwise(
-    Graph *graph, 
-    Node *node,
-    Padding &padding
-)
-{
-
-    bool padded = padding.pad_lhs_rt > 0 || padding.pad_lhs_ct > 0;
-
-    // Both dimensions are regular, so we skip padding
-    if (!padded)
-        return padded;
-
-    // Now, when we have figured out that shape is not irregular, we get incoming and outgoing edges.
-    // The idea is to pad incoming nodes and unpad outgoing nodes, we get these using the edges of the given node.
-    std::vector<Edge> incoming_edges = graph->operand_data_edges(node);
-    std::vector<Edge> outgoing_edges = graph->user_data_edges(node);
-
-    // Insert pad node for each incoming edge
-    for (Edge incoming_edge : incoming_edges)
-    {
-        NodeId incoming_node_id = incoming_edge.producer_node_id;
-        Node *incoming_node = graph->node_by_id(incoming_node_id);
-
-        if (check_shape_size(incoming_node->shape()) || check_shape_ones(incoming_node->shape()))
-            update_broadcast_op_with_pad(
-                graph, 
-                incoming_edge, 
-                padding.pad_lhs_rt * Shape::BUDA_TILE_DIM, 
-                padding.pad_lhs_ct * Shape::BUDA_TILE_DIM
-            );
-        else
-            insert_pad_buda(
-                graph,
-                incoming_edge,
-                padding.pad_lhs_rt,
-                padding.pad_lhs_ct,
-                // Padding value, used only in case
-                // when we use buda implmentation for padding
-                0.0
-            );
-
-    }  // end for, incoming edges
-
-    set_padded_node_out_shape(node, padding);
-
-    // Insert unpad for each outgoing edge
-    for (Edge outgoing_edge : outgoing_edges)
-    {
-
-        insert_unpad(
-            graph,
-            node,
-            outgoing_edge,
-            padding,
-            /* Insert nop and queue. */
-            false
-        );
-
-    }  // end for, outgoing edges
-
-    return padded;
-}
-
-bool pad_matmul(
-    Graph *graph, 
-    Node *node,
-    Padding &padding
-)
-{
-
-    std::vector<Edge> incoming_edges = graph->operand_data_edges(node);
-    std::vector<Edge> outgoing_edges = graph->user_data_edges(node);
-
-    // Get the operands of the matmul
-    Edge lhs_edge;
-    Edge rhs_edge;
-    Edge bias_edge;
-    for (Edge incoming_edge : incoming_edges)
-    {
-        if (incoming_edge.consumer_input_port_id == 0)
-            lhs_edge = incoming_edge;
-        else if (incoming_edge.consumer_input_port_id == 1)
-            rhs_edge = incoming_edge;
-        else if (incoming_edges.size() > 2 && incoming_edge.consumer_input_port_id == 2)
-            bias_edge = incoming_edge;
-    }
-
-    // All operands have regular shape, nothing to do
-    bool padded_lhs = padding.pad_lhs_rt > 0 || padding.pad_lhs_ct > 0;
-    bool padded_rhs = padding.pad_lhs_ct > 0 || padding.pad_rhs_ct > 0;
-
-    if (!padded_lhs && !padded_rhs)
-        return false;
-
-    // Insert pad for the left operand
-    if (padded_lhs)
-    {
-        insert_pad_buda(
-            graph,
-            lhs_edge,
-            padding.pad_lhs_rt,
-            padding.pad_lhs_ct,
-            // Padding value, used only in case
-            // when we use buda implmentation for padding
-            0.0);
-    }
-
-    // Insert pad for the right operand
-    if (padded_rhs)
-    {
-        insert_pad_buda(
-            graph,
-            rhs_edge,
-            // R dimension for right operand is the same as C dimension for left operand
-            padding.pad_lhs_ct,
-            padding.pad_rhs_ct,
-            // Padding value, used only in case
-            // when we use buda implmentation for padding
-            0.0);
-    }
-
-    // If matmul has bias with broadcast, align with proper padding.
-    if ((incoming_edges.size() > 2))
-    {
-        update_broadcast_op_with_pad(graph, bias_edge, padding.pad_lhs_rt, padding.pad_rhs_ct);
-    }
-
-    set_padded_node_out_shape(node, padding);
-
-    // Insert unpad for each output node,
-    for (Edge outgoing_edge : outgoing_edges)
-    {
-        insert_unpad(
-            graph,
-            node,
-            outgoing_edge,
-            padding,
-            /* Insert nop and queue. */
-            false
-        );
-
-    }  // end for, outgoing edges
-
-    return true;
-}
-
-bool pad_smm(
-    Graph *graph, 
-    Node *node, 
-    Padding &padding
-)
-{
-    bool padded = false;
-
-    std::vector<Edge> incoming_edges = graph->operand_data_edges(node);
-    std::vector<Edge> outgoing_edges = graph->user_data_edges(node);
-
-    bool unpad_flag = false;
-
-    // Insert pad for each incoming edge.
-    for (Edge incoming_edge : incoming_edges)
-    {
-        NodeId incoming_node_id = incoming_edge.producer_node_id;
-        Node *incoming_node = graph->node_by_id(incoming_node_id);
-
-        // Pad the LHS operand.
-        if (incoming_edge.consumer_input_port_id == 0)
-        {
-            if (padding.pad_lhs_rt > 0 || padding.pad_lhs_ct > 0)
-            {
-
-                padded = true;
-
-                // If the operation is sparse matmul we do not add buda_pad 
-                // operation, we change the existing constant node
-                insert_pad_smm(
-                    incoming_node, 
-                    /* pad R dimension */ 
-                    padding.pad_lhs_rt * Shape::BUDA_TILE_DIM, 
-                    /* pad C dimension */ 
-                    padding.pad_lhs_ct * Shape::BUDA_TILE_DIM
-                );
-
-                // We unpad only if outer dimensions are padded, in the case of LHS operand that's R dimension
-                if (padding.pad_lhs_rt > 0) {
-                    unpad_flag = true;
-                    graphlib::OpNode* op = node->as<graphlib::OpNode>();
-                    auto op_attrs = op->op_attrs();
-                    padding.sparse_r_attr = std::get<int>(op_attrs[5]);
-                    op_attrs[5] = ((std::get<int>(op_attrs[5]) - 1) / Shape::BUDA_TILE_DIM + 1) * Shape::BUDA_TILE_DIM + (int) (padding.pad_lhs_rt * Shape::BUDA_TILE_DIM);
-                    op->overwrite_op_attrs(op_attrs);
-                }
-
-            }
-
-        } // end if, LHS operand
-
-        // Pad the RHS operand.
-        else if (incoming_edge.consumer_input_port_id == 1)
-        {
-
-            if (padding.pad_lhs_ct > 0 || padding.pad_rhs_ct > 0)
-            {
-
-                insert_pad_buda(
-                    graph,
-                    incoming_edge,
-                    // R dimension for right operand is the same as C dimension for left operand
-                    padding.pad_lhs_ct,
-                    padding.pad_rhs_ct,
-                    // Padding value, used only in case
-                    // when we use buda implmentation for padding
-                    0.0
-                );
-
-                padded = true;
-                if (!unpad_flag && padding.pad_rhs_ct > 0)
-                    unpad_flag = true;
-
-            }
-
-        } // end if, RHS operand
-
-    }
-
-    set_padded_node_out_shape(node, padding);
-
-    if (unpad_flag)
-    {
-
-        for (Edge outgoing_edge : outgoing_edges)
-        {
-
-            insert_unpad(
-                graph,
-                node,
-                outgoing_edge,
-                padding,
-                /* Insert nop and queue. */
-                false
-            );
-
-        }  // end for, outgoing edges
-
-    }
-
-    return padded;
-}
-
-/* TODO: Should be implemented in the future.
-bool pad_splice(
-    Graph *graph, 
-    Node *node
-)
-{
-    bool padded = false;
-
-    return padded;
-e
-}
-*/
-
-/* TODO: Should be implemented in the future.
-bool pad_fused_op(
-    Graph *graph, 
-    Node *node
-)
-{
-
-    bool padded = false;
-
-    return padded;
-
-}
-*/
-
-
-// TODO
-// void remove_redundant_pad(Graph *graph)
-// {
-//     // In padding pass we pad particular operations,
-//     // in such a way that we put padding before and
-//     // unpadding after the operation.
-
-//     // Sometimes we will have successive unpad and pad,
-//     // that's uneccessary and expensive, so we want to
-//     // reduce those situations. Also, we can have for the same
-//     // operation a few the same paddings. In that case we want to
-//     // replace all the same paddings with only one.
-
-//     // This pass is divided into two passes, the first one is
-//     // DFS removing and the second one is BFS removing,
-
-//     // DFS
-//     // remove_redudant_pad_dfs(graph);
-
-//     // BFS
-//     // remove_redudant_pad_bfs(graph);
-// }
-
-// TODO Needs to be reimplemented for TMs because padding pass was moved to lowered graph.
-//
-// void remove_redudant_pad_dfs(Graph *graph)
-// {
-//     // In this "sub-pass", we remove blocks with the same
-//     // unpadding/padding operations.
-
-//     std::vector<Node *> removing_nodes;
-
-//     std::vector<Node *> nodes = tt::graphlib::topological_sort(*graph);
-
-//     for (Node *node : nodes)
-//     {
-//         if (node->as<TaggedNode>()->has_tag("padding"))
-//         {
-//             std::string op_type = node->as<BudaOpNode>()->op_type().op;
-//             // Check if it is unpad operator
-//             if (op_type == "buda_unpad")
-//             {
-//                 // Check if it has corresponding pad operator
-//                 Edge pad_edge = graph->user_data_edges(node)[0];
-//                 NodeId pad_node_id = pad_edge.consumer_node_id;
-//                 Node *pad_node = graph->node_by_id(pad_node_id);
-
-//                 // Check if the operator has padding tag
-//                 if (!pad_node->as<TaggedNode>()->has_tag("padding"))
-//                     continue;
-//                 if (pad_node->node_type() != NodeType::kPyOp)
-//                     continue;
-//                 std::string pad_op_type = pad_node->as<BudaOpNode>()->op_type().op;
-//                 if (pad_op_type != "buda_pad")
-//                     continue;
-
-//                 // Check if the previous operation can change the result
-//                 Edge previous_edge = graph->operand_data_edges(node)[0];
-//                 NodeId previous_node_id = previous_edge.producer_node_id;
-//                 Node *previous_node = graph->node_by_id(previous_node_id);
-//                 if (change_result(previous_node))
-//                     continue;
-
-//                 // TODO: In Progress
-//                 // // Check if the next operation can change the result
-//                 // Edge next_edge = graph->user_data_edges(pad_node)[0];
-//                 // NodeId next_node_id = next_edge.consumer_node_id;
-//                 // Node *next_node = graph->node_by_id(next_node_id);
-//                 // if (change_result(next_node))
-//                 //     continue;
-
-//                 // Get padding and unpadding attributes and compare them
-//                 std::vector<BudaOpAttr> pad_attr = pad_node->as<BudaOpNode>()->op_type().attr;
-//                 BudaOpAttr pad_attr_rt = pad_attr[0];
-//                 BudaOpAttr pad_attr_ct = pad_attr[1];
-
-//                 std::vector<BudaOpAttr> unpad_attr = node->as<BudaOpNode>()->op_type().attr;
-//                 BudaOpAttr unpad_attr_rt = unpad_attr[0];
-//                 BudaOpAttr unpad_attr_ct = unpad_attr[1];
-
-//                 // If the padding and unpadding attributes are the same, remove particular nodes
-//                 // and particular edges.
-//                 if (pad_attr_rt == unpad_attr_rt && pad_attr_ct == unpad_attr_ct)
-//                 {
-//                     // This removal will be done in two steps.
-//                     // Preserve nodes for removing, then remove them.
-
-//                     // This is done in this way, because we can't remove nodes immediately.
-//                     // We are iterating over already fetched nodes, and if we remove
-//                     // some node, we will have a potential problem to access the same node.
-
-//                     removing_nodes.push_back(node);
-//                     removing_nodes.push_back(pad_node);
-
-//                 }  // end if, remove block
-
-//             }  // end if, buda unpad
-
-//         }  // end if, padding node
-
-//     }  // end for, graph traversal
-
-//     // Remove nodes
-//     for (Node *node : removing_nodes)
-//     {
-//         bypass_node(graph, node, /* remove node */ true);
-//     }
-// }
-
-// TODO: In Progress
-// void remove_redudant_pad_bfs(Graph *graph)
-// {
-//     // In this "sub-pass", we remove the same paddings and
-//     // the same unpaddings used a few times for the same operation.
-
-//     // std::vector<Node *> nodes = tt::graphlib::topological_sort(*graph);
-//     // for (Node *node : nodes) {
-
-//     // }
-
-// }
-
-void insert_pad_smm(Node *incoming_node, std::uint32_t pad_r, std::uint32_t pad_c)
-{
-    // In this case incoming node is our node we want to pad,
-    // but we can't, because sparse matmul takes for left hand operand
-    // only ConstantInputNode type, so we need to pad it in the other way.
-    // That way is to change input node, because these kinds of nodes
-    // are created only in python and cpp part of compiler has only pointer
-    // to them. So, we make python function that pads the node and call it here.
-
-    // Check if we need to pad the node.
-    // For sparse matmul, we will have padding for R or C, or R and C dimensions,
-    // case where both of dimensions are not padded is almost impossible, but we want to discard that possibility.
-    if (pad_r <= 0 && pad_c <= 0)
-        return;
-
-    // Create constant input node using incoming node.
-    ConstantInputNode *pad_node = incoming_node->as<ConstantInputNode>();
-
-    // Get sparse buda object that keeps information about the sparse tensor we want to pad
-    SparseBUDA sparse_pad_node = pad_node->get_sparse_buda();
-
-    // Change shape of the sparse buda tensor
-    std::vector<std::int64_t> sparse_shape = sparse_pad_node.sparse_shape;
-    std::uint32_t sparse_shape_size = sparse_shape.size();
-    sparse_shape[sparse_shape_size - 2] += pad_r;
-    sparse_shape[sparse_shape_size - 1] += pad_c;
-
-    // Change shape of the sparse_zs tensors
-    std::vector<SparseCOO> sparse_zs = sparse_pad_node.sparse_zs;
-    for (SparseCOO& sparse_z : sparse_zs) {
-        std::vector<std::int64_t> sparse_z_shape = sparse_z.shape;
-        std::uint32_t sparse_z_shape_size = sparse_z_shape.size();
-        sparse_z_shape[sparse_z_shape_size - 2] += pad_r;
-        sparse_z_shape[sparse_z_shape_size - 1] += pad_c;
-        sparse_z.shape = sparse_z_shape;
-    }
-
-    // Set the sparse buda tensor to pad node with the new shapes
-    sparse_pad_node.sparse_shape = sparse_shape;
-    sparse_pad_node.sparse_zs = sparse_zs;
-    pad_node->set_sparse_buda(sparse_pad_node);
-}
-
-void insert_pad_buda(Graph *graph, Edge incoming_edge, std::uint32_t pad_r, std::uint32_t pad_c, float value)
-{
-    log_trace(LogPadding, "Padding node with pad_r {} pad_c {} value {}.", pad_r, pad_c, value);
-    std::vector<OpType::Attr> buda_pad_attrs(3, 0);
-    buda_pad_attrs[0] = (int)pad_r;
-    buda_pad_attrs[1] = (int)pad_c;
-    buda_pad_attrs[2] = value;
-    tt::BudaOpAttrs buda_attrs = tt::BudaOpAttrs{};
-    buda_attrs["rt"] = buda_pad_attrs[0];
-    buda_attrs["ct"] = buda_pad_attrs[1];
-    buda_attrs["pad_value"] = buda_pad_attrs[2];
-
-    graphlib::OpType tm_op_type = graphlib::OpType("buda_pad", buda_pad_attrs, buda_attrs);
-    graph->get_edge_attributes(incoming_edge)->append_tm(tm_op_type);
-}
-
-void insert_unpad_buda(
-    Graph *graph, 
-    Node *node, 
-    Edge edge, 
-    std::uint32_t pad_r, 
-    std::uint32_t pad_c,
-    std::uint32_t orig_r,
-    std::uint32_t orig_c
-)
-{
-    log_trace(LogPadding, "Unpadding node with pad_r {} pad_c {}.", pad_r, pad_c);
-    std::vector<std::uint32_t> shape_vect = node->shape().as_vector();
-    std::uint32_t shape_size = shape_vect.size();
-
-    std::vector<OpType::Attr> buda_unpad_attrs(4, 0);
-    buda_unpad_attrs[0] = (int)pad_r;
-    buda_unpad_attrs[1] = (int)pad_c;
-    if (orig_r == 0) 
-        buda_unpad_attrs[2] = (int)shape_vect[shape_size - 2];
-    else
-        buda_unpad_attrs[2] = (int)orig_r;
-    if (orig_c == 0)
-        buda_unpad_attrs[3] = (int)shape_vect[shape_size - 1];
-    else
-        buda_unpad_attrs[3] = (int)orig_c;
-
-    tt::BudaOpAttrs buda_attrs = tt::BudaOpAttrs{};
-    buda_attrs["rt"] = buda_unpad_attrs[0];
-    buda_attrs["ct"] = buda_unpad_attrs[1];
-    buda_attrs["orig_r"] = buda_unpad_attrs[2];
-    buda_attrs["orig_c"] = buda_unpad_attrs[3];
-
-    graphlib::OpType tm_op_type = graphlib::OpType("buda_unpad", buda_unpad_attrs, buda_attrs);
-    graph->get_edge_attributes(edge)->prepend_tm(tm_op_type);
-}
-
-void insert_unpad(
-    Graph *graph, 
-    Node *node, 
-    Edge edge,
-    Padding &padding,
-    bool insert_nop_queue
-)
-{
-
-    std::vector<std::uint32_t> orig_shape = padding.orig_shape.as_vector();
-    std::uint32_t orig_shape_size = orig_shape.size();
-    std::uint32_t orig_r = orig_shape[orig_shape_size - 2];
-    std::uint32_t orig_c = orig_shape[orig_shape_size - 1];
-
-    NodeId outgoing_node_id = edge.consumer_node_id;
-    Node* outgoing_node = graph->node_by_id(outgoing_node_id);
-
-    if (insert_nop_queue)
-    {
-        BudaOpNode *nop_node = create_nop(graph, node, "unpadding");
-        nop_node->as<TaggedNode>()->add_tags({ { "padding_nop", true } });
-        insert_node_on_edge(graph, edge, nop_node, true, true, 0, true);
-
-        Edge nop_edge = retrieve_between_edge(graph, nop_node, outgoing_node);
-
-        insert_unpad_buda(
-            graph,
-            nop_node,
-            nop_edge,
-            // With buda implmentation we pad R in tiles
-            padding.pad_lhs_rt,
-            // With buda implmentation we pad C in tiles
-            padding.pad_rhs_ct,
-            // Original shape R dimension
-            orig_r,
-            // Original shape C dimension
-            orig_c
-        );
-
-        insert_serialized_dram_queue_between_ops(
-            // graph
-            graph,
-            // producer name
-            nop_node->name(),
-            // consumer name
-            outgoing_node->name(),
-            // operand index is always zero,
-            // because vstack has only one operand
-            (PortId) edge.consumer_input_port_id
-        );
-    }
-    else if (padding.added_nop)
-    {
-        insert_unpad_buda(
-            graph,
-            node,
-            edge,
-            // With buda implmentation we pad R in tiles
-            padding.pad_lhs_rt,
-            // With buda implmentation we pad C in tiles
-            padding.pad_rhs_ct,
-            // Original shape R dimension
-            orig_r,
-            // Original shape C dimension
-            orig_c
-        );
-
-        insert_serialized_dram_queue_between_ops(
-            // graph
-            graph,
-            // producer name
-            node->name(),
-            // consumer name
-            outgoing_node->name(),
-            // operand index is always zero,
-            // because vstack has only one operand
-            (PortId) edge.consumer_input_port_id
-        );
-    }
-    else
-    {
-        BudaOpNode *nop_node = create_nop(graph, node, "unpadding");
-        nop_node->as<TaggedNode>()->add_tags({ { "padding_nop", true } });
-        insert_node_on_edge(graph, edge, nop_node, true, true, 0, true);
-
-        Edge nop_edge = retrieve_between_edge(graph, nop_node, outgoing_node);
-
-        insert_unpad_buda(
-            graph,
-            nop_node,
-            nop_edge,
-            // With buda implmentation we pad R in tiles
-            padding.pad_lhs_rt,
-            // With buda implmentation we pad C in tiles
-            padding.pad_rhs_ct,
-            // Original shape R dimension
-            orig_r,
-            // Original shape C dimension
-            orig_c
-        );
-
-        // Set the NOP indicator.
-        padding.added_nop = true;
-    }
-
-}
-
-void insert_queue(Graph *graph, Node *node)
-{
-    std::vector<Edge> outgoing_edges = graph->user_data_edges(node);
-
-    // Insert unpad for each outgoing node.
-    for (Edge outgoing_edge : outgoing_edges)
-    {
-        NodeId outgoing_node_id = outgoing_edge.consumer_node_id;
-        Node* outgoing_node = graph->node_by_id(outgoing_node_id);
-
-        if (outgoing_node->node_type() == NodeType::kQueue)
-            continue;
-
-        insert_serialized_dram_queue_between_ops(
-            // graph
-            graph,
-            // producer name
-            node->name(),
-            // consumer name
-            outgoing_node->name(),
-            // operand index is always zero,
-            // because vstack has only one operand
-            (PortId) outgoing_edge.consumer_input_port_id
-        );
-
-    }  // end for, outgoing edges
-}
-
-BudaOpNode* create_op(
-    Graph *graph,
-    Node *node,
-    Shape shape,
-    std::vector<OpType::Attr> attrs,
-    std::string name,
-    std::string op_name
-)
-{
-
-    // This function creates new operation node based on given
-    // graph, operation shape, attributes, operation name, 
-    // name of a new node epoch type and output data format.
-
-    OpType op_type = OpType(op_name, attrs);
-    BudaOpNode *op_node = graph->add_node(
-        tt::graphlib::create_node<BudaOpNode>(name, op_type),
-        graph->get_subgraph_id_for_node(node->id())
-    );
-    op_node->set_epoch_type(node->get_epoch_type());
-    op_node->set_output_df(node->output_df());
-    op_node->set_shape(shape);
-    op_node->as<TaggedNode>()->tag("original_op_name", name);
-    op_node->as<TaggedNode>()->tag("original_op_type", op_name);
-    op_node->as<TaggedNode>()->add_tags({ { "padding", true } });
-
-    return op_node;
-    
-}
-
-BudaOpNode* create_nop(
-    Graph *graph,
-    Node *node,
-    std::string padding_type
-)
-{
-    // Nop Attributes
-    std::vector<OpType::Attr> nop_attrs = {};
-    // Nop Names
-    std::string op_name = "nop";
-    std::string name = node->name() + "." + padding_type + "." + op_name + "_" + std::to_string(node->get_padding_id());
-    node->increment_padding_id();
-    // Nop Shape, the shape is the same as shape of the previous node
-    BudaOpNode *nop = create_op(graph, node, node->shape(), nop_attrs, name, op_name);
-
-    return nop;
-}
-
-bool is_irregular(Graph *graph, Node *node, Padding &padding, PaddingCriterion criterion)
-{
-    if (node->node_type() != NodeType::kBudaOp)
-        return false;
-
-    BudaOpNode *buda_op_node = node->as<BudaOpNode>();
-
-    if (graphlib::is_eltwise(buda_op_node))
-        return is_irregular_element_wise(node, padding, criterion);
-
-    if (buda_op_node->is_matmul())
-    {
-        // Check only sparse matmul.
-        if (buda_op_node->is_sparse_matmul())
-            return is_irregular_smm(graph, node, padding, criterion);
-
-        // Check only matmul.
-        if (buda_op_node->is_matmul())
-            return is_irregular_matmul(graph, node, padding, criterion);
-    }
-
-    return false;
-}
-
-bool is_irregular_element_wise(Node *node, Padding &padding, PaddingCriterion criterion)
-{
-
-    auto [row_size, column_size] = extract_dimensions_eltwise(node);
-
-    // // Now, when we know that the operation is element-wise, we get its shape.
-    // // Based on shape we get R dimension, its index and its value.
-    // std::vector<std::uint32_t> shape = node->shape().as_vector();
-    // std::uint32_t shape_size = shape.size();
-    // std::uint32_t row_dim = shape_size - 2;
-    // std::uint32_t row_size = shape[row_dim];
-    // std::uint32_t column_dim = shape_size - 1;
-    // std::uint32_t column_size = shape[column_dim];
-
-    bool irregular = false;
-
-    if (is_irregular(row_size + padding.pad_lhs_rt * Shape::BUDA_TILE_DIM, criterion))
-        irregular |= true;
-
-    if (is_irregular(column_size + padding.pad_lhs_ct * Shape::BUDA_TILE_DIM, criterion))
-        irregular |= true;
-
-    return irregular;
-}
-
-bool is_irregular_matmul(Graph *graph, Node *node, Padding &padding, PaddingCriterion criterion)
-{
-
-    auto [lhs_row_size, lhs_col_size, rhs_col_size] = extract_dimensions_matmul(graph, node);
-
-    bool irregular = false;
-    
-    // Check if the operation has regular/irregular shape
-    // Left operand, R dimension
-    if (is_irregular(lhs_row_size + padding.pad_lhs_rt * Shape::BUDA_TILE_DIM, criterion))
-        irregular |= true;
-
-    // Left operand, C dimension
-    if (is_irregular(lhs_col_size + padding.pad_lhs_ct * Shape::BUDA_TILE_DIM, criterion))
-        irregular |= true;
-
-    // Right operand, C dimension
-    if (is_irregular(rhs_col_size + padding.pad_rhs_ct * Shape::BUDA_TILE_DIM, criterion))
-        irregular |= true;
-
-    return irregular;
-
-}
-
-bool is_irregular_smm(Graph *graph, Node *node, Padding &padding, PaddingCriterion criterion)
-{
-    
-    auto [lhs_row_size, lhs_col_size, rhs_col_size] = extract_dimensions_smm(graph, node);
-
-    bool irregular = false;
-
-    // Check if the operation has regular/irregular shape
-    // Left operand, R dimension.
-    if (is_irregular(lhs_row_size + padding.pad_lhs_rt * Shape::BUDA_TILE_DIM, criterion))
-        irregular |= true;
-
-    // Left operand, C dimension, and right operand R dimension.
-    if (is_irregular(lhs_col_size + padding.pad_lhs_ct * Shape::BUDA_TILE_DIM, criterion))
-        irregular |= true;
-
-    // Right operand, C dimension.
-    if (is_irregular(rhs_col_size + padding.pad_rhs_ct * Shape::BUDA_TILE_DIM, criterion))
-        irregular |= true;
-
-    return irregular;
-
-}
-
-
-bool is_irregular(std::uint32_t dimension, PaddingCriterion criterion)
-{
-    if (criterion == PaddingCriterion::PRIME_NUMBER)
-        return is_prime(dimension);
-
-    if (criterion == PaddingCriterion::POWER_OF_TWO)
-        return !is_power_of_2(dimension);
-
-    if (criterion == PaddingCriterion::MULTIPLE_12_OF_TILE)
-        return !is_multiple_12_of_tile(dimension);
-
-    if (criterion == PaddingCriterion::MULTIPLE_10_OF_TILE)
-        return !is_multiple_10_of_tile(dimension);
-
-    if (criterion == PaddingCriterion::PRIME_TILE)
-        return !is_tile_prime(dimension);
-
-    if (criterion == PaddingCriterion::BIGGEST_FACTOR_PRIME_10 ||
-        criterion == PaddingCriterion::BIGGEST_FACTOR_PRIME_10_INCREMENT)
-        return is_biggest_factor_prime(10, dimension);
-
-    return false;
-}
-
-bool is_sparse_irregular(std::uint32_t r_dim, std::uint32_t c_dim, PaddingCriterion criterion)
-{
-    // This method checks if given sparse matrix multiplication is irregular.
-
-    if (criterion == PaddingCriterion::SPARSE_MATMUL_BASIC)
-    {
-        // If C dimension is greater than R dimension, then we skip the padding.
-        if (c_dim > r_dim)
-            return false;
-
-        std::uint32_t r_tiles = get_tiles_num(r_dim);
-        std::uint32_t c_tiles = get_tiles_num(c_dim);
-
-        return is_sparse_irregular_tiles(r_tiles, c_tiles);
-    }
-
-    return false;
-}
-
-bool is_sparse_irregular_tiles(std::uint32_t r_tiles, std::uint32_t c_tiles)
-{
-    // This method checks if given sparse matrix multiplication is irregular.
-    // First we get GCD of R and C tiles, then we multiply it with input tiles.
-    // Input tiles is constant empirically determined.
-    // If this multiplication biggee than r_tiles and c_tiles the sparse matmul is regular.
-
-    std::uint32_t input_tiles = 10;
-    std::uint32_t gcd_num = tt::balancer::gcd(r_tiles, c_tiles);
-    std::uint32_t factor = input_tiles * gcd_num;
-
-    if (factor > r_tiles && factor > c_tiles)
-        return false;
-
-    return true;
-}
-
-// Get number of tiles for given dimension.
-std::uint32_t get_tiles_num(std::uint32_t dimension) { return (dimension - 1) / Shape::BUDA_TILE_DIM + 1; }
-
-bool is_prime(std::uint32_t n)
-{
-    // Primality test
-
-    if (n == 2 || n == 3)
-        return true;
-
-    if (n <= 1 || n % 2 == 0 || n % 3 == 0)
-        return false;
-
-    for (std::uint32_t i = 5; i * i <= n; i += 6)
-        if (n % i == 0 || n % (i + 2) == 0)
-            return false;
-
-    return true;
-}
-
-bool is_tile_prime(std::uint32_t n)
-{
-    std::uint32_t tile_prime = (n - 1) / Shape::BUDA_TILE_DIM + 1;
-    if (is_prime(tile_prime))
-        return true;
-    return false;
-}
-
-bool is_power_of_2(std::uint32_t n)
-{
-    TT_ASSERT(n > 0, "Number must be strictly greater than 0.");
-    float result_log = log2(n);
-    if (ceil(result_log) == floor(result_log))
-        return true;
-    return false;
-}
-
-bool is_multiple_12_of_tile(std::uint32_t n)
-{
-    if (((n - 1) / Shape::BUDA_TILE_DIM + 1) % 12 == 0)
-        return true;
-    return false;
-}
-
-bool is_multiple_10_of_tile(std::uint32_t n)
-{
-    if (((n - 1) / Shape::BUDA_TILE_DIM + 1) % 10 == 0)
-        return true;
-    return false;
-}
-
-bool is_biggest_factor_prime(std::uint32_t threshold, std::uint32_t dimension)
-{
-    std::uint32_t tile_size = (dimension - 1) / Shape::BUDA_TILE_DIM + 1;
-    std::vector<std::uint32_t> factors = prime_factorize(tile_size);
-
-    if (factors.size() > 0 && factors[factors.size() - 1] > threshold)
-        return true;
-    return false;
-}
-
-std::vector<std::uint32_t> prime_factorize(std::uint32_t dimension)
-{
-    std::vector<std::uint32_t> factors;
-
-    while (dimension % 2 == 0)
-    {
-        dimension /= 2;
-        factors.push_back(2);
-    }
-
-    for (int divisor = 3; divisor < (int)std::sqrt(dimension) + 1; divisor += 2)
-    {
-        while (dimension % divisor == 0)
-        {
-            dimension /= divisor;
-            factors.push_back(divisor);
-        }
-    }
-
-    if (dimension > 2)
-        factors.push_back(dimension);
-
-    return factors;
-}
-
-
-void compute_pad_eltwise(Node *node, Padding &padding, PaddingCriterion criterion)
-{
-
-    auto [row_size, column_size] = extract_dimensions_eltwise(node);
-
-    std::uint32_t pad_lhs_r = compute_pad(row_size + padding.pad_lhs_rt * Shape::BUDA_TILE_DIM, criterion);
-    std::uint32_t pad_lhs_c = compute_pad(column_size + padding.pad_lhs_ct * Shape::BUDA_TILE_DIM, criterion);
-
-    padding.pad_lhs_rt += pad_lhs_r / Shape::BUDA_TILE_DIM;
-    padding.pad_lhs_ct += pad_lhs_c / Shape::BUDA_TILE_DIM;
-    padding.pad_rhs_ct += pad_lhs_c / Shape::BUDA_TILE_DIM;
-
-}
-
-void compute_pad_matmul(Graph *graph, Node *node, Padding &padding, PaddingCriterion criterion)
-{
-
-    auto [lhs_row_size, lhs_col_size, rhs_col_size] = extract_dimensions_matmul(graph, node);
-
-    std::uint32_t pad_lhs_r = compute_pad(lhs_row_size + padding.pad_lhs_rt * Shape::BUDA_TILE_DIM, criterion);
-    std::uint32_t pad_lhs_c = compute_pad(lhs_col_size + padding.pad_lhs_ct * Shape::BUDA_TILE_DIM, criterion);
-    std::uint32_t pad_rhs_c = compute_pad(rhs_col_size + padding.pad_rhs_ct * Shape::BUDA_TILE_DIM, criterion);
-
-    padding.pad_lhs_rt += pad_lhs_r / Shape::BUDA_TILE_DIM;
-    padding.pad_lhs_ct += pad_lhs_c / Shape::BUDA_TILE_DIM;
-    padding.pad_rhs_ct += pad_rhs_c / Shape::BUDA_TILE_DIM;
-
-}
-
-void compute_pad_smm(Graph *graph, Node *node, Padding &padding, PaddingCriterion criterion)
-{
-
-    auto [lhs_row_size, lhs_col_size, rhs_col_size] = extract_dimensions_smm(graph, node);
-
-    std::uint32_t pad_lhs_r = compute_pad(lhs_row_size + padding.pad_lhs_rt * Shape::BUDA_TILE_DIM, criterion);
-    std::uint32_t pad_lhs_c = compute_pad(lhs_col_size + padding.pad_lhs_ct * Shape::BUDA_TILE_DIM, criterion);
-    std::uint32_t pad_rhs_c = compute_pad(rhs_col_size + padding.pad_rhs_ct * Shape::BUDA_TILE_DIM, criterion);
-
-    padding.pad_lhs_rt += pad_lhs_r / Shape::BUDA_TILE_DIM;
-    padding.pad_lhs_ct += pad_lhs_c / Shape::BUDA_TILE_DIM;
-    padding.pad_rhs_ct += pad_rhs_c / Shape::BUDA_TILE_DIM;
-
-}
-
-std::uint32_t compute_pad(std::uint32_t dimension, PaddingCriterion criterion)
-{
-
-    if (criterion == PaddingCriterion::POWER_OF_TWO)
-        return round_power_of_2(dimension) - dimension;
-
-    if (criterion == PaddingCriterion::PRIME_TILE)
-        return round_tile_prime(dimension) - dimension;
-
-    if (criterion == PaddingCriterion::MULTIPLE_12_OF_TILE)
-        return round_multiple_12_of_tile(dimension) - dimension;
-
-    if (criterion == PaddingCriterion::MULTIPLE_10_OF_TILE)
-        return round_multiple_10_of_tile(dimension) - dimension;
-
-    if (criterion == PaddingCriterion::BIGGEST_FACTOR_PRIME_10)
-        return round_biggest_factor_prime(10, dimension) - dimension;
-
-    if (criterion == PaddingCriterion::BIGGEST_FACTOR_PRIME_10_INCREMENT)
-        return increment_until_valid(dimension, criterion) - dimension;
-
-    // TODO: It should be assertion ???
-    return 0;
-}
-
-std::uint32_t compute_sparse_pad(std::uint32_t r_dim, std::uint32_t c_dim, PaddingCriterion criterion)
-{
-    if (criterion == PaddingCriterion::SPARSE_MATMUL_BASIC)
-        return compute_sparse_pad_basic(r_dim, c_dim);
-    if (criterion == PaddingCriterion::SPARSE_MATMUL_FACTORS)
-        return compute_sparse_pad_factors(r_dim, c_dim);
-
-    // TODO: It should be assertion ???
-    return 0;
-}
-
-std::uint32_t compute_sparse_pad_basic(std::uint32_t r_dim, std::uint32_t c_dim)
-{
-    std::uint32_t r_tiles = get_tiles_num(r_dim);
-    std::uint32_t c_tiles = get_tiles_num(c_dim);
-
-    while (is_sparse_irregular_tiles(r_tiles, c_tiles)) c_tiles++;
-
-    return c_tiles * Shape::BUDA_TILE_DIM - c_dim;
-}
-
-std::uint32_t compute_sparse_pad_factors(std::uint32_t r_dim, std::uint32_t c_dim)
-{
-    std::uint32_t r_tiles = get_tiles_num(r_dim);
-    std::uint32_t c_tiles = get_tiles_num(c_dim);
-    std::vector<std::uint32_t> factors_r = prime_factorize(r_tiles);
-    std::uint32_t new_c_tiles = 1;
-    for (int factor_it = factors_r.size() - 1; factor_it >= 0; factor_it--)
-    {
-        if (new_c_tiles >= c_tiles)
-            break;
-        new_c_tiles *= factors_r[factor_it];
-    }
-    return new_c_tiles * Shape::BUDA_TILE_DIM - c_dim;
-}
-
-std::uint32_t round_power_of_2(std::uint32_t n)
-{
-    n--;
-    n |= n >> 1;
-    n |= n >> 2;
-    n |= n >> 4;
-    n |= n >> 8;
-    n |= n >> 16;
-    n++;
-
-    return n;
-}
-
-std::uint32_t round_tile_prime(std::uint32_t n) { return ((n - 1) / Shape::BUDA_TILE_DIM + 2) * Shape::BUDA_TILE_DIM; }
-
-std::uint32_t round_multiple_12_of_tile(std::uint32_t n)
-{
-    return ((n - 1) / Shape::BUDA_TILE_DIM / 12 + 1) * Shape::BUDA_TILE_DIM * 12;
-}
-
-std::uint32_t round_multiple_10_of_tile(std::uint32_t n)
-{
-    return ((n - 1) / Shape::BUDA_TILE_DIM / 10 + 1) * Shape::BUDA_TILE_DIM * 10;
-}
-
-std::uint32_t round_biggest_factor_prime(std::uint32_t threshold, std::uint32_t dimension)
-{
-    std::uint32_t result = 1;
-    std::uint32_t tile_size = (dimension - 1) / Shape::BUDA_TILE_DIM + 1;
-    std::vector<std::uint32_t> factors = round_biggest_factor_prime_inner(threshold, tile_size);
-    for (std::uint32_t item : factors) result *= item;
-    return result * Shape::BUDA_TILE_DIM;
-}
-
-std::vector<std::uint32_t> round_biggest_factor_prime_inner(std::uint32_t threshold, std::uint32_t dimension)
-{
-    std::uint32_t result = 1;
-    std::vector<std::uint32_t> factors = prime_factorize(dimension);
-    if (factors[factors.size() - 1] > threshold)
-    {
-        for (int it = factors.size() - 1; it >= 0; it--)
-        {
-            if (factors[it] > threshold)
-                factors[it] += 1;
-            result *= factors[it];
-        }
-        return round_biggest_factor_prime_inner(threshold, result);
-    }
-    else
-        return factors;
-}
-
-bool check_shape_dims(Shape shape)
-{
-    // This function is because of imperfection of padding algorithm.
-    // Padding doesn't work for operations with shape that have dimensions
-    // Z, W, etc. bigger than zero.
-
-    // e.g. these are allowed shapes: [32, 192], [1, 1, 48, 256]
-    // e.g. these are not allowed shapes: [1, 6, 32, 192], [32, 1, 48, 256], [8, 192, 192]
-
-    std::vector<std::uint32_t> shape_vect = shape.as_vector();
-    std::uint32_t shape_size = shape_vect.size();
-    if (shape_size > 2)
-    {
-        for (std::uint32_t it = 0; it < shape_size - 2; it++)
-        {
-            if (shape_vect[it] > 1)
-                return false;
-        }
-    }
-    return true;
-}
-
-std::uint32_t increment_until_valid(std::uint32_t dimension, PaddingCriterion criterion)
-{
-    std::uint32_t tile_num = get_tiles_num(dimension);
-
-    while (is_irregular(tile_num * Shape::BUDA_TILE_DIM, criterion))
-    {
-        tile_num++;
-    }
-
-    return tile_num * Shape::BUDA_TILE_DIM;
-}
-
-bool check_shape_size(tt::graphlib::Shape shape)
-{
-    // Check if shape size smaller than 2
-    if (shape.as_vector().size() < 2)
-        return true;
-    return false;
-}
-
-bool check_shape_ones(tt::graphlib::Shape shape)
-{
-    // Check if all dimensions in tesnor are 1,
-    // if that's true we don't need to pad,
-    // the tensor will be broadcasted.
-    bool is_one = true;
-
-    std::vector<std::uint32_t> shape_vect = shape.as_vector();
-    std::uint32_t length = shape_vect.size();
-    for (std::uint32_t index = 0; index < length - 1; index++)
-    {
-        if (shape_vect[index] != 1)
-        {
-            is_one = false;
-            break;
-        }
-    }
-
-    return is_one;
-}
-
-void update_broadcast_op_with_pad(Graph *graph, Edge edge, std::uint32_t pad_r, std::uint32_t pad_c)
-{
-    // If padded node has broadcast value on input, that broadcast needs to be updated too.
-
-    std::vector<graphlib::OpType> tms = graph->get_edge_attributes(edge)->get_tms();
-    for (OpType tm : tms)
-    {
-        std::string tm_op = tm.op;
-        if (tm_op == "broadcast")
-        {
-            std::vector<BudaOpAttr> attrs = tm.attr;
-            // Broadcast parameters are computed and added before pre-lowering pass where we actually add padding pass,
-            // so, if we only change the shape broadcast on the will not be affected, because of that we change it
-            // manually.
-            int broadcast_dim = std::get<int>(attrs[0]);
-            if (broadcast_dim == -2)
-            {
-                if (pad_r > 0)
-                {
-                    int broadcast_size = std::get<int>(attrs[1]);
-                    graph->get_edge_attributes(edge)->remove_broadcast_dim(-2);
-                    graph->get_edge_attributes(edge)->set_broadcast_dim(-2, broadcast_size + (int)pad_r);
-                }
-            }
-            if (broadcast_dim == -1)
-            {
-                if (pad_c > 0)
-                {
-                    int broadcast_size = std::get<int>(attrs[1]);
-                    graph->get_edge_attributes(edge)->remove_broadcast_dim(-1);
-                    graph->get_edge_attributes(edge)->set_broadcast_dim(-1, broadcast_size + (int)pad_c);
-                }
-            }
-        }
-    }
-}
-
-bool change_result(Node *operation)
-{
-    // Check if we have operations that can change zero to something else
-    std::string op_type = operation->as<BudaOpNode>()->op_type().op;
-
-    if (op_type == "exp")
-        return true;
-    if (op_type == "log")
-        return true;
-    if (op_type == "cosine")
-        return true;
-    if (op_type == "sigmoid")
-        return true;
-    if (op_type == "gelu_derivative")
-        return true;
-    if (op_type == "reciprocal")
-        return true;
-    return false;
-}
-
-std::tuple<std::uint32_t, std::uint32_t, std::uint32_t> extract_dimensions_smm(Graph *graph, Node *node)
-{
-
-    // This function extracts all necessary dimensions for sparse matmul padding.
-
-    Node *lhs_node = nullptr;
-    Node *rhs_node = nullptr;
-    std::vector<Edge> incoming_edges = graph->operand_data_edges(node);
-    for (Edge incoming_edge : incoming_edges)
-    {
-        NodeId incoming_node_id = incoming_edge.producer_node_id;
-        Node *incoming_node = graph->node_by_id(incoming_node_id);
-        // Get the first operand. Unique sparse tiles.
-        if (incoming_edge.consumer_input_port_id == 0)
-            lhs_node = incoming_node;
-
-        // Get the second operand. Activations.
-        else if (incoming_edge.consumer_input_port_id == 1)
-            rhs_node = incoming_node;
-    }
-
-    TT_ASSERT(lhs_node != nullptr && rhs_node != nullptr, "Pointers of sparse matmul operands must be non-null value.");
-
-    ConstantInputNode *lhs_node_const = lhs_node->as<ConstantInputNode>();
-    SparseBUDA lhs_node_sparse = lhs_node_const->get_sparse_buda();
-    std::vector<std::int64_t> lhs_shape = lhs_node_sparse.sparse_shape;
-    std::uint32_t lhs_shape_size = lhs_shape.size();
-    std::uint32_t lhs_row_dim = lhs_shape_size - 2;
-    std::uint32_t lhs_col_dim = lhs_shape_size - 1;
-    std::uint32_t lhs_row_size = lhs_shape[lhs_row_dim];
-    std::uint32_t lhs_col_size = lhs_shape[lhs_col_dim];
-
-    std::vector<std::uint32_t> rhs_shape = rhs_node->shape().as_vector();
-    std::uint32_t rhs_shape_size = rhs_shape.size();
-    std::uint32_t rhs_col_dim = rhs_shape_size - 1;
-    std::uint32_t rhs_col_size = rhs_shape[rhs_col_dim];
-
-    return {
-        lhs_row_size,
-        lhs_col_size,
-        rhs_col_size
-    };
-
-}
-
-std::tuple<std::uint32_t, std::uint32_t, std::uint32_t> extract_dimensions_matmul(Graph *graph, Node *node)
-{
-
-    std::vector<Edge> incoming_edges = graph->operand_data_edges(node);
-
-    Node *lhs_operand = nullptr;
-    Node *rhs_operand = nullptr;
-    for (Edge incoming_edge : incoming_edges)
-    {
-        // Left operand has input port id 0
-        if (incoming_edge.consumer_input_port_id == 0)
-            lhs_operand = graph->node_by_id(incoming_edge.producer_node_id);
-        // Right operand has input port id 1
-        else if (incoming_edge.consumer_input_port_id == 1)
-            rhs_operand = graph->node_by_id(incoming_edge.producer_node_id);
-    }
-
-    // Get shapes and R/C dimensions of the operands
-    // Left operand
-    std::vector<std::uint32_t> lhs_shape = lhs_operand->shape().as_vector();
-    std::uint32_t lhs_shape_size = lhs_shape.size();
-    std::uint32_t lhs_row_dim = lhs_shape_size - 2;
-    std::uint32_t lhs_row_size = lhs_shape[lhs_row_dim];
-    std::uint32_t lhs_col_dim = lhs_shape_size - 1;
-    std::uint32_t lhs_col_size = lhs_shape[lhs_col_dim];
-
-    // Right operand
-    // In case for right hand we take only C dimension,
-    // because inner dimensions are equal for both operands in matmul
-    std::vector<std::uint32_t> rhs_shape = rhs_operand->shape().as_vector();
-    std::uint32_t rhs_shape_size = rhs_shape.size();
-    std::uint32_t rhs_col_dim = rhs_shape_size - 1;
-    std::uint32_t rhs_col_size = rhs_shape[rhs_col_dim];
-
-    return {
-        lhs_row_size,
-        lhs_col_size,
-        rhs_col_size
-    };
-
-}
-
-std::tuple<std::uint32_t, std::uint32_t> extract_dimensions_eltwise(Node *node)
-{
-
-    // Now, when we know that the operation is element-wise, we get its shape.
-    // Based on shape we get R and C dimensions, their index and value.
-    std::vector<std::uint32_t> shape = node->shape().as_vector();
-    std::uint32_t shape_size = shape.size();
-    std::uint32_t row_dim = shape_size - 2;
-    std::uint32_t row_size = shape[row_dim];
-    std::uint32_t column_dim = shape_size - 1;
-    std::uint32_t column_size = shape[column_dim];
-
-    return { row_size, column_size };
-
-}
-
-// void update_padding_matmul()
-// {
-
-// }
-
-// void update_padding_eltwise()
-// {
-
-// }
-
-
-}  // namespace tt::padding_placer
diff --git a/pybuda/csrc/passes/padding_pass_placer.hpp b/pybuda/csrc/passes/padding_pass_placer.hpp
deleted file mode 100644
index f5c8751ce..000000000
--- a/pybuda/csrc/passes/padding_pass_placer.hpp
+++ /dev/null
@@ -1,252 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <map>
-#include <tuple>
-
-#include "balancer/balancer_cache_collection.hpp"
-#include "balancer/balancer_utils.hpp"
-#include "balancer/legalizer/legalizer.hpp"
-#include "graph_lib/node_types.hpp"
-
-namespace tt::graphlib
-{
-class Graph;
-class Node;
-class Shape;
-}  // namespace tt::graphlib
-
-namespace tt::balancer
-{
-struct BalancerConfig;
-}
-
-namespace tt::padding_placer
-{
-
-struct Padding
-{
-    // Padding record for preserving the most important information during the padding pass.
-    // For each node in the graph we can have the particular padding structure.
-    // So, here we should preserve only things specific for particular node, not something common for all nodes.
-
-    // Original shape of the node.
-    tt::graphlib::Shape orig_shape;
-
-    // Sparse matmul R dimension attribute.
-    int sparse_r_attr = 0;
-
-    // Buda.
-    std::uint32_t pad_lhs_rt = 0;
-    std::uint32_t pad_lhs_ct = 0;
-    std::uint32_t pad_rhs_ct = 0;
-
-    // Flag that indicates NOP insertion.
-    bool added_nop = false;
-};
-
-// Padding criterion says how we want to compute
-// pad number for a given tensor
-enum PaddingCriterion
-{
-    // These are criterions for all ops except sparse matmul
-    PRIME_NUMBER = 0,
-    POWER_OF_TWO = 1,
-    MULTIPLE_OF_TILE = 2,
-    PRIME_TILE = 3,
-    MULTIPLE_12_OF_TILE = 4,
-    MULTIPLE_10_OF_TILE = 5,
-    BIGGEST_FACTOR_PRIME_10 = 6,
-    BIGGEST_FACTOR_PRIME_10_INCREMENT = 7,
-
-    // These are criterions specific for sparse matmul ops
-    SPARSE_MATMUL_BASIC = 20,
-    SPARSE_MATMUL_FACTORS = 21,
-};
-
-enum PaddingDimension
-{
-    R = 0,
-    C = 1,
-    Z = 2,
-    W = 3
-};
-
-enum PaddingOperation
-{
-    ALL = 0,
-    CONVOLUTION = 1,
-    POOLING = 2,
-    ELEMENT_WISE = 3,
-    MATMUL = 4,
-    REDUCE = 5,
-    NN = 6,
-    TM = 7,
-};
-
-bool pad_pass_placer(
-    tt::graphlib::Graph *,
-    // This parameter represents nodes that should be padded
-    const std::unordered_map<tt::graphlib::Node *, const balancer::BudaOpNodeLegalizerFailureInfo> &,
-    const tt::balancer::BalancerConfig &,
-    std::shared_ptr<balancer::BalancerCacheCollection> balancer_cache_collection);
-
-std::unordered_map<tt::graphlib::Node *, const tt::balancer::BudaOpNodeLegalizerFailureInfo> check_node_legality(
-    tt::graphlib::Graph *,
-    tt::graphlib::Node *,
-    const tt::balancer::BalancerConfig &,
-    std::shared_ptr<balancer::BalancerCacheCollection>);
-
-void remove_padding(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &);
-
-void restore_smm(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &);
-
-void remove_pad(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &padding);
-
-void remove_unpad(tt::graphlib::Graph *, tt::graphlib::Node * /* , Padding &padding */);
-
-void remove_buda_pad(tt::graphlib::Graph *, tt::graphlib::Node *);
-
-void remove_buda_unpad(tt::graphlib::Graph *, tt::graphlib::Node *);
-
-bool pad_node(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &);
-
-bool pad_eltwise(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &);
-
-bool pad_matmul(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &);
-
-bool pad_smm(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &);
-
-// TODO: In Progress.
-// bool pad_splice(tt::graphlib::Graph *, tt::graphlib::Node *);
-
-// TODO: In Progress.
-// bool pad_fused_op(tt::graphlib::Graph *, tt::graphlib::Node *);
-
-// TODO: In Progress
-// void remove_redundant_pad(tt::graphlib::Graph *);
-
-// TODO: In Progress
-// void remove_redudant_pad_bfs(tt::graphlib::Graph *);
-
-// TODO: In Progress
-// void remove_redudant_pad_dfs(tt::graphlib::Graph *);
-
-void insert_pad_smm(tt::graphlib::Node *, std::uint32_t, std::uint32_t);
-
-void insert_pad_buda(tt::graphlib::Graph *, tt::graphlib::Edge, std::uint32_t, std::uint32_t, float);
-
-void insert_unpad_buda(
-    tt::graphlib::Graph *,
-    tt::graphlib::Node *,
-    tt::graphlib::Edge,
-    std::uint32_t,
-    std::uint32_t,
-    std::uint32_t orig_r = 0,
-    std::uint32_t orig_c = 0);
-
-void set_padded_node_out_shape(Node *, Padding &);
-
-void insert_unpad(tt::graphlib::Graph *, tt::graphlib::Node *, tt::graphlib::Edge, Padding &, bool);
-
-void insert_queue(tt::graphlib::Graph *, tt::graphlib::Node *);
-
-void insert_queue(tt::graphlib::Graph *graph, tt::graphlib::Node *node);
-
-tt::graphlib::BudaOpNode *create_op(
-    tt::graphlib::Graph *,
-    tt::graphlib::Node *,
-    tt::graphlib::Shape,
-    std::vector<tt::graphlib::OpType::Attr>,
-    std::string,
-    std::string);
-
-tt::graphlib::BudaOpNode *create_nop(tt::graphlib::Graph *, tt::graphlib::Node *, std::string);
-
-bool check_op_type(std::string, tt::padding_placer::PaddingOperation);
-
-bool is_irregular(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &, PaddingCriterion);
-
-bool is_irregular_element_wise(tt::graphlib::Node *, Padding &, PaddingCriterion);
-
-bool is_irregular_matmul(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &, PaddingCriterion);
-
-bool is_irregular_smm(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &, PaddingCriterion);
-
-bool is_irregular(std::uint32_t, PaddingCriterion);
-
-bool is_sparse_irregular(std::uint32_t, std::uint32_t, PaddingCriterion);
-
-bool is_sparse_irregular_tiles(std::uint32_t, std::uint32_t);
-
-bool is_prime(std::uint32_t);
-
-bool is_tile_prime(std::uint32_t);
-
-bool is_power_of_2(std::uint32_t);
-
-bool is_multiple_12_of_tile(std::uint32_t);
-
-bool is_multiple_10_of_tile(std::uint32_t);
-
-bool is_biggest_factor_prime(std::uint32_t, std::uint32_t);
-
-std::vector<std::uint32_t> prime_factorize(std::uint32_t);
-
-std::uint32_t round_power_of_2(std::uint32_t);
-
-std::uint32_t round_tile_prime(std::uint32_t);
-
-std::uint32_t round_multiple_12_of_tile(std::uint32_t);
-
-std::uint32_t round_multiple_10_of_tile(std::uint32_t);
-
-std::uint32_t round_biggest_factor_prime(std::uint32_t, std::uint32_t);
-
-std::vector<std::uint32_t> round_biggest_factor_prime_inner(std::uint32_t, std::uint32_t);
-
-std::uint32_t increment_until_valid(std::uint32_t, PaddingCriterion);
-
-void compute_pad_eltwise(tt::graphlib::Node *, Padding &, PaddingCriterion);
-
-void compute_pad_matmul(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &, PaddingCriterion);
-
-void compute_pad_smm(tt::graphlib::Graph *, tt::graphlib::Node *, Padding &, PaddingCriterion);
-
-std::uint32_t compute_pad(std::uint32_t, PaddingCriterion);
-
-std::uint32_t compute_sparse_pad(std::uint32_t, std::uint32_t, PaddingCriterion);
-
-std::uint32_t compute_sparse_pad_basic(std::uint32_t, std::uint32_t);
-
-std::uint32_t compute_sparse_pad_factors(std::uint32_t, std::uint32_t);
-
-bool check_shape_dims(tt::graphlib::Shape);
-
-bool check_shape_size(tt::graphlib::Shape);
-
-bool check_shape_ones(tt::graphlib::Shape);
-
-void update_broadcast_op_with_pad(graphlib::Graph *, graphlib::Edge, std::uint32_t, std::uint32_t);
-
-std::string convert_pad_op(PaddingOperation);
-
-std::uint32_t get_tiles_num(std::uint32_t);
-
-bool change_result(tt::graphlib::Node *);
-
-std::tuple<std::uint32_t, std::uint32_t, std::uint32_t> extract_dimensions_smm(
-    tt::graphlib::Graph *, tt::graphlib::Node *);
-
-std::tuple<std::uint32_t, std::uint32_t, std::uint32_t> extract_dimensions_matmul(
-    tt::graphlib::Graph *, tt::graphlib::Node *);
-
-std::tuple<std::uint32_t, std::uint32_t> extract_dimensions_eltwise(tt::graphlib::Node *);
-
-// void update_padding_matmul();
-
-// void update_padding_eltwise();
-
-}  // namespace tt::padding_placer
diff --git a/pybuda/csrc/passes/passes_utils.cpp b/pybuda/csrc/passes/passes_utils.cpp
index e67955dfe..41fb3ea91 100644
--- a/pybuda/csrc/passes/passes_utils.cpp
+++ b/pybuda/csrc/passes/passes_utils.cpp
@@ -3,7 +3,6 @@
 // SPDX-License-Identifier: Apache-2.0
 #include "passes_utils.hpp"
 
-#include "balancer/balancer_utils.hpp"
 #include "graph_lib/node_types.hpp"
 #include "graph_lib/utils.hpp"
 #include "utils/logger.hpp"
@@ -12,6 +11,8 @@ namespace tt {
 
 using NodeType = graphlib::NodeType;
 
+bool divisible_either_direction(int a, int b) { return (a % b == 0) or (b % a == 0); }
+
 void optimize_tms(std::vector<graphlib::OpType> &tms) {
     if (tms.size() < 2) {
         return;
@@ -83,7 +84,7 @@ void optimize_tms(std::vector<graphlib::OpType> &tms) {
         {[](OpType const &a, OpType const &b)
          {
              return ((a.op == "hstack" and b.op == "hslice") or (a.op == "vstack" and b.op == "vslice")) and
-                    balancer::divisible_either_direction(std::get<int>(a.attr[0]), std::get<int>(b.attr[0]));
+                    divisible_either_direction(std::get<int>(a.attr[0]), std::get<int>(b.attr[0]));
          },
          [](OpType &a, OpType &b)
          {
@@ -312,43 +313,6 @@ bool check_unsupported_hw_ops(Graph *graph, bool should_throw)
 
         if (node->node_type() != NodeType::kBudaOp)
             continue;
-
-        graphlib::BudaOpNode *op = node->as<graphlib::BudaOpNode>();
-        py::function pybuda_parallelization = eval_module.attr("get_f_pybuda_parallelization")(op->op_type_ptr());
-        py::object parallelization = pybuda_parallelization(balancer::get_op_shape(graph, node), 1);
-
-        if (parallelization.is_none())
-        {
-            unsupported_hw_ops = true;
-
-            std::string attrs;
-            for (const auto &[key, val] : op->buda_attrs())
-            {
-                attrs = attrs + key + ": ";
-                if (std::holds_alternative<bool>(val))
-                {
-                    attrs += std::to_string(std::get<bool>(val)) + ", ";
-                }
-                else if (std::holds_alternative<int>(val))
-                {
-                    attrs += std::to_string(std::get<int>(val)) + ", ";
-                }
-                else if (std::holds_alternative<float>(val))
-                {
-                    attrs += std::to_string(std::get<float>(val)) + ", ";
-                }
-                else if (std::holds_alternative<std::string>(val))
-                {
-                    attrs += std::get<std::string>(val) + ", ";
-                }
-            }
-            if (attrs.length() > 1)
-            {
-                attrs.erase(attrs.length() - 2);
-            }
-            log_warning("Unsupported HW op: {} {}({})", op->name(), op->op_type().op, attrs);
-            message += fmt::format("{} {}({})\n", op->name(), op->op_type().op, attrs);
-        }
     }
 
     if (unsupported_hw_ops and should_throw)
diff --git a/pybuda/csrc/passes/placer_buda_passes.cpp b/pybuda/csrc/passes/placer_buda_passes.cpp
deleted file mode 100644
index 3b5a76b5d..000000000
--- a/pybuda/csrc/passes/placer_buda_passes.cpp
+++ /dev/null
@@ -1,508 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "passes/placer_buda_passes.hpp"
-
-#include "balancer/balancer_cache_collection.hpp"
-#include "balancer/policies/policies.hpp"
-#include "passes/fracture.hpp"
-#include "passes/padding_pass_placer.hpp"
-#include "passes/passes_utils.hpp"
-#include "passes/pre_placer_buda_passes.hpp"
-#include "placer/lower_to_placer.hpp"
-#include "placer/placer.hpp"
-#include "reportify/reportify.hpp"
-#include "utils/env.hpp"
-
-namespace tt::passes
-{
-
-// Insert NOPs on queues that have a TM on their input.
-//
-static void fix_tms_on_queues(graphlib::Graph* graph)
-{
-    for (Node* n : graph->nodes_by_type(graphlib::NodeType::kQueue))
-    {
-        std::vector<Edge> edges = graph->operand_data_edges(n);
-        TT_ASSERT(edges.size() == 1);
-        std::vector<graphlib::OpType> tms = graph->get_edge_attributes(edges[0])->get_tms();
-
-        if (tms.size() == 0)
-            continue;
-
-        graphlib::BudaOpNode* nop = graph->add_node(
-            graphlib::create_node<graphlib::BudaOpNode>(n->name() + "_tm_nop", "nop"),
-            graph->get_subgraph_id_for_node(n->id()));
-        nop->copy_parent_op_attributes(graph->node_by_id(edges[0].producer_node_id)->as<graphlib::BudaOpNode>());
-        graphlib::insert_node_on_edge(graph, edges[0], nop);
-    }
-}
-
-static void graph_padding_pass(
-    graphlib::Graph* graph,
-    std::unordered_map<graphlib::Node*, const balancer::BudaOpNodeLegalizerFailureInfo>& nodes_to_pad,
-    const balancer::BalancerConfig& balancer_config,
-    std::shared_ptr<balancer::BalancerCacheCollection> balancer_cache_collection)
-{
-    // Pad graph before balancing.
-    //
-    if (!env_as<bool>("PYBUDA_DISABLE_PADDING_PASS") and !env_as<bool>("PYBUDA_PADDING_PASS_DISABLE_BUDA_OP"))
-    {
-        bool padded_flag =
-            padding_placer::pad_pass_placer(graph, nodes_to_pad, balancer_config, balancer_cache_collection);
-        if (padded_flag)
-        {
-            fix_tms_on_queues(graph);
-            recalculate_shapes(graph);
-            nodes_to_pad.clear();
-        }
-    }
-}
-
-static void graph_padding_override_pass(
-    graphlib::Graph* graph,
-    const py::dict& paddings_dict,
-    const balancer::BalancerConfig& balancer_config,
-    std::shared_ptr<balancer::BalancerCacheCollection> balancer_cache_collection)
-{
-    std::unordered_map<graphlib::Node*, const balancer::BudaOpNodeLegalizerFailureInfo> nodes_to_pad;
-
-    // Convert paddings from pybind11, python py::dict to c++ std::map.
-    //
-    std::map<std::string, bool> paddings = paddings_dict.cast<std::map<std::string, bool>>();
-    for (std::pair<std::string, bool> padding : paddings)
-    {
-        std::string name = padding.first;
-        bool pad = padding.second;
-
-        if (!pad)
-            continue;
-
-        Node* node = graph->get_node_by_name(name);
-        balancer::BudaOpNodeLegalizerFailureInfo failure_info;
-        failure_info.recordOpModelFailure(balancer::OpModelFailureReason::NoFailure);
-        nodes_to_pad.emplace(node, failure_info);
-    }
-
-    graph_padding_pass(graph, nodes_to_pad, balancer_config, balancer_cache_collection);
-}
-
-// Insert nop after a node
-static std::string insert_nop(graphlib::Graph* graph, const std::string& src_op)
-{
-    Node* src;
-    TT_ASSERT(graph->has_node_with_name(src_op));
-    src = graph->get_node_by_name(src_op);
-
-    std::uint32_t buffer_index = 0;
-
-    auto op_name = [](Node* src, std::uint32_t buffer_index)
-    { return "dram_writer_" + std::to_string(buffer_index) + "_" + src->name(); };
-
-    while (graph->has_node_with_name(op_name(src, buffer_index))) buffer_index++;
-
-    graphlib::BudaOpNode* buffer_nop = nullptr;
-    std::cout << "Insert NOP after " << src->name() << std::endl;
-
-    for (graphlib::Edge e : graph->user_data_edges(src))
-    {
-        std::cout << " - edge" << std::endl;
-        if (e.edge_type != graphlib::EdgeType::kData)
-            continue;
-
-        if (buffer_nop == nullptr)
-        {
-            std::cout << " - creating nop" << std::endl;
-            buffer_nop = graph->add_node(
-                graphlib::create_node<graphlib::BudaOpNode>(op_name(src, buffer_index), "nop"),
-                graph->get_subgraph_id_for_node(src->id()));
-            buffer_nop->set_shape(src->shape());
-            buffer_nop->set_buffering_op(true);
-        }
-
-        auto [edge0, edge1] = graphlib::insert_node_on_edge(graph, e, buffer_nop);
-        log_trace(
-            LogGraphCompiler,
-            "Inserted dram writer nop node {} between {} and {}",
-            buffer_nop->name(),
-            src->name(),
-            graph->node_by_id(e.consumer_node_id)->name());
-
-        // Move TMs to edge1
-        auto& tms = graph->get_edge_attributes(edge0)->get_tms();
-        if (true)  // not sure
-        {
-            // not hoisting tms, move them to edge1
-            graph->get_edge_attributes(edge1)->set_tms(tms);
-            graph->get_edge_attributes(edge0)->set_tms(std::vector<graphlib::OpType>{});
-        }
-    }
-
-    TT_ASSERT(buffer_nop != nullptr);
-    std::cout << " - created nop " << buffer_nop->name() << std::endl;
-    return buffer_nop->name();
-}
-
-static void handle_node_exceeds_max_op_forks(
-    graphlib::Graph* graph, balancer::BalancerError::NodeExceedsMaxOpForks type, int attempt)
-{
-    auto nodes = type.specific_node() ? std::vector<graphlib::Node*>{graph->node_by_id(type.node_id)} : graph->nodes();
-    for (graphlib::Node* node : nodes)
-    {
-        if (node->node_type() != graphlib::NodeType::kBudaOp)
-            continue;
-
-        std::vector<graphlib::Edge> users = graph->user_data_edges(node);
-        if ((int)users.size() <= type.max_forks)
-            continue;
-
-        int num_nops = (int)ceil(users.size() / type.max_forks);
-
-        for (int nop_i = 0; nop_i < num_nops; ++nop_i)
-        {
-            graphlib::OpType op_type("nop");
-            graphlib::BudaOpNode* nop = graph->add_node(
-                graphlib::create_node<graphlib::BudaOpNode>(
-                    node->name() + "_attempt_" + std::to_string(attempt) + "_input_op_fork_nop" + std::to_string(nop_i),
-                    op_type),
-                graph->get_subgraph_id_for_node(node->id()));
-            nop->set_shape(node->shape());
-            nop->set_output_df(node->output_df());
-            nop->set_epoch_type(node->get_epoch_type());
-
-            graphlib::Edge input_nop_edge(node->id(), 0, nop->id(), 0, graphlib::EdgeType::kData);
-            graph->add_edge(input_nop_edge);
-
-            for (int edge_i = (nop_i * type.max_forks);
-                 edge_i < std::min(((nop_i + 1) * type.max_forks), (int64_t)users.size());
-                 ++edge_i)
-            {
-                graphlib::Edge edge = users[edge_i];
-                auto edge_attrs = graph->get_edge_attributes(edge);
-                graph->remove_edge(edge);
-
-                graphlib::Edge output_nop_edge(
-                    nop->id(), 0, edge.consumer_node_id, edge.consumer_input_port_id, edge.edge_type);
-                graph->add_edge(output_nop_edge, edge_attrs);
-
-                // Associate nop with one of the fwd to bwd edges (so it can belong to an op group)
-                if (nop->get_epoch_type() != graphlib::NodeEpochType::Forward and edge_i % type.max_forks == 0)
-                {
-                    for (auto bwd_edge : graph->operand_edges(graph->node_by_id(edge.consumer_node_id)))
-                    {
-                        if (bwd_edge.edge_type != graphlib::EdgeType::kAutogradFwdToBwd and
-                            bwd_edge.edge_type != graphlib::EdgeType::kAutogradFwdToGradient and
-                            bwd_edge.edge_type != graphlib::EdgeType::kAutogradFwdToOptimizer and
-                            bwd_edge.edge_type != graphlib::EdgeType::kAutogradFwdToRecompute)
-                            continue;
-
-                        graphlib::Edge nop_bwd_edge(
-                            bwd_edge.producer_node_id,
-                            bwd_edge.producer_output_port_id,
-                            nop->id(),
-                            0,
-                            bwd_edge.edge_type);
-                        graph->add_edge(nop_bwd_edge);
-                    }
-                }
-            }
-        }
-    }
-}
-
-static void handle_input_exceeds_max_grid_forks(
-    graphlib::Graph* graph, balancer::BalancerError::InputBroadcastExceedsMaxGridForks)
-{
-    split_broadcasts(graph);
-}
-
-static void handle_dram_writer_needs_nop(
-    graphlib::Graph* graph,
-    balancer::BalancerConfig& balancer_config,
-    balancer::BalancerError::DRAMWriterNOPNeeded type)
-{
-    const std::string nop_name = insert_nop(graph, type.src);
-    if (type.transpose)
-        balancer_config.op_name_to_placer_overrides.emplace(nop_name, placer::PlacerOpOverride::force_op_transpose());
-}
-
-static bool handle_matmul_no_valid_grid(graphlib::Graph* graph, graphlib::BudaOpNode* op_node)
-{
-    TT_ASSERT(op_node != nullptr);
-    TT_ASSERT(op_node->is_matmul());
-
-    auto matmul_output_edges = graph->user_data_edges(op_node);
-    if (matmul_output_edges.size() != 1)
-        return false;
-
-    auto edge_attr = graph->get_edge_attributes(matmul_output_edges[0]);
-    auto consumer_op = dynamic_cast<graphlib::BudaOpNode*>(graph->node_by_id(matmul_output_edges[0].consumer_node_id));
-
-    // Handle Matmul -> Eltwise Binary edge where Matmul has no valid grid
-    if (consumer_op == nullptr or not graphlib::is_eltwise_binary(consumer_op))
-        return false;
-    if (not edge_attr->has_tm("transpose"))
-        return false;
-
-    // We have a matmul -> eltwise binary edge with transpose TM
-    return try_insert_nop_on_transpose_edge(graph, matmul_output_edges[0]);
-}
-
-static bool handle_splice_no_valid_grid(graphlib::Graph* graph, graphlib::BudaOpNode* op_node)
-{
-    TT_ASSERT(op_node != nullptr);
-    TT_ASSERT(op_node->is_splice());
-    int dim = op_node->op_type().get_attr_as<int>("dim");
-    if (dim != 2 and dim != 3)
-        return false;
-
-    auto [orig_dim, input_slices, output_stack] =
-        op_node->py_attr<std::tuple<int, std::vector<int>, int>>("convert_mode_t");
-
-    auto operand_edges = graph->operand_data_edges(op_node);
-    TT_ASSERT(operand_edges.size() == input_slices.size());
-    for (std::size_t i = 0; i < operand_edges.size(); ++i)
-    {
-        auto edge = operand_edges[i];
-        auto slice_factor = input_slices[i];
-        if (slice_factor > 1)
-        {
-            std::vector<graphlib::OpType>& tms = graph->get_edge_attributes(edge)->get_tms();
-            tms.push_back(graphlib::OpType((orig_dim == 2) ? "vslice" : "hslice", {slice_factor}));
-        }
-    }
-
-    if (output_stack > 1)
-    {
-        for (auto edge : graph->user_data_edges(op_node))
-        {
-            std::vector<graphlib::OpType>& tms = graph->get_edge_attributes(edge)->get_tms();
-            tms.insert(tms.begin(), graphlib::OpType((orig_dim == 2) ? "vstack" : "hstack", {output_stack}));
-        }
-    }
-
-    return true;
-}
-
-static void handle_no_valid_grid(
-    graphlib::Graph* graph,
-    std::unordered_map<graphlib::Node*, const balancer::BudaOpNodeLegalizerFailureInfo>& nodes_without_legal_op_model)
-{
-    std::vector<graphlib::Node*> fixed;
-    for (const auto& node_fail_pair : nodes_without_legal_op_model)
-    {
-        const auto op_node = dynamic_cast<graphlib::BudaOpNode*>(node_fail_pair.first);
-
-        if (op_node == nullptr)
-            continue;
-
-        if (op_node->is_matmul() and handle_matmul_no_valid_grid(graph, op_node))
-        {
-            fixed.push_back(op_node);
-        }
-        else if (op_node->is_splice() and handle_splice_no_valid_grid(graph, op_node))
-        {
-            fixed.push_back(op_node);
-        }
-    }
-
-    for (auto op_node : fixed) nodes_without_legal_op_model.erase(op_node);
-}
-
-void insert_queues(
-    graphlib::Graph* graph,
-    std::unordered_map<graphlib::Node*, const balancer::BudaOpNodeLegalizerFailureInfo>& nodes_without_legal_op_model)
-{
-    std::vector<graphlib::Node*> fixed;
-
-    for (auto const& [node, info] : nodes_without_legal_op_model)
-    {
-        auto* op_node = dynamic_cast<graphlib::BudaOpNode*>(node);
-        TT_ASSERT(op_node);
-
-        if (not info.getOpModelFailureCountByType(balancer::OpModelFailureReason::UserAccessPreventsStreaming))
-            continue;
-
-        bool users_already_fixed = false;
-        for (auto user : graph->user_data_edges(node))
-        {
-            auto* user_op_node = dynamic_cast<graphlib::BudaOpNode*>(graph->node_by_id(user.consumer_node_id));
-            if (not user_op_node)
-            {
-                auto* user_queue_node = dynamic_cast<graphlib::QueueNode*>(graph->node_by_id(user.consumer_node_id));
-                if (user_queue_node and user_queue_node->has_tag("no_valid_grids_queue"))
-                    users_already_fixed = true;
-                continue;
-            }
-
-            auto name = op_node->name() + "_no_valid_grids_queue";
-            if (graph->has_node_with_name(name))
-            {
-                graphlib::QueueNode* queue = graph->get_node_by_name(name)->as<graphlib::QueueNode>();
-                auto attr = graph->remove_edge(user);
-                user.producer_node_id = queue->id();
-                graph->add_edge(user, attr);
-            }
-            else
-            {
-                graphlib::QueueNode* queue = graph->add_node(
-                    graphlib::create_node<graphlib::BufferingQueueNode>(name, 2),
-                    graph->get_subgraph_id_for_node(op_node->id()));
-                queue->tag("no_valid_grids_queue");
-
-                bool constexpr inherit_consumer_attrs = false;
-                bool constexpr remove_edge = true;
-                bool constexpr place_tms_on_outgoing = true;
-                graphlib::insert_node_on_edge(
-                    graph,
-                    user,
-                    queue,
-                    inherit_consumer_attrs,
-                    remove_edge,
-                    user.consumer_input_port_id,
-                    place_tms_on_outgoing);
-            }
-        }
-
-        if (not users_already_fixed)
-            fixed.push_back(op_node);
-    }
-
-    for (auto op_node : fixed) nodes_without_legal_op_model.erase(op_node);
-}
-
-std::pair<std::shared_ptr<balancer::BalancerSolution>, bool> run_placer_buda_passes(
-    graphlib::Graph* graph,
-    balancer::BalancerConfig balancer_config,
-    FractureChipIdAssignments const& fracture_chip_id_assignments,
-    const py::dict& paddings_dict)
-{
-    int max_balancer_attempts = 30;
-    int attempt = 0;
-    int max_minor_attempts = 200;  // we expect a lot of these... really need to not have a limit, but a forward
-                                   // progress indicator - keep going if the number of epochs placed is increasing.
-    int minor_attempt = 0;
-
-    std::shared_ptr<balancer::BalancerCacheCollection> balancer_cache_collection =
-        std::make_shared<balancer::BalancerCacheCollection>();
-
-    // Do padding if there are any overrides specified
-    // in paddings_dict.
-    //
-    if (paddings_dict.size() > 0)
-    {
-        graph_padding_override_pass(graph, paddings_dict, balancer_config, balancer_cache_collection);
-    }
-
-    while (attempt < max_balancer_attempts)
-    {
-        check_unsupported_hw_ops(graph, true);
-
-        try
-        {
-            // assign chips ids from fracture chip assignments, if any
-            // iterate over the fracture chip assignments
-            for (auto const& [node, chip_id] : fracture_chip_id_assignments)
-            {
-                // check if the balancer config already has this op name
-                if (balancer_config.op_name_to_placer_overrides.find(node) !=
-                    balancer_config.op_name_to_placer_overrides.end())
-                {
-                    // if it does, check if there is a chip id
-                    if (balancer_config.op_name_to_placer_overrides[node].chip_id)
-                    {
-                        continue;
-                    }
-                    else
-                    {
-                        // if there is not, add the fracture chip id to the balancer config
-                        balancer_config.op_name_to_placer_overrides[node].chip_id = chip_id;
-                    }
-                }
-                else
-                {
-                    // if it does not, add it to the balancer config
-                    auto placer_override = placer::PlacerOpOverride();
-                    placer_override.chip_id = chip_id;
-                    balancer_config.op_name_to_placer_overrides.emplace(node, placer_override);
-                }
-            }
-
-            return std::make_pair(
-                balancer::run_balancer_and_placer(graph, balancer_config, balancer_cache_collection), attempt > 0);
-        }
-        catch (balancer::BalancerError const& e)
-        {
-            log_debug(LogGraphCompiler, "Handle BalancerError: {}", e.what());
-
-            attempt++;
-            if (balancer::BalancerError::NodeExceedsMaxOpForks const* type =
-                    std::get_if<balancer::BalancerError::NodeExceedsMaxOpForks>(&e.type))
-            {
-                handle_node_exceeds_max_op_forks(graph, *type, attempt);
-            }
-            else if (
-                balancer::BalancerError::InputBroadcastExceedsMaxGridForks const* type =
-                    std::get_if<balancer::BalancerError::InputBroadcastExceedsMaxGridForks>(&e.type))
-            {
-                handle_input_exceeds_max_grid_forks(graph, *type);
-            }
-            else if (
-                balancer::BalancerError::DRAMWriterNOPNeeded const* type =
-                    std::get_if<balancer::BalancerError::DRAMWriterNOPNeeded>(&e.type))
-            {
-                handle_dram_writer_needs_nop(graph, balancer_config, *type);
-                attempt--;
-                minor_attempt++;
-                if (minor_attempt > max_minor_attempts)
-                    break;
-            }
-            else if (
-                balancer::BalancerError::NoValidGrid const* type =
-                    std::get_if<balancer::BalancerError::NoValidGrid>(&e.type))
-            {
-                auto nodes_without_legal_op_model = type->nodes_without_legal_op_model;
-
-                if (not nodes_without_legal_op_model.empty())
-                    handle_no_valid_grid(graph, nodes_without_legal_op_model);
-
-                if (not nodes_without_legal_op_model.empty())
-                    graph_padding_pass(graph, nodes_without_legal_op_model, balancer_config, balancer_cache_collection);
-
-                if (not nodes_without_legal_op_model.empty())
-                    insert_queues(graph, nodes_without_legal_op_model);
-
-                if (not nodes_without_legal_op_model.empty())
-                    throw;
-            }
-            else if (balancer::BalancerError::Fatal const* type = std::get_if<balancer::BalancerError::Fatal>(&e.type))
-            {
-                log_fatal(LogGraphCompiler, "Fatal balancer error: {}", type->message);
-            }
-            else
-            {
-                throw;
-            }
-        }
-
-        reportify::dump_graph(graph->name(), "balancer_error_handler_attempt" + std::to_string(attempt), graph);
-
-        // We have to rerun the scheduler + some pre_placer graph passes after editing the graph
-        placer::PlacerConfigUpdate updated_config = schedule_pre_placer_graph(
-            graph,
-            balancer_config.device_config,
-            balancer_config.scheduler_config,
-            balancer_config.chip_ids,
-            balancer_config.op_names_to_chip_break,
-            balancer_config.op_names_to_epoch_break,
-            fracture_chip_id_assignments,
-            "_attempt" + std::to_string(attempt) /* nops_remote_devices_postfix */,
-            balancer_config.use_interactive_placer);
-        balancer_config.op_to_chip_id_assignment = updated_config.op_to_chip_id_assignment;
-    }
-
-    log_fatal("Error: We failed to balance/place after {} attempts", max_balancer_attempts);
-    // unreachable
-    return std::make_pair(nullptr, false);
-}
-}  // namespace tt::passes
diff --git a/pybuda/csrc/passes/placer_buda_passes.hpp b/pybuda/csrc/passes/placer_buda_passes.hpp
deleted file mode 100644
index 43e3a9508..000000000
--- a/pybuda/csrc/passes/placer_buda_passes.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <memory>
-#include <utility>
-
-#include "balancer/balancer.hpp"
-#include "passes/fracture.hpp"
-
-namespace tt::passes
-{
-void insert_queues(
-    graphlib::Graph* graph,
-    std::unordered_map<graphlib::Node*, const balancer::BudaOpNodeLegalizerFailureInfo>& nodes_without_legal_op_model);
-
-std::pair<std::shared_ptr<balancer::BalancerSolution>, bool> run_placer_buda_passes(
-    graphlib::Graph* graph,
-    balancer::BalancerConfig balancer_config,
-    FractureChipIdAssignments const& fracture_chip_id_assignments, 
-    const py::dict &paddings_dict);
-}  // namespace tt::passes
diff --git a/pybuda/csrc/passes/post_placer_buda_passes.cpp b/pybuda/csrc/passes/post_placer_buda_passes.cpp
deleted file mode 100644
index b506c61fa..000000000
--- a/pybuda/csrc/passes/post_placer_buda_passes.cpp
+++ /dev/null
@@ -1,935 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "post_placer_buda_passes.hpp"
-
-#include "lower_to_buda/common.hpp"
-#include "utils/env.hpp"
-
-namespace tt
-{
-
-void set_prologue_queues(Graph *graph, balancer::OpModelMap const &op_model_map)
-{
-    for (Node *node : graph->nodes())
-    {
-        if (node->node_type() == graphlib::NodeType::kInput)
-        {
-            balancer::OpModel const &op_model = op_model_map.at(node->name());
-            node->as<graphlib::InputNode>()->set_prologue(op_model.input_prologue);
-            log_trace(LogGraphCompiler, "Prologue {}: {}", node->name(), op_model.input_prologue);
-        }
-    }
-}
-
-static void set_kernel_broadcast(graphlib::BudaOpNode *op, balancer::OpModel const &op_model)
-{
-    graphlib::OpType &type = op->op_type();
-    int input_idx = 0;
-    for (auto const &input_buffer : op_model.input_buffers)
-    {
-        std::string input_name = "input_" + std::to_string(input_idx++);
-        if (input_buffer.kernel_broadcast_tiles == 0)
-            continue;
-        auto &attr = type.buda_attrs["kernel_broadcast"];
-        if (not std::holds_alternative<BudaKernelBroadcastInputs>(attr))
-            attr = BudaKernelBroadcastInputs{};
-        std::get<BudaKernelBroadcastInputs>(attr)[input_name] = input_buffer.kernel_broadcast_tiles;
-    }
-}
-
-static void set_l1_accumulate(graphlib::BudaOpNode *op, bool is_wormhole_b0)
-{
-    
-    /*
-    
-        This function should provide accumulation in L1 memory.
-        
-        This are the constraints related to this feature:
-        
-            - (1) Intermediate format can be float32, float16_b, int32 (float16_a is excluded due to the hw bug)
-                  Note: We will skip format int32 for now.
-
-            - (2) If intermed and output format are the same then buffer sharing is enabled (interm and output buffer share same physical address range in l1). 
-                  We'll get performance gain when m_k > 1.
-
-            - (3) If interm and output format are different then buffers are split and we don't need to double buffer output (buf_size_mb can be set to 1). 
-                  We'll get performance gain if m_k>2. When m_k is 2 we need to spill and reload once to repack data in different format and there is only single pass through interm buffers. 
-                  For the case m_k=2 we'll introduce overhead and reduce perf if l1 acc is enabled.
-
-            - (4) If m_k=1 l1 acc won't make any difference as we don't spill into l1.
-
-            - (5) For matmul accumulations and accumulate data format Fp32, intermediate data format must be Fp32.
-
-            - (6) If the operation is reduce skip L1 accumulation.
-
-    */
-
-    // Flag for disabling and debugging L1 accumaulation feature.
-    if (env_as<bool>("PYBUDA_DISABLE_L1_ACCUMULATE"))
-        return;
-
-    // L1 accumulation is only supported on WH B0
-    if (!is_wormhole_b0)
-        return;
-
-    // Check data formats (1)
-    bool is_not_float32 = op->intermediate_df() != DataFormat::Float32;
-    bool is_not_float16_b = op->intermediate_df() != DataFormat::Float16_b;
-
-    bool is_acc_float32 = op->accumulate_df() == DataFormat::Float32;
-
-    if (is_not_float32 && is_not_float16_b)
-        return;
-
-    // Check matmul and, intermediate and accumulate data format. (5)
-    if (op->is_matmul() && is_acc_float32 && is_not_float32)
-        return;
-
-    graphlib::OpType &type = op->op_type();
-
-    // Check reduce op. (6)
-    if (type.op == "reduce")
-        return;
-
-    // If m_k exists as attribute retrieve it, otherwise don't apply L1 accumulation.
-    if (type.buda_attrs.find("m_k") == type.buda_attrs.end())
-        return;
-
-    int m_k = std::get<int>(type.buda_attrs["m_k"]);
-
-    // (4)
-    if (m_k == 1)
-        return;
-
-    bool relu_present = (type.buda_attrs.find("relu_en") != type.buda_attrs.end());
-
-    // Compare intermediate and output data formats
-    // (2)
-    if (op->intermediate_df() == op->output_df())
-    {
-        if (m_k > 1 && !relu_present)
-            type.buda_attrs["l1_acc"] = true;
-    }
-    // (3)
-    else
-    {
-        if (m_k > 2 || relu_present)
-            type.buda_attrs["l1_acc"] = true;
-    }
-    
-}
-
-static std::tuple<int, int, int> calculate_sparse_mm_inner_dim(
-    graphlib::Graph *graph, graphlib::BudaOpNode *op, balancer::OpModelMap const &op_model_map)
-{
-    balancer::OpModel const &op_model = op_model_map.at(op->name());
-    auto operands = graph->operand_data_edges(op);
-    TT_ASSERT(operands.size() >= 3);
-    graphlib::Node *input1 = graph->node_by_id(operands.at(1).producer_node_id);
-    balancer::BlockShape input1_block_shape = op_model.input_buffers[1].block_shape;
-    auto const &tms = graph->get_edge_attributes(operands.at(1))->get_tms();
-
-    auto input1_canonical_form = post_tms_shape(input1->shape(), tms);
-
-    int act_t = input1_canonical_form.z();
-    int u_kt = input1_block_shape.ublock.rt;
-    int m_k = op_model.op_shape.inputs[1].rt / u_kt;
-    return std::make_tuple(act_t, m_k, u_kt);
-}
-
-void post_placer_lower_buda_attrs(
-    Graph *graph, DeviceConfig const &device_config, balancer::OpModelMap const &op_model_map)
-{
-    for (Node *node : graph->nodes())
-    {
-        if (node->node_type() == graphlib::NodeType::kBudaOp)
-        {
-            graphlib::BudaOpNode *op = node->as<graphlib::BudaOpNode>();
-            graphlib::OpType &type = op->op_type();
-            balancer::OpModel const &op_model = op_model_map.at(node->name());
-
-            if (op->is_dense_matmul())
-            {
-                balancer::BlockShape input0_block_shape = op_model.input_buffers[0].block_shape;
-                type.buda_attrs["m_k"] = op_model.op_shape.inputs[0].ct / input0_block_shape.ublock.ct;
-                type.buda_attrs["u_kt"] = input0_block_shape.ublock.ct;
-            }
-            else if (op->is_sparse_matmul())
-            {
-                auto [act_t, m_k, u_kt] = calculate_sparse_mm_inner_dim(graph, op, op_model_map);
-                type.buda_attrs["act_t"] = act_t;
-                type.buda_attrs["m_k"] = m_k;
-                type.buda_attrs["u_kt"] = u_kt;
-            }
-            else if (op->is_depthwise_matmul())
-            {
-                type.buda_attrs["u_kt"] = 1;  // hlk limitation
-                type.buda_attrs["m_k"] = op_model.op_shape.inputs[1].rt;  // inner-dim of in1 in tiles
-            }
-            else if (type.op == "reduce")
-            {
-                balancer::BlockShape input0_block_shape = op_model.input_buffers[0].block_shape;
-                if (std::get<std::string>(type.buda_attrs.at("dim")) == "r")
-                {
-                    type.buda_attrs["m_k"] = op_model.op_shape.inputs[0].rt / input0_block_shape.ublock.rt;
-                    type.buda_attrs["u_kt"] = input0_block_shape.ublock.rt;
-                }
-                else if (std::get<std::string>(type.buda_attrs.at("dim")) == "c")
-                {
-                    type.buda_attrs["m_k"] = op_model.op_shape.inputs[0].ct / input0_block_shape.ublock.ct;
-                    type.buda_attrs["u_kt"] = input0_block_shape.ublock.ct;
-                }
-            }
-
-            set_kernel_broadcast(op, op_model);
-            set_l1_accumulate(op, device_config.is_wormhole_b0());
-        }
-    }
-}
-
-graphlib::QueueNode *insert_epoch_to_epoch_queue(
-    graphlib::Graph *graph,
-    const std::string &name,
-    graphlib::Edge edge,
-    graphlib::UBlockOrder op_ublock_order,
-    bool cross_epoch_type,
-    bool cross_chip_type,
-    graphlib::NodeEpochType user_epoch_type,
-    graphlib::QueueNode *q = nullptr)
-{
-    TT_ASSERT(edge.edge_type == graphlib::EdgeType::kData, "Only data edge can be broken up with e2e queues");
-
-    if (q == nullptr)
-    {
-        // Create new queue
-        q = graph->add_node(
-            graphlib::create_node<graphlib::EpochToEpochQueueNode>(name, cross_epoch_type, cross_chip_type),
-            graph->get_subgraph_id_for_node(edge.producer_node_id));
-        q->set_shape(graph->node_by_id(edge.producer_node_id)->shape());
-        q->set_output_df(graph->node_by_id(edge.producer_node_id)->output_df());
-        q->set_epoch_type(user_epoch_type);
-
-        Edge node_to_q_edge(edge.producer_node_id, edge.producer_output_port_id, q->id(), 0, graphlib::EdgeType::kData);
-        graph->add_edge(node_to_q_edge);
-        graph->get_edge_attributes(node_to_q_edge)->set_ublock_order(op_ublock_order);
-    }
-
-    // Add edge from queue to consumer
-    graphlib::Edge q_to_node_edge =
-        Edge(q->id(), 0, edge.consumer_node_id, edge.consumer_input_port_id, graphlib::EdgeType::kData);
-    graph->add_edge(q_to_node_edge);
-    graph->copy_edge_attributes(edge, q_to_node_edge);
-    graph->remove_edge(edge);
-
-    return q;
-}
-
-graphlib::QueueNode *insert_buffering_queue(
-    graphlib::Graph *graph,
-    const std::string &name,
-    graphlib::Edge edge,
-    graphlib::UBlockOrder op_ublock_order,
-    graphlib::NodeEpochType user_epoch_type,
-    graphlib::QueueNode *q = nullptr)
-{
-    TT_ASSERT(edge.edge_type == graphlib::EdgeType::kData, "Only data edge can be broken up with this queue!");
-
-    if (q == nullptr)
-    {
-        // Create new queue
-        q = graph->add_node(
-            graphlib::create_node<graphlib::BufferingQueueNode>(name, graph->get_microbatch()),
-            graph->get_subgraph_id_for_node(edge.producer_node_id));
-        q->set_shape(graph->node_by_id(edge.producer_node_id)->shape());
-        q->set_output_df(graph->node_by_id(edge.producer_node_id)->output_df());
-        q->set_epoch_type(user_epoch_type);
-
-        Edge node_to_q_edge(edge.producer_node_id, edge.producer_output_port_id, q->id(), 0, graphlib::EdgeType::kData);
-        graph->add_edge(node_to_q_edge);
-        graph->get_edge_attributes(node_to_q_edge)->set_ublock_order(op_ublock_order);
-    }
-
-    // Add edge from queue to consumer
-    graphlib::Edge q_to_node_edge =
-        Edge(q->id(), 0, edge.consumer_node_id, edge.consumer_input_port_id, graphlib::EdgeType::kData);
-    graph->add_edge(q_to_node_edge);
-    graph->copy_edge_attributes(edge, q_to_node_edge);
-    graph->remove_edge(edge);
-
-    return q;
-}
-
-graphlib::Node *get_existing_fwd_e2e_queue(
-    graphlib::Graph *graph, const placer::PlacerSolution &placer_solution, graphlib::Node *recompute_node)
-{
-    Node *fwd_node = graphlib::get_fwd_from_recompute(graph, recompute_node);
-    std::uint32_t src_temporal_epoch = placer_solution.temporal_epoch_id(fwd_node->name());
-    for (Edge e : graph->user_data_edges(fwd_node))
-    {
-        graphlib::Node *dest_node = graph->node_by_id(e.consumer_node_id);
-        // If destination node is kQueue then placer_solution.temporal_epoch_id(dest_node->name()) doesn't contain
-        // dest_node->name() To infer dest_temporal_epoch we use consumer
-        if (dest_node->node_type() == tt::graphlib::kQueue)
-        {
-            dest_node = graph->node_by_id(graph->user_data_edges(dest_node)[0].consumer_node_id);
-        }
-
-        std::uint32_t dest_temporal_epoch = placer_solution.temporal_epoch_id(dest_node->name());
-        if (src_temporal_epoch != dest_temporal_epoch)
-        {
-            log_debug(
-                LogGraphCompiler,
-                "recompute_node: {} mapped to fwd_node: {} and e2e queue: {}",
-                recompute_node->name(),
-                fwd_node->name(),
-                dest_node->name());
-            return fwd_node;
-        }
-    }
-    return nullptr;
-}
-
-static void reconnect_recompute_consumers_to_fwd_queue(
-    graphlib::Graph *graph, graphlib::Node *recompute_node, graphlib::Node *queue)
-{
-    for (Edge e : graph->user_data_edges(recompute_node))
-    {
-        graphlib::Edge new_edge = Edge(
-            queue->id(),
-            e.producer_output_port_id,
-            e.consumer_node_id,
-            e.consumer_input_port_id,
-            graphlib::EdgeType::kData);
-        graph->add_edge(new_edge);
-        graph->copy_edge_attributes(e, new_edge);
-        graph->remove_edge(e);
-        log_debug(
-            LogGraphCompiler,
-            "\t Removing edge connecting: {}->{}",
-            recompute_node->name(),
-            graph->node_by_id(e.consumer_node_id)->name());
-    }
-}
-
-void replace_recompute_with_checkpoint(graphlib::Graph *graph, const placer::PlacerSolution &placer_solution)
-{
-    std::vector<graphlib::Node *> nodes = graphlib::topological_sort(*graph);
-    std::deque<std::pair<std::string, bool>> nodes_to_delete;
-
-    for (graphlib::Node *node : nodes)
-    {
-        if (node->node_type() != graphlib::NodeType::kBudaOp)
-        {
-            continue;
-        }
-        if (not graphlib::is_recompute(graph, node))
-        {
-            continue;
-        }
-        if (graph->num_users(node) == 0)
-        {
-            nodes_to_delete.emplace_back(node->name(), false);
-        }
-
-        if (graphlib::Node *queue = get_existing_fwd_e2e_queue(graph, placer_solution, node); queue != nullptr)
-        {
-            reconnect_recompute_consumers_to_fwd_queue(graph, node, queue);
-            nodes_to_delete.emplace_back(node->name(), false);
-            continue;
-        }
-
-        try
-        {
-            std::uint32_t src_temporal_epoch = placer_solution.temporal_epoch_id(node->name());
-            for (Edge e : graph->user_data_edges(node))
-            {
-                graphlib::Node *dest_node = graph->node_by_id(e.consumer_node_id);
-                if (dest_node->node_type() != graphlib::NodeType::kBudaOp)
-                    continue;
-                if (dest_node->node_type() == graphlib::NodeType::kQueue)
-                    continue;
-
-                try
-                {
-                    std::uint32_t dest_temporal_epoch = placer_solution.temporal_epoch_id(dest_node->name());
-
-                    if (src_temporal_epoch > dest_temporal_epoch)
-                    {
-                        Node *fwd_node = graphlib::get_fwd_from_recompute(graph, node);
-
-                        graphlib::Edge new_edge = Edge(
-                            fwd_node->id(),
-                            e.producer_output_port_id,
-                            dest_node->id(),
-                            e.consumer_input_port_id,
-                            graphlib::EdgeType::kData);
-
-                        graph->add_edge(new_edge);
-                        log_trace(
-                            LogGraphCompiler,
-                            "Bypassing {}. Connecting {} to {}.",
-                            node->name(),
-                            fwd_node->name(),
-                            dest_node->name());
-                        log_trace(LogGraphCompiler, "src= {}, dst={}.", src_temporal_epoch, dest_temporal_epoch);
-
-                        graph->copy_edge_attributes(e, new_edge);
-                        graph->remove_edge(e);
-
-                        nodes_to_delete.emplace_back(node->name(), false);
-                        for (auto operand : graph->data_operands(node))
-                        {
-                            nodes_to_delete.emplace_back(operand->name(), false);
-                        }
-                    }
-                }
-                catch (std::out_of_range &e)
-                {
-                    throw std::runtime_error(
-                        "Placement solution missing for node " + dest_node->name() +
-                        " while inserting epoch_to_epoch queues");
-                }
-            }
-        }
-        catch (std::out_of_range &e)
-        {
-            throw std::runtime_error(
-                "Placement solution missing for node " + node->name() + " while inserting epoch_to_epoch queues");
-        }
-    }
-    while (not nodes_to_delete.empty())
-    {
-        auto [node_to_delete_name, force_delete] = nodes_to_delete.front();
-        nodes_to_delete.pop_front();
-
-        if (graph->has_node_with_name(node_to_delete_name))
-        {
-            Node *node_to_delete = graph->get_node_by_name(node_to_delete_name);
-            if (force_delete)
-            {
-                graph->remove_node(node_to_delete);
-            }
-            else
-            {
-                if (graph->num_users(node_to_delete->id()) == 0)
-                {
-                    for (Node *operand : graph->data_operands(node_to_delete))
-                    {
-                        nodes_to_delete.emplace_back(operand->name(), false);
-                    }
-                    graph->remove_node(node_to_delete);
-                }
-            }
-        }
-    }
-}
-
-static bool feeds_remote_chips(
-    const placer::PlacerSolution &placer_solution, graphlib::Graph *graph, graphlib::Node *producer)
-{
-    std::uint32_t producer_chip_id = placer_solution.chip_id(producer->name());
-    for (const Edge &e : graph->user_data_edges(producer))
-    {
-        graphlib::Node *dest_node = graph->node_by_id(e.consumer_node_id);
-        if (dest_node->node_type() == graphlib::NodeType::kBudaOp)
-        {
-            if (producer_chip_id != placer_solution.chip_id(dest_node->name()))
-            {
-                return true;
-            }
-        }
-    }
-    return false;
-}
-
-void validate_subgraph_placement(Graph *graph, placer::PlacerSolution &placer_solution)
-{
-    for (size_t epoch = 0; epoch < placer_solution.epoch_id_to_epoch_info.size(); epoch++)
-    {
-        std::vector<placer::OpPlacement> ops = placer_solution.epoch_id_to_op_placement[epoch];
-        int epoch_subgraph_index = -1;
-        for (placer::OpPlacement op : ops)
-        {
-            if (not graph->has_node_with_name(ops[0].name))
-                continue;
-
-            if (epoch_subgraph_index == -1)
-                epoch_subgraph_index = graph->get_subgraph_id_for_node(graph->get_node_by_name(ops[0].name)->id());
-
-            TT_ASSERT(graph->get_subgraph_id_for_node(graph->get_node_by_name(ops[0].name)->id()) == (unsigned int)epoch_subgraph_index);
-        }
-        // If a subgraph index was found, set it for the epoch
-        if (epoch_subgraph_index != -1)
-            placer_solution.epoch_id_to_subgraph_index[epoch] = epoch_subgraph_index;
-        else
-        {
-            // otherwise for empty epochs deafult to 0
-            placer_solution.epoch_id_to_subgraph_index[epoch] = 0;
-        }
-    }
-}
-
-static bool feeds_multiple_remote_consumers(
-    const placer::PlacerSolution &placer_solution, graphlib::Graph *graph, graphlib::Node *producer)
-{
-    std::set<std::uint32_t> consumer_chip_ids;
-    for (const Edge &e : graph->user_data_edges(producer))
-    {
-        graphlib::Node *dest_node = graph->node_by_id(e.consumer_node_id);
-        if (dest_node->node_type() == graphlib::NodeType::kBudaOp)
-        {
-            consumer_chip_ids.insert(placer_solution.chip_id(dest_node->name()));
-        }
-    }
-    return consumer_chip_ids.size() > 1;
-}
-
-bool any_consumers_cross_epoch(graphlib::Graph *graph, graphlib::Node *producer)
-{
-    bool cross_epoch_type_across_all_users = false;
-    for (const Edge &e : graph->user_data_edges(producer))
-    {
-        graphlib::Node *dest_node = graph->node_by_id(e.consumer_node_id);
-        cross_epoch_type_across_all_users |= producer->get_epoch_type() != dest_node->get_epoch_type();
-    }
-    return cross_epoch_type_across_all_users;
-}
-
-// Remove buffering queues connecting cross epoch nodes so that E2E queues can be inserted instead.
-void remove_buffering_queues_from_cross_epoch_edges(
-    graphlib::Graph *graph, const placer::PlacerSolution &placer_solution)
-{
-    for (graphlib::Node *node : graphlib::topological_sort(*graph))
-    {
-        if (node->node_type() != graphlib::NodeType::kQueue)
-            continue;
-
-        graphlib::QueueNode *queue = static_cast<graphlib::QueueNode *>(node);
-
-        if (queue->queue_type() == graphlib::QueueNodeType::Buffering)
-        {
-            Node *source_node = graph->data_operands(queue).back();
-
-            if (source_node->node_type() == graphlib::NodeType::kBudaOp)
-            {
-                std::uint32_t src_temporal_epoch = placer_solution.temporal_epoch_id(source_node->name());
-                auto user_edges = graph->user_data_edges(queue);
-                bool no_users_in_src_epoch = true;
-                for (std::size_t i = 0; i < user_edges.size(); i++)
-                {
-                    Node *dest_node = graph->node_by_id(user_edges[i].consumer_node_id);
-                    if (dest_node->node_type() == graphlib::NodeType::kBudaOp)
-                    {
-                        std::uint32_t dest_temporal_epoch = placer_solution.temporal_epoch_id(dest_node->name());
-                        if (src_temporal_epoch != dest_temporal_epoch)
-                        {
-                            // if this is the last user of the queue and none of the previous users were in the same
-                            // epoch as producer, then all users are connected to source by e2e queue and we should
-                            // remove buffering queue.
-                            bool remove_queue = (i == user_edges.size() - 1 && no_users_in_src_epoch);
-                            connect_queue_src_to_queue_user(graph, queue, user_edges[i], remove_queue);
-                        }
-                        else
-                        {
-                            no_users_in_src_epoch = false;
-                        }
-                    }
-                }
-            }
-        }
-    }
-}
-
-// Insert a queue between every two ops that are not in the same epoch
-void insert_epoch_to_epoch_queues(
-    graphlib::Graph *graph,
-    const placer::PlacerSolution &placer_solution,
-    const std::unordered_set<graphlib::NodeEpochType> &epoch_types,
-    const balancer::CutEdges &graph_solver_cut_edges)
-{
-    bool firmware_looping_enabled = env_as<int>("NUM_EXEC_LOOP_ITERATIONS", 0) > 1;
-    for (graphlib::Node *node : graphlib::topological_sort(*graph))
-    {
-        if (node->node_type() != graphlib::NodeType::kBudaOp)
-            continue;
-        if (epoch_types.find(node->get_epoch_type()) == epoch_types.end())
-            continue;
-
-        int added_q_count = 0;
-
-        bool minimize_remote_dram_queues = env_as<bool>("PYBUDA_MINIMIZE_REMOTE_DRAM_QUEUES");
-
-        try
-        {
-            std::uint32_t src_temporal_epoch = placer_solution.temporal_epoch_id(node->name());
-
-            // Only create one e2e queue for each destination epoch
-            std::unordered_map<std::uint32_t, graphlib::QueueNode *> e2e_queues;
-            graphlib::QueueNode *e2e_q = nullptr;
-            graphlib::QueueNode *buf_q = nullptr;
-
-            const bool producer_feeds_multiple_remote_consumers =
-                feeds_multiple_remote_consumers(placer_solution, graph, node);
-            const bool producer_feeds_cross_epoch_consumer = any_consumers_cross_epoch(graph, node);
-            const bool producer_feeds_remote_chip = feeds_remote_chips(placer_solution, graph, node);
-
-            for (Edge e : graph->user_data_edges(node))
-            {
-                bool graph_solver_cut_edge = graph_solver_cut_edges.count(e) > 0;
-                graphlib::Node *dest_node = graph->node_by_id(e.consumer_node_id);
-                if (dest_node->node_type() != graphlib::NodeType::kBudaOp)
-                    continue;
-                if (dest_node->node_type() == graphlib::NodeType::kQueue)
-                    continue;
-
-                try
-                {
-                    std::uint32_t dest_temporal_epoch = placer_solution.temporal_epoch_id(dest_node->name());
-                    TT_ASSERT(placer_solution.epoch_id_to_subgraph_index.at(src_temporal_epoch) == placer_solution.epoch_id_to_subgraph_index.at(dest_temporal_epoch), "e2e queues across subgraphs not allowed");
-                    if (src_temporal_epoch > dest_temporal_epoch)
-                    {
-                        log_error(
-                            "Error creating e2e queue (likely an issue with pybuda placer):"
-                            "producer op ({}, {}) is placed in a later epoch than the dest op ({}, {}).",
-                            node->name(),
-                            src_temporal_epoch,
-                            dest_node->name(),
-                            dest_temporal_epoch);
-                    }
-
-                    bool should_insert_e2e_queue = src_temporal_epoch != dest_temporal_epoch;
-                    bool should_insert_buffering_queue = graph_solver_cut_edge;
-
-                    if (node->as<OpNode>()->is_gradient_op() and node->get_epoch_type() != dest_node->get_epoch_type())
-                    {
-                        connect_gradient_accum_queue(graph, node, e);
-                    }
-                    else if (
-                        producer_feeds_multiple_remote_consumers and should_insert_e2e_queue and
-                        not minimize_remote_dram_queues)
-                    {
-                        Edge operand = graph->operand_data_edges(node).back();
-                        graphlib::UBlockOrder op_ublock_order = graph->get_edge_attributes(operand)->get_ublock_order();
-                        bool cross_epoch_type = node->get_epoch_type() != dest_node->get_epoch_type();
-
-                        graphlib::QueueNode *e2e_q = nullptr;
-                        auto it = e2e_queues.find(dest_temporal_epoch);
-                        if (it != e2e_queues.end())
-                            e2e_q = it->second;
-
-                        e2e_q = insert_epoch_to_epoch_queue(
-                            graph,
-                            "e2e_" + node->name() + "_" + std::to_string(added_q_count++),
-                            e,
-                            op_ublock_order,
-                            cross_epoch_type,
-                            producer_feeds_remote_chip,
-                            dest_node->get_epoch_type(),
-                            e2e_q);
-
-                        e2e_queues[dest_temporal_epoch] = e2e_q;
-                    }
-                    else if (should_insert_e2e_queue)
-                    {
-                        Edge operand = graph->operand_data_edges(node).back();
-                        graphlib::UBlockOrder op_ublock_order = graph->get_edge_attributes(operand)->get_ublock_order();
-
-                        e2e_q = insert_epoch_to_epoch_queue(
-                            graph,
-                            "e2e_" + node->name() + "_" + std::to_string(added_q_count++),
-                            e,
-                            op_ublock_order,
-                            producer_feeds_cross_epoch_consumer,
-                            producer_feeds_remote_chip,
-                            dest_node->get_epoch_type(),
-                            e2e_q);
-                    }
-                    else if (should_insert_buffering_queue)
-                    {
-                        // For graph solver cut edge add bufferring queue instead but only if E2E queue is not already
-                        // added.
-                        //
-                        Edge operand = graph->operand_data_edges(node).back();
-                        graphlib::UBlockOrder op_ublock_order = graph->get_edge_attributes(operand)->get_ublock_order();
-
-                        auto q_ptr = insert_buffering_queue(
-                            graph,
-                            "buf_" + node->name() + "_" + std::to_string(added_q_count++),
-                            e,
-                            op_ublock_order,
-                            dest_node->get_epoch_type(),
-                            buf_q);
-                        if (!firmware_looping_enabled)
-                        {
-                            // With FW looping, we duplicate the buffering to avoid a deadlock scenario bug. This is a
-                            // workaround otherwise, we reuse the buf_q (this path)
-                            buf_q = q_ptr;
-                        }
-                    }
-                }
-                catch (std::out_of_range &e)
-                {
-                    throw std::runtime_error(
-                        "Placement solution missing for node " + dest_node->name() +
-                        " while inserting epoch_to_epoch queues");
-                }
-            }
-        }
-        catch (std::out_of_range &e)
-        {
-            throw std::runtime_error(
-                "Placement solution missing for node " + node->name() + " while inserting epoch_to_epoch queues");
-        }
-    }
-}
-
-void connect_gradient_accum_queue(graphlib::Graph *graph, Node *node, const graphlib::Edge &edge)
-{
-    // fetch the gradient queue, reconnect to optimizer input and delete old edge
-    for (Node *user : graph->data_users(node))
-    {
-        if (user->node_type() == graphlib::NodeType::kQueue and user->as<graphlib::QueueNode>()->is_grad_accumulator())
-        {
-            graphlib::Edge q_to_node_edge =
-                Edge(user->id(), 0, edge.consumer_node_id, edge.consumer_input_port_id, graphlib::EdgeType::kData);
-            graph->add_edge(q_to_node_edge);
-            graph->copy_edge_attributes(edge, q_to_node_edge);
-            graph->remove_edge(edge);
-        }
-    }
-}
-
-// Set queue entry sizes based on the configuration for different types of queues
-void set_queue_sizes(graphlib::Graph *graph, PostPlacerConfig &config, const placer::PlacerSolution &placer_solution)
-{
-    for (graphlib::Node *node : graph->nodes())
-    {
-        int size = -1;
-        if (node->node_type() == graphlib::NodeType::kInput)
-        {
-            bool constant = (node->as<graphlib::InputNode>()->is_constant());  // hacky, fix!
-            bool optimizer_param = (node->as<graphlib::InputNode>()->is_optimizer_parameter());
-            if ((node->as<graphlib::InputNode>()->is_parameter()) || constant || optimizer_param)
-            {
-                // TODO(jchu): This needs to get updated for repeat structures
-                size = 1;
-            }
-            else
-            {
-                size = env_as<int>("PYBUDA_OVERRIDE_INPUT_QUEUE_ENTRIES", config.input_queue_multiplier * graph->get_microbatch());
-            }
-        }
-        else if (node->node_type() == graphlib::NodeType::kOutput)
-        {
-            size = config.output_queue_multiplier * graph->get_microbatch();
-        }
-        else if (node->node_type() == graphlib::NodeType::kQueue)
-        {
-            graphlib::QueueNode *qnode = node->as<graphlib::QueueNode>();
-            if (qnode->is_grad_accumulator())
-            {
-                size = 1;  // TODO: grad accumulator should be a ram
-            }
-            else if (qnode->is_epoch_to_epoch())
-            {
-                graphlib::EpochToEpochQueueNode *e2e = qnode->as<graphlib::EpochToEpochQueueNode>();
-
-                if (e2e->is_cross_epoch_type())
-                {
-                    // TODO: Check for invalid epoch-to-epoch crossing
-                    size = config.microbatch_size * config.microbatch_count;
-                }
-                else
-                {
-                    // Need to cover the delta between epochs within one loop
-                    std::uint32_t last_epoch_use = get_last_epoch_use(graph, qnode, placer_solution);
-                    std::uint32_t first_epoch_producer = get_first_epoch_producer(graph, qnode, placer_solution);
-
-                    TT_LOG_ASSERT(
-                        last_epoch_use >= first_epoch_producer,
-                        "e2e queue: {} is going backwards in epochs.",
-                        qnode->name());
-                    // Need to cover the delta between chips if set in a pipeline
-                    // TODO: for wormhole, we don't need this much buffering if it's from one
-                    // temporal epoch to another!
-                    try
-                    {
-                        int first_epoch_chip_id = placer_solution.epoch_id_to_chip.at(first_epoch_producer);
-                        int last_epoch_chip_id = placer_solution.epoch_id_to_chip.at(last_epoch_use);
-                        int chip_to_chip_delta = -1;
-                        if (config.enable_cross_chip_buffering)
-                        {
-                            // on WH, the chip id may not be consecutive
-                            chip_to_chip_delta = abs(last_epoch_chip_id - first_epoch_chip_id) + 1;
-                        }
-                        else
-                        {
-                            chip_to_chip_delta = 1;
-                        }
-
-                        if (e2e->get_epoch_type() == graphlib::NodeEpochType::Optimizer)
-                        {
-                            size = chip_to_chip_delta;  // optimizer always works on one element only
-                        }
-                        else
-                        {
-                            size = config.microbatch_size * chip_to_chip_delta;
-                        }
-                    }
-                    catch (std::out_of_range &e)
-                    {
-                        log_fatal("Not all epochs in chip map");
-                    }
-                }
-            }
-            else if (qnode->is_buffering())
-            {
-                // Skip. The size is either user-configured or set earlier by some pass
-            }
-            else
-            {
-                // TODO: what else falls in here?
-                TT_ASSERT(false);
-            }
-        }
-
-        if (size > 0)
-        {
-            node->as<graphlib::QueueNode>()->set_num_entries(size);
-        }
-    }
-}
-
-std::vector<std::uint32_t> get_consumer_epoch_ids(
-    const graphlib::Graph *graph, const graphlib::Node *node, const placer::PlacerSolution &placer_solution)
-{
-    std::vector<std::uint32_t> consumer_epoch_ids;
-    std::vector<graphlib::Node *> users = graph->data_users(node);
-    try
-    {
-        for (Node *user : users)
-        {
-            consumer_epoch_ids.push_back(placer_solution.temporal_epoch_id(user->name()));
-        }
-        return consumer_epoch_ids;
-    }
-    catch (std::out_of_range &e)
-    {
-        log_fatal("Placement missing for a user of {}", node->name());
-        return {};
-    }
-}
-
-std::uint32_t get_last_epoch_use(
-    const graphlib::Graph *graph, const graphlib::Node *node, const placer::PlacerSolution &placer_solution)
-{
-    std::vector<std::uint32_t> consumer_epoch_ids = get_consumer_epoch_ids(graph, node, placer_solution);
-    return *std::max_element(consumer_epoch_ids.begin(), consumer_epoch_ids.end());
-}
-
-// Return first/last epoch in which this node's output is used
-std::uint32_t get_first_epoch_producer(
-    const graphlib::Graph *graph, const graphlib::Node *node, const placer::PlacerSolution &placer_solution)
-{
-    std::vector<graphlib::Node *> operands = graph->operands(node);
-    try
-    {
-        std::uint32_t min_epoch = placer_solution.temporal_epoch_id(operands[0]->name());
-        for (std::size_t i = 1; i < operands.size(); i++)
-        {
-            std::uint32_t epoch = placer_solution.temporal_epoch_id(operands[i]->name());
-            if (epoch < min_epoch)
-                min_epoch = epoch;
-        }
-        return min_epoch;
-    }
-    catch (std::out_of_range &e)
-    {
-        log_fatal("Placement missing for an operand of {}", node->name());
-        return 0;
-    }
-}
-
-void validate_multichip_queue_placements(
-    const PostPlacerConfig &config, const graphlib::Graph *graph, const placer::PlacerSolution &placer_solution)
-{
-    if (config.device_config.arch_name != "grayskull" or config.device_config.chip_ids.size() == 1)
-    {
-        return;
-    }
-
-    bool pass = true;
-    for (graphlib::Node *node : graphlib::topological_sort(*graph))
-    {
-        if (node->node_type() != graphlib::NodeType::kQueue)
-        {
-            continue;
-        }
-        graphlib::QueueNode *queue_node = node->as<graphlib::QueueNode>();
-
-        if (queue_node->is_grad_accumulator())
-        {
-            Node *gradient_op = graph->data_operands(queue_node).at(0);
-            if (placer_solution.chip_id(gradient_op->name()) != placer_solution.chip_id(queue_node->name()))
-            {
-                log_error(
-                    "Error: gradient op ({}, chip_id: {}) but grad_accum queue ({}, chip_id: {})",
-                    gradient_op->name(),
-                    placer_solution.chip_id(gradient_op->name()),
-                    queue_node->name(),
-                    placer_solution.chip_id(queue_node->name()));
-                pass = false;
-            }
-        }
-        else if (queue_node->is_epoch_to_epoch())
-        {
-            Node *producer_op = graph->data_operands(queue_node).at(0);
-            Node *consumer_op = graph->data_users(queue_node).at(0);
-            int producer_chip_id = placer_solution.chip_id(producer_op->name());
-            int consumer_chip_id = placer_solution.chip_id(consumer_op->name());
-
-            if (producer_op->get_epoch_type() == graphlib::NodeEpochType::Forward and
-                consumer_op->get_epoch_type() == graphlib::NodeEpochType::Forward)
-            {
-                bool valid_dataflow =
-                    (consumer_chip_id == producer_chip_id) || (consumer_chip_id == producer_chip_id + 1);
-                if (not valid_dataflow)
-                {
-                    log_error(
-                        "Error: producer op ({}, chip_id: {}) but consumer op({}, chip_id: {})",
-                        producer_op->name(),
-                        placer_solution.chip_id(producer_op->name()),
-                        consumer_op->name(),
-                        placer_solution.chip_id(consumer_op->name()));
-                    pass = false;
-                }
-            }
-            else if (
-                producer_op->get_epoch_type() == graphlib::NodeEpochType::Backward and
-                consumer_op->get_epoch_type() == graphlib::NodeEpochType::Backward)
-            {
-                bool valid_dataflow =
-                    (consumer_chip_id == producer_chip_id) || (consumer_chip_id == producer_chip_id - 1);
-                if (not valid_dataflow)
-                {
-                    log_error(
-                        "Error: producer op ({}, chip_id: {}) but consumer op({}, chip_id: {})",
-                        producer_op->name(),
-                        placer_solution.chip_id(producer_op->name()),
-                        consumer_op->name(),
-                        placer_solution.chip_id(consumer_op->name()));
-                    pass = false;
-                }
-            }
-        }
-    }
-    if (not pass)
-    {
-        log_fatal("validate_multichip_queue_placements FAILED");
-    }
-}
-
-}  // namespace tt
-
-
diff --git a/pybuda/csrc/passes/post_placer_buda_passes.hpp b/pybuda/csrc/passes/post_placer_buda_passes.hpp
deleted file mode 100644
index 1df05171d..000000000
--- a/pybuda/csrc/passes/post_placer_buda_passes.hpp
+++ /dev/null
@@ -1,111 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-// post placer buda passes
-// these functions are called from run_post_placer_buda_passes
-
-#pragma once
-
-#include "balancer/balancer.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-#include "graph_lib/node_types.hpp"
-#include "placer/dram.hpp"
-#include "placer/dram_allocator.hpp"
-#include "placer/host_memory.hpp"
-#include "placer/placer.hpp"
-
-namespace tt {
-
-using Graph = graphlib::Graph;
-using Node = graphlib::Node;
-using NodeContext = graphlib::NodeContext;
-using NodeToNodeMap = std::unordered_map<Node *, Node *>;
-
-struct PostPlacerConfig {
-    PostPlacerConfig(
-        DeviceConfig const &device_config,
-        std::uint32_t microbatch_size,
-        std::uint32_t microbatch_count,
-        bool enable_t_streaming,
-        bool input_queues_on_host,
-        bool output_queues_on_host,
-        DramQueueMap manual_dram_queue_placement,
-        std::uint32_t fork_join_tiles_treshold,
-        std::uint32_t output_queue_multiplier = 2,
-        std::uint32_t input_queue_multiplier = 2,
-        bool enable_cross_chip_buffering = true,
-        placer::DRAMPlacementAlgorithm placement_algorithm = placer::ROUND_ROBIN) :
-        device_config(device_config),
-        dram_placer_config(device_config, input_queues_on_host, output_queues_on_host, manual_dram_queue_placement),
-        host_memory_placer_config(device_config, input_queues_on_host, output_queues_on_host),
-        output_queue_multiplier(output_queue_multiplier),
-        input_queue_multiplier(input_queue_multiplier),
-        microbatch_size(microbatch_size),
-        microbatch_count(microbatch_count),
-        enable_t_streaming(enable_t_streaming),
-        enable_cross_chip_buffering(enable_cross_chip_buffering),
-        fork_join_tiles_treshold(fork_join_tiles_treshold),
-        placement_algorithm(placement_algorithm)
-    {
-    }
-
-    DeviceConfig const &device_config;
-    placer::DramPlacerConfig dram_placer_config;
-    placer::HostMemoryPlacerConfig host_memory_placer_config;
-
-    std::uint32_t output_queue_multiplier;
-    std::uint32_t input_queue_multiplier;
-    std::uint32_t microbatch_size;
-    std::uint32_t microbatch_count;
-    bool enable_t_streaming;
-    bool enable_cross_chip_buffering;
-    std::uint32_t fork_join_tiles_treshold;
-    placer::DRAMPlacementAlgorithm placement_algorithm;
-};
-
-void set_prologue_queues(Graph *graph, balancer::OpModelMap const &op_model_map);
-
-void post_placer_lower_buda_attrs(
-    Graph *graph, DeviceConfig const &device_config, balancer::OpModelMap const &op_model_map);
-
-void replace_recompute_with_checkpoint(graphlib::Graph *graph, const placer::PlacerSolution &placer_solution);
-
-void validate_subgraph_placement(Graph *graph, placer::PlacerSolution &placer_solution);
-
-void reduce_ethernet_stream_usage(PostPlacerConfig& config, graphlib::Graph *graph, balancer::BalancerSolution& balancer_solution, placer::PlacerSolution& placer_solution, tt::DeviceConfig const& device_config);
-
-// Remove buffering queues connecting cross epoch nodes so that E2E queues can be inserted instead.
-void  remove_buffering_queues_from_cross_epoch_edges(
-    graphlib::Graph *graph,
-    const placer::PlacerSolution &placer_solution
-);
-
-// Insert a queue between every two ops that are not in the same epoch
-// or if we have an edge that was cut by a GraphSolver.
-//
-void insert_epoch_to_epoch_queues(
-    graphlib::Graph *graph,
-    const placer::PlacerSolution &placer_solution,
-    const std::unordered_set<graphlib::NodeEpochType> &epoch_types,
-    const balancer::CutEdges &graph_solver_cut_edges);
-
-// Insert queues between ops on different epochs
-void insert_epoch_to_epoch_queue(
-    graphlib::Graph *graph,
-    const std::string &name,
-    graphlib::Edge edge,
-    graphlib::UBlockOrder op_ublock_order,
-    bool cross_epoch_type);
-
-void connect_gradient_accum_queue(graphlib::Graph *graph, Node* node, const graphlib::Edge& edge);
-
-// Set queue entry sizes based on the configuration for different types of queues
-void set_queue_sizes(graphlib::Graph *graph, PostPlacerConfig &config, const placer::PlacerSolution &placer_solution);
-
-std::vector<std::uint32_t> get_consumer_epoch_ids(const graphlib::Graph *graph, const graphlib::Node *node, const placer::PlacerSolution &placer_solution);
-std::uint32_t get_last_epoch_use(const graphlib::Graph *graph, const graphlib::Node *node, const placer::PlacerSolution &placer_solution);
-std::uint32_t get_first_epoch_producer(const graphlib::Graph *graph, const graphlib::Node *node, const placer::PlacerSolution &placer_solution);
-void validate_multichip_queue_placements(const PostPlacerConfig& config, const graphlib::Graph *graph, const placer::PlacerSolution &placer_solution);
-bool any_consumers_cross_epoch(graphlib::Graph *graph, graphlib::Node* producer);
-}
diff --git a/pybuda/csrc/passes/pre_lowering_passes.cpp b/pybuda/csrc/passes/pre_lowering_passes.cpp
index ab464be12..a6c81de8d 100644
--- a/pybuda/csrc/passes/pre_lowering_passes.cpp
+++ b/pybuda/csrc/passes/pre_lowering_passes.cpp
@@ -38,6 +38,23 @@ void convert_broadcast_ops_to_tms(Graph *graph)
     }
 }
 
+void place_inter_subgraph_queues(graphlib::Graph *graph) {
+    for (Node *n : graph->nodes_by_type(NodeType::kOutput)) {
+        std::vector<Node *> consumers = graph->data_users(n);
+        if (consumers.size() == 0)
+            continue;
+        std::vector<Node *> producers = graph->data_operands(n);
+        TT_ASSERT(producers.size() == 1);
+
+        std::cout << "removing node: " << n->name() << std::endl;
+        graph->remove_node(n);
+        for (Node *consumer : consumers) {
+            std::cout << "adding edge from: " << producers[0]->name() << " to: " << consumer->name() << std::endl;
+            graph->add_edge(producers[0], consumer);
+        }
+    }
+}
+
 static void insert_tile_broadcasts(
     Graph *graph, graphlib::Edge edge, std::vector<int> ignore_dims = {}, bool try_consteval = true)
 {
@@ -424,8 +441,12 @@ void duplicate_embedding_table_if_needed(Graph *graph)
                 }
                 else
                 {
-                    non_embedding_users_table = clone_param(param);
-                    params.push_back(non_embedding_users_table);
+                    // possibly the param is already cloned once and #users decrease, check the condition again
+                    if (graph->data_users(param).size() != 1)
+                    {
+                        non_embedding_users_table = clone_param(param);
+                        params.push_back(non_embedding_users_table);
+                    }
                 }
             }
         }
@@ -523,7 +544,7 @@ static bool has_fusable_upstream_matmul(graphlib::Graph *graph, graphlib::PyOpNo
     if (op == nullptr)
         return false;
 
-    while (not (op->is_dense_matmul() || op->is_depthwise_matmul()))
+    while (not (op->is_dense_matmul() || (op->is_depthwise_matmul() and not requant))) // requant can't be fused to depthwise
     {
         if (not (commutable_reshape(op))) {
             if (not (requant and op->is_tm()))  // requant can be commuted through TM
diff --git a/pybuda/csrc/passes/pre_lowering_passes.hpp b/pybuda/csrc/passes/pre_lowering_passes.hpp
index 0d827c2ea..5d7265b5c 100644
--- a/pybuda/csrc/passes/pre_lowering_passes.hpp
+++ b/pybuda/csrc/passes/pre_lowering_passes.hpp
@@ -35,6 +35,7 @@ bool safe_to_hoist_past(const Graph *graph, const Node *operand);
 void fuse_bias(Graph *graph);
 void fuse_gelu(Graph *graph);
 void fuse_requantize(Graph *graph);
+void place_inter_subgraph_queues(graphlib::Graph *graph);
 
 void replace_with_broadcasted_const(
     Graph *graph,
diff --git a/pybuda/csrc/passes/pre_placer_buda_passes.cpp b/pybuda/csrc/passes/pre_placer_buda_passes.cpp
index ac0568298..a55998065 100644
--- a/pybuda/csrc/passes/pre_placer_buda_passes.cpp
+++ b/pybuda/csrc/passes/pre_placer_buda_passes.cpp
@@ -4,20 +4,18 @@
 #include "passes/pre_placer_buda_passes.hpp"
 
 #include "autograd/binding.hpp"
-#include "passes/fuse_ops.hpp"
 #include "passes/lowering_context.hpp"
 #include "passes/passes_utils.hpp"
-#include "placer/lower_to_placer.hpp"
 #include "utils/logger.hpp"
 
 namespace tt {
 
 void lower_to_buffering_queues(Graph* graph) {
-    vector<Node*> nodes = graphlib::topological_sort(*graph);
+    std::vector<Node *> nodes = graphlib::topological_sort(*graph);
     for (Node* node : nodes) {
         if (node->node_type() == NodeType::kBudaOp and node->as<graphlib::BudaOpNode>()->op_name() == "dram_queue") {
             int entries = std::get<int>(node->as<graphlib::BudaOpNode>()->op_attrs().at(0));
-            auto buffering_queue = graph->add_node(
+            graphlib::QueueNode *buffering_queue = graph->add_node(
                 graphlib::create_node<graphlib::BufferingQueueNode>("lowered_" + node->name(), entries),
                 graph->get_subgraph_id_for_node(node->id()));
             graphlib::replace_node(graph, node, buffering_queue, false);
@@ -65,15 +63,15 @@ static bool compatible_relu_attrs(BudaOpAttrs const &dst_attrs, BudaOpAttrs cons
         return true;
 
     std::string dst_relu_mode =
-        (dst_attrs.find("relu_mode") != dst_attrs.end()) ? std::get<string>(dst_attrs.at("relu_mode")) : "min";
+        (dst_attrs.find("relu_mode") != dst_attrs.end()) ? std::get<std::string>(dst_attrs.at("relu_mode")) : "min";
     std::string src_relu_mode =
-        (src_attrs.find("relu_mode") != src_attrs.end()) ? std::get<string>(src_attrs.at("relu_mode")) : "min";
+        (src_attrs.find("relu_mode") != src_attrs.end()) ? std::get<std::string>(src_attrs.at("relu_mode")) : "min";
     return dst_relu_mode == src_relu_mode;
 }
 
 static bool can_hoist_relu(graphlib::Graph *graph, Node *nop)
 {
-    vector<Node *> operand_nodes = graph->data_operands(nop);
+    std::vector<Node *> operand_nodes = graph->data_operands(nop);
     TT_ASSERT(operand_nodes.size() == 1);
     graphlib::BudaOpNode *producer = dynamic_cast<graphlib::BudaOpNode *>(operand_nodes[0]);
 
@@ -85,9 +83,6 @@ static bool can_hoist_relu(graphlib::Graph *graph, Node *nop)
     if (nop->as<graphlib::BudaOpNode>()->is_gradient_op() or producer->is_gradient_op())
         return false;
 
-    if (producer->is_fused_op())
-        return false;
-
     bool producer_forks = graph->data_users(producer).size() > 1;
     if (producer_forks)
         return false;
@@ -108,7 +103,7 @@ static bool can_hoist_relu(graphlib::Graph *graph, Node *nop)
 
 static void hoist_relu(graphlib::Graph *graph, Node *nop)
 {
-    vector<Node *> operand_nodes = graph->data_operands(nop);
+    std::vector<Node *> operand_nodes = graph->data_operands(nop);
     TT_ASSERT(operand_nodes.size() == 1);
     Node *producer = operand_nodes[0];
 
@@ -118,7 +113,7 @@ static void hoist_relu(graphlib::Graph *graph, Node *nop)
     if (producer_attrs.find("relu_threshold") != producer_attrs.end())
     {
         std::string relu_mode = (producer_attrs.find("relu_mode") != producer_attrs.end())
-                                    ? std::get<string>(producer_attrs.at("relu_mode"))
+                                    ? std::get<std::string>(producer_attrs.at("relu_mode"))
                                     : "min";
         TT_ASSERT(relu_mode == "min" or relu_mode == "max");
         float producer_threshold = std::get<float>(producer_attrs.at("relu_threshold"));
@@ -166,7 +161,7 @@ void remove_nops(graphlib::Graph *graph)
     };
 
     // copy into separate vector because of iterator invalidation from removing nodes during iteration
-    vector<Node*> nop_nodes = graphlib::topological_sort(*graph, is_nop_node);
+    std::vector<Node *> nop_nodes = graphlib::topological_sort(*graph, is_nop_node);
 
     for (Node *node : nop_nodes)
     {
@@ -197,7 +192,7 @@ void remove_nops(graphlib::Graph *graph)
             continue;
         }
 
-        vector<Edge> operand_edges = graph->operand_data_edges(node);
+        std::vector<Edge> operand_edges = graph->operand_data_edges(node);
         TT_ASSERT(operand_edges.size() == 1);
         const Edge& producer_to_nop_edge = operand_edges[0];
         Node* producer = graph->node_by_id(producer_to_nop_edge.producer_node_id);
@@ -405,7 +400,7 @@ void fix_untilized_outputs(graphlib::Graph *graph, const DeviceConfig &device_co
 {
     // if multichip-wormhole, we add these untilize-nops indiscriminately for now
     // and will place these ops on an MMIO-capable device.
-    bool is_multichip_wormhole = device_config.is_wormhole() and device_config.chip_ids.size() > 1;
+    bool is_multichip_wormhole = device_config.is_wormhole_b0() and device_config.chip_ids.size() > 1;
 
     std::unordered_map<Node *, int> output_nop_indices; // For the cases where one op feeds multiple outputs
     for (Node *n: graph->nodes_by_type(graphlib::NodeType::kOutput))
@@ -424,7 +419,7 @@ void fix_untilized_outputs(graphlib::Graph *graph, const DeviceConfig &device_co
         bool is_reduce_z =
             (node->op_name() == "reduce") and (std::get<std::string>(node->buda_attrs().at("dim")) == "z");
         bool needs_nop =
-            node->is_matmul() || is_reduce_z || (graph->data_users(node).size() > 1) || (is_multichip_wormhole) || node->is_fused_op();
+            node->is_matmul() || is_reduce_z || (graph->data_users(node).size() > 1) || (is_multichip_wormhole);
 
         if (!needs_nop)
             continue;
@@ -488,231 +483,73 @@ void insert_tilize_op_on_input(graphlib::Graph *graph){
     }
 }
 
-
-placer::PlacerConfigUpdate schedule_pre_placer_graph(
-    graphlib::Graph *graph,
-    DeviceConfig const &device_config,
-    scheduler::SchedulerConfig const &scheduler_config,
-    std::vector<std::uint32_t> const &chip_ids,
-    std::vector<std::vector<std::string>> const &op_names_to_chip_break,
-    std::vector<std::vector<std::string>> const &op_names_to_epoch_break,
-    passes::FractureChipIdAssignments const &fracture_chip_id_assignments,
-    std::string const &nops_remote_devices_postfix,
-    bool use_interactive_placer)
+void fix_host_inputs(graphlib::Graph *graph)
 {
-    scheduler::Schedule scheduled_ops = run_scheduler(scheduler_config, graph);
-    placer::ChipPlacerConfig chip_placer_config = {
-        .chip_ids = chip_ids,
-        .arch_name = device_config.arch_name,
-        .op_to_epoch_type = placer::lowering::get_op_to_epoch_type_mapping(graph, scheduled_ops),
-        .ops_tagged_for_chip_id_break =
-            placer::lowering::tag_ops_for_chip_break(device_config.arch_name, op_names_to_chip_break, scheduled_ops, graph, use_interactive_placer),
-        .ops_tagged_for_epoch_break = placer::lowering::tag_ops_for_epoch_break(
-            device_config.arch_name,
-            op_names_to_epoch_break,
-            op_names_to_chip_break,
-            scheduled_ops,
-            graph,
-            use_interactive_placer),
-        .fracture_chip_id_assignments = fracture_chip_id_assignments,
-        .fwd_to_bwd_nodes = placer::lowering::get_fwd_to_bwd_nodes(graph),
-        .fwd_to_opt_nodes = placer::lowering::get_fwd_to_opt_nodes(graph, scheduled_ops),
-        .output_ops = placer::lowering::get_output_nodes(graph),
-        .chips_with_mmio = device_config.chips_with_mmio,
-    };
-    placer::OpToChipIdAssignment op_to_chip_id_assignment =
-        get_op_to_chip_id_assignment(chip_placer_config, scheduled_ops);
-
-    // update chip-id assignment, epoch breaking to accommodate fractured ops
-    std::vector<std::vector<std::string>> updated_op_names_to_epoch_break = op_names_to_epoch_break;
-    if (not fracture_chip_id_assignments.empty())
+    // Skip training.
+    //
+    if (graph->enable_training())
     {
-        std::tie(op_to_chip_id_assignment, updated_op_names_to_epoch_break) = update_config_for_fractured_ops(
-            chip_placer_config, op_names_to_epoch_break, scheduled_ops, op_to_chip_id_assignment);
+        return;
     }
-    // updated_op_names_to_epoch_break = update_epoch_breaks_for_partial_datacopy(graph, updated_op_names_to_epoch_break);
 
-    if (device_config.is_grayskull() and chip_ids.size() > 1)
+    for (Node *n : graph->nodes_by_type(graphlib::NodeType::kInput))
     {
-        // we assume any forking chip_ids are ordered in the same way that they will be pipelineed.
-        insert_nops_forking_to_remote_devices(graph, chip_ids, op_to_chip_id_assignment, nops_remote_devices_postfix);
-    }
-    // Recalculate shapes
-    recalculate_shapes(graph);
-
-    calculate_ublock_order(graph);
-
-    // Run TM optimizations, we want minimal TMs so balancer can accurately calculate output buffer requirements
-    optimize_tms(graph);
-
-    validate_buffering_queues(graph);
-
-    return placer::PlacerConfigUpdate(
-        op_to_chip_id_assignment, op_names_to_chip_break, updated_op_names_to_epoch_break);
-}
-
-static void insert_nop_fork(
-    graphlib::Graph *graph,
-    Node *node,
-    const std::vector<std::uint32_t> &chip_ids,
-    const std::unordered_map<uint32_t, uint32_t> &chip_id_to_pipeline_index,
-    const std::map<string, std::uint32_t> &user_remote_device_ids,
-    placer::OpToChipIdAssignment &op_to_chip_id_assignment,
-    std::string const &postfix)
-
-{
-    // case 3:
-    // process with remote-device ids in sorted order
-    std::vector<Node*> users = graph->data_users(node);
-    for (Node* user : users) {
-        if (user_remote_device_ids.find(user->name()) == user_remote_device_ids.end()) {
-            log_fatal("{} is not tagged with a chip_id assignment", user->name());
-        }
-    }
-
-    std::sort(users.begin(), users.end(), [&user_remote_device_ids, &chip_id_to_pipeline_index](const Node* a, const Node* b) {
-        auto chip_a = user_remote_device_ids.at(a->name());
-        auto chip_b = user_remote_device_ids.at(b->name());
-        return chip_id_to_pipeline_index.at(chip_a) < chip_id_to_pipeline_index.at(chip_b);
-    });
-
-    // from receiver nodes,  min receive chip id and max-receive chip id
-    TT_ASSERT(users.size() >= 2);
-    std::uint32_t first_user_chip_id = user_remote_device_ids.at(users.front()->name());
-    std::uint32_t last_user_chip_id = user_remote_device_ids.at(users.back()->name());
-
-    int start_pipeline_index = chip_id_to_pipeline_index.at(first_user_chip_id);
-    int end_pipeline_index = chip_id_to_pipeline_index.at(last_user_chip_id);
-    std::unordered_map<std::uint32_t, Node*> pipeline_index_to_source;
-
-    if (node->get_epoch_type() == graphlib::NodeEpochType::Backward and not graphlib::is_recompute(graph, node)) {
-        pipeline_index_to_source[end_pipeline_index] = node;
-
-        for (int pipeline_index = end_pipeline_index - 1; pipeline_index >= start_pipeline_index; --pipeline_index)
-        {
-            graphlib::BudaOpNode *nop = graph->add_node(graphlib::create_node<graphlib::BudaOpNode>(
-                node->name() + "_chip_to_chip_nop_" + std::to_string(pipeline_index) + postfix, "nop"),
-                graph->get_subgraph_id_for_node(node->id()));
-            Node* previous_source = pipeline_index_to_source.at(pipeline_index+1);
-            graph->add_edge(previous_source, nop);
-            graph->copy_node_attributes(previous_source, nop);
-
-            // If the producer was in backward (or optimizer) epoch, and there are fwd->bwd edges going to it,
-            // the need to go to the new op, too
-            for (Edge &e : graph->operand_edges(previous_source)) {
-                // Adjust control & autograd edges
-                if ( (e.edge_type == EdgeType::kAutogradFwdToBwd) || 
-                     (e.edge_type == EdgeType::kAutogradFwdToOptimizer) || 
-                     (e.edge_type == EdgeType::kAutogradFwdToRecompute)) 
-                {
-                    graph->add_edge(
-                            graph->node_by_id(e.producer_node_id),
-                            nop,
-                            e.producer_output_port_id,
-                            0,
-                            e.edge_type);
-                }
-            }
-
-            op_to_chip_id_assignment[nop->name()] = chip_ids.at(pipeline_index + 1);
-            pipeline_index_to_source[pipeline_index] = nop;
-        }
+        graphlib::InputNode *input_node = n->as<graphlib::InputNode>();
 
-        Node* source_node = node;
-        for (size_t user_index = 0; user_index < users.size(); ++user_index)
+        if (is_input_host_queue(true, graph, input_node))
         {
-            Node* user = users.at(user_index);
-            std::uint32_t remote_chip_id = op_to_chip_id_assignment.at(user->name());
-            int remote_chip_pipeline_index = chip_id_to_pipeline_index.at(remote_chip_id);
-            if (remote_chip_pipeline_index == end_pipeline_index) {continue;}
-
-            Node* new_source = pipeline_index_to_source.at(remote_chip_pipeline_index);
-
-            for (auto old_edge : graph->get_edges(source_node, user)) {
-                if (old_edge.edge_type == graphlib::EdgeType::kData) {
-                    graphlib::Edge new_edge=
-                        Edge(
-                            new_source->id(),
-                            old_edge.producer_output_port_id,
-                            user->id(),
-                            old_edge.consumer_input_port_id,
-                            graphlib::EdgeType::kData);
-
-
-                    graph->add_edge(new_edge);
-                    graph->copy_edge_attributes(old_edge, new_edge);
-                    graph->remove_edge(old_edge);
-                }
-            }
-        }
+            std::vector<Edge> user_data_edges = graph->user_data_edges(input_node);
 
-    } else {
-        pipeline_index_to_source[start_pipeline_index] = node;
+            // Hitting constraint issues with high forking. Skip for now.
+            //
+            if (user_data_edges.size() == 1 or user_data_edges.size() > 4)
+                continue;
 
-        for (int pipeline_index = start_pipeline_index + 1; pipeline_index <= end_pipeline_index; ++pipeline_index)
-        {
-            graphlib::BudaOpNode *nop = graph->add_node(graphlib::create_node<graphlib::BudaOpNode>(
-                node->name() + "_chip_to_chip_nop_" + std::to_string(pipeline_index) + postfix, "nop"),
-                graph->get_subgraph_id_for_node(node->id()));
-            Node* previous_source = pipeline_index_to_source.at(pipeline_index-1);
-            graph->add_edge(previous_source, nop);
-            graph->copy_node_attributes(previous_source, nop);
-
-            // If the producer was in backward (or optimizer) epoch, and there are fwd->bwd edges going to it,
-            // the need to go to the new op, too
-            for (Edge &e : graph->operand_edges(previous_source)) {
-                // Adjust control & autograd edges
-                if ( (e.edge_type == EdgeType::kAutogradFwdToBwd) || 
-                     (e.edge_type == EdgeType::kAutogradFwdToOptimizer) || 
-                     (e.edge_type == EdgeType::kAutogradFwdToRecompute)) 
+            // Check if attrs are same for all the edges coming out of host.
+            //
+            for (std::size_t i = 1; i < user_data_edges.size(); i++)
+            {
+                if (!(*graph->get_edge_attributes(user_data_edges[0]) == *graph->get_edge_attributes(user_data_edges[i])))
                 {
-                    graph->add_edge(
-                            graph->node_by_id(e.producer_node_id),
-                            nop,
-                            e.producer_output_port_id,
-                            0,
-                            e.edge_type);
+                    return;
                 }
             }
 
-            op_to_chip_id_assignment[nop->name()] = chip_ids.at(pipeline_index - 1);
-            pipeline_index_to_source[pipeline_index] = nop;
-        }
-        Node* source_node = node;
-        for (size_t user_index = 1; user_index < users.size(); ++user_index)
-        {
-
-            Node* user = users.at(user_index);
-            std::uint32_t remote_chip_id = op_to_chip_id_assignment.at(user->name());
-            int remote_chip_pipeline_index = chip_id_to_pipeline_index.at(remote_chip_id);
-            if (remote_chip_pipeline_index == start_pipeline_index) {continue;}
+            graphlib::BudaOpNode *nop = graph->add_node(
+                graphlib::create_node<graphlib::BudaOpNode>(input_node->name() + "_input_buffer_nop", "nop"),
+                graph->get_subgraph_id_for_node(input_node->id()));
+            nop->set_shape(input_node->shape());
+            graph->copy_node_attributes(input_node, nop);
+            nop->as<graphlib::TaggedNode>()->tag("host_input_buffer");
 
-            Node* new_source = pipeline_index_to_source.at(remote_chip_pipeline_index);
+            graphlib::Edge new_input_edge = Edge(
+                input_node->id(),
+                0 /* producer_output_port_id*/,
+                nop->id(),
+                0 /* consumer_output_port_id */,
+                graphlib::EdgeType::kData);
 
-            for (auto old_edge : graph->get_edges(source_node, user)) {
-                if (old_edge.edge_type == graphlib::EdgeType::kData) {
-                    graphlib::Edge new_edge=
-                        Edge(
-                            new_source->id(),
-                            old_edge.producer_output_port_id,
-                            user->id(),
-                            old_edge.consumer_input_port_id,
-                            graphlib::EdgeType::kData);
+            graph->add_edge(new_input_edge);
+            graph->copy_edge_attributes(user_data_edges[0], new_input_edge);
 
-
-                    graph->add_edge(new_edge);
-                    graph->copy_edge_attributes(old_edge, new_edge);
-                    graph->remove_edge(old_edge);
-                }
+            for (const Edge &old_edge : user_data_edges)
+            {
+                graphlib::Edge new_edge = Edge(
+                    nop->id(),
+                    old_edge.producer_output_port_id,
+                    old_edge.consumer_node_id,
+                    old_edge.consumer_input_port_id,
+                    graphlib::EdgeType::kData);
+
+                graph->add_edge(new_edge);
+                graph->remove_edge(old_edge);
             }
         }
-
     }
 }
 
 std::vector<std::vector<std::string>> update_epoch_breaks_for_partial_datacopy(
-    graphlib::Graph *graph,
-    std::vector<std::vector<std::string>> const &op_names_to_epoch_break)
+    graphlib::Graph *graph, std::vector<std::vector<std::string>> const &op_names_to_epoch_break)
 {
     std::vector<std::vector<std::string>> updated_op_names_to_epoch_break = op_names_to_epoch_break;
     for (auto node : graph->nodes())
@@ -729,161 +566,9 @@ std::vector<std::vector<std::string>> update_epoch_breaks_for_partial_datacopy(
     }
     return updated_op_names_to_epoch_break;
 }
-std::pair<placer::OpToChipIdAssignment, std::vector<std::vector<std::string>>> update_config_for_fractured_ops(
-    const placer::ChipPlacerConfig& config,
-    std::vector<std::vector<std::string>> const &op_names_to_epoch_break,
-    const std::vector<std::string>& scheduled_ops,
-    const placer::OpToChipIdAssignment& op_to_chip_id_assignment)
-{
-    placer::OpToChipIdAssignment updated_op_to_chip_id_assignment = op_to_chip_id_assignment;
-    std::vector<std::vector<std::string>> updated_op_names_to_epoch_break = op_names_to_epoch_break;
-
-    if (config.arch_name == "grayskull")
-    {
-        log_fatal("Multichip fracturing is not supported for grayskull architecture");
-    }
-    else
-    {
-        // For Wormhole, we need to add the fractured ops to the epoch break list
-        // so that placer knows to assign each fractured op to its designated chip and
-        // guarantee it will be the only op
-        for (uint32_t i = 0; i < scheduled_ops.size(); ++i)
-        {
-            if (auto it = config.fracture_chip_id_assignments.find(scheduled_ops[i]); it != config.fracture_chip_id_assignments.end())
-            {
-                log_debug(LogGraphCompiler, "Assigning fractured op: {} to chip_id: {}:", scheduled_ops[i], it->second);
-                updated_op_to_chip_id_assignment[scheduled_ops[i]] = it->second;
-            }
-        }
-    }
-    return {updated_op_to_chip_id_assignment, updated_op_names_to_epoch_break};
-}
 
-void insert_nops_forking_to_remote_devices(
-    graphlib::Graph *graph,
-    const std::vector<std::uint32_t> &chip_ids,
-    placer::OpToChipIdAssignment &op_to_chip_id_assignment,
-    std::string const &postfix)
+void calculate_ublock_order(graphlib::Graph *graph)
 {
-    std::unordered_map<uint32_t, uint32_t> chip_id_to_pipeline_index;
-    for (size_t pipeline_index = 0; pipeline_index < chip_ids.size(); ++pipeline_index) {
-        chip_id_to_pipeline_index[chip_ids[pipeline_index]] = pipeline_index;
-    }
-
-
-    for (graphlib::Node *node : graphlib::topological_sort(*graph))
-    {
-        // There are three cases we need to check for:
-        // 1) source input node is a constant -> we will default to replication
-        // 2) producer feeding a single consumer with a chip-hop-jistance > 1
-        // 3) op forking to multiple remote devices
-        if (graph->data_users(node).size() == 0)  { continue; }
-
-        std::map<string, std::uint32_t> user_remote_device_ids;
-        std::uint32_t min_chip_id_found = UINT_MAX;
-        std::uint32_t max_chip_id_found = 0;
-        for (Node *user : graph->data_users(node)) {
-            if (op_to_chip_id_assignment.find(user->name()) != op_to_chip_id_assignment.end()) {
-                std::uint32_t user_chip_id = op_to_chip_id_assignment.at(user->name());
-                user_remote_device_ids.emplace(user->name(), user_chip_id);
-                min_chip_id_found = std::min(min_chip_id_found, user_chip_id);
-                max_chip_id_found = std::max(max_chip_id_found, user_chip_id);
-            }
-        }
-
-        // NB: this is to correspond to the fwd/bwd data-flow across the chips for grayskull pipelining
-        std::uint32_t source_chip_id = op_to_chip_id_assignment.find(node->name()) != op_to_chip_id_assignment.end()
-                                ? op_to_chip_id_assignment.at(node->name())
-                                : node->get_epoch_type() == graphlib::NodeEpochType::Backward ? max_chip_id_found : min_chip_id_found;
-        std::uint32_t source_pipeline_index = chip_id_to_pipeline_index.at(source_chip_id);
-
-        int num_remote_consumers_with_more_one_hop = 0;
-        int num_remote_consumers = 0;
-        bool is_fork = (graph->data_users(node).size() > 1);
-
-        for (Node *user : graph->data_users(node)) {
-            if (auto it = op_to_chip_id_assignment.find(user->name()); it != op_to_chip_id_assignment.end()) {
-                std::uint32_t dest_pipeline_index = chip_id_to_pipeline_index.at(it->second);
-                if (dest_pipeline_index - source_pipeline_index > 1) {
-                    num_remote_consumers_with_more_one_hop += 1;
-                }
-                if (source_pipeline_index != dest_pipeline_index) {
-                    num_remote_consumers += 1;
-                }
-            }
-        }
-
-        if (node->node_type() == graphlib::NodeType::kInput and node->as<graphlib::InputNode>()->is_constant())
-        {
-            // case 1:
-            std::vector<Edge> user_data_edges = graph->user_data_edges(node);
-
-            for (size_t user_edge_index = 0; user_edge_index < user_data_edges.size(); ++user_edge_index) {
-                const Edge& user_edge = user_data_edges[user_edge_index];
-
-                Node* user = graph->node_by_id(user_edge.consumer_node_id);
-                if (auto it = op_to_chip_id_assignment.find(user->name()); it != op_to_chip_id_assignment.end() and it->second != source_chip_id) {
-
-                    auto cloned_constant_up = node->clone();
-                    cloned_constant_up->set_name(node->name() + "_" + std::to_string(user_edge_index) + postfix);
-                    Node* cloned_constant = graph->add_node(
-                        std::move(cloned_constant_up), graph->get_subgraph_id_for_node(node->id()));
-
-                    graphlib::Edge cloned_constant_to_remote_consumer =
-                        Edge(
-                            cloned_constant->id(),
-                            0,
-                            user->id(),
-                            user_edge.consumer_input_port_id,
-                            graphlib::EdgeType::kData);
-
-                    graph->add_edge(cloned_constant_to_remote_consumer);
-                    graph->copy_edge_attributes(user_edge, cloned_constant_to_remote_consumer);
-                    graph->remove_edge(user_edge);
-                }
-            }
-        }
-        else if (num_remote_consumers_with_more_one_hop > 0 and not is_fork) {
-            // case 2:
-            Node* source_node = node;
-            Node* user = graph->data_users(node).at(0);
-            std::uint32_t remote_chip_id = op_to_chip_id_assignment.at(user->name());
-            std::uint32_t start_pipeline_index = chip_id_to_pipeline_index.at(source_chip_id) + 1;
-            std::uint32_t end_pipeline_index = chip_id_to_pipeline_index.at(remote_chip_id);
-
-            for (std::uint32_t pipeline_index = start_pipeline_index; pipeline_index < end_pipeline_index; ++pipeline_index)
-            {
-                graphlib::BudaOpNode *nop = graph->add_node(graphlib::create_node<graphlib::BudaOpNode>(
-                    node->name() + "_chip_to_chip_nop_" + std::to_string(pipeline_index) + postfix, "nop"),
-                    graph->get_subgraph_id_for_node(node->id()));
-                graph->copy_node_attributes(source_node, nop);
-                op_to_chip_id_assignment[nop->name()] = chip_ids.at(pipeline_index);
-
-                for (auto user_edge : graph->get_edges(source_node, user)) {
-                    if (user_edge.edge_type == graphlib::EdgeType::kData) {
-                        graphlib::insert_node_on_edge(graph, user_edge, nop);
-                    }
-                }
-                source_node = nop;
-                source_chip_id = chip_ids.at(pipeline_index);
-            }
-
-        }
-        else if (is_fork and num_remote_consumers > 0)
-        {
-            insert_nop_fork(
-                graph,
-                node,
-                chip_ids,
-                chip_id_to_pipeline_index,
-                user_remote_device_ids,
-                op_to_chip_id_assignment,
-                postfix);
-        }
-    }
-}
-
-void calculate_ublock_order(graphlib::Graph *graph) {
     auto eval_module = py::module_::import("pybuda.op.eval.buda");
     py::function pybuda_input_ublock_order = eval_module.attr("get_f_pybuda_input_ublock_order");
 
@@ -1053,6 +738,42 @@ void calculate_ublock_order(graphlib::Graph *graph) {
         }
     }
 
+    // Set linking on subgraphs
+    for (Node * node : graph->nodes())
+    {
+        auto is_subgraph_link_edge = [](Edge e) {
+            return (e.edge_type == graphlib::EdgeType::kSubgraphLink);
+        };
+        std::vector<graphlib::Edge> subgraph_link_edge = graph->operand_edges(node, is_subgraph_link_edge);
+
+        if (subgraph_link_edge.empty())
+            continue;
+        
+        // Fetch producers' edges because they are the kData edges that have ublock order
+        auto output_node = graph->node_by_id(subgraph_link_edge[0].producer_node_id);
+        TT_ASSERT(graph->operand_edges(output_node).size() == 1, "Output node should only have 1 producer");
+        auto producer_ublock_order = graph->get_edge_attributes(graph->operand_edges(output_node)[0])->get_ublock_order();
+
+        // Consumer should have the same ublock order as producer
+        for (auto e : graph->user_data_edges(node)) {
+            auto edge_attrs = graph->get_edge_attributes(e);
+            if (edge_attrs->get_ublock_order() != producer_ublock_order) {
+                // Insert NOP to transpose ublock order
+                auto nop = graph->add_node(graphlib::create_node<graphlib::BudaOpNode>(
+                    node->name() + "_subgraph_link_nop", "nop"),
+                    graph->get_subgraph_id_for_node(node->id()));
+                auto original_ublock_order = edge_attrs->get_ublock_order();
+                
+                auto [new_edge0, new_edge1] = graphlib::insert_node_on_edge(graph, e, nop);
+                graph->get_edge_attributes(new_edge0)->set_ublock_order(producer_ublock_order);
+                graph->get_edge_attributes(new_edge1)->set_ublock_order(original_ublock_order);
+                graphlib::calculate_and_set_node_shape(graph, nop);
+            }
+        }
+    }
+
+
+
 #ifdef DEBUG
     //
     // Assert that
@@ -1428,6 +1149,7 @@ std::unique_ptr<Graph> lower_to_buda_ops(Graph *graph)
     auto new_graph = std::make_unique<Graph>(graphlib::IRLevel::IR_BUDA, graph->name());
 
     new_graph->set_microbatch(graph->get_microbatch());
+    new_graph->set_enable_training(graph->enable_training());
 
     // Mapping of old nodes to new ones. Where the old node maps to multiple new ones,
     // the output node is recorded as "new", because it will be used as operand into
@@ -1488,12 +1210,6 @@ static Node* create_recompute_op(graphlib::Graph* graph, Node* fwd_node, std::un
     recompute_node->set_accumulate_df(fwd_node->as<graphlib::BudaOpNode>()->accumulate_df());
     recompute_node->set_epoch_type(graphlib::NodeEpochType::Backward);
 
-    if (fwd_node->as<graphlib::BudaOpNode>()->is_fused_op())
-    {
-        std::shared_ptr<FusedOp> fused_op_clone = fwd_node->as<graphlib::BudaOpNode>()->get_fused_op()->clone(recompute_node);
-        recompute_node->set_fused_op(fused_op_clone);
-    }
-
     std::unordered_map<graphlib::NodeId, graphlib::NodeId> producer_remap;
 
     // For sparse matmul we much duplicate the constant inputs, these need to be unique per instance
@@ -1656,17 +1372,12 @@ void insert_user_defined_queues(
             consumer_name,
             input_port_id);
 
-        auto name = "insert_queue" + std::to_string(id++);
         // adding queue that has num_entries equal to microbatch size. This is enough to guarantee that queue will never
         // be full. microbatch size is the highest num_entires one buffering queue will ever need, since it is inside
         // one epoch
-        auto *q = graph->add_node(
-            graphlib::create_node<graphlib::BufferingQueueNode>(name, graph->get_microbatch() * 2),
-            graph->get_subgraph_id_for_node(producer->id()));
-        q->as<graphlib::TaggedNode>()->tag("inserted_queue");
-        q->set_shape(producer->shape());
-        q->set_output_df(producer->output_df());
-        q->set_epoch_type(consumer->get_epoch_type());
+        std::string name = "insert_queue" + std::to_string(id++);
+        graphlib::QueueNode *queue_node = graphlib::create_buffering_queue(graph, producer, name, graph->get_microbatch());
+        queue_node->as<graphlib::TaggedNode>()->tag("inserted_queue");
 
         auto ublock_order = graph->get_edge_attributes(*match)->get_ublock_order();
         bool inherit_consumer_attrs = true;
@@ -1674,7 +1385,7 @@ void insert_user_defined_queues(
         std::uint32_t consumer_index = 0;
         bool place_tms_on_outgoing = true;
         auto [producer_edge, consumer_edge] = insert_node_on_edge(
-            graph, *match, q, inherit_consumer_attrs, remove_edge, consumer_index, place_tms_on_outgoing);
+            graph, *match, queue_node, inherit_consumer_attrs, remove_edge, consumer_index, place_tms_on_outgoing);
         graph->get_edge_attributes(producer_edge)->set_ublock_order(ublock_order);
     }
 }
diff --git a/pybuda/csrc/passes/pre_placer_buda_passes.hpp b/pybuda/csrc/passes/pre_placer_buda_passes.hpp
index e4baca0bd..a703e4e9c 100644
--- a/pybuda/csrc/passes/pre_placer_buda_passes.hpp
+++ b/pybuda/csrc/passes/pre_placer_buda_passes.hpp
@@ -11,8 +11,6 @@
 #include "graph_lib/node_types.hpp"
 #include "lower_to_buda/common.hpp"
 #include "passes/fracture.hpp"
-#include "placer/chip_id_assignment.hpp"
-#include "scheduler/scheduler.hpp"
 
 namespace tt {
 
@@ -43,39 +41,16 @@ void sanitize_past_cache_ios(graphlib::Graph *graph);
 
 void fix_untilized_outputs(graphlib::Graph *graph, const DeviceConfig &device_config);
 
+void fix_host_inputs(graphlib::Graph *graph);
+
 void replace_buffers_with_nops(graphlib::Graph *graph);
 
 void insert_nop_on_matmul_input(graphlib::Graph *graph);
 
 void insert_tilize_op_on_input(graphlib::Graph *graph);
 
-placer::PlacerConfigUpdate schedule_pre_placer_graph(
-    graphlib::Graph *graph,
-    DeviceConfig const &device_config,
-    scheduler::SchedulerConfig const &scheduler_config,
-    std::vector<std::uint32_t> const &chip_ids,
-    std::vector<std::vector<std::string>> const &op_names_to_chip_break,
-    std::vector<std::vector<std::string>> const &op_names_to_epoch_break,
-    passes::FractureChipIdAssignments const &fracture_chip_id_assignments,
-    std::string const &nops_remote_devices_postfix = "",
-    bool use_interactive_placer = true);
-
-std::pair<placer::OpToChipIdAssignment, std::vector<std::vector<std::string>>>
-update_config_for_fractured_ops(
-    const placer::ChipPlacerConfig& config,
-    std::vector<std::vector<std::string>> const &op_names_to_epoch_break,
-    const std::vector<std::string>& scheduled_ops,
-    const placer::OpToChipIdAssignment& op_to_chip_id_assignment);
-
 std::vector<std::vector<std::string>> update_epoch_breaks_for_partial_datacopy(
-    graphlib::Graph *graph,
-    std::vector<std::vector<std::string>> const &op_names_to_epoch_break);
-
-void insert_nops_forking_to_remote_devices(
-    graphlib::Graph *graph,
-    const std::vector<std::uint32_t> &chip_ids,
-    placer::OpToChipIdAssignment &op_to_chip_id_assignment,
-    std::string const &postfix = "");
+    graphlib::Graph *graph, std::vector<std::vector<std::string>> const &op_names_to_epoch_break);
 
 void calculate_ublock_order(graphlib::Graph *graph);
 
diff --git a/pybuda/csrc/passes/reproduce_subgraph.cpp b/pybuda/csrc/passes/reproduce_subgraph.cpp
deleted file mode 100644
index ebeb30153..000000000
--- a/pybuda/csrc/passes/reproduce_subgraph.cpp
+++ /dev/null
@@ -1,220 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "passes/reproduce_subgraph.hpp"
-
-#include <pybind11/pybind11.h>
-
-#include "graph_lib/node_types.hpp"
-#include "graph_lib/utils.hpp"
-#include "utils/logger.hpp"
-#include "python_bindings_common.hpp"
-
-#include "balancer/types.hpp"
-#include "passes/consteval.hpp"
-
-namespace tt::passes
-{
-
-
-static std::vector<graphlib::Node *> find_path_to_node(
-    graphlib::Graph *graph,
-    graphlib::OpNode *initial_op,
-    graphlib::OpNode *final_op,
-    graphlib::OpNode *from = nullptr)
-{
-    std::vector<graphlib::Node *> path;
-
-    graphlib::OpNode *iter = from ? from : initial_op;
-
-    bool found_op = false;
-    while (not found_op)
-    {   
-        graphlib::OpNode *op = dynamic_cast<graphlib::OpNode *>(iter);
-        TT_ASSERT(op);
-        path.push_back(op);
-
-        std::vector<graphlib::Node *> users = graph->data_users(op);
-        for (std::size_t i = 1; i < users.size(); ++i)
-        {
-            graphlib::OpNode *user = dynamic_cast<graphlib::OpNode *>(users[i]);
-            auto fork_path = find_path_to_node(graph, initial_op, final_op, user);
-            if (not fork_path.empty())
-            {
-                path.insert(path.end(), fork_path.begin(), fork_path.end());
-                found_op = true;
-                break;
-            }
-        }
-
-        if(op->id() == final_op->id())
-        {
-            found_op = true;
-            break;
-        }
-
-        TT_ASSERT(users.size() > 0);
-        graphlib::OutputNode *output = dynamic_cast<graphlib::OutputNode *>(users[0]);
-        if (output)
-            break;
-        
-        iter = dynamic_cast<graphlib::OpNode *>(users[0]);
-        if (not iter)
-            break;
-    }
-
-    if (not found_op)
-        path.clear();
-
-    return path;
-}
-
-
-
-void reproduce_subgraph(
-    graphlib::Graph *graph,
-    std::string input_name,
-    std::string output_name,
-    std::unordered_map<std::string, py::object> intermediates,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution,
-    placer::PlacerSolution *placer_solution)
-{
-    std::vector<graphlib::Node *> nodes_to_keep;
-    graphlib::OpNode *input_node;
-    graphlib::OpNode *output_node;
-
-    graphlib::Node *last_node = nullptr;
-    TT_ASSERT(input_name.length() > 0, "No input node provided");
-    TT_ASSERT(output_name.length() > 0, "No output node provided");
-
-    TT_ASSERT(graph->get_node_by_name(input_name), "Input node not found");
-    TT_ASSERT(graph->get_node_by_name(output_name), "Output node not found");
-    input_node = dynamic_cast<graphlib::OpNode *>(graph->get_node_by_name(input_name));
-    output_node = dynamic_cast<graphlib::OpNode *>(graph->get_node_by_name(output_name));
-
-    nodes_to_keep = find_path_to_node(graph, dynamic_cast<graphlib::OpNode *>(input_node), dynamic_cast<graphlib::OpNode *>(output_node));
-    TT_ASSERT(nodes_to_keep.size() > 0, "No path found between input and output nodes");
-    last_node = nodes_to_keep.back();
-    std::vector<graphlib::Node *> needed_inputs;
-    for (auto node : nodes_to_keep)
-    {
-        std::vector<graphlib::Edge> operand_data_edges = graph->operand_data_edges(node);
-        int i = 0;
-        for (auto operand_data_edge : operand_data_edges)
-        {
-            auto operand = graph->node_by_id(operand_data_edge.producer_node_id);
-            if (std::find(nodes_to_keep.begin(), nodes_to_keep.end(), operand) != nodes_to_keep.end())
-                continue;
-            // for first operand that's not an input node, connect it to the graph input, with a runtime transform
-            // for subsequent nodes, create a constant input node
-            if (not dynamic_cast<graphlib::InputNode *>(operand))
-            {
-                if (dynamic_cast<graphlib::QueueNode *>(operand))
-                {
-                    operand = graph->data_operands(operand)[0];
-                }
-                TT_ASSERT(dynamic_cast<graphlib::OpNode *>(operand), "Something went wrong");
-                graphlib::Node *input;
-                if (i == 0)
-                {
-                    input = graph->ordered_module_inputs()[0];
-                    graphlib::InputNode *in_node = dynamic_cast<graphlib::InputNode *>(input);
-                    graphlib::RuntimeTensorTransform runtime_tensor_transform {};
-                    runtime_tensor_transform.set_constant_input_tensor(intermediates[operand->name()]);
-                    in_node->set_runtime_tensor_transform(runtime_tensor_transform);
-                    input->set_shape(operand->shape());
-                    for (auto edge : graph->user_data_edges(input))
-                    {
-                        graph->remove_edge(edge);
-                    }
-                    balancer::BlockShape block_shape = balancer_solution->op_models[operand->name()].output_buffers[0].block_shape;
-                    block_shape.mblock_m *= balancer_solution->op_models[operand->name()].grid_shape.r;
-                    block_shape.mblock_n *= balancer_solution->op_models[operand->name()].grid_shape.c;
-                    balancer_solution->block_shapes[input->name()] = block_shape;
-
-                    placer_solution->name_to_queue_placement.erase(input->name());
-                    i++;
-                }
-                else
-                {
-                    py::object constant_value = intermediates[operand->name()];
-                    input = graph->add_node(graphlib::create_node<graphlib::ConstantInputNode>(
-                            operand->name() + "_constant_input_" + std::to_string(i++),
-                            make_shared_py_object(constant_value),
-                            operand->shape()), graph->get_subgraph_id_for_node(node->id()));
-
-                    tt::balancer::GridShape grid_shape = tt::balancer::GridShape(1, 1);
-                    tt::balancer::BlockShape block_shape = tt::balancer::BlockShape(operand->shape(), grid_shape.r, grid_shape.c, 1, tt::balancer::UBlockShape(1, 1));
-                    tt::balancer::BufferModel input_buffer_model = tt::balancer::BufferModel(block_shape, 1, input->output_df());
-
-                    tt::balancer::OpModel input_op_model;
-                    input_op_model.grid_shape = grid_shape;
-                    input_op_model.op_shape.outputs.push_back(operand->shape());
-                    input_op_model.output_buffers.push_back(input_buffer_model);
-                    input_op_model.data_format = input->output_df();
-                    input_op_model.input_prologue = false;
-
-                    balancer_solution->op_models[node->name()] = input_op_model;
-                    balancer_solution->block_shapes[node->name()] = block_shape;
-                    placer_solution->input_queue_to_grid_shape.insert(
-                        {node->name(),
-                         tt::placer::GridShape((std::uint32_t)grid_shape.r, (std::uint32_t)grid_shape.c)});
-                }
-                graphlib::Edge new_edge(operand_data_edge);
-                new_edge.producer_node_id = input->id();
-                graph->add_edge(new_edge);
-                graph->copy_edge_attributes(operand_data_edge, new_edge);
-                graph->remove_edge(operand_data_edge);
-                needed_inputs.push_back(input);
-            }
-            else
-            {
-                needed_inputs.push_back(operand);
-            }
-        }
-    }
-    nodes_to_keep.insert(nodes_to_keep.end(), needed_inputs.begin(), needed_inputs.end());
-    for (graphlib::Node *node : graph->ordered_module_inputs())
-        nodes_to_keep.push_back(node);
-
-    int i = 0;
-    for (graphlib::Node *node : graph->ordered_module_outputs())
-    {
-        nodes_to_keep.push_back(node);
-        if (i++ == 0)
-        {
-            placer_solution->name_to_queue_placement.erase(node->name());
-            graph->add_edge(last_node, node, graphlib::EdgeType::kData);
-            graph->set_output_node_redirected(true);
-            graphlib::OutputNode *out_node = dynamic_cast<graphlib::OutputNode *>(node);
-            out_node->set_runtime_tensor_transform(graphlib::RuntimeTensorTransform());
-            out_node->set_untilize(false);
-            node->set_shape(last_node->shape());
-
-            balancer::BlockShape block_shape = balancer_solution->op_models[last_node->name()].output_buffers[0].block_shape;
-            balancer_solution->block_shapes[out_node->name()] = block_shape;
-        }
-    }
-
-    for (auto node : nodes_to_keep)
-    {
-        placer_solution->name_to_op_placement[node->name()].global_epoch_id = 0;
-    }
-    for (unsigned int i = 1; i < placer_solution->num_epochs; i++)
-    {
-        placer_solution->epoch_id_to_chip.erase(i);
-        placer_solution->epoch_id_to_op_placement.erase(i);
-        placer_solution->epoch_id_to_epoch_info.erase(i);
-    }
-    placer_solution->num_epochs = 1;
-
-    for (auto node : graphlib::topological_sort(*graph))
-    {
-        if (std::find(nodes_to_keep.begin(), nodes_to_keep.end(), node) == nodes_to_keep.end())
-        {
-            graph->remove_node(node);
-        }
-    }
-
-}
-}  // namespace tt::passes
diff --git a/pybuda/csrc/passes/reproduce_subgraph.hpp b/pybuda/csrc/passes/reproduce_subgraph.hpp
deleted file mode 100644
index 0a3958999..000000000
--- a/pybuda/csrc/passes/reproduce_subgraph.hpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <string>
-#include <vector>
-#include <unordered_map>
-#include <pybind11/pybind11.h>
-
-#include "balancer/balancer.hpp"
-
-namespace py = pybind11;
-
-namespace tt::graphlib
-{
-class Graph;
-}
-
-namespace tt::passes
-{
-// 
-void reproduce_subgraph(
-    graphlib::Graph *graph,
-    std::string input_name,
-    std::string output_name,
-    std::unordered_map<std::string, py::object> intermediates,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution,
-    placer::PlacerSolution *placer_solution);
-}
diff --git a/pybuda/csrc/passes/t_stream.cpp b/pybuda/csrc/passes/t_stream.cpp
deleted file mode 100644
index 0c4e706d9..000000000
--- a/pybuda/csrc/passes/t_stream.cpp
+++ /dev/null
@@ -1,509 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "t_stream.hpp"
-
-#include "autograd/binding.hpp"
-#include "graph_lib/node_types.hpp"
-
-namespace tt
-{
-using namespace balancer;
-
-static char opposite_dir(char dir) { return (dir == 'h') ? 'v' : 'h'; }
-static std::vector<OpType>::iterator append_tm(std::vector<OpType>& tms, graphlib::OpType const& tm)
-{
-    return tms.insert(tms.end(), tm);
-}
-static std::vector<OpType>::iterator prepend_tm(std::vector<OpType>& tms, graphlib::OpType const& tm, bool after_transpose = true)
-{
-    auto iter = tms.begin();
-    if (after_transpose)
-    {
-        // We must prepend after transpose tms since the calculations below use streaming dir post-transpose
-        iter = std::find_if(tms.begin(), tms.end(), [](OpType const& op) { return op.op == "transpose"; });
-        iter = (iter == tms.end()) ? tms.begin() : iter + 1;
-    }
-    return tms.insert(iter, tm);
-}
-
-static void insert_t_stream_stack_slice(
-    std::vector<OpType>& tms,
-    std::string dir,
-    int consumer_factor,
-    int producer_factor,
-    bool after_transpose = true,
-    bool producer_z_major = false,
-    int group = 1)
-{
-    if (consumer_factor == producer_factor)
-        return;
-
-    TT_ASSERT(dir.size() == 1 and (dir[0] == 'h' or dir[0] == 'v'));
-
-    int factor;
-    std::string op_name;
-    if (consumer_factor > producer_factor)
-    {
-        TT_ASSERT((consumer_factor % producer_factor) == 0);
-        factor = consumer_factor / producer_factor;
-        op_name = "slice";
-    }
-    else
-    {
-        TT_ASSERT((producer_factor % consumer_factor) == 0);
-        factor = producer_factor / consumer_factor;
-        op_name = "stack";
-    }
-
-    std::vector<graphlib::OpType>::iterator iter;
-    graphlib::OpType op_type((dir + op_name), {factor});
-    if (consumer_factor > producer_factor or producer_z_major)
-    {
-        if (tms.size() > 0 && (*tms.rbegin()).op == "buda_pad" && op_name == "stack")
-        {
-            OpType buda_pad = *tms.rbegin();
-            tms.pop_back();
-            tms.push_back(op_type);
-            iter = tms.insert(tms.end(), buda_pad);
-        }
-        else
-            iter = append_tm(tms, op_type);
-    }
-    else
-    {
-        iter = prepend_tm(tms, op_type, after_transpose);
-    }
-
-    if (group > 1)
-    {
-        auto group_dir = opposite_dir(dir[0]);
-        graphlib::OpType group_op_type((group_dir + std::string("stack")), {group});
-        iter = tms.insert(iter, group_op_type);
-        graphlib::OpType ungroup_op_type((group_dir + std::string("slice")), {group});
-        tms.insert(iter + 2, ungroup_op_type);
-    }
-}
-
-static void insert_t_stream_default_tms(
-    std::vector<OpType>& tms,
-    TStreamFactor consumer_factor,
-    TStreamFactor producer_factor,
-    int group = 1,
-    bool after_transpose = true)
-{
-    TT_ASSERT(not consumer_factor.none() or not producer_factor.none(), producer_factor, consumer_factor);
-
-    TStreamDir dir = consumer_factor.none() ? producer_factor.dir : consumer_factor.dir;
-    if (dir.r())
-    {
-        insert_t_stream_stack_slice(
-            tms, "v", consumer_factor.r, producer_factor.r, after_transpose, producer_factor.dir.z_major(), group);
-        insert_t_stream_stack_slice(tms, "h", consumer_factor.c, producer_factor.c, after_transpose);
-    }
-    else
-    {
-        insert_t_stream_stack_slice(
-            tms, "h", consumer_factor.c, producer_factor.c, after_transpose, producer_factor.dir.z_major(), group);
-        insert_t_stream_stack_slice(tms, "v", consumer_factor.r, producer_factor.r, after_transpose);
-    }
-}
-
-void insert_t_stream_tms_for_eltwise(
-    std::vector<graphlib::OpType>& tms, TStreamFactor consumer_factor, TStreamFactor producer_factor)
-{
-    return insert_t_stream_default_tms(tms, consumer_factor, producer_factor);
-}
-
-static void insert_t_stream_tms_for_matmul(
-    std::vector<OpType>& tms, TStreamFactor consumer_factor, TStreamFactor producer_factor, int operand_idx)
-{
-    if (consumer_factor.none())
-        return insert_t_stream_default_tms(tms, consumer_factor, producer_factor);
-
-    if (consumer_factor.dir.r())
-    {
-        if (operand_idx == 0)
-        {
-            insert_t_stream_default_tms(tms, consumer_factor, producer_factor);
-        }
-        else if (operand_idx == 1)
-        {
-            // If matmul R streaming RHS must be fully buffered so unstream the RHS
-            // This has been proved to be safe to do via constraints
-            if (producer_factor.is_streaming())
-                insert_t_stream_default_tms(tms, TStreamFactor{}, producer_factor);
-            graphlib::OpType broadcast("broadcast", {3, consumer_factor.r});
-            append_tm(tms, broadcast);
-            insert_t_stream_stack_slice(tms, "h", consumer_factor.r, 1);
-        }
-        else
-        {
-            // Fused bias
-            insert_t_stream_default_tms(tms, consumer_factor, producer_factor);
-        }
-    }
-    else
-    {
-        if (operand_idx == 0)
-        {
-            // If matmul C streaming LHS must be fully buffered so unstream the LHS
-            // This has been proved to be safe to do via constraints
-            if (producer_factor.is_streaming())
-                insert_t_stream_default_tms(tms, TStreamFactor{}, producer_factor);
-            graphlib::OpType broadcast("broadcast", {2, consumer_factor.c});
-            append_tm(tms, broadcast);
-            insert_t_stream_stack_slice(tms, "v", consumer_factor.c, 1);
-        }
-        else if (operand_idx == 1)
-        {
-            insert_t_stream_default_tms(tms, consumer_factor, producer_factor);
-        }
-        else
-        {
-            // Fused bias
-            insert_t_stream_default_tms(tms, consumer_factor, producer_factor);
-        }
-    }
-}
-
-static void insert_t_stream_tms_for_sparse_matmul(
-    std::vector<OpType>& tms, TStreamFactor consumer_factor, TStreamFactor producer_factor, int operand_idx)
-{
-    if (operand_idx == 1)
-    {
-        TT_ASSERT(not consumer_factor.none() or not producer_factor.none());
-        if (producer_factor.dir.r())
-        {
-            insert_t_stream_stack_slice(tms, "v", 1, producer_factor.r);
-        }
-
-        if (producer_factor.dir.c() or consumer_factor.dir.c())
-        {
-            insert_t_stream_stack_slice(tms, "h", consumer_factor.c, producer_factor.c);
-        }
-    }
-}
-
-static void insert_t_stream_tms_for_op(
-    graphlib::OpNode const* op_node,
-    std::vector<OpType>& tms,
-    TStreamFactor consumer_factor,
-    TStreamFactor producer_factor,
-    int operand_idx,
-    int group = 1)
-{
-    TT_ASSERT(op_node);
-    if (consumer_factor.none() and producer_factor.none())
-        return;
-
-    if (op_node->is_sparse_matmul())
-    {
-        TT_ASSERT(group == 1, "unsupported");
-        return insert_t_stream_tms_for_sparse_matmul(tms, consumer_factor, producer_factor, operand_idx);
-    }
-    else if (op_node->is_matmul())
-    {
-        TT_ASSERT(group == 1, "unsupported");
-        return insert_t_stream_tms_for_matmul(tms, consumer_factor, producer_factor, operand_idx);
-    }
-    else
-    {
-        return insert_t_stream_default_tms(tms, consumer_factor, producer_factor, group);
-    }
-}
-
-static void assert_t_stream_factors(
-    graphlib::OpNode const* consumer,
-    TStreamFactor producer_factor,
-    TStreamFactor consumer_factor,
-    int group,
-    bool consumes_rz_major)
-{
-    bool r_slicing = (consumer_factor.r > producer_factor.r);
-    bool c_slicing = (consumer_factor.c > producer_factor.c);
-    bool r_stacking = (consumer_factor.r < producer_factor.r);
-    bool c_stacking = (consumer_factor.c < producer_factor.c);
-    bool slicing = r_slicing or c_slicing;
-    bool stacking = r_stacking or c_stacking;
-    bool eq = (consumer_factor.r == producer_factor.r and consumer_factor.c == producer_factor.c);
-    TT_ASSERT(
-        producer_factor.compatible_consumer(
-            consumer_factor, consumer->is_sparse_matmul(), (group > 1) and consumes_rz_major),
-        consumer->name(),
-        producer_factor,
-        consumer_factor,
-        consumer->is_sparse_matmul(),
-        group,
-        consumes_rz_major);
-    TT_LOG_ASSERT(
-        (slicing != stacking) or eq,
-        "Illegal combination of slicing/stacking: {}\n{}\n{}",
-        consumer->name(),
-        producer_factor,
-        consumer_factor);
-}
-
-static void lower_broadcast_z(graphlib::Shape shape, std::vector<OpType>& tms)
-{
-    auto match = std::find_if(
-        tms.begin(),
-        tms.end(),
-        [&shape](auto const& tm) { return shape.z() > 1 and tm.op == "broadcast" and std::get<int>(tm.attr[0]) == 1; });
-
-    if (match == tms.end())
-        return;
-
-    // net2pipe doesn't support Z bcast when t>1 so if we're streaming, turn Z bcast into C broadcast and hslice
-    OpType broadcast = *match;
-    std::get<int>(broadcast.attr[0]) = 3;
-    int factor = std::get<int>(broadcast.attr[1]);
-    graphlib::OpType hslice("hslice", {factor});
-
-    auto insert_pos = tms.erase(match);
-    insert_pos = tms.insert(insert_pos, hslice);
-    insert_pos = tms.insert(insert_pos, broadcast);
-}
-
-static void consteval_t_stream_shape_for_loopback(graphlib::Graph* graph, graphlib::InputNode* loopback_queue, TStreamFactor producer_factor)
-{
-    if (producer_factor.none())
-        return;
-
-    std::vector<OpType> tms;
-    insert_t_stream_default_tms(tms, producer_factor, TStreamFactor());
-
-    graphlib::ConstEvalGraph* consteval_graph = loopback_queue->get_consteval_graph(graph, true, true);
-
-    consteval_graph->pad_output_to_buda_dims("t_stream");
-
-    graphlib::Shape current_shape = consteval_graph->get_output()->shape();
-    for (graphlib::OpType const& op_type : tms)
-    {
-        std::vector<graphlib::Shape> input_shapes = {current_shape};
-        auto [shape, bcast_dims] = ::get_op_shape(op_type, input_shapes, false);
-        auto tm = graphlib::create_node<graphlib::PyOpNode>(op_type.op + "_" + loopback_queue->name(), op_type);
-        tm->set_shape(shape);
-        tm->set_epoch_type(loopback_queue->get_epoch_type());
-        tm->set_output_df(loopback_queue->output_df());
-        consteval_graph->promote_node(std::move(tm));
-        current_shape = shape;
-    }
-
-    // This is for loopback so set needs autograd to true
-    consteval_graph->set_needs_autograd(true);
-    consteval_graph->autograd();
-}
-
-// See insert_t_stream_tms declaration documentation for what `group` means in the context
-// of inserting t-stream TMs. This is somewhat of a special case when t-streaming RZ dir
-// through a queue.  In order to get back to the canonical form, we need to undo the z-major
-// tile ordering.  We do this by grouping together chunks of some inner "group" factor,
-// performing the regular t-streaming TMs, and then undoing the grouping.  This is a form of
-// Z/R permute of tiles.
-static int calculate_group_factor(
-    bool is_queue,
-    TStreamFactor consumer_factor,
-    TStreamFactor producer_factor,
-    int operand_idx,
-    std::vector<OpType> const& tms,
-    bool is_mm,
-    bool is_sparse_mm,
-    bool consumes_rz_major)
-{
-    TStreamFactor none;
-    bool directly_compatible = producer_factor.compatible_consumer(consumer_factor, is_sparse_mm, consumes_rz_major);
-    bool reorder_without_grouping = producer_factor.compatible_consumer(none, false, false) and
-                                    none.compatible_consumer(consumer_factor, is_sparse_mm, consumes_rz_major);
-    bool is_non_primary_matmul_streaming =
-        is_mm and ((consumer_factor.dir.r() and operand_idx != 0) or (consumer_factor.dir.c() and operand_idx != 1));
-    if (not is_queue or directly_compatible or reorder_without_grouping or is_non_primary_matmul_streaming)
-        return 1;
-
-    int internal_slice_stack_factor = 1;
-    for (auto const& tm : tms)
-    {
-        if (tm.op == "vslice")
-        {
-            internal_slice_stack_factor *= std::get<int>(tm.attr[0]);
-        }
-        else if (tm.op == "hstack")
-        {
-            TT_ASSERT(internal_slice_stack_factor % std::get<int>(tm.attr[0]) == 0);
-            internal_slice_stack_factor /= std::get<int>(tm.attr[0]);
-        }
-        else if (tm.op == "hslice")
-        {
-            internal_slice_stack_factor *= std::get<int>(tm.attr[0]);
-        }
-        else if (tm.op == "vstack")
-        {
-            TT_ASSERT(internal_slice_stack_factor % std::get<int>(tm.attr[0]) == 0);
-            internal_slice_stack_factor /= std::get<int>(tm.attr[0]);
-        }
-        else if (tm.op == "transpose")
-        {
-            // nothing to do
-        }
-        else
-        {
-            TT_LOG_ASSERT(false, "Unhandled tm type for grouping {}", tm.op);
-        }
-    }
-    return internal_slice_stack_factor;
-}
-
-void insert_t_stream_tms(
-    graphlib::OpNode const* consumer,
-    std::vector<OpType>& tms,
-    TStreamFactor consumer_factor,
-    TStreamFactor producer_factor,
-    int operand_idx,
-    bool through_queue,
-    int group,
-    bool consumes_rz_major)
-{
-    bool has_transpose =
-        std::find_if(tms.begin(), tms.end(), [](OpType const& op) { return op.op == "transpose"; }) != tms.end();
-    producer_factor = has_transpose ? TStreamFactor::Transposed(producer_factor) : producer_factor;
-
-    int producer_group = producer_factor.is_streaming() ? group : 1;
-    int consumer_group = consumer_factor.is_streaming() ? group : 1;
-    if (through_queue and producer_factor.is_streaming())
-    {
-        // If we come from a queue, first undo all producer streaming TMs into canonical form
-        insert_t_stream_default_tms(tms, TStreamFactor{}, producer_factor, producer_group);
-        // Then apply (below) consumer t-stream TMs
-        producer_factor = TStreamFactor{};
-    }
-
-    assert_t_stream_factors(consumer, producer_factor, consumer_factor, 1, consumes_rz_major);
-    insert_t_stream_tms_for_op(consumer, tms, consumer_factor, producer_factor, operand_idx, consumer_group);
-}
-
-void insert_t_stream_tms(Graph* graph, balancer::OpModelMap const& op_models)
-{
-    std::unordered_set<graphlib::InputNode*> visted_loopback_queues;
-    for (auto const& [node_id, edges] : graph->operands_map())
-    {
-        graphlib::OpNode* consumer = dynamic_cast<graphlib::OpNode*>(graph->node_by_id(node_id));
-        if (not consumer)
-            continue;
-
-        OpModel const& consumer_op_model = op_models.at(consumer->name());
-        for (Edge const& edge : edges)
-        {
-            if (edge.edge_type != graphlib::EdgeType::kData and edge.edge_type != graphlib::EdgeType::kDataLoopback)
-            {
-                continue;
-            }
-            TT_ASSERT(node_id == edge.consumer_node_id);
-            auto edge_attrs = graph->get_edge_attributes(edge);
-            Node* producer = graph->node_by_id(edge.producer_node_id);
-            graphlib::InputNode* loopback_queue = nullptr;
-
-            std::vector<graphlib::Node*> producer_operands = graph->data_operands(producer);
-            if (producer_operands.empty())
-            {
-                TT_ASSERT(producer->node_type() == graphlib::NodeType::kInput);
-                insert_t_stream_tms_for_op(
-                    consumer,
-                    edge_attrs->get_tms(),
-                    consumer_op_model.t_stream_factor,
-                    TStreamFactor(),
-                    edge.consumer_input_port_id);
-                continue;
-            }
-
-            // If this edge is cut, treat it as virtual edge which will be bypassed with a queue.
-            //
-            bool is_queue = producer->node_type() != graphlib::NodeType::kBudaOp;
-            if (is_queue)
-            {
-                TT_ASSERT(producer->node_type() == graphlib::NodeType::kInput or producer->node_type() == graphlib::NodeType::kQueue);
-                TT_ASSERT(producer_operands.size() == 1);
-                loopback_queue = dynamic_cast<graphlib::InputNode*>(producer);
-                producer = producer_operands[0];
-                TT_ASSERT(producer->node_type() == graphlib::NodeType::kBudaOp);
-            }
-
-            OpModel const& producer_op_model = op_models.at(producer->name());
-            if (producer_op_model.t_stream_factor.none() and consumer_op_model.t_stream_factor.none())
-                continue;
-
-            log_trace(
-                LogTStream,
-                "Insert t stream tms: {}[{}]({}) -> {}[{}]({}) {}",
-                producer->name(),
-                edge.producer_output_port_id,
-                producer->get_epoch_type(),
-                consumer->name(),
-                edge.consumer_input_port_id,
-                consumer->get_epoch_type(),
-                edge_attrs->get_ublock_order());
-            log_trace(
-                LogTStream,
-                "    {} {} {}",
-                producer_op_model.grid_shape,
-                producer_op_model.block_shape(),
-                producer_op_model.t_stream_factor);
-            log_trace(
-                LogTStream,
-                "    {} {} {}",
-                consumer_op_model.grid_shape,
-                consumer_op_model.block_shape(),
-                consumer_op_model.t_stream_factor);
-
-            auto& tms = edge_attrs->get_tms();
-            int group = calculate_group_factor(
-                is_queue,
-                consumer_op_model.t_stream_factor,
-                producer_op_model.t_stream_factor,
-                edge.consumer_input_port_id,
-                tms,
-                consumer->is_matmul(),
-                consumer->is_sparse_matmul(),
-                consumer_op_model.consumes_rz_major);
-            insert_t_stream_tms(
-                consumer,
-                tms,
-                consumer_op_model.t_stream_factor,
-                producer_op_model.t_stream_factor,
-                edge.consumer_input_port_id,
-                is_queue,
-                group,
-                consumer_op_model.consumes_rz_major);
-
-            lower_broadcast_z(producer->shape(), edge_attrs->get_tms());
-
-            if (loopback_queue and visted_loopback_queues.find(loopback_queue) == visted_loopback_queues.end())
-            {
-                consteval_t_stream_shape_for_loopback(graph, loopback_queue, producer_op_model.t_stream_factor);
-                visted_loopback_queues.insert(loopback_queue);
-            }
-        }
-    }
-
-    // Calculate undo t streaming for golden reconstruction
-    for (Node* node : graph->nodes())
-    {
-        if (node->node_type() != graphlib::NodeType::kBudaOp)
-            continue;
-
-        TStreamFactor t_stream_factor = op_models.at(node->name()).t_stream_factor;
-        if (t_stream_factor.none())
-            continue;
-
-        // Set `after_transpose=false` for golden because it requires the TMs
-        // to be undone in exactly the opposite order in which they were applied
-        constexpr bool after_transpose = false;
-        constexpr int group = 1;
-        insert_t_stream_default_tms(
-            node->as<graphlib::OpNode>()->get_golden_transforms(),
-            TStreamFactor{},
-            t_stream_factor,
-            group,
-            after_transpose);
-    }
-}
-}  // namespace tt
diff --git a/pybuda/csrc/passes/t_stream.hpp b/pybuda/csrc/passes/t_stream.hpp
deleted file mode 100644
index 53080ce43..000000000
--- a/pybuda/csrc/passes/t_stream.hpp
+++ /dev/null
@@ -1,82 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "balancer/balancer.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-
-namespace tt
-{
-//
-// insert_t_stream_tms_for_eltwise
-//
-// Low level convenience function for inserting streaming TMs for the
-// eltwise simple case. Only legal if you can guarantee that the
-// consumer will accept eltwise tile ordering.
-//
-void insert_t_stream_tms_for_eltwise(
-    std::vector<graphlib::OpType>& tms,
-    balancer::TStreamFactor consumer_factor,
-    balancer::TStreamFactor producer_factor);
-
-//
-// insert_t_stream_tms
-//
-// consumer:
-//   The consumer op by which to apply incoming t-stream TMs.
-//
-// tms:
-//   A vector of TMs, modified in place.  Results in a vector of TMs
-//   with t-streaming TMs inserted.
-//
-// consumer_factor/producer_factor:
-//   Respective consumer/producer t-streaming factors.
-//
-// operand_idx:
-//   The edge input port that the provided tms belong to for this
-//   consumer.
-//
-// through_queue:
-//   If true, we have construction `producer -> e2e -> consumer`.
-//   e2e inherits its producer's streaming amount.  This is a special
-//   case because t-streaming constraints are relaxed when bouncing
-//   through a queue so extra logic is needed to canonicalize the
-//   form through the queue.
-//
-// group:
-//   surrounds the stream TMs with a group factor, e.g.
-//   Unstreaming R major you might have:
-//
-//       vstack(5)
-//
-//   If you wanted to preserve t-groupings of 9, than group=9 would
-//   result:
-//
-//       hstack(9)
-//       vstack(5)
-//       hslice(9)
-//
-//   Note, the grouping is achieved by first stacking in the opposite
-//   direction, applying the underlying streaming TM, and then slicing
-//   back.
-//
-void insert_t_stream_tms(
-    graphlib::OpNode const* consumer,
-    std::vector<graphlib::OpType>& tms,
-    balancer::TStreamFactor consumer_factor,
-    balancer::TStreamFactor producer_factor,
-    int operand_idx,
-    bool through_queue = false,
-    int group = 1,
-    bool consumes_rz_major = false);
-
-//
-// insert_t_stream_tms
-//
-// Insert t-stream tms for all edges in the graph, given the selected set
-// of op_models.
-//
-void insert_t_stream_tms(Graph* graph, balancer::OpModelMap const& op_models);
-}  // namespace tt
diff --git a/pybuda/csrc/passes/tests/module.mk b/pybuda/csrc/passes/tests/module.mk
index 80925104f..80fa8fc5b 100644
--- a/pybuda/csrc/passes/tests/module.mk
+++ b/pybuda/csrc/passes/tests/module.mk
@@ -1,11 +1,10 @@
 PYBUDA_CSRC_PASSES_TESTS = $(TESTDIR)/pybuda/csrc/passes/tests/passes_unit_tests
 PYBUDA_CSRC_PASSES_TESTS_SRCS = \
-	pybuda/csrc/balancer/tests/test_balancer_utils.cpp \
 	$(wildcard pybuda/csrc/passes/tests/*.cpp)
 
 
 PYBUDA_CSRC_PASSES_TESTS_INCLUDES = -Ipybuda/csrc/graph_lib $(PYBUDA_CSRC_INCLUDES)
-PYBUDA_CSRC_PASSES_TESTS_LDFLAGS = -lstdc++fs -lgtest -lgtest_main -lpthread -l$(PYTHON_VERSION) -lm
+PYBUDA_CSRC_PASSES_TESTS_LDFLAGS = -lgtest -lgtest_main -lpthread -l$(PYTHON_VERSION) -lm
 
 PYBUDA_CSRC_PASSES_TESTS_OBJS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_PASSES_TESTS_SRCS:.cpp=.o))
 PYBUDA_CSRC_PASSES_TESTS_DEPS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_PASSES_TESTS_SRCS:.cpp=.d))
diff --git a/pybuda/csrc/passes/tests/test_balacer_error_passes.cpp b/pybuda/csrc/passes/tests/test_balacer_error_passes.cpp
deleted file mode 100644
index 4b51c3b50..000000000
--- a/pybuda/csrc/passes/tests/test_balacer_error_passes.cpp
+++ /dev/null
@@ -1,51 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "passes/placer_buda_passes.hpp"
-#include "test/common.hpp"
-
-namespace tt::test
-{
-struct InsertQueues : public BudaGraphTest, public testing::WithParamInterface<int>
-{
-   protected:
-    virtual std::vector<OpType*> create_graph() override
-    {
-        auto num_forks = GetParam();
-        auto act = create_activation(shape(1, 1, 32, 32));
-        gelu0 = create_op("gelu", {act});
-        for (int i = 0; i < num_forks; ++i)
-        {
-            forks.push_back(create_op("gelu", {gelu0}));
-            forks_as_nodes.push_back(forks.back());
-        }
-        return forks;
-    }
-
-    OpType* gelu0;
-    std::vector<OpType*> forks;
-    std::vector<graphlib::Node*> forks_as_nodes;
-};
-
-TEST_P(InsertQueues, insert_queues)
-{
-    graphlib::Graph* graph = get_graph();
-
-    balancer::BudaOpNodeLegalizerFailureInfo info;
-    info.recordOpModelFailure(balancer::OpModelFailureReason::UserAccessPreventsStreaming);
-    std::unordered_map<graphlib::Node*, const balancer::BudaOpNodeLegalizerFailureInfo> nodes_without_legal_op_model = {
-        {gelu0, info},
-    };
-    passes::insert_queues(graph, nodes_without_legal_op_model);
-    auto users = graph->data_users(gelu0);
-    ASSERT_EQ(users.size(), 1);
-    auto *queue = dynamic_cast<graphlib::QueueNode*>(users.front());
-    ASSERT_NE(queue, nullptr);
-    auto queue_users = graph->data_users(queue);
-    std::sort(queue_users.begin(), queue_users.end());
-    std::sort(forks_as_nodes.begin(), forks_as_nodes.end());
-    ASSERT_EQ(queue_users, forks_as_nodes);
-}
-
-INSTANTIATE_TEST_SUITE_P(InsertQueues, InsertQueues, testing::Values(1, 2, 3));
-}  // namespace tt::test
diff --git a/pybuda/csrc/passes/tests/test_bufferig_queues.cpp b/pybuda/csrc/passes/tests/test_bufferig_queues.cpp
deleted file mode 100644
index ac2684491..000000000
--- a/pybuda/csrc/passes/tests/test_bufferig_queues.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include <optional>
-
-#include "balancer/balancer.hpp"
-#include "balancer/legalizer/graph_solver.hpp"
-#include "balancer/legalizer/legalizer.hpp"
-#include "balancer/policies/policies.hpp"
-#include "balancer/tests/test_balancer_utils.hpp"
-#include "balancer/types.hpp"
-#include "graph_lib/defines.hpp"
-#include "graph_lib/utils.hpp"
-#include "gtest/gtest.h"
-#include "passes/post_placer_buda_passes.hpp"
-#include "test/common.hpp"
-
-using namespace tt;
-namespace tt::test
-{
-
-// check if node with node_name has queue (of specified QueueNodeType) as producer on specified consumer_input_port_id.
-bool check_if_node_has_que_as_prod(
-    graphlib::Graph* graph, const std::string& node_name, graphlib::PortId port_id, graphlib::QueueNodeType queue_type)
-{
-    graphlib::Node* node = graph->get_node_by_name(node_name);
-    std::vector<graphlib::Edge> operand_edges = graph->operand_data_edges(node);
-    bool has_operand_on_port = false;
-    for (graphlib::Edge operand_edge : operand_edges)
-    {
-        if (operand_edge.consumer_input_port_id == port_id)
-        {
-            has_operand_on_port = true;
-            graphlib::NodeId producer_node_id = operand_edge.producer_node_id;
-            graphlib::Node* producer_node = graph->node_by_id(producer_node_id);
-            if (producer_node->node_type() != graphlib::NodeType::kQueue)
-            {
-                // According to the test producer_node should be queue type
-                return false;
-            }
-            else
-            {
-                graphlib::QueueNode* queue = static_cast<graphlib::QueueNode*>(producer_node);
-                return queue->queue_type() == queue_type ? true : false;
-            }
-        }
-    }
-    TT_ASSERT(has_operand_on_port, "node should have operand connected on input port_id");
-    return false;
-}
-struct BypassBuffQueueMultipleConsumers : public BudaGraphTest
-{
-   protected:
-    virtual std::vector<OpType*> create_graph() override
-    {
-        auto act_1 = create_activation(1, 1, 60 * 32, 32 * 32);
-        auto in0_mm_0 = create_parameter(1, 1, 60 * 32, 60 * 32);
-        auto in1_mm_1 = create_parameter(1, 1, 32 * 32, 1 * 32);
-        auto in1_mm_2 = create_parameter(1, 1, 32 * 32, 1 * 32);
-        auto in1_mm_3 = create_parameter(1, 1, 32 * 32, 1 * 32);
-
-        auto matmul_0 = create_op("matmul", {in0_mm_0, act_1});
-
-        auto buff_queue = create_buffering_queue(matmul_0, 2 /*num_entries*/);
-
-        auto matmul_1 = create_op("matmul", {buff_queue, in1_mm_1});
-        auto matmul_2 = create_op("matmul", {buff_queue, in1_mm_2});
-        auto matmul_3 = create_op("matmul", {buff_queue, in1_mm_3});
-
-        auto add_12 = create_op("add", {matmul_1, matmul_2});
-        auto out = create_op("add", {add_12, matmul_3});
-
-        return {out};
-    }
-};
-
-TEST_F(BypassBuffQueueMultipleConsumers, bypass_buff_queue_with_multiple_consumers)
-{
-    // Buda graph
-    graphlib::Graph* graph = get_graph();
-    balancer::BalancerConfig balancer_config =
-        create_balancer_config(Arch::Grayskull, std::nullopt, balancer::PolicyType::Ribbon);
-    std::shared_ptr<balancer::BalancerCacheCollection> cache_collection = create_balancer_cache_collection();
-    balancer_config.op_names_to_epoch_break.push_back({"matmul3"});
-
-    std::optional<placer::PlacerSolution> opt_placer_solution = std::nullopt;
-
-    balancer::LegalOpModels valid_op_models =
-        balancer::legalizer::get_legal_op_models(graph, balancer_config, cache_collection);
-
-    auto graph_solver = get_graph_solver(balancer_config, cache_collection, graph, valid_op_models);
-    balancer::legalizer::GraphSolverSolution graph_solver_solution =
-        balancer::run_policy(graph, balancer_config, graph_solver, opt_placer_solution);
-
-    validate_subgraph_placement(graph, *opt_placer_solution);
-
-    remove_buffering_queues_from_cross_epoch_edges(graph, *opt_placer_solution);
-
-    insert_epoch_to_epoch_queues(
-        graph,
-        opt_placer_solution.value(),
-        {graphlib::NodeEpochType::Forward, graphlib::NodeEpochType::Backward, graphlib::Optimizer},
-        graph_solver_solution.cut_edges);
-
-    bool mm_1_check = check_if_node_has_que_as_prod(graph, "matmul1", 0, graphlib::QueueNodeType::Buffering);
-    EXPECT_TRUE(mm_1_check);
-    bool mm_2_check = check_if_node_has_que_as_prod(graph, "matmul2", 0, graphlib::QueueNodeType::Buffering);
-    EXPECT_TRUE(mm_2_check);
-    bool mm_3_check = check_if_node_has_que_as_prod(graph, "matmul3", 0, graphlib::QueueNodeType::EpochToEpoch);
-    EXPECT_TRUE(mm_3_check);
-}
-
-}  // namespace tt::test
\ No newline at end of file
diff --git a/pybuda/csrc/passes/tests/test_dram_forking.cpp b/pybuda/csrc/passes/tests/test_dram_forking.cpp
deleted file mode 100644
index fb3e92f04..000000000
--- a/pybuda/csrc/passes/tests/test_dram_forking.cpp
+++ /dev/null
@@ -1,379 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "balancer/balancer.hpp"
-#include "balancer/balancer_cache_collection.hpp"
-#include "balancer/legalizer/graph_solver.hpp"
-#include "balancer/legalizer/legalizer.hpp"
-#include "balancer/tests/test_balancer_utils.hpp"
-#include "passes/forked_dram_inputs.hpp"
-#include "placer/best_fit_allocator.hpp"
-#include "placer/chip_id_assignment.hpp"
-#include "placer/lowering_utils.hpp"
-#include "placer/placer.hpp"
-#include "test/common.hpp"
-
-namespace tt::test
-{
-
-// This Graph has dram forking inputs
-struct ForkedDramGraph : public BudaGraphTest
-{
-   protected:
-    virtual std::vector<OpType *> create_graph() override
-    {
-        std::uint32_t seq_len = 32;
-        std::uint32_t embed = 32;
-
-        auto in0 = create_input("act0", shape(1, 1, seq_len, embed), tt::graphlib::InputNodeType::Activation);
-        auto in1 = create_input("act1", shape(1, 1, seq_len, embed), tt::graphlib::InputNodeType::Activation);
-
-        auto add0 = create_op("add", {in0, in1});
-        (void)add0;
-        auto add1 = create_op("add", {in0, in1});
-        auto e2e1 = create_buffering_queue(add1, 1);
-        auto add2 = create_op("add", {e2e1, in1});
-        (void)add2;
-        auto add3 = create_op("add", {in1, e2e1});
-
-        auto gelu = create_op("gelu", {add3});
-        return {gelu};
-    }
-};
-
-// This test evaluates a regular dram forking inputs with scheduled ops on the same epoch , all ops with same
-// block shape
-TEST_F(ForkedDramGraph, forked_dram_test)
-{
-    graphlib::Graph *graph = get_graph();
-
-    // get balancer solution
-    balancer::BalancerConfig balancer_config = create_balancer_config();
-    std::shared_ptr<balancer::BalancerCacheCollection> cache_collection = create_balancer_cache_collection();
-    // run balancer and placer
-    auto balancer_solution = run_balancer_and_placer(graph, balancer_config, cache_collection);
-    // run dram forking pass
-    auto forked_dram_map = tt::passes::get_forked_dram_inputs(
-        true, graph, &balancer_solution->placer_solution.name_to_op_placement, &balancer_solution->op_models);
-
-    // Can store main node and forked nodes info(similar to what netlist.cpp will do)
-    std::unordered_map<std::string, std::vector<std::pair<std::string, std::string>>> forked_dram_node_map;
-    for (Node *node : graphlib::topological_sort(*graph))
-    {
-        if (node->node_type() == graphlib::NodeType::kBudaOp)
-        {
-            for (auto operand : graph->operand_data_edges(node))
-            {
-                if (forked_dram_map.find(operand) != forked_dram_map.end())
-                {
-                    auto edge_temp = forked_dram_map[operand];
-                    forked_dram_node_map[node->name()].push_back(std::make_pair(
-                        graph->node_by_id(edge_temp.producer_node_id)->name(),
-                        graph->node_by_id(edge_temp.consumer_node_id)->name()));
-                }
-            }
-        }
-    }
-
-    // Verify if the forked nodes are correct
-    ASSERT_TRUE(forked_dram_node_map.find("add0") != forked_dram_node_map.end());
-
-    std::vector<std::pair<std::string, std::string>> add0_fork = {{"act1", "add3"}};
-    ASSERT_TRUE(add0_fork == forked_dram_node_map["add0"]);
-
-    ASSERT_TRUE(forked_dram_node_map.find("add1") != forked_dram_node_map.end());
-
-    std::vector<std::pair<std::string, std::string>> add1_fork = {{"act0", "add0"}};
-    ASSERT_TRUE(add1_fork == forked_dram_node_map["add1"]);
-
-    ASSERT_TRUE(forked_dram_node_map.find("add2") != forked_dram_node_map.end());
-
-    std::vector<std::pair<std::string, std::string>> add2_fork = {{"act1", "add3"}};
-    ASSERT_TRUE(add2_fork == forked_dram_node_map["add2"]);
-
-    ASSERT_TRUE(forked_dram_node_map.find("add3") != forked_dram_node_map.end());
-
-    std::vector<std::pair<std::string, std::string>> add3_fork = {{"buff_queue0", "add2"}};
-    ASSERT_TRUE(add3_fork == forked_dram_node_map["add3"]);
-}
-
-// This test evaluates a regular dram forking inputs with scheduled ops on the different epochs , all ops with same
-// block shape
-TEST_F(ForkedDramGraph, forked_dram_test_epoch_break)
-{
-    graphlib::Graph *graph = get_graph();
-
-    // get balancer solution
-    balancer::BalancerConfig balancer_config = create_balancer_config();
-    std::shared_ptr<balancer::BalancerCacheCollection> cache_collection = create_balancer_cache_collection();
-    // adding a break at add2 node  to be scheduled in a separate epoch
-    balancer_config.op_names_to_epoch_break.push_back({"add2"});
-    // run balancer and placer
-    auto balancer_solution = run_balancer_and_placer(graph, balancer_config, cache_collection);
-    // run dram forking pass
-    auto forked_dram_map = tt::passes::get_forked_dram_inputs(
-        true, graph, &balancer_solution->placer_solution.name_to_op_placement, &balancer_solution->op_models);
-
-    // Can store main node and forked nodes info(similar to what netlist.cpp will do)
-    std::unordered_map<std::string, std::vector<std::pair<std::string, std::string>>> forked_dram_node_map;
-    for (Node *node : graphlib::topological_sort(*graph))
-    {
-        if (node->node_type() == graphlib::NodeType::kBudaOp)
-        {
-            for (auto operand : graph->operand_data_edges(node))
-            {
-                if (forked_dram_map.find(operand) != forked_dram_map.end())
-                {
-                    auto edge_temp = forked_dram_map[operand];
-                    forked_dram_node_map[node->name()].push_back(std::make_pair(
-                        graph->node_by_id(edge_temp.producer_node_id)->name(),
-                        graph->node_by_id(edge_temp.consumer_node_id)->name()));
-                }
-            }
-        }
-    }
-
-    // Verify if the forked nodes are correct, remember epoch break is at node add2
-    ASSERT_TRUE(forked_dram_node_map.find("add1") != forked_dram_node_map.end());
-
-    std::vector<std::pair<std::string, std::string>> add1_fork = {{"act0", "add0"}};
-    ASSERT_TRUE(add1_fork == forked_dram_node_map["add1"]);
-
-    ASSERT_TRUE(forked_dram_node_map.find("add2") != forked_dram_node_map.end());
-
-    std::vector<std::pair<std::string, std::string>> add2_fork = {{"act1", "add3"}};
-    ASSERT_TRUE(add2_fork == forked_dram_node_map["add2"]);
-
-    ASSERT_TRUE(forked_dram_node_map.find("add3") != forked_dram_node_map.end());
-
-    std::vector<std::pair<std::string, std::string>> add3_fork = {{"buff_queue0", "add2"}};
-    ASSERT_TRUE(add3_fork == forked_dram_node_map["add3"]);
-}
-
-// This graph has data dependency between two nodes
-struct ForkedDramGraphDependency : public BudaGraphTest
-{
-   protected:
-    virtual std::vector<OpType *> create_graph() override
-    {
-        std::uint32_t seq_len = 32;
-        std::uint32_t embed = 32;
-
-        auto in0 = create_input("act0", shape(1, 1, seq_len, embed), tt::graphlib::InputNodeType::Activation);
-        auto in1 = create_input("act1", shape(1, 1, seq_len, embed), tt::graphlib::InputNodeType::Activation);
-
-        auto add0 = create_op("add", {in0, in1});
-        auto add1 = create_op("add", {in1, add0});
-
-        auto e2e1 = create_buffering_queue(add1, 1);
-        auto add2 = create_op("add", {e2e1, in1});
-        (void)add2;
-        auto add3 = create_op("add", {in1, e2e1});
-
-        auto gelu = create_op("gelu", {add3});
-        return {gelu};
-    }
-};
-
-// This test evaluates dram forking inputs with data dependency between two nodes and scheduled ops the same epoch , all
-// ops with same block shape
-TEST_F(ForkedDramGraphDependency, forked_dram_test_node_dependency)
-{
-    graphlib::Graph *graph = get_graph();
-
-    // get balancer solution
-    balancer::BalancerConfig balancer_config = create_balancer_config();
-    std::shared_ptr<balancer::BalancerCacheCollection> cache_collection = create_balancer_cache_collection();
-    // run balancer and placer
-    auto balancer_solution = run_balancer_and_placer(graph, balancer_config, cache_collection);
-    // run dram forking pass
-    auto forked_dram_map = tt::passes::get_forked_dram_inputs(
-        true, graph, &balancer_solution->placer_solution.name_to_op_placement, &balancer_solution->op_models);
-
-    // Can store main node and forked nodes info(similar to what netlist.cpp will do)
-    std::unordered_map<std::string, std::vector<std::pair<std::string, std::string>>> forked_dram_node_map;
-    for (Node *node : graphlib::topological_sort(*graph))
-    {
-        if (node->node_type() == graphlib::NodeType::kBudaOp)
-        {
-            for (auto operand : graph->operand_data_edges(node))
-            {
-                if (forked_dram_map.find(operand) != forked_dram_map.end())
-                {
-                    auto edge_temp = forked_dram_map[operand];
-
-                    forked_dram_node_map[node->name()].push_back(std::make_pair(
-                        graph->node_by_id(edge_temp.producer_node_id)->name(),
-                        graph->node_by_id(edge_temp.consumer_node_id)->name()));
-                }
-            }
-        }
-    }
-
-    // Verify if the forked nodes are correct, remember add1 reads from add0 (i.e there is a data dependency between
-    // add1 and add0, add1 and add2, add1 and add3)
-
-    ASSERT_TRUE(forked_dram_node_map.find("add3") != forked_dram_node_map.end());
-
-    std::vector<std::pair<std::string, std::string>> add3_fork = {{"buff_queue0", "add2"}};
-    ASSERT_TRUE(add3_fork == forked_dram_node_map["add3"]);
-}
-
-// This graph has Prologue node(constants or weights ), same epoch, same block shapes
-struct ForkedDramGraphPrologue : public BudaGraphTest
-{
-   protected:
-    virtual std::vector<OpType *> create_graph() override
-    {
-        std::uint32_t seq_len = 32;
-        std::uint32_t embed = 32;
-        std::uint32_t hidden = 32;
-
-        auto in0 = create_input("act0", shape(1, 1, seq_len, embed), tt::graphlib::InputNodeType::Activation);
-        auto in1 = create_input("act1", shape(1, 1, seq_len, embed), tt::graphlib::InputNodeType::Activation);
-
-        // constant parameters, weights make the node a prologue node
-        auto Win = create_parameter(shape(1, 1, embed, hidden));
-        auto matmul0 = create_op("matmul", {in0, Win});
-        (void)matmul0;
-        auto add0 = create_op("add", {in0, in1});
-
-        auto e2e1 = create_buffering_queue(add0, 1);
-        auto add1 = create_op("add", {e2e1, in1});
-        (void)add1;
-        auto add2 = create_op("add", {in1, e2e1});
-
-        auto gelu = create_op("gelu", {add2});
-        return {gelu};
-    }
-};
-
-// This test evaluates dram forking inputs with a prologue node, same epoch, same block shapes, prologue nodes can not
-// use dram forking optimization
-TEST_F(ForkedDramGraphPrologue, forked_dram_test_prologue)
-{
-    graphlib::Graph *graph = get_graph();
-
-    // get balancer solution
-    balancer::BalancerConfig balancer_config = create_balancer_config();
-    std::shared_ptr<balancer::BalancerCacheCollection> cache_collection = create_balancer_cache_collection();
-    // run balancer and placer
-    auto balancer_solution = run_balancer_and_placer(graph, balancer_config, cache_collection);
-    // run dram forking pass
-    auto forked_dram_map = tt::passes::get_forked_dram_inputs(
-        true, graph, &balancer_solution->placer_solution.name_to_op_placement, &balancer_solution->op_models);
-
-    // Can store main node and forked nodes info(similar to what netlist.cpp will do)
-    std::unordered_map<std::string, std::vector<std::pair<std::string, std::string>>> forked_dram_node_map;
-    for (Node *node : graphlib::topological_sort(*graph))
-    {
-        if (node->node_type() == graphlib::NodeType::kBudaOp)
-        {
-            for (auto operand : graph->operand_data_edges(node))
-            {
-                if (forked_dram_map.find(operand) != forked_dram_map.end())
-                {
-                    auto edge_temp = forked_dram_map[operand];
-
-                    forked_dram_node_map[node->name()].push_back(std::make_pair(
-                        graph->node_by_id(edge_temp.producer_node_id)->name(),
-                        graph->node_by_id(edge_temp.consumer_node_id)->name()));
-                }
-            }
-        }
-    }
-    // Verify if the forked nodes are correct, remember there is a prologue node
-    ASSERT_TRUE(forked_dram_node_map.find("add0") != forked_dram_node_map.end());
-
-    std::vector<std::pair<std::string, std::string>> add0_fork = {{"act0", "matmul0"}};
-    ASSERT_TRUE(add0_fork == forked_dram_node_map["add0"]);
-
-    ASSERT_TRUE(forked_dram_node_map.find("add1") != forked_dram_node_map.end());
-
-    std::vector<std::pair<std::string, std::string>> add1_fork = {{"act1", "add2"}};
-    ASSERT_TRUE(add1_fork == forked_dram_node_map["add1"]);
-
-    ASSERT_TRUE(forked_dram_node_map.find("add2") != forked_dram_node_map.end());
-
-    std::vector<std::pair<std::string, std::string>> add2_fork = {{"buff_queue0", "add1"}};
-    ASSERT_TRUE(add2_fork == forked_dram_node_map["add2"]);
-}
-
-// This Graph has dram forking inputs
-struct ForkedDramGraphGridShape : public BudaGraphTest
-{
-   protected:
-    virtual std::vector<OpType *> create_graph() override
-    {
-        std::uint32_t seq_len = 32;
-        std::uint32_t embed = 64;
-
-        auto in0 = create_input("act0", shape(1, 1, seq_len, embed), tt::graphlib::InputNodeType::Activation);
-        auto in1 = create_input("act1", shape(1, 1, seq_len, embed), tt::graphlib::InputNodeType::Activation);
-
-        auto add0 = create_op("add", {in0, in1});
-        auto add1 = create_op("add", {in0, in1});
-
-        auto e2e1 = create_buffering_queue(add1, 1);
-        auto add2 = create_op("add", {e2e1, in1});
-        auto add3 = create_op("add", {in1, e2e1});
-
-        auto gelu = create_op("gelu", {add3});
-        return {add0, add2, gelu};
-    }
-};
-
-// This test evaluates a regular dram forking inputs with scheduled ops on the same epoch , ops with different block
-// shapes
-TEST_F(ForkedDramGraphGridShape, forked_dram_test_diff_grid_shapes)
-{
-    graphlib::Graph *graph = get_graph();
-
-    // get balancer solution
-    balancer::BalancerConfig balancer_config = create_balancer_config();
-    std::shared_ptr<balancer::BalancerCacheCollection> cache_collection = create_balancer_cache_collection();
-
-    // force ops for different grid shapes (to have different block shapes)
-    balancer_config.enable_t_streaming = false;
-    balancer_config.op_overrides["add0"].grid_shape = std::make_pair(1, 1);
-    balancer_config.op_overrides["add1"].grid_shape = std::make_pair(1, 1);
-    balancer_config.op_overrides["add2"].grid_shape = std::make_pair(1, 2);
-    balancer_config.op_overrides["add3"].grid_shape = std::make_pair(1, 1);
-    // run balancer and placer
-    auto balancer_solution = run_balancer_and_placer(graph, balancer_config, cache_collection);
-    // run dram forking pass
-    auto forked_dram_map = tt::passes::get_forked_dram_inputs(
-        true, graph, &balancer_solution->placer_solution.name_to_op_placement, &balancer_solution->op_models);
-
-    // Can store main node and forked nodes info(similar to what netlist.cpp will do)
-    std::unordered_map<std::string, std::vector<std::pair<std::string, std::string>>> forked_dram_node_map;
-    for (Node *node : graphlib::topological_sort(*graph))
-    {
-        if (node->node_type() == graphlib::NodeType::kBudaOp)
-        {
-            for (auto operand : graph->operand_data_edges(node))
-            {
-                if (forked_dram_map.find(operand) != forked_dram_map.end())
-                {
-                    auto edge_temp = forked_dram_map[operand];
-
-                    forked_dram_node_map[node->name()].push_back(std::make_pair(
-                        graph->node_by_id(edge_temp.producer_node_id)->name(),
-                        graph->node_by_id(edge_temp.consumer_node_id)->name()));
-                }
-            }
-        }
-    }
-    // Verify if the forked nodes are correct
-    ASSERT_TRUE(forked_dram_node_map.find("add0") != forked_dram_node_map.end());
-
-    std::vector<std::pair<std::string, std::string>> add0_fork = {{"act1", "add3"}};
-    ASSERT_TRUE(add0_fork == forked_dram_node_map["add0"]);
-
-    ASSERT_TRUE(forked_dram_node_map.find("add1") != forked_dram_node_map.end());
-
-    std::vector<std::pair<std::string, std::string>> add1_fork = {{"act0", "add0"}};
-    ASSERT_TRUE(add1_fork == forked_dram_node_map["add1"]);
-}
-
-}  // namespace tt::test
diff --git a/pybuda/csrc/passes/tests/test_fuse_ops.cpp b/pybuda/csrc/passes/tests/test_fuse_ops.cpp
deleted file mode 100644
index eeba30353..000000000
--- a/pybuda/csrc/passes/tests/test_fuse_ops.cpp
+++ /dev/null
@@ -1,420 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "balancer/balancer.hpp"
-#include "balancer/legalizer/graph_solver.hpp"
-#include "balancer/legalizer/legalizer.hpp"
-#include "balancer/tests/test_balancer_utils.hpp"
-#include "graph_lib/defines.hpp"
-#include "graph_lib/utils.hpp"
-#include "gtest/gtest.h"
-#include "lower_to_buda/fused_op.hpp"
-#include "lower_to_buda/netlist.hpp"
-#include "passes/dataformat.hpp"
-#include "passes/fuse_ops.hpp"
-#include "test/common.hpp"
-
-using namespace tt;
-namespace tt::test
-{
-
-// Fuse ops with default arguments.
-void fuse_ops(graphlib::Graph* graph, DeviceConfig* device_config = nullptr)
-{
-    const std::vector<std::vector<std::string>> op_names_to_chip_break;
-    const std::vector<std::vector<std::string>> op_names_to_epoch_break;
-
-    if (device_config != nullptr)
-    {
-        tt::fuse_ops(graph, *device_config, op_names_to_chip_break, op_names_to_epoch_break, {}, {}, {});
-    }
-    else
-    {
-        tt::fuse_ops(
-            graph, tt::test::create_device_config(), op_names_to_chip_break, op_names_to_epoch_break, {}, {}, {});
-    }
-}
-
-// Get vector of all fused ops in the graph.
-std::vector<BudaOpNode*> get_fused_ops(Graph* graph)
-{
-    std::vector<BudaOpNode*> fused_ops;
-
-    for (auto node : graph->nodes())
-    {
-        if (node->node_type() == graphlib::kBudaOp)
-        {
-            BudaOpNode* op = node->as<BudaOpNode>();
-
-            if (op->is_fused_op())
-            {
-                fused_ops.push_back(op);
-            }
-        }
-    }
-
-    return fused_ops;
-}
-
-struct FuseBroadcastCLHSMatmul : public BudaGraphTest
-{
-   protected:
-    virtual std::vector<OpType*> create_graph() override
-    {
-        auto in0 = create_activation(1, 1, 32, 32);
-        auto in1 = create_activation(1, 1, 32, 32);
-        auto in2 = create_activation(1, 1, 32, 64);
-
-        auto add0 = create_op("add", {in0, in1});
-        auto add1 = create_op("add", {add0, in2});
-        append_tm("broadcast", add1, 0, 3, 2);
-
-        auto out = create_op("matmul", {add1, in1});
-
-        return {out};
-    }
-};
-
-TEST_F(FuseBroadcastCLHSMatmul, fuse_broadcast_c_as_lhs_matmul)
-{
-    graphlib::Graph* graph = get_graph();
-    fuse_ops(graph);
-
-    // Get fused ops.
-    std::vector<BudaOpNode*> fused_ops = get_fused_ops(graph);
-
-    // We expect only 1 fused op.
-    ASSERT_EQ(fused_ops.size(), 1);
-
-    graphlib::UBlockOrder u_block_order = get_output_ublock_order(graph, fused_ops[0]);
-
-    // We expect UBlockOrder::R since fused op has brodcast C.
-    ASSERT_EQ(u_block_order, graphlib::UBlockOrder::R);
-}
-
-struct FuseOpsEquivalentTest : public BudaGraphTest
-{
-   protected:
-    virtual std::vector<OpType*> create_graph() override
-    {
-        auto in0 = create_activation(1, 1, 64, 64);
-        auto in1 = create_activation(1, 1, 64, 64);
-        auto in2 = create_activation(1, 1, 64, 64);
-
-        auto add0 = create_op("add", {in0, in1});
-        auto add1 = create_op("add", {add0, in2});
-
-        auto add2 = create_op("add", {in0, in1});
-        auto add3 = create_op("add", {add2, in2});
-        add3_name = add3->name();
-
-        auto out = create_op("matmul", {add1, add3});
-
-        return {out};
-    }
-
-    std::string add3_name;
-};
-
-TEST_F(FuseOpsEquivalentTest, fuse_equivalent_fused_ops_without_attr)
-{
-    graphlib::Graph* graph = get_graph();
-    fuse_ops(graph);
-
-    // Get fused ops.
-    std::vector<BudaOpNode*> fused_ops = get_fused_ops(graph);
-
-    ASSERT_EQ(fused_ops.size(), 2);
-
-    // Get test op_model in order to generate buda fused ops.
-    balancer::BalancerConfig balancer_config = create_balancer_config();
-    std::shared_ptr<balancer::BalancerCacheCollection> cache_collection = create_balancer_cache_collection();
-    balancer::LegalOpModels valid_op_models =
-        balancer::legalizer::get_legal_op_models(graph, balancer_config, cache_collection);
-    std::vector<BudaFusedOp> buda_fused_ops;
-
-    for (auto fused_op : fused_ops)
-    {
-        buda_fused_ops.push_back(create_fused_op(fused_op, valid_op_models[fused_op][0]));
-    }
-
-    ASSERT_EQ(buda_fused_ops.size(), 2);
-
-    // Equivalent call should return true for 2 fused ops in this test.
-    ASSERT_TRUE(buda_fused_ops[0].equivalent(buda_fused_ops[1]));
-}
-
-TEST_F(FuseOpsEquivalentTest, fuse_equivalent_fused_ops_with_attr)
-{
-    Graph* graph = get_graph();
-
-    // Add atribute to one op.
-    // This attribute should make diff between fused ops so that they are not equivalent anymore.
-    BudaOpAttrs relu_attr;
-    relu_attr["relu_en"] = true;
-    auto add3 = graph->get_node_by_name(this->add3_name);
-    BudaOpNode* add3_op = add3->as<BudaOpNode>();
-    add3_op->overwrite_buda_attrs(relu_attr);
-
-    fuse_ops(graph);
-
-    // Get fused ops.
-    std::vector<BudaOpNode*> fused_ops = get_fused_ops(graph);
-
-    ASSERT_EQ(fused_ops.size(), 2);
-
-    // Get test op_model in order to generate buda fused ops.
-    balancer::BalancerConfig balancer_config = create_balancer_config();
-    std::shared_ptr<balancer::BalancerCacheCollection> cache_collection = create_balancer_cache_collection();
-    balancer::LegalOpModels valid_op_models =
-        balancer::legalizer::get_legal_op_models(graph, balancer_config, cache_collection);
-    std::vector<BudaFusedOp> buda_fused_ops;
-
-    for (auto fused_op : fused_ops)
-    {
-        buda_fused_ops.push_back(create_fused_op(fused_op, valid_op_models[fused_op][0]));
-    }
-
-    ASSERT_EQ(buda_fused_ops.size(), 2);
-
-    // Equivalent call should return false for 2 fused ops in this test.
-    ASSERT_FALSE(buda_fused_ops[0].equivalent(buda_fused_ops[1]));
-}
-
-struct FuseOpsReuseTest : public BudaGraphTest
-{
-   protected:
-    virtual std::vector<OpType*> create_graph() override
-    {
-        auto in0 = create_activation(1, 1, 32, 32);
-        auto in1 = create_activation(1, 1, 32, 32);
-        auto in2 = create_activation(1, 1, 32, 32);
-
-        auto add0 = create_op("add", {in0, in1});
-        auto add1 = create_op("add", {in2, add0});
-
-        append_tm("tile_broadcast", add1, 1 /* operand id */, 2 /* broadcast dim */);
-        op_tile_broadcast_name = add0->name();
-
-        auto add2 = create_op("add", {in0, in1});
-        auto add3 = create_op("add", {add2, in2});
-
-        add2_name = add2->name();
-
-        auto out = create_op("matmul", {add1, add3});
-
-        return {out};
-    }
-
-    std::string add2_name;
-    std::string op_tile_broadcast_name;
-};
-
-TEST_F(FuseOpsReuseTest, fuse_dont_reuse_dest_if_relu)
-{
-    Graph* graph = get_graph();
-
-    // Add relu atribute to one op.
-    // Its outputs should not be reused.
-    BudaOpAttrs relu_attr;
-    relu_attr["relu_en"] = true;
-    auto add2 = graph->get_node_by_name(add2_name);
-    BudaOpNode* add2_op = add2->as<BudaOpNode>();
-    add2_op->overwrite_buda_attrs(relu_attr);
-
-    fuse_ops(graph);
-
-    // Get fused ops.
-    std::vector<BudaOpNode*> fused_ops = get_fused_ops(graph);
-
-    // Find the operator with relu activation (add2) and confirm that its outputs are not reused.
-    for (auto fused_op : fused_ops)
-    {
-        auto fused = fused_op->get_fused_op();
-        for (auto schedule : fused->get_schedules())
-        {
-            for (auto& op : schedule.ops)
-            {
-                if (op.name == add2_name)
-                {
-                    EXPECT_TRUE(op.output_type != FusedSubOp::OutputType::DEST);
-                }
-            }
-        }
-    }
-}
-
-TEST_F(FuseOpsReuseTest, dont_reuse_tile_broadcast)
-{
-    graphlib::Graph* graph = get_graph();
-
-    // Use wormhole_b0 config, since on grayskull dest can be reused only on srcA.
-    // (we want to check if srcB will be reused)
-    DeviceConfig device_config = tt::test::create_device_config(Arch::Wormhole_b0);
-    fuse_ops(graph, &device_config);
-
-    // Get fused ops.
-    std::vector<BudaOpNode*> fused_ops = get_fused_ops(graph);
-
-    // Find the operator with tile broadcast and confirm that its outputs are not reused.
-    for (auto fused_op : fused_ops)
-    {
-        auto fused = fused_op->get_fused_op();
-        for (auto schedule : fused->get_schedules())
-        {
-            for (auto& op : schedule.ops)
-            {
-                if (op.name == op_tile_broadcast_name)
-                {
-                    EXPECT_TRUE(op.output_type != FusedSubOp::OutputType::DEST);
-                }
-            }
-        }
-    }
-}
-
-struct FuseOpsDataFormatsTest : public BudaGraphTest
-{
-   protected:
-    virtual std::vector<OpType*> create_graph() override
-    {
-        auto in0 = create_activation(1, 1, 64, 64);
-        in0->set_output_df(DataFormat::Float16_b);
-        auto in1 = create_activation(1, 1, 64, 64);
-        in0->set_output_df(DataFormat::Float16_b);
-        auto in2 = create_activation(1, 1, 64, 64);
-        in0->set_output_df(DataFormat::Float16_b);
-
-        auto add0 = create_op("add", {in0, in1});
-        auto add1 = create_op("add", {in0, in2});
-
-        auto out = create_op("add", {add0, add1});
-
-        in2_name = in2->name();
-
-        return {out};
-    }
-
-    std::string in2_name;
-    ;
-};
-
-TEST_F(FuseOpsDataFormatsTest, fuse_same_data_formats)
-{
-    graphlib::Graph* graph = get_graph();
-    fuse_ops(graph);
-
-    // Since all ops have same data format we expect that validation will not assert.
-    tt::passes::validate_data_formats(graph, tt::test::create_device_config());
-}
-
-TEST_F(FuseOpsDataFormatsTest, fuse_same_data_format_types)
-{
-    graphlib::Graph* graph = get_graph();
-
-    // Change one data format to Bfp8_b
-    auto in2 = graph->get_node_by_name(this->in2_name);
-    in2->set_output_df(DataFormat::Bfp8_b);
-
-    fuse_ops(graph);
-
-    // Since all ops have same data format type (b) we expect that validation will not assert.
-    tt::passes::validate_data_formats(graph, tt::test::create_device_config());
-}
-
-TEST_F(FuseOpsDataFormatsTest, fuse_data_formats_with_float32)
-{
-    graphlib::Graph* graph = get_graph();
-
-    // Change one data format to Float32
-    auto in2 = graph->get_node_by_name(this->in2_name);
-    in2->set_output_df(DataFormat::Float32);
-
-    fuse_ops(graph);
-
-    // Since all ops have same data format type (b) or Float32 we expect that validation will not assert.
-    tt::passes::validate_data_formats(graph, tt::test::create_device_config());
-}
-
-TEST_F(FuseOpsDataFormatsTest, fail_fuse_due_to_unaligned_data_formats)
-{
-    graphlib::Graph* graph = get_graph();
-
-    // Change one data format to Float16
-    auto in2 = graph->get_node_by_name(this->in2_name);
-    in2->set_output_df(DataFormat::Float16);
-
-    fuse_ops(graph);
-
-    // Since there is op an with different data format type (a) we expect that validation will assert.
-    ASSERT_ANY_THROW(tt::passes::validate_data_formats(graph, tt::test::create_device_config()));
-}
-
-struct FuseOpsLimits : public BudaGraphTest, public testing::WithParamInterface<std::tuple<int, int, int, bool>>
-{
-   protected:
-    virtual std::vector<OpType*> create_graph() override
-    {
-        std::tie(num_inputs, split, num_forks, dram_inputs) = GetParam();
-        auto in0 = create_activation(1, 1, 64, 64);
-        graphlib::Node* out = in0;
-
-        std::vector<OpType*> outputs;
-        for (int i = 0; i < num_inputs; ++i)
-        {
-            graphlib::Node* in_i = create_activation(1, 1, 64, 64);
-            if (not dram_inputs)
-                in_i = create_op("buffer", {in_i});
-            out = create_op("add", {out, in_i});
-
-            for (int f = 0; (i % split == 0) and f < num_forks; ++f)
-            {
-                auto in_f = create_activation(1, 1, 64, 64);
-                outputs.push_back(create_op("add", {out, in_f}));
-            }
-        }
-
-        outputs.push_back(out->as<OpType>());
-
-        return outputs;
-    }
-
-    int num_inputs = 0;
-    int num_forks = 0;
-    int split = 0;
-    bool dram_inputs = false;
-};
-
-TEST_P(FuseOpsLimits, fuse_ops_limits)
-{
-    graphlib::Graph* graph = get_graph();
-
-    fuse_ops(graph);
-
-    for (auto* fused_op : get_fused_ops(graph))
-    {
-        auto operands = graph->data_operands(fused_op);
-        auto users = graph->data_users(fused_op);
-        std::vector<graphlib::Node*> dram_operands;
-        auto num_dram_operands = std::count_if(
-            operands.begin(),
-            operands.end(),
-            [](graphlib::Node* n) { return dynamic_cast<graphlib::QueueNode*>(n) != nullptr; });
-        auto num_connections = operands.size() + users.size();
-        EXPECT_LE((int)num_connections, tt::FusedOp::kMaxNumConnections);
-        EXPECT_LE((int)num_dram_operands, tt::FusedOp::kMaxNumDRAMInputs);
-    }
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    FuseOpsLimits,
-    FuseOpsLimits,
-    testing::Values(
-        std::make_tuple(32, 1, 0, false),
-        std::make_tuple(32, 2, 1, false),
-        std::make_tuple(32, 4, 7, false),
-        std::make_tuple(32, 7, 7, false),
-        std::make_tuple(32, 15, 7, false),
-        std::make_tuple(32, 1, 0, true)));
-
-}  // namespace tt::test
diff --git a/pybuda/csrc/passes/tests/test_padding_pass.cpp b/pybuda/csrc/passes/tests/test_padding_pass.cpp
deleted file mode 100644
index 74d687c74..000000000
--- a/pybuda/csrc/passes/tests/test_padding_pass.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "graph_lib/shape.hpp"
-#include "gtest/gtest.h"
-#include "passes/padding_pass_placer.hpp"
-
-using namespace tt;
-
-struct PaddingFunctionTests : testing::Test
-{
-};
-
-TEST_F(PaddingFunctionTests, test_biggest_prime_factor_10_increment)
-{
-    std::uint32_t shape_r_size = 13 * tt::graphlib::Shape::BUDA_TILE_DIM;  // 13 tiles
-
-    // Per BIGGEST_FACTOR_PRIME_10_INCREMENT padding function 13 tiles should be irregular shape.
-    ASSERT_EQ(true, tt::padding_placer::is_irregular(shape_r_size, tt::padding_placer::PaddingCriterion::BIGGEST_FACTOR_PRIME_10_INCREMENT));
-
-    // Per BIGGEST_FACTOR_PRIME_10_INCREMENT padding function should add one tile of padding.
-    ASSERT_EQ(
-        tt::graphlib::Shape::BUDA_TILE_DIM,
-        tt::padding_placer::compute_pad(shape_r_size, tt::padding_placer::PaddingCriterion::BIGGEST_FACTOR_PRIME_10_INCREMENT));
-}
diff --git a/pybuda/csrc/passes/tests/test_split_unsupp_ops.cpp b/pybuda/csrc/passes/tests/test_split_unsupp_ops.cpp
index 9e22bfc41..26ad5df17 100644
--- a/pybuda/csrc/passes/tests/test_split_unsupp_ops.cpp
+++ b/pybuda/csrc/passes/tests/test_split_unsupp_ops.cpp
@@ -56,7 +56,7 @@ struct TestGradEltwiseSubtract : public BudaGraphTest
 
 TEST_F(TestGradEltwiseAdd, split_unsup_grad_wh_b0)
 {
-    DeviceConfig device_config = create_device_config(Arch::Wormhole_b0);
+    DeviceConfig device_config = create_device_config(ARCH::WORMHOLE_B0);
     graphlib::Graph* graph = get_graph();
 
     split_unsupported_gradient_ops(graph, device_config);
@@ -68,7 +68,7 @@ TEST_F(TestGradEltwiseAdd, split_unsup_grad_wh_b0)
 
 TEST_F(TestGradEltwiseAdd, split_unsup_grad_non_wh_b0)
 {
-    DeviceConfig device_config = create_device_config(Arch::Wormhole);
+    DeviceConfig device_config = create_device_config(ARCH::GRAYSKULL);
     graphlib::Graph* graph = get_graph();
 
     split_unsupported_gradient_ops(graph, device_config);
@@ -80,7 +80,7 @@ TEST_F(TestGradEltwiseAdd, split_unsup_grad_non_wh_b0)
 
 TEST_F(TestGradEltwiseSubtract, split_unsup_grad_wh_b0)
 {
-    DeviceConfig device_config = create_device_config(Arch::Wormhole_b0);
+    DeviceConfig device_config = create_device_config(ARCH::WORMHOLE_B0);
     graphlib::Graph* graph = get_graph();
 
     split_unsupported_gradient_ops(graph, device_config);
@@ -92,7 +92,7 @@ TEST_F(TestGradEltwiseSubtract, split_unsup_grad_wh_b0)
 
 TEST_F(TestGradEltwiseSubtract, split_unsup_grad_non_wh_b0)
 {
-    DeviceConfig device_config = create_device_config(Arch::Wormhole);
+    DeviceConfig device_config = create_device_config(ARCH::GRAYSKULL);
     graphlib::Graph* graph = get_graph();
 
     split_unsupported_gradient_ops(graph, device_config);
diff --git a/pybuda/csrc/passes/tests/test_transpose_srca.cpp b/pybuda/csrc/passes/tests/test_transpose_srca.cpp
index aa85055cc..2b84609d3 100644
--- a/pybuda/csrc/passes/tests/test_transpose_srca.cpp
+++ b/pybuda/csrc/passes/tests/test_transpose_srca.cpp
@@ -98,7 +98,7 @@ struct TestTransposeSrcANary : public GraphTest<graphlib::IRLevel::IR_BUDA>
 
 TEST_F(TestTransposeSrcAUnary, fix_transpose_wh_b0)
 {
-    DeviceConfig device_config = create_device_config(Arch::Wormhole_b0);
+    DeviceConfig device_config = create_device_config(ARCH::WORMHOLE_B0);
     graphlib::Graph* graph = get_graph();
 
     fix_transposes(graph, device_config);
@@ -110,7 +110,7 @@ TEST_F(TestTransposeSrcAUnary, fix_transpose_wh_b0)
 
 TEST_F(TestTransposeSrcAUnary, fix_transpose_non_wh_b0)
 {
-    DeviceConfig device_config = create_device_config(Arch::Wormhole);
+    DeviceConfig device_config = create_device_config(ARCH::GRAYSKULL);
     graphlib::Graph* graph = get_graph();
 
     fix_transposes(graph, device_config);
@@ -122,7 +122,7 @@ TEST_F(TestTransposeSrcAUnary, fix_transpose_non_wh_b0)
 
 TEST_F(TestTransposeSrcABinary, fix_transpose_wh_b0)
 {
-    DeviceConfig device_config = create_device_config(Arch::Wormhole_b0);
+    DeviceConfig device_config = create_device_config(ARCH::WORMHOLE_B0);
     graphlib::Graph* graph = get_graph();
 
     fix_transposes(graph, device_config);
@@ -134,7 +134,7 @@ TEST_F(TestTransposeSrcABinary, fix_transpose_wh_b0)
 
 TEST_F(TestTransposeSrcABinary, fix_transpose_non_wh_b0)
 {
-    DeviceConfig device_config = create_device_config(Arch::Wormhole);
+    DeviceConfig device_config = create_device_config(ARCH::GRAYSKULL);
     graphlib::Graph* graph = get_graph();
 
     fix_transposes(graph, device_config);
@@ -146,7 +146,7 @@ TEST_F(TestTransposeSrcABinary, fix_transpose_non_wh_b0)
 
 TEST_F(TestTransposeSrcABinaryBoth, fix_transpose_wh_b0)
 {
-    DeviceConfig device_config = create_device_config(Arch::Wormhole_b0);
+    DeviceConfig device_config = create_device_config(ARCH::WORMHOLE_B0);
     graphlib::Graph* graph = get_graph();
 
     fix_transposes(graph, device_config);
@@ -158,7 +158,7 @@ TEST_F(TestTransposeSrcABinaryBoth, fix_transpose_wh_b0)
 
 TEST_F(TestTransposeSrcABinaryBoth, fix_transpose_non_wh_b0)
 {
-    DeviceConfig device_config = create_device_config(Arch::Wormhole);
+    DeviceConfig device_config = create_device_config(ARCH::GRAYSKULL);
     graphlib::Graph* graph = get_graph();
 
     fix_transposes(graph, device_config);
@@ -170,7 +170,7 @@ TEST_F(TestTransposeSrcABinaryBoth, fix_transpose_non_wh_b0)
 
 TEST_F(TestTransposeSrcANary, fix_transpose_wh_b0)
 {
-    DeviceConfig device_config = create_device_config(Arch::Wormhole_b0);
+    DeviceConfig device_config = create_device_config(ARCH::WORMHOLE_B0);
     graphlib::Graph* graph = get_graph();
 
     fix_transposes(graph, device_config);
@@ -182,7 +182,7 @@ TEST_F(TestTransposeSrcANary, fix_transpose_wh_b0)
 
 TEST_F(TestTransposeSrcANary, fix_transpose_non_wh_b0)
 {
-    DeviceConfig device_config = create_device_config(Arch::Wormhole);
+    DeviceConfig device_config = create_device_config(ARCH::GRAYSKULL);
     graphlib::Graph* graph = get_graph();
 
     fix_transposes(graph, device_config);
diff --git a/pybuda/csrc/pattern_matcher/boost_lowering.cpp b/pybuda/csrc/pattern_matcher/boost_lowering.cpp
deleted file mode 100644
index 277e0e2e5..000000000
--- a/pybuda/csrc/pattern_matcher/boost_lowering.cpp
+++ /dev/null
@@ -1,478 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "pattern_matcher/boost_lowering.hpp"
-#include "pattern_matcher/pattern_matcher.hpp"
-
-#include <sstream>
-
-#include "utils/logger.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node_types.hpp"
-#include "graph_lib/utils.hpp"
-
-
-#include <unordered_map>
-
-// keys := set of nodes corresponding to subgraph pattern
-// values := ordered list of matches
-// TODO(jchu): verify the matches are presented in order.
-using NodeIdToNodeIdMatches = std::unordered_map<NodeId, std::vector<NodeId>>;
-using tt::LogPatternMatcher;
-
-using namespace tt::graphlib;
-
-namespace pattern_matcher {
-
-std::string get_input_op_string(Node* node) {
-    if (node->as<InputNode>()->is_activation()) {
-        return "*";
-    }
-    return node->as<InputNode>()->input_type_string();
-}
-
-std::string get_op_string(Node* node) {
-    if (node->node_type() == NodeType::kInput) {
-        return get_input_op_string(node);
-    } else if (node->node_type() == NodeType::kOutput) {
-        return "output";
-    }
-    return node->as<OpNode>()->op_name();
-}
-
-VertexId add_vertex_to_boost_graph(graph_type& graph, Node* node) {
-
-    // log_trace(LogPatternMatcher, "Node: {}, {}, {}", node->id(), get_op_string(node), node->name());
-    VertexId vertex_descriptor = add_vertex(
-        VertexProperty{
-            .name=node->name(),
-            .op_type=get_op_string(node),
-            .node_id=node->id(),
-        },
-        graph
-    );
-    return vertex_descriptor;
-}
-
-graph_type convert_graph_to_boost_graph(Graph* graph) {
-    graph_type boost_graph;
-    std::unordered_map<NodeId, VertexId> node_to_vertex;
-
-    std::vector<Node*> deferred_nodes;
-
-    // add the activation inputs first
-    for (Node* node : topological_sort(*graph)) {
-        if (node->node_type() == NodeType::kInput and node->as<InputNode>()->is_activation()) {
-            node_to_vertex[node->id()] = add_vertex_to_boost_graph(boost_graph, node);
-        }
-    }
-
-    // copy nodes and record node mapping
-    for (Node* node : topological_sort(*graph)) {
-        for (Node* operand : graph->data_operands(node)) {
-            if (node_to_vertex.find(operand->id()) == node_to_vertex.end()) {
-                node_to_vertex[operand->id()] = add_vertex_to_boost_graph(boost_graph, operand);
-            }
-        }
-
-        if (node->node_type() == NodeType::kPyOp or node->node_type() == NodeType::kBudaOp) {
-            node_to_vertex[node->id()] = add_vertex_to_boost_graph(boost_graph, node);
-        } else {
-            deferred_nodes.push_back(node);
-        }
-    }
-
-    for (Node* node : deferred_nodes) {
-        if (node_to_vertex.find(node->id()) == node_to_vertex.end()) {
-            node_to_vertex[node->id()] = add_vertex_to_boost_graph(boost_graph, node);
-        }
-    }
-
-    // copy nodes and record node mapping
-    for (const auto& [node_id, edge_set]: graph->operands_map()) {
-        for (const Edge& edge : edge_set) {
-            if (edge.edge_type == EdgeType::kData)  {
-                // going to assume all of these are just on single output port
-                auto [output_port_id, producer_output_edge_index] = graph->output_port_and_index_for_data_user_port(graph->node_by_id(edge.producer_node_id), edge);
-                // log_trace(LogPatternMatcher, "Edge: {}, {}, {}", edge.producer_node_id, edge.consumer_node_id, producer_output_edge_index);
-                add_edge(
-                    node_to_vertex[edge.producer_node_id],
-                    node_to_vertex[edge.consumer_node_id],
-                    EdgeProperty{
-                        .producer_output_edge_index=producer_output_edge_index,
-                        .consumer_input_edge_index=(int)edge.consumer_input_port_id
-                    },
-                    boost_graph
-                );
-            }
-        }
-    }
-
-    for (const auto& [node_id, edge_set]: graph->users_map()) {
-        for (const Edge& edge : edge_set) {
-            if (edge.edge_type == EdgeType::kData)  {
-                // going to assume all of these are just on single output port
-                auto [output_port_id, producer_output_edge_index] = graph->output_port_and_index_for_data_user_port(graph->node_by_id(edge.producer_node_id), edge);
-                // log_trace(LogPatternMatcher, "Edge: {}, {}, {}", edge.producer_node_id, edge.consumer_node_id, producer_output_edge_index);
-                add_edge(
-                    node_to_vertex[edge.producer_node_id],
-                    node_to_vertex[edge.consumer_node_id],
-                    EdgeProperty{
-                        .producer_output_edge_index=producer_output_edge_index,
-                        .consumer_input_edge_index=(int)edge.consumer_input_port_id
-                    },
-                    boost_graph
-                );
-            }
-        }
-    }
-
-    return boost_graph;
-}
-
-VertexId add_vertex_to_boost_graph(graph_type& graph, json node) {
-
-    uint node_id = node["nid"];
-    log_debug(LogPatternMatcher, "Node: {}, {}, {}", node_id, node["op"], node["buda_name"]);
-    VertexId vertex_descriptor = add_vertex(
-        VertexProperty{
-            .name=node["buda_name"],
-            .op_type=node["op"],
-            .node_id=node_id,
-        },
-        graph
-    );
-    return vertex_descriptor;
-}
-
-
-graph_type convert_json_graph_to_boost_graph(json json_graph) {
-    graph_type boost_graph;
-    std::unordered_map<int, VertexId> node_to_vertex;
-    std::vector<uint> deferred_node_ids; 
-
-    // process input nodes first
-    auto nodes = json_graph["nodes"];
-    for (auto node : nodes) {
-        if (node.contains("op") and node["op"] == "*") {
-            node_to_vertex[node["nid"]] = add_vertex_to_boost_graph(boost_graph, node);
-        }
-    }
-
-    // we want to defer adding constants until they are needed in the graph
-    for (auto node : nodes) {
-        if (node["attrs"].contains("num_inputs")) {
-            std::string num_inputs_str = node["attrs"]["num_inputs"];
-            uint num_inputs = std::stoi(num_inputs_str);
-            for (uint input_index = 0; input_index < num_inputs; input_index++) {
-                uint input_nid = node["inputs"][input_index][0];
-                if (node_to_vertex.find(input_nid) == node_to_vertex.end()) {
-                    node_to_vertex[input_nid] = add_vertex_to_boost_graph(boost_graph, json_graph["nodes"][input_nid]);
-                }
-            }
-
-            node_to_vertex[node["nid"]] = add_vertex_to_boost_graph(boost_graph, node);
-        }
-        else {
-            deferred_node_ids.push_back(node["nid"]);
-        }
-    }
-    for (uint nid : deferred_node_ids) {
-        if (node_to_vertex.find(nid) == node_to_vertex.end()) {
-            std::cout << "Shouldn't be the case!!!!!!!" << std::endl;
-            node_to_vertex[nid] = add_vertex_to_boost_graph(boost_graph, json_graph["nodes"][nid]);
-        }
-    }
-
-    // construct users map:
-    std::unordered_map<VertexId, std::vector<VertexId>> users_map;
-    std::map<std::pair<VertexId, VertexId>, int> producer_output_index_map;
-    std::map<std::pair<VertexId, VertexId>, int> consumer_input_index_map;
-
-    for (auto node: nodes) {
-        if (node["attrs"].contains("num_inputs")) {
-            std::string num_inputs_str = node["attrs"]["num_inputs"];
-            uint num_inputs = std::stoi(num_inputs_str);
-            for (uint input_index = 0; input_index < num_inputs; input_index++) {
-                uint input_nid = node["inputs"][input_index][0];
-                uint node_id = node["nid"];
-
-                int producer_output_index = users_map[input_nid].size();
-                users_map[input_nid].push_back(node_id);
-                std::pair<VertexId, VertexId> key = {input_nid, node_id};
-
-                producer_output_index_map[key] = producer_output_index;
-                consumer_input_index_map[key] = input_index;
-            }
-        }
-    }
-
-    for (auto node: nodes) {
-        if (node["attrs"].contains("num_inputs")) {
-            std::string num_inputs_str = node["attrs"]["num_inputs"];
-            uint num_inputs = std::stoi(num_inputs_str);
-            for (uint input_index = 0; input_index < num_inputs; input_index++) {
-                uint input_nid = node["inputs"][input_index][0];
-                uint node_id = node["nid"];
-                // log_trace(LogPatternMatcher, "Edge: {}, {}, {}", input_nid, node_id, producer_output_edge_index);
-
-                std::pair<VertexId, VertexId> key = {input_nid, node_id};
-                int producer_output_index = producer_output_index_map[key];
-                int consumer_input_index = consumer_input_index_map[key];
-
-                add_edge(
-                    node_to_vertex[input_nid],
-                    node_to_vertex[node_id],
-                    EdgeProperty{.producer_output_edge_index=producer_output_index, .consumer_input_edge_index=consumer_input_index },
-                    boost_graph
-                );
-            }
-        }
-    }
-
-    return boost_graph;
-}
-
-
-void print_nodes(json graph, std::string type, const std::vector<NodeId>& node_ids) {
-    std::stringstream ss;
-    for (auto node_id : node_ids) {
-        ss << graph["nodes"][node_id]["buda_name"] << ", ";
-    }
-    log_debug(LogPatternMatcher, "{}: [{}]", type, ss.str());
-}
-
-void print_subgraph_pattern_matches(json graph, graph_type& boost_braph, graph_type& subgraph, const SubgraphPatternMatchMappings& subgraph_matches) {
-    for (NodeId node_id : get_node_ids(subgraph, subgraph_matches)) {
-        std::stringstream ss;
-        for (size_t match_idx = 0; match_idx < subgraph_matches.size(); ++match_idx) {
-            NodeId matched_node_id = subgraph_matches[match_idx].at(node_id);
-            ss << graph["nodes"][matched_node_id]["buda_name"] << ", ";
-        }
-        log_debug(LogPatternMatcher, "{} -> [{}]", graph["nodes"][node_id]["buda_name"], ss.str());
-    }
-
-    std::vector<NodeId> input_activation_node_ids = get_input_activation_node_ids(subgraph, subgraph_matches);
-    std::vector<NodeId> parameter_node_ids = get_parameter_node_ids(subgraph, subgraph_matches);
-    std::vector<NodeId> constant_node_ids = get_constant_node_ids(subgraph, subgraph_matches);
-    std::vector<NodeId> output_node_ids = get_output_node_ids(subgraph, subgraph_matches);
-
-    print_nodes(graph, "input", input_activation_node_ids );
-    print_nodes(graph, "parameters", parameter_node_ids);
-    print_nodes(graph, "constant", constant_node_ids);
-    print_nodes(graph, "output", output_node_ids);
-
-    std::vector<NodeId> unmatched_node_ids = get_unmatched_node_ids(boost_braph, subgraph_matches);
-    print_nodes(graph, "unmatched", unmatched_node_ids);
-}
-
-
-void print_nodes(Graph* graph, std::string type, const std::vector<NodeId>& node_ids) {
-    std::stringstream ss;
-    for (auto node_id : node_ids) {
-        ss << graph->node_by_id(node_id)->name() << ", ";
-    }
-    log_debug(LogPatternMatcher, "{}: [{}]", type, ss.str());
-}
-
-void print_subgraph_pattern_matches(Graph* graph, graph_type& boost_graph, graph_type& subgraph, const SubgraphPatternMatchMappings& subgraph_matches) {
-    for (NodeId node_id : get_node_ids(subgraph, subgraph_matches)) {
-        std::stringstream ss;
-        for (size_t match_idx = 0; match_idx < subgraph_matches.size(); ++match_idx) {
-            NodeId matched_node_id = subgraph_matches[match_idx].at(node_id);
-            ss << (graph->node_by_id(matched_node_id))->name() << ", ";
-        }
-        log_debug(LogPatternMatcher, "{} -> [{}]", graph->node_by_id(node_id)->name(), ss.str());
-    }
-
-    std::vector<NodeId> input_activation_node_ids = get_input_activation_node_ids(subgraph, subgraph_matches);
-    std::vector<NodeId> parameter_node_ids = get_parameter_node_ids(subgraph, subgraph_matches);
-    std::vector<NodeId> constant_node_ids = get_constant_node_ids(subgraph, subgraph_matches);
-    std::vector<NodeId> output_node_ids = get_output_node_ids(subgraph, subgraph_matches);
-
-    log_debug(LogPatternMatcher, "=== Printing Results ===");
-    print_nodes(graph, "input", input_activation_node_ids );
-    print_nodes(graph, "parameters", parameter_node_ids);
-    print_nodes(graph, "constant", constant_node_ids);
-    print_nodes(graph, "output", output_node_ids);
-
-    std::vector<NodeId> unmatched_node_ids = get_unmatched_node_ids(boost_graph, subgraph_matches);
-    print_nodes(graph, "unmatched", unmatched_node_ids);
-}
-
-
-bool can_subgraph_be_looped(Graph *graph, graph_type& subgraph, const SubgraphPatternMatchMappings& subgraph_matches) {
-
-    // Condition that must be satisfied for subgraph to be looped:
-    // - from the discovered subgraph, we visit its output nodes (i.e. leaf nodes). Each output node
-    //   must belong in the input_activation_node_ids of the next match instance.
-    // - We are required to do a pairwise check for each discovered subgraph pattern to guarantee
-    //   that we can continue the loop iteration.
-    size_t num_pairwise_checks = subgraph_matches.size() - 1;
-
-    for (size_t match_idx = 0; match_idx < num_pairwise_checks; ++match_idx) {
-        // not expecting a lot of input ids.. leave it as a vector instead of converting to a set
-        std::vector<NodeId> current_match_output_node_ids = get_output_node_ids(subgraph, subgraph_matches, match_idx);
-        std::vector<NodeId> next_match_input_node_ids = get_input_activation_node_ids(subgraph, subgraph_matches, match_idx + 1);
-
-        bool can_continue_looping = true;
-        for (NodeId node_id : next_match_input_node_ids) {
-            // all operands to the input nodes of the next match should belong in the output node ids of current match
-            Node* input_node = graph->node_by_id(node_id);
-            for (Node* operand : graph->data_operands(input_node)) {
-                // all operands belong to the output-set of current-match
-                if (operand->node_type() == NodeType::kInput) {
-                    continue;
-                }
-                can_continue_looping &= std::find(current_match_output_node_ids.begin(), current_match_output_node_ids.end(), operand->id()) !=
-                    current_match_output_node_ids.end();
-            }
-        }
-        if (not can_continue_looping) {
-            log_debug(LogPatternMatcher, "current match_idx: {}, output_nodes:", match_idx);
-            for (NodeId output_id: current_match_output_node_ids ) {
-                log_debug(LogPatternMatcher, "\t\t {}", graph->node_by_id(output_id)->name());
-            }
-
-            log_debug(LogPatternMatcher, "\t next_match_idx input_ids:" );
-            for (NodeId input_id : next_match_input_node_ids) {
-                log_debug(LogPatternMatcher, "\t\t {}", graph->node_by_id(input_id)->name());
-            }
-            return false;
-        }
-    }
-
-    return true;
-}
-
-void loop_over_subgraph(Graph *graph, graph_type& subgraph, const SubgraphPatternMatchMappings& subgraph_matches) {
-    std::vector<NodeId> input_activation_node_ids = get_input_activation_node_ids(subgraph, subgraph_matches);
-    std::vector<NodeId> output_node_ids = get_output_node_ids(subgraph, subgraph_matches);
-
-    // For now only handle single output
-    TT_ASSERT(input_activation_node_ids.size() == 1, "PatternMatcher only supports single output for now.");
-    TT_ASSERT(output_node_ids.size() == 1, "PatternMatcher only supports single output for now.");
-
-    std::vector<NodeId> final_match_output_node_ids = get_output_node_ids(subgraph, subgraph_matches, subgraph_matches.size() - 1);
-    TT_ASSERT(final_match_output_node_ids.size() == 1, "PatternMatcher only supports single output for now.");
-    Node* final_match_output_node = graph->node_by_id(final_match_output_node_ids.at(0));
-
-    int index = 0;
-    std::vector<Node*> outputs_of_final_subgraph_match = graph->data_users(final_match_output_node);
-
-    // connect output node to outputs_of_final_subgraph_match
-    Node* input_activation_node = graph->node_by_id(input_activation_node_ids.at(index));
-    Node* input = graph->data_operands(input_activation_node).at(0);
-    Node* primary_output = graph->node_by_id(output_node_ids.at(index));
-
-
-    int loop_iterations = subgraph_matches.size();
-    std::unordered_map<std::string, std::vector<std::string>> parameter_to_matched_parameters;
-    std::unordered_set<NodeId> nodes_processed_in_loop;
-
-    for (int match_idx = 0; match_idx < loop_iterations; ++match_idx) {
-        for (const auto& [primary_node_id, matched_node_id] : subgraph_matches[match_idx]) {
-            Node* primary = graph->node_by_id(primary_node_id);
-            Node* matched = graph->node_by_id(matched_node_id);
-
-            if (primary->node_type() == NodeType::kInput and primary->as<InputNode>()->is_parameter()) {
-                parameter_to_matched_parameters[primary->name()].push_back(matched->name());
-            }
-
-            if (match_idx == 0) {
-                nodes_processed_in_loop.insert(primary->id());
-
-            } else {
-                // skip match_idx = 0 because we only want to delete matches [1, n)
-                if (subgraph_matches[match_idx].find(matched_node_id) == subgraph_matches[match_idx].end()) {
-                    // make sure output of primaries aren't deleted
-                    graph->remove_node(matched_node_id);
-                } else {
-                    for (auto edge : graph->user_edges(matched)) {
-                        graph->remove_edge(edge);
-                    }
-                }
-            }
-        }
-    }
-
-    for (const auto& [parameter, mapped_parameters] : parameter_to_matched_parameters) {
-        std::stringstream ss;
-        for (const auto& mapped_parameter : mapped_parameters) {
-            ss << mapped_parameter << ", ";
-        }
-        log_info(LogPatternMatcher, "Recording Parameter Mapping: {}->[{}]", parameter, ss.str());
-    }
-
-    for (auto output : outputs_of_final_subgraph_match) {
-        log_info(LogPatternMatcher, "Final subgraph match output users: {}", output->name());
-        graph->add_edge(primary_output, output);
-    }
-
-    Edge control_loop_edge(primary_output->id(), 0, input->id(), 0 /* consumer_input_port_id */, EdgeType::kControlLoop);
-    std::shared_ptr<LoopEdgeAttributes> loop_attributes = std::make_shared<LoopEdgeAttributes>(
-            EdgeType::kControlLoop,
-            LoopEdgeAttributes::LoopEdgeAttributesInternal{
-                .loop_iterations_ = loop_iterations,
-                .parameter_to_matched_parameters_ = parameter_to_matched_parameters,
-                .nodes_processed_in_loop_= nodes_processed_in_loop
-            });
-    graph->add_edge(control_loop_edge, loop_attributes);
-}
-
-std::pair<Graph*, MatchResult> lower_pybuda_to_pattern_matcher(Graph* graph, int num_matches_to_search) {
-    graph_type boost_graph = convert_graph_to_boost_graph(graph);
-    graph_type subgraph_pattern = discover_largest_subgraph_pattern(boost_graph, num_matches_to_search);
-
-    // save_dotgraph_to_ostream(std::cout, boost_graph);
-
-    bool is_subgraph_pattern_found = boost::num_vertices(subgraph_pattern) > 0;
-    bool is_subgraph_loopable = false;
-
-    SubgraphPatternMatchMappings subgraph_matches = subgraph_pattern_match(subgraph_pattern, boost_graph);
-    if (is_subgraph_pattern_found) {
-        print_subgraph_pattern_matches(graph, boost_graph, subgraph_pattern, subgraph_matches);
-
-        is_subgraph_loopable = can_subgraph_be_looped(graph, subgraph_pattern, subgraph_matches);
-        if (is_subgraph_loopable) {
-            loop_over_subgraph(graph, subgraph_pattern, subgraph_matches);
-        }
-    }
-
-
-    log_info(LogPatternMatcher, "Subgraph pattern is found: {}", (is_subgraph_pattern_found ? "YES" : "NO"));
-    log_info(LogPatternMatcher, "Subgraph can be looped: {}", (is_subgraph_loopable  ? "YES" : "NO"));
-
-    return {
-        graph,
-        MatchResult{
-            .is_subgraph_pattern_found=is_subgraph_pattern_found,
-            .is_subgraph_loopable=is_subgraph_loopable,
-            .subgraph_matches=subgraph_matches
-        }
-    };
-}
-
-MatchResult lower_json_to_pattern_matcher(json graph, int num_matches_to_search) {
-    graph_type boost_graph = convert_json_graph_to_boost_graph(graph);
-    graph_type subgraph_pattern = discover_largest_subgraph_pattern(boost_graph, num_matches_to_search);
-
-    //save_dotgraph_to_ostream(std::cout, boost_graph);
-    //save_dotgraph_to_file("input_graph.txt", boost_graph);
-
-    bool is_subgraph_pattern_found = boost::num_vertices(subgraph_pattern) > 0;
-
-    SubgraphPatternMatchMappings subgraph_matches = subgraph_pattern_match(subgraph_pattern, boost_graph);
-    print_subgraph_pattern_matches(graph, boost_graph, subgraph_pattern, subgraph_matches);
-
-    log_info(LogPatternMatcher, "Subgraph pattern is found: {}", (is_subgraph_pattern_found ? "YES" : "NO"));
-
-    return 
-        MatchResult{
-            .is_subgraph_pattern_found=is_subgraph_pattern_found,
-            .is_subgraph_loopable=false,
-            .subgraph_matches=subgraph_matches
-        };
-}
-
-} // namespace pattern_matcher
diff --git a/pybuda/csrc/pattern_matcher/boost_lowering.hpp b/pybuda/csrc/pattern_matcher/boost_lowering.hpp
deleted file mode 100644
index c63aa36cd..000000000
--- a/pybuda/csrc/pattern_matcher/boost_lowering.hpp
+++ /dev/null
@@ -1,32 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <utility>
-
-#include "third_party/json/json.hpp"
-#include "pattern_matcher.hpp"
-
-using json = nlohmann::json;
-// fwd declare
-namespace tt::graphlib {
-    class Graph;
-}
-
-using Graph = tt::graphlib::Graph;
-
-namespace pattern_matcher {
-
-struct MatchResult {
-    bool is_subgraph_pattern_found;
-    bool is_subgraph_loopable;
-    SubgraphPatternMatchMappings subgraph_matches;
-};
-
-// 1. Discover the largest subgraph in the graph containing exactly `num_matches_to_search`
-// 2. If we can "roll" the subgraph, we compact the graph with a subgraph + loops.
-std::pair<Graph*, MatchResult> lower_pybuda_to_pattern_matcher(Graph* graph, int num_matches_to_search);
-MatchResult lower_json_to_pattern_matcher(json json_graph, int num_matches_to_search);
-
-} // namespace pattern_matcher
diff --git a/pybuda/csrc/pattern_matcher/module.mk b/pybuda/csrc/pattern_matcher/module.mk
deleted file mode 100644
index 384cb9f8e..000000000
--- a/pybuda/csrc/pattern_matcher/module.mk
+++ /dev/null
@@ -1,31 +0,0 @@
-# Every variable in subdir must be prefixed with subdir (emulating a namespace)
-
-PYBUDA_CSRC_PATTERN_MATCHER_LIB = $(LIBDIR)/libpattern_matcher.a
-PYBUDA_CSRC_PATTERN_MATCHER_SRCS = \
-	pybuda/csrc/pattern_matcher/pattern_matcher.cpp \
-	pybuda/csrc/pattern_matcher/boost_lowering.cpp \
-	pybuda/csrc/pattern_matcher/python_bindings.cpp
-
-PYBUDA_CSRC_PATTERN_MATCHER_INCLUDES = $(PYBUDA_CSRC_INCLUDES)
-
-PYBUDA_CSRC_PATTERN_MATCHER_OBJS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_PATTERN_MATCHER_SRCS:.cpp=.o))
-PYBUDA_CSRC_PATTERN_MATCHER_DEPS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_PATTERN_MATCHER_SRCS:.cpp=.d))
-
-
-
--include $(PYBUDA_CSRC_PATTERN_MATCHER_DEPS)
-
-PATTERN_MATCHER_CSRC_CFLAGS = $(PYBUDA_CSRC_CFLAGS)
-
-# Each module has a top level target as the entrypoint which must match the subdir name
-pybuda/csrc/pattern_matcher: $(PYBUDA_CSRC_PATTERN_MATCHER_LIB)
-
-$(PYBUDA_CSRC_PATTERN_MATCHER_LIB): $(PYBUDA_CSRC_GRAPH_LIB) $(PYBUDA_CSRC_PATTERN_MATCHER_OBJS)
-	@mkdir -p $(LIBDIR)
-	ar rcs $@ $^
-
-$(OBJDIR)/pybuda/csrc/pattern_matcher/%.o: pybuda/csrc/pattern_matcher/%.cpp
-	@mkdir -p $(@D)
-	$(CXX) $(PATTERN_MATCHER_CSRC_CFLAGS) $(CXXFLAGS) $(STATIC_LIB_FLAGS) $(PYBUDA_CSRC_PATTERN_MATCHER_INCLUDES) -c -o $@ $<
-
-include pybuda/csrc/pattern_matcher/tests/module.mk
diff --git a/pybuda/csrc/pattern_matcher/pattern_matcher.cpp b/pybuda/csrc/pattern_matcher/pattern_matcher.cpp
deleted file mode 100644
index ecaf37d52..000000000
--- a/pybuda/csrc/pattern_matcher/pattern_matcher.cpp
+++ /dev/null
@@ -1,435 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "pattern_matcher/pattern_matcher.hpp"
-#include "utils/logger.hpp"
-
-#include <iostream>
-#include <string>
-#include <unordered_set>
-
-#include <boost/graph/copy.hpp>
-#include <boost/graph/filtered_graph.hpp>
-#include <boost/graph/graph_utility.hpp>
-#include <boost/graph/adjacency_list.hpp>
-#include <boost/graph/vf2_sub_graph_iso.hpp>
-#include <boost/graph/graphviz.hpp>
-#include <boost/variant/get.hpp>
-
-#include <boost/archive/text_oarchive.hpp>
-#include <boost/archive/text_iarchive.hpp>
-#include "fmt/core.h"
-
-using std::string;
-
-using tt::LogPatternMatcher;
-
-namespace pattern_matcher {
-
-int num_subgraph_pattern_matches(graph_type& small_graph, graph_type& large_graph, int max_matches = INT_MAX) {
-    int total_matches = 0;
-
-    std::unordered_map<NodeId, std::unordered_set<NodeId>> unique_matches;
-
-    auto callback = [&](auto bijection, auto) {
-        for (auto v : boost::make_iterator_range(vertices(small_graph))) {
-            NodeId matched_id = large_graph[get(bijection, v)].node_id;
-            if (unique_matches.find(matched_id) == unique_matches.end()) {
-                unique_matches[small_graph[v].node_id].insert(matched_id);
-            } else {
-                return false;
-            }
-        }
-
-        total_matches += 1;
-        if (total_matches > max_matches) {
-            return false;
-        }
-        return true;
-    };
-
-    auto edge_predicate = [&](auto edge_a, auto edge_b) {
-        return small_graph[edge_a].producer_output_edge_index == large_graph[edge_b].producer_output_edge_index
-            and small_graph[edge_a].consumer_input_edge_index == large_graph[edge_b].consumer_input_edge_index ;
-    };
-    auto vertex_predicate = [&](auto vertex_a, auto vertex_b) {
-        if (small_graph[vertex_a].op_type == "*") {
-            return true;
-        }
-
-        return small_graph[vertex_a].op_type == large_graph[vertex_b].op_type;
-    };
-    boost::vf2_subgraph_iso(
-        small_graph,
-        large_graph,
-        callback,
-        boost::vertex_order_by_mult(small_graph),
-        boost::edges_equivalent(edge_predicate).vertices_equivalent(vertex_predicate));
-
-    return total_matches;
-}
-
-
-struct EdgePredicate {
-  EdgePredicate() = default;
-  EdgePredicate(graph_type* graph, VertexId valid_start, VertexId valid_end)
-      : graph_(graph), valid_start_(valid_start), valid_end_(valid_end) {}
-
-  template <typename Edge>
-  bool operator()(const Edge& edge) const {
-    // Include this edge iff both endpoints belong in the valid range
-
-    auto source_vertex = source(edge, *graph_);
-    if (source_vertex < valid_start_ or source_vertex > valid_end_) {
-        return false;
-    }
-
-    auto target_vertex = target(edge, *graph_);
-    if (target_vertex < valid_start_ or target_vertex > valid_end_) {
-        return false;
-    }
-
-    return true;
-  }
-
-  graph_type* graph_;
-  VertexId valid_start_;
-  VertexId valid_end_;
-};
-
-struct VertexPredicate {
-  VertexPredicate() = default;
-  VertexPredicate(graph_type* graph, VertexId valid_start, VertexId valid_end)
-      : graph_(graph), valid_start_(valid_start), valid_end_(valid_end) {}
-
-  template <typename VertexId>
-  bool operator()(const VertexId& v) const {
-    if (v < valid_start_) return false;
-    if (v > valid_end_) return false;
-
-    // Include this vertex iff it has at least one connection to a vertex in
-    // the allowable set.
-    for (auto in_edge : make_iterator_range(in_edges(v, *graph_))) {
-        VertexId source_vertex = source(in_edge, *graph_);
-        if (source_vertex >= valid_start_ and source_vertex <= valid_end_) {
-            return true;
-        }
-    }
-
-    for (auto out_edge : make_iterator_range(out_edges(v, *graph_))) {
-        VertexId target_vertex = target(out_edge, *graph_);
-        if (target_vertex >= valid_start_ and target_vertex <= valid_end_) {
-            return true;
-        }
-    }
-
-    return false;
-  }
-
-  graph_type* graph_;
-  VertexId valid_start_;
-  VertexId valid_end_;
-};
-
-
-graph_type generate_pattern_subgraph(graph_type& graph, VertexId start, VertexId end)
-{
-    using filtered_graph_type = boost::filtered_graph<graph_type, EdgePredicate, VertexPredicate>;
-    EdgePredicate edge_pred = EdgePredicate(&graph, start, end);
-    VertexPredicate vert_pred = VertexPredicate(&graph, start, end);
-    filtered_graph_type filtered_graph = boost::filtered_graph(graph, edge_pred, vert_pred);
-
-    // Bad design of boost::filtered_graph.. we can't use it directly in vf2
-    // We can incrementally build pattern graph for speedup if this ends up being a bottleneck
-    graph_type pattern_graph;
-    boost::copy_graph(filtered_graph, pattern_graph);
-    return pattern_graph;
-}
-
-int get_max_vertices_to_include(const graph_type& graph, int num_expected_matches)
-{
-    // heuristic to limit the subgraphs generated. Given N-expected matches,
-    // we roughly know that the pattern subgraph we should search for is roughly in
-    // num_nodes() / N plus some preamble nodes.
-    return (num_vertices(graph) / num_expected_matches) + 1;
-}
-static int get_num_input_nodes(graph_type& graph) {
-    int num_input_nodes = 0;
-
-    for (auto v : boost::make_iterator_range(vertices(graph))) {
-        if (graph[v].op_type == "*") {
-            num_input_nodes++;
-        }
-    }
-
-    return num_input_nodes;
-}
-
-std::vector<std::pair<VertexId, VertexId>> get_subgraph_vertex_start_end_pairs(graph_type& graph, int num_expected_matches) {
-    int window_size = 5;
-    int max_vertices_to_include = get_max_vertices_to_include(graph, num_expected_matches);
-    std::vector<std::pair<VertexId, VertexId>> subgraph_vertex_start_end_pairs;
-
-    int first_non_input_vertex_id = get_num_input_nodes(graph);
-    for (int start_vertex_id = first_non_input_vertex_id; start_vertex_id < first_non_input_vertex_id + window_size; ++start_vertex_id) {
-        int end_vertex_id_begin = std::max(start_vertex_id, start_vertex_id + max_vertices_to_include - window_size);
-        int end_vertex_id_end = start_vertex_id + max_vertices_to_include + window_size;
-
-        // NB: The fact that we go in reverse is important. The reason is because we want to start subgraph matching
-        // on the largest set on nodes possibly before reducing the range.
-        // TODO(jchu): may need to replace return type with std::set and guarantee largest subgraphs across differnt
-        // start vertex ids are tried first.
-        for (int end_vertex_id = end_vertex_id_end - 1; end_vertex_id >= end_vertex_id_begin; --end_vertex_id) {
-            subgraph_vertex_start_end_pairs.emplace_back(start_vertex_id, end_vertex_id);
-        }
-    }
-    auto largest_difference_cmp = [](const std::pair<VertexId, VertexId>& a, const std::pair<VertexId, VertexId>& b) {
-        auto [a_start, a_end] = a;
-        auto [b_start, b_end] = b;
-        if ((a_end - a_start) == (b_end - b_start)) {
-            return a_start < b_start;
-        }
-
-        return (a_end - a_start) >= (b_end - b_start);
-    };
-    std::sort(subgraph_vertex_start_end_pairs.begin(), subgraph_vertex_start_end_pairs.end(), largest_difference_cmp);
-    return subgraph_vertex_start_end_pairs;
-}
-
-
-bool contains_exactly_n_subgraph_matches(graph_type& graph, int num_expected_matches) {
-    for (const auto& [start_vertex_id, end_vertex_id] : get_subgraph_vertex_start_end_pairs(graph, num_expected_matches)) {
-        graph_type subgraph_pattern = generate_pattern_subgraph(graph, start_vertex_id, end_vertex_id);
-        int matches = num_subgraph_pattern_matches(subgraph_pattern, graph, num_expected_matches);
-        if (matches == num_expected_matches) {
-            return true;
-        }
-    }
-    return false;
-}
-
-void save_dotgraph_to_ostream(std::ostream& stream, const graph_type& graph)
-{
-    VertexPropertyWriter<graph_type> vertex_writer(graph);
-    write_graphviz(stream, graph, vertex_writer);
-}
-
-void save_dotgraph_to_file(std::string filename, const graph_type& graph)
-{
-    std::string dot_graph_filename = "compiled_dot_graph.txt";
-    std::ofstream odotfile{filename};
-    save_dotgraph_to_ostream(odotfile, graph);
-    odotfile.close();
-}
-
-void save_graph_to_file(std::string filename, graph_type& graph)
-{
-    std::ofstream ofile{filename};
-    boost::archive::text_oarchive oa{ofile};
-    save(oa, graph);
-    ofile.close();
-}
-
-graph_type load_graph_from_file(std::string filename)
-{
-    std::ifstream ifile{filename};
-    boost::archive::text_iarchive ia{ifile};
-
-    graph_type graph;
-    load(ia, graph);
-
-    VertexPropertyWriter<graph_type> vertex_writer(graph);
-    return graph;
-}
-
-std::vector<NodeId> get_node_ids(graph_type& graph, const SubgraphPatternMatchMappings& subgraph_matches, int match_index)
-{
-    assert(match_index < (int)subgraph_matches.size());
-    std::vector<NodeId> node_ids;
-
-    for (auto v : boost::make_iterator_range(vertices(graph))) {
-        NodeId node_id = (match_index == 0) ? graph[v].node_id : subgraph_matches[match_index].at(graph[v].node_id);
-        node_ids.push_back(node_id);
-    }
-
-    return node_ids;
-}
-
-std::vector<NodeId> get_input_activation_node_ids(graph_type& graph, const SubgraphPatternMatchMappings& subgraph_matches, int match_index)
-{
-    assert(match_index < (int)subgraph_matches.size());
-    std::vector<NodeId> input_node_ids;
-
-    const std::unordered_set<std::string> input_type_strings = {
-        "accumulator",
-        "loss",
-        "parameter",
-        "constant",
-        "optimizer_parameter",
-    };
-
-    for (auto v : boost::make_iterator_range(vertices(graph))) {
-        const std::string& op_type = graph[v].op_type;
-
-        bool is_input_node = graph[v].op_type == "*";
-        bool is_math_op = input_type_strings.find(op_type) == input_type_strings.end();
-        bool are_all_operands_inputs = true;
-
-        auto [in_edges_start, in_edges_end] = boost::in_edges(v, graph);
-        for (; in_edges_start != in_edges_end; ++in_edges_start) {
-            VertexId operand = boost::source(*in_edges_start, graph);
-            are_all_operands_inputs &= input_type_strings.find(graph[operand].op_type) != input_type_strings.end();
-            if (not are_all_operands_inputs) {
-                break;
-            }
-        }
-        if (is_input_node or (is_math_op and are_all_operands_inputs)) {
-            NodeId node_id = (match_index == 0) ? graph[v].node_id : subgraph_matches[match_index].at(graph[v].node_id);
-            input_node_ids.push_back(node_id);
-        }
-    }
-
-    return input_node_ids;
-}
-
-std::vector<NodeId> get_parameter_node_ids(graph_type& graph, const SubgraphPatternMatchMappings& subgraph_matches, int match_index)
-{
-    assert(match_index < (int)subgraph_matches.size());
-    std::vector<NodeId> parameter_node_ids;
-
-    for (auto v : boost::make_iterator_range(vertices(graph))) {
-        if (boost::in_degree(v, graph) == 0 and graph[v].op_type == "parameter") {
-            NodeId node_id = (match_index == 0) ? graph[v].node_id : subgraph_matches[match_index].at(graph[v].node_id);
-            parameter_node_ids.push_back(node_id);
-        }
-    }
-
-    return parameter_node_ids;
-}
-
-std::vector<NodeId> get_constant_node_ids(graph_type& graph, const SubgraphPatternMatchMappings& subgraph_matches, int match_index)
-{
-    assert(match_index < (int)subgraph_matches.size());
-    std::vector<NodeId> constant_node_ids;
-
-    for (auto v : boost::make_iterator_range(vertices(graph))) {
-        if (boost::in_degree(v, graph) == 0 and graph[v].op_type == "constant") {
-            NodeId node_id = (match_index == 0) ? graph[v].node_id : subgraph_matches[match_index].at(graph[v].node_id);
-            constant_node_ids.push_back(node_id);
-        }
-    }
-
-    return constant_node_ids;
-}
-
-std::vector<NodeId> get_output_node_ids(graph_type& graph, const SubgraphPatternMatchMappings& subgraph_matches, int match_index)
-{
-    assert(match_index < (int)subgraph_matches.size());
-    std::vector<NodeId> output_node_ids;
-    for (auto v : boost::make_iterator_range(vertices(graph))) {
-        if (boost::out_degree(v, graph) == 0) {
-            NodeId node_id = (match_index == 0) ? graph[v].node_id : subgraph_matches[match_index].at(graph[v].node_id);
-            output_node_ids.push_back(node_id);
-        }
-    }
-
-    return output_node_ids;
-}
-
-std::vector<NodeId> get_unmatched_node_ids(graph_type& graph, const SubgraphPatternMatchMappings& subgraph_matches)
-{
-    std::vector<NodeId> unmatched_node_ids;
-
-    for (auto v : boost::make_iterator_range(vertices(graph))) {
-        bool matched = false;
-        for (size_t match_index = 0; match_index < subgraph_matches.size(); match_index++) {
-            for (auto matches : subgraph_matches[match_index]) {
-                if ((matches.first == graph[v].node_id) || (matches.second == graph[v].node_id)) {
-                    matched = true;
-                    break;
-                }
-            }
-            if (matched) break;
-        }
-        if (!matched) {
-            unmatched_node_ids.push_back(graph[v].node_id);
-        }
-    }
-
-    return unmatched_node_ids;
-}
-
-
-graph_type discover_largest_subgraph_pattern(graph_type& graph, int num_expected_matches)
-{
-    log_debug(LogPatternMatcher, "discover_largest_subgraph_pattern(num_expected_matches={}).", num_expected_matches);
-    graph_type best_subgraph_pattern;
-    int max_num_subgraph_vertices = 0;
-    bool is_subgraph_match_found = false;
-
-    for (const auto& [start_vertex_id, end_vertex_id] : get_subgraph_vertex_start_end_pairs(graph, num_expected_matches)) {
-        log_trace(LogPatternMatcher, "start_vertex: {}, end_vertex: {}", start_vertex_id, end_vertex_id);
-        graph_type subgraph_pattern = generate_pattern_subgraph(graph, start_vertex_id, end_vertex_id);
-        //save_dotgraph_to_file(fmt::format("subgraph_pattern_{}_{}.txt", start_vertex_id, end_vertex_id), subgraph_pattern);
-        int num_subgraph_vertices = boost::num_vertices(subgraph_pattern);
-
-        if (is_subgraph_match_found and num_subgraph_vertices <= max_num_subgraph_vertices) {
-            // we have already found a match with a larger subgraph pattern
-            continue;
-        }
-
-        int matches = num_subgraph_pattern_matches(subgraph_pattern, graph, num_expected_matches);
-        if (matches == num_expected_matches) {
-            log_debug(LogPatternMatcher, "\tstart_vertex: {}, end_vertex: {}: Found exactly {} matches.", start_vertex_id, end_vertex_id, num_expected_matches);
-            max_num_subgraph_vertices = num_subgraph_vertices;
-            is_subgraph_match_found = true;
-            best_subgraph_pattern = subgraph_pattern;
-        }
-    }
-    return best_subgraph_pattern;
-}
-
-SubgraphPatternMatchMappings subgraph_pattern_match(graph_type& subgraph, graph_type& graph)
-{
-    int num_subgraph_vertices = boost::num_vertices(subgraph);
-    log_info(LogPatternMatcher, "SubgraphPatternMatch found subgraph of {} nodes.", num_subgraph_vertices);
-
-    SubgraphPatternMatchMappings subgraph_match_mappings;
-
-    int total_matches = 0;
-    auto callback = [&](auto bijection, auto) {
-        SubgraphPatternMatch subgraph_matches;
-        subgraph_matches.reserve(num_subgraph_vertices);
-
-        for (auto v : boost::make_iterator_range(vertices(subgraph))) {
-            subgraph_matches[subgraph[v].node_id] = graph[get(bijection, v)].node_id;
-        }
-        subgraph_match_mappings.emplace_back(std::move(subgraph_matches));
-        total_matches += 1;
-        return true;
-    };
-
-    auto edge_predicate = [&](auto edge_a, auto edge_b) {
-        return subgraph[edge_a].producer_output_edge_index == graph[edge_b].producer_output_edge_index
-            and subgraph[edge_a].consumer_input_edge_index == graph[edge_b].consumer_input_edge_index;
-    };
-    auto vertex_predicate = [&](auto vertex_a, auto vertex_b) {
-        if (subgraph[vertex_a].op_type == "*") {
-            return true;
-        }
-        return subgraph[vertex_a].op_type == graph[vertex_b].op_type;
-    };
-
-    boost::vf2_subgraph_iso(
-        subgraph,
-        graph,
-        callback,
-        boost::vertex_order_by_mult(subgraph),
-        boost::edges_equivalent(edge_predicate).vertices_equivalent(vertex_predicate));
-
-    log_info(LogPatternMatcher, "SubgraphPatternMatch Finished: Recorded {} matches.", total_matches);
-
-    return subgraph_match_mappings;
-}
-
-} // namespace pattern_matcher
diff --git a/pybuda/csrc/pattern_matcher/pattern_matcher.hpp b/pybuda/csrc/pattern_matcher/pattern_matcher.hpp
deleted file mode 100644
index 566a8faab..000000000
--- a/pybuda/csrc/pattern_matcher/pattern_matcher.hpp
+++ /dev/null
@@ -1,122 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "graph_lib/defines.hpp"
-
-#include <string>
-#include <iostream>
-#include <vector>
-#include <unordered_map>
-
-#include <boost/graph/graph_utility.hpp>
-#include <boost/graph/adjacency_list.hpp>
-#include <boost/graph/adj_list_serialize.hpp>
-
-using tt::graphlib::NodeId;
-
-namespace pattern_matcher {
-
-/*
-  ____        _          ____  _                   _
- |  _ \  __ _| |_ __ _  / ___|| |_ _ __ _   _  ___| |_ _   _ _ __ ___  ___
- | | | |/ _` | __/ _` | \___ \| __| '__| | | |/ __| __| | | | '__/ _ \/ __|
- | |_| | (_| | || (_| |  ___) | |_| |  | |_| | (__| |_| |_| | | |  __/\__ \
- |____/ \__,_|\__\__,_| |____/ \__|_|   \__,_|\___|\__|\__,_|_|  \___||___/
-
-*/
-
-struct VertexProperty {
-    std::string name;
-    std::string op_type;
-    NodeId node_id;
-
-    template <typename Archive>
-    void serialize(Archive& ar, const unsigned int version)
-    {
-        (void)version;
-        ar & name;
-        ar & op_type;
-        ar & node_id;
-    }
-};
-
-struct EdgeProperty {
-
-    // tag the producer_output_edge_index so that we don't get permuted user edge mappings
-    int producer_output_edge_index;
-    int consumer_input_edge_index = 0;
-
-    template <typename Archive>
-    void serialize(Archive& ar, const unsigned int version)
-    {
-        (void)version;
-        ar & producer_output_edge_index;
-    }
-};
-
-template <class Name>
-class VertexPropertyWriter {
-public:
-     VertexPropertyWriter(Name _name) : name(_name) {}
-     template <class VertexOrEdge>
-     void operator()(std::ostream& out, const VertexOrEdge& v) const
-     {
-        out << "[label=\"" << name[v].name << "\nid:" << std::to_string(v) << "\"]";
-     }
-private:
-     Name name;
-};
-
-typedef boost::adjacency_list< boost::setS, boost::vecS, boost::bidirectionalS, VertexProperty, EdgeProperty> graph_type;
-typedef boost::graph_traits<graph_type>::vertex_descriptor VertexId;
-typedef boost::graph_traits<graph_type>::edge_descriptor EdgeId;
-
-
-// keys := set of nodes corresponding to subgraph pattern
-// values := ordered list of matches
-using SubgraphPatternMatch = std::unordered_map<NodeId, NodeId>;
-using SubgraphPatternMatchMappings = std::vector<std::unordered_map<NodeId, NodeId>>;
-
-
-
-/*
-    _    ____ ___
-   / \  |  _ \_ _|___
-  / _ \ | |_) | |/ __|
- / ___ \|  __/| |\__ \
-/_/   \_\_|  |___|___/
-*/
-
-
-// Utility Methods
-void save_dotgraph_to_ostream(std::ostream& stream, const graph_type& graph);
-void save_dotgraph_to_file(std::string filename, const graph_type& graph);
-void save_graph_to_file(std::string filename, graph_type& graph);
-graph_type load_graph_from_file(std::string filename);
-
-// Helper query methods
-int num_subgraph_pattern_matches(graph_type& subgraph, graph_type& graph, int num_matches);
-bool contains_exactly_n_subgraph_matches(graph_type& graph, int num_matches);
-
-//
-// Main Subgraph Pattern Matcher APIs
-//
-std::vector<NodeId> get_node_ids(graph_type& graph, const SubgraphPatternMatchMappings& subgraph_matches, int match_index = 0);
-std::vector<NodeId> get_input_activation_node_ids(graph_type& graph, const SubgraphPatternMatchMappings& subgraph_matches, int match_index = 0);
-std::vector<NodeId> get_parameter_node_ids(graph_type& graph, const SubgraphPatternMatchMappings& subgraph_matches, int match_index = 0);
-std::vector<NodeId> get_constant_node_ids(graph_type& graph, const SubgraphPatternMatchMappings& subgraph_matches, int match_index = 0);
-std::vector<NodeId> get_output_node_ids(graph_type& graph, const SubgraphPatternMatchMappings& subgraph_matches, int match_index = 0);
-std::vector<NodeId> get_unmatched_node_ids(graph_type& graph, const SubgraphPatternMatchMappings& subgraph_matches);
-
-// Given a graph, return the largest discovered subgraph which yields exactly
-// `num_expected_matches` instances in the input `graph`.
-// If subgraph was not discovered, return empty graph.
-graph_type discover_largest_subgraph_pattern(graph_type& graph, int num_expected_matches);
-
-// Given a subgraph and a graph, return a BUDA-graph NodeId mapping between
-// the subgraph and all instances in the graph.
-SubgraphPatternMatchMappings subgraph_pattern_match(graph_type& subgraph, graph_type& graph);
-
-} // namespace pattern_matcher
diff --git a/pybuda/csrc/pattern_matcher/python_bindings.cpp b/pybuda/csrc/pattern_matcher/python_bindings.cpp
deleted file mode 100644
index 88133abc4..000000000
--- a/pybuda/csrc/pattern_matcher/python_bindings.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "pattern_matcher/python_bindings.hpp"
-#include "pattern_matcher/boost_lowering.hpp"
-
-#include "graph_lib/graph.hpp"
-#include "pybind11_json.hpp"
-
-namespace tt {
-
-void PatternMatcherModule(py::module &m_pattern_matcher) {
-    using namespace pattern_matcher;
-
-    py::class_<pattern_matcher::MatchResult>(m_pattern_matcher, "MatchResult")
-        .def_readwrite("is_subgraph_pattern_found", &MatchResult::is_subgraph_pattern_found)
-        .def_readwrite("is_subgraph_loopable", &MatchResult::is_subgraph_loopable)
-        .def_readwrite("subgraph_matches", &MatchResult::subgraph_matches);
-
-    m_pattern_matcher.def("lower_pybuda_to_pattern_matcher", &lower_pybuda_to_pattern_matcher);
-    m_pattern_matcher.def("lower_json_to_pattern_matcher", &lower_json_to_pattern_matcher);
-}
-
-} // namespace tt
-
diff --git a/pybuda/csrc/pattern_matcher/python_bindings.hpp b/pybuda/csrc/pattern_matcher/python_bindings.hpp
deleted file mode 100644
index 987cdbfbb..000000000
--- a/pybuda/csrc/pattern_matcher/python_bindings.hpp
+++ /dev/null
@@ -1,16 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <pybind11/numpy.h>
-namespace py = pybind11;
-
-namespace tt {
-
-void PatternMatcherModule(py::module &m_pattern_matcher);
-
-} // namespace tt
-
diff --git a/pybuda/csrc/pattern_matcher/tests/boost_test_graphs/12encoder_boost_graph.txt b/pybuda/csrc/pattern_matcher/tests/boost_test_graphs/12encoder_boost_graph.txt
deleted file mode 100644
index 99590b9fe..000000000
--- a/pybuda/csrc/pattern_matcher/tests/boost_test_graphs/12encoder_boost_graph.txt
+++ /dev/null
@@ -1 +0,0 @@
-22 serialization::archive 17 0 0 998 1105 0 0 19 input_hidden_states 5 input 1 9 reshape_1 7 reshape 2 7 const_0 5 input 3 13 buda.matmul_3 6 matmul 4 9 reshape_4 7 reshape 5 7 const_1 5 input 6 5 add_6 3 add 7 13 buda.hslice_7 6 hslice 8 9 reshape_8 7 reshape 9 7 const_2 5 input 10 14 buda.matmul_10 6 matmul 11 10 reshape_11 7 reshape 12 7 const_3 5 input 13 6 add_13 3 add 14 14 buda.hslice_14 6 hslice 15 10 reshape_15 7 reshape 16 12 transpose_16 9 transpose 17 18 nn.batch_matmul_17 6 matmul 18 10 reshape_18 7 reshape 19 16 constant_const_4 8 constant 20 11 multiply_20 8 multiply 21 14 softmax_exp_21 3 exp 23 14 softmax_sum_21 10 reduce_sum 25 16 softmax_recip_21 10 reciprocal 24 15 softmax_mult_21 8 multiply 22 10 reshape_22 7 reshape 26 7 const_5 5 input 27 14 buda.matmul_24 6 matmul 28 10 reshape_25 7 reshape 29 7 const_6 5 input 30 6 add_27 3 add 31 14 buda.hslice_28 6 hslice 32 12 transpose_29 9 transpose 33 10 reshape_30 7 reshape 34 12 transpose_31 9 transpose 35 18 nn.batch_matmul_32 6 matmul 36 10 reshape_33 7 reshape 37 14 buda.hstack_34 6 hstack 38 7 const_7 5 input 39 14 buda.matmul_36 6 matmul 40 10 reshape_37 7 reshape 41 7 const_8 5 input 42 6 add_39 3 add 43 6 add_40 3 add 44 7 const_9 5 input 45 8 const_10 5 input 46 36 constant_1_layernorm_var_plus_eps_43 8 constant 56 17 layernorm_mean_43 10 reduce_avg 52 16 layernorm_sub_43 8 subtract 50 15 layernorm_sq_43 8 multiply 57 16 layernorm_var_43 10 reduce_avg 55 25 layernorm_var_plus_eps_43 3 add 54 17 layernorm_sqrt_43 4 sqrt 53 18 layernorm_recip_43 10 reciprocal 51 19 layernorm_output_43 8 multiply 49 20 layernorm_weights_43 8 multiply 48 17 layernorm_bias_43 3 add 47 10 reshape_44 7 reshape 58 8 const_11 5 input 59 14 buda.matmul_46 6 matmul 60 10 reshape_47 7 reshape 61 8 const_12 5 input 62 6 add_49 3 add 63 7 gelu_50 4 gelu 64 10 reshape_51 7 reshape 65 8 const_13 5 input 66 14 buda.matmul_53 6 matmul 67 10 reshape_54 7 reshape 68 8 const_14 5 input 69 6 add_56 3 add 70 6 add_57 3 add 71 8 const_15 5 input 72 8 const_16 5 input 73 36 constant_1_layernorm_var_plus_eps_60 8 constant 83 17 layernorm_mean_60 10 reduce_avg 79 16 layernorm_sub_60 8 subtract 77 15 layernorm_sq_60 8 multiply 84 16 layernorm_var_60 10 reduce_avg 82 25 layernorm_var_plus_eps_60 3 add 81 17 layernorm_sqrt_60 4 sqrt 80 18 layernorm_recip_60 10 reciprocal 78 19 layernorm_output_60 8 multiply 76 20 layernorm_weights_60 8 multiply 75 17 layernorm_bias_60 3 add 74 10 reshape_61 7 reshape 85 8 const_17 5 input 86 14 buda.matmul_63 6 matmul 87 10 reshape_64 7 reshape 88 8 const_18 5 input 89 6 add_66 3 add 90 14 buda.hslice_67 6 hslice 91 10 reshape_68 7 reshape 92 8 const_19 5 input 93 14 buda.matmul_70 6 matmul 94 10 reshape_71 7 reshape 95 8 const_20 5 input 96 6 add_73 3 add 97 14 buda.hslice_74 6 hslice 98 10 reshape_75 7 reshape 99 12 transpose_76 9 transpose 100 18 nn.batch_matmul_77 6 matmul 101 10 reshape_78 7 reshape 102 17 constant_const_21 8 constant 103 11 multiply_80 8 multiply 104 14 softmax_exp_81 3 exp 106 14 softmax_sum_81 10 reduce_sum 108 16 softmax_recip_81 10 reciprocal 107 15 softmax_mult_81 8 multiply 105 10 reshape_82 7 reshape 109 8 const_22 5 input 110 14 buda.matmul_84 6 matmul 111 10 reshape_85 7 reshape 112 8 const_23 5 input 113 6 add_87 3 add 114 14 buda.hslice_88 6 hslice 115 12 transpose_89 9 transpose 116 10 reshape_90 7 reshape 117 12 transpose_91 9 transpose 118 18 nn.batch_matmul_92 6 matmul 119 10 reshape_93 7 reshape 120 14 buda.hstack_94 6 hstack 121 8 const_24 5 input 122 14 buda.matmul_96 6 matmul 123 10 reshape_97 7 reshape 124 8 const_25 5 input 125 6 add_99 3 add 126 7 add_100 3 add 127 8 const_26 5 input 128 8 const_27 5 input 129 37 constant_1_layernorm_var_plus_eps_103 8 constant 139 18 layernorm_mean_103 10 reduce_avg 135 17 layernorm_sub_103 8 subtract 133 16 layernorm_sq_103 8 multiply 140 17 layernorm_var_103 10 reduce_avg 138 26 layernorm_var_plus_eps_103 3 add 137 18 layernorm_sqrt_103 4 sqrt 136 19 layernorm_recip_103 10 reciprocal 134 20 layernorm_output_103 8 multiply 132 21 layernorm_weights_103 8 multiply 131 18 layernorm_bias_103 3 add 130 11 reshape_104 7 reshape 141 8 const_28 5 input 142 15 buda.matmul_106 6 matmul 143 11 reshape_107 7 reshape 144 8 const_29 5 input 145 7 add_109 3 add 146 8 gelu_110 4 gelu 147 11 reshape_111 7 reshape 148 8 const_30 5 input 149 15 buda.matmul_113 6 matmul 150 11 reshape_114 7 reshape 151 8 const_31 5 input 152 7 add_116 3 add 153 7 add_117 3 add 154 8 const_32 5 input 155 8 const_33 5 input 156 37 constant_1_layernorm_var_plus_eps_120 8 constant 166 18 layernorm_mean_120 10 reduce_avg 162 17 layernorm_sub_120 8 subtract 160 16 layernorm_sq_120 8 multiply 167 17 layernorm_var_120 10 reduce_avg 165 26 layernorm_var_plus_eps_120 3 add 164 18 layernorm_sqrt_120 4 sqrt 163 19 layernorm_recip_120 10 reciprocal 161 20 layernorm_output_120 8 multiply 159 21 layernorm_weights_120 8 multiply 158 18 layernorm_bias_120 3 add 157 11 reshape_121 7 reshape 168 8 const_34 5 input 169 15 buda.matmul_123 6 matmul 170 11 reshape_124 7 reshape 171 8 const_35 5 input 172 7 add_126 3 add 173 15 buda.hslice_127 6 hslice 174 11 reshape_128 7 reshape 175 8 const_36 5 input 176 15 buda.matmul_130 6 matmul 177 11 reshape_131 7 reshape 178 8 const_37 5 input 179 7 add_133 3 add 180 15 buda.hslice_134 6 hslice 181 11 reshape_135 7 reshape 182 13 transpose_136 9 transpose 183 19 nn.batch_matmul_137 6 matmul 184 11 reshape_138 7 reshape 185 17 constant_const_38 8 constant 186 12 multiply_140 8 multiply 187 15 softmax_exp_141 3 exp 189 15 softmax_sum_141 10 reduce_sum 191 17 softmax_recip_141 10 reciprocal 190 16 softmax_mult_141 8 multiply 188 11 reshape_142 7 reshape 192 8 const_39 5 input 193 15 buda.matmul_144 6 matmul 194 11 reshape_145 7 reshape 195 8 const_40 5 input 196 7 add_147 3 add 197 15 buda.hslice_148 6 hslice 198 13 transpose_149 9 transpose 199 11 reshape_150 7 reshape 200 13 transpose_151 9 transpose 201 19 nn.batch_matmul_152 6 matmul 202 11 reshape_153 7 reshape 203 15 buda.hstack_154 6 hstack 204 8 const_41 5 input 205 15 buda.matmul_156 6 matmul 206 11 reshape_157 7 reshape 207 8 const_42 5 input 208 7 add_159 3 add 209 7 add_160 3 add 210 8 const_43 5 input 211 8 const_44 5 input 212 37 constant_1_layernorm_var_plus_eps_163 8 constant 222 18 layernorm_mean_163 10 reduce_avg 218 17 layernorm_sub_163 8 subtract 216 16 layernorm_sq_163 8 multiply 223 17 layernorm_var_163 10 reduce_avg 221 26 layernorm_var_plus_eps_163 3 add 220 18 layernorm_sqrt_163 4 sqrt 219 19 layernorm_recip_163 10 reciprocal 217 20 layernorm_output_163 8 multiply 215 21 layernorm_weights_163 8 multiply 214 18 layernorm_bias_163 3 add 213 11 reshape_164 7 reshape 224 8 const_45 5 input 225 15 buda.matmul_166 6 matmul 226 11 reshape_167 7 reshape 227 8 const_46 5 input 228 7 add_169 3 add 229 8 gelu_170 4 gelu 230 11 reshape_171 7 reshape 231 8 const_47 5 input 232 15 buda.matmul_173 6 matmul 233 11 reshape_174 7 reshape 234 8 const_48 5 input 235 7 add_176 3 add 236 7 add_177 3 add 237 8 const_49 5 input 238 8 const_50 5 input 239 37 constant_1_layernorm_var_plus_eps_180 8 constant 249 18 layernorm_mean_180 10 reduce_avg 245 17 layernorm_sub_180 8 subtract 243 16 layernorm_sq_180 8 multiply 250 17 layernorm_var_180 10 reduce_avg 248 26 layernorm_var_plus_eps_180 3 add 247 18 layernorm_sqrt_180 4 sqrt 246 19 layernorm_recip_180 10 reciprocal 244 20 layernorm_output_180 8 multiply 242 21 layernorm_weights_180 8 multiply 241 18 layernorm_bias_180 3 add 240 11 reshape_181 7 reshape 251 8 const_51 5 input 252 15 buda.matmul_183 6 matmul 253 11 reshape_184 7 reshape 254 8 const_52 5 input 255 7 add_186 3 add 256 15 buda.hslice_187 6 hslice 257 11 reshape_188 7 reshape 258 8 const_53 5 input 259 15 buda.matmul_190 6 matmul 260 11 reshape_191 7 reshape 261 8 const_54 5 input 262 7 add_193 3 add 263 15 buda.hslice_194 6 hslice 264 11 reshape_195 7 reshape 265 13 transpose_196 9 transpose 266 19 nn.batch_matmul_197 6 matmul 267 11 reshape_198 7 reshape 268 17 constant_const_55 8 constant 269 12 multiply_200 8 multiply 270 15 softmax_exp_201 3 exp 272 15 softmax_sum_201 10 reduce_sum 274 17 softmax_recip_201 10 reciprocal 273 16 softmax_mult_201 8 multiply 271 11 reshape_202 7 reshape 275 8 const_56 5 input 276 15 buda.matmul_204 6 matmul 277 11 reshape_205 7 reshape 278 8 const_57 5 input 279 7 add_207 3 add 280 15 buda.hslice_208 6 hslice 281 13 transpose_209 9 transpose 282 11 reshape_210 7 reshape 283 13 transpose_211 9 transpose 284 19 nn.batch_matmul_212 6 matmul 285 11 reshape_213 7 reshape 286 15 buda.hstack_214 6 hstack 287 8 const_58 5 input 288 15 buda.matmul_216 6 matmul 289 11 reshape_217 7 reshape 290 8 const_59 5 input 291 7 add_219 3 add 292 7 add_220 3 add 293 8 const_60 5 input 294 8 const_61 5 input 295 37 constant_1_layernorm_var_plus_eps_223 8 constant 305 18 layernorm_mean_223 10 reduce_avg 301 17 layernorm_sub_223 8 subtract 299 16 layernorm_sq_223 8 multiply 306 17 layernorm_var_223 10 reduce_avg 304 26 layernorm_var_plus_eps_223 3 add 303 18 layernorm_sqrt_223 4 sqrt 302 19 layernorm_recip_223 10 reciprocal 300 20 layernorm_output_223 8 multiply 298 21 layernorm_weights_223 8 multiply 297 18 layernorm_bias_223 3 add 296 11 reshape_224 7 reshape 307 8 const_62 5 input 308 15 buda.matmul_226 6 matmul 309 11 reshape_227 7 reshape 310 8 const_63 5 input 311 7 add_229 3 add 312 8 gelu_230 4 gelu 313 11 reshape_231 7 reshape 314 8 const_64 5 input 315 15 buda.matmul_233 6 matmul 316 11 reshape_234 7 reshape 317 8 const_65 5 input 318 7 add_236 3 add 319 7 add_237 3 add 320 8 const_66 5 input 321 8 const_67 5 input 322 37 constant_1_layernorm_var_plus_eps_240 8 constant 332 18 layernorm_mean_240 10 reduce_avg 328 17 layernorm_sub_240 8 subtract 326 16 layernorm_sq_240 8 multiply 333 17 layernorm_var_240 10 reduce_avg 331 26 layernorm_var_plus_eps_240 3 add 330 18 layernorm_sqrt_240 4 sqrt 329 19 layernorm_recip_240 10 reciprocal 327 20 layernorm_output_240 8 multiply 325 21 layernorm_weights_240 8 multiply 324 18 layernorm_bias_240 3 add 323 11 reshape_241 7 reshape 334 8 const_68 5 input 335 15 buda.matmul_243 6 matmul 336 11 reshape_244 7 reshape 337 8 const_69 5 input 338 7 add_246 3 add 339 15 buda.hslice_247 6 hslice 340 11 reshape_248 7 reshape 341 8 const_70 5 input 342 15 buda.matmul_250 6 matmul 343 11 reshape_251 7 reshape 344 8 const_71 5 input 345 7 add_253 3 add 346 15 buda.hslice_254 6 hslice 347 11 reshape_255 7 reshape 348 13 transpose_256 9 transpose 349 19 nn.batch_matmul_257 6 matmul 350 11 reshape_258 7 reshape 351 17 constant_const_72 8 constant 352 12 multiply_260 8 multiply 353 15 softmax_exp_261 3 exp 355 15 softmax_sum_261 10 reduce_sum 357 17 softmax_recip_261 10 reciprocal 356 16 softmax_mult_261 8 multiply 354 11 reshape_262 7 reshape 358 8 const_73 5 input 359 15 buda.matmul_264 6 matmul 360 11 reshape_265 7 reshape 361 8 const_74 5 input 362 7 add_267 3 add 363 15 buda.hslice_268 6 hslice 364 13 transpose_269 9 transpose 365 11 reshape_270 7 reshape 366 13 transpose_271 9 transpose 367 19 nn.batch_matmul_272 6 matmul 368 11 reshape_273 7 reshape 369 15 buda.hstack_274 6 hstack 370 8 const_75 5 input 371 15 buda.matmul_276 6 matmul 372 11 reshape_277 7 reshape 373 8 const_76 5 input 374 7 add_279 3 add 375 7 add_280 3 add 376 8 const_77 5 input 377 8 const_78 5 input 378 37 constant_1_layernorm_var_plus_eps_283 8 constant 388 18 layernorm_mean_283 10 reduce_avg 384 17 layernorm_sub_283 8 subtract 382 16 layernorm_sq_283 8 multiply 389 17 layernorm_var_283 10 reduce_avg 387 26 layernorm_var_plus_eps_283 3 add 386 18 layernorm_sqrt_283 4 sqrt 385 19 layernorm_recip_283 10 reciprocal 383 20 layernorm_output_283 8 multiply 381 21 layernorm_weights_283 8 multiply 380 18 layernorm_bias_283 3 add 379 11 reshape_284 7 reshape 390 8 const_79 5 input 391 15 buda.matmul_286 6 matmul 392 11 reshape_287 7 reshape 393 8 const_80 5 input 394 7 add_289 3 add 395 8 gelu_290 4 gelu 396 11 reshape_291 7 reshape 397 8 const_81 5 input 398 15 buda.matmul_293 6 matmul 399 11 reshape_294 7 reshape 400 8 const_82 5 input 401 7 add_296 3 add 402 7 add_297 3 add 403 8 const_83 5 input 404 8 const_84 5 input 405 37 constant_1_layernorm_var_plus_eps_300 8 constant 415 18 layernorm_mean_300 10 reduce_avg 411 17 layernorm_sub_300 8 subtract 409 16 layernorm_sq_300 8 multiply 416 17 layernorm_var_300 10 reduce_avg 414 26 layernorm_var_plus_eps_300 3 add 413 18 layernorm_sqrt_300 4 sqrt 412 19 layernorm_recip_300 10 reciprocal 410 20 layernorm_output_300 8 multiply 408 21 layernorm_weights_300 8 multiply 407 18 layernorm_bias_300 3 add 406 11 reshape_301 7 reshape 417 8 const_85 5 input 418 15 buda.matmul_303 6 matmul 419 11 reshape_304 7 reshape 420 8 const_86 5 input 421 7 add_306 3 add 422 15 buda.hslice_307 6 hslice 423 11 reshape_308 7 reshape 424 8 const_87 5 input 425 15 buda.matmul_310 6 matmul 426 11 reshape_311 7 reshape 427 8 const_88 5 input 428 7 add_313 3 add 429 15 buda.hslice_314 6 hslice 430 11 reshape_315 7 reshape 431 13 transpose_316 9 transpose 432 19 nn.batch_matmul_317 6 matmul 433 11 reshape_318 7 reshape 434 17 constant_const_89 8 constant 435 12 multiply_320 8 multiply 436 15 softmax_exp_321 3 exp 438 15 softmax_sum_321 10 reduce_sum 440 17 softmax_recip_321 10 reciprocal 439 16 softmax_mult_321 8 multiply 437 11 reshape_322 7 reshape 441 8 const_90 5 input 442 15 buda.matmul_324 6 matmul 443 11 reshape_325 7 reshape 444 8 const_91 5 input 445 7 add_327 3 add 446 15 buda.hslice_328 6 hslice 447 13 transpose_329 9 transpose 448 11 reshape_330 7 reshape 449 13 transpose_331 9 transpose 450 19 nn.batch_matmul_332 6 matmul 451 11 reshape_333 7 reshape 452 15 buda.hstack_334 6 hstack 453 8 const_92 5 input 454 15 buda.matmul_336 6 matmul 455 11 reshape_337 7 reshape 456 8 const_93 5 input 457 7 add_339 3 add 458 7 add_340 3 add 459 8 const_94 5 input 460 8 const_95 5 input 461 37 constant_1_layernorm_var_plus_eps_343 8 constant 471 18 layernorm_mean_343 10 reduce_avg 467 17 layernorm_sub_343 8 subtract 465 16 layernorm_sq_343 8 multiply 472 17 layernorm_var_343 10 reduce_avg 470 26 layernorm_var_plus_eps_343 3 add 469 18 layernorm_sqrt_343 4 sqrt 468 19 layernorm_recip_343 10 reciprocal 466 20 layernorm_output_343 8 multiply 464 21 layernorm_weights_343 8 multiply 463 18 layernorm_bias_343 3 add 462 11 reshape_344 7 reshape 473 8 const_96 5 input 474 15 buda.matmul_346 6 matmul 475 11 reshape_347 7 reshape 476 8 const_97 5 input 477 7 add_349 3 add 478 8 gelu_350 4 gelu 479 11 reshape_351 7 reshape 480 8 const_98 5 input 481 15 buda.matmul_353 6 matmul 482 11 reshape_354 7 reshape 483 8 const_99 5 input 484 7 add_356 3 add 485 7 add_357 3 add 486 9 const_100 5 input 487 9 const_101 5 input 488 37 constant_1_layernorm_var_plus_eps_360 8 constant 498 18 layernorm_mean_360 10 reduce_avg 494 17 layernorm_sub_360 8 subtract 492 16 layernorm_sq_360 8 multiply 499 17 layernorm_var_360 10 reduce_avg 497 26 layernorm_var_plus_eps_360 3 add 496 18 layernorm_sqrt_360 4 sqrt 495 19 layernorm_recip_360 10 reciprocal 493 20 layernorm_output_360 8 multiply 491 21 layernorm_weights_360 8 multiply 490 18 layernorm_bias_360 3 add 489 11 reshape_361 7 reshape 500 9 const_102 5 input 501 15 buda.matmul_363 6 matmul 502 11 reshape_364 7 reshape 503 9 const_103 5 input 504 7 add_366 3 add 505 15 buda.hslice_367 6 hslice 506 11 reshape_368 7 reshape 507 9 const_104 5 input 508 15 buda.matmul_370 6 matmul 509 11 reshape_371 7 reshape 510 9 const_105 5 input 511 7 add_373 3 add 512 15 buda.hslice_374 6 hslice 513 11 reshape_375 7 reshape 514 13 transpose_376 9 transpose 515 19 nn.batch_matmul_377 6 matmul 516 11 reshape_378 7 reshape 517 18 constant_const_106 8 constant 518 12 multiply_380 8 multiply 519 15 softmax_exp_381 3 exp 521 15 softmax_sum_381 10 reduce_sum 523 17 softmax_recip_381 10 reciprocal 522 16 softmax_mult_381 8 multiply 520 11 reshape_382 7 reshape 524 9 const_107 5 input 525 15 buda.matmul_384 6 matmul 526 11 reshape_385 7 reshape 527 9 const_108 5 input 528 7 add_387 3 add 529 15 buda.hslice_388 6 hslice 530 13 transpose_389 9 transpose 531 11 reshape_390 7 reshape 532 13 transpose_391 9 transpose 533 19 nn.batch_matmul_392 6 matmul 534 11 reshape_393 7 reshape 535 15 buda.hstack_394 6 hstack 536 9 const_109 5 input 537 15 buda.matmul_396 6 matmul 538 11 reshape_397 7 reshape 539 9 const_110 5 input 540 7 add_399 3 add 541 7 add_400 3 add 542 9 const_111 5 input 543 9 const_112 5 input 544 37 constant_1_layernorm_var_plus_eps_403 8 constant 554 18 layernorm_mean_403 10 reduce_avg 550 17 layernorm_sub_403 8 subtract 548 16 layernorm_sq_403 8 multiply 555 17 layernorm_var_403 10 reduce_avg 553 26 layernorm_var_plus_eps_403 3 add 552 18 layernorm_sqrt_403 4 sqrt 551 19 layernorm_recip_403 10 reciprocal 549 20 layernorm_output_403 8 multiply 547 21 layernorm_weights_403 8 multiply 546 18 layernorm_bias_403 3 add 545 11 reshape_404 7 reshape 556 9 const_113 5 input 557 15 buda.matmul_406 6 matmul 558 11 reshape_407 7 reshape 559 9 const_114 5 input 560 7 add_409 3 add 561 8 gelu_410 4 gelu 562 11 reshape_411 7 reshape 563 9 const_115 5 input 564 15 buda.matmul_413 6 matmul 565 11 reshape_414 7 reshape 566 9 const_116 5 input 567 7 add_416 3 add 568 7 add_417 3 add 569 9 const_117 5 input 570 9 const_118 5 input 571 37 constant_1_layernorm_var_plus_eps_420 8 constant 581 18 layernorm_mean_420 10 reduce_avg 577 17 layernorm_sub_420 8 subtract 575 16 layernorm_sq_420 8 multiply 582 17 layernorm_var_420 10 reduce_avg 580 26 layernorm_var_plus_eps_420 3 add 579 18 layernorm_sqrt_420 4 sqrt 578 19 layernorm_recip_420 10 reciprocal 576 20 layernorm_output_420 8 multiply 574 21 layernorm_weights_420 8 multiply 573 18 layernorm_bias_420 3 add 572 11 reshape_421 7 reshape 583 9 const_119 5 input 584 15 buda.matmul_423 6 matmul 585 11 reshape_424 7 reshape 586 9 const_120 5 input 587 7 add_426 3 add 588 15 buda.hslice_427 6 hslice 589 11 reshape_428 7 reshape 590 9 const_121 5 input 591 15 buda.matmul_430 6 matmul 592 11 reshape_431 7 reshape 593 9 const_122 5 input 594 7 add_433 3 add 595 15 buda.hslice_434 6 hslice 596 11 reshape_435 7 reshape 597 13 transpose_436 9 transpose 598 19 nn.batch_matmul_437 6 matmul 599 11 reshape_438 7 reshape 600 18 constant_const_123 8 constant 601 12 multiply_440 8 multiply 602 15 softmax_exp_441 3 exp 604 15 softmax_sum_441 10 reduce_sum 606 17 softmax_recip_441 10 reciprocal 605 16 softmax_mult_441 8 multiply 603 11 reshape_442 7 reshape 607 9 const_124 5 input 608 15 buda.matmul_444 6 matmul 609 11 reshape_445 7 reshape 610 9 const_125 5 input 611 7 add_447 3 add 612 15 buda.hslice_448 6 hslice 613 13 transpose_449 9 transpose 614 11 reshape_450 7 reshape 615 13 transpose_451 9 transpose 616 19 nn.batch_matmul_452 6 matmul 617 11 reshape_453 7 reshape 618 15 buda.hstack_454 6 hstack 619 9 const_126 5 input 620 15 buda.matmul_456 6 matmul 621 11 reshape_457 7 reshape 622 9 const_127 5 input 623 7 add_459 3 add 624 7 add_460 3 add 625 9 const_128 5 input 626 9 const_129 5 input 627 37 constant_1_layernorm_var_plus_eps_463 8 constant 637 18 layernorm_mean_463 10 reduce_avg 633 17 layernorm_sub_463 8 subtract 631 16 layernorm_sq_463 8 multiply 638 17 layernorm_var_463 10 reduce_avg 636 26 layernorm_var_plus_eps_463 3 add 635 18 layernorm_sqrt_463 4 sqrt 634 19 layernorm_recip_463 10 reciprocal 632 20 layernorm_output_463 8 multiply 630 21 layernorm_weights_463 8 multiply 629 18 layernorm_bias_463 3 add 628 11 reshape_464 7 reshape 639 9 const_130 5 input 640 15 buda.matmul_466 6 matmul 641 11 reshape_467 7 reshape 642 9 const_131 5 input 643 7 add_469 3 add 644 8 gelu_470 4 gelu 645 11 reshape_471 7 reshape 646 9 const_132 5 input 647 15 buda.matmul_473 6 matmul 648 11 reshape_474 7 reshape 649 9 const_133 5 input 650 7 add_476 3 add 651 7 add_477 3 add 652 9 const_134 5 input 653 9 const_135 5 input 654 37 constant_1_layernorm_var_plus_eps_480 8 constant 664 18 layernorm_mean_480 10 reduce_avg 660 17 layernorm_sub_480 8 subtract 658 16 layernorm_sq_480 8 multiply 665 17 layernorm_var_480 10 reduce_avg 663 26 layernorm_var_plus_eps_480 3 add 662 18 layernorm_sqrt_480 4 sqrt 661 19 layernorm_recip_480 10 reciprocal 659 20 layernorm_output_480 8 multiply 657 21 layernorm_weights_480 8 multiply 656 18 layernorm_bias_480 3 add 655 11 reshape_481 7 reshape 666 9 const_136 5 input 667 15 buda.matmul_483 6 matmul 668 11 reshape_484 7 reshape 669 9 const_137 5 input 670 7 add_486 3 add 671 15 buda.hslice_487 6 hslice 672 11 reshape_488 7 reshape 673 9 const_138 5 input 674 15 buda.matmul_490 6 matmul 675 11 reshape_491 7 reshape 676 9 const_139 5 input 677 7 add_493 3 add 678 15 buda.hslice_494 6 hslice 679 11 reshape_495 7 reshape 680 13 transpose_496 9 transpose 681 19 nn.batch_matmul_497 6 matmul 682 11 reshape_498 7 reshape 683 18 constant_const_140 8 constant 684 12 multiply_500 8 multiply 685 15 softmax_exp_501 3 exp 687 15 softmax_sum_501 10 reduce_sum 689 17 softmax_recip_501 10 reciprocal 688 16 softmax_mult_501 8 multiply 686 11 reshape_502 7 reshape 690 9 const_141 5 input 691 15 buda.matmul_504 6 matmul 692 11 reshape_505 7 reshape 693 9 const_142 5 input 694 7 add_507 3 add 695 15 buda.hslice_508 6 hslice 696 13 transpose_509 9 transpose 697 11 reshape_510 7 reshape 698 13 transpose_511 9 transpose 699 19 nn.batch_matmul_512 6 matmul 700 11 reshape_513 7 reshape 701 15 buda.hstack_514 6 hstack 702 9 const_143 5 input 703 15 buda.matmul_516 6 matmul 704 11 reshape_517 7 reshape 705 9 const_144 5 input 706 7 add_519 3 add 707 7 add_520 3 add 708 9 const_145 5 input 709 9 const_146 5 input 710 37 constant_1_layernorm_var_plus_eps_523 8 constant 720 18 layernorm_mean_523 10 reduce_avg 716 17 layernorm_sub_523 8 subtract 714 16 layernorm_sq_523 8 multiply 721 17 layernorm_var_523 10 reduce_avg 719 26 layernorm_var_plus_eps_523 3 add 718 18 layernorm_sqrt_523 4 sqrt 717 19 layernorm_recip_523 10 reciprocal 715 20 layernorm_output_523 8 multiply 713 21 layernorm_weights_523 8 multiply 712 18 layernorm_bias_523 3 add 711 11 reshape_524 7 reshape 722 9 const_147 5 input 723 15 buda.matmul_526 6 matmul 724 11 reshape_527 7 reshape 725 9 const_148 5 input 726 7 add_529 3 add 727 8 gelu_530 4 gelu 728 11 reshape_531 7 reshape 729 9 const_149 5 input 730 15 buda.matmul_533 6 matmul 731 11 reshape_534 7 reshape 732 9 const_150 5 input 733 7 add_536 3 add 734 7 add_537 3 add 735 9 const_151 5 input 736 9 const_152 5 input 737 37 constant_1_layernorm_var_plus_eps_540 8 constant 747 18 layernorm_mean_540 10 reduce_avg 743 17 layernorm_sub_540 8 subtract 741 16 layernorm_sq_540 8 multiply 748 17 layernorm_var_540 10 reduce_avg 746 26 layernorm_var_plus_eps_540 3 add 745 18 layernorm_sqrt_540 4 sqrt 744 19 layernorm_recip_540 10 reciprocal 742 20 layernorm_output_540 8 multiply 740 21 layernorm_weights_540 8 multiply 739 18 layernorm_bias_540 3 add 738 11 reshape_541 7 reshape 749 9 const_153 5 input 750 15 buda.matmul_543 6 matmul 751 11 reshape_544 7 reshape 752 9 const_154 5 input 753 7 add_546 3 add 754 15 buda.hslice_547 6 hslice 755 11 reshape_548 7 reshape 756 9 const_155 5 input 757 15 buda.matmul_550 6 matmul 758 11 reshape_551 7 reshape 759 9 const_156 5 input 760 7 add_553 3 add 761 15 buda.hslice_554 6 hslice 762 11 reshape_555 7 reshape 763 13 transpose_556 9 transpose 764 19 nn.batch_matmul_557 6 matmul 765 11 reshape_558 7 reshape 766 18 constant_const_157 8 constant 767 12 multiply_560 8 multiply 768 15 softmax_exp_561 3 exp 770 15 softmax_sum_561 10 reduce_sum 772 17 softmax_recip_561 10 reciprocal 771 16 softmax_mult_561 8 multiply 769 11 reshape_562 7 reshape 773 9 const_158 5 input 774 15 buda.matmul_564 6 matmul 775 11 reshape_565 7 reshape 776 9 const_159 5 input 777 7 add_567 3 add 778 15 buda.hslice_568 6 hslice 779 13 transpose_569 9 transpose 780 11 reshape_570 7 reshape 781 13 transpose_571 9 transpose 782 19 nn.batch_matmul_572 6 matmul 783 11 reshape_573 7 reshape 784 15 buda.hstack_574 6 hstack 785 9 const_160 5 input 786 15 buda.matmul_576 6 matmul 787 11 reshape_577 7 reshape 788 9 const_161 5 input 789 7 add_579 3 add 790 7 add_580 3 add 791 9 const_162 5 input 792 9 const_163 5 input 793 37 constant_1_layernorm_var_plus_eps_583 8 constant 803 18 layernorm_mean_583 10 reduce_avg 799 17 layernorm_sub_583 8 subtract 797 16 layernorm_sq_583 8 multiply 804 17 layernorm_var_583 10 reduce_avg 802 26 layernorm_var_plus_eps_583 3 add 801 18 layernorm_sqrt_583 4 sqrt 800 19 layernorm_recip_583 10 reciprocal 798 20 layernorm_output_583 8 multiply 796 21 layernorm_weights_583 8 multiply 795 18 layernorm_bias_583 3 add 794 11 reshape_584 7 reshape 805 9 const_164 5 input 806 15 buda.matmul_586 6 matmul 807 11 reshape_587 7 reshape 808 9 const_165 5 input 809 7 add_589 3 add 810 8 gelu_590 4 gelu 811 11 reshape_591 7 reshape 812 9 const_166 5 input 813 15 buda.matmul_593 6 matmul 814 11 reshape_594 7 reshape 815 9 const_167 5 input 816 7 add_596 3 add 817 7 add_597 3 add 818 9 const_168 5 input 819 9 const_169 5 input 820 37 constant_1_layernorm_var_plus_eps_600 8 constant 830 18 layernorm_mean_600 10 reduce_avg 826 17 layernorm_sub_600 8 subtract 824 16 layernorm_sq_600 8 multiply 831 17 layernorm_var_600 10 reduce_avg 829 26 layernorm_var_plus_eps_600 3 add 828 18 layernorm_sqrt_600 4 sqrt 827 19 layernorm_recip_600 10 reciprocal 825 20 layernorm_output_600 8 multiply 823 21 layernorm_weights_600 8 multiply 822 18 layernorm_bias_600 3 add 821 11 reshape_601 7 reshape 832 9 const_170 5 input 833 15 buda.matmul_603 6 matmul 834 11 reshape_604 7 reshape 835 9 const_171 5 input 836 7 add_606 3 add 837 15 buda.hslice_607 6 hslice 838 11 reshape_608 7 reshape 839 9 const_172 5 input 840 15 buda.matmul_610 6 matmul 841 11 reshape_611 7 reshape 842 9 const_173 5 input 843 7 add_613 3 add 844 15 buda.hslice_614 6 hslice 845 11 reshape_615 7 reshape 846 13 transpose_616 9 transpose 847 19 nn.batch_matmul_617 6 matmul 848 11 reshape_618 7 reshape 849 18 constant_const_174 8 constant 850 12 multiply_620 8 multiply 851 15 softmax_exp_621 3 exp 853 15 softmax_sum_621 10 reduce_sum 855 17 softmax_recip_621 10 reciprocal 854 16 softmax_mult_621 8 multiply 852 11 reshape_622 7 reshape 856 9 const_175 5 input 857 15 buda.matmul_624 6 matmul 858 11 reshape_625 7 reshape 859 9 const_176 5 input 860 7 add_627 3 add 861 15 buda.hslice_628 6 hslice 862 13 transpose_629 9 transpose 863 11 reshape_630 7 reshape 864 13 transpose_631 9 transpose 865 19 nn.batch_matmul_632 6 matmul 866 11 reshape_633 7 reshape 867 15 buda.hstack_634 6 hstack 868 9 const_177 5 input 869 15 buda.matmul_636 6 matmul 870 11 reshape_637 7 reshape 871 9 const_178 5 input 872 7 add_639 3 add 873 7 add_640 3 add 874 9 const_179 5 input 875 9 const_180 5 input 876 37 constant_1_layernorm_var_plus_eps_643 8 constant 886 18 layernorm_mean_643 10 reduce_avg 882 17 layernorm_sub_643 8 subtract 880 16 layernorm_sq_643 8 multiply 887 17 layernorm_var_643 10 reduce_avg 885 26 layernorm_var_plus_eps_643 3 add 884 18 layernorm_sqrt_643 4 sqrt 883 19 layernorm_recip_643 10 reciprocal 881 20 layernorm_output_643 8 multiply 879 21 layernorm_weights_643 8 multiply 878 18 layernorm_bias_643 3 add 877 11 reshape_644 7 reshape 888 9 const_181 5 input 889 15 buda.matmul_646 6 matmul 890 11 reshape_647 7 reshape 891 9 const_182 5 input 892 7 add_649 3 add 893 8 gelu_650 4 gelu 894 11 reshape_651 7 reshape 895 9 const_183 5 input 896 15 buda.matmul_653 6 matmul 897 11 reshape_654 7 reshape 898 9 const_184 5 input 899 7 add_656 3 add 900 7 add_657 3 add 901 9 const_185 5 input 902 9 const_186 5 input 903 37 constant_1_layernorm_var_plus_eps_660 8 constant 913 18 layernorm_mean_660 10 reduce_avg 909 17 layernorm_sub_660 8 subtract 907 16 layernorm_sq_660 8 multiply 914 17 layernorm_var_660 10 reduce_avg 912 26 layernorm_var_plus_eps_660 3 add 911 18 layernorm_sqrt_660 4 sqrt 910 19 layernorm_recip_660 10 reciprocal 908 20 layernorm_output_660 8 multiply 906 21 layernorm_weights_660 8 multiply 905 18 layernorm_bias_660 3 add 904 11 reshape_661 7 reshape 915 9 const_187 5 input 916 15 buda.matmul_663 6 matmul 917 11 reshape_664 7 reshape 918 9 const_188 5 input 919 7 add_666 3 add 920 15 buda.hslice_667 6 hslice 921 11 reshape_668 7 reshape 922 9 const_189 5 input 923 15 buda.matmul_670 6 matmul 924 11 reshape_671 7 reshape 925 9 const_190 5 input 926 7 add_673 3 add 927 15 buda.hslice_674 6 hslice 928 11 reshape_675 7 reshape 929 13 transpose_676 9 transpose 930 19 nn.batch_matmul_677 6 matmul 931 11 reshape_678 7 reshape 932 18 constant_const_191 8 constant 933 12 multiply_680 8 multiply 934 15 softmax_exp_681 3 exp 936 15 softmax_sum_681 10 reduce_sum 938 17 softmax_recip_681 10 reciprocal 937 16 softmax_mult_681 8 multiply 935 11 reshape_682 7 reshape 939 9 const_192 5 input 940 15 buda.matmul_684 6 matmul 941 11 reshape_685 7 reshape 942 9 const_193 5 input 943 7 add_687 3 add 944 15 buda.hslice_688 6 hslice 945 13 transpose_689 9 transpose 946 11 reshape_690 7 reshape 947 13 transpose_691 9 transpose 948 19 nn.batch_matmul_692 6 matmul 949 11 reshape_693 7 reshape 950 15 buda.hstack_694 6 hstack 951 9 const_194 5 input 952 15 buda.matmul_696 6 matmul 953 11 reshape_697 7 reshape 954 9 const_195 5 input 955 7 add_699 3 add 956 7 add_700 3 add 957 9 const_196 5 input 958 9 const_197 5 input 959 37 constant_1_layernorm_var_plus_eps_703 8 constant 969 18 layernorm_mean_703 10 reduce_avg 965 17 layernorm_sub_703 8 subtract 963 16 layernorm_sq_703 8 multiply 970 17 layernorm_var_703 10 reduce_avg 968 26 layernorm_var_plus_eps_703 3 add 967 18 layernorm_sqrt_703 4 sqrt 966 19 layernorm_recip_703 10 reciprocal 964 20 layernorm_output_703 8 multiply 962 21 layernorm_weights_703 8 multiply 961 18 layernorm_bias_703 3 add 960 11 reshape_704 7 reshape 971 9 const_198 5 input 972 15 buda.matmul_706 6 matmul 973 11 reshape_707 7 reshape 974 9 const_199 5 input 975 7 add_709 3 add 976 8 gelu_710 4 gelu 977 11 reshape_711 7 reshape 978 9 const_200 5 input 979 15 buda.matmul_713 6 matmul 980 11 reshape_714 7 reshape 981 9 const_201 5 input 982 7 add_716 3 add 983 7 add_717 3 add 984 9 const_202 5 input 985 9 const_203 5 input 986 37 constant_1_layernorm_var_plus_eps_720 8 constant 996 18 layernorm_mean_720 10 reduce_avg 992 17 layernorm_sub_720 8 subtract 990 16 layernorm_sq_720 8 multiply 997 17 layernorm_var_720 10 reduce_avg 995 26 layernorm_var_plus_eps_720 3 add 994 18 layernorm_sqrt_720 4 sqrt 993 19 layernorm_recip_720 10 reciprocal 991 20 layernorm_output_720 8 multiply 989 21 layernorm_weights_720 8 multiply 988 18 layernorm_bias_720 3 add 987 16 output_layernorm 6 output 998 996 997 0 0 0 988 989 2 989 990 0 986 991 0 990 991 0 991 992 0 983 987 1 992 993 0 987 988 0 983 988 0 993 994 0 988 994 0 984 995 0 994 995 0 985 996 0 995 996 0 969 983 1 982 983 0 981 982 0 980 982 0 979 980 0 978 979 0 977 979 0 976 977 0 975 976 0 974 975 0 973 975 0 972 973 0 971 972 0 970 972 0 969 970 0 961 962 2 962 963 0 959 964 0 963 964 0 964 965 0 956 960 1 965 966 0 960 961 0 956 961 0 966 967 0 961 967 0 957 968 0 967 968 0 958 969 0 968 969 0 913 956 1 955 956 0 954 955 0 953 955 0 952 953 0 951 952 0 950 952 0 949 950 0 948 949 0 947 948 0 938 948 0 946 947 0 945 946 0 944 945 0 943 944 0 942 943 0 941 943 0 940 941 0 939 940 0 914 940 2 937 938 0 934 935 1 935 936 0 933 934 0 936 937 0 934 937 0 932 933 0 931 933 0 930 931 0 929 930 0 921 930 0 928 929 0 927 928 0 926 927 0 925 926 0 924 926 0 923 924 0 922 923 0 914 923 1 920 921 0 919 920 0 918 919 0 917 919 0 916 917 0 915 916 0 914 916 0 913 914 0 905 906 2 906 907 0 903 908 0 907 908 0 908 909 0 900 904 1 909 910 0 904 905 0 900 905 0 910 911 0 905 911 0 901 912 0 911 912 0 902 913 0 912 913 0 886 900 1 899 900 0 898 899 0 897 899 0 896 897 0 895 896 0 894 896 0 893 894 0 892 893 0 891 892 0 890 892 0 889 890 0 888 889 0 887 889 0 886 887 0 878 879 2 879 880 0 876 881 0 880 881 0 881 882 0 873 877 1 882 883 0 877 878 0 873 878 0 883 884 0 878 884 0 874 885 0 884 885 0 875 886 0 885 886 0 830 873 1 872 873 0 871 872 0 870 872 0 869 870 0 868 869 0 867 869 0 866 867 0 865 866 0 864 865 0 855 865 0 863 864 0 862 863 0 861 862 0 860 861 0 859 860 0 858 860 0 857 858 0 856 857 0 831 857 2 854 855 0 851 852 1 852 853 0 850 851 0 853 854 0 851 854 0 849 850 0 848 850 0 847 848 0 846 847 0 838 847 0 845 846 0 844 845 0 843 844 0 842 843 0 841 843 0 840 841 0 839 840 0 831 840 1 837 838 0 836 837 0 835 836 0 834 836 0 833 834 0 832 833 0 831 833 0 830 831 0 822 823 2 823 824 0 820 825 0 824 825 0 825 826 0 817 821 1 826 827 0 821 822 0 817 822 0 827 828 0 822 828 0 818 829 0 828 829 0 819 830 0 829 830 0 803 817 1 816 817 0 815 816 0 814 816 0 813 814 0 812 813 0 811 813 0 810 811 0 809 810 0 808 809 0 807 809 0 806 807 0 805 806 0 804 806 0 803 804 0 795 796 2 796 797 0 793 798 0 797 798 0 798 799 0 790 794 1 799 800 0 794 795 0 790 795 0 800 801 0 795 801 0 791 802 0 801 802 0 792 803 0 802 803 0 747 790 1 789 790 0 788 789 0 787 789 0 786 787 0 785 786 0 784 786 0 783 784 0 782 783 0 781 782 0 772 782 0 780 781 0 779 780 0 778 779 0 777 778 0 776 777 0 775 777 0 774 775 0 773 774 0 748 774 2 771 772 0 768 769 1 769 770 0 767 768 0 770 771 0 768 771 0 766 767 0 765 767 0 764 765 0 763 764 0 755 764 0 762 763 0 761 762 0 760 761 0 759 760 0 758 760 0 757 758 0 756 757 0 748 757 1 754 755 0 753 754 0 752 753 0 751 753 0 750 751 0 749 750 0 748 750 0 747 748 0 739 740 2 740 741 0 737 742 0 741 742 0 742 743 0 734 738 1 743 744 0 738 739 0 734 739 0 744 745 0 739 745 0 735 746 0 745 746 0 736 747 0 746 747 0 720 734 1 733 734 0 732 733 0 731 733 0 730 731 0 729 730 0 728 730 0 727 728 0 726 727 0 725 726 0 724 726 0 723 724 0 722 723 0 721 723 0 720 721 0 712 713 2 713 714 0 710 715 0 714 715 0 715 716 0 707 711 1 716 717 0 711 712 0 707 712 0 717 718 0 712 718 0 708 719 0 718 719 0 709 720 0 719 720 0 664 707 1 706 707 0 705 706 0 704 706 0 703 704 0 702 703 0 701 703 0 700 701 0 699 700 0 698 699 0 689 699 0 697 698 0 696 697 0 695 696 0 694 695 0 693 694 0 692 694 0 691 692 0 690 691 0 665 691 2 688 689 0 685 686 1 686 687 0 684 685 0 687 688 0 685 688 0 683 684 0 682 684 0 681 682 0 680 681 0 672 681 0 679 680 0 678 679 0 677 678 0 676 677 0 675 677 0 674 675 0 673 674 0 665 674 1 671 672 0 670 671 0 669 670 0 668 670 0 667 668 0 666 667 0 665 667 0 664 665 0 656 657 2 657 658 0 654 659 0 658 659 0 659 660 0 651 655 1 660 661 0 655 656 0 651 656 0 661 662 0 656 662 0 652 663 0 662 663 0 653 664 0 663 664 0 637 651 1 650 651 0 649 650 0 648 650 0 647 648 0 646 647 0 645 647 0 644 645 0 643 644 0 642 643 0 641 643 0 640 641 0 639 640 0 638 640 0 637 638 0 629 630 2 630 631 0 627 632 0 631 632 0 632 633 0 624 628 1 633 634 0 628 629 0 624 629 0 634 635 0 629 635 0 625 636 0 635 636 0 626 637 0 636 637 0 581 624 1 623 624 0 622 623 0 621 623 0 620 621 0 619 620 0 618 620 0 617 618 0 616 617 0 615 616 0 606 616 0 614 615 0 613 614 0 612 613 0 611 612 0 610 611 0 609 611 0 608 609 0 607 608 0 582 608 2 605 606 0 602 603 1 603 604 0 601 602 0 604 605 0 602 605 0 600 601 0 599 601 0 598 599 0 597 598 0 589 598 0 596 597 0 595 596 0 594 595 0 593 594 0 592 594 0 591 592 0 590 591 0 582 591 1 588 589 0 587 588 0 586 587 0 585 587 0 584 585 0 583 584 0 582 584 0 581 582 0 573 574 2 574 575 0 571 576 0 575 576 0 576 577 0 568 572 1 577 578 0 572 573 0 568 573 0 578 579 0 573 579 0 569 580 0 579 580 0 570 581 0 580 581 0 554 568 1 567 568 0 566 567 0 565 567 0 564 565 0 563 564 0 562 564 0 561 562 0 560 561 0 559 560 0 558 560 0 557 558 0 556 557 0 555 557 0 554 555 0 546 547 2 547 548 0 544 549 0 548 549 0 549 550 0 541 545 1 550 551 0 545 546 0 541 546 0 551 552 0 546 552 0 542 553 0 552 553 0 543 554 0 553 554 0 498 541 1 540 541 0 255 256 0 254 255 0 253 255 0 252 253 0 251 252 0 250 252 0 249 250 0 241 242 2 242 243 0 239 244 0 243 244 0 244 245 0 236 240 1 245 246 0 240 241 0 236 241 0 246 247 0 241 247 0 237 248 0 247 248 0 238 249 0 248 249 0 222 236 1 235 236 0 234 235 0 233 235 0 232 233 0 231 232 0 230 232 0 229 230 0 228 229 0 227 228 0 226 228 0 225 226 0 224 225 0 223 225 0 222 223 0 214 215 2 215 216 0 212 217 0 216 217 0 217 218 0 209 213 1 218 219 0 213 214 0 209 214 0 219 220 0 214 220 0 210 221 0 220 221 0 211 222 0 221 222 0 166 209 1 208 209 0 207 208 0 206 208 0 205 206 0 204 205 0 203 205 0 202 203 0 201 202 0 200 201 0 191 201 0 199 200 0 198 199 0 197 198 0 196 197 0 195 196 0 194 196 0 193 194 0 192 193 0 167 193 2 190 191 0 187 188 1 188 189 0 186 187 0 189 190 0 187 190 0 185 186 0 184 186 0 183 184 0 182 183 0 174 183 0 181 182 0 180 181 0 179 180 0 178 179 0 177 179 0 176 177 0 175 176 0 167 176 1 173 174 0 172 173 0 171 172 0 170 172 0 169 170 0 168 169 0 167 169 0 166 167 0 158 159 2 159 160 0 156 161 0 160 161 0 161 162 0 153 157 1 162 163 0 157 158 0 153 158 0 163 164 0 158 164 0 154 165 0 164 165 0 155 166 0 165 166 0 139 153 1 152 153 0 151 152 0 150 152 0 149 150 0 148 149 0 147 149 0 146 147 0 145 146 0 144 145 0 143 145 0 142 143 0 141 142 0 140 142 0 139 140 0 131 132 2 132 133 0 129 134 0 133 134 0 134 135 0 126 130 1 135 136 0 130 131 0 126 131 0 136 137 0 131 137 0 127 138 0 137 138 0 128 139 0 138 139 0 56 57 0 48 49 2 49 50 0 46 51 0 50 51 0 51 52 0 43 47 1 52 53 0 47 48 0 43 48 0 53 54 0 48 54 0 44 55 0 54 55 0 45 56 0 55 56 0 0 43 1 42 43 0 41 42 0 40 42 0 39 40 0 38 39 0 37 39 0 36 37 0 35 36 0 34 35 0 25 35 0 33 34 0 32 33 0 31 32 0 30 31 0 29 30 0 28 30 0 10 11 0 9 10 0 1 10 1 7 8 0 6 7 0 5 6 0 4 6 0 3 4 0 2 3 0 1 3 0 0 1 0 12 13 0 11 13 0 13 14 0 14 15 0 15 16 0 16 17 0 8 17 0 17 18 0 19 20 0 18 20 0 23 24 0 21 24 0 20 21 0 22 23 0 21 22 1 24 25 0 26 27 0 1 27 2 27 28 0 58 59 0 57 59 0 59 60 0 61 62 0 60 62 0 62 63 0 63 64 0 65 66 0 64 66 0 66 67 0 68 69 0 67 69 0 56 70 1 69 70 0 72 83 0 82 83 0 71 82 0 81 82 0 80 81 0 75 81 0 74 75 0 70 75 0 79 80 0 70 74 1 78 79 0 73 78 0 77 78 0 76 77 0 75 76 2 83 84 0 85 86 0 84 86 0 86 87 0 88 89 0 87 89 0 89 90 0 90 91 0 92 93 0 84 93 1 93 94 0 95 96 0 94 96 0 96 97 0 97 98 0 98 99 0 99 100 0 91 100 0 100 101 0 102 103 0 101 103 0 106 107 0 104 107 0 103 104 0 105 106 0 104 105 1 107 108 0 109 110 0 84 110 2 110 111 0 112 113 0 111 113 0 113 114 0 114 115 0 115 116 0 116 117 0 117 118 0 108 118 0 118 119 0 119 120 0 121 122 0 120 122 0 122 123 0 124 125 0 123 125 0 83 126 1 125 126 0 256 257 0 258 259 0 250 259 1 259 260 0 261 262 0 260 262 0 262 263 0 263 264 0 264 265 0 265 266 0 257 266 0 266 267 0 268 269 0 267 269 0 272 273 0 270 273 0 269 270 0 271 272 0 270 271 1 273 274 0 275 276 0 250 276 2 276 277 0 278 279 0 277 279 0 279 280 0 280 281 0 281 282 0 282 283 0 283 284 0 274 284 0 284 285 0 285 286 0 287 288 0 286 288 0 288 289 0 290 291 0 289 291 0 249 292 1 291 292 0 294 305 0 304 305 0 293 304 0 303 304 0 302 303 0 297 303 0 296 297 0 292 297 0 301 302 0 292 296 1 300 301 0 295 300 0 299 300 0 298 299 0 297 298 2 305 306 0 307 308 0 306 308 0 308 309 0 310 311 0 309 311 0 311 312 0 312 313 0 314 315 0 313 315 0 315 316 0 317 318 0 316 318 0 305 319 1 318 319 0 321 332 0 331 332 0 320 331 0 330 331 0 329 330 0 324 330 0 323 324 0 319 324 0 328 329 0 319 323 1 327 328 0 322 327 0 326 327 0 325 326 0 324 325 2 332 333 0 334 335 0 333 335 0 335 336 0 337 338 0 336 338 0 338 339 0 339 340 0 341 342 0 333 342 1 342 343 0 344 345 0 343 345 0 345 346 0 346 347 0 347 348 0 348 349 0 340 349 0 349 350 0 351 352 0 350 352 0 355 356 0 353 356 0 352 353 0 354 355 0 353 354 1 356 357 0 358 359 0 333 359 2 359 360 0 361 362 0 360 362 0 362 363 0 363 364 0 364 365 0 365 366 0 366 367 0 357 367 0 367 368 0 368 369 0 370 371 0 369 371 0 371 372 0 373 374 0 372 374 0 332 375 1 374 375 0 377 388 0 387 388 0 376 387 0 386 387 0 385 386 0 380 386 0 379 380 0 375 380 0 384 385 0 375 379 1 383 384 0 378 383 0 382 383 0 381 382 0 380 381 2 388 389 0 390 391 0 389 391 0 391 392 0 393 394 0 392 394 0 394 395 0 395 396 0 397 398 0 396 398 0 398 399 0 400 401 0 399 401 0 388 402 1 401 402 0 404 415 0 414 415 0 403 414 0 413 414 0 412 413 0 407 413 0 406 407 0 402 407 0 411 412 0 402 406 1 410 411 0 405 410 0 409 410 0 408 409 0 407 408 2 415 416 0 417 418 0 416 418 0 418 419 0 420 421 0 419 421 0 421 422 0 422 423 0 424 425 0 416 425 1 425 426 0 427 428 0 426 428 0 428 429 0 429 430 0 430 431 0 431 432 0 423 432 0 432 433 0 434 435 0 433 435 0 438 439 0 436 439 0 435 436 0 437 438 0 436 437 1 439 440 0 441 442 0 416 442 2 442 443 0 444 445 0 443 445 0 445 446 0 446 447 0 447 448 0 448 449 0 449 450 0 440 450 0 450 451 0 451 452 0 453 454 0 452 454 0 454 455 0 456 457 0 455 457 0 415 458 1 457 458 0 460 471 0 470 471 0 459 470 0 469 470 0 468 469 0 463 469 0 462 463 0 458 463 0 467 468 0 458 462 1 466 467 0 461 466 0 465 466 0 464 465 0 463 464 2 471 472 0 473 474 0 472 474 0 474 475 0 476 477 0 475 477 0 477 478 0 478 479 0 480 481 0 479 481 0 481 482 0 483 484 0 482 484 0 471 485 1 484 485 0 487 498 0 497 498 0 486 497 0 496 497 0 495 496 0 490 496 0 489 490 0 485 490 0 494 495 0 485 489 1 493 494 0 488 493 0 492 493 0 491 492 0 490 491 2 498 499 0 500 501 0 499 501 0 501 502 0 503 504 0 502 504 0 504 505 0 505 506 0 507 508 0 499 508 1 508 509 0 510 511 0 509 511 0 511 512 0 512 513 0 513 514 0 514 515 0 506 515 0 515 516 0 517 518 0 516 518 0 521 522 0 519 522 0 518 519 0 520 521 0 519 520 1 522 523 0 524 525 0 499 525 2 525 526 0 527 528 0 526 528 0 528 529 0 529 530 0 530 531 0 531 532 0 532 533 0 523 533 0 533 534 0 534 535 0 536 537 0 535 537 0 537 538 0 539 540 0 538 540 0 0 0
\ No newline at end of file
diff --git a/pybuda/csrc/pattern_matcher/tests/boost_test_graphs/2encoder_boost_graph.txt b/pybuda/csrc/pattern_matcher/tests/boost_test_graphs/2encoder_boost_graph.txt
deleted file mode 100644
index 881cda641..000000000
--- a/pybuda/csrc/pattern_matcher/tests/boost_test_graphs/2encoder_boost_graph.txt
+++ /dev/null
@@ -1 +0,0 @@
-22 serialization::archive 17 0 0 168 185 0 0 19 input_hidden_states 5 input 1 9 reshape_1 7 reshape 2 7 const_0 5 input 3 13 buda.matmul_3 6 matmul 4 9 reshape_4 7 reshape 5 7 const_1 5 input 6 5 add_6 3 add 7 13 buda.hslice_7 6 hslice 8 9 reshape_8 7 reshape 9 7 const_2 5 input 10 14 buda.matmul_10 6 matmul 11 10 reshape_11 7 reshape 12 7 const_3 5 input 13 6 add_13 3 add 14 14 buda.hslice_14 6 hslice 15 10 reshape_15 7 reshape 16 12 transpose_16 9 transpose 17 18 nn.batch_matmul_17 6 matmul 18 10 reshape_18 7 reshape 19 16 constant_const_4 8 constant 20 11 multiply_20 8 multiply 21 14 softmax_exp_21 3 exp 23 14 softmax_sum_21 10 reduce_sum 25 16 softmax_recip_21 10 reciprocal 24 15 softmax_mult_21 8 multiply 22 10 reshape_22 7 reshape 26 7 const_5 5 input 27 14 buda.matmul_24 6 matmul 28 10 reshape_25 7 reshape 29 7 const_6 5 input 30 6 add_27 3 add 31 14 buda.hslice_28 6 hslice 32 12 transpose_29 9 transpose 33 10 reshape_30 7 reshape 34 12 transpose_31 9 transpose 35 18 nn.batch_matmul_32 6 matmul 36 10 reshape_33 7 reshape 37 14 buda.hstack_34 6 hstack 38 7 const_7 5 input 39 14 buda.matmul_36 6 matmul 40 10 reshape_37 7 reshape 41 7 const_8 5 input 42 6 add_39 3 add 43 6 add_40 3 add 44 7 const_9 5 input 45 8 const_10 5 input 46 36 constant_1_layernorm_var_plus_eps_43 8 constant 56 17 layernorm_mean_43 10 reduce_avg 52 16 layernorm_sub_43 8 subtract 50 15 layernorm_sq_43 8 multiply 57 16 layernorm_var_43 10 reduce_avg 55 25 layernorm_var_plus_eps_43 3 add 54 17 layernorm_sqrt_43 4 sqrt 53 18 layernorm_recip_43 10 reciprocal 51 19 layernorm_output_43 8 multiply 49 20 layernorm_weights_43 8 multiply 48 17 layernorm_bias_43 3 add 47 10 reshape_44 7 reshape 58 8 const_11 5 input 59 14 buda.matmul_46 6 matmul 60 10 reshape_47 7 reshape 61 8 const_12 5 input 62 6 add_49 3 add 63 7 gelu_50 4 gelu 64 10 reshape_51 7 reshape 65 8 const_13 5 input 66 14 buda.matmul_53 6 matmul 67 10 reshape_54 7 reshape 68 8 const_14 5 input 69 6 add_56 3 add 70 6 add_57 3 add 71 8 const_15 5 input 72 8 const_16 5 input 73 36 constant_1_layernorm_var_plus_eps_60 8 constant 83 17 layernorm_mean_60 10 reduce_avg 79 16 layernorm_sub_60 8 subtract 77 15 layernorm_sq_60 8 multiply 84 16 layernorm_var_60 10 reduce_avg 82 25 layernorm_var_plus_eps_60 3 add 81 17 layernorm_sqrt_60 4 sqrt 80 18 layernorm_recip_60 10 reciprocal 78 19 layernorm_output_60 8 multiply 76 20 layernorm_weights_60 8 multiply 75 17 layernorm_bias_60 3 add 74 10 reshape_61 7 reshape 85 8 const_17 5 input 86 14 buda.matmul_63 6 matmul 87 10 reshape_64 7 reshape 88 8 const_18 5 input 89 6 add_66 3 add 90 14 buda.hslice_67 6 hslice 91 10 reshape_68 7 reshape 92 8 const_19 5 input 93 14 buda.matmul_70 6 matmul 94 10 reshape_71 7 reshape 95 8 const_20 5 input 96 6 add_73 3 add 97 14 buda.hslice_74 6 hslice 98 10 reshape_75 7 reshape 99 12 transpose_76 9 transpose 100 18 nn.batch_matmul_77 6 matmul 101 10 reshape_78 7 reshape 102 17 constant_const_21 8 constant 103 11 multiply_80 8 multiply 104 14 softmax_exp_81 3 exp 106 14 softmax_sum_81 10 reduce_sum 108 16 softmax_recip_81 10 reciprocal 107 15 softmax_mult_81 8 multiply 105 10 reshape_82 7 reshape 109 8 const_22 5 input 110 14 buda.matmul_84 6 matmul 111 10 reshape_85 7 reshape 112 8 const_23 5 input 113 6 add_87 3 add 114 14 buda.hslice_88 6 hslice 115 12 transpose_89 9 transpose 116 10 reshape_90 7 reshape 117 12 transpose_91 9 transpose 118 18 nn.batch_matmul_92 6 matmul 119 10 reshape_93 7 reshape 120 14 buda.hstack_94 6 hstack 121 8 const_24 5 input 122 14 buda.matmul_96 6 matmul 123 10 reshape_97 7 reshape 124 8 const_25 5 input 125 6 add_99 3 add 126 7 add_100 3 add 127 8 const_26 5 input 128 8 const_27 5 input 129 37 constant_1_layernorm_var_plus_eps_103 8 constant 139 18 layernorm_mean_103 10 reduce_avg 135 17 layernorm_sub_103 8 subtract 133 16 layernorm_sq_103 8 multiply 140 17 layernorm_var_103 10 reduce_avg 138 26 layernorm_var_plus_eps_103 3 add 137 18 layernorm_sqrt_103 4 sqrt 136 19 layernorm_recip_103 10 reciprocal 134 20 layernorm_output_103 8 multiply 132 21 layernorm_weights_103 8 multiply 131 18 layernorm_bias_103 3 add 130 11 reshape_104 7 reshape 141 8 const_28 5 input 142 15 buda.matmul_106 6 matmul 143 11 reshape_107 7 reshape 144 8 const_29 5 input 145 7 add_109 3 add 146 8 gelu_110 4 gelu 147 11 reshape_111 7 reshape 148 8 const_30 5 input 149 15 buda.matmul_113 6 matmul 150 11 reshape_114 7 reshape 151 8 const_31 5 input 152 7 add_116 3 add 153 7 add_117 3 add 154 8 const_32 5 input 155 8 const_33 5 input 156 37 constant_1_layernorm_var_plus_eps_120 8 constant 166 18 layernorm_mean_120 10 reduce_avg 162 17 layernorm_sub_120 8 subtract 160 16 layernorm_sq_120 8 multiply 167 17 layernorm_var_120 10 reduce_avg 165 26 layernorm_var_plus_eps_120 3 add 164 18 layernorm_sqrt_120 4 sqrt 163 19 layernorm_recip_120 10 reciprocal 161 20 layernorm_output_120 8 multiply 159 21 layernorm_weights_120 8 multiply 158 18 layernorm_bias_120 3 add 157 16 output_layernorm 6 output 168 166 167 0 0 0 158 159 2 159 160 0 156 161 0 160 161 0 161 162 0 153 157 1 162 163 0 157 158 0 153 158 0 163 164 0 158 164 0 154 165 0 164 165 0 155 166 0 165 166 0 139 153 1 152 153 0 151 152 0 150 152 0 149 150 0 148 149 0 147 149 0 146 147 0 145 146 0 144 145 0 143 145 0 142 143 0 141 142 0 140 142 0 139 140 0 131 132 2 132 133 0 129 134 0 133 134 0 134 135 0 126 130 1 135 136 0 130 131 0 126 131 0 136 137 0 131 137 0 127 138 0 137 138 0 128 139 0 138 139 0 56 57 0 48 49 2 49 50 0 46 51 0 50 51 0 51 52 0 43 47 1 52 53 0 47 48 0 43 48 0 53 54 0 48 54 0 44 55 0 54 55 0 45 56 0 55 56 0 0 43 1 42 43 0 41 42 0 40 42 0 39 40 0 38 39 0 37 39 0 36 37 0 35 36 0 34 35 0 25 35 0 33 34 0 32 33 0 31 32 0 30 31 0 29 30 0 28 30 0 10 11 0 9 10 0 1 10 1 7 8 0 6 7 0 5 6 0 4 6 0 3 4 0 2 3 0 1 3 0 0 1 0 12 13 0 11 13 0 13 14 0 14 15 0 15 16 0 16 17 0 8 17 0 17 18 0 19 20 0 18 20 0 23 24 0 21 24 0 20 21 0 22 23 0 21 22 1 24 25 0 26 27 0 1 27 2 27 28 0 58 59 0 57 59 0 59 60 0 61 62 0 60 62 0 62 63 0 63 64 0 65 66 0 64 66 0 66 67 0 68 69 0 67 69 0 56 70 1 69 70 0 72 83 0 82 83 0 71 82 0 81 82 0 80 81 0 75 81 0 74 75 0 70 75 0 79 80 0 70 74 1 78 79 0 73 78 0 77 78 0 76 77 0 75 76 2 83 84 0 85 86 0 84 86 0 86 87 0 88 89 0 87 89 0 89 90 0 90 91 0 92 93 0 84 93 1 93 94 0 95 96 0 94 96 0 96 97 0 97 98 0 98 99 0 99 100 0 91 100 0 100 101 0 102 103 0 101 103 0 106 107 0 104 107 0 103 104 0 105 106 0 104 105 1 107 108 0 109 110 0 84 110 2 110 111 0 112 113 0 111 113 0 113 114 0 114 115 0 115 116 0 116 117 0 117 118 0 108 118 0 118 119 0 119 120 0 121 122 0 120 122 0 122 123 0 124 125 0 123 125 0 83 126 1 125 126 0 0 0
\ No newline at end of file
diff --git a/pybuda/csrc/pattern_matcher/tests/gtest_main.cpp b/pybuda/csrc/pattern_matcher/tests/gtest_main.cpp
deleted file mode 100644
index a4addef8a..000000000
--- a/pybuda/csrc/pattern_matcher/tests/gtest_main.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include <gtest/gtest.h>
-#include <pybind11/embed.h>
-
-int main(int argc, char **argv) {
-    ::testing::InitGoogleTest(&argc, argv);
-    pybind11::scoped_interpreter guard{};
-    return RUN_ALL_TESTS();
-}
diff --git a/pybuda/csrc/pattern_matcher/tests/module.mk b/pybuda/csrc/pattern_matcher/tests/module.mk
deleted file mode 100644
index d96a1f86f..000000000
--- a/pybuda/csrc/pattern_matcher/tests/module.mk
+++ /dev/null
@@ -1,22 +0,0 @@
-PYBUDA_CSRC_PATTERN_MATCHER_TESTS = $(TESTDIR)/pybuda/csrc/pattern_matcher/tests/pattern_matcher_unit_tests
-PYBUDA_CSRC_PATTERN_MATCHER_TESTS_SRCS = \
-	pybuda/csrc/pattern_matcher/tests/unit_tests.cpp \
-	pybuda/csrc/pattern_matcher/tests/gtest_main.cpp
-
-PYBUDA_CSRC_PATTERN_MATCHER_TESTS_INCLUDES = $(PYBUDA_CSRC_PATTERN_MATCHER_INCLUDES) -I./boost_test_graphs
-PYBUDA_CSRC_PATTERN_MATCHER_TESTS_LDFLAGS = -lstdc++fs -lgtest -lgtest_main -lpthread -l$(PYTHON_VERSION) -L./third_party/boost/stage/lib -lboost_serialization -lm
-
-PYBUDA_CSRC_PATTERN_MATCHER_TESTS_OBJS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_PATTERN_MATCHER_TESTS_SRCS:.cpp=.o))
-PYBUDA_CSRC_PATTERN_MATCHER_TESTS_DEPS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_PATTERN_MATCHER_TESTS_SRCS:.cpp=.d))
-
--include $(PYBUDA_CSRC_PATTERN_MATCHER_TESTS_DEPS)
-
-pybuda/csrc/pattern_matcher/tests: $(PYBUDA_CSRC_PATTERN_MATCHER_TESTS)
-
-$(PYBUDA_CSRC_PATTERN_MATCHER_TESTS): $(PYBUDA_CSRC_PATTERN_MATCHER_TESTS_OBJS) $(PYBUDA_CSRC_PATTERN_MATCHER_LIB)
-	@mkdir -p $(@D)
-	$(CXX) $(PATTERN_MATCHER_CSRC_CFLAGS) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) $(PYBUDA_CSRC_PATTERN_MATCHER_TESTS_LDFLAGS)
-
-$(OBJDIR)/pybuda/csrc/pattern_matcher/tests/%.o: pybuda/csrc/pattern_matcher/tests/%.cpp
-	@mkdir -p $(@D)
-	$(CXX) $(PATTERN_MATCHER_CSRC_CFLAGS) $(CXXFLAGS) $(PYBUDA_CSRC_PATTERN_MATCHER_TESTS_INCLUDES) -c -o $@ $<
diff --git a/pybuda/csrc/pattern_matcher/tests/unit_tests.cpp b/pybuda/csrc/pattern_matcher/tests/unit_tests.cpp
deleted file mode 100644
index 934931077..000000000
--- a/pybuda/csrc/pattern_matcher/tests/unit_tests.cpp
+++ /dev/null
@@ -1,130 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "gtest/gtest.h"
-#include <iostream>
-#include <fstream>
-#include <experimental/filesystem> // clang6 requires us to use "experimental", g++ 9.3 is fine with just <filesystem>
-
-#include "pattern_matcher/pattern_matcher.hpp"
-
-#include <boost/graph/adjacency_list.hpp>
-#include <boost/graph/copy.hpp>
-#include <boost/graph/filtered_graph.hpp>
-#include <boost/graph/vf2_sub_graph_iso.hpp>
-
-#include <boost/graph/adj_list_serialize.hpp>
-#include <boost/archive/text_oarchive.hpp>
-#include <boost/archive/text_iarchive.hpp>
-#include <boost/graph/graphviz.hpp>
-
-using std::string;
-using namespace tt::graphlib;
-using namespace pattern_matcher;
-
-
-typedef boost::adjacency_list< boost::setS, boost::vecS, boost::bidirectionalS, VertexProperty, EdgeProperty> graph_type;
-typedef boost::subgraph<graph_type> subgraph_type;
-
-typedef boost::graph_traits<graph_type>::vertex_descriptor VertexId;
-typedef boost::graph_traits<graph_type>::edge_descriptor EdgeId;
-
-
-graph_type load_mha_matmul_pattern() {
-    graph_type pattern_graph;
-    auto input = add_vertex(
-        VertexProperty{
-            .name="input",
-            .op_type="*", // used as wildcard
-            .node_id=7,
-        },
-        pattern_graph
-    );
-
-    auto reshape= add_vertex(
-        VertexProperty{
-            .name="reshape_1",
-            .op_type="reshape",
-            .node_id=0,
-        },
-        pattern_graph
-    );
-    auto mm0= add_vertex(
-        VertexProperty{
-            .name="matmul0",
-            .op_type="matmul",
-            .node_id=1,
-        },
-        pattern_graph
-    );
-    auto mm1= add_vertex(
-        VertexProperty{
-            .name="matmul1",
-            .op_type="matmul",
-            .node_id=2,
-        },
-        pattern_graph
-    );
-    auto mm2= add_vertex(
-        VertexProperty{
-            .name="matmul2",
-            .op_type="matmul",
-            .node_id=3,
-        },
-        pattern_graph
-    );
-
-    add_edge(input, reshape, EdgeProperty{.producer_output_edge_index = 0 }, pattern_graph);
-    add_edge(reshape, mm0, EdgeProperty{.producer_output_edge_index = 0 }, pattern_graph);
-    add_edge(reshape, mm1, EdgeProperty{.producer_output_edge_index = 1 }, pattern_graph);
-    add_edge(reshape, mm2, EdgeProperty{.producer_output_edge_index = 2 }, pattern_graph);
-
-    return pattern_graph;
-}
-
-
-TEST(PatternMatcher, encoders_2)
-{
-    auto pattern = load_mha_matmul_pattern();
-    std::string graph_file_path =
-        std::experimental::filesystem::path(__FILE__).parent_path().string() +
-        "/boost_test_graphs/2encoder_boost_graph.txt";
-    auto graph = load_graph_from_file(graph_file_path);
-
-    int total_matches = num_subgraph_pattern_matches(pattern, graph, 2);
-    EXPECT_TRUE(total_matches == 2);
-}
-
-TEST(PatternMatcher, encoders_2_discovery)
-{
-    std::string graph_file_path =
-        std::experimental::filesystem::path(__FILE__).parent_path().string() +
-        "/boost_test_graphs/2encoder_boost_graph.txt";
-    auto large_graph = load_graph_from_file(graph_file_path);
-    bool pass = contains_exactly_n_subgraph_matches(large_graph, 2);
-    EXPECT_TRUE(pass);
-}
-
-TEST(PatternMatcher, encoders_12)
-{
-    auto pattern = load_mha_matmul_pattern();
-    std::string graph_file_path =
-        std::experimental::filesystem::path(__FILE__).parent_path().string() +
-        "/boost_test_graphs/12encoder_boost_graph.txt";
-    auto graph = load_graph_from_file(graph_file_path);
-
-    int total_matches = num_subgraph_pattern_matches(pattern, graph, 12);
-    EXPECT_TRUE(total_matches == 12);
-}
-
-TEST(PatternMatcher, encoders_12_discovery)
-{
-    std::string graph_file_path =
-        std::experimental::filesystem::path(__FILE__).parent_path().string() +
-        "/boost_test_graphs/12encoder_boost_graph.txt";
-    auto large_graph = load_graph_from_file(graph_file_path);
-
-    bool pass = contains_exactly_n_subgraph_matches(large_graph, 12);
-    EXPECT_TRUE(pass);
-}
-
diff --git a/pybuda/csrc/perf_model/event.cpp b/pybuda/csrc/perf_model/event.cpp
deleted file mode 100644
index 399aa1c6b..000000000
--- a/pybuda/csrc/perf_model/event.cpp
+++ /dev/null
@@ -1,500 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "perf_model/event.hpp"
-
-#include <list>
-#include <queue>
-
-#include "perf_model/simulator.hpp"
-
-constexpr std::uint32_t PACKER_OPERAND = 16;
-
-namespace tt::perf_model
-{
-
-// TODO: move this to somewhere and add better estimates
-std::uint32_t get_noc_transfer_time(Buffer *, Buffer *, std::uint32_t)
-{
-    // noc latency is mostly hidden by pack latency... still, need to model this
-    return 100; /* return 500 + int(count * 32);*/
-}
-std::uint32_t get_host_transfer_time(std::uint32_t)
-{
-    // Need to model this better...
-    return 100; /*return 1000 + int(count * 320); */
-}
-std::uint32_t get_pack_time(std::uint32_t count) { return 100 + count * 16; }
-
-OpDataEvent::OpDataEvent(
-    std::uint32_t input_index,
-    TimeData data,
-    Buffer *output_buffer,
-    std::uint32_t current_t,
-    std::uint32_t current_ublock,
-    std::uint32_t current_k) :
-    DataEvent(input_index, data, output_buffer),
-    current_t(current_t),
-    current_ublock(current_ublock),
-    current_k(current_k)
-{
-    op = output_buffer->get_node();
-    total_t = op->get_perf_data()->op_perf_data.op_model.block_shape().t;
-    total_ublocks = op->get_perf_data()->op_perf_data.op_model.block_shape().mblock_m *
-                    op->get_perf_data()->op_perf_data.op_model.block_shape().mblock_n;
-    total_k = 1;
-
-    if ( (op->get_op_type() == "matmul") || (op->get_op_type() == "sparse_matmul") )
-    {
-        total_k = op->get_perf_data()->attr.m_k;
-        total_ublocks = 1;
-    }
-    if (op->get_op_type() == "reduce")
-    {
-        total_k = op->get_perf_data()->attr.m_k * total_ublocks;
-        total_ublocks = 1;
-    }
-
-    // We need to do some fused op input analysis when fusing, and then figure out their ublock consumption rate when setting input ublock shapes, and input buffers.
-    // Unitl then, we can "cheat", and pretend that fused op consumes and produces full inputs/mblocks, and doesn't stream ublock by ublock.
-    if (op->get_op_type() == "fused_op")
-        total_ublocks = 1;
-
-    TT_ASSERT(current_t < total_t);
-    TT_ASSERT(current_k < total_k);
-}
-
-OutputDataEvent::OutputDataEvent(
-    std::uint32_t input_index,
-    TimeData data,
-    Buffer *output_buffer,
-    const std::vector<std::pair<Buffer *, std::uint32_t>> &consumers) :
-    DataEvent(input_index, data, output_buffer), consumers(consumers)
-{
-    TT_ASSERT(
-        consumers.size() > 0,
-        "Node " + output_buffer->get_node()->get_name() + " has no consumers, but has output buffer");
-
-    for (std::size_t i = 0; i < consumers.size(); i++)
-    {
-        remaining.push_back(consumers.at(i).first->get_broadcast_multiplier() * data.count);  // remaining to send
-    }
-
-    consumed = std::vector<std::vector<TimeData>>(consumers.size(), std::vector<TimeData>());
-}
-
-// Process this event. Return pointer to buffer on which we're stalled, if stalled...
-// If any new events have been generated, populate them in new_events vector.
-ProcessStatus OutputDataEvent::process(SimStateP &sim_state, SimCacheP &, std::string const&)
-{
-    ProcessStatus ret;
-    if (unprocessed)
-    {
-        buffer->insert_data(data.count);
-        unprocessed = false;
-    }
-
-    auto max_before_broadcast = [&]()
-    {
-        std::uint32_t max = 0;
-        for (std::size_t i = 0; i < consumers.size(); i++)
-        {
-            std::uint32_t broadcast_multiplier = consumers[i].first->get_broadcast_multiplier();
-
-            // round up
-            std::uint32_t r = ((remaining[i] + broadcast_multiplier - 1) / broadcast_multiplier) * broadcast_multiplier;
-            r /= broadcast_multiplier;
-            if (r > max)
-                max = r;
-        }
-        return max;
-    };
-
-    std::uint32_t max_remaing_before = max_before_broadcast();
-    for (std::size_t i = 0; i < consumers.size(); i++)
-    {
-        if (remaining[i] == 0)
-            continue;  // we're done with this consumer
-
-        auto &[target_buffer, operand] = consumers[i];
-
-        std::uint32_t to_transfer = std::min(target_buffer->available_space(), remaining[i]);
-        if (to_transfer == 0)
-        {
-            ret.stall_reason.push_back(target_buffer);
-            continue;  // no room
-        }
-
-        target_buffer->reserve_space(to_transfer);
-        remaining[i] -= to_transfer;
-        consumed[i].push_back(TimeData{.count = to_transfer, .timestamp = sim_state->timestamp});
-
-        // Create an input buffer event
-        std::uint32_t time_increment = get_noc_transfer_time(buffer, target_buffer, to_transfer);
-        ret.new_events.push_back(new InputDataEvent(
-            input_index,
-            TimeData{.count = to_transfer, .timestamp = sim_state->timestamp + time_increment},
-            target_buffer));
-
-        ret.modified_buffers.push_back(buffer);
-
-        if (remaining[i] > 0)
-            ret.stall_reason.push_back(target_buffer);  // we're still stalled since we couldn't send it all
-    }
-
-    std::uint32_t max_remaing_after = max_before_broadcast();
-    if (max_remaing_after < max_remaing_before)
-        buffer->pop_data(max_remaing_before - max_remaing_after);
-
-    if (max_remaing_before > max_remaing_after)
-        SIMLOG << "  POPPED " << (max_remaing_before - max_remaing_after) << " from output buffer." << std::endl;
-
-    return ret;
-}
-
-std::string OutputDataEvent::to_string() const
-{
-    std::stringstream ss;
-    ss << "OutputDataEvent(input=" << input_index << ", org=@" << data.timestamp << ", count=" << data.count
-       << ", buf=" << buffer->to_string();
-    return ss.str();
-}
-
-// Process this event. Return pointer to buffer on which we're stalled, if stalled...
-// If any new events have been generated, populate them in new_events vector.
-ProcessStatus InputDataEvent::process(SimStateP &, SimCacheP &, std::string const&)
-{
-    if (unprocessed)
-    {
-        buffer->insert_data(data.count);
-        unprocessed = false;
-    }
-    ProcessStatus ret;
-    ret.modified_buffers.push_back(buffer);
-
-    return ret;  // this event never stalls
-}
-
-// Process this event. Return pointer to buffer on which we're stalled, if stalled...
-// If any new events have been generated, populate them in new_events vector.
-ProcessStatus OpDataEvent::process(SimStateP &sim_state, SimCacheP &cache, std::string const& arch_name)
-{
-    unprocessed = false;
-
-    // We'll check all input buffers for the op that this input data belongs to, and if everything is available,
-    // and there's room in output buffer, we'll go ahead and produce data
-    NodeP node = buffer->get_node();
-    TT_ASSERT(node->is_op());
-
-    std::vector<Buffer *> input_buffers = cache->node_input_buffers(node);
-    Buffer *output_buffer = buffer;
-
-    ProcessStatus ret;
-    for (Buffer *b : input_buffers)
-    {
-        if (!b->above_threshold())
-        {
-            // stalled on this input buffer
-            ret.stall_reason.push_back(b);
-            SIMLOG << "   @" << sim_state->timestamp << " [op] Stalled " << node->get_name() << " on input buffer"
-                   << b->to_string() << std::endl;
-
-            if (sim_state->trace)
-                sim_state->trace_op.at(node)->unpack_stall(
-                    input_index, b->get_operand(), sim_state->timestamp, b->get_threshold());
-        }
-        else
-        {
-            if (sim_state->trace)
-                sim_state->trace_op.at(node)->unpack_data_available(
-                    input_index, b->get_operand(), sim_state->timestamp);
-        }
-    }
-
-    // check that output has enough room
-    std::uint32_t output_size = cache->node_output_size_in_tiles(node) / (total_t * total_ublocks);
-    TT_LOG_ASSERT(output_size > 0, "Node {} has output size of 0", node->get_name());
-
-    if (output_buffer->available_space() < output_size)
-    {
-        // stalled on output buffer having enough space
-        ret.stall_reason.push_back(output_buffer);
-        SIMLOG << "   @" << sim_state->timestamp << " [op] Stalled " << node->get_name() << " on output buffer"
-               << std::endl;
-
-        if (sim_state->trace)
-            sim_state->trace_op.at(node)->pack_stall(input_index, PACKER_OPERAND, sim_state->timestamp, output_size);
-    }
-
-    if (ret.stall_reason.size() > 0)
-        return ret;
-
-    // Good to go
-    SIMLOG << "   @" << sim_state->timestamp << " [op] Executed " << node->get_name() << ", in=" << input_index
-           << ", t=" << current_t << "/" << total_t << ", k=" << current_k << "/" << total_k
-           << ", ublock=" << current_ublock << "/" << total_ublocks << std::endl;
-
-    std::uint32_t op_time =
-        node->get_perf_data()->op_perf_data.cycle_count_ideal(arch_name) / (total_t * total_k * total_ublocks);
-    std::uint32_t end_time = sim_state->timestamp;
-
-    end_time += op_time;
-
-    // Pop input buffers
-    std::vector<Buffer *> pop_on_output, pop_on_end;
-    for (std::size_t i = 0; i < input_buffers.size(); i++)
-    {
-        Buffer *b = input_buffers[i];
-        if ((node->get_op_type() == "matmul") && (i == 2))
-        {
-            pop_on_output.push_back(b);  // Bias only gets popped on produced output
-        }
-        else if ((node->get_op_type() == "sparse_matmul") && ((i==0) || (i==2)))
-        {
-            pop_on_end.push_back(b);  // tiles/indices only get popped at the end
-        }
-        else
-        {
-            b->pop_threshold();
-            ret.modified_buffers.push_back(b);
-        }
-    }
-
-    // If this is not the last k/t, schedule the next one... otherwise produce output
-    std::uint32_t next_t, next_k, next_ublock;
-    bool produce_output = false;
-    bool next_op = false;
-
-    bool intermediate = false;
-    if (current_k + 1 < total_k)
-    {
-        next_t = current_t;
-        next_k = current_k + 1;
-        next_ublock = current_ublock;
-        TT_ASSERT(current_ublock == 0, "Matmul doesn't have partial ublock outputs");
-        intermediate = true;
-    }
-    if (current_ublock + 1 < total_ublocks)
-    {
-        next_t = current_t;
-        next_ublock = current_ublock + 1;
-        next_k = current_k;
-        TT_ASSERT(current_k == 0, "Non-matmul doesn't have 'k' counter");
-        intermediate = true;
-        produce_output = true;
-    }
-
-    if (!intermediate)
-    {
-        if (current_t + 1 < total_t)
-        {
-            next_t = current_t + 1;
-            next_k = 0;
-            next_ublock = 0;
-            produce_output = true;
-        }
-        else
-        {
-            next_t = 0;
-            next_k = 0;
-            next_ublock = 0;
-            produce_output = true;
-            next_op = true;
-        }
-    }
-
-    if (produce_output)
-    {
-        for (Buffer *b: pop_on_output)
-        {
-            b->pop_threshold();
-            ret.modified_buffers.push_back(b);
-        }
-
-        // Produce output
-        output_buffer->reserve_space(output_size);
-        ret.modified_buffers.push_back(output_buffer);
-
-        SIMLOG << "    Op " << node->get_name() << " produced output size=" << output_size << std::endl;
-
-        std::uint32_t pack_time = get_pack_time(output_size);
-        ret.new_events.push_back(new OutputDataEvent(
-            input_index,
-            TimeData{.count = output_size, .timestamp = end_time + pack_time},
-            output_buffer,
-            cache->node_outputs(node)));
-
-        if (sim_state->trace)
-            sim_state->trace_op.at(node)->pack_started(input_index, PACKER_OPERAND, end_time);
-        if (sim_state->trace)
-            sim_state->trace_op.at(node)->pack_ended(input_index, end_time + pack_time);
-    }
-
-    if (!next_op || (input_index + 1 < sim_state->total_input_count))
-    {
-        std::uint32_t next_input_index = next_op ? input_index + 1 : input_index;
-        ret.new_events.push_back(new OpDataEvent(
-            next_input_index,
-            TimeData{.count = data.count, .timestamp = end_time},
-            output_buffer,
-            next_t,
-            next_ublock,
-            next_k));
-    }
-    
-    if (next_op)
-    {
-        for (Buffer *b: pop_on_end)
-        {
-            b->pop_threshold();
-            ret.modified_buffers.push_back(b);
-        }
-
-    }
-
-    return ret;
-}
-
-// Process this event. Return pointer to buffer on which we're stalled, if stalled...
-// If any new events have been generated, populate them in new_events vector.
-ProcessStatus QueueDataEvent::process(SimStateP &sim_state, SimCacheP &cache, std::string const&)
-{
-    unprocessed = false;
-
-    // We'll check all input buffers for the op that this input data belongs to, and if everything is available,
-    // and there's room in output buffer, we'll go ahead and produce data
-    NodeP node = buffer->get_node();
-    TT_ASSERT(!node->is_op());
-
-    Buffer *input_buffer = buffer;
-    Buffer *output_buffer = cache->node_output_buffer(node);
-
-    ProcessStatus ret;
-    if (!input_buffer->above_threshold())
-    {
-        ret.stall_reason.push_back(input_buffer);
-        return ret;  // wait for data
-    }
-
-    //
-    // NOTE: Queue doesn't really have an "output buffer". However, the output buffer
-    // concept and output buffer event that comes with it "just works", so we don't have
-    // implement special behaviour for the queue. Setting latency of 0 from input to output buffer
-    // should produce correct results.
-    //
-    // check that output has enough room
-    std::uint32_t output_size = data.count;
-    if (output_buffer->available_space() < output_size)
-    {
-        // stalled on output buffer having enough space
-        ret.stall_reason.push_back(output_buffer);
-        return ret;  // wait for room in output buffer
-    }
-
-    // Pop input buffer
-    input_buffer->pop_threshold();
-    ret.modified_buffers.push_back(input_buffer);
-
-    // Produce output
-    output_buffer->reserve_space(output_size);
-    ret.modified_buffers.push_back(output_buffer);
-
-    ret.new_events.push_back(new OutputDataEvent(
-        input_index,
-        TimeData{.count = output_size, .timestamp = sim_state->timestamp},  // 0 latency
-        output_buffer,
-        cache->node_outputs(node)));
-
-    if (input_index + 1 < sim_state->total_input_count)
-    {
-        ret.new_events.push_back(new QueueDataEvent(
-            input_index + 1, TimeData{.count = data.count, .timestamp = sim_state->timestamp + 1}, buffer));
-    }
-
-    return ret;
-}
-
-std::string InputDataEvent::to_string() const
-{
-    std::stringstream ss;
-    ss << "InputDataEvent(input=" << input_index << ", org=@" << data.timestamp << ", count=" << data.count
-       << ", buf=" << buffer->to_string();
-    return ss.str();
-}
-
-std::string OpDataEvent::to_string() const
-{
-    std::stringstream ss;
-    ss << "OpDataEvent(input=" << input_index << ", org=@" << data.timestamp << ", count=" << data.count
-       << ", op=" << op->get_name();
-    return ss.str();
-}
-
-std::string QueueDataEvent::to_string() const
-{
-    std::stringstream ss;
-    ss << "QueueDataEvent(input=" << input_index << ", org=@" << data.timestamp << ", count=" << data.count
-       << ", queue=" << buffer->get_node()->get_name();
-    return ss.str();
-}
-
-std::string HostWriteDataEvent::to_string() const
-{
-    std::stringstream ss;
-    ss << "HostWriteDataEvent(input=" << input_index << ", org=@" << data.timestamp << ", count=" << data.count
-       << ", input=" << buffer->get_node()->get_name();
-    return ss.str();
-}
-
-std::string HostReadDataEvent::to_string() const
-{
-    std::stringstream ss;
-    ss << "HostReadDataEvent(input=" << input_index << ", org=@" << data.timestamp << ", count=" << data.count
-       << ", output=" << buffer->get_node()->get_name();
-    return ss.str();
-}
-
-// Process this event. Return pointer to buffer on which we're stalled, if stalled...
-// If any new events have been generated, populate them in new_events vector.
-ProcessStatus HostWriteDataEvent::process(SimStateP &sim_state, SimCacheP &cache, std::string const&)
-{
-    // Host only transfers if there's room for the whole input
-    ProcessStatus ret;
-    if (buffer->available_space() < data.count)
-    {
-        ret.stall_reason.push_back(buffer);
-        return ret;
-    }
-
-    // Transfer
-    buffer->reserve_space(data.count);
-    ret.modified_buffers.push_back(buffer);
-    unprocessed = false;
-
-    ret.new_events.push_back(new OutputDataEvent(
-        input_index,
-        TimeData{.count = data.count, .timestamp = sim_state->timestamp + get_host_transfer_time(data.count)},
-        buffer,
-        cache->node_outputs(buffer->get_node())));
-    return ret;
-}
-
-// Process this event. Return pointer to buffer on which we're stalled, if stalled...
-// If any new events have been generated, populate them in new_events vector.
-ProcessStatus HostReadDataEvent::process(SimStateP &, SimCacheP &, std::string const&)
-{
-    ProcessStatus ret;
-    if (!buffer->above_threshold())
-    {
-        ret.stall_reason.push_back(buffer);
-        return ret;
-    }
-
-    // Read data
-    buffer->pop_data(data.count);
-    ret.modified_buffers.push_back(buffer);
-    unprocessed = false;
-    return ret;
-}
-
-}  // namespace tt::perf_model
diff --git a/pybuda/csrc/perf_model/event.hpp b/pybuda/csrc/perf_model/event.hpp
deleted file mode 100644
index 8c347f9dd..000000000
--- a/pybuda/csrc/perf_model/event.hpp
+++ /dev/null
@@ -1,214 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-#include <deque>
-
-#include "perf_model/graph.hpp"
-
-namespace tt::perf_model
-{
-
-class Simulator;
-class Buffer;
-class SimCache;
-struct SimState;
-using SimCacheP = std::unique_ptr<SimCache>;
-using SimStateP = std::unique_ptr<SimState>;
-
-// Amount / timestamp pair
-struct TimeData
-{
-    std::uint32_t count;
-    std::uint32_t timestamp;
-};
-
-// Return structure from event process() calls
-class DataEvent;
-struct ProcessStatus
-{
-    std::vector<Buffer *> stall_reason;      // Buffers we're stalled on - run this event again when this buffer changes
-    std::vector<Buffer *> modified_buffers;  // Buffers mofidied by the processing step
-    std::vector<DataEvent *> new_events;     // new events generated by the processing step
-};
-
-// Base class for data events
-class DataEvent
-{
-   protected:
-    //
-    // Set at creation
-    //
-    std::uint32_t input_index;  // input index in a microbatch
-    TimeData data;              // count and time at which this happened
-    Buffer *buffer;             // buffer in which data was produced or pushed to
-
-    //
-    // Simulation-time
-    //
-    bool unprocessed;  // set if this event has never been processed
-
-   public:
-    DataEvent(std::uint32_t input_index, TimeData data, Buffer *buffer) :
-        input_index(input_index), data(data), buffer(buffer), unprocessed(true)
-    {
-    }
-
-    virtual ~DataEvent() {}
-
-    std::uint32_t timestamp() const { return data.timestamp; }
-    Buffer *get_buffer() const { return buffer; }
-    bool is_unprocessed() const { return unprocessed; }
-    std::uint32_t get_input_index() const { return input_index; }
-
-    // Process this event. Return pointer to buffer on which we're stalled, if stalled...
-    // If any new events have been generated, populate them in new_events vector.
-    virtual ProcessStatus process(SimStateP &sim_state, SimCacheP &cache, std::string const& arch_name) = 0;
-
-    virtual std::string to_string() const = 0;
-
-    bool operator<(const DataEvent &other) const { return data.timestamp < other.data.timestamp; }
-};
-
-// Data received by the output buffer. If receiving buffers have room, they will receive data, creating an
-// InputDataEvent
-class OutputDataEvent : public DataEvent
-{
-    //
-    // Set at creation
-    //
-    std::vector<std::pair<Buffer *, std::uint32_t>> consumers;  // consuming node + operand pair
-
-    //
-    // Set during simulation
-    //
-    std::vector<std::vector<TimeData>> consumed;  // timestamp/amount pair of consumed data, per consumer
-    std::vector<std::uint32_t> remaining;         // remaining data to send, per consumer
-
-   public:
-    OutputDataEvent(
-        std::uint32_t input_index,
-        TimeData data,
-        Buffer *output_buffer,
-        const std::vector<std::pair<Buffer *, std::uint32_t>> &consumers);
-
-    // Process this event. Return pointer to buffer on which we're stalled, if stalled...
-    // If any new events have been generated, populate them in new_events vector.
-    virtual ProcessStatus process(SimStateP &sim_state, SimCacheP &cache, std::string const& arch_name) override;
-
-    virtual std::string to_string() const override;
-};
-
-// Data received by input buffer. If attached node is ready, it'll go on to produce data in the output buffer.
-class InputDataEvent : public DataEvent
-{
-    //
-    // Set at creation
-    //
-
-    //
-    // Set during simulation
-    //
-    std::vector<TimeData> consumed;  // amount/timestamp consumed by the op
-    // std::uint32_t remaining_amount;  // unconsumed amount
-    // std::uint32_t completed_timestamp;
-
-   public:
-    InputDataEvent(std::uint32_t input_index, TimeData data, Buffer *receiver) : DataEvent(input_index, data, receiver)
-    {
-    }
-
-    // Process this event. Return pointer to buffer on which we're stalled, if stalled...
-    // If any new events have been generated, populate them in new_events vector.
-    virtual ProcessStatus process(SimStateP &sim_state, SimCacheP &cache, std::string const& arch_name) override;
-
-    virtual std::string to_string() const override;
-};
-
-// Operation execution
-class OpDataEvent : public DataEvent
-{
-    //
-    // Set at creation
-    //
-    NodeP op;
-    std::uint32_t current_t;  // Op only executes one T, and schedules the next one
-    std::uint32_t total_t;
-
-    std::uint32_t current_ublock; // For ops that produce ublock by ublock (all by matmul)
-    std::uint32_t total_ublocks;
-
-    // Specific to matmuls, multiple inner loops needed before output is created
-    std::uint32_t total_k = 1;
-    std::uint32_t current_k;
-
-    //
-    // Simulation
-    //
-
-   public:
-    OpDataEvent(
-        std::uint32_t input_index,
-        TimeData data,
-        Buffer *output_buffer,
-        std::uint32_t current_t,
-        std::uint32_t current_ublock,
-        std::uint32_t current_k);
-
-    // Process this event. Return pointer to buffer on which we're stalled, if stalled...
-    // If any new events have been generated, populate them in new_events vector.
-    virtual ProcessStatus process(SimStateP &sim_state, SimCacheP &cache, std::string const& arch_name) override;
-
-    virtual std::string to_string() const override;
-};
-
-// Mid-graph queue
-class QueueDataEvent : public DataEvent
-{
-   public:
-    QueueDataEvent(
-        std::uint32_t input_index,
-        TimeData data,
-        Buffer *input_buffer) : DataEvent(input_index, data, input_buffer) {}
-
-    // Process this event. Return pointer to buffer on which we're stalled, if stalled...
-    // If any new events have been generated, populate them in new_events vector.
-    virtual ProcessStatus process(SimStateP &sim_state, SimCacheP &cache, std::string const& arch_name) override;
-
-    virtual std::string to_string() const override;
-
-};
-
-// Host write to input
-class HostWriteDataEvent : public DataEvent
-{
-   public:
-    HostWriteDataEvent(std::uint32_t input_index, TimeData data, Buffer *device_buffer) :
-        DataEvent(input_index, data, device_buffer)
-    {
-    }
-
-    // Process this event. Return pointer to buffer on which we're stalled, if stalled...
-    // If any new events have been generated, populate them in new_events vector.
-    virtual ProcessStatus process(SimStateP &sim_state, SimCacheP &cache, std::string const& arch_name) override;
-
-    virtual std::string to_string() const override;
-};
-
-// Host read from output
-class HostReadDataEvent : public DataEvent
-{
-   public:
-    HostReadDataEvent(std::uint32_t input_index, TimeData data, Buffer *device_buffer) :
-        DataEvent(input_index, data, device_buffer)
-    {
-    }
-
-    // Process this event. Return pointer to buffer on which we're stalled, if stalled...
-    // If any new events have been generated, populate them in new_events vector.
-    virtual ProcessStatus process(SimStateP &sim_state, SimCacheP &cache, std::string const& arch_name) override;
-
-    virtual std::string to_string() const override;
-};
-
-}  // namespace tt::perf_model
diff --git a/pybuda/csrc/perf_model/graph.cpp b/pybuda/csrc/perf_model/graph.cpp
deleted file mode 100644
index f688ebf66..000000000
--- a/pybuda/csrc/perf_model/graph.cpp
+++ /dev/null
@@ -1,165 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include "perf_model/graph.hpp"
-#include "balancer/policies/policy_utils.hpp"
-
-namespace tt::perf_model
-{
-
-SystemSpec SystemSpec::get_for_device(const DeviceConfig &device_config)
-{
-    // Placeholder until DeviceConfig has it
-    if (device_config.arch_name == "grayskull")
-    {
-        return SystemSpec{
-            .clock_period = 1 / (1.2 * 1000000000),
-            .noc_bw = 1,                                 // TODO
-            .dram_bw = {10, 10, 10, 10, 10, 10, 10, 10},  // bytes/s
-            .grid_size_r = 10,
-            .grid_size_c = 12,
-            .arch_name = device_config.arch_name,
-        };
-    }
-
-    // wormhole flavours
-    return SystemSpec{
-        .clock_period = 1 / (1.2 * 1000000000),
-        .noc_bw = 1,                         // TODO
-        .dram_bw = {60, 60, 60, 60, 60, 60},  // bytes/s
-        .grid_size_r = 10,
-        .grid_size_c = 8,
-        .arch_name = device_config.arch_name,
-    };
-}
-
-std::uint32_t TensorData::size_in_bytes() const
-{
-    std::uint32_t size = shape.volume();
-    switch (df)
-    {
-        case DataFormat::Bfp2:
-        case DataFormat::Bfp2_b: return size / 4;
-
-        case DataFormat::Bfp4:
-        case DataFormat::Bfp4_b: return size / 2;
-
-        case DataFormat::Int8:
-        case DataFormat::Bfp8:
-        case DataFormat::Bfp8_b:
-        case DataFormat::Lf8: return size;
-
-        case DataFormat::UInt16:
-        case DataFormat::Float16:
-        case DataFormat::Float16_b: return size * 2;
-
-        case DataFormat::Float32: return size * 4;
-        case DataFormat::Int32: return size * 4;
-
-        default: return size * 4; // anything else?
-    }
-}
-
-std::uint32_t TensorData::size_in_tiles(bool include_z) const 
-{ 
-    auto out = shape.volume() / (32 * 32); 
-    if (!include_z)
-        out /= shape.z();
-    return out;
-}
-
-Node::Node(
-    const std::string &name, const std::string &op_type, const std::vector<NodeP> &operands, PerfDataP perf_data) :
-    name(name), type(NodeType::OP), operands(operands), op_type(op_type), perf_data(perf_data)
-{
-}
-
-Node::Node(const std::string &name, graphlib::QueueNodeType queue_type, NodeP operand, PerfDataP perf_data) :
-    name(name), type(NodeType::QUEUE), queue_type(queue_type), perf_data(perf_data)
-{
-    if (operand == nullptr)
-        operands = {};
-    else
-        operands = {operand};
-}
-
-std::size_t Node::get_operand_index(const NodeP node, std::uint32_t start) const
-{
-    for (std::size_t i = start; i < operands.size(); i++)
-    {
-        if (operands[i] == node)
-            return i;
-    }
-    TT_THROW("Not an operand");
-    return 0;  // avoid warning
-}
-
-NodeP Graph::add_node(NodeP node, bool is_input)
-{
-    nodes.push_back(node);
-    if (is_input)
-        inputs.push_back(node);
-
-    for (NodeP operand : node->get_operands()) operand->add_output(node);
-
-    return node;
-}
-
-NodeP Graph::add_op(
-    const std::string &name,
-    const std::string &op_type,
-    const std::vector<NodeP> &operands,
-    PerfDataP perf_data,
-    bool is_input)
-{
-    NodeP node = std::make_shared<Node>(name, op_type, operands, perf_data);
-    return add_node(node, is_input);
-}
-
-NodeP Graph::add_queue(
-    const std::string &name, graphlib::QueueNodeType queue_type, NodeP operand, PerfDataP perf_data, bool is_input)
-{
-    NodeP node = std::make_shared<Node>(name, queue_type, operand, perf_data);
-    return add_node(node, is_input);
-}
-
-// Find the longest op in the graph, and its length
-std::pair<NodeP, std::uint32_t> Graph::get_longest_op() const
-{
-    std::uint32_t max_len = 0;
-    NodeP max_op = nullptr;
-
-    for (NodeP node : nodes)
-    {
-        std::uint32_t cycle_count = node->get_perf_data()->op_perf_calculated_data.cycle_count_actual;
-        if ((max_op == nullptr) || (cycle_count > max_len))
-        {
-            max_op = node;
-            max_len = cycle_count;
-        }
-    }
-
-    return std::make_pair(max_op, max_len);
-}
-
-// Find outputs
-std::vector<NodeP> Graph::get_outputs() const
-{
-    std::vector<NodeP> ret;
-    for (NodeP node : get_nodes())
-        if (node->get_outputs().size() == 0)
-            ret.push_back(node);
-    return ret;
-}
-
-void OpPerfData::_get_bw_limited_execution_cycles(const DeviceConfig &device_config, const graphlib::Graph *graph)
-{
-    if (_has_bw_limited_execution_cycles)
-        return;
-
-    _cycle_bw_limited = get_limiter_cycles(op_model, graph, device_config);
-    _has_bw_limited_execution_cycles = true;
-}
-
-}  // namespace tt::perf_model
diff --git a/pybuda/csrc/perf_model/graph.hpp b/pybuda/csrc/perf_model/graph.hpp
deleted file mode 100644
index c94de4fe2..000000000
--- a/pybuda/csrc/perf_model/graph.hpp
+++ /dev/null
@@ -1,256 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <memory>
-#include <string>
-#include <vector>
-
-#include "backend_api/device_config.hpp"
-#include "balancer/types.hpp"
-#include "graph_lib/defines.hpp"
-#include "graph_lib/node_types.hpp"
-#include "graph_lib/shape.hpp"
-
-namespace tt
-{
-namespace perf_model
-{
-
-class Node;
-using NodeP = std::shared_ptr<Node>;
-
-struct SystemSpec
-{
-    float clock_period;
-    float noc_bw;
-    std::vector<float> dram_bw;
-    std::uint32_t grid_size_r, grid_size_c;
-    std::string arch_name;
-
-    static SystemSpec get_for_device(const DeviceConfig &device_config);
-};
-
-struct TensorData
-{
-    graphlib::Shape shape;
-    std::uint32_t t;
-    DataFormat df;
-
-    std::uint32_t size_in_bytes() const;
-    std::uint32_t size_in_tiles(bool include_z = true) const;
-};
-
-struct OpGrid
-{
-    std::uint32_t loc_r, loc_c;
-    std::uint32_t size_r, size_c;
-    std::uint32_t size() const { return size_r * size_c; }
-};
-
-struct OpPerfData
-{
-    // Cycle count to produce one output, if input bandwidth is 100% of needed
-    OpGrid grid;
-    balancer::OpModel op_model;
-
-    // Epoch number and type
-    std::uint32_t temporal_epoch;
-    graphlib::NodeEpochType epoch_type;
-
-    OpPerfData(
-        OpGrid grid, balancer::OpModel op_model, std::uint32_t temporal_epoch, graphlib::NodeEpochType epoch_type) :
-        grid(grid), op_model(op_model), temporal_epoch(temporal_epoch), epoch_type(epoch_type)
-    {
-    }
-
-    OpPerfData() {}
-
-   private:
-    bool _has_execution_cycles = false;  // cache because calls are expensive
-    bool _has_bw_limited_execution_cycles = false;
-    std::uint32_t _cycle_count_ideal;
-    std::uint32_t _theoretical_cycles;
-    std::uint32_t _cycle_bw_limited;
-
-    void _get_execution_cycles(std::string const &arch_name)
-    {
-        if (_has_execution_cycles)
-            return;
-        _cycle_count_ideal = op_model.get_execution_cycles(arch_name);
-        _theoretical_cycles = op_model.get_execution_cycles(arch_name, true);
-        _has_execution_cycles = true;
-    }
-
-    void _get_bw_limited_execution_cycles(const DeviceConfig &device_config, const graphlib::Graph *graph);
-
-   public:
-    std::uint32_t cycle_count_ideal(std::string const &arch_name)
-    {
-        _get_execution_cycles(arch_name);
-        return _cycle_count_ideal;
-    }
-    std::uint32_t theoretical_cycles(std::string const &arch_name)
-    {
-        _get_execution_cycles(arch_name);
-        return _theoretical_cycles;
-    }
-    std::uint32_t cycle_count_bw_limited(const DeviceConfig &device_config, const graphlib::Graph *graph)
-    {
-        _get_bw_limited_execution_cycles(device_config, graph);
-        return _cycle_bw_limited;
-    }
-};
-
-struct OpPerfCalculatedData
-{
-    // BWs - ideal/actual
-    std::vector<float> input_bw_needed, input_bw_got;
-    float output_bw_perc;  // the percentage of required bw we got (for worst case operand), which is also output bw%
-    float output_bw_ideal, output_bw_produced;
-
-    // Cycle counts, utilization
-    float utilization;
-    std::uint32_t cycle_count_actual;
-};
-
-struct QueuePerfData
-{
-    // Location - dram, host, etc.
-    std::string location;
-    std::vector<std::uint32_t> dram_channels;
-};
-
-struct QueuePerfCalculatedData
-{
-    float total_read_bw_ideal;  // ideal total BW requested by all consumers
-    float write_bw_ideal;       // ideal write BW from the producer
-
-    float total_bw_perc;           // the percentage of requested bw that we can get from dram
-    float total_read_bw_produced;  // actual BW that can be given to the op
-    float write_bw_received;       // actual write BW from the producer
-};
-
-struct Attr
-{
-    // number of inner-dim blocks (typically only for matmul)
-    std::uint32_t m_k;
-    std::uint32_t u_kt;
-};
-
-struct PerfData
-{
-    bool is_op;
-    Attr attr;  // general attributes, only applicable to some ops/queues
-
-    // Input/output shapes
-    std::vector<TensorData> inputs;
-    std::vector<std::uint32_t> input_broadcast_multiplier;
-    TensorData output;
-
-    OpPerfData op_perf_data;
-    OpPerfCalculatedData op_perf_calculated_data;
-
-    QueuePerfData queue_perf_data;
-    QueuePerfCalculatedData queue_perf_calculated_data;
-
-    PerfData(
-        std::vector<TensorData> inputs,
-        std::vector<std::uint32_t> input_broadcast_multiplier,
-        TensorData output,
-        const OpPerfData &op_perf_data) :
-        is_op(true),
-        inputs(inputs),
-        input_broadcast_multiplier(input_broadcast_multiplier),
-        output(output),
-        op_perf_data(op_perf_data)
-    {
-    }
-    PerfData(std::vector<TensorData> inputs, TensorData output, const QueuePerfData &queue_perf_data) :
-        is_op(false), inputs(inputs), output(output), queue_perf_data(queue_perf_data)
-    {
-    }
-};
-
-enum NodeType
-{
-    OP,
-    QUEUE
-};
-
-using PerfDataP = std::shared_ptr<PerfData>;
-class Node
-{
-   private:
-    std::string name;
-    NodeType type;
-
-    std::vector<NodeP> operands;
-    std::vector<NodeP> outputs;
-
-    std::string op_type;
-    graphlib::QueueNodeType queue_type;
-
-    PerfDataP perf_data;
-
-   public:
-    Node(const std::string &name, const std::string &op_type, const std::vector<NodeP> &operands, PerfDataP perf_data);
-    Node(const std::string &name, graphlib::QueueNodeType queue_type, NodeP operand, PerfDataP perf_data);
-
-    void add_output(NodeP node) { outputs.push_back(node); }
-
-    const std::vector<NodeP> &get_operands() const { return operands; }
-    const std::vector<NodeP> &get_outputs() const { return outputs; }
-    std::string get_name() const { return name; }
-
-    bool is_op() const { return type == NodeType::OP; }
-    std::string get_op_type() const
-    {
-        TT_ASSERT(is_op());
-        return op_type;
-    }
-
-    bool is_queue() const { return type == NodeType::QUEUE; }
-    graphlib::QueueNodeType get_queue_type() const
-    {
-        TT_ASSERT(is_queue());
-        return queue_type;
-    }
-
-    PerfDataP get_perf_data() const { return perf_data; }
-
-    // Find which input is fed by node
-    std::size_t get_operand_index(const NodeP node, std::uint32_t start = 0) const;
-};
-
-class Graph
-{
-   private:
-    std::vector<NodeP> nodes;
-    std::vector<NodeP> inputs;
-
-   public:
-    Graph() {}
-    NodeP add_op(
-        const std::string &name,
-        const std::string &op_type,
-        const std::vector<NodeP> &operands,
-        PerfDataP perf_data,
-        bool is_input);
-    NodeP add_queue(
-        const std::string &name, graphlib::QueueNodeType queue_type, NodeP operand, PerfDataP perf_data, bool is_input);
-    const std::vector<NodeP> &get_nodes() const { return nodes; }
-    const std::vector<NodeP> &get_inputs() const { return inputs; }
-    std::vector<NodeP> get_outputs() const;
-
-   private:
-    // Add a created node
-    NodeP add_node(NodeP node, bool input);
-
-    // Find the longest op in the graph, and its length
-    std::pair<NodeP, std::uint32_t> get_longest_op() const;
-};
-
-}  // namespace perf_model
-}  // namespace tt
diff --git a/pybuda/csrc/perf_model/module.mk b/pybuda/csrc/perf_model/module.mk
deleted file mode 100644
index 4a5a9da84..000000000
--- a/pybuda/csrc/perf_model/module.mk
+++ /dev/null
@@ -1,33 +0,0 @@
-# Every variable in subdir must be prefixed with subdir (emulating a namespace)
-
-PYBUDA_CSRC_PERF_MODEL_LIB = $(LIBDIR)/libperf_model.a
-PYBUDA_CSRC_PERF_MODEL_SRCS = \
-	pybuda/csrc/perf_model/graph.cpp \
-	pybuda/csrc/perf_model/perf_model.cpp \
-	pybuda/csrc/perf_model/event.cpp \
-	pybuda/csrc/perf_model/trace.cpp \
-	pybuda/csrc/perf_model/simulator.cpp
-
-PYBUDA_CSRC_PERF_MODEL_INCLUDES = $(PYBUDA_CSRC_INCLUDES)
-
-PYBUDA_CSRC_PERF_MODEL_OBJS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_PERF_MODEL_SRCS:.cpp=.o))
-PYBUDA_CSRC_PERF_MODEL_DEPS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_PERF_MODEL_SRCS:.cpp=.d))
-
-
-
--include $(PYBUDA_CSRC_PERF_MODEL_DEPS)
-
-PERF_MODEL_CSRC_CFLAGS = $(PYBUDA_CSRC_CFLAGS)
-
-# Each module has a top level target as the entrypoint which must match the subdir name
-pybuda/csrc/perf_model: $(PYBUDA_CSRC_PERF_MODEL_LIB)
-
-$(PYBUDA_CSRC_PERF_MODEL_LIB): $(PYBUDA_CSRC_PERF_MODEL_OBJS) $(PYBUDA_CSRC_GRAPH_LIB) 
-	@mkdir -p $(LIBDIR)
-	ar rcs $@ $^
-
-$(OBJDIR)/pybuda/csrc/perf_model/%.o: pybuda/csrc/perf_model/%.cpp
-	@mkdir -p $(@D)
-	$(CXX) $(PERF_MODEL_CSRC_CFLAGS) $(CXXFLAGS) $(STATIC_LIB_FLAGS) $(PYBUDA_CSRC_PERF_MODEL_INCLUDES) -c -o $@ $<
-
-include pybuda/csrc/perf_model/tests/module.mk
diff --git a/pybuda/csrc/perf_model/perf_model.cpp b/pybuda/csrc/perf_model/perf_model.cpp
deleted file mode 100644
index 5bd96b7c5..000000000
--- a/pybuda/csrc/perf_model/perf_model.cpp
+++ /dev/null
@@ -1,618 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "perf_model/perf_model.hpp"
-
-#include <unordered_map>
-
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-#include "graph_lib/node_types.hpp"
-#include "graph_lib/utils.hpp"
-#include "perf_model/graph.hpp"
-#include "perf_model/simulator.hpp"
-#include "placer/placer.hpp"
-#include "utils/logger.hpp"
-
-using tt::LogPerfModel;
-
-namespace tt::perf_model
-{
-
-// Loop up operand nodes in the map and conver
-std::vector<NodeP> get_node_operands(
-    graphlib::Graph *g, graphlib::Node *node, const std::unordered_map<graphlib::Node *, NodeP> node_map)
-{
-    std::vector<NodeP> ret;
-    for (graphlib::Node *operand : g->data_operands(node))
-    {
-        if ((node_map.count(operand)) == 0)
-            continue;
-        ret.push_back(node_map.at(operand));
-    }
-    return ret;
-}
-
-std::vector<TensorData> get_node_inputs(graphlib::Graph *g, graphlib::Node *node)
-{
-    std::vector<TensorData> inputs;
-    for (graphlib::Node *operand : g->data_operands(node))
-    {
-        inputs.push_back(
-            {.shape = operand->shape(),
-             .t = 1,  // TOOD
-             .df = operand->output_df()});
-    }
-    return inputs;
-}
-
-std::vector<std::uint32_t> get_node_input_broadcast_multiplier(graphlib::Graph *g, graphlib::Node *node)
-{
-    auto operand_edges = g->operand_data_edges(node);
-    if (node->node_type() != graphlib::kBudaOp)
-        return std::vector<std::uint32_t>(operand_edges.size(), 1);
-
-    std::vector<std::uint32_t> m;
-    for (Edge e : g->operand_data_edges(node))
-    {
-        auto tms = g->get_edge_attributes(e)->get_tms();
-        std::uint32_t multiplier = 1;
-        for (auto tm : tms)
-        {
-            if (tm.op == "broadcast")
-            {
-                multiplier *= std::get<int>(tm.attr[1]);
-            }
-        }
-        TT_ASSERT(multiplier >= 1);
-        m.push_back(multiplier);
-    }
-
-    return m;
-}
-
-OpGrid get_op_grid(const placer::OpPlacement &placement)
-{
-    const auto &p = placement.placed_cores;
-    if (placement.grid_transpose)
-        return OpGrid{.loc_r = p.start.row, .loc_c = p.start.col, .size_r = p.size_c(), .size_c = p.size_r()};
-
-    return OpGrid{.loc_r = p.start.row, .loc_c = p.start.col, .size_r = p.size_r(), .size_c = p.size_c()};
-}
-
-// Generate static perf data for an op
-PerfDataP get_op_perf_data(
-    graphlib::Graph *g, graphlib::BudaOpNode *op, const std::shared_ptr<balancer::BalancerSolution> balancer_solution)
-{
-    std::vector<TensorData> inputs = get_node_inputs(g, op);
-    balancer::OpModel op_model = balancer_solution->op_models.at(op->name());
-    auto ret = std::make_shared<PerfData>(PerfData{
-        inputs,
-        get_node_input_broadcast_multiplier(g, op),
-        TensorData{.shape = op->shape(), .t = 1, .df = op->output_df()},
-
-        OpPerfData(
-
-            get_op_grid(balancer_solution->placer_solution.name_to_op_placement.at(op->name())),
-            op_model,
-            balancer_solution->placer_solution.temporal_epoch_id(op->name()),
-            op->get_epoch_type())});
-
-    if (op->op_type().op == "matmul")
-    {
-        ret->attr.m_k = std::get<int>(op->op_type().buda_attrs["m_k"]);
-        ret->attr.u_kt = std::get<int>(op->op_type().buda_attrs["u_kt"]);
-    }
-    if (op->op_type().op == "reduce")
-    {
-        if (std::get<std::string>(op->op_type().buda_attrs["dim"]) == "z")
-        {
-            ret->attr.m_k = std::get<int>(op->op_type().buda_attrs["z"]);
-        }
-    }
-    return ret;
-}
-
-graphlib::QueueNodeType get_queue_type(graphlib::Node *node) { return node->as<graphlib::QueueNode>()->queue_type(); }
-
-// Generate static perf data for a queue
-PerfDataP get_queue_perf_data(graphlib::Graph *g, graphlib::Node *node)
-{
-    return std::make_shared<PerfData>(PerfData{
-        get_node_inputs(g, node),
-        TensorData{.shape = node->shape(), .t = 1, .df = node->output_df()},
-        QueuePerfData{
-            .location = "dram",  // TODO
-            .dram_channels = {0}}});
-}
-
-// Update per-epoch structures
-void init_epoch(
-    std::uint32_t temporal_epoch_id,
-    std::vector<std::unique_ptr<Graph>> &temporal_epoch_graphs,
-    std::vector<PerfModel::NodeMap> &epoch_node_map)
-{
-    while (temporal_epoch_graphs.size() <= temporal_epoch_id)
-    {
-        temporal_epoch_graphs.push_back(std::make_unique<Graph>());
-        epoch_node_map.push_back(PerfModel::NodeMap());
-    }
-}
-
-void PerfModel::create_op(
-    graphlib::Graph *g,
-    graphlib::BudaOpNode *op,
-    const std::shared_ptr<balancer::BalancerSolution> balancer_solution,
-    NodeMap &node_map,
-    std::vector<NodeMap> &epoch_node_map)
-{
-    std::uint32_t temporal_epoch_id = balancer_solution->placer_solution.temporal_epoch_id(op->name());
-    init_epoch(temporal_epoch_id, temporal_epoch_graphs, epoch_node_map);
-
-    PerfDataP perf_data = get_op_perf_data(g, op, balancer_solution);
-
-    // Add to global graph
-    auto global_operands = get_node_operands(g, op, node_map);
-    NodeP new_node =
-        graph->add_op(op->name(), op->op_type().op, global_operands, perf_data, global_operands.size() == 0);
-    node_map.insert(std::make_pair(op, new_node));
-
-    // Add to epoch graph
-    auto epoch_operands = get_node_operands(g, op, epoch_node_map[temporal_epoch_id]);
-    std::string op_type = op->is_sparse_matmul() ? "sparse_matmul" : op->op_type().op;
-    NodeP epoch_new_node = temporal_epoch_graphs[temporal_epoch_id]->add_op(
-        op->name(),
-        op_type,
-        epoch_operands,
-        perf_data,
-        epoch_operands.size() == 0);  // TODO: figure out input in constructor instead of here
-    epoch_node_map[temporal_epoch_id].insert(std::make_pair(op, epoch_new_node));
-}
-
-void PerfModel::create_tm(
-    graphlib::Graph *g,
-    graphlib::BudaNaryTMNode *tm,
-    const std::shared_ptr<balancer::BalancerSolution> balancer_solution,
-    NodeMap &node_map,
-    std::vector<NodeMap> &epoch_node_map)
-{
-    graphlib::BudaOpNode *user = g->data_users(tm).at(0)->as<graphlib::BudaOpNode>();
-    std::uint32_t temporal_epoch_id = balancer_solution->placer_solution.temporal_epoch_id(user->name());
-    init_epoch(temporal_epoch_id, temporal_epoch_graphs, epoch_node_map);
-
-    PerfDataP perf_data = get_op_perf_data(g, user, balancer_solution);
-
-    // Add to global graph
-    auto global_operands = get_node_operands(g, tm, node_map);
-    NodeP new_node =
-        graph->add_op(tm->name(), tm->op_type().op, global_operands, perf_data, global_operands.size() == 0);
-    node_map.insert(std::make_pair(tm, new_node));
-
-    // Add to epoch graph
-    auto epoch_operands = get_node_operands(g, tm, epoch_node_map[temporal_epoch_id]);
-    NodeP epoch_new_node = temporal_epoch_graphs[temporal_epoch_id]->add_op(
-        tm->name(),
-        tm->op_type().op,
-        epoch_operands,
-        perf_data,
-        epoch_operands.size() == 0);  // TODO: figure out input in constructor instead of here
-    epoch_node_map[temporal_epoch_id].insert(std::make_pair(tm, epoch_new_node));
-}
-
-void PerfModel::create_queue(
-    graphlib::Graph *g,
-    graphlib::QueueNode *q,
-    const std::shared_ptr<balancer::BalancerSolution> balancer_solution,
-    NodeMap &node_map,
-    std::vector<NodeMap> &epoch_node_map)
-{
-    // Queue could be e2e, in which case it's both input and output, on different epoch graphs
-
-    // Check if there's a producer
-    std::vector<NodeP> operands = get_node_operands(g, q, node_map);
-    NodeP operand = operands.size() > 0 ? operands[0] : nullptr;
-
-    // If there's no producer, it's a global input
-    bool is_input = operand == nullptr;
-
-    // Add to global graph
-    PerfDataP perf_data = get_queue_perf_data(g, q);
-    NodeP new_node = graph->add_queue(q->name(), get_queue_type(q), operand, perf_data, is_input);
-    node_map.insert(std::make_pair(q, new_node));
-
-    // Add as output to producer epoch
-    std::uint32_t producer_epoch_id = 0;
-    if (operand != nullptr)
-    {
-        producer_epoch_id = balancer_solution->placer_solution.temporal_epoch_id(g->data_operands(q)[0]->name());
-        init_epoch(producer_epoch_id, temporal_epoch_graphs, epoch_node_map);
-
-        // Get operand from the temporal epoch
-        std::vector<NodeP> operands = get_node_operands(g, q, epoch_node_map[producer_epoch_id]);
-        TT_ASSERT(operands.size() > 0);
-        operand = operands[0];
-
-        // Make a new copy of perf data, because in each epoch, the queue will be used differently
-        PerfDataP perf_data = get_queue_perf_data(g, q);
-        NodeP epoch_new_node = temporal_epoch_graphs[producer_epoch_id]->add_queue(
-            q->name(), get_queue_type(q), operand, perf_data, false /* epoch input */);
-        epoch_node_map[producer_epoch_id].insert(std::make_pair(q, epoch_new_node));
-    }
-
-    // Add as input to each epoch that reads it, unless it's also the producer_epoch_id
-    std::unordered_set<std::uint32_t> inserted_epochs;
-    for (graphlib::Node *user : g->data_users(q))
-    {
-        std::uint32_t temporal_epoch_id = balancer_solution->placer_solution.temporal_epoch_id(user->name());
-        if ((operand != nullptr) && (producer_epoch_id == temporal_epoch_id))
-            continue;
-
-        if (inserted_epochs.count(temporal_epoch_id) > 0)
-            continue;  // already added to this one
-
-        inserted_epochs.insert(temporal_epoch_id);
-
-        init_epoch(temporal_epoch_id, temporal_epoch_graphs, epoch_node_map);
-
-        // Make a new copy of perf data, because in each epoch, the queue will be used differently
-        PerfDataP perf_data = get_queue_perf_data(g, q);
-        NodeP epoch_new_node = temporal_epoch_graphs[temporal_epoch_id]->add_queue(
-            q->name(), get_queue_type(q), nullptr, perf_data, true /* epoch input */);
-        epoch_node_map[temporal_epoch_id].insert(std::make_pair(q, epoch_new_node));
-    }
-}
-
-void PerfModel::create_graphs(graphlib::Graph *g, const std::shared_ptr<balancer::BalancerSolution> balancer_solution)
-{
-    graph = std::make_unique<Graph>();
-    NodeMap node_map;                     // map of original graph to perf graph nodes
-    std::vector<NodeMap> epoch_node_map;  // map of original graph to perf graph nodes
-
-    std::ofstream op_perf;
-    bool dump_op_perf = env_as<bool>("PYBUDA_OP_PERF");
-    if (dump_op_perf)
-    {
-        op_perf.open("op_perf.csv");
-        op_perf << "name, type, epoch, grid, tiles, cycles, limiter_cycles" << std::endl;
-    }
-
-    // Convert the graph
-    for (graphlib::Node *node : graphlib::topological_sort(*g))
-    {
-        if (node->node_type() == graphlib::NodeType::kBudaOp)
-        {
-            create_op(g, node->as<graphlib::BudaOpNode>(), balancer_solution, node_map, epoch_node_map);
-            NodeP op = node_map.at(node);
-            if (dump_op_perf)
-                op_perf << op->get_name() << ", " << op->get_op_type() << ", "
-                        << balancer_solution->placer_solution.temporal_epoch_id(op->get_name()) << ","
-                        << op->get_perf_data()->op_perf_data.grid.size_r << "x"
-                        << op->get_perf_data()->op_perf_data.grid.size_c << ", "
-                        << op->get_perf_data()->output.size_in_tiles() << ", "
-                        << op->get_perf_data()->op_perf_data.cycle_count_ideal(device_config.arch_name) << ", "
-                        << op->get_perf_data()->op_perf_data.cycle_count_bw_limited(device_config, g) << std::endl;
-        }
-        else if (node->node_type() == graphlib::NodeType::kBudaNaryTM)
-            create_tm(g, node->as<graphlib::BudaNaryTMNode>(), balancer_solution, node_map, epoch_node_map);
-        else
-            create_queue(g, node->as<graphlib::QueueNode>(), balancer_solution, node_map, epoch_node_map);
-    }
-    if (dump_op_perf)
-        op_perf.close();
-}
-
-OpPerfCalculatedData get_op_perf_calculated_data(const PerfDataP perf_data, std::string const& arch_name)
-{
-    OpPerfCalculatedData ret;
-    for (auto input : perf_data->inputs)
-    {
-        ret.input_bw_needed.push_back(1.0 * input.size_in_bytes() / perf_data->op_perf_data.cycle_count_ideal(arch_name));
-    }
-    ret.output_bw_ideal = 1.0 * perf_data->output.size_in_bytes() / perf_data->op_perf_data.cycle_count_ideal(arch_name);
-    return ret;
-}
-
-void PerfModel::calculate_ideal_bws(const SystemSpec& system_spec)
-{
-    // For each op, calculate ideal input/output bws
-    for (NodeP node : graph->get_nodes())
-    {
-        PerfDataP perf_data = node->get_perf_data();
-        if (!perf_data->is_op)
-            continue;
-
-        perf_data->op_perf_calculated_data = get_op_perf_calculated_data(perf_data, system_spec.arch_name);
-    }
-
-    // For queues, add up input bws for each of its consumers
-    for (auto &epoch_graph : temporal_epoch_graphs)
-    {
-        for (NodeP node : epoch_graph->get_nodes())
-        {
-            PerfDataP perf_data = node->get_perf_data();
-            if (perf_data->is_op)
-                continue;
-
-            float total_bw = 0.0;
-            for (NodeP user : node->get_outputs())
-            {
-                std::uint32_t operand_index = user->get_operand_index(node);
-                TT_ASSERT(user->is_op());
-                total_bw += user->get_perf_data()->op_perf_calculated_data.input_bw_needed[operand_index];
-            }
-            perf_data->queue_perf_calculated_data.total_read_bw_ideal = total_bw;
-
-            // If it's writen to, record that
-            for (NodeP operand : node->get_operands())
-            {
-                TT_ASSERT(operand->is_op());
-                perf_data->queue_perf_calculated_data.write_bw_ideal =
-                    operand->get_perf_data()->op_perf_calculated_data.output_bw_ideal;
-                break;  // one operand only
-            }
-        }
-    }
-}
-
-void propagate_bws(Graph *graph, const SystemSpec &system)
-{
-    // Figure out how much dram channel bw is available to each queue
-    std::vector<float> total_dram_bw_requested(system.dram_bw.size(), 0.0);
-    for (NodeP node : graph->get_nodes())
-    {
-        PerfDataP perf_data = node->get_perf_data();
-        if (perf_data->is_op)
-            continue;
-
-        std::uint32_t num_channels_used = perf_data->queue_perf_data.dram_channels.size();
-        for (std::uint32_t channel : perf_data->queue_perf_data.dram_channels)
-        {
-            total_dram_bw_requested[channel] +=
-                perf_data->queue_perf_calculated_data.total_read_bw_ideal / (float)num_channels_used;
-
-            total_dram_bw_requested[channel] +=
-                perf_data->queue_perf_calculated_data.write_bw_ideal / (float)num_channels_used;
-        }
-    }
-
-    // Figure out how much of the requested bw can we actually give each queue
-    std::vector<float> dram_channel_bw_percentage;
-    for (std::uint32_t channel = 0; channel < system.dram_bw.size(); channel++)
-    {
-        dram_channel_bw_percentage.push_back(
-            (total_dram_bw_requested[channel] <= system.dram_bw[channel])
-                ? 1.0
-                : system.dram_bw[channel] / total_dram_bw_requested[channel]);
-    }
-
-    for (NodeP node : graph->get_nodes())
-    {
-        PerfDataP perf_data = node->get_perf_data();
-        OpPerfCalculatedData &od = perf_data->op_perf_calculated_data;
-
-        if (perf_data->is_op)
-        {
-            float worst_input_bw_perc = 1.0;
-            for (std::size_t i = 0; i < node->get_operands().size(); i++)
-            {
-                PerfDataP operand_perf_data = node->get_operands()[i]->get_perf_data();
-                float operand_bw;
-                if (operand_perf_data->is_op)
-                    operand_bw = operand_perf_data->op_perf_calculated_data.output_bw_produced;
-                else
-                    operand_bw = operand_perf_data->queue_perf_calculated_data.total_read_bw_produced;
-
-                od.input_bw_got.push_back(operand_bw);
-
-                float perc = operand_bw / od.input_bw_needed[i];
-                if (perc < worst_input_bw_perc)
-                    worst_input_bw_perc = perc;
-            }
-            TT_ASSERT(worst_input_bw_perc > 0.0);
-            od.output_bw_perc = worst_input_bw_perc;
-            od.output_bw_produced = worst_input_bw_perc * od.output_bw_ideal;
-            od.cycle_count_actual = perf_data->op_perf_data.cycle_count_ideal(system.arch_name) / worst_input_bw_perc;
-            od.utilization = worst_input_bw_perc;  // TODO - need baseline utilization first
-        }
-        else
-        {
-            // queue
-            QueuePerfCalculatedData &qd = perf_data->queue_perf_calculated_data;
-
-            // Figure out total BW available
-            qd.total_bw_perc = 0.0;
-            for (std::uint32_t dram_channel : perf_data->queue_perf_data.dram_channels)
-                qd.total_bw_perc += dram_channel_bw_percentage[dram_channel];
-            qd.total_bw_perc /= (float)perf_data->queue_perf_data.dram_channels.size();
-
-            qd.total_read_bw_produced = qd.total_bw_perc * qd.total_read_bw_ideal;
-            qd.write_bw_received = qd.total_bw_perc * qd.write_bw_ideal;
-        }
-    }
-}
-
-bool is_matmul(NodeP node)
-{
-    // Matmul type, and not brcst / reduce
-    return (
-        node->is_op() && ( (node->get_op_type() == "matmul") || (node->get_op_type() == "sparse_matmul") ) && (node->get_name().find("_brcst_") == std::string::npos) &&
-        (node->get_name().find("_reduce_") == std::string::npos));
-}
-
-void PerfModel::calculate_utilization(const SystemSpec &system)
-{
-    std::ofstream os("utilization.txt");
-    std::stringstream epoch_reports;
-    std::uint32_t core_count = system.grid_size_c * system.grid_size_r;
-    float overall_utilization = 0.0;
-    std::vector<float> epoch_utilization;
-
-    std::uint32_t total_matmul_cores = 0, total_other_cores = 0, total_empty_cores = 0, total_cores = 0;
-    for (std::uint32_t epoch = 0; epoch < temporal_epoch_graphs.size(); epoch++)
-    {
-        // Figure out how many cores are using matmuls vs. else vs. unused
-        // Figure out matmul utilization vs. max theoretical
-        const std::unique_ptr<Graph> &epoch_graph = temporal_epoch_graphs[epoch];
-
-        epoch_reports << "Epoch " << epoch << std::endl;
-        epoch_reports << "===============" << std::endl;
-        std::stringstream matmul_reports;
-
-        std::uint32_t matmul_cores = 0;
-        std::uint32_t other_cores = 0;
-        float matmul_util = 0.0;
-
-        for (NodeP node : epoch_graph->get_nodes())
-        {
-            if (!node->is_op())
-                continue;
-            std::uint32_t cores = node->get_perf_data()->op_perf_data.grid.size();
-
-            if (is_matmul(node))
-            {
-                std::uint32_t cycles = node->get_perf_data()->op_perf_data.cycle_count_ideal(system.arch_name);
-                std::uint32_t theoretical_cycles = node->get_perf_data()->op_perf_data.theoretical_cycles(system.arch_name);
-                float util = (float)theoretical_cycles / cycles;
-                std::uint32_t util_p = (util * 100.0);
-
-                matmul_reports << node->get_name() << ": cores " << cores << ", cycles " << cycles << ", theoretical "
-                               << theoretical_cycles << ", util " << util_p << "%" << std::endl;
-
-                matmul_util = ((matmul_util * matmul_cores) + (util * cores)) / (matmul_cores + cores);
-                matmul_cores += cores;
-            }
-            else
-            {
-                other_cores += cores;
-            }
-        }
-
-        std::uint32_t empty_cores = core_count - matmul_cores - other_cores;
-        float matmul_core_util = (float)matmul_cores / core_count;
-        std::uint32_t matmul_core_util_p = 100.0 * matmul_core_util;
-
-        epoch_reports << "Matmul cores:     " << matmul_cores << " (" << matmul_core_util_p << "%)" << std::endl;
-        epoch_reports << "Non-Matmul cores: " << other_cores << std::endl;
-        epoch_reports << "Empty cores:      " << empty_cores << std::endl;
-        epoch_reports << std::endl;
-        std::uint32_t matmul_util_p = 100.0 * matmul_util;
-        epoch_reports << "Matmul math utilization: " << matmul_util_p << "%" << std::endl;
-        float epoch_util = matmul_util * matmul_core_util;
-        std::uint32_t epoch_util_p = 100.0 * epoch_util;
-        epoch_reports << "Overall epoch utilization: " << epoch_util_p << "%" << std::endl;
-        epoch_reports << std::endl << "Matmul report: " << std::endl;
-        epoch_reports << matmul_reports.str() << std::endl;
-
-        epoch_utilization.push_back(epoch_util);
-        overall_utilization = ((epoch * overall_utilization) + epoch_util) / (epoch + 1);
-
-        total_matmul_cores += matmul_cores;
-        total_other_cores += other_cores;
-        total_empty_cores += empty_cores;
-        total_cores += core_count;
-    }
-
-    os << "Overall utilization: " << (std::uint32_t)(100.0 * overall_utilization) << "%" << std::endl;
-    os << "Total cores:            " << total_cores << std::endl;
-    os << "Total matmul cores:     " << total_matmul_cores << " ("
-       << (std::uint32_t)(100.0 * total_matmul_cores / total_cores) << "%)" << std::endl;
-    os << "Total non-matmul cores: " << total_other_cores << " ("
-       << (std::uint32_t)(100.0 * total_other_cores / total_cores) << "%)" << std::endl;
-    os << "Total empty cores:      " << total_empty_cores << " ("
-       << (std::uint32_t)(100.0 * total_empty_cores / total_cores) << "%)" << std::endl;
-    os << std::endl;
-
-    for (std::uint32_t epoch = 0; epoch < epoch_utilization.size(); epoch++) {
-        os << "Epoch " << epoch << " utilization: " << (std::uint32_t)(100.0 * epoch_utilization[epoch]) << "%"
-           << std::endl;
-        results["epoch_" + std::to_string(epoch) + "_utilization"] = epoch_utilization[epoch];
-    }
-    os << std::endl;
-    os << epoch_reports.str();
-
-    results["overall_utilization"] = overall_utilization;
-    results["total_matmul_cores"] = total_matmul_cores;
-    results["total_non_matmul_cores"] = total_other_cores;
-    results["total_empty_cores"] = total_empty_cores;
-
-    os.close();
-}
-
-// Convert graph to perf model graph, and graphs for each temporal epoch
-PerfModel::PerfModel(
-    graphlib::Graph *g,
-    const std::string &graph_name,
-    const DeviceConfig &device_config,
-    const std::shared_ptr<balancer::BalancerSolution> balancer_solution) :
-    graph_name(graph_name),
-    device_config(device_config)
-{
-    // create main and epoch graphs
-    create_graphs(g, balancer_solution);
-    SystemSpec system = SystemSpec::get_for_device(device_config);
-
-    // calculate ideal bandwidths for queues and ops
-    calculate_ideal_bws(system);
-
-    // calculate utilization
-    if (env_as<bool>("PYBUDA_PERF_UTIL"))
-        calculate_utilization(system);
-
-    // Propagate BWs
-    for (auto &epoch_graph : temporal_epoch_graphs) propagate_bws(epoch_graph.get(), system);
-
-    if (env_as<bool>("PYBUDA_PERF_SIMULATOR"))
-    {
-        std::uint32_t original_microbatch = g->get_microbatch();
-        if (auto sim_mb = env_as_optional<int>("PYBUDA_PERF_SIMULATOR_MICROBATCH"))
-            g->set_microbatch(*sim_mb);
-
-        bool sim_log = env_as<bool>("PYBUDA_PERF_SIMULATOR_LOG");
-        bool sim_trace = env_as<bool>("PYBUDA_PERF_SIMULATOR_TRACE");
-
-        std::uint32_t total_runtime = 0;
-        for (std::uint32_t epoch = 0; epoch < temporal_epoch_graphs.size(); epoch++)
-        {
-            auto &epoch_graph = temporal_epoch_graphs[epoch];
-            auto sim = perf_model::Simulator(epoch_graph.get(), g->get_microbatch(), sim_trace, sim_log);
-            bool sim_ok = sim.run(device_config.arch_name, epoch);
-            TT_ASSERT(sim_ok);
-            std::uint32_t epoch_timestamp = sim.get_timestamp();
-            log_debug(tt::LogPerfModel, "Epoch {} expected cycles: {}", epoch, epoch_timestamp);
-            results["expected_epoch_" + std::to_string(epoch) + "_cycles"] = epoch_timestamp;
-            total_runtime += epoch_timestamp;
-        }
-        // TBD device config
-        float cycles_per_second = 1.2 * 1000000000;
-        float expected_perf = round(100.0 * g->get_microbatch() * (cycles_per_second / total_runtime)) / 100.0;
-        log_info(
-            tt::LogPerfModel,
-            "Expected perf: {} samples/s (Total cycles {} for {} inputs)",
-            expected_perf,
-            total_runtime,
-            g->get_microbatch());
-
-        results["total_runtime_cycles"] = total_runtime;
-        results["expected_perf"] = expected_perf;
-
-        if (env_as<bool>("PYBUDA_PERF_STOP_AFTER_SIMULATOR"))
-            TT_ASSERT(false);  // hacky way to stop
-
-        // revert
-        g->set_microbatch(original_microbatch);
-    }
-}
-
-std::unordered_map<std::string, float> run_performance_model(
-    graphlib::Graph *g,
-    const std::string &graph_name,
-    const DeviceConfig &device_config,
-    const std::shared_ptr<balancer::BalancerSolution> balancer_solution)
-{
-    log_info(tt::LogPerfModel, "Running performance model...");
-    PerfModel model = PerfModel(g, graph_name, device_config, balancer_solution);
-    return model.get_results();
-}
-
-}  // namespace tt::perf_model
-
diff --git a/pybuda/csrc/perf_model/perf_model.hpp b/pybuda/csrc/perf_model/perf_model.hpp
deleted file mode 100644
index aa8d57d4f..000000000
--- a/pybuda/csrc/perf_model/perf_model.hpp
+++ /dev/null
@@ -1,77 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <memory>
-
-#include "balancer/balancer.hpp"
-#include "perf_model/graph.hpp"
-
-namespace tt
-{
-namespace graphlib
-{
-class Graph;
-}
-namespace perf_model
-{
-
-class PerfModel
-{
-   private:
-    std::unique_ptr<perf_model::Graph> graph;
-    std::vector<std::unique_ptr<perf_model::Graph>> temporal_epoch_graphs;
-    std::string graph_name;
-    const DeviceConfig &device_config;
-
-    std::unordered_map<std::string, float> results;
-
-   public:
-    PerfModel(
-        graphlib::Graph *g,
-        const std::string &graph_name,
-        const DeviceConfig &device_config,
-        const std::shared_ptr<balancer::BalancerSolution> balancer_solution);
-
-    using NodeMap = std::unordered_map<graphlib::Node *, NodeP>;
-
-    std::unordered_map<std::string, float> get_results() const { return results; }
-
-   private:
-    void create_graphs(graphlib::Graph *g, const std::shared_ptr<balancer::BalancerSolution> balancer_solution);
-    void calculate_ideal_bws(const SystemSpec &system);
-
-    void create_op(
-        graphlib::Graph *g,
-        graphlib::BudaOpNode *op,
-        const std::shared_ptr<balancer::BalancerSolution> balancer_solution,
-        NodeMap &node_map,
-        std::vector<NodeMap> &epoch_node_map);
-
-    void create_tm(
-        graphlib::Graph *g,
-        graphlib::BudaNaryTMNode *tm,
-        const std::shared_ptr<balancer::BalancerSolution> balancer_solution,
-        NodeMap &node_map,
-        std::vector<NodeMap> &epoch_node_map);
-
-    void create_queue(
-        graphlib::Graph *g,
-        graphlib::QueueNode *q,
-        const std::shared_ptr<balancer::BalancerSolution> balancer_solution,
-        NodeMap &node_map,
-        std::vector<NodeMap> &epoch_node_map);
-
-    void calculate_utilization(const SystemSpec &system);
-};
-
-std::unordered_map<std::string, float> run_performance_model(
-    graphlib::Graph *g,
-    const std::string &graph_name,
-    const DeviceConfig &device_config,
-    const std::shared_ptr<balancer::BalancerSolution> balancer_solution);
-
-}  // namespace perf_model
-}  // namespace tt
-
diff --git a/pybuda/csrc/perf_model/simulator.cpp b/pybuda/csrc/perf_model/simulator.cpp
deleted file mode 100644
index 00610945f..000000000
--- a/pybuda/csrc/perf_model/simulator.cpp
+++ /dev/null
@@ -1,501 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "perf_model/simulator.hpp"
-#include "utils/assert.hpp"
-
-namespace tt::perf_model
-{
-std::uint32_t Buffer::s_id = 0;
-std::ofstream Simulator::s_log;
-bool Simulator::s_write_log = false;
-
-Simulator::Simulator(Graph *graph, std::uint32_t input_count, bool trace, bool log) :
-    graph(graph), input_count(input_count)
-{
-    sim_state = std::make_unique<SimState>(
-        SimState{.timestamp = 0, .total_input_count = input_count, .trace = trace, .trace_op = {}});
-    s_write_log = log;
-}
-
-SimCache::~SimCache()
-{
-    for (auto &[node, input_buffers] : node_input_buffer_map)
-        for (auto input_buffer : input_buffers) delete input_buffer;
-
-    for (auto &[node, output_buffer] : node_output_buffer_map) delete output_buffer;
-}
-
-std::uint32_t Buffer::available_space() const { return size - occupied - reserved; }
-void Buffer::reserve_space(std::uint32_t count)
-{
-    TT_ASSERT(available_space() >= count);
-    reserved += count;
-}
-
-void Buffer::insert_data(std::uint32_t count)
-{
-    TT_ASSERT(reserved >= count, "Trying to insert data without reserving space first");
-    reserved -= count;
-    occupied += count;
-}
-
-void Buffer::pop_data(std::uint32_t count)
-{
-    TT_ASSERT(
-        occupied >= count,
-        "Buffer underflow (have  {}, popping {}): {}", occupied, count, to_string());
-    occupied -= count;
-}
-
-void Buffer::pop_threshold() { pop_data(threshold); }
-
-bool Buffer::above_threshold() const { return occupied >= threshold; }
-
-bool Buffer::empty() const { return (occupied == 0) && (reserved == 0); }
-
-std::string Buffer::to_string(bool show_contents) const
-{
-    std::stringstream ss;
-    ss << "#" << unique_id;
-    if (input)
-        ss << " InputBuffer";
-    else
-        ss << " OutputBuffer";
-
-    ss << "(" << owner->get_name();
-    if (input)
-        ss << ", operand " << operand;
-    ss << ")";
-
-    if (show_contents)
-        ss << " size: " << size << ", occupied: " << occupied << ", reserved: " << reserved
-           << ", threshold: " << threshold << ", broadcast_x: " << broadcast_multiplier;
-    return ss.str();
-}
-
-std::vector<std::pair<Buffer *, std::uint32_t>> SimCache::node_outputs(NodeP node)
-{
-    // Get consumer / op index
-    auto consumers = [&](const NodeP node)
-    {
-        std::vector<std::pair<Buffer *, std::uint32_t>> ret;
-        std::unordered_map<NodeP, std::uint32_t>
-            last_operand_index;  // to detect the case where multiple outputs go to same node
-        for (NodeP user : node->get_outputs())
-        {
-            auto operand_index = user->get_operand_index(node, last_operand_index[user]);
-            last_operand_index[user] = operand_index + 1;  // skip this one next time it's looked up
-            ret.push_back(std::make_pair(node_input_buffer(user, operand_index), operand_index));
-        }
-        return ret;
-    };
-
-    auto it = node_output_map.find(node);
-    if (it == node_output_map.end())
-    {
-        auto ret = consumers(node);
-        node_output_map.insert(std::make_pair(node, ret));
-        return ret;
-    }
-
-    return it->second;
-}
-
-const std::vector<Buffer *> SimCache::node_input_buffers(NodeP node)
-{
-    auto it = node_input_buffer_map.find(node);
-    if (it == node_input_buffer_map.end())
-    {
-        std::vector<Buffer *> ibs;
-        for (std::size_t operand_index = 0; operand_index < node->get_operands().size(); operand_index++)
-        {
-            if (!node->is_op())
-            {
-                std::uint32_t output_size = node_output_size_in_tiles(node);
-                ibs.push_back(new Buffer(node, operand_index, output_size * 2, output_size, 1));
-                continue;
-            }
-
-            NodeP operand = node->get_operands()[operand_index];
-            std::uint32_t input_size =
-                node->get_perf_data()->op_perf_data.op_model.input_buffers.at(operand_index).l1_size_tiles;
-
-            auto perf_data = node->get_perf_data()->op_perf_data;
-            auto input_block_shape = perf_data.op_model.input_buffers.at(operand_index).block_shape;
-            std::uint32_t threshold = input_block_shape.buffered_rt() * input_block_shape.buffered_ct();
-
-            std::uint32_t grid_multiplier = 0;
-            if ( (node->get_op_type() == "matmul") || (node->get_op_type() == "sparse_matmul") )
-            {
-                if (operand_index == 0)
-                    grid_multiplier = perf_data.grid.size_r;  // same activations on each column
-                else if (operand_index == 1)
-                    grid_multiplier = perf_data.grid.size_c;  // same params on each row
-                else if (operand_index == 2) {
-                    if (node->get_op_type() == "matmul")
-                    {
-                        // Bias input shape is just ublock, but matmul even doesn't read this ublock by ublock...
-                        // so let's override here for now
-                        threshold = perf_data.op_model.input_buffers.at(1).block_shape.buffered_ct();
-                        if (input_size < threshold) input_size = threshold; // not correct, but ok for bias... TODO
-                        grid_multiplier = perf_data.grid.size_c * node->get_perf_data()->output.shape.rt();  // bias
-                    }
-                    else
-                    {
-                        // Special case, we read some variable amount per t... so we'll just read all of it once at the end
-                        input_size = node->get_perf_data()->inputs.at(2).size_in_tiles();
-                        threshold = input_size;
-                        grid_multiplier = 1;
-                    }
-                }
-                else 
-                {
-                    TT_THROW("Invalid operand for matmul");
-                }
-
-            }
-            else
-                grid_multiplier = perf_data.grid.size();
-
-            input_size *= grid_multiplier;
-            threshold *= grid_multiplier;
-            std::uint32_t broadcast_multiplier = node->get_perf_data()->input_broadcast_multiplier.at(operand_index);
-
-            if (node->get_op_type() == "fused_op")
-            {
-                // We need to do some fused op input analysis when fusing, and then figure out their ublock consumption rate when setting input ublock shapes, and input buffers.
-                // Unitl then, we can "cheat", and pretend that fused op consumes and produces full inputs/mblocks, and doesn't stream ublock by ublock.
-                auto input_tensor_shape = perf_data.op_model.op_shape.inputs[operand_index];
-                threshold = input_tensor_shape.rt * input_tensor_shape.ct;
-                input_size = threshold * 2;
-            }
-
-
-            //input_size *= 1024; // TEST
-
-            ibs.push_back(new Buffer(
-                node,
-                operand_index,
-                input_size,
-                threshold,
-                broadcast_multiplier));
-
-            SIMLOG << "Node " << node->get_name() << ", operand " << operand_index << " input buffer: "
-                   << " input_size: " << input_size << ", threshold: " << threshold << " ("
-                   << input_block_shape.buffered_rt() << ", " << input_block_shape.buffered_ct()
-                   << "), broadcast_x: " << broadcast_multiplier << " grid: " << perf_data.grid.size_r << "x"
-                   << perf_data.grid.size_c << ": ";
-            SIMLOG << ibs.back()->to_string() << std::endl;
-        }
-
-        node_input_buffer_map.insert(std::make_pair(node, ibs));
-        return ibs;
-    }
-
-    return it->second;
-}
-
-Buffer *SimCache::node_input_buffer(NodeP node, std::uint32_t operand_index)
-{
-    auto input_buffers = node_input_buffers(node);
-    TT_ASSERT(operand_index < input_buffers.size());
-    return input_buffers.at(operand_index);
-}
-
-Buffer *SimCache::node_output_buffer(NodeP node)
-{
-    auto it = node_output_buffer_map.find(node);
-    if (it == node_output_buffer_map.end())
-    {
-        return create_node_output_buffer(node);
-    }
-
-    return it->second;
-}
-
-// API to explicitly create an output buffer with given output multiplier
-Buffer *SimCache::create_node_output_buffer(NodeP node, std::uint32_t output_mb)
-{
-    std::uint32_t output_size = node->get_perf_data()->output.size_in_tiles() * output_mb; // default for nodes (like input) that don't have output buffers
-    if (node->get_perf_data()->op_perf_data.op_model.output_buffers.size() > 0) 
-    {
-        output_size = node->get_perf_data()->op_perf_data.op_model.output_buffers.at(0).l1_size_tiles;
-        output_size *= node->get_perf_data()->op_perf_data.op_model.grid_shape.volume();
-    }
-
-    auto ret = new Buffer(node, output_size);
-    node_output_buffer_map.insert(std::make_pair(node, ret));
-    return ret;
-}
-
-std::uint32_t SimCache::node_output_size_in_tiles(NodeP node)
-{
-    auto it = node_output_size_map.find(node);
-    if (it == node_output_size_map.end())
-    {
-        std::uint32_t output_size = node->get_perf_data()->output.size_in_tiles();
-        node_output_size_map.insert(std::make_pair(node, output_size));
-        return output_size;
-    }
-    return it->second;
-}
-
-void Simulator::unstall_dependencies(Buffer *b)
-{
-    auto it = stalled_events.find(b);
-    if (it != stalled_events.end())
-    {
-        // Find the lowest input for which we have a stalled event. Don't unstall any after it, it's not necessary.
-        std::uint32_t lowest_input = UINT32_MAX;
-        for (DataEvent *e : it->second)
-            if (e->get_input_index() < lowest_input)
-                lowest_input = e->get_input_index();
-
-        std::vector<DataEvent *> remaining_events;
-        for (DataEvent *e : it->second)
-        {
-            // TODO
-            if (e->get_input_index() > lowest_input + 1)
-            {
-                remaining_events.push_back(e);
-                continue;
-            }
-
-            // Erase the event from any other buffer stalls it was on
-            for (Buffer *other_b : stalled_events_reverse_map.at(e))
-            {
-                if (other_b == b)
-                    continue;
-                auto &v = stalled_events.at(other_b);
-                auto it2 = std::find(v.begin(), v.end(), e);
-                TT_ASSERT(it2 != v.end());
-                v.erase(it2);
-            }
-
-            // Not stalled any more
-            stalled_events_reverse_map.erase(e);
-
-            add_data_event(e);
-            SIMLOG << "  UNSTALL " << e->to_string() << std::endl;
-        }
-
-        if (remaining_events.size() > 0)
-            stalled_events[b] = remaining_events;
-        else
-            stalled_events.erase(it);
-    }
-}
-
-void Simulator::schedule_ops(SimCacheP &cache)
-{
-    for (NodeP node : graph->get_nodes())
-    {
-        if (node->is_op())
-        {
-            add_data_event(new OpDataEvent(
-                0,
-                TimeData{.count = cache->node_output_size_in_tiles(node), .timestamp = 0},
-                cache->node_output_buffer(node),
-                0,
-                0,
-                0));
-        }
-        else if ((node->get_operands().size() > 0) && (node->get_outputs().size() > 0))
-        {
-            // Intra-epoch queue
-            add_data_event(new QueueDataEvent(
-                0,
-                TimeData{.count = cache->node_output_size_in_tiles(node), .timestamp = 0},
-                cache->node_input_buffer(node, 0)));
-        }
-    }
-}
-
-bool Simulator::run(std::string const& arch_name, std::uint32_t epoch)
-{
-    if (Simulator::s_write_log)
-        s_log.open("simulator.log");
-
-    SIMLOG << "NODES:" << std::endl;
-    for (NodeP node : graph->get_nodes())
-    {
-        SIMLOG << " - " << node->get_name() << std::endl;
-        for (NodeP input : node->get_operands()) SIMLOG << "      *-> " << input->get_name() << std::endl;
-        for (NodeP output : node->get_outputs()) SIMLOG << "    <-*   " << output->get_name() << std::endl;
-    }
-
-    // Cache various lookups that we'll be doing a lot
-    auto cache = std::make_unique<SimCache>();
-
-    // Populate host read/write events
-    initialize_io(cache, sim_state);
-
-    // Schedule first set of ops
-    schedule_ops(cache);
-
-    sim_state->timestamp = 0;
-
-    if (env_as<bool>("PYBUDA_PERF_SIMULATOR_TRACE"))
-    {
-        for (NodeP node : graph->get_nodes())
-        {
-            if (!node->is_op())
-                continue;
-
-            if (sim_state->trace)
-                sim_state->trace_op.emplace(std::pair(
-                    node,
-                    new TraceOp(
-                        node->get_name(),
-                        node->get_op_type(),
-                        node->get_perf_data()->op_perf_data.grid,
-                        DataFormat::Float16_b,
-                        {DataFormat::Float16_b})));
-        }
-    }
-
-    while (event_queue.size() > 0)
-    {
-        DataEvent *event = pop_data_event();
-        if (event->timestamp() > sim_state->timestamp)
-            sim_state->timestamp = event->timestamp();
-
-        SIMLOG << "@" << sim_state->timestamp << " Processing: " << event->to_string() << std::endl;
-
-        ProcessStatus ps = event->process(sim_state, cache, arch_name);
-
-        for (Buffer *b : ps.modified_buffers) unstall_dependencies(b);
-
-        for (Buffer *stall_buffer : ps.stall_reason)
-        {
-            // Record the stall so that we can re-queue this event later
-            SIMLOG << "  STALLED on " << stall_buffer->to_string() << std::endl;
-            stalled_events[stall_buffer].push_back(event);
-            stalled_events_reverse_map[event].push_back(stall_buffer);
-        }
-
-        if (ps.stall_reason.size() == 0)
-        {
-            // Done
-            delete event;
-        }
-
-        // Schedule new events
-        for (DataEvent *new_event : ps.new_events)
-        {
-            SIMLOG << "  SCHEDULE " << new_event->to_string() << std::endl;
-            add_data_event(new_event);
-        }
-    }
-
-    for (auto &[stalled, bufs] : stalled_events_reverse_map)
-    {
-        SIMLOG << "** STALLED event: " << stalled->to_string() << std::endl;
-        for (Buffer *b : bufs)
-        {
-            SIMLOG << "  - on buffer: " << b->to_string() << std::endl;
-        }
-        delete stalled;
-    }
-    bool ok = stalled_events.size() == 0;
-    stalled_events.clear();
-    stalled_events_reverse_map.clear();
-
-    for (NodeP node : graph->get_nodes())
-    {
-        for (Buffer *input : cache->node_input_buffers(node))
-            if (!input->empty()) {
-                SIMLOG << "** NON-EMPTY BUFFER: " << input->to_string(true) << std::endl;
-                ok = false;
-            }
-
-        Buffer *output = cache->node_output_buffer(node);
-        if (!output->empty()) {
-            SIMLOG << "** NON-EMPTY BUFFER: " << output->to_string(true) << std::endl;
-            ok = false;
-        }
-    }
-    if (Simulator::s_write_log)
-        s_log.close();
-
-    if (ok && sim_state->trace)
-    {
-        std::ofstream os("perf_postprocess_epoch_" + std::to_string(epoch) + ".json");
-        std::vector<std::uint32_t> input_indices = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15};
-        for (std::uint32_t i = input_count - 3; i < input_count; i++)
-            if (i > 2)
-                input_indices.push_back(i);
-        // std::vector<std::uint32_t> input_indices;
-        //  for (std::uint32_t input_index = 0; input_index < input_count; input_index++)
-        //      input_indices.push_back(input_index);
-
-        os << sim_state->trace_to_json(input_indices);
-
-        os.close();
-    }
-    return ok;
-}
-
-// Populate input/output events
-void Simulator::initialize_io(SimCacheP &cache, SimStateP &sim_state)
-{
-    for (std::uint32_t input_index = 0; input_index < input_count; input_index++)
-    {
-        for (NodeP input : graph->get_inputs())
-        {
-            // Input buffer holds the full microbatch
-            Buffer *b = cache->create_node_output_buffer(input, sim_state->total_input_count);
-            std::uint32_t count = cache->node_output_size_in_tiles(input);
-            add_data_event(new HostWriteDataEvent(input_index, TimeData{.count = count, .timestamp = input_index}, b));
-        }
-
-        // TODO: for optimizer outputs, we'll read the weights out before they reach the optimizer ops... causing a hang
-        for (NodeP output : graph->get_outputs())
-        {
-            Buffer *b = cache->node_input_buffer(output, 0);
-            std::uint32_t count = cache->node_output_size_in_tiles(output);
-            add_data_event(new HostReadDataEvent(input_index, TimeData{.count = count, .timestamp = input_index}, b));
-        }
-    }
-}
-
-void Simulator::add_data_event(DataEvent *event) { event_queue.push(event); }
-
-DataEvent *Simulator::pop_data_event()
-{
-    DataEvent *ret = event_queue.top();
-    event_queue.pop();
-    return ret;
-}
-
-std::string SimState::trace_to_json(const std::vector<std::uint32_t> &input_indices) const
-{
-    json j;
-    for (auto &[node, op] : trace_op)
-    {
-        op->add_to_json(j, input_indices);
-    }
-
-    auto &e = j["per-epoch-events"];
-
-    e["AICLK"] = 1202;
-    e["average-input-throughput"] = 230.54;
-    e["device-id"] = 0;
-    e["last-input-" + std::to_string(input_indices.back()) + "-execution-time"] = 25000;
-
-    auto &p = e["last-pack"];
-    for (auto input_index : input_indices)
-    {
-        p["input_" + std::to_string(input_index)] = {{"core-id", "(x=2, y=2)"}, {"end-timestamp", 25000}};
-    }
-    auto &u = e["unpack-first-block-available"];
-    for (auto input_index : input_indices)
-    {
-        u["input_" + std::to_string(input_index)] = {{"core-id", "(x=2, y=2)"}, {"timestamp", 0}};
-    }
-
-    return j.dump(2);
-}
-
-}  // namespace tt::perf_model
diff --git a/pybuda/csrc/perf_model/simulator.hpp b/pybuda/csrc/perf_model/simulator.hpp
deleted file mode 100644
index fe6e4e59c..000000000
--- a/pybuda/csrc/perf_model/simulator.hpp
+++ /dev/null
@@ -1,177 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-#include <fstream>
-#include <list>
-#include <queue>
-#include <unordered_map>
-
-#include "perf_model/event.hpp"
-#include "perf_model/graph.hpp"
-#include "perf_model/trace.hpp"
-
-#define SIMLOG                  \
-    if (Simulator::s_write_log) \
-    Simulator::s_log
-
-namespace tt::perf_model
-{
-
-// Input buffer keeps track of received data
-class Buffer
-{
-    // Set at creation
-    static std::uint32_t s_id;
-    std::string unique_id;
-    NodeP owner;
-    bool input;  // input or output
-    std::uint32_t size;
-
-    // Input buffer only
-    std::uint32_t operand;
-    std::uint32_t threshold;             // how much is needed by the node to process
-    std::uint32_t broadcast_multiplier;  // incoming data is multiplied through broadcast
-
-    // Simulation time
-    std::uint32_t reserved = 0;
-    std::uint32_t occupied = 0;
-
-   public:
-    // Input buffer
-    Buffer(
-        NodeP owner,
-        std::uint32_t operand,
-        std::uint32_t size,
-        std::uint32_t threshold,
-        std::uint32_t broadcast_multiplier) :
-        owner(owner),
-        input(true),
-        size(size),
-        operand(operand),
-        threshold(threshold),
-        broadcast_multiplier(broadcast_multiplier)
-    {
-        TT_LOG_ASSERT(threshold <= size, "Buffer {} threshold ({}) is larger than buffer size ({})", owner->get_name(), threshold, size);
-        unique_id = "b" + std::to_string(s_id++);
-    }
-
-    // Output buffer
-    Buffer(NodeP owner, std::uint32_t size) : owner(owner), input(false), size(size), threshold(0), broadcast_multiplier(0)
-    {
-        unique_id = "b" + std::to_string(s_id++);
-    }
-
-    NodeP get_node() const { return owner; }
-    std::uint32_t get_operand() const { return operand; }
-    std::uint32_t get_threshold() const { return threshold; }
-    std::uint32_t get_broadcast_multiplier() const { return broadcast_multiplier; }
-    bool is_input() const { return input; }
-
-    std::uint32_t available_space() const;
-    void reserve_space(std::uint32_t count);
-    void insert_data(std::uint32_t count);
-    void pop_data(std::uint32_t count);
-    void pop_threshold();  // pop data, threshold amount
-    bool above_threshold() const;
-    bool empty() const;
-
-    std::string to_string(bool show_contents = false) const;
-
-    // Process data in the buffer, and return the amount consumed, if any
-    std::uint32_t process();
-};
-
-// Cache regularly looked up data that requires a bit of calculation
-class SimCache
-{
-   private:
-    std::unordered_map<NodeP, std::vector<Buffer *>> node_input_buffer_map;
-    std::unordered_map<NodeP, Buffer *> node_output_buffer_map;
-    std::unordered_map<NodeP, std::uint32_t> node_output_size_map;
-    using OutputMap = std::unordered_map<NodeP, std::vector<std::pair<Buffer *, std::uint32_t>>>;
-    OutputMap node_output_map;
-
-   public:
-    ~SimCache();
-    const std::vector<Buffer *> node_input_buffers(NodeP node);
-    Buffer *node_input_buffer(NodeP node, std::uint32_t operand_index);
-    Buffer *node_output_buffer(NodeP node);
-    Buffer *create_node_output_buffer(NodeP node, std::uint32_t output_mb = 2);
-    std::uint32_t node_output_size_in_tiles(NodeP node);
-    std::vector<std::pair<Buffer *, std::uint32_t>> node_outputs(NodeP node);
-};
-
-// Simulator state
-struct SimState
-{
-    std::uint32_t timestamp;
-    std::uint32_t total_input_count;
-    bool trace;  // set to generate trace for routeagui
-    std::unordered_map<NodeP, TraceOp *> trace_op;
-
-    std::string trace_to_json(const std::vector<std::uint32_t> &input_indices) const;
-};
-
-using SimCacheP = std::unique_ptr<SimCache>;
-using SimStateP = std::unique_ptr<SimState>;
-
-struct EventComp
-{
-    bool operator()(const DataEvent *a, const DataEvent *b) { return *b < *a; }
-};
-
-using EventQueue = std::priority_queue<DataEvent *, std::vector<DataEvent *>, EventComp>;
-
-// Main simulator class
-class Simulator
-{
-   private:
-    // Graph we're modelling
-    Graph *graph;
-    std::uint32_t input_count;
-
-    // Pending, non-stalled, events to be processed
-    EventQueue event_queue;
-
-    // Current state
-    SimStateP sim_state;
-
-    // Stalled events, keyed on buffer they are waiting on, as well as a reverse map
-    std::unordered_map<Buffer *, std::vector<DataEvent *>> stalled_events;
-    std::unordered_map<DataEvent *, std::vector<Buffer *>> stalled_events_reverse_map;
-
-    // Input buffers
-    std::unordered_map<NodeP, std::vector<Buffer *>> input_buffers;  // vector (of operands) per node
-
-    // Populate input/output events
-    void initialize_io(SimCacheP &cache, SimStateP &sim_state);
-
-    // Schedule first set of ops on all cores
-    void schedule_ops(SimCacheP &cache);
-
-    // Add data event to the end of the queue
-    void add_data_event(DataEvent *event);
-
-    // Pop the left-most event
-    DataEvent *pop_data_event();
-
-    // Re-schedule events that were stalled on this buffer
-    void unstall_dependencies(Buffer *b);
-
-   public:
-    Simulator(Graph *graph, std::uint32_t input_count, bool trace = false, bool log = false);
-
-    // Temporary logging to a file
-    static bool s_write_log;
-    static std::ofstream s_log;
-
-    // Run full simulation, return true if completed without a deadlock
-    // Epoch number is only used to generated logs and traces
-    bool run(std::string const& arch_name, std::uint32_t epoch = 0);
-
-    // Get final timestamp
-    std::uint32_t get_timestamp() const { return sim_state->timestamp; }
-};
-
-}  // namespace tt::perf_model
diff --git a/pybuda/csrc/perf_model/tests/gtest_main.cpp b/pybuda/csrc/perf_model/tests/gtest_main.cpp
deleted file mode 100644
index a4addef8a..000000000
--- a/pybuda/csrc/perf_model/tests/gtest_main.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include <gtest/gtest.h>
-#include <pybind11/embed.h>
-
-int main(int argc, char **argv) {
-    ::testing::InitGoogleTest(&argc, argv);
-    pybind11::scoped_interpreter guard{};
-    return RUN_ALL_TESTS();
-}
diff --git a/pybuda/csrc/perf_model/tests/module.mk b/pybuda/csrc/perf_model/tests/module.mk
deleted file mode 100644
index 094ae79d6..000000000
--- a/pybuda/csrc/perf_model/tests/module.mk
+++ /dev/null
@@ -1,22 +0,0 @@
-PYBUDA_CSRC_PERF_MODEL_TESTS = $(TESTDIR)/pybuda/csrc/perf_model/tests/perf_model_unit_tests
-PYBUDA_CSRC_PERF_MODEL_TESTS_SRCS = \
-	pybuda/csrc/perf_model/tests/simulator_tests.cpp \
-	pybuda/csrc/perf_model/tests/gtest_main.cpp
-
-PYBUDA_CSRC_PERF_MODEL_TESTS_INCLUDES = $(PYBUDA_CSRC_PERF_MODEL_INCLUDES)
-PYBUDA_CSRC_PERF_MODEL_TESTS_LDFLAGS = -lstdc++fs -lgtest -lgtest_main -lpthread -l$(PYTHON_VERSION) -lm
-
-PYBUDA_CSRC_PERF_MODEL_TESTS_OBJS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_PERF_MODEL_TESTS_SRCS:.cpp=.o))
-PYBUDA_CSRC_PERF_MODEL_TESTS_DEPS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_PERF_MODEL_TESTS_SRCS:.cpp=.d))
-
--include $(PYBUDA_CSRC_PERF_MODEL_TESTS_DEPS)
-
-pybuda/csrc/perf_model/tests: $(PYBUDA_CSRC_PERF_MODEL_TESTS)
-
-$(PYBUDA_CSRC_PERF_MODEL_TESTS): $(PYBUDA_CSRC_PERF_MODEL_TESTS_OBJS) $(PYBUDA_CSRC_PERF_MODEL_LIB) $(PYBUDA_CSRC_GRAPH_LIB)
-	@mkdir -p $(@D)
-	$(CXX) $(PERF_MODEL_CSRC_CFLAGS) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) $(PYBUDA_CSRC_PERF_MODEL_TESTS_LDFLAGS)
-
-$(OBJDIR)/pybuda/csrc/perf_model/tests/%.o: pybuda/csrc/perf_model/tests/%.cpp
-	@mkdir -p $(@D)
-	$(CXX) $(PERF_MODEL_CSRC_CFLAGS) $(CXXFLAGS) $(PYBUDA_CSRC_PERF_MODEL_TESTS_INCLUDES) -c -o $@ $<
diff --git a/pybuda/csrc/perf_model/tests/simulator_tests.cpp b/pybuda/csrc/perf_model/tests/simulator_tests.cpp
deleted file mode 100644
index 1dc0a9031..000000000
--- a/pybuda/csrc/perf_model/tests/simulator_tests.cpp
+++ /dev/null
@@ -1,162 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include <stdexcept>
-#include <unordered_map>
-#include <vector>
-
-#include "gtest/gtest.h"
-#include "perf_model/perf_model.hpp"
-#include "perf_model/simulator.hpp"
-
-using namespace tt::perf_model;
-
-// Utility for making perf data structure with the correct size in tiles
-PerfDataP perf_data(std::uint32_t out_size_in_tiles, std::vector<std::uint32_t> in_size_in_tiles, bool is_op)
-{
-    auto td = [](std::uint32_t size, std::uint32_t t = 1) {
-        return TensorData{.shape = tt::graphlib::Shape::create({1, 1, 32, 32 * size}), .t = t};
-    };
-
-    std::vector<TensorData> inputs;
-    for (auto in : in_size_in_tiles) inputs.push_back(td(in));
-
-    if (is_op)
-    {
-        OpGrid grid1 = OpGrid{.loc_r = 0, .loc_c = 0, .size_r = 1, .size_c = 1};
-        tt::balancer::OpModel op_model;
-        return std::make_shared<PerfData>(
-            PerfData(inputs, td(out_size_in_tiles), OpPerfData{.grid = grid1, .op_model=op_model}));
-    }
-
-    return std::make_shared<PerfData>(PerfData(inputs, td(out_size_in_tiles), QueuePerfData{}));
-};
-
-// Utility functions for quick graph creations
-NodeP add_queue(
-    tt::perf_model::Graph *graph,
-    const std::string &name,
-    bool is_input,
-    NodeP operand = nullptr,
-    std::uint32_t out_size_in_tiles = 1)
-{
-    auto q_type = is_input ? tt::graphlib::QueueNodeType::Input : tt::graphlib::QueueNodeType::Output;
-    std::vector<std::uint32_t> in_size_in_tiles = {};
-    if (operand != nullptr)
-        in_size_in_tiles = std::vector<std::uint32_t>(1, out_size_in_tiles);
-    return graph->add_queue(name, q_type, operand, perf_data(out_size_in_tiles, in_size_in_tiles, false), is_input);
-}
-
-NodeP add_op(
-    tt::perf_model::Graph *graph,
-    const std::string &name,
-    const std::string &type,
-    std::vector<NodeP> operands,
-    std::uint32_t out_size_in_tiles = 1,
-    std::vector<std::uint32_t> in_size_in_tiles = {})
-{
-    if (in_size_in_tiles.size() == 0)
-        in_size_in_tiles = std::vector<std::uint32_t>(operands.size(), 1);
-    return graph->add_op(name, type, operands, perf_data(out_size_in_tiles, in_size_in_tiles, true), false);
-}
-
-TEST(PerfModel, basic)
-{
-    tt::perf_model::Graph *graph = new tt::perf_model::Graph();
-
-    auto in0 = add_queue(graph, "input0", true);
-    auto in1 = add_queue(graph, "input1", true);
-    auto op0 = add_op(graph, "op0", "add", {in0, in1});
-    add_queue(graph, "output", false, op0);
-
-    Simulator sim(graph, 3);
-    EXPECT_TRUE(sim.run());
-}
-
-TEST(PerfModel, fork)
-{
-    tt::perf_model::Graph *graph = new tt::perf_model::Graph();
-
-    auto in0 = add_queue(graph, "input0", true);
-    auto in1 = add_queue(graph, "input1", true);
-    auto op0 = add_op(graph, "op0", "exp", {in0});
-    auto op1a = add_op(graph, "op1a", "add", {op0, in1});
-    auto op1b = add_op(graph, "op1b", "sqrt", {op0});
-    auto op2 = add_op(graph, "op1b", "mul", {op1a, op1b});
-    add_queue(graph, "output", false, op2);
-
-    Simulator sim(graph, 3);
-    EXPECT_TRUE(sim.run());
-}
-
-TEST(PerfModel, t_streaming)
-{
-    tt::perf_model::Graph *graph = new tt::perf_model::Graph();
-
-    auto in0 = add_queue(graph, "input0", true, nullptr, 4);
-    auto in1 = add_queue(graph, "input1", true, nullptr, 4);
-    auto op0 = add_op(graph, "op0", "add", {in0, in1}, 4, {4, 4});
-    auto op1 = add_op(graph, "op1t", "exp", {op0}, 1, {1});
-    op1->get_perf_data()->output.t = 4;
-    add_queue(graph, "output", false, op1, 4);
-
-    Simulator sim(graph, 3);
-    EXPECT_TRUE(sim.run());
-}
-
-TEST(PerfModel, matmul)
-{
-    tt::perf_model::Graph *graph = new tt::perf_model::Graph();
-    auto in0 = add_queue(graph, "input0", true, nullptr, 4);
-    auto in1 = add_queue(graph, "input1", true, nullptr, 4);
-
-    auto op0 = add_op(graph, "matmul0", "matmul", {in0, in1}, 1, {4, 4});
-    op0->get_perf_data()->attr.m_k = 4;
-    add_queue(graph, "output", false, op0, 1);
-
-    Simulator sim(graph, 3);
-    EXPECT_TRUE(sim.run());
-}
-
-TEST(PerfModel, fork_join_perf)
-{
-    // Fork 5 to 0 ops, show that perf gets better as we add more buffering
-
-    auto create_graph = [](std::uint32_t buf_count) -> tt::perf_model::Graph *
-    {
-        tt::perf_model::Graph *graph = new tt::perf_model::Graph();
-
-        auto in0 = add_queue(graph, "input0", true);
-        auto op0 = add_op(graph, "op0", "exp", {in0});
-
-        auto prev = op0;
-        for (std::uint32_t b = 0; b < buf_count; b++)
-        {
-            auto buf = add_op(graph, "buf" + std::to_string(b), "nop", {prev});
-            prev = buf;
-        }
-
-        auto f_prev = op0;
-        for (std::uint32_t f = 0; f < 6; f++)
-        {
-            auto op = add_op(graph, "op_f" + std::to_string(f), "exp", {f_prev});
-            f_prev = op;
-        }
-
-        auto join = add_op(graph, "op_join", "add", {prev, f_prev});
-        add_queue(graph, "output", false, join);
-        return graph;
-    };
-
-    std::vector<std::uint32_t> time;
-    for (std::uint32_t i = 0; i < 3; i++)
-    {
-        Simulator sim(create_graph(i), 32);
-        EXPECT_TRUE(sim.run());
-        time.push_back(sim.get_timestamp());
-    }
-
-    // Perf should get better with buffering on the short side
-    EXPECT_GT(time[0], time[1]);
-    EXPECT_GT(time[1], time[2]);
-}
diff --git a/pybuda/csrc/perf_model/trace.cpp b/pybuda/csrc/perf_model/trace.cpp
deleted file mode 100644
index 1d28fe94a..000000000
--- a/pybuda/csrc/perf_model/trace.cpp
+++ /dev/null
@@ -1,195 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "perf_model/trace.hpp"
-
-namespace tt::perf_model
-{
-void StallWait::start_stall(
-    std::uint32_t input_index, std::uint32_t operand, std::uint32_t timestamp, std::uint32_t num_tiles)
-{
-    auto &w = stalls[input_index][operand];
-    if (w.start.size() > w.end.size())
-        return;  // repeated stall, ignore until unpack data is available
-
-    if (w.start.size() == 0)
-    {
-        // first stall
-        w.input_index = input_index;
-        w.operand = operand;
-        w.num_tiles = num_tiles;
-    }
-
-    w.start.push_back(timestamp);
-}
-
-void StallWait::stop_stall(std::uint32_t input_index, std::uint32_t operand, std::uint32_t timestamp)
-{
-    auto &w = stalls[input_index][operand];
-    if (w.start.size() == w.end.size())
-        return;  // no stall
-
-    w.end.push_back(timestamp);
-}
-
-void TraceOp::unpack_stall(
-    std::uint32_t input_index, std::uint32_t operand, std::uint32_t timestamp, std::uint32_t num_tiles)
-{
-    t0.stall.start_stall(input_index, operand, timestamp, num_tiles);
-}
-
-void TraceOp::unpack_data_available(std::uint32_t input_index, std::uint32_t operand, std::uint32_t timestamp)
-{
-    auto &uf = t0.unpack_first_instruction[input_index];
-    if (uf.value == 0)
-        uf.value = timestamp;
-
-    t0.stall.stop_stall(input_index, operand, timestamp);
-}
-
-void TraceOp::set_math_data(std::uint32_t input_index, std::uint32_t total_cycles, std::uint32_t useful_cycles)
-{
-    t1.math_perf_counter[input_index] = MathTrace::MathPerfCounter{
-        .input_index = input_index,
-        .activity = useful_cycles,
-        .utilization = (float)(1.0 * useful_cycles / total_cycles),
-        .total_period = total_cycles};
-}
-
-void TraceOp::pack_stall(
-    std::uint32_t input_index, std::uint32_t operand, std::uint32_t timestamp, std::uint32_t num_tiles)
-{
-    t2.stall.start_stall(input_index, operand, timestamp, num_tiles);
-}
-
-void TraceOp::pack_started(std::uint32_t input_index, std::uint32_t operand, std::uint32_t timestamp)
-{
-    auto &p = t2.packer_outer_loop[input_index];
-    if (p.start == 0)
-        p.start = timestamp;
-
-    t2.stall.stop_stall(input_index, operand, timestamp);
-}
-
-void TraceOp::pack_ended(std::uint32_t input_index, std::uint32_t timestamp)
-{
-    auto &p = t2.packer_outer_loop[input_index];
-    p.end = timestamp;
-}
-    
-void TraceOp::add_to_json(json &j, const std::vector<std::uint32_t> &input_indices) const
-{
-    auto &op = j[std::to_string(grid.loc_c) + "-" + std::to_string(grid.loc_r) + "-" + name];
-    op["NCRISC"] = {};
-    op["T0"] = t0.to_json(input_indices);
-    op["T1"] = t1.to_json(input_indices);
-    op["T2"] = t2.to_json(input_indices);
-
-    op["inputs-common-events"] = {
-        {"op-type", op_type},
-        {"pack-dst-data-format", 6},
-        {"unpack-src-data-format-op0", 6},
-        {"unpack-src-data-format-op1", 6}};
-
-    for (std::uint32_t input_index : input_indices)
-    {
-        std::uint32_t unpack_start = t0.unpack_first_instruction.at(input_index).value;
-        std::uint32_t pack_end = t2.packer_outer_loop.at(input_index).end;
-        std::uint32_t pack_start = t2.packer_outer_loop.at(input_index).start;
-        op["per-thread-events"]["input-" + std::to_string(input_index)] = {
-
-            {"first-unpack-to-last-pack", pack_end - unpack_start},
-            {"first-unpack-to-last-pack-without-wait-tile", "N/A"},
-            {"math-runtime", pack_end - unpack_start},
-            {"math-utilization-first-unpack-to-last-pack", 0.33},
-            {"math-utilization-first-unpack-to-last-pack-without-wait-tile", "N/A"},
-            {"math-utilization-over-math-thread", 0.1},
-            {"pack-end-outer-loop", pack_end},
-            {"pack-runtime", pack_end - pack_start},
-            {"pack-start-outer-loop", pack_start},
-            {"total-unpack-wait-for-tile-after-first-unpack", 0},
-            {"total-wait-for-free-tile-after-first-unpack", 0},
-            {"unpack-first-block-data-available", t0.unpack_first_instruction.at(input_index).value}
-        };
-    }
-}
-
-json UnpackerTrace::to_json(const std::vector<std::uint32_t> &input_indices) const
-{
-    json ret;
-    ret["out-of-memory"] = "false";
-    for (auto input_index : input_indices)
-    {
-        ret["unpack-first-instruction-outer-loop-" + std::to_string(input_index)]["value"] =
-            std::vector<std::uint32_t>(1, unpack_first_instruction.at(input_index).value);
-
-        auto name = [](std::uint32_t input_index, std::uint32_t operand, std::uint32_t num_tiles) -> std::string
-        {
-            return "wait-for-incoming-tiles-outer-loop-" + std::to_string(input_index) + "-operand-" +
-                   std::to_string(operand) + "-num-tiles-" + std::to_string(num_tiles);
-        };
-
-        stall.add_to_json(ret, input_indices, name);
-    }
-
-    return ret;
-}
-
-json PackerTrace::to_json(const std::vector<std::uint32_t> &input_indices) const
-{
-    json ret;
-    ret["out-of-memory"] = "false";
-    for (auto input_index : input_indices)
-    {
-        auto &p = ret["packer-each-input-outer-loop-" + std::to_string(input_index)];
-        auto &d = packer_outer_loop.at(input_index);
-        p["start"] = std::vector<std::uint32_t>(1, d.start);
-        p["end"] = std::vector<std::uint32_t>(1, d.end);
-        p["diff"] = std::vector<std::uint32_t>(1, d.end - d.start);
-
-        auto name = [](std::uint32_t input_index, std::uint32_t operand, std::uint32_t num_tiles) -> std::string
-        {
-            return "wait-for-free-tiles-outer-loop-" + std::to_string(input_index) + "-operand-" +
-                   std::to_string(operand) + "-num-tiles-" + std::to_string(num_tiles);
-        };
-
-        stall.add_to_json(ret, input_indices, name);
-    }
-
-    return ret;
-}
-
-json MathTrace::to_json(const std::vector<std::uint32_t> &input_indices) const
-{
-    json ret;
-    for (auto input_index : input_indices)
-    {
-        auto &m = ret["math-perf-counter-outer-loop-" + std::to_string(input_index)];
-        m["math-activity"] = 100;
-        m["total-period"] = 100;
-        m["math-utilization"] = 0.99;
-    }
-    return ret;
-}
-
-void StallWait::add_to_json(
-    json &j,
-    const std::vector<std::uint32_t> &input_indices,
-    std::function<std::string(std::uint32_t, std::uint32_t, std::uint32_t)> c) const
-{
-    for (auto input_index : input_indices)
-    {
-        if (stalls.size() <= input_index) continue;
-        for (auto &[operand, s] : stalls.at(input_index))
-        {
-            auto &w = j[c(input_index, operand, s.num_tiles)];
-            std::vector<std::uint32_t> diff;
-            for (std::size_t i = 0; i < s.end.size(); i++) diff.push_back(s.end[i] - s.start[i]);
-            w["start"] = s.start;
-            w["end"] = s.end;
-            w["diff"] = diff;
-        }
-    }
-}
-
-};  // namespace tt::perf_model
diff --git a/pybuda/csrc/perf_model/trace.hpp b/pybuda/csrc/perf_model/trace.hpp
deleted file mode 100644
index a517e1661..000000000
--- a/pybuda/csrc/perf_model/trace.hpp
+++ /dev/null
@@ -1,131 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-#include <perf_model/graph.hpp>
-#include <string>
-
-#include "json.hpp"
-using json = nlohmann::json;
-
-//
-// Structures for keeping track of perf trace data
-//
-
-namespace tt::perf_model
-{
-
-struct StallWait
-{
-    struct Stall
-    {
-        std::uint32_t input_index;
-        std::uint32_t operand;
-        std::vector<std::uint32_t> start, end;
-        std::uint32_t num_tiles;
-    };
-    std::unordered_map<std::uint32_t, std::unordered_map<std::uint32_t, Stall>> stalls;
-
-    void start_stall(
-        std::uint32_t input_index, std::uint32_t operand, std::uint32_t timestamp, std::uint32_t num_tiles);
-    void stop_stall(std::uint32_t input_index, std::uint32_t operand, std::uint32_t timestamp);
-
-    void add_to_json(
-        json &j,
-        const std::vector<std::uint32_t> &input_indices,
-        std::function<std::string(std::uint32_t, std::uint32_t, std::uint32_t)> c) const;
-};
-
-struct UnpackerTrace
-{
-    struct UnpackFirstInstruction
-    {
-        std::uint32_t input_index;
-        std::uint32_t value = 0;
-    };
-    std::unordered_map<std::uint32_t, UnpackFirstInstruction> unpack_first_instruction;
-
-    StallWait stall;
-
-    json to_json(const std::vector<std::uint32_t> &input_indices) const;
-};
-
-struct MathTrace
-{
-    struct MathPerfCounter
-    {
-        std::uint32_t input_index;
-        std::uint32_t activity;
-        float utilization;
-        std::uint32_t total_period;
-    };
-    std::unordered_map<std::uint32_t, MathPerfCounter> math_perf_counter;
-
-    json to_json(const std::vector<std::uint32_t> &input_indices) const;
-};
-
-struct PackerTrace
-{
-    struct PackerOuterLoop
-    {
-        std::uint32_t input_index;
-        std::uint32_t start = 0, end = 0;
-    };
-    std::unordered_map<std::uint32_t, PackerOuterLoop> packer_outer_loop;
-    StallWait stall;
-
-    json to_json(const std::vector<std::uint32_t> &input_indices) const;
-};
-
-struct PerThreadEvents
-{
-    struct Event
-    {
-        std::uint32_t input_index;
-        std::uint32_t first_unpack_to_last_pack;
-    };
-    std::unordered_map<std::uint32_t, Event> events;
-};
-
-class TraceOp
-{
-    std::string name;
-    std::string op_type;
-    OpGrid grid;
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wunused-private-field"
-    DataFormat pack_format;
-#pragma GCC diagnostic pop
-    std::vector<DataFormat> unpack_format;
-
-    UnpackerTrace t0;
-    MathTrace t1;
-    PackerTrace t2;
-    PerThreadEvents per_thread;
-
-   public:
-    TraceOp(
-        std::string name,
-        std::string op_type,
-        OpGrid grid,
-        DataFormat pack_format,
-        std::vector<DataFormat> unpack_format) :
-        name(name), op_type(op_type), grid(grid), pack_format(pack_format), unpack_format(unpack_format)
-    {
-    }
-
-    void unpack_stall(
-        std::uint32_t input_index, std::uint32_t operand, std::uint32_t timestamp, std::uint32_t num_tiles);
-    void unpack_data_available(std::uint32_t input_index, std::uint32_t operand, std::uint32_t timestamp);
-
-    void set_math_data(std::uint32_t input_index, std::uint32_t total_cycles, std::uint32_t useful_cycles);
-
-    void pack_stall(std::uint32_t input_index, std::uint32_t operand, std::uint32_t timestamp, std::uint32_t num_tiles);
-    void pack_started(std::uint32_t input_index, std::uint32_t operand, std::uint32_t timestamp);
-    void pack_ended(std::uint32_t input_index, std::uint32_t timestamp);
-
-    void add_to_json(json &j, const std::vector<std::uint32_t> &input_indices) const;
-};
-
-}  // namespace tt::perf_model
diff --git a/pybuda/csrc/placer/README.md b/pybuda/csrc/placer/README.md
deleted file mode 100644
index b928d966e..000000000
--- a/pybuda/csrc/placer/README.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# BUDA Placer
-
-BUDA placer is a tool that is responsible for placing ops onto a grid of cores which may span multiple epochs and multiple devices.
-
-## Build Instructions
-
-From root directory:
-
-```
-make placer
-
-make placer/tests
-```
-
-## Run
-
-```
-build/test/placer/tests/placer_unit_tests
-```
diff --git a/pybuda/csrc/placer/allocator_utils.cpp b/pybuda/csrc/placer/allocator_utils.cpp
deleted file mode 100644
index 826602257..000000000
--- a/pybuda/csrc/placer/allocator_utils.cpp
+++ /dev/null
@@ -1,115 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "backend_api/device_config.hpp"
-#include "balancer/types.hpp"
-#include "graph_lib/edge.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-#include "graph_lib/node_types.hpp"
-#include "placer/host_memory.hpp"
-#include "placer/placer.hpp"
-
-namespace tt
-{
-namespace backend
-{
-extern int get_io_size_in_bytes(
-    const DataFormat data_formati,
-    const bool is_untilized,
-    const int ublock_ct,
-    const int ublock_rt,
-    const int mblock_m,
-    const int mblock_n,
-    const int t,
-    const int entries,
-    const int tile_height = 32,
-    const int tile_width = 32);
-extern uint32_t get_next_aligned_address(const uint32_t address);
-}  // namespace backend
-namespace placer
-{
-
-int get_queue_size(const graphlib::QueueNode* node, balancer::BlockShape const& block_shape, bool untilized)
-{
-    const graphlib::Shape shape = node->shape();
-
-    std::uint32_t queue_size = tt::backend::get_io_size_in_bytes(
-        node->output_df(),
-        untilized,
-        block_shape.ublock.ct,
-        block_shape.ublock.rt,
-        block_shape.mblock_m,
-        block_shape.mblock_n,
-        block_shape.t,
-        node->get_num_entries());
-
-    return queue_size;
-}
-
-graphlib::Node* get_reference_node(const graphlib::Graph* graph, const graphlib::Node* node)
-{
-    graphlib::Node* ref_node = nullptr;  // Node from which we'll get the placement information
-
-    if (node->node_type() == graphlib::NodeType::kInput and graph->num_users(node))
-    {
-        ref_node = graph->data_users(node).at(0);
-    }
-
-    if ((node->node_type() == graphlib::NodeType::kQueue) || (node->node_type() == graphlib::NodeType::kOutput))
-    {
-        std::vector<graphlib::Node*> operands = graph->data_operands(node);
-        TT_ASSERT(operands.size() == 1, "There can only be exactly one queue writer, not true for " + node->name());
-        ref_node = operands[0];
-    }
-    return ref_node;
-}
-
-bool is_queue_already_placed(const PlacerSolution& placer_solution, const graphlib::Node* node)
-{
-    bool already_placed =
-        placer_solution.name_to_queue_placement.find(node->name()) != placer_solution.name_to_queue_placement.end();
-    return already_placed;
-}
-
-bool is_queue_already_allocated(const PlacerSolution& placer_solution, const graphlib::Node* node)
-{
-    bool already_allocated = is_queue_already_placed(placer_solution, node) and
-                             placer_solution.name_to_queue_placement.at(node->name()).dram_buffers.size() != 0;
-    return already_allocated;
-}
-
-bool is_linked_queue(const graphlib::Graph* graph, const graphlib::Node* node)
-{
-    return node->node_type() == graphlib::NodeType::kOutput and
-           not graph
-                   ->user_edges(
-                       node, [](graphlib::Edge e) { return e.edge_type == graphlib::EdgeType::kPartialDataCopy; })
-                   .empty();
-}
-
-bool is_input_host_queue(const HostMemoryPlacerConfig& config, const graphlib::Node* node)
-{
-    bool input_on_host =
-        config.place_input_queues_on_host() && node->as<graphlib::QueueNode>()->is_input() &&
-        (node->as<graphlib::InputNode>()->is_activation() or node->as<graphlib::InputNode>()->is_loss());
-
-    return input_on_host;
-}
-
-bool is_output_host_queue(
-    const HostMemoryPlacerConfig& config, const graphlib::Graph* graph, const graphlib::Node* node)
-{
-    bool output_on_host = config.place_output_queues_on_host() && (node->node_type() == graphlib::NodeType::kOutput) &&
-                          node->as<graphlib::OutputNode>()->untilize() && not is_linked_queue(graph, node);
-    return output_on_host;
-}
-
-bool is_host_queue(
-    const HostMemoryPlacerConfig& host_memory_config, const graphlib::Graph* graph, const graphlib::Node* node)
-{
-    return is_input_host_queue(host_memory_config, node) or is_output_host_queue(host_memory_config, graph, node);
-}
-
-}  // namespace placer
-}  // namespace tt
\ No newline at end of file
diff --git a/pybuda/csrc/placer/allocator_utils.hpp b/pybuda/csrc/placer/allocator_utils.hpp
deleted file mode 100644
index 993634f65..000000000
--- a/pybuda/csrc/placer/allocator_utils.hpp
+++ /dev/null
@@ -1,44 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-// fwd declare
-namespace tt
-{
-namespace graphlib
-{
-
-class Node;
-class Graph;
-class QueueNode;
-}  // namespace graphlib
-namespace balancer
-{
-struct BlockShape;
-struct BalancerSolution;
-}  // namespace balancer
-
-}  // namespace tt::graphlib
-
-namespace tt::placer
-{
-
-struct HostMemoryPlacerConfig;
-
-int get_queue_size(const graphlib::QueueNode* node, balancer::BlockShape const& block_shape, bool untilized);
-graphlib::Node* get_reference_node(const graphlib::Graph* graph, const graphlib::Node* node);
-bool is_queue_already_placed(const PlacerSolution& placer_solution, const graphlib::Node* node);
-bool is_queue_already_allocated(const PlacerSolution& placer_solution, const graphlib::Node* node);
-
-bool is_input_host_queue(
-    const HostMemoryPlacerConfig& config, const graphlib::Graph* graph, const graphlib::Node* node);
-bool is_output_host_queue(
-    const HostMemoryPlacerConfig& config, const graphlib::Graph* graph, const graphlib::Node* node);
-
-bool is_host_queue(
-    const HostMemoryPlacerConfig& host_memory_config, const graphlib::Graph* graph, const graphlib::Node* node);
-
-bool is_linked_queue(const graphlib::Graph* graph, const graphlib::Node* node);
-
-}  // namespace tt::placer
diff --git a/pybuda/csrc/placer/best_fit_allocator.cpp b/pybuda/csrc/placer/best_fit_allocator.cpp
deleted file mode 100644
index e099a41a9..000000000
--- a/pybuda/csrc/placer/best_fit_allocator.cpp
+++ /dev/null
@@ -1,105 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "placer/best_fit_allocator.hpp"
-
-namespace tt::placer {
-
-BestFitAllocator::BestFitAllocator(std::uint32_t start_addr, std::uint32_t end_addr, Blocks pre_allocated_blocks) : ChannelAllocator()
-{
-    if (pre_allocated_blocks.free_blocks_start.size() > 0) {
-        blocks = pre_allocated_blocks;
-    } else if (start_addr < end_addr) {
-        // end_addr is the last available address. block includes end_addr
-        add_free_block(Block{start_addr, end_addr - start_addr + 1});
-    }
-}
-
-void BestFitAllocator::add_free_block(const Block &block) 
-{
-    blocks.free_blocks_start[block.addr] = block;
-    blocks.free_blocks_end[block.addr + block.size] = block;
-}
-
-std::uint32_t BestFitAllocator::get_capacity()
-{
-    std::uint32_t capacity = 0;
-    for (auto free_block : blocks.free_blocks_start) {
-        capacity += free_block.second.size;
-    }
-    return capacity;
-}
-
-void BestFitAllocator::remove_free_block(const Block &block) 
-{
-    std::uint32_t end = block.addr + block.size;
-    blocks.free_blocks_start.erase(block.addr);
-    blocks.free_blocks_end.erase(end);
-}
-
-bool BestFitAllocator::allocate(std::uint32_t size, std::uint32_t &addr)
-{
-    // Find the free block with the closest >= size
-    Block closest_block;
-    std::uint32_t diff = UINT32_MAX;
-    for (auto it = blocks.free_blocks_start.rbegin(); it != blocks.free_blocks_start.rend(); it++) 
-    {
-        if (it->second.size >= size) 
-        {
-            auto my_diff = it->second.size - size;
-            if (my_diff < diff) {
-                diff = my_diff;
-                closest_block = it->second;
-                if (diff == 0)
-                    break;
-            }
-        }
-    }
-
-    if (diff == UINT32_MAX)
-        return false;
-
-    addr = closest_block.addr;
-    // Since we allocate new block from right to left, end of the free block will be the end of our new allocated block
-    addr = closest_block.addr + closest_block.size - size;
-    remove_free_block(closest_block);
-    if (diff == 0) {
-        blocks.allocated_blocks[addr] = closest_block;
-    } else {
-        blocks.allocated_blocks[addr] = Block{addr, size};
-        add_free_block(Block{closest_block.addr, diff});
-    }
-
-    return true;
-}
-
-void BestFitAllocator::deallocate(std::uint32_t addr) 
-{
-    //return;
-    auto it = blocks.allocated_blocks.find(addr);
-
-    if (it == blocks.allocated_blocks.end())
-        TT_THROW("Trying to deallocate addr that hasn't been allocated");
-
-    // Find previous and next block to merge with, if any
-    Block freed_block = it->second;
-    auto next = blocks.free_blocks_start.find(addr + it->second.size);
-    if (next != blocks.free_blocks_start.end())
-    {
-        freed_block.size += next->second.size;
-        remove_free_block(next->second);
-    }
-
-    auto prev = blocks.free_blocks_end.find(addr);
-    if (prev != blocks.free_blocks_end.end()) {
-        freed_block.addr = prev->second.addr;
-        freed_block.size += prev->second.size;
-        remove_free_block(prev->second);
-    }
-            
-    add_free_block(freed_block);
-    blocks.allocated_blocks.erase(it);
-}
-
-
-}
diff --git a/pybuda/csrc/placer/best_fit_allocator.hpp b/pybuda/csrc/placer/best_fit_allocator.hpp
deleted file mode 100644
index eea02ff61..000000000
--- a/pybuda/csrc/placer/best_fit_allocator.hpp
+++ /dev/null
@@ -1,23 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "placer/dram_allocator.hpp"
-#include "placer/dram.hpp"
-
-namespace tt::placer {
-
-class BestFitAllocator : public ChannelAllocator
-{
-    Blocks blocks;
-    void add_free_block(const Block &block);
-    void remove_free_block(const Block &block);
-public:
-    virtual Blocks get_blocks() override { return blocks; }
-    BestFitAllocator(std::uint32_t start_addr, std::uint32_t end_addr, Blocks pre_allocated_blocks = Blocks());
-    virtual bool allocate(std::uint32_t size, std::uint32_t &addr) override; // return true if allocated, and update addr
-    virtual void deallocate(std::uint32_t addr) override;
-    virtual std::uint32_t get_capacity() override;
-};
-}
diff --git a/pybuda/csrc/placer/chip_id_assignment.cpp b/pybuda/csrc/placer/chip_id_assignment.cpp
deleted file mode 100644
index 612f61489..000000000
--- a/pybuda/csrc/placer/chip_id_assignment.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "placer/chip_id_assignment.hpp"
-
-#include "utils/assert.hpp"
-#include "utils/logger.hpp"
-
-namespace tt::placer {
-
-static int get_num_fwd_ops(const ChipPlacerConfig& config, const vector<string>& scheduled_ops) {
-    int num_fwd_ops = 0;
-    for (const string& op_name : scheduled_ops)
-    {
-        NodeEpochType epoch_type = config.op_to_epoch_type.at(op_name);
-        if (epoch_type == NodeEpochType::Forward)
-        {
-            num_fwd_ops += 1;
-        }
-    }
-    return num_fwd_ops;
-}
-
-unordered_map<string, uint32_t> get_grayskull_fwd_op_to_chip_id_placement(
-    const ChipPlacerConfig& config,
-    const vector<string>& scheduled_ops)
-{
-    unordered_map<uint32_t, int> chip_id_to_num_assigned_ops;
-    unordered_map<string, uint32_t> fwd_op_to_chip_id_placement;
-    uint32_t current_chip_index = 0;
-    uint32_t current_chip_id = config.chip_ids[current_chip_index];
-
-    bool use_user_defined_scheme = not config.ops_tagged_for_chip_id_break.empty() or config.chip_ids.size() == 1;
-    if (use_user_defined_scheme)
-    {
-        for (const string& op_name : scheduled_ops)
-        {
-            if (chip_id_to_num_assigned_ops.find(current_chip_id) != chip_id_to_num_assigned_ops.end())
-            {
-                if (config.ops_tagged_for_chip_id_break.find(op_name) != config.ops_tagged_for_chip_id_break.end())
-                {
-                    current_chip_index++;
-                }
-            }
-
-            NodeEpochType epoch_type = config.op_to_epoch_type.at(op_name);
-            if (epoch_type == NodeEpochType::Forward)
-            {
-                current_chip_id = config.chip_ids[current_chip_index];
-                fwd_op_to_chip_id_placement[op_name] = current_chip_id;
-                chip_id_to_num_assigned_ops[current_chip_id] += 1;
-            }
-        }
-    }
-    else
-    {
-        log_info("Placer: Running Grayskull multichip auto-placement");
-        int ops_per_device = ceil((float)get_num_fwd_ops(config, scheduled_ops) / config.chip_ids.size());
-        int fwd_op_idx = 0;
-
-        for (const string& op_name : scheduled_ops)
-        {
-            if (config.op_to_epoch_type.at(op_name) == NodeEpochType::Forward)
-            {
-                current_chip_id = config.chip_ids[current_chip_index];
-                fwd_op_to_chip_id_placement[op_name] = current_chip_id;
-                chip_id_to_num_assigned_ops[current_chip_id] += 1;
-                fwd_op_idx += 1;
-
-                // round-robin available chips
-                if (fwd_op_idx % ops_per_device == 0)
-                {
-                    current_chip_index = (current_chip_index + 1) % config.chip_ids.size();
-                }
-            }
-        }
-    }
-
-    return fwd_op_to_chip_id_placement;
-}
-
-unordered_map<string, uint32_t> get_op_to_chip_id_assignment(
-    const ChipPlacerConfig& config,
-    const vector<string>& scheduled_ops)
-{
-    if (config.arch_name != "grayskull")
-    {
-        // return empty map because we don't have a chip id assignment scheme for this arch
-        // chip-id assignments will be generated after creating spatial epochs
-        return {};
-    }
-    unordered_map<string, uint32_t> op_to_chip_id_assignment = get_grayskull_fwd_op_to_chip_id_placement(config, scheduled_ops);
-
-    // chip-id assignment for BWD nodes
-    for (int i = scheduled_ops.size() - 1; i >= 0; --i)
-    {
-        const string& fwd_node_name = scheduled_ops.at(i);
-        NodeEpochType epoch_type = config.op_to_epoch_type.at(fwd_node_name);
-
-        if (epoch_type == NodeEpochType::Forward)
-        {
-            uint32_t fwd_chip_id = op_to_chip_id_assignment.at(fwd_node_name);
-            bool has_bwd_nodes = config.fwd_to_bwd_nodes.find(fwd_node_name) != config.fwd_to_bwd_nodes.end();
-
-            if (has_bwd_nodes) {
-                for (const auto& bwd_node_name : config.fwd_to_bwd_nodes.at(fwd_node_name))
-                {
-                    op_to_chip_id_assignment[bwd_node_name] = fwd_chip_id;
-                }
-            }
-        }
-    }
-
-    // chip-id assignment for OPT nodes
-    for (const string& fwd_node_name : scheduled_ops)
-    {
-        NodeEpochType epoch_type = config.op_to_epoch_type.at(fwd_node_name);
-        if (epoch_type == NodeEpochType::Forward)
-        {
-            bool has_opt_nodes = config.fwd_to_opt_nodes.find(fwd_node_name) != config.fwd_to_opt_nodes.end();
-            if (has_opt_nodes) {
-                uint32_t fwd_chip_id = op_to_chip_id_assignment.at(fwd_node_name);
-                for (const auto& [operand_index, opt_node_names] : config.fwd_to_opt_nodes.at(fwd_node_name))
-                {
-                    for (const string& opt_node_name : opt_node_names)
-                    {
-                        op_to_chip_id_assignment[opt_node_name] = fwd_chip_id;
-
-                    }
-                }
-            }
-        }
-    }
-    return op_to_chip_id_assignment;
-}
-
-} // namespace tt::placer
diff --git a/pybuda/csrc/placer/chip_id_assignment.hpp b/pybuda/csrc/placer/chip_id_assignment.hpp
deleted file mode 100644
index 351c7a548..000000000
--- a/pybuda/csrc/placer/chip_id_assignment.hpp
+++ /dev/null
@@ -1,54 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "placer/placer.hpp"
-
-namespace tt::placer {
-
-struct ChipPlacerConfig
-{
-    // Arch config
-    std::vector<std::uint32_t> chip_ids;
-    string arch_name;    
-
-    // Capture any user or op-specific config for placement
-    // like chip-breaks or epoch-breaks
-    unordered_map<string, NodeEpochType> op_to_epoch_type;
-
-    // captures any user-configuration for chip-breaking
-    unordered_set<string> ops_tagged_for_chip_id_break;
-    unordered_set<string> ops_tagged_for_epoch_break;
-    unordered_map<string, int> fracture_chip_id_assignments;
-
-    unordered_map<string, vector<string>> fwd_to_bwd_nodes;
-    unordered_map<string, map<int, vector<string>>> fwd_to_opt_nodes;
-    unordered_set<string> output_ops = {};
-    vector<int> chips_with_mmio; // for wormhole
-};
-
-using OpToChipIdAssignment = unordered_map<string, uint32_t>;
-
-unordered_map<string, uint32_t> get_op_to_chip_id_assignment(
-    const ChipPlacerConfig& config,
-    const vector<string>& scheduled_ops);
-
-enum class ChipPlacementPolicy
-{
-    MMIO_LAST = 0, // use chip id order as given by the user, use mmio chips last
-    SNAKE = 1,     // sort chip ids in a snake pattern
-};
-
-inline ChipPlacementPolicy chip_placement_policy_from_string(std::string const& s)
-{
-    if (s == "MMIO_LAST") {
-        return ChipPlacementPolicy::MMIO_LAST;
-    } else if (s == "SNAKE") {
-        return ChipPlacementPolicy::SNAKE;
-    }
-    TT_ASSERT(false);
-    return ChipPlacementPolicy::MMIO_LAST;
-}
-
-} // namespace tt::placer
diff --git a/pybuda/csrc/placer/dram.cpp b/pybuda/csrc/placer/dram.cpp
deleted file mode 100644
index 0f7c95893..000000000
--- a/pybuda/csrc/placer/dram.cpp
+++ /dev/null
@@ -1,597 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "placer/dram.hpp"
-
-#include <algorithm>
-#include <numeric>
-#include <unordered_map>
-
-#include "buda_passes.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node_types.hpp"
-#include "placer/allocator_utils.hpp"
-#include "placer/dram_allocator.hpp"
-#include "utils/logger.hpp"
-
-namespace tt
-{
-// from backend
-namespace placer
-{
-using Graph = graphlib::Graph;
-using Node = graphlib::Node;
-
-static void log_epoch_to_epoch_queue_info(Graph *graph, const PlacerSolution &placer_solution)
-{
-    log_debug(LogPlacer, "*** Logging Epoch-To-Epoch Queues ***");
-    std::uint32_t total_e2e_queue_size = 0;
-    std::map<std::uint32_t, std::uint32_t> epoch_id_to_total_e2e_input_buffer_size;
-    std::map<graphlib::NodeEpochType, std::uint32_t> epoch_type_to_total_e2e_input_buffer_size;
-
-    for (Node *node : graph->nodes())
-    {
-        if (node->node_type() == graphlib::NodeType::kQueue and
-            node->as<graphlib::QueueNode>()->queue_type() == graphlib::QueueNodeType::EpochToEpoch)
-        {
-            std::uint32_t producer_epoch_id = get_first_epoch_producer(graph, node, placer_solution);
-            graphlib::NodeEpochType producer_epoch_type = placer_solution.epoch_type(producer_epoch_id);
-            const auto &queue_placement = placer_solution.name_to_queue_placement.at(node->name());
-
-            std::uint32_t total_buffer_size = std::accumulate(
-                queue_placement.dram_buffers.begin(),
-                queue_placement.dram_buffers.end(),
-                0,
-                [](std::uint32_t accumulator, const QueueBufferPlacement &buffer_placement)
-                { return accumulator + buffer_placement.buffer_size; });
-
-            total_e2e_queue_size += total_buffer_size;
-            for (std::uint32_t consumer_epoch_id : get_consumer_epoch_ids(graph, node, placer_solution))
-            {
-                graphlib::NodeEpochType consumer_epoch_type = placer_solution.epoch_type(consumer_epoch_id);
-                log_trace(
-                    LogPlacer,
-                    "{}->{}, {}->{}, e2e-queue: {}, size: {}",
-                    graphlib::node_epoch_type_to_string(producer_epoch_type),
-                    graphlib::node_epoch_type_to_string(consumer_epoch_type),
-                    producer_epoch_id,
-                    consumer_epoch_id,
-                    node->name(),
-                    total_buffer_size);
-                epoch_id_to_total_e2e_input_buffer_size[consumer_epoch_id] += total_buffer_size;
-                epoch_type_to_total_e2e_input_buffer_size[consumer_epoch_type] += total_buffer_size;
-            }
-        }
-    }
-    for (const auto &[epoch_id, total_input_buffer_size] : epoch_id_to_total_e2e_input_buffer_size)
-    {
-        log_debug(
-            LogPlacer,
-            "EpochId: {}, EpochType: {}: total input e2e buffer size: {} KB",
-            epoch_id,
-            graphlib::node_epoch_type_to_string(placer_solution.epoch_type(epoch_id)),
-            ((float)total_input_buffer_size / 1024));
-    }
-    for (const auto &[epoch_type, total_input_buffer_size] : epoch_type_to_total_e2e_input_buffer_size)
-    {
-        log_debug(
-            LogPlacer,
-            "EpochType={}: total input e2e buffer size: {} KB",
-            graphlib::node_epoch_type_to_string(epoch_type),
-            ((float)total_input_buffer_size / 1024));
-    }
-    log_debug(LogPlacer, "Total e2e transfer: {} KB", ((float)total_e2e_queue_size / 1024));
-}
-
-// TODO - get this from backend
-Coord logical_to_physical_coord(
-    const Coord &logical_coord, const DeviceConfig &config, const std::vector<std::uint32_t> &harvested_rows)
-{
-    if (config.is_grayskull())
-        return logical_coord;  // TODO
-
-    auto col = (logical_coord.col <= 3) ? logical_coord.col + 1 : logical_coord.col + 2;
-    auto row = (logical_coord.row <= 4) ? logical_coord.row + 1 : logical_coord.row + 2;
-
-    for (auto harvested : harvested_rows)
-    {
-        if (row >= harvested)
-            row++;
-    }
-
-    return Coord{.row = row, .col = col};
-}
-
-// Figure out which cores are reading from which dram buffer (or writing to)
-// dram_buffer is relative coordinate within the buffer grid
-std::vector<Coord> get_reader_cores(
-    const Node *node, const OpPlacement &placement, std::uint32_t operand, Coord dram_buffer, GridShape queue_grid)
-{
-    if (node->node_type() == graphlib::NodeType::kBudaOp)
-    {
-        const graphlib::OpNode *op = node->as<graphlib::OpNode>();
-
-        const bool t = placement.grid_transpose;
-
-        // Figure out the scale ratio of the op vs. queue grid, and scale start/end linearly
-        std::uint32_t op_size_c = t ? placement.placed_cores.size_r() : placement.placed_cores.size_c();
-        std::uint32_t op_size_r = t ? placement.placed_cores.size_c() : placement.placed_cores.size_r();
-        float queue_relative_size_c = (float)op_size_c / (float)queue_grid.columns;
-        float queue_relative_size_r = (float)op_size_r / (float)queue_grid.rows;
-
-        std::uint32_t op_start_c = (float)dram_buffer.col * queue_relative_size_c;
-        std::uint32_t op_start_r = (float)dram_buffer.row * queue_relative_size_r;
-
-        float op_end_c_fl = (float)(dram_buffer.col + 1) * queue_relative_size_c;
-        float op_end_r_fl = (float)(dram_buffer.row + 1) * queue_relative_size_r;
-
-        // If we moved a bit into the next core, include it, otherwise don't
-        std::uint32_t op_end_c = ((int)op_end_c_fl == op_end_c_fl) ? op_end_c_fl - 1 : op_end_c_fl;
-        std::uint32_t op_end_r = ((int)op_end_r_fl == op_end_r_fl) ? op_end_r_fl - 1 : op_end_r_fl;
-
-        if (op->is_matmul())
-        {
-            std::vector<Coord> cores;
-            switch (operand)
-            {
-                case 0:  // Activations are only read by the first column.
-                    for (std::uint32_t row = op_start_r; row <= op_end_r; row++)
-                    {
-                        Coord c =
-                            placement.grid_transpose
-                                ? Coord{.row = placement.placed_cores.start.row, .col = placement.placed_cores.start.col + row}
-                                : Coord{
-                                      .row = placement.placed_cores.start.row + row,
-                                      .col = placement.placed_cores.start.col};
-                        cores.push_back(c);
-                    }
-                    break;
-
-                case 1:
-                case 2:  // Weights are read by the last row and broadcast up
-                    for (std::uint32_t col = op_start_c; col <= op_end_c; col++)
-                    {
-                        Coord c =
-                            placement.grid_transpose
-                                ? Coord{.row = placement.placed_cores.start.row + col, .col = placement.placed_cores.end.col - 1}
-                                : Coord{
-                                      .row = placement.placed_cores.end.row - 1,
-                                      .col = placement.placed_cores.start.col + col};
-                        cores.push_back(c);
-                    }
-                    break;
-            }
-            return cores;
-        }
-        else
-        {
-            // Even distribution for other ops
-            std::vector<Coord> cores;
-            for (std::uint32_t row = op_start_r; row <= op_end_r; row++)
-            {
-                for (std::uint32_t col = op_start_c; col <= op_end_c; col++)
-                {
-                    Coord c =
-                        placement.grid_transpose
-                            ? Coord{.row = placement.placed_cores.start.row + col, .col = placement.placed_cores.start.col + row}
-                            : Coord{
-                                  .row = placement.placed_cores.start.row + row,
-                                  .col = placement.placed_cores.start.col + col};
-                    cores.push_back(c);
-                }
-            }
-            return cores;
-        }
-    }
-
-    // Not an op
-    return {placement.placed_cores.start};
-}
-
-// Writing is always 1-1 for now
-Coord get_writer_core(const OpPlacement &placement, Coord dram_buffer)
-{
-    return Coord{
-        .row = placement.placed_cores.start.row + dram_buffer.row,
-        .col = placement.placed_cores.start.col + dram_buffer.col};
-}
-
-// Generate consumer locations for a queue
-std::unordered_map<std::uint32_t, std::unordered_map<std::uint32_t, std::vector<std::pair<Coord, std::uint32_t>>>>
-get_consumer_locations(
-    const PlacerSolution &placer_solution,
-    const Graph *graph,
-    const Node *node,
-    GridShape queue_grid,
-    const DramPlacerConfig &config,
-    const std::vector<std::uint32_t> &harvested_rows)
-{
-    std::unordered_map<std::uint32_t, std::unordered_map<std::uint32_t, std::vector<std::pair<Coord, std::uint32_t>>>>
-        consumer_loc;
-
-    for (std::uint32_t row = 0; row < queue_grid.rows; row++)
-    {
-        for (std::uint32_t col = 0; col < queue_grid.columns; col++)
-        {
-            for (Edge user_edge : graph->user_data_edges(node))
-            {
-                const Node *user = graph->node_by_id(user_edge.consumer_node_id);
-                auto it = placer_solution.name_to_op_placement.find(user->name());
-                TT_LOG_ASSERT(
-                    it != placer_solution.name_to_op_placement.end(),
-                    "Consumer {} of queue {} not placed",
-                    user->name(),
-                    node->name());
-                std::vector<Coord> readers = get_reader_cores(
-                    user, it->second, user_edge.consumer_input_port_id, Coord{.row = row, .col = col}, queue_grid);
-                for (Coord reader : readers)
-                {
-                    consumer_loc[row][col].push_back(
-                        {logical_to_physical_coord(reader, config.device_config, harvested_rows),
-                         it->second.epoch_id()});
-                }
-            }
-        }
-    }
-
-    return consumer_loc;
-}
-
-// Generate producer locations for a queue
-std::unordered_map<std::uint32_t, std::unordered_map<std::uint32_t, std::pair<Coord, std::uint32_t>>>
-get_producer_locations(
-    const PlacerSolution &placer_solution,
-    const Graph *graph,
-    const Node *node,
-    GridShape queue_grid,
-    const DramPlacerConfig &config,
-    const std::vector<std::uint32_t> &harvested_rows)
-{
-    std::unordered_map<std::uint32_t, std::unordered_map<std::uint32_t, std::pair<Coord, std::uint32_t>>> producer_loc;
-
-    if (node->as<graphlib::QueueNode>()->is_input())
-    {
-        if (node->as<graphlib::InputNode>()->input_type() == graphlib::InputNodeType::Activation)
-        {
-            // Producer is PCIe
-            for (std::uint32_t row = 0; row < queue_grid.rows; row++)
-            {
-                for (std::uint32_t col = 0; col < queue_grid.columns; col++)
-                {
-                    // TODO - get PCIe from backend
-                    producer_loc[row][col] = {Coord{.row = 3, .col = 0}, 0};
-                }
-            }
-        }
-        // Return empty for other inputs, as they are filled in previous epochs or constants
-        return producer_loc;
-    }
-
-    auto operands = graph->data_operands(node);
-    TT_ASSERT(operands.size() == 1);
-    auto it = placer_solution.name_to_op_placement.find(operands[0]->name());
-    TT_LOG_ASSERT(
-        it != placer_solution.name_to_op_placement.end(),
-        "Producer {} of queue {} not placed",
-        operands[0]->name(),
-        node->name());
-    for (std::uint32_t row = 0; row < queue_grid.rows; row++)
-    {
-        for (std::uint32_t col = 0; col < queue_grid.columns; col++)
-        {
-            Coord writer = get_writer_core(it->second, Coord{.row = row, .col = col});
-            producer_loc[row][col] = {
-                logical_to_physical_coord(writer, config.device_config, harvested_rows), it->second.epoch_id()};
-        }
-    }
-
-    return producer_loc;
-}
-
-//
-// The DRAM queues are split into buffers, one for each of the cores that is reading from a queue. These buffers can be
-// freely allocated to any DRAM channel.
-//
-// The placer's job is to pick optimal dram channels for these buffers to maximize bandwidth and minimize latency, while
-// ensuring that all dram queues fit in dram.
-//
-// If total queue size is simply too big, placer will fail to allocate successfully and this function will return false.
-//
-// This function assumes that ops are already placed and placer_solution is populated with the current solution.
-void place_dram_queues(
-    Graph *graph,
-    PlacerSolution &placer_solution,
-    balancer::BalancerSolution &balancer_solution,
-    const HostMemoryPlacerConfig &host_memory_placer_config,
-    const DramPlacerConfig &config,
-    std::vector<DramAllocator> &chip_dram_allocators)
-{
-    std::map<string, string> linked_queues;
-    std::unordered_map<std::uint32_t, std::vector<DRAMScheduleData>> scheduled_queue_placements;
-    std::uint32_t max_chip_id = 0;
-    std::uint32_t mmio_chip_index = 0;
-
-    // Get harvested rows once, since it's an expensive query
-    std::unordered_map<std::uint32_t, std::vector<std::uint32_t>> harvested_rows;  // per chip_id
-    auto get_harvested_rows = [&config, &harvested_rows](std::uint32_t chip_id)
-    {
-        if (harvested_rows.count(chip_id) == 0)
-        {
-            harvested_rows[chip_id] =
-                config.device_config.get_harvested_rows(config.device_config.get_harvested_cfg()[chip_id]);
-        }
-        return harvested_rows.at(chip_id);
-    };
-
-    for (Node *node : graphlib::topological_sort(*graph))
-    {
-        Node *ref_node = get_reference_node(graph, node);  // Node from which we'll get the placement information
-        if (ref_node == nullptr)
-            continue;
-
-        bool already_placed = is_queue_already_placed(placer_solution, node);
-        bool already_allocated = is_queue_already_allocated(placer_solution, node);
-
-        if (already_allocated)
-        {
-            log_trace(LogPlacer, "Skipping queue {} since it is already allocated.", node->name());
-            continue;
-        }
-        if (is_host_queue(host_memory_placer_config, graph, node))
-        {
-            continue;
-        }
-
-        OpPlacement placement;
-        CoordRange queue_coord_range;
-        GridShape queue_grid;
-        balancer::BlockShape block_shape;
-
-        try
-        {
-            placement = placer_solution.name_to_op_placement.at(ref_node->name());
-            queue_coord_range = placement.placed_cores;
-            if (ref_node->get_type() == "BudaOp::ethernet_datacopy")
-            {
-                auto const &grid_shape = balancer_solution.op_models.at(ref_node->name()).grid_shape;
-                queue_coord_range = CoordRange{
-                    .start = Coord{.row = 0, .col = 0},
-                    .end = Coord{.row = (uint32_t)grid_shape.r, .col = (uint32_t)grid_shape.c}};
-            }
-            block_shape = balancer_solution.block_shapes.at(
-                (node->node_type() == graphlib::NodeType::kQueue) ? ref_node->name() : node->name());
-            if (node->node_type() == graphlib::NodeType::kQueue and
-                balancer_solution.op_models.at(ref_node->name()).has_sparse_buffer())
-            {
-                TT_ASSERT((queue_coord_range.size_c() % 2) == 0);
-                queue_coord_range.end.col = queue_coord_range.start.col + (queue_coord_range.size_c() / 2);
-            }
-        }
-        catch (std::out_of_range &e)
-        {
-            throw std::runtime_error(
-                "Placement for node " + ref_node->name() + " from queue " + node->name() + " is missing.");
-        }
-        bool linked_queue =
-            node->node_type() == graphlib::NodeType::kOutput and
-            not graph->user_edges(node, [](Edge e) { return e.edge_type == graphlib::EdgeType::kPartialDataCopy; })
-                    .empty();
-
-        if (node->node_type() == graphlib::NodeType::kInput)
-        {
-            queue_grid = placer_solution.input_queue_to_grid_shape.at(node->name());
-            // Adjust the range to queue grid
-            queue_coord_range.end.row = queue_coord_range.start.row + queue_grid.rows;
-            queue_coord_range.end.col = queue_coord_range.start.col + queue_grid.columns;
-        }
-        else
-        {
-            bool grid_transpose = placement.grid_transpose;
-            queue_grid = GridShape(
-                (grid_transpose) ? queue_coord_range.size_c() : queue_coord_range.size_r(),
-                (grid_transpose) ? queue_coord_range.size_r() : queue_coord_range.size_c());
-        }
-        // Loopback queue (i.e. queue that optmizer writes to) should not have 'HOST' as their input even
-        // though the host will be initializing them.
-        std::string input_name = ref_node->name();
-        if (node->node_type() == graphlib::NodeType::kInput)
-        {
-            std::vector<graphlib::Edge> loopback_edges = graph->operand_edges(
-                node, [](graphlib::Edge e) { return e.edge_type == graphlib::EdgeType::kDataLoopback; });
-            if (loopback_edges.size() > 0)
-            {
-                input_name = graph->node_by_id(loopback_edges[0].producer_node_id)->name();
-            }
-            else
-            {
-                input_name = "HOST";
-            }
-        }
-
-        // Output and intermediate queues depend on the producer grid. There can only be one writer to the queue.
-        // If the output queue is to be placed on host, then no allocation is needed
-        if (linked_queue)
-        {
-            node->as<graphlib::QueueNode>()->set_memory_access_type(graphlib::MemoryAccessType::RAM);
-            auto partial_loopback_edges =
-                graph->user_edges(node, [](Edge e) { return e.edge_type == graphlib::EdgeType::kPartialDataCopy; });
-            TT_ASSERT(partial_loopback_edges.size() == 1);
-            auto linked_node = graph->node_by_id(partial_loopback_edges[0].consumer_node_id);
-            log_debug(tt::LogPlacer, "Linked node {}", linked_node->name());
-            auto writer_block_shape = balancer_solution.block_shapes.at(linked_node->name());
-            auto reader_block_shape = balancer_solution.block_shapes.at(node->name());
-            TT_ASSERT(writer_block_shape.t % reader_block_shape.t == 0);
-            int multiplier = writer_block_shape.t / reader_block_shape.t;
-            node->as<graphlib::QueueNode>()->set_num_entries(multiplier * graph->get_microbatch());
-            node->as<graphlib::QueueNode>()->set_alias(linked_node->name());
-            linked_node->as<graphlib::QueueNode>()->set_memory_access_type(graphlib::MemoryAccessType::RAM);
-            linked_node->as<graphlib::QueueNode>()->set_num_entries(graph->get_microbatch());
-            linked_queues.emplace(node->name(), linked_node->name());
-            placer_solution.name_to_queue_placement.insert(std::make_pair(
-                node->name(),
-                QueuePlacement{
-                    .name = node->name(),
-                    .input_name = ref_node->name(),
-                    .grid_shape = queue_grid,
-                    .on_host = false,
-                    .chip_id = placement.chip_id,
-                    .dram_buffers = {},
-                    .host_buffers = {},
-                    .write_stride = multiplier,
-                }));
-            log_debug(tt::LogPlacer, "Adding linked queue {}", node->name());
-        }
-        else
-        {
-            // Currently assume that all consumers to the queue belong to the same chip
-            int producer_chip_id = placement.chip_id;
-            int consumer_chip_id = -1;
-            for (Node *user : graph->data_users(node))
-            {
-                if (placer_solution.name_to_op_placement.find(user->name()) !=
-                    placer_solution.name_to_op_placement.end())
-                {
-                    int current_consumer_chip_id = placer_solution.name_to_op_placement.at(user->name()).chip_id;
-                    if (consumer_chip_id != -1 and consumer_chip_id != current_consumer_chip_id)
-                    {
-                        if (config.device_config.is_grayskull())
-                        {
-                            throw std::runtime_error(
-                                "Placement for queue " + node->name() + " contains multiple remote consumers.");
-                        }
-                        else
-                        {
-                            // Wormhole allows this, but we need to turn off the prologue bit
-                            if (node->node_type() == graphlib::NodeType::kInput)
-                            {
-                                node->as<graphlib::InputNode>()->set_prologue(false);
-                            }
-                        }
-                    }
-                    consumer_chip_id = current_consumer_chip_id;
-                }
-            }
-
-            // the target device for the queue placement is set based on the consumer. For both GS and WH,
-            // we want the consumer to read its input data from an input queue on the same chip.
-            uint32_t chip_id = consumer_chip_id >= 0 ? consumer_chip_id : producer_chip_id;
-
-            // If this is an e2e epoch, figure out which epoch is writing, and the last epoch that is reading, after
-            // which we can deallocate the queue
-            std::uint32_t producer_epoch = 0;
-            std::uint32_t last_consumer_epoch = UINT_MAX;
-            if (node->as<graphlib::QueueNode>()->queue_type() == graphlib::QueueNodeType::EpochToEpoch)
-            {
-                producer_epoch = get_first_epoch_producer(graph, node, placer_solution);
-                last_consumer_epoch = get_last_epoch_use(graph, node, placer_solution);
-            }
-
-            // relevant only for grayskull p2p access
-            bool in_p2p_region_soft = false;
-            bool in_p2p_region_hard = false;
-            bool is_input = false;
-            bool is_prologue = false;
-            if (node->as<graphlib::QueueNode>()->is_input())
-            {
-                auto input_node = node->as<graphlib::InputNode>();
-                if ((input_node->is_activation() || input_node->is_loss()))
-                    in_p2p_region_soft = ((queue_grid.rows == 1) && (queue_grid.columns == 1));  // try, if possible
-                is_input = true;
-                is_prologue = input_node->is_prologue();
-            }
-            if (config.device_config.is_grayskull() and node->as<graphlib::QueueNode>()->is_epoch_to_epoch())
-            {
-                if (producer_chip_id != consumer_chip_id)
-                    in_p2p_region_hard = true;  // won't work if we don't fit
-            }
-            if (config.device_config.is_wormhole() and in_p2p_region_soft)
-            {
-                // Try to put the activations into mmio-capable p2p region to enable the fast-tilize region
-                // simply round-robin across available mmio capable chips if the consumer chip-id is not mmio capable.
-                auto mmio_chip_id_it = std::find(
-                    std::begin(config.device_config.chips_with_mmio),
-                    std::end(config.device_config.chips_with_mmio),
-                    chip_id);
-                if (mmio_chip_id_it == config.device_config.chips_with_mmio.end())
-                {
-                    mmio_chip_index = (mmio_chip_index % config.device_config.chips_with_mmio.size());
-                    chip_id = config.device_config.chips_with_mmio.at(mmio_chip_index++);
-                }
-            }
-
-            // Override all dram queue placements with user specified assignments
-            if (config.manual_dram_queue_placement.find(node->name()) != config.manual_dram_queue_placement.end())
-            {
-                const auto &dram_placement = config.manual_dram_queue_placement.at(node->name());
-                if (dram_placement.chip_id.has_value())
-                {
-                    log_debug(
-                        tt::LogPlacer,
-                        "Manually placing dram queue {} to chip_id: {}",
-                        node->name(),
-                        dram_placement.chip_id.value());
-                    chip_id = dram_placement.chip_id.value();
-                }
-            }
-
-            // Save placement information, the actual placement will have in second pass-through
-            if (chip_id > max_chip_id)
-                max_chip_id = chip_id;
-            log_debug(tt::LogPlacer, "\tScheduling queue {} for placement", node->name());
-
-            scheduled_queue_placements[chip_id].push_back(std::make_pair(
-                already_placed
-                    ? placer_solution.name_to_queue_placement.at(node->name())
-                    : QueuePlacement{.name = node->name(), .input_name = input_name, .grid_shape = queue_grid, .on_host = false, .chip_id = chip_id, .dram_buffers = {}, .host_buffers = {}, .epoch_allocate = -1, .epoch_deallocate = -1},
-                QueueDRAMPlacementParameters{
-                    .config = &config,
-                    .node = node,
-                    .grid_shape = queue_grid,
-                    .consumer_loc = get_consumer_locations(
-                        placer_solution, graph, node, queue_grid, config, get_harvested_rows(chip_id)),
-                    .producer_loc = get_producer_locations(
-                        placer_solution, graph, node, queue_grid, config, get_harvested_rows(chip_id)),
-                    .block_shape = block_shape,
-                    .producer_epoch = producer_epoch,
-                    .last_consumer_epoch = last_consumer_epoch,
-                    .in_p2p_region_soft = in_p2p_region_soft,
-                    .in_p2p_region_hard = in_p2p_region_hard,
-                    .is_input = is_input,
-                    .is_prologue = is_prologue}));
-        }
-    }
-
-    for (std::uint32_t chip_id = 0; chip_id <= max_chip_id; chip_id++)
-    {
-        if (scheduled_queue_placements.count(chip_id) == 0)
-            continue;
-        log_info(tt::LogPlacer, "Running DRAM allocator for device {}", chip_id);
-        chip_dram_allocators.at(chip_id).allocate_queues(
-            scheduled_queue_placements.at(chip_id), config.disable_dynamic_dram);
-        for (auto &[queue_placement, parameters] : scheduled_queue_placements[chip_id])
-        {
-            log_debug(tt::LogPlacer, "\tAllocating/placing queue {} on chip {}", queue_placement.name, chip_id);
-            if (placer_solution.name_to_queue_placement.find(parameters.node->name()) ==
-                placer_solution.name_to_queue_placement.end())
-            {
-                placer_solution.name_to_queue_placement.emplace(parameters.node->name(), queue_placement);
-            }
-            else
-            {
-                placer_solution.name_to_queue_placement[parameters.node->name()] = queue_placement;
-            }
-        }
-    }
-    for (auto &[key, val] : linked_queues)
-    {
-        placer_solution.name_to_queue_placement[key].dram_buffers =
-            placer_solution.name_to_queue_placement[val].dram_buffers;
-        placer_solution.name_to_queue_placement[key].write_only = true;
-        placer_solution.name_to_queue_placement[val].read_only = true;
-        placer_solution.name_to_queue_placement[key].chip_id = placer_solution.name_to_queue_placement[val].chip_id;
-    }
-    log_epoch_to_epoch_queue_info(graph, placer_solution);
-}
-
-}  // namespace placer
-}  // namespace tt
diff --git a/pybuda/csrc/placer/dram.hpp b/pybuda/csrc/placer/dram.hpp
deleted file mode 100644
index 6248be6c2..000000000
--- a/pybuda/csrc/placer/dram.hpp
+++ /dev/null
@@ -1,182 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-#include <cassert>
-
-#include "backend_api/device_config.hpp"
-#include "balancer/balancer.hpp"
-#include "placer/placer.hpp"
-
-namespace tt
-{
-struct DramQueueConfigOverride
-{
-    std::optional<std::uint32_t> chip_id;
-    std::optional<std::uint32_t> channel;
-    DramQueueConfigOverride(
-        std::optional<std::uint32_t> chip_id = std::nullopt, std::optional<std::uint32_t> channel = std::nullopt) :
-        chip_id(chip_id), channel(channel)
-    {
-    }
-};
-using DramQueueMap = std::unordered_map<std::string, DramQueueConfigOverride>;
-
-namespace graphlib
-{
-class Graph;
-}
-
-namespace placer
-{
-class DramAllocator;
-struct HostMemoryPlacerConfig;
-
-struct DramConfig
-{
-    uint32_t channel;
-    uint32_t sub_channel;
-    uint32_t channel_size;
-    Coord location;
-    uint32_t initial_dram_offset;
-
-    static std::vector<DramConfig> get_config(DeviceConfig const &device_config)
-    {
-        std::vector<DramConfig> ret;
-        std::uint32_t num_channels = device_config.get_dram_num_channels();
-        std::uint32_t num_subchannels = device_config.get_dram_num_subchannels();
-        for (std::uint32_t channel = 0; channel < num_channels; channel++)
-        {
-            for (std::uint32_t sub_channel = 0; sub_channel < num_subchannels; sub_channel++)
-            {
-                ret.push_back(DramConfig{
-                    channel,
-                    sub_channel,
-                    device_config.get_dram_channel_capacity(),
-                    get_location(device_config, channel, sub_channel),
-                    ((device_config.is_grayskull()) ? device_config.get_dram_backend_reserved_max() : device_config.get_dram_backend_reserved(channel)) +
-                        0x100 /* tenstorrent/budabackend#461 */,
-                });
-            }
-        }
-        return ret;
-    }
-
-    static Coord get_location(const DeviceConfig &device_config, std::uint32_t channel, std::uint32_t subchannel = 0)
-    {
-        // NB: using NOC coordinates here. The coordinates aren't really used by FE.
-        // it's also inconsistent because grayskull() config is just wrong.
-        // TODO(jchu): this should all be cleaned up in favour of queries to the BE
-        // that provide this info explicitly.
-        const std::vector<Coord> grayskull_locs = {{0, 0}, {5, 0}, {0, 3}, {5, 3}, {0, 6}, {5, 6}, {0, 9}, {5, 9}};
-        if (device_config.is_grayskull())
-        {
-            TT_ASSERT(channel < grayskull_locs.size());
-            return grayskull_locs.at(channel);
-        }
-
-        if (device_config.is_wormhole())
-        {
-            // TT_ASSERT(channel < wormhole_locs.size());
-            auto c = device_config.get_dram_core_coord(channel, subchannel);
-            return {(std::uint32_t)c.y, (std::uint32_t)c.x};
-        }
-
-        TT_THROW("Unknown arch: " + device_config.arch_name);
-        return {0, 0};
-    }
-};
-
-struct DramPlacerConfig
-{
-    const DeviceConfig &device_config;
-
-    // vector of channels, assume same for each device
-    std::vector<DramConfig> dram_config;
-
-    // Allocate input queues in dram or on host
-    bool input_queues_on_host;
-
-    // Allocate output queues in dram or on host
-    bool output_queues_on_host;
-
-    // Disable dynamic dram support
-    bool disable_dynamic_dram;
-
-    std::uint32_t p2p_offset;
-    std::uint32_t p2p_size;
-    std::uint32_t host_mmio_range_offset;
-    std::uint32_t host_mmio_range_size;
-
-    // Manual dram queue placement
-    DramQueueMap manual_dram_queue_placement;
-
-    DramPlacerConfig(
-        DeviceConfig const &device_config,
-        bool input_queues_on_host,
-        bool output_queues_on_host,
-        const DramQueueMap &manual_dram_queue_placement) :
-        device_config(device_config), manual_dram_queue_placement(manual_dram_queue_placement)
-    {
-        // host_mmio is part of the dram memory allocated for host->device acess. it is defined by offset and size.
-        host_mmio_range_offset = device_config.get_host_mmio_range_offset();
-        host_mmio_range_size = device_config.get_host_mmio_range_size();
-        // p2p is part of the dram memory allocated for device->device acess. it is defined by offset and size.
-        p2p_size = device_config.get_p2p_size();
-        p2p_offset = device_config.get_p2p_offset();  // same for all channels
-        TT_ASSERT(p2p_offset + p2p_size == 0x40000000);
-        dram_config = DramConfig::get_config(device_config);
-        this->input_queues_on_host = input_queues_on_host;
-        this->output_queues_on_host = output_queues_on_host;
-        disable_dynamic_dram = env_as<bool>("PYBUDA_DISABLE_DYNAMIC_DRAM");
-    }
-};
-
-struct QueueDRAMPlacementParameters
-{
-    const DramPlacerConfig *config;
-
-    const Node *node;
-
-    // Producer op grid, which will drive the grid of the queue
-    GridShape grid_shape;
-
-    // For each grid buffer, list locations of the consumers (i.e. readers) and the producer (i.e. writer). The pair is
-    // coordinate + epoch number
-    using ConsumerMap = std::
-        unordered_map<std::uint32_t, std::unordered_map<std::uint32_t, std::vector<std::pair<Coord, std::uint32_t>>>>;
-    using ProducerMap =
-        std::unordered_map<std::uint32_t, std::unordered_map<std::uint32_t, std::pair<Coord, std::uint32_t>>>;
-    ConsumerMap consumer_loc;
-    ProducerMap producer_loc;
-
-    balancer::BlockShape block_shape;
-    std::uint32_t producer_epoch;
-    std::uint32_t last_consumer_epoch;
-    bool in_p2p_region_soft;
-    bool in_p2p_region_hard;
-    bool is_input;
-    bool is_prologue;
-};
-
-using DRAMScheduleData = std::pair<QueuePlacement, QueueDRAMPlacementParameters>;
-
-int get_queue_size(const graphlib::QueueNode *node, balancer::BlockShape const &block_shape, bool untilized);
-
-// Figure out which cores are reading from which dram buffer (or writing to)
-// dram_buffer is relative coordinate within the buffer grid
-std::vector<Coord> get_reader_cores(
-    const Node *node, const OpPlacement &placement, std::uint32_t operand, Coord dram_buffer, GridShape queue_grid);
-
-// Place and allocate DRAM queues
-void place_dram_queues(
-    graphlib::Graph *graph,
-    PlacerSolution &placer_solution,
-    balancer::BalancerSolution &balancer_solution,
-    const HostMemoryPlacerConfig &host_memory_placer_config,
-    const DramPlacerConfig &config,
-    std::vector<DramAllocator> &chip_dram_allocators);
-
-}  // namespace placer
-}  // namespace tt
-
diff --git a/pybuda/csrc/placer/dram_allocator.cpp b/pybuda/csrc/placer/dram_allocator.cpp
deleted file mode 100644
index 50d159559..000000000
--- a/pybuda/csrc/placer/dram_allocator.cpp
+++ /dev/null
@@ -1,744 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include "placer/dram_allocator.hpp"
-
-#include <unordered_map>
-
-#include "best_fit_allocator.hpp"
-#include "placer/dram.hpp"
-#include "reportify/paths.hpp"
-#include "third_party/budabackend/common/param_lib.hpp"
-
-// from backend
-namespace tt::backend
-{
-extern uint32_t get_next_aligned_address(const uint32_t address);
-}
-
-namespace tt::placer
-{
-class RoundRobinPicker : public ChannelPicker
-{
-    const bool skip_ch0 = env_as<bool>("PYBUDA_DISABLE_DRAM0");
-    std::uint32_t next_channel = skip_ch0 ? 1 : 0;
-
-   public:
-    virtual std::uint32_t pick_channel(
-        const QueueDRAMPlacementParameters & /* parameters */,
-        Coord /*c*/,
-        const std::vector<std::unique_ptr<ChannelAllocator>> &channel_allocators) override
-    {
-        std::uint32_t selected_channel = next_channel;
-        TT_ASSERT(selected_channel < channel_allocators.size());
-
-        next_channel++;
-        if (next_channel >= channel_allocators.size())
-            next_channel = skip_ch0 ? 1 : 0;
-
-        return selected_channel;
-    }
-};
-
-class RoundRobinFlipFlopPicker : public ChannelPicker
-{
-    const bool skip_ch0 = env_as<bool>("PYBUDA_DISABLE_DRAM0");
-    struct group
-    {
-        std::uint32_t next_channel;
-        std::uint32_t min, max;
-    };
-    std::vector<group> groups;
-
-   public:
-    virtual std::uint32_t pick_channel(
-        const QueueDRAMPlacementParameters &parameters,
-        Coord /*c*/,
-        const std::vector<std::unique_ptr<ChannelAllocator>> &channel_allocators) override
-    {
-        // Round robin between two groups of channels, based on producer epoch mod 2
-        // If 'is_input' is set, then full set of channels is used
-        std::uint32_t group = parameters.is_input ? 2 : parameters.producer_epoch % 2;
-        if (groups.size() == 0)
-        {
-            for (std::uint32_t i = 0; i < 3; i++)
-            {
-                RoundRobinFlipFlopPicker::group g;
-                std::uint32_t mid = channel_allocators.size() / 2;
-                g.min = ((i == 0) || (i == 2)) ? (skip_ch0 ? 1 : 0) : mid;
-                g.max = (i == 0) ? mid : channel_allocators.size();
-                g.next_channel = g.min;
-                groups.push_back(g);
-            }
-        }
-
-        // Pick channel from the appropriate group
-        std::uint32_t selected_channel = groups[group].next_channel;
-        TT_ASSERT(selected_channel < channel_allocators.size());
-
-        // Increment next channel
-        groups[group].next_channel++;
-        if (groups[group].next_channel >= groups[group].max)
-            groups[group].next_channel = groups[group].min;
-
-        return selected_channel;
-    }
-};
-
-class GreatestCapacityPicker : public ChannelPicker
-{
-   public:
-    virtual std::uint32_t pick_channel(
-        const QueueDRAMPlacementParameters & /*parameters*/,
-        Coord /*c*/,
-        const std::vector<std::unique_ptr<ChannelAllocator>> &channel_allocators) override
-    {
-        std::uint32_t largest_capacity = 0;
-        std::uint32_t selected_channel = 0;
-        for (std::uint32_t i = 0; i < channel_allocators.size(); i++)
-        {
-            auto capacity = channel_allocators[i]->get_capacity();
-            if (capacity > largest_capacity)
-            {
-                largest_capacity = capacity;
-                selected_channel = i;
-            }
-        }
-        TT_ASSERT(selected_channel < channel_allocators.size());
-
-        return selected_channel;
-    }
-};
-
-class ClosestPicker : public ChannelPicker
-{
-    const Node *current_node = nullptr;
-    std::unordered_map<std::uint32_t, std::unordered_map<std::uint32_t, std::uint32_t>>
-        solution;  // coord to channel choice for the node
-    std::unordered_map<std::uint32_t, std::unordered_set<std::uint32_t>>
-        unused_channels_in_epoch;  // per epoch, keep track of channels that are not used, in order to avoid splitting
-                                   // DRAM bw
-    // Prologue has a separate pool of channels because it doesn't run at the same time as the epoch
-    std::unordered_map<std::uint32_t, std::unordered_set<std::uint32_t>>
-        unused_channels_in_epoch_prologue;  // per epoch, keep track of channels that are not used in prologue
-    // Check if channel is unused, and modify it if an alias is available
-    static bool is_channel_unused(
-        std::uint32_t &channel, bool is_grayskull, const std::unordered_set<std::uint32_t> &unused_channels);
-
-   public:
-    virtual std::uint32_t pick_channel(
-        const QueueDRAMPlacementParameters &parameters,
-        Coord /*c*/,
-        const std::vector<std::unique_ptr<ChannelAllocator>> &channel_allocators) override;
-};
-
-DramAllocator::DramAllocator(
-    const DramPlacerConfig &dram_config,
-    const std::string &graph_name,
-    std::uint32_t chip_id,
-    std::vector<Blocks> &allocated_blocks,
-    DRAMPlacementAlgorithm placement_algorithm,
-    AllocationAlgorithm allocator_algorithm) :
-    dram_config(dram_config), graph_name(graph_name), chip_id(chip_id)
-{
-    dram_logger = std::make_unique<DramLogger>();
-
-    switch (placement_algorithm)
-    {
-        case ROUND_ROBIN: channel_picker = std::make_unique<RoundRobinPicker>(); break;
-        case ROUND_ROBIN_FLIP_FLOP: channel_picker = std::make_unique<RoundRobinFlipFlopPicker>(); break;
-        case GREATEST_CAPACITY: channel_picker = std::make_unique<GreatestCapacityPicker>(); break;
-        case CLOSEST: channel_picker = std::make_unique<ClosestPicker>(); break;
-        default: TT_THROW("Unknown placement algorithm");
-    }
-
-    std::unordered_set<std::uint32_t> allocated_channel_blocks;
-    if (allocated_blocks.size() == 0)
-    {
-        for (std::size_t i = 0; i < dram_config.dram_config.size(); i++)
-        {
-            // Only once per channel, skip sub-channels
-            if (allocated_channel_blocks.count(dram_config.dram_config[i].channel))
-                continue;
-
-            allocated_channel_blocks.insert(dram_config.dram_config[i].channel);
-
-            allocated_blocks.push_back(Blocks());
-            if (dram_config.device_config.is_wormhole())
-            {
-                allocated_blocks.push_back(Blocks());
-            }
-        }
-        allocated_blocks.push_back(Blocks());
-    }
-
-    std::unordered_set<std::uint32_t> allocated_channels;
-    switch (allocator_algorithm)
-    {
-        case BEST_FIT:
-            std::uint32_t p2p_offset;
-            std::uint32_t p2p_size;
-
-            if (chip_id == 0)
-            {
-                // if chip is first in a set of chips (chip_id == 0), then, it communicates with host, and we use
-                // host_mmio offset and range. this memory is allocated for communication with host.
-                p2p_offset = dram_config.host_mmio_range_offset;
-                p2p_size = dram_config.host_mmio_range_size;
-            }
-            else
-            {
-                // if chip is not first in a set of chips (chip_id != 0), then, it communicates with chip, and we use
-                // p2p offset and range. this memory is allocated for communication with other chip.
-                p2p_offset = dram_config.p2p_offset;
-                p2p_size = dram_config.p2p_size;
-            }
-
-            // first channel contains memory allocated for communication with host or other device, so first channel
-            // allocator has smaller space than other chanels. It ends where p2p region starts, so, its last available
-            // address is p2p_offset - 1.
-            channel_allocators.push_back(std::make_unique<BestFitAllocator>(
-                dram_config.dram_config[0].initial_dram_offset, p2p_offset - 1, allocated_blocks[0]));
-            if (dram_config.device_config.is_wormhole())
-            {
-                channel_allocators.push_back(std::make_unique<BestFitAllocator>(
-                    std::max(p2p_offset + p2p_size, dram_config.dram_config[0].initial_dram_offset),
-                    dram_config.dram_config[0].channel_size - 1,
-                    allocated_blocks[0]));
-            }
-            allocated_channels.insert(0);  // 0 is done
-
-            for (std::size_t i = 0; i < dram_config.dram_config.size(); i++)
-            {
-                // Don't create extra allocators for subchannels
-                if (allocated_channels.count(dram_config.dram_config[i].channel) > 0)
-                    continue;
-
-                allocated_channels.insert(dram_config.dram_config[i].channel);
-
-                // channels from second to last don not contain memory space for communication with other host/device.
-                // They use all space from initial_dram_offset to channel_size - 1. in wormhole, however, we split that
-                // memory in two parts, since wormhole has separate bandwidths for two halves of dram.
-                if (dram_config.device_config.is_wormhole())
-                {
-                    // in wormhole we split one channel to two 1GB channels because they have separate bandwidths.
-                    channel_allocators.push_back(std::make_unique<BestFitAllocator>(
-                        dram_config.dram_config[i].initial_dram_offset,
-                        dram_config.dram_config[i].channel_size / 2 - 1,
-                        allocated_blocks[2 * dram_config.dram_config[i].channel]));
-                    channel_allocators.push_back(std::make_unique<BestFitAllocator>(
-                        std::max(dram_config.dram_config[i].initial_dram_offset, dram_config.dram_config[i].channel_size / 2),
-                        dram_config.dram_config[i].channel_size - 1,
-                        allocated_blocks[(2 * dram_config.dram_config[i].channel) + 1]));
-                }
-                else
-                {
-                    channel_allocators.push_back(std::make_unique<BestFitAllocator>(
-                        dram_config.dram_config[i].initial_dram_offset,
-                        dram_config.dram_config[i].channel_size - 1,
-                        allocated_blocks[dram_config.dram_config[i].channel]));
-                }
-            }
-            p2p_allocator =
-                std::make_unique<BestFitAllocator>(p2p_offset, p2p_offset + p2p_size - 1, allocated_blocks.back());
-            break;
-
-        default: TT_THROW("Unknown placement algorithm");
-    }
-}
-
-std::vector<Blocks> DramAllocator::get_blocks()
-{
-    std::vector<Blocks> blocks;
-    for (const auto &allocator : channel_allocators) blocks.push_back(allocator->get_blocks());
-    blocks.push_back(p2p_allocator->get_blocks());
-    return blocks;
-}
-const std::unique_ptr<ChannelAllocator> &DramAllocator::get_allocator(
-    std::uint32_t channel_index, bool in_p2p_region) const
-{
-    if (in_p2p_region)
-        return p2p_allocator;
-
-    TT_ASSERT(channel_index < channel_allocators.size());
-    return channel_allocators.at(channel_index);
-}
-
-void DramAllocator::allocate_queues(
-    std::vector<DRAMScheduleData> &scheduled_queue_placements, bool disable_dynamic_dram)
-{
-    // Sort by producer epoch, then by consumer epoch, so that queues that are deallocated at the same time are more
-    // likely to be allocated next to each other
-    sort(
-        scheduled_queue_placements.begin(),
-        scheduled_queue_placements.end(),
-        [](const DRAMScheduleData &i1, const DRAMScheduleData &i2)
-        {
-            if (i1.second.producer_epoch == i2.second.producer_epoch)
-                return i1.second.last_consumer_epoch < i2.second.last_consumer_epoch;
-            return i1.second.producer_epoch < i2.second.producer_epoch;
-        });
-
-    auto is_cross_epoch_type = [](const Node *q) -> bool
-    {
-        if (q->node_type() != graphlib::NodeType::kQueue)
-            return false;
-        if (q->as<graphlib::QueueNode>()->queue_type() != graphlib::QueueNodeType::EpochToEpoch)
-            return false;
-        return q->as<graphlib::EpochToEpochQueueNode>()->is_cross_epoch_type();
-    };
-
-    auto is_cross_chip_type = [](const Node *q) -> bool
-    {
-        if (q->node_type() != graphlib::NodeType::kQueue)
-            return false;
-        if (q->as<graphlib::QueueNode>()->queue_type() != graphlib::QueueNodeType::EpochToEpoch)
-            return false;
-        return q->as<graphlib::EpochToEpochQueueNode>()->is_cross_chip_type();
-    };
-
-    auto is_static_queue =
-        [disable_dynamic_dram, is_cross_epoch_type, is_cross_chip_type](const Node *node, bool is_input)
-    {
-        return disable_dynamic_dram || is_input || is_cross_epoch_type(node) || is_cross_chip_type(node) ||
-               node->as<graphlib::QueueNode>()->is_grad_accumulator();
-    };
-
-    // Allocate all static queues first
-    std::vector<std::uint32_t> dynamic_queues;
-    for (std::size_t i = 0; i < scheduled_queue_placements.size(); i++)
-    {
-        auto &[queue_placement, parameters] = scheduled_queue_placements[i];
-        if (is_static_queue(parameters.node, parameters.is_input))
-        {
-            queue_placement.dram_buffers = allocate_buffers(parameters);
-            log_debug("\tqueue {}: {} buffers allocated", queue_placement.name, queue_placement.dram_buffers.size());
-        }
-        else
-        {
-            dynamic_queues.push_back(i);
-        }
-    }
-
-    std::unordered_set<const Node *> deallocated;
-    std::uint32_t current_epoch = 0;
-    for (std::size_t i = 0; i < dynamic_queues.size(); i++)
-    {
-        auto &[queue_placement, parameters] = scheduled_queue_placements[dynamic_queues[i]];
-
-        // Allocate new queues
-        queue_placement.dram_buffers = allocate_buffers(parameters);
-        log_debug("\tqueue {}: {} buffers allocated", queue_placement.name, queue_placement.dram_buffers.size());
-        queue_placement.epoch_allocate = parameters.producer_epoch;
-
-        // Deallocate queues that were used in the previous epoch, if we moved epochs, or if this is the last queue to
-        // place
-        bool last_queue = (i == dynamic_queues.size() - 1);
-        if ((parameters.producer_epoch > current_epoch) || last_queue)
-        {
-            for (std::size_t index : dynamic_queues)
-            {
-                auto &[prev_queue_placement, prev_parameters] = scheduled_queue_placements[index];
-
-                if ((prev_parameters.producer_epoch > current_epoch) && !last_queue)
-                    break;  // only looking into the past
-
-                if ((prev_parameters.last_consumer_epoch <= current_epoch) || last_queue)
-                {
-                    // Deallocate, if it hasn't been deallocated
-                    if (deallocated.count(prev_parameters.node) > 0)
-                        continue;
-
-                    for (auto &buffer : prev_queue_placement.dram_buffers)
-                    {
-                        // TODO: if p2p_soft, we might not be allocated there, in which case we need to deallocate
-                        // elsewhere
-                        get_allocator(
-                            buffer.dram_channel,
-                            prev_parameters.in_p2p_region_soft | prev_parameters.in_p2p_region_hard)
-                            ->deallocate(buffer.dram_address);
-                        dram_logger->log_deallocate(
-                            buffer.dram_channel, buffer.dram_address, prev_parameters.last_consumer_epoch);
-                        prev_queue_placement.epoch_deallocate = prev_parameters.last_consumer_epoch;
-                    }
-
-                    deallocated.insert(prev_parameters.node);
-                }
-            }
-            current_epoch = parameters.producer_epoch;
-        }
-    }
-
-    // pass through scheduled queue placements and divide channel by two to get real dram channel index.
-    // we do this here because we need virtual channel indices in above lines, for allocator manipulation
-    if (dram_config.device_config.is_wormhole())
-    {
-        for (std::size_t i = 0; i < scheduled_queue_placements.size(); i++)
-        {
-            auto &[queue_placement, parameters] = scheduled_queue_placements[i];
-            for (std::size_t j = 0; j < queue_placement.dram_buffers.size(); j++)
-            {
-                int channel = queue_placement.dram_buffers[j].dram_channel;
-                queue_placement.dram_buffers[j].dram_channel = channel / 2;
-            }
-        }
-    }
-
-    dram_logger->dump_to_reportify(
-        reportify::get_default_reportify_path(graph_name) + reportify::get_memory_report_relative_directory(),
-        graph_name);
-}
-
-static bool is_prologue_queue(const Node *node)
-{
-    return node->as<graphlib::QueueNode>()->is_input() and node->as<graphlib::InputNode>()->is_prologue();
-}
-
-static bool is_output_queue(const Node *node) { return node->as<graphlib::QueueNode>()->is_output(); }
-
-std::vector<QueueBufferPlacement> DramAllocator::allocate_buffers(const QueueDRAMPlacementParameters &parameters)
-{
-    std::vector<QueueBufferPlacement> buffer_placement;
-
-    std::uint32_t queue_size =
-        get_queue_size(parameters.node->as<graphlib::QueueNode>(), parameters.block_shape, false);
-    TT_ASSERT(queue_size > 0, "Queue size must be more than 0");
-
-    // Adjust for alignment
-    queue_size = tt::backend::get_next_aligned_address(queue_size);
-
-    const std::uint32_t num_channels = channel_allocators.size();
-
-    // Patch on DRAM channel selection to get around wormhole_a0 issue
-    bool assign_to_same_channel = this->dram_config.device_config.is_wormhole() and
-                                  not this->dram_config.device_config.is_wormhole_b0() and
-                                  not is_prologue_queue(parameters.node) and not is_output_queue(parameters.node);
-
-    for (std::uint32_t row = 0; row < parameters.grid_shape.rows; row++)
-    {
-        for (std::uint32_t col = 0; col < parameters.grid_shape.columns; col++)
-        {
-            bool force_channel_selection = assign_to_same_channel and not buffer_placement.empty();
-            std::uint32_t addr;
-            std::uint32_t channel;
-
-            if (auto queue_override_it = this->dram_config.manual_dram_queue_placement.find(parameters.node->name());
-                queue_override_it != this->dram_config.manual_dram_queue_placement.end() and
-                queue_override_it->second.channel.has_value())
-            {
-                auto channel_override = queue_override_it->second.channel.value();
-                log_debug(
-                    tt::LogPlacer,
-                    "Manually placing dram queue {} to channel: {}",
-                    parameters.node->name(),
-                    channel_override);
-                channel = channel_override;
-            }
-            else if (force_channel_selection)
-            {
-                channel = buffer_placement.front().dram_channel;
-            }
-            else
-            {
-                channel = channel_picker->pick_channel(parameters, Coord{row, col}, channel_allocators);
-            }
-
-            bool allocated = false;
-            bool try_p2p_region =
-                parameters.in_p2p_region_soft |
-                parameters.in_p2p_region_hard;  // try first, fall back to normal channels if not possible (if ok)
-            for (std::size_t attempt = 0; attempt < num_channels + (try_p2p_region ? 1 : 0); attempt++)
-            {
-                if (get_allocator(channel, try_p2p_region)->allocate(queue_size, addr))
-                {
-                    allocated = true;
-                    dram_logger->log_allocate(parameters.node, channel, addr, queue_size, parameters.producer_epoch);
-                    if (try_p2p_region)
-                        channel = 0;
-                    break;
-                }
-                if (!try_p2p_region and not force_channel_selection)
-                {
-                    channel++;
-                    if (channel >= num_channels)
-                        channel = 0;
-                }
-                else
-                {
-                    if (parameters.in_p2p_region_hard)
-                    {
-                        log_fatal(
-                            tt::LogPlacer,
-                            "Failed to allocate queue {} of size {} ({} MB) in p2p dram on chip {}",
-                            parameters.node->name(),
-                            queue_size,
-                            int(queue_size * 1.0 / (1024 * 1024)),
-                            chip_id);
-                    }
-                }
-
-                try_p2p_region = false;
-            }
-
-            if (!allocated)
-            {
-                log_fatal(
-                    tt::LogPlacer,
-                    "Failed to allocate queue {} of size {} ({} MB) in dram, as there's no room left on chip {}",
-                    parameters.node->name(),
-                    queue_size,
-                    int(queue_size * 1.0 / (1024 * 1024)),
-                    chip_id);
-            }
-
-            int real_channel = channel;  // not virtual
-            if (dram_config.device_config.is_wormhole())
-            {
-                real_channel = channel / 2;
-            }
-
-            buffer_placement.push_back(QueueBufferPlacement{
-                .dram_channel = channel,  // this is still virtual channel, meaning that it has to be divided by 2 to
-                                          // get actual dram channel.
-                .dram_address = addr,
-                .dram_channel_location = dram_config.dram_config[real_channel].location,
-                .buffer_size = queue_size,
-            });
-        }
-    }
-
-    return buffer_placement;
-}
-
-std::uint32_t noc_distance(const Coord &start, const Coord &end, const tt::DeviceGrid &grid_size, std::uint32_t noc)
-{
-    // NOC0 goes right and down, NOC1 goes left and up.
-    auto grid_r = grid_size.r + 2;  // physical grid size
-    auto grid_c = grid_size.c + 2;
-
-    if (noc == 0)
-    {
-        // Check for wrap
-        auto x_dist = (start.col <= end.col) ? end.col - start.col : grid_c - start.col + end.col;
-        auto y_dist = (start.row <= end.row) ? end.row - start.row : grid_r - start.row + end.row;
-        return x_dist + y_dist;
-    }
-    else
-    {
-        // Check for wrap
-        auto x_dist = (start.col >= end.col) ? start.col - end.col : grid_c - end.col + start.col;
-        auto y_dist = (start.row >= end.row) ? start.row - end.row : grid_r - end.row + start.row;
-        return x_dist + y_dist;
-    }
-}
-
-// Check if channel is unused, and modify it if an alias is available
-bool ClosestPicker::is_channel_unused(
-    std::uint32_t &channel, bool is_grayskull, const std::unordered_set<std::uint32_t> &unused_channels)
-{
-    if (unused_channels.count(channel) == 0)
-    {
-        // Check the other one for wormhole
-        if (!is_grayskull && (channel % 2 == 0) && unused_channels.count(channel + 1) > 0)
-        {
-            channel += 1;
-            return true;
-        }
-        return false;
-    }
-    return true;
-}
-
-// Pick the closest DRAM channel, to minimize the distance on either NOC0 or NOC1
-// For each channel, we will calculate the average distance, on each noc, to all consumers, and the producer. If the
-// channel has more than one location, all will be calculated, and lowest picked. Finally, the channel with the lowest
-// average distance is picked.
-std::uint32_t ClosestPicker::pick_channel(
-    const QueueDRAMPlacementParameters &parameters,
-    Coord c,
-    const std::vector<std::unique_ptr<ChannelAllocator>> &channel_allocators)
-{
-    const auto &config = *parameters.config;
-    const auto *node = parameters.node;
-
-    if (node == current_node)
-    {
-        // We already have a solution, just return it.
-        TT_ASSERT(solution.count(c.row) > 0);
-        TT_ASSERT(solution.at(c.row).count(c.col) > 0);
-        log_trace(
-            tt::LogPlacer, "Picking channel {} for queue {} at {}", solution.at(c.row).at(c.col), node->name(), c);
-        return solution.at(c.row).at(c.col);
-    }
-
-    // We don't have a solution yet, so we need to calculate it.
-    bool is_grayskull = parameters.config->device_config.is_grayskull();
-
-    std::unordered_set<std::uint32_t> unused_channels;
-    auto reset_channels = [&config, is_grayskull](std::unordered_set<std::uint32_t> &channels)
-    {
-        channels.clear();
-        for (std::size_t i = 0; i < config.dram_config.size(); i++)
-        {
-            if (is_grayskull)
-            {
-                channels.insert(config.dram_config[i].channel);
-            }
-            else
-            {
-                channels.insert(config.dram_config[i].channel * 2);
-                channels.insert(config.dram_config[i].channel * 2 + 1);
-            }
-        }
-    };
-    reset_channels(unused_channels);
-
-    //
-    // Each buffer has to be allocated a single channel. However, each epoch can have a different subchannel, i.e. a
-    // differe core to read from, or to write to. So, we need to go through all the epochs and pick the channel that
-    // minimizes the distance to all consumers, from whatever the best core is in each epoch.
-    //
-    // There's currently no way to specify this in the netlist, one we pick the channel, but we will eventually have it.
-    // In the meantime, we rely on backend to pick the closest core whenever possible.
-    //
-
-    for (std::uint32_t row = 0; row < parameters.grid_shape.rows; row++)
-    {
-        for (std::uint32_t col = 0; col < parameters.grid_shape.columns; col++)
-        {
-            // Get producers and consumers for this buffer in the queue
-            TT_LOG_ASSERT(
-                parameters.consumer_loc.count(row) > 0,
-                "Missing consumer location for queue {} for row {}",
-                node->name(),
-                row);
-            TT_LOG_ASSERT(
-                parameters.consumer_loc.at(row).count(col) > 0,
-                "No consumers for queue {} at row={}, col={}",
-                node->name(),
-                row,
-                col);
-            std::vector<std::pair<Coord, std::uint32_t>> consumers = parameters.consumer_loc.at(row).at(col);
-
-            // Per channel, per epoch, record best distance and subchannel that has it
-            std::
-                unordered_map<std::uint32_t, std::unordered_map<std::uint32_t, std::pair<std::uint32_t, std::uint32_t>>>
-                    best_distance;
-
-            // Record used epochs so we can check for unused channels per epoch
-            std::unordered_set<std::uint32_t> used_epochs;
-            for (auto &consumer : consumers)
-            {
-                used_epochs.insert(consumer.second);
-            }
-
-            // Producer is optional - some queues are filled from the host before the epoch runs (i.e. constants/parameters)
-            if ((parameters.producer_loc.count(row) > 0) && (parameters.producer_loc.at(row).count(col) > 0))
-            {
-                std::pair<Coord, std::uint32_t> producer = parameters.producer_loc.at(row).at(col);
-                used_epochs.insert(producer.second);
-            }
-
-            // Go through all dram locations, and find the distance to all consumers and the producer. If it's shorter
-            // than current best_distance, update best_distance and best_channel.
-            for (auto &dram_c : config.dram_config)
-            {
-                // Only consider channels that haven't been picked. Grayskull has 1-1 mapping, but Wormhole actually has
-                // 2 channels for each one in dram config.
-                auto real_channel = is_grayskull ? dram_c.channel : dram_c.channel * 2;
-                if (!is_channel_unused(real_channel, is_grayskull, unused_channels))
-                {
-                    continue;
-                }
-
-                // Also check per-epoch unused channels, and skip the channel if it's been used
-                auto &epoch_map = parameters.is_prologue ? unused_channels_in_epoch_prologue : unused_channels_in_epoch;
-                for (auto &epoch : used_epochs)
-                {
-                    if (epoch_map.count(epoch) == 0)
-                    {
-                        epoch_map[epoch] = std::unordered_set<std::uint32_t>();
-                        reset_channels(epoch_map[epoch]);
-                    }
-                    if (!is_channel_unused(real_channel, is_grayskull, epoch_map[epoch]))
-                    {
-                        continue;
-                    }
-                }
-
-                std::unordered_map<std::uint32_t, std::uint32_t> distance;  // per epoch
-
-                for (auto &consumer : consumers)
-                {
-                    auto noc0 = noc_distance(dram_c.location, consumer.first, config.device_config.grid_size, 0);
-                    auto noc1 = noc_distance(dram_c.location, consumer.first, config.device_config.grid_size, 1);
-                    auto d = (noc0 < noc1) ? noc0 : noc1;
-                    distance[consumer.second] += d;
-                }
-
-                if ((parameters.producer_loc.count(row) > 0) && (parameters.producer_loc.at(row).count(col) > 0))
-                {
-                    std::pair<Coord, std::uint32_t> producer = parameters.producer_loc.at(row).at(col);
-                    auto noc0 = noc_distance(producer.first, dram_c.location, config.device_config.grid_size, 0);
-                    auto noc1 = noc_distance(producer.first, dram_c.location, config.device_config.grid_size, 1);
-                    auto d = (noc0 < noc1) ? noc0 : noc1;
-                    distance[producer.second] += d;
-
-                    for (auto it : distance)
-                    {
-                        if (best_distance.count(real_channel) == 0 || best_distance.at(real_channel).count(it.first) == 0 ||
-                            it.second < best_distance.at(real_channel).at(it.first).second)
-                        {
-                            best_distance[real_channel][it.first] = std::make_pair(real_channel, it.second);
-                        }
-                    }
-                }
-            }
-
-            // For each channel, sum up the epoch distances, and find the best channel
-            std::uint32_t total_best_distance = std::numeric_limits<std::uint32_t>::max();
-            std::uint32_t total_best_channel = 0;
-
-            for (auto &[channel, epoch_distances] : best_distance)
-            {
-                std::uint32_t total_distance = 0;
-                for (auto &[epoch, distance] : epoch_distances)
-                {
-                    total_distance += distance.second;
-                }
-                if (total_distance < total_best_distance)
-                {
-                    total_best_distance = total_distance;
-                    total_best_channel = channel;
-                }
-            }
-
-            solution[row][col] = total_best_channel;
-            unused_channels.erase(total_best_channel);
-
-            if (unused_channels.empty())
-            {
-                // Go back to picking from all channels
-                reset_channels(unused_channels);
-            }
-
-            // Clear per-epoch unused channels
-            auto &epoch_map = parameters.is_prologue ? unused_channels_in_epoch_prologue : unused_channels_in_epoch;
-            for (auto &epoch : used_epochs)
-            {
-                const bool no_epoch = env_as<bool>("PYBUDA_CLOSEST_NO_EPOCH", false);
-                if (!no_epoch)
-                    epoch_map[epoch].erase(total_best_channel);
-                if (epoch_map[epoch].empty())
-                {
-                    reset_channels(epoch_map[epoch]);
-                }
-            }
-        }
-    }
-    current_node = node;
-    return pick_channel(parameters, c, channel_allocators);
-}
-
-}  // namespace tt::placer
diff --git a/pybuda/csrc/placer/dram_allocator.hpp b/pybuda/csrc/placer/dram_allocator.hpp
deleted file mode 100644
index 5b5c50f76..000000000
--- a/pybuda/csrc/placer/dram_allocator.hpp
+++ /dev/null
@@ -1,90 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-//
-// Allocate queues in DRAM. Keep track of allocated space, try to distribute allocations in a good way.
-//
-#pragma once
-
-#include "placer/dram.hpp"
-#include "placer/dram_logger.hpp"
-
-namespace tt
-{
-namespace placer
-{
-struct Block
-{
-    std::uint32_t addr, size;
-};
-struct Blocks
-{
-    std::map<std::uint32_t, Block> free_blocks_start;  // keyed on start addr
-    std::unordered_map<std::uint32_t, Block> free_blocks_end;    // keyed on start+size
-    std::unordered_map<std::uint32_t, Block> allocated_blocks;   // keyed on start
-};
-// Allocate buffers within one channel
-class ChannelAllocator
-{
-   public:
-    ChannelAllocator() {}
-    virtual ~ChannelAllocator() = default;
-    virtual bool allocate(std::uint32_t size, std::uint32_t &addr) = 0;  // return true if allocated, and update addr
-    virtual void deallocate(std::uint32_t addr) = 0;
-    virtual std::uint32_t get_capacity() = 0;
-    virtual Blocks get_blocks() = 0;
-};
-
-class ChannelPicker
-{
-   public:
-    virtual ~ChannelPicker() = default;
-    virtual std::uint32_t pick_channel(
-        const QueueDRAMPlacementParameters &parameters,
-        Coord /*c*/,
-        const std::vector<std::unique_ptr<ChannelAllocator>> &channel_allocators) = 0;
-};
-
-enum DRAMPlacementAlgorithm
-{
-    ROUND_ROBIN = 1,
-    ROUND_ROBIN_FLIP_FLOP = 2,
-    GREATEST_CAPACITY = 3,
-    CLOSEST = 4
-};
-
-enum AllocationAlgorithm
-{
-    BEST_FIT = 1
-};
-
-// Allocate queues across all channels
-class DramAllocator
-{
-   private:
-    const DramPlacerConfig &dram_config;
-    const std::string graph_name;
-    std::uint32_t chip_id;
-    std::unique_ptr<DramLogger> dram_logger;
-
-    std::vector<std::unique_ptr<ChannelAllocator>> channel_allocators;
-    std::unique_ptr<ChannelAllocator> p2p_allocator;
-    std::unique_ptr<ChannelPicker> channel_picker;
-
-    std::vector<QueueBufferPlacement> allocate_buffers(const QueueDRAMPlacementParameters &parameters);
-    const std::unique_ptr<ChannelAllocator> &get_allocator(std::uint32_t channel_index, bool in_p2p_region) const;
-
-   public:
-    DramAllocator(
-        const DramPlacerConfig &dram_config,
-        const std::string &graph_name,
-        std::uint32_t chip_id,
-        std::vector<Blocks> &allocated_blocks,
-        DRAMPlacementAlgorithm placement_algorithm = ROUND_ROBIN,
-        AllocationAlgorithm allocator_algorithm = BEST_FIT);
-    void allocate_queues(std::vector<DRAMScheduleData> &scheduled_queue_placements, bool disable_dynamic_dram);
-    std::vector<Blocks> get_blocks();
-};
-
-}  // namespace placer
-}  // namespace tt
diff --git a/pybuda/csrc/placer/dram_logger.cpp b/pybuda/csrc/placer/dram_logger.cpp
deleted file mode 100644
index 4f228de7b..000000000
--- a/pybuda/csrc/placer/dram_logger.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "placer/dram_logger.hpp"
-#include "graph_lib/node.hpp"
-
-#include <fstream>
-#include <experimental/filesystem>
-#include "utils/logger.hpp"
-
-namespace tt::placer {
-
-void DramLogger::log_allocate(
-            const graphlib::Node *node, 
-            std::uint32_t dram_channel, 
-            std::uint32_t addr, 
-            std::uint32_t size,
-            std::uint32_t allocate_epoch)
-{
-    allocations.push_back(Allocation{
-            .name = node->name(),
-            .dram_channel = dram_channel,
-            .addr = addr,
-            .size = size,
-            .allocate_epoch = allocate_epoch,
-            .deallocate_epoch = 0});
-    log_trace(tt::LogPlacer, "Placing {} to channel: {} at addr: 0x{:x}", node->name(), dram_channel, addr);
-}
-
-void DramLogger::log_deallocate(std::uint32_t dram_channel, std::uint32_t addr, std::uint32_t deallocate_epoch)
-{
-    for (Allocation &alloc : allocations)
-    {
-        if ((alloc.deallocate_epoch == 0) && (alloc.dram_channel == dram_channel) && (alloc.addr == addr)) {
-            alloc.deallocate_epoch = deallocate_epoch;
-            return;
-        }
-    }
-    // commenting out for now for wormhole, need to fix
-    //TT_THROW("Logging a deallocation that can't be found in allocation list.");
-}
-
-void DramLogger::dump_to_reportify(const std::string &output_dir, const std::string &test_name) const
-{
-    if (env_as<bool>("PYBUDA_DISABLE_REPORTIFY_DUMP"))
-        return;
-    std::experimental::filesystem::create_directories(output_dir);
-    std::string filename = output_dir + "/memory_dram_dynamic_analysis.json";
-    std::ofstream out(filename);
-    TT_ASSERT(out.is_open(), "Can't open " + filename + " for writing.");
-
-
-    std::uint32_t max_dram_ch = 0;
-    std::unordered_map<std::uint32_t, std::vector<Allocation>> alloc_table;
-    for (const Allocation &alloc : allocations) {
-        if (alloc.dram_channel > max_dram_ch)
-            max_dram_ch = alloc.dram_channel;
-        alloc_table[alloc.dram_channel].push_back(alloc);
-    }
-
-    out << "{" << std::endl
-        << "  \"test_name\": \"" << test_name << "\", " 
-        << "  \"dram\": 1," << std::endl;
-    
-    for (std::uint32_t dram_channel = 0; dram_channel <= max_dram_ch; dram_channel++)
-    {
-
-        out << "  \"Dynamic DRAM Map for Bank " << dram_channel << "\": [" << std::endl;
-        bool first = true;
-        for (const Allocation &alloc : alloc_table[dram_channel])
-        {
-            if (!first) {
-                out << ", " << std::endl;
-            }
-            first = false;
-
-            out << "{" 
-                << "\"queue_name\": \"" << alloc.name << "\", "
-                << "\"base\": " << alloc.addr << ", "
-                << "\"size\": " << alloc.size << ", "
-                << "\"allocation_cycle\": " << alloc.allocate_epoch;
-
-            if (alloc.deallocate_epoch > 0) 
-                out << ", \"deallocation_cycle\": " << alloc.deallocate_epoch;
-
-            out << "}";
-        }
-        out << std::endl << "]";
-        if (dram_channel < max_dram_ch)
-            out << ", " << std::endl;
-
-    }
-    out << "}" << std::endl;
-    out.close();
-}
-
-}
-
diff --git a/pybuda/csrc/placer/dram_logger.hpp b/pybuda/csrc/placer/dram_logger.hpp
deleted file mode 100644
index ce1c2294d..000000000
--- a/pybuda/csrc/placer/dram_logger.hpp
+++ /dev/null
@@ -1,47 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-//
-// Log DRAM allocations / deallocations to analyze later
-//
-#pragma once
-
-#include <cstdint>
-#include <string>
-#include <vector>
-
-namespace tt::graphlib {
-    class Node;
-}
-
-namespace tt::placer {
-
-class DramLogger {
-    struct Allocation {
-        std::string name;
-        std::uint32_t dram_channel;
-        std::uint32_t addr;
-        std::uint32_t size;
-        std::uint32_t allocate_epoch;
-        std::uint32_t deallocate_epoch = 0;
-    };
-
-    std::vector<Allocation> allocations;
-
-public:
-    void log_allocate(
-            const graphlib::Node *node, 
-            std::uint32_t dram_channel, 
-            std::uint32_t addr, 
-            std::uint32_t size,
-            std::uint32_t allocate_epoch);
-
-    void log_deallocate(
-            std::uint32_t dram_channel, 
-            std::uint32_t addr, 
-            std::uint32_t deallocate_epoch);
-
-    void dump_to_reportify(const std::string &output_dir, const std::string &test_name) const;
-};
-
-}
diff --git a/pybuda/csrc/placer/epoch_placer.cpp b/pybuda/csrc/placer/epoch_placer.cpp
deleted file mode 100644
index 8209242ec..000000000
--- a/pybuda/csrc/placer/epoch_placer.cpp
+++ /dev/null
@@ -1,410 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "placer/epoch_placer.hpp"
-
-#include "balancer/balancer.hpp"
-#include "balancer/legalizer/legalizer.hpp"
-#include "balancer/policies/policy_nlp.hpp"
-#include "graph_lib/defines.hpp"
-#include "placer/evaluator.hpp"
-#include "placer/grid_placer.hpp"
-#include "placer/post_epoch_passes.hpp"
-#include "placer/pre_epoch_passes.hpp"
-#include "scheduler/interactive_scheduler.hpp"
-
-using namespace tt::balancer;
-
-namespace tt::placer
-{
-
-static std::tuple<OpModelMap, BlockShapeMap, OutputHostTMMap, CutEdges> run_balancer(
-    Graph* graph,
-    BalancerConfig const& config,
-    std::shared_ptr<balancer::BalancerCacheCollection> cache_collection,
-    const LegalOpModels& valid_op_models,
-    std::uint32_t target_cycles)
-{
-    auto graph_solver = get_graph_solver(config, cache_collection, graph, valid_op_models);
-    legalizer::GraphSolverSolution graph_solver_solution = run_policy_nlp(graph, config, graph_solver, target_cycles);
-    update_ops_on_selected_op_models(graph, graph_solver_solution.selected_op_models);
-    return legalizer::resolve_block_shapes(graph, config, graph_solver_solution);
-}
-
-void insert_input_queues(PlacerSolution& placer_solution, const Graph* graph, const OpModelMap& op_models)
-{
-    // Add input queues to the placer solution
-    for (auto [node_name, op_model] : op_models)
-    {
-        Node* node = graph->get_node_by_name(node_name);
-        switch (node->node_type())
-        {
-            case graphlib::NodeType::kInput:
-            {
-                placer_solution.input_queue_to_grid_shape.insert(
-                    {node_name,
-                     tt::placer::GridShape(
-                         (std::uint32_t)op_model.grid_shape.r, (std::uint32_t)op_model.grid_shape.c)});
-                break;
-            }
-            default: break;
-        }
-    }
-}
-
-PlacerSolution place_epoch(
-    std::uint32_t epoch_index,
-    graphlib::NodeEpochType epoch_type,
-    balancer::BalancerConfig const& config,
-    OpModelMap const& selected_op_models,
-    scheduler::InteractiveScheduler& ischeduler)
-{
-    std::optional<Coord> starting_coordinate = std::nullopt;
-
-    unordered_map<string, OpPlacement> name_to_op_placement;
-    map<PlacerSolution::EpochId, int> epoch_id_to_chip;
-    unordered_map<int, vector<OpPlacement>> epoch_id_to_op_placement;
-    EpochIdToDeviceGrid epoch_id_to_device_grid;
-
-    epoch_id_to_device_grid.rows = config.device_config.grid_size.r;  // TODO: get harvested rows
-    epoch_id_to_device_grid.columns = config.device_config.grid_size.c;
-    epoch_id_to_device_grid.initialize_device_grid(epoch_index);
-
-    std::uint32_t current_row = 0;
-    while (true)
-    {
-        std::vector<std::string> candidate_ops = ischeduler.get_ops();
-        if (candidate_ops.size() == 0)
-            break;
-
-        std::unordered_map<std::string, tt::placer::GridShape> op_to_grid_shape;
-
-        // Try, in priority order, to place on the same row, then same epoch
-        std::optional<std::uint32_t> selected_op = std::nullopt;
-        DeviceGridPlacement device_grid_placement;
-        for (std::size_t i = 0; i < candidate_ops.size(); i++)
-        {
-            std::string name = candidate_ops.at(i);
-            auto op_model = selected_op_models.at(name);
-            op_to_grid_shape.insert(
-                {name, GridShape((std::uint32_t)op_model.grid_shape.r, (std::uint32_t)op_model.grid_shape.c)});
-
-            std::optional<DeviceGridPlacement> p = place_one_op(
-                name,
-                op_to_grid_shape,
-                epoch_id_to_device_grid.get_device_grid(epoch_index),
-                config.op_name_to_placer_overrides,
-                config.enable_auto_transposing_placement,
-                starting_coordinate);
-
-            if (!p.has_value())  // doesn't fit
-                continue;
-
-            bool placed_on_same_row = (device_grid_placement.placed_cores.start.row == current_row);
-            if (!selected_op.has_value() || placed_on_same_row)
-            {
-                device_grid_placement = p.value();
-                selected_op = i;
-            }
-
-            if (placed_on_same_row)
-                break;  // same row, we can't do any better
-        }
-
-        if (!selected_op.has_value())
-            break;  // nothing fits in this epoch
-
-        std::string name = candidate_ops.at(selected_op.value());
-        ischeduler.accept_op(candidate_ops.at(selected_op.value()));  // placed
-        current_row = device_grid_placement.placed_cores.start.row;
-
-        OpPlacement op_placement = OpPlacement{
-            .id = 0,
-            .name = name,
-            .chip_id = 0,
-            .global_epoch_id = epoch_index,
-            .grid_transpose = device_grid_placement.grid_transpose,
-            .placed_cores = device_grid_placement.placed_cores};
-        name_to_op_placement[op_placement.name] = op_placement;
-        epoch_id_to_op_placement[epoch_index].push_back(op_placement);
-
-        epoch_id_to_device_grid.fill_device_grid_with_placement(
-            epoch_index, device_grid_placement.placed_cores.start, op_to_grid_shape.at(name));
-
-        log_debug(
-            tt::LogPlacer,
-            "\tPlacing {} with grid_shape ({}, {}) onto:",
-            op_placement.name,
-            op_to_grid_shape.at(op_placement.name).rows,
-            op_to_grid_shape.at(op_placement.name).columns);
-        log_debug(
-            tt::LogPlacer,
-            "\t\t chip_id={}, epoch_id={}, inclusive_start: {}, exclusive_end={}",
-            op_placement.chip_id,
-            op_placement.epoch_id(),
-            op_placement.placed_cores.start,
-            op_placement.placed_cores.end);
-    }
-
-    unordered_map<int, EpochInfo> epoch_id_to_epoch_info;
-
-    epoch_id_to_epoch_info[epoch_index] = EpochInfo{
-        .global_epoch_id = epoch_index,
-        .temporal_epoch_id = epoch_index,
-        .spatial_epoch_id = 0,
-        .epoch_type = epoch_type,
-    };
-
-    std::uint32_t num_epochs = (name_to_op_placement.size() == 0) ? 0 : 1;
-    PlacerSolution placer_solution = PlacerSolution{
-        .name_to_op_placement = std::move(name_to_op_placement),
-        .input_queue_to_grid_shape = {},
-        .name_to_queue_placement = {},
-        .epoch_id_to_chip = std::move(epoch_id_to_chip),
-        .epoch_id_to_subgraph_index = {},
-        .epoch_id_to_op_placement = std::move(epoch_id_to_op_placement),
-        .epoch_id_to_device_grid = std::move(epoch_id_to_device_grid),
-        .epoch_id_to_epoch_info = std::move(epoch_id_to_epoch_info),
-        .num_epochs = num_epochs,
-    };
-
-    return placer_solution;
-}
-
-std::shared_ptr<balancer::BalancerSolution> run_epoch_placer(
-    Graph** graph,
-    balancer::BalancerConfig const& config,
-    std::shared_ptr<balancer::BalancerCacheCollection> cache_collection)
-{
-    PlacerHistory history;
-
-    Graph* current_graph = *graph;
-
-    PlacerSolution placer_solution;
-    scheduler::InteractiveScheduler ischeduler =
-        scheduler::InteractiveScheduler(config.scheduler_config, current_graph, graphlib::NodeEpochType::Forward);
-    bool first_epoch = true;
-
-    // Final balancer solution, merged over epochs
-    OpModelMap selected_op_models;
-    BlockShapeMap selected_block_shape_map;
-    OutputHostTMMap selected_output_host_tms;
-
-    // Global graph legal op models, generated by the legalizer
-    LegalOpModels global_valid_op_models = legalizer::get_legal_op_models(current_graph, config, cache_collection);
-    auto graph_solver = get_graph_solver(config, cache_collection, current_graph, global_valid_op_models);
-
-    // Figure out initial global target cycle count
-    std::uint32_t global_target_cycles;
-    if (auto manual_target = env_as_optional<int>("PYBUDA_NLP_MANUAL_TARGET"))
-    {
-        global_target_cycles = *manual_target;
-        log_info(LogBalancer, "Manual override of target cycles to {}", global_target_cycles);
-    }
-    else
-    {
-        global_target_cycles = calculate_target_cycles(current_graph, graph_solver, config.device_config.arch_name);
-    }
-
-    auto [op_models, block_shape_map, output_host_tms, cut_edges] =
-        run_balancer(current_graph, config, cache_collection, global_valid_op_models, global_target_cycles);
-
-    graphlib::NodeEpochType current_epoch_type =
-        graphlib::NodeEpochType::Forward;  // TODO: we should do bwd first, figure out recompute, etc.
-    while (true)
-    {
-        // First attempt at placement uses global targets and global valid shapes
-        bool global_target_run = true;
-        bool epoch_complete = false;
-
-        std::uint32_t eval_attempt = 0;
-        EpochPlacement best_placement;  // currently best placement
-
-        std::vector<std::uint32_t> eval_target_cycles;
-        std::uint32_t target_cycles_to_try = 10;  // todo, config
-        std::uint32_t target_cycles_low = 0.25 * global_target_cycles;
-        std::uint32_t target_cycles_high = 4 * global_target_cycles;
-        for (std::uint32_t i = 0; i < target_cycles_to_try; i++)
-        {
-            eval_target_cycles.push_back(
-                target_cycles_low + 1.0 * i * (target_cycles_high - target_cycles_low) / (target_cycles_to_try - 1));
-        }
-
-        while (!epoch_complete)  // Run the loop until we're satisfied, or enough attempts were done
-        {
-            // Run pre-epoch passes, potentially modifying the graph
-            std::unique_ptr<Graph> modified_graph = run_pre_epoch_passes(current_graph, config, history);
-
-            // Run legalizer/solver
-            if (modified_graph)
-            {
-                current_graph = modified_graph.get();
-                global_valid_op_models = legalizer::get_legal_op_models(current_graph, config, cache_collection);
-                std::tie(op_models, block_shape_map, output_host_tms, cut_edges) =
-                    run_balancer(current_graph, config, cache_collection, global_valid_op_models, global_target_cycles);
-            }
-
-            if (!global_target_run)
-            {
-                std::tie(op_models, block_shape_map, output_host_tms, cut_edges) = run_balancer(
-                    current_graph,
-                    config,
-                    cache_collection,
-                    global_valid_op_models,
-                    eval_target_cycles.at(eval_attempt));
-            }
-
-            // Run op-by-op placer, and place full epoch
-            auto checkpoint = ischeduler.save_checkpoint();
-            auto epoch_placer_solution = place_epoch(  // TODO on epoch type
-                history.current_epoch(),
-                current_epoch_type,
-                config,
-                op_models,
-                ischeduler);
-
-            // Run post-epoch passes
-            PlacerAttemptSummary summary = run_post_epoch_passes(placer_solution, epoch_placer_solution, history);
-
-            if (summary.fail)
-            {
-                log_trace(
-                    LogPlacer,
-                    "Placer attempt {} on epoch {} failed.",
-                    history.current_attempt(),
-                    history.current_epoch());
-                if (history.current_attempt() > 5)
-                    TT_THROW("Epoch placer failed to place an epoch more than 5 times. Aborting.");
-            }
-            else
-            {
-                log_trace(
-                    LogPlacer,
-                    "Placer attempt {} on epoch {} passed",
-                    history.current_attempt(),
-                    history.current_epoch());
-
-                auto balancer_solution = std::make_shared<balancer::BalancerSolution>(
-                    epoch_placer_solution, op_models, block_shape_map, output_host_tms, cut_edges);
-                EpochPlacement new_placement(
-                    balancer_solution,
-                    ischeduler.save_checkpoint(),
-                    std::move(modified_graph),
-                    config.device_config.arch_name);
-
-                if (global_target_run)
-                {
-                    log_debug(
-                        LogPlacer,
-                        "Placer initial eval attempt for epoch {} has score of {}, for target cycles {}",
-                        history.current_epoch(),
-                        new_placement.score(),
-                        global_target_cycles);
-                }
-                else
-                {
-                    log_debug(
-                        LogPlacer,
-                        "Placer eval attempt {} of {} for epoch {} has score of {}, for target_cycles {}",
-                        eval_attempt + 1,
-                        eval_target_cycles.size(),
-                        history.current_epoch(),
-                        new_placement.score(),
-                        eval_target_cycles.at(eval_attempt));
-                }
-
-                if (global_target_run || (new_placement.is_better_than(best_placement)))
-                    best_placement = std::move(new_placement);
-
-                if (!global_target_run)
-                    eval_attempt++;
-                global_target_run = false;
-                history.reset_attempts();
-
-                epoch_complete = (eval_attempt >= eval_target_cycles.size());
-            }
-
-            ischeduler.restore_checkpoint(checkpoint);
-        }
-
-        TT_ASSERT(best_placement.valid());
-
-        // Epoch done, record solution
-        const auto& best_solution = best_placement.solution();
-        for (auto it : best_solution->placer_solution.name_to_op_placement)
-        {
-            const std::string& name = it.first;
-            selected_op_models.insert(std::make_pair(name, best_solution->op_models.at(name)));
-            selected_block_shape_map.insert(std::make_pair(name, best_solution->block_shapes.at(name)));
-
-            if (best_solution->output_host_tms.count(name) > 0)
-                selected_output_host_tms.insert(std::make_pair(name, best_solution->output_host_tms.at(name)));
-
-            for (graphlib::Node* input : current_graph->data_operands(current_graph->get_node_by_name(name)))
-            {
-                if (input->node_type() == graphlib::kInput)
-                {
-                    selected_op_models.insert(
-                        std::make_pair(input->name(), best_solution->op_models.at(input->name())));
-                    selected_block_shape_map.insert(
-                        std::make_pair(input->name(), best_solution->block_shapes.at(input->name())));
-                }
-            }
-            for (graphlib::Node* output : current_graph->data_users(current_graph->get_node_by_name(name)))
-            {
-                if (output->node_type() == graphlib::kOutput)
-                {
-                    selected_block_shape_map.insert(
-                        std::make_pair(output->name(), best_solution->block_shapes.at(output->name())));
-                }
-            }
-        }
-
-        if (first_epoch)
-            placer_solution = best_solution->placer_solution;
-        else
-            placer_solution.merge(best_solution->placer_solution);  // destroys the original
-
-        // placeholder
-        placer_solution.epoch_id_to_chip[history.current_epoch()] = 0;
-
-        history.next_epoch();
-        first_epoch = false;
-
-        Graph* modified_graph = best_placement.release_graph();
-        if (modified_graph)
-        {
-            *graph = modified_graph;
-            current_graph = modified_graph;
-        }
-
-        ischeduler.restore_checkpoint(best_placement.scheduler_checkpoint());
-
-        while (ischeduler.done())
-        {
-            // TODO: simple progression for now
-            if (current_epoch_type == graphlib::NodeEpochType::Forward)
-            {
-                current_epoch_type = graphlib::NodeEpochType::Backward;
-                ischeduler.set_epoch_type(current_epoch_type);
-            }
-            else if (current_epoch_type == graphlib::NodeEpochType::Backward)
-            {
-                current_epoch_type = graphlib::NodeEpochType::Optimizer;
-                ischeduler.set_epoch_type(current_epoch_type);
-            }
-            else
-            {
-                // Add input queues to the placer solution
-                insert_input_queues(placer_solution, current_graph, selected_op_models);
-
-                // Assign chips (todo)
-
-                return std::make_shared<balancer::BalancerSolution>(
-                    placer_solution, selected_op_models, selected_block_shape_map, selected_output_host_tms, cut_edges);
-            }
-        }
-    }
-}
-
-}  // namespace tt::placer
diff --git a/pybuda/csrc/placer/epoch_placer.hpp b/pybuda/csrc/placer/epoch_placer.hpp
deleted file mode 100644
index 0499b01f1..000000000
--- a/pybuda/csrc/placer/epoch_placer.hpp
+++ /dev/null
@@ -1,27 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "balancer/balancer.hpp"
-#include "balancer/balancer_cache_collection.hpp"
-
-namespace tt
-{
-namespace graphlib
-{
-class Graph;
-}
-
-namespace placer
-{
-
-// Return balancer solution for the full graps.
-// It could modify the graph, and update the pointer
-std::shared_ptr<balancer::BalancerSolution> run_epoch_placer(
-    Graph** graph,
-    balancer::BalancerConfig const& config,
-    std::shared_ptr<balancer::BalancerCacheCollection> cache_collection);
-
-}  // namespace placer
-}  // namespace tt
diff --git a/pybuda/csrc/placer/evaluator.cpp b/pybuda/csrc/placer/evaluator.cpp
deleted file mode 100644
index da66a8f8a..000000000
--- a/pybuda/csrc/placer/evaluator.cpp
+++ /dev/null
@@ -1,93 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include "placer/evaluator.hpp"
-
-namespace tt::placer
-{
-
-EpochPlacement::EpochPlacement(
-    std::shared_ptr<balancer::BalancerSolution> solution,
-    scheduler::InteractiveScheduler::Checkpoint scheduler_checkpoint,
-        std::unique_ptr<graphlib::Graph> graph,
-        std::string const& arch_name) :
-    solution_(solution), scheduler_checkpoint_(scheduler_checkpoint), graph_(std::move(graph))
-{
-    score_ = calculate_score(arch_name);
-}
-
-bool EpochPlacement::is_better_than(const EpochPlacement &other) const
-{
-    if (solution_ == nullptr)
-        return other.solution_ == nullptr;
-
-    return score_ > other.score_;
-}
-
-float f1_score(float a, float b) { return 2 * a * b / (a + b); }
-
-float EpochPlacement::calculate_score(std::string const& arch_name)
-{
-    // Figure out how well we've balanced the ops
-    if (solution_ == nullptr)
-        return 0.0;
-
-    // get_execution_cycles() is slow, so we'll only call it once
-    // TODO: we need some kind of a global cache, since we're still going to call calculate_scores
-    // many times
-    std::unordered_map<std::string, std::uint32_t> execution_cycles;
-    std::unordered_map<std::string, std::uint32_t> theoretical_cycles;
-
-    const auto &placements = solution_->placer_solution.name_to_op_placement;
-    for (auto it : placements)
-    {
-        const std::string &name = it.first;
-        const auto &op_model = solution_->op_models.at(name);
-        execution_cycles.insert(std::make_pair(name, op_model.get_execution_cycles(arch_name)));
-        if (op_model.op_type() == "matmul")
-            theoretical_cycles.insert(std::make_pair(name, op_model.get_execution_cycles(arch_name, true)));
-    }
-
-    std::uint32_t slowest_core = 0;
-    for (auto it : placements)
-    {
-        const std::uint32_t cycles = execution_cycles.at(it.first);
-        if (cycles > slowest_core)
-            slowest_core = cycles;
-    }
-
-    // One goal is to have matmuls run as efficiently as possible.
-    // Another goal is to keep as many cores as busy as we can.
-
-    float matmul_utilization = 0.0;    // indicator of how close to theoretical matmuls are
-    float balancer_utilization = 0.0;  // indicator of how busy all cores are
-    std::uint32_t matmul_core_count = 0;
-    for (auto it : placements)
-    {
-        const std::string &name = it.first;
-        const auto &op_model = solution_->op_models.at(name);
-        balancer_utilization += (1.0 * execution_cycles.at(name) / slowest_core) * op_model.grid_shape.volume();
-        if (op_model.op_type() == "matmul")
-        {
-            matmul_utilization += (1.0 * theoretical_cycles.at(name) / slowest_core) * op_model.grid_shape.volume();
-            matmul_core_count += op_model.grid_shape.volume();
-        }
-    }
-
-    std::uint32_t core_count = solution_->placer_solution.epoch_id_to_device_grid.rows *
-                               solution_->placer_solution.epoch_id_to_device_grid.columns;
-
-    balancer_utilization /= core_count;
-    if (matmul_core_count > 0)
-    {
-        matmul_utilization /= matmul_core_count;
-    }
-
-    if (matmul_core_count > 0)
-        return f1_score(balancer_utilization, matmul_utilization);
-
-    return balancer_utilization;
-}
-
-}  // namespace tt::placer
diff --git a/pybuda/csrc/placer/evaluator.hpp b/pybuda/csrc/placer/evaluator.hpp
deleted file mode 100644
index bc3ebeec5..000000000
--- a/pybuda/csrc/placer/evaluator.hpp
+++ /dev/null
@@ -1,52 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <memory>
-
-#include "balancer/balancer.hpp"
-#include "scheduler/interactive_scheduler.hpp"
-
-namespace tt
-{
-
-namespace placer
-{
-// Copy of a valid placement solution, along with meta-data that allows selection between them, and
-// restorating of the state.
-class EpochPlacement
-{
-   private:
-    std::shared_ptr<balancer::BalancerSolution> solution_;
-    float score_;
-
-    // state
-    scheduler::InteractiveScheduler::Checkpoint scheduler_checkpoint_;
-    std::unique_ptr<graphlib::Graph> graph_;
-
-    float calculate_score(std::string const& arch_name);
-
-   public:
-    EpochPlacement() : solution_(nullptr), score_(0.0), graph_(nullptr) {}
-    EpochPlacement(
-        std::shared_ptr<balancer::BalancerSolution> solution,
-        scheduler::InteractiveScheduler::Checkpoint scheduler_checkpoint,
-        std::unique_ptr<graphlib::Graph> graph,
-        std::string const& arch_name);
-
-    bool valid() const { return solution_ != nullptr; }
-    float score() const { return score_; }
-    const scheduler::InteractiveScheduler::Checkpoint &scheduler_checkpoint() const { return scheduler_checkpoint_; }
-    std::shared_ptr<balancer::BalancerSolution> solution() const { return solution_; }
-    Graph *release_graph()
-    {
-        if (graph_ == nullptr)
-            return nullptr;
-        return graph_.release();
-    }
-    bool is_better_than(const EpochPlacement &other) const;
-};
-
-}  // namespace placer
-}  // namespace tt
diff --git a/pybuda/csrc/placer/exceptions.hpp b/pybuda/csrc/placer/exceptions.hpp
deleted file mode 100644
index 340e0b13f..000000000
--- a/pybuda/csrc/placer/exceptions.hpp
+++ /dev/null
@@ -1,35 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <exception>
-#include <string>
-#include <unordered_set>
-#include <variant>
-#include <vector>
-
-namespace tt::placer
-{
-struct FailToPlaceOnCurrentEpoch : public std::exception
-{
-    std::string message;
-    FailToPlaceOnCurrentEpoch(std::string const& message) : message(message) {}
-    virtual char const* what() const noexcept override { return message.c_str(); }
-};
-
-struct FailToSatisfyPlacementConstraint : public std::exception
-{
-    std::string message;
-    FailToSatisfyPlacementConstraint(std::string const& message) : message(message) {}
-    virtual char const* what() const noexcept override { return message.c_str(); }
-};
-
-struct FailToSatisfyConflictingConstraint : public std::exception
-{
-    std::string message;
-    FailToSatisfyConflictingConstraint(std::string const& message) : message(message) {}
-    virtual char const* what() const noexcept override { return message.c_str(); }
-};
-
-}  // namespace tt::placer
diff --git a/pybuda/csrc/placer/grid_placer.cpp b/pybuda/csrc/placer/grid_placer.cpp
deleted file mode 100644
index d9c0065ba..000000000
--- a/pybuda/csrc/placer/grid_placer.cpp
+++ /dev/null
@@ -1,817 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "placer/grid_placer.hpp"
-
-#include "placer/utils.hpp"
-#include "placer/lowering_utils.hpp"
-#include "placer/exceptions.hpp"
-
-#include "graph_lib/defines.hpp"
-#include "utils/logger.hpp"
-#include "utils/assert.hpp"
-
-#include <fstream>
-#include <iomanip>
-#include <set>
-#include <stdexcept>
-#include <exception>
-
-using tt::LogPlacer;
-
-namespace tt {
-namespace placer {
-
-namespace device_grid
-{
-DeviceGrid create_empty_device_grid(uint32_t rows, uint32_t columns)
-{
-    return vector<vector<uint32_t>>(rows, vector<uint32_t>(columns, 0));
-}
-DeviceGrid superposition(const DeviceGrid& a, const DeviceGrid& b)
-{
-    const uint32_t a_rows = a.size();
-    const uint32_t a_cols = a.at(0).size(); 
-    const uint32_t b_rows = a.size();
-    const uint32_t b_cols = a.at(0).size(); 
-    TT_ASSERT(a_rows == b_rows);
-    TT_ASSERT(a_cols == b_cols);
-
-    DeviceGrid new_device_grid = create_empty_device_grid(a_rows, a_cols);
-    for (uint32_t i = 0; i < a_rows; ++i) {
-        for (uint32_t j = 0; j < a_cols; ++j) {
-            if (a[i][j])
-            {
-                new_device_grid[i][j] = a[i][j];
-            }
-            else if (b[i][j])
-            {
-                new_device_grid[i][j] = b[i][j];
-            }
-            else
-            {
-                new_device_grid[i][j] = 0;
-            }
-        }
-    }
-    return new_device_grid;
-}
-
-bool can_place_on_device_grid(const DeviceGrid& device_grid, const Coord& start, const GridShape& shape)
-{
-    const uint32_t available_rows_on_grid = device_grid.size();
-    const uint32_t available_columns_on_grid = device_grid.at(0).size(); 
-
-    if (start.row + shape.rows > available_rows_on_grid)
-    {
-        return false;
-    }
-    else if (start.col + shape.columns > available_columns_on_grid)
-    {
-        return false;
-    }
-
-    for (uint32_t i = start.row; i < start.row + shape.rows; ++i) {
-        for (uint32_t j = start.col; j < start.col + shape.columns; ++j) {
-            if (device_grid.at(i).at(j) != 0) {
-                return false;
-            }
-        }
-    }
-    return true;
-}
-
-bool contains_empty_device_grid(const DeviceGrid& device_grid)
-{
-    if (device_grid.empty())
-    {
-        return true;
-    }
-    if (device_grid[0].empty())
-    {
-        return true;
-    }
-
-    for (uint32_t i = 0; i < device_grid.size(); ++i) {
-        for (uint32_t j = 0; j < device_grid.at(0).size(); ++j) {
-            if (device_grid[i][j] != 0) {
-                return false;
-            }
-        }
-    }
-    return true;
-}
-
-void fill_device_grid_with_placement(DeviceGrid& device_grid, const Coord& op_start, const GridShape& op_grid_shape)
-{ 
-    for (uint32_t i = op_start.row; i < op_start.row + op_grid_shape.rows; ++i) {
-        for (uint32_t j = op_start.col; j < op_start.col + op_grid_shape.columns; ++j) {
-            device_grid.at(i).at(j) = 1;
-        }
-    }
-}
-
-void print_device_grid(const DeviceGrid& device_grid)
-{ 
-    for (uint32_t i = 0; i < device_grid.size(); ++i) {
-        for (uint32_t j = 0; j < device_grid.at(0).size(); ++j) {
-            std::cout << " " << device_grid.at(i).at(j);
-        }
-        std::cout << std::endl;
-    }
-}
-std::optional<Coord> get_first_free_location(const DeviceGrid& device_grid, const GridShape& op_grid_shape)
-{
-    const uint32_t rows = device_grid.size();
-    const uint32_t columns = device_grid.at(0).size(); 
-    for (uint32_t i = 0; i < rows; ++i)
-    { 
-        for (uint32_t j = 0; j < columns; ++j)
-        { 
-            if (i + op_grid_shape.rows > rows or j + op_grid_shape.columns > columns)
-            {
-                continue;
-            } 
-             
-            bool is_valid = true;
-            for (uint32_t op_i = i; op_i < i + op_grid_shape.rows; ++ op_i)
-            {
-                for (uint32_t op_j = j; op_j < j + op_grid_shape.columns; ++ op_j)
-                {
-                    if (device_grid[op_i][op_j] != 0)
-                    { 
-                        is_valid = false;  
-                        break;
-                    }
-                }
-            }
-            if (is_valid)
-            {  
-                return Coord{.row = i, .col = j};
-            }
-        } 
-    }
-    return std::nullopt;
-}
-
-std::optional<Coord> get_next_grid_coordinate(const DeviceGrid& device_grid, const GridShape& op_grid_shape)
-{
-    return get_first_free_location(device_grid, op_grid_shape);
-}
-} // namespace device_grid
-
-enum class PlacerState {
-    PLACE_WITH_RELATIVE_OFFSETS,
-    INCREMENT_EPOCH_AND_PLACE_WITH_RELATIVE_OFFSETS,
-    ALLOW_EPOCH_INCREMENTS,
-};
-
-struct DeviceGridConfig {
-    bool enable_relative_offsets;
-    bool increment_epoch;
-    bool allow_increment;
-};
-
-uint32_t get_active_placement_row_height(const DeviceGrid& device_grid, const std::optional<Coord>& candidate)
-{ 
-    if (not candidate.has_value())
-    {
-        return 0;
-    }
-
-    const uint32_t device_grid_rows = device_grid.size(); 
-    const uint32_t device_grid_columns = device_grid[0].size();
-    bool device_R_larger_than_C = (device_grid_rows > device_grid_columns);
-    uint32_t row_height = 0; 
-    uint32_t i = candidate.value().row;
-    uint32_t j = candidate.value().col;
-    
-    for (int op_j = j-1; i+row_height < device_grid_rows && op_j >= 0; --op_j) 
-    {
-        if(device_grid[i+row_height][op_j] != 0)
-        { 
-            for (uint32_t op_i = i+row_height; op_i < device_grid_rows; ++op_i) 
-            {
-                if (device_grid[op_i][op_j] == 0) 
-                {
-                    break;
-                }
-                row_height++;
-            }
-            break; 
-        }
-
-    }
-        
-    return (row_height == 0 and device_R_larger_than_C) ? (device_grid_rows - i) : row_height;
-}
-
-
-DeviceGridConfig get_device_grid_config_from_strategy(
-    PlacerState grid_placer_strategy, bool induce_epoch_increment, bool allow_increment=false)
-{
-    bool enable_relative_offsets = env_as<bool>("PYBUDA_TRIPLET_PLACEMENT");
-    switch(grid_placer_strategy)
-    {
-        case PlacerState::PLACE_WITH_RELATIVE_OFFSETS:
-        {
-            return DeviceGridConfig{
-                .enable_relative_offsets=enable_relative_offsets,
-                .increment_epoch=induce_epoch_increment,
-                .allow_increment=allow_increment,
-            };
-        }
-        case PlacerState::INCREMENT_EPOCH_AND_PLACE_WITH_RELATIVE_OFFSETS:
-        {
-            return DeviceGridConfig{
-                .enable_relative_offsets=enable_relative_offsets,
-                .increment_epoch=true,
-                .allow_increment=false,
-            };
-        }
-        case PlacerState::ALLOW_EPOCH_INCREMENTS:
-        {
-            return DeviceGridConfig{
-                .enable_relative_offsets=false,
-                .increment_epoch=true,
-                .allow_increment=true,
-            };
-        }
-    }
-    TT_ASSERT("Failed to configure device_grid_config.");
-    return DeviceGridConfig{};
-}
-
-bool should_try_auto_transpose_op(
-    const DeviceGrid& device_grid, 
-    const GridShape& op_grid_shape, 
-    const bool enable_auto_transposing, 
-    const bool manually_transpose_this_op)
-{
-    const uint32_t device_grid_r = device_grid.size();
-    const uint32_t device_grid_c = device_grid[0].size();
-    bool is_transposable = op_grid_shape.columns <= device_grid_r and op_grid_shape.rows <= device_grid_c;
-    TT_LOG_ASSERT((not manually_transpose_this_op) or (manually_transpose_this_op and is_transposable), 
-                  "Manually passed op is not transposable, op-grid-shape: {}x{}, device-grid: {}x{}", 
-                  op_grid_shape.rows, 
-                  op_grid_shape.columns, 
-                  device_grid_r, 
-                  device_grid_c);
-
-    // check for auto-transpose 
-    bool try_auto_transpose = is_transposable and enable_auto_transposing and (op_grid_shape.rows > op_grid_shape.columns);  
-    return try_auto_transpose and not manually_transpose_this_op;
-}
-
-bool apply_auto_transpose(
-    const bool try_auto_transpose, 
-    const DeviceGrid& device_grid,
-    const std::optional<Coord>& coordinate_to_try,
-    const uint32_t op_grid_R)
-{    
-    return try_auto_transpose and (not coordinate_to_try.has_value() or get_active_placement_row_height(device_grid, coordinate_to_try) < op_grid_R);
-}
-
-std::tuple<Coord, uint32_t, bool> get_placed_coordinate(
-    const string& op_name,
-    const GridShape& op_grid_shape,
-    EpochIdToDeviceGrid& e,
-    uint32_t candidate_epoch_id,
-    std::optional<Coord> coordinate_to_try,
-    bool allow_increment,
-    const bool enable_auto_transposing,
-    const std::unordered_map<std::string, PlacerOpOverride> &op_to_overrides)
-{
-    TT_ASSERT(coordinate_to_try.has_value());
-    GridShape op_grid_shape_local(op_grid_shape.rows, op_grid_shape.columns);
-
-    e.initialize_device_grid(candidate_epoch_id);
-
-    std::optional<PlacerOpOverride> op_override = std::nullopt;
-    if (op_to_overrides.find(op_name) != op_to_overrides.end())
-    {
-        op_override = op_to_overrides.at(op_name);
-    }
-    bool manually_transpose_this_op = op_override.has_value() ? op_override.value().transpose_op : false;
-
-    bool try_auto_transpose = should_try_auto_transpose_op(
-        e.epoch_id_to_device_grid.at(candidate_epoch_id), 
-        op_grid_shape, 
-        enable_auto_transposing, 
-        manually_transpose_this_op); 
-
-    if (manually_transpose_this_op)
-    {
-        op_grid_shape_local = GridShape(op_grid_shape.columns, op_grid_shape.rows);
-    }
-
-    // Try conditional after transposing
-    if (op_override.has_value() and op_override.value().grid_start.has_value())
-    {
-        const auto& user_grid_start = op_override.value().grid_start.value();
-        bool can_place_with_user_override = e.can_place_on_device_grid(op_name, candidate_epoch_id, user_grid_start, op_grid_shape_local);
-        if (can_place_with_user_override)
-        {
-            log_debug(LogPlacer, "{} has an op override is now placed at: {}", op_name, op_override.value().grid_start.value());
-            coordinate_to_try = op_override.value().grid_start;
-        }
-        else if (not e.satisfies_constraints(op_name, user_grid_start, op_grid_shape_local))
-        {
-            const Coord& user_grid_start = op_override.value().grid_start.value();
-
-            for (const auto& [constraint_name, constraint_grid] : e.op_to_constraints)
-            {
-                if (op_name != constraint_name and not device_grid::can_place_on_device_grid(constraint_grid, user_grid_start, op_grid_shape_local))
-                {
-                    throw FailToSatisfyConflictingConstraint(
-                        fmt::format("OpPlacement for {} to start at {} it conflicts with the constraint placed at {} : {}.",
-                            op_name,
-                            op_override.value().grid_start.value(),
-                            constraint_name,
-                            op_to_overrides.at(constraint_name).grid_start.value()
-                        )
-                    );
-                }
-            }
-        }
-        else
-        {
-            throw FailToSatisfyPlacementConstraint(
-                fmt::format("User has specified an override of the OpPlacement for {} to start at {} but it is not valid.",
-                    op_name,
-                    op_override.value().grid_start.value()
-                )
-            );
-        }
-    }
- 
-    while (not e.can_place_on_device_grid(op_name, candidate_epoch_id, coordinate_to_try.value(), op_grid_shape_local))
-    {  
-        const DeviceGrid& device_grid = e.epoch_id_to_device_grid.at(candidate_epoch_id);
-        coordinate_to_try = e.get_next_grid_coordinate(op_name, candidate_epoch_id, op_grid_shape_local);
-        if (allow_increment and not coordinate_to_try.has_value())
-        {
-            candidate_epoch_id += 1;
-            e.initialize_device_grid(candidate_epoch_id);
-            coordinate_to_try = Coord{.row = 0, .col = 0};
-        } 
-        else if (apply_auto_transpose(try_auto_transpose, device_grid, coordinate_to_try, op_grid_shape.rows))
-        {
-            std::optional<Coord> coord_T = e.get_next_grid_coordinate(
-                op_name, candidate_epoch_id, GridShape(op_grid_shape_local.columns, op_grid_shape_local.rows));
-            if (coord_T.has_value() and (not coordinate_to_try.has_value() or coord_T.value() < coordinate_to_try.value() || coord_T.value() == coordinate_to_try.value()))
-            {
-                op_grid_shape_local = GridShape(op_grid_shape.columns, op_grid_shape.rows);
-                coordinate_to_try = Coord{.row = coord_T.value().row, .col = coord_T.value().col};
-            } 
-        }
-
-        if (not coordinate_to_try.has_value())
-        {
-            throw FailToPlaceOnCurrentEpoch("ran out of valid placements");
-        }
-    }
-    TT_ASSERT(coordinate_to_try.has_value());
-    bool is_transposed = (op_grid_shape.rows != op_grid_shape_local.rows);  
-    return {coordinate_to_try.value(), candidate_epoch_id, is_transposed};
-}
-
-std::tuple<vector<DeviceGridPlacement>, Coord> place_on_grid_helper(
-    const vector<string>& op_names,
-    const unordered_map<string, GridShape>& op_to_grid_shape,
-    const std::unordered_map<std::string, PlacerOpOverride> &op_to_overrides,
-    const bool enable_auto_transposing,
-    const DeviceGridConfig config,
-    const DeviceGrid& device_grid,
-    std::optional<Coord> starting_coordinate,
-    bool return_after_one_epoch = false,
-    const unordered_map<string, CoordOffset>& op_name_to_relative_offset_from_first_op = {},
-    const std::unordered_map<std::string, DeviceGrid> constraints = {})
-{
-    uint32_t current_epoch_id = 0;
-    if (not starting_coordinate.has_value())
-    {
-        starting_coordinate = Coord{.row=0, .col=0};
-    }
-
-    auto e_copy = EpochIdToDeviceGrid(device_grid.size(), device_grid.at(0).size());
-    e_copy.initialize_device_grid(current_epoch_id, device_grid);
-    e_copy.add_constraints(constraints);
-
-    if (config.increment_epoch and not e_copy.contains_empty_grid(current_epoch_id))
-    {
-        current_epoch_id += 1;
-        starting_coordinate = {.row = 0, .col = 0};
-    }
-
-    Coord placed_coordinate = starting_coordinate.value();
-    uint32_t candidate_epoch_id = current_epoch_id;
-
-    vector<DeviceGridPlacement> op_placements;
-    std::optional<Coord> first_placement_start = std::nullopt; 
-    bool is_op_transpose_enabled = enable_auto_transposing or op_to_overrides.size() > 0;
- 
-    for (const auto& op_name : op_names)
-    {
-        Coord previous_coordinate = placed_coordinate; 
-        GridShape op_grid_shape = op_to_grid_shape.at(op_name); 
-
-        // NB: relative offsets are not applied when op is transposed
-        bool has_relative_offset = op_name_to_relative_offset_from_first_op.find(op_name) != op_name_to_relative_offset_from_first_op.end();
-        if (config.enable_relative_offsets and has_relative_offset and first_placement_start.has_value() and not is_op_transpose_enabled)  
-        {
-            const CoordOffset& offset = op_name_to_relative_offset_from_first_op.at(op_name);
-            placed_coordinate = first_placement_start.value() + offset;
-        }
-
-        bool op_transposed = false; 
-        std::tie(placed_coordinate, candidate_epoch_id, op_transposed) = get_placed_coordinate(
-            op_name,
-            op_grid_shape,
-            e_copy,
-            candidate_epoch_id,
-            placed_coordinate,
-            config.allow_increment,
-            enable_auto_transposing,
-            op_to_overrides
-        ); 
- 
-        if (return_after_one_epoch and candidate_epoch_id != 0)
-        {
-            return {op_placements, previous_coordinate};
-        }
-        
-        if (op_transposed)
-        {
-            op_grid_shape = GridShape(op_grid_shape.columns, op_grid_shape.rows);
-        }
-        previous_coordinate = placed_coordinate;  
-        e_copy.fill_device_grid_with_placement(candidate_epoch_id, placed_coordinate, op_grid_shape);
-
-        op_placements.push_back(DeviceGridPlacement{
-            .op_name = op_name,
-            .device_grid_index = candidate_epoch_id,
-            .placed_cores = CoordRange{.start = placed_coordinate, .end = placed_coordinate + op_grid_shape},
-            .grid_transpose = op_transposed,
-        });
-
-        if (not first_placement_start.has_value())
-        {
-            first_placement_start = placed_coordinate;
-        } 
-    }
-    return {op_placements, placed_coordinate};
-}
-
-std::string to_string(PlacerState state)
-{
-    switch (state)
-    {
-        case PlacerState::PLACE_WITH_RELATIVE_OFFSETS:
-        {
-            return "PLACE_WITH_RELATIVE_OFFSETS";
-        }
-        case PlacerState::INCREMENT_EPOCH_AND_PLACE_WITH_RELATIVE_OFFSETS:
-        {
-            return "INCREMENT_EPOCH_AND_PLACE_WITH_RELATIVE_OFFSETS";
-        }
-        case PlacerState::ALLOW_EPOCH_INCREMENTS:
-        {
-            return "ALLOW_EPOCH_INCREMENTS";
-        }
-    }
-    TT_ASSERT("PlacerState with undefined string conversion.");
-    return "";
-}
-
-std::optional<DeviceGridPlacement> place_one_op(
-    const string op,
-    const unordered_map<string, GridShape>& op_to_grid_shape,
-    const DeviceGrid& device_grid,
-    const std::unordered_map<std::string, PlacerOpOverride> &op_to_overrides,
-    const bool enable_auto_transposing,
-    std::optional<Coord> starting_coordinate)
-{
-    try
-    {
-        std::vector<std::string> ops = {op};
-        auto [device_grid_placements, last_placed] = place_on_grid_helper(
-            ops,
-            op_to_grid_shape,
-            op_to_overrides, 
-            enable_auto_transposing,
-            get_device_grid_config_from_strategy(PlacerState::PLACE_WITH_RELATIVE_OFFSETS, false),
-            device_grid,
-            starting_coordinate,
-            true /* return_after_one_epoch */
-        );
-        TT_ASSERT(device_grid_placements.size() == 1);
-        return device_grid_placements.at(0);
-    }
-    catch (const FailToPlaceOnCurrentEpoch& e)
-    {
-        // can't place on current epoch, return back to user
-    }
-    catch (...)
-    {
-        TT_ASSERT("place_one_op: caught unhandled exception");
-    }
-    return std::nullopt;
-}
-
-std::tuple<vector<DeviceGridPlacement>, Coord> place_on_grid(
-    const OpGroupToPlace& op_group_to_place,
-    const unordered_map<string, GridShape>& op_to_grid_shape,
-    const DeviceGrid& device_grid,
-    const std::unordered_map<std::string, PlacerOpOverride> &op_to_overrides,
-    const bool enable_auto_transposing,
-    std::optional<Coord> starting_coordinate)
-{
-    std::vector<PlacerState> grid_placer_strategies = {
-        PlacerState::PLACE_WITH_RELATIVE_OFFSETS,
-        PlacerState::INCREMENT_EPOCH_AND_PLACE_WITH_RELATIVE_OFFSETS,
-        PlacerState::ALLOW_EPOCH_INCREMENTS,
-    };
-
-    for (PlacerState grid_placer_strategy : grid_placer_strategies)
-    {
-        try
-        {
-            log_trace(LogPlacer, "Placing with strategy: {}", to_string(grid_placer_strategy)); 
-            DeviceGridConfig grid_placer_config = get_device_grid_config_from_strategy(grid_placer_strategy, op_group_to_place.increment_epoch);
-            return place_on_grid_helper(
-                op_group_to_place.op_names,
-                op_to_grid_shape, 
-                op_to_overrides,
-                enable_auto_transposing,
-                grid_placer_config,
-                device_grid,
-                starting_coordinate,
-                false /* return_after_one_epoch */,
-                op_group_to_place.op_name_to_relative_offset_from_first_op
-            );
-        }
-        catch (const FailToPlaceOnCurrentEpoch& e)
-        {
-            // can't place on current epoch, switch to next grid strategy
-        }
-        catch (...)
-        {
-            TT_ASSERT("place_on_grid: caught unhandled exception");
-        }
-    }
-    log_fatal("All place_on_grid(..) strategies have failed.");
-    return {};
-}
-
-vector<EpochDeviceGridPlacement> place_onto_device_grids(
-    const GridShape& device_grid_shape,
-    const vector<OpGroupToPlace>& op_groups_to_place,
-    const unordered_map<string, GridShape>& op_to_grid_shape,
-    const std::unordered_map<std::string, PlacerOpOverride> &op_to_overrides,
-    const bool enable_auto_transposing)
-{
-    vector<EpochDeviceGridPlacement> epoch_device_grid_placements;
-    unordered_map<uint32_t, vector<DeviceGridPlacement>> epoch_id_to_device_grid_placement;
-    auto epoch_id_to_device_grid = EpochIdToDeviceGrid(device_grid_shape.rows, device_grid_shape.columns);
-
-    uint32_t current_epoch_id = 0;
-    std::optional<Coord> current_coordinate = std::nullopt;
-
-    for (const OpGroupToPlace& op_group : op_groups_to_place)
-    {
-        vector<DeviceGridPlacement> device_grid_placements;
-        epoch_id_to_device_grid.initialize_device_grid(current_epoch_id);
-        std::tie(device_grid_placements, current_coordinate) = place_on_grid(
-            op_group,
-            op_to_grid_shape,
-            epoch_id_to_device_grid.get_device_grid(current_epoch_id),
-            op_to_overrides,
-            enable_auto_transposing,
-            current_coordinate
-        );
-
-        for (const DeviceGridPlacement& device_grid_placement : device_grid_placements)
-        {
-
-            uint32_t device_index = current_epoch_id + device_grid_placement.device_grid_index;
-            epoch_id_to_device_grid.fill_device_grid_with_placement(
-                device_index, 
-                device_grid_placement.placed_cores.start,
-                op_to_grid_shape.at(device_grid_placement.op_name));
-
-            epoch_id_to_device_grid_placement[device_index].push_back(device_grid_placement);
-        }
-        current_epoch_id = epoch_id_to_device_grid.get_current_epoch_id(); 
-    }
-
-    for (uint32_t epoch_id = 0; epoch_id < epoch_id_to_device_grid.epoch_id_to_device_grid.size(); ++epoch_id)
-    {
-        epoch_device_grid_placements.emplace_back(
-            epoch_id_to_device_grid.epoch_id_to_device_grid.at(epoch_id), epoch_id_to_device_grid_placement.at(epoch_id));
-        
-    }
-
-    return epoch_device_grid_placements;
-}
-
-void EpochDevicePlacer::enqueue_workload(const vector<OpGroupToPlace>& op_groups)
-{
-    for (const auto& op_group : op_groups)
-    {
-        this->remaining_op_groups.push_back(op_group);
-    }
-}
-
-std::vector<DeviceGridPlacement> EpochDevicePlacer::place_on_grid(
-    const OpGroupToPlace& op_group_to_place)
-{
-    auto [device_grid_placements, _] = place_on_grid_helper(
-        op_group_to_place.op_names,
-        this->config.op_to_grid_shape,
-        this->config.op_to_overrides, 
-        this->config.enable_auto_transposing_placement,
-        get_device_grid_config_from_strategy(PlacerState::PLACE_WITH_RELATIVE_OFFSETS, false),
-        this->active_device_grid,
-        std::nullopt,
-        true /* return_after_one_epoch */,
-        op_group_to_place.op_name_to_relative_offset_from_first_op,
-        this->op_to_device_grid_constraint
-    );
-
-    for (const auto& op_device_placement : device_grid_placements)
-    {
-        auto op_grid_shape  = this->config.op_to_grid_shape.at(op_device_placement.op_name);
-        if (op_device_placement.grid_transpose){
-            op_grid_shape = GridShape(op_grid_shape.columns, op_grid_shape.rows);
-        }
-
-        device_grid::fill_device_grid_with_placement(
-            this->active_device_grid, 
-            op_device_placement.placed_cores.start,
-            op_grid_shape);
-    }
-    return device_grid_placements;
-}
-void EpochDevicePlacer::clear_state()
-{
-    GridShape device_grid_shape(config.get_available_rows_on_device(), config.device_grid.columns);
-    this->active_device_grid = device_grid::create_empty_device_grid(device_grid_shape.rows, device_grid_shape.columns);
-    while (not this->placed_op_groups.empty()) { this->placed_op_groups.pop_front(); }
-    this->active_op_placements.clear();
-    this->op_to_device_grid_constraint.clear();
-}
-
-EpochDeviceGridPlacement EpochDevicePlacer::complete_epoch()
-{
-    // either we have to place this on a new epoch OR 
-    // apply constraint and then replace the previous ops
-    auto complete_epoch = EpochDeviceGridPlacement(this->active_device_grid, this->active_op_placements);
-    
-    // Reset state
-    this->clear_state();
-
-    //device_grid::print_device_grid(complete_epoch.device_grid);
-    return complete_epoch;
-}
-
-std::optional<EpochDeviceGridPlacement> EpochDevicePlacer::get_next_epoch()
-{
-    std::optional<EpochDeviceGridPlacement> active_epoch_placement = std::nullopt;
-    GridShape device_grid_shape(config.get_available_rows_on_device(), config.device_grid.columns);
-    this->active_device_grid = device_grid::create_empty_device_grid(device_grid_shape.rows, device_grid_shape.columns);
-
-    while (not this->remaining_op_groups.empty() or not this->op_groups_to_place_again.empty())
-    {
-        // try to place current op-group, if not possible
-
-        const auto& op_group_to_place = this->op_groups_to_place_again.empty() ?
-            this->remaining_op_groups.front() : this->op_groups_to_place_again.front();
-
-        bool contains_single_op_in_group = op_group_to_place.op_names.size() == 1;
-        TT_ASSERT(op_group_to_place.op_names.size() >= 1);
-        if (op_group_to_place.op_names.size() > 1)
-        {
-            // We don't support constraints applied on an OpGroup with multiple ops
-            for (const auto& op : op_group_to_place.op_names)
-            {
-                TT_ASSERT(this->op_to_device_grid_constraint.find(op) == this->op_to_device_grid_constraint.end());
-            }
-        }
-        const auto& op = op_group_to_place.op_names.at(0);
-
-        if (op_group_to_place.increment_epoch and not this->active_op_placements.empty())
-        {
-            return this->complete_epoch();
-        }
-
-        try
-        {
-            log_debug(LogPlacer, "trying to place op_group: {}", op_group_to_place.op_names);
-            auto op_device_placements = this->place_on_grid(op_group_to_place);
-            if (contains_single_op_in_group and this->op_to_device_grid_constraint.find(op) != this->op_to_device_grid_constraint.end())
-            {
-                TT_ASSERT(this->op_groups_to_place_again.empty());
-                log_trace(LogPlacer, "erasing constraint for: {}", op);
-                this->op_to_device_grid_constraint.erase(op);
-            }
-
-            for (const auto& op_device_placement : op_device_placements)
-            {
-                this->active_op_placements.push_back(op_device_placement);
-            }
-            this->placed_op_groups.push_back(op_group_to_place);
-
-            if (this->op_groups_to_place_again.empty())
-            {
-                this->remaining_op_groups.pop_front();
-            }
-            else
-            {
-                this->op_groups_to_place_again.pop_front();
-            }
-
-        }
-        catch (const FailToPlaceOnCurrentEpoch& e)
-        {
-            return this->complete_epoch();
-        }
-        catch (const FailToSatisfyPlacementConstraint& e)
-        {
-            // Replay epoch placement with the constraint
-            log_debug(LogPlacer, "failing to place {} because of existing constraints. adding constraint", op);
-            auto constraint_grid = device_grid::create_empty_device_grid(device_grid_shape.rows, device_grid_shape.columns);
-            TT_ASSERT(config.op_to_overrides.find(op) != config.op_to_overrides.end());
-            const auto& op_override = config.op_to_overrides.at(op);
-            auto op_grid_shape  = this->config.op_to_grid_shape.at(op);
-            if (op_override.transpose_op){
-                op_grid_shape = GridShape(op_grid_shape.columns, op_grid_shape.rows);
-            }
-            device_grid::fill_device_grid_with_placement(constraint_grid, op_override.grid_start.value(), op_grid_shape);
-
-            for (const auto& active_placement : this->active_op_placements)
-            {
-                if (this->config.op_to_overrides.find(active_placement.op_name) != this->config.op_to_overrides.end())
-                {
-                    const auto& existing_op_override = config.op_to_overrides.at(active_placement.op_name);
-                    auto existing_op_grid_shape  = this->config.op_to_grid_shape.at(active_placement.op_name);
-                    if (existing_op_override.transpose_op){
-                        existing_op_grid_shape = GridShape(existing_op_grid_shape.columns, existing_op_grid_shape.rows);
-                    }
-
-                    if (not device_grid::can_place_on_device_grid(constraint_grid, existing_op_override.grid_start.value(), existing_op_grid_shape))
-                    {
-                        log_debug(LogPlacer, "Placer: Completing epoch because there's an op in the current epoch that conflicts with constraint. {}", op);
-                        return this->complete_epoch();
-                    }
-                }
-            }
-
-            this->op_to_device_grid_constraint[op] = constraint_grid;
-
-            while (not this->placed_op_groups.empty()) { 
-                const auto& old_op_group = this->placed_op_groups.back(); 
-                this->op_groups_to_place_again.push_front(old_op_group);
-                this->placed_op_groups.pop_back(); 
-            }
-            this->active_op_placements.clear();
-            this->active_device_grid = device_grid::create_empty_device_grid(device_grid_shape.rows, device_grid_shape.columns);
-        }
-        catch (const FailToSatisfyConflictingConstraint& e)
-        {
-            log_fatal("Caught FailToSatisfyConflictingConstraint: {}", e.what());
-        }
-    }
-    if (not this->active_op_placements.empty())
-    {
-        return this->complete_epoch();
-    }
-    return active_epoch_placement;
-}
-
-std::vector<EpochDeviceGridPlacement> EpochDevicePlacer::place(const vector<OpGroupToPlace>& op_groups)
-{
-    log_debug(LogPlacer, "ops tagged for chip_break: {}", this->config.ops_tagged_for_chip_id_break);
-    log_debug(LogPlacer, "ops tagged for epoch_break: {}", this->config.ops_tagged_for_epoch_break);
-
-    log_debug(LogPlacer, "ops tagged for override:");
-    for (const auto& [op, override] : this->config.op_to_overrides)
-    {
-        log_debug(LogPlacer,
-            "ops tagged for override: {}, override={}", op, override
-        );
-    }
-
-    std::vector<EpochDeviceGridPlacement> epochs;
-    this->enqueue_workload(op_groups);
-
-    for (auto epoch_placement = get_next_epoch(); epoch_placement.has_value();)
-    {
-        epochs.push_back(epoch_placement.value());
-        epoch_placement = get_next_epoch();
-    }
-    return epochs;
-}
-
-} // namespace placer
-} // namespace tt
diff --git a/pybuda/csrc/placer/grid_placer.hpp b/pybuda/csrc/placer/grid_placer.hpp
deleted file mode 100644
index 3fcc4b7d1..000000000
--- a/pybuda/csrc/placer/grid_placer.hpp
+++ /dev/null
@@ -1,97 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "placer/placer.hpp"
-#include <deque>
-#include <queue>
-
-namespace tt::placer {
-
-// In the future, refactor this into a class..
-// if DeviceGrid[i][j] == 0, then we consider the core at location (i,j) to be free/available
-using DeviceGrid = vector<vector<uint32_t>>;
-
-// Functions on Device Grid
-namespace device_grid
-{
-DeviceGrid create_empty_device_grid(uint32_t rows, uint32_t columns);
-DeviceGrid superposition(const DeviceGrid& a, const DeviceGrid& b);
-bool can_place_on_device_grid(const DeviceGrid& device_grid, const Coord& start, const GridShape& shape);
-bool contains_empty_device_grid(const DeviceGrid& device_grid);
-void fill_device_grid_with_placement(DeviceGrid& device_grid, const Coord& op_start, const GridShape& op_grid_shape);
-void print_device_grid(const DeviceGrid& device_grid);
-std::optional<Coord> get_next_grid_coordinate(const DeviceGrid& device_grid, const GridShape& op_grid_shape);
-} // namespace device_grid
-
-struct DeviceGridPlacement {
-    std::string op_name;
-    uint32_t device_grid_index;
-    CoordRange placed_cores;
-    bool grid_transpose;
-};
-
-struct EpochDeviceGridPlacement
-{
-    DeviceGrid device_grid;
-    vector<DeviceGridPlacement> op_placements;
-
-    EpochDeviceGridPlacement(DeviceGrid&& device_grid) : device_grid(std::move(device_grid)) {}
-    EpochDeviceGridPlacement(const DeviceGrid& device_grid) : device_grid(device_grid) {}
-    EpochDeviceGridPlacement(const DeviceGrid& device_grid, const vector<DeviceGridPlacement>& op_placements)
-        : device_grid(device_grid), op_placements(op_placements) {}
-};
-
-// Notes:
-// op_to_grid_shape := op to grid_shapes that we need to place
-// device_grid := device_grid containing current view of placed ops
-// [[optional]] starting_coordinate := coordinate from where to start placing
-//
-// Returns the grid coordinate location of the last placed op.
-std::tuple<vector<DeviceGridPlacement>, Coord> place_on_grid(
-    const OpGroupToPlace& op_group_to_place,
-    const unordered_map<std::string, GridShape>& op_to_grid_shape,
-    const DeviceGrid& device_grid,
-    const std::unordered_map<std::string, PlacerOpOverride> &op_to_overrides,
-    const bool enable_auto_transposing_placement,
-    std::optional<Coord> starting_coordinate = std::nullopt);
-
-// Grid-placer API that attempts to place an op in the current epoch. It never
-// moves to a new epoch on its own.
-std::optional<DeviceGridPlacement> place_one_op(
-    const string op,
-    const unordered_map<string, GridShape>& op_to_grid_shape,
-    const DeviceGrid& device_grid,
-    const std::unordered_map<std::string, PlacerOpOverride> &op_to_overrides,
-    const bool enable_auto_transposing_placement,
-    std::optional<Coord> starting_coordinate = std::nullopt);
-
-
-// Constraint-based iterative grid-placer:
-// Given {op_groups, constraints on grid-location of ops}, just return 
-// a fully-populated device-grid epoch.
-class EpochDevicePlacer
-{
-    const PlacerConfig& config;
-    std::deque<OpGroupToPlace> op_groups_to_place_again;
-    std::deque<OpGroupToPlace> remaining_op_groups;
-
-    std::deque<OpGroupToPlace> placed_op_groups;
-    std::vector<DeviceGridPlacement> active_op_placements;
-    DeviceGrid active_device_grid;
-
-    std::unordered_map<std::string, DeviceGrid> op_to_device_grid_constraint;
-
-    void clear_state();
-    EpochDeviceGridPlacement complete_epoch();
-    std::vector<DeviceGridPlacement> place_on_grid(const OpGroupToPlace& op_group_to_place);
-    void enqueue_workload(const vector<OpGroupToPlace>& op_groups);
-
-  public:
-    EpochDevicePlacer(const PlacerConfig& config) : config(config) {}
-    std::optional<EpochDeviceGridPlacement> get_next_epoch();
-    std::vector<EpochDeviceGridPlacement> place(const vector<OpGroupToPlace>& op_groups);
-};
-
-} // end namespace tt::placer
diff --git a/pybuda/csrc/placer/host_memory.cpp b/pybuda/csrc/placer/host_memory.cpp
deleted file mode 100644
index 48f1fec8e..000000000
--- a/pybuda/csrc/placer/host_memory.cpp
+++ /dev/null
@@ -1,201 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "placer/host_memory.hpp"
-
-#include "backend_api/device_config.hpp"
-#include "balancer/balancer.hpp"
-#include "balancer/types.hpp"
-#include "graph_lib/node_types.hpp"
-#include "placer/allocator_utils.hpp"
-#include "utils/logger.hpp"
-
-namespace tt::placer
-{
-
-HostMemoryPlacerConfig::HostMemoryPlacerConfig(
-    const DeviceConfig& device_config, bool input_queues_on_host, bool output_queues_on_host)
-{
-    if (input_queues_on_host)
-    {
-        if (device_config.is_grayskull())
-        {
-            log_warning(
-                LogPlacer,
-                "Compilation Option with input queue placed on host, but Grayskull does not support fast device reads "
-                "from host. Placer opting to allocate the queue on device instead.");
-            input_queues_on_host = false;
-        }
-        if ((device_config.is_wormhole() || device_config.is_wormhole_b0()) && device_config.chip_ids.size() > 1)
-        {
-            log_warning(
-                LogPlacer,
-                "Compilation Option with input queue placed on host, but Wormhole does not support fast device reads "
-                "from host in multi-chip systems. Placer opting to allocate the queue on device instead.");
-            input_queues_on_host = false;
-        }
-    }
-
-    this->input_queues_on_host = input_queues_on_host;
-    this->output_queues_on_host = output_queues_on_host;
-
-    for (std::uint32_t i = 0; i < device_config.get_host_memory_num_channels(); i++)
-    {
-        this->host_memory_regions.emplace_back(
-            i, device_config.get_host_memory_channel_start_address(), device_config.get_host_memory_channel_size(i));
-    }
-}
-
-bool HostMemoryPlacerConfig::place_input_queues_on_host() const { return this->input_queues_on_host; }
-bool HostMemoryPlacerConfig::place_output_queues_on_host() const { return this->output_queues_on_host; }
-
-std::pair<CoordRange, GridShape> get_host_queue_grid(
-    const HostMemoryPlacerConfig &config,
-    const PlacerSolution &placer_solution,
-    const OpPlacement &placement,
-    const graphlib::Graph *graph,
-    const graphlib::Node *node,
-    CoordRange queue_coord_range)
-{
-    GridShape queue_grid;
-    if (node->node_type() == graphlib::NodeType::kInput)
-    {
-        queue_grid = placer_solution.input_queue_to_grid_shape.at(node->name());
-        // Adjust the range to queue grid
-        queue_coord_range.end.row = queue_coord_range.start.row + queue_grid.rows;
-        queue_coord_range.end.col = queue_coord_range.start.col + queue_grid.columns;
-    }
-    else if (is_output_host_queue(config, graph, node))
-    {
-        queue_grid = GridShape(1, 1);
-        queue_coord_range.end.row = queue_coord_range.start.row + 1;
-        queue_coord_range.end.col = queue_coord_range.start.col + 1;
-    }
-    else
-    {
-        bool grid_transpose = placement.grid_transpose;
-        queue_grid = GridShape(
-            (grid_transpose) ? queue_coord_range.size_c() : queue_coord_range.size_r(),
-            (grid_transpose) ? queue_coord_range.size_r() : queue_coord_range.size_c());
-    }
-    return {queue_coord_range, queue_grid};
-}
-
-std::string get_host_input_name(
-    const graphlib::Graph *graph, const graphlib::Node *ref_node, const graphlib::Node *node)
-{
-    // Loopback queue (i.e. queue that optmizer writes to) should not have 'HOST' as their input even
-    // though the host will be initializing them.
-    std::string input_name = ref_node->name();
-    if (node->node_type() == graphlib::NodeType::kInput)
-    {
-        std::vector<graphlib::Edge> loopback_edges = graph->operand_edges(
-            node, [](graphlib::Edge e) { return e.edge_type == graphlib::EdgeType::kDataLoopback; });
-        if (loopback_edges.size() > 0)
-        {
-            input_name = graph->node_by_id(loopback_edges[0].producer_node_id)->name();
-        }
-        else
-        {
-            input_name = "HOST";
-        }
-    }
-    return input_name;
-}
-QueuePlacement get_queue_placement(
-    const HostMemoryPlacerConfig &config,
-    HostMemoryAllocator &allocator,
-    const graphlib::Graph *graph,
-    const graphlib::Node *node,
-    const graphlib::Node *ref_node,
-    const PlacerSolution &placer_solution,
-    const balancer::BalancerSolution &balancer_solution)
-{
-    GridShape queue_grid;
-    OpPlacement placement;
-    CoordRange queue_coord_range;
-    balancer::BlockShape block_shape;
-
-    try
-    {
-        placement = placer_solution.name_to_op_placement.at(ref_node->name());
-        queue_coord_range = placement.placed_cores;
-        if (ref_node->get_type() == "BudaOp::ethernet_datacopy")
-        {
-            auto const &grid_shape = balancer_solution.op_models.at(ref_node->name()).grid_shape;
-            queue_coord_range = CoordRange{
-                .start = Coord{.row = 0, .col = 0},
-                .end = Coord{.row = (uint32_t)grid_shape.r, .col = (uint32_t)grid_shape.c}};
-        }
-        block_shape = balancer_solution.block_shapes.at(
-            (node->node_type() == graphlib::NodeType::kQueue) ? ref_node->name() : node->name());
-        if (node->node_type() == graphlib::NodeType::kQueue and
-            balancer_solution.op_models.at(ref_node->name()).has_sparse_buffer())
-        {
-            TT_ASSERT((queue_coord_range.size_c() % 2) == 0);
-            queue_coord_range.end.col = queue_coord_range.start.col + (queue_coord_range.size_c() / 2);
-        }
-    }
-    catch (std::out_of_range &e)
-    {
-        throw std::runtime_error(
-            "Placement for node " + ref_node->name() + " from queue " + node->name() + " is missing.");
-    }
-
-    bool output_host_queue = is_output_host_queue(config, graph, node);  // only output tensors to host are untilized
-    bool untilize = output_host_queue;
-    if (output_host_queue)
-    {
-        block_shape = balancer_solution.block_shapes.at(node->name());
-    }
-    std::tie(queue_coord_range, queue_grid) =
-        get_host_queue_grid(config, placer_solution, placement, graph, node, queue_coord_range);
-    std::uint32_t queue_size = get_queue_size(node->as<graphlib::QueueNode>(), block_shape, untilize);
-
-    return QueuePlacement{
-        .name = node->name(),
-        .input_name = get_host_input_name(graph, ref_node, node),
-        .grid_shape = queue_grid,
-        .on_host = true,
-        .chip_id = placement.chip_id,
-        .dram_buffers = {},
-        .host_buffers = allocator.allocate_queue(node, queue_coord_range, queue_size)};
-}
-
-void place_host_queues(
-    const HostMemoryPlacerConfig &host_memory_config,
-    HostMemoryAllocator &host_memory_allocator,
-    const graphlib::Graph *graph,
-    PlacerSolution &placer_solution,
-    balancer::BalancerSolution &balancer_solution)
-{
-    for (Node *node : graphlib::topological_sort(*graph))
-    {
-        if (Node *ref_node = get_reference_node(graph, node);
-            ref_node != nullptr and is_host_queue(host_memory_config, graph, node))
-        {
-            // Output and intermediate queues depend on the producer grid. There can only be one writer to the queue.
-            // If the output queue is to be placed on host, then no allocation is needed
-            bool already_placed = is_queue_already_placed(placer_solution, node);
-            bool already_allocated = is_queue_already_allocated(placer_solution, node);
-            if (already_allocated || already_placed)
-            {
-                log_trace(LogPlacer, "Skipping queue {} since it is already allocated.", node->name());
-                continue;
-            }
-
-            placer_solution.name_to_queue_placement.insert(std::make_pair(
-                node->name(),
-                get_queue_placement(
-                    host_memory_config,
-                    host_memory_allocator,
-                    graph,
-                    node,
-                    ref_node,
-                    placer_solution,
-                    balancer_solution)));
-        }
-    }
-}
-
-}  // namespace tt::placer
diff --git a/pybuda/csrc/placer/host_memory.hpp b/pybuda/csrc/placer/host_memory.hpp
deleted file mode 100644
index b7454a1a2..000000000
--- a/pybuda/csrc/placer/host_memory.hpp
+++ /dev/null
@@ -1,62 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <cstdint>
-#include <vector>
-
-#include "placer/host_memory_allocator.hpp"
-
-namespace tt
-{
-struct DeviceConfig;
-namespace graphlib
-{
-class Graph;
-class Node;
-}  // namespace graphlib
-namespace balancer
-{
-struct BalancerSolution;
-}
-
-namespace placer
-{
-
-class HostChannelMemoryRegion
-{
-    std::uint32_t host_channel_id;
-    std::uint32_t host_channel_start_addr;
-    std::uint32_t host_channel_size;
-
-   public:
-    HostChannelMemoryRegion(
-        std::uint32_t host_channel_id, std::uint32_t host_channel_start_addr, std::uint32_t host_channel_size) :
-        host_channel_id(host_channel_id),
-        host_channel_start_addr(host_channel_start_addr),
-        host_channel_size(host_channel_size)
-    {
-    }
-    std::uint32_t get_host_channel_id() const { return host_channel_id; }
-    std::uint32_t get_host_channel_start_addr() const { return host_channel_start_addr; }
-    std::uint32_t get_host_channel_size() const { return host_channel_size; }
-};
-
-// HostMemory is system memory that is memory mapped onto the device.
-// Host memory is divided into host channels, which are contiguous regions of memory.
-struct HostMemoryPlacerConfig
-{
-    std::vector<HostChannelMemoryRegion> host_memory_regions;
-    bool input_queues_on_host;
-    bool output_queues_on_host;
-
-   public:
-    HostMemoryPlacerConfig(const DeviceConfig& device_config, bool input_queues_on_host, bool output_queues_on_host);
-    bool place_input_queues_on_host() const;
-    bool place_output_queues_on_host() const;
-    std::size_t get_num_host_channels() const { return host_memory_regions.size(); }
-};
-
-}  // namespace placer
-}  // namespace tt
diff --git a/pybuda/csrc/placer/host_memory_allocator.cpp b/pybuda/csrc/placer/host_memory_allocator.cpp
deleted file mode 100644
index 1958d0154..000000000
--- a/pybuda/csrc/placer/host_memory_allocator.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "placer/host_memory_allocator.hpp"
-
-#include "balancer/balancer.hpp"
-#include "graph_lib/node.hpp"
-#include "placer/allocator_utils.hpp"
-#include "placer/host_memory.hpp"
-#include "third_party/budabackend/common/param_lib.hpp"
-
-namespace tt::placer
-{
-
-// NB: To ensure device->host writes are 64B aligned(PCIE controller w/ 512-bit interface), we need to allocate 
-// addresses that are odd multiples of 32 bytes because we need to include the 32 byte tile header.
-// See BBE#2175 for more details.
-inline static std::uint32_t align_host_address(std::uint32_t address)
-{
-    constexpr std::uint32_t alignment = 32;
-    // Add alignment to the address to ensure we go to the next multiple if not already at one
-    address += (alignment - 1);
-
-    // Align to the next even multiple of `alignment`
-    address &= ~uintptr_t(alignment - 1);
-
-    // Check if the result is an odd multiple; if not, add another `alignment`
-    if ((address / alignment) % 2 == 0)
-    {
-        address += alignment;
-    }
-
-    return address;
-}
-
-std::uint32_t HostMemoryAllocator::get_current_allocation_address() const
-{
-    return align_host_address(this->current_allocation_address);
-}
-
-void HostMemoryAllocator::increment_allocation_address(const std::uint32_t size)
-{
-    this->current_allocation_address = align_host_address(this->get_current_allocation_address() + size);
-}
-
-std::pair<std::uint32_t, std::uint32_t> HostMemoryAllocator::allocate_memory(const graphlib::Node* node, std::uint32_t queue_size)
-{
-    std::uint32_t allocated_channel = this->get_current_allocation_channel();
-    std::uint32_t allocated_address = this->get_current_allocation_address();
-
-    if (allocated_address + queue_size > this->config.host_memory_regions.at(allocated_channel).get_host_channel_size())
-    {
-        // Fallback to existing allocation scheme: allocate on next channel until we run out of channels
-        if (allocated_channel >= this->config.host_memory_regions.size() - 1)
-        {
-            log_fatal(tt::LogPlacer, "Host queue {} of address {} + size {} = {} exceeds maximum allocatable address {} on host channel {}",
-                node->name(), allocated_address, queue_size, this->current_allocation_address, this->config.host_memory_regions.at(allocated_channel).get_host_channel_size(), allocated_channel);
-        }
-        allocated_channel++;
-        this->current_allocation_channel = allocated_channel;
-        this->current_allocation_address =
-            this->config.host_memory_regions.at(allocated_channel).get_host_channel_start_addr();
-        return allocate_memory(node, queue_size);
-    }
-    this->increment_allocation_address(queue_size);
-
-    return {allocated_channel, allocated_address};
-}
-
-std::vector<QueueHostBufferPlacement> HostMemoryAllocator::allocate_queue(
-    const graphlib::Node *node, CoordRange const &queue_grid, std::uint32_t queue_size)
-{
-    std::vector<QueueHostBufferPlacement> buffer_placement;
-    for (std::uint32_t row = queue_grid.start.row; row < queue_grid.end.row; row++)
-    {
-        for (std::uint32_t col = queue_grid.start.col; col < queue_grid.end.col; col++)
-        {
-            auto [allocated_channel, allocated_address] = this->allocate_memory(node, queue_size);
-
-            buffer_placement.push_back(QueueHostBufferPlacement{
-                .channel = allocated_channel,
-                .address = allocated_address,
-                .buffer_size = queue_size,
-            });
-            log_debug(
-                tt::LogPlacer,
-                "Placing host queue {} of size {}, channel {} address {}",
-                node->name(),
-                queue_size,
-                allocated_channel,
-                allocated_address);
-        }
-    }
-
-    return buffer_placement;
-}
-
-}  // namespace tt::placer
\ No newline at end of file
diff --git a/pybuda/csrc/placer/host_memory_allocator.hpp b/pybuda/csrc/placer/host_memory_allocator.hpp
deleted file mode 100644
index 403c56f36..000000000
--- a/pybuda/csrc/placer/host_memory_allocator.hpp
+++ /dev/null
@@ -1,56 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#pragma once
-
-#include <vector>
-
-#include "placer/host_memory.hpp"
-#include "placer/placer.hpp"
-
-namespace tt
-{
-struct DeviceConfig;
-
-namespace graphlib
-{
-class Graph;
-class Node;
-}  // namespace graphlib
-namespace balancer
-{
-struct BalancerSolution;
-}
-
-namespace placer
-{
-
-struct HostMemoryPlacerConfig;
-class HostMemoryAllocator
-{
-    const HostMemoryPlacerConfig &config;
-    std::uint32_t current_allocation_channel;
-    std::uint32_t current_allocation_address;
-
-   public:
-    HostMemoryAllocator(const HostMemoryPlacerConfig &config, std::uint32_t current_allocation_address)
-    : config(config), current_allocation_channel(0), current_allocation_address(current_allocation_address) {}
-
-    std::uint32_t get_current_allocation_channel() const { return current_allocation_channel; }
-    std::uint32_t get_current_allocation_address() const;
-    void increment_allocation_address(const std::uint32_t size);
-
-    std::pair<std::uint32_t, std::uint32_t> allocate_memory(const graphlib::Node* node, std::uint32_t queue_size);
-    std::vector<QueueHostBufferPlacement> allocate_queue(
-        const graphlib::Node *node, CoordRange const &queue_grid, std::uint32_t queue_size);
-};
-void place_host_queues(
-    const HostMemoryPlacerConfig &host_memory_config,
-    HostMemoryAllocator &host_memory_allocator,
-    const graphlib::Graph *graph,
-    PlacerSolution &placer_solution,
-    balancer::BalancerSolution &balancer_solution);
-
-}  // namespace placer
-}  // namespace tt
diff --git a/pybuda/csrc/placer/interactive_placer.cpp b/pybuda/csrc/placer/interactive_placer.cpp
deleted file mode 100644
index f9c66ac8d..000000000
--- a/pybuda/csrc/placer/interactive_placer.cpp
+++ /dev/null
@@ -1,847 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include "placer/interactive_placer.hpp"
-
-#include <optional>
-
-#include "placer/grid_placer.hpp"
-#include "placer/lower_to_placer.hpp"
-#include "placer/lowering_utils.hpp"
-#include "placer/placer.hpp"
-#include "utils/assert.hpp"
-
-namespace tt::placer
-{
-
-InteractivePlacer::InteractivePlacer(const graphlib::Graph *graph, const balancer::BalancerConfig &config) :
-    valid(true), config(config)
-{
-    epoch_id_to_device_grid.rows = config.device_config.grid_size.r;  // TODO: get harvested rows
-    epoch_id_to_device_grid.columns = config.device_config.grid_size.c;
-    chips_with_mmio = std::unordered_set<ChipId>(
-        std::begin(config.device_config.chips_with_mmio),
-        std::end(config.device_config.chips_with_mmio));
-
-    log_debug(tt::LogPlacer, "config.device_config.arch_name:{}", config.device_config.arch_name);
-    log_debug(tt::LogPlacer, "config.chip_ids:{}", config.chip_ids);
-    log_debug(tt::LogPlacer, "config.chips_with_mmio:{}", config.device_config.chips_with_mmio);
-
-    TT_LOG_ASSERT(
-        config.chip_ids.size() == 1 || config.device_config.arch_name == "wormhole" ||
-            config.device_config.arch_name == "wormhole_b0",
-        "Interactive placer for multi-chip - unsupported architecture: {}",
-        config.device_config.arch_name);
-
-    if (env_as<bool>("PYBUDA_WORMHOLE_PIPELINED_PLACER") == false &&
-        (config.device_config.arch_name == "wormhole" || config.device_config.arch_name == "wormhole_b0"))
-    {
-        // iterate over chip_ids in round-robin for WH placements, non-mmio chips first
-        sorted_chip_ids = placer::lowering::apply_chip_placement_policy(config.device_config, config.chip_placement_policy, config.chip_ids);
-    }
-    else
-    {
-        // single-chip assignment for non-WH or for pipelined placement
-        sorted_chip_ids.push_back(0);
-    }
-
-    log_debug(tt::LogPlacer, "sorted_chip_ids: {}", sorted_chip_ids);
-
-    if (graph)
-    {
-        output_ops = placer::lowering::get_output_nodes(graph);
-    }
-
-    current_epoch_index = 0;
-    current_epoch_type = NodeEpochType::Forward;
-    current_temporal_epoch_id = 0;
-    current_spatial_epoch_id = 0;
-    init_epoch();
-}
-
-// insert empty graphs if there are unused chip_ids in the current temporal epoch
-// arguments are spatial_epoch_id and temporal_epoch_id of the first empty graph to be inserted
-void InteractivePlacer::insert_empty_graphs(std::uint32_t spatial_epoch_id, std::uint32_t temporal_epoch_id)
-{
-    while (remaining_chip_ids_in_temporal_epoch.size())
-    {
-        ChipId chip_id = remaining_chip_ids_in_temporal_epoch.front();
-        remaining_chip_ids_in_temporal_epoch.pop_front();
-        log_debug(
-            tt::LogPlacer,
-            "empty graph - current_epoch_index:{} temporal_epoch_id:{} spatial_epoch_id:{} chip_id:{}",
-            current_epoch_index,
-            temporal_epoch_id,
-            spatial_epoch_id,
-            chip_id);
-        TT_ASSERT(spatial_epoch_id < sorted_chip_ids.size());
-
-        epoch_id_to_chip[current_epoch_index] = chip_id;
-        epoch_id_to_epoch_info[current_epoch_index] = EpochInfo{
-            .global_epoch_id = current_epoch_index,
-            .temporal_epoch_id = temporal_epoch_id,
-            .spatial_epoch_id = spatial_epoch_id,
-            .epoch_type = current_epoch_type};
-        epoch_id_to_subgraph_index[current_epoch_index] = 0;
-        epoch_id_to_op_placement[current_epoch_index].clear();
-        current_epoch_index++;
-        spatial_epoch_id++;
-    }
-}
-
-// compute current_chip_id for the next spatial epoch with global spatial id=current_epoch_index
-// start_temporal_epoch: true if the next epoch is the first epoch in the temporal epoch
-// new_temporal_epoch: true if this is the first time we are processing this temporal epoch, i.e. it is not a rewind
-void InteractivePlacer::next_chip_id(bool start_temporal_epoch, bool new_temporal_epoch, std::optional<std::vector<ChipId>> requested_chip_ids)
-{
-    // repopulate chip ids for the temporal epoch
-    if (start_temporal_epoch)
-    {
-        // if we finished a temporal epoch and starting a new one with next_epoch,
-        // remaining_chip_ids_in_temporal_epoch should be empty
-        // but in case next_epoch is forced with an epoch_break or we rewind epochs,
-        // remaining_chip_ids_in_temporal_epoch will not be empty and we should clear it
-        // if remaining chip_ids are not empty (i.e. we are not rewinding) insert empty graphs for them
-        if (remaining_chip_ids_in_temporal_epoch.size() && new_temporal_epoch)
-        {
-            // current_temporal_epoch_id is already incremented for the new temporal epoch
-            insert_empty_graphs(
-                epoch_id_to_epoch_info.at(current_epoch_index - 1).spatial_epoch_id +
-                    1,  // last spatial_epoch_id in temporal epoch
-                current_temporal_epoch_id -
-                    1  // current_temporal_epoch_id is already incremented for the new temporal epoch
-            );
-        }
-        remaining_chip_ids_in_temporal_epoch.clear();
-
-        if (env_as<bool>("PYBUDA_PLACER_SNAKE") && (current_temporal_epoch_id % 2) == 1)
-        {
-            // every odd temporal epoch, iterate chip ids in reverse order
-            // so that we start with the chip id we ended the previous temporal epoch
-            std::copy(
-                sorted_chip_ids.rbegin(),
-                sorted_chip_ids.rend(),
-                std::inserter(remaining_chip_ids_in_temporal_epoch, remaining_chip_ids_in_temporal_epoch.begin()));
-        }
-        else
-        {
-            std::copy(
-                sorted_chip_ids.begin(),
-                sorted_chip_ids.end(),
-                std::inserter(remaining_chip_ids_in_temporal_epoch, remaining_chip_ids_in_temporal_epoch.begin()));
-        }
-    }
-
-    // check if any of the requested chip ids is in remaining_chip_ids_in_temporal_epoch, then use it
-    // otherwise get the next chip id from remaining_chip_ids_in_temporal_epoch
-    ChipId requested_chip_id = INVALID_CHIP_ID;
-    if(requested_chip_ids.has_value()) {
-        for(auto& chip_id: requested_chip_ids.value()) {
-            if(std::find(
-                remaining_chip_ids_in_temporal_epoch.begin(),
-                remaining_chip_ids_in_temporal_epoch.end(),
-                chip_id) != remaining_chip_ids_in_temporal_epoch.end()) {
-                requested_chip_id = chip_id;
-                break;
-            }
-        }
-    }
-
-    if(requested_chip_id != INVALID_CHIP_ID) {
-        current_chip_id = requested_chip_id;
-        remaining_chip_ids_in_temporal_epoch.erase(
-            std::remove(
-                remaining_chip_ids_in_temporal_epoch.begin(),
-                remaining_chip_ids_in_temporal_epoch.end(),
-                requested_chip_id),
-            remaining_chip_ids_in_temporal_epoch.end());
-    }
-    else {
-        // get a chip id to use
-        // the requirement for picking a chip_id is not to repeat chip_ids in a temporal epoch
-        // i.e. we need to end the temporal epoch once all chip ids are used
-        // a simple algorihm used here is to pop from a deque.
-        // once all all last chip ids are used, a new temporal epoch will start.
-        current_chip_id = remaining_chip_ids_in_temporal_epoch.front();
-        remaining_chip_ids_in_temporal_epoch.pop_front();
-    }
-
-    is_current_chip_id_mmio = chips_with_mmio.count(current_chip_id);
-}
-
-// returns true if the op can be placed on current_chip_id
-bool InteractivePlacer::can_place_op_onto_chip(const std::string &op_name, bool chip_break, std::vector<ChipId>& requested_chip_ids)
-{
-    bool output_op = config.output_queues_on_host && output_ops.find(op_name) != output_ops.end();
-    if (output_op)
-    {
-        log_debug(tt::LogPlacer, "epoch {} contains output_op: {}", current_epoch_index, op_name);
-    }
-    if (chip_break)
-    {
-        log_debug(tt::LogPlacer, "epoch {} contains chip_break_op: {}", current_epoch_index, op_name);
-    }
-
-    // place output ops only on mmio chips
-    bool skip_due_to_output_op = output_op && is_current_chip_id_mmio == false;
-    // skip a spatial epoch if a chip break is requested
-    bool skip_due_to_chip_break = chip_break && placed_ops_in_current_epoch.size() &&
-                                  visited_ops_in_current_epoch.find(op_name) == visited_ops_in_current_epoch.end();
-    ChipId override_chip_id =
-        config.op_name_to_placer_overrides.find(op_name) != config.op_name_to_placer_overrides.end() &&
-                config.op_name_to_placer_overrides.at(op_name).chip_id.has_value()
-            ? config.op_name_to_placer_overrides.at(op_name).chip_id.value()
-            : INVALID_CHIP_ID;
-    // skip if op has a chip id override which is not current_chip_id
-    bool skip_due_to_chip_id_override = override_chip_id != INVALID_CHIP_ID && override_chip_id != current_chip_id;
-    TT_ASSERT(
-        override_chip_id == INVALID_CHIP_ID || skip_due_to_output_op == false || chips_with_mmio.count(override_chip_id),
-        "Op has override chip id but must be placed onto mmio chip");
-    // only output ops are placed on mmio chips on Nebula+Galaxy systems
-    bool skip_if_not_output = env_as<bool>("PYBUDA_NEBULA_GALAXY_PLACER") && output_op == false &&
-                              is_current_chip_id_mmio == true && sorted_chip_ids.size() != 1;
-
-    // if we are using pipelined placer, we insert an implicit epoch break on the output op
-    // so that the output op should be the sole op on the epoch so it can be placed on the mmio chip
-    bool skip_due_to_epoch_break_for_output_op =
-        env_as<bool>("PYBUDA_NEBULA_GALAXY_PLACER") && env_as<bool>("PYBUDA_WORMHOLE_PIPELINED_PLACER") && output_op &&
-        placed_ops_in_current_epoch.size() &&
-        visited_ops_in_current_epoch.find(op_name) == visited_ops_in_current_epoch.end();
-
-    // request a chip id from the chip_id assignment for the next epoch
-    if(skip_due_to_chip_id_override) {
-        requested_chip_ids.push_back(override_chip_id);
-    }
-    else if(skip_due_to_output_op) {
-        // request an mmio chip id for next attempt
-        std::transform(
-            config.device_config.chips_with_mmio.begin(),
-            config.device_config.chips_with_mmio.end(),
-            std::inserter(requested_chip_ids, requested_chip_ids.begin()),
-            [](int chip_id){ return (ChipId)chip_id; }
-        );
-    }
-
-    return skip_due_to_output_op == false && skip_due_to_chip_break == false && skip_due_to_chip_id_override == false &&
-           skip_if_not_output == false && skip_due_to_epoch_break_for_output_op == false;
-}
-
-// initialize a spatial epoch with the epoch_index (i.e. global epoch id)
-void InteractivePlacer::init_epoch(bool start_temporal_epoch, bool new_temporal_epoch, std::optional<std::vector<ChipId>> requested_chip_ids)
-{
-    next_chip_id(start_temporal_epoch, new_temporal_epoch, requested_chip_ids);
-
-    epoch_id_to_epoch_info[current_epoch_index] = EpochInfo{
-        .global_epoch_id = current_epoch_index,
-        .temporal_epoch_id = current_temporal_epoch_id,
-        .spatial_epoch_id = current_spatial_epoch_id,
-        .epoch_type = current_epoch_type};
-
-    if(env_as<bool>("PYBUDA_NEBULA_GALAXY_PLACER") && is_current_chip_id_mmio)
-    {
-        // On Nebula+Galaxy systems, Nebula chip is mmio and it is harvested
-        epoch_id_to_device_grid.initialize_device_grid(
-            current_epoch_index,
-            config.device_config.get_harvested_nebula_galaxy_grid().r,
-            config.device_config.get_harvested_nebula_galaxy_grid().c
-        );
-    }
-    else
-    {
-        epoch_id_to_device_grid.initialize_device_grid(
-            current_epoch_index,
-            config.device_config.grid_size.r,
-            config.device_config.grid_size.c
-        );
-    }
-
-    epoch_id_to_op_placement[current_epoch_index].clear();
-    epoch_id_to_chip[current_epoch_index] = current_chip_id;
-    epoch_id_to_subgraph_index[current_epoch_index] = 0;
-    if (start_temporal_epoch)
-    {
-        placed_ops_in_current_epoch.clear();
-        visited_ops_in_current_epoch.clear();
-    }
-    log_debug(
-        tt::LogPlacer,
-        "init_epoch - current_epoch_index:{} current_chip_id:{} current_temporal_epoch_id:{} "
-        "current_spatial_epoch_id:{} remaining_chip_ids_in_temporal_epoch.size:{}",
-        current_epoch_index,
-        current_chip_id,
-        current_temporal_epoch_id,
-        current_spatial_epoch_id,
-        remaining_chip_ids_in_temporal_epoch.size());
-}
-
-// Place single op on current epoch. Returns nullopt if it doesn't fit.
-std::optional<placer::CoordRange> InteractivePlacer::place_op(
-    const std::string &op_name, const balancer::GridShape &shape, bool enable_transpose, bool chip_break)
-{
-    return place_op(
-        op_name, placer::GridShape({(std::uint32_t)shape.r, (std::uint32_t)shape.c}), enable_transpose, chip_break);
-}
-
-std::optional<placer::CoordRange> InteractivePlacer::place_op(
-    const std::string &op_name, const placer::GridShape &shape, bool enable_transpose, bool chip_break)
-{
-    TT_ASSERT(valid);
-    std::unordered_map<std::string, placer::GridShape> to_place;
-    to_place[op_name] = shape;
-
-    log_debug(
-        tt::LogPlacer,
-        "Interactive placer start for op {}, grid ({}, {})", op_name, shape.rows, shape.columns);
-
-    std::optional<placer::DeviceGridPlacement> placement = place_one_op(
-        op_name,
-        config.enable_auto_transposing_placement && enable_transpose,
-        chip_break,
-        to_place);
-
-    // cannot place the op on this temporal epoch
-    if(!placement.has_value())
-    {
-        return std::nullopt;
-    }
-
-    // Placed, update structures
-    placed_ops_in_current_epoch.push_back(op_name);
-
-    auto device_grid_placement = placement.value();
-    OpPlacement op_placement = OpPlacement{
-        .id = 0,
-        .name = op_name,
-        .chip_id = current_chip_id,
-        .global_epoch_id = current_epoch_index,
-        .grid_transpose = device_grid_placement.grid_transpose,
-        .placed_cores = device_grid_placement.placed_cores};
-    name_to_op_placement[op_placement.name] = op_placement;
-    epoch_id_to_op_placement[current_epoch_index].push_back(op_placement);
-
-    placer::GridShape op_shape = shape;
-    if (op_placement.grid_transpose)
-    {
-        op_shape = placer::GridShape(shape.columns, shape.rows);
-    }
-
-    epoch_id_to_device_grid.fill_device_grid_with_placement(
-        current_epoch_index, device_grid_placement.placed_cores.start, op_shape);
-
-    log_debug(
-        tt::LogPlacer,
-        "Interactive placer: op {}, grid ({}, {}) onto chip_id={}, epoch_id={}, inclusive_start: {}, exclusive_end={}",
-        op_placement.name,
-        op_shape.rows,
-        op_shape.columns,
-        op_placement.chip_id,
-        op_placement.epoch_id(),
-        op_placement.placed_cores.start,
-        op_placement.placed_cores.end);
-
-    return op_placement.placed_cores;
-}
-
-std::optional<placer::DeviceGridPlacement> InteractivePlacer::place_one_op(
-    const std::string &op_name, bool enable_transpose, bool chip_break, const std::unordered_map<std::string, placer::GridShape>& to_place)
-{
-    std::optional<placer::DeviceGridPlacement> placement;
-
-    // keep trying epochs/chip_ids for the op
-    // until we either successfully place the op
-    // or reach the end of the temporal epoch (i.e. fail)
-    while (!placement.has_value())
-    {
-        std::vector<ChipId> requested_chip_ids;
-        if (can_place_op_onto_chip(op_name, chip_break, requested_chip_ids))
-        {
-            placement = placer::place_one_op(
-                op_name,
-                to_place,
-                epoch_id_to_device_grid.get_device_grid(current_epoch_index),
-                config.op_name_to_placer_overrides,
-                enable_transpose);
-        }
-        else
-        {
-            log_debug(tt::LogPlacer, "skipping place_op in epoch {}", current_epoch_index);
-        }
-        if (!placement.has_value())
-        {
-            // if no chip ids left in the temporal epoch, we cannot place the op
-            if (remaining_chip_ids_in_temporal_epoch.size() == 0)
-            {
-                return std::nullopt;
-            }
-
-            // for whatever reason, we did not place this op on this spatial epoch, no need to consider it for chip
-            // break again
-            visited_ops_in_current_epoch.insert(op_name);
-
-            // initialize the next spatial epoch within the current temporal epoch (with a new chip id) and try again
-            current_spatial_epoch_id++;
-            current_epoch_index++;
-
-            // corner case:
-            // 1. if we were not able to place any ops on the current_chip_id
-            // 2. a chip id was requested for this op
-            // 3. we will definitely place this on the requested chip id
-            // put the current_chip_id back in the pool to be used for the remaining ops
-            // #3 prevents deadlock: if requested_chip_id is not available,
-            //    we will keep inserting current_chip_id to the pool and pop it in a loop
-            if(placed_ops_in_current_epoch.size() == 0) {
-                for(auto& requested_chip_id: requested_chip_ids) {
-                    if(std::find(remaining_chip_ids_in_temporal_epoch.begin(),
-                            remaining_chip_ids_in_temporal_epoch.end(), requested_chip_id) != remaining_chip_ids_in_temporal_epoch.end()) {
-                        remaining_chip_ids_in_temporal_epoch.push_front(current_chip_id);
-                        current_spatial_epoch_id--;
-                        current_epoch_index--;
-                    }
-                }
-            }
-
-            init_epoch(false /* start_temporal_epoch */, false /* new_temporal_epoch */, requested_chip_ids);
-        }
-
-        // in Nebula+Galaxy systems, only output_ops will be placed onto mmio chips (Nebula)
-        // these ops should not use rows 8&9 due to harvesting
-        // TODO: this should be driven based on the config read from backend
-        // Also Nebula is not necessarily the mmio chip when we have more than one nebula chips
-        TT_ASSERT(
-            env_as<bool>("PYBUDA_NEBULA_GALAXY_PLACER") == false || config.output_queues_on_host == false ||
-            output_ops.find(op_name) == output_ops.end() || placement.has_value() == false ||
-            placement.value().placed_cores.end.row <= 8);
-    }
-
-    return placement;
-}
-
-// Bind and atomically place two ops as if they were one op.
-// Row dimension must match.
-//
-std::optional<placer::CoordRange> InteractivePlacer::place_two_ops_rowwise(
-    const std::string &op_name_1,
-    const balancer::GridShape &shape_1,
-    const std::string &op_name_2,
-    const balancer::GridShape &shape_2,
-    bool enable_transpose,
-    bool chip_break)
-{
-    return place_two_ops_rowwise(
-        op_name_1,
-        placer::GridShape((std::uint32_t)shape_1.r, (std::uint32_t)shape_1.c),
-        op_name_2,
-        placer::GridShape((std::uint32_t)shape_2.r, (std::uint32_t)shape_2.c),
-        enable_transpose,
-        chip_break);
-}
-
-std::optional<placer::CoordRange> InteractivePlacer::place_two_ops_rowwise(
-    const std::string &op_name_1,
-    const placer::GridShape &shape_1,
-    const std::string &op_name_2,
-    const placer::GridShape &shape_2,
-    bool enable_transpose,
-    bool chip_break)
-{
-    TT_ASSERT(valid);
-    TT_ASSERT(shape_1.rows == shape_2.rows);
-    std::unordered_map<std::string, placer::GridShape> to_place;
-    to_place[op_name_1] =
-        placer::GridShape({(std::uint32_t)shape_1.rows, (std::uint32_t)shape_1.columns + shape_2.columns});
-    TT_ASSERT(can_fit_on_single_epoch(to_place[op_name_1].rows, to_place[op_name_1].columns, enable_transpose));
-
-    std::optional<placer::DeviceGridPlacement> placement = place_one_op(
-        op_name_1,
-        config.enable_auto_transposing_placement && enable_transpose,
-        chip_break,
-        to_place);
-
-    // cannot place the op on this temporal epoch
-    if(!placement.has_value())
-    {
-        return std::nullopt;
-    }
-
-    // Placed, update structures. Since we placed two OPs as a single block now we need to unbind them.
-    // Calculate grid bounds of both ops and uptate the structures accordingly.
-    // First OP needs to have grid end updated, second OP needs to have grid start updated.
-    //
-    placed_ops_in_current_epoch.push_back(op_name_1);
-
-    auto device_grid_placement = placement.value();
-    auto device_grid_placement_1 = device_grid_placement;
-
-    // Handle transpose.
-    //
-    placer::GridShape op_shape_1 = shape_1;
-    if (device_grid_placement.grid_transpose)
-    {
-        op_shape_1 = placer::GridShape(shape_1.columns, shape_1.rows);
-    }
-
-    device_grid_placement_1.placed_cores.end.row = device_grid_placement.placed_cores.start.row + op_shape_1.rows;
-    device_grid_placement_1.placed_cores.end.col = device_grid_placement.placed_cores.start.col + op_shape_1.columns;
-
-    OpPlacement op_placement_1 = OpPlacement{
-        .id = 0,
-        .name = op_name_1,
-        .chip_id = current_chip_id,
-        .global_epoch_id = current_epoch_index,
-        .grid_transpose = device_grid_placement_1.grid_transpose,
-        .placed_cores = device_grid_placement_1.placed_cores};
-    name_to_op_placement[op_placement_1.name] = op_placement_1;
-    epoch_id_to_op_placement[current_epoch_index].push_back(op_placement_1);
-
-    epoch_id_to_device_grid.fill_device_grid_with_placement(
-        current_epoch_index, device_grid_placement_1.placed_cores.start, op_shape_1);
-
-    log_debug(
-        tt::LogPlacer,
-        "Interactive placer: op {}, grid ({}, {}) onto chip_id={}, epoch_id={}, inclusive_start: {}, exclusive_end={}",
-        op_placement_1.name,
-        op_shape_1.rows,
-        op_shape_1.columns,
-        op_placement_1.chip_id,
-        op_placement_1.epoch_id(),
-        op_placement_1.placed_cores.start,
-        op_placement_1.placed_cores.end);
-
-    // Handle transpose.
-    //
-    placer::GridShape op_shape_2 = shape_2;
-    if (device_grid_placement.grid_transpose)
-    {
-        op_shape_2 = placer::GridShape(shape_2.columns, shape_2.rows);
-    }
-
-    auto device_grid_placement_2 = device_grid_placement;
-    device_grid_placement_2.placed_cores.start.row = device_grid_placement.placed_cores.end.row - op_shape_2.rows;
-    device_grid_placement_2.placed_cores.start.col = device_grid_placement.placed_cores.end.col - op_shape_2.columns;
-    placed_ops_in_current_epoch.push_back(op_name_2);
-
-    OpPlacement op_placement_2 = OpPlacement{
-        .id = 0,
-        .name = op_name_2,
-        .chip_id = current_chip_id,
-        .global_epoch_id = current_epoch_index,
-        .grid_transpose = device_grid_placement_2.grid_transpose,
-        .placed_cores = device_grid_placement_2.placed_cores};
-    name_to_op_placement[op_placement_2.name] = op_placement_2;
-    epoch_id_to_op_placement[current_epoch_index].push_back(op_placement_2);
-
-    epoch_id_to_device_grid.fill_device_grid_with_placement(
-        current_epoch_index, device_grid_placement_2.placed_cores.start, op_shape_2);
-
-    log_debug(
-        tt::LogPlacer,
-        "Interactive placer: op {}, grid ({}, {}) onto chip_id={}, epoch_id={}, inclusive_start: {}, exclusive_end={}",
-        op_placement_2.name,
-        op_shape_2.rows,
-        op_shape_2.columns,
-        op_placement_2.chip_id,
-        op_placement_2.epoch_id(),
-        op_placement_2.placed_cores.start,
-        op_placement_2.placed_cores.end);
-
-    return device_grid_placement.placed_cores;
-}
-
-// Create and switch to new epoch. Returns next epoch id.
-std::uint32_t InteractivePlacer::next_epoch(graphlib::NodeEpochType epoch_type)
-{
-    TT_ASSERT(valid);
-    log_debug(tt::LogPlacer, "InteractivePlacer::next_epoch");
-    current_epoch_index++;
-    current_temporal_epoch_id++;
-    current_spatial_epoch_id = 0;
-    current_epoch_type = epoch_type;
-    init_epoch();
-    return current_epoch_index;
-}
-
-// Clear current epoch and start over. Returns the list of ops that were undone, in placed order.
-std::vector<std::pair<std::string, OpPlacement>> InteractivePlacer::rewind_epoch_logged()
-{
-    std::vector<std::pair<std::string, OpPlacement>> ret;
-
-    log_debug(tt::LogPlacer, "InteractivePlacer::rewind_epoch");
-
-    for (const std::string &name : placed_ops_in_current_epoch)
-    {
-        const OpPlacement &p = name_to_op_placement.at(name);
-        log_trace(LogPlacer, "Unplacing: {}", name);
-
-        ret.push_back(std::make_pair(name, p));
-        name_to_op_placement.erase(name);
-    }
-
-    // rewind back to the first spatial epoch in the temporal epoch
-    current_epoch_index -= current_spatial_epoch_id;
-    current_spatial_epoch_id = 0;
-
-    init_epoch(true /* start_temporal_epoch */, false /* new_temporal_epoch */);  // clear the epoch
-    return ret;
-}
-
-// Clear current epoch and start over. Non-logged fast version.
-//
-void InteractivePlacer::rewind_epoch()
-{
-    for (const std::string &name : placed_ops_in_current_epoch)
-    {
-        name_to_op_placement.erase(name);
-    }
-
-    // rewind back to the first spatial epoch in the temporal epoch
-    current_epoch_index -= current_spatial_epoch_id;
-    current_spatial_epoch_id = 0;
-
-    init_epoch(true /* start_temporal_epoch */, false /* new_temporal_epoch */);  // clear the epoch
-}
-
-// Rewind current epoch to given op - i.e. place everything up to it, but not it. Returns the name
-// and shape of the last placed op.
-std::pair<std::string, OpPlacement> InteractivePlacer::rewind_to(const std::string &op_name)
-{
-    std::pair<std::string, OpPlacement> last;
-    last.first = "";
-
-    log_trace(LogPlacer, "Rewind to: {}", op_name);
-    auto rew = rewind_epoch_logged();
-
-    for (const auto &p : rew)
-    {
-        if (p.first == op_name)
-            return last;
-
-        log_trace(LogPlacer, "Replacing: {}", p.first);
-
-        std::unordered_map<std::string, tt::placer::PlacerOpOverride>::iterator existing_override;
-        std::unordered_map<std::string, tt::placer::PlacerOpOverride>::iterator rewind_override;
-        std::optional<tt::placer::PlacerOpOverride> user_override = std::nullopt;
-        existing_override = get_op_overrides().find(p.first);
-        if (existing_override != get_op_overrides().end())
-        {
-            // Save the user override, if any, so we can restore it after rewinding the op.
-            //
-            user_override = existing_override->second;
-            get_op_overrides().erase(existing_override);
-        }
-
-        bool rewind_override_set;
-        std::tie(rewind_override, rewind_override_set) = get_op_overrides().emplace(
-            p.first,
-            tt::placer::PlacerOpOverride(p.second.placed_cores.start, p.second.grid_transpose, p.second.chip_id));
-        TT_ASSERT(rewind_override_set);
-
-        CoordRange untransposed_shape = p.second.placed_cores;
-        if (p.second.grid_transpose)
-        {
-            untransposed_shape.transpose();
-        }
-
-        auto pl = place_op(p.first, GridShape({untransposed_shape.size_r(), untransposed_shape.size_c()}));
-
-        get_op_overrides().erase(rewind_override);
-        if (user_override.has_value())
-        {
-            get_op_overrides().emplace(p.first, user_override.value());
-        }
-
-        // Re-placing in same order, on the same epoch, same grid start and size -> it should always fit.
-        //
-        TT_LOG_ASSERT(pl.has_value(), "Failed to re-place {} after rewinding.", p.first);
-        last = p;
-    }
-
-    TT_THROW("Rewinding to op that doesn't exist");
-    return last;
-}
-
-// assign consecutive epochs to the same chip until
-// number of epochs per chip is reached.
-// e.g.
-// ep0 means epoch0
-// emp means empty graph
-//               chip1    chip2   chip3
-// temp epoch 0:  ep0      emp     emp
-// temp epoch 1:  ep1      emp     emp
-// temp epoch 2:  ep2      emp     emp
-// temp epoch 3:  emp      ep3     emp
-// temp epoch 4:  emp      ep4     emp
-// temp epoch 5:  emp      ep5     emp
-// temp epoch 6:  emp      emp     ep6
-// temp epoch 7:  emp      emp     ep7
-// temp epoch 8:  emp      emp     ep8
-void InteractivePlacer::assign_chip_ids_for_pipelined_placement(
-    std::uint32_t num_epochs, std::optional<std::unordered_set<string>> const &chip_break_ops)
-{
-    TT_ASSERT(config.device_config.arch_name == "wormhole" || config.device_config.arch_name == "wormhole_b0");
-    TT_ASSERT(chip_break_ops.has_value());
-
-    log_debug(tt::LogPlacer, "Interactive placer pipelined chip id assignment for {} epochs", num_epochs);
-
-    placed_ops_in_current_epoch.clear();
-    visited_ops_in_current_epoch.clear();
-
-    // iterate over chip_ids in round-robin for WH placements, non-mmio chips first
-    sorted_chip_ids = placer::lowering::apply_chip_placement_policy(config.device_config, config.chip_placement_policy, config.chip_ids);
-
-    std::uint32_t num_epochs_per_chip = std::ceil(float(num_epochs) / sorted_chip_ids.size());
-
-    // expecting no chip id placement before
-    TT_ASSERT(remaining_chip_ids_in_temporal_epoch.size() == 0);
-    std::copy(
-        sorted_chip_ids.begin(),
-        sorted_chip_ids.end(),
-        std::inserter(remaining_chip_ids_in_temporal_epoch, remaining_chip_ids_in_temporal_epoch.begin()));
-
-    current_temporal_epoch_id = 0;
-    current_spatial_epoch_id = 0;             // spacial_epoch_id within the temporal epoch
-    std::uint32_t current_chip_id_index = 0;  // round-robin over the chip ids
-    current_chip_id = sorted_chip_ids.at(current_chip_id_index % sorted_chip_ids.size());
-    is_current_chip_id_mmio = chips_with_mmio.count(current_chip_id);
-
-    std::uint32_t num_epochs_placed_on_chip = 0;
-
-    // iterate over all the epochs already placed and assign chip ids to them
-    for (std::uint32_t epoch_index = 0; epoch_index < num_epochs; epoch_index++)
-    {
-        bool can_place_epoch_onto_chip = false;
-
-        // keep looking for valid chip_id
-        while (!can_place_epoch_onto_chip)
-        {
-            can_place_epoch_onto_chip = true;
-            for (auto &placement : epoch_id_to_op_placement[epoch_index])
-            {
-                // unused in this function
-                // for pipelined assignment, the order of chip ids is fixed
-                std::vector<ChipId> requested_chip_ids;
-                can_place_epoch_onto_chip =
-                    can_place_epoch_onto_chip &&
-                    can_place_op_onto_chip(
-                        placement.name, chip_break_ops.value().find(placement.name) != chip_break_ops.value().end(), requested_chip_ids);
-                if (!can_place_epoch_onto_chip)
-                    break;
-            }
-
-            log_debug(
-                tt::LogPlacer,
-                "epoch_index:{} current_epoch_index:{} current_chip_id:{} temporal_epoch_id:{} spatial_epoch_id:{} "
-                "can_place_epoch_onto_chip:{} num_epochs_per_chip:{}",
-                epoch_index,
-                current_epoch_index,
-                current_chip_id,
-                current_temporal_epoch_id,
-                current_spatial_epoch_id,
-                can_place_epoch_onto_chip,
-                num_epochs_per_chip);
-
-            if (can_place_epoch_onto_chip)
-            {
-                epoch_id_to_chip[epoch_index] = current_chip_id;
-                epoch_id_to_epoch_info[epoch_index].temporal_epoch_id = current_temporal_epoch_id;
-                epoch_id_to_epoch_info[epoch_index].spatial_epoch_id = current_spatial_epoch_id;
-                epoch_id_to_subgraph_index[epoch_index] = 0;
-                for (auto &op_placement : epoch_id_to_op_placement[epoch_index])
-                {
-                    op_placement.chip_id = current_chip_id;
-                    name_to_op_placement[op_placement.name].chip_id = current_chip_id;
-                }
-
-                // so we do not insert chip breaks for the first op on the chip
-                for (auto &placement : epoch_id_to_op_placement[epoch_index])
-                {
-                    placed_ops_in_current_epoch.push_back(placement.name);
-                }
-
-                num_epochs_placed_on_chip++;
-
-                // remove current_chip_id from remaining_chip_ids_in_temporal_epoch
-                // TODO: unnecessarily slow code but we do this once per temporal epoch for now
-                remaining_chip_ids_in_temporal_epoch.erase(
-                    std::remove(
-                        remaining_chip_ids_in_temporal_epoch.begin(),
-                        remaining_chip_ids_in_temporal_epoch.end(),
-                        current_chip_id),
-                    remaining_chip_ids_in_temporal_epoch.end());
-
-                // end temporal epoch by assigning empty graphs to all the other chips
-                insert_empty_graphs(current_spatial_epoch_id + 1, current_temporal_epoch_id);
-                current_temporal_epoch_id++;
-                current_spatial_epoch_id = 0;
-
-                // re-populate chip ids for the next temporal epoch
-                TT_ASSERT(remaining_chip_ids_in_temporal_epoch.empty());
-                std::copy(
-                    sorted_chip_ids.begin(),
-                    sorted_chip_ids.end(),
-                    std::inserter(remaining_chip_ids_in_temporal_epoch, remaining_chip_ids_in_temporal_epoch.begin()));
-            }
-
-            // so we insert exactly one chip break on the op
-            for (auto &placement : epoch_id_to_op_placement[epoch_index])
-            {
-                visited_ops_in_current_epoch.insert(placement.name);
-            }
-
-            // moving to next chip after successfull placement
-            if (can_place_epoch_onto_chip && num_epochs_placed_on_chip == num_epochs_per_chip)
-            {
-                placed_ops_in_current_epoch.clear();
-            }
-
-            // advance to next chip id if we could not place the op or we reached the epoch per chip limit
-            if (can_place_epoch_onto_chip == false || num_epochs_placed_on_chip == num_epochs_per_chip)
-            {
-                current_chip_id_index++;
-                current_chip_id = sorted_chip_ids.at(current_chip_id_index % sorted_chip_ids.size());
-                num_epochs_placed_on_chip = 0;
-                is_current_chip_id_mmio = chips_with_mmio.count(current_chip_id);
-            }
-        }
-    }
-}
-
-PlacerSolution InteractivePlacer::commit(std::optional<std::unordered_set<string>> const &chip_break_ops)
-{
-    if (epoch_id_to_op_placement.at(current_epoch_index).size() > 0)
-    {
-        current_epoch_index++;
-    }
-    std::uint32_t num_epochs = current_epoch_index;
-
-    if (env_as<bool>("PYBUDA_WORMHOLE_PIPELINED_PLACER"))
-    {
-        // assign chip ids after all epochs are created because we need to know
-        // how many epochs we have to balance between all the chips
-        assign_chip_ids_for_pipelined_placement(num_epochs, chip_break_ops);
-    }
-    else
-    {
-        // if doing round-robin/eager chip id assignment,
-        // and if the last temporal epoch has unused chip ids, insert empty graphs for them
-        insert_empty_graphs(current_spatial_epoch_id + 1, current_temporal_epoch_id);
-    }
-
-    log_debug(LogPlacer, "InteractivePlacer::commit");
-
-    PlacerSolution placer_solution = PlacerSolution{
-        .name_to_op_placement = std::move(name_to_op_placement),
-        .input_queue_to_grid_shape = {},
-        .name_to_queue_placement = {},
-        .epoch_id_to_chip = std::move(epoch_id_to_chip),
-        .epoch_id_to_subgraph_index = {},
-        .epoch_id_to_op_placement = std::move(epoch_id_to_op_placement),
-        .epoch_id_to_device_grid = std::move(epoch_id_to_device_grid),
-        .epoch_id_to_epoch_info = std::move(epoch_id_to_epoch_info),
-        .num_epochs = num_epochs};
-
-    valid = false;
-    return placer_solution;
-}
-
-}  // namespace tt::placer
diff --git a/pybuda/csrc/placer/interactive_placer.hpp b/pybuda/csrc/placer/interactive_placer.hpp
deleted file mode 100644
index 4932b6212..000000000
--- a/pybuda/csrc/placer/interactive_placer.hpp
+++ /dev/null
@@ -1,139 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <map>
-#include <vector>
-#include <unordered_set>
-
-#include "balancer/balancer.hpp"
-#include "placer/grid_placer.hpp"
-
-// Interactive placer provides APIs for placing individual ops, reverting epochs back, checkpointing, etc.
-
-namespace tt
-{
-namespace placer
-{
-
-class InteractivePlacer
-{
-   private:
-    std::uint32_t current_epoch_index;  // global id of the current spatial epoch
-    NodeEpochType current_epoch_type;
-    bool valid;
-    std::vector<ChipId> sorted_chip_ids;
-    std::deque<ChipId> remaining_chip_ids_in_temporal_epoch;
-    ChipId current_chip_id;
-    std::uint32_t current_temporal_epoch_id;
-    std::uint32_t current_spatial_epoch_id;
-    std::unordered_set<ChipId> chips_with_mmio; // for quick lookups
-    bool is_current_chip_id_mmio;
-
-    ChipId INVALID_CHIP_ID = (ChipId)-1;
-
-    balancer::BalancerConfig config;
-    unordered_map<string, OpPlacement> name_to_op_placement;
-    map<PlacerSolution::EpochId, int> epoch_id_to_chip;
-    map<PlacerSolution::EpochId, unsigned int> epoch_id_to_subgraph_index;
-    unordered_map<int, vector<OpPlacement>> epoch_id_to_op_placement;
-    EpochIdToDeviceGrid epoch_id_to_device_grid;
-    unordered_map<int, EpochInfo> epoch_id_to_epoch_info;
-    std::vector<std::string> placed_ops_in_current_epoch;  // ordered list
-    std::set<std::string> visited_ops_in_current_epoch;
-    std::unordered_set<std::string> output_ops;
-
-    // returns true if the op can be placed on current_chip_id
-    bool can_place_op_onto_chip(const std::string &op_name, bool chip_break, std::vector<ChipId>& requested_chip_ids);
-
-    // utility function for picking a chip id for the epoch
-    void next_chip_id(bool start_temporal_epoch, bool new_temporal_epoch, std::optional<std::vector<ChipId>> requested_chip_ids);
-
-    // pipelined placement chip id assignment
-    void assign_chip_ids_for_pipelined_placement(
-        std::uint32_t num_epochs, std::optional<std::unordered_set<string>> const &chip_break_ops);
-
-    // Set up new epoch
-    void init_epoch(bool start_temporal_epoch = true, bool new_temporal_epoch = true, std::optional<std::vector<ChipId>> requested_chip_ids = std::nullopt);
-
-    std::optional<placer::DeviceGridPlacement> place_one_op(
-        const std::string &op_name,
-        bool enable_transpose,
-        bool chip_break,
-        const std::unordered_map<std::string, placer::GridShape>& to_place);
-
-   public:
-    InteractivePlacer(const graphlib::Graph *graph, const balancer::BalancerConfig &config);
-
-    // Place single op on current epoch. Returns nullopt if it doesn't fit.
-    std::optional<placer::CoordRange> place_op(
-        const std::string &op_name,
-        const placer::GridShape &shape,
-        bool enable_transpose = false,
-        bool chip_break = false);
-    std::optional<placer::CoordRange> place_op(
-        const std::string &op_name,
-        const balancer::GridShape &shape,
-        bool enable_transpose = false,
-        bool chip_break = false);
-
-    std::optional<placer::CoordRange> place_two_ops_rowwise(
-        const std::string &op_name_1,
-        const balancer::GridShape &shape_1,
-        const std::string &op_name_2,
-        const balancer::GridShape &shape_2,
-        bool enable_transpose = false,
-        bool chip_break = false);
-    std::optional<placer::CoordRange> place_two_ops_rowwise(
-        const std::string &op_name_1,
-        const placer::GridShape &shape_1,
-        const std::string &op_name_2,
-        const placer::GridShape &shape_2,
-        bool enable_transpose = false,
-        bool chip_break = false);
-
-    // Create and switch to new epoch. Returns next epoch id.
-    std::uint32_t next_epoch(graphlib::NodeEpochType epoch_type);
-
-    // Clear current epoch and start over. Returns the list of ops that were undone, in placed order.
-    std::vector<std::pair<std::string, OpPlacement>> rewind_epoch_logged();
-
-    // Clear current epoch and start over. Non-logged fast version.
-    //
-    void rewind_epoch();
-
-    // Rewind current epoch to given op - i.e. place everything up to it, but not it.
-    // Returns placement information about last placed op.
-    //
-    std::pair<std::string, OpPlacement> rewind_to(const std::string &op_name);
-
-    std::uint32_t get_current_epoch_index() const { return current_epoch_index; }
-    bool current_epoch_empty() const { return placed_ops_in_current_epoch.empty(); }
-    int current_epoch_size() const { return placed_ops_in_current_epoch.size(); }
-    std::vector<std::string> const &current_epoch_ops() const { return placed_ops_in_current_epoch; }
-
-    bool op_placed(const std::string &op_name) const { return name_to_op_placement.count(op_name) > 0; }
-
-    void insert_empty_graphs(std::uint32_t spatial_epoch_id, std::uint32_t temporal_epoch_id);
-
-    PlacerSolution commit(
-        std::optional<std::unordered_set<string>> const &chip_break_ops =
-            std::nullopt);  // Commit and generate final placer solution. Puts object into invalid state.
-    std::unordered_map<std::string, placer::PlacerOpOverride> &get_op_overrides()
-    {
-        return config.op_name_to_placer_overrides;
-    }
-
-    bool can_fit_on_single_epoch(uint32_t rows, uint32_t columns, bool allow_transpose = false) const
-    {
-        return (rows <= epoch_id_to_device_grid.rows and columns <= epoch_id_to_device_grid.columns) or
-               (allow_transpose and config.enable_auto_transposing_placement and
-                rows <= epoch_id_to_device_grid.columns and columns <= epoch_id_to_device_grid.rows and rows > columns);
-    }
-
-    const unordered_map<string, OpPlacement>& get_current_name_to_op_placement() const { return name_to_op_placement; }
-};
-
-}  // namespace placer
-}  // namespace tt
diff --git a/pybuda/csrc/placer/lower_to_placer.cpp b/pybuda/csrc/placer/lower_to_placer.cpp
deleted file mode 100644
index 9e84ec1fd..000000000
--- a/pybuda/csrc/placer/lower_to_placer.cpp
+++ /dev/null
@@ -1,406 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "placer/lower_to_placer.hpp"
-
-#include <fmt/ranges.h>
-
-#include <unordered_set>
-
-#include "autograd/autograd.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-#include "graph_lib/utils.hpp"
-#include "scheduler/utils.hpp"
-#include "utils/assert.hpp"
-#include "utils/logger.hpp"
-
-using std::unordered_set;
-using std::tuple;
-
-// Aliases
-using tt_node = tt::graphlib::Node;
-using tt_graph = tt::graphlib::Graph;
-
-namespace tt {
-using namespace graphlib;
-namespace placer {
-namespace lowering {
-
-bool are_dependencies_resolved(const tt_graph *graph, const string& node_name, const unordered_set<string>& visited, NodeEpochType epoch_type)
-{
-    tt_node* node = graph->get_node_by_name(node_name);
-    for (tt_node* operand : graph->data_operands(node)) {
-        if ((int)operand->get_epoch_type() < (int)epoch_type)
-        {
-            continue;
-        }
-        if (operand->node_type() == NodeType::kInput)
-        {
-            continue;
-        }
-        if (visited.find(operand->name()) == visited.end())
-        {
-            return false;
-        }
-    }
-
-    return true;
-}
-
-vector<tuple<int, string, int>> get_opt_op_group_to_place(
-    const Graph* graph, Node* fwd_node, const unordered_map<string, int>& op_to_schedule_index)
-{
-    vector<tuple<int, string, int>> ret;
-
-    for (tt_node* operand : graph->data_operands(fwd_node))
-    {
-        if (operand->node_type() != NodeType::kInput) { continue; }
-
-        for (const auto& [operand_index, input_opt_nodes] : graph->get_opt_nodes(operand))
-        {
-            for (tt_node* opt_node : input_opt_nodes)
-            {
-                if (op_to_schedule_index.find(opt_node->name()) != op_to_schedule_index.end()) {
-                    ret.emplace_back(op_to_schedule_index.at(opt_node->name()), opt_node->name(), operand_index);
-                }
-            }
-        }
-    }
-
-    for (const auto& [operand_index, opt_nodes] : graph->get_opt_nodes(fwd_node))
-    {
-        for (tt_node* opt_node : opt_nodes)
-        {
-            if (op_to_schedule_index.find(opt_node->name()) != op_to_schedule_index.end()) {
-                ret.emplace_back(op_to_schedule_index.at(opt_node->name()), opt_node->name(), operand_index);
-            }
-        }
-    }
-    return ret;
-}
-
-
-unordered_map<string, vector<string>> get_fwd_to_bwd_nodes(graphlib::Graph const* graph)
-{
-    unordered_map<string, vector<string>> fwd_to_bwd_nodes;
-    for (Node* node : tt::graphlib::topological_sort(*graph))
-    {
-        if (node->get_epoch_type() == NodeEpochType::Forward and node->node_type() == NodeType::kBudaOp)
-        {
-            Node* fwd_node = node;
-            const string& fwd_node_name = fwd_node->name();
-            fwd_to_bwd_nodes[fwd_node_name] = {};
-            // Compute any recompute if exists
-            for (const Node* fwd_input_node : scheduler::get_schedule_predecessors(graph, fwd_node))
-            {
-                for (Edge fwd_input_recompute_edge : graph->user_edges(
-                         fwd_input_node,
-                         [](Edge e) { return e.edge_type == graphlib::EdgeType::kAutogradFwdToRecompute; }))
-                {
-                    Node* fwd_input_recompute_node = graph->node_by_id(fwd_input_recompute_edge.consumer_node_id);
-                    if (fwd_input_recompute_node->node_type() == NodeType::kBudaOp)
-                    {
-                        fwd_to_bwd_nodes[fwd_node_name].push_back(fwd_input_recompute_node->name());
-                    }
-                }
-            }
-
-            for (Edge bwd_edge : graph->user_edges(
-                     fwd_node, [](Edge e) { return e.edge_type == graphlib::EdgeType::kAutogradFwdToBwd; }))
-            {
-                Node* bwd_node = graph->node_by_id(bwd_edge.consumer_node_id);
-                if (bwd_node->node_type() == NodeType::kBudaOp)
-                {
-                    fwd_to_bwd_nodes[fwd_node_name].push_back(bwd_node->name());
-                }
-            }
-            for (Node* fwd_input_node : graph->data_operands(fwd_node))
-            {
-                if (fwd_input_node->node_type() == NodeType::kInput)
-                {
-                    for (Edge bwd_edge : graph->user_edges(
-                             fwd_input_node,
-                             [](Edge e) { return e.edge_type == graphlib::EdgeType::kAutogradFwdToBwd; }))
-                    {
-                        Node* bwd_node = graph->node_by_id(bwd_edge.consumer_node_id);
-                        if (bwd_node->node_type() == NodeType::kBudaOp)
-                        {
-                            fwd_to_bwd_nodes[fwd_node_name].push_back(bwd_node->name());
-                        }
-                    }
-                }
-            }
-        }
-    }
-    return fwd_to_bwd_nodes;
-}
-
-// TODO(jchu): commonize with above
-unordered_map<string, map<int, vector<string>>> get_fwd_to_opt_nodes(
-    graphlib::Graph const *graph,
-    const vector<string>& scheduled_ops)
-{
-    unordered_map<string, map<int, vector<string>>> fwd_to_opt_nodes;
-
-    if (not graph->contains_opt_nodes())
-    {
-        return {};
-    }
-
-
-    unordered_map<string, int> op_to_schedule_index;
-
-    for (int i = 0; i < (int)scheduled_ops.size(); ++i) {
-        op_to_schedule_index[scheduled_ops[i]] = i;
-    }
-
-    unordered_set<string> visited_ops;
-    vector<string> unprocessed;
-    for (const string& fwd_node_name : scheduled_ops)
-    {
-        tt_node* fwd_node = graph->get_node_by_name(fwd_node_name);
-        NodeEpochType epoch_type = fwd_node->get_epoch_type();
-
-        if (epoch_type == NodeEpochType::Forward)
-        {
-            vector<tuple<int, string, int>> opt_ops_to_place = get_opt_op_group_to_place(graph, fwd_node, op_to_schedule_index);
-            std::sort(opt_ops_to_place.begin(), opt_ops_to_place.end());
-
-            for (const auto& [_, opt_node_name, operand_index] : opt_ops_to_place)
-            {
-                if (visited_ops.find(opt_node_name) == visited_ops.end())
-                {
-                    if (are_dependencies_resolved(graph, opt_node_name, visited_ops, NodeEpochType::Optimizer)) {
-                        fwd_to_opt_nodes[fwd_node_name][operand_index].push_back(opt_node_name);
-                        visited_ops.insert(opt_node_name);
-                    } else {
-                        unprocessed.push_back(opt_node_name);
-                    }
-                }
-            }
-        }
-    }
-    for (const string& name : unprocessed) {
-        if (visited_ops.find(name) == visited_ops.end()) {
-            log_fatal("{} was not included to be placed.", name);
-        }
-    }
-
-
-    return fwd_to_opt_nodes;
-}
-
-unordered_map<string, NodeEpochType> get_op_to_epoch_type_mapping(
-    tt_graph const* graph, const vector<string>& scheduled_ops) {
-    unordered_map<string, NodeEpochType> op_to_epoch_type;
-    for (const string& op_name : scheduled_ops)
-    {
-        tt_node* node = graph->get_node_by_name(op_name);
-        op_to_epoch_type[op_name] = node->get_epoch_type();
-    }
-    return op_to_epoch_type;
-}
-
-unordered_map<string, bool> get_op_to_grad_op_mapping(
-    tt_graph const* graph, const vector<string>& scheduled_ops) {
-    unordered_map<string, bool> op_to_grad_op;
-    for (const string& op_name : scheduled_ops)
-    {
-        tt_node* node = graph->get_node_by_name(op_name);
-        if (node->node_type() == NodeType::kBudaOp) {
-            op_to_grad_op[op_name] = node->as<BudaOpNode>()->is_gradient_op();
-        } else {
-            op_to_grad_op[op_name] = false;
-        }
-    }
-    return op_to_grad_op;
-}
-
-
-unordered_map<string, bool>
-get_op_to_recompute_mapping(graphlib::Graph const* graph, const vector<string>& scheduled_ops)
-{
-    unordered_map<string, bool> op_to_recompute;
-    for (const string& op_name : scheduled_ops)
-    {
-        tt_node* node = graph->get_node_by_name(op_name);
-        op_to_recompute[op_name] = graphlib::is_recompute(graph, node);
-    }
-    return op_to_recompute;
-}
-
-unordered_set<string> get_output_nodes(const graphlib::Graph *graph)
-{
-    unordered_set<string> output_ops;
-    for (Node *n: graph->nodes_by_type(graphlib::NodeType::kOutput))
-    {
-        auto partial_datacopy_edges = graph->user_edges(n, [](Edge e) { return e.edge_type == graphlib::EdgeType::kPartialDataCopy; });
-        if (not partial_datacopy_edges.empty())
-            continue;
-
-        std::vector<Edge> edges = graph->operand_data_edges(n);
-        TT_ASSERT(edges.size() == 1);
-        Node *source = graph->node_by_id(edges[0].producer_node_id);
-        TT_ASSERT(source->node_type() == NodeType::kBudaOp);
-        output_ops.insert(source->name());
-    }
-
-    return output_ops;
-}
-
-vector<string> generate_placer_schedule(tt_graph const* graph, PlacementScheduleOrder) {
-    vector<string> scheduled_nodes;
-    for (tt_node* node : tt::graphlib::topological_sort(*graph))
-    {
-        if (node->node_type() != NodeType::kInput and node->node_type() != NodeType::kOutput and node->node_type() != NodeType::kQueue)
-        {
-            scheduled_nodes.push_back(node->name());
-        }
-    }
-    return scheduled_nodes;
-}
-
-static unordered_set<string> tag_ops_for_epoch_or_chip_break(
-    const vector<vector<string>>& op_names_to_epoch_or_chip_break,
-    const vector<string>& scheduled_ops,
-    graphlib::Graph const* graph,
-    bool /*is_epoch_break*/)
-{
-    check_user_defined_op_names_exist_in_schedule(op_names_to_epoch_or_chip_break, scheduled_ops);
-
-    unordered_set<string> ops_tagged_for_epoch_break;
-    for (const auto& op_names : op_names_to_epoch_or_chip_break)
-    {
-        if (op_names.size() == 1)
-        {
-            ops_tagged_for_epoch_break.insert(op_names[0]);
-        }
-        else
-        {
-            // select the op that comes earliest in the schedule. not really expecting a ton of these iterations
-            // we'll just loop over scheduled_ops to simplify
-            unordered_set<string> op_names_set(op_names.begin(), op_names.end());
-            for (const string& scheduled_op : scheduled_ops)
-            {
-                if (op_names_set.find(scheduled_op) != op_names_set.end())
-                {
-                    ops_tagged_for_epoch_break.insert(scheduled_op);
-                    break;
-                }
-
-            }
-        }
-    }
-
-    // Add epoch breaks between subgraphs
-    unsigned int prev_subgraph_id = graph->get_subgraph_id_for_node(graph->get_node_by_name(scheduled_ops[0])->id());
-    for (auto op : scheduled_ops)
-    {
-        unsigned int subgraph_id = graph->get_subgraph_id_for_node(graph->get_node_by_name(op)->id());
-        TT_ASSERT(subgraph_id >= prev_subgraph_id);
-        if (subgraph_id != prev_subgraph_id)
-        {
-            ops_tagged_for_epoch_break.insert(op);
-            log_debug(LogPlacer, "Epoch break due to subgraph at: {}",op);
-            prev_subgraph_id = subgraph_id;
-        }
-    }
-    return ops_tagged_for_epoch_break;
-}
-
-unordered_set<string> tag_ops_for_epoch_break(
-    const string& arch_name,
-    const vector<vector<string>>& op_names_to_epoch_break,
-    const vector<vector<string>>& op_names_to_chip_break,
-    const vector<string>& scheduled_ops,
-    graphlib::Graph const* graph,
-    bool use_interactive_placer)
-{
-    if (env_as<bool>("PYBUDA_SINGLE_OP_EPOCHS"))
-    {
-        unordered_set<string> ops_tagged_for_epoch_break;
-        for (const auto& op_name : scheduled_ops)
-        {
-            ops_tagged_for_epoch_break.insert(op_name);
-        }
-        return ops_tagged_for_epoch_break;
-    }
-    if ((use_interactive_placer == false || env_as<bool>("PYBUDA_WORMHOLE_PIPELINED_PLACER")) && arch_name.find("wormhole") != std::string::npos)
-    {
-        vector<vector<string>> updated_op_names_to_epoch_break = op_names_to_epoch_break;
-        updated_op_names_to_epoch_break.insert(
-            updated_op_names_to_epoch_break.end(),
-            op_names_to_chip_break.begin(), op_names_to_chip_break.end());
-
-        if (env_as<bool>("PYBUDA_NEBULA_GALAXY_PLACER"))
-        {
-            for (const auto& output_op : get_output_nodes(graph))
-            {
-                updated_op_names_to_epoch_break.insert(updated_op_names_to_epoch_break.end(), {output_op});
-            }
-        }
-        return tag_ops_for_epoch_or_chip_break(updated_op_names_to_epoch_break, scheduled_ops, graph, true);
-    }
-    return tag_ops_for_epoch_or_chip_break(op_names_to_epoch_break, scheduled_ops, graph, true);
-}
-
-unordered_set<string> tag_ops_for_chip_break(
-    const string& arch_name,
-    const vector<vector<string>>& op_names_to_chip_break,
-    const vector<string>& scheduled_ops,
-    graphlib::Graph const* graph,
-    bool use_interactive_placer)
-{
-    if (use_interactive_placer == false && arch_name.find("wormhole") != std::string::npos)
-    {
-        return {};
-    }
-    return tag_ops_for_epoch_or_chip_break(op_names_to_chip_break, scheduled_ops, graph, false);
-}
-
-// only used by legacy placer, with interactive_placer epoch_break will act as a temporal_epoch break
-unordered_set<string> tag_ops_for_temporal_epoch_break(
-    graphlib::Graph const* graph,
-    const vector<string>& scheduled_op_names,
-    const std::unordered_map<std::string, placer::PlacerOpOverride>& op_name_to_placer_overrides)
-{
-    unordered_set<string> ops_tagged_for_temporal_epoch_break;
-    unordered_map<std::string, std::uint32_t> op_to_schedule_index;
-    unordered_set<std::uint32_t> visited_fracture_ids;
-
-    for (std::uint32_t i = 0; i < scheduled_op_names.size(); ++i)
-    {
-        op_to_schedule_index[scheduled_op_names[i]] = i;
-        graphlib::Node* node = graph->get_node_by_name(scheduled_op_names[i]);
-        if (node->as<graphlib::TaggedNode>()->has_tag("fracture_group_id"))
-        {
-            graphlib::TagValue tag_value = node->as<graphlib::TaggedNode>()->tag_value("fracture_group_id");
-            std::uint32_t fracture_group_id = std::get<std::uint32_t>(tag_value);
-            if (visited_fracture_ids.find(fracture_group_id) == visited_fracture_ids.end())
-            {
-                ops_tagged_for_temporal_epoch_break.insert(scheduled_op_names[i]);
-                visited_fracture_ids.insert(fracture_group_id);
-            }
-        }
-    }
-    for (const auto& op_name_to_placer_override : op_name_to_placer_overrides)
-    {
-        const auto& [op_name, placer_op_override] = op_name_to_placer_override;
-        if (placer_op_override.temporal_epoch_break)
-        {
-            ops_tagged_for_temporal_epoch_break.insert(op_name);
-        }
-    }
-
-    if (not ops_tagged_for_temporal_epoch_break.empty())
-    {
-        log_debug(LogPlacer, "ops_tagged_for_temporal_epoch_break: {}", ops_tagged_for_temporal_epoch_break);
-    }
-    return ops_tagged_for_temporal_epoch_break;
-}
-
-
-} // namespace lowering
-} // namespace placer
-} // namespace tt
diff --git a/pybuda/csrc/placer/lower_to_placer.hpp b/pybuda/csrc/placer/lower_to_placer.hpp
deleted file mode 100644
index 97eb0e1d0..000000000
--- a/pybuda/csrc/placer/lower_to_placer.hpp
+++ /dev/null
@@ -1,69 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "placer/placer.hpp"
-#include "placer/lowering_utils.hpp"
-
-#include <vector>
-
-using std::uint32_t;
-using std::string;
-using std::vector;
-using std::unordered_map;
-using std::unordered_set;
-
-// Aliases
-using NodeEpochType = tt::graphlib::NodeEpochType;
-
-namespace tt {
-
-// Forward Declares
-namespace graphlib
-{
-    class Graph;
-}
-
-
-namespace placer {
-namespace lowering {
-
-unordered_map<string, vector<string>> get_fwd_to_bwd_nodes(graphlib::Graph const* graph);
-unordered_map<string, map<int, vector<string>>> get_fwd_to_opt_nodes(
-    graphlib::Graph const* graph, const vector<string>& scheduled_ops);
-
-unordered_map<string, NodeEpochType>
-get_op_to_epoch_type_mapping(graphlib::Graph const* graph, const vector<string>& scheduled_ops);
-unordered_map<string, bool>
-get_op_to_grad_op_mapping(graphlib::Graph const* graph, const vector<string>& scheduled_ops);
-unordered_map<string, bool>
-get_op_to_recompute_mapping(graphlib::Graph const* graph, const vector<string>& scheduled_ops);
-unordered_set<string> get_output_nodes(const graphlib::Graph *graph);
-
-// Returns an ordered list of node names
-vector<string> generate_placer_schedule(graphlib::Graph const* graph, PlacementScheduleOrder schedule_type);
-
-unordered_set<string> tag_ops_for_epoch_break(
-    const string& arch_name,
-    const vector<vector<string>>& op_names_to_epoch_break,
-    const vector<vector<string>>& op_names_to_chip_break,
-    const vector<string>& scheduled_ops,
-    graphlib::Graph const* graph,
-    bool use_interactive_placer);
-
-unordered_set<string> tag_ops_for_chip_break(
-    const string& arch_name,
-    const vector<vector<string>>& op_names_to_chip_break,
-    const vector<string>& scheduled_ops,
-    graphlib::Graph const* graph,
-    bool use_interactive_placer);
-
-unordered_set<string> tag_ops_for_temporal_epoch_break(
-    graphlib::Graph const* graph,
-    const vector<string>& scheduled_op_names,
-    const std::unordered_map<std::string, placer::PlacerOpOverride>& op_name_to_placer_overrides);
-
-} // end namespace lowering
-} // end namespace placer
-} // end namespace tt
diff --git a/pybuda/csrc/placer/lowering_utils.cpp b/pybuda/csrc/placer/lowering_utils.cpp
deleted file mode 100644
index 9768baed6..000000000
--- a/pybuda/csrc/placer/lowering_utils.cpp
+++ /dev/null
@@ -1,404 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "placer/lowering_utils.hpp"
-
-#include <math.h>
-
-#include <unordered_set>
-#include <utility>
-
-#include "placer/chip_id_assignment.hpp"
-#include "scheduler/scheduler.hpp"
-#include "utils/assert.hpp"
-#include "utils/logger.hpp"
-#include "utils/ordered_associative_containers/ordered_map.hpp"
-
-using tt::LogPlacer;
-using std::unordered_set;
-
-namespace tt {
-namespace placer {
-namespace lowering {
-
-void validate_placer_config(const PlacerConfig& placer_config)
-{
-    bool is_config_valid = true;
-    std::unordered_set<std::string> bwd_nodes_set;
-
-    for (auto& [fwd, bwd_nodes] : placer_config.fwd_to_bwd_nodes)
-    {
-        for (auto bwd_node : bwd_nodes)
-        {
-            bwd_nodes_set.insert(bwd_node);
-        }
-    }
-    for (auto& [fwd, index_to_opt_nodes_map] : placer_config.fwd_to_opt_nodes) {
-        for (auto& [index, opt_nodes] : index_to_opt_nodes_map) {
-            for (auto opt_node : opt_nodes) {
-                if (bwd_nodes_set.find(opt_node) != bwd_nodes_set.end()) {
-                    is_config_valid = false;
-                    log_error("Invalid PlacerConfig: Found {} having both fwd->bwd AND fwd->opt edges", opt_node);
-                }
-            }
-        }
-    }
-    if (not is_config_valid) {
-        log_fatal("Invalid PlacerConfig. Cannot run placer module");
-    }
-}
-
-std::map<ChipId, std::uint32_t> get_galaxy_snake_chip_order(const DeviceConfig& config)
-{
-    TT_ASSERT(config.galaxy_shelves.size() == 1, "SNAKE chip config is only supported for single-galaxy systems");
-    // x-y galaxy chip coordinates for snake pattern
-    std::vector<std::pair<int, int>> galaxy_snake_chip_order_in_logical_coordinates =
-    {
-        {3, 4}, {3, 3}, {3, 2}, {3, 1}, {3, 0}, {2, 0}, {1, 0}, {0, 0},
-        {0, 1}, {1, 1}, {2, 1}, {2, 2}, {1, 2}, {0, 2}, {0, 3}, {1, 3},
-        {2, 3}, {2, 4}, {1, 4}, {0, 4}, {0, 5}, {1, 5}, {2, 5}, {2, 6},
-        {1, 6}, {0, 6}, {0, 7}, {1, 7}, {2, 7}, {3, 7}, {3, 6}, {3, 5},
-    };
-    std::map<ChipId, std::uint32_t> chip_id_indices;
-    std::uint32_t index = 0;
-    for(auto& xy: galaxy_snake_chip_order_in_logical_coordinates)
-    {
-        int x = xy.first;
-        int y = xy.second;
-        chip_id_indices[config.chip_coord_to_chip_id.at(EthCoord(x, y, 0, config.galaxy_shelves.front()))] = index++;
-    }
-    return chip_id_indices;
-}
-
-vector<std::uint32_t> apply_chip_placement_policy(
-    const DeviceConfig& config,
-    ChipPlacementPolicy chip_placement_policy,
-    const vector<ChipId>& chip_ids)
-{
-    std::vector<ChipId> sorted_chip_ids;
-
-    // use given chip ids, sort non_mmio + mmio
-    if(chip_placement_policy == ChipPlacementPolicy::MMIO_LAST)
-    {
-        for (ChipId chip_id : chip_ids)
-        {
-            if (std::find(config.chips_with_mmio.begin(), config.chips_with_mmio.end(), chip_id) == config.chips_with_mmio.end())
-            {
-                sorted_chip_ids.push_back(chip_id);
-            }
-        }
-        for (std::uint32_t chip_id : chip_ids)
-        {
-            if (std::find(config.chips_with_mmio.begin(), config.chips_with_mmio.end(), chip_id) != config.chips_with_mmio.end())
-            {
-                sorted_chip_ids.push_back(chip_id);
-            }
-        }
-        return sorted_chip_ids;
-    }
-
-    // get chip id order based on the ChipPlacementPolicy
-    std::map<ChipId, std::uint32_t> galaxy_chip_id_indices =
-        chip_placement_policy == ChipPlacementPolicy::SNAKE ? get_galaxy_snake_chip_order(config) :
-        // add new policies here
-        std::map<ChipId, std::uint32_t>();
-    TT_ASSERT(galaxy_chip_id_indices.size());
-
-    // split all available chip_ids into galaxy_chip_ids and non_galaxy_chip_ids
-    std::vector<ChipId> galaxy_chip_ids;
-    std::vector<ChipId> non_galaxy_chip_ids;
-    for(auto& chip_id: chip_ids)
-    {
-        if(galaxy_chip_id_indices.find(chip_id) != galaxy_chip_id_indices.end())
-        {
-            galaxy_chip_ids.push_back(chip_id);
-        }
-        else {
-            non_galaxy_chip_ids.push_back(chip_id);
-        }
-    }
-
-    // order galaxy_chip_ids based on their order in the ChipPlacementPolicy
-    std::sort(galaxy_chip_ids.begin(), galaxy_chip_ids.end(), [&galaxy_chip_id_indices](ChipId chip_a, ChipId chip_b)
-    {
-        return galaxy_chip_id_indices.at(chip_a) < galaxy_chip_id_indices.at(chip_b);
-    });
-
-    sorted_chip_ids = galaxy_chip_ids;
-    sorted_chip_ids.insert(sorted_chip_ids.end(), non_galaxy_chip_ids.begin(), non_galaxy_chip_ids.end());
-
-    return sorted_chip_ids;
-}
-
-unordered_map<string, GridShape> get_op_to_grid_shape(
-    const vector<string>& scheduled_ops, uint32_t default_rows, uint32_t default_columns)
-{
-    unordered_map<string, GridShape> op_to_grid_shape;
-    for (const string& node_name : scheduled_ops)
-    {
-        op_to_grid_shape[node_name] = GridShape(default_rows, default_columns);
-    }
-    return op_to_grid_shape;
-}
-
-void check_user_defined_op_names_exist_in_schedule(
-    const PlacerConfig& config,
-    const vector<string>& scheduled_ops)
-{
-    if (config.ops_tagged_for_chip_id_break.empty() and config.ops_tagged_for_epoch_break.empty())
-    {
-        return;
-    }
-
-    unordered_set<string> scheduled_ops_set;
-    for (const string& name : scheduled_ops)
-    {
-        scheduled_ops_set.insert(name);
-    }
-
-    // Check all user-defined op_to_chip_id breaks exists
-    for (const string& name_tagged_for_chip_id_break : config.ops_tagged_for_chip_id_break)
-    {
-        bool is_op_found_schedule =
-            scheduled_ops_set.find(name_tagged_for_chip_id_break) != scheduled_ops_set.end();
-
-        TT_ASSERT(
-            is_op_found_schedule,
-            "User provided an op tagged for chip break not in the schedule. (may have been consteval)",
-            name_tagged_for_chip_id_break);
-    }
-
-    // Check all user-defined epoch breaks exists
-    for (const string& name_tagged_for_epoch_break : config.ops_tagged_for_epoch_break)
-    {
-        bool is_op_found_schedule =
-            scheduled_ops_set.find(name_tagged_for_epoch_break) != scheduled_ops_set.end();
-
-        TT_ASSERT(
-            is_op_found_schedule,
-            "User provided an op tagged for epoch break not in the schedule. (may have been consteval)",
-            name_tagged_for_epoch_break);
-    }
-}
-
-
-void check_user_defined_op_names_exist_in_schedule(
-    const vector<vector<string>>& op_names_to_chip_or_epoch_break,
-    const vector<string>& scheduled_ops)
-{
-    if (op_names_to_chip_or_epoch_break.empty())
-    {
-        return;
-    }
-
-    unordered_set<string> scheduled_ops_set;
-    for (const string& name : scheduled_ops)
-    {
-        scheduled_ops_set.insert(name);
-    }
-
-    // Check all user-defined op_to_chip_id breaks exists
-    for (const vector<string>& op_names_for_epoch_or_chip_break : op_names_to_chip_or_epoch_break)
-    {
-        for (const string& op_name : op_names_for_epoch_or_chip_break)
-        {
-            bool is_op_found_schedule =
-                scheduled_ops_set.find(op_name) != scheduled_ops_set.end();
-
-            TT_ASSERT(is_op_found_schedule, "User provided an op tagged for epoch/chip break not in the schedule: {}", op_name);
-        }
-    }
-
-}
-
-vector<OpGroupToPlace> generate_simple_placer_workload(
-    const PlacerConfig& config,
-    const vector<string>& scheduled_ops)
-{
-    check_user_defined_op_names_exist_in_schedule(config, scheduled_ops);
-
-    // For now, no actual groupings.. each group will just contain a single op
-    uint32_t current_chip_index = 0;
-    vector<OpGroupToPlace> placer_op_group_workload;
-    for (const string& op_name : scheduled_ops)
-    {
-        bool increment_epoch = false;
-        NodeEpochType epoch_type = config.op_to_epoch_type.at(op_name);
-
-        if (not placer_op_group_workload.empty())
-        {
-            // Don't trigger increment_epoch/chip when placing first OpGroup
-            if (config.ops_tagged_for_epoch_break.find(op_name) != config.ops_tagged_for_epoch_break.end())
-            {
-                increment_epoch = true;
-            }
-            if (config.ops_tagged_for_chip_id_break.find(op_name) != config.ops_tagged_for_chip_id_break.end())
-            {
-                current_chip_index = (current_chip_index + 1) % config.chip_ids.size();
-            }
-        }
-
-        placer_op_group_workload.push_back(
-            OpGroupToPlace{
-                .op_group_id = OpGroupToPlace::get_next_op_group_id(),
-                .op_names = {op_name},
-                .op_name_to_relative_offset_from_first_op = {},
-                .chip_id = config.chip_ids.at(current_chip_index),
-                .increment_epoch = increment_epoch,
-                .epoch_type=epoch_type,
-            }
-        );
-    }
-    return placer_op_group_workload;
-}
-
-vector<OpGroupToPlace> generate_forward_placer_workload(
-    const PlacerConfig& config,
-    const vector<string>& scheduled_ops)
-{
-    vector<OpGroupToPlace> op_groups;
-
-    for (const string& op_name : scheduled_ops)
-    {
-        bool increment_epoch = false;
-
-        if (not op_groups.empty())
-        {
-            // Don't trigger increment_epoch/chip when placing first OpGroup
-            if (config.ops_tagged_for_epoch_break.find(op_name) != config.ops_tagged_for_epoch_break.end())
-            {
-                increment_epoch = true;
-            }
-        }
-
-        NodeEpochType epoch_type = config.op_to_epoch_type.at(op_name);
-        if (epoch_type == NodeEpochType::Forward)
-        {
-            uint32_t assigned_chip_id = config.get_chip_id(op_name);
-            op_groups.push_back(
-                OpGroupToPlace{
-                    .op_group_id = OpGroupToPlace::get_next_op_group_id(),
-                    .op_names={op_name},
-                    .op_name_to_relative_offset_from_first_op = {},
-                    .chip_id = assigned_chip_id,
-                    .increment_epoch = increment_epoch,
-                    .epoch_type=NodeEpochType::Forward,
-                }
-            );
-        }
-    }
-    return op_groups;
-}
-
-vector<OpGroupToPlace> generate_backward_placer_workload(
-    const PlacerConfig& config, const vector<string>& scheduled_ops)
-{
-    vector<OpGroupToPlace> op_groups;
-    for (auto it = scheduled_ops.begin(); it != scheduled_ops.end(); ++it)
-    {
-        auto bwd_op = *it;
-        if (config.op_to_epoch_type.at(bwd_op) == NodeEpochType::Backward)
-        {
-            bool is_grad_op =
-                config.op_to_grad_op.find(bwd_op) != config.op_to_grad_op.end() and config.op_to_grad_op.at(bwd_op);
-            bool is_recompute_op = config.op_to_recompute_op.find(bwd_op) != config.op_to_recompute_op.end() and
-                                   config.op_to_recompute_op.at(bwd_op);
-            std::string op_type = (is_grad_op ? "grad_op" : (is_recompute_op ? "recompute_op" : "bwd_op"));
-
-            log_debug(tt::LogPlacer, "\tbwd_node: {} is type: {}", bwd_op, op_type);
-            op_groups.push_back(OpGroupToPlace{
-                .op_group_id = OpGroupToPlace::get_next_op_group_id(),
-                .op_names = {bwd_op},
-                .op_name_to_relative_offset_from_first_op = {},
-                .chip_id = config.get_chip_id(bwd_op),
-                .increment_epoch =
-                    config.ops_tagged_for_epoch_break.find(bwd_op) != config.ops_tagged_for_epoch_break.end(),
-                .epoch_type = NodeEpochType::Backward});
-        }
-    }
-    return op_groups;
-}
-
-vector<OpGroupToPlace> generate_optimizer_placer_workload(
-    const PlacerConfig& config,
-    const vector<string>& scheduled_ops)
-{
-    vector<OpGroupToPlace> op_groups;
-    for (const string& node_name : scheduled_ops)
-    {
-        NodeEpochType epoch_type = config.op_to_epoch_type.at(node_name);
-        if (epoch_type == NodeEpochType::Optimizer)
-        {
-            uint32_t chip_id = config.get_chip_id(node_name);
-            bool increment_epoch = false;
-            if (config.ops_tagged_for_epoch_break.find(node_name) !=
-                config.ops_tagged_for_epoch_break.end())
-            {
-                increment_epoch = true;
-            }
-
-            op_groups.push_back(
-                    OpGroupToPlace{
-                        .op_group_id = OpGroupToPlace::get_next_op_group_id(),
-                        .op_names = {node_name},
-                        .op_name_to_relative_offset_from_first_op = {},
-                        .chip_id = chip_id,
-                        .increment_epoch = increment_epoch,
-                        .epoch_type = NodeEpochType::Optimizer,
-                    });
-        }
-    }
-    return op_groups;
-}
-
-ChipIdToPlacerWorkload generate_placer_workload(
-    const PlacerConfig& config,
-    const vector<string>& scheduled_ops)
-{
-    check_user_defined_op_names_exist_in_schedule(config, scheduled_ops);
-    if (config.device_config.arch_name == "grayskull")
-    {
-        TT_ASSERT(not config.op_to_chip_id_assignment.empty(), "op to chip_id assignment not populated");
-    }
-    map<uint32_t, vector<OpGroupToPlace>> placer_workload;
-
-    for (auto&& op_group : generate_forward_placer_workload(config, scheduled_ops))
-    {
-        placer_workload[op_group.chip_id].emplace_back(op_group);
-    }
-    for (auto&& op_group : generate_backward_placer_workload(config, scheduled_ops))
-    {
-        placer_workload[op_group.chip_id].emplace_back(op_group);
-    }
-    for (auto&& op_group : generate_optimizer_placer_workload(config, scheduled_ops))
-    {
-        placer_workload[op_group.chip_id].emplace_back(op_group);
-    }
-    return placer_workload;
-}
-
-vector<OpGroupToPlace> generate_wormhole_placer_workload(
-    const PlacerConfig& config,
-    const vector<string>& scheduled_ops)
-{
-    check_user_defined_op_names_exist_in_schedule(config, scheduled_ops);
-    vector<OpGroupToPlace> placer_workload;
-
-    for (auto&& op_group : generate_forward_placer_workload(config, scheduled_ops))
-    {
-        placer_workload.emplace_back(op_group);
-    }
-    for (auto&& op_group : generate_backward_placer_workload(config, scheduled_ops))
-    {
-        placer_workload.emplace_back(op_group);
-    }
-    for (auto&& op_group : generate_optimizer_placer_workload(config, scheduled_ops))
-    {
-        placer_workload.emplace_back(op_group);
-    }
-    return placer_workload;
-}
-
-} // namespace lowering
-} // namespace placer
-} // namespace tt
diff --git a/pybuda/csrc/placer/lowering_utils.hpp b/pybuda/csrc/placer/lowering_utils.hpp
deleted file mode 100644
index d2b7aff05..000000000
--- a/pybuda/csrc/placer/lowering_utils.hpp
+++ /dev/null
@@ -1,55 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "placer/placer.hpp"
-
-#include <vector>
-
-using std::uint32_t;
-using std::string;
-using std::vector;
-using std::unordered_map;
-
-// Aliases
-using NodeEpochType = tt::graphlib::NodeEpochType;
-
-namespace tt {
-namespace placer {
-namespace lowering {
-
-void validate_placer_config(const PlacerConfig& placer_config);
-
-using ChipId = uint32_t;
-
-vector<ChipId> apply_chip_placement_policy(const DeviceConfig& config, ChipPlacementPolicy chip_placement_policy, const vector<ChipId>& chip_ids);
-
-unordered_map<string, GridShape> get_op_to_grid_shape(
-        const vector<string>& scheduled_ops,
-        uint32_t default_rows = 1,
-        uint32_t default_columns = 1);
-
-// Each returned OpGroupToPlace in the list only contains a single op in its grouping
-vector<OpGroupToPlace> generate_simple_placer_workload(
-        const PlacerConfig& config,
-        const vector<string>& scheduled_ops);
-
-ChipIdToPlacerWorkload generate_placer_workload(
-        const PlacerConfig& config, const vector<string>& scheduled_ops);
-
-
-vector<OpGroupToPlace> generate_wormhole_placer_workload(
-    const PlacerConfig& config,
-    const vector<string>& scheduled_ops);
-
-
-void check_user_defined_op_names_exist_in_schedule(const PlacerConfig& config, const vector<string>& scheduled_ops);
-
-void check_user_defined_op_names_exist_in_schedule(
-    const vector<vector<string>>& op_names_to_chip_or_epoch_break,
-    const vector<string>& scheduled_ops);
-
-} // end namespace lowering
-} // end namespace placer
-} // end namespace tt
diff --git a/pybuda/csrc/placer/module.mk b/pybuda/csrc/placer/module.mk
deleted file mode 100644
index bbac18d12..000000000
--- a/pybuda/csrc/placer/module.mk
+++ /dev/null
@@ -1,45 +0,0 @@
-# Every variable in subdir must be prefixed with subdir (emulating a namespace)
-
-PYBUDA_CSRC_PLACER_LIB = $(LIBDIR)/libplacer.a
-PYBUDA_CSRC_PLACER_SRCS = \
-	pybuda/csrc/placer/allocator_utils.cpp \
-	pybuda/csrc/placer/best_fit_allocator.cpp \
-	pybuda/csrc/placer/chip_id_assignment.cpp \
-	pybuda/csrc/placer/dram.cpp \
-	pybuda/csrc/placer/dram_logger.cpp \
-	pybuda/csrc/placer/dram_allocator.cpp \
-	pybuda/csrc/placer/epoch_placer.cpp \
-	pybuda/csrc/placer/evaluator.cpp \
-	pybuda/csrc/placer/grid_placer.cpp \
-	pybuda/csrc/placer/host_memory.cpp \
-	pybuda/csrc/placer/host_memory_allocator.cpp \
-	pybuda/csrc/placer/interactive_placer.cpp \
-	pybuda/csrc/placer/lowering_utils.cpp \
-	pybuda/csrc/placer/lower_to_placer.cpp \
-	pybuda/csrc/placer/placer.cpp \
-	pybuda/csrc/placer/pre_epoch_passes.cpp \
-	pybuda/csrc/placer/post_epoch_passes.cpp \
-	pybuda/csrc/placer/python_bindings.cpp \
-	pybuda/csrc/placer/utils.cpp
-
-PYBUDA_CSRC_PLACER_INCLUDES = $(PYBUDA_CSRC_INCLUDES)
-
-PYBUDA_CSRC_PLACER_OBJS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_PLACER_SRCS:.cpp=.o))
-PYBUDA_CSRC_PLACER_DEPS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_PLACER_SRCS:.cpp=.d))
-
--include $(PYBUDA_CSRC_PLACER_DEPS)
-
-PLACER_CSRC_CFLAGS = $(PYBUDA_CSRC_CFLAGS)
-
-# Each module has a top level target as the entrypoint which must match the subdir name
-pybuda/csrc/placer: $(PYBUDA_CSRC_PLACER_LIB)
-
-$(PYBUDA_CSRC_PLACER_LIB):  $(PYBUDA_CSRC_PLACER_OBJS) $(PYBUDA_CSRC_GRAPH_LIB) $(PYBUDA_CSRC_SCHEDULER_LIB) $(PYBUDA_CSRC_GRAPH_LIB)
-	@mkdir -p $(LIBDIR)
-	ar rcs $@ $^
-
-$(OBJDIR)/pybuda/csrc/placer/%.o: pybuda/csrc/placer/%.cpp
-	@mkdir -p $(@D)
-	$(CXX) $(PLACER_CSRC_CFLAGS) $(CXXFLAGS) $(STATIC_LIB_FLAGS) $(PYBUDA_CSRC_PLACER_INCLUDES) -c -o $@ $<
-
-include pybuda/csrc/placer/tests/module.mk
diff --git a/pybuda/csrc/placer/placer.cpp b/pybuda/csrc/placer/placer.cpp
deleted file mode 100644
index 931333dc2..000000000
--- a/pybuda/csrc/placer/placer.cpp
+++ /dev/null
@@ -1,1438 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "placer/placer.hpp"
-#include "placer/utils.hpp"
-#include "placer/lowering_utils.hpp"
-#include "placer/dram.hpp"
-#include "placer/grid_placer.hpp"
-#include "third_party/json/json.hpp"
-
-#include "graph_lib/defines.hpp"
-#include "utils/logger.hpp"
-#include "utils/assert.hpp"
-
-#include <fstream>
-#include <iomanip>
-#include <set>
-#include <stdexcept>
-#include <queue>
-#include <algorithm>
-#include <exception>
-#include <optional>
-
-using tt::LogPlacer;
-using std::max;
-using std::ofstream;
-using std::setw;
-using std::runtime_error;
-
-// Aliases
-using NodeEpochType = tt::graphlib::NodeEpochType;
-uint32_t tt::placer::OpGroupToPlace::current_op_group_id = 0;
-
-namespace tt {
-namespace placer {
-    
-uint32_t PlacerConfig::get_available_rows_on_device() const
-{
-    return device_grid.rows - harvested_rows.size();
-}
-
-uint32_t PlacerConfig::get_chip_id(const string& op_name) const
-{
-    if (device_config.arch_name != "grayskull")
-    {
-        return 0;
-    }
-    return op_to_chip_id_assignment.at(op_name);
-}
-std::optional<uint32_t> PlacerConfig::get_chip_id_override(const string& op_name) const
-{
-    // check if op_name is in op_to_overrides. If so, return the chip_id from there
-    if (auto it = op_to_overrides.find(op_name); it != op_to_overrides.end())
-    {
-        const auto& op_override = it->second;
-        if (op_override.chip_id.has_value())
-        {
-            return op_override.chip_id.value();
-        }
-    }
-    return {};
-}
-
-Coord Coord::operator+(const GridShape& rhs) const
-{
-    return {.row=this->row + rhs.rows,
-            .col=this->col + rhs.columns};
-}
-
-Coord Coord::operator+(const Coord& rhs) const
-{
-    return {.row=this->row + rhs.row,
-            .col=this->col + rhs.col};
-}
-
-Coord Coord::operator+(const CoordOffset& rhs) const
-{
-    return {.row=this->row + rhs.row_offset,
-            .col=this->col + rhs.column_offset};
-}
-
-bool Coord::operator< (const Coord &rhs) const
-{
-    if (this->row == rhs.row)
-    {
-        return this->col < rhs.col;
-    }
-    return this->row < rhs.row;
-}
-
-bool Coord::operator==(const Coord &rhs) const
-{
-    return (row == rhs.row) && (col == rhs.col);
-}
-
-bool Coord::operator!=(const Coord &rhs) const
-{
-    return !(*this == rhs);
-}
-
-json Coord::to_json() const
-{
-    json output_json;
-    output_json["row"] = this->row;
-    output_json["col"] = this->col;
-    return output_json;
-}
-
-std::array<std::uint32_t, 2> Coord::as_array() const
-{
-    return {this-> row, this->col};
-}
-
-json CoordRange::to_json() const
-{
-    json output_json;
-    output_json["start"] = this->start.to_json();
-    output_json["end"] = this->end.to_json();
-    return output_json;
-}
-
-bool CoordRange::operator==(const CoordRange &rhs) const
-{
-    return this->start == rhs.start and this->end == rhs.end;
-}
-bool CoordRange::operator!=(const CoordRange &rhs) const
-{
-    return !(*this == rhs);
-}
-
-
-uint32_t OpGroupToPlace::get_next_op_group_id()
-{
-    return current_op_group_id++;
-}
-
-json OpPlacement::to_json() const
-{
-    json output_json;
-    output_json["name"] = this->name;
-    output_json["chip_id"] = this->chip_id;
-    output_json["epoch_id"] = this->epoch_id();
-
-    vector<json> placed_cores_json;
-    placed_cores_json.push_back(this->placed_cores.to_json());
-
-    output_json["placed_cores"] = placed_cores_json;
-    return output_json;
-}
-
-bool OpPlacement::operator==(const OpPlacement& rhs) const
-{
-    // exclude global_id from capture
-    return (
-        this->name == rhs.name and
-        this->chip_id == rhs.chip_id and
-        this->placed_cores == rhs.placed_cores
-    );
-}
-bool OpPlacement::operator!=(const OpPlacement& rhs) const
-{
-    return !(*this == rhs);
-}
-
-json QueueBufferPlacement::to_json() const
-{
-    json output_json;
-    output_json["dram_channel"] = this->dram_channel;
-    output_json["dram_address"] = this->dram_address;
-    output_json["dram_channel_location"] = this->dram_channel_location.to_json();
-    output_json["buffer_size"] = this->buffer_size;
-    return output_json;
-}
-
-json QueueHostBufferPlacement::to_json() const
-{
-    json output_json;
-    output_json["channel"] = this->channel;
-    output_json["address"] = this->address;
-    output_json["buffer_size"] = this->buffer_size;
-    return output_json;
-}
-
-json QueuePlacement::to_json() const
-{
-    json output_json;
-    output_json["name"] = this->name;
-    output_json["on_host"] = this->on_host;
-    output_json["chip_id"] = this->chip_id;
-
-    vector<json> buffers_json;
-    for (const QueueBufferPlacement& qb : this->dram_buffers)
-    {
-        buffers_json.push_back(qb.to_json());
-    }
-
-    output_json["buffers"] = buffers_json;
-    return output_json;
-}
-
-json PlacerSolution::to_json() const
-{
-    json output_json;
-
-    // serialize `name_to_op_placement`
-    for (const auto& [name, op_placement] : this->name_to_op_placement)
-    {
-        output_json["name_to_op_placement"][name] = op_placement.to_json();
-    }
-
-    // serialize `name_to_queue_placement`
-    for (const auto& [name, q_placement] : this->name_to_queue_placement)
-    {
-        output_json["name_to_queue_placement"][name] = q_placement.to_json();
-    }
-
-    // serialize `epoch_id_to_chip`
-    for (const auto& [epoch_id, chip] : this->epoch_id_to_chip)
-    {
-        output_json["epoch_id_to_chip"][epoch_id] = chip;
-    }
-
-    output_json["num_epochs"] = this->num_epochs;
-
-    return output_json;
-}
-
-bool PlacerSolution::is_placed(const std::string& op) const
-{
-    return this->name_to_op_placement.find(op) != this->name_to_op_placement.end();
-}
-
-uint32_t PlacerSolution::chip_id(const std::string& op) const {
-    if (this->name_to_op_placement.find(op) != this->name_to_op_placement.end()) {
-        return this->name_to_op_placement.at(op).chip_id;
-
-    } else if (this->name_to_queue_placement.find(op) != this->name_to_queue_placement.end()) {
-        return this->name_to_queue_placement.at(op).chip_id;
-    }
-    TT_LOG_ASSERT(false, "Error: PlacerSolution::chip_id() invoked with unassigned op/queue: {}", op);
-    return 0;
-}
-
-uint32_t PlacerSolution::epoch_id(const std::string& op) const {
-    int global_epoch_id = this->name_to_op_placement.at(op).epoch_id();
-    return global_epoch_id;
-}
-
-uint32_t PlacerSolution::temporal_epoch_id(const std::string& op) const {
-    int global_epoch_id = this->name_to_op_placement.at(op).epoch_id();
-    return this->epoch_id_to_epoch_info.at(global_epoch_id).temporal_epoch_id;
-}
-
-uint32_t PlacerSolution::temporal_epoch_id(uint32_t global_epoch_id) const {
-    return this->epoch_id_to_epoch_info.at(global_epoch_id).temporal_epoch_id;
-}
-
-uint32_t PlacerSolution::num_temporal_epochs() const {
-    uint32_t max_epoch_id_found = 0;
-    for (const auto& [epoch_id, epoch_info] : this->epoch_id_to_epoch_info) {
-        max_epoch_id_found = std::max(max_epoch_id_found, epoch_info.temporal_epoch_id);
-    }
-    return max_epoch_id_found + 1;
-}
-NodeEpochType PlacerSolution::epoch_type(uint32_t global_epoch_id) const
-{
-    return this->epoch_id_to_epoch_info.at(global_epoch_id).epoch_type;
-}
-
-const EpochInfo& PlacerSolution::epoch_info(uint32_t global_epoch_id) const
-{
-    return this->epoch_id_to_epoch_info.at(global_epoch_id);
-}
-
-uint32_t PlacerSolution::num_temporal_epochs(NodeEpochType type) const
-{
-    std::set<uint32_t> temporal_epoch_ids;
-    for (const auto& [epoch_id, epoch_info] : this->epoch_id_to_epoch_info)
-    {
-        if (epoch_info.epoch_type == type)
-        {
-            temporal_epoch_ids.insert(epoch_info.temporal_epoch_id);
-        }
-    }
-    return temporal_epoch_ids.size();
-}
-    
-// Merge another placer solution into this one. Destroys the original!
-// Assumes that the 'other' contains new stand-alone epochs, this will likely not work 
-// for partial epoch merging.
-void PlacerSolution::merge(PlacerSolution &other)
-{
-    TT_ASSERT(is_pipelined == other.is_pipelined, "Incompatible placer solutions for merging (pipelined).");
-    TT_ASSERT(epoch_id_to_device_grid.rows == other.epoch_id_to_device_grid.rows, 
-            "Incompatible placer solutions for merging (grid rows).");
-    TT_ASSERT(epoch_id_to_device_grid.columns == other.epoch_id_to_device_grid.columns, 
-            "Incompatible placer solutions for merging (grid columns).");
-
-    name_to_op_placement.merge(other.name_to_op_placement);
-    input_queue_to_grid_shape.merge(other.input_queue_to_grid_shape);
-    name_to_queue_placement.merge(other.name_to_queue_placement);
-    epoch_id_to_chip.merge(other.epoch_id_to_chip);
-    epoch_id_to_op_placement.merge(other.epoch_id_to_op_placement);
-    epoch_id_to_epoch_info.merge(other.epoch_id_to_epoch_info);
-    num_epochs += other.num_epochs;
-    epoch_id_to_device_grid.epoch_id_to_device_grid.merge(other.epoch_id_to_device_grid.epoch_id_to_device_grid);
-}
-
-void EpochIdToDeviceGrid::initialize_device_grid(uint32_t candidate_epoch_id, bool clear_existing)
-{
-    if (clear_existing || (this->epoch_id_to_device_grid.find(candidate_epoch_id) == this->epoch_id_to_device_grid.end()))
-    {
-        this->epoch_id_to_device_grid[candidate_epoch_id] = device_grid::create_empty_device_grid(this->rows, this->columns);
-    }
-}
-
-void EpochIdToDeviceGrid::initialize_device_grid(uint32_t candidate_epoch_id, uint32_t rows, uint32_t columns)
-{
-    this->epoch_id_to_device_grid[candidate_epoch_id] = device_grid::create_empty_device_grid(rows, columns);
-}
-
-void EpochIdToDeviceGrid::initialize_device_grid(uint32_t candidate_epoch_id, const DeviceGrid& device_grid)
-{
-    if (this->epoch_id_to_device_grid.find(candidate_epoch_id) == this->epoch_id_to_device_grid.end())
-    {
-        this->epoch_id_to_device_grid[candidate_epoch_id] = device_grid;
-    }
-}
-std::optional<Coord> EpochIdToDeviceGrid::get_next_grid_coordinate(const std::string& op_name, uint32_t epoch_id, const GridShape& op_grid_shape) const
-{
-    DeviceGrid device_grid = this->epoch_id_to_device_grid.at(epoch_id);
-    for (const auto& [constraint_name, constraint_grid] : this->op_to_constraints)
-    {
-        if (op_name != constraint_name)
-        {
-            device_grid = device_grid::superposition(device_grid, constraint_grid);
-        }
-    }
-
-    return device_grid::get_next_grid_coordinate(device_grid, op_grid_shape);
-}
-
-bool EpochIdToDeviceGrid::satisfies_constraints(const std::string& op_name, const Coord& start, const GridShape& shape) const
-{
-    bool satisfies_constraints = true;
-    for (const auto& [constraint_name, constraint_grid] : this->op_to_constraints)
-    {
-        if (op_name != constraint_name)
-        {
-            satisfies_constraints &= device_grid::can_place_on_device_grid(constraint_grid, start, shape);
-        }
-    }
-    return satisfies_constraints;
-}
-
-bool EpochIdToDeviceGrid::can_place_on_device_grid(
-    const std::string& op_name,
-    int epoch_id,
-    const Coord& start,
-    const GridShape& shape)
-{
-    bool satisfies_constraints = this->satisfies_constraints(op_name, start, shape);
-    this->initialize_device_grid(epoch_id);
-    const DeviceGrid& device_grid = this->epoch_id_to_device_grid.at(epoch_id);
-    return satisfies_constraints and device_grid::can_place_on_device_grid(device_grid, start, shape);
-}
-
-void fill_in_device_grid(
-    const PlacerSolution& placer_solution,
-    vector<vector<uint32_t>>& device_grid_for_epoch,
-    const unordered_map<uint32_t, string> &id_to_string,
-    const OpPlacement& op_placement,
-    uint32_t id)
-{
-    const auto& cores = op_placement.placed_cores;
-    for (uint32_t i = cores.start.row; i < cores.end.row; ++i) {
-        for (uint32_t j = cores.start.col; j < cores.end.col; ++j) {
-            if (device_grid_for_epoch.at(i).at(j) != 0) {
-                uint32_t offending_id = device_grid_for_epoch[i][j];
-                const string& offending_op = id_to_string.at(offending_id);
-                auto oop = placer_solution.name_to_op_placement.at(offending_op);
-
-
-                log_fatal("On chip {}, epoch {}, we are placing {} onto [{},{}]->[{},{}] but it overlaps with another op: {}, i:{}, j:{}, ->[{},{}]->[{},{}] ",
-                        op_placement.chip_id, op_placement.epoch_id(), op_placement.name,
-                        cores.start.row, cores.start.col, cores.end.row, cores.end.col,
-                        offending_op,
-                        i,j,
-                        oop.placed_cores.start.row,oop.placed_cores.start.col, oop.placed_cores.end.row, oop.placed_cores.end.col);
-            }
-            device_grid_for_epoch[i][j] = id;
-        }
-
-    }
-}
-
-void fill_device_grid_with_placement(
-    DeviceGrid& device_grid_for_epoch,
-    const Coord& op_start,
-    const GridShape& op_grid_shape)
-{
-    for (uint32_t i = op_start.row; i < op_start.row + op_grid_shape.rows; ++i) {
-        for (uint32_t j = op_start.col; j < op_start.col + op_grid_shape.columns; ++j) {
-            device_grid_for_epoch.at(i).at(j) = 1;
-        }
-    }
-}
-
-
-void EpochIdToDeviceGrid::fill_device_grid_with_placement(
-    int epoch_id,
-    const Coord& op_start,
-    const GridShape& op_grid_shape)
-{
-    initialize_device_grid(epoch_id);
-    device_grid::fill_device_grid_with_placement(this->epoch_id_to_device_grid.at(epoch_id), op_start, op_grid_shape);
-}
-
-/* static */ GridShape GridShape::from_array(std::array<uint32_t, 2> array) { return GridShape(array[0], array[1]); };
-
-bool contains_harvested_row(uint32_t row_start, uint32_t num_rows_for_op, const vector<uint32_t>& harvested_rows)
-{
-    for (uint32_t row : harvested_rows) {
-        bool contains_harvested_row = row >= row_start and row < row_start + num_rows_for_op;
-        if (contains_harvested_row) {
-            return false;
-        }
-    }
-    return true;
-}
-
-bool EpochIdToDeviceGrid::contains_empty_grid(uint32_t epoch_id) {
-    initialize_device_grid(epoch_id);
-    if (this->epoch_id_to_device_grid.find(epoch_id) == this->epoch_id_to_device_grid.end()) {
-        return true;
-    }
-    const auto& device_grid_for_epoch = this->epoch_id_to_device_grid.at(epoch_id);
-    return device_grid::contains_empty_device_grid(device_grid_for_epoch);
-}
-
-uint32_t EpochIdToDeviceGrid::get_current_epoch_id() const
-{
-    int current_epoch = 0;
-    for (const auto & [epoch_id, device_grid] : this->epoch_id_to_device_grid)
-    {
-        current_epoch = std::max(current_epoch, epoch_id);
-    }
-    return (uint32_t)current_epoch;
-}
-
-const DeviceGrid& EpochIdToDeviceGrid::get_device_grid(uint32_t epoch_id) const
-{
-    TT_ASSERT(this->epoch_id_to_device_grid.find(epoch_id) != this->epoch_id_to_device_grid.end());
-    return this->epoch_id_to_device_grid.at(epoch_id);
-}
-void EpochIdToDeviceGrid::add_constraints(const std::unordered_map<std::string, DeviceGrid>& constraints)
-{
-    this->op_to_constraints = constraints;
-}
-
-void generate_placement_constraints(const PlacerConfig& config, vector<OpGroupToPlace>& placer_op_group_workload)
-{
-    // Encode the constraint that we need an epoch break between new epoch transitions
-    // e.g. FWD->{RECOMPUTE/BWD}->OPT
-    NodeEpochType prev_epoch_type = NodeEpochType::Forward;
-    for (OpGroupToPlace& op_group_to_place : placer_op_group_workload)
-    {
-        for (const string& op_name : op_group_to_place.op_names)
-        {
-            if (is_forward_to_backward_epoch_transition(prev_epoch_type, config.op_to_epoch_type.at(op_name)) or
-                is_backward_to_optimizer_epoch_transition(prev_epoch_type, config.op_to_epoch_type.at(op_name)))
-            {
-                op_group_to_place.increment_epoch = true;
-                prev_epoch_type = config.op_to_epoch_type.at(op_name);
-            }
-        }
-    }
-}
-
-void validate_placer_solution(const PlacerConfig& config, const PlacerSolution& placer_solution)
-{
-    for (const auto& [name, op_placement] : placer_solution.name_to_op_placement) {
-        const CoordRange& coord = op_placement.placed_cores;
-        if (coord.end.row > config.device_grid.rows or coord.end.col > config.device_grid.columns) {
-            log_fatal("{} placed cores is out of bounds: [{},{}]->[{},{}], device_grid: [{},{}]", 
-                    name,
-                    coord.start.row, coord.start.col, coord.end.row, coord.end.col,
-                    config.device_grid.rows, config.device_grid.columns);
-        }
-        if (env_as<bool>("PYBUDA_NEBULA_GALAXY_PLACER"))
-        {
-            if (config.output_queues_on_host and config.output_ops.find(name) != config.output_ops.end())
-            {
-                // TODO: get this from device config when mixed harvesting changes are fully consumed
-                const uint32_t MAX_NUM_ROWS_UNHARVESTED = 8;
-                // when PYBUDA_NEBULA_GALAXY_PLACER is enabled, nops are forced onto the Nebula chip which may be two
-                // row harvested for now assert that the output nop is not spilling onto the last two rows
-                if (coord.end.row > MAX_NUM_ROWS_UNHARVESTED)
-                {
-                    log_fatal(
-                        "{} forced onto mmio chip may be out of bounds: [{},{}]->[{},{}], Nebula assumed harvested "
-                        "grid: [{},{}]",
-                        name,
-                        coord.start.row,
-                        coord.start.col,
-                        coord.end.row,
-                        coord.end.col,
-                        MAX_NUM_ROWS_UNHARVESTED,
-                        config.device_grid.columns);
-                }
-            }
-        }
-    }
-
-    // within an epoch, there should not be any overlap in terms of op placement
-    const int device_available_rows = config.get_available_rows_on_device();
-
-    for (const auto& [epoch, op_placements] : placer_solution.epoch_id_to_op_placement) {
-        // simple way to do the check by coloring in device grid - switch just to do simple boundary checks later
-        vector<vector<uint32_t>> device_grid_for_epoch(device_available_rows, vector<uint32_t>(config.device_grid.columns));
-
-        uint32_t start_id = 1;
-        unordered_map<uint32_t, string> id_to_string;
-
-        for (const auto& op_placement : op_placements) {
-            id_to_string[start_id] = op_placement.name;
-            fill_in_device_grid(placer_solution, device_grid_for_epoch, id_to_string, op_placement, start_id);
-            start_id += 1;
-        }
-    }
-}
-
-PlacerSolution place_onto_chip(
-    const PlacerConfig& config,
-    PlacerWorkload& placer_op_group_workload,
-    uint32_t epoch_start_id,
-    std::optional<NodeEpochType> epoch_type)
-{
-    validate_chip_mapping(config, placer_op_group_workload);
-    validate_placer_inputs(config, placer_op_group_workload);
-
-    if (epoch_type)
-    {
-        if (epoch_type.value() == NodeEpochType::Forward) {
-            log_debug(tt::LogPlacer, "Placing FWD ops...");
-        } else if (epoch_type.value() == NodeEpochType::Backward) {
-            log_debug(tt::LogPlacer, "Placing BWD ops...");
-        } else if (epoch_type.value() == NodeEpochType::Optimizer) {
-            log_debug(tt::LogPlacer, "Placing OPT ops...");
-        }
-    }
-    else
-    {
-        generate_placement_constraints(config, placer_op_group_workload);
-    }
-
-
-    unordered_map<string, OpPlacement> name_to_op_placement;
-    map<PlacerSolution::EpochId, int> epoch_id_to_chip;
-    unordered_map<int, vector<OpPlacement>> epoch_id_to_op_placement;
-    GridShape device_grid_shape(config.get_available_rows_on_device(), config.device_grid.columns);
-
-    auto epoch_id_to_device_grid = EpochIdToDeviceGrid(device_grid_shape.rows, device_grid_shape.columns);
-    unordered_map<int, EpochInfo> epoch_id_to_epoch_info;
-
-    uint32_t max_epoch_id_assigned = epoch_start_id;
-    bool placed = false;
-
-    uint32_t current_epoch_id = epoch_start_id;
-    vector<DeviceGridPlacement> device_grid_placements;
-
-    std::vector<OpGroupToPlace> filtered_op_groups;
-    std::unordered_set<uint32_t> visited_op_group_ids;
-    for (const OpGroupToPlace& op_group : placer_op_group_workload)
-    {
-        if ((not epoch_type.has_value()) or (epoch_type and op_group.epoch_type == epoch_type.value()))
-        {
-            filtered_op_groups.push_back(op_group);
-        }
-    }
-
-    auto placer = EpochDevicePlacer(config);
-    std::vector<EpochDeviceGridPlacement> placed_epochs = placer.place(filtered_op_groups);
-    std::map<std::string, uint32_t> name_to_op_group_id;
-    std::map<std::string, uint32_t> name_to_chip_id;
-    for (const OpGroupToPlace& op_group : placer_op_group_workload)
-    {
-        for (const auto& name : op_group.op_names)
-        {
-            name_to_op_group_id[name] = op_group.op_group_id;
-            name_to_chip_id[name] = op_group.chip_id;
-        }
-    }
-
-
-    for (uint32_t epoch_id = 0; epoch_id < placed_epochs.size(); ++epoch_id)
-    {
-        auto& placed_epoch = placed_epochs.at(epoch_id);
-        epoch_id_to_device_grid.initialize_device_grid(current_epoch_id + epoch_id);
-        vector<OpPlacement> op_placements;
-        for (const auto& device_grid_placement : placed_epoch.op_placements)
-        {
-            op_placements.push_back(OpPlacement{
-                .id = name_to_op_group_id.at(device_grid_placement.op_name),
-                .name = device_grid_placement.op_name,
-                .chip_id = name_to_chip_id.at(device_grid_placement.op_name), // TODO(JCHU): HACK
-                .global_epoch_id = current_epoch_id + epoch_id,
-                .grid_transpose = device_grid_placement.grid_transpose,
-                .placed_cores = device_grid_placement.placed_cores
-            });
- 
-            GridShape op_grid_shape = config.op_to_grid_shape.at(device_grid_placement.op_name);
-            if(device_grid_placement.grid_transpose){
-                op_grid_shape = GridShape(op_grid_shape.columns, op_grid_shape.rows);
-            }
-            epoch_id_to_device_grid.fill_device_grid_with_placement(
-                current_epoch_id + epoch_id, 
-                device_grid_placement.placed_cores.start,
-                op_grid_shape); 
-        }
-
-        if (not op_placements.empty())
-        {
-            placed = true;
-        }
-
-        for (const OpPlacement& op_placement : op_placements)
-        {
-            const string& name = op_placement.name;
-            name_to_op_placement[name] = op_placement;
-            epoch_id_to_chip[op_placement.epoch_id()] = op_placement.chip_id;
-            epoch_id_to_op_placement[op_placement.epoch_id()].push_back(op_placement);
-            max_epoch_id_assigned = std::max(max_epoch_id_assigned, op_placement.epoch_id());
-
-            log_debug(tt::LogPlacer, "\tPlacing {} with grid_shape ({}, {}) onto:",
-                    op_placement.name, config.op_to_grid_shape.at(name).rows, config.op_to_grid_shape.at(name).columns);
-
-            log_debug(tt::LogPlacer, "\t\t chip_id={}, epoch_id={}, inclusive_start: {}, exclusive_end={}",
-                    op_placement.chip_id,
-                    op_placement.epoch_id(),
-                    op_placement.placed_cores.start,
-                    op_placement.placed_cores.end
-            );
-        }
-    }
-    PlacerSolution placer_solution = PlacerSolution{
-        .name_to_op_placement = std::move(name_to_op_placement),
-        .input_queue_to_grid_shape = config.input_queue_to_grid_shape,
-        .name_to_queue_placement = {},
-        .epoch_id_to_chip = std::move(epoch_id_to_chip),
-        .epoch_id_to_subgraph_index = {},
-        .epoch_id_to_op_placement = std::move(epoch_id_to_op_placement),
-        .epoch_id_to_device_grid = std::move(epoch_id_to_device_grid),
-        .epoch_id_to_epoch_info = std::move(epoch_id_to_epoch_info),
-        .num_epochs = placed ? (max_epoch_id_assigned - epoch_start_id) + 1 : 0,
-    };
-
-    validate_placer_solution(config, placer_solution);
-
-    return placer_solution;
-}
-
-
-static std::vector<ChipId> get_chip_id_order(
-    const ChipIdToPlacerWorkload& chip_to_placer_op_group_workload,
-    NodeEpochType epoch_type)
-{
-    vector<ChipId> chip_id_order;
-    for (auto& [chip_id, placer_workload] : chip_to_placer_op_group_workload) {
-        chip_id_order.push_back(chip_id);
-    }
-
-    if (epoch_type == NodeEpochType::Backward) {
-        std::reverse(std::begin(chip_id_order), std::end(chip_id_order));
-    }
-    return chip_id_order;
-}
-
-static PlacerSolution grayskull_placer(const PlacerConfig& config, const std::vector<std::string>& scheduled_ops)
-{
-    ChipIdToPlacerWorkload chip_to_placer_op_group_workload = lowering::generate_placer_workload(config, scheduled_ops);
-
-    uint32_t current_epoch_id = 0;
-    unordered_map<string, OpPlacement> name_to_op_placement;
-    map<int, int> epoch_id_to_chip;
-    unordered_map<int, vector<OpPlacement>> epoch_id_to_op_placement;
-    auto e = EpochIdToDeviceGrid(config.get_available_rows_on_device(), config.device_grid.columns);
-    unordered_map<int, EpochInfo> epoch_id_to_epoch_info;
-
-    // For each chip, assign the op_group_workload to the chip and place it.
-    for (auto epoch_type : {NodeEpochType::Forward, NodeEpochType::Backward, NodeEpochType::Optimizer}) {
-        for (ChipId chip_id : get_chip_id_order(chip_to_placer_op_group_workload, epoch_type)) {
-            auto& placer_workload = chip_to_placer_op_group_workload.at(chip_id);
-            log_debug(tt::LogPlacer, "############################");
-            log_debug(tt::LogPlacer, "Placing OPs onto chip_id: {}", chip_id);
-            log_debug(tt::LogPlacer, "############################");
-
-            auto chip_solution = place_onto_chip(config, placer_workload, current_epoch_id, epoch_type);
-
-            name_to_op_placement.insert(
-                chip_solution.name_to_op_placement.begin(),
-                chip_solution.name_to_op_placement.end());
-
-            epoch_id_to_op_placement.insert(
-                chip_solution.epoch_id_to_op_placement.begin(),
-                chip_solution.epoch_id_to_op_placement.end());
-
-            e.epoch_id_to_device_grid.insert(
-                chip_solution.epoch_id_to_device_grid.epoch_id_to_device_grid.begin(),
-                chip_solution.epoch_id_to_device_grid.epoch_id_to_device_grid.end());
-
-
-            for (auto epoch_id = current_epoch_id; epoch_id < current_epoch_id + chip_solution.num_epochs; ++epoch_id)
-            {
-                epoch_id_to_chip[epoch_id] = chip_id;
-                epoch_id_to_epoch_info[epoch_id] = EpochInfo{
-                    .global_epoch_id = epoch_id,
-                    .temporal_epoch_id = epoch_id,
-                    .spatial_epoch_id = 0,
-                    .epoch_type = epoch_type,
-                };
-            }
-            current_epoch_id += chip_solution.num_epochs;
-        }
-    }
-
-    PlacerSolution placer_solution = {
-        .name_to_op_placement = name_to_op_placement,
-        .input_queue_to_grid_shape = config.input_queue_to_grid_shape,
-        .name_to_queue_placement = {},
-        .epoch_id_to_chip = epoch_id_to_chip,
-        .epoch_id_to_subgraph_index = {},
-        .epoch_id_to_op_placement = std::move(epoch_id_to_op_placement),
-        .epoch_id_to_device_grid = std::move(e),
-        .epoch_id_to_epoch_info = std::move(epoch_id_to_epoch_info),
-        .num_epochs = current_epoch_id,
-        .is_pipelined = true,
-    };
-
-    for (uint32_t i = 0; i < placer_solution.num_epochs; ++i) {
-        if (placer_solution.epoch_id_to_op_placement.find(i) == placer_solution.epoch_id_to_op_placement.end()) {
-            log_fatal(tt::LogPlacer, "Placer: Error found blank/missing epoch_id: {}", i);
-        }
-    }
-
-
-    return placer_solution;
-}
-
-vector<vector<OpGroupToPlace>> get_placer_workload_grouped_by_chip_id(ChipIdToPlacerWorkload& chip_to_placer_op_group_workload, NodeEpochType epoch_type) {
-    vector<OpGroupToPlace> placer_workload;
-
-    for (auto& [chip_id, op_groups] : chip_to_placer_op_group_workload)
-    {
-        for (const auto& op_group : op_groups) {
-            if (op_group.epoch_type == epoch_type)
-            {
-                placer_workload.push_back(op_group);
-            }
-        }
-    }
-    if (placer_workload.empty())
-    {
-        return {};
-    }
-    std::sort(placer_workload.begin(), placer_workload.end(),
-            [](const OpGroupToPlace& a, const OpGroupToPlace& b) { return a.op_group_id < b.op_group_id; });
-
-    uint32_t current_chip_id = placer_workload.at(0).chip_id;
-    uint32_t previous_chip_id = current_chip_id;
-    vector<vector<OpGroupToPlace>> placer_workload_grouped_by_chip_id;
-
-    for (const OpGroupToPlace& op_group : placer_workload) {
-        current_chip_id = op_group.chip_id;
-        if (placer_workload_grouped_by_chip_id.empty() or previous_chip_id != current_chip_id) {
-            placer_workload_grouped_by_chip_id.push_back({});
-        }
-
-        placer_workload_grouped_by_chip_id.back().push_back(op_group);
-        previous_chip_id = op_group.chip_id;
-    }
-    if (epoch_type == NodeEpochType::Backward)
-    {
-        std::reverse(placer_workload_grouped_by_chip_id.begin(), placer_workload_grouped_by_chip_id.end());
-    }
-
-    return placer_workload_grouped_by_chip_id;
-}
-
-static bool can_place_epoch_onto_chip(const PlacerConfig& config, const PlacerSolution& chip_solution, uint32_t epoch_id, uint32_t proposed_chip_id)
-{
-    TT_ASSERT(not config.device_config.chips_with_mmio.empty(), "Expecting at least one chip with MMIO capability.");
-    for (const auto &placement : chip_solution.epoch_id_to_op_placement.at(epoch_id))
-    {
-        if (config.output_queues_on_host and config.output_ops.find(placement.name) != config.output_ops.end() and
-            std::find(config.device_config.chips_with_mmio.begin(), config.device_config.chips_with_mmio.end(), proposed_chip_id) == config.device_config.chips_with_mmio.end())
-        {
-            log_debug(tt::LogPlacer, "output op {} not on MMIO chip", placement.name);
-            return false;
-        }
-        if (config.op_to_chip_id_assignment.find(placement.name) != config.op_to_chip_id_assignment.end() and
-            config.op_to_chip_id_assignment.at(placement.name) != proposed_chip_id)
-        {
-            log_debug(tt::LogPlacer, "op {} assigned a chip id {} that is not the proposed chip id {}", placement.name, config.op_to_chip_id_assignment.at(placement.name), proposed_chip_id);
-            return false;
-        }
-        if (auto maybe_chip_id_override = config.get_chip_id_override(placement.name); maybe_chip_id_override)
-        {
-            if (maybe_chip_id_override.value() != proposed_chip_id)
-            {
-                log_debug(tt::LogPlacer, "op {} has an override chip id {} that is not the proposed chip id {}", placement.name, maybe_chip_id_override.value(), proposed_chip_id);
-                return false;
-            }
-        }
-        // TODO: generalize to chips to avoid in case there are multichip chips on shelf (e.g. Nebula x2)
-        if (env_as<bool>("PYBUDA_NEBULA_GALAXY_PLACER"))
-        {
-            if (std::find(config.device_config.chips_with_mmio.begin(), config.device_config.chips_with_mmio.end(), proposed_chip_id) !=
-                config.device_config.chips_with_mmio.end())
-            {
-                if (!config.output_queues_on_host)
-                {
-                    return false;
-                }
-                else if (
-                    config.output_queues_on_host and config.output_ops.find(placement.name) == config.output_ops.end())
-                {
-                    return false;
-                }
-            }
-        }
-    }
-    return true;
-}
-
-static bool validate_epoch_placement(const PlacerConfig& config, const PlacerSolution& chip_solution, uint32_t epoch_id)
-{
-    TT_ASSERT(not config.device_config.chips_with_mmio.empty(), "Expecting at least one chip with MMIO capability.");
-    std::unordered_set<uint32_t> user_assigned_chip_ids;
-    std::unordered_map<std::string, uint32_t> op_to_constraint;
-    for (const auto& placement : chip_solution.epoch_id_to_op_placement.at(epoch_id))
-    {
-        if (auto maybe_chip_id_override = config.get_chip_id_override(placement.name); maybe_chip_id_override)
-        {
-            user_assigned_chip_ids.insert(maybe_chip_id_override.value());
-            op_to_constraint[placement.name] = maybe_chip_id_override.value();
-        }
-        if (config.op_to_chip_id_assignment.find(placement.name) != config.op_to_chip_id_assignment.end())
-        {
-            user_assigned_chip_ids.insert(config.op_to_chip_id_assignment.at(placement.name));
-            op_to_constraint[placement.name] = config.op_to_chip_id_assignment.at(placement.name);
-        }
-    }
-    if (user_assigned_chip_ids.size() > 1)
-    {
-        log_fatal("Placer: Error, epoch {} has ops assigned to multiple chips: {}", epoch_id, op_to_constraint);
-        return false;
-    }
-    else if (user_assigned_chip_ids.size() == 1)
-    {
-        uint32_t user_assigned_chip_id = *user_assigned_chip_ids.begin();
-
-        // check that that is no conflict with constraints configured and the output op
-        for (const auto& placement : chip_solution.epoch_id_to_op_placement.at(epoch_id))
-        {
-            if (config.output_queues_on_host and config.output_ops.find(placement.name) != config.output_ops.end() and
-                std::find(config.device_config.chips_with_mmio.begin(), config.device_config.chips_with_mmio.end(), user_assigned_chip_id) ==
-                    config.device_config.chips_with_mmio.end())
-            {
-                log_fatal(
-                    "Placer: User has defined constraints on the ops: {} but there is an output op: {} on the same "
-                    "epoch that must be assigned to an MMIO capable chip",
-                    op_to_constraint,
-                    placement.name);
-                return false;
-            }
-        }
-    }
-    else
-    {
-        // no user assigned constraints so we are done
-        return true;
-    }
-
-    return true;
-}
-
-static std::tuple<bool, std::uint32_t, std::uint32_t, std::uint32_t>
-advance_epoch(
-    const std::vector<std::uint32_t>& chip_ids,
-    bool placing_forward,
-    bool is_fwd_chip_direction,
-    std::uint32_t current_chip_index, 
-    std::uint32_t current_temporal_epoch_id,
-    std::uint32_t current_spatial_epoch_id)
-{
-    std::uint32_t next_chip_index = current_chip_index;
-    std::uint32_t next_temporal_epoch_id = current_temporal_epoch_id;
-    std::uint32_t next_spatial_epoch_id = current_spatial_epoch_id + 1;
-    
-    // Snake the chip assignments so it's likely the first spatial epoch of a new temporal epoch 
-    // reads activations from its own DRAM
-    if (env_as<bool>("PYBUDA_PLACER_SNAKE"))
-    {
-        if (placing_forward) {
-            if (next_chip_index == (chip_ids.size() - 1)) {
-                placing_forward = false;
-                next_spatial_epoch_id = 0;
-                next_temporal_epoch_id++;
-            }
-            else {
-                next_chip_index++;
-            }
-        } else {
-            if (next_chip_index == 0) {
-                placing_forward = true;
-                next_spatial_epoch_id = 0;
-                next_temporal_epoch_id++;
-            }
-            else {
-                next_chip_index--;
-            }
-        }
-    }
-    else {
-        bool wrap;
-
-        if (is_fwd_chip_direction) {
-            next_chip_index++;
-            wrap = (next_chip_index >= chip_ids.size());
-        } else {
-            wrap = (next_chip_index == 0);
-            if (!wrap) next_chip_index--;
-        }
-        if (wrap) {
-            next_chip_index = is_fwd_chip_direction ? 0 : chip_ids.size() - 1;
-            next_spatial_epoch_id = 0;
-            next_temporal_epoch_id++;
-        }
-    }
-    return {placing_forward, next_chip_index, next_temporal_epoch_id, next_spatial_epoch_id};
-}
-
-static PlacerSolution wormhole_placer(const PlacerConfig& config, const std::vector<std::string>& scheduled_ops)
-{
-    log_debug(LogPlacer, "schedule {}", scheduled_ops);
-
-    unordered_map<string, OpPlacement> name_to_op_placement;
-    map<int, int> epoch_id_to_chip;
-    unordered_map<int, vector<OpPlacement>> epoch_id_to_op_placement;
-    auto e = EpochIdToDeviceGrid(config.get_available_rows_on_device(), config.device_grid.columns);
-    unordered_map<int, NodeEpochType> epoch_id_to_type;
-    unordered_map<int, EpochInfo> epoch_id_to_epoch_info;
-
-    std::vector<OpGroupToPlace> placer_workload = lowering::generate_wormhole_placer_workload(config, scheduled_ops);
-
-    uint32_t current_epoch_id = 0;
-    uint32_t current_temporal_epoch_id = 0;
-
-    log_debug(LogPlacer, "WH Fracturing Constraints: {}", config.op_to_chip_id_assignment);
-
-    for (const auto& [op, override] : config.op_to_overrides)
-    {
-        if (override.chip_id.has_value())
-        {
-            log_debug(LogPlacer, "WH Override: {}, {}", op, override.chip_id.value());
-        }
-    }
-
-    std::vector<uint32_t> chip_ids = lowering::apply_chip_placement_policy(config.device_config, config.chip_placement_policy, config.chip_ids);
-
-    for (auto epoch_type : {NodeEpochType::Forward, NodeEpochType::Backward, NodeEpochType::Optimizer})
-    {
-        uint32_t starting_epoch_id = current_epoch_id;
-        auto chip_solution = place_onto_chip(config, placer_workload, current_epoch_id, epoch_type);
-        name_to_op_placement.insert(
-            chip_solution.name_to_op_placement.begin(),
-            chip_solution.name_to_op_placement.end());
-
-        current_epoch_id += chip_solution.num_epochs;
-
-        // Everything's placed on one chip, but we need to split across available chips
-        bool is_fwd_chip_direction = epoch_type == NodeEpochType::Forward or epoch_type == NodeEpochType::Optimizer;
-        std::uint32_t current_chip_index = is_fwd_chip_direction ? 0 : chip_ids.size() - 1;
-        std::uint32_t current_spatial_epoch_id = 0;
-        bool placing_forward = true;
-        bool enable_pipelined_placement = env_as<bool>("PYBUDA_WORMHOLE_PIPELINED_PLACER");
-
-        std::uint32_t num_epochs_placed_on_chip = 0;
-        for (std::uint32_t epoch = starting_epoch_id; epoch < current_epoch_id; epoch++)
-        {
-            validate_epoch_placement(config, chip_solution, epoch);
-
-            bool valid_chip_assignment = true;
-            // With snaking chip assignment we need one more attempt to account for transition between directions
-            for (std::size_t attempt = 0; attempt < (2*chip_ids.size()+1); ++attempt)
-            {
-                std::uint32_t current_chip_id = chip_ids[current_chip_index];
-                valid_chip_assignment = can_place_epoch_onto_chip(config, chip_solution, epoch, current_chip_id);
-
-                if (valid_chip_assignment)
-                {
-                    for (auto &placement : chip_solution.epoch_id_to_op_placement.at(epoch))
-                    {
-                        placement.chip_id = current_chip_id;
-                        name_to_op_placement[placement.name].chip_id = current_chip_id;
-                    }
-
-                    epoch_id_to_op_placement.insert(
-                        chip_solution.epoch_id_to_op_placement.begin(),
-                        chip_solution.epoch_id_to_op_placement.end());
-
-                    e.epoch_id_to_device_grid.insert(
-                        chip_solution.epoch_id_to_device_grid.epoch_id_to_device_grid.begin(),
-                        chip_solution.epoch_id_to_device_grid.epoch_id_to_device_grid.end());
-
-                    epoch_id_to_chip[epoch] = current_chip_id;
-                    epoch_id_to_type[epoch] = epoch_type;
-
-                    epoch_id_to_epoch_info[epoch] = EpochInfo{
-                        .global_epoch_id = (uint32_t)epoch,
-                        .temporal_epoch_id = (uint32_t)current_temporal_epoch_id,
-                        .spatial_epoch_id = (uint32_t)(current_spatial_epoch_id % chip_ids.size()),
-                        .epoch_type = epoch_type
-                    };
-                    num_epochs_placed_on_chip++;
-                }
-
-                if (enable_pipelined_placement)
-                {
-                    std::uint32_t num_epochs_per_chip = std::ceil(float(chip_solution.num_epochs) / chip_ids.size());
-                    if (not valid_chip_assignment or (num_epochs_placed_on_chip >= num_epochs_per_chip))
-                    {
-                        std::tie(placing_forward, current_chip_index, current_temporal_epoch_id, current_spatial_epoch_id) = 
-                            advance_epoch(chip_ids, placing_forward, is_fwd_chip_direction, current_chip_index, current_temporal_epoch_id, current_spatial_epoch_id);
-                        num_epochs_placed_on_chip = 0;
-                    }
-                }
-                else
-                {
-                    std::tie(placing_forward, current_chip_index, current_temporal_epoch_id, current_spatial_epoch_id) = 
-                        advance_epoch(chip_ids, placing_forward, is_fwd_chip_direction, current_chip_index, current_temporal_epoch_id, current_spatial_epoch_id);
-
-                }
-
-                if (valid_chip_assignment)
-                {
-                    break;
-                }
-            }
-            TT_LOG_ASSERT(valid_chip_assignment, "Invalid chip assignment for temporal epoch {} {}, spatial epoch {}, chip {}", epoch_type, current_temporal_epoch_id, current_spatial_epoch_id, current_chip_index);
-        }
-        log_debug(tt::LogPlacer, "Placing {} epochs onto chip_id: {}", chip_solution.num_epochs, 0);
-    }
-
-
-    int current_temporal_epoch = 0;
-    map<int, int> chip_id_to_spatial_epoch_index;
-    for (uint32_t i = 0; i < config.chip_ids.size(); ++i) {
-        chip_id_to_spatial_epoch_index[config.chip_ids[i]] = i;
-    }
-
-    map<int, int> chip_to_current_temporal_epoch;
-    map<int, std::set<int>> temporal_epoch_id_to_spatial_epochs;
-    map<int, NodeEpochType> temporal_epoch_id_to_epoch_type;
-
-    log_debug(tt::LogPlacer, "## Wormhole Placement Summary ##");
-    NodeEpochType prev_epoch_type = NodeEpochType::Forward;
-    for (const auto& [epoch_id, chip_id] : epoch_id_to_chip) {
-        if (chip_to_current_temporal_epoch.find(chip_id) != chip_to_current_temporal_epoch.end()) {
-            int last_recorded_temporal_epoch = chip_to_current_temporal_epoch[chip_id];
-            current_temporal_epoch = std::max(current_temporal_epoch, last_recorded_temporal_epoch + 1);
-        }
-        bool is_new_temporal_epoch_requested = false;
-        for (const auto &placement : epoch_id_to_op_placement.at(epoch_id))
-        {
-            if (config.ops_tagged_for_temporal_epoch_break.find(placement.name) != config.ops_tagged_for_temporal_epoch_break.end())
-            {
-                is_new_temporal_epoch_requested = true;
-            }
-        }
-        if (epoch_id_to_type[epoch_id] != prev_epoch_type)
-        {
-            is_new_temporal_epoch_requested = true;
-        }
-        prev_epoch_type = epoch_id_to_type[epoch_id];
-
-        if (is_new_temporal_epoch_requested and temporal_epoch_id_to_spatial_epochs[current_temporal_epoch].size() > 0)
-        {
-            // Since there are already op-placements on this current temporal epoch,
-            // and a new temporal epoch is requested, we'll just increment
-            current_temporal_epoch += 1;
-        }
-
-        int current_spatial_epoch = chip_id_to_spatial_epoch_index.at(chip_id);
-        epoch_id_to_epoch_info[epoch_id] = EpochInfo{
-            .global_epoch_id = (uint32_t)epoch_id,
-            .temporal_epoch_id = (uint32_t)current_temporal_epoch,
-            .spatial_epoch_id = (uint32_t)current_spatial_epoch,
-            .epoch_type = epoch_id_to_type.at(epoch_id)
-        };
-        log_debug(tt::LogPlacer, "Epoch: {}, Chip: {}, Temporal Epoch: {}, Spatial Epoch: {}",
-                epoch_id, chip_id, current_temporal_epoch, current_spatial_epoch);
-
-        chip_to_current_temporal_epoch[chip_id] = current_temporal_epoch;
-        temporal_epoch_id_to_spatial_epochs[current_temporal_epoch].insert(current_spatial_epoch);
-        temporal_epoch_id_to_epoch_type[current_temporal_epoch] = epoch_id_to_type[epoch_id];
-    }
-
-    for (std::uint32_t temporal_epoch_id = 0; temporal_epoch_id < temporal_epoch_id_to_epoch_type.size(); ++temporal_epoch_id)
-    {
-        const auto& spatial_epochs = temporal_epoch_id_to_spatial_epochs[temporal_epoch_id];
-        for (uint32_t spatial_epoch_index = 0; spatial_epoch_index < config.chip_ids.size(); ++spatial_epoch_index)
-        {
-            if (spatial_epochs.find(spatial_epoch_index) == spatial_epochs.end())
-            {
-                // need to insert empty epochs
-                // NB: assume temporal epoch should share the same epoch-type
-                int global_epoch_id = current_epoch_id++;
-
-                epoch_id_to_chip[global_epoch_id] = config.chip_ids.at(spatial_epoch_index);
-                epoch_id_to_op_placement[global_epoch_id] = {};
-                e.epoch_id_to_device_grid[global_epoch_id] = {};
-                epoch_id_to_epoch_info[global_epoch_id] = EpochInfo{
-                    .global_epoch_id = (uint32_t)global_epoch_id,
-                    .temporal_epoch_id = (uint32_t)temporal_epoch_id,
-                    .spatial_epoch_id = (uint32_t)spatial_epoch_index,
-                    .epoch_type = temporal_epoch_id_to_epoch_type.at(temporal_epoch_id)
-                };
-
-                log_debug(tt::LogPlacer, "Inserting Empty Epoch: {}, Chip: {}, Temporal Epoch: {}, Spatial Epoch: {}",
-                        global_epoch_id, config.chip_ids.at(spatial_epoch_index), temporal_epoch_id, spatial_epoch_index);
-            }
-        }
-    }
-
-    // if user has defined manual configuration for `place_on_new_chip`, the multichip
-    // wormhole placement is configured to be pipelined
-    bool is_pipelined = not config.ops_tagged_for_chip_id_break.empty();
-
-    PlacerSolution placer_solution = {
-        .name_to_op_placement = name_to_op_placement,
-        .input_queue_to_grid_shape = config.input_queue_to_grid_shape,
-        .name_to_queue_placement = {},
-        .epoch_id_to_chip = epoch_id_to_chip,
-        .epoch_id_to_subgraph_index = {},
-        .epoch_id_to_op_placement = std::move(epoch_id_to_op_placement),
-        .epoch_id_to_device_grid = std::move(e),
-        .epoch_id_to_epoch_info = std::move(epoch_id_to_epoch_info),
-        .num_epochs = current_epoch_id,
-        .is_pipelined = is_pipelined,
-    };
-
-    return placer_solution;
-}
-
-
-PlacerSolution galaxy_placer(const PlacerConfig &config, const std::vector<std::string> &scheduled_ops)
-{
-
-    // Group ops into fwd, bwd, grad, opt
-    std::unordered_map<std::string, std::vector<std::string>> op_megagroup;
-
-    bool split_grad = not env_as<bool>("PYBUDA_GALAXY_PLACER_COMBINE_GRAD");
-    bool split_recompute = not env_as<bool>("PYBUDA_GALAXY_PLACER_COMBINE_RECOMPUTE");
-
-    for (auto op_name : scheduled_ops)
-    {
-        NodeEpochType epoch_type = config.op_to_epoch_type.at(op_name);
-        bool is_gradient_op = config.op_to_grad_op.at(op_name);
-        //bool is_recompute_op = config.op_to_recompute_op.at(op_name);
-        // TODO
-        bool is_recompute_op = false;
-
-        if (epoch_type == NodeEpochType::Forward)
-            op_megagroup["fwd"].push_back(op_name);
-        else if (epoch_type == NodeEpochType::Optimizer)
-            op_megagroup["opt"].push_back(op_name);
-        else {
-            // bwd
-            if (split_recompute && is_recompute_op)
-                op_megagroup["rcmp"].push_back(op_name);
-            else if (split_grad && is_gradient_op) 
-                op_megagroup["grad"].push_back(op_name);
-            else 
-                op_megagroup["bwd"].push_back(op_name);
-        }
-    }
-
-    std::unordered_map<string, OpPlacement> name_to_op_placement;
-    std::map<int, int> epoch_id_to_chip;
-    std::unordered_map<int, vector<OpPlacement>> epoch_id_to_op_placement;
-    auto e = EpochIdToDeviceGrid(config.get_available_rows_on_device(), config.device_grid.columns);
-    std::unordered_map<int, EpochInfo> epoch_id_to_epoch_info;
-
-    std::uint32_t current_epoch_id = 0;
-    std::uint32_t current_temporal_epoch_id = 0;
-
-    for (auto &type : std::vector<std::string>{"fwd", "rcmp", "bwd", "grad", "opt"})
-    {
-        if (op_megagroup[type].size() == 0)
-            continue;
-
-        bool chip_direction = (type == "fwd") || (type == "rcmp"); // incrementing chip IDs
-        
-        std::uint32_t current_spatial_epoch_id = 0;
-        std::vector<OpGroupToPlace> placer_workload = lowering::generate_simple_placer_workload(config, op_megagroup[type]);
-        std::uint32_t starting_epoch_id = current_epoch_id;
-        auto chip_solution = place_onto_chip(config, placer_workload, current_epoch_id);
-        current_epoch_id += chip_solution.num_epochs;
-
-        // Everything's placed on one chip, but we need to split across available chips
-        std::uint32_t current_chip_index = chip_direction ? 0 : config.chip_ids.size() - 1;
-        for (std::uint32_t epoch = starting_epoch_id; epoch < current_epoch_id; epoch++)
-        {
-            std::uint32_t current_chip_id = config.chip_ids[current_chip_index];
-            for (auto &placement : chip_solution.epoch_id_to_op_placement.at(epoch))
-                placement.chip_id = current_chip_id;
-
-            name_to_op_placement.insert(
-                chip_solution.name_to_op_placement.begin(),
-                chip_solution.name_to_op_placement.end());
-
-            for (auto &placement : chip_solution.epoch_id_to_op_placement.at(epoch))
-                name_to_op_placement[placement.name].chip_id = current_chip_id;
-
-            epoch_id_to_op_placement.insert(
-                chip_solution.epoch_id_to_op_placement.begin(),
-                chip_solution.epoch_id_to_op_placement.end());
-
-            e.epoch_id_to_device_grid.insert(
-                chip_solution.epoch_id_to_device_grid.epoch_id_to_device_grid.begin(),
-                chip_solution.epoch_id_to_device_grid.epoch_id_to_device_grid.end());
-
-            epoch_id_to_chip[epoch] = current_chip_id;
-            NodeEpochType epoch_type = (type == "fwd") ? NodeEpochType::Forward :
-                                       (type == "opt") ? NodeEpochType::Optimizer : 
-                                                         NodeEpochType::Backward;
-
-            epoch_id_to_epoch_info[epoch] = EpochInfo{
-                .global_epoch_id = (uint32_t)epoch,
-                .temporal_epoch_id = (uint32_t)current_temporal_epoch_id,
-                .spatial_epoch_id = (uint32_t)(current_spatial_epoch_id % config.chip_ids.size()),
-                .epoch_type = epoch_type
-            };
-
-            if (epoch < current_epoch_id - 1) {
-                current_spatial_epoch_id++;
-
-                bool wrap;
-
-                if (chip_direction) {
-                    current_chip_index++;
-                    wrap = (current_chip_index >= config.chip_ids.size());
-                } else {
-                    wrap = (current_chip_index == 0);
-                    if (!wrap) current_chip_index--;
-                }
-                if (wrap) {
-                    current_chip_index = chip_direction ? 0 : config.chip_ids.size() - 1;
-                    current_spatial_epoch_id = 0;
-                    current_temporal_epoch_id ++;
-                }
-            }
-        }
-
-        current_temporal_epoch_id++;
-    }
-
-    PlacerSolution placer_solution = {
-        .name_to_op_placement = name_to_op_placement,
-        .input_queue_to_grid_shape = config.input_queue_to_grid_shape,
-        .name_to_queue_placement = {},
-        .epoch_id_to_chip = epoch_id_to_chip,
-        .epoch_id_to_subgraph_index = {},
-        .epoch_id_to_op_placement = std::move(epoch_id_to_op_placement),
-        .epoch_id_to_device_grid = std::move(e),
-        .epoch_id_to_epoch_info = std::move(epoch_id_to_epoch_info),
-        .num_epochs = current_epoch_id,
-        .is_pipelined = false,
-    };
-
-    return placer_solution;
-}
-
-
-PlacerSolution placer(const PlacerConfig& config, const vector<string>& scheduled_ops)
-{
-    lowering::validate_placer_config(config);
-
-    // TODO: expose as config... for now, quick testing through env variable
-    if (env_as<bool>("PYBUDA_GALAXY_PLACER"))
-    {
-        return galaxy_placer(config, scheduled_ops);
-    }
-
-    if (config.device_config.arch_name == "grayskull") {
-        return grayskull_placer(config, scheduled_ops);
-    } else {
-        TT_ASSERT((config.device_config.arch_name == "wormhole" || config.device_config.arch_name == "wormhole_b0"), "Placer Failed: Unknown device arch name.");
-        return wormhole_placer(config, scheduled_ops);
-    }
-}
-
-void place_on_new_epoch(PlacerConfig& config, const string& op_name)
-{
-    config.ops_tagged_for_epoch_break.insert(op_name);
-}
-
-void place_on_new_chip(PlacerConfig& config, const string& op_name)
-{
-    config.ops_tagged_for_chip_id_break.insert(op_name);
-}
-
-
-void dump_placer_solution_json_to_file(const PlacerSolution& solution)
-{
-    json placer_solution_json = solution.to_json();
-
-    const string DEFAULT_FILEPATH = "placement.json";
-    ofstream o(DEFAULT_FILEPATH);
-    o << setw(4) << placer_solution_json;
-    o.close();
-
-}
-
-std::ostream& operator<<(std::ostream& os, const Coord& coord)
-{
-    os << "Coord{";
-    os << ".row= " << coord.row << ", ";
-    os << ".col= " << coord.col << ", ";
-    os << "}";
-    return os;
-}
-
-std::ostream& operator<<(std::ostream& os, const CoordRange& coord_range)
-{
-    os << "CoordRange{";
-    os << ".start= " << coord_range.start << ", ";
-    os << ".end= " << coord_range.end << ", ";
-    os << "}";
-    return os;
-}
-
-std::ostream& operator<<(std::ostream& os, const PlacerOpOverride& override)
-{
-    os << "PlacerOpOverride{";
-    os << ".transpose= " << override.transpose_op << ", ";
-    if (override.grid_start.has_value())
-    {
-        os << ".grid_start= " << override.grid_start.value() << ", ";
-    }
-    else
-    {
-        os << ".grid_start= " << "None" << ", ";
-    }
-    if (override.chip_id.has_value())
-    {
-        os << ".chip_id= " << override.chip_id.value() << ", ";
-    }
-    else
-    {
-        os << ".chip_id= " << "None" << ", ";
-    }
-    os << ".temporal_epoch_break= " << (override.temporal_epoch_break ? "true" : "false") << ", ";
-
-    os << "}";
-    return os;
-}
-
-std::unordered_map<std::string, placer::PlacerOpOverride> match_op_names_to_placer_overrides(
-    graphlib::Graph* graph,
-    std::vector<std::pair<std::variant<std::string, graphlib::query::NodePredicate>, placer::PlacerOpOverride>> const&
-        predicates_to_overrides)
-{
-    std::unordered_map<std::string, placer::PlacerOpOverride> op_names_to_placer_overrides;
-    auto is_op_node = graphlib::query::predicate_op_node_type();
-    for (auto const& [string_or_predicate, override] : predicates_to_overrides)
-    {
-        if (std::string const* s = std::get_if<std::string>(&string_or_predicate))
-        {
-            op_names_to_placer_overrides[*s] = override;
-        }
-        else if (graphlib::query::NodePredicate const* p = std::get_if<graphlib::query::NodePredicate>(&string_or_predicate))
-        {
-            for (graphlib::Node* node : graphlib::query::filter_nodes(graph, *p & is_op_node))
-            {
-                if (op_names_to_placer_overrides.find(node->name()) != op_names_to_placer_overrides.end())
-                  log_fatal("Overlapping placer override predicates for node: {}", node->name());
-                op_names_to_placer_overrides[node->name()] = override;
-            }
-        }
-    }
-    return op_names_to_placer_overrides;
-}
-
-std::vector<std::vector<std::string>> match_op_names_to_breaks(
-    graphlib::Graph* graph, const PredicatesToBreaks& predicates_to_breaks)
-{
-    std::vector<std::vector<std::string>> op_names_to_breaks;
-    op_names_to_breaks.reserve(predicates_to_breaks.size());
-    auto is_op_node = graphlib::query::predicate_op_node_type();
-    for (auto const& outer : predicates_to_breaks)
-    {
-        if (auto* p = std::get_if<graphlib::query::NodePredicate>(&outer))
-        {
-            for (graphlib::Node* node : graphlib::query::filter_nodes(graph, *p & is_op_node))
-            {
-                op_names_to_breaks.push_back({node->name()});
-            }
-        }
-        else if (
-            auto* sublist = std::get_if<std::vector<std::variant<std::string, graphlib::query::NodePredicate>>>(&outer))
-        {
-            op_names_to_breaks.emplace_back();
-            auto& back = op_names_to_breaks.back();
-            back.reserve(sublist->size());
-            for (auto const& elem : *sublist)
-            {
-                if (std::string const* s = std::get_if<std::string>(&elem))
-                {
-                    back.push_back(*s);
-                }
-                else if (graphlib::query::NodePredicate const* p = std::get_if<graphlib::query::NodePredicate>(&elem))
-                {
-                    for (graphlib::Node* node : graphlib::query::filter_nodes(graph, *p & is_op_node))
-                    {
-                        back.push_back(node->name());
-                    }
-                }
-            }
-        }
-    }
-    return op_names_to_breaks;
-}
-
-}  // namespace placer
-}  // namespace tt
diff --git a/pybuda/csrc/placer/placer.hpp b/pybuda/csrc/placer/placer.hpp
deleted file mode 100644
index fe6d3f545..000000000
--- a/pybuda/csrc/placer/placer.hpp
+++ /dev/null
@@ -1,452 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <map>
-#include <optional>
-#include <set>
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "backend_api/device_config.hpp"
-#include "graph_lib/defines.hpp"
-#include "graph_lib/query.hpp"
-#include "third_party/json/json_fwd.hpp"
-
-using NodeEpochType = tt::graphlib::NodeEpochType;
-
-using std::map;
-using std::set;
-using std::string;
-using std::uint32_t;
-using std::unordered_map;
-using std::unordered_set;
-using std::vector;
-using json = nlohmann::json;
-
-namespace tt {
-namespace placer {
-
-
-/*
-  ____        _          ____  _                   _
- |  _ \  __ _| |_ __ _  / ___|| |_ _ __ _   _  ___| |_ _   _ _ __ ___  ___
- | | | |/ _` | __/ _` | \___ \| __| '__| | | |/ __| __| | | | '__/ _ \/ __|
- | |_| | (_| | || (_| |  ___) | |_| |  | |_| | (__| |_| |_| | | |  __/\__ \
- |____/ \__,_|\__\__,_| |____/ \__|_|   \__,_|\___|\__|\__,_|_|  \___||___/
-
-*/
-
-struct CoordOffset
-{
-    uint32_t row_offset;
-    uint32_t column_offset;
-};
-
-struct GridShape
-{
-    uint32_t rows = 0;
-    uint32_t columns = 0;
-
-    GridShape() = default;
-    GridShape(uint32_t rows, uint32_t columns) : rows(rows), columns(columns) {};
-    std::uint32_t volume() const { return rows * columns; }
-    GridShape transposed() const { return GridShape(columns, rows); }
-
-    static GridShape from_array(std::array<uint32_t, 2> array);
-};
-
-struct Coord
-{
-    uint32_t row = 0;
-    uint32_t col = 0;
-    Coord operator+(const GridShape &rhs) const;
-    Coord operator+(const CoordOffset &rhs) const;
-    Coord operator+(const Coord &rhs) const;
-    bool operator< (const Coord &rhs) const;
-    bool operator== (const Coord &rhs) const;
-    bool operator!= (const Coord &rhs) const;
-
-    json to_json() const;
-    std::array<uint32_t, 2> as_array() const;
-};
-
-struct CoordRange
-{
-    // Contiguous range of core coordinates from top-left(start) to bottom-right(end)
-    Coord start; // inclusive
-    Coord end; // exclusive
-
-    uint32_t size_r() const { return end.row - start.row; }
-    uint32_t size_c() const { return end.col - start.col; }
-    bool operator==(const CoordRange &rhs) const;
-    bool operator!=(const CoordRange &rhs) const;
-    json to_json() const;
-
-    void transpose()
-    {
-        uint32_t r = size_r();
-        end.row = start.row + size_c();
-        end.col = start.col + r;
-    }
-};
-
-struct PlacerOpOverride
-{
-    std::optional<Coord> grid_start = std::nullopt;
-    bool transpose_op = false;
-    std::optional<uint32_t> chip_id = std::nullopt;
-    bool temporal_epoch_break = false;
-
-    PlacerOpOverride() = default;
-    PlacerOpOverride(
-        std::optional<std::array<uint32_t, 2>> start,
-        bool transpose_op,
-        std::optional<uint32_t> chip_id,
-        bool temporal_epoch_break = false) :
-        transpose_op(transpose_op), chip_id(chip_id), temporal_epoch_break(temporal_epoch_break)
-    {
-        if (start.has_value())
-        {
-            std::array<uint32_t, 2> start_array = start.value();
-            this->grid_start = Coord{.row = start_array[0], .col = start_array[1]};
-        }
-    }
-    PlacerOpOverride(
-        std::optional<Coord> start,
-        bool transpose_op,
-        std::optional<uint32_t> chip_id,
-        bool temporal_epoch_break = false) :
-        grid_start(start), transpose_op(transpose_op), chip_id(chip_id), temporal_epoch_break(temporal_epoch_break)
-    {
-    }
-
-    static PlacerOpOverride force_op_transpose()
-    {
-        std::optional<Coord> start = std::nullopt;
-        return PlacerOpOverride(start, true, std::nullopt, false);
-    }
-
-    static PlacerOpOverride override_chip_id(int chip_id)
-    {
-        std::optional<Coord> start = std::nullopt;
-        return PlacerOpOverride(start, false, chip_id, false);
-    }
-
-    bool operator==(const PlacerOpOverride& rhs) const
-    {
-        return (grid_start == rhs.grid_start) && (transpose_op == rhs.transpose_op) && (chip_id == rhs.chip_id) &&
-               (temporal_epoch_break == rhs.temporal_epoch_break);
-    }
-};
-
-enum PlacementStrategy
-{
-    // PlacementStrategy controls how we place a sequence of ops.
-
-    // Extend and implement for different placement strategies
-    LeftToRight = 0, // Place left-to-right on each new row
-};
-enum PlacementScheduleOrder
-{
-    // PlacementSchedule controls the sequence order of the ops we place.
-    // By default, we place based on topological ordering of the nodes
-    Topological,
-};
-
-enum class ChipPlacementPolicy;
-
-struct PlacerConfig
-{
-    // Arch config
-    std::vector<std::uint32_t> chip_ids;
-    tt::placer::ChipPlacementPolicy chip_placement_policy;
-    const DeviceConfig& device_config;
-    GridShape device_grid;
-    bool contains_recompute = false;
-    bool output_queues_on_host = true;
-
-    // a list of row_indices (range defined by logical coordinates), defining the harvested rows
-    // in other words, placer should skip placing ops on these rows.
-    vector<uint32_t> harvested_rows = {};
-
-    // Placer config toggling strategies/behaviors of different automatic placements
-    PlacementStrategy strategy = PlacementStrategy::LeftToRight;
-
-    unordered_map<string, GridShape> op_to_grid_shape;
-    unordered_map<string, GridShape> input_queue_to_grid_shape;
-
-    // Capture any user or op-specific config for placement
-    // like chip-breaks or epoch-breaks
-    unordered_map<string, NodeEpochType> op_to_epoch_type;
-    unordered_map<string, bool> op_to_grad_op; // set for gradient accumulation ops
-    unordered_map<string, bool> op_to_recompute_op;
-
-    // captures any user-configuration for chip-breaking
-    unordered_set<string> ops_tagged_for_chip_id_break;
-    unordered_set<string> ops_tagged_for_epoch_break;
-    unordered_set<string> ops_tagged_for_temporal_epoch_break; // WH and legacy-placer specific
-
-    unordered_map<string, vector<string>> fwd_to_bwd_nodes;
-    unordered_map<string, map<int, vector<string>>> fwd_to_opt_nodes;
-    unordered_set<string> output_ops = {};
-    unordered_map<string, uint32_t> op_to_chip_id_assignment;
-    unordered_map<string, PlacerOpOverride> op_to_overrides;
-
-    bool enable_auto_transposing_placement = false;
-
-    // methods
-    uint32_t get_available_rows_on_device() const;
-    uint32_t get_chip_id(const string& op_name) const;
-    std::optional<uint32_t> get_chip_id_override(const string& op_name) const;
-};
-
-struct PlacerConfigUpdate
-{
-    unordered_map<string, uint32_t> op_to_chip_id_assignment;
-    vector<vector<string>> op_names_to_chip_break;
-    vector<vector<string>> op_names_to_epoch_break;
-
-    PlacerConfigUpdate(
-        const unordered_map<string, uint32_t>& op_to_chip_id_assignment,
-        const vector<vector<string>>& op_names_to_chip_break,
-        const vector<vector<string>>& op_names_to_epoch_break) :
-        op_to_chip_id_assignment(op_to_chip_id_assignment),
-        op_names_to_chip_break(op_names_to_chip_break),
-        op_names_to_epoch_break(op_names_to_epoch_break)
-    {
-    }
-};
-
-
-// The struct capturing the decision made by the placer for how to place an op.
-// This struct defines the atomic unit of work for the Placer.
-//
-// This captures one or more ops to be placed TOGETHER in the same epoch/chip.
-// This simplifies things so placer only needs to worry about placing one OpGroupToPlace at a time,
-// instead of doing look-aheads to make sure we're still conforming to constraints
-//
-// Consider the following cases:
-//   1. tilize/untilize unaries needing to be placed with its producer op
-//   2. any user-defined groupings for the op (user: "I want to place ops {A, B, C} in the same epoch")
-//   3. triplet placement
-struct OpGroupToPlace
-{
-    static uint32_t current_op_group_id; // assigned based on placement order
-
-    uint32_t op_group_id; // assigned based on placement order
-    vector<string> op_names;
-    unordered_map<string, CoordOffset> op_name_to_relative_offset_from_first_op;
-    uint32_t chip_id = 0;
-    bool increment_epoch = false;
-
-    NodeEpochType epoch_type = NodeEpochType::Forward;
-    static uint32_t get_next_op_group_id();
-};
-
-struct EpochInfo
-{
-    uint32_t global_epoch_id; // globally unique across time/space/chip
-    uint32_t temporal_epoch_id; // epoch timestep where multiple spatially arranged chips may be executing concurrently
-    uint32_t spatial_epoch_id; // within a temporal_epoch_id, the linearized id defining the spatial index. for grayskull, this is always zero.
-
-    NodeEpochType epoch_type;
-};
-
-inline bool operator<(const EpochInfo& lhs, const EpochInfo& rhs)
-{
-    //return lhs.global_epoch_id < rhs.global_epoch_id;
-    if (lhs.temporal_epoch_id == rhs.temporal_epoch_id) {
-        return lhs.spatial_epoch_id < rhs.spatial_epoch_id;
-    }
-    return lhs.temporal_epoch_id < rhs.temporal_epoch_id;
-}
-
-// The struct capturing the decision made by the placer for how to place an op.
-struct OpPlacement
-{
-    uint32_t id = 0;
-    string name;
-    uint32_t chip_id;
-    uint32_t global_epoch_id; // globally unique across time/space/chip
-    bool grid_transpose;
-
-    // Future: For initial implementation, no fracturing support. `placed_cores` will only
-    // have a single element in the vector.
-    CoordRange placed_cores;
-
-    // methods
-    uint32_t epoch_id() const { return global_epoch_id; }
-    bool operator==(const OpPlacement& rhs) const;
-    bool operator!=(const OpPlacement& rhs) const;
-    json to_json() const;
-};
-
-// Placement information for a single buffer in DRAM queue, placed on one dram channel
-struct QueueBufferPlacement
-{
-    uint32_t dram_channel;
-    uint32_t dram_address;
-
-    // Not strictly needed to set placement, but convenient to have here
-    Coord dram_channel_location;
-    uint32_t buffer_size;
-
-    // methods
-    json to_json() const;
-};
-
-struct QueueHostBufferPlacement
-{
-    uint32_t channel;
-    uint32_t address;
-    uint32_t buffer_size;
-
-    // methods
-    json to_json() const;
-};
-
-// Placement information for a DRAM queue, split over some number of channels
-struct QueuePlacement
-{
-    string name;
-    string input_name;
-    GridShape grid_shape;
-    bool on_host;
-    uint32_t chip_id;
-    std::vector<QueueBufferPlacement> dram_buffers;
-    std::vector<QueueHostBufferPlacement> host_buffers;
-    bool read_only = false;
-    bool write_only = false;
-    int write_stride = -1;
-
-    // If dynamic, this indicates when queue will be allocated/deallocated
-    int epoch_allocate = -1;
-    int epoch_deallocate = -1;
-
-    // methods
-    json to_json() const;
-};
-
-// The final returned struct out of the Placer module will have fully populated attributes
-using DeviceGrid = vector<vector<uint32_t>>;
-
-struct EpochIdToDeviceGrid
-{
-    uint32_t rows = 0;
-    uint32_t columns = 0;
-    unordered_map<int, DeviceGrid> epoch_id_to_device_grid;
-    unordered_map<std::string, DeviceGrid> op_to_constraints;
-
-    EpochIdToDeviceGrid() : rows(0), columns(0) {}
-    EpochIdToDeviceGrid(uint32_t rows, uint32_t columns) : rows(rows), columns(columns) {}
-    EpochIdToDeviceGrid(const std::pair<int, int>& grid_pair) : rows(grid_pair.first), columns(grid_pair.second) {}
-
-    void initialize_device_grid(uint32_t epoch_id, bool clear_existing = false);
-    void initialize_device_grid(uint32_t candidate_epoch_id, uint32_t rows, uint32_t columns);
-    void initialize_device_grid(uint32_t epoch_id, const DeviceGrid& device_grid);
-    bool contains_empty_grid(uint32_t epoch_id) ;
-    bool satisfies_constraints(const std::string& op_name, const Coord& start, const GridShape& shape) const;
-    bool can_place_on_device_grid(const std::string& op_name, int epoch_id, const Coord& start, const GridShape& shape);
-    void fill_device_grid_with_placement(int epoch_id, const Coord& op_start, const GridShape& op_grid_shape);
-    uint32_t get_current_epoch_id() const;
-    const DeviceGrid& get_device_grid(uint32_t epoch_id) const;
-    std::optional<Coord> get_next_grid_coordinate(const std::string& op_name, uint32_t epoch_id, const GridShape& op_grid_shape) const;
-
-    void add_constraints(const std::unordered_map<std::string, DeviceGrid>& constraints);
-};
-struct PlacerSolution
-{
-    using EpochId = int;
-    unordered_map<string, OpPlacement> name_to_op_placement;
-    unordered_map<string, GridShape> input_queue_to_grid_shape;
-    unordered_map<string, QueuePlacement> name_to_queue_placement;
-    map<EpochId, int> epoch_id_to_chip;
-    map<EpochId, unsigned int> epoch_id_to_subgraph_index;
-    unordered_map<int, vector<OpPlacement>> epoch_id_to_op_placement;
-    EpochIdToDeviceGrid epoch_id_to_device_grid;
-    unordered_map<int, EpochInfo> epoch_id_to_epoch_info;
-    uint32_t num_epochs = 0;
-    bool is_pipelined = true;
-    bool fork_join_buffered = false;
-
-    // methods
-    json to_json() const;
-
-    uint32_t chip_id(const std::string& op_name) const;
-
-    // Globally unique across chips
-    uint32_t epoch_id(const std::string& op_name) const;
-
-    const EpochInfo& epoch_info(uint32_t global_epoch_id) const;
-
-    // These methods are really only relevant for wormhole.
-    // For grayskull, temporal_epoch_id == epoch_id
-    uint32_t temporal_epoch_id(const std::string& op_name) const;
-    uint32_t temporal_epoch_id(uint32_t global_epoch_id) const;
-    uint32_t num_temporal_epochs() const;
-    uint32_t num_temporal_epochs(NodeEpochType type) const;
-    NodeEpochType epoch_type(uint32_t global_epoch_id) const;
-
-    void merge(PlacerSolution &other);
-    bool is_placed(const std::string& op_name) const;
-
-};
-
-/*
-  ____  _                          _    ____ ___
- |  _ \| | __ _  ___ ___ _ __     / \  |  _ \_ _|___
- | |_) | |/ _` |/ __/ _ \ '__|   / _ \ | |_) | |/ __|
- |  __/| | (_| | (_|  __/ |     / ___ \|  __/| |\__ \
- |_|   |_|\__,_|\___\___|_|    /_/   \_\_|  |___|___/
-*/
-
-// Placer Manipulation APIs: Convenience methods to update PlacerConfig with user-constraints
-void place_on_new_epoch(PlacerConfig& config, const string& op_name);
-void place_on_new_chip(PlacerConfig& config, const string& op_name);
-void dump_placer_solution_json_to_file(const PlacerSolution& solution);
-
-
-// *Main Entrypoints* from placer lowering
-//
-// Intentionally not introducing gstate or tt_graph into these APIs for the placer-module
-// to decouple from all that state.
-// Placer just receives a schedule and config and is responsible for generating op placements
-//
-// Given a list of scheduled ops, and the PlacerConfig::PlacementStrategy, iterate through
-// the list placing each op one at a time.
-
-using ChipId = uint32_t;
-using PlacerWorkload = vector<OpGroupToPlace>;
-using ChipIdToPlacerWorkload = map<ChipId, PlacerWorkload>;
-
-PlacerSolution place_onto_chip(
-    const PlacerConfig& config,
-    PlacerWorkload& placer_op_group_workload,
-    uint32_t epoch_start_id = 0,
-    std::optional<NodeEpochType> epoch_type = std::nullopt);
-
-PlacerSolution placer(const PlacerConfig& config, const vector<string>& scheduled_ops);
-
-std::ostream& operator<<(std::ostream& os, const Coord& coord);
-std::ostream& operator<<(std::ostream& os, const CoordRange& coord_range);
-std::ostream& operator<<(std::ostream& os, const PlacerOpOverride& override);
-
-// Expand predicates into a map of all matched node names
-std::unordered_map<std::string, placer::PlacerOpOverride> match_op_names_to_placer_overrides(
-    graphlib::Graph* graph,
-    std::vector<std::pair<std::variant<std::string, graphlib::query::NodePredicate>, placer::PlacerOpOverride>> const&
-        predicates_to_overrides);
-
-using PredicatesToBreaks = std::vector<std::variant<
-    std::vector<std::variant<std::string, graphlib::query::NodePredicate>>,
-    graphlib::query::NodePredicate>>;
-
-// Expand predicates into a list of all matched node names
-std::vector<std::vector<std::string>> match_op_names_to_breaks(
-    graphlib::Graph* graph, const PredicatesToBreaks& predicates_to_breaks);
-}  // end namespace placer
-} // end namespace tt
diff --git a/pybuda/csrc/placer/post_epoch_passes.cpp b/pybuda/csrc/placer/post_epoch_passes.cpp
deleted file mode 100644
index 7d850b6c1..000000000
--- a/pybuda/csrc/placer/post_epoch_passes.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "placer/post_epoch_passes.hpp"
-
-namespace tt::placer
-{
-
-PlacerAttemptSummary run_post_epoch_passes(
-    PlacerSolution &, PlacerSolution &epoch_placer_solution, PlacerHistory &history)
-{
-    PlacerAttemptSummary sum = history.next_attempt();
-    sum.fail = (epoch_placer_solution.num_epochs == 0);
-
-    return sum;
-}
-
-}  // namespace tt::placer
diff --git a/pybuda/csrc/placer/post_epoch_passes.hpp b/pybuda/csrc/placer/post_epoch_passes.hpp
deleted file mode 100644
index b60faff98..000000000
--- a/pybuda/csrc/placer/post_epoch_passes.hpp
+++ /dev/null
@@ -1,18 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "placer/pre_epoch_passes.hpp"
-
-namespace tt
-{
-namespace placer
-{
-
-PlacerAttemptSummary run_post_epoch_passes(
-    PlacerSolution &placer_solution, PlacerSolution &epoch_placer_solution, PlacerHistory &history);
-
-}
-}  // namespace tt
-
diff --git a/pybuda/csrc/placer/pre_epoch_passes.cpp b/pybuda/csrc/placer/pre_epoch_passes.cpp
deleted file mode 100644
index ecb9388cc..000000000
--- a/pybuda/csrc/placer/pre_epoch_passes.cpp
+++ /dev/null
@@ -1,18 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "placer/pre_epoch_passes.hpp"
-
-#include "graph_lib/graph.hpp"
-
-namespace tt::placer
-{
-
-// Return modified graph if modifications are made
-std::unique_ptr<Graph> run_pre_epoch_passes(graphlib::Graph *, const balancer::BalancerConfig &, PlacerHistory &)
-{
-    // TODO
-    return nullptr;
-}
-
-}  // namespace tt::placer
diff --git a/pybuda/csrc/placer/pre_epoch_passes.hpp b/pybuda/csrc/placer/pre_epoch_passes.hpp
deleted file mode 100644
index b45c7f860..000000000
--- a/pybuda/csrc/placer/pre_epoch_passes.hpp
+++ /dev/null
@@ -1,87 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <cstdint>
-#include <vector>
-
-#include "balancer/balancer.hpp"
-
-namespace tt
-{
-namespace graphlib
-{
-class Graph;
-class Node;
-}  // namespace graphlib
-
-namespace placer
-{
-
-// Request for post-epoch pass to modify the graph of placement instructions
-class PreEpochRequest
-{
-   public:
-    virtual void ModifyGraph(graphlib::Graph *graph) = 0;
-};
-
-class InsertNopRequest : public PreEpochRequest
-{
-   private:
-    graphlib::Node *src, *dest;  // insert nop between these two
-   public:
-    InsertNopRequest(graphlib::Node *src, graphlib::Node *dest) : src(src), dest(dest) {}
-    virtual void ModifyGraph(graphlib::Graph *graph) override;
-};
-
-// Summary of configuration, and results of a placer attempt
-struct PlacerAttemptSummary
-{
-    std::uint32_t epoch_index;
-    std::uint32_t attempt_index;
-    bool fail;
-
-    // Requests made of pre-epoch passes
-    std::vector<std::shared_ptr<PreEpochRequest>> pre_epoch_requests;
-};
-
-class PlacerHistory
-{
-    std::uint32_t current_epoch_index;
-    std::uint32_t next_attempt_index;
-    std::vector<std::vector<PlacerAttemptSummary>> attempts;
-
-    PlacerAttemptSummary create_new_attempt_summary()
-    {
-        auto pas = PlacerAttemptSummary{current_epoch_index, next_attempt_index, false, {}};
-        attempts[current_epoch_index].push_back(pas);
-        return pas;
-    }
-
-   public:
-    PlacerHistory() : current_epoch_index(0), next_attempt_index(0), attempts(1) {}
-
-    PlacerAttemptSummary next_attempt()
-    {
-        auto pas = create_new_attempt_summary();
-        next_attempt_index++;
-        return pas;
-    }
-    void next_epoch()
-    {
-        current_epoch_index++;
-        next_attempt_index = 0;
-        attempts.push_back({});
-    }
-
-    std::uint32_t current_epoch() const { return current_epoch_index; }
-    std::uint32_t current_attempt() const { return next_attempt_index; }
-    void reset_attempts() { next_attempt_index = 0; }
-};
-
-std::unique_ptr<Graph> run_pre_epoch_passes(
-    graphlib::Graph *graph, const balancer::BalancerConfig &config, PlacerHistory &history);
-
-}  // namespace placer
-}  // namespace tt
diff --git a/pybuda/csrc/placer/python_bindings.cpp b/pybuda/csrc/placer/python_bindings.cpp
deleted file mode 100644
index 6db5b2eae..000000000
--- a/pybuda/csrc/placer/python_bindings.cpp
+++ /dev/null
@@ -1,170 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "placer/python_bindings.hpp"
-
-#include <cstdint>
-
-#include "graph_lib/graph.hpp"
-#include "placer/placer.hpp"
-#include "placer/dram_allocator.hpp"
-#include "placer/lower_to_placer.hpp"
-
-
-namespace tt {
-inline std::optional<std::array<uint32_t, 2>> coord_as_array(std::optional<placer::Coord> const& p)
-{
-    if (not p)
-        return std::nullopt;
-    return std::array<uint32_t, 2>{p->row, p->col};
-}
-
-inline std::optional<placer::Coord> array_as_coord(std::optional<std::array<uint32_t, 2>> const& p)
-{
-    if (not p)
-        return std::nullopt;
-    return placer::Coord{.row = (*p)[0], .col = (*p)[1]};
-}
-
-void PlacerModule(py::module &m_placer) {
-    py::class_<placer::Coord>(m_placer, "Coord") 
-        .def_readonly("row", &placer::Coord::row)
-        .def_readonly("col", &placer::Coord::col)
-        .def(py::pickle(
-            [](const placer::Coord& p) { // __getstate__
-                return py::make_tuple(
-                    p.row,
-                    p.col
-                );
-            },
-          [](py::tuple t) { // __setstate__
-                if (t.size() != 2)
-                {
-                    throw std::runtime_error("placer::Coord: Invalid state!");
-                }
-
-                placer::Coord coord = {
-                    .row = t[0].cast<std::uint32_t>(),
-                    .col = t[1].cast<std::uint32_t>()
-                };
-
-                return coord;
-        }));
-
-
-    py::class_<placer::CoordRange>(m_placer, "CoordRange")
-        .def("size_r", &placer::CoordRange::size_r)
-        .def("size_c", &placer::CoordRange::size_c)
-        .def_readonly("start", &placer::CoordRange::start)
-        .def_readonly("end", &placer::CoordRange::end);
-
-    py::class_<placer::PlacerConfigUpdate>(m_placer, "PlacerConfigUpdate")
-        .def_readonly("op_to_chip_id_assignment", &placer::PlacerConfigUpdate::op_to_chip_id_assignment)
-        .def_readonly("op_names_to_chip_break", &placer::PlacerConfigUpdate::op_names_to_chip_break)
-        .def_readonly("op_names_to_epoch_break", &placer::PlacerConfigUpdate::op_names_to_epoch_break);
-
-    using OpOverrideTypes = std::variant<bool, std::optional<uint32_t>, std::optional<std::array<uint32_t, 2>>>;
-    py::class_<placer::PlacerOpOverride>(m_placer, "OpOverride")
-        .def(py::init<std::optional<std::array<std::uint32_t, 2>>, bool, std::optional<uint32_t>, bool>())
-        .def_readonly("grid_start", &placer::PlacerOpOverride::grid_start)
-        .def_readonly("transpose_op", &placer::PlacerOpOverride::transpose_op)
-        .def_readonly("temporal_epoch_break", &placer::PlacerOpOverride::temporal_epoch_break)
-        .def(py::pickle(
-            [](const placer::PlacerOpOverride&p) { // __getstate__
-                return py::make_tuple(
-                    p.grid_start,
-                    p.transpose_op,
-                    p.chip_id,
-                    p.temporal_epoch_break
-                );
-            },
-          [](py::tuple t) { // __setstate__
-            if (t.size() != 4)
-            {
-                throw std::runtime_error("placer::PlacerOpOverride: Invalid state!");
-            }
-
-            placer::PlacerOpOverride p = placer::PlacerOpOverride(
-                t[0].cast<std::optional<placer::Coord>>(),
-                t[1].cast<bool>(),
-                t[2].cast<std::optional<uint32_t>>(),
-                t[3].cast<bool>()
-            );
-
-            return p;
-        }))
-        .def(
-            "to_json",
-            [](placer::PlacerOpOverride const& op_override) {
-                std::unordered_map<std::string, OpOverrideTypes> d;
-                d["grid_start"] = coord_as_array(op_override.grid_start);
-                d["transpose_op"] = op_override.transpose_op;
-                d["chip_id"] = op_override.chip_id;
-                d["temporal_epoch_break"] = op_override.temporal_epoch_break;
-                return d;
-            })
-        .def("from_json", [](std::unordered_map<std::string, OpOverrideTypes> const& d) {
-            placer::PlacerOpOverride op_override;
-            if (auto match = d.find("grid_start");
-                match != d.end() && std::holds_alternative<std::optional<std::array<uint32_t, 2>>>(match->second))
-                op_override.grid_start =
-                    array_as_coord(std::get<std::optional<std::array<uint32_t, 2>>>(match->second));
-            if (auto match = d.find("transpose_op"); match != d.end())
-                op_override.transpose_op = std::get<bool>(match->second);
-            if (auto match = d.find("chip_id");
-                match != d.end() && std::holds_alternative<std::optional<uint32_t>>(match->second))
-                op_override.chip_id = std::get<std::optional<uint32_t>>(match->second);
-            if (auto match = d.find("temporal_epoch_break");
-                match != d.end() && std::holds_alternative<bool>(match->second))
-                op_override.temporal_epoch_break = std::get<bool>(match->second);
-            return op_override;
-        });
-
-    py::class_<placer::OpPlacement>(m_placer, "OpPlacement")
-        .def_readonly("grid_transpose", &placer::OpPlacement::grid_transpose)
-        .def_readonly("placed_cores", &placer::OpPlacement::placed_cores)
-        .def_readonly("chip_id", &placer::OpPlacement::chip_id)
-        .def_property_readonly("epoch_id", &placer::OpPlacement::epoch_id);
-
-    py::class_<placer::PlacerSolution>(m_placer, "PlacerSolution")
-        .def("chip_id", &placer::PlacerSolution::chip_id)
-        .def("epoch_id", &placer::PlacerSolution::epoch_id)
-        .def("temporal_epoch", py::overload_cast<uint32_t>(&placer::PlacerSolution::temporal_epoch_id, py::const_))
-        .def("temporal_epoch", py::overload_cast<const std::string &>(&placer::PlacerSolution::temporal_epoch_id, py::const_))
-        .def_readonly("epoch_id_to_chip", &placer::PlacerSolution::epoch_id_to_chip)
-        .def_readonly("is_pipelined", &placer::PlacerSolution::is_pipelined)
-        .def_readonly("name_to_op_placement", &placer::PlacerSolution::name_to_op_placement);
-    
-    py::enum_<tt::placer::DRAMPlacementAlgorithm>(m_placer, "DRAMPlacementAlgorithm")
-        .value("ROUND_ROBIN", tt::placer::DRAMPlacementAlgorithm::ROUND_ROBIN)
-        .value("ROUND_ROBIN_FLIP_FLOP", tt::placer::DRAMPlacementAlgorithm::ROUND_ROBIN_FLIP_FLOP)
-        .value("GREATEST_CAPACITY", tt::placer::DRAMPlacementAlgorithm::GREATEST_CAPACITY)
-        .value("CLOSEST", tt::placer::DRAMPlacementAlgorithm::CLOSEST)
-        .export_values()
-        .def("to_json", [](const tt::placer::DRAMPlacementAlgorithm algorithm){
-            switch (algorithm)
-            {
-                case tt::placer::DRAMPlacementAlgorithm::ROUND_ROBIN: return "ROUND_ROBIN";
-                case tt::placer::DRAMPlacementAlgorithm::ROUND_ROBIN_FLIP_FLOP: return "ROUND_ROBIN_FLIP_FLOP";
-                case tt::placer::DRAMPlacementAlgorithm::GREATEST_CAPACITY: return "GREATEST_CAPACITY";
-                case tt::placer::DRAMPlacementAlgorithm::CLOSEST: return "CLOSEST";
-                default: break;
-            }
-            throw std::runtime_error("DRAMPlacementAlgorithm::to_json with unrecognized case!");
-        })
-        .def("from_json", [](std::string const &encoded){
-            static std::unordered_map<std::string, tt::placer::DRAMPlacementAlgorithm> decode = {
-                {"ROUND_ROBIN", tt::placer::DRAMPlacementAlgorithm::ROUND_ROBIN},
-                {"ROUND_ROBIN_FLIP_FLOP", tt::placer::DRAMPlacementAlgorithm::ROUND_ROBIN_FLIP_FLOP},
-                {"GREATEST_CAPACITY", tt::placer::DRAMPlacementAlgorithm::GREATEST_CAPACITY},
-                {"CLOSEST", tt::placer::DRAMPlacementAlgorithm::CLOSEST},
-            };
-            return decode.at(encoded);
-
-        });
-
-    m_placer.def("match_op_names_to_placer_overrides", &placer::match_op_names_to_placer_overrides);
-}
-
-} // namespace tt
-
diff --git a/pybuda/csrc/placer/python_bindings.hpp b/pybuda/csrc/placer/python_bindings.hpp
deleted file mode 100644
index f52a9315f..000000000
--- a/pybuda/csrc/placer/python_bindings.hpp
+++ /dev/null
@@ -1,14 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <pybind11/numpy.h>
-namespace py = pybind11;
-
-namespace tt {
-
-void PlacerModule(py::module &m_placer);
-
-} // namespace tt
-
diff --git a/pybuda/csrc/placer/tests/dram.cpp b/pybuda/csrc/placer/tests/dram.cpp
deleted file mode 100644
index 0975610dd..000000000
--- a/pybuda/csrc/placer/tests/dram.cpp
+++ /dev/null
@@ -1,473 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "placer/dram.hpp"
-
-#include "balancer/types.hpp"
-#include "graph_lib/defines.hpp"
-#include "graph_lib/node_types.hpp"
-#include "gtest/gtest.h"
-#include "placer/best_fit_allocator.hpp"
-#include "placer/chip_id_assignment.hpp"
-#include "placer/dram_allocator.hpp"
-#include "placer/lowering_utils.hpp"
-#include "placer/placer.hpp"
-#include "test/common.hpp"
-
-using namespace tt::placer;
-using std::runtime_error;
-using std::string;
-using std::unordered_map;
-using tt::graphlib::NodeEpochType;
-using tt::test::Arch;
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
-
-//
-// Tests for DRAM allocators
-//
-namespace test
-{
-extern unordered_map<string, NodeEpochType> map_ops_to_forward_epoch(const vector<string> &scheduled_ops);
-
-// Test parameters for each test
-struct TestConfig
-{
-    DRAMPlacementAlgorithm algo = DRAMPlacementAlgorithm::ROUND_ROBIN;
-    bool input_queues_on_host = true;
-    bool output_queues_on_host = true;
-    tt::DramQueueMap manual_dram_queue_placemenet = {};
-};
-
-class DRAMPlacerTest : public testing::TestWithParam<Arch>
-{
-    // List of queues to be placed
-    std::vector<std::pair<QueuePlacement, QueueDRAMPlacementParameters>> queue_placement_params;
-
-    // DRAM allocator, DUT
-    std::unique_ptr<DramAllocator> allocator;
-
-   public:
-    // Configs
-    tt::DeviceConfig device_config = tt::test::create_device_config();
-    std::unique_ptr<DramPlacerConfig> dram_config;
-
-    // Main graph
-    std::unique_ptr<Graph> graph;
-
-    // Use user-friendly test parameter names
-    struct PrintToStringParamName
-    {
-        template <class ParamType>
-        std::string operator()(const testing::TestParamInfo<ParamType> &info) const
-        {
-            auto arch = static_cast<Arch>(info.param);
-            return arch2str(arch);
-        }
-    };
-
-    // Alias to make code more readable
-    Arch get_arch() { return GetParam(); }
-
-    // Common overrides
-    void SetUp(DRAMPlacementAlgorithm algo)
-    {
-        TestConfig test_cfg;
-        test_cfg.algo = algo;
-        SetUp(test_cfg);
-    }
-
-    void SetUp(TestConfig test_cfg)
-    {
-        device_config = tt::test::create_device_config(get_arch());
-        std::vector<std::string> scheduled_ops;
-        PlacerConfig placer_config = {
-            .chip_ids = std::vector<std::uint32_t>{0},
-            .device_config = device_config,
-            .device_grid = {(std::uint32_t)device_config.grid_size.r, (std::uint32_t)device_config.grid_size.c},
-            .strategy = PlacementStrategy::LeftToRight,
-            .op_to_grid_shape = lowering::get_op_to_grid_shape(scheduled_ops),
-            .op_to_epoch_type = test::map_ops_to_forward_epoch(scheduled_ops),
-            .ops_tagged_for_chip_id_break = {},
-            .ops_tagged_for_epoch_break = {},
-            .fwd_to_bwd_nodes = {},
-            .fwd_to_opt_nodes = {},
-        };
-
-        std::vector<Blocks> allocated_blocks;
-        dram_config = std::make_unique<DramPlacerConfig>(
-            device_config,
-            test_cfg.input_queues_on_host,
-            test_cfg.output_queues_on_host,
-            test_cfg.manual_dram_queue_placemenet);
-        allocator =
-            std::make_unique<DramAllocator>(*dram_config, "unit_test_graph", 0, allocated_blocks, test_cfg.algo);
-
-        graph = std::make_unique<Graph>(tt::graphlib::IRLevel::IR_BUDA);
-    }
-
-    std::pair<const Node *, QueueDRAMPlacementParameters &> add_e2e_queue(
-        std::uint32_t grid_r,
-        std::uint32_t grid_c,
-        std::uint32_t producer_epoch = 0,
-        std::uint32_t last_consumer_epoch = 0,
-        QueueDRAMPlacementParameters::ConsumerMap consumer_loc = {},
-        QueueDRAMPlacementParameters::ProducerMap producer_loc = {})
-    {
-        std::uint32_t node_number = 0;
-        std::string node_name = "queue_" + std::to_string(node_number);
-        while (graph->has_node_with_name(node_name))
-        {
-            node_number++;
-            node_name = "queue_" + std::to_string(node_number);
-        }
-
-        auto *node =
-            graph->add_node(tt::graphlib::create_node<tt::graphlib::EpochToEpochQueueNode>(node_name, true, false), 0);
-
-        CoordRange queue_coord_range = {0, 0, grid_r, grid_c};
-
-        bool is_input = false;
-        bool in_p2p_region_soft = false;
-        bool in_p2p_region_hard = false;
-        tt::balancer::BlockShape block_shape = {1, 1, 1, tt::balancer::UBlockShape{1, 1}};
-
-        std::string input_name = "foo";
-        queue_placement_params.push_back(
-            {QueuePlacement{
-                 .name = node->name(),
-                 .input_name = input_name,
-                 .grid_shape = {queue_coord_range.size_r(), queue_coord_range.size_c()},
-                 .on_host = false,
-                 .chip_id = 0,
-                 .dram_buffers = {},
-                 .host_buffers = {},
-                 .epoch_allocate = -1,
-                 .epoch_deallocate = -1},
-             QueueDRAMPlacementParameters{
-                 .config = dram_config.get(),
-                 .node = node,
-                 .grid_shape = {queue_coord_range.size_r(), queue_coord_range.size_c()},
-                 .consumer_loc = consumer_loc,
-                 .producer_loc = producer_loc,
-                 .block_shape = block_shape,
-                 .producer_epoch = producer_epoch,
-                 .last_consumer_epoch = last_consumer_epoch,
-                 .in_p2p_region_soft = in_p2p_region_soft,
-                 .in_p2p_region_hard = in_p2p_region_hard,
-                 .is_input = is_input,
-             }});
-
-        return {node, queue_placement_params.back().second};
-    }
-
-    std::unordered_map<const Node *, std::vector<QueueBufferPlacement>> run_allocator()
-    {
-        allocator->allocate_queues(queue_placement_params, dram_config->disable_dynamic_dram);
-        std::unordered_map<const Node *, std::vector<QueueBufferPlacement>> ret;
-        for (auto &[queue_placement, queue_dram_placement_params] : queue_placement_params)
-        {
-            ret[queue_dram_placement_params.node] = queue_placement.dram_buffers;
-        }
-        return ret;
-    }
-};  // namespace test
-
-TEST_P(DRAMPlacerTest, RoundRobin)
-{
-    SetUp(DRAMPlacementAlgorithm::ROUND_ROBIN);
-    auto q1 = add_e2e_queue(5, 4);
-    auto results = run_allocator();
-
-    // Check results
-    std::uint32_t expected_channel = 0;
-    std::uint32_t expected_subchannel = 0;
-    for (auto b : results.at(q1.first))
-    {
-        EXPECT_EQ(b.dram_channel, expected_channel);
-        if (get_arch() == Arch::Wormhole_b0)
-        {
-            // Each channel is "two in one"
-            if (expected_subchannel == 0)
-            {
-                EXPECT_LT(b.dram_address, dram_config->dram_config[0].channel_size / 2);
-                expected_subchannel = 1;
-            }
-            else
-            {
-                EXPECT_GE(b.dram_address, dram_config->dram_config[0].channel_size / 2);
-                expected_subchannel = 0;
-                expected_channel++;
-            }
-        }
-        else
-        {
-            expected_channel++;
-        }
-        if (expected_channel >= device_config.get_dram_num_channels())
-        {
-            expected_channel = 0;
-        }
-    }
-}
-
-TEST_P(DRAMPlacerTest, RoundRobinFlipFlop)
-{
-    SetUp(DRAMPlacementAlgorithm::ROUND_ROBIN_FLIP_FLOP);
-    auto q1 = add_e2e_queue(2, 4);  // group 0
-    auto q2 = add_e2e_queue(2, 4);  // group 1
-    q2.second.producer_epoch = 1;
-    auto q3 = add_e2e_queue(2, 4);  // group 0
-    auto results = run_allocator();
-
-    // Check results
-    std::vector<std::uint32_t> expected_channel = {0, device_config.get_dram_num_channels() / 2};
-    std::vector<std::uint32_t> expected_subchannel = {0, 0};
-    auto check_group = [&](int group, const std::vector<QueueBufferPlacement> &results)
-    {
-        for (auto b : results)
-        {
-            EXPECT_EQ(b.dram_channel, expected_channel[group]);
-            if (get_arch() == Arch::Wormhole_b0)
-            {
-                // Each channel is "two in one"
-                if (expected_subchannel[group] == 0)
-                {
-                    EXPECT_LT(b.dram_address, dram_config->dram_config[0].channel_size / 2);
-                    expected_subchannel[group] = 1;
-                }
-                else
-                {
-                    EXPECT_GE(b.dram_address, dram_config->dram_config[0].channel_size / 2);
-                    expected_subchannel[group] = 0;
-                    expected_channel[group]++;
-                }
-            }
-            else
-            {
-                expected_channel[group]++;
-            }
-            if (group == 0)
-            {
-                if (expected_channel[group] >= device_config.get_dram_num_channels() / 2)
-                {
-                    expected_channel[group] = 0;
-                }
-            }
-            else
-            {
-                if (expected_channel[group] >= device_config.get_dram_num_channels())
-                {
-                    expected_channel[group] = device_config.get_dram_num_channels() / 2;
-                }
-            }
-        }
-    };
-
-    check_group(0, results.at(q1.first));
-    check_group(1, results.at(q2.first));
-    check_group(0, results.at(q3.first));
-}
-
-/*
- wormhole dram channels
-     0 1 2 3 4 5 6 7 8 9
-    +-+-+-+-+-+-+-+-+-+-+
-  0 |0| | | | |2| | | | |
-    +-+-+-+-+-+-+-+-+-+-+
-  1 |0| | | | |2| | | | |
-    +-+-+-+-+-+-+-+-+-+-+
-  2 | | | | | |3| | | | |
-    +-+-+-+-+-+-+-+-+-+-+
-  3 | | | | | |4| | | | |
-    +-+-+-+-+-+-+-+-+-+-+
-  4 | | | | | |4| | | | |
-    +-+-+-+-+-+-+-+-+-+-+
-  5 |1| | | | |5| | | | |
-    +-+-+-+-+-+-+-+-+-+-+
-  6 |1| | | | |5| | | | |
-    +-+-+-+-+-+-+-+-+-+-+
-  7 |1| | | | |5| | | | |
-    +-+-+-+-+-+-+-+-+-+-+
-  8 | | | | | |4| | | | |
-    +-+-+-+-+-+-+-+-+-+-+
-  9 | | | | | |3| | | | |
-    +-+-+-+-+-+-+-+-+-+-+
- 10 | | | | | |3| | | | |
-    +-+-+-+-+-+-+-+-+-+-+
- 11 |0| | | | |2| | | | |
-    +-+-+-+-+-+-+-+-+-+-+
-*/
-
-TEST_P(DRAMPlacerTest, Closest)
-{
-    if (get_arch() != Arch::Wormhole_b0)
-    {
-        GTEST_SKIP();  // Focus on WH for now
-    }
-    SetUp(DRAMPlacementAlgorithm::CLOSEST);
-    QueueDRAMPlacementParameters::ConsumerMap consumer_loc = {};
-    QueueDRAMPlacementParameters::ProducerMap producer_loc = {};
-
-    producer_loc[0][0] = {Coord{1, 1}, 0};
-    consumer_loc[0][0].push_back({Coord{2, 2}, 1});
-    auto q1 = add_e2e_queue(1, 1, 0, 1, consumer_loc, producer_loc);
-    auto q2 = add_e2e_queue(
-        1, 1, 0, 1, consumer_loc, producer_loc);  // same locations, same epoch, should pick the other subchannel
-
-    producer_loc.clear();
-    consumer_loc.clear();
-    producer_loc[0][0] = {Coord{2, 9}, 0};
-    producer_loc[0][1] = {Coord{8, 6}, 0};
-    consumer_loc[0][0].push_back({Coord{8, 2}, 1});
-    consumer_loc[0][0].push_back({Coord{8, 3}, 1});
-    consumer_loc[0][1].push_back({Coord{8, 4}, 1});
-    auto q3 = add_e2e_queue(1, 2, 0, 1, consumer_loc, producer_loc);
-
-    auto results = run_allocator();
-
-    EXPECT_EQ(results.at(q1.first)[0].dram_channel, 0);
-    EXPECT_EQ(results.at(q2.first)[0].dram_channel, 0);
-    EXPECT_GE(results.at(q2.first)[0].dram_address, dram_config->dram_config[0].channel_size / 2);
-    EXPECT_EQ(results.at(q3.first)[0].dram_channel, 3);
-    EXPECT_EQ(results.at(q3.first)[1].dram_channel, 4);
-}
-
-// Test without a producer core
-TEST_P(DRAMPlacerTest, Closest_no_producer)
-{
-    if (get_arch() != Arch::Wormhole_b0)
-    {
-        GTEST_SKIP();  // Focus on WH for now
-    }
-    SetUp(DRAMPlacementAlgorithm::CLOSEST);
-    QueueDRAMPlacementParameters::ConsumerMap consumer_loc = {};
-    QueueDRAMPlacementParameters::ProducerMap producer_loc = {};
-
-    consumer_loc[0][0].push_back({Coord{1, 1}, 0});
-    consumer_loc[0][1].push_back({Coord{1, 2}, 0});
-    consumer_loc[0][2].push_back({Coord{1, 3}, 0});
-    auto q1 = add_e2e_queue(1, 3, 0, 0, consumer_loc, producer_loc);
-
-    auto results = run_allocator();
-
-    EXPECT_EQ(results.at(q1.first)[0].dram_channel, 0);
-}
-
-//
-// Tests that check that reader core is calculated correctly for various ops and grids
-//
-
-class ReaderCoreTest : public testing::TestWithParam<bool>
-{
-public:
-    // Alias to make code more readable
-    bool grid_transpose() { return GetParam(); }
-
-    Coord position;
-    GridShape op_dim, queue_dim;
-    std::unique_ptr<tt::graphlib::BudaOpNode> test_op;
-    OpPlacement placement;
-
-    void SetUp(const std::string &op_type, GridShape op_dim_ = {2, 3}, GridShape queue_dim_ = {2, 3}, Coord position_ = {1, 5})
-    {
-        op_dim = op_dim_;
-        queue_dim = queue_dim_;
-        position = position_;
-
-        test_op = tt::graphlib::create_node<tt::graphlib::BudaOpNode>("test_op", op_type);
-        CoordRange placed_cores = {position, position + (grid_transpose() ? op_dim.transposed() : op_dim)};
-        placement = {0, test_op->name(), 0, 0, grid_transpose(), placed_cores};
-    }
-
-    std::vector<Coord> calculate_readers(const Coord &dram_core, std::uint32_t operand = 0)
-    {
-        return get_reader_cores(test_op.get(), placement, operand, dram_core, queue_dim);
-    }
-
-};
-
-//
-// Test get_reader_cores function for various ops and placements
-//
-
-TEST_P(ReaderCoreTest, ReaderCores_EltwiseOnetoOne)
-{
-    // Eltwise, 2x3 reading 2x3 buffer
-    SetUp("add", {2, 3}, {2, 3});
-
-    for (std::uint32_t x = 0; x < queue_dim.columns; x++)
-    {
-        for (std::uint32_t y = 0; y < queue_dim.rows; y++)
-        {
-            Coord dram_core = {y, x};
-            std::vector<Coord> reader_cores = calculate_readers(dram_core);
-            Coord expected_offset = {grid_transpose() ? x : y, grid_transpose() ? y : x};
-            EXPECT_EQ(reader_cores.size(), 1) << "Expect one reader core for each dram core";
-            EXPECT_EQ(reader_cores.at(0), position + expected_offset) << "One to one mapping";
-        }
-    }
-}
-
-TEST_P(ReaderCoreTest, ReaderCores_EltwiseSingleBuffer)
-{
-    // Eltwise, 4x2 reading 1x1 buffer
-    SetUp("add", {4, 2}, {1, 1});
-
-    Coord dram_core = {0, 0};
-    std::vector<Coord> reader_cores = calculate_readers(dram_core);
-    EXPECT_EQ(reader_cores.size(), op_dim.volume()) << "1x1 buffer, expected all cores to be readers";
-    std::uint32_t index = 0;
-    for (std::uint32_t y = 0; y < op_dim.rows; y++)
-    {
-        for (std::uint32_t x = 0; x < op_dim.columns; x++)
-        {
-            Coord expected_offset = {grid_transpose() ? x : y, grid_transpose() ? y : x};
-            EXPECT_EQ(reader_cores.at(index), position + expected_offset) << "1x1 buffer, op x=" << x << ", y=" << y;
-            index++;
-        }
-    }
-
-}
-
-TEST_P(ReaderCoreTest, ReaderCores_MatmulOnetoOne)
-{
-    // Matmul, 2x3 reading 2x3 buffer
-    SetUp("matmul", {2, 3}, {2, 3});
-
-    for (std::uint32_t x = 0; x < queue_dim.columns; x++)
-    {
-        for (std::uint32_t y = 0; y < queue_dim.rows; y++)
-        {
-            Coord dram_core = {y, x};
-            
-            // Activations - first column (x=0) reads only
-            {
-                std::vector<Coord> reader_cores = calculate_readers(dram_core, 0);
-                Coord expected_offset = {grid_transpose() ? 0 : y, grid_transpose() ? y : 0};
-                EXPECT_EQ(reader_cores.size(), 1) << "Expect one reader core for each dram core";
-                EXPECT_EQ(reader_cores.at(0), position + expected_offset);
-            }
-
-            // Weights - last row (y = op_dim.rows - 1) reads
-            {
-                std::vector<Coord> reader_cores = calculate_readers(dram_core, 1);
-                Coord expected_offset = {grid_transpose() ? x : op_dim.rows - 1, grid_transpose() ? op_dim.rows - 1 : x};
-                EXPECT_EQ(reader_cores.size(), 1) << "Expect one reader core for each dram core";
-                EXPECT_EQ(reader_cores.at(0), position + expected_offset);
-            }
-
-        }
-    }
-}
-
-INSTANTIATE_TEST_SUITE_P(
-    DRAMPlacerTests,
-    DRAMPlacerTest,
-    ::testing::Values(Arch::Wormhole_b0, Arch::Grayskull),
-    DRAMPlacerTest::PrintToStringParamName());
-
-INSTANTIATE_TEST_SUITE_P(DRAMPlacerTests, ReaderCoreTest, ::testing::Values(false, true));
-
-}  // namespace test
diff --git a/pybuda/csrc/placer/tests/gtest_main.cpp b/pybuda/csrc/placer/tests/gtest_main.cpp
deleted file mode 100644
index a4addef8a..000000000
--- a/pybuda/csrc/placer/tests/gtest_main.cpp
+++ /dev/null
@@ -1,11 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include <gtest/gtest.h>
-#include <pybind11/embed.h>
-
-int main(int argc, char **argv) {
-    ::testing::InitGoogleTest(&argc, argv);
-    pybind11::scoped_interpreter guard{};
-    return RUN_ALL_TESTS();
-}
diff --git a/pybuda/csrc/placer/tests/module.mk b/pybuda/csrc/placer/tests/module.mk
deleted file mode 100644
index f3d763da0..000000000
--- a/pybuda/csrc/placer/tests/module.mk
+++ /dev/null
@@ -1,23 +0,0 @@
-PYBUDA_CSRC_PLACER_TESTS = $(TESTDIR)/pybuda/csrc/placer/tests/placer_unit_tests
-PYBUDA_CSRC_PLACER_TESTS_SRCS = \
-	pybuda/csrc/placer/tests/unit_tests.cpp \
-	pybuda/csrc/placer/tests/dram.cpp \
-	pybuda/csrc/placer/tests/gtest_main.cpp
-
-PYBUDA_CSRC_PLACER_TESTS_INCLUDES = $(PYBUDA_CSRC_PLACER_INCLUDES)
-PYBUDA_CSRC_PLACER_TESTS_LDFLAGS = -lstdc++fs -lgtest -lgtest_main -lpthread -l$(PYTHON_VERSION) -lm
-
-PYBUDA_CSRC_PLACER_TESTS_OBJS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_PLACER_TESTS_SRCS:.cpp=.o))
-PYBUDA_CSRC_PLACER_TESTS_DEPS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_PLACER_TESTS_SRCS:.cpp=.d))
-
--include $(PYBUDA_CSRC_PLACER_TESTS_DEPS)
-
-pybuda/csrc/placer/tests: $(PYBUDA_CSRC_PLACER_TESTS)
-
-$(PYBUDA_CSRC_PLACER_TESTS): $(PYBUDA_CSRC_PLACER_TESTS_OBJS) $(PYBUDA_CSRC_LIB)
-	@mkdir -p $(@D)
-	$(CXX) $(PLACER_CSRC_CFLAGS) $(CXXFLAGS) -o $@ $^ $(LDFLAGS) $(PYBUDA_CSRC_PLACER_TESTS_LDFLAGS)
-
-$(OBJDIR)/pybuda/csrc/placer/tests/%.o: pybuda/csrc/placer/tests/%.cpp
-	@mkdir -p $(@D)
-	$(CXX) $(PLACER_CSRC_CFLAGS) $(CXXFLAGS) $(PYBUDA_CSRC_PLACER_TESTS_INCLUDES) -c -o $@ $<
diff --git a/pybuda/csrc/placer/tests/unit_tests.cpp b/pybuda/csrc/placer/tests/unit_tests.cpp
deleted file mode 100644
index 3b96ed98f..000000000
--- a/pybuda/csrc/placer/tests/unit_tests.cpp
+++ /dev/null
@@ -1,588 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "gtest/gtest.h"
-
-#include "graph_lib/defines.hpp"
-#include "placer/placer.hpp"
-#include "placer/lowering_utils.hpp"
-#include "placer/best_fit_allocator.hpp"
-#include "placer/chip_id_assignment.hpp"
-#include "test/common.hpp"
-
-#include "third_party/json/json.hpp"
-#include <unordered_map>
-#include <optional>
-#include <stdexcept>
-#include <stdlib.h>
-
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wmissing-field-initializers"
-
-using namespace tt::placer;
-using std::unordered_map;
-using std::string;
-using std::runtime_error;
-using tt::graphlib::NodeEpochType;
-
-namespace test
-{
-
-unordered_map<string, NodeEpochType> map_ops_to_forward_epoch(const vector<string>& scheduled_ops)
-{
-    unordered_map<string, NodeEpochType> op_to_epoch_type;
-    for (const string& op : scheduled_ops)
-    {
-        op_to_epoch_type[op] = NodeEpochType::Forward;
-    }
-    return op_to_epoch_type;
-}
-
-} // namespace test
-
-TEST(Placer, single_row)
-{
-    vector<string> scheduled_ops = {
-        "matmul0",
-        "matmul1",
-        "matmul2",
-    };
-
-    tt::DeviceConfig device_config = tt::test::create_device_config();
-    PlacerConfig placer_config = {
-        .chip_ids = std::vector<std::uint32_t>{0},
-        .device_config = device_config,
-        .device_grid = {10, 12},
-        .strategy = PlacementStrategy::LeftToRight,
-        .op_to_grid_shape = lowering::get_op_to_grid_shape(scheduled_ops),
-        .op_to_epoch_type = test::map_ops_to_forward_epoch(scheduled_ops),
-        .ops_tagged_for_chip_id_break = {},
-        .ops_tagged_for_epoch_break = {},
-        .fwd_to_bwd_nodes = {},
-        .fwd_to_opt_nodes = {},
-    };
-    vector<OpGroupToPlace> placer_op_group_workload = lowering::generate_simple_placer_workload(placer_config, scheduled_ops);
-    PlacerSolution solution = place_onto_chip(placer_config, placer_op_group_workload);
-    EXPECT_TRUE(solution.num_epochs == 1);
-}
-
-TEST(Placer, multiple_row)
-{
-    vector<string> scheduled_ops = {
-        "matmul0",
-        "matmul1",
-        "matmul2",
-        "matmul3",
-        "matmul4",
-        "matmul5",
-        "matmul6",
-        "matmul7",
-        "matmul8",
-        "matmul9",
-        "matmul10",
-        "matmul11",
-        "matmul12",
-    };
-
-    tt::DeviceConfig device_config = tt::test::create_device_config();
-    PlacerConfig placer_config = {
-        .chip_ids = std::vector<std::uint32_t>{0},
-        .device_config = device_config,
-        .device_grid = {10, 12},
-        .op_to_grid_shape = lowering::get_op_to_grid_shape(scheduled_ops),
-        .op_to_epoch_type = test::map_ops_to_forward_epoch(scheduled_ops),
-        .ops_tagged_for_chip_id_break = {},
-        .ops_tagged_for_epoch_break = {},
-        .fwd_to_bwd_nodes = {},
-        .fwd_to_opt_nodes = {},
-    };
-
-    vector<OpGroupToPlace> placer_op_group_workload = lowering::generate_simple_placer_workload(placer_config, scheduled_ops);
-    PlacerSolution solution = place_onto_chip(placer_config, placer_op_group_workload);
-    EXPECT_TRUE(solution.num_epochs == 1);
-}
-
-TEST(Placer, multiple_epochs)
-{
-    vector<string> scheduled_ops = {
-        "matmul0",
-        "matmul1",
-        "matmul2",
-        "matmul3",
-    };
-
-    // Each epoch should hold two ops
-    unordered_map<string, GridShape> op_to_grid_shape = {
-        {"matmul0", {.rows = 10, .columns=6}},
-        {"matmul1", {.rows = 10, .columns=6}},
-        {"matmul2", {.rows = 10, .columns=6}},
-        {"matmul3", {.rows = 10, .columns=6}},
-    };
-
-    tt::DeviceConfig device_config = tt::test::create_device_config();
-    PlacerConfig placer_config = {
-        .chip_ids = std::vector<std::uint32_t>{0},
-        .device_config = device_config,
-        .device_grid = {10, 12},
-        .op_to_grid_shape = op_to_grid_shape,
-        .op_to_epoch_type = test::map_ops_to_forward_epoch(scheduled_ops),
-        .ops_tagged_for_chip_id_break = {},
-        .ops_tagged_for_epoch_break = {},
-        .fwd_to_bwd_nodes = {},
-        .fwd_to_opt_nodes = {},
-    };
-    vector<OpGroupToPlace> placer_op_group_workload = lowering::generate_simple_placer_workload(placer_config, scheduled_ops);
-    PlacerSolution solution = place_onto_chip(placer_config, placer_op_group_workload);
-
-    EXPECT_TRUE(solution.num_epochs == 2);
-}
-
-TEST(Placer, test_fwd_bwd_epoch_splitting)
-{
-    vector<string> scheduled_ops = {
-        "matmul0",
-        "matmul0_bwd",
-    };
-
-    // Each epoch should hold two ops
-    unordered_map<string, GridShape> op_to_grid_shape = {
-        {"matmul0", {.rows = 10, .columns=6}},
-        {"matmul0_bwd", {.rows = 10, .columns=6}},
-    };
-    unordered_map<string, NodeEpochType> op_to_epoch_type = {
-        {"matmul0", NodeEpochType::Forward},
-        {"matmul0_bwd", NodeEpochType::Backward},
-    };
-
-    tt::DeviceConfig device_config = tt::test::create_device_config();
-    PlacerConfig placer_config = {
-        .chip_ids = std::vector<std::uint32_t>{0},
-        .device_config = device_config,
-        .device_grid = {10, 12},
-        .op_to_grid_shape = op_to_grid_shape,
-        .op_to_epoch_type = op_to_epoch_type,
-        .ops_tagged_for_chip_id_break = {},
-        .ops_tagged_for_epoch_break = {},
-        .fwd_to_bwd_nodes = {},
-        .fwd_to_opt_nodes = {},
-    };
-    vector<OpGroupToPlace> placer_op_group_workload = lowering::generate_simple_placer_workload(placer_config, scheduled_ops);
-    PlacerSolution solution = place_onto_chip(placer_config, placer_op_group_workload);
-
-    EXPECT_TRUE(solution.num_epochs == 2);
-}
-
-TEST(Placer, test_multichip_fwd)
-{
-    vector<string> scheduled_ops = {
-        "matmul0",
-        "matmul1",
-    };
-
-    // Each epoch should hold two ops
-    unordered_map<string, GridShape> op_to_grid_shape = {
-        {"matmul0", {.rows = 10, .columns=6}},
-        {"matmul1", {.rows = 10, .columns=6}},
-    };
-    unordered_map<string, NodeEpochType> op_to_epoch_type = {
-        {"matmul0", NodeEpochType::Forward},
-        {"matmul1", NodeEpochType::Forward},
-    };
-
-    ChipPlacerConfig chip_placer_config = {
-        .chip_ids = std::vector<std::uint32_t>{0, 1},
-        .arch_name = "grayskull",
-        .op_to_epoch_type = test::map_ops_to_forward_epoch(scheduled_ops),
-        .ops_tagged_for_chip_id_break = {"matmul1"},
-        .ops_tagged_for_epoch_break = {},
-        .fwd_to_bwd_nodes = {},
-        .fwd_to_opt_nodes = {},
-    };
-    OpToChipIdAssignment op_to_chip_id_assignment = get_op_to_chip_id_assignment(chip_placer_config, scheduled_ops);
-
-    tt::DeviceConfig device_config = tt::test::create_device_config();
-    PlacerConfig placer_config = {
-        .chip_ids = std::vector<std::uint32_t>{0, 1},
-        .device_config = device_config,
-        .device_grid = {10, 12},
-        .op_to_grid_shape = op_to_grid_shape,
-        .op_to_epoch_type = op_to_epoch_type,
-        .ops_tagged_for_chip_id_break = {"matmul1"},
-        .ops_tagged_for_epoch_break = {},
-        .fwd_to_bwd_nodes = {},
-        .fwd_to_opt_nodes = {},
-        .op_to_chip_id_assignment = op_to_chip_id_assignment,
-    };
-
-
-    PlacerSolution solution = placer(placer_config, scheduled_ops);
-
-    EXPECT_EQ(solution.num_epochs, 2);
-}
-
-
-TEST(Placer, test_multichip_fwd_and_bwd)
-{
-    vector<string> scheduled_ops = {
-        "matmul0",
-        "matmul1",
-        "matmul0_bwd",
-        "matmul1_bwd",
-    };
-
-    // Each epoch should hold two ops
-    unordered_map<string, GridShape> op_to_grid_shape = {
-        {"matmul0",     {.rows = 10, .columns = 6}},
-        {"matmul0_bwd", {.rows = 10, .columns = 6}},
-        {"matmul1",     {.rows = 10, .columns = 6}},
-        {"matmul1_bwd", {.rows = 10, .columns = 6}},
-    };
-    unordered_map<string, NodeEpochType> op_to_epoch_type = {
-        {"matmul0",     NodeEpochType::Forward},
-        {"matmul0_bwd", NodeEpochType::Backward},
-        {"matmul1",     NodeEpochType::Forward},
-        {"matmul1_bwd", NodeEpochType::Backward},
-    };
-    ChipPlacerConfig chip_placer_config = {
-        .chip_ids = std::vector<std::uint32_t>{0, 1},
-        .arch_name = "grayskull",
-        .op_to_epoch_type = test::map_ops_to_forward_epoch(scheduled_ops),
-        .ops_tagged_for_chip_id_break = {"matmul1"},
-        .ops_tagged_for_epoch_break = {},
-        .fwd_to_bwd_nodes = {{"matmul0", {"matmul0_bwd"}}, {"matmul1", {"matmul1_bwd"}}},
-        .fwd_to_opt_nodes = {},
-    };
-    OpToChipIdAssignment op_to_chip_id_assignment = get_op_to_chip_id_assignment(chip_placer_config, scheduled_ops);
-
-    tt::DeviceConfig device_config = tt::test::create_device_config();
-    PlacerConfig placer_config = {
-        .chip_ids = std::vector<std::uint32_t>{0, 1},
-        .device_config = device_config,
-        .device_grid = {10, 12},
-        .op_to_grid_shape = op_to_grid_shape,
-        .op_to_epoch_type = op_to_epoch_type,
-        .ops_tagged_for_chip_id_break = {"matmul1"},
-        .ops_tagged_for_epoch_break = {},
-        .fwd_to_bwd_nodes = {{"matmul0", {"matmul0_bwd"}}, {"matmul1", {"matmul1_bwd"}}},
-        .fwd_to_opt_nodes = {},
-        .op_to_chip_id_assignment = op_to_chip_id_assignment,
-    };
-    PlacerSolution solution = placer(placer_config, scheduled_ops);
-
-    EXPECT_EQ(solution.num_epochs, 4);
-}
-
-
-TEST(Placer, triplet_placement)
-{
-    setenv("PYBUDA_TRIPLET_PLACEMENT", "1", 0);
-    GridShape matmul0_grid = {.rows = 10, .columns = 6};
-    GridShape matmul0_bwd0_grid = {.rows = 2, .columns = 2};
-    GridShape matmul0_bwd1_grid = {.rows = 2, .columns = 2};
-    GridShape matmul0_bwd2_grid = {.rows = 2, .columns = 2};
-
-    unordered_map<string, GridShape> op_to_grid_shape = {
-        {"matmul0", matmul0_grid},
-        {"matmul0_bwd0", matmul0_bwd0_grid},
-        {"matmul0_bwd1", matmul0_bwd1_grid},
-        {"matmul0_bwd2", matmul0_bwd2_grid},
-    };
-
-    unordered_map<string, NodeEpochType> op_to_epoch_type ={
-        {"matmul0", NodeEpochType::Forward},
-        {"matmul0_bwd0", NodeEpochType::Backward},
-        {"matmul0_bwd1", NodeEpochType::Backward},
-        {"matmul0_bwd2", NodeEpochType::Backward},
-    };
-
-    // We can also annotate properties on op-groupings like:
-    //   force-epoch-break, force-chip-break, force-new-row
-    // and future support for partial placements
-    OpGroupToPlace op_group0 = {
-        .op_names = {"matmul0"},
-        .op_name_to_relative_offset_from_first_op = {},
-    };
-
-    OpGroupToPlace op_group1 = {
-        .op_names = {"matmul0_bwd0", "matmul0_bwd1", "matmul0_bwd2"},
-        .op_name_to_relative_offset_from_first_op = {
-            {"matmul0_bwd1", {.row_offset = 2, .column_offset = 0}},
-            {"matmul0_bwd2", {.row_offset = 4, .column_offset = 0}},
-        }
-    };
-
-    vector<OpGroupToPlace> placer_op_group_workload = {
-        op_group0,
-        op_group1,
-    };
-
-    tt::DeviceConfig device_config = tt::test::create_device_config();
-    PlacerConfig placer_config = {
-        .chip_ids = std::vector<std::uint32_t>{0},
-        .device_config = device_config,
-        .device_grid = {10, 12},
-        .op_to_grid_shape = op_to_grid_shape,
-        .op_to_epoch_type = op_to_epoch_type,
-        .ops_tagged_for_chip_id_break = {},
-        .ops_tagged_for_epoch_break = {},
-        .fwd_to_bwd_nodes = {},
-        .fwd_to_opt_nodes = {},
-    };
-
-    PlacerSolution solution = place_onto_chip(placer_config, placer_op_group_workload);
-
-    // Final Checks
-    const OpPlacement& matmul_bwd1_placement = solution.name_to_op_placement.at("matmul0_bwd1");
-    const CoordRange& matmul_bwd1_coords = matmul_bwd1_placement.placed_cores;
-
-    EXPECT_EQ(solution.num_epochs, 2);
-    EXPECT_EQ(matmul_bwd1_placement.epoch_id(), 1);
-    EXPECT_EQ(matmul_bwd1_coords.start.row, 2); // expect bwd1 to be right below bwd0
-    EXPECT_EQ(matmul_bwd1_coords.start.col, 0);
-}
-
-
-TEST(Placer, test_epoch_breaks)
-{
-    vector<string> scheduled_ops = {
-        "matmul0",
-        "matmul1",
-        "matmul2",
-        "matmul3",
-    };
-
-    // Each epoch should hold two ops
-    unordered_map<string, GridShape> op_to_grid_shape = {
-        {"matmul0",     {.rows = 2, .columns = 2}},
-        {"matmul1",     {.rows = 2, .columns = 2}},
-        {"matmul2",     {.rows = 2, .columns = 2}},
-        {"matmul3",     {.rows = 2, .columns = 2}},
-    };
-    unordered_map<string, NodeEpochType> op_to_epoch_type = {
-        {"matmul0",     NodeEpochType::Forward},
-        {"matmul1",     NodeEpochType::Forward},
-        {"matmul2",     NodeEpochType::Forward},
-        {"matmul3",     NodeEpochType::Forward},
-    };
-    ChipPlacerConfig chip_placer_config = {
-        .chip_ids = std::vector<std::uint32_t>{0},
-        .arch_name = "grayskull",
-        .op_to_epoch_type = test::map_ops_to_forward_epoch(scheduled_ops),
-        .ops_tagged_for_chip_id_break = {},
-        .ops_tagged_for_epoch_break = {
-            "matmul1",
-            "matmul2"
-        },
-        .fwd_to_bwd_nodes = {},
-        .fwd_to_opt_nodes = {},
-    };
-    OpToChipIdAssignment op_to_chip_id_assignment = get_op_to_chip_id_assignment(chip_placer_config, scheduled_ops);
-
-    tt::DeviceConfig device_config = tt::test::create_device_config();
-    PlacerConfig placer_config = {
-        .chip_ids = std::vector<std::uint32_t>{0, 1},
-        .device_config = device_config,
-        .device_grid = {10, 12},
-        .op_to_grid_shape = op_to_grid_shape,
-        .op_to_epoch_type = op_to_epoch_type,
-        .ops_tagged_for_chip_id_break = {},
-        .ops_tagged_for_epoch_break = {
-            "matmul1",
-            "matmul2"
-        },
-        .fwd_to_bwd_nodes = {},
-        .fwd_to_opt_nodes = {},
-        .op_to_chip_id_assignment = op_to_chip_id_assignment,
-    };
-
-    PlacerSolution solution = placer(placer_config, scheduled_ops);
-    //dump_placer_solution_json_to_file(solution);
-
-    EXPECT_EQ(solution.num_epochs, 3);
-}
-
-
-TEST(Placer, test_row_harvesting)
-{
-    vector<string> scheduled_ops = {
-        "matmul0",
-        "matmul1",
-        "matmul2",
-    };
-
-    uint32_t default_row_cores = 5;
-    uint32_t default_column_cores = 6;
-
-    tt::DeviceConfig device_config = tt::test::create_device_config();
-    PlacerConfig placer_config = {
-        .chip_ids = std::vector<std::uint32_t>{0},
-        .device_config = device_config,
-        .device_grid = {10, 12},
-        .harvested_rows = {5, 6, 7, 8, 9},
-        .strategy = PlacementStrategy::LeftToRight,
-        .op_to_grid_shape = lowering::get_op_to_grid_shape(scheduled_ops, default_row_cores, default_column_cores),
-        .op_to_epoch_type = test::map_ops_to_forward_epoch(scheduled_ops),
-        .ops_tagged_for_chip_id_break = {},
-        .ops_tagged_for_epoch_break = {},
-        .fwd_to_bwd_nodes = {},
-        .fwd_to_opt_nodes = {},
-    };
-    vector<OpGroupToPlace> placer_op_group_workload = lowering::generate_simple_placer_workload(placer_config, scheduled_ops);
-    PlacerSolution solution = place_onto_chip(placer_config, placer_op_group_workload);
-
-    //std::cout << std::setw(4) << solution.to_json() << std::endl;
-    // technically all three ops can be placed on the same epoch, but because the bottom half of the chip is harvested,
-    // this needs to spill into two epochs
-    EXPECT_TRUE(solution.num_epochs == 2);
-}
-
-TEST(Placer, test_manual_transpose_ops)
-{
-    GridShape matmul0_grid = {.rows = 10, .columns = 6}; 
-
-    unordered_map<string, GridShape> op_to_grid_shape = {
-        {"matmul0", matmul0_grid}, 
-    };
-
-    unordered_map<string, NodeEpochType> op_to_epoch_type ={
-        {"matmul0", NodeEpochType::Forward}, 
-    };
- 
-    OpGroupToPlace op_group0 = {
-        .op_names = {"matmul0"},
-        .op_name_to_relative_offset_from_first_op = {},
-    }; 
-
-    vector<OpGroupToPlace> placer_op_group_workload = {
-        op_group0, 
-    };
-    std::optional<Coord> start = std::nullopt;
-    tt::DeviceConfig device_config = tt::test::create_device_config();
-    PlacerConfig placer_config = {
-        .chip_ids = std::vector<std::uint32_t>{0},
-        .device_config = device_config,
-        .device_grid = {10, 12},
-        .op_to_grid_shape = op_to_grid_shape,
-        .op_to_epoch_type = op_to_epoch_type,
-        .ops_tagged_for_chip_id_break = {},
-        .ops_tagged_for_epoch_break = {},
-        .fwd_to_bwd_nodes = {},
-        .fwd_to_opt_nodes = {},
-        .op_to_overrides = {
-            {
-                "matmul0", PlacerOpOverride(start, true /* transpose_op */, std::nullopt /* chip_id */)
-            }
-        },
-    };
-
-    PlacerSolution solution = place_onto_chip(placer_config, placer_op_group_workload);
-
-    // Final Checks
-    const OpPlacement& matmul0_placement = solution.name_to_op_placement.at("matmul0");
-    const CoordRange& matmul0_coords = matmul0_placement.placed_cores;
-    const uint32_t matmul0_coords_r = matmul0_coords.size_r();
-    const uint32_t matmul0_coords_c = matmul0_coords.size_c(); 
-    const bool matmul0_grid_transpose = matmul0_placement.grid_transpose;
- 
-    EXPECT_EQ(matmul0_coords_r, 6); // expect matmul0 to be transposed to (6x10) 
-    EXPECT_EQ(matmul0_coords_c, 10);
-    EXPECT_EQ(matmul0_grid_transpose, true);
-}
-
-TEST(Placer, test_auto_transpose_ops)
-{
-    GridShape matmul0_grid = {.rows = 2, .columns = 6};
-    GridShape matmul1_grid = {.rows = 3, .columns = 2}; 
-
-    unordered_map<string, GridShape> op_to_grid_shape = {
-        {"matmul0", matmul0_grid}, 
-        {"matmul1", matmul1_grid},
-    };
-
-    unordered_map<string, NodeEpochType> op_to_epoch_type ={
-        {"matmul0", NodeEpochType::Forward}, 
-        {"matmul1", NodeEpochType::Forward}, 
-    };
- 
-    OpGroupToPlace op_group0 = {
-        .op_names = {"matmul0", "matmul1"},
-        .op_name_to_relative_offset_from_first_op = {},
-    }; 
-
-    vector<OpGroupToPlace> placer_op_group_workload = {
-        op_group0, 
-    };
-
-    tt::DeviceConfig device_config = tt::test::create_device_config();
-    PlacerConfig placer_config = {
-        .chip_ids = std::vector<std::uint32_t>{0},
-        .device_config = device_config,
-        .device_grid = {10, 12},
-        .op_to_grid_shape = op_to_grid_shape,
-        .op_to_epoch_type = op_to_epoch_type,
-        .ops_tagged_for_chip_id_break = {},
-        .ops_tagged_for_epoch_break = {},
-        .fwd_to_bwd_nodes = {},
-        .fwd_to_opt_nodes = {},
-        .enable_auto_transposing_placement = true, 
-    };
-
-    PlacerSolution solution = place_onto_chip(placer_config, placer_op_group_workload);
-
-    // Final Checks
-    const OpPlacement& matmul1_placement = solution.name_to_op_placement.at("matmul1");
-    const CoordRange& matmul1_coords = matmul1_placement.placed_cores;
-    const uint32_t matmul1_coords_r = matmul1_coords.size_r();
-    const uint32_t matmul1_coords_c = matmul1_coords.size_c(); 
-    const bool matmul1_grid_transpose = matmul1_placement.grid_transpose;
- 
-    EXPECT_EQ(matmul1_coords_r, 2); // expect matmul1 to be transposed to (2x3) based on the row size
-    EXPECT_EQ(matmul1_coords_c, 3);
-    EXPECT_EQ(matmul1_grid_transpose, true);
-}
-
-
-
-/* Turn off until deallocate is back on
-TEST(Placer, best_fit_allocator)
-{
-    std::uint32_t start_addr = 0x100;
-    std::uint32_t end_addr = 0x8100;
-    std::uint32_t size = end_addr - start_addr;
-    auto bfa = BestFitAllocator(start_addr, end_addr);
-
-    // Allocate everything, and deallocate
-    std::uint32_t addr;
-    EXPECT_TRUE(bfa.allocate(size, addr));
-    EXPECT_EQ(addr, start_addr);
-    bfa.deallocate(addr);
-
-    // Allocate two half-pieces
-    std::uint32_t addr1, addr2;
-    EXPECT_TRUE(bfa.allocate(size/2, addr1));
-    EXPECT_TRUE(bfa.allocate(size/2, addr2));
-    EXPECT_EQ(addr1, start_addr);
-    EXPECT_EQ(addr2, start_addr + size/2);
-    EXPECT_FALSE(bfa.allocate(0x10, addr));
-
-    // Deallocate out of order
-    bfa.deallocate(addr2);
-    bfa.deallocate(addr1);
-
-    // Allocate three pieces of 0x100, check that they are merged back with the whole area
-    std::uint32_t addr3;
-    EXPECT_TRUE(bfa.allocate(0x100, addr1));
-    EXPECT_TRUE(bfa.allocate(0x100, addr2));
-    EXPECT_TRUE(bfa.allocate(0x100, addr3));
-    EXPECT_FALSE(bfa.allocate(size, addr)); // no room
-
-    // Deallocate out of order
-    bfa.deallocate(addr3);
-    bfa.deallocate(addr1);
-    bfa.deallocate(addr2);
-    EXPECT_TRUE(bfa.allocate(size, addr)); // should have room now
-    bfa.deallocate(addr);
-}
-*/
-
-#pragma GCC diagnostic pop
diff --git a/pybuda/csrc/placer/utils.cpp b/pybuda/csrc/placer/utils.cpp
deleted file mode 100644
index b18c85715..000000000
--- a/pybuda/csrc/placer/utils.cpp
+++ /dev/null
@@ -1,89 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "placer/placer.hpp"
-
-#include <algorithm>
-
-#include "graph_lib/defines.hpp"
-#include "utils/logger.hpp"
-#include "utils/assert.hpp"
-
-using NodeEpochType = tt::graphlib::NodeEpochType;
-using std::ostream;
-using std::to_string;
-
-namespace tt {
-namespace placer {
-
-bool is_forward_to_backward_epoch_transition(NodeEpochType prev_epoch_type, NodeEpochType current_epoch_type)
-{
-    return (prev_epoch_type == NodeEpochType::Forward)
-        and (current_epoch_type == NodeEpochType::Backward);
-}
-
-bool is_backward_to_optimizer_epoch_transition(NodeEpochType prev_epoch_type, NodeEpochType current_epoch_type)
-{
-    return prev_epoch_type == NodeEpochType::Backward and current_epoch_type == NodeEpochType::Optimizer;
-}
-
-
-void validate_placer_inputs(const PlacerConfig& config, vector<OpGroupToPlace>& placer_op_group_workload)
-{
-    std::unordered_set<std::string> visited_ops;
-
-    for (const OpGroupToPlace& op_group : placer_op_group_workload)
-    {
-        for (std::size_t current_op_index = 0; current_op_index < op_group.op_names.size(); ++current_op_index)
-        {
-            const string& current_op_name = op_group.op_names.at(current_op_index);
-
-            if (visited_ops.find(current_op_name) != visited_ops.end()) {
-                log_fatal("{} belongs to more than one op_group_workload", current_op_name);
-            }
-
-            // verify all outputs are on MMIO capable devices for wormhole
-            if (config.device_config.arch_name.find("wormhole") != std::string::npos and config.output_ops.find(current_op_name) != config.output_ops.end()) {
-                // TODO(jchu): update this assert with MMIO chip ids
-                TT_ASSERT(std::find(config.device_config.chips_with_mmio.begin(), config.device_config.chips_with_mmio.end(), op_group.chip_id) != config.device_config.chips_with_mmio.end(),
-                    "Placer: For wormhole multichip, we expect all output ops to be placed on MMIO devices.");
-            }
-
-            // Validate that the op_grid sizes are able to fit within the device grid_shape
-            try {
-                const GridShape& op_grid_shape = config.op_to_grid_shape.at(current_op_name);
-                TT_ASSERT(op_grid_shape.rows <= config.get_available_rows_on_device());
-                if(op_grid_shape.columns > config.device_grid.columns) {
-                    throw std::runtime_error("Error: op:" + current_op_name + " grid_shape.columns: " + to_string(op_grid_shape.columns) +
-                    " but the device grid_shape.columns is: " + to_string(config.device_grid.columns));
-                }
-
-                if (current_op_index > 0)
-                {
-                    // Validate that all ops belonging to an op-group belong to the same epochType
-                    const string& prev_op_name = op_group.op_names.at(current_op_index-1);
-                    TT_ASSERT(config.op_to_epoch_type.at(prev_op_name) == config.op_to_epoch_type.at(current_op_name));
-                }
-                visited_ops.insert(current_op_name);
-
-            } catch (std::out_of_range &e) {
-                log_fatal("op_to_grid_shape missing for {}", current_op_name);
-            }
-        }
-    }
-}
-
-void validate_chip_mapping(const PlacerConfig& config, vector<OpGroupToPlace>& placer_workload)
-{
-    for (const OpGroupToPlace& op_group : placer_workload)
-    {
-        TT_ASSERT(std::find(config.chip_ids.begin(), config.chip_ids.end(), op_group.chip_id) != config.chip_ids.end(), 
-                "Placing an op group on chip that's not in the list of available devices: " + std::to_string(op_group.chip_id));
-    }
-}
-
-
-
-} // namespace placer
-} // namespace tt
-
diff --git a/pybuda/csrc/placer/utils.hpp b/pybuda/csrc/placer/utils.hpp
deleted file mode 100644
index 7b9476496..000000000
--- a/pybuda/csrc/placer/utils.hpp
+++ /dev/null
@@ -1,31 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "placer/placer.hpp"
-
-#include <vector>
-#include <map>
-#include <unordered_map>
-#include <string>
-
-// Aliases
-using NodeEpochType = tt::graphlib::NodeEpochType;
-
-using std::uint32_t;
-using std::string;
-using std::vector;
-
-namespace tt {
-namespace placer {
-
-bool is_backward_to_optimizer_epoch_transition(NodeEpochType prev_epoch_type, NodeEpochType current_epoch_type);
-bool is_forward_to_backward_epoch_transition(NodeEpochType prev_epoch_type, NodeEpochType current_epoch_type);
-
-void validate_placer_inputs(const PlacerConfig& config, vector<OpGroupToPlace>& placer_op_group_workload);
-void validate_chip_mapping(const PlacerConfig& config, vector<OpGroupToPlace>& placer_workload);
-
-
-} // end namespace placer
-} // end namespace tt
diff --git a/pybuda/csrc/pybuda_bindings.cpp b/pybuda/csrc/pybuda_bindings.cpp
index e719e9389..1e5718e59 100644
--- a/pybuda/csrc/pybuda_bindings.cpp
+++ b/pybuda/csrc/pybuda_bindings.cpp
@@ -2,7 +2,11 @@
 //
 // SPDX-License-Identifier: Apache-2.0
 
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
 #include <pybind11/pybind11.h>
+#pragma clang diagnostic pop
+
 #include <pybind11/stl.h>
 #include "third_party/json/pybind11_json.hpp"
 
@@ -10,35 +14,34 @@
 namespace py = pybind11;
 
 #include "autograd/python_bindings.hpp"
-#include "backend_api/backend_api.hpp"
-#include "balancer/python_bindings.hpp"
+#include "backend_api/device_config.hpp"
 #include "buda_passes.hpp"
 #include "graph_lib/graph.hpp"
 #include "graph_lib/python_bindings.hpp"
 #include "lower_to_buda/common.hpp"
-#include "lower_to_buda/netlist.hpp"
 #include "passes/amp.hpp"
 #include "passes/consteval.hpp"
-#include "passes/fork_join.hpp"
 #include "passes/fracture.hpp"
-#include "passes/passes_utils.hpp"
-#include "passes/placer_buda_passes.hpp"
-#include "passes/python_bindings.hpp"
 #include "passes/link_past_cache_ios.hpp"
 #include "passes/move_index_to_mm_weights.hpp"
-#include "pattern_matcher/python_bindings.hpp"
-#include "placer/python_bindings.hpp"
+#include "passes/passes_utils.hpp"
+#include "passes/python_bindings.hpp"
+#include "passes/mlir_compiler.hpp"
 #include "python_bindings_common.hpp"
 #include "reportify/reportify.hpp"
-#include "scheduler/python_bindings.hpp"
+#include "runtime/python_bindings.hpp"
 #include "shared_utils/sparse_matmul_utils.hpp"
-#include "utils/ordered_associative_containers/ordered_map.hpp"
 #include "tt_torch_device/python_bindings.hpp"
+#include "utils/ordered_associative_containers/ordered_map.hpp"
+#include "utils/signal_handlers.hpp"
 
 namespace tt {
 
 PYBIND11_MODULE(_C, m) {
 
+    // Register signal handlers when loading pybuda module.
+    static SignalHandlers signal_handlers;
+
     m.attr("__name__") = "pybuda._C";
     m.doc() = "python bindings to pybuda framwork";
 
@@ -46,6 +49,15 @@ PYBIND11_MODULE(_C, m) {
 
     m.attr("k_dim") = py::int_(passes::k_dim);
 
+    py::enum_<tt::ARCH>(m, "Arch")
+        .value("JAWBRIDGE", tt::ARCH::JAWBRIDGE)
+        .value("GRAYSKULL", tt::ARCH::GRAYSKULL)
+        .value("WORMHOLE", tt::ARCH::WORMHOLE)
+        .value("WORMHOLE_B0", tt::ARCH::WORMHOLE_B0)
+        .value("BLACKHOLE", tt::ARCH::BLACKHOLE)
+        .value("Invalid", tt::ARCH::Invalid)
+        .export_values();
+
     py::enum_<tt::DataFormat>(m, "DataFormat")
         .value("Float32", tt::DataFormat::Float32)
         .value("Float16", tt::DataFormat::Float16)
@@ -101,262 +113,14 @@ PYBIND11_MODULE(_C, m) {
     py::module_ m_autograd = m.def_submodule("autograd", "Submodule defining autograd_engine.");
     AutogradModule(m_autograd);
 
-    py::module_ m_scheduler = m.def_submodule("scheduler", "Submodule defining scheduling of ops on device.");
-    SchedulerModule(m_scheduler);
-
-    py::module_ m_placer = m.def_submodule("placer", "Submodule defining placer functions for placing ops onto epoch/chips");
-    PlacerModule(m_placer);
-
-    py::module_ m_balancer = m.def_submodule("balancer", "Submodule balancing ops onto device");
-    BalancerModule(m_balancer);
-
-    py::module_ m_pattern_matcher = m.def_submodule("pattern_matcher", "Submodule for discovering repeated subgraph structures");
-    PatternMatcherModule(m_pattern_matcher);
-
-    py::module_ m_backend = m.def_submodule("backend_api", "API to Buda Backend");
-    tt::backend_api::BackendModule(m_backend);
-
     py::module_ m_passes = m.def_submodule("passes", "API to Buda Passes");
     PassesModule(m_passes);
 
     py::module_ m_torch_device = m.def_submodule("torch_device", "TT Torch Device");
     TorchDeviceModule(m_torch_device);
 
-    py::class_<BudaNetlistConfig>(m, "BudaNetlistConfig")
-        .def(py::init<>());
-
-    py::class_<BudaNetlist>(m, "BudaNetlist")
-        .def(py::init<>())
-        .def("dump_to_yaml", &BudaNetlist::dump_to_yaml)
-        .def("append_comment", &BudaNetlist::append_comment);
-
-    py::class_<DramQueueConfigOverride>(m, "DramQueueConfigOverride")
-        .def(py::init<std::optional<std::uint32_t>, std::optional<std::uint32_t>>())
-        .def(py::pickle(
-            [](const DramQueueConfigOverride &p) {  // __getstate__
-                return py::make_tuple(p.chip_id, p.channel);
-            },
-            [](py::tuple t) {  // __setstate__
-                if (t.size() != 2)
-                    throw std::runtime_error("DramQueueConfigOverride: Invalid state!");
-
-                DramQueueConfigOverride p(
-                    t[0].cast<std::optional<std::uint32_t>>(), t[1].cast<std::optional<std::uint32_t>>());
-                return p;
-            }))
-        .def(
-            "to_json",
-            [](const DramQueueConfigOverride &p)
-            {
-                std::unordered_map<std::string, std::optional<std::uint32_t>> d;
-                d["chip_id"] = p.chip_id;
-                d["channel"] = p.channel;
-                return d;
-            })
-        .def(
-            "from_json",
-            [](std::unordered_map<std::string, std::optional<std::uint32_t>> const &d)
-            {
-                DramQueueConfigOverride queue_override;
-                if (auto match = d.find("chip_id"); match != d.end())
-                    queue_override.chip_id = match->second;
-                if (auto match = d.find("channel"); match != d.end())
-                    queue_override.channel = match->second;
-                return queue_override;
-            }),
-        py::arg("chip_id"), py::arg("channel");
-
-    py::class_<PostPlacerConfig>(m, "PostPlacerConfig")
-        .def(
-            py::init<
-                DeviceConfig const &,
-                std::uint32_t,
-                std::uint32_t,
-                bool,
-                bool,
-                bool,
-                DramQueueMap,
-                std::uint32_t,
-                std::uint32_t,
-                std::uint32_t,
-                bool,
-                placer::DRAMPlacementAlgorithm>(),
-            py::arg("device_config"),
-            py::arg("microbatch_size"),
-            py::arg("microbatch_count"),
-            py::arg("enable_t_streaming"),
-            py::arg("input_queues_on_host"),
-            py::arg("output_queues_on_host"),
-            py::arg("manual_dram_queue_placement"),
-            py::arg("fork_join_tiles_treshold"),
-            py::arg("output_queue_multiplier"),
-            py::arg("input_queue_multiplier"),
-            py::arg("enable_cross_chip_buffering"),
-            py::arg("placement_algorithm"));
-
-    py::class_<InsertionInstruction,PyInsertionInstruction , std::shared_ptr<InsertionInstruction>>(m, "InsertionInstruction")
-        .def(
-            py::init<
-                std::string,
-                std::string,
-                bool,
-                std::optional<std::uint32_t>,
-                std::optional<std::uint32_t>,
-                bool>(),
-            py::arg("src"),
-            py::arg("dest"),
-            py::arg("hoist_tms"),
-            py::arg("input_id") = std::nullopt,
-            py::arg("fork_id") = std::nullopt,
-            py::arg("user_defined") = false)
-        .def("unique_id", &InsertionInstruction::unique_id)
-        .def("insert", &InsertionInstruction::insert);
-
-    using NopInsertionFields = std::variant<std::string, bool, std::uint32_t, std::optional<std::uint32_t>>;
-    py::class_<NopInsertionInstruction, InsertionInstruction, std::shared_ptr<NopInsertionInstruction>>(
-        m, "NopInsertionInstruction")
-        .def(
-            py::init<
-                std::string,
-                std::string,
-                bool,
-                std::uint32_t,
-                std::optional<std::uint32_t>,
-                std::optional<std::uint32_t>,
-                bool,
-                bool,
-                bool,
-                bool>(),
-            py::arg("src"),
-            py::arg("dest"),
-            py::arg("hoist_tms"),
-            py::arg("nop_count") = 1,
-            py::arg("input_id") = std::nullopt,
-            py::arg("fork_id") = std::nullopt,
-            py::arg("user_defined") = false,
-            py::arg("mergeable") = false,
-            py::arg("daisy_chain") = false,
-            py::arg("request_merge") = false)
-        .def(py::pickle(
-            [](const NopInsertionInstruction &p) {  // __getstate__
-                return py::make_tuple(
-                    p.src, p.dest, p.hoist_tms, p.nop_count, p.input_id, p.fork_id, p.user_defined, p.mergeable, p.daisy_chain, p.request_merge);
-            },
-            [](py::tuple t) {  // __setstate__
-                if (t.size() != 10)
-                    throw std::runtime_error("Invalid state!");
-
-                NopInsertionInstruction p(
-                    t[0].cast<std::string>(),
-                    t[1].cast<std::string>(),
-                    t[2].cast<bool>(),
-                    t[3].cast<std::uint32_t>(),
-                    t[4].cast<std::optional<std::uint32_t>>(),
-                    t[5].cast<std::optional<std::uint32_t>>(),
-                    t[6].cast<bool>(),
-                    t[7].cast<bool>(),
-                    t[8].cast<bool>(),
-                    t[9].cast<bool>());
-                return p;
-            }))
-        .def(
-            "to_json",
-            [](const NopInsertionInstruction &p)
-            {
-                std::unordered_map<std::string, NopInsertionFields> d;
-                d["src"] = p.src;
-                d["dest"] = p.dest;
-                d["hoist_tms"] = p.hoist_tms;
-                d["nop_count"] = p.nop_count;
-                d["input_id"] = p.input_id;
-                d["fork_id"] = p.fork_id;
-                d["user_defined"] = p.user_defined;
-                d["mergeable"] = p.mergeable;
-                d["daisy_chain"] = p.daisy_chain;
-                d["request_merge"] = p.request_merge;
-                return d;
-            })
-        .def(
-            "from_json",
-            [](std::unordered_map<std::string, NopInsertionFields> const &d)
-            {
-                NopInsertionInstruction nii;
-                if (auto match = d.find("src"); match != d.end())
-                    nii.src = std::get<std::string>(match->second);
-                if (auto match = d.find("dest"); match != d.end())
-                    nii.dest = std::get<std::string>(match->second);
-                if (auto match = d.find("hoist_tms"); match != d.end())
-                    nii.hoist_tms = std::get<bool>(match->second);
-                if (auto match = d.find("nop_count"); match != d.end())
-                    nii.nop_count = std::get<std::uint32_t>(match->second);
-                if (auto match = d.find("input_id");
-                    match != d.end() && std::holds_alternative<std::optional<std::uint32_t>>(match->second))
-                    nii.input_id = std::get<std::optional<std::uint32_t>>(match->second);
-                if (auto match = d.find("fork_id");
-                    match != d.end() && std::holds_alternative<std::optional<std::uint32_t>>(match->second))
-                    nii.fork_id = std::get<std::optional<std::uint32_t>>(match->second);
-                if (auto match = d.find("user_defined"); match != d.end())
-                    nii.user_defined = std::get<bool>(match->second);
-                if (auto match = d.find("mergeable"); match != d.end())
-                    nii.mergeable = std::get<bool>(match->second);
-                if (auto match = d.find("daisy_chain"); match != d.end())
-                    nii.daisy_chain = std::get<bool>(match->second);
-                if (auto match = d.find("request_merge"); match != d.end())
-                    nii.request_merge = std::get<bool>(match->second);
-                return nii;
-            })
-        .def("unique_id", &NopInsertionInstruction::unique_id);
-
-    py::class_<QueueInsertionInstruction, InsertionInstruction, std::shared_ptr<QueueInsertionInstruction>>(
-        m, "QueueInsertionInstruction")
-        .def(
-            py::init<
-                std::string,
-                std::string,
-                bool,
-                int,
-                std::uint32_t,
-                std::optional<std::uint32_t>,
-                std::optional<std::uint32_t>,
-                bool>(),
-            py::arg("src"),
-            py::arg("dest"),
-            py::arg("hoist_tms"),
-            py::arg("num_entries"),
-            py::arg("queue_size"),
-            py::arg("input_id") = std::nullopt,
-            py::arg("fork_id") = std::nullopt,
-            py::arg("user_defined") = false)
-        .def(py::pickle(
-            [](const QueueInsertionInstruction &p) {  // __getstate__
-                return py::make_tuple(
-                    p.src, p.dest, p.hoist_tms, p.num_entries, p.queue_size, p.input_id, p.fork_id, p.user_defined);
-            },
-            [](py::tuple t) {  // __setstate__
-                if (t.size() != 8)
-                    throw std::runtime_error("Invalid state!");
-
-                QueueInsertionInstruction p(
-                    t[0].cast<std::string>(),
-                    t[1].cast<std::string>(),
-                    t[2].cast<bool>(),
-                    t[3].cast<int>(),
-                    t[4].cast<std::uint32_t>(),
-                    t[5].cast<std::optional<std::uint32_t>>(),
-                    t[6].cast<std::optional<std::uint32_t>>(),
-                    t[7].cast<bool>());
-                return p;
-            }))
-        .def("unique_id", &QueueInsertionInstruction::unique_id);
-
-    py::class_<tt::placer::Blocks>(m, "Blocks").def(py::init<>());
-    py::class_<tt::placer::Block>(m, "Block").def(py::init<>());
-
-    py::class_<PostPlacerResults>(m, "PostPlacerResults")
-        .def_readonly("perf_model_results", &PostPlacerResults::perf_model_results)
-        .def_readonly("ins_instructions", &PostPlacerResults::ins_instructions)
-        .def_readonly("allocated_blocks", &PostPlacerResults::allocated_blocks)
-        .def_readonly("current_host_address", &PostPlacerResults::current_host_address);
+    py::module m_runtime = m.def_submodule("runtime", "Submodule defining runtime functions");
+    RuntimeModule(m_runtime);
 
     py::enum_<tt::MathFidelity>(m, "MathFidelity")
         .value("LoFi", tt::MathFidelity::LoFi)
@@ -400,11 +164,8 @@ PYBIND11_MODULE(_C, m) {
         "run_pre_placer_buda_passes",
         &run_pre_placer_buda_passes,
         py::arg("graph"),
-        py::arg("scheduler_config"),
         py::arg("device_config"),
         py::arg("chip_ids") = std::vector<std::uint32_t>{0},
-        py::arg("op_names_to_chip_break") = placer::PredicatesToBreaks(),
-        py::arg("op_names_to_epoch_break") = placer::PredicatesToBreaks(),
         py::arg("op_names_dont_fuse") = std::vector<std::string>{},
         py::arg("op_names_manual_fuse") = std::vector<std::string>{},
         py::arg("fracture_chip_id_assignments") = passes::FractureChipIdAssignments{},
@@ -417,79 +178,36 @@ PYBIND11_MODULE(_C, m) {
         py::arg("amp_level") = 0,
         py::arg("enable_recompute") = 0,
         py::arg("output_queues_on_host") = true,
-        py::arg("ins_instructions") = tt::
-            ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>{},
+        py::arg("input_queues_on_host") = true,
         py::arg("insert_queues") = std::vector<std::tuple<std::string, std::string, int>>{},
         py::arg("amp_properties") = std::vector<AMPNodeProperties>{},
         py::arg("op_intermediates_to_save") = std::vector<std::string>{},
         py::arg("use_interactive_placer") = true,
         py::arg("enable_device_tilize") = false);
-    m.def(
-        "is_subset_of_instructions",
-        &is_subset_of_instructions,
-        py::arg("ins_instructions") = tt::
-            ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>{},
-        py::arg("previous_instructions") = tt::
-            ordered_map<InsInstructionUniqueId, std::shared_ptr<InsertionInstruction>, InsInstructionUniqueIdHash>{});
-    m.def("run_post_placer_buda_passes", &run_post_placer_buda_passes);
-    m.def("run_pre_netlist_generation_buda_passes", &run_pre_netlist_generation_buda_passes);
-    m.def("run_placer_buda_passes", &passes::run_placer_buda_passes);
     m.def("run_pre_lowering_passes", &run_pre_lowering_passes);
-    m.def("lower_to_buda_netlist", &lower_to_buda_netlist,
-    py::arg("graph"),
-    py::arg("graph_name"),
-    py::arg("placer_solution"),
-    py::arg("balancer_solution"),
-    py::arg("chip_ids"),
-    py::arg("device_config"),
-    py::arg("enable_forked_dram_inputs")=false);
-    m.def("merge_netlists", &merge_netlists);
+    m.def("run_mlir_compiler", &passes::run_mlir_compiler);
 
-    m.def("dump_graph", [](
-        const tt::graphlib::Graph *graph, 
-        std::string test_name, 
-        std::string graph_name, 
-        const tt::placer::PlacerSolution *placer_solution,
-        std::shared_ptr<tt::balancer::BalancerSolution> balancer_solution)
-    {
-        tt::reportify::dump_graph(test_name, graph_name, graph, placer_solution, balancer_solution);
-    },
+    m.def(
+        "dump_graph",
+        [](const tt::graphlib::Graph *graph, std::string test_name, std::string graph_name)
+        { tt::reportify::dump_graph(test_name, graph_name, graph); },
         py::arg("graph"),
         py::arg("test_name"),
-        py::arg("graph_name"),
-        py::arg("placer_solution") = nullptr,
-        py::arg("balancer_solution") = nullptr
-    );
-    m.def("dump_epoch_type_graphs", [](
-        const tt::graphlib::Graph *graph, 
-        std::string test_name, 
-        std::string graph_name, 
-        const tt::placer::PlacerSolution *placer_solution,
-        std::shared_ptr<tt::balancer::BalancerSolution> balancer_solution)
-    {
-        tt::reportify::dump_epoch_type_graphs(test_name, graph_name, graph, placer_solution, balancer_solution);
-    },
+        py::arg("graph_name"));
+    m.def(
+        "dump_epoch_type_graphs",
+        [](const tt::graphlib::Graph *graph, std::string test_name, std::string graph_name)
+        { tt::reportify::dump_epoch_type_graphs(test_name, graph_name, graph); },
         py::arg("graph"),
         py::arg("test_name"),
-        py::arg("graph_name"),
-        py::arg("placer_solution") = nullptr,
-        py::arg("balancer_solution") = nullptr
-    );
-    m.def("dump_epoch_id_graphs", [](
-        const tt::graphlib::Graph *graph, 
-        std::string test_name, 
-        std::string graph_name, 
-        const tt::placer::PlacerSolution *placer_solution,
-        std::shared_ptr<tt::balancer::BalancerSolution> balancer_solution)
-    {
-        tt::reportify::dump_epoch_id_graphs(test_name, graph_name, graph, placer_solution, balancer_solution);
-    },
+        py::arg("graph_name"));
+    m.def(
+        "dump_epoch_id_graphs",
+        [](const tt::graphlib::Graph *graph, std::string test_name, std::string graph_name)
+        { tt::reportify::dump_epoch_id_graphs(test_name, graph_name, graph); },
         py::arg("graph"),
         py::arg("test_name"),
-        py::arg("graph_name"),
-        py::arg("placer_solution"),
-        py::arg("balancer_solution") = nullptr
-    );
+        py::arg("graph_name"));
 
     py::enum_<tt::graphlib::NodeEpochType>(m, "NodeEpochType")
         .value("Forward", tt::graphlib::NodeEpochType::Forward)
@@ -519,6 +237,7 @@ PYBIND11_MODULE(_C, m) {
         .def_readonly("zdim", &sparse::SparseBUDA::zdim)
         .def_readonly("bcast_factor", &sparse::SparseBUDA::bcast_factor)
         .def("get_sparse_tile_ptr_bits", &sparse::SparseBUDA::get_sparse_tile_ptr_bits)
+        .def("get_sparse_ublock_idx_bits", &sparse::SparseBUDA::get_sparse_ublock_idx_bits)
         .def("get_sparse_tiles_and_encodings", [](tt::sparse::SparseBUDA &self, int grid_r) {
             return self.get_sparse_tiles_and_encodings(grid_r);
         });
@@ -595,7 +314,7 @@ PYBIND11_MODULE(_C, m) {
                     t[6].cast<std::optional<std::string>>(),
                     t[7].cast<std::optional<tt::passes::InputDfConfig>>(),
                     t[8].cast<std::optional<bool>>(),
-                    t[9].cast<std::optional<vector<std::pair<std::uint32_t, std::uint32_t>>>>());
+                    t[9].cast<std::optional<std::vector<std::pair<std::uint32_t, std::uint32_t>>>>());
                 return p;
             }));
 }
diff --git a/pybuda/csrc/reportify/CMakeLists.txt b/pybuda/csrc/reportify/CMakeLists.txt
new file mode 100644
index 000000000..759d6a76a
--- /dev/null
+++ b/pybuda/csrc/reportify/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_library(reportify
+    STATIC
+    reportify.cpp
+    paths.cpp
+    to_json.cpp)
+
+target_compile_options(reportify PRIVATE ${STATIC_LIB_FLAGS} ${PYBUDA_CSRC_CFLAGS})
+
diff --git a/pybuda/csrc/reportify/module.mk b/pybuda/csrc/reportify/module.mk
index 562eb1696..d481f9488 100644
--- a/pybuda/csrc/reportify/module.mk
+++ b/pybuda/csrc/reportify/module.mk
@@ -13,7 +13,7 @@ PYBUDA_CSRC_REPORTIFY_INCLUDES = $(PYBUDA_CSRC_INCLUDES)
 
 pybuda/csrc/reportify: $(PYBUDA_CSRC_REPORTIFY)
 
-$(PYBUDA_CSRC_REPORTIFY): $(PYBUDA_CSRC_REPORTIFY_OBJS) $(PYBUDA_CSRC_GRAPH_LIB)
+$(PYBUDA_CSRC_REPORTIFY): $(PYBUDA_CSRC_REPORTIFY_OBJS)
 	@mkdir -p $(LIBDIR)
 	ar rcs $@ $^
 
diff --git a/pybuda/csrc/reportify/paths.cpp b/pybuda/csrc/reportify/paths.cpp
index 88ae944d9..1c492809f 100644
--- a/pybuda/csrc/reportify/paths.cpp
+++ b/pybuda/csrc/reportify/paths.cpp
@@ -3,7 +3,7 @@
 // SPDX-License-Identifier: Apache-2.0
 #include "reportify/paths.hpp"
 
-#include <experimental/filesystem>
+#include <filesystem>
 #include <fstream>
 #include <iostream>
 
@@ -100,7 +100,7 @@ bool initalize_reportify_directory(const std::string& reportify_dir,
     std::string dir = reportify_dir + "/" + get_variant(test_name);
     std::string summary_filename = dir + "/" + "summary.yaml";
 
-    std::experimental::filesystem::create_directories(dir);
+    std::filesystem::create_directories(dir);
 
     std::ofstream ofs(summary_filename);
 
diff --git a/pybuda/csrc/reportify/reportify.cpp b/pybuda/csrc/reportify/reportify.cpp
index 6fa88ca66..54ba92618 100644
--- a/pybuda/csrc/reportify/reportify.cpp
+++ b/pybuda/csrc/reportify/reportify.cpp
@@ -3,20 +3,17 @@
 // SPDX-License-Identifier: Apache-2.0
 #include "reportify/reportify.hpp"
 
-#include <experimental/filesystem>
+#include <filesystem>
 #include <fstream>
 #include <iomanip>
 #include <ostream>
 #include <sstream>
 #include <string>
 
-#include "balancer/balancer.hpp"
 #include "graph_lib/graph.hpp"
 #include "graph_lib/node.hpp"
 #include "graph_lib/node_types.hpp"
 #include "json.hpp"
-#include "passes/fuse_ops.hpp"
-#include "placer/placer.hpp"
 #include "reportify/to_json.hpp"
 #include "utils/logger.hpp"
 
@@ -63,11 +60,7 @@ std::vector<std::string> tt_nodes_to_name_strings(const std::vector<graphlib::No
     return ret_vector;
 }
 
-json node_to_json(
-    const graphlib::Node* node,
-    const graphlib::Graph* graph,
-    const placer::PlacerSolution* placer_solution,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution)
+json node_to_json(const graphlib::Node* node, const graphlib::Graph* graph)
 {
     json ret_json;
     ret_json["pybuda"] = 1;  // marker to reportify to use new colouring scheme
@@ -106,7 +99,8 @@ json node_to_json(
             incoming_edge.edge_type != graphlib::EdgeType::kDataLoopback and
             incoming_edge.edge_type != graphlib::EdgeType::kControlLoop and
             incoming_edge.edge_type != graphlib::EdgeType::kControl and
-            incoming_edge.edge_type != graphlib::EdgeType::kPartialDataCopy)
+            incoming_edge.edge_type != graphlib::EdgeType::kPartialDataCopy and
+            incoming_edge.edge_type != graphlib::EdgeType::kSubgraphLink)
         {
             continue;  // don't display others for now
         }
@@ -136,40 +130,6 @@ json node_to_json(
     ret_json["cache"]["shape"] = node->shape().as_vector();
 
     ret_json["epoch"] = 0;
-    if (placer_solution != nullptr)
-    {
-        try
-        {
-            if (node->node_type() == graphlib::NodeType::kBudaOp)
-            {
-                placer::OpPlacement placement = placer_solution->name_to_op_placement.at(node->name());
-                ret_json["grid_start"] = {placement.placed_cores.start.row, placement.placed_cores.start.col};
-                ret_json["grid_end"] = {placement.placed_cores.end.row, placement.placed_cores.end.col};
-                ret_json["epoch"] = placer_solution->temporal_epoch_id(node->name());
-                ret_json["chip_id"] = placer_solution->chip_id(node->name());
-            }
-            else if (node->node_type() == graphlib::NodeType::kInput)
-            {
-                ret_json["epoch"] = placer_solution->temporal_epoch_id(graph->data_users(node)[0]->name());
-                ret_json["chip_id"] = placer_solution->chip_id(graph->data_users(node)[0]->name());
-            }
-            else if (node->node_type() == graphlib::NodeType::kOutput)
-            {
-                ret_json["epoch"] = placer_solution->temporal_epoch_id(graph->data_operands(node)[0]->name());
-                ret_json["chip_id"] = placer_solution->chip_id(graph->data_operands(node)[0]->name());
-            }
-        }
-        catch (std::out_of_range& e)
-        {
-            log_warning(tt::LogReportify, "Node {} has no placement, skipping.", node->name());
-        }
-    }
-
-    if (balancer_solution and balancer_solution->op_models.find(node->name()) != balancer_solution->op_models.end())
-    {
-        balancer::OpModel const& op_model = balancer_solution->op_models.at(node->name());
-        ret_json["op_model"] = op_model;
-    }
 
     ret_json["epoch_type"] = graphlib::node_epoch_type_to_string(node->get_epoch_type());
     ret_json["output_nodes"] = output_nodes;
@@ -257,28 +217,6 @@ json node_to_json(
             ss << opnode->math_fidelity();
             ret_json["fidelity"] = ss.str();
         }
-
-        if (opnode->is_fused_op())
-        {
-            std::vector<std::vector<std::string>> schedules;
-
-            auto fused_op = opnode->get_fused_op();
-            for (const auto& schedule : fused_op->get_schedules())
-            {
-                std::vector<std::string> sch;
-                for (const auto& op : schedule.ops)
-                {
-                    auto sh = op.op_shape.outputs.at(0);
-                    std::string shape = std::to_string(sh.w) + "," + std::to_string(sh.z) + "," +
-                                        std::to_string(sh.rt) + "," + std::to_string(sh.ct);
-                    sch.push_back(
-                        op.name + ": " + op.op_type.op + " (" + shape + "), out: " + std::to_string(op.output_buffer));
-                }
-                schedules.push_back(sch);
-            }
-
-            ret_json["schedules"] = schedules;
-        }
     }
     else if (node->node_type() == graphlib::NodeType::kBudaNaryTM)
     {
@@ -295,15 +233,6 @@ json node_to_json(
         ret_json["is_cross_epoch_type"] = node->as<graphlib::QueueNode>()->is_epoch_to_epoch() and
                                           node->as<graphlib::EpochToEpochQueueNode>()->is_cross_epoch_type();
         ret_json["memory_access"] = node->as<graphlib::QueueNode>()->memory_access_type_string();
-
-        if (balancer_solution)
-        {
-            auto operands = graph->data_operands(node);
-            TT_ASSERT(operands.size() == 1);
-            TT_ASSERT(operands[0]->node_type() == graphlib::NodeType::kBudaOp);
-            balancer::OpModel const& op_model = balancer_solution->op_models.at(operands[0]->name());
-            ret_json["op_model"] = {{"t_stream_factor", op_model.t_stream_factor}};
-        }
     }
     std::stringstream ss;
     ss << node->output_df();
@@ -346,8 +275,6 @@ void write_json_to_file(const std::string& path, json json_file, int width = 4)
 JsonNamePairs create_jsons_for_graph(
     const std::string& graph_prefix,
     const graphlib::Graph* graph,
-    const placer::PlacerSolution* placer_solution,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution,
     std::function<bool(graphlib::Node*)> node_filter = [](graphlib::Node*) { return true; });
 
 void dump_graph(
@@ -355,14 +282,12 @@ void dump_graph(
     const std::string& test_name,
     const std::string& graph_prefix,
     const graphlib::Graph* graph,
-    const placer::PlacerSolution* placer_solution,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution,
     const std::string& report_path)
 {
     if (env_as<bool>("PYBUDA_DISABLE_REPORTIFY_DUMP"))
         return;
 
-    JsonNamePairs json_pairs = create_jsons_for_graph(graph_prefix, graph, placer_solution, balancer_solution);
+    JsonNamePairs json_pairs = create_jsons_for_graph(graph_prefix, graph);
 
     initalize_reportify_directory(path, test_name);
 
@@ -371,7 +296,7 @@ void dump_graph(
 
     log_debug(tt::LogReportify, "Writing graph to {}", subgraph_path);
 
-    std::experimental::filesystem::create_directories(subgraph_path);
+    std::filesystem::create_directories(subgraph_path);
 
     json root_json = json_pairs.back().first;
     std::string root_json_name = json_pairs.back().second;
@@ -382,22 +307,20 @@ void dump_graph(
 
 void dump_consteval_graph(const std::string& test_name, const std::string& graph_prefix, const graphlib::Graph* graph)
 {
-    return dump_graph(test_name, canonical_dirname(graph_prefix), graph, nullptr, nullptr, "/buda_reports/Consteval/");
+    return dump_graph(test_name, canonical_dirname(graph_prefix), graph, "/buda_reports/Consteval/");
 }
 
 void dump_epoch_type_graphs(
     const std::string& test_name,
     const std::string& graph_prefix,
     const graphlib::Graph* graph,
-    const placer::PlacerSolution* placer_solution,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution,
     const std::string& directory_path)
 {
     if (env_as<bool>("PYBUDA_DISABLE_REPORTIFY_DUMP"))
         return;
-    
-    std::function<bool(graphlib::Node*, NodeEpochType epoch_type, const graphlib::Graph* graph)> epoch_type_filter =
-        [](graphlib::Node* node, NodeEpochType epoch_type, const graphlib::Graph* graph)
+
+    std::function<bool(graphlib::Node*, graphlib::NodeEpochType epoch_type, const graphlib::Graph* graph)> epoch_type_filter =
+        [](graphlib::Node* node, graphlib::NodeEpochType epoch_type, const graphlib::Graph* graph)
     {
         if (node->node_type() == graphlib::NodeType::kInput or node->node_type() == graphlib::NodeType::kQueue)
         {
@@ -432,12 +355,13 @@ void dump_epoch_type_graphs(
 
     log_debug(tt::LogReportify, "Writing graph to {}", subgraph_path);
 
-    std::experimental::filesystem::create_directories(subgraph_path);
+    std::filesystem::create_directories(subgraph_path);
 
-    for (NodeEpochType epoch_type : {NodeEpochType::Forward, NodeEpochType::Backward, NodeEpochType::Optimizer})
+    for (graphlib::NodeEpochType epoch_type :
+         {graphlib::NodeEpochType::Forward, graphlib::NodeEpochType::Backward, graphlib::NodeEpochType::Optimizer})
     {
-        if ((epoch_type == NodeEpochType::Backward and not graph->contains_bwd_nodes()) or
-            (epoch_type == NodeEpochType::Optimizer and not graph->contains_opt_nodes()))
+        if ((epoch_type == graphlib::NodeEpochType::Backward and not graph->contains_bwd_nodes()) or
+            (epoch_type == graphlib::NodeEpochType::Optimizer and not graph->contains_opt_nodes()))
         {
             continue;
         }
@@ -446,8 +370,6 @@ void dump_epoch_type_graphs(
         JsonNamePairs new_json_pairs = create_jsons_for_graph(
             graph_prefix + graphlib::node_epoch_type_to_string(epoch_type),
             graph,
-            placer_solution,
-            balancer_solution,
             node_epoch_type_filter);
 
         for (const auto& [json, json_name] : new_json_pairs)
@@ -462,91 +384,13 @@ void dump_epoch_id_graphs(
     const std::string& test_name,
     const std::string& graph_prefix,
     const graphlib::Graph* graph,
-    const placer::PlacerSolution* placer_solution,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution,
     const std::string& directory_path)
 {
-    if (env_as<bool>("PYBUDA_DISABLE_REPORTIFY_DUMP"))
-        return;
-
-    if (placer_solution == nullptr)
-    {
-        log_warning(
-            tt::LogReportify, "dump_epoch_id_graphs(..) invoked without placer_solution argument, no dumps written");
-        return;
-    }
-
-    std::function<bool(
-        graphlib::Node*,
-        uint32_t epoch_id,
-        const graphlib::Graph* graph,
-        const placer::PlacerSolution* placer_solution)>
-        epoch_id_filter = [](graphlib::Node* node,
-                             uint32_t epoch_id,
-                             const graphlib::Graph* graph,
-                             const placer::PlacerSolution* placer_solution)
-    {
-        if (node->node_type() == graphlib::NodeType::kBudaOp)
-        {
-            return placer_solution->temporal_epoch_id(node->name()) == epoch_id;
-        }
-
-        for (graphlib::Node* user : graph->data_users(node))
-        {
-            if (placer_solution->name_to_op_placement.find(user->name()) != placer_solution->name_to_op_placement.end())
-            {
-                if (placer_solution->temporal_epoch_id(user->name()) == epoch_id)
-                {
-                    return true;
-                }
-            }
-        }
-        for (graphlib::Node* operand : graph->data_operands(node))
-        {
-            if (placer_solution->name_to_op_placement.find(operand->name()) !=
-                placer_solution->name_to_op_placement.end())
-            {
-                if (placer_solution->temporal_epoch_id(operand->name()) == epoch_id)
-                {
-                    return true;
-                }
-            }
-        }
-        return false;
-    };
-
-    initalize_reportify_directory(directory_path, test_name);
-
-    std::string report_path = get_epoch_id_report_relative_directory();
-    std::string sage_report_path = build_report_path(directory_path, test_name, report_path);
-    std::string subgraph_path = sage_report_path + graph_prefix + "_graphs/";
-
-    log_debug(tt::LogReportify, "Writing graph to {}", subgraph_path);
-
-    std::experimental::filesystem::create_directories(subgraph_path);
-
-    for (uint32_t epoch_id = 0; epoch_id < placer_solution->num_epochs; ++epoch_id)
-    {
-        auto node_epoch_id_filter = std::bind(epoch_id_filter, std::placeholders::_1, epoch_id, graph, placer_solution);
-        JsonNamePairs new_json_pairs = create_jsons_for_graph(
-            graph_prefix + "_epoch_id_" + std::to_string(epoch_id),
-            graph,
-            placer_solution,
-            balancer_solution,
-            node_epoch_id_filter);
-
-        for (const auto& [json, json_name] : new_json_pairs)
-        {
-            std::string root_json_path = sage_report_path + graph_prefix + json_name;
-            write_json_to_file(root_json_path, json);
-        }
-    }
+    return;
 }
 
 json create_json_for_graph(
     const graphlib::Graph* graph,
-    const placer::PlacerSolution* placer_solution,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution,
     std::function<bool(graphlib::Node*)> node_filter)
 {
     json this_json;
@@ -555,7 +399,7 @@ json create_json_for_graph(
     {
         if (node_filter(node))
         {
-            this_json["nodes"][node->name()] = node_to_json(node, graph, placer_solution, balancer_solution);
+            this_json["nodes"][node->name()] = node_to_json(node, graph);
             this_json["graph"] = std::unordered_map<std::string, std::string>();
             this_json["topological_sorted_nodes"].push_back(node->name());
         }
@@ -566,13 +410,11 @@ json create_json_for_graph(
 JsonNamePairs create_jsons_for_graph(
     const std::string& graph_prefix,
     const graphlib::Graph* graph,
-    const placer::PlacerSolution* placer_solution,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution,
     std::function<bool(graphlib::Node*)> node_filter)
 {
     JsonNamePairs this_json_name_pairs;
 
-    json this_json = create_json_for_graph(graph, placer_solution, balancer_solution, node_filter);
+    json this_json = create_json_for_graph(graph, node_filter);
     std::string this_name = graph_prefix + ".buda";
     JsonNamePair this_json_name_pair = std::make_pair(this_json, this_name);
     this_json_name_pairs.push_back(this_json_name_pair);
@@ -584,42 +426,10 @@ void dump_graph(
     const std::string& test_name,
     const std::string& graph_prefix,
     const graphlib::Graph* graph,
-    const placer::PlacerSolution* placer_solution,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution,
     const std::string& report_path)
 {
     std::string default_dir = get_default_reportify_path("");
-    dump_graph(default_dir, test_name, graph_prefix, graph, placer_solution, balancer_solution, report_path);
-}
-
-void dump_constraints(
-    const std::string& test_name, const balancer::legalizer::GraphSolver* graph_solver, const std::string& report_path)
-{
-    if (env_as<bool>("PYBUDA_DISABLE_REPORTIFY_DUMP"))
-        return;
-
-    if (env_as<bool>("PYBUDA_COLLECT_CONSTRAINT_INFO"))
-    {
-        std::string default_dir = get_default_reportify_path("");
-
-        std::string constraints_report_path = build_report_path(default_dir, test_name, report_path);
-        log_debug(tt::LogReportify, "Writing graph to {}", constraints_report_path);
-        initalize_reportify_directory(default_dir, test_name);
-        std::experimental::filesystem::create_directories(constraints_report_path);
-
-        json constraints_json = graph_solver->get_constraint_info();
-        std::string json_path = constraints_report_path + "constraints.json";
-        write_json_to_file(json_path, constraints_json, 0);
-
-        int page_idx = 0;
-        for (auto const& page : graph_solver->get_constraint_info().pages)
-        {
-            std::string json_path = constraints_report_path + "constraints.page_" + std::to_string(page_idx) + ".json";
-            write_json_to_file(json_path, page, 0);
-            ++page_idx;
-        }
-    }
+    dump_graph(default_dir, test_name, graph_prefix, graph, report_path);
 }
-
 }  // namespace reportify
 }  // namespace tt
diff --git a/pybuda/csrc/reportify/reportify.hpp b/pybuda/csrc/reportify/reportify.hpp
index e3e14877d..e7cbcacf9 100644
--- a/pybuda/csrc/reportify/reportify.hpp
+++ b/pybuda/csrc/reportify/reportify.hpp
@@ -15,21 +15,8 @@ namespace graphlib {
     class Node;
 }
 
-namespace placer {
-struct PlacerSolution;
-}
-
-namespace balancer {
-struct BalancerSolution;
-struct OpModel;
-namespace legalizer
+namespace reportify
 {
-class GraphSolver;
-}
-void to_json(nlohmann::json& j, OpModel const& op_model);
-}
-
-namespace reportify {
 using json = nlohmann::json;
 
 void dump_graph(
@@ -37,23 +24,17 @@ void dump_graph(
     const std::string& test_name,
     const std::string& graph_prefix,
     const graphlib::Graph* graph,
-    const placer::PlacerSolution* placer_solution = nullptr,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution = nullptr,
     const std::string& report_path = get_pass_reports_relative_directory());
 
 // Default path
 json create_json_for_graph(
     const graphlib::Graph *graph,
-    const placer::PlacerSolution *placer_solution = nullptr,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution = nullptr,
     std::function<bool(graphlib::Node*)> node_filter = [](graphlib::Node*) { return true; });
 
 void dump_graph(
     const std::string& test_name,
     const std::string& graph_prefix,
     const graphlib::Graph* graph,
-    const placer::PlacerSolution* placer_solution = nullptr,
-    std::shared_ptr<balancer::BalancerSolution> balancer_solution = nullptr,
     const std::string& report_path = get_pass_reports_relative_directory());
 
 void dump_consteval_graph(const std::string& test_name, const std::string& graph_prefix, const graphlib::Graph* graph);
@@ -62,23 +43,13 @@ void dump_epoch_type_graphs(
         const std::string& test_name,
         const std::string& graph_prefix,
         const graphlib::Graph *graph,
-        const placer::PlacerSolution *placer_solution,
-        std::shared_ptr<balancer::BalancerSolution> balancer_solution = nullptr,
         const std::string& directory_path = get_default_reportify_path(""));
 
 void dump_epoch_id_graphs(
         const std::string& test_name,
         const std::string& graph_prefix,
         const graphlib::Graph *graph,
-        const placer::PlacerSolution *placer_solution,
-        std::shared_ptr<balancer::BalancerSolution> balancer_solution = nullptr,
         const std::string& directory_path = get_default_reportify_path(""));
-
-void dump_constraints(
-    const std::string& test_name,
-    const balancer::legalizer::GraphSolver* graph_solver,
-    const std::string& report_path = get_constraint_reports_relative_directory());
-
 }  // namespace reportify
 
 } // tt
diff --git a/pybuda/csrc/reportify/to_json.cpp b/pybuda/csrc/reportify/to_json.cpp
index 2e443f252..810966d7d 100644
--- a/pybuda/csrc/reportify/to_json.cpp
+++ b/pybuda/csrc/reportify/to_json.cpp
@@ -34,89 +34,4 @@ void to_json(json& j, EdgeAttributes const& attrs)
     j["tms"] = attrs.get_tms();
 }
 }  // namespace graphlib
-
-namespace balancer
-{
-std::string to_string(TStreamDir const& dir)
-{
-    switch (dir.v)
-    {
-        case TStreamDir::R: return "R";
-        case TStreamDir::C: return "C";
-        case TStreamDir::RZ: return "RZ";
-        case TStreamDir::CZ: return "CZ";
-        default: return "Unknown";
-    }
-}
-
-void to_json(json& j, TStreamFactor const& tsr)
-{
-    j["dir"] = tsr.none() ? "None" : to_string(tsr.dir);
-    j["factor"] = {tsr.r, tsr.c};
-}
-
-void to_json(json& j, BlockShape const& block_shape)
-{
-    j["t"] = block_shape.t;
-    j["tblock_m"] = block_shape.tblock_m;
-    j["tblock_n"] = block_shape.tblock_n;
-    j["mblock_m"] = block_shape.mblock_m;
-    j["mblock_n"] = block_shape.mblock_n;
-    j["ublock_rt"] = block_shape.ublock.rt;
-    j["ublock_ct"] = block_shape.ublock.ct;
-}
-
-void to_json(json& j, BufferModel const& buffer_model)
-{
-    j["block_shape"] = buffer_model.block_shape;
-    j["buffer_factor"] = buffer_model.buffer_factor;
-    j["l1_size_tiles"] = buffer_model.l1_size_tiles;
-    std::stringstream ss;
-    ss << buffer_model.data_format;
-    j["data_format"] = ss.str();
-    j["kernel_broadcast_tiles"] = buffer_model.kernel_broadcast_tiles;
-}
-
-void to_json(json& j, TensorShape const& shape) { j = {shape.w, shape.z, shape.rt, shape.ct}; }
-
-void to_json(json& j, OpModel const& op_model)
-{
-    j["op_model_id"] = op_model.id.id;
-    if (op_model.buda_op_node)
-        j["op_type"] = op_model.op_type();
-    j["grid_shape"] = op_model.grid_shape;
-    j["t_stream_factor"] = op_model.t_stream_factor;
-    j["fracture_factor"] = op_model.fracture_factor;
-    j["inputs"] = op_model.input_buffers;
-    j["outputs"] = op_model.output_buffers;
-    j["input_shapes"] = op_model.op_shape.inputs;
-    j["execution_cycles"] = op_model.cached_execution_cycles;
-}
-
-void to_json(json& j, GridShape const& grid_shape) { j = {grid_shape.r, grid_shape.c}; }
-namespace legalizer
-{
-void to_json(json& j, GraphSolver::ConstraintInfo::Page const& info)
-{
-    j["node_id_order"] = info.node_id_order;
-    j["op_models"] = info.id_to_op_models;
-    j["node_op_models"] = info.node_id_to_op_model_ids;
-    j["edge_path_sets"] = info.edge_to_path_sets;
-    j["failure_reasons"] = info.failure_reason_ids;
-}
-
-void to_json(json& j, GraphSolver::ConstraintInfo const& info)
-{
-    j["graph_name"] = info.graph_name;
-    j["paged"] = true;
-    j["num_pages"] = info.pages.size();
-    j["page_size"] = GraphSolver::ConstraintInfo::kPageSize;
-    j["op_model_selection"] = info.op_model_selection;
-    j["node_names"] = info.node_id_to_name;
-    j["node_pages"] = info.node_name_to_page;
-    j["error_codes"] = ConstraintFailureReasonDesc;
-}
-
-}  // namespace legalizer
-}  // namespace balancer
 }  // namespace tt
diff --git a/pybuda/csrc/reportify/to_json.hpp b/pybuda/csrc/reportify/to_json.hpp
index c98d6e1d9..5f1bdd49f 100644
--- a/pybuda/csrc/reportify/to_json.hpp
+++ b/pybuda/csrc/reportify/to_json.hpp
@@ -3,13 +3,9 @@
 // SPDX-License-Identifier: Apache-2.0
 #pragma once
 
+#include "graph_lib/node_types.hpp"
 #include "json_fwd.hpp"
-
 #include "lower_to_buda/common.hpp"
-#include "graph_lib/node_types.hpp"
-#include "placer/placer.hpp"
-#include "balancer/types.hpp"
-#include "balancer/legalizer/graph_solver.hpp"
 
 namespace std
 {
@@ -31,20 +27,4 @@ void to_json(json& j, UBlockOrder const& ublock_order);
 void to_json(json& j, OpType const& op_type);
 void to_json(json& j, EdgeAttributes const& attrs);
 }  // namespace graphlib
-
-namespace balancer
-{
-std::string to_string(TStreamDir const& dir);
-void to_json(json& j, TStreamFactor const& tsr);
-void to_json(json& j, BlockShape const& block_shape);
-void to_json(json& j, BufferModel const& buffer_model);
-void to_json(json& j, TensorShape const& shape);
-void to_json(json& j, OpModel const& op_model);
-void to_json(json& j, GridShape const& grid_shape);
-namespace legalizer
-{
-void to_json(json& j, GraphSolver::ConstraintInfo::Page const& constraint_info_page);
-void to_json(json& j, GraphSolver::ConstraintInfo const& constraint_info);
-}  // namespace legalizer
-}  // namespace balancer
 }  // namespace tt
diff --git a/pybuda/csrc/runtime/CMakeLists.txt b/pybuda/csrc/runtime/CMakeLists.txt
new file mode 100644
index 000000000..ea9c4fe32
--- /dev/null
+++ b/pybuda/csrc/runtime/CMakeLists.txt
@@ -0,0 +1,4 @@
+add_library(runtime STATIC runtime.cpp tt_device.cpp python_bindings.cpp)
+add_dependencies(runtime build_tt_mlir)
+
+target_compile_options(runtime PRIVATE ${STATIC_LIB_FLAGS} ${PYBUDA_CSRC_CFLAGS})
diff --git a/pybuda/csrc/runtime/python_bindings.cpp b/pybuda/csrc/runtime/python_bindings.cpp
new file mode 100644
index 000000000..b19c73e57
--- /dev/null
+++ b/pybuda/csrc/runtime/python_bindings.cpp
@@ -0,0 +1,19 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "runtime/python_bindings.hpp"
+#include "runtime/runtime.hpp"
+#include "tt/runtime/types.h"
+
+namespace tt {
+
+void RuntimeModule(py::module &m_runtime)
+{
+    py::class_<runtime::Binary>(m_runtime, "Binary")
+        .def("get_program_inputs", &runtime::Binary::getProgramInputs)
+        .def("get_program_outputs", &runtime::Binary::getProgramOutputs);
+    m_runtime.def("run_binary", tt::run_binary);
+}
+
+} // namespace tt
diff --git a/pybuda/csrc/runtime/python_bindings.hpp b/pybuda/csrc/runtime/python_bindings.hpp
new file mode 100644
index 000000000..ef0fa90e5
--- /dev/null
+++ b/pybuda/csrc/runtime/python_bindings.hpp
@@ -0,0 +1,19 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#pragma once
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
+#include "pybind11/pybind11.h"
+#include <pybind11/stl.h>
+#include <pybind11/numpy.h>
+#pragma clang diagnostic pop
+namespace py = pybind11;
+
+namespace tt {
+
+void RuntimeModule(py::module &m_runtime);
+
+}  // namespace tt
diff --git a/pybuda/csrc/runtime/runtime.cpp b/pybuda/csrc/runtime/runtime.cpp
new file mode 100644
index 000000000..f17fff4b7
--- /dev/null
+++ b/pybuda/csrc/runtime/runtime.cpp
@@ -0,0 +1,142 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "runtime.hpp"
+#include <optional>
+
+#include "tt_device.hpp"
+#include "utils/logger.hpp"
+#include "tt/runtime/runtime.h"
+
+namespace tt {
+
+static target::DataType torch_scalar_type_to_dt(torch::ScalarType st)
+{
+    switch (st)
+    {
+        case torch::ScalarType::Byte: return target::DataType::UInt8;
+        case torch::ScalarType::Char: return target::DataType::UInt8;
+        case torch::ScalarType::Short: return target::DataType::UInt16;
+        case torch::ScalarType::Int: return target::DataType::UInt32;
+        case torch::ScalarType::Long: return target::DataType::UInt32;
+        case torch::ScalarType::Half: return target::DataType::Float16;
+        case torch::ScalarType::Float: return target::DataType::Float32;
+        // case torch::ScalarType::Double:
+        // case torch::ScalarType::ComplexHalf:
+        // case torch::ScalarType::ComplexFloat:
+        // case torch::ScalarType::ComplexDouble:
+        // case torch::ScalarType::Bool:
+        case torch::ScalarType::BFloat16: return target::DataType::BFloat16;
+        default: break;
+    }
+
+    log_fatal(LogTTDevice, "Unhandled dtype {}", st);
+}
+
+static torch::ScalarType dt_to_torch_scalar_type(target::DataType df)
+{
+    switch (df)
+    {
+        case target::DataType::UInt8: return torch::ScalarType::Byte;
+        case target::DataType::UInt16: return torch::ScalarType::Short;
+        case target::DataType::UInt32: return torch::ScalarType::Int;
+        case target::DataType::Float16: return torch::ScalarType::Half;
+        case target::DataType::Float32: return torch::ScalarType::Float;
+        case target::DataType::BFloat16: return torch::ScalarType::BFloat16;
+        default: break;
+    }
+
+    log_fatal(LogTTDevice, "Unhandled dtype {}", df);
+}
+
+template <typename T>
+std::vector<int64_t> as_vec_int64(std::vector<T> const& vec)
+{
+    std::vector<int64_t> result;
+    result.reserve(vec.size());
+    for (auto const& v : vec)
+    {
+        result.push_back(v);
+    }
+    return result;
+}
+
+static runtime::Tensor create_tensor(const torch::Tensor& tensor)
+{
+    auto data = std::shared_ptr<void>(
+        tensor.data_ptr(),
+        [tensor](void*) { (void)tensor; }  // Capture tensor by value to increase ref count and keep it alive
+    );
+
+    auto shape = std::vector<uint32_t>(tensor.sizes().begin(), tensor.sizes().end());
+    auto stride = std::vector<uint32_t>(tensor.strides().begin(), tensor.strides().end());
+
+    return runtime::createTensor(
+        data,
+        shape,
+        stride,
+        tensor.element_size(),
+        torch_scalar_type_to_dt(tensor.scalar_type()));
+}
+
+runtime::Binary load_binary_from_file(std::string const& filename)
+{
+    runtime::Binary binary = tt::runtime::Binary::loadFromPath(filename.c_str()).handle;
+    return binary;
+}
+
+std::vector<torch::Tensor> run_binary_from_file(std::string const& filename, int program_idx, std::vector<torch::Tensor> const& inputs)
+{
+    auto binary = load_binary_from_file(filename);
+
+    return run_binary(binary, program_idx, inputs);
+}
+
+std::vector<torch::Tensor> run_binary(runtime::Binary &binary, int program_idx, std::vector<torch::Tensor> const& inputs)
+{
+    auto& system = TTSystem::get_system();
+
+    for (auto &device : system.devices)
+    {
+        if (!device->is_open())
+        {
+            device->open_device();
+        }
+    }
+
+    // For now, we only support a single device.
+    auto& tt_device = system.devices[0];
+    if (!tt_device->is_open())
+    {
+        log_fatal(LogTTDevice, "Failed to open device");
+    }
+
+    auto& device = *tt_device->rt_device;
+
+    std::vector<runtime::Tensor> rt_inputs;
+    for (auto const& input : inputs)
+    {
+        rt_inputs.emplace_back(create_tensor(input));
+    }
+
+    std::vector<torch::Tensor> outputs;
+    std::vector<runtime::Tensor> rt_outputs;
+    std::vector<runtime::TensorDesc> output_descs = binary.getProgramOutputs(program_idx);
+    outputs.reserve(output_descs.size());
+    for (auto const& desc : output_descs)
+    {
+        std::vector<std::int64_t> shape = as_vec_int64(desc.shape);
+        std::vector<std::int64_t> stride = as_vec_int64(desc.stride);
+
+        torch::Tensor output = at::empty_strided(shape, stride, dt_to_torch_scalar_type(desc.dataType));
+        outputs.emplace_back(std::move(output));
+        rt_outputs.emplace_back(create_tensor(outputs.back()));
+    }
+
+    runtime::Event _ = runtime::submit(device, binary, program_idx, rt_inputs, rt_outputs);
+
+    return outputs;
+}
+
+} // namespace tt
diff --git a/pybuda/csrc/runtime/runtime.hpp b/pybuda/csrc/runtime/runtime.hpp
new file mode 100644
index 000000000..42953cd36
--- /dev/null
+++ b/pybuda/csrc/runtime/runtime.hpp
@@ -0,0 +1,19 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include <torch/torch.h>
+#include <torch/python.h>
+#include "tt/runtime/types.h"
+
+namespace tt {
+
+// Entry point for invoking tt-mlir runtime and running the binary on the device.
+std::vector<torch::Tensor> run_binary(runtime::Binary& binary, int program_idx, std::vector<torch::Tensor> const& inputs);
+
+// Helper function to run the binary from the file - might be useful for testing/debugging.
+std::vector<torch::Tensor> run_binary_from_file(std::string const& filename, int program_idx, std::vector<torch::Tensor> const& inputs);
+
+} // namespace tt
+
diff --git a/pybuda/csrc/runtime/tt_device.cpp b/pybuda/csrc/runtime/tt_device.cpp
new file mode 100644
index 000000000..f6a47d919
--- /dev/null
+++ b/pybuda/csrc/runtime/tt_device.cpp
@@ -0,0 +1,64 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include <optional>
+
+#include "tt_device.hpp"
+#include "utils/assert.hpp"
+#include "utils/logger.hpp"
+
+#include "tt/runtime/runtime.h"
+
+namespace tt {
+
+TTSystem detect_available_devices() {
+    auto [system_desc, chip_ids] = runtime::getCurrentSystemDesc();
+
+    std::vector<std::shared_ptr<TTDevice>> devices;
+    int logical_device_index = 0;
+    ARCH arch = ARCH::Invalid;
+    for (std::uint32_t chip_desc_index : *system_desc->chip_desc_indices())
+    {
+        target::ChipDesc const* chip_desc = system_desc->chip_descs()->Get(chip_desc_index);
+        target::ChipCapability chip_capabilities = system_desc->chip_capabilities()->Get(logical_device_index);
+
+        bool mmio = bool(chip_capabilities & target::ChipCapability::HostMMIO);
+        if (not mmio)
+        {
+            continue;
+        }
+
+        switch(chip_desc->arch())
+        {
+            case target::Arch::Grayskull: arch = ARCH::GRAYSKULL; break;
+            case target::Arch::Wormhole_b0: arch = ARCH::WORMHOLE_B0; break;
+            case target::Arch::Blackhole: arch = ARCH::BLACKHOLE; break;
+            default: log_fatal(LogTTDevice, "Unknown chip type {}", chip_desc->arch());
+        }
+
+        auto device = std::make_shared<TTDevice>(std::nullopt, system_desc, arch, mmio, logical_device_index);
+        devices.push_back(device);
+        ++logical_device_index;
+    }
+
+    return TTSystem{system_desc, chip_ids, devices};
+}
+
+TTSystem& TTSystem::get_system() {
+    static TTSystem system = detect_available_devices();
+    return system;
+}
+
+void TTDevice::open_device() {
+    TT_ASSERT(!is_open());
+    rt_device = runtime::openDevice({index});
+}
+
+void TTDevice::close_device() {
+    TT_ASSERT(is_open());
+    runtime::closeDevice(rt_device.value());
+    rt_device.reset();
+}
+
+} // namespace tt
diff --git a/pybuda/csrc/runtime/tt_device.hpp b/pybuda/csrc/runtime/tt_device.hpp
new file mode 100644
index 000000000..338453022
--- /dev/null
+++ b/pybuda/csrc/runtime/tt_device.hpp
@@ -0,0 +1,83 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include <map>
+
+#include "pybuda/csrc/backend_api/arch_type.hpp"
+#include "tt/runtime/types.h"
+
+namespace tt
+{
+
+struct TTDevice
+{
+    std::optional<runtime::Device> rt_device;
+    ARCH arch;
+    bool mmio;
+    int index;
+
+    // TODO: These don't seem to belong here
+    std::map<int, std::vector<std::string>> input_runtime_transforms;
+    std::map<int, std::vector<std::vector<int>>> input_tile_bcast_dims;
+    std::map<int, std::vector<std::string>> output_runtime_transforms;
+    std::unordered_map<int, std::vector<int>> subgraph_to_tensor_uid_on_device;
+
+    TTDevice(
+        std::optional<runtime::Device> rt_device,
+        runtime::SystemDesc system_desc,
+        ARCH arch,
+        bool mmio,
+        int index) :
+        rt_device(rt_device),
+        arch(arch),
+        mmio(mmio),
+        index(index)
+    {
+    }
+
+    TTDevice(const TTDevice&) = delete;
+    TTDevice& operator=(const TTDevice&) = delete;
+
+    bool is_open() const
+    {
+        return rt_device.has_value();
+    }
+
+    void open_device();
+    void close_device();
+};
+
+struct TTSystem
+{
+    runtime::SystemDesc system_desc;
+    std::vector<int> chip_ids;
+    std::vector<std::shared_ptr<TTDevice>> devices;
+
+    TTSystem(const TTSystem&) = delete;
+    TTSystem& operator=(const TTSystem&) = delete;
+
+    ~TTSystem()
+    {
+        close_devices();
+    }
+
+    void close_devices()
+    {
+        for (auto& device : devices)
+        {
+            if (device->is_open())
+            {
+                device->close_device();
+            }
+        }
+    }
+
+    static TTSystem& get_system();
+};
+
+TTSystem detect_available_devices();
+
+} // namespace tt
+
diff --git a/pybuda/csrc/scheduler/interactive_scheduler.cpp b/pybuda/csrc/scheduler/interactive_scheduler.cpp
deleted file mode 100644
index f39be2d4a..000000000
--- a/pybuda/csrc/scheduler/interactive_scheduler.cpp
+++ /dev/null
@@ -1,127 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "scheduler/interactive_scheduler.hpp"
-
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node_types.hpp"
-#include "scheduler/longest_path.hpp"
-#include "utils/logger.hpp"
-
-namespace tt::scheduler
-{
-
-bool InteractiveScheduler::op_is_ready(const graphlib::BudaOpNode *op) const
-{
-    if (op->get_epoch_type() != current_epoch_type)
-        return false;
-
-    for (graphlib::Node *node : graph->operands(op))
-    {
-        if (node->node_type() == graphlib::kBudaOp)
-        {
-            if (scheduled_ops.count(node->as<graphlib::BudaOpNode>()) == 0)
-                return false;
-        }
-    }
-
-    return true;
-}
-
-void InteractiveScheduler::try_schedule_users(const graphlib::Node *node)
-{
-    for (graphlib::Node *node : graph->users(node))
-        if (node->node_type() == graphlib::kBudaOp) {
-            auto it = std::find(ready_ops.begin(), ready_ops.end(), node->name());
-            if ((it == ready_ops.end()) && op_is_ready(node->as<graphlib::BudaOpNode>()))
-                ready_ops.push_back(node->name());
-        }
-}
-
-InteractiveScheduler::InteractiveScheduler(
-    const SchedulerConfig &config, const graphlib::Graph *graph, graphlib::NodeEpochType initial_epoch_type) :
-    graph(graph), current_epoch_type(initial_epoch_type)
-{
-    // Generate preferred schedule. Scheduler will prioritize the next op in the scheduler when offering
-    // choices.
-    switch (config.policy)
-    {
-        case SchedulerPolicy::Topological: preferred_schedule = run_topological_scheduler(graph); break;
-        case SchedulerPolicy::ModuleInputsBFS: preferred_schedule = run_module_by_module_scheduler(config, graph); break;
-        case SchedulerPolicy::LongestPath: preferred_schedule = run_longest_path_scheduler(graph); break;
-        default: log_fatal("providing unknown scheduler policy.");
-    }
-
-    if (preferred_schedule.size() == 0)
-        return;
-
-    // Create initial set of op candidates
-    ready_ops.push_back(preferred_schedule.at(0));
-
-    for (graphlib::Node *node : graph->nodes())
-    {
-        if (node->node_type() == graphlib::NodeType::kInput)
-        {
-            try_schedule_users(node);
-        }
-    }
-}
-
-std::vector<std::string> InteractiveScheduler::get_ops() const { return ready_ops; }
-
-void InteractiveScheduler::accept_op(const std::string &op)
-{
-    // Mark op as scheduled, and update the 'ready_ops' list
-    scheduled_ops.insert(graph->get_node_by_name(op));
-
-    auto it = std::find(ready_ops.begin(), ready_ops.end(), op);
-    TT_ASSERT(it != ready_ops.end());
-    ready_ops.erase(it);
-
-    try_schedule_users(graph->get_node_by_name(op));
-
-    it = std::find(preferred_schedule.begin(), preferred_schedule.end(), op);
-    if (it != preferred_schedule.end())
-        preferred_schedule.erase(it);
-
-    if (preferred_schedule.size() > 0)
-    {
-        it = std::find(ready_ops.begin(), ready_ops.end(), preferred_schedule.at(0));
-        if (it != ready_ops.begin() && it != ready_ops.end()) // found it, and not at the top spot - move it
-        {
-            ready_ops.erase(it);
-            ready_ops.insert(ready_ops.begin(), preferred_schedule.at(0));
-        }
-    }
-}
-
-bool InteractiveScheduler::done() const { return ready_ops.size() == 0; }
-    
-void InteractiveScheduler::set_epoch_type(graphlib::NodeEpochType epoch_type) 
-{ 
-    TT_ASSERT(done(), "Epoch type shouldn't be changed on the fly. At least not with current implementation");
-    current_epoch_type = epoch_type; 
-
-    for (const std::string &op : preferred_schedule)
-    {
-        if (op_is_ready(graph->get_node_by_name(op)->as<graphlib::BudaOpNode>()))
-        {
-            ready_ops.push_back(op);
-        }
-    }
-}
-
-InteractiveScheduler::Checkpoint InteractiveScheduler::save_checkpoint() const
-{
-    return Checkpoint{preferred_schedule, ready_ops, scheduled_ops, current_epoch_type};
-}
-
-void InteractiveScheduler::restore_checkpoint(const InteractiveScheduler::Checkpoint &checkpoint)
-{
-    preferred_schedule = checkpoint.preferred_schedule;
-    ready_ops = checkpoint.ready_ops;
-    scheduled_ops = checkpoint.scheduled_ops;
-    current_epoch_type = checkpoint.current_epoch_type;
-}
-
-}  // namespace tt::scheduler
diff --git a/pybuda/csrc/scheduler/interactive_scheduler.hpp b/pybuda/csrc/scheduler/interactive_scheduler.hpp
deleted file mode 100644
index c94c2bc1b..000000000
--- a/pybuda/csrc/scheduler/interactive_scheduler.hpp
+++ /dev/null
@@ -1,59 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <memory>
-#include <vector>
-#include <unordered_set>
-
-#include "scheduler/scheduler.hpp"
-#include "scheduler/utils.hpp"
-#include "graph_lib/defines.hpp"
-
-namespace tt
-{
-namespace graphlib
-{
-class Graph;
-class BudaOpNode;
-}
-namespace scheduler
-{
-// Interactive scheduler returns a list of ops that could be scheduled next, ordered by preference
-// of the selected scheduling algorithm.
-class InteractiveScheduler
-{
-   private:
-    const graphlib::Graph *graph;
-    graphlib::NodeEpochType current_epoch_type;
-
-    Schedule preferred_schedule;
-    std::vector<std::string> ready_ops; // ops ready to be executed
-    std::unordered_set<graphlib::Node *> scheduled_ops; // ops that have been accepted
-
-    bool op_is_ready(const graphlib::BudaOpNode *op) const;
-    void try_schedule_users(const graphlib::Node *node);
-
-   public:
-    InteractiveScheduler(const SchedulerConfig &config, const graphlib::Graph *graph, graphlib::NodeEpochType initial_epoch_type);
-
-    std::vector<std::string> get_ops() const;
-    void accept_op(const std::string &op);
-    bool done() const;
-
-    void set_epoch_type(graphlib::NodeEpochType epoch_type);
-    
-    struct Checkpoint {
-        Schedule preferred_schedule;
-        std::vector<std::string> ready_ops; // ops ready to be executed
-        std::unordered_set<graphlib::Node *> scheduled_ops; // ops that have been accepted
-        graphlib::NodeEpochType current_epoch_type;
-    };
-
-    Checkpoint save_checkpoint() const;
-    void restore_checkpoint(const Checkpoint &checkpoint);
-};
-
-}  // namespace scheduler
-}  // namespace tt
diff --git a/pybuda/csrc/scheduler/longest_path.cpp b/pybuda/csrc/scheduler/longest_path.cpp
deleted file mode 100644
index 83822b82b..000000000
--- a/pybuda/csrc/scheduler/longest_path.cpp
+++ /dev/null
@@ -1,174 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "scheduler/longest_path.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-#include "graph_lib/utils.hpp"
-
-#include "utils/logger.hpp"
-
-using tt::graphlib::Graph;
-using tt::graphlib::Node;
-using tt::LogScheduler;
-
-// 
-// Give longest path throgh the graph the priority - schedule along it whenever possible, along with any additional outputs
-// that are created along that path. The goal is to have outputs "sit" for the shortest amount of time.
-//
-// To achieve the goal:
-//   - Schedule the next op in the longest path when possible.
-//   - When an op on longest path can't be scheduled, schedule its operands as quickly as possible.
-//   - Don't go deep down paths when other ops have their outputs sitting around.
-//
-// We'll do this by having two priority groups:
-//
-//   - Priority 1 - only schedule ops from this group. The next in the longest path gets priority.
-//   - Priority 2 - when priority 1 group is empty, move priority 2 to priority 1
-//
-// Priority 1 starts with the first op of the longest path, priority 2 starts empty.
-// Any time a new op is scheduled, its users are added to priority 2, if they aren't in priority 1 or have been scheduled already.
-// If no op in P1 can be scheduled, then loop:
-//   - all operands of P1 ops are added to P1, unless they have been scheduled already. If they are in P2, they are removed from there.
-// ..until something from P1 can be scheduled.
-namespace tt::scheduler {
-
-Schedule run_longest_path_scheduler(const Graph* graph)
-{
-
-    auto less = [](Node *a, Node *b) { return a->id() < b->id(); };
-    using SortedNodeSet = std::set<Node *, std::function<bool(Node *, Node*)>>; // using a sorted set to preserve determinism
-
-    std::vector<Node *> longest_path = graphlib::get_longest_path(graph);
-    SortedNodeSet P1(less);
-    SortedNodeSet P2(less);
-    P1.insert(longest_path[1]); // skipping the first node since it's an input.. it's already "scheduled"
-    std::uint32_t current_longest_path_index = 1;
-
-    // Set of ops currently scheduled for quick lookup
-    SortedNodeSet scheduled(less);
-    SortedNodeSet input_nodes(less);
-    for (Node *node : graph->nodes())
-        if (node->node_type() == graphlib::NodeType::kInput) {
-            input_nodes.insert(node);
-            scheduled.insert(node); // all inputs are "scheduled", so that ops that only depend on inputs can see their operands ready
-        }
-
-    // Actual schedule
-    std::vector<Node *> schedule;
-
-    // Check if op can be scheduled - i.e. all of its operands have been scheduled
-    auto can_be_scheduled = [&scheduled, &graph](Node *node) -> bool { 
-        std::vector<Node *> operands = graph->data_operands(node);
-        TT_ASSERT(operands.size() > 0, "Input " + node->name() + " should've already been scheduled"); // all inputs should already be scheduled
-        return std::all_of(operands.begin(), operands.end(), [&scheduled](Node *operand) { return scheduled.count(operand) > 0; });
-    };
-
-    // Schedule op - add to schedule, remove from P1, add outputs to P2
-    auto schedule_op = [&scheduled, &P1, &P2, &can_be_scheduled, &schedule, &graph](Node *node) {
-        TT_ASSERT(can_be_scheduled(node));
-        TT_ASSERT(P1.count(node) > 0 && P2.count(node) == 0 && scheduled.count(node) == 0); 
-        schedule.push_back(node);
-        scheduled.insert(node);
-        P1.erase(node);
-
-        for (Node *user : graph->data_users(node))
-        {
-            if (P1.count(user) == 0 && scheduled.count(user) == 0)
-                P2.insert(user);
-        }
-
-    };
-
-
-    // Keep scheduling until nothing's left
-    while (!P1.empty() || !P2.empty())
-    {
-        if (P1.empty()) {
-            P1 = P2;
-            P2.clear();
-        }
-
-        // First schedule longest path, if possible
-        Node *next_on_longest_path = longest_path[current_longest_path_index];
-        while (P1.count(next_on_longest_path) > 0 && can_be_scheduled(next_on_longest_path)) {
-            schedule_op(next_on_longest_path);
-            if (current_longest_path_index < longest_path.size() - 1)
-                current_longest_path_index++;
-            next_on_longest_path = longest_path[current_longest_path_index];
-        }
-
-        // Now schedule everything else in P1, if possible
-        SortedNodeSet P1_copy(less);
-        P1_copy = P1; // make a copy since we'll be removing items
-        for (Node *node: P1_copy)
-        {
-            if (can_be_scheduled(node))
-                schedule_op(node);
-        }
-
-        // If there's anything left in P1, it means it couldn't be scheduled, so we'll try to schedule its operands
-        P1_copy = P1;
-        for (Node *node: P1_copy)
-        {
-            for (Node *operand: graph->data_operands(node))
-            {
-                if (P1.count(operand) == 0 && scheduled.count(operand) == 0) {
-                    if (P2.count(operand) > 0)
-                        P2.erase(operand); // upgrade to P1
-                    P1.insert(operand);
-                }
-            }
-        }
-
-        if (P1.empty() && P2.empty()) {
-            // Check if any of the inputs have their users not scheduled - add them to P1 to get them queued up.
-            for (Node *input_node : input_nodes)
-                for (Node *user: graph->data_users(input_node))
-                    if (scheduled.count(user) == 0)
-                        P1.insert(user);
-        }
-    }
-
-    // Debug
-    //std::cout << "Schedule:" << std::endl;
-    //for (Node *node: schedule)
-    //    std::cout << " - " << node->name() << std::endl;
-
-    // Verify
-    auto visible_nodes = graphlib::visible_nodes(*graph);
-    if (scheduled.size() != visible_nodes.size())
-    {
-        for (Node *node : visible_nodes)
-            if (scheduled.count(node) == 0)
-            {
-                log_error(tt::LogScheduler, "{} hasn't been scheduled.", node->name());
-            }
-        TT_THROW("Some nodes haven't been scheduled");
-    }
-
-    std::unordered_set<Node *> visited;
-    for (Node *node: schedule) {
-        for (Node *operand: graph->data_operands(node))
-            TT_ASSERT(visited.count(operand) > 0 || input_nodes.count(operand) > 0,
-                    "Operand " + operand->name() + " of node " + node->name() + " hasn't been scheduled before the node.");
-        visited.insert(node);
-    }
-
-    // Remove all unscheduleable nodes
-    std::vector<Node *> final_schedule;
-    for (Node *node : schedule)
-    {
-        if (can_schedule_node(node))
-        {
-            final_schedule.push_back(node);
-        }
-    }
-
-    Schedule ret;
-    for (Node *node : final_schedule) ret.push_back(node->name());
-    return ret;
-}
-
-}
-
diff --git a/pybuda/csrc/scheduler/longest_path.hpp b/pybuda/csrc/scheduler/longest_path.hpp
deleted file mode 100644
index 8d41c2b07..000000000
--- a/pybuda/csrc/scheduler/longest_path.hpp
+++ /dev/null
@@ -1,15 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include "scheduler/scheduler.hpp"
-#include "scheduler/utils.hpp"
-
-namespace tt::scheduler
-{
-
-// Schedule the longest path through the graph, and then fill in the missing parts
-Schedule run_longest_path_scheduler(const graphlib::Graph* graph);
-
-}  // namespace tt::scheduler
diff --git a/pybuda/csrc/scheduler/module.mk b/pybuda/csrc/scheduler/module.mk
deleted file mode 100644
index 2a4b93b95..000000000
--- a/pybuda/csrc/scheduler/module.mk
+++ /dev/null
@@ -1,32 +0,0 @@
-# Every variable in subdir must be prefixed with subdir (emulating a namespace)
-
-PYBUDA_CSRC_SCHEDULER_LIB = $(LIBDIR)/libscheduler.a
-PYBUDA_CSRC_SCHEDULER_SRCS = \
-	pybuda/csrc/scheduler/scheduler.cpp \
-	pybuda/csrc/scheduler/longest_path.cpp \
-	pybuda/csrc/scheduler/utils.cpp \
-	pybuda/csrc/scheduler/interactive_scheduler.cpp \
-	pybuda/csrc/scheduler/python_bindings.cpp
-
-PYBUDA_CSRC_SCHEDULER_INCLUDES = $(PYBUDA_CSRC_INCLUDES)
-
-PYBUDA_CSRC_SCHEDULER_OBJS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_SCHEDULER_SRCS:.cpp=.o))
-PYBUDA_CSRC_SCHEDULER_DEPS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_SCHEDULER_SRCS:.cpp=.d))
-
-
-
--include $(PYBUDA_CSRC_SCHEDULER_DEPS)
-
-SCHEDULER_CSRC_CFLAGS = $(PYBUDA_CSRC_CFLAGS)
-
-# Each module has a top level target as the entrypoint which must match the subdir name
-pybuda/csrc/scheduler: $(PYBUDA_CSRC_SCHEDULER_LIB)
-
-$(PYBUDA_CSRC_SCHEDULER_LIB): $(PYBUDA_CSRC_SCHEDULER_OBJS) $(PYBUDA_CSRC_GRAPH_LIB) 
-	@mkdir -p $(LIBDIR)
-	ar rcs $@ $^
-
-$(OBJDIR)/pybuda/csrc/scheduler/%.o: pybuda/csrc/scheduler/%.cpp
-	@mkdir -p $(@D)
-	$(CXX) $(SCHEDULER_CSRC_CFLAGS) $(CXXFLAGS) $(STATIC_LIB_FLAGS) $(PYBUDA_CSRC_SCHEDULER_INCLUDES) -c -o $@ $<
-
diff --git a/pybuda/csrc/scheduler/python_bindings.cpp b/pybuda/csrc/scheduler/python_bindings.cpp
deleted file mode 100644
index a0a430198..000000000
--- a/pybuda/csrc/scheduler/python_bindings.cpp
+++ /dev/null
@@ -1,30 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "scheduler/python_bindings.hpp"
-#include "scheduler/scheduler.hpp"
-#include "graph_lib/graph.hpp"
-
-#include <sstream>
-
-using namespace tt::scheduler;
-
-void SchedulerModule(py::module &m_scheduler) {
-    py::enum_<SchedulerPolicy>(m_scheduler, "SchedulerPolicy")
-        .value("Topological", SchedulerPolicy::Topological)
-        .value("ModuleInputsBFS", SchedulerPolicy::ModuleInputsBFS)
-        .export_values();
-
-    py::class_<SchedulerConfig>(m_scheduler, "SchedulerConfig")
-        .def(py::init<SchedulerPolicy, std::vector<std::vector<std::string>>>(), py::arg("scheduler_policy"), py::arg("scheduler_constraints"))
-        .def_readwrite("policy", &SchedulerConfig::policy)
-        .def_readwrite("scheduler_constraints", &SchedulerConfig::scheduler_constraints);
-
-    m_scheduler.def(
-        "policy_from_string", &policy_from_string, "Returns schedule policy from string", py::arg("schedule_policy_str"));
-
-    m_scheduler.def(
-        "run_scheduler", &run_scheduler, py::arg("scheduler_config"), py::arg("graph")
-    );
-}
-
diff --git a/pybuda/csrc/scheduler/python_bindings.hpp b/pybuda/csrc/scheduler/python_bindings.hpp
deleted file mode 100644
index 3a02604f1..000000000
--- a/pybuda/csrc/scheduler/python_bindings.hpp
+++ /dev/null
@@ -1,10 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <pybind11/numpy.h>
-namespace py = pybind11;
-
-void SchedulerModule(py::module &m_scheduler);
-
diff --git a/pybuda/csrc/scheduler/scheduler.cpp b/pybuda/csrc/scheduler/scheduler.cpp
deleted file mode 100644
index 1318415bc..000000000
--- a/pybuda/csrc/scheduler/scheduler.cpp
+++ /dev/null
@@ -1,861 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "scheduler/scheduler.hpp"
-
-#include <map>
-#include <queue>
-#include <unordered_map>
-#include <stack>
-
-#include "graph_lib/defines.hpp"
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-#include "graph_lib/node_types.hpp"
-#include "graph_lib/utils.hpp"
-#include "placer/lower_to_placer.hpp"
-#include "scheduler/longest_path.hpp"
-#include "utils/logger.hpp"
-
-using tt::LogScheduler;
-using tt::graphlib::Node;
-using tt::graphlib::NodeId;
-using tt::graphlib::Edge;
-using tt::graphlib::EdgeType;
-
-namespace tt::scheduler {
-
-Schedule run_topological_scheduler(const graphlib::Graph* graph)
-{
-    Schedule scheduled_nodes;
-    for (graphlib::Node* node : graphlib::topological_sort(*graph))
-    {
-        if (can_schedule_node(node))
-        {
-            scheduled_nodes.push_back(node->name());
-        }
-    }
-    return scheduled_nodes;
-}
-
-static bool requires_visit(const std::unordered_set<NodeId>& visited, NodeId node_id) {
-    return visited.find(node_id) == visited.end();
-}
-
-void assert_schedule_dependencies_met(const graphlib::Graph* graph, const Schedule& schedule)
-{
-    std::unordered_map<std::string, std::uint32_t> node_to_schedule_index;
-    node_to_schedule_index.reserve(schedule.size());
-    for (std::uint32_t i = 0; i < schedule.size(); ++i)
-    {
-        node_to_schedule_index[schedule[i]] = i;
-    }
-
-    for (const std::string& op : schedule)
-    {
-        Node* node = graph->get_node_by_name(op);
-        for (const Node* predecessor_node : get_schedule_predecessors(graph, node))
-        {
-            if (node_to_schedule_index.find(predecessor_node->name()) != node_to_schedule_index.end())
-            {
-                TT_LOG_ASSERT(
-                    node_to_schedule_index[predecessor_node->name()] < node_to_schedule_index[op],
-                    "Scheduler: dependency not met for node: {}",
-                    op);
-            }
-        }
-    }
-}
-
-void assert_valid_schedule(const graphlib::Graph* graph, const Schedule& schedule) {
-    // basic check: verify all nodes have been placed using topo as baseline
-    Schedule topo_schedule = run_topological_scheduler(graph);
-
-    std::unordered_set<std::string> scheduled_set;
-    for (auto op : schedule) {
-        scheduled_set.insert(op);
-    }
-    for (auto op : topo_schedule) {
-        if (scheduled_set.find(op) == scheduled_set.end()) {
-            log_warning("{} not found in the scheduled_set.", op);
-        }
-    }
-
-    if (schedule.size() != topo_schedule.size())
-    {
-        std::vector<std::string> diff;
-        auto copy = schedule;
-        std::sort(copy.begin(), copy.end());
-        std::sort(topo_schedule.begin(), topo_schedule.end());
-        std::set_symmetric_difference(
-            copy.begin(), copy.end(), topo_schedule.begin(), topo_schedule.end(), std::back_inserter(diff));
-        log_error("Scheduler: not all nodes have been placed using current policy.");
-        for (auto name : diff) log_error("  {} {}", name, graph->get_node_by_name(name)->get_type());
-        log_fatal("Scheduler: not all nodes have been placed using current policy.");
-    }
-    assert_schedule_dependencies_met(graph, schedule);
-}
-
-static void push_to_node_queue(
-    const graphlib::Graph* graph,
-    std::queue<Node*>& node_queue,
-    const std::vector<std::string>& nodes)
-{
-    for (const std::string& name : nodes) {
-        log_debug(LogScheduler, "Running BFS from module input: {}", name);
-        node_queue.push(graph->get_node_by_name(name));
-    }
-}
-
-using NodeGroup = std::unordered_set<std::string>;
-using NodeGroupVector = std::vector<NodeGroup>;
-NodeGroup add_group(const graphlib::Graph* graph, const std::string& op)
-{
-    NodeGroup group;
-    Node* node = graph->get_node_by_name(op);
-    for (const Edge& operand_edge : graph->operand_data_edges(node))
-    {
-        if (operand_edge.edge_type == EdgeType::kDataLoopback or
-            operand_edge.edge_type == EdgeType::kControlLoop)
-        {
-            continue;
-        }
-        Node* predecessor_node = graph->node_by_id(operand_edge.producer_node_id);
-        group.insert(predecessor_node->name());
-    }
-    log_debug(LogScheduler, "Creating group:");
-    for (const auto& n : group)
-    {
-        log_debug(LogScheduler, "\t op: {}", n);
-    }
-
-    return group;
-}
-
-NodeGroupVector create_groups(const graphlib::Graph* graph, const std::vector<std::string>& ops)
-{
-    NodeGroupVector groups;
-    for (const auto& op : ops)
-    {
-        groups.emplace_back(add_group(graph, op));
-    }
-    return groups;
-}
-
-std::vector<std::string> discover_ops_for_grouped_inputs(const graphlib::Graph* graph)
-{
-    constexpr int HEURISTIC_FOR_NUM_INPUTS = 6;
-    const std::unordered_set<std::string> target_op_types = {"concatenate", "splice"};
-
-    std::vector<std::string> ops_for_grouped_inputs;
-    for (graphlib::Node* node : graphlib::topological_sort(*graph))
-    {
-        if (node->node_type() == graphlib::NodeType::kPyOp or node->node_type() == graphlib::NodeType::kBudaOp)
-        {
-            const std::string& op_type = node->as<graphlib::OpNode>()->op_name();
-            if (target_op_types.find(op_type) != target_op_types.end())
-            {
-                int data_inputs = 0;
-                for (const Edge& operand_edge : graph->operand_data_edges(node))
-                {
-                    if (operand_edge.edge_type == EdgeType::kDataLoopback or
-                        operand_edge.edge_type == EdgeType::kControlLoop)
-                    {
-                        continue;
-                    }
-                    data_inputs += 1;
-                }
-
-                if (data_inputs >= HEURISTIC_FOR_NUM_INPUTS)
-                {
-                    ops_for_grouped_inputs.push_back(node->name());
-                }
-            }
-        }
-    }
-    for (const auto& op : ops_for_grouped_inputs)
-    {
-        log_debug(LogScheduler, "Discovered: {}", op);
-    }
-    return ops_for_grouped_inputs;
-}
-
-static void add_schedule_dependencies(
-    std::unordered_map<std::string, std::string>& schedule_dependencies,
-    const std::vector<std::string>& partial_ordering)
-{
-    for (size_t i = 1; i < partial_ordering.size(); i++)
-    {
-        schedule_dependencies[partial_ordering[i]] = partial_ordering[i - 1];
-    }
-}
-
-// schedule partial data-copies as early as possible. soft-constraint
-static std::vector<std::vector<std::string>> get_schedule_constraints_for_partial_data_copies(
-    const graphlib::Graph* graph)
-{
-    std::vector<std::vector<std::string>> partial_orderings;
-    for (Node* partial_datacopy_output : graph->ordered_partial_datacopy_outputs())
-    {
-        // find the op that performs the read from the write-back input/parameter
-        auto loopback_edge = graph->user_edges(partial_datacopy_output).at(0);
-        auto loopback_input = graph->node_by_id(loopback_edge.consumer_node_id);
-
-        std::vector<std::string> write_back_path;
-        for (auto node : graphlib::subgraph(graph, loopback_input, partial_datacopy_output))
-        {
-            write_back_path.push_back(node->name());
-        }
-
-        if (not write_back_path.empty())
-        {
-            std::reverse(write_back_path.begin(), write_back_path.end());  // graphlib::subgraph in rev order
-            partial_orderings.push_back(write_back_path);
-
-            auto writer = graph->get_node_by_name(write_back_path.front());
-            for (Node* user : graph->data_users(writer))
-            {
-                if (std::find(write_back_path.begin(), write_back_path.end(), user->name()) == write_back_path.end())
-                {
-                    partial_orderings.push_back({write_back_path.back(), user->name()});
-                }
-            }
-        }
-    }
-    log_trace(LogScheduler, "\t Partial ordering inserted from data-copy-writeback", partial_orderings);
-    return partial_orderings;
-}
-
-static std::vector<std::vector<std::string>> get_constraints_from_control_edges(const graphlib::Graph* graph)
-{
-    std::vector<std::vector<std::string>> partial_orderings;
-    for (auto edge : graph->edges(graphlib::EdgeType::kControl))
-    {
-        Node* producer = graph->node_by_id(edge.producer_node_id);
-        Node* consumer = graph->node_by_id(edge.consumer_node_id);
-        if (can_schedule_node(producer) and can_schedule_node(consumer))
-        {
-            partial_orderings.push_back({producer->name(), consumer->name()});
-        }
-    }
-    log_trace(LogScheduler, "\t Partial ordering inserted from control edges: {}", partial_orderings);
-    return partial_orderings;
-}
-
-static std::unordered_map<std::string, std::string> get_schedule_dependencies(
-    const SchedulerConfig& config, const graphlib::Graph* graph)
-{
-    std::unordered_map<std::string, std::string> schedule_dependencies;
-    for (const auto& partial_ordering : config.scheduler_constraints)
-    {
-        add_schedule_dependencies(schedule_dependencies, partial_ordering);
-    }
-    for (const auto& partial_ordering : get_schedule_constraints_for_partial_data_copies(graph))
-    {
-        add_schedule_dependencies(schedule_dependencies, partial_ordering);
-    }
-    for (const auto& partial_ordering : get_constraints_from_control_edges(graph))
-    {
-        add_schedule_dependencies(schedule_dependencies, partial_ordering);
-    }
-    return schedule_dependencies;
-}
-
-static std::vector<graphlib::NodeId> get_operand_node_ids(
-    const std::unordered_map<std::string, std::string>& schedule_dependencies,
-    const graphlib::Graph* graph,
-    const graphlib::Node* node)
-{
-    std::vector<graphlib::NodeId> operand_node_ids;
-    for (const Edge& operand_edge : graph->operand_data_edges(node)) {
-        if (operand_edge.edge_type == EdgeType::kDataLoopback or
-            operand_edge.edge_type == EdgeType::kControlLoop)
-        {
-            continue;
-        }
-        operand_node_ids.push_back(operand_edge.producer_node_id);
-    }
-    if (schedule_dependencies.find(node->name()) != schedule_dependencies.end())
-    {
-        const std::string& dependency = schedule_dependencies.at(node->name());
-        Node* dependency_node = graph->get_node_by_name(dependency);
-        operand_node_ids.push_back(dependency_node->id());
-    }
-    return operand_node_ids;
-}
-
-// For a given parrent_node, fetch paired node if it exists so that they can be scheduled together.
-// Note that valid paired node should have only parrent_node and inputs as operands.
-//
-const graphlib::Node* get_paired_op_if_exists(const graphlib::Graph* graph, const graphlib::Node* parrent_node)
-{
-    const graphlib::Node* paired_node = nullptr;
-    if (parrent_node->node_type() == graphlib::NodeType::kBudaOp)
-    {
-        const graphlib::BudaOpNode* op_node = static_cast<const graphlib::BudaOpNode*>(parrent_node);
-
-        // Sparse-dense pair case.
-        //
-        if (op_node->is_sparse_matmul())
-        {
-            std::vector<tt::graphlib::Node*> users = graph->data_users(op_node);
-            if (users.size() == 1 && users[0]->node_type() == graphlib::kBudaOp)
-            {
-                const graphlib::BudaOpNode* user_op_node = static_cast<const graphlib::BudaOpNode*>(users[0]);
-                if (user_op_node->should_pair_with_sparse(op_node, graph))
-                {
-                    paired_node = user_op_node;
-                }
-            }
-        }
-    }
-
-#ifdef DEBUG
-    // Check that paired node has only parrent_node and inputs as operands.
-    //
-    if (paired_node != nullptr)
-    {
-        for (const Node* operand_node : graph->data_operands(paired_node))
-        {
-            TT_ASSERT(operand_node == parrent_node || operand_node->node_type() == graphlib::NodeType::kInput);
-        }
-    }
-#endif
-
-    TT_ASSERT(!paired_node or can_schedule_node(paired_node));
-
-    return paired_node;
-}
-
-Schedule run_scheduler(
-    const SchedulerConfig& config,
-    const graphlib::Graph* graph,
-    std::queue<Node*>& node_queue,
-    std::unordered_set<NodeId>& visited,
-    const NodeGroupVector& groups)
-{
-    (void)groups;
-    std::unordered_map<std::string, std::string> schedule_dependencies = get_schedule_dependencies(config, graph);
-    Schedule scheduled_nodes;
-
-
-    // declare a function to handle fracture groups
-    std::function<void(Node*)> VisitNode;
-    std::function<bool(Node*)> FracVisit = [&](Node* node) -> bool{
-
-        // this fracture group has to be scheduled contiguously in DFS fashion
-        // get the fracture group id of the node
-        auto fracture_group_id = node->as<graphlib::TaggedNode>()->tag_value("fracture_group_id");
-
-        // collect the nodes in this fracture group
-        // also collect the tops and bottoms
-        std::unordered_set<NodeId> fracture_group_nodes;
-        std::vector<NodeId> fracture_group_tops;
-        std::vector<NodeId> fracture_group_bottoms;
-        for (auto& node : graph->nodes()) {
-            if (node->as<graphlib::TaggedNode>()->has_tag("fracture_group_id")) {
-                if (node->as<graphlib::TaggedNode>()->tag_value("fracture_group_id") != fracture_group_id) {
-                    continue;
-                }
-                fracture_group_nodes.insert(node->id());
-            }
-            if (node->as<graphlib::TaggedNode>()->has_tag("fracture_top")) {
-                fracture_group_tops.push_back(node->id());
-            }
-            if (node->as<graphlib::TaggedNode>()->has_tag("fracture_bottom")) {
-                fracture_group_bottoms.push_back(node->id());
-            }
-        }
-
-        // ensure that the fracture group is allowed to be scheduled by the compiler
-        // fracture nodes can participate in schedule dependencies only if all the fracture group nodes are tops or bottoms
-        bool fracture_group_allowed = true;
-        // iterate over the schedule dependencies
-        for (auto& [op, dep] : schedule_dependencies) {
-            // if neither op nor dep are in the fracture group, then skip
-            if (fracture_group_nodes.find(graph->get_node_by_name(op)->id()) == fracture_group_nodes.end() and
-                fracture_group_nodes.find(graph->get_node_by_name(dep)->id()) == fracture_group_nodes.end()) {
-                continue;
-            }
-
-            // make sure that both op and dep are not in the fracture group
-            if (fracture_group_nodes.find(graph->get_node_by_name(op)->id()) != fracture_group_nodes.end() or
-                fracture_group_nodes.find(graph->get_node_by_name(dep)->id()) != fracture_group_nodes.end()) {
-                fracture_group_allowed = false;
-                break;
-            }
-
-            // if op is a fracture group top, or dep is a fracture group bottom, then compiler cannot schedule this
-            if (graph->get_node_by_name(op)->as<graphlib::TaggedNode>()->has_tag("fracture_top") or
-                graph->get_node_by_name(dep)->as<graphlib::TaggedNode>()->has_tag("fracture_bottom")) {
-                fracture_group_allowed = false;
-                break;
-            }
-        }
-
-        // if the fracture group is allowed to be scheduled by the compiler
-        if (not fracture_group_allowed)  return false;
-
-        // create a DFS stack with the fracture group top
-        std::stack<Node*> frac_node_stack;
-
-        // define a function to run a DFS scheduler on fracture group nodes
-        std::function<void(void)> DFSFrac = [&]() {
-            // return if stack is empty
-            if (frac_node_stack.empty()) {
-                return;
-            }
-
-            // pop a node from the stack
-            Node* stk = frac_node_stack.top();
-            frac_node_stack.pop();
-
-            // add this node to the schedule
-            if (can_schedule_node(stk)){
-                scheduled_nodes.push_back(stk->name());
-            } else return; // not a schedulable node
-
-            if (requires_visit(visited, stk->id())) {
-                // mark this node as visited
-                visited.insert(stk->id());
-            }
-
-            // if the node is a fracture group bottom, then return
-            if (stk->as<graphlib::TaggedNode>()->has_tag("fracture_bottom")) {
-                return;
-            }
-
-            // iterate over the users of this node and stack them if
-            // all the operands of the users have been visited
-            bool all_operands_visited = true;
-            for (const Edge& user_edge : graph->user_data_edges(stk)) {
-                NodeId successor_id = user_edge.consumer_node_id;
-                Node* successor_node = graph->node_by_id(successor_id);
-
-                // if the successor node is not in the fracture group, then skip it
-                if (fracture_group_nodes.find(successor_node->id()) == fracture_group_nodes.end()) {
-                    continue;
-                }
-
-                // check if all of the operands of the successor have been visited
-                auto predecessors = graph->operands(successor_node);
-                // iterate over predecessors
-                for (auto& pred : predecessors) {
-                    // if the predecessor cannot be scheduled, then skip it
-                    if (not can_schedule_node(pred)) {
-                        continue;
-                    }
-
-                    // if the predecessor is already scheduled, then skip it
-                    if (std::find(scheduled_nodes.begin(), scheduled_nodes.end(), pred->name()) != scheduled_nodes.end()) {
-                        continue;
-                    }
-                    all_operands_visited = false;
-                    break;
-                }
-                if (not all_operands_visited) {
-                    continue;
-                }
-                // push the successor node to the stack and visit
-                frac_node_stack.push(successor_node);
-                DFSFrac();
-            }
-        };
-
-        // now loop over the fracture group tops and visit them
-        for (auto& top : fracture_group_tops) {
-            // make sure all operands of the top have been visited
-            // visiting them inside this loop, rather than outside
-            // to keep these operands close to their users
-            // fork nodes delivering input parameters fall in this category
-            auto predecessors = graph->operands(graph->node_by_id(top));
-            // iterate over predecessors
-            for (auto pred : predecessors) {
-                // if the predecessor cannot be scheduled, then skip it
-                if (not can_schedule_node(pred)) {
-                    continue;
-                }
-
-                // if the predecessor is already scheduled, then skip it
-                if (std::find(scheduled_nodes.begin(), scheduled_nodes.end(), pred->name()) != scheduled_nodes.end()) {
-                    continue;
-                }
-
-                // otherwise, visit the predecessor
-                VisitNode(pred);
-            }
-            frac_node_stack.push(graph->node_by_id(top));
-            DFSFrac();
-        }
-
-        // enqueue the successors of fracture group bottoms
-        for (auto& bottom : fracture_group_bottoms) {
-            for (const Edge& user_edge : graph->user_data_edges(graph->node_by_id(bottom))) {
-                NodeId successor_id = user_edge.consumer_node_id;
-                Node* successor_node = graph->node_by_id(successor_id);
-                node_queue.push(successor_node);
-            }
-        }
-        return true;
-    };
-
-    VisitNode = [&](Node* node) {
-        if (not requires_visit(visited, node->id())) {
-            return;
-        }
-        visited.insert(node->id());
-        for (NodeId predecessor_id : get_operand_node_ids(schedule_dependencies, graph, node)) {
-            Node* predecessor_node = graph->node_by_id(predecessor_id);
-            VisitNode(predecessor_node);
-        }
-
-        // if the node is a fracture group top, then call the fracture group scheduler
-        if (node->as<graphlib::TaggedNode>()->has_tag("fracture_top")) {
-            // check if this has already been scheduled, because there are multiple tops in a fracture region, and only the first top needs to be called
-            if (std::find(scheduled_nodes.begin(), scheduled_nodes.end(), node->name()) != scheduled_nodes.end()) {
-                return;
-            }
-            auto scheduled = FracVisit(node);
-            if (scheduled) return;
-        }
-
-        // Get paired op if it exists so that we can schedule it right after the current op.
-        //
-        const Node* paired_node = get_paired_op_if_exists(graph, node);
-
-        if (can_schedule_node(node) and
-            (std::find(scheduled_nodes.begin(), scheduled_nodes.end(), node->name()) == scheduled_nodes.end()))
-        {
-            scheduled_nodes.push_back(node->name());
-
-            // Schedule paired op right after the current op.
-            //
-            if (paired_node != nullptr and
-                (std::find(scheduled_nodes.begin(), scheduled_nodes.end(), paired_node->name()) ==
-                 scheduled_nodes.end()))
-            {
-                scheduled_nodes.push_back(paired_node->name());
-            }
-        }
-
-        for (const Edge& user_edge : graph->user_data_edges(node)) {
-            NodeId successor_id = user_edge.consumer_node_id;
-            node_queue.push(graph->node_by_id(successor_id));
-        }
-    };
-
-    while (not node_queue.empty())
-    {
-        Node* node = node_queue.front();
-        VisitNode(node);
-        node_queue.pop();
-    }
-    return scheduled_nodes;
-}
-
-std::unordered_set<NodeId> get_visited_with_recompute_nodes_marked(
-    const graphlib::Graph* graph,
-    const std::unordered_set<NodeId>& visited
-)
-{
-    std::unordered_set<NodeId> visited_with_recompute_marked = visited;
-
-    for (graphlib::Node* node : graphlib::topological_sort(*graph))
-    {
-        if (graphlib::is_recompute(graph, node))
-        {
-            visited_with_recompute_marked.insert(node->id());
-        }
-    }
-
-    return visited_with_recompute_marked;
-}
-
-std::vector<std::string> get_valid_schedule(const graphlib::Graph* graph, const vector<string>& schedule)
-{
-    std::unordered_map<std::string, int> node_to_schedule_index = get_op_to_schedule_index(schedule);
-    tt::ordered_map<std::string, std::uint32_t> node_to_indegree;
-    for (const std::string& op : schedule)
-    {
-        std::uint32_t indegree = 0;
-        Node* node = graph->get_node_by_name(op);
-        for (const Node* predecessor_node : get_schedule_predecessors(graph, node))
-        {
-            if (node_to_schedule_index.find(predecessor_node->name()) != node_to_schedule_index.end())
-            {
-                indegree += 1;
-            }
-        }
-        node_to_indegree[op] = indegree;
-    }
-
-    std::deque<std::string> ops_to_process;
-    for (const std::string& op : schedule)
-    {
-        if (node_to_indegree[op] == 0)
-        {
-            ops_to_process.push_back(op);
-        }
-    }
-
-    std::vector<std::string> valid_schedule;
-    valid_schedule.reserve(schedule.size());
-    while (not ops_to_process.empty())
-    {
-        std::string op = ops_to_process.front();
-        ops_to_process.pop_front();
-        valid_schedule.push_back(op);
-        Node* node = graph->get_node_by_name(op);
-
-        for (const Node* successor_node : get_schedule_successors(graph, node))
-        {
-            if (node_to_schedule_index.find(successor_node->name()) != node_to_schedule_index.end())
-            {
-                node_to_indegree[successor_node->name()] -= 1;
-                if (node_to_indegree[successor_node->name()] == 0)
-                {
-                    ops_to_process.push_back(successor_node->name());
-                }
-            }
-        }
-    }
-    TT_LOG_ASSERT(
-        valid_schedule.size() == schedule.size(), "Valid schedule size does not match original schedule size");
-
-    return valid_schedule;
-}
-
-struct BackwardOpInfo
-{
-    std::string name;
-    int schedule_index;
-
-    BackwardOpInfo(const std::string& name, int schedule_index) : name(name), schedule_index(schedule_index) {}
-    bool operator<(const BackwardOpInfo& rhs) { return this->schedule_index < rhs.schedule_index; }
-};
-
-unordered_map<string, vector<string>> get_ordered_fwd_to_bwd_ops(
-    const graphlib::Graph* graph, const vector<string>& original_schedule)
-{
-    auto fwd_to_bwd_nodes = ::tt::placer::lowering::get_fwd_to_bwd_nodes(graph);
-    std::unordered_map<std::string, std::vector<std::string>> fwd_to_bwd_ops_to_place;
-    std::unordered_set<std::string> visited_ops;
-    std::unordered_map<std::string, int> op_to_schedule_index = get_op_to_schedule_index(original_schedule);
-
-    for (int i = original_schedule.size() - 1; i >= 0; --i)
-    {
-        const auto& fwd_node_name = original_schedule[i];
-        if (fwd_to_bwd_nodes.find(fwd_node_name) != fwd_to_bwd_nodes.end())
-        {
-            fwd_to_bwd_ops_to_place[fwd_node_name] = {};
-            vector<BackwardOpInfo> bwd_node_placement_order;
-            for (const string& bwd_node_name : fwd_to_bwd_nodes.at(fwd_node_name))
-            {
-                if (visited_ops.find(bwd_node_name) == visited_ops.end())
-                {
-                    bwd_node_placement_order.emplace_back(bwd_node_name, op_to_schedule_index.at(bwd_node_name));
-                    visited_ops.insert(bwd_node_name);
-                }
-            }
-            std::sort(bwd_node_placement_order.begin(), bwd_node_placement_order.end());
-            for (const auto& bwd_op_info : bwd_node_placement_order)
-            {
-                fwd_to_bwd_ops_to_place[fwd_node_name].push_back(bwd_op_info.name);
-            }
-        }
-    }
-    return fwd_to_bwd_ops_to_place;
-}
-
-// Instead of naively placing the backward ops via module-first/topological ordering, we know a better schedule would
-// place the backward ops in the reverse order of the forward ops to back-propagate the gradients and weight updates.
-// - While we do have a "fwd->bwd" mapping that provides information on which backward ops are associated with a
-//   forward, it's not reliable to directly use this mapping to place the backward ops because it's non-trivial to
-//   maintain this mapping as the graph is mutated by transformations. Instead, we'll treat this mapping as a hint to
-//   guide the scheduling of the backward graph and defer the responsibility of guarding against data-dependency
-//   violations to the `get_valid_schedule` function.
-// - The reason it's a better schedule and produces less e2e queues is because the mapping serves as a higher-level
-//   organization/grouping of the nodes we know should cluster and be associated together when scheduling.
-std::vector<std::string> optimize_bwd_schedule(
-    const graphlib::Graph* graph,
-    const std::vector<std::string>& original_schedule,
-    const std::vector<std::string>& fwd_schedule)
-{
-    std::vector<string> optimized_bwd_schedule;
-
-    auto fwd_to_bwd_ops = get_ordered_fwd_to_bwd_ops(graph, original_schedule);
-    for (auto it = fwd_schedule.rbegin(); it != fwd_schedule.rend(); ++it)
-    {
-        const string& fwd_op = *it;
-        for (const string& bwd_op_name : fwd_to_bwd_ops.at(fwd_op))
-        {
-            optimized_bwd_schedule.push_back(bwd_op_name);
-        }
-    }
-    return get_valid_schedule(graph, optimized_bwd_schedule);
-}
-
-std::vector<std::string> optimize_schedule(const graphlib::Graph* graph, const std::vector<std::string>& scheduled_ops)
-{
-    auto fwd_schedule = get_filtered_schedule(graph, scheduled_ops, NodeEpochType::Forward);
-    auto bwd_schedule = optimize_bwd_schedule(graph, scheduled_ops, fwd_schedule);
-    auto opt_schedule = get_filtered_schedule(graph, scheduled_ops, NodeEpochType::Optimizer);
-
-    std::vector<std::string> optimized_schedule;
-    optimized_schedule.reserve(scheduled_ops.size());
-    optimized_schedule.insert(std::end(optimized_schedule), std::begin(fwd_schedule), std::end(fwd_schedule));
-    optimized_schedule.insert(std::end(optimized_schedule), std::begin(bwd_schedule), std::end(bwd_schedule));
-    optimized_schedule.insert(std::end(optimized_schedule), std::begin(opt_schedule), std::end(opt_schedule));
-
-    return optimized_schedule;
-}
-
-std::vector<std::string> move_output_ops_to_end(const graphlib::Graph* graph, const std::vector<std::string>& scheduled_ops)
-{
-    std::vector<std::string> new_schedule;
-    std::vector<std::string> output_ops;
-    for(auto& op_name: scheduled_ops)
-    {
-        Node* op_node = graph->get_node_by_name(op_name);
-        auto consumers = graph->users(op_node);
-        bool feeds_graph_output_queue = std::all_of(consumers.begin(), consumers.end(), [](Node* n) { return n->node_type() == graphlib::NodeType::kOutput; });
-        if(feeds_graph_output_queue)
-        {
-            output_ops.push_back(op_name);
-        }
-        else
-        {
-            new_schedule.push_back(op_name);
-        }
-    }
-    for(auto& output_op_name: output_ops)
-    {
-        new_schedule.push_back(output_op_name);
-    }
-    return new_schedule;
-}
-
-Schedule run_module_by_module_scheduler(const SchedulerConfig& config, const graphlib::Graph* graph)
-{
-    Schedule scheduled_nodes;
-
-    std::unordered_set<NodeId> visited;
-    std::queue<Node*> node_queue;
-
-    NodeGroupVector groups = create_groups(graph, discover_ops_for_grouped_inputs(graph));
-
-    push_to_node_queue(graph, node_queue, graph->get_ordered_input_names());
-    Schedule fwd_schedule = run_scheduler(config, graph, node_queue, visited, groups);
-    scheduled_nodes.insert(std::end(scheduled_nodes), std::begin(fwd_schedule), std::end(fwd_schedule));
-
-    push_to_node_queue(graph, node_queue, graph->get_ordered_output_gradient_names());
-    auto visited_with_recompute_marked = get_visited_with_recompute_nodes_marked(graph, visited);
-    Schedule temp = run_scheduler(config, graph, node_queue, visited_with_recompute_marked, groups);
-
-    push_to_node_queue(graph, node_queue, graph->get_ordered_output_gradient_names());
-    push_to_node_queue(graph, node_queue, temp);
-    Schedule temp2 = run_scheduler(config, graph, node_queue, visited, groups);
-    scheduled_nodes.insert(std::end(scheduled_nodes), std::begin(temp2), std::end(temp2));
-
-
-    // sort the schedule based on fwd/bwd/opt
-    std::stable_sort(std::begin(scheduled_nodes), std::end(scheduled_nodes),
-            [&graph](const std::string& a, const std::string& b) {
-            Node* node_a = graph->get_node_by_name(a);
-            Node* node_b = graph->get_node_by_name(b);
-
-            return (int)node_a->get_epoch_type() < (int)node_b->get_epoch_type();
-        });
-
-    Schedule optimized_schedule = optimize_schedule(graph, scheduled_nodes);
-    assert_valid_schedule(graph, optimized_schedule);
-    return optimized_schedule;
-}
-
-SchedulerPolicy policy_from_string(const std::string& policy_str)
-{
-    if (policy_str == "Topological") {
-        return SchedulerPolicy::Topological;
-    } else if (policy_str == "ModuleInputsBFS") {
-        return SchedulerPolicy::ModuleInputsBFS;
-    } else if (policy_str == "LongestPath") {
-        return SchedulerPolicy::LongestPath;
-    }
-
-    log_error(LogScheduler, "Failed to parse scheduler policy from string: {}", policy_str);
-    log_error(LogBalancer, "Falling back to SchedulerPolicy::ModuleInputsBFS");
-    return SchedulerPolicy::ModuleInputsBFS;
-}
-
-std::ostream& operator<<(std::ostream& stream, SchedulerPolicy scheduler_policy) {
-    switch (scheduler_policy) {
-        case SchedulerPolicy::Topological: stream << "SchedulerPolicy::Topological"; break;
-        case SchedulerPolicy::ModuleInputsBFS: stream << "SchedulerPolicy::ModuleInputsBFS"; break;
-        case SchedulerPolicy::LongestPath: stream << "SchedulerPolicy::LongestPath"; break;
-        default: stream << "SchedulerPolicy::Unknown"; break;
-    }
-    return stream;
-}
-
-// In future, we can use the balancer to help guide decisions about how to schedule ops.
-// For now, just implementing naive baseline.
-//
-// If we need the schedule in multiple parts of the compile, we can either
-// 1. cache the schedule and fetch it off of an obj. like the graph (less intrusive)
-// 2. explicitly embed a new ScheduleEdgeType to impose scheduling dependecies
-//    directly on the graph (more intrusive)
-Schedule run_scheduler(const SchedulerConfig& config, const graphlib::Graph* graph)
-{
-    log_debug(LogScheduler, "Running Scheduler with Policy: {}", config.policy);
-    Schedule schedule;
-
-    if (not config.scheduler_constraints.empty())
-    {
-        log_debug(LogScheduler, "Running Scheduler with constraints: {}", config.scheduler_constraints);
-    }
-
-    if (config.policy == SchedulerPolicy::Topological)
-    {
-        schedule = run_topological_scheduler(graph);
-    }
-    else if (config.policy == SchedulerPolicy::ModuleInputsBFS)
-    {
-        schedule = run_module_by_module_scheduler(config, graph);
-    }
-    else if (config.policy == SchedulerPolicy::LongestPath)
-    {
-        schedule = run_longest_path_scheduler(graph);
-    }
-    else
-    {
-        log_fatal("providing unknown scheduler policy.");
-    }
-
-    if(env_as<bool>("PYBUDA_NEBULA_GALAXY_PLACER"))
-    {
-        schedule = move_output_ops_to_end(graph, schedule);
-    }
-
-    // Remove all already processed nodes.
-    //
-    if (config.ignored_nodes)
-    {
-        Schedule final_schedule;
-        final_schedule.reserve(schedule.size() - config.ignored_nodes->size());
-        for (const std::string& node_name : schedule)
-        {
-            if (config.ignored_nodes->count(graph->get_node_by_name(node_name)) == 0)
-            {
-                final_schedule.push_back(node_name);
-            }
-        }
-
-        TT_ASSERT(final_schedule.size() == schedule.size() - config.ignored_nodes->size());
-        schedule.swap(final_schedule);
-    }
-
-    log_schedule(schedule);
-    return schedule;
-}
-
-}  // end namespace tt::scheduler
diff --git a/pybuda/csrc/scheduler/scheduler.hpp b/pybuda/csrc/scheduler/scheduler.hpp
deleted file mode 100644
index b73208e78..000000000
--- a/pybuda/csrc/scheduler/scheduler.hpp
+++ /dev/null
@@ -1,70 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#pragma once
-
-#include <string>
-#include <memory>
-#include <vector>
-#include "scheduler/utils.hpp"
-
-namespace tt {
-
-// Forward Declares
-namespace graphlib
-{
-    class Graph;
-}
-
-namespace scheduler {
-
-/*
-  ____        _          ____  _                   _
- |  _ \  __ _| |_ __ _  / ___|| |_ _ __ _   _  ___| |_ _   _ _ __ ___  ___
- | | | |/ _` | __/ _` | \___ \| __| '__| | | |/ __| __| | | | '__/ _ \/ __|
- | |_| | (_| | || (_| |  ___) | |_| |  | |_| | (__| |_| |_| | | |  __/\__ \
- |____/ \__,_|\__\__,_| |____/ \__|_|   \__,_|\___|\__|\__,_|_|  \___||___/
-
-*/
-
-enum SchedulerPolicy
-{
-    Topological,
-    ModuleInputsBFS,
-    LongestPath,
-};
-
-struct SchedulerConfig
-{
-    SchedulerPolicy policy;
-    std::vector<std::vector<std::string>> scheduler_constraints;
-    const std::unordered_set<const tt::graphlib::Node*>* ignored_nodes = nullptr;
-
-    SchedulerConfig(
-        SchedulerPolicy policy = SchedulerPolicy::Topological,
-        const std::vector<std::vector<std::string>>& scheduler_constraints = {}) :
-        policy(policy), scheduler_constraints(scheduler_constraints)
-    {
-    }
-};
-
-/*
-     _    ____ ___
-    / \  |  _ \_ _|___
-   / _ \ | |_) | |/ __|
-  / ___ \|  __/| |\__ \
- /_/   \_\_|  |___|___/
-*/
-
-SchedulerPolicy policy_from_string(const std::string& policy_str);
-
-// Returns an ordered list of node names
-Schedule run_scheduler(const SchedulerConfig& config, const graphlib::Graph* graph);
-
-
-// Individual scheduler implementations
-Schedule run_topological_scheduler(const graphlib::Graph* graph);
-Schedule run_module_by_module_scheduler(const SchedulerConfig& config, const graphlib::Graph* graph);
-
-} // end namespace scheduler
-} // end namespace tt
diff --git a/pybuda/csrc/scheduler/utils.cpp b/pybuda/csrc/scheduler/utils.cpp
deleted file mode 100644
index be97267dc..000000000
--- a/pybuda/csrc/scheduler/utils.cpp
+++ /dev/null
@@ -1,154 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-
-#include "scheduler/utils.hpp"
-
-#include "graph_lib/graph.hpp"
-#include "graph_lib/node.hpp"
-#include "graph_lib/node_types.hpp"
-#include "utils/logger.hpp"
-
-using tt::LogScheduler;
-
-namespace tt::scheduler
-{
-
-void log_schedule(const Schedule& schedule)
-{
-    for (std::uint32_t i = 0; i < schedule.size(); ++i)
-    {
-        log_debug(LogScheduler, "schedule index: {}, op: {}", i, schedule[i]);
-    }
-}
-
-bool can_schedule_node(const graphlib::Node* node)
-{
-    return node->node_type() != graphlib::NodeType::kInput and node->node_type() != graphlib::NodeType::kOutput and
-           node->node_type() != graphlib::NodeType::kQueue and node->node_type() != graphlib::NodeType::kBudaNaryTM;
-}
-
-// Returns operands of the node, skipping through queue nodes.
-//
-const std::vector<const graphlib::Node*> get_schedule_predecessors(
-    const graphlib::Graph* graph, const graphlib::Node* node)
-{
-    std::vector<const graphlib::Node*> predecessors;
-    for (const graphlib::Node* operand_node : graph->data_operands(node))
-    {
-        // Skip through queue nodes.
-        //
-        if (operand_node->node_type() == graphlib::NodeType::kQueue)
-        {
-            const graphlib::QueueNode* queue_node = operand_node->as<graphlib::QueueNode>();
-            if (queue_node->queue_type() == graphlib::QueueNodeType::Buffering or
-                queue_node->queue_type() == graphlib::QueueNodeType::EpochToEpoch)
-            {
-                predecessors.push_back(graph->data_operands(queue_node)[0]);
-            }
-        }
-        else
-        {
-            predecessors.push_back(operand_node);
-        }
-    }
-
-    return predecessors;
-}
-
-// Returns users of the node, skipping(expanding) through queue nodes.
-//
-const std::vector<const graphlib::Node*> get_schedule_successors(
-    const graphlib::Graph* graph, const graphlib::Node* node)
-{
-    std::vector<const graphlib::Node*> successors;
-    for (const graphlib::Node* user_node : graph->data_users(node))
-    {
-        // Skip through queue nodes.
-        //
-        if (user_node->node_type() == graphlib::NodeType::kQueue)
-        {
-            const graphlib::QueueNode* queue_node = user_node->as<graphlib::QueueNode>();
-            if (queue_node->queue_type() == graphlib::QueueNodeType::Buffering or
-                queue_node->queue_type() == graphlib::QueueNodeType::EpochToEpoch)
-            {
-                std::vector<tt::graphlib::Node*> data_users = graph->data_users(queue_node);
-                successors.insert(successors.end(), data_users.begin(), data_users.end());
-            }
-        }
-        else
-        {
-            successors.push_back(user_node);
-        }
-    }
-
-    return successors;
-}
-
-std::unordered_map<std::string, int> get_op_to_schedule_index(const Schedule& scheduled_ops)
-{
-    std::unordered_map<std::string, int> op_to_schedule_index;
-    op_to_schedule_index.reserve(scheduled_ops.size());
-    for (int i = 0; i < (int)scheduled_ops.size(); ++i)
-    {
-        const std::string& node_name = scheduled_ops.at(i);
-        op_to_schedule_index[node_name] = i;
-    }
-    return op_to_schedule_index;
-}
-
-Schedule get_filtered_schedule(const graphlib::Graph* graph, const Schedule& schedule, graphlib::NodeEpochType type)
-{
-    Schedule filtered_schedule;
-    filtered_schedule.reserve(schedule.size());
-    for (unsigned int subgraph_index = 0; subgraph_index < graph->num_subgraphs(); subgraph_index++)
-    {
-        for (const auto& node_name : schedule)
-        {
-            graphlib::Node* node = graph->get_node_by_name(node_name);
-            if (node->get_epoch_type() == type)
-            {
-                if (graph->get_subgraph_id_for_node(node->id()) != subgraph_index)
-                {
-                    continue;
-                }
-                filtered_schedule.push_back(node_name);
-            }
-        }
-    }
-    return filtered_schedule;
-}
-
-bool are_schedule_dependencies_met(const graphlib::Graph* graph, const std::vector<std::string>& schedule)
-{
-    std::unordered_map<std::string, std::uint32_t> node_to_schedule_index;
-    node_to_schedule_index.reserve(schedule.size());
-    for (std::uint32_t i = 0; i < schedule.size(); ++i)
-    {
-        node_to_schedule_index[schedule[i]] = i;
-    }
-
-    for (const std::string& op : schedule)
-    {
-        graphlib::Node* node = graph->get_node_by_name(op);
-        for (const graphlib::Edge& operand_edge : graph->operand_data_edges(node))
-        {
-            graphlib::Node* predecessor_node = graph->node_by_id(operand_edge.producer_node_id);
-            if (node_to_schedule_index.find(predecessor_node->name()) != node_to_schedule_index.end())
-            {
-                if (node_to_schedule_index[predecessor_node->name()] > node_to_schedule_index[op])
-                {
-                    log_warning(
-                        LogPlacer,
-                        "Scheduler: dependency not met for node: {}: {} should come before",
-                        op,
-                        predecessor_node->name());
-                    return false;
-                }
-            }
-        }
-    }
-    return true;
-}
-
-}  // namespace tt::scheduler
diff --git a/pybuda/csrc/scheduler/utils.hpp b/pybuda/csrc/scheduler/utils.hpp
deleted file mode 100644
index c07fda45f..000000000
--- a/pybuda/csrc/scheduler/utils.hpp
+++ /dev/null
@@ -1,42 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-//
-// Common utilities for schedulers
-//
-#pragma once
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-#include "graph_lib/defines.hpp"
-
-namespace tt
-{
-
-namespace graphlib
-{
-class Graph;
-class Node;
-}
-
-namespace scheduler
-{
-
-using Schedule = std::vector<std::string>;
-
-void log_schedule(const Schedule& schedule);
-bool can_schedule_node(const graphlib::Node* node);
-const std::vector<const graphlib::Node*> get_schedule_predecessors(
-    const graphlib::Graph* graph, const graphlib::Node* node);
-const std::vector<const graphlib::Node*> get_schedule_successors(
-    const graphlib::Graph* graph, const graphlib::Node* node);
-
-std::unordered_map<std::string, int> get_op_to_schedule_index(const Schedule& scheduled_ops);
-Schedule get_filtered_schedule(const graphlib::Graph* graph, const Schedule& schedule, graphlib::NodeEpochType type);
-bool are_schedule_dependencies_met(const graphlib::Graph* graph, const Schedule& schedule);
-
-}  // namespace scheduler
-
-}  // namespace tt
diff --git a/pybuda/csrc/shared_utils/CMakeLists.txt b/pybuda/csrc/shared_utils/CMakeLists.txt
new file mode 100644
index 000000000..ffa249df8
--- /dev/null
+++ b/pybuda/csrc/shared_utils/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_library(shared_utils
+    STATIC
+    placement_printer.cpp
+    pretty_table.cpp
+    sparse_matmul_utils.cpp
+    string_extension.cpp)
+
+target_compile_options(shared_utils PRIVATE ${STATIC_LIB_FLAGS} ${PYBUDA_CSRC_CFLAGS})
diff --git a/pybuda/csrc/shared_utils/module.mk b/pybuda/csrc/shared_utils/module.mk
index 36357e0ca..6c021df47 100644
--- a/pybuda/csrc/shared_utils/module.mk
+++ b/pybuda/csrc/shared_utils/module.mk
@@ -4,7 +4,8 @@ PYBUDA_CSRC_SHARED_UTILS_LIB = $(LIBDIR)/libsharedutils.a
 PYBUDA_CSRC_SHARED_UTILS_SRCS += \
 	pybuda/csrc/shared_utils/placement_printer.cpp \
 	pybuda/csrc/shared_utils/pretty_table.cpp \
-	pybuda/csrc/shared_utils/sparse_matmul_utils.cpp
+	pybuda/csrc/shared_utils/sparse_matmul_utils.cpp \
+	pybuda/csrc/shared_utils/string_extension.cpp
 
 PYBUDA_CSRC_SHARED_UTILS_INCLUDES = $(PYBUDA_CSRC_INCLUDES)
 
diff --git a/pybuda/csrc/shared_utils/sparse_matmul_utils.cpp b/pybuda/csrc/shared_utils/sparse_matmul_utils.cpp
index 1db8a2ce7..8e91ebe32 100644
--- a/pybuda/csrc/shared_utils/sparse_matmul_utils.cpp
+++ b/pybuda/csrc/shared_utils/sparse_matmul_utils.cpp
@@ -12,7 +12,6 @@
 #include <unordered_map>
 #include <utility>
 
-#include "balancer/types.hpp"
 #include "python_bindings_common.hpp"
 #include "utils/assert.hpp"
 
@@ -73,13 +72,12 @@ std::ostream& operator<<(std::ostream& out, SparseBUDA::Layout layout)
         case SparseBUDA::Layout::Default: out << "SparseBUDA::Layout::Default"; break;
         case SparseBUDA::Layout::ZMajor: out << "SparseBUDA::Layout::ZMajor"; break;
         case SparseBUDA::Layout::ZMajorDataflow: out << "SparseBUDA::Layout::ZMajorDataflow"; break;
-        case SparseBUDA::Layout::BufferOp: out << "SparseBUDA::Layout::BufferOp"; break;
         default: out << "SparseBUDA::Layout::Unknown"; break;
     }
     return out;
 }
 
-static int get_u_rt_encoding_bits(int u_rt)
+int get_u_rt_encoding_bits(int u_rt)
 {
     // u_rt_bits can be 0
     int u_rt_bits = 32 - __builtin_clz(u_rt);
@@ -87,7 +85,7 @@ static int get_u_rt_encoding_bits(int u_rt)
     return u_rt_bits;
 }
 
-static int get_u_kt_encoding_bits(int u_kt)
+int get_u_kt_encoding_bits(int u_kt)
 {
     int u_kt_bits = 32 - __builtin_clz(u_kt);
     u_kt_bits -= ((u_kt & (u_kt - 1)) == 0);  // power of two check
@@ -246,7 +244,7 @@ class StripAllocator
         ++num_strips;
     }
 
-    void repeat(int n, int num_strips, int sparse_tile_ptr_bits)
+    void repeat(int n, int num_strips, int sparse_ublock_idx_bits)
     {
         if (n <= 1 or not prev_strip_ptr)
             return;
@@ -281,7 +279,7 @@ class StripAllocator
             std::uint8_t* src = base_ptr();
             std::uint8_t* dst = base_ptr() + orig_size * i;
             memcpy(dst, src, orig_size);
-            patch_strip_indices(dst, orig_size, num_strips * i, sparse_tile_ptr_bits);
+            patch_strip_indices(dst, orig_size, num_strips * i, sparse_ublock_idx_bits);
         }
 
         // Update the prev_strip_ptr to point at the new end
@@ -317,7 +315,7 @@ class StripAllocator
     }
 
     static void patch_strip_indices(
-        std::uint8_t* base, std::size_t size, std::size_t strip_offset, int sparse_tile_ptr_bits)
+        std::uint8_t* base, std::size_t size, std::size_t strip_offset, int sparse_ublock_idx_bits)
     {
         using IndexType = std::remove_extent_t<decltype(strip_info_struct::F::index_array)>;
 
@@ -325,7 +323,7 @@ class StripAllocator
         constexpr int kTileElems = TILE_DIM * TILE_DIM;
         constexpr int kTileSizeBytes = kTileElems * kElemSize;
         int num_tiles = static_cast<int>((size + kTileSizeBytes - 1) / kTileSizeBytes);
-        int ublock_tile_index_bytes = 16 - sparse_tile_ptr_bits;
+        int ublock_tile_index_bytes = 16 - sparse_ublock_idx_bits;
 
         for (int tile_id = 0; tile_id < num_tiles; ++tile_id)
         {
@@ -347,7 +345,7 @@ class StripAllocator
                 for (int ublock_i = 0; ublock_i < info->f.nz_ublocks; ++ublock_i)
                 {
                     IndexType encoded = info->f.index_array[i++];
-                    IndexType nz_tiles_in_ublock = encoded >> sparse_tile_ptr_bits;
+                    IndexType nz_tiles_in_ublock = encoded >> sparse_ublock_idx_bits;
                     nz_tiles_in_ublock =
                         (nz_tiles_in_ublock == 0u) ? (1u << ublock_tile_index_bytes) : nz_tiles_in_ublock;
                     i += nz_tiles_in_ublock;
@@ -392,43 +390,85 @@ static std::pair<std::vector<std::int32_t>, int> encode_strips(
     // Calculate bits needed for ublock (u_rt + u_kt separately encoded)
     int u_rt_bits = get_u_rt_encoding_bits(u_rt);
     int u_kt_bits = get_u_kt_encoding_bits(u_kt);
-    TT_ASSERT(u_kt_bits + u_rt_bits <= 16 - sparse_ublock_idx_bits);
+    int ublock_tile_index_bits = 16 - sparse_tile_ptr_bits;
+    TT_ASSERT(u_rt_bits + u_kt_bits <= ublock_tile_index_bits);
+    int nz_tiles_in_ublock_bits = 16 - sparse_ublock_idx_bits;
 
     using IndexType = std::remove_extent_t<decltype(strip_info_struct::F::index_array)>;
 
-    auto encode_ublock_header = [sparse_ublock_idx_bits](
+    // Encodes ublock header
+    // 16b total
+    // Example:
+    //   - sparse_ublock_idx_bits = 7
+    //   - nz_tiles_in_ublock_bits = 16 - sparse_ublock_idx_bits = 9
+    // If we look at the 16 bits for the above example, it would look like this:
+    //                     MSB [nnnnnnnnn|sssssss] LSB
+    //                          ^^^^^^^^^ ^^^^^^^
+    //                              |        |
+    //    nz_tiles_in_ublock_bits <-|        |
+    //     sparse_ublock_idx_bits <----------|
+    //
+    auto encode_ublock_header = [sparse_ublock_idx_bits, nz_tiles_in_ublock_bits](
                                     IndexType current_ublock_index, IndexType nz_tiles_in_ublock) -> IndexType
     {
-        int ublock_tile_index_bits = 16 - sparse_ublock_idx_bits;
+        // Check if bits exceeded for current_ublock_index
+        //
         if (current_ublock_index >= (1 << sparse_ublock_idx_bits))
+        {
             throw std::runtime_error(
-                fmt::format("Row tiles {} exceed {} bit encoding", current_ublock_index, sparse_ublock_idx_bits));
-        if (nz_tiles_in_ublock > (1 << ublock_tile_index_bits))
+                fmt::format("UBlock index {} exceeds {} bit encoding", current_ublock_index, sparse_ublock_idx_bits));
+        }
+
+        // Check if bits exceeded for nz_tile_in_ublock
+        // Note: if nz_tiles_in_ublock is (1 << nz_tiles_in_ublock_bits), this is legal, we encode it with 0
+        //
+        if (nz_tiles_in_ublock > (1 << nz_tiles_in_ublock_bits))
+        {
             throw std::runtime_error(fmt::format(
-                "Num tiles in ublock {} exceed {} bit encoding", current_ublock_index, sparse_ublock_idx_bits));
+                "UBlock index {} exceeds {} bit encoding", current_ublock_index, sparse_ublock_idx_bits));
+        }
 
-        // 0 means (1 << ublock_tile_index_bits)
-        nz_tiles_in_ublock = (nz_tiles_in_ublock == (1 << ublock_tile_index_bits)) ? 0 : nz_tiles_in_ublock;
+        // Use 0 to represent (1 << nz_tiles_in_ublock_bits)
+        nz_tiles_in_ublock = (nz_tiles_in_ublock == (1 << nz_tiles_in_ublock_bits)) ? 0 : nz_tiles_in_ublock;
 
         IndexType encoded = 0;
         encoded |= nz_tiles_in_ublock << sparse_ublock_idx_bits;
-        encoded |= current_ublock_index & ((1u << sparse_ublock_idx_bits) - 1u);
+        encoded |= current_ublock_index;
         return encoded;
     };
 
-    auto encode_index_pair = [sparse_tile_ptr_bits, u_kt_bits](
-                                 IndexType in0, IndexType in1_rt, IndexType in1_ct) -> IndexType
+    // Encodes indices of in0
+    // 16b total
+    // Example:
+    //   - sparse_tile_ptr_bits = 5
+    //   - ublock_tile_index_bits = 16 - sparse_tile_ptr_bits = 11
+    //   - u_rt_bits = 3
+    //   - u_kt_bits = 6
+    // If we look at the 16 bits for the above example, it would look like this:
+    //                     MSB [sssss|xx|rrr|kkkkkk] LSB
+    //                          ^^^^^ ^^ ^^^ ^^^^^^
+    //                            |   |   |    |
+    //     sparse_tile_ptr_bits <-|   |   |    |
+    //              unused bits <-----|   |    |
+    //                u_rt_bits <---------|    |
+    //                u_kt_bits <--------------|
+    //
+    // Note: ublock_tile_index_bits is a union of (u_rt_bits,  u_kt_bits, unused bits)
+    //
+    auto encode_index_pair = [sparse_tile_ptr_bits, ublock_tile_index_bits, u_kt_bits](
+                                 IndexType in0, IndexType in0_rt, IndexType in0_ct) -> IndexType
     {
-        int in1_ptr_bits = 16 - sparse_tile_ptr_bits;
+        // Check that sparse tile ptr index (in0) fits in the number of bits we have (sparse_tile_ptr_bits)
+        //
         if (in0 >= (1u << sparse_tile_ptr_bits))
+        {
             throw std::runtime_error(fmt::format("in0 exceeds {} bit sparse encoding", sparse_tile_ptr_bits));
-        if (((in1_rt << u_kt_bits) | in1_ct) >= (1 << in1_ptr_bits))
-            throw std::runtime_error(fmt::format("in1 exceeds {} bit sparse encoding", in1_ptr_bits));
+        }
 
         IndexType encoded = 0;
-        encoded |= in0 << in1_ptr_bits;
-        encoded |= in1_rt << u_kt_bits;
-        encoded |= in1_ct;
+        encoded |= in0 << ublock_tile_index_bits;
+        encoded |= in0_rt << u_kt_bits;
+        encoded |= in0_ct;
         return encoded;
     };
 
@@ -507,16 +547,16 @@ static std::pair<std::vector<std::int32_t>, int> encode_strips(
         allocator.push_strip(strip_info_struct(curr_strip_index, 0, true), {});
     }
 
-    allocator.repeat(t_factor_c, m_k * dimz, sparse_tile_ptr_bits);
+    allocator.repeat(t_factor_c, m_k * dimz, sparse_ublock_idx_bits);
 
     return allocator.finish_buda_strips();
 }
 
 static void print_info_indices(
-    std::vector<std::int32_t> const& buda_indices, int sparse_tile_ptr_bits, int sparse_ublock_idx_bits)
+    std::vector<std::int32_t> const& buda_indices, int sparse_ublock_idx_bits)
 {
     using IndexType = std::remove_extent_t<decltype(strip_info_struct::F::index_array)>;
-    int ublock_tile_index_bytes = 16 - sparse_tile_ptr_bits;
+    int ublock_tile_index_bytes = 16 - sparse_ublock_idx_bits;
     std::uint8_t const* base_ptr = reinterpret_cast<std::uint8_t const*>(buda_indices.data());
     TT_ASSERT((int)buda_indices.size() % (TILE_DIM * TILE_DIM) == 0);
     for (int tile_id = 0; tile_id < (int)(buda_indices.size() / (TILE_DIM * TILE_DIM)); ++tile_id)
@@ -586,6 +626,14 @@ SparseBUDA::SparseBUDA(
     TT_ASSERT(sparse_ct < SparseBUDA::kMaxSparseIndexValue, "Sparse matrix too wide");
 }
 
+enum EncodingBitErrors
+{
+    MaxSparseTilesExceeded = -1,
+    MaxUBlocksRExceeded = -2
+};
+
+// Returns negative value if failed
+//
 int SparseBUDA::get_sparse_tile_ptr_bits(int grid_r, int t_factor_r, int u_rt) const
 {
     // TODO: num_sparse_tiles should be calculated per core, and max should be used as the result of this fn
@@ -595,21 +643,23 @@ int SparseBUDA::get_sparse_tile_ptr_bits(int grid_r, int t_factor_r, int u_rt) c
     TT_ASSERT(num_sparse_tiles > 0);
     if (num_sparse_tiles > SparseBUDA::kMaxSparseTiles)
     {
-        throw std::runtime_error(fmt::format("Num sparse tiles {} exceeds max {}", num_sparse_tiles, kMaxSparseTiles));
+        return MaxSparseTilesExceeded;
     }
 
     // TODO: This can be divided by fracture factor
     std::uint32_t max_ublocks_r = this->sparse_shape[0] / (TILE_DIM * grid_r * t_factor_r * u_rt);
     if (max_ublocks_r > SparseBUDA::kMaxUblocksR)
     {
-        throw std::runtime_error(fmt::format("Num row tiles {} exceeds max {}", max_ublocks_r + 1, kMaxUblocksR));
+        return MaxUBlocksRExceeded;
     }
 
-    std::uint32_t max_num = std::max(num_sparse_tiles, max_ublocks_r);
+    std::uint32_t max_num = num_sparse_tiles;
     int num_lz = 32 - __builtin_clz(max_num);
     return num_lz;
 }
 
+// Returns negative value if failed
+//
 int SparseBUDA::get_sparse_ublock_idx_bits(int grid_r, int t_factor_r, int u_rt) const
 {
     // TODO: num_sparse_tiles should be calculated per core, and max should be used as the result of this fn
@@ -619,14 +669,14 @@ int SparseBUDA::get_sparse_ublock_idx_bits(int grid_r, int t_factor_r, int u_rt)
     TT_ASSERT(num_sparse_tiles > 0);
     if (num_sparse_tiles > SparseBUDA::kMaxSparseTiles)
     {
-        throw std::runtime_error(fmt::format("Num sparse tiles {} exceeds max {}", num_sparse_tiles, kMaxSparseTiles));
+        return MaxSparseTilesExceeded;
     }
 
     // TODO: This can be divided by fracture factor
     std::uint32_t max_ublocks_r = this->sparse_shape[0] / (TILE_DIM * grid_r * t_factor_r * u_rt);
     if (max_ublocks_r > SparseBUDA::kMaxUblocksR)
     {
-        throw std::runtime_error(fmt::format("Num row tiles {} exceeds max {}", max_ublocks_r + 1, kMaxUblocksR));
+        return MaxUBlocksRExceeded;
     }
 
     std::uint32_t max_num = std::max(num_sparse_tiles, max_ublocks_r);
@@ -651,12 +701,10 @@ int SparseBUDA::get_max_u_kt(int grid_r, int t_factor_r, int u_rt, int sparse_ti
     return (1 << ublock_bits);
 }
 
-SparseBUDA::Layout SparseBUDA::create_layout(bool buffer_op, bool z_major, int fracture_factor)
+SparseBUDA::Layout SparseBUDA::create_layout(bool z_major, int fracture_factor)
 {
     Layout layout = Layout::Default;
-    if (buffer_op)
-        layout = Layout::BufferOp;
-    else if (z_major and (fracture_factor == 1) and not env_as<bool>("PYBUDA_SPARSE_DISABLE_LAYOUT_DATAFLOW"))
+    if (z_major and (fracture_factor == 1) and not env_as<bool>("PYBUDA_SPARSE_DISABLE_LAYOUT_DATAFLOW"))
         layout = Layout::ZMajorDataflow;
     else if (z_major)
         layout = Layout::ZMajor;
@@ -676,13 +724,14 @@ static std::vector<SparseCOO> vslice_layout(
         (layout == SparseBUDA::Layout::ZMajorDataflow) ? (sparse.rt() / grid_r / t_factor_r / bcast_factor) : 1;
     std::vector<SparseCOO> vsliced = sparse.vslice(grid_r * t_factor_r * bcast_factor * dflow_factor);
     std::vector<SparseCOO> slices;
+    slices.reserve(grid_r * t_factor_r);
 
     for (int t = 0; t < t_factor_r; t++)
     {
         std::vector<SparseCOO> b_slices;
-        b_slices.reserve(bcast_factor * grid_r);
+        b_slices.reserve(grid_r * dflow_factor * bcast_factor);
 
-        if (layout == SparseBUDA::Layout::BufferOp or layout == SparseBUDA::Layout::ZMajorDataflow)
+        if (layout == SparseBUDA::Layout::ZMajorDataflow)
         {
             for (int r = 0; r < grid_r; r++)
             {
@@ -882,21 +931,15 @@ SparseBUDA::get_sparse_tiles_and_encodings(
     Layout layout,
     std::string const& visualize_sparse_path) const
 {
-    // TODO: Legalizer should pass max_u_kt, and then this fn *should return* the max possible values given encoding
-    // constraints
-
     int sparse_tile_ptr_bits = get_sparse_tile_ptr_bits(grid_r, t_factor_r, u_rt);
     int sparse_ublock_idx_bits = get_sparse_ublock_idx_bits(grid_r, t_factor_r, u_rt);
 
+    TT_ASSERT(sparse_tile_ptr_bits > 0 and sparse_ublock_idx_bits > 0);
+
     // Calculate bits needed for ublock (u_rt + u_kt separately encoded)
     int u_rt_bits = get_u_rt_encoding_bits(u_rt);
     int u_kt_bits = get_u_kt_encoding_bits(u_kt);
-    TT_ASSERT(
-        sparse_tile_ptr_bits + u_rt_bits + u_kt_bits <= 16,
-        "Can't encode sparse matrix with these parameters",
-        sparse_tile_ptr_bits,
-        u_rt_bits,
-        u_kt_bits);
+    TT_ASSERT(sparse_tile_ptr_bits + u_rt_bits + u_kt_bits <= 16);
 
     std::function<bool(tt::sparse::SparseIndex const&, tt::sparse::SparseIndex const&)> sp_indices_cmp_fn =
         [u_rt, u_kt](SparseIndex const& a, SparseIndex const& b) { return comp_zcr_ublocked(a, b, u_rt, u_kt); };
@@ -905,8 +948,6 @@ SparseBUDA::get_sparse_tiles_and_encodings(
     // Fracture factor is like having multiple grid_r's in flight
     int virtual_grid_r = grid_r * fracture_factor;
 
-    TT_ASSERT(!((layout == Layout::BufferOp) and fracture_factor > 1));
-
     SparseTiles sparse_tiles;
     EncodingTiles buda_indices;
     std::vector<int> num_strips_per_row;
@@ -956,7 +997,7 @@ SparseBUDA::get_sparse_tiles_and_encodings(
         if (env_as<bool>("PYBUDA_SPARSE_PRINT_INDICES"))
         {
             fmt::print("Grid_r[{}] {} {}\n", g_r, layout, t_factor_r);
-            print_info_indices(buda_indices.back(), sparse_tile_ptr_bits, sparse_ublock_idx_bits);
+            print_info_indices(buda_indices.back(), sparse_ublock_idx_bits);
         }
     }
 
@@ -1001,7 +1042,7 @@ SparseBUDA::get_sparse_tiles_and_encodings(
     return std::make_tuple<>(sparse_tiles, buda_indices, sparse_shape, encodings_shape, num_strips_per_row);
 }
 
-int SparseBUDA::get_encoding_tiles_per_core_general(int grid_r, int t_factor_r, int u_rt, int u_kt) const
+int SparseBUDA::get_encoding_tiles_per_core_estimate(int grid_r, int t_factor_r, int u_rt, int u_kt) const
 {
     // strip index (with last_* bits)   4b
     // number of ublocks                2b
@@ -1149,7 +1190,7 @@ int SparseBUDA::get_encoding_tiles_per_core_general(int grid_r, int t_factor_r,
     return (max_space + tile_bytes - 1) / tile_bytes;
 }
 
-int SparseBUDA::get_sparse_tiles_per_core_general(int grid_r, int t_factor_r) const
+int SparseBUDA::get_sparse_tiles_per_core_estimate(int grid_r, int t_factor_r) const
 {
     TT_ASSERT(this->sparse_shape[0] / TILE_DIM >= grid_r * t_factor_r);
     TT_ASSERT(this->sparse_shape[0] / TILE_DIM % (grid_r * t_factor_r) == 0);
diff --git a/pybuda/csrc/shared_utils/sparse_matmul_utils.hpp b/pybuda/csrc/shared_utils/sparse_matmul_utils.hpp
index 9a31fe2fd..444980a26 100644
--- a/pybuda/csrc/shared_utils/sparse_matmul_utils.hpp
+++ b/pybuda/csrc/shared_utils/sparse_matmul_utils.hpp
@@ -11,12 +11,17 @@
 #include "utils/assert.hpp"
 #include "utils/logger.hpp"
 
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wgnu-anonymous-struct"
+#pragma clang diagnostic ignored "-Wc99-extensions"
+#endif
+
 namespace tt::sparse
 {
 
 constexpr uint32_t TILE_DIM = 32;
 
-// third_party/budabackend/ops/mm_bare_structs.hpp
+
 union strip_info_struct
 {
     struct
@@ -148,6 +153,8 @@ struct SparseCOO
         while (idx++ < (rt_dim * ct_dim)) fmt::print(".{}", ((idx % ct_dim) == 0) ? "\n" : " ");
     }
 
+    // Vertically slice a SparseCOO tensor
+    //
     std::vector<SparseCOO> vslice(int num_slices) const
     {
         TT_ASSERT(shape[0] % num_slices == 0);
@@ -161,6 +168,21 @@ struct SparseCOO
 
         std::vector<SparseCOO> ret(num_slices, SparseCOO({this->shape[0] / num_slices, this->shape[1]}));
 
+        // Calculate total count of indices upfront, in order to reserve vector space once
+        //
+        std::vector<int> cache(num_slices, 0);
+        for (size_t idx = 0; idx < this->rows.size(); idx++)
+        {
+            int slice_idx = this->rows[idx] / slice_height;
+            cache[slice_idx]++;
+        }
+        for (size_t idx = 0; idx < cache.size(); idx++)
+        {
+            ret[idx].rows.reserve(cache[idx]);
+            ret[idx].cols.reserve(cache[idx]);
+            ret[idx].vals.reserve(cache[idx]);
+        }
+
         for (size_t idx = 0; idx < this->rows.size(); idx++)
         {
             int slice_idx = this->rows[idx] / slice_height;
@@ -182,21 +204,56 @@ struct SparseCOO
         shape[0] *= static_cast<std::int64_t>(std::distance(begin, end));
         SparseCOO ret(shape);
 
+        // Calculate total count of indices upfront, in order to reserve vector space once
+        //
+        std::uint64_t total_count_indices = 0;
+        for (auto iter = begin; iter != end; ++iter)
+        {
+            total_count_indices += iter->vals.size();
+        }
+
+        // Early out if empty
+        //
+        if (total_count_indices == 0)
+        {
+            return ret;
+        }
+
+        // Cols and Vals get reserved, while Rows get resized - we manually manage the Rows indices in the loop below
+        //
+        ret.rows.resize(total_count_indices);
+        ret.cols.reserve(total_count_indices);
+        ret.vals.reserve(total_count_indices);
+
         std::int64_t row_offset = 0;
+        std::int64_t indices_previously_added = 0;
         for (auto iter = begin; iter != end; ++iter)
         {
             SparseCOO const& coo = *iter;
             TT_ASSERT(coo.shape == begin->shape);
-            std::int64_t prev_size = static_cast<std::int64_t>(ret.rows.size());
-            ret.rows.resize(ret.rows.size() + coo.rows.size());
+
+            // Nothing to update if `coo` is empty
+            //
+            if (coo.vals.empty())
+            {
+                row_offset += coo.shape[0];
+                continue;
+            }
+
             for (std::int64_t i = 0; i < static_cast<std::int64_t>(coo.rows.size()); ++i)
-                ret.rows[prev_size + i] = coo.rows[i] + row_offset;
+            {
+                ret.rows[indices_previously_added++] = coo.rows[i] + row_offset;
+            }
             ret.cols.insert(ret.cols.end(), coo.cols.begin(), coo.cols.end());
             ret.vals.insert(ret.vals.end(), coo.vals.begin(), coo.vals.end());
             ret.col_bounds.extend(coo.col_bounds);
             row_offset += coo.shape[0];
         }
 
+        TT_ASSERT(ret.rows.size() == total_count_indices);
+        TT_ASSERT(ret.cols.size() == total_count_indices);
+        TT_ASSERT(ret.vals.size() == total_count_indices);
+
         return ret;
     }
 
@@ -219,25 +276,38 @@ struct SparseCOO
    private:
     SortOrder sorted_order = SortOrder::UNSORTED;
 
+    struct RowColVal
+    {
+        std::int64_t row;
+        std::int64_t col;
+        float val;
+    };
+
+    // Sorts the COO matrix in either row-major or column-major order
+    //
     void sort_(bool row_major)
     {
-        // TODO: This fn can probably optimized to work in-place with permutations
-        std::vector<std::tuple<int64_t, int64_t, float>> zipped;
+        // If this method ever becomes memory hungry (or slow due to memory allocations), an alternative approach would
+        // be something like this: https://stackoverflow.com/a/17074810/4030496
+
+        // Zip rows, cols, and vals together
+        //
+        std::vector<RowColVal> zipped;
+        zipped.reserve(rows.size());
         for (size_t idx = 0; idx < rows.size(); idx++)
         {
-            zipped.push_back(std::make_tuple(rows[idx], cols[idx], vals[idx]));
+            zipped.push_back(RowColVal{rows[idx], cols[idx], vals[idx]});
         }
 
+        // Sort either row-major or column-major
+        //
         if (row_major)
         {
             std::sort(
                 zipped.begin(),
                 zipped.end(),
                 [](const auto& lhs, const auto& rhs)
-                {
-                    return std::get<0>(lhs) == std::get<0>(rhs) ? std::get<1>(lhs) < std::get<1>(rhs)
-                                                                : std::get<0>(lhs) < std::get<0>(rhs);
-                });
+                { return lhs.row == rhs.row ? lhs.col < rhs.col : lhs.row < rhs.row; });
         }
         else
         {
@@ -245,17 +315,16 @@ struct SparseCOO
                 zipped.begin(),
                 zipped.end(),
                 [](const auto& lhs, const auto& rhs)
-                {
-                    return std::get<1>(lhs) == std::get<1>(rhs) ? std::get<0>(lhs) < std::get<0>(rhs)
-                                                                : std::get<1>(lhs) < std::get<1>(rhs);
-                });
+                { return lhs.col == rhs.col ? lhs.row < rhs.row : lhs.col < rhs.col; });
         }
 
+        // Update original rows, cols, and vals
+        //
         for (size_t idx = 0; idx < rows.size(); idx++)
         {
-            rows[idx] = std::get<0>(zipped[idx]);
-            cols[idx] = std::get<1>(zipped[idx]);
-            vals[idx] = std::get<2>(zipped[idx]);
+            rows[idx] = zipped[idx].row;
+            cols[idx] = zipped[idx].col;
+            vals[idx] = zipped[idx].val;
             col_bounds.extend(cols[idx]);
         }
     }
@@ -283,13 +352,147 @@ struct SparseBUDA
    public:
     enum class Layout
     {
-        Default,
-        ZMajor,
-        ZMajorDataflow,
-        BufferOp,
+        Default,         // Default layout
+        ZMajor,          // Z-major layout, e.g. RZ streaming - go thru all Zs first, for a given R slice
+        ZMajorDataflow,  // Z-major layout, special cased for sparse->dense dataflow (same as ZMajor, but slice
+                         // vertically down to single tile)
     };
 
-    static Layout create_layout(bool buffer_op, bool z_major, int fracture_factor);
+    // A little more on layouts...
+    //
+    // Layout dictates what the sparse tensor of the sparse matmul will look like. Sparse tensors get divided into
+    // chunks, to accomodate for parallelization across cores and t-streaming. Depending on what the chunks look like,
+    // and which core receives which chunk, the performance profile of the sparse matmul can change - additionally,
+    // whether the parallelization is legal can also change.
+    //
+    // Sparse matmul ops are used for various scenarios, but most often they're there as building blocks of
+    // convolutions. Sparse tensors of such ops have a specific pattern that looks something like this (e.g. for a 2x2
+    // convolution):
+    // [
+    //   1 0 0 0
+    //   0 1 0 0
+    //   0 0 1 0
+    //   0 0 0 1
+    //   1 0 0 0
+    //   0 1 0 0
+    //   0 0 1 0
+    //   0 0 0 1
+    //   1 0 0 0
+    //   0 1 0 0
+    //   0 0 1 0
+    //   0 0 0 1
+    //   1 0 0 0
+    //   0 1 0 0
+    //   0 0 1 0
+    //   0 0 0 1
+    // ]
+    // So they will look like a set of diagonal matrices, each one representing a single kernel point of the
+    // convolution.
+    //
+    // Currently, there are 3 variants of the layout: Default, ZMajor, and ZMajorDataflow.
+    //
+    // Default:
+    //   In this layout, the sparse tensor is in its original shape.
+    //
+    // ZMajor:
+    //   In this layout, the goal is to eliminate serialization in execution between cores. It is easier to explain the
+    //   transformation first, and then show why it eliminates serialization.
+    //
+    //   The transformation is as follows:
+    //     - Let's say we have 2x1 cores for this sparse op and t=2. The top core will handle the first two kernel
+    //     points while the second core will handle the last two kernel points - this is by design of the layout. In t=0
+    //     the top core will get the top parts of the sparse tensor's first two kernel points, which is two 2x4x4
+    //     pieces. The bottom core will get the top parts of the sparse tensor's last two kernel points, which is also
+    //     two 2x4x4 pieces. So for t=0, the first core's tensor will look like this:
+    //       [
+    //         1 0 0 0  <- first row of the first kernel point
+    //         0 1 0 0  <- second row of the first kernel point
+    //         1 0 0 0  <- first row of the second kernel point
+    //         0 1 0 0  <- second row of the second kernel point
+    //       ]
+    //     - And the second core, for t=0, it's tensor will look like this:
+    //       [
+    //         1 0 0 0  <- first row of the third kernel point
+    //         0 1 0 0  <- second row of the third kernel point
+    //         1 0 0 0  <- first row of the fourth kernel point
+    //         0 1 0 0  <- second row of the fourth kernel point
+    //       ]
+    //     - If we were to continue this for t=1, the first core's tensor would look like this:
+    //       [
+    //         0 0 1 0  <- third row of the first kernel point
+    //         0 0 0 1  <- fourth row of the first kernel point
+    //         0 0 1 0  <- third row of the second kernel point
+    //         0 0 0 1  <- fourth row of the second kernel point
+    //       ]
+    //     - And the second core, for t=1, it's tensor will look like this:
+    //       [
+    //         0 0 1 0  <- third row of the third kernel point
+    //         0 0 0 1  <- fourth row of the third kernel point
+    //         0 0 1 0  <- third row of the fourth kernel point
+    //         0 0 0 1  <- fourth row of the fourth kernel point
+    //       ]
+    //
+    //   From the perspective of the sparse matmul, where these sparse tensors are left operands, the ZMajor layout
+    //   makes it so that all cores, in a given point in time, are working on the same inner dimension of the matmul. If
+    //   we think about how the right operand is buffered (horizontal strips), and keep in mind that all cores receive
+    //   the full right operand tensor, this layout makes it so that all cores are working on the same strip of the
+    //   right operand tensor, which removes any serialization in execution.
+    //
+    //   Naming comes from the fact that the sparse tensor is first vertically sliced by a factor of kernel points,
+    //   which is in this case a vslice(4). Then, a core will work on a set of Zs for a given R slice, so in a way, if
+    //   we fix the height that a core is reading (in this case 2-tile high), we go through the tensor in Z-major order,
+    //   hence the name ZMajor for the layout.
+    //
+    // ZMajorDataflow:
+    //   This layout is a special case of ZMajor, where the sparse tensor is vertically sliced all the way down to a
+    //   single tile. The goal of this layout is to improve the dataflow between the sparse and dense matmuls - if we
+    //   place a sparse&dense pair of ops next to each other, with the same height of cores, as we usually do, this
+    //   layout will enable dataflow thru NOC to use direct pipes, which means that each sparse core will send its
+    //   output directly and only to its corresponding core of dense matmul, which is right next to the sparse core.
+    //   Hence the "Dataflow" in the name.
+    //
+    //   Using the sparse tensor from the Default layout, the transformation to ZMajorDataflow is as follows:
+    //     - Similar to ZMajor, we can first imagine that the sparse tensor is vertically sliced by a factor of kernel
+    //     points, which is in this case a vslice(4), so the tensor goes from 16x4 to a 4x4x4. Then, imagine that the
+    //     tensor is read completely in Z-major order, tile by tile. Let's say we have 2x1 cores for this sparse op and
+    //     t=2. First core, for t=0, will get these tiles:
+    //       [
+    //         1 0 0 0  <- first tile of the first kernel point
+    //         1 0 0 0  <- first tile of the second kernel point
+    //         1 0 0 0  <- first tile of the third kernel point
+    //         1 0 0 0  <- first tile of the fourth kernel point
+    //       ]
+    //     - And the second core, for t=0, it's tensor will look like this:
+    //       [
+    //         0 1 0 0  <- second tile of the first kernel point
+    //         0 1 0 0  <- second tile of the second kernel point
+    //         0 1 0 0  <- second tile of the third kernel point
+    //         0 1 0 0  <- second tile of the fourth kernel point
+    //       ]
+    //     - If we were to continue this for t=1, the first core's tensor would look like this:
+    //       [
+    //         0 0 1 0  <- third tile of the first kernel point
+    //         0 0 1 0  <- third tile of the second kernel point
+    //         0 0 1 0  <- third tile of the third kernel point
+    //         0 0 1 0  <- third tile of the fourth kernel point
+    //       ]
+    //     - And the second core, for t=1, it's tensor will look like this:
+    //       [
+    //         0 0 0 1  <- fourth tile of the first kernel point
+    //         0 0 0 1  <- fourth tile of the second kernel point
+    //         0 0 0 1  <- fourth tile of the third kernel point
+    //         0 0 0 1  <- fourth tile of the fourth kernel point
+    //       ]
+    //
+    //   The main difference between ZMajor and ZMajorDataflow is that ZMajorDataflow is vertically sliced all the way
+    //   down to a single tile, which enables the dataflow between the sparse and dense matmuls to use direct pipes.
+    //   However, by doing this transformation, there is a side-effect on the output, the rows of the output are all
+    //   mixed up, i.e. they're not in the order that the dense matmul "expects" them to be in. To correct this, a set
+    //   of TM ops is applied to the output of the sparse matmul, which will reorder the rows back to the correct order.
+    //   This set of TM ops might look complicated, but when worked out, it just makes it so that each sparse matmul
+    //   core sends data only to its corresponding dense matmul core, which is right next to it.
+
+    static Layout create_layout(bool z_major, int fracture_factor);
 
     std::vector<SparseCOO> sparse_zs;
     std::vector<SparseIndex> sparse_indices;
@@ -309,8 +512,8 @@ struct SparseBUDA
         int bcast_factor,
         int fracture_factor);
 
-    int get_sparse_tiles_per_core_general(int grid_r, int t_factor_r) const;
-    int get_encoding_tiles_per_core_general(int grid_r, int t_factor_r, int u_rt, int u_kt) const;
+    int get_sparse_tiles_per_core_estimate(int grid_r, int t_factor_r) const;
+    int get_encoding_tiles_per_core_estimate(int grid_r, int t_factor_r, int u_rt, int u_kt) const;
     int get_sparse_tile_ptr_bits(int grid_r, int t_factor_r, int u_rt = 1) const;
     int get_sparse_ublock_idx_bits(int grid_r, int t_factor_r, int u_rt = 1) const;
     int get_max_u_kt(int grid_r, int t_factor_r, int u_rt, int sparse_tile_ptr_bits = 0) const;
@@ -348,6 +551,9 @@ struct SparseBUDA
 SparseBUDA compress_sparse_tensor_and_strip_info(
     std::vector<SparseCOO> const& sparse_zs, int bcast_factor, int fracture_factor);
 
+int get_u_rt_encoding_bits(int u_rt);
+int get_u_kt_encoding_bits(int u_kt);
+
 std::ostream& operator<<(std::ostream& out, const SparseCOO::SortOrder& sort_order);
 std::ostream& operator<<(std::ostream& out, SparseBUDA::Layout layout);
 
diff --git a/pybuda/csrc/shared_utils/string_extension.cpp b/pybuda/csrc/shared_utils/string_extension.cpp
new file mode 100644
index 000000000..c696efb0f
--- /dev/null
+++ b/pybuda/csrc/shared_utils/string_extension.cpp
@@ -0,0 +1,21 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "string_extension.hpp"
+#include <algorithm>
+namespace tt::utils {
+    // Convert a string to lower case
+    std::string to_lower_string(const std::string& str) {
+        std::string lower_str = str;
+        std::transform(lower_str.begin(), lower_str.end(), lower_str.begin(), [](unsigned char c) { return std::tolower(c); });
+        return lower_str;
+    }
+
+    // Convert a string to upper case
+    std::string to_upper_string(const std::string &str) {
+        std::string upper_str = str;
+        std::transform(upper_str.begin(), upper_str.end(), upper_str.begin(), [](unsigned char c) { return std::toupper(c); });
+        return upper_str;
+    }
+}
\ No newline at end of file
diff --git a/pybuda/csrc/shared_utils/string_extension.hpp b/pybuda/csrc/shared_utils/string_extension.hpp
new file mode 100644
index 000000000..326450745
--- /dev/null
+++ b/pybuda/csrc/shared_utils/string_extension.hpp
@@ -0,0 +1,14 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include <string>
+
+namespace tt::utils{
+    // Convert a string to lower case
+    std::string to_lower_string(const std::string& str);
+    
+    // Convert a string to upper case
+    std::string to_upper_string(const std::string &str);
+}
\ No newline at end of file
diff --git a/pybuda/csrc/test/common.hpp b/pybuda/csrc/test/common.hpp
index 5dd49ffbd..6ce95b8ac 100644
--- a/pybuda/csrc/test/common.hpp
+++ b/pybuda/csrc/test/common.hpp
@@ -280,37 +280,18 @@ class GraphTest : public ::testing::Test
 using PybudaGraphTest = GraphTest<graphlib::IRLevel::IR_PYBUDA>;
 using BudaGraphTest = GraphTest<graphlib::IRLevel::IR_BUDA>;
 
-enum class Arch
-{
-    Grayskull,
-    Wormhole,
-    Wormhole_b0,
-};
-
-inline std::string arch2str(Arch arch)
-{
-    switch (arch)
-    {
-        case Arch::Grayskull: return "Grayskull";
-        case Arch::Wormhole_b0: return "Wormhole_b0";
-        case Arch::Wormhole: return "Wormhole";
-    }
-    return "Unknown";
-}
-
 inline DeviceConfig create_device_config(
-    Arch arch = Arch::Grayskull,
+    tt::ARCH arch = tt::ARCH::GRAYSKULL,
     std::optional< std::vector<std::uint32_t> > device_chip_ids =  std::nullopt,
     std::string cluster_config_yaml = "",
     std::string runtime_params_yaml = "");
 
 DeviceConfig create_device_config(
-    Arch arch,
+    tt::ARCH arch,
     std::optional< std::vector<std::uint32_t> > device_chip_ids,
     std::string cluster_config_yaml,
     std::string runtime_params_yaml)
 {
-    std::string home = env_as<std::string>("BUDA_HOME", "third_party/budabackend");
     std::vector<std::uint32_t> chip_ids = {0};
     if(device_chip_ids.has_value()) {
         chip_ids = device_chip_ids.value();
@@ -318,7 +299,7 @@ DeviceConfig create_device_config(
 
     switch (arch)
     {
-        case Arch::Grayskull:
+        case tt::ARCH::GRAYSKULL:
             return DeviceConfig(
                 "grayskull" /*arch_name*/,
                 home + "/device/grayskull_120_arch.yaml" /*device_yaml*/,
@@ -327,19 +308,19 @@ DeviceConfig create_device_config(
                 "golden" /*backend_type*/,
                 false /*store_backend_db_to_yaml*/,
                 chip_ids);
-        case Arch::Wormhole:
+        case tt::ARCH::WORMHOLE_B0:
             return DeviceConfig(
-                "wormhole" /*arch_name*/,
-                home + "/device/wormhole_80_arch.yaml" /*device_yaml*/,
+                "wormhole_b0" /*arch_name*/,
+                home + "/device/wormhole_b0_80_arch.yaml" /*device_yaml*/,
                 cluster_config_yaml /*cluster_config_yaml*/,
                 runtime_params_yaml /*runtime_params_yaml*/,
                 "golden" /*backend_type*/,
                 false /*store_backend_db_to_yaml*/,
                 chip_ids);
-        case Arch::Wormhole_b0:
+        case tt::ARCH::BLACKHOLE:
             return DeviceConfig(
-                "wormhole_b0" /*arch_name*/,
-                home + "/device/wormhole_b0_80_arch.yaml" /*device_yaml*/,
+                "blackhole" /*arch_name*/,
+                home + "/device/blackhole_80_arch.yaml" /*device_yaml*/,
                 cluster_config_yaml /*cluster_config_yaml*/,
                 runtime_params_yaml /*runtime_params_yaml*/,
                 "golden" /*backend_type*/,
diff --git a/pybuda/csrc/tt_torch_device/CMakeLists.txt b/pybuda/csrc/tt_torch_device/CMakeLists.txt
new file mode 100644
index 000000000..9272b56b3
--- /dev/null
+++ b/pybuda/csrc/tt_torch_device/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_library(tt_torch_device
+    STATIC
+    tt_device.cpp
+    torch_device_impl.cpp
+    python_bindings.cpp)
+
+target_compile_options(tt_torch_device PRIVATE ${STATIC_LIB_FLAGS} ${PYBUDA_CSRC_CFLAGS})
+add_dependencies(tt_torch_device build_tt_mlir)
diff --git a/pybuda/csrc/tt_torch_device/module.mk b/pybuda/csrc/tt_torch_device/module.mk
index 4768cabbf..05caabad8 100644
--- a/pybuda/csrc/tt_torch_device/module.mk
+++ b/pybuda/csrc/tt_torch_device/module.mk
@@ -13,10 +13,10 @@ PYBUDA_CSRC_TT_TORCH_DEVICE_DEPS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_TT_TORC
 
 pybuda/csrc/tt_torch_device: $(PYBUDA_CSRC_TT_TORCH_DEVICE_LIB);
 
-$(PYBUDA_CSRC_TT_TORCH_DEVICE_LIB): $(PYBUDA_CSRC_TT_TORCH_DEVICE_OBJS) $(PYBUDA_CSRC_BACKENDAPI_LIB)
+$(PYBUDA_CSRC_TT_TORCH_DEVICE_LIB): $(PYBUDA_CSRC_TT_TORCH_DEVICE_OBJS)
 	@mkdir -p $(LIBDIR)
 	ar rcs $@ $^
 
-$(OBJDIR)/pybuda/csrc/tt_torch_device/%.o: pybuda/csrc/tt_torch_device/%.cpp python_env
+$(OBJDIR)/pybuda/csrc/tt_torch_device/%.o: pybuda/csrc/tt_torch_device/%.cpp $(PYTHON_ENV)
 	@mkdir -p $(@D)
 	$(CXX) $(PYBUDA_CSRC_CFLAGS) $(CXXFLAGS) $(SHARED_LIB_FLAGS) $(PYBUDA_CSRC_TT_TORCH_DEVICE_INCLUDES) -c -o $@ $<
diff --git a/pybuda/csrc/tt_torch_device/python_bindings.cpp b/pybuda/csrc/tt_torch_device/python_bindings.cpp
index 3cc8c41cd..bf86a31f7 100644
--- a/pybuda/csrc/tt_torch_device/python_bindings.cpp
+++ b/pybuda/csrc/tt_torch_device/python_bindings.cpp
@@ -4,13 +4,14 @@
 #include "tt_torch_device/python_bindings.hpp"
 #include "tt_torch_device/tt_device.hpp"
 #include "pybuda/csrc/python_bindings_common.hpp"
+#include "pybuda/csrc/backend_api/arch_type.hpp"
 
 
 namespace tt {
 
 void TorchDeviceModule(py::module &m_torch_device)
 {
-    m_torch_device.def("get_default_device", []() { return tt::get_default_tt_device(); });
+    m_torch_device.def("get_default_device", &tt::get_default_tt_device, py::return_value_policy::reference);
     m_torch_device.def("get_available_devices", []() { return tt::get_available_tt_devices(); });
 
     py::class_<tt::PyBudaTensorDesc>(m_torch_device, "PyBudaTensorDesc")
@@ -29,59 +30,27 @@ void TorchDeviceModule(py::module &m_torch_device)
             .def_readonly("ptr", &tt::PyBudaTensorDesc::ptr)
             .def_readonly("constant", &tt::PyBudaTensorDesc::constant);
 
-    py::class_<tt::Program>(m_torch_device, "Program")
-        .def(
-            py::init<std::string const&, std::map<std::string, std::string> const&>(),
-            py::arg("name"),
-            py::arg("params"));
-
-    py::class_<tt::CompileRequest>(m_torch_device, "CompileRequest")
-        .def(
-            py::init<
-                std::string const&,
-                std::string const&,
-                tt::tt_backend_config const&,
-                std::vector<tt::PyBudaTensorDesc> const&,
-                std::vector<std::string> const&,
-                std::vector<tt::PyBudaTensorDesc> const&,
-                std::vector<tt::PyBudaTensorDesc> const&,
-                std::vector<tt::PyBudaTensorDesc> const&,
-                std::vector<std::string> const&>(),
-            py::arg("netlist_path"),
-            py::arg("output_dir"),
-            py::arg("backend_config"),
-            py::arg("inputs"),
-            py::arg("input_runtime_transforms"),
-            py::arg("constants"),
-            py::arg("parameters"),
-            py::arg("outputs"),
-            py::arg("output_runtime_transforms"));
-
     py::class_<tt::Workload, std::shared_ptr<tt::Workload>>(m_torch_device, "Workload")
-        .def_readonly("backend", &tt::Workload::backend)
         .def_readonly("inputs", &tt::Workload::inputs)
         .def_readonly("constants", &tt::Workload::constants)
         .def_readonly("parameters", &tt::Workload::parameters)
         .def_readonly("outputs", &tt::Workload::outputs);
 
-    py::class_<tt::TTDevice>(m_torch_device, "TTDevice")
-        .def_readonly("type", &tt::TTDevice::type)
-        .def_readonly("arch", &tt::TTDevice::arch)
+    py::class_<tt::TTDevice>tt_device (m_torch_device, "TTDevice");
+    tt_device.def_readonly("arch", &tt::TTDevice::arch)
         .def_readonly("mmio", &tt::TTDevice::mmio)
-        .def_readonly("index", &tt::TTDevice::index)
-        .def_readonly("soc_desc_yaml", &tt::TTDevice::soc_desc_yaml)
+        .def_readonly("input_runtime_transforms", &tt::TTDevice::input_runtime_transforms)
+        .def_readonly("input_tile_bcast_dims", &tt::TTDevice::input_tile_bcast_dims)
+        .def_readonly("output_runtime_transforms", &tt::TTDevice::output_runtime_transforms)
         .def_property_readonly("cluster_yaml", &tt::get_device_cluster_yaml)
         .def("torch_device", &tt::torch_device)
         .def("str", &tt::to_string)
         .def("__str__", &tt::to_string)
-        .def("compile", &tt::compile)
         .def("dispatch", &tt::dispatch);
 
-    m_torch_device.def("push_tensor", tt::push_tensor);
     m_torch_device.def("is_created_on_device", tt::is_created_on_device);
     m_torch_device.def("original_shape", tt::original_shape);
+    m_torch_device.def("unique_id", tt::unique_id);
 }
 
-
-
 }
diff --git a/pybuda/csrc/tt_torch_device/python_bindings.hpp b/pybuda/csrc/tt_torch_device/python_bindings.hpp
index 7d2143292..bbc973fce 100644
--- a/pybuda/csrc/tt_torch_device/python_bindings.hpp
+++ b/pybuda/csrc/tt_torch_device/python_bindings.hpp
@@ -3,7 +3,12 @@
 // SPDX-License-Identifier: Apache-2.0
 
 #pragma once
+
+#pragma clang diagnostic push
+#pragma clang diagnostic ignored "-Wgnu-zero-variadic-macro-arguments"
 #include <pybind11/pybind11.h>
+#pragma clang diagnostic pop
+
 #include <pybind11/stl.h>
 #include <pybind11/numpy.h>
 namespace py = pybind11;
diff --git a/pybuda/csrc/tt_torch_device/torch_device_impl.cpp b/pybuda/csrc/tt_torch_device/torch_device_impl.cpp
index 0e9725108..f926097a3 100644
--- a/pybuda/csrc/tt_torch_device/torch_device_impl.cpp
+++ b/pybuda/csrc/tt_torch_device/torch_device_impl.cpp
@@ -3,18 +3,21 @@
 // SPDX-License-Identifier: Apache-2.0
 #define STRIP_ERROR_MESSAGES
 #include <ATen/Context.h>
-#include <ATen/native/CPUFallback.h>
 #include <ATen/InferSize.h>
 #include <ATen/TensorUtils.h>
+#include <ATen/native/CPUFallback.h>
 #include <torch/library.h>
 
 #include <utility>
 
-#include "pybuda/csrc/tt_torch_device/tt_device.hpp"
+#include "pybuda/csrc/lower_to_buda/common.hpp"
 #include "pybuda/csrc/tt_torch_device/python_bindings.hpp"
+#include "pybuda/csrc/tt_torch_device/tt_device.hpp"
 #include "utils/assert.hpp"
 #include "utils/logger.hpp"
 
+#include "tt/runtime/runtime.h"
+
 namespace tt
 {
 // There are dummy enums defined in pytorch, like PrivateUse1 that can be used
@@ -23,10 +26,10 @@ namespace tt
 constexpr inline c10::DispatchKey DispatchKeyTT = c10::DispatchKey::PrivateUse1;
 
 // TorchDevice interposes potentially many underlying HWDevices
-class TorchDeviceImpl : public c10::impl::DeviceGuardImplInterface
+class TorchDeviceImpl final : public c10::impl::DeviceGuardImplInterface
 {
    public:
-    TorchDeviceImpl(std::vector<TTDevice> const& tt_devices) : tt_devices(tt_devices) {}
+    TorchDeviceImpl(const TTSystem& system) : tt_devices(system.devices) {}
 
     // Torch overrides
     virtual c10::DeviceType type() const override { return TT; }
@@ -63,24 +66,27 @@ class TorchDeviceImpl : public c10::impl::DeviceGuardImplInterface
     // TT specific
     static TorchDeviceImpl& get()
     {
-        static TorchDeviceImpl tt_device_impl(query_available_tt_devices());
+        static TorchDeviceImpl tt_device_impl(TTSystem::get_system());
+
         return tt_device_impl;
     }
     std::int64_t get_index() { return current_device.index(); }
 
-    TTDevice getTTDevice() const
+    int get_next_unique_id() { return next_id++; }
+
+    std::shared_ptr<TTDevice> getTTDevice() const
     {
         TT_ASSERT(current_device.index() < (int)tt_devices.size());
         return tt_devices[current_device.index()];
     }
 
-    TTDevice getDefaultTTDevice() const
+    const std::shared_ptr<TTDevice>& getDefaultTTDevice() const
     {
         TT_ASSERT(not tt_devices.empty());
         return tt_devices.front();
     }
 
-    std::vector<TTDevice> getTTDevices() const { return tt_devices; }
+    std::vector<std::shared_ptr<TTDevice>> getTTDevices() const { return tt_devices; }
 
     std::map<const void*, std::string> registered_output_transforms;
     std::vector<std::string> ordered_input_trasforms;
@@ -89,18 +95,19 @@ class TorchDeviceImpl : public c10::impl::DeviceGuardImplInterface
    private:
     mutable c10::Device current_device = c10::Device(TT, 0);
     mutable c10::Stream current_stream = c10::Stream(c10::Stream::UNSAFE, c10::Device(TT, 0), 0);
-    std::vector<TTDevice> tt_devices;
+    std::vector<std::shared_ptr<TTDevice>> tt_devices;
+    int next_id = 0;
 };
 
 // register backend
 c10::impl::DeviceGuardImplRegistrar tt_device_impl_reg(TT, &TorchDeviceImpl::get());
 
-TTDevice get_default_tt_device() { return TorchDeviceImpl::get().getDefaultTTDevice(); }
-std::vector<TTDevice> get_available_tt_devices() { return TorchDeviceImpl::get().getTTDevices(); }
+const std::shared_ptr<TTDevice>& get_default_tt_device() { return TorchDeviceImpl::get().getDefaultTTDevice();}
+std::vector<std::shared_ptr<TTDevice>> get_available_tt_devices() { return TorchDeviceImpl::get().getTTDevices(); }
 
 struct Mallocator final : public c10::Allocator
 {
-    virtual c10::DataPtr allocate(size_t n) const
+    virtual c10::DataPtr allocate(size_t n) const override
     {
         void* ptr = std::calloc(n, 1);
         return c10::DataPtr(ptr, nullptr, std::free, c10::Device(TT, 0));
@@ -122,23 +129,6 @@ void fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack)
     at::native::cpu_fallback(op, stack);
 }
 
-static torch::ScalarType df_to_torch_scalar_type(DataFormat df)
-{
-    switch (df)
-    {
-        case DataFormat::Int8: return torch::ScalarType::Byte;
-        case DataFormat::UInt16: return torch::ScalarType::Short;
-        case DataFormat::RawUInt32: return torch::ScalarType::Int;
-        case DataFormat::Int32: return torch::ScalarType::Int;
-        case DataFormat::Float16: return torch::ScalarType::Half;
-        case DataFormat::Float32: return torch::ScalarType::Float;
-        case DataFormat::Float16_b: return torch::ScalarType::BFloat16;
-        default: break;
-    }
-
-    log_fatal(LogTTDevice, "Unhandled dtype {}", df);
-}
-
 static std::pair<std::vector<std::int64_t>, std::size_t> calculate_stride_size(
     torch::IntArrayRef size, torch::IntArrayRef stride, std::size_t scalar_size)
 {
@@ -198,53 +188,6 @@ std::string get_runtime_transform(torch::Tensor const& tensor, bool input)
     }
 }
 
-torch::Tensor from_pytorch_tensor_desc(
-    tt_PytorchTensorDesc const& desc, std::vector<std::int64_t> const& shape, FreePytorchTensorDescFn* free_fn)
-{
-    std::int64_t elemsize = static_cast<std::int64_t>(desc.itemsize);
-    std::vector<std::int64_t> strides = {
-        static_cast<std::int64_t>(desc.strides[0]) / elemsize,
-        static_cast<std::int64_t>(desc.strides[1]) / elemsize,
-        static_cast<std::int64_t>(desc.strides[2]) / elemsize,
-        static_cast<std::int64_t>(desc.strides[3]) / elemsize,
-    };
-    std::vector<std::int64_t> aligned_shape;
-    size_t dim = 0;
-    for (auto s : shape)
-    {
-        if (shape.size() <= 2 or dim < shape.size() - 2)
-            aligned_shape.push_back(s);
-        else
-            aligned_shape.push_back(align_up_tile(s));
-        dim++;
-    }
-    TT_ASSERT(shape.size() <= strides.size());
-
-    while (strides.size() > shape.size()) strides.erase(strides.begin());
-
-    torch::ScalarType type = df_to_torch_scalar_type(desc.format);
-    std::int64_t size_bytes = strides.front() * elemsize;
-
-    tt_PytorchTensorDesc* ctx = new tt_PytorchTensorDesc(desc);
-    c10::Storage storage(
-        c10::Storage::use_byte_size_t(),
-        size_bytes,
-        at::DataPtr(const_cast<void*>(desc.ptr), static_cast<void*>(ctx), free_fn, at::Device(TT, TorchDeviceImpl::get().get_index())));
-
-    c10::DispatchKeySet dispatch_keyset = c10::DispatchKeySet{DispatchKeyTT};
-    c10::intrusive_ptr<c10::TensorImpl> impl = c10::make_intrusive<c10::TensorImpl>(
-        std::move(storage), dispatch_keyset, caffe2::TypeMeta::fromScalarType(type));
-
-    impl->set_sizes_and_strides(torch::IntArrayRef(aligned_shape), torch::IntArrayRef(strides));
-
-    c10::intrusive_ptr<c10::BackendMeta> backend_meta{std::unique_ptr<c10::BackendMeta>(new TTMetaData())};
-    TTMetaData *tt_meta = dynamic_cast<TTMetaData*>(backend_meta.get());
-    tt_meta->runtime_transformed = false;
-    tt_meta->created_on_device = true;
-    impl->set_backend_meta(backend_meta);
-
-    return torch::Tensor::wrap_tensor_impl(impl);;
-}
 
 torch::Device torch_device_at_index(std::int64_t index) { return torch::Device(TT, index); }
 
@@ -327,6 +270,45 @@ torch::Tensor empty(
     return empty_strided(size, stride, dtype, layout, device, pin_memory);
 }
 
+// torch::Tensor to(
+//     const torch::Tensor& self,
+//     c10::optional<c10::ScalarType> dtype,
+//     c10::optional<c10::Layout> layout,
+//     c10::optional<c10::Device> device,
+//     c10::optional<bool> pin_memory,
+//     bool non_blocking,
+//     bool copy,
+//     c10::optional<c10::MemoryFormat> optional_memory_format
+// ) 
+// {
+//     (void)dtype;
+//     (void)layout;
+//     (void)device;
+//     (void)pin_memory;
+//     (void)non_blocking;
+//     (void)copy;
+//     (void)optional_memory_format;
+//     return self;
+// }
+// torch::Tensor _to_copy(
+//     const torch::Tensor& self,
+//     c10::optional<c10::ScalarType> dtype,
+//     c10::optional<c10::Layout> layout,
+//     c10::optional<c10::Device> device,
+//     c10::optional<bool> pin_memory,
+//     bool non_blocking,
+//     c10::optional<c10::MemoryFormat> optional_memory_format)
+// {
+//     (void)dtype;
+//     (void)layout;
+//     (void)device;
+//     (void)pin_memory;
+//     (void)non_blocking;
+//     (void)optional_memory_format;
+//     //TODO: Implement me
+//     return self;
+// }
+
 torch::Tensor _copy_from(const torch::Tensor& self, const torch::Tensor& dest, bool non_blocking)
 {
     PyGILState_STATE gstate=PyGILState_Ensure();
@@ -393,11 +375,15 @@ torch::Tensor _copy_from(const torch::Tensor& self, const torch::Tensor& dest, b
         // barrier self
         // barrier dest
         // dest = self
-
-        log_fatal(
-            "Unsupported (for now) _copy_from TTDevice[{}] to TTDevice[{}]",
-            self.device().index(),
-            dest.device().index());
+        
+        //log_fatal(
+        //    "Unsupported (for now) _copy_from TTDevice[{}] to TTDevice[{}]",
+        //    self.device().index(),
+        //    dest.device().index());
+        auto self_num_items = self.numel();
+        auto dest_num_items = dest.numel();
+        TT_ASSERT(self_num_items == dest_num_items, self_num_items, dest_num_items);
+        std::memcpy(dest_tensor_data, self_tensor_data, self_nbytes);
     }
     else
     {
@@ -520,6 +506,7 @@ torch::Tensor view(const torch::Tensor &self, const c10::IntArrayRef size)
         self.strides(),
         size);
 
+    PyGILState_Release(gstate);
     at::DimVector inferred_size = at::infer_size_dv(size, self.numel());
     c10::optional<at::DimVector>  stride = at::detail::computeStride(self.sizes(),
                                             self.strides(),
@@ -536,7 +523,6 @@ torch::Tensor view(const torch::Tensor &self, const c10::IntArrayRef size)
     else
         TT_ASSERT(false, "Unhandled");
 
-    PyGILState_Release(gstate);
     return ret;
 }
 }  // namespace tt
@@ -547,13 +533,15 @@ TORCH_LIBRARY_IMPL(aten, PrivateUse1, m)
     if (ops_registered)
         return;
     ops_registered = true;
-    m.impl("aten::empty.memory_format", &tt::empty);
-    m.impl("aten::empty_strided", &tt::empty_strided);
-    m.impl("aten::_copy_from", &tt::_copy_from);
-    m.impl("aten::_copy_from_and_resize", &tt::_copy_from_and_resize);
-    m.impl("aten::_reshape_alias", &tt::_reshape_alias);
-    m.impl("aten::as_strided", &tt::as_strided);
-    m.impl("aten::index.Tensor_out", &tt::index_outf);
+    // m.impl("aten::empty.memory_format", &tt::empty);
+    // m.impl("aten::empty_strided", &tt::empty_strided);
+    // m.impl("aten::_copy_from", &tt::_copy_from);
+    // m.impl("aten::_to_copy", &tt::_to_copy);
+    // m.impl("aten::to", &tt::to);
+    // m.impl("aten::_copy_from_and_resize", &tt::_copy_from_and_resize);
+    // m.impl("aten::_reshape_alias", &tt::_reshape_alias);
+    // m.impl("aten::as_strided", &tt::as_strided);
+    // m.impl("aten::index.Tensor_out", &tt::index_outf);
     // m.impl("aten::view", &tt::view);
 }
 
@@ -565,3 +553,4 @@ TORCH_LIBRARY_IMPL(_, PrivateUse1, m)
     fallback_registered = true;
     m.fallback(torch::CppFunction::makeFromBoxedFunction<&tt::fallback>());
 }
+
diff --git a/pybuda/csrc/tt_torch_device/tt_device.cpp b/pybuda/csrc/tt_torch_device/tt_device.cpp
index c21279a7d..72a87c20e 100644
--- a/pybuda/csrc/tt_torch_device/tt_device.cpp
+++ b/pybuda/csrc/tt_torch_device/tt_device.cpp
@@ -9,9 +9,9 @@
 #include <unordered_map>
 #include <vector>
 
-#include "pybuda/csrc/balancer/output_host_tm_types.hpp"
-#include "third_party/budabackend/netlist/tt_backend.hpp"
-#include "third_party/budabackend/netlist/tt_backend_api.hpp"
+#include "pybuda/csrc/lower_to_buda/common.hpp"
+#include "tt/runtime/runtime.h"
+#include "tt/runtime/types.h"
 #include "utils/assert.hpp"
 #include "utils/env.hpp"
 #include "utils/logger.hpp"
@@ -37,260 +37,216 @@ struct CommandQueue
     std::vector<Command> commands;
 };
 
-std::shared_ptr<Workload> compile(TTDevice& device, CompileRequest const& compile_request)
-{
-    TT_ASSERT(device.arch == compile_request.backend_config.arch);
-
-    std::shared_ptr<Workload> workload = std::make_shared<Workload>(
-        tt_backend::create(compile_request.netlist_path, compile_request.backend_config),
-        compile_request.output_dir,
-        compile_request.inputs,
-        compile_request.constants,
-        compile_request.parameters,
-        compile_request.outputs);
-
-    register__ordered_input_runtime_transforms(compile_request.input_runtime_transforms);
-    device.input_runtime_transforms = compile_request.input_runtime_transforms;
-    device.output_runtime_transforms = compile_request.output_runtime_transforms;
-    
-    tt::tt_compile_result result;
-    if (workload->backend->initialize(&result) != DEVICE_STATUS_CODE::Success)
-        log_fatal(LogTTDevice, "Backend compile failed: {}", tt::get_string(result));
-
-    workload->initialized = true;
-
-    return workload;
-}
-
-static DataFormat torch_scalar_type_to_df(torch::ScalarType st)
+static target::DataType torch_scalar_type_to_dt(torch::ScalarType st)
 {
     switch (st)
     {
-        case torch::ScalarType::Byte: return DataFormat::Int8;
-        case torch::ScalarType::Char: return DataFormat::Int8;
-        case torch::ScalarType::Short: return DataFormat::UInt16;
-        case torch::ScalarType::Int: return DataFormat::RawUInt32;
-        case torch::ScalarType::Long: return DataFormat::RawUInt32;
-        case torch::ScalarType::Half: return DataFormat::Float16;
-        case torch::ScalarType::Float: return DataFormat::Float32;
+        case torch::ScalarType::Byte: return target::DataType::UInt8;
+        case torch::ScalarType::Char: return target::DataType::UInt8;
+        case torch::ScalarType::Short: return target::DataType::UInt16;
+        case torch::ScalarType::Int: return target::DataType::UInt32;
+        case torch::ScalarType::Long: return target::DataType::UInt32;
+        case torch::ScalarType::Half: return target::DataType::Float16;
+        case torch::ScalarType::Float: return target::DataType::Float32;
         // case torch::ScalarType::Double:
         // case torch::ScalarType::ComplexHalf:
         // case torch::ScalarType::ComplexFloat:
         // case torch::ScalarType::ComplexDouble:
         // case torch::ScalarType::Bool:
-        case torch::ScalarType::BFloat16: return DataFormat::Float16_b;
+        case torch::ScalarType::BFloat16: return target::DataType::BFloat16;
         default: break;
     }
 
     log_fatal(LogTTDevice, "Unhandled dtype {}", st);
 }
 
-static void free_tt_PytorchTensorDesc(void* ctx)
+static torch::ScalarType dt_to_torch_scalar_type(target::DataType df)
 {
-    tt_PytorchTensorDesc* desc = static_cast<tt_PytorchTensorDesc*>(ctx);
-    backend::free_tensor(*desc);
-    delete desc;
-}
-
-static tt_PytorchTensorDesc to_pytorch_tensor_desc(torch::Tensor const& tensor)
-{
-    // TT_ASSERT(tensor.is_contiguous());
-    TT_ASSERT(tensor.dim() <= 4);
-    TT_ASSERT(tensor.strides().size() == tensor.sizes().size());
-
-    std::int64_t dim = (std::int64_t)tensor.sizes().size();
-    TT_ASSERT(dim > 0);
-
-    std::size_t scalar_size = tensor.element_size();
-    std::array<std::uint32_t, PY_TENSOR_DIMS> shape = {1, 1, kTileDim, kTileDim};
-    std::array<std::uint32_t, PY_TENSOR_DIMS> strides = {0, 0, 0, 0};
-
-    int i = PY_TENSOR_DIMS - dim;
-    for (auto s : tensor.sizes())
+    switch (df)
     {
-        shape[i] = i >= 2 ? align_up_tile(s) : s;
-        ++i;
+        case target::DataType::UInt8: return torch::ScalarType::Byte;
+        case target::DataType::UInt16: return torch::ScalarType::Short;
+        case target::DataType::UInt32: return torch::ScalarType::Int;
+        case target::DataType::Float16: return torch::ScalarType::Half;
+        case target::DataType::Float32: return torch::ScalarType::Float;
+        case target::DataType::BFloat16: return torch::ScalarType::BFloat16;
+        default: break;
     }
 
-    i = PY_TENSOR_DIMS - 1;
-    for (int j = dim - 1; j >= 0; --j, --i)
-    {
-        strides[i] = tensor.strides()[j] * scalar_size;
-    }
+    log_fatal(LogTTDevice, "Unhandled dtype {}", df);
+}
 
-    // Special case where dim == 1
-    if (i >= 2)
-    {
-        strides[i] = kTileDim * scalar_size;
-        --i;
+void pad_to_buda_shape(torch::Tensor & tensor)
+{
+    auto tt_device = tensor.device();
+    auto cpu_tensor = tensor.to(torch::kCPU);
+    if (cpu_tensor.sizes().size() > 4) {
+        throw std::runtime_error("Tensor has more than 4 dimensions");
+    } else if (cpu_tensor.sizes().size() < 4) {
+        auto tensor_impl = cpu_tensor.unsafeGetTensorImpl();
+        std::vector<int64_t> new_shape;
+        for (size_t i = 0; i < cpu_tensor.sizes().size(); i++) {
+            new_shape.push_back(cpu_tensor.sizes()[i]);
+        }
+        while (new_shape.size() < 4) {
+            new_shape.insert(new_shape.begin(), 1);
+        }
+        tensor_impl->Reshape(new_shape);
     }
-
-    int last = i + 1;
-    while (i >= 0)
-    {
-        strides[i] = strides[last];
-        --i;
+    auto new_shape = cpu_tensor.sizes();
+    namespace F = torch::nn::functional;
+    cpu_tensor = torch::nn::functional::pad(
+        cpu_tensor, 
+        F::PadFuncOptions(
+            {0, align_up_tile(new_shape[3]) - new_shape[3],
+             0, align_up_tile(new_shape[2]) - new_shape[2]}
+        ).mode(torch::kConstant));
+
+    cpu_tensor.unsafeGetTensorImpl()->set_size(2, align_up_tile(new_shape[2]));
+    cpu_tensor.unsafeGetTensorImpl()->set_size(3, align_up_tile(new_shape[3]));
+
+    int64_t curr_stride = 1;
+
+    for (int i = 3; i >= 0; i--) {
+        cpu_tensor.unsafeGetTensorImpl()->set_stride(i, curr_stride);
+        curr_stride *= cpu_tensor.sizes()[i];
     }
 
-    return tt_PytorchTensorDesc(
-        tensor.data_ptr(), tensor.element_size(), torch_scalar_type_to_df(tensor.scalar_type()), shape, strides, 4);
+    tensor = cpu_tensor.to(tt_device);
 }
 
-void push_tensor(
-    tt_backend& backend,
-    PyBudaTensorDesc const& desc,
-    torch::Tensor const& tensor,
-    std::string const& info)
+std::vector<std::uint32_t> fromIntArrayRef(torch::IntArrayRef arr)
 {
-    log_debug(
-        LogTTDevice,
-        "Pushing tensor({})[{}][{}] to device[{}]",
+    std::vector<std::uint32_t> vec;
+    for (auto i : arr)
+        vec.push_back(i);
+    return vec;
+}
+
+runtime::Tensor create_tensor(const torch::Tensor& tensor)
+{
+    auto data = std::shared_ptr<void>(
         tensor.data_ptr(),
-        desc.name,
-        tensor.scalar_type(),
-        tensor.device());
-
-    (void)info;
-    // if (tensor.device().type() != TT)
-    //     log_fatal(
-    //         LogTTDevice,
-    //         "Tensor is not resident on submitted device (forgot to call tensor.to(\"tt\")?) {}: device[{}] {}",
-    //         desc.name,
-    //         tensor.device(),
-    //         info);
-
-    tt_dram_io_desc queue_desc = backend.get_queue_descriptor(desc.name);
-    backend::translate_addresses(queue_desc);
-    tt_PytorchTensorDesc tensor_desc = to_pytorch_tensor_desc(tensor);
-    constexpr int kDefaultTimeoutSec = 10;
-    constexpr bool push_one = false;
-    auto status = backend::push_input(queue_desc, tensor_desc, push_one, kDefaultTimeoutSec, desc.ptr);
-    if (status != DEVICE_STATUS_CODE::Success)
-        log_fatal(LogTTDevice, "Failed to push tensor: {} {}", desc.name, status);
+        [tensor](void*) { (void)tensor; }  // Capture tensor by value to increase ref count and keep it alive
+    );
+    return runtime::createTensor(
+        data,
+        fromIntArrayRef(tensor.sizes()),
+        fromIntArrayRef(tensor.strides()),
+        tensor.element_size(),
+        torch_scalar_type_to_dt(tensor.scalar_type()));
 }
 
-static torch::Tensor pop_tensor(tt_backend& backend, PyBudaTensorDesc const& desc, tt::balancer::OutputHostTM const& output_host_tm)
+template <typename T>
+std::vector<std::int64_t> asInt64Vec(std::vector<T> const& v)
 {
-    log_debug(LogTTDevice, "Popping tensor[{}]", desc.name);
-
-    tt_PytorchTensorDesc tensor_desc;
-    tt_dram_io_desc queue_desc = backend.get_queue_descriptor(desc.name);
-    backend::translate_addresses(queue_desc);
-    queue_desc.hstack_factor = output_host_tm.hstack_factor;
-    queue_desc.vstack_factor = output_host_tm.vstack_factor;
-    queue_desc.stack_row_major = output_host_tm.row_major;
-
-    constexpr bool pop_one = false;
-    int timeout_in_seconds = 600;
-
-    auto status = backend::get_output(queue_desc, tensor_desc, pop_one, timeout_in_seconds, desc.ptr);
-    if (status != DEVICE_STATUS_CODE::Success)
-        log_fatal(LogTTDevice, "Failed to get_output: {} {}", desc.name, status);
-
-    // TODO: cannot call on RAM
-    status = backend::pop_output(queue_desc, pop_one, timeout_in_seconds);
-    if (status != DEVICE_STATUS_CODE::Success)
-        log_fatal(LogTTDevice, "Failed to pop_output: {} {}", desc.name, status);
-
-    torch::Tensor ret = from_pytorch_tensor_desc(tensor_desc, desc.shape, free_tt_PytorchTensorDesc);
-    
-    return ret;
+    std::vector<std::int64_t> result;
+    result.reserve(v.size());
+    for (auto const& i : v)
+        result.push_back(i);
+    return result;
 }
 
 std::vector<torch::Tensor> dispatch(
-    TTDevice const& device,
+    TTDevice& device,
     std::shared_ptr<Workload> workload,
-    std::vector<Program> const& programs,
-    std::vector<torch::Tensor> const& inputs,
-    tt::balancer::OutputHostTMMap const& output_host_tms)
+    int program_idx,
+    std::vector<torch::Tensor>& inputs,
+    bool const& is_compile)
 {
-    bool expected = false;
-    if (device.context->initialized.compare_exchange_strong(
-            expected, true, std::memory_order_relaxed, std::memory_order_relaxed))
-    {
-        backend::initialize_child_process(workload->output_dir);
-    }
-
     int input_idx = 0;
-    // if input hasn't been transformed (first time running) we need to transform it now
-    TTMetaData *input_meta;
-    std::vector<const void*> copied_inputs = get_copied_inputs();
-    // TT_ASSERT(copied_inputs.size() == inputs.size());
-    for (auto const& desc : workload->inputs)
+    std::vector<runtime::Tensor> rt_inputs;
+    rt_inputs.reserve(workload->inputs.at(program_idx).size());
+    for ([[ maybe_unused ]] auto const& desc : workload->inputs.at(program_idx))
     {
-        torch::Tensor const& input = inputs.at(input_idx);
+        torch::Tensor & input = inputs.at(input_idx);
         auto impl = input.unsafeGetTensorImpl();
-        input_meta = dynamic_cast<TTMetaData*>(impl->get_backend_meta());
+        TTMetaData *input_meta = dynamic_cast<TTMetaData*>(impl->get_backend_meta());
+
         TT_ASSERT (input_meta != nullptr);
-        if (!input_meta->runtime_transformed)
+        if (!input_meta->runtime_transformed and !input_meta->created_on_device)
         {
-            std::string runtime_transform = device.input_runtime_transforms.at(input_idx);
-            torch::Tensor transformed_input = eval_runtime_transform(input.to(torch::kCPU), runtime_transform, workload->backend->get_queue_descriptor(desc.name));
+            std::string runtime_transform = device.input_runtime_transforms.at(program_idx).at(input_idx);
+            std::vector<int> tile_bcast_dims = device.input_tile_bcast_dims.at(program_idx).at(input_idx);
+            auto transformed_input = eval_runtime_transform(input.to(torch::kCPU), runtime_transform, tile_bcast_dims);
             input_meta->runtime_transformed = true;
-            push_tensor(*workload->backend, desc, transformed_input, fmt::format("input[{}]", input_idx));
+            rt_inputs.emplace_back(create_tensor(transformed_input));
         }
         else
         {
-            push_tensor(*workload->backend, desc, input, fmt::format("input[{}]", input_idx));
+            rt_inputs.emplace_back(create_tensor(input));
         }
-        // TT_ASSERT(copied_inputs.at(input_idx) == input.const_data_ptr(), "Incorrect input pointer, input tensors need to be copied to device in the same order as they'll be consumed");
         ++input_idx;
     }
 
-    for (Program const& program : programs)
-    {
-        auto status = workload->backend->run_program(program.name, program.parameters);
-        if (status != DEVICE_STATUS_CODE::Success)
-            log_fatal(LogTTDevice, "Failed to run_program: {} {}", program.name, status);
-    }
-
+    runtime::Binary binary(workload->flatbuffer);
     std::vector<torch::Tensor> outputs;
-    outputs.reserve(workload->outputs.size());
-    for (size_t i = 0; i < workload->outputs.size(); ++i)
+    std::vector<runtime::Tensor> rt_outputs;
+    std::vector<runtime::TensorDesc> output_descs = binary.getProgramOutputs(program_idx);
+    outputs.reserve(output_descs.size());
+    for (auto const& desc : output_descs)
     {
-        PyBudaTensorDesc const& desc = workload->outputs.at(i);
-        tt::balancer::OutputHostTM output_host_tm = tt::balancer::OutputHostTM(); 
-        if (output_host_tms.count(desc.name))
-            output_host_tm = output_host_tms.at(desc.name);
-
-        torch::Tensor output = pop_tensor(*workload->backend, desc, output_host_tm);
-        std::string runtime_transform = device.output_runtime_transforms.at(i);
-        register_output_runtime_transform(output, runtime_transform);
-        outputs.emplace_back(output);
+        std::vector<std::int64_t> shape = asInt64Vec(desc.shape);
+        std::vector<std::int64_t> stride = asInt64Vec(desc.stride);
+        outputs.emplace_back(empty_strided(shape, stride, dt_to_torch_scalar_type(desc.dataType)));
+        rt_outputs.emplace_back(create_tensor(outputs.back()));
     }
-    return outputs;
-}
 
-std::vector<TTDevice> query_available_tt_devices()
-{
-    static std::shared_ptr<TTContext> context = std::make_shared<TTContext>();
-    std::vector<TTDevice> d;
-    if (true)//(env_as<bool>("PYBUDA_DEVMODE"))
+    if (!device.rt_device.has_value())
     {
-        constexpr bool mmio = true;
-        ARCH arch = env_as<bool>("GOLDEN_WORMHOLE_B0") ? ARCH::WORMHOLE_B0 : ARCH::GRAYSKULL;
-        auto desc = backend::get_custom_device_desc(arch, mmio);
-        d.emplace_back(DEVICE::Golden, arch, desc.soc_desc_yaml, desc.mmio, 0, context);
+        device.open_device();
     }
-    else
+
+    runtime::Event event = runtime::submit(device.rt_device.value(), binary, program_idx, rt_inputs, rt_outputs);
+    (void)event;
+
+    // Clear old tensor uids and update with new ones
+    if (device.subgraph_to_tensor_uid_on_device.count(program_idx) != 0)
+        device.subgraph_to_tensor_uid_on_device[program_idx].clear();
+
+    int output_idx = 0;
+    const auto& subgraph_outputs = workload->outputs.at(program_idx);
+    for (auto const& output : outputs)
     {
-        auto available_devices = backend::get_device_descs_for_available_devices();
-        int index = 0;
-        for (auto desc : available_devices)
+        PyBudaTensorDesc const& desc = subgraph_outputs.at(output_idx );
+
+        std::string runtime_transform = device.output_runtime_transforms.at(program_idx).at(output_idx );
+        // auto impl = output.unsafeGetTensorImpl();
+        // auto output_tensor_uid = dynamic_cast<TTMetaData*>(impl->get_backend_meta())->unique_output_id;
+
+        // if (queue_desc.io_type == IO_TYPE::RandomAccess) {
+        //     register_output_runtime_transform(output, runtime_transform);
+        //     device.subgraph_to_tensor_uid_on_device[program_idx].push_back(output_tensor_uid);
+        //     outputs.emplace_back(output);
+        // } else 
         {
-            d.emplace_back(DEVICE::Silicon, desc.arch, desc.soc_desc_yaml, desc.mmio, index++, context);
+            PyGILState_STATE gstate=PyGILState_Ensure();
+            auto tt_device_ = output.device();
+            // Move tensor to CPU because torch::narrow is only supported on CPU for now
+            torch::Tensor cpu_output = output.to(
+            torch::kCPU, output.scalar_type(), false, true);
+            register_output_runtime_transform(output, runtime_transform);
+
+            for (size_t i = 0; i < cpu_output.sizes().size(); i++)
+            {
+                if (cpu_output.sizes()[i] != desc.shape[i]) {
+                    log_trace(LogTorchDevice, "narrowing dim[{}] start[{}] length[{}]", i, 0, desc.shape[i]);
+                    cpu_output = torch::narrow(cpu_output, i, 0, desc.shape[i]);
+                }
+            }
+            // Move tensor back to TT device
+            // (TODO: this is a workaround, we should be able to do this without calling contiguous, which makes a copy)
+            torch::Tensor tt_output_ = cpu_output.contiguous().to(
+                tt_device_, cpu_output.scalar_type(), false, false/* copy */);
+            PyGILState_Release(gstate);
+            outputs.emplace_back(tt_output_);
         }
+        ++output_idx;
     }
-
-    if (d.empty())
-        log_fatal(LogTTDevice, "No available devices detected (To run with golden device, set PYBUDA_DEVMODE=1)");
-
-    log_debug(LogTTDevice, "Available devices:");
-    for (int i = 0; i < (int)d.size(); ++i) log_debug(LogTTDevice, "  [{}] {} {}", i, d[i].type, d[i].arch);
-    return d;
+    return outputs;
 }
 
-std::string get_device_cluster_yaml(TTDevice const&) { return backend::get_device_cluster_yaml(); }
+std::string get_device_cluster_yaml(TTDevice const&) { return "";} //TODO }
 
 std::string to_string(TTDevice const& d)
 {
@@ -299,22 +255,21 @@ std::string to_string(TTDevice const& d)
 
 torch::Device torch_device(TTDevice const& d) { return torch_device_at_index(d.index); }
 
-TTContext::~TTContext()
-{
-    if (initialized.load(std::memory_order_relaxed))
-        backend::finish_child_process();
-}
+TTContext::~TTContext() {;}
 
-torch::Tensor eval_runtime_transform(const torch::Tensor& tensor, std::string transform, tt_dram_io_desc q)
+torch::Tensor eval_runtime_transform(
+    const torch::Tensor& tensor,
+    std::string transform,
+    std::vector<int> &tile_bcast_dims)
 {
     py::object py_tensor = py::reinterpret_steal<py::object>(THPVariable_Wrap(tensor));
 
     PyGILState_STATE gstate=PyGILState_Ensure();
     auto module = py::module_::import("pybuda.tensor");
     py::function eval_transform = module.attr("eval_runtime_transform");
-    py::object py_result = eval_transform(transform, py_tensor, q);
+    py::tuple py_result = eval_transform(transform, py_tensor, tile_bcast_dims);
     PyGILState_Release(gstate);
-    torch::Tensor torch_tensor = THPVariable_Unpack(static_cast<PyObject *>(py_result.ptr()));
+    torch::Tensor torch_tensor = THPVariable_Unpack(static_cast<PyObject *>(py_result[0].ptr()));
     return torch_tensor;
 }
 
@@ -325,7 +280,7 @@ torch::Tensor narrow_to_pytorch(const torch::Tensor& tensor, std::string transfo
 
     PyGILState_STATE gstate=PyGILState_Ensure();
     auto module = py::module_::import("pybuda.tensor");
-    py::function eval_transform = module.attr("eval_runtime_transform");
+    py::function eval_transform = module.attr("eval_runtime_transform"); //TODO: update
     py::object py_result = eval_transform(transform, py_tensor);
     PyGILState_Release(gstate);
     torch::Tensor torch_tensor = THPVariable_Unpack(static_cast<PyObject *>(py_result.ptr()));
@@ -339,6 +294,7 @@ bool is_created_on_device(const torch::Tensor& tensor)
     TT_ASSERT(meta != nullptr);
     return meta->created_on_device;
 }
+
 std::vector<size_t> original_shape(const torch::Tensor& tensor)
 {
     auto impl = tensor.unsafeGetTensorImpl();
@@ -351,4 +307,14 @@ std::vector<size_t> original_shape(const torch::Tensor& tensor)
     return shape;
 }
 
+int unique_id(const torch::Tensor& tensor)
+{
+    auto impl = tensor.unsafeGetTensorImpl();
+    TTMetaData* meta = dynamic_cast<TTMetaData*>(impl->get_backend_meta());
+    if (meta != nullptr)
+        return meta->unique_output_id;
+
+    return -1;
+}
+
 }  // namespace tt
diff --git a/pybuda/csrc/tt_torch_device/tt_device.hpp b/pybuda/csrc/tt_torch_device/tt_device.hpp
index 03f32104f..e638b908b 100644
--- a/pybuda/csrc/tt_torch_device/tt_device.hpp
+++ b/pybuda/csrc/tt_torch_device/tt_device.hpp
@@ -12,13 +12,12 @@
 
 #include <atomic>
 #include <memory>
+#include <optional>
 #include <vector>
 
-#include "third_party/budabackend/netlist/tt_backend_api_types.hpp"
-#include "third_party/budabackend/netlist/tt_backend.hpp"
-#include "third_party/budabackend/netlist/tt_backend_api.hpp"
-
-#include "balancer/output_host_tm_types.hpp"
+#include "pybuda/csrc/backend_api/arch_type.hpp"
+#include "runtime/tt_device.hpp"
+#include "tt/runtime/types.h"
 #include "utils/assert.hpp"
 #include "utils/env.hpp"
 #include "utils/logger.hpp"
@@ -33,6 +32,7 @@ struct TTMetaData : public c10::BackendMeta {
     torch::IntArrayRef original_shape;
     bool runtime_transformed = false;
     bool created_on_device = false;
+    int unique_output_id = -1;
 };
 
 struct PyBudaTensorDesc
@@ -53,87 +53,31 @@ struct PyBudaTensorDesc
 };
 
 
-struct Program
-{
-    std::string name;
-    std::map<std::string, std::string> parameters;
-
-    Program(std::string const& name, std::map<std::string, std::string> const& parameters) :
-        name(name), parameters(parameters)
-    {
-    }
-};
-
-struct CompileRequest
-{
-    std::string netlist_path;
-    std::string output_dir;
-    tt::tt_backend_config backend_config;
-    std::vector<PyBudaTensorDesc> inputs;
-    std::vector<std::string> input_runtime_transforms;
-    std::vector<PyBudaTensorDesc> constants;
-    std::vector<PyBudaTensorDesc> parameters;
-    std::vector<PyBudaTensorDesc> outputs;
-    std::vector<std::string> output_runtime_transforms;
-
-    CompileRequest(
-        std::string const& netlist_path,
-        std::string output_dir,
-        tt::tt_backend_config const& backend_config,
-        std::vector<PyBudaTensorDesc> const& inputs,
-        std::vector<std::string> const& input_runtime_transforms,
-        std::vector<PyBudaTensorDesc> const& constants,
-        std::vector<PyBudaTensorDesc> const& parameters,
-        std::vector<PyBudaTensorDesc> const& outputs,
-        std::vector<std::string> const& output_runtime_transforms) :
-        netlist_path(netlist_path),
-        output_dir(output_dir),
-        backend_config(backend_config),
-        inputs(inputs),
-        input_runtime_transforms(input_runtime_transforms),
-        constants(constants),
-        parameters(parameters),
-        outputs(outputs),
-        output_runtime_transforms(output_runtime_transforms)
-    {
-    }
-};
+using Program = int;
 
 struct Workload
 {
-    std::shared_ptr<tt_backend> backend;
-    std::string output_dir;
-    std::vector<PyBudaTensorDesc> inputs;
+    std::shared_ptr<void> flatbuffer;
+    std::map<int, std::vector<PyBudaTensorDesc>> inputs;
     std::vector<PyBudaTensorDesc> constants;
     std::vector<PyBudaTensorDesc> parameters;
-    std::vector<PyBudaTensorDesc> outputs;
+    std::map<int, std::vector<PyBudaTensorDesc>> outputs;
     bool initialized = false;
+    std::unordered_map<int, bool> subgraph_link_tensor_populated;
 
     Workload(
-        std::shared_ptr<tt_backend> backend,
-        std::string output_dir,
-        std::vector<PyBudaTensorDesc> const& inputs,
+        std::shared_ptr<void> flatbuffer,
+        std::map<int, std::vector<PyBudaTensorDesc>> const& inputs, // a vector per program
         std::vector<PyBudaTensorDesc> const& constants,
         std::vector<PyBudaTensorDesc> const& parameters,
-        std::vector<PyBudaTensorDesc> const& outputs) :
-        backend(backend),
-        output_dir(output_dir),
+        std::map<int, std::vector<PyBudaTensorDesc>> const& outputs) :
+        flatbuffer(flatbuffer),
         inputs(inputs),
         constants(constants),
         parameters(parameters),
         outputs(outputs)
     {
     }
-
-    ~Workload()
-    {
-        if (initialized)
-        {
-            TT_ASSERT(bool(backend));
-            backend->finish();
-        }
-
-    }
 };
 
 struct TTContext
@@ -145,62 +89,46 @@ struct TTContext
 using Fence = std::uint64_t;
 using ResourceID = std::uint64_t;
 
-// 1to1 mapping of physical devices plugged into this machine and TTDevice
-struct TTDevice
-{
-    DEVICE type;
-    ARCH arch;
-    std::string soc_desc_yaml;
-    bool mmio;
-    int index;
-    std::shared_ptr<TTContext> context;
-    std::vector<std::string> input_runtime_transforms;
-    std::vector<std::string> output_runtime_transforms;
-
-    TTDevice(
-        DEVICE type, ARCH arch, std::string soc_desc_yaml, bool mmio, int index, std::shared_ptr<TTContext> context) :
-        type(type), arch(arch), soc_desc_yaml(soc_desc_yaml), mmio(mmio), index(index), context(context)
-    {
-    }
-};
-
 using FreePytorchTensorDescFn = void(void*);
-torch::Tensor from_pytorch_tensor_desc(
-    tt_PytorchTensorDesc const& desc, std::vector<std::int64_t> const& shape, FreePytorchTensorDescFn* free_fn);
 void register_output_runtime_transform(torch::Tensor const& tensor, std::string transform);
 void register__ordered_input_runtime_transforms(std::vector<std::string> input_transforms);
 std::string get_runtime_transform(torch::Tensor const& tensor);
 std::vector<TTDevice> query_available_tt_devices();
-TTDevice get_default_tt_device();
-std::vector<TTDevice> get_available_tt_devices();
+const std::shared_ptr<TTDevice>& get_default_tt_device();
+std::vector<std::shared_ptr<TTDevice>> get_available_tt_devices();
 std::string device_type_name(c10::DeviceType type, bool lower_case = false);
 torch::Device torch_device_at_index(std::int64_t index);
+torch::Tensor empty_strided(
+    torch::IntArrayRef size,
+    torch::IntArrayRef stride,
+    c10::optional<at::ScalarType> dtype,
+    c10::optional<at::Layout> layout = c10::nullopt,
+    c10::optional<at::Device> device = c10::nullopt,
+    c10::optional<bool> pin_memory = c10::nullopt);
 
-std::vector<const void*> get_copied_inputs();
-std::shared_ptr<Workload> compile(TTDevice& device, CompileRequest const& compile_request);
-void push_tensor(
-    tt_backend& backend,
-    PyBudaTensorDesc const& desc,
-    torch::Tensor const& tensor,
-    std::string const& info = "");
 std::vector<torch::Tensor> dispatch(
-    TTDevice const& device,
+    TTDevice& device,
     std::shared_ptr<Workload> workload,
-    std::vector<Program> const& programs,
-    std::vector<torch::Tensor> const& inputs,
-    tt::balancer::OutputHostTMMap const& output_host_tms);
+    int program_idx,
+    std::vector<torch::Tensor>& inputs,
+    bool const& is_compile);
 std::string get_device_cluster_yaml(TTDevice const&);
 std::string to_string(TTDevice const& d);
 torch::Device torch_device(TTDevice const& d);
 
-torch::Tensor eval_runtime_transform(const torch::Tensor& tensor, std::string transform, tt_dram_io_desc q);
+torch::Tensor eval_runtime_transform(const torch::Tensor& tensor, std::string transform, std::vector<int> &tile_bcast_dims);
 bool is_created_on_device(const torch::Tensor& tensor);
+int unique_id(const torch::Tensor& tensor);
+torch::Tensor narrow_to_pytorch(const torch::Tensor& tensor, torch::IntArrayRef original_shape);
 std::vector<size_t> original_shape(const torch::Tensor& tensor);
 
+std::shared_ptr<void> load_binary_from_file(std::string const& filename);
+
 template <typename T>
 inline T align_up_tile(T d)
 {
     d -= 1;
     return static_cast<T>(d - (d % kTileDim) + kTileDim);
 }
+
 }  // namespace tt
diff --git a/pybuda/module.mk b/pybuda/module.mk
index a2ac49220..647dd59ff 100644
--- a/pybuda/module.mk
+++ b/pybuda/module.mk
@@ -1,8 +1,8 @@
 include pybuda/csrc/module.mk
 
-$(PYTHON_ENV)/lib/$(PYTHON_VERSION)/site-packages/pybuda.egg-link: python_env $(PYTHON_ENV)/lib/$(PYTHON_VERSION)/site-packages/pybuda/_C.so
-	bash -c "source $(PYTHON_ENV)/bin/activate; cd pybuda; pip install -e ."
-	touch -r $(PYTHON_ENV)/lib/$(PYTHON_VERSION)/site-packages/pybuda/_C.so $@
+$(PYTHON_ENV_ROOT)/lib/$(PYTHON_VERSION)/site-packages/pybuda.egg-link: $(PYTHON_ENV) $(PYTHON_ENV_ROOT)/lib/$(PYTHON_VERSION)/site-packages/pybuda/_C.so
+	bash -c "source $(PYTHON_ENV_ROOT)/bin/activate; cd pybuda; pip install -e ."
+	touch -r $(PYTHON_ENV_ROOT)/lib/$(PYTHON_VERSION)/site-packages/pybuda/_C.so $@
 
-pybuda: pybuda/csrc $(PYTHON_ENV)/lib/$(PYTHON_VERSION)/site-packages/pybuda.egg-link ;
+pybuda: pybuda/csrc $(PYTHON_ENV_ROOT)/lib/$(PYTHON_VERSION)/site-packages/pybuda.egg-link ;
 
diff --git a/pybuda/pybuda/CMakeLists.txt b/pybuda/pybuda/CMakeLists.txt
new file mode 100644
index 000000000..33bba1fcf
--- /dev/null
+++ b/pybuda/pybuda/CMakeLists.txt
@@ -0,0 +1,5 @@
+add_custom_target(install_pybuda ALL
+    COMMAND cd "${CMAKE_SOURCE_DIR}/pybuda" && pip install -e .
+    COMMENT "Installing pybuda module")
+
+add_dependencies(install_pybuda run_after_pybuda_csrc)
diff --git a/pybuda/pybuda/_C/__init__.pyi b/pybuda/pybuda/_C/__init__.pyi
index 2c1734b8d..cbddcae24 100644
--- a/pybuda/pybuda/_C/__init__.pyi
+++ b/pybuda/pybuda/_C/__init__.pyi
@@ -1,6 +1,7 @@
-from . import autograd as autograd, backend_api as backend_api, balancer as balancer, graph as graph, pattern_matcher as pattern_matcher, scheduler as scheduler, torch_device as torch_device
-from typing import ClassVar, Dict, List, Optional, Tuple, Union
+from . import autograd as autograd, graph as graph, torch_device as torch_device
+from typing import ClassVar
 
+BLACKHOLE: Arch
 Backward: NodeEpochType
 Bfp2: DataFormat
 Bfp2_b: DataFormat
@@ -12,12 +13,14 @@ Float16: DataFormat
 Float16_b: DataFormat
 Float32: DataFormat
 Forward: NodeEpochType
+GRAYSKULL: Arch
 HiFi2: MathFidelity
 HiFi3: MathFidelity
 HiFi4: MathFidelity
 Int32: DataFormat
 Int8: DataFormat
 Invalid: MathFidelity
+JAWBRIDGE: Arch
 Lf8: DataFormat
 LoFi: MathFidelity
 Optimizer: NodeEpochType
@@ -26,46 +29,54 @@ RawUInt32: DataFormat
 RawUInt8: DataFormat
 UInt16: DataFormat
 VERSION: int
+WORMHOLE: Arch
+WORMHOLE_B0: Arch
 k_dim: int
 
 class AMPNodeProperties:
-    def __init__(self, op_type: Optional[str] = ..., epoch_type: Optional[NodeEpochType] = ..., output_df: Optional[DataFormat] = ..., intermediate_df: Optional[DataFormat] = ..., accumulate_df: Optional[DataFormat] = ..., math_fidelity: Optional[MathFidelity] = ..., name_regex_match: Optional[str] = ..., input_df: Optional[Union[Dict[int, Tuple[DataFormat, bool]], DataFormat]] = ..., is_gradient_op: Optional[bool] = ..., input_parameter_indices_to_optimize: Optional[List[Tuple[int, int]]] = ...) -> None: ...
+    def __init__(self, op_type: str | None = ..., epoch_type: NodeEpochType | None = ..., output_df: DataFormat | None = ..., intermediate_df: DataFormat | None = ..., accumulate_df: DataFormat | None = ..., math_fidelity: MathFidelity | None = ..., name_regex_match: str | None = ..., input_df: dict[int, tuple[DataFormat, bool]] | DataFormat | None | None = ..., is_gradient_op: bool | None = ..., input_parameter_indices_to_optimize: list[tuple[int, int]] | None = ...) -> None: ...
     def from_json(self) -> AMPNodeProperties: ...
     def to_json(self) -> json: ...
     @property
-    def accumulate_df(self) -> Optional[DataFormat]: ...
+    def accumulate_df(self) -> DataFormat | None: ...
     @property
-    def epoch_type(self) -> Optional[NodeEpochType]: ...
+    def epoch_type(self) -> NodeEpochType | None: ...
     @property
-    def input_df(self) -> Optional[Union[Dict[int, Tuple[DataFormat, bool]], DataFormat]]: ...
+    def input_df(self) -> dict[int, tuple[DataFormat, bool]] | DataFormat | None | None: ...
     @property
-    def input_parameter_indices_to_optimize(self) -> Optional[List[Tuple[int, int]]]: ...
+    def input_parameter_indices_to_optimize(self) -> list[tuple[int, int]] | None: ...
     @property
-    def intermediate_df(self) -> Optional[DataFormat]: ...
+    def intermediate_df(self) -> DataFormat | None: ...
     @property
-    def is_gradient_op(self) -> Optional[bool]: ...
+    def is_gradient_op(self) -> bool | None: ...
     @property
-    def math_fidelity(self) -> Optional[MathFidelity]: ...
+    def math_fidelity(self) -> MathFidelity | None: ...
     @property
-    def name_regex_match(self) -> Optional[str]: ...
+    def name_regex_match(self) -> str | None: ...
     @property
-    def op_type(self) -> Optional[str]: ...
+    def op_type(self) -> str | None: ...
     @property
-    def output_df(self) -> Optional[DataFormat]: ...
+    def output_df(self) -> DataFormat | None: ...
 
-class Block:
-    def __init__(self) -> None: ...
-
-class Blocks:
-    def __init__(self) -> None: ...
-
-class BudaNetlist:
-    def __init__(self) -> None: ...
-    def append_comment(self, arg0: str) -> None: ...
-    def dump_to_yaml(self) -> str: ...
-
-class BudaNetlistConfig:
-    def __init__(self) -> None: ...
+class Arch:
+    __members__: ClassVar[dict] = ...  # read-only
+    BLACKHOLE: ClassVar[Arch] = ...
+    GRAYSKULL: ClassVar[Arch] = ...
+    Invalid: ClassVar[Arch] = ...
+    JAWBRIDGE: ClassVar[Arch] = ...
+    WORMHOLE: ClassVar[Arch] = ...
+    WORMHOLE_B0: ClassVar[Arch] = ...
+    __entries: ClassVar[dict] = ...
+    def __init__(self, value: int) -> None: ...
+    def __eq__(self, other: object) -> bool: ...
+    def __hash__(self) -> int: ...
+    def __index__(self) -> int: ...
+    def __int__(self) -> int: ...
+    def __ne__(self, other: object) -> bool: ...
+    @property
+    def name(self) -> str: ...
+    @property
+    def value(self) -> int: ...
 
 class DataFormat:
     __members__: ClassVar[dict] = ...  # read-only
@@ -100,16 +111,6 @@ class DataFormat:
     @property
     def value(self) -> int: ...
 
-class DramQueueConfigOverride:
-    def __init__(self, arg0: Optional[int], arg1: Optional[int]) -> None: ...
-    def from_json(self) -> DramQueueConfigOverride: ...
-    def to_json(self) -> Dict[str, Optional[int]]: ...
-
-class InsertionInstruction:
-    def __init__(self, src: str, dest: str, hoist_tms: bool, input_id: Optional[int] = ..., fork_id: Optional[int] = ..., user_defined: bool = ...) -> None: ...
-    def insert(self, arg0: graph.Graph) -> None: ...
-    def unique_id(self) -> Tuple[str, str, int, int, bool]: ...
-
 class MathFidelity:
     __members__: ClassVar[dict] = ...  # read-only
     HiFi2: ClassVar[MathFidelity] = ...
@@ -148,72 +149,44 @@ class NodeEpochType:
     @property
     def value(self) -> int: ...
 
-class NopInsertionInstruction(InsertionInstruction):
-    def __init__(self, src: str, dest: str, hoist_tms: bool, nop_count: int = ..., input_id: Optional[int] = ..., fork_id: Optional[int] = ..., user_defined: bool = ..., mergeable: bool = ..., daisy_chain: bool = ..., request_merge: bool = ...) -> None: ...
-    def from_json(self) -> NopInsertionInstruction: ...
-    def to_json(self) -> Dict[str, Union[str, bool, int, Optional[int]]]: ...
-    def unique_id(self) -> Tuple[str, str, int, int, bool]: ...
-
-class PostPlacerConfig:
-    def __init__(self, device_config: backend_api.DeviceConfig, microbatch_size: int, microbatch_count: int, enable_t_streaming: bool, input_queues_on_host: bool, output_queues_on_host: bool, manual_dram_queue_placement: Dict[str, DramQueueConfigOverride], fork_join_tiles_treshold: int, output_queue_multiplier: int, input_queue_multiplier: int, enable_cross_chip_buffering: bool, placement_algorithm: placer.DRAMPlacementAlgorithm) -> None: ...
-
-class PostPlacerResults:
-    def __init__(self, *args, **kwargs) -> None: ...
-    @property
-    def allocated_blocks(self) -> List[List[Blocks]]: ...
-    @property
-    def current_host_address(self) -> int: ...
-    @property
-    def ins_instructions(self) -> Dict[Tuple[str, str, int, int, bool], InsertionInstruction]: ...
-    @property
-    def perf_model_results(self) -> Dict[str, float]: ...
-
-class QueueInsertionInstruction(InsertionInstruction):
-    def __init__(self, src: str, dest: str, hoist_tms: bool, num_entries: int, queue_size: int, input_id: Optional[int] = ..., fork_id: Optional[int] = ..., user_defined: bool = ...) -> None: ...
-    def unique_id(self) -> Tuple[str, str, int, int, bool]: ...
-
 class SparseBUDA:
     def __init__(self, *args, **kwargs) -> None: ...
     def get_sparse_tile_ptr_bits(self, arg0: int, arg1: int, arg2: int) -> int: ...
-    def get_sparse_tiles_and_encodings(self, arg0: int) -> Tuple[List[List[float]], List[List[int]], List[int], List[int], List[int]]: ...
+    def get_sparse_tiles_and_encodings(self, arg0: int) -> tuple[list[list[float]], list[list[int]], list[int], list[int], list[int]]: ...
+    def get_sparse_ublock_idx_bits(self, arg0: int, arg1: int, arg2: int) -> int: ...
     @property
     def bcast_factor(self) -> int: ...
     @property
     def sparse_indices(self): ...
     @property
-    def sparse_shape(self) -> List[int]: ...
+    def sparse_shape(self) -> list[int]: ...
     @property
     def zdim(self) -> int: ...
 
 class SparseCOO:
-    def __init__(self, rows: List[int], cols: List[int], vals: List[float], shape: List[int]) -> None: ...
+    def __init__(self, rows: list[int], cols: list[int], vals: list[float], shape: list[int]) -> None: ...
     @property
-    def cols(self) -> List[int]: ...
+    def cols(self) -> list[int]: ...
     @property
-    def rows(self) -> List[int]: ...
+    def rows(self) -> list[int]: ...
     @property
-    def shape(self) -> List[int]: ...
+    def shape(self) -> list[int]: ...
     @property
-    def vals(self) -> List[float]: ...
+    def vals(self) -> list[float]: ...
 
 class UnsupportedHWOpsError(Exception): ...
 
-def compress_sparse_tensor_and_strip_info(arg0: List[SparseCOO], arg1: int, arg2: int) -> SparseBUDA: ...
-def dump_epoch_id_graphs(graph: graph.Graph, test_name: str, graph_name: str, placer_solution: placer.PlacerSolution, balancer_solution: balancer.BalancerSolution = ...) -> None: ...
-def dump_epoch_type_graphs(graph: graph.Graph, test_name: str, graph_name: str, placer_solution: placer.PlacerSolution = ..., balancer_solution: balancer.BalancerSolution = ...) -> None: ...
-def dump_graph(graph: graph.Graph, test_name: str, graph_name: str, placer_solution: placer.PlacerSolution = ..., balancer_solution: balancer.BalancerSolution = ...) -> None: ...
-def is_subset_of_instructions(ins_instructions: Dict[Tuple[str, str, int, int, bool], InsertionInstruction] = ..., previous_instructions: Dict[Tuple[str, str, int, int, bool], InsertionInstruction] = ...) -> Tuple[bool, int, int]: ...
-def link_past_cache_ios(arg0: graph.Graph) -> Dict[str, int]: ...
-def lower_to_buda_netlist(graph: graph.Graph, graph_name: str, placer_solution: placer.PlacerSolution, balancer_solution: balancer.BalancerSolution, chip_ids: List[int], device_config: backend_api.DeviceConfig, enable_forked_dram_inputs: bool = ...) -> BudaNetlist: ...
-def merge_netlists(arg0: List[BudaNetlist]) -> BudaNetlist: ...
+def compress_sparse_tensor_and_strip_info(arg0: list[SparseCOO], arg1: int, arg2: int) -> SparseBUDA: ...
+def dump_epoch_id_graphs(graph: graph.Graph, test_name: str, graph_name: str) -> None: ...
+def dump_epoch_type_graphs(graph: graph.Graph, test_name: str, graph_name: str) -> None: ...
+def dump_graph(graph: graph.Graph, test_name: str, graph_name: str) -> None: ...
+def link_past_cache_ios(arg0: graph.Graph) -> dict[str, int]: ...
 def move_index_to_mm_weights(arg0: graph.Graph) -> None: ...
 def run_consteval_graph_pass(arg0: graph.Graph) -> None: ...
-def run_optimization_graph_passes(arg0: graph.Graph, arg1: backend_api.DeviceConfig) -> None: ...
-def run_placer_buda_passes(arg0: graph.Graph, arg1: balancer.BalancerConfig, arg2: Dict[str, int], arg3: dict) -> Tuple[balancer.BalancerSolution, bool]: ...
-def run_post_autograd_graph_passes(arg0: graph.Graph, arg1: object) -> List[Tuple[int, int]]: ...
-def run_post_initial_graph_passes(arg0: graph.Graph, arg1: object, arg2: List[Tuple[List[Tuple[str, List[int], List[int]]], Dict[str, List[int]]]]) -> Tuple[List[Tuple[int, int]], Dict[str, int]]: ...
-def run_post_optimize_decompose_graph_passes(arg0: graph.Graph, arg1: object) -> List[Tuple[int, int]]: ...
-def run_post_placer_buda_passes(arg0: graph.Graph, arg1: str, arg2: backend_api.DeviceConfig, arg3: placer.PlacerSolution, arg4: PostPlacerConfig, arg5: balancer.BalancerSolution, arg6: Dict[Tuple[str, str, int, int, bool], InsertionInstruction], arg7: List[List[Blocks]], arg8: int) -> PostPlacerResults: ...
-def run_pre_lowering_passes(arg0: graph.Graph) -> None: ...
-def run_pre_netlist_generation_buda_passes(arg0: graph.Graph, arg1: str, arg2: backend_api.DeviceConfig, arg3: Dict[str, object], arg4: placer.PlacerSolution, arg5: PostPlacerConfig, arg6: balancer.BalancerSolution, arg7: List[List[Blocks]], arg8: int) -> None: ...
-def run_pre_placer_buda_passes(graph: graph.Graph, scheduler_config: scheduler.SchedulerConfig, device_config: backend_api.DeviceConfig, chip_ids: List[int] = ..., op_names_to_chip_break: List[List[str]] = ..., op_names_to_epoch_break: List[List[str]] = ..., op_names_dont_fuse: List[str] = ..., op_names_manual_fuse: List[str] = ..., fracture_chip_id_assignments: Dict[str, int] = ..., default_df_override: Optional[DataFormat] = ..., df_overrides: Dict[str, DataFormat] = ..., default_accumulate_df: Optional[DataFormat] = ..., enable_broadcast_splitting: bool = ..., fp32_fallback: DataFormat = ..., default_math_fidelity: MathFidelity = ..., enable_auto_fusing: bool = ..., amp_level: int = ..., enable_recompute: bool = ..., output_queues_on_host: bool = ..., ins_instructions: Dict[Tuple[str, str, int, int, bool], InsertionInstruction] = ..., insert_queues: List[Tuple[str, str, int]] = ..., amp_properties=..., op_intermediates_to_save: List[str] = ..., use_interactive_placer: bool = ..., enable_device_tilize: bool = ...) -> Tuple[graph.Graph, placer.PlacerConfigUpdate]: ...
+def run_mlir_compiler(arg0: graph.Graph) -> runtime.Binary: ...
+def run_optimization_graph_passes(arg0: graph.Graph) -> None: ...
+def run_post_autograd_graph_passes(arg0: graph.Graph, arg1: object) -> list[tuple[int, int]]: ...
+def run_post_initial_graph_passes(arg0: graph.Graph, arg1: object, arg2: list[tuple[list[tuple[str, list[int], list[int]]], dict[str, list[int]]]]) -> tuple[list[tuple[int, int]], dict[str, int]]: ...
+def run_post_optimize_decompose_graph_passes(arg0: graph.Graph, arg1: object) -> list[tuple[int, int]]: ...
+def run_pre_lowering_passes(arg0: graph.Graph) -> graph.Graph: ...
+def run_pre_placer_buda_passes(graph: graph.Graph, device_config, chip_ids: list[int] = ..., op_names_dont_fuse: list[str] = ..., op_names_manual_fuse: list[str] = ..., fracture_chip_id_assignments: dict[str, int] = ..., default_df_override: DataFormat | None = ..., default_accumulate_df: DataFormat | None = ..., enable_broadcast_splitting: bool = ..., fp32_fallback: DataFormat = ..., default_math_fidelity: MathFidelity = ..., enable_auto_fusing: bool = ..., amp_level: int = ..., enable_recompute: bool = ..., output_queues_on_host: bool = ..., input_queues_on_host: bool = ..., insert_queues: list[tuple[str, str, int]] = ..., amp_properties=..., op_intermediates_to_save: list[str] = ..., use_interactive_placer: bool = ..., enable_device_tilize: bool = ...) -> graph.Graph: ...
diff --git a/pybuda/pybuda/_C/autograd.pyi b/pybuda/pybuda/_C/autograd.pyi
index ee8a7f069..f53083955 100644
--- a/pybuda/pybuda/_C/autograd.pyi
+++ b/pybuda/pybuda/_C/autograd.pyi
@@ -1,5 +1,5 @@
 import pybuda._C.graph
-from typing import List, Union, overload
+from typing import overload
 
 class AutogradConfig:
     def __init__(self, recompute: bool = ..., optimizer: object = ...) -> None: ...
@@ -10,13 +10,13 @@ class AutogradContext:
     def constant(self, arg0: int) -> pybuda._C.graph.NodeContext: ...
     @overload
     def constant(self, arg0: float) -> pybuda._C.graph.NodeContext: ...
-    def create_optimizer_op(self, type: str, operands: List[pybuda._C.graph.NodeContext], attributes=...) -> pybuda._C.graph.NodeContext: ...
-    def get_operands(self, arg0: pybuda._C.graph.NodeContext) -> List[pybuda._C.graph.NodeContext]: ...
+    def create_optimizer_op(self, type: str, operands: list[pybuda._C.graph.NodeContext], attributes=...) -> pybuda._C.graph.NodeContext: ...
+    def get_operands(self, arg0: pybuda._C.graph.NodeContext) -> list[pybuda._C.graph.NodeContext]: ...
     def get_pytorch_tensor(self, arg0: pybuda._C.graph.NodeContext) -> object: ...
-    def get_shape(self, arg0: pybuda._C.graph.NodeContext) -> List[int]: ...
+    def get_shape(self, arg0: pybuda._C.graph.NodeContext) -> list[int]: ...
     def input(self, *args, **kwargs): ...
     def loopback(self, arg0: pybuda._C.graph.NodeContext, arg1: pybuda._C.graph.NodeContext) -> None: ...
-    def op(self, type: Union[str, object], operands: List[pybuda._C.graph.NodeContext], attributes=...) -> pybuda._C.graph.NodeContext: ...
+    def op(self, type: str | object, operands: list[pybuda._C.graph.NodeContext], attributes=...) -> pybuda._C.graph.NodeContext: ...
     def tensor(self, arg0: object) -> pybuda._C.graph.NodeContext: ...
 
 class AutogradEngine:
diff --git a/pybuda/pybuda/_C/backend_api.pyi b/pybuda/pybuda/_C/backend_api.pyi
index f55592ece..37deb4d2e 100644
--- a/pybuda/pybuda/_C/backend_api.pyi
+++ b/pybuda/pybuda/_C/backend_api.pyi
@@ -36,16 +36,33 @@ class BackendCompileFailure:
     @property
     def value(self) -> int: ...
 
-class BackendCompileResult:
-    device_id: int
-    extra_size_bytes: int
+class BackendBaseCompileResult:
+    success: bool
+    failure_type: BackendCompileFailure
     failure_message: str
     failure_target: str
-    failure_type: BackendCompileFailure
+
+class BackendCompileResultPerEpoch(BackendBaseCompileResult):
+    device_id: int
+    temporal_epoch_id: int
     logical_core_x: int
     logical_core_y: int
-    success: bool
-    temporal_epoch_id: int
+    maximum_size_bytes: int
+    allocated_size_bytes: int
+    extra_size_bytes: int
+    def __init__(self) -> None: ...
+
+class BackendFwCompileResult(BackendBaseCompileResult):
+    def __init__(self) -> None: ...
+
+class BackendOverlayCompileResult(BackendBaseCompileResult):
+    failed_compile_results_per_epoch: List[BackendCompileResultPerEpoch]
+    blob_usage_per_epoch_per_core: dict[int, dict[str, int]]
+    def __init__(self) -> None: ...
+
+class BackendCompileResult(BackendBaseCompileResult):
+    fw_compile_result: BackendBaseCompileResult
+    overlay_compile_result: BackendOverlayCompileResult
     def __init__(self) -> None: ...
 
 class BackendConfig:
@@ -58,8 +75,8 @@ class BackendDevice:
     __members__: ClassVar[dict] = ...  # read-only
     Grayskull: ClassVar[BackendDevice] = ...
     Invalid: ClassVar[BackendDevice] = ...
-    Wormhole: ClassVar[BackendDevice] = ...
     Wormhole_B0: ClassVar[BackendDevice] = ...
+    Blackhole: ClassVar[BackendDevice] = ...
     __entries: ClassVar[dict] = ...
     def __init__(self, value: int) -> None: ...
     def from_json(self) -> BackendDevice: ...
@@ -107,10 +124,12 @@ class BackendStatusCode:
 
 class BackendType:
     __members__: ClassVar[dict] = ...  # read-only
-    Golden: ClassVar[BackendType] = ...
     Model: ClassVar[BackendType] = ...
-    NoBackend: ClassVar[BackendType] = ...
+    Versim: ClassVar[BackendType] = ...
     Silicon: ClassVar[BackendType] = ...
+    Golden: ClassVar[BackendType] = ...
+    Emulation: ClassVar[BackendType] = ...
+    NoBackend: ClassVar[BackendType] = ...
     __entries: ClassVar[dict] = ...
     def __init__(self, value: int) -> None: ...
     def from_json(self) -> BackendType: ...
@@ -132,11 +151,11 @@ class DeviceConfig:
     def __init__(self, arg0: str, arg1: str, arg2: str, arg3: str, arg4: str, arg5: bool, arg6: List[int]) -> None: ...
     @overload
     def __init__(self, arg0: str, arg1: str, arg2: str, arg3: str, arg4: str, arg5: bool, arg6: List[Tuple[int, int, int, int]]) -> None: ...
-    def get_ethernet_connections(self) -> Dict[int, Dict[int, Tuple[int, int]]]: ...
-    def get_harvested_cfg(self) -> List[int]: ...
     def get_dram_backend_reserved_max(self) -> int: ...
-    def get_host_memory_channel_start_address(self) -> int: ...
+    def get_ethernet_connections(self) -> Dict[int, Dict[int, Tuple[int, int]]]: ...
+    def get_harvested_cfg(self) -> Dict[int, int]: ...
     def get_host_memory_channel_size(self, arg0: int) -> int: ...
+    def get_host_memory_channel_start_address(self) -> int: ...
     def get_host_memory_num_channels(self) -> int: ...
     @property
     def arch(self) -> BackendDevice: ...
@@ -253,10 +272,15 @@ class OpModelDesc:
     ublock_ct: int
     ublock_kt: int
     ublock_rt: int
+    version: int
     def __init__(self) -> None: ...
 
 class PytorchTensorDesc:
+    dim: int
     format: pybuda._C.DataFormat
+    itemsize: int
+    shape: List[int[4]]
+    strides: List[int[4]]
     @overload
     def __init__(self) -> None: ...
     @overload
@@ -264,29 +288,36 @@ class PytorchTensorDesc:
     @overload
     def __init__(self, arg0: capsule, arg1: int, arg2: pybuda._C.DataFormat, arg3: int, arg4: List[int[4]], arg5: List[int[4]]) -> None: ...
     def print(self) -> None: ...
-    @property
-    def dim(self) -> int: ...
-    @property
-    def itemsize(self) -> int: ...
-    @property
-    def shape(self) -> List[int[4]]: ...
-    @property
-    def strides(self) -> List[int[4]]: ...
 
 class StrideDescriptor:
     stride: int
     xy_offsets: List[Tuple[int, int]]
     def __init__(self) -> None: ...
 
-def clear_backend_param_cache(out_dir: str = ...) -> None: ...
+class TilizedTensorDesc:
+    buf_size_bytes: int
+    format: pybuda._C.DataFormat
+    num_buffers: int
+    def __init__(self) -> None: ...
+    def print(self) -> None: ...
+
+@overload
+def binarize_tensor(arg0: PytorchTensorDesc, arg1: str) -> None: ...
+@overload
+def binarize_tensor(arg0: TilizedTensorDesc, arg1: str) -> None: ...
+def clear_backend_param_cache() -> None: ...
+@overload
+def debinarize_tensor(arg0: PytorchTensorDesc, arg1: str) -> None: ...
+@overload
+def debinarize_tensor(arg0: TilizedTensorDesc, arg1: str) -> None: ...
 def detect_available_silicon_devices(only_detect_mmio: bool = ...) -> List[BackendDevice]: ...
 def finish_child_process() -> BackendStatusCode: ...
 @overload
 def free_tensor(arg0: PytorchTensorDesc) -> BackendStatusCode: ...
 @overload
-def free_tensor(arg0) -> BackendStatusCode: ...
+def free_tensor(arg0: TilizedTensorDesc) -> BackendStatusCode: ...
 def get_custom_device_desc(arch: BackendDevice = ..., mmio: bool = ..., harvesting_mask: int = ..., grid_dim: Tuple[int, int] = ..., out_dir: str = ...) -> BackendDeviceDesc: ...
-def get_device_cluster_yaml() -> str: ...
+def get_device_cluster_yaml(out_dir: str) -> str: ...
 def get_device_descs_for_available_devices(out_dir: str = ...) -> List[BackendDeviceDesc]: ...
 def get_golden_config() -> BackendConfig: ...
 def get_io_size_in_bytes(data_formati: pybuda._C.DataFormat, is_untilizesd: bool, ublock_ct: int, ublock_rt: int, mblock_m: int, mblock_n: int, t: int, entries: int, tile_height: int = ..., tile_width: int = ...) -> int: ...
@@ -295,10 +326,13 @@ def get_op_model_execution_cycles(arg0: OpModelDesc) -> int: ...
 def get_op_model_param(arg0: OpModelDesc, arg1: str) -> int: ...
 def get_output(arg0: DramIODesc, arg1: PytorchTensorDesc, arg2: bool, arg3: int, arg4: int) -> BackendStatusCode: ...
 def initialize_child_process(arg0: str) -> BackendStatusCode: ...
+def load_cached_sys_param(arg0: str) -> Dict[str, str]: ...
 def pop_output(arg0: DramIODesc, arg1: bool, arg2: int) -> BackendStatusCode: ...
 @overload
 def push_input(arg0: DramIODesc, arg1: PytorchTensorDesc, arg2: bool, arg3: int, arg4: int) -> BackendStatusCode: ...
 @overload
-def push_input(arg0: DramIODesc, arg1, arg2: int, arg3: int) -> BackendStatusCode: ...
+def push_input(arg0: DramIODesc, arg1: TilizedTensorDesc, arg2: int, arg3: int) -> BackendStatusCode: ...
 def release_backend_ptr(arg0: BackendApi) -> None: ...
-def translate_addresses(arg0: DramIODesc) -> BackendStatusCode: ...
\ No newline at end of file
+def tilize_tensor(arg0: DramIODesc, arg1: PytorchTensorDesc) -> TilizedTensorDesc: ...
+def translate_addresses(arg0: DramIODesc) -> BackendStatusCode: ...
+def get_format_from_string(arg0: str) -> pybuda._C.DataFormat: ...
diff --git a/pybuda/pybuda/_C/balancer.pyi b/pybuda/pybuda/_C/balancer.pyi
index 087c6d3f1..29651c4cf 100644
--- a/pybuda/pybuda/_C/balancer.pyi
+++ b/pybuda/pybuda/_C/balancer.pyi
@@ -42,6 +42,7 @@ class BalancerConfig:
     random_policy_seed: int
     scheduler_config: pybuda._C.scheduler.SchedulerConfig
     skip_l1_usage_validation: bool
+    target_cycles_offset: int
     use_interactive_placer: bool
     def __init__(self, device_config, scheduler_config: pybuda._C.scheduler.SchedulerConfig, policy_type: PolicyType = ..., random_policy_seed: int = ..., chip_ids: List[int] = ..., chip_placement_policy: ChipPlacementPolicy = ..., default_dram_parameters: bool = ..., skip_l1_usage_validation: bool = ..., enable_t_streaming: bool = ..., manual_t_streaming: bool = ..., input_queues_on_host: bool = ..., output_queues_on_host: bool = ..., op_overrides: Dict[str, OpOverride] = ..., op_names_to_epoch_break: List[List[str]] = ..., op_names_to_chip_break: List[List[str]] = ..., op_names_to_chip_id_assignment: Dict[str, int] = ..., op_name_to_placer_overrides: Dict[str, pybuda._C.placer.OpOverride] = ..., enable_auto_transposing_placement: bool = ..., graph_solver_self_cut_type: GraphSolverSelfCutType = ..., use_interactive_placer: bool = ..., enable_enumerate_u_kt: bool = ..., enable_single_buffer_fallback: bool = ...) -> None: ...
 
@@ -117,16 +118,27 @@ class FactorizedInt:
     def max_factor(self) -> int: ...
 
 class FusedSubOpModel:
-    type: str
-    mblock_m: int
-    mblock_n: int
-    ublock_rt: int
-    ublock_ct: int
-    mblock_k: int
-    ublock_kt: int
-    reduce_dim: str
-    has_dest_input: bool
-    has_dest_output: bool
+    def __init__(self, *args, **kwargs) -> None: ...
+    @property
+    def has_dest_input(self) -> bool: ...
+    @property
+    def has_dest_output(self) -> bool: ...
+    @property
+    def mblock_k(self) -> int: ...
+    @property
+    def mblock_m(self) -> int: ...
+    @property
+    def mblock_n(self) -> int: ...
+    @property
+    def reduce_dim(self) -> str: ...
+    @property
+    def type(self) -> str: ...
+    @property
+    def ublock_ct(self) -> int: ...
+    @property
+    def ublock_kt(self) -> int: ...
+    @property
+    def ublock_rt(self) -> int: ...
 
 class GraphSolverSelfCutType:
     __members__: ClassVar[dict] = ...  # read-only
diff --git a/pybuda/pybuda/_C/graph.pyi b/pybuda/pybuda/_C/graph.pyi
index 68b5de2b4..757c7c79d 100644
--- a/pybuda/pybuda/_C/graph.pyi
+++ b/pybuda/pybuda/_C/graph.pyi
@@ -1,5 +1,5 @@
 import pybuda._C
-from typing import ClassVar, Dict, Iterator, List, Optional, Tuple, overload
+from typing import ClassVar, Iterator, overload
 
 C: UBlockOrder
 Concatenate: RuntimeTensorTransformType
@@ -21,45 +21,44 @@ class Graph:
     def __init__(self, arg0: str) -> None: ...
     def clone(self) -> Graph: ...
     def enable_training(self) -> bool: ...
-    def get_constant_input_runtime_tensor_transform_constants(self) -> List[Tuple[str, object]]: ...
-    def get_constant_names(self) -> List[str]: ...
+    def get_constant_input_runtime_tensor_transform_constants(self) -> list[tuple[str, object]]: ...
+    def get_constant_names(self) -> list[str]: ...
     def get_constant_nodes(self, *args, **kwargs): ...
-    def get_fused_ops(self) -> List[Tuple[int, List[List[str]]]]: ...
     def get_input_runtime_tensor_transforms(self, *args, **kwargs): ...
     def get_microbatch(self) -> int: ...
     def get_name(self) -> str: ...
     def get_node_id(self, arg0: str) -> int: ...
     def get_node_name(self, arg0: int) -> str: ...
-    def get_ordered_constant_tile_dims(self) -> List[List[int]]: ...
-    def get_ordered_input_gradient_names(self) -> List[str]: ...
-    def get_ordered_input_names(self) -> List[str]: ...
-    def get_ordered_input_requires_grad(self) -> List[bool]: ...
-    def get_ordered_input_shapes(self) -> List[List[int]]: ...
-    def get_ordered_input_subgraph_indices(self) -> List[int]: ...
-    def get_ordered_input_tile_dims(self) -> List[List[int]]: ...
-    def get_ordered_intermediate_names(self) -> List[str]: ...
-    def get_ordered_intermediate_shapes(self) -> List[List[int]]: ...
-    def get_ordered_output_gradient_names(self) -> List[str]: ...
-    def get_ordered_output_names(self) -> List[str]: ...
-    def get_ordered_output_requires_grad(self) -> List[bool]: ...
-    def get_ordered_output_shapes(self) -> List[List[int]]: ...
-    def get_ordered_output_subgraph_indices(self) -> List[int]: ...
-    def get_ordered_parameter_tile_dims(self) -> List[List[int]]: ...
-    def get_ordered_target_names(self) -> List[str]: ...
-    def get_ordered_target_shapes(self) -> List[List[int]]: ...
-    def get_ordered_target_subgraph_indices(self) -> List[int]: ...
+    def get_ordered_constant_tile_dims(self) -> list[list[int]]: ...
+    def get_ordered_input_gradient_names(self) -> list[str]: ...
+    def get_ordered_input_names(self) -> list[str]: ...
+    def get_ordered_input_requires_grad(self) -> list[bool]: ...
+    def get_ordered_input_shapes(self) -> list[list[int]]: ...
+    def get_ordered_input_subgraph_indices(self) -> list[int]: ...
+    def get_ordered_input_tile_dims(self) -> list[list[int]]: ...
+    def get_ordered_intermediate_names(self) -> list[str]: ...
+    def get_ordered_intermediate_shapes(self) -> list[list[int]]: ...
+    def get_ordered_output_gradient_names(self) -> list[str]: ...
+    def get_ordered_output_names(self) -> list[str]: ...
+    def get_ordered_output_requires_grad(self) -> list[bool]: ...
+    def get_ordered_output_shapes(self) -> list[list[int]]: ...
+    def get_ordered_output_subgraph_indices(self) -> list[int]: ...
+    def get_ordered_parameter_tile_dims(self) -> list[list[int]]: ...
+    def get_ordered_target_names(self) -> list[str]: ...
+    def get_ordered_target_shapes(self) -> list[list[int]]: ...
+    def get_ordered_target_subgraph_indices(self) -> list[int]: ...
     def get_output_runtime_tensor_transforms(self, *args, **kwargs): ...
     def get_parameter_nodes(self, *args, **kwargs): ...
     def get_subgraph_id_for_node(self, arg0: int) -> int: ...
-    def get_tile_broadcast_dims_for_bw_input(self, arg0: int) -> List[int]: ...
-    def get_tile_broadcast_dims_for_input(self, arg0: int) -> List[int]: ...
-    def get_tile_broadcast_dims_for_target(self, arg0: int) -> List[int]: ...
+    def get_tile_broadcast_dims_for_bw_input(self, arg0: int) -> list[int]: ...
+    def get_tile_broadcast_dims_for_input(self, arg0: int) -> list[int]: ...
+    def get_tile_broadcast_dims_for_target(self, arg0: int) -> list[int]: ...
     def has_node_with_id(self, arg0: int) -> bool: ...
-    def nodes(self) -> List[str]: ...
+    def nodes(self) -> list[str]: ...
     def output_node_redirected(self) -> bool: ...
-    def register_module_inputs(self, arg0: List[int]) -> None: ...
-    def register_module_outputs(self, arg0: List[int], arg1: List[bool]) -> None: ...
-    def register_module_targets(self, arg0: List[int]) -> None: ...
+    def register_module_inputs(self, module_inputs: list[int], append: bool = ...) -> None: ...
+    def register_module_outputs(self, module_outputs: list[int], requires_grad: list[bool], append: bool = ...) -> None: ...
+    def register_module_targets(self, arg0: list[int]) -> None: ...
     def set_enable_training(self, arg0: bool) -> None: ...
     def set_microbatch(self, arg0: int) -> None: ...
 
@@ -195,15 +194,15 @@ class Shape:
     BUDA: ClassVar[Shape.Type] = ...
     FREE: ClassVar[Shape.Type] = ...
     def __init__(self, *args, **kwargs) -> None: ...
-    def as_list(self) -> List[int]: ...
-    @classmethod
-    def create(cls, values: List[int]) -> Shape: ...
-    @classmethod
-    def create_buda(cls, arg0: List[int], arg1: int, arg2: int) -> Shape: ...
-    @classmethod
-    def create_with_type_from_other(cls, other: Shape, values: List[int]) -> Shape: ...
-    @classmethod
-    def from_json(cls, arg0: json) -> Shape: ...
+    def as_list(self) -> list[int]: ...
+    @staticmethod
+    def create(values: list[int]) -> Shape: ...
+    @staticmethod
+    def create_buda(arg0: list[int], arg1: int, arg2: int) -> Shape: ...
+    @staticmethod
+    def create_with_type_from_other(other: Shape, values: list[int]) -> Shape: ...
+    @staticmethod
+    def from_json(arg0: json) -> Shape: ...
     def get_tile_dim(self, *args, **kwargs): ...
     def get_tile_height(self) -> int: ...
     def get_tile_width(self) -> int: ...
@@ -246,21 +245,22 @@ class UBlockOrder:
     def value(self) -> int: ...
 
 def add_partial_datacopy_edge(arg0: Graph, arg1: int, arg2: int, arg3: int, arg4: int) -> None: ...
-def create_activation_input(arg0: Graph, arg1: str, arg2: List[int], arg3: bool, arg4: pybuda._C.DataFormat, arg5: int) -> int: ...
+def add_subgraph_io_link_edge(arg0: Graph, arg1: int, arg2: int, arg3: int, arg4: int) -> None: ...
+def create_activation_input(arg0: Graph, arg1: str, arg2: list[int], arg3: bool, arg4: pybuda._C.DataFormat, arg5: int) -> int: ...
 @overload
 def create_constant_input(arg0: Graph, arg1: str, arg2: float, arg3: pybuda._C.DataFormat, arg4: int) -> int: ...
 @overload
-def create_constant_input(arg0: Graph, arg1: str, arg2: object, arg3: List[int], arg4: pybuda._C.DataFormat, arg5: int) -> int: ...
+def create_constant_input(arg0: Graph, arg1: str, arg2: object, arg3: list[int], arg4: pybuda._C.DataFormat, arg5: int) -> int: ...
 def create_control_edge(arg0: Graph, arg1: int, arg2: int, arg3: int, arg4: int) -> None: ...
-def create_data_edge(arg0: Graph, arg1: int, arg2: int, arg3: int, arg4: int, arg5: List[tuple]) -> None: ...
-def create_op_node(arg0: Graph, arg1: str, arg2: OpType, arg3: List[int], arg4: pybuda._C.DataFormat, arg5: int) -> int: ...
-def create_output(arg0: Graph, arg1: str, arg2: List[int], arg3: pybuda._C.DataFormat, arg4: bool, arg5: int) -> int: ...
-def create_parameter_input(arg0: Graph, arg1: str, arg2: List[int], arg3: bool, arg4: pybuda._C.DataFormat, arg5: int) -> int: ...
-def create_target_input(arg0: Graph, arg1: str, arg2: List[int], arg3: bool, arg4: pybuda._C.DataFormat, arg5: int) -> int: ...
-def eval(graph: Graph, inputs: List[object], parameters: Dict[str, object], tt_device: object, relative_atol: float, pcc: float, intermediate_golden_tensors: Dict[int, object] = ..., losses: List[object] = ..., targets: List[object] = ..., balancer_solution=..., dump_tensors_path: str = ..., allow_modified_shapes: bool = ...) -> Tuple[List[object], Dict[str, object], List[object], Dict[str, object]]: ...
+def create_data_edge(arg0: Graph, arg1: int, arg2: int, arg3: int, arg4: int, arg5: list[tuple]) -> None: ...
+def create_op_node(arg0: Graph, arg1: str, arg2: OpType, arg3: list[int], arg4: pybuda._C.DataFormat, arg5: int, arg6: dict[str, bool | int | str]) -> int: ...
+def create_output(arg0: Graph, arg1: str, arg2: list[int], arg3: pybuda._C.DataFormat, arg4: bool, arg5: int) -> int: ...
+def create_parameter_input(arg0: Graph, arg1: str, arg2: list[int], arg3: bool, arg4: pybuda._C.DataFormat, arg5: int) -> int: ...
+def create_target_input(arg0: Graph, arg1: str, arg2: list[int], arg3: bool, arg4: pybuda._C.DataFormat, arg5: int) -> int: ...
+def eval(graph: Graph, inputs: list[object], parameters: dict[str, object], tt_device: object, relative_atol: float, pcc: float, intermediate_golden_tensors: dict[int, object] = ..., losses: list[object] = ..., targets: list[object] = ..., dump_tensors_path: str = ..., allow_modified_shapes: bool = ...) -> tuple[list[object], dict[str, object], list[object], dict[str, object]]: ...
 def get_constant_input_value(arg0: Node, arg1: bool) -> object: ...
-def get_intermediate_tensors(graph: Graph, inputs: List[object], parameters: Dict[str, object], tt_device: object, relative_atol: float, pcc: float, intermediate_golden_tensors: Dict[int, object] = ..., losses: List[object] = ..., targets: List[object] = ..., balancer_solution=..., dump_tensors_path: str = ..., allow_modified_shapes: bool = ...) -> Dict[str, object]: ...
-def get_optimizer_param_info(arg0: Graph, arg1: str) -> List[Tuple[InputNode, str]]: ...
-def get_shape_for_node(arg0: Graph, arg1: str) -> List[int]: ...
-def record_consteval_operations(arg0: Graph) -> Dict[str, Optional[json]]: ...
+def get_intermediate_tensors(graph: Graph, inputs: list[object], parameters: dict[str, object], tt_device: object, relative_atol: float, pcc: float, intermediate_golden_tensors: dict[int, object] = ..., losses: list[object] = ..., targets: list[object] = ..., dump_tensors_path: str = ..., allow_modified_shapes: bool = ...) -> dict[str, object]: ...
+def get_optimizer_param_info(arg0: Graph, arg1: str) -> list[tuple[InputNode, str]]: ...
+def get_shape_for_node(arg0: Graph, arg1: str) -> list[int]: ...
+def record_consteval_operations(arg0: Graph) -> dict[str, json | None]: ...
 def remove_node(arg0: Graph, arg1: int) -> None: ...
diff --git a/pybuda/pybuda/_C/torch_device.pyi b/pybuda/pybuda/_C/torch_device.pyi
index b7410b979..b1e9ac4b7 100644
--- a/pybuda/pybuda/_C/torch_device.pyi
+++ b/pybuda/pybuda/_C/torch_device.pyi
@@ -1,57 +1,48 @@
-import pybuda._C.backend_api
-import pybuda._C.balancer
+import pybuda._C
 import torch
-from typing import Dict, List, Optional
-
-class CompileRequest:
-    def __init__(self, netlist_path: str, output_dir: str, backend_config: pybuda._C.backend_api.BackendConfig, inputs: List[PyBudaTensorDesc], constants: List[PyBudaTensorDesc], parameters: List[PyBudaTensorDesc], outputs: List[PyBudaTensorDesc]) -> None: ...
-
-class Program:
-    def __init__(self, name: str, params: Dict[str, str]) -> None: ...
 
 class PyBudaTensorDesc:
-    def __init__(self, name: str, shape: List[int], ptr: int = ..., constant: Optional[torch.Tensor] = ...) -> None: ...
+    def __init__(self, name: str, shape: list[int], ptr: int = ..., constant: torch.Tensor | None = ...) -> None: ...
     @property
-    def constant(self) -> Optional[torch.Tensor]: ...
+    def constant(self) -> torch.Tensor | None: ...
     @property
     def name(self) -> str: ...
     @property
     def ptr(self) -> int: ...
     @property
-    def shape(self) -> List[int]: ...
+    def shape(self) -> list[int]: ...
 
 class TTDevice:
     def __init__(self, *args, **kwargs) -> None: ...
-    def compile(self, arg0: CompileRequest) -> Workload: ...
-    def dispatch(self, arg0: Workload, arg1: List[Program], arg2: List[torch.Tensor], arg3: Dict[str, pybuda._C.balancer.OutputHostTM]) -> List[torch.Tensor]: ...
+    def dispatch(self, arg0: Workload, arg1: int, arg2: list[torch.Tensor], arg3: bool) -> list[torch.Tensor]: ...
     def str(self) -> str: ...
     def torch_device(self) -> torch.device: ...
     @property
-    def arch(self) -> pybuda._C.backend_api.BackendDevice: ...
+    def arch(self) -> pybuda._C.Arch: ...
     @property
     def cluster_yaml(self) -> str: ...
     @property
-    def index(self) -> int: ...
+    def input_runtime_transforms(self) -> dict[int, list[str]]: ...
     @property
-    def mmio(self) -> bool: ...
+    def input_tile_bcast_dims(self) -> dict[int, list[list[int]]]: ...
     @property
-    def soc_desc_yaml(self) -> str: ...
+    def mmio(self) -> bool: ...
     @property
-    def type(self) -> pybuda._C.backend_api.BackendType: ...
+    def output_runtime_transforms(self) -> dict[int, list[str]]: ...
 
 class Workload:
     def __init__(self, *args, **kwargs) -> None: ...
     @property
-    def backend(self) -> pybuda._C.backend_api.BackendApi: ...
-    @property
-    def constants(self) -> List[PyBudaTensorDesc]: ...
+    def constants(self) -> list[PyBudaTensorDesc]: ...
     @property
-    def inputs(self) -> List[PyBudaTensorDesc]: ...
+    def inputs(self) -> dict[int, list[PyBudaTensorDesc]]: ...
     @property
-    def outputs(self) -> List[PyBudaTensorDesc]: ...
+    def outputs(self) -> dict[int, list[PyBudaTensorDesc]]: ...
     @property
-    def parameters(self) -> List[PyBudaTensorDesc]: ...
+    def parameters(self) -> list[PyBudaTensorDesc]: ...
 
 def get_available_devices(*args, **kwargs): ...
 def get_default_device(*args, **kwargs): ...
-def push_tensor(arg0: pybuda._C.backend_api.BackendApi, arg1: PyBudaTensorDesc, arg2: torch.Tensor, arg3: str) -> None: ...
+def is_created_on_device(arg0: torch.Tensor) -> bool: ...
+def original_shape(arg0: torch.Tensor) -> list[int]: ...
+def unique_id(arg0: torch.Tensor) -> int: ...
diff --git a/pybuda/pybuda/__init__.py b/pybuda/pybuda/__init__.py
index 74c28d6aa..adfefd150 100644
--- a/pybuda/pybuda/__init__.py
+++ b/pybuda/pybuda/__init__.py
@@ -9,21 +9,13 @@ def set_home_paths():
     import pathlib
     from loguru import logger
     pybuda_path = pathlib.Path(__file__).parent.parent.resolve()
-    if os.path.exists(str(pybuda_path) + "/budabackend"):
+
         # deployment path
-        base_path = str(pybuda_path)
-        out_path = "."
-    else:
-        # DEV path
-        pybuda_path = pybuda_path.parent.resolve()
-        assert os.path.exists(str(pybuda_path) + "/third_party/budabackend"), "Can't find budabackend"
-        base_path = str(pybuda_path) + "/third_party"
-        out_path = str(base_path) + "/third_party/budabackend/tt_build"
+    base_path = str(pybuda_path)
+    out_path = "."
 
     if "PYBUDA_HOME" not in os.environ:
         os.environ["PYBUDA_HOME"] = str(pybuda_path)
-    if "BUDA_HOME" not in os.environ:
-        os.environ["BUDA_HOME"] = str(base_path) + "/budabackend/"
     if "TVM_HOME" not in os.environ:
         os.environ["TVM_HOME"] = str(base_path) + "/tvm"
     if "BUDA_OUT" not in os.environ:
@@ -39,27 +31,30 @@ def set_home_paths():
 os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
 os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"
 
-from .module import Module, PyTorchModule, PyBudaModule, TFModule, TFGraphDefModule, OnnxModule, MXNetModule, JaxModule, TFLiteModule
-from .tti import TTDeviceImage
-from .device import Device 
-from .cpudevice import CPUDevice
-from .gpudevice import GPUDevice
-from .ttdevice import TTDevice
-from .ttcluster import TTCluster
-from .run import run_inference, run_training, shutdown, initialize_pipeline, run_forward, run_backward, run_optimizer, run_schedulers, run_generate, run_generative_inference, get_parameter_checkpoint, get_parameter_gradients, update_device_parameters, error_raised, get_loss_queue, sync, get_intermediates_queue
-from .compile import pybuda_compile
-from .torch_compile import compile_torch#, get_default_device, get_available_devices, torch_device
+from .module import Module, PyTorchModule, PyBudaModule, TFGraphDefModule, OnnxModule, JaxModule, TFLiteModule
+from .compile import pybuda_compile_torch, compile_main as compile
+from .torch_compile import compile_torch
 from .compiled_graph_state import CompiledGraphState 
-from .config import CompilerConfig, CompileDepth, set_configuration_options, set_epoch_break, set_chip_break, override_op_size, PerfTraceLevel, insert_buffering_nop, override_dram_queue_placement, configure_mixed_precision
+from .config import CompilerConfig, CompileDepth, set_configuration_options, set_epoch_break, set_chip_break, override_op_size, PerfTraceLevel, insert_buffering_nop, insert_nop, _internal_insert_fj_buffering_nop, override_dram_queue_placement, configure_mixed_precision
 from .verify import VerifyConfig
 from .pybudaglobal import pybuda_reset, set_device_pipeline, is_silicon, get_tenstorrent_device
 from .parameter import Parameter
 from .tensor import Tensor, SomeTensor, TensorShape
 from .optimizers import SGD, Adam, AdamW, LAMB, LARS
-from ._C.backend_api import BackendType, BackendDevice
 from ._C import DataFormat, MathFidelity
 from ._C import k_dim
-from .run.api import detect_available_devices
 
 import pybuda.op as op
 import pybuda.transformers
+
+# Torch backend registration
+# TODO: move this in a separate file / module.
+from torch._dynamo.backends.registry import _BACKENDS
+from torch._dynamo import register_backend
+
+# register backend with torch:
+# - enables backend to be shown when calling torch._dynamo.list_backends()
+# - enables torch.compile(model, backend="<name_from_list_backends>"), where <name_from_list_backends> is "tt" in this case
+if "tt" in _BACKENDS:
+    del _BACKENDS["tt"]
+register_backend(compile_torch, "tt")
diff --git a/pybuda/pybuda/backend.py b/pybuda/pybuda/backend.py
deleted file mode 100644
index af6267df1..000000000
--- a/pybuda/pybuda/backend.py
+++ /dev/null
@@ -1,484 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-# Backend API wrapper
-
-import threading
-import queue
-import time
-from typing import Optional, List, Tuple, Dict, Union
-
-import torch
-import torch.multiprocessing as mp
-from multiprocessing.synchronize import Event as EventClass
-from multiprocessing.synchronize import Barrier as BarrierClass
-
-from loguru import logger
-
-import pybuda
-from .pybudaglobal import TILE_DIM
-from pybuda._C import DataFormat
-from pybuda._C.backend_api import BackendType, BackendDevice, BackendApi, BackendConfig, DramIODesc, PytorchTensorDesc, TilizedTensorDesc, BackendStatusCode, BackendCompileResult, clear_backend_param_cache, release_backend_ptr, push_input, pop_output, get_output, translate_addresses, free_tensor, DeviceMode, debinarize_tensor
-from pybuda._C.graph import Graph, get_constant_input_value, get_optimizer_param_info, RuntimeTensorTransform, RuntimeTensorTransformType
-from pybuda._C.balancer import OutputHostTM
-from .tensor import Tensor, consteval_input, pytorch_tensor_to_tensor_desc, pad_pytorch_tensor_to_buda, tensor_desc_to_pytorch_tensor, get_device_constant_and_parameters, const_eval_tensor
-from .utils import detach_tensors
-from .config import PerfTraceLevel
-
-class BackendAPI:
-
-    def __init__(self,
-            type: BackendType,
-            device_type: BackendDevice,
-            device: "TTDevice",
-            netlist: str,
-            compiled_graph_state: "CompiledGraphState",
-            feeder_thread: bool,
-            shutdown_event: Optional[EventClass],
-            final_barrier: Optional[BarrierClass],
-            performance_trace: PerfTraceLevel,
-            device_mode: DeviceMode = DeviceMode.CompileAndRun,
-            golden_ignore_df_precision: bool = True,
-            opt_level: int = 0,
-            output_dir: str = "tt_build/test_out",
-            device_descriptor_path: str = "",
-            cluster_descriptor_path: str = "",
-            runtime_args: str = ""):
-
-        self.type = type
-        self.device = device
-        self.device_type = device_type
-        self.netlist = netlist
-        self.compiled_graph_state = compiled_graph_state
-        self.shutdown_event = shutdown_event
-        self.final_barrier = final_barrier
-        self.feeder_thread = None
-        self.feeder_thread_queue = None
-        self.cache_zerod = False
-        self.output_dir = output_dir
-
-        # If set, we'll wait for idle after every program
-        # It shouldn't be needed, but ok for debug
-        self.explicit_barrier_between_programs = False
-
-        bcfg = BackendConfig(
-            self.type,
-            self.device_type,
-            device_mode,
-            opt_level,
-            output_dir,
-            device_descriptor_path,
-            cluster_descriptor_path,
-        )
-        bcfg.set_golden_ignore_df_precision(golden_ignore_df_precision)
-        bcfg.set_performance_trace_args(performance_trace.get_backend_cfg_string())
-        bcfg.set_runtime_args(runtime_args)
-
-        self.be_api = BackendApi(self.netlist, bcfg)
-        self.compile_result = BackendCompileResult()
-        if self.be_api.initialize(self.compile_result) != BackendStatusCode.Success:
-            logger.info(f"Backend compile {self.compile_result.success}, target: {self.compile_result.failure_target}, error type: {self.compile_result.failure_type}, error: {self.compile_result.failure_message}\n"
-            f"target chip id: {self.compile_result.device_id}, target core(x,y): {self.compile_result.logical_core_x} {self.compile_result.logical_core_y}, temporal epoch id: {self.compile_result.temporal_epoch_id}\n"
-            f"requires extra size bytes: {self.compile_result.extra_size_bytes}\n")
-            self.shutdown()
-            raise RuntimeError(f"Backend compile failed: {self.compile_result.failure_type}")
-
-        # Create and start a feeder thread, if requested
-        if feeder_thread:
-            self.feeder_thread_queue = queue.Queue()
-            self.feeder_thread = threading.Thread(target=self.feeder_thread_main, args=(self.feeder_thread_queue,))
-            self.feeder_thread.start()
-
-    def shutdown(self):
-        """
-        Shutdown the device
-        """
-        if self.feeder_thread_queue:
-            self.feeder_thread_queue.put("quit")
-            self.feeder_thread.join()
-            self.feeder_thread= None
-            self.feeder_thread_queue = None
-
-        if self.be_api:
-            clear_backend_param_cache()
-            self.be_api.finish()
-            release_backend_ptr(self.be_api)
-            self.be_api = None
-
-    def sync(self):
-        """
-        Wait until device is idle, and queued up commands have completed
-        """
-        if self.feeder_thread_queue:
-            while not self.feeder_thread_queue.empty():
-                time.sleep(0.01)
-                if self.shutdown_event and self.shutdown_event.is_set():
-                    return
-        assert self.be_api
-        assert self.be_api.wait_for_idle() == BackendStatusCode.Success, "Failed while waiting for device to go idle"
-
-    def feeder_thread_main(self, cmdqueue: queue.Queue):
-        """
-        Run in a loop, reading commands and executing them, until quit has been received, or an exception occured
-        """
-        logger.info("Feeder thread on {} starting", self)
-        while True:
-            while True:
-                try:
-                    cmd = cmdqueue.get(timeout=0.1)
-                    break
-                except queue.Empty as _:
-                    if self.shutdown_event is not None and self.shutdown_event.is_set():
-                        logger.debug("Ending feeder thread on {} due to shutdown event", self)
-                        if self.final_barrier is not None:
-                            self.final_barrier.abort()
-                        return # got a signal to shutdown and end the process
-                    continue
-            
-            if cmd == "token":
-                continue
-            else:
-                params = []
-                if isinstance(cmd, tuple):
-                    params = cmd[1:]
-                    cmd = cmd[0]
-
-            logger.debug("Run feeder thread cmd: {}", cmd)
-            if cmd == "fwd":
-                self._run_forward(loop_count=params[0])
-            elif cmd == "gen":
-                self._run_generate(write_index=params[0], inner_loop_count=params[1], inner_increment=params[2], outer_loop_count=params[3], outer_increment=params[4])
-            elif cmd == "bwd":
-                assert len(params) == 1
-                self._run_backward(zero_grad=False, loop_count=params[0])
-            elif cmd == "bwd_zero_grad":
-                assert len(params) == 1
-                self._run_backward(zero_grad=True, loop_count=params[0])
-            elif cmd == "opt":
-                self._run_optimizer()
-            elif cmd == "sch":
-                self._step_schedulers()
-            elif cmd == "quit":
-                break
-            else:
-                raise RuntimeError(f"Invalid feeder thread command: {cmd}")
-
-    def schedule_run_forward(self, loop_count: int):
-        if self.feeder_thread_queue:
-            self._schedule_feeder_cmd(("fwd", loop_count))
-        else:
-            self._run_forward(loop_count)
-
-    def schedule_run_generate(self, write_index: int, inner_loop_count: int, inner_increment: int, outer_loop_count: int, outer_increment: int):
-        if self.feeder_thread_queue:
-            self._schedule_feeder_cmd(("gen", write_index, inner_loop_count, inner_increment, outer_loop_count, outer_increment))
-        else:
-            self._run_generate(write_index, inner_loop_count, inner_increment, outer_loop_count, outer_increment)
-
-    def schedule_run_backward(self, loop_count: int, zero_grad: bool):
-        if self.feeder_thread_queue:
-            name = "bwd_zero_grad" if zero_grad else "bwd"
-            self._schedule_feeder_cmd((name, loop_count))
-        else:
-            self._run_backward(zero_grad=zero_grad, loop_count=loop_count)
-
-    def schedule_run_optimizer(self):
-        if self.feeder_thread_queue:
-            self._schedule_feeder_cmd("opt")
-        else:
-            self._run_optimizer()
-
-    def schedule_run_schedulers(self, device):
-        if self.feeder_thread_queue:
-            self._schedule_feeder_cmd("sch")
-        else:
-            self._step_schedulers()
-
-    def _schedule_feeder_cmd(self, item):
-        assert self.feeder_thread_queue
-        self.feeder_thread_queue.put(item)
-        self.feeder_thread_queue.put("token")
-
-    def _run_forward(self, loop_count: int):
-        assert self.be_api
-        params = {
-            "$p_loop_count": str(loop_count)
-        }
-        assert self.be_api.run_program("run_fwd_" + f"{self.device.get_active_subgraph()}", params) == BackendStatusCode.Success, "Failed while running fwd program"
-        if self.explicit_barrier_between_programs:
-            assert self.be_api.wait_for_idle() == BackendStatusCode.Success, "Failed while waiting for idle"
-
-    def _run_generate(self, write_index, inner_loop_count, inner_increment, outer_loop_count, outer_increment):
-        assert self.be_api
-        if not self.cache_zerod:
-            zero_cache = True
-            self.cache_zerod = True
-        else:
-            zero_cache = False
-        params = {
-            "$p_cache_write_index": str(write_index),
-            "$p_inner_loop_count": str(inner_loop_count),
-            "$p_inner_increment": str(inner_increment),
-            "$p_outer_loop_count": str(outer_loop_count),
-            "$p_outer_increment": str(outer_increment),
-        }
-        assert self.be_api.run_program("run_fwd_" + f"{self.device.get_active_subgraph()}", params) == BackendStatusCode.Success, "Failed while running fwd program"
-        if self.explicit_barrier_between_programs:
-            assert self.be_api.wait_for_idle() == BackendStatusCode.Success, "Failed while waiting for idle"
-
-    def _run_backward(self, zero_grad: bool, loop_count: int):
-        assert self.be_api
-        params = {
-            "$p_zero_grad" : "True" if zero_grad else "False",
-            "$p_loop_count": str(loop_count)
-        }
-        logger.info("run_backward: zero_grad={}", zero_grad)
-        assert self.be_api.run_program("run_bwd_" + f"{self.device.get_active_subgraph()}", params) == BackendStatusCode.Success, "Failed while running bwd program"
-        if self.explicit_barrier_between_programs:
-            assert self.be_api.wait_for_idle() == BackendStatusCode.Success, "Failed while waiting for idle"
-
-    def _run_optimizer(self):
-        assert self.be_api
-        assert self.be_api.run_program("run_opt_" + f"{self.device.get_active_subgraph()}", {}) == BackendStatusCode.Success, "Failed while running opt program"
-        if self.explicit_barrier_between_programs:
-            assert self.be_api.wait_for_idle() == BackendStatusCode.Success, "Failed while waiting for idle"
-
-    def _step_schedulers(self):
-        if hasattr(self.device, "scheduler") and self.device.scheduler is not None:
-            self.device.scheduler.step()
-            self.push_optimizer_parameters(translate=True, only_scheduler_params=True)
-        else:
-            self.device._step_schedulers()
-
-    @classmethod
-    def _capture_tensor(cls, desc: PytorchTensorDesc, q: DramIODesc):
-        if not hasattr(cls, "should_capture_tensors"):
-            cls.should_capture_tensors = pybuda.ci.capture_tensors()
-
-        if not cls.should_capture_tensors:
-            return
-
-        base = pybuda.ci.get_netlist_dir()
-        assert isinstance(desc, PytorchTensorDesc)
-        tensor = Tensor.create_from_tensor_descriptor(desc).value()
-        path = f"{base}/{q.name}"
-        assert tensor.shape[0] == q.input_count
-        for entry in range(q.input_count):
-            pybuda.op.eval.common.dump_tensor(tensor[entry], path, entry=entry)
-
-    def get_output_queue_descriptor(self, output_name) -> DramIODesc:
-        desc = self.be_api.get_queue_descriptor(output_name)
-        assert translate_addresses(desc) == BackendStatusCode.Success, f"Failed to translate addresses: {desc.name}"
-        if output_name in self.compiled_graph_state.output_host_tms:
-            tm = self.compiled_graph_state.output_host_tms[output_name]
-            desc.hstack_factor = tm.hstack_factor
-            desc.vstack_factor = tm.vstack_factor
-            desc.stack_row_major = tm.row_major
-        return desc
-
-    def get_ordered_output_queues(self) -> List[DramIODesc]:
-        assert self.be_api
-        ordered_output_queues: List[DramIODesc] = []
-        ordered_outputs = self.compiled_graph_state.ordered_output_names
-        for output_name in ordered_outputs:
-            ordered_output_queues.append(self.get_output_queue_descriptor(output_name))
-        return ordered_output_queues
-
-    def get_ordered_bw_output_queues(self) -> List[DramIODesc]:
-        """
-        For each input, find the queue that holds its gradients (if requires_grad was set), and return its descriptor
-        """
-        ordered_bw_output_queues: List[DramIODesc] = []
-        ordered_input_gradients = self.compiled_graph_state.ordered_input_gradient_names
-        for output_name in ordered_input_gradients:
-            ordered_bw_output_queues.append(self.get_output_queue_descriptor(output_name))
-        return ordered_bw_output_queues
-    
-    def get_intermediate_activation_queues(self) -> List[DramIODesc]:
-        assert self.be_api
-        return [self.get_output_queue_descriptor(output_queue_name) for op_name, output_queue_name in self.compiled_graph_state.ordered_intermediate_activation_names]
-
-    @classmethod
-    def read_queues(
-        cls, 
-        queues: List[DramIODesc], 
-        original_shapes: List[Tuple[int, ...]], 
-        runtime_tensor_transforms: Optional[List[RuntimeTensorTransform]], 
-        requires_grad: List[bool], 
-        single_output: bool, 
-        rd_ptr: int = -1, 
-        shutdown_event: Optional[EventClass] = None, 
-        clone: bool = False,
-        has_microbatch_dim: bool = True
-    ) -> List[Tensor]:
-        ret = []
-        tensors = []
-        out_descs = []
-        if runtime_tensor_transforms is None:
-            runtime_tensor_transforms = [None] * len(queues)
-        for i, outq in enumerate(queues):
-            logger.debug("Reading output queue {}", outq.name)
-            out_desc = PytorchTensorDesc()
-            timeout = 10 # TODO: add control
-            resp = BackendStatusCode.RuntimeError
-            for _ in range(timeout):
-                resp = get_output(outq, out_desc, single_output, 1, rd_ptr)
-                if resp != BackendStatusCode.TimeoutError:
-                    break
-
-                if shutdown_event and shutdown_event.is_set():
-                    break
-
-            if resp == BackendStatusCode.TimeoutError:
-                shutdown_event.set()
-                raise RuntimeError("Timeout while reading " + outq.name)
-
-            assert resp == BackendStatusCode.Success, "Error while reading output"
-            cls._capture_tensor(out_desc, outq)
-            tensors.append(Tensor.create_from_tensor_descriptor(out_desc))
-            out_descs.append(out_desc)
-
-        concat_transforms = [transform for transform in runtime_tensor_transforms if transform is not None and transform.type == RuntimeTensorTransformType.Concatenate]
-        if len(concat_transforms) > 0:
-            def get_index(transform):
-                return transform.concat_index
-            for i in range(len(concat_transforms) // 2):
-                group = [transform for transform in concat_transforms if transform.concat_group == i]
-                group.sort(key=get_index)
-                tensors_to_concat = [tensors[runtime_tensor_transforms.index(transform)] for transform in group]
-
-                inserted = False
-                out_descs_to_remove = []
-                for i in range(len(tensors)):
-                    if (tensors[i]) in tensors_to_concat:
-                        if not inserted:
-                            tensor = Tensor.create_from_tensor_descriptor(pytorch_tensor_to_tensor_desc(torch.cat([t.value() for t in tensors_to_concat], dim=group[0].concat_dim)))
-                            tensors[i] = tensor
-                            inserted = True
-                        elif clone:
-                            out_descs_to_remove.append(out_descs[i])
-                            free_tensor(out_descs[i])
-                for out_desc in out_descs_to_remove:
-                    out_descs.remove(out_desc)
-                tensors = [tensor for tensor in tensors if tensor not in tensors_to_concat]
-
-
-        assert len(original_shapes) == len(tensors)
-        assert len(requires_grad) == len(tensors)
-        for i, tensor in enumerate(tensors):
-            tensor = tensors[i].narrow_to_original_shape(tuple(original_shapes[i]), runtime_tensor_transforms[i].reinterpreted_shape.as_list() if runtime_tensor_transforms[i] is not None else None, \
-                                                     has_microbatch_dim=has_microbatch_dim, unpadded_shape=runtime_tensor_transforms[i].unpadded_shape.as_list() if runtime_tensor_transforms[i] is not None else None)
-
-            if requires_grad[i]:
-                tensor = tensor.detach()
-                tensor.set_requires_grad(True)
-
-            if clone:
-                tensor = tensor.clone()
-                free_tensor(out_descs[i])
-            ret.append(tensor)
-        logger.debug("Done reading queues")
-        return ret
-
-    @classmethod
-    def pop_queues(cls, queues: List[DramIODesc], single_output: bool):
-        for outq in queues:
-            logger.debug("Popping from queue {}", outq.name)
-            assert pop_output(outq, single_output, 1) == BackendStatusCode.Success, "Error while popping output"
-
-    def _get_ordered_queues(self, names: List[str]) -> List[DramIODesc]:
-        ordered_queues = []
-        for i, name in enumerate(names):
-            desc = self.be_api.get_queue_descriptor(name)
-            assert translate_addresses(desc) == BackendStatusCode.Success, f"Failed to translate addresses: {desc.name}"
-            ordered_queues.append(desc)
-        return ordered_queues
-
-    def get_ordered_input_queues(self) -> List[DramIODesc]:
-        return self._get_ordered_queues(self.compiled_graph_state.ordered_input_names)
-
-    def get_ordered_bw_input_queues(self) -> List[DramIODesc]:
-        return self._get_ordered_queues(self.compiled_graph_state.ordered_output_gradient_names)
-
-    def get_ordered_target_queues(self) -> List[DramIODesc]:
-        return self._get_ordered_queues(self.compiled_graph_state.ordered_target_names)
-
-    @classmethod
-    def push_input(cls, queue_desc: DramIODesc, tensor_desc: Union[PytorchTensorDesc, TilizedTensorDesc], single_input: bool = True, timeout_secs: int = 1, ram_address: int = 0):
-        if isinstance(tensor_desc, TilizedTensorDesc):
-            assert push_input(queue_desc, tensor_desc, timeout_secs, ram_address) == BackendStatusCode.Success, "Error while pushing tilized inputs"
-        else:
-            assert push_input(queue_desc, tensor_desc, single_input, timeout_secs, ram_address) == BackendStatusCode.Success, "Error while pushing inputs"
-
-    @classmethod
-    def push_to_queues(cls, ordered_input_queues: List[DramIODesc], tensors: List[PytorchTensorDesc], single_input: bool):
-        assert len(tensors) == len(ordered_input_queues), "Incorrect number of tensors provided on input"
-        for i, inq in enumerate(ordered_input_queues):
-            logger.debug("Pushing to queue {}", inq.name)
-            logger.trace(tensors[i].shape)
-            logger.trace(tensor_desc_to_pytorch_tensor(tensors[i]))
-            cls._capture_tensor(tensors[i], inq)
-            BackendAPI.push_input(inq, tensors[i], single_input, 1, -1) == BackendStatusCode.Success, "Error while pushing inputs"
-
-    def update_device_paramaters(self, parameter_values: Dict[str, torch.Tensor]):
-        """
-        Push new parameter values to the device
-        """
-        device_constants_and_parameters = get_device_constant_and_parameters(self.device, updated_parameter_values=parameter_values)
-        for parameter_name in self.compiled_graph_state.ordered_parameter_node_names:
-            pq = self.be_api.get_queue_descriptor(parameter_name)
-            assert translate_addresses(pq) == BackendStatusCode.Success, f"Failed to translate addresses: {pq.name}"
-            logger.debug("Pushing to parameter {}", pq.name)
-            value = const_eval_tensor(device_constants_and_parameters, self.compiled_graph_state.consteval_trace, self.compiled_graph_state.parameter_to_tile_dims, parameter_name)
-            value = detach_tensors([value], fix_non_contiguos=True)[0]
-            BackendAPI.push_input(pq, pytorch_tensor_to_tensor_desc(value), True, 1, 0) == BackendStatusCode.Success
-
-    def push_constants_and_parameters(self, translate: bool = False):
-        # Push constants
-        assert self.be_api
-
-        for constant_name in self.compiled_graph_state.ordered_constant_node_names:
-            inq = self.be_api.get_queue_descriptor(constant_name)
-            if translate:
-                assert translate_addresses(inq) == BackendStatusCode.Success, f"Failed to translate addresses: {inq.name}"
-            value = self.compiled_graph_state.get_constant_tensor(constant_name)
-            logger.debug("Pushing to constant {}", inq.name)
-            df = None
-            if inq.data_format == DataFormat.RawUInt32:
-                df = DataFormat.RawUInt32
-            BackendAPI.push_input(inq, pytorch_tensor_to_tensor_desc(value, df), True, 1, -1) == BackendStatusCode.Success
-
-        # Push parameters
-        for parameter_name in self.compiled_graph_state.ordered_parameter_node_names:
-            pq = self.be_api.get_queue_descriptor(parameter_name)
-            if translate:
-                assert translate_addresses(pq) == BackendStatusCode.Success, f"Failed to translate addresses: {pq.name}"
-            logger.debug("Pushing to parameter {}", pq.name)
-            value = self.compiled_graph_state.get_parameter_tensor(parameter_name)
-            BackendAPI.push_input(pq, pytorch_tensor_to_tensor_desc(value), True, 1, 0) == BackendStatusCode.Success
-
-    def push_optimizer_parameters(self, translate: bool = False, only_scheduler_params: bool = False):
-        assert self.be_api
-        if only_scheduler_params:
-            params_to_push = self.device.get_scheduler_params(is_buda=True)
-        else:
-            params_to_push = self.device.get_optimizer_params(is_buda=True)
-
-        for param_name, opt_params in params_to_push.items():
-            for input_name, param_key in self.compiled_graph_state.optimizer_param_info[param_name]:
-                if param_key not in opt_params:
-                    # If only_scheduler_params, opt_params contains subset of param keys
-                    continue
-                tensor = opt_params[param_key]
-                assert tensor is not None, f"Optimizer parameter tensor missing for {param_name} / {param_key}"
-
-                opq = self.be_api.get_queue_descriptor(input_name)
-                if translate:
-                    assert translate_addresses(opq) == BackendStatusCode.Success, f"Failed to translate addresses: {opq.name}"
-
-                value = const_eval_tensor({input_name: tensor.value()}, self.compiled_graph_state.consteval_trace, self.compiled_graph_state.parameter_to_tile_dims, input_name)
-                value = detach_tensors([value], fix_non_contiguos=True)[0]
-                logger.debug("Pushing to optimizer parameter {}", opq.name)
-                BackendAPI.push_input(opq, pytorch_tensor_to_tensor_desc(value), True, 1, 0) == BackendStatusCode.Success
-
-
diff --git a/pybuda/pybuda/capture_fx_graph.py b/pybuda/pybuda/capture_fx_graph.py
deleted file mode 100644
index 79fa95b96..000000000
--- a/pybuda/pybuda/capture_fx_graph.py
+++ /dev/null
@@ -1,441 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-
-from loguru import logger
-
-import torch
-
-from pybuda.config import _get_global_compiler_config
-import pybuda
-from pybuda.tensor import to_buda_tensors, to_pt_tensors
-from pybuda.tvm_utils import flatten_inputs
-
-from pybuda._C.graph import Graph, create_op_node, create_data_edge, create_parameter_input, create_activation_input, create_output, create_constant_input, create_target_input, add_partial_datacopy_edge, RuntimeTensorTransform, RuntimeTensorTransformType, Shape, OpType
-
-from pybuda.tensor import pytorch_dtype_to_buda_dataformat
-import os
-import sys
-import math
-
-from typing import List
-from pybuda.tvm_to_python import Operation
-from pybuda.python_codegen import PyTorchWriter, PyBudaWriter, PythonWriter
-
-class PyBudaNode:
-    def __init__(self, op: OpType, args: List[torch.fx.node.Node]):
-        self.op = op
-        self.args = args
-        self.shape = None
-        self.dtype = None
-        self.wrap_tuple = None
-
-def process_dummy_no_attr(node, pybuda_op_name):
-    return PyBudaNode(OpType(pybuda_op_name, []), node.args)
-
-def process_dummy_attr_in_args(node, pybuda_op_name):
-    attrs = node.args[1] if len(node.args) == 2 else node.args[1:]
-    if not isinstance(attrs, (list, tuple)):
-        attrs = [attrs, ]
-    return PyBudaNode(OpType(pybuda_op_name, attrs), [node.args[0], ])
-
-def process_expand(node, pybuda_op_name):
-    return PyBudaNode(OpType(pybuda_op_name, []), [node.args[0], ])
-
-def process_flatten(node, pybuda_op_name):
-    return PyBudaNode(OpType(pybuda_op_name, [-1, ]), [node.args[0], ])
-
-def process_gelu(node, pybuda_op_name):
-    return PyBudaNode(OpType(pybuda_op_name, ["none", ]), node.args)
-
-def process_getitem(node, pybuda_op_name):
-    breakpoint()
-    num_dims = sum([(isinstance(dim, slice) and (dim.start is not None or dim.stop is not None)) or (not isinstance(dim, slice) and dim is not None) for dim in node.args[1]])
-    if num_dims == 0:
-        return PyBudaNode(OpType("nop", []), [node.args[0], ])
-    assert num_dims <= 1, "TODO: Support multi axis getitem"
-    for dim, slice_index in enumerate(node.args[1]):
-        if isinstance(slice_index, slice) and slice_index.start is None and slice_index.stop is None:
-            continue
-        if isinstance(slice_index, int):
-            start = slice_index
-            stop = None
-            stride = 1
-        else:
-            start = slice_index.start
-            stop = slice_index.stop
-            if slice_index.step is not None:
-                stride = slice_index.step
-            else:
-                stride = 1
-
-    if stop is None:
-        stop = start + 1
-    if stop < 0:
-        stop += node.args[0].meta['tensor_meta'].shape[dim]
-    
-    return PyBudaNode(OpType(pybuda_op_name, [dim, start, stop, stride]), [node.args[0], ])
-
-def process_transpose(node, pybuda_op_name):
-    torch_op_name = node.target.__name__
-    if torch_op_name == "permute":
-        dim0 = None
-        dim1 = None
-        for i, arg in enumerate(node.args[1]):
-            if arg != i:
-                if dim0 is None:
-                    dim0 = i
-                elif dim1 is None:
-                    dim1 = i
-                else:
-                    assert False, "Multi axis permute needs to be added to pybuda"
-
-    elif torch_op_name == "transpose":
-        dim0 = node.args[1]
-        dim1 = node.args[2]
-    
-    dims = len(node.args[0].meta['tensor_meta'].shape)
-    if dim0 > 0:
-        dim0 -= dims
-    if dim1 > 0:
-        dim1 -= dims
-    if dim0 > dim1:
-        dim0, dim1 = dim1, dim0
-
-    named_attrs = {"dim0": dim0, "dim1": dim1, "z_dim_slice": -1}
-
-    return PyBudaNode(OpType(pybuda_op_name, named_attrs=named_attrs), [node.args[0], ])
-
-def process_softmax(node, pybuda_op_name):
-    if len(node.args) == 1:
-        assert "dim" in node.kwargs, "dim must be specified"
-        dim = node.kwargs["dim"]
-    else:
-        dim = node.args[1]
-    
-    if dim >= 0:
-        dim -= len(node.args[0].meta['tensor_meta'].shape)
-    stable = 1
-    attrs = [dim, stable]
-    return PyBudaNode(OpType(pybuda_op_name, attrs), [node.args[0], ])
-
-def process_matmul(node, pybuda_op_name):
-    assert len(node.args) == 2 or len(node.args) == 3
-    if len(node.args) == 3:
-        # Torch addmm inputs are bias, LHS, RHS
-        args = [node.args[1], node.args[2], node.args[0]]
-    else:
-        args = node.args
-    
-    return PyBudaNode(OpType(pybuda_op_name, []), args)
-
-def process_embedding(node, pybuda_op_name):
-    assert len(node.args) == 2 or len(node.args) == 3
-
-    #TODO Handle padding index (arg 2)
-    args = [node.args[0], node.args[1]]
-    return PyBudaNode(OpType(pybuda_op_name, []), args)
-
-def process_layernorm(node, pybuda_op_name):
-    assert len(node.args) == 5
-    dim = -1
-    epsilon = node.args[4]
-    attrs = [dim, epsilon]
-
-    args = [node.args[0], node.args[2], node.args[3]]
-    pybuda_node = PyBudaNode(OpType(pybuda_op_name, attrs), args)
-    pybuda_node.shape = node.meta['tensor_meta'][0].shape
-    pybuda_node.dtype = pytorch_dtype_to_buda_dataformat(node.meta['tensor_meta'][0].dtype)
-    pybuda_node.wrap_tuple = True
-    return pybuda_node
-
-def process_select(node, pybuda_op_name):
-    assert len(node.args) == 3
-
-    dim = node.args[1]
-    if dim >= 0:
-        dim -= len(node.args[0].meta['tensor_meta'].shape)
-    index = node.args[2]
-    attrs = [dim, index, index+1, 1]
-    args = [node.args[0], ]
-    return PyBudaNode(OpType(pybuda_op_name, attrs), args)
-
-def process_slice(node, pybuda_op_name):
-    assert len(node.args) == 4
-
-    dim = node.args[1]
-    start = node.args[2]
-    end = node.args[3]
-    if dim >= 0:
-        dim -= len(node.args[0].meta['tensor_meta'].shape)
-    if start == 0 and end == sys.maxsize:
-        pybuda_node = PyBudaNode(OpType("nop", []), [node.args[0], ])
-    else:
-        stride = 1
-        attrs = [dim, start, end, stride]
-        args = [node.args[0], ]
-        pybuda_node = PyBudaNode(OpType(pybuda_op_name, attrs), args)
-    return pybuda_node
-
-def process_usqueeze(node, pybuda_op_name):
-    assert len(node.args) == 2
-    dim = node.args[1]
-    input_ndim = len(node.meta['tensor_meta'].shape)
-
-    if dim >= 0:
-        dim -= input_ndim
-    
-    attrs = [dim, input_ndim]
-    return PyBudaNode(OpType(pybuda_op_name, attrs), [node.args[0], ])
-
-def process_reshape(node, pybuda_op_name):
-    attrs = node.args[1].copy() if len(node.args) == 2 else node.args[1:].copy()
-    if not isinstance(attrs, (list, tuple)):
-        attrs = [attrs, ]
-
-    input_volume = 1
-    for dim in node.args[0].meta['tensor_meta'].shape:
-        input_volume *= dim
-
-    blank_index = None
-    reshape_volume = 1
-    for i, dim in enumerate(attrs):
-        if dim == -1:
-            assert blank_index is None, "Only one dimension can be -1"
-            blank_index = i
-        else:
-            reshape_volume *= dim
-    
-    if blank_index is not None:
-        attrs[blank_index] = input_volume//reshape_volume
-
-    input_volume = node.args[0].meta['tensor_meta'].shape[0]
-    return PyBudaNode(OpType(pybuda_op_name, attrs), [node.args[0], ])
-
-def process_power(node, pybuda_op_name):
-    if isinstance(node.args[1], int) or isinstance(node.args[1], float) and math.isclose(node.args[1] / int(node.args[1]), 1.0):
-        attrs = [int(node.args[1]), ]
-        pybuda_node = PyBudaNode(OpType("pow", attrs), [node.args[0], ])
-    else:
-        pybuda_node = PyBudaNode(OpType("power", []), node.args)
-    return pybuda_node
-
-def process_cat(node, pybuda_op_name):
-    dim = node.args[1]
-    if dim >= 0:
-        dim -= len(node.meta['tensor_meta'].shape)
-    pybuda_node = PyBudaNode(OpType(pybuda_op_name, [dim, ]), node.args[0])
-    return pybuda_node
-
-
-dynamo_to_pybuda_function = {
-    "_softmax"                      : (process_softmax, "softmax"),
-    "add"                           : (process_dummy_no_attr, "add"),
-    "addmm"                         : (process_matmul, "matmul"),
-    "bmm"                           : (process_matmul, "matmul"),
-    "cat"                           : (process_cat, "concatenate"),
-    "clone"                         : (process_dummy_no_attr, "nop"),
-    "contiguous"                    : (process_dummy_no_attr, "nop"),
-    "div"                           : (process_matmul, "divide"),
-    "embedding"                     : (process_embedding, "embedding"),
-    "expand"                        : (process_expand, "nop"),
-    "flatten"                       : (process_flatten, "reshape"),
-    "gelu"                          : (process_gelu, "gelu"),
-    "getitem"                       : (process_getitem, "index"),
-    "iadd"                          : (process_dummy_no_attr, "add"),
-    "matmul"                        : (process_dummy_no_attr, "matmul"),
-    "mm"                            : (process_matmul, "matmul"),
-    "mul"                           : (process_dummy_no_attr, "multiply"),
-    "native_layer_norm"             : (process_layernorm, "layernorm"),
-    "permute"                       : (process_transpose, "transpose"),
-    "select"                        : (process_select, "index"),
-    "slice"                         : (process_slice, "index"),
-    "softmax"                       : (process_softmax, "softmax"),
-    "sub"                           : (process_dummy_no_attr, "subtract"),
-    "tanh"                          : (process_dummy_no_attr, "tanh"),
-    "to"                            : (process_dummy_no_attr, "nop"), #TODO
-    "_to_copy"                      : (process_dummy_no_attr, "nop"), #TODO
-    "transpose"                     : (process_transpose, "transpose"),
-    "truediv"                       : (process_dummy_no_attr, "divide"),
-    "unsqueeze"                     : (process_usqueeze, "unsqueeze"),
-    "view"                          : (process_reshape, "reshape"),
-    "where"                         : (process_dummy_no_attr, "where"),
-    "pow"                           : (process_power, ""),
-}
-
-torch_constant_ops = {
-    "ones"                           : torch.ones,
-    "zeros"                          : torch.zeros,
-    "arange"                         : torch.arange,
-    "full"                           : torch.full,
-}
-
-# graph = None
-node_to_id = {}
-param_to_id = {}
-const_to_id = {}
-id_to_intermed = {}
-
-def get_pybuda_node(torch_op_name, node):
-    if torch_op_name in dynamo_to_pybuda_function:
-        return dynamo_to_pybuda_function[torch_op_name][0](node, dynamo_to_pybuda_function[torch_op_name][1])
-    else:
-        print(f"Unsupported op {torch_op_name}")
-        breakpoint()
-        assert False, f"Unsupported op {torch_op_name}"
-
-# Check to see if subgraph is already on device
-def is_on_device(subgraph_idx: int):
-    pass
-
-# Remove all nodes associated with subgraph
-def remove_subgraph(subgraph_idx: int):
-    pass
-
-def add_op(graph, node, name, pybuda_node, subgraph_idx):
-    global node_to_id
-    shape = node.meta['tensor_meta'].shape if pybuda_node.shape is None else pybuda_node.shape
-    dtype = pytorch_dtype_to_buda_dataformat(node.meta['tensor_meta'].dtype) if pybuda_node.dtype is None else pybuda_node.dtype
-
-    add_constants_if_necessary(graph, pybuda_node.args, subgraph_idx)
-    nid = create_op_node(
-            graph,
-            f"{name}_{subgraph_idx}",
-            pybuda_node.op,
-            [int(dim) for dim in shape],
-            pytorch_dtype_to_buda_dataformat(dtype),
-            subgraph_idx,
-            {})
-    
-    for i, input_node in enumerate(pybuda_node.args):
-        create_data_edge(graph, node_to_id[input_node], 0, nid, i, [])
-
-    eval_args = [id_to_intermed[node_to_id[arg]] if isinstance(arg, torch.fx.node.Node) else arg for arg in node.args]
-    for idx, arg in enumerate(eval_args):
-        if isinstance(arg, (list, tuple)):
-            eval_args[idx] = [id_to_intermed[node_to_id[a]] if isinstance(a, torch.fx.node.Node) else a for a in arg]
-    kwargs = {k:v for k, v in node.kwargs.items() if k != "device"}
-    id_to_intermed[nid] = node.target(*eval_args, **kwargs)
-    if (pybuda_node.wrap_tuple):
-        nid = (nid,)
-    return nid
-
-def add_input(graph, node, subgraph_idx, module_inputs):
-    nid = create_activation_input(
-            graph,
-            f"{node.name}_{subgraph_idx}",
-            [int(dim) for dim in node.meta['tensor_meta'].shape],
-            node.meta["tensor_meta"].requires_grad,
-            pytorch_dtype_to_buda_dataformat(node.meta["tensor_meta"].dtype),
-            subgraph_idx)
-    module_inputs.append(nid)
-    return nid
-    
-
-def add_constant(graph, name, tensor, subgraph_idx):
-    if tensor in const_to_id:
-        return const_to_id[tensor]
-    nid = create_constant_input(
-            graph, 
-            f"{name}_{subgraph_idx}",
-            tensor,
-            [int(dim) for dim in tensor.shape],
-            pytorch_dtype_to_buda_dataformat(tensor.dtype),
-            subgraph_idx)
-    const_to_id[tensor] = nid
-    return nid
-
-def add_param(graph, name, torch_param, subgraph_idx):
-    if name in param_to_id:
-        return param_to_id[name]
-    nid = create_parameter_input(
-            graph, 
-            name,
-            [int(dim) for dim in torch_param.shape],
-            torch_param.requires_grad,
-            pytorch_dtype_to_buda_dataformat(torch_param.dtype),
-            subgraph_idx)
-    param_to_id[name] = nid
-    return nid
-
-def add_outputs(graph, node, subgraph_idx, output_nids, output_requires_grad, output_tensors):
-    global node_to_id
-    for index, meta in enumerate(node.meta['tensor_meta']):
-        arg = node.args[0][index]
-        nid = create_output(
-                graph, 
-                node.name + "_" + arg.name + "_" + str(subgraph_idx),
-                [int(dim) for dim in meta.shape],
-                pytorch_dtype_to_buda_dataformat(meta.dtype),
-                False,  #TODO Loss output
-                subgraph_idx)
-        create_data_edge(graph, node_to_id[arg], 0, nid, index, [])
-        output_nids.append(nid)
-        output_requires_grad.append(meta.requires_grad)
-        output_tensors.append(id_to_intermed[node_to_id[arg]])
-
-def add_constants_if_necessary(graph, ops, subgraph_idx):
-    global node_to_id
-    for op in ops:
-        if isinstance(op, (float, int)):
-            if op in node_to_id:
-                continue
-            tensor = torch.ones([1]) * op
-            node_to_id[op] = add_constant(graph, f"{op}", tensor, subgraph_idx)
-            id_to_intermed[node_to_id[op]] = tensor
-
-def append_to_graph(graph, module, aten_module, rand_atan_inputs, activations, subgraph_idx):
-    torch.fx.passes.shape_prop.ShapeProp(aten_module).propagate(*rand_atan_inputs)
-    # aten_module.graph.print_tabular()
-
-    module_inputs = []
-    output_nids = []
-    output_requires_grad = []
-    output_tensors = []
-
-    def process_function(node):
-        global node_to_id
-        op_name = node.target.__name__
-        if op_name in torch_constant_ops:
-            kwargs = {k:v for k, v in node.kwargs.items() if k != "device"}
-            tensor = torch_constant_ops[op_name](*node.args, **kwargs)
-            if len(tensor.shape) == 0:
-                tensor = tensor.unsqueeze(0)
-            node_to_id[node] = add_constant(graph, node.name, tensor.float(), subgraph_idx)
-            id_to_intermed[node_to_id[node]] = tensor
-        elif op_name == "getitem":
-            assert isinstance(node_to_id[node.args[0]], (list, tuple))
-            node_to_id[node] = node_to_id[node.args[0]][node.args[1]]
-            id_to_intermed[node_to_id[node]] = id_to_intermed[node_to_id[node]][node.args[1]]
-        else:
-            pybuda_node = get_pybuda_node(op_name, node)
-            node_to_id[node] = add_op(graph, node, node.name, pybuda_node, subgraph_idx)
-
-    params = list(module.named_parameters(remove_duplicate=False)) + list(module.named_buffers(remove_duplicate=False))
-    assert len(params) == len(torch._guards.TracingContext.get().params_flat)
-
-    for index, node in enumerate(aten_module.graph.nodes):
-        if index < len(params):
-            # params are located first in the args list
-            assert node.op == "placeholder"
-            assert node.meta['val'].size() == rand_atan_inputs[index].shape
-            node_to_id[node] = add_param(graph, params[index][0], params[index][1].data, subgraph_idx)
-            id_to_intermed[node_to_id[node]] = params[index][1].data
-            continue
-        if node.op == "placeholder":
-            node_to_id[node] = add_input(graph, node, subgraph_idx, module_inputs)
-            id_to_intermed[node_to_id[node]] = activations[index - len(params)]
-        elif node.op == "get_attr":
-            assert False #TODO
-            node_to_id[node] = add_param(graph, node.target, module.state_dict()[node.target], subgraph_idx)
-        elif node.op == "call_function":
-            process_function(node)
-        elif node.op == "output":
-            add_outputs(graph, node, subgraph_idx, output_nids, output_requires_grad, output_tensors)
-        else:
-            assert False, f"Unsupported op {node.op}"
-
-
-    graph.register_module_inputs(module_inputs)
-    graph.register_module_outputs(output_nids, output_requires_grad)
-    return graph, id_to_intermed, output_tensors
diff --git a/pybuda/pybuda/ci.py b/pybuda/pybuda/ci.py
index fe1eef16d..f41087fa6 100644
--- a/pybuda/pybuda/ci.py
+++ b/pybuda/pybuda/ci.py
@@ -2,6 +2,7 @@
 
 # SPDX-License-Identifier: Apache-2.0
 import os
+import pwd
 import tempfile
 import filelock
 import shutil
@@ -69,8 +70,8 @@ def create_symlink(target_path: str, symlink_path: str, *, remove_existing: bool
     # Path objects for target and symlink
     target, symlink = Path(target_path), Path(symlink_path)
 
-    # Create a lock file in a standard temporary directory
-    lock_file_path = os.path.join(tempfile.gettempdir(), f"{symlink.name}.lock")
+    # Create a lock file in a standard temporary directory under user's name
+    lock_file_path = os.path.join(tempfile.gettempdir(), pwd.getpwuid(os.getuid()).pw_name, f"{symlink.name}.lock")
     lock = filelock.FileLock(lock_file_path)
 
     with lock:
diff --git a/pybuda/pybuda/compile.py b/pybuda/pybuda/compile.py
index 0deb46639..5d806f8b2 100644
--- a/pybuda/pybuda/compile.py
+++ b/pybuda/pybuda/compile.py
@@ -3,14 +3,20 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 from typing import Optional, List, Dict, Any, Tuple, Union
+from dataclasses import dataclass, field
 
 import torch
+import tensorflow as tf
 from loguru import logger
 
-from .ttdevice import TTDevice
-from .tensor import Tensor, pad_pytorch_tensor_to_buda
+import pybuda
+from pybuda.compiled_graph_state import CompiledGraphState, CompiledModel, CompileResults
+from pybuda.config import (
+    CompilerConfig,
+    CompileDepth,
+    _get_global_compiler_config,
+)
 from pybuda._C import (
-    BudaNetlist,
     link_past_cache_ios,
     move_index_to_mm_weights,
     run_post_initial_graph_passes,
@@ -18,43 +24,20 @@
     run_post_optimize_decompose_graph_passes,
     run_consteval_graph_pass,
     run_post_autograd_graph_passes,
-    run_pre_placer_buda_passes,
-    run_post_placer_buda_passes,
-    run_pre_netlist_generation_buda_passes,
-    run_placer_buda_passes,
     run_pre_lowering_passes,
-    lower_to_buda_netlist,
-    merge_netlists,
     dump_graph,
-    dump_epoch_type_graphs,
-    dump_epoch_id_graphs,
-    is_subset_of_instructions,
-    PostPlacerConfig,
-    UnsupportedHWOpsError,
-    NopInsertionInstruction,
 )
-import pybuda
-from .parameter import Parameter
-from pybuda._C.backend_api import BackendType, BackendDevice
 import pybuda._C.autograd as pyautograd
-import pybuda._C.balancer as pybalancer
-import pybuda._C.pattern_matcher as pypattern_matcher
-import pybuda._C.scheduler as pyscheduler
-from pybuda._C.placer import match_op_names_to_placer_overrides
+import pybuda._C.graph as pygraph
 from pybuda._C.graph import Graph
+import pybuda.ci as ci
+from pybuda.module import PyBudaModule, wrap_module
+from pybuda.parameter import Parameter
+from pybuda.pybudaglobal import state_changed, clear_state_changed
 import pybuda.query as query
-from .verify import VerifyConfig, do_verify, verify_golden, verify_net2pipe, _generate_random_losses, _run_pytorch_backward, get_intermediate_tensors
-import pybuda._C.graph as pygraph
-from .config import (
-    CompilerConfig,
-    CompileDepth,
-    _get_global_compiler_config,
-)
-from .pybudaglobal import state_changed, clear_state_changed
-from pybuda import PyBudaModule
-from .tensor import Tensor, to_pt_tensors, to_buda_tensors
-from . import ci, utils
-from pybuda.tools.net2reportify import net2placement
+from pybuda.tensor import Tensor, to_pt_tensors
+from pybuda.verify import VerifyConfig, do_verify, _generate_random_losses, _run_pytorch_backward
+
 
 LAST_SUCCESSFUL_STAGE = None
 def init_log_last_successful_compile_stage():
@@ -109,25 +92,41 @@ def generate_override_config(graph, balancer_solution, placer_solution, nop_inst
     with open(path, "w") as fd:
         yaml.dump(overrides, fd, indent=2)
 
-
-class CompileResults:
-    """
-    Wrapper for result from the graph compiler. Contains initial and final graphs, output tensors, 
-    and, optionally golden results for final output and intermediates, if desired.
-    """
-    outputs: List[Tensor]
-    golden_outputs: List[torch.Tensor]
-    golden_intermediates: Dict[str, torch.Tensor]
-    initial_graph: Graph
-    lowered_graph: Graph
-    netlist_filename: str
-    perf_model_results: Dict[str, float]
-
-    pass_specific_output_kwargs: Dict[str, Any] = {}
+@dataclass
+class CompileContext:
+    modules: List[PyBudaModule]
+    graph_name: str
+    compiler_cfg: CompilerConfig
+    verify_cfg: VerifyConfig
+    microbatch_size: int
+    microbatch_count: int
+    inputs: Optional[Tuple[Union[Tensor, List[Any], Dict[str, Any]],...]] = None
+    graph: Optional[Graph] = None
+    losses: Optional[List[Tensor]] = None
+    output_kwargs: Dict[str, Any] = field(default_factory=dict)
+    targets: List[Tensor] = field(default_factory=list)
+    initial_graph: Optional[Graph] = None
+    final_graph: Optional[Graph] = None
+    stage: CompileDepth = CompileDepth.INIT_COMPILE
+    initial_graph_copy: Optional[Graph] = None
+    outputs: Tuple[Tensor, ...] = field(default_factory=tuple)
+    intermediate_tensors: Dict[int, Tensor] = field(default_factory=dict)
+    parameter_dict: Dict[str, torch.Tensor] = field(default_factory=dict)
+    input_grads: List[torch.Tensor] = field(default_factory=list)
+    netlist_filename: Optional[str] = None
+    perf_model_results: Optional[Dict[str, float]] = None
+    use_interactive_placer: bool = False
+    fracture_chip_id_assignments: Dict[str, int] = field(default_factory=dict)
+    buda_targets: List[Tensor] = field(default_factory=list)
+    buda_losses: List[Tensor] = field(default_factory=list)
+    placer_retry_count: int = 0
+    backend_output_directory: str = ""
+    in_recompile: bool = False
+    recompile_count: int = 0
+    target_cycles_offset: int = 0
 
 def calculate_grads(
         outputs: Tuple[Tensor, ...],
-        device: "TTDevice",
         intermediate_golden_tensors: Dict,
         is_buda: bool,
         losses=None):
@@ -146,21 +145,184 @@ def calculate_grads(
         # Check if we need to run, or if gradients have been calculated already
         if o.value().grad is None and o.requires_grad:
             run_backward = True
-            break 
-    
+            break
+
     if not losses or run_backward:
 
         if losses is None and device.loss_module is None:
             losses = _generate_random_losses(outputs, is_buda)
-        
+
         if run_backward:
             _run_pytorch_backward(outputs, device, losses)
 
     return losses
 
+def compile_main(
+        module: torch.nn.Module | tf.keras.Model | PyBudaModule,
+        sample_inputs: Optional[Tuple[Union[Tensor, List[Any], Dict[str, Any]],...]] = None,
+        module_name: Optional[str] = None,
+):
+    """
+    Main entry point for compiling modules from different frameworks for Tenstorrent devices.
+    """
+
+    assert isinstance(module, torch.nn.Module) or isinstance(module, tf.keras.Model) or isinstance(module, PyBudaModule), "Only PyTorch, TensorFlow, and PyBuda modules are supported."
+
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.apply_env_config_overrides()
+
+    if module_name is None:
+        module_name = module.__class__.__name__
+
+    assert module_name is not None
+
+    logger.info("Compiling module {}", module_name)
+
+    if (sample_inputs is None):
+        logger.error("No sample inputs provided for module {}", module_name)
+        assert False
+
+    assert sample_inputs is not None
+
+    wrapped_module = wrap_module(module, module_name)
+
+    compile_context: CompileContext = CompileContext(
+        modules=[wrapped_module],
+        graph_name=module_name,
+        compiler_cfg=compiler_cfg,
+        verify_cfg=VerifyConfig.disabled(),
+        microbatch_size=1,
+        microbatch_count=1,
+        inputs=sample_inputs,
+    )
+
+    return pybuda_compile_from_context(compile_context)
+    
+
+def pybuda_compile_from_context(context: CompileContext) -> CompiledModel:
+    """
+    Run front-end compile passes and generate a Buda netlist, with a given compile context.
+
+    Parameters
+    ----------
+    context: CompileContext
+        Contains all needed info to run compile passes.
+
+    Returns
+    -------
+    CompileResults
+
+    """
+
+    # Map stages to functions which execute them.
+    stage_to_func = {
+        CompileDepth.INIT_COMPILE: init_compile,
+        CompileDepth.GENERATE_INITIAL_GRAPH: generate_initial_graph,
+        CompileDepth.POST_INITIAL_GRAPH_PASS: run_post_initial_graph_pass,
+        CompileDepth.CONSTEVAL_GRAPH: run_consteval_pass,
+        CompileDepth.POST_PATTERN_MATCHER: run_post_pattern_matcher,
+        CompileDepth.OPTIMIZED_GRAPH: run_optimization_pass,
+        CompileDepth.AUTOGRAD: run_autograd_pass,
+        CompileDepth.POST_AUTOGRAD_PASS: run_post_autograd_pass,
+        CompileDepth.PRE_LOWERING_PASS: run_pre_lowering_pass,
+        CompileDepth.RUN_MLIR_COMPILER: run_mlir_compiler,
+        CompileDepth.FINISH_COMPILE: finish_compile,
+    }
+
+    while context.stage != CompileDepth.FULL:
+        logger.info("Running compile stage {}", context.stage.name.lower())
+
+        current_stage = context.stage
+        verify_cfg = context.verify_cfg
+        compiler_cfg = context.compiler_cfg
+
+        # Execute the current stage.
+        next_stage = stage_to_func[current_stage](context)
+
+        # Check if we need to stop compilation or perform verifications in the current stage.
+        should_early_stop_compilation = check_for_compilation_early_stop(compiler_cfg.compile_depth, current_stage)
+
+        can_verify = current_stage != CompileDepth.INIT_COMPILE and current_stage != CompileDepth.PRE_LOWERING_PASS and current_stage != CompileDepth.POST_PATTERN_MATCHER
+        should_verify = current_stage == CompileDepth.POST_AUTOGRAD_PASS and verify_cfg.verify_post_autograd_passes
+
+        if (verify_cfg.verify_all or (verify_cfg.verify_last and should_early_stop_compilation) or should_verify) and can_verify:
+            in_training = context.compiler_cfg.enable_training and current_stage.value >= CompileDepth.AUTOGRAD.value
+            assert False, "verification not working yet"
+            do_verify(current_stage.name.lower(), in_training, context.graph, context.inputs, context.parameter_dict, context.input_grads, context.outputs, dev, context.intermediate_tensors, verify_cfg, False, losses=context.losses, targets=context.targets)
+
+        if should_early_stop_compilation:
+            logger.info("Early stopping compilation at stage {}", current_stage.name.lower())
+            return generate_compile_results(context.verify_cfg, context.initial_graph_copy, context.outputs, context.intermediate_tensors, context.final_graph, pass_specific_output_kwargs=context.output_kwargs)
+
+        context.stage = next_stage
+
+    compile_results = generate_compile_results(
+        verify_cfg,
+        context.initial_graph_copy, context.outputs,
+        context.intermediate_tensors,
+        final_graph=context.final_graph,
+        pass_specific_output_kwargs = context.output_kwargs
+    )
+
+    compiled_graph_state = CompiledGraphState.from_compiled_graph(context.modules[0], compile_results)
+
+    compiled_module = CompiledModel(
+        compiled_graph_state,
+        context.output_kwargs["binary"]
+    )
+
+    logger.info("Compilation completed.")
+
+    return compiled_module
+
+
+def pybuda_compile_torch(
+        module_name: str,
+        module: torch.fx.GraphModule,
+        graph: Graph,
+        *inputs: Union[Tensor, List[Any], Dict[str, Any]]
+    ):
+    """
+    Entry point for pybuda compile for torch 2.0 api.
+
+    Parameters
+    ---------
+    module_name: str
+        Name of the module
+
+    module: torch.fx.GraphModule
+        Torch FX Module to compile
+
+    graph: Graph
+        Initial graph to compile (unlike other paths, the torch 2.0 path should already have an initial graph at this point)
+
+    inputs:
+        Sample inputs for the module
+
+    Returns
+    -------
+    CompileResults
+    """
+
+    inputs = list(inputs)
+
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.apply_env_config_overrides()
+    
+    compile_context: CompileContext = CompileContext(
+        modules=[module],
+        graph_name=module_name,
+        inputs=inputs,
+        compiler_cfg=compiler_cfg,
+        verify_cfg=VerifyConfig.disabled(),
+        microbatch_size=1,
+        microbatch_count=1,
+        graph=graph,
+    )
+
+    return pybuda_compile_from_context(compile_context)
 
 def pybuda_compile(
-        dev: TTDevice,
         graph_name: str,
         *inputs: Union[Tensor, List[Any], Dict[str, Any]],
         targets: List[Tensor] = [],
@@ -199,626 +361,781 @@ def pybuda_compile(
     Returns
     -------
     CompileResults
-    
+
     """
 
+    inputs = list(inputs)
     if verify_cfg is None:
         verify_cfg = VerifyConfig.disabled() # no verification config provided, disable by default
-    
+
     if compiler_cfg is None:
         compiler_cfg = _get_global_compiler_config()
 
-    force_full = bool(int(os.environ.get("PYBUDA_FORCE_FULL_COMPILE_DEPTH", "0")))
-    if force_full:
-        compiler_cfg.compile_depth = CompileDepth.FULL
+    compiler_cfg.apply_env_config_overrides()
+
+    compile_context: CompileContext = CompileContext(
+        graph_name=graph_name,
+        inputs=inputs,
+        compiler_cfg=compiler_cfg,
+        verify_cfg=verify_cfg,
+        microbatch_size=microbatch_size,
+        microbatch_count=microbatch_count,
+        targets=targets,
+        losses=losses,
+    )
+
+    return pybuda_compile_from_context(compile_context)
+
+def check_for_compilation_early_stop(desired_stage, current_stage):
+    """
+    Determines should current compilation process stop or not based on desired
+    and current phase of execution.
+
+    Parameters
+    ----------
+    desired_stage: CompileDepth
+        Desired phase for compiler early stopping.
+
+    current_stage: CompileDepth
+        Current phase for compiler early stopping.
+
+    Returns
+    -------
+    Boolean
+    """
+    # update global compile stage variable
+    global LAST_SUCCESSFUL_STAGE
+    LAST_SUCCESSFUL_STAGE = str(CompileDepth(current_stage.value).name)
+
+    if not CompileDepth.has_value(desired_stage.value):
+        raise Exception("Invalid compilation depth flag: {}".format(desired_stage.name))
+
+    if desired_stage.value == current_stage.value:
+        logger.info("Compilation early stopping after {}".format(current_stage.name))
+
+        return True
+
+    return False
+
+def placer_breaks_eval(value):
+    if type(value) is query.NodePredicateBuilder:
+        return value.eval()
+    elif type(value) is list:
+        return [placer_breaks_eval(v) for v in value]
+    else:
+        assert type(value) is str
+        return value
+
+def placer_op_overrides_eval(value):
+    assert type(value) is tuple
+    if type(value[0]) is query.NodePredicateBuilder:
+        return (value[0].eval(), value[1])
+    else:
+        return value
+
 
-    if len(targets) > 0:
-        assert dev.loss_module is not None, "Target provided for compilation, but this device has no loss module"
+def generate_compile_results(
+    verify_cfg = None,
+    initial_graph = None,
+    outputs = None,
+    intermediate_tensors = None,
+    final_graph = None,
+    *,
+    pass_specific_output_kwargs = None,
+):
+    """
+    Wrapper for generating result from the graph compiler. Contains initial and final graphs, output tensors,
+    and, optionally golden results for final output and intermediates, if desired.
+
+    Parameters
+    ----------
+    verify_cfg: VerifyConfig
+        Value verification config
+
+    initial_graph: Graph
+        Initial graph, immediately after conversion from the input framework
+
+    outputs: Tuple[Tensor, ...]
+        Output tensors
+
+    intermediate_tensors: Dict[str, Tensor]
+        Intermediated tensors
 
-    if dev.loss_module is not None:
-        assert len(targets) > 0, f"Device {dev} has a loss module, but no targets were provided for compilation"
+    final_graph: Graph
+        Buda graph
 
-    backend_output_directory = compiler_cfg.backend_output_dir
-    ci.initialize_output_build_directory(backend_output_directory)
+    netlist_filename: str
+        Netlist file name
 
-    device_cfg = dev.get_device_config(compiler_cfg=compiler_cfg)
-    logger.info("Device grid size: r = {}, c = {}", device_cfg.grid_size.r, device_cfg.grid_size.c)
+    Returns
+    -------
+    CompileResults
+    """
+    ret = CompileResults()
 
-    # Set global cluster descriptor file path if not provided by user (it was obtained from backend when getting device config)
-    if compiler_cfg.backend_cluster_descriptor_path == "":
-        compiler_cfg.backend_cluster_descriptor_path = device_cfg.cluster_config_yaml
+    ret.initial_graph = initial_graph
+    ret.outputs = outputs
+    if verify_cfg and verify_cfg.intermediates:
+        ret.golden_intermediates = {
+            initial_graph.get_node_name(node_id): tensor
+            for node_id, tensor in intermediate_tensors.items() if initial_graph.has_node_with_id(node_id)
+        }
+    ret.final_graph = final_graph
 
-    if compiler_cfg.backend_device_descriptor_path == "":
-        compiler_cfg.backend_device_descriptor_path = device_cfg.device_yaml
+    if outputs:
+        ret.golden_outputs = [out.value() if out.has_value() else None for out in outputs]
 
-    assert len(device_cfg.chip_ids) > 0, "Trying to compile onto zero chips."
-    logger.info("Using chips: {}", device_cfg.chip_ids)
+    if pass_specific_output_kwargs:
+        ret.pass_specific_output_kwargs = pass_specific_output_kwargs
+
+    return ret
+
+def init_compile(context: CompileContext) -> CompileDepth:
+    
+    compiler_cfg = context.compiler_cfg
+    graph_name = context.graph_name
+
+    force_full = bool(int(os.environ.get("PYBUDA_FORCE_FULL_COMPILE_DEPTH", "0")))
+    if force_full:
+        compiler_cfg.compile_depth = CompileDepth.FULL
+
+    context.backend_output_directory = compiler_cfg.backend_output_dir
+    ci.initialize_output_build_directory(context.backend_output_directory)
 
     # compiler_cfg is fully formed
     if "PYBUDA_LOAD_CONFIG" in os.environ:
         compiler_cfg = load_compiler_cfg(compiler_cfg)
     elif "PYBUDA_DUMP_CONFIG" in os.environ:
-        dump_compiler_cfg(backend_output_directory, compiler_cfg, graph_name)
+        dump_compiler_cfg(context.backend_output_directory, compiler_cfg, graph_name)
 
     init_log_last_successful_compile_stage()
 
-    should_early_stop_compilation = check_for_compilation_early_stop(compiler_cfg.compile_depth, CompileDepth.START_COMPILE)
-    if should_early_stop_compilation:
-        return generate_compile_results(
-            verify_cfg, 
-        )
+    return CompileDepth.GENERATE_INITIAL_GRAPH
+
+def generate_initial_graph(context: CompileContext) -> CompileDepth:
+    """
+    Generates initial graph from the input framework.
+
+    Parameters
+    ----------
+    context: CompileContext
+        Compile context
 
-    logger.info("Generating initial graph")
-    should_early_stop_compilation = check_for_compilation_early_stop(compiler_cfg.compile_depth, CompileDepth.GENERATE_INITIAL_GRAPH)
+    Returns
+    -------
+    CompileDepth - next compile stage
+    """
 
-    if compiler_cfg.compile_tvm_to_python and dev.graph is None:
-        module_inputs = inputs
-        for index, module in enumerate(dev.modules):
+    modules_ = []
+    if context.compiler_cfg.compile_tvm_to_python and context.graph is None:
+        module_inputs = context.inputs
+        for index, module in enumerate(context.modules):
             if not isinstance(module, PyBudaModule):
                 from .tvm_to_python import generate_pybuda_module
                 prev_state = state_changed()
-                modules, dev_types, module_inputs = generate_pybuda_module(module, to_pt_tensors(module_inputs), compiler_cfg, module.name, verify_cfg,)
+                if module_inputs is None:
+                    logger.error("No inputs provided for module {}", module.name)
+                    assert False
+                modules, dev_types, module_inputs = generate_pybuda_module(module, to_pt_tensors(module_inputs), context.compiler_cfg, module.name, context.verify_cfg,)
                 assert len(modules) == 1, "Attemping to load split model onto single devices"
-                dev.modules[index] = modules[0]
 
+                modules_.append(modules[0])
                 if index == 0:
-                    inputs = module_inputs
+                    context.inputs = module_inputs
 
                 if not(prev_state):
                     clear_state_changed()
 
-            if index < len(dev.modules) - 1 and not compiler_cfg.compile_subgraphs:
-                if module is dev.loss_module:
-                    if len(module_inputs) == 1:
-                        module_inputs = dev.modules[index].forward(module_inputs[0], targets[0])
-                    else:
-                        module_inputs = dev.modules[index].forward(tuple(module_inputs), tuple(targets))
-                else:
-                    module_inputs = dev.modules[index].forward(*module_inputs)
-
                 if isinstance(module_inputs, Tensor):
                     module_inputs = (module_inputs,) # Force a tuple
 
-    if dev.graph is None:
-        graph, outputs, intermediate_tensors, inputs, _ = dev.generate_graph(*inputs, return_intermediate=verify_cfg.intermediates, graph_name=graph_name, compiler_cfg=compiler_cfg, target_tensors=targets, verify_cfg=verify_cfg)
-    else:
-        graph = dev.graph
-        intermediate_tensors = dev.intermediate_tensors
-        outputs = dev.output_tensors
-
-    graph.set_microbatch(microbatch_size)
-    dump_graph(graph, graph_name, "initial_graph")
-    validate_override_names(graph, compiler_cfg)
-    if compiler_cfg.enable_link_past_cache_ios:
+    if context.graph is None:
+        context.graph, context.outputs, context.intermediate_tensors, context.inputs, _ = generate_graph(modules_, *context.inputs, return_intermediate=context.verify_cfg.intermediates, graph_name=context.graph_name, compiler_cfg=context.compiler_cfg, target_tensors=context.targets)
+
+    context.graph.set_microbatch(context.microbatch_size)
+    dump_graph(context.graph, context.graph_name, "initial_graph")
+    if context.compiler_cfg.enable_link_past_cache_ios:
         # move index ops to weights if applicable
-        move_index_to_mm_weights(graph)
+        move_index_to_mm_weights(context.graph)
 
-        # link past cache ios will change the number on inputs / outputs, so it is called bfore we clone the initial graph
-        new_params = link_past_cache_ios(graph)
+        # link past cache ios will change the number on inputs / outputs, so it is called before we clone the initial graph
+        new_params = link_past_cache_ios(context.graph)
         inputs_to_remove = []
         for k, v in new_params.items():
-            module.add_parameter(k, Parameter(inputs[v].value(), requires_grad=False, name=k))
-            inputs_to_remove.append(inputs[v])
+            context.dev.modules[-1].add_parameter(k, Parameter(context.inputs[v].value(), requires_grad=False, name=k))
+            inputs_to_remove.append(context.inputs[v])
         for i in inputs_to_remove:
-            inputs.remove(i)
+            context.inputs.remove(i)
 
-    initial_graph_copy = graph.clone() # save the original graph for verification and analysis
-    input_grads = []
+    context.initial_graph_copy = context.graph.clone() # save the original graph for verification and analysis
+    context.input_grads = []
 
-    pass_specific_output_kwargs = {}
-    parameter_dict = {p.get_name() : p.value(is_buda=False) for p in dev.get_parameters()}
-    
-    if verify_cfg.verify_all or (verify_cfg.verify_last and should_early_stop_compilation):
-        do_verify("initial_graph", False, graph, inputs, parameter_dict, input_grads, outputs, dev, intermediate_tensors, verify_cfg, False, targets=targets)
+    context.parameter_dict = {}
+    for module in context.modules:
+        if isinstance(module, pybuda.module.Module):
+            for p in module.get_parameters():
+                context.parameter_dict[p.get_name()] = p.value(is_buda=False)
+        elif isinstance(module, torch.fx.GraphModule):
+            for name, value in module.named_parameters():
+                context.parameter_dict[name] = value
+
+    return CompileDepth.POST_INITIAL_GRAPH_PASS
+
+def run_post_initial_graph_pass(context: CompileContext) -> CompileDepth:
+    """
+    Runs post initial graph passes.
 
-    if should_early_stop_compilation:
-        return generate_compile_results(
-            verify_cfg, 
-            initial_graph_copy, outputs, 
-            intermediate_tensors,
-        )
+    Parameters
+    ----------
+    context: CompileContext
+        Compile context
 
-    logger.info("Running post initial graph pass")
-    should_early_stop_compilation = check_for_compilation_early_stop(compiler_cfg.compile_depth, CompileDepth.POST_INITIAL_GRAPH_PASS)
+    Returns
+    -------
+    CompileDepth - next compile stage
+    """
+    compiler_cfg = context.compiler_cfg
+    graph_name = context.graph_name
+    graph, intermediate_tensors = context.graph, context.intermediate_tensors
 
-    inserted_node_id_mapping, fracture_chip_id_assignments = run_post_initial_graph_passes(graph, compiler_cfg, compiler_cfg.fracture_groups)
+    if compiler_cfg.enable_consteval:
+        run_consteval_graph_pass(graph)
+    inserted_node_id_mapping, context.fracture_chip_id_assignments = run_post_initial_graph_passes(graph, compiler_cfg, compiler_cfg.fracture_groups)
 
     for inserted_node_id, original_node_id in inserted_node_id_mapping:
-        # If we have multi-level of decomposition, some node id might not in the original 
-        # intermediate tensor dict. 
+        # If we have multi-level of decomposition, some node id might not in the original
+        # intermediate tensor dict.
         if original_node_id in intermediate_tensors:
             intermediate_tensors[inserted_node_id] = intermediate_tensors[original_node_id]
 
     dump_graph(graph, graph_name, "decomposed_graph")
-    if verify_cfg.verify_all or (verify_cfg.verify_last and should_early_stop_compilation):
-        do_verify("decomposed_graph", False, graph, inputs, parameter_dict, input_grads, outputs, dev, intermediate_tensors, verify_cfg, False, targets=targets)
 
+    next_stage = CompileDepth.OPTIMIZED_GRAPH
     if compiler_cfg.enable_consteval:
-        run_consteval_graph_pass(graph)
-        dump_graph(graph, graph_name, "consteval_graph")
-        if verify_cfg.verify_all or (verify_cfg.verify_last and should_early_stop_compilation):
-            do_verify("consteval_graph", False, graph, inputs, parameter_dict, input_grads, outputs, dev, intermediate_tensors, verify_cfg, False, targets=targets)
+        next_stage = CompileDepth.CONSTEVAL_GRAPH
+    elif compiler_cfg.match_subgraph_patterns:
+        next_stage = CompileDepth.POST_PATTERN_MATCHER
+
+    return next_stage
+
+def run_consteval_pass(context: CompileContext) -> CompileDepth:
+    """
+    Runs consteval pass.
+
+    Parameters
+    ----------
+    context: CompileContext
+        Compile context
 
+    Returns
+    -------
+    CompileDepth - next compile stage
+    """
+    compiler_cfg = context.compiler_cfg
+    graph = context.graph
+    graph_name = context.graph_name
+
+    run_consteval_graph_pass(graph)
+    dump_graph(graph, graph_name, "consteval_graph")
+
+    next_stage = CompileDepth.OPTIMIZED_GRAPH
     if compiler_cfg.match_subgraph_patterns:
-        graph, match_result = pypattern_matcher.lower_pybuda_to_pattern_matcher(graph, compiler_cfg.match_subgraph_patterns)
-        pass_specific_output_kwargs["match_result"] = match_result
+        next_stage = CompileDepth.POST_PATTERN_MATCHER
 
-        if match_result.is_subgraph_loopable:
-            dump_graph(graph, graph_name, "looped_graph")
+    return next_stage
 
-        if check_for_compilation_early_stop(compiler_cfg.compile_depth, CompileDepth.POST_PATTERN_MATCHER):
-            return generate_compile_results(
-                pass_specific_output_kwargs = pass_specific_output_kwargs 
-            )
+def run_post_pattern_matcher(context: CompileContext) -> CompileDepth:
+    """
+    Runs post pattern matcher passes.
 
-    run_optimization_graph_passes(graph, device_cfg)
-    dump_graph(graph, graph_name, "optimized_graph")
-    inserted_node_id_mapping = run_post_optimize_decompose_graph_passes(graph, compiler_cfg)
-    dump_graph(graph, graph_name, "decomposed_optimized_graph")
-    for inserted_node_id, original_node_id in inserted_node_id_mapping:
-        if original_node_id in intermediate_tensors:
-            intermediate_tensors[inserted_node_id] = intermediate_tensors[original_node_id]
+    Parameters
+    ----------
+    context: CompileContext
+        Compile context
 
-    if verify_cfg.verify_all or (verify_cfg.verify_last and should_early_stop_compilation):
-        do_verify("optimized_graph", False, graph, inputs, parameter_dict, input_grads, outputs, dev, intermediate_tensors, verify_cfg, False, targets=targets)
+    Returns
+    -------
+    CompileDepth - next compile stage
+    """
+    compiler_cfg = context.compiler_cfg
+    graph = context.graph
+    graph_name = context.graph_name
 
-    # Workaround for TVM and lack of parameters at the time optimizer is created
-    if dev.optimizer:
-        if dev.optimizer.device_params:
-            dev.optimizer.set_parameters_to_optimize(dev.modules[0].get_parameters())
-        dev.optimizer.set_optimizer_parameters()
+    graph, match_result = pypattern_matcher.lower_pybuda_to_pattern_matcher(graph, compiler_cfg.match_subgraph_patterns)
+    context.output_kwargs["match_result"] = match_result
 
-    if compiler_cfg.enable_training:
+    if match_result.is_subgraph_loopable:
+        dump_graph(graph, graph_name, "looped_graph")
 
-        autograd_config = pyautograd.AutogradConfig(recompute=compiler_cfg.enable_recompute, optimizer=dev.optimizer)
-        autograd_engine = pyautograd.AutogradEngine(graph, autograd_config)
-        
-        graph = autograd_engine.run()
-        dump_graph(graph, graph_name, "post_autograd")
-        
-        if verify_cfg.verify_all or (verify_cfg.verify_last and should_early_stop_compilation):
-            losses = do_verify("post_autograd", compiler_cfg.enable_training, graph, inputs, parameter_dict, input_grads, outputs, dev, intermediate_tensors, verify_cfg, False, losses, targets=targets)
-        elif compiler_cfg.enable_training:
-            losses = calculate_grads(outputs, dev, intermediate_tensors, False, losses)
+    return CompileDepth.OPTIMIZED_GRAPH
 
-        # Record calculated input grads from the previous do_verify call and save so that we don't keep re-calculating and
-        # accumulating on each verification call
-        input_grads = [i.value().grad for i in inputs if i.value().requires_grad and i.value().grad is not None]
+def run_optimization_pass(context: CompileContext) -> CompileDepth:
+    """
+    Runs optimization passes.
 
-        # run_post_autograd_graph_passes(graph)
-        # dump_graph(graph, graph_name, "post_autograd_passes")
-        
-        # do_verify("post_autograd_passes", compiler_cfg.enable_training, graph, inputs, parameter_dict, input_grads, outputs, dev, intermediate_tensors, verify_cfg, False, losses)
+    Parameters
+    ----------
+    context: CompileContext
+        Compile context
 
-    logger.info("Running post autograd graph pass")
-    inserted_node_id_mapping = run_post_autograd_graph_passes(graph, compiler_cfg)
+    Returns
+    -------
+    CompileDepth - next compile stage
+    """
+    compiler_cfg = context.compiler_cfg
+    graph_name = context.graph_name
+    graph, intermediate_tensors = context.graph, context.intermediate_tensors
+
+    run_optimization_graph_passes(graph)
+    dump_graph(graph, graph_name, "optimized_graph")
+
+    inserted_node_id_mapping = run_post_optimize_decompose_graph_passes(graph, compiler_cfg)
+    dump_graph(graph, graph_name, "decomposed_optimized_graph")
     for inserted_node_id, original_node_id in inserted_node_id_mapping:
         if original_node_id in intermediate_tensors:
             intermediate_tensors[inserted_node_id] = intermediate_tensors[original_node_id]
 
-    dump_graph(graph, graph_name, "post_autograd_passes")
+    next_stage = CompileDepth.POST_AUTOGRAD_PASS
+    if context.compiler_cfg.enable_training:
+        next_stage = CompileDepth.AUTOGRAD
 
-    if verify_cfg.verify_all or verify_cfg.verify_post_autograd_passes or (verify_cfg.verify_last and should_early_stop_compilation):
-        do_verify("post_autograd_passes", compiler_cfg.enable_training, graph, inputs, parameter_dict, input_grads, outputs, dev, intermediate_tensors, verify_cfg, False, losses, targets=targets)
-    elif compiler_cfg.enable_training:
-        calculate_grads(outputs, dev, intermediate_tensors, False, losses)
-        
-    input_grads = [i.value().grad for i in inputs if i.value().requires_grad and i.value().grad is not None]
+    return next_stage
 
-    if should_early_stop_compilation:
-        return generate_compile_results(
-            verify_cfg, 
-            initial_graph_copy, outputs, 
-            intermediate_tensors,
-        )
+def run_autograd_pass(context: CompileContext) -> CompileDepth:
+    """
+    Runs autograd pass.
 
-    run_pre_lowering_passes(graph)
-    dump_graph(graph, graph_name, "pre_lowering")
+    Parameters
+    ----------
+    context: CompileContext
+        Compile context
 
-    should_early_stop_compilation = check_for_compilation_early_stop(compiler_cfg.compile_depth, CompileDepth.PRE_LOWERING_PASS)
-    if should_early_stop_compilation:
-        return generate_compile_results(
-            verify_cfg, 
-            initial_graph_copy, outputs, 
-            intermediate_tensors,
-        )
-
-    logger.info("Lowering to Buda")
-    for parameter in dev.get_parameters():
-        parameter._set_fp32_fallback(dev.fp32_fallback)
-
-    scheduler_config = pyscheduler.SchedulerConfig(
-        scheduler_policy=pyscheduler.policy_from_string(os.environ.get("PYBUDA_SCHEDULER_POLICY", compiler_cfg.scheduler_policy)),
-        scheduler_constraints=compiler_cfg.scheduler_constraints,
-    )
+    Returns
+    -------
+    CompileDepth - next compile stage
+    """
+    compiler_cfg = context.compiler_cfg
+    dev = context.dev
+    graph_name = context.graph_name
+    graph, intermediate_tensors, outputs = context.graph, context.intermediate_tensors, context.outputs
 
-    should_early_stop_compilation = check_for_compilation_early_stop(compiler_cfg.compile_depth, CompileDepth.BUDA_GRAPH_PRE_PLACER)
+    autograd_config = pyautograd.AutogradConfig(recompute=compiler_cfg.enable_recompute, optimizer=dev.optimizer)
+    autograd_engine = pyautograd.AutogradEngine(graph, autograd_config)
 
-    post_placer_results = None
-    placer_done = False
-    placer_loop_count = 0
+    graph = autograd_engine.run()
+    dump_graph(graph, graph_name, "post_autograd")
 
-    policy_type = pybalancer.policy_from_string(os.environ.get("PYBUDA_BALANCER_POLICY_TYPE", compiler_cfg.balancer_policy))
-    use_interactive_placer = (
-        compiler_cfg.use_interactive_placer and
-        not (bool(int(os.environ.get("PYBUDA_DISABLE_INTERACTIVE_PLACER", "0")))) and
-        pybalancer.can_use_interactive_placer(policy_type)
-    )
-    while not placer_done:
-        instructions = {} if post_placer_results is None else post_placer_results.ins_instructions
-        temp_dict = {}; temp_dict.update(compiler_cfg.buffering_nops_to_insert); temp_dict.update(instructions)
-        lowered_graph, placer_config_update = run_pre_placer_buda_passes(
-                graph,
-                scheduler_config,
-                device_cfg, 
-                device_cfg.chip_ids,
-                list(map(placer_breaks_eval, compiler_cfg.op_names_to_chip_break)),
-                list(map(placer_breaks_eval, compiler_cfg.op_names_to_epoch_break)),
-                compiler_cfg.op_names_dont_fuse,
-                compiler_cfg.op_names_manual_fuse,
-                fracture_chip_id_assignments,
-                compiler_cfg.default_df_override, 
-                compiler_cfg.default_accumulate_df, 
-                compiler_cfg.enable_broadcast_splitting or bool(int(os.environ.get("PYBUDA_ENABLE_BROADCAST_SPLITTING", "0"))),
-                dev.fp32_fallback,
-                compiler_cfg.default_math_fidelity,
-                compiler_cfg.enable_auto_fusing,
-                compiler_cfg.amp_level or int(os.environ.get("PYBUDA_AMP_LEVEL", "0")),
-                compiler_cfg.enable_recompute,
-                (bool(int(os.environ.get("PYBUDA_ENABLE_OUTPUT_QUEUES_ON_HOST", "1"))) and compiler_cfg.output_queues_on_host),
-                temp_dict,
-                compiler_cfg.insert_queues,
-                compiler_cfg.amp_properties,
-                compiler_cfg.op_intermediates_to_save,
-                use_interactive_placer,
-                compiler_cfg.enable_device_tilize)
-        dump_graph(lowered_graph, graph_name, "pre_placer")
-
-        # Convert to buda tensors - i.e. 4d / tile-snapped dims
-        def to_buda_shapes(tensors):
-            if tensors is None or not tensors:
-                return tensors
-
-            if isinstance(tensors[0], torch.Tensor):
-                return [pad_pytorch_tensor_to_buda(t, lowered_graph.get_tile_broadcast_dims_for_bw_input(i)) for i, t in enumerate(tensors)]
-
-            return [t.to_buda_shape(tile_broadcast_dims=lowered_graph.get_tile_broadcast_dims_for_target(i)) for i, t in enumerate(tensors)]
-
-        #buda_intermediate_tensors = {}
-        #for k, v in intermediate_tensors.items(): 
-        #    buda_intermediate_tensors[k] = pad_pytorch_tensor_to_buda(v)
-        buda_losses = to_buda_shapes(losses)
-        buda_targets = to_buda_shapes(targets)
-
-        if verify_cfg.verify_all or (verify_cfg.verify_last and should_early_stop_compilation):
-            do_verify("pre_placer", compiler_cfg.enable_training, lowered_graph, inputs, parameter_dict, input_grads, outputs, dev, intermediate_tensors, verify_cfg, True, buda_losses, targets=buda_targets)
-        elif compiler_cfg.enable_training:
-            calculate_grads(outputs, dev, intermediate_tensors, True, losses)
+    context.losses = calculate_grads(outputs, dev, intermediate_tensors, False, context.losses)
 
-        if should_early_stop_compilation:
-            return generate_compile_results(
-                verify_cfg, 
-                initial_graph_copy, outputs, 
-                intermediate_tensors,
-                lowered_graph,
-            )
-
-        op_name_to_placer_overrides = match_op_names_to_placer_overrides(lowered_graph, list(map(placer_op_overrides_eval, compiler_cfg.placer_op_overrides)))
-        balancer_config = pybalancer.BalancerConfig(
-            device_config=device_cfg,
-            scheduler_config=scheduler_config,
-            policy_type=policy_type,
-            random_policy_seed=int(os.environ.get("PYBUDA_BALANCER_RANDOM_POLICY_SEED", 0)),
-            chip_ids=device_cfg.chip_ids,
-            chip_placement_policy=pybalancer.chip_placement_policy_from_string(compiler_cfg.chip_placement_policy),
-            enable_t_streaming = (bool(int(os.environ.get("PYBUDA_ENABLE_T_STREAMING", "0"))) or compiler_cfg.enable_t_streaming),
-            manual_t_streaming = compiler_cfg.manual_t_streaming,
-            skip_l1_usage_validation=bool(int(os.environ.get("PYBUDA_SKIP_L1_USAGE_VALIDATION", "0"))),
-            input_queues_on_host=compiler_cfg.input_queues_on_host,
-            output_queues_on_host=compiler_cfg.output_queues_on_host,
-            default_dram_parameters=(not verify_cfg.enabled and (microbatch_size == 1)) if compiler_cfg.default_dram_parameters is None else compiler_cfg.default_dram_parameters,
-            op_names_to_epoch_break=placer_config_update.op_names_to_epoch_break,
-            op_names_to_chip_break=placer_config_update.op_names_to_chip_break,
-            op_overrides=compiler_cfg.balancer_op_overrides,
-            op_names_to_chip_id_assignment=placer_config_update.op_to_chip_id_assignment,
-            op_name_to_placer_overrides=op_name_to_placer_overrides,
-            enable_auto_transposing_placement = compiler_cfg.enable_auto_transposing_placement,
-            graph_solver_self_cut_type = pybalancer.graph_solver_self_cut_type_from_string(os.environ.get("PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE", compiler_cfg.graph_solver_self_cut_type)),
-            use_interactive_placer = use_interactive_placer,
-            enable_enumerate_u_kt = compiler_cfg.enable_enumerate_u_kt,
-            enable_single_buffer_fallback = compiler_cfg.enable_single_buffer_fallback,
-        )
-        should_early_stop_compilation = check_for_compilation_early_stop(compiler_cfg.compile_depth, CompileDepth.BALANCER_PASS)
-        try:
-            balancer_solution, had_balancer_attempts = run_placer_buda_passes(lowered_graph, balancer_config, fracture_chip_id_assignments, compiler_cfg.paddings)
-        except UnsupportedHWOpsError as e:
-            logger.warning("Found unsupported HW ops, stopping compilation early:\n{}", e)
-            assert not bool(int(os.environ.get("PYBUDA_ASSERT_UNSUPPORTED_HW_OP", "0")))
-            return generate_compile_results(
-                verify_cfg, 
-                initial_graph_copy, outputs, 
-                intermediate_tensors,
-                lowered_graph,
-            )
-
-        placer_solution = balancer_solution.placer_solution
-        pass_specific_output_kwargs["placer_solution"] = placer_solution
-        pass_specific_output_kwargs["output_host_tms"] = balancer_solution.output_host_tms
-
-        if had_balancer_attempts:
-            dump_graph(lowered_graph, graph_name, "post_balancer_error")
-            if verify_cfg.verify_all or (verify_cfg.verify_last and should_early_stop_compilation):
-                pass
-                #do_verify("post_balancer_error", compiler_cfg.enable_training, lowered_graph, inputs, parameter_dict, input_grads, outputs, dev, intermediate_tensors, verify_cfg, True, buda_losses, targets=buda_targets)
-            elif compiler_cfg.enable_training:
-                calculate_grads(outputs, dev, intermediate_tensors, True, losses)
-
-        # Golden back-end requires input and output queues to be large enough to store all inputs/outputs
-        io_queue_multiplier = microbatch_count if dev.devtype == BackendType.Golden else 2  # double-buffer on silicon
-        input_queue_multiplier = io_queue_multiplier
-        output_queue_multiplier = io_queue_multiplier
-
-        # For training, queues used across fwd & bwd must be large enough to store all microbatches
-        if compiler_cfg.enable_training and microbatch_count > 1:
-            input_queue_multiplier = max(input_queue_multiplier, microbatch_count)
-            # output_queue_multiplier = max(output_queue_multiplier, microbatch_count)
-
-        cross_chip_buffering = dev.arch == BackendDevice.Grayskull or bool(
-            int(os.environ.get("PYBUDA_WORMHOLE_PIPELINED_PLACER", "0"))
-        )
-
-        post_placer_config = PostPlacerConfig(
-                device_config=device_cfg,
-                input_queue_multiplier=input_queue_multiplier,
-                output_queue_multiplier=output_queue_multiplier,
-                microbatch_size=microbatch_size,
-                microbatch_count=microbatch_count,
-                enable_t_streaming=balancer_config.enable_t_streaming,
-                input_queues_on_host=balancer_config.input_queues_on_host,
-                output_queues_on_host=balancer_config.output_queues_on_host,
-                manual_dram_queue_placement=compiler_cfg.manual_dram_queue_placement,
-                fork_join_tiles_treshold=balancer_config.fork_join_tiles_treshold,
-                enable_cross_chip_buffering=cross_chip_buffering,
-                placement_algorithm=compiler_cfg.dram_placement_algorithm)
-
-        allocated_blocks = dev.allocated_blocks
-        current_host_address = dev.current_host_address
-        post_placer_results = run_post_placer_buda_passes(lowered_graph, graph_name, device_cfg, placer_solution, post_placer_config, balancer_solution, instructions, allocated_blocks, current_host_address)
-        dump_graph(lowered_graph, graph_name, "post_placer", placer_solution, balancer_solution)
-
-        # placer_done = len(post_placer_results.ins_instructions) == len(instructions) # no new instructions
-        placer_done, _, _ = is_subset_of_instructions(post_placer_results.ins_instructions, instructions)
-
-        if not placer_done:
-            placer_loop_count += 1
-            logger.debug(f"Previous  instructions: {len(instructions)}, new instructions: {len(post_placer_results.ins_instructions)}")
-            logger.info(f"Placer failed, retrying loop count {placer_loop_count}")
-            assert placer_loop_count < 20, " 20 loops of placer failed - aborting compile"
-
-    if bool(int(os.environ.get("PYBUDA_REPRODUCE_SUBGRAPH", "0"))):
-        intermediates = get_intermediate_tensors(lowered_graph, inputs, parameter_dict, dev, True)
-        assert len(outputs) == 1, "Only single output supported for cut_graph"
-        golden_output = intermediates[os.environ.get("PYBUDA_REPRODUCE_SUBGRAPH_OUTPUT")]
-        verify_cfg.override_module_outptus = [golden_output]
-    else:
-        intermediates = {}
-    run_pre_netlist_generation_buda_passes(lowered_graph, graph_name, device_cfg, intermediates, placer_solution, post_placer_config, balancer_solution, post_placer_results.allocated_blocks, post_placer_results.current_host_address)
-    dump_graph(lowered_graph, graph_name, "pre_netlist")
-
-    verify_cfg.dump_tensors_path = ci.get_netlist_dir() if ci.capture_tensors() else ""
-    if verify_cfg.verify_all or verify_cfg.verify_post_placer or (verify_cfg.verify_last and should_early_stop_compilation) or verify_cfg.dump_tensors_path:
-        do_verify("post_placer", compiler_cfg.enable_training, lowered_graph, inputs, parameter_dict, input_grads, outputs, dev, intermediate_tensors, verify_cfg, True, buda_losses, balancer_solution=balancer_solution, targets=buda_targets)
-    elif compiler_cfg.enable_training:
-        calculate_grads(outputs, dev, intermediate_tensors, True, losses)
-
-    if should_early_stop_compilation:
-        return generate_compile_results(
-            verify_cfg,
-            initial_graph_copy, outputs,
-            intermediate_tensors,
-            lowered_graph,
-            pass_specific_output_kwargs = pass_specific_output_kwargs 
-        )
-
-    pass_specific_output_kwargs["consteval_trace"] = pygraph.record_consteval_operations(lowered_graph)
-
-    logger.info("Generating Netlist")
-    net : BudaNetlist = lower_to_buda_netlist(lowered_graph, graph_name, placer_solution, balancer_solution, device_cfg.chip_ids, device_cfg, compiler_cfg.enable_forked_dram_inputs)
-    dev.compiled_netlists.append(net)
-
-    dump_epoch_type_graphs(lowered_graph, graph_name, "post_placer", placer_solution, balancer_solution)
-    dump_epoch_id_graphs(lowered_graph, graph_name, "post_placer", placer_solution, balancer_solution)
-
-    netlist_filename = ci.write_netlist_and_buda_envs_config(net, graph_name, backend_output_directory)
-
-    netlist_override = os.environ.get("PYBUDA_NETLIST_OVERRIDE", None)
-    if netlist_override is not None:
-        logger.info("PYBUDA_NETLIST_OVERRIDE={}", netlist_override)
-        netlist_filename = netlist_override
-
-    postfix = os.environ.get("PYBUDA_REPORTIFY_POSTFIX", "")
-    if len(postfix) > 0:
-        postfix = "." + postfix
-    net2placement(graph_name + postfix, netlist_filename, device_yaml=device_cfg.device_yaml)
-    if "PYBUDA_GENERATE_OVERRIDE_CONFIG" in os.environ:
-        generate_override_config(lowered_graph, balancer_solution, placer_solution, post_placer_results.nop_instructions, graph_name)
-
-    if verify_cfg.run_net2pipe or bool(int(os.environ.get("PYBUDA_VERIFY_NET2PIPE", "0"))):
-        verify_net2pipe(netlist_filename, device_cfg.device_yaml, device_cfg.cluster_config_yaml)
-
-    should_early_stop_compilation = check_for_compilation_early_stop(compiler_cfg.compile_depth, CompileDepth.GENERATE_NETLIST)
-    if should_early_stop_compilation:
-        return generate_compile_results(
-            verify_cfg, 
-            initial_graph_copy, outputs, 
-            intermediate_tensors,
-            lowered_graph,
-            netlist_filename,
-            post_placer_results.perf_model_results,
-        )
+    # Record calculated input grads from the previous do_verify call and save so that we don't keep re-calculating and
+    # accumulating on each verification call
+    context.input_grads = [i.value().grad for i in context.inputs if i.value().requires_grad and i.value().grad is not None]
 
-    compile_results = generate_compile_results(
-        verify_cfg, 
-        initial_graph_copy, outputs, 
-        intermediate_tensors,
-        lowered_graph,
-        netlist_filename,
-        post_placer_results.perf_model_results,
-        pass_specific_output_kwargs = pass_specific_output_kwargs 
-    )
+    return CompileDepth.POST_AUTOGRAD_PASS
 
-    # Verify on backend golden
-    if verify_cfg.run_golden:
-        verify_golden(netlist_filename, compiler_cfg.enable_training, compile_results, dev, inputs, outputs, verify_cfg)
-        
-    should_early_stop_compilation = check_for_compilation_early_stop(compiler_cfg.compile_depth, CompileDepth.BACKEND_GOLDEN_VERIFY)
-    if should_early_stop_compilation:
-        return compile_results
+def run_post_autograd_pass(context: CompileContext) -> CompileDepth:
+    """
+    Runs post autograd passes.
 
-    return compile_results
+    Parameters
+    ----------
+    context: CompileContext
+        Compile context
+
+    Returns
+    -------
+    CompileDepth - next compile stage
+    """
+    compiler_cfg = context.compiler_cfg
+    graph_name = context.graph_name
+    graph, intermediate_tensors, losses, outputs = context.graph, context.intermediate_tensors, context.losses, context.outputs
+
+    inserted_node_id_mapping = run_post_autograd_graph_passes(graph, compiler_cfg)
+    for inserted_node_id, original_node_id in inserted_node_id_mapping:
+        if original_node_id in intermediate_tensors:
+            intermediate_tensors[inserted_node_id] = intermediate_tensors[original_node_id]
 
+    dump_graph(graph, graph_name, "post_autograd_passes")
+    # TODO: training is dependent on TTDevice.py which is removed
+    if compiler_cfg.enable_training:
+        calculate_grads(outputs, dev, intermediate_tensors, False, losses)
 
-def check_for_compilation_early_stop(desired_depth, current_depth):
+    return CompileDepth.PRE_LOWERING_PASS
+
+def run_pre_lowering_pass(context: CompileContext) -> CompileDepth:
     """
-    Determines should current compilation process stop or not based on desired
-    and current phase of execution.
+    Runs pre lowering passes.
 
     Parameters
     ----------
-    desired_depth: CompileDepth
-        Desired phase for compiler early stopping.
-
-    current_depth: CompileDepth
-        Current phase for compiler early stopping.
+    context: CompileContext
+        Compile context
 
     Returns
     -------
-    Boolean
+    CompileDepth - next compile stage
     """
-    # update global compile stage variable
-    global LAST_SUCCESSFUL_STAGE
-    LAST_SUCCESSFUL_STAGE = str(CompileDepth(current_depth.value-1).name)
+    compiler_cfg = context.compiler_cfg
+    graph_name = context.graph_name
+    graph = context.graph
 
-    if not CompileDepth.has_value(desired_depth.value):
-        raise Exception("Invalid compilation depth flag: {}".format(desired_depth.name))
+    graph = run_pre_lowering_passes(graph)
+    dump_graph(graph, graph_name, "pre_lowering")
 
-    if desired_depth == current_depth:
-        logger.info("Compilation early stopping after {}".format(current_depth.name))
+    context.final_graph = graph
+    return CompileDepth.RUN_MLIR_COMPILER
 
-        return True
+def run_mlir_compiler(context: CompileContext) -> CompileDepth:
+    graph = context.graph
 
-    return False
+    binary = pybuda._C.run_mlir_compiler(graph)
+    context.output_kwargs["binary"] = binary
 
-def placer_breaks_eval(value):
-    if type(value) is query.NodePredicateBuilder:
-        return value.eval()
-    elif type(value) is list:
-        return [placer_breaks_eval(v) for v in value]
-    else:
-        assert type(value) is str
-        return value
+    return CompileDepth.FINISH_COMPILE
 
-def placer_op_overrides_eval(value):
-    assert type(value) is tuple
-    if type(value[0]) is query.NodePredicateBuilder:
-        return (value[0].eval(), value[1])
-    else:
-        return value
 
-def validate_override_names(graph, compiler_cfg):
+def finish_compile(context: CompileContext) -> CompileDepth:
     """
-    Checks whether name in per_op overrides uses depracated naming scheme and warns user.
+    Runs backend golden verify.
 
     Parameters
     ----------
-    graph: Graph
-        PyBuda Graph
-
-    compiler_cfg: CompilerCfg
-        Compiler configuration options
+    context: CompileContext
+        Compile context
 
     Returns
     -------
-    None
+    CompileDepth - next compile stage
     """
-    from pybuda.op.common import depracated_name_dict
-
-    keys = list(compiler_cfg.balancer_op_overrides.keys())
-    keys.extend([key[0] for key in compiler_cfg.op_names_to_epoch_break if type(key) is list and type(key[0]) is str])
-    keys.extend([key[0] for key in compiler_cfg.op_names_to_chip_break if type(key) is list and type(key[0]) is str])
-    for key in keys:
-        for depracated_name in depracated_name_dict.keys():
-            if key.startswith(depracated_name):
-                new_name = key.replace(depracated_name, depracated_name_dict[depracated_name])
-                if key in compiler_cfg.balancer_op_overrides:
-                    compiler_cfg.balancer_op_overrides[new_name] = compiler_cfg.balancer_op_overrides.pop(key)
-                elif [key] in compiler_cfg.op_names_to_epoch_break:
-                    compiler_cfg.op_names_to_epoch_break.remove([key])
-                    compiler_cfg.op_names_to_epoch_break.append([new_name])
-                elif [key] in compiler_cfg.op_names_to_chip_break:
-                    compiler_cfg.op_names_to_chip_break.remove([key])
-                    compiler_cfg.op_names_to_chip_break.append([new_name])
-
-                logger.warning("Using depracated node name: {}, being replaced by: {}. Please update your test files. ", key, new_name)
+    verify_cfg = context.verify_cfg
 
+    context.output_kwargs["consteval_trace"] = pygraph.record_consteval_operations(context.final_graph)
 
+    return CompileDepth.FULL
 
-def generate_compile_results(
-    verify_cfg = None,
-    initial_graph = None,
-    outputs = None,
-    intermediate_tensors = None,
-    lowered_graph = None,
-    netlist_filename = None,
-    perf_model_results = None,
-    *,
-    pass_specific_output_kwargs = None,
-):
+def generate_graph(
+        modules,
+        *inputs: Tensor, 
+        target_tensors: List[Tensor] = [],
+        return_intermediate: bool = False, 
+        graph_name: str = "default_graph", 
+        compiler_cfg: Optional[CompilerConfig] = None, 
+        trace_only: bool = False) -> Tuple[Graph, Tuple[Tensor, ...], Dict[str, Tensor], Tuple[Tensor, ...], Optional[Tensor]]:
     """
-    Wrapper for generating result from the graph compiler. Contains initial and final graphs, output tensors, 
-    and, optionally golden results for final output and intermediates, if desired.
+    Generate a buda graph from the passed modules, and return the graph and output tensors.
+    If input tensors have a value set, the output tensor will also have the calculated output value
+    set.
 
     Parameters
     ----------
-    verify_cfg: VerifyConfig
-        Value verification config
+    inputs: Tuple[Tensor, ....]
+        Input tensors
 
-    initial_graph: Graph
-        Initial graph, immediately after conversion from the input framework
+    target_tensors: List[Tensor]
+        Target inputs. Optional, if trace_only is set. Otherwise, value must be provided.
 
-    outputs: Tuple[Tensor, ...]
-        Output tensors
+    return_intermediate: bool
+        Optional. If set, a dictionary of node IDs -> tensors will be return with intermediate values, for data mismatch debug.
 
-    intermediate_tensors: Dict[str, Tensor]
-        Intermediated tensors
-
-    lowered_graph: Graph
-        Buda graph
-
-    netlist_filename: str
-        Netlist file name
+    trace_only: bool
+        If set, the graph is made for a quick trace only and shouldn't have side-effects
 
     Returns
     -------
-    CompileResults
+    Graph, Tuple[Tensor, ...], Dict[str, Tensor], Tuple[Tensor, ...], Optional[Tensor]
+        Buda graph, outputs, optional intermediates, original inputs, target tensor
     """
-    ret = CompileResults()
 
-    ret.initial_graph = initial_graph
-    ret.outputs = outputs
-    if verify_cfg and verify_cfg.intermediates:
-        ret.golden_intermediates = {
-            initial_graph.get_node_name(node_id): tensor
-            for node_id, tensor in intermediate_tensors.items() if initial_graph.has_node_with_id(node_id)
-        }
-    ret.lowered_graph = lowered_graph
-    ret.netlist_filename = netlist_filename
-    ret.perf_model_results = perf_model_results
+    '''
+    TODO: This function was copied over from ttdevice.py with some modifications. Probably needs to be refactored (possibly moved to cpp)
+    '''
 
-    if outputs:
-        ret.golden_outputs = [out.value() if out.has_value() else None for out in outputs]
+    from .pybudaglobal import start_tracing, stop_tracing
+    from pybuda.tvm_utils import flatten_inputs
+    from collections import deque
+    import inspect
 
-    if pass_specific_output_kwargs:
-        ret.pass_specific_output_kwargs = pass_specific_output_kwargs
+    from pybuda._C.graph import create_output, create_parameter_input, create_data_edge, create_activation_input, create_constant_input, create_op_node, create_target_input
+
+    output_to_module_name_prefix = {}
+    output_to_subgraph_index = {}
+
+    # Create the graph
+    graph = Graph(graph_name)
+    graph.set_microbatch(1)
+
+    if compiler_cfg is None:
+        compiler_cfg = _get_global_compiler_config()
+
+    graph.set_enable_training(compiler_cfg.enable_training)
+
+    # Trace through the modules
+    all_subgraph_outputs = []
+    outputs = inputs
+    for idx, module in enumerate(modules):
+
+        assert isinstance(module, PyBudaModule), "This function only supports PyBudaModule instances"
+
+        if compiler_cfg.compile_subgraphs:
+            outputs = inputs[idx]
+
+        start_tracing()
+        outputs = module.forward(*outputs)
+        stop_tracing()
+        if isinstance(outputs, Tensor):
+            outputs = (outputs,) # Force a tuple
+
+        for output in outputs:
+            output_to_module_name_prefix[output] = module.get_name()
+            if compiler_cfg.compile_subgraphs:
+                assert output not in output_to_subgraph_index, "Output tensor {} is produced by multiple modules".format(output)
+
+            output_to_subgraph_index[output] = module.subgraph_idx
+
+        all_subgraph_outputs += outputs
+
+    if trace_only:
+        return graph, all_subgraph_outputs, {}, inputs, target_tensors
+
+    visited_tensors = {}
+    pending_tensors = deque()
+    intermediate = {}
+    module_input_tensor_to_node: Dict[str, Tensor] = {}
+    module_output_tensor_to_node: Dict[str, Tensor] = {}
+    module_target_tensor_to_node: Dict[str, Tensor] = {}
+    module_loopback_tensor_to_node: Dict[str, Tensor] = {}
+    passthroughs: Set = set()
+
+    input_node_names = []
+    input_names_known = True
+    if isinstance(inputs[0], Tensor):
+        inputs = (inputs,)
+    for index, (module, submodule_input) in enumerate(zip(modules, inputs)):
+        submodule_input_node_names = list(inspect.signature(super(PyBudaModule, module).__getattribute__("forward")).parameters.keys())
+        if len(modules) > 1:
+            submodule_input_node_names = [f"{input_name}_{index}" for input_name in submodule_input_node_names]
+        input_node_names += submodule_input_node_names
+        if len(submodule_input_node_names) != len(submodule_input):
+            input_names_known = False
+    inputs, _, _ = flatten_inputs(inputs)
+
+    for out in all_subgraph_outputs:
+        is_loss_output = False # self.loss_module is not None
+        if out.src_op is None:
+
+            # No source op. It could be a pass-through, so let's compare to inputs
+            found = False
+            for input in inputs:
+                if input == out:
+                    # Found a passthrough
+                    outq = create_output(graph, 
+                        output_to_module_name_prefix.get(out, "") + f".output_passthrough_{len(passthroughs)}",
+                        out.shape.get_pytorch_shape(), 
+                        out.data_format,
+                        is_loss_output,
+                        output_to_subgraph_index.get(out, 0))
+                    passthroughs.add(input)
+                    found = True
+                    break
+
+            if not found:
+                raise RuntimeError("Untraced output tensor encountered")
+
+        else:
+            outq = create_output(graph, 
+                    output_to_module_name_prefix.get(out, "") + ".output_" + out.src_op.name, 
+                    out.shape.get_pytorch_shape(), 
+                    out.data_format,
+                    is_loss_output,
+                    output_to_subgraph_index.get(out, 0))
+        module_output_tensor_to_node[out] = outq
+        pending_tensors.append( (out, outq, 0, [], output_to_subgraph_index.get(out, 0)) )
+
+    recorded_parameters = {}
+
+    while pending_tensors:
+
+        tensor, output, port_index, operand_broadcast, subgraph_idx = pending_tensors.popleft()
+
+        if tensor in visited_tensors:
+            # Already created the note - let's add the edge and move on
+            create_data_edge(graph, visited_tensors[tensor], 0, output, port_index, operand_broadcast)
+            continue
+
+        if isinstance(tensor, int):
+            # integer constant. Don't add to visited tensors.
+            assert False # not supported any more
+
+        if isinstance(tensor, Parameter):
+            # parameter tensor
+            if tensor.get_name() is not None:
+                name = tensor.get_name()
+            else:
+                name = "parameter_" + graph.get_node_name(output)
+
+            if name in recorded_parameters:
+                # Multiple subgraphs might use the same parameter. If it is used in the same subgraph,
+                # we should have already found it in the visited_tensors dictionary. Putting an assert here
+                # to catch fallouts.
+                assert graph.get_subgraph_id_for_node(recorded_parameters[name]) != subgraph_idx, \
+                        "Trying to add parameter with name: {} that is used in the same subgraph".format(name)
+                create_data_edge(graph, recorded_parameters[name], 0, output, port_index, operand_broadcast)
+                continue
+
+            inq = create_parameter_input(
+                    graph, 
+                    name,
+                    tensor.shape.get_pytorch_shape(),
+                    tensor.requires_grad,
+                    tensor.data_format,
+                    subgraph_idx)
+            create_data_edge(graph, inq, 0, output, port_index, operand_broadcast)
+            visited_tensors[tensor] = inq
+            recorded_parameters[name] = inq
+            continue
+        
+        if tensor.src_op is None:
+            input_name = input_node_names[inputs.index(tensor)] if input_names_known and tensor in inputs else "input_" + str(port_index) + "_" + graph.get_node_name(output)
+            if tensor in passthroughs:
+                # passthrough input->output, add a nop
+                inq = create_activation_input(
+                        graph,
+                        input_name,
+                        tensor.shape.get_pytorch_shape(),
+                        tensor.requires_grad,
+                        tensor.data_format,
+                        subgraph_idx)
+
+                nop = create_op_node(graph, f"_passthrough_nop_{output}", 
+                        OpType("nop"), tensor.shape.get_pytorch_shape(), tensor.data_format, subgraph_idx, {})
+
+                create_data_edge(graph, inq, 0, nop, 0, operand_broadcast)
+                create_data_edge(graph, nop, 0, output, 0, operand_broadcast)
+                visited_tensors[tensor] = inq
+                module_input_tensor_to_node[tensor] = inq
+                continue
+
+            elif tensor in target_tensors:
+                # Target input
+                inq = create_target_input(
+                        graph,
+                        input_name,
+                        tensor.shape.get_pytorch_shape(),
+                        tensor.requires_grad,
+                        tensor.data_format,
+                        subgraph_idx)
+                create_data_edge(graph, inq, 0, output, port_index, operand_broadcast)
+                visited_tensors[tensor] = inq
+                module_target_tensor_to_node[tensor] = inq
+                continue
+
+            elif tensor.is_constant():
+                # Target input
+                inq = create_constant_input(
+                        graph,
+                        input_name,
+                        tensor.value(),
+                        tensor.shape.get_pytorch_shape(),
+                        tensor.data_format,
+                        subgraph_idx)
+                create_data_edge(graph, inq, 0, output, port_index, operand_broadcast)
+                visited_tensors[tensor] = inq
+                module_target_tensor_to_node[tensor] = inq
+                continue
+
+            else:
+                # input tensor
+                input_creator = create_activation_input if input_name not in compiler_cfg.loopback_outputs else create_parameter_input
+
+                if input_name in compiler_cfg.loopback_outputs:
+                    module.add_parameter(input_name, Parameter(tensor.value(), requires_grad=True, name=input_name))
+
+                inq = input_creator(
+                        graph,
+                        input_name,
+                        tensor.shape.get_pytorch_shape(),
+                        tensor.requires_grad,
+                        tensor.data_format,
+                        subgraph_idx)
+                create_data_edge(graph, inq, 0, output, port_index, operand_broadcast)
+                visited_tensors[tensor] = inq
+                if input_name not in compiler_cfg.loopback_outputs:
+                    module_input_tensor_to_node[tensor] = inq
+                elif input_name in compiler_cfg.loopback_outputs:
+                    module_loopback_tensor_to_node[tensor] = inq
+                    recorded_parameters[input_name] = inq
+                continue
+
+        elif tensor.src_op.op_type == "constant":
+            constant_value = tensor.src_op.attrs[0]
+            constant = create_constant_input(
+                    graph,
+                    "constant_" + str(port_index) + "_" + graph.get_node_name(output),
+                    constant_value,
+                    tensor.data_format,
+                    subgraph_idx)
+
+            create_data_edge(graph, constant, 0, output, port_index, operand_broadcast)
+            visited_tensors[tensor] = constant
+            continue
+
+        '''
+        print("ttdevice.py, create_op_node")
+        print(f"graph type: {type(graph)}")
+        print(f"src_op name: {tensor.src_op.name}")
+        print(f"src_op op_type: {tensor.src_op.op_type}")
+        print(f"src_op attrs: {tensor.src_op.attrs}")
+        print(f"shape: {tensor.shape.get_pytorch_shape()}")
+        print(f"data format: {tensor.data_format}")
+        '''
+
+        tags = {}
+        if tensor.src_layer is not None:
+            tags["layer"] = tensor.src_layer
+        op = create_op_node(graph, tensor.src_op.name, tensor.src_op.cpp_op_type, tensor.shape.get_pytorch_shape(), tensor.data_format, subgraph_idx, tags)
+
+        visited_tensors[tensor] = op
+        if return_intermediate and tensor.has_value():
+            intermediate[op] = tensor.value()
+
+        create_data_edge(graph, op, 0, output, port_index, operand_broadcast)
+
+        for i, t in enumerate(tensor.src_op.operands):
+            pending_tensors.append( (t, op, i, tensor.src_op.operand_broadcast, subgraph_idx) )
+
+    # Register input/output order of the module to the graph now that the nodes are created
+    module_inputs = [module_input_tensor_to_node[input_tensor] for input_tensor in inputs if input_tensor in module_input_tensor_to_node]
+    module_outputs = [module_output_tensor_to_node[output_tensor] for output_tensor in all_subgraph_outputs if output_tensor in module_output_tensor_to_node]
+    module_targets = [module_target_tensor_to_node[target_tensor] for target_tensor in target_tensors]
+    out_requires_grad = [output_tensor.requires_grad for output_tensor in all_subgraph_outputs if output_tensor in module_output_tensor_to_node]
+
+    # Remove unused inputs from list of module inputs
+    inputs = [input_tensor for input_tensor in inputs if input_tensor in module_input_tensor_to_node or input_tensor in module_output_tensor_to_node]
+
+    # Remove loopback inputs from list of module inputs
+    inputs = [input_tensor for input_tensor in inputs if input_tensor not in module_loopback_tensor_to_node]
+
+    if len(compiler_cfg.loopback_outputs):
+        output_to_remove = []
+        out_requires_grad_to_remove = []
+        for input_name, output_indices in compiler_cfg.loopback_outputs.items():
+            if isinstance(output_indices, int):
+                output_indices = [output_indices]
+            for output_index in output_indices:
+                input_id = graph.get_node_id(input_name)
+                output_id = module_outputs[output_index]
+                add_partial_datacopy_edge(graph, output_id, 0, input_id, 0)
+                output_to_remove.append(module_outputs[output_index])
+                out_requires_grad_to_remove.append(out_requires_grad[output_index])
+        [module_outputs.remove(value) for value in output_to_remove]
+        [out_requires_grad.remove(value) for value in out_requires_grad_to_remove]
+
+    graph.register_module_inputs(module_inputs)
+    graph.register_module_targets(module_targets)
+    graph.register_module_outputs(module_outputs, out_requires_grad)
+
+    if return_intermediate:
+        return graph, outputs, intermediate, inputs, target_tensors
+
+    return graph, outputs, {}, inputs, target_tensors
 
-    return ret
diff --git a/pybuda/pybuda/compiled_graph_state.py b/pybuda/pybuda/compiled_graph_state.py
index 3056a1258..18ef7f76c 100644
--- a/pybuda/pybuda/compiled_graph_state.py
+++ b/pybuda/pybuda/compiled_graph_state.py
@@ -3,27 +3,21 @@
 # SPDX-License-Identifier: Apache-2.0
 
 from typing import Dict, List, Any, Tuple, Optional
-from dataclasses import dataclass, field
-from enum import Enum
-import inspect
-import os
-import json
-
-import importlib
-from pybuda.ttdevice import TTDevice
+from loguru import logger
 
-from pybuda.compile import CompileResults
+from dataclasses import dataclass, field
+from dataclasses_json import dataclass_json, config
 
 from pybuda._C import DataFormat
-from pybuda._C.graph import Graph, get_constant_input_value, get_optimizer_param_info, RuntimeTensorTransform, RuntimeTensorTransformType, Shape
-from pybuda._C.balancer import OutputHostTM
+from pybuda._C.graph import Graph, RuntimeTensorTransform
+from pybuda._C.runtime import run_binary, Binary
+from pybuda.utils import list_as_json
+from pybuda.tensor import Tensor, get_post_const_eval_tensors
+from pybuda.module import Module
 
-import dataclasses
-from dataclasses_json import dataclass_json, config
-from pybuda.utils import as_json, dict_as_json, list_as_json, detach_tensors
-from pybuda.tensor import get_device_constant_and_parameters, get_post_const_eval_tensors 
 
 import torch
+
 def no_encoding(obj):
     return obj # perform json-encoding later
 def no_decoding(obj):
@@ -33,6 +27,19 @@ def optional_no_encoding(obj):
 def optional_no_decoding(obj):
     return None if obj is None else obj
 
+class CompileResults:
+    """
+    Wrapper for result from the graph compiler. Contains initial and final graphs, output tensors,
+    and, optionally golden results for final output and intermediates, if desired.
+    """
+    outputs: List[Tensor]
+    golden_outputs: List[torch.Tensor]
+    golden_intermediates: Dict[str, torch.Tensor]
+    initial_graph: Graph
+    final_graph: Graph
+
+    pass_specific_output_kwargs: Dict[str, Any] = {}
+
 @dataclass_json
 @dataclass()
 class CompiledGraphState:
@@ -74,8 +81,6 @@ class CompiledGraphState:
     ordered_intermediate_shapes: List[List[int]]
     ordered_output_data_formats: List[DataFormat] = field(metadata=list_as_json(DataFormat))
 
-    netlist_filename: str
-    output_host_tms: Dict[str, OutputHostTM] = field(metadata=dict_as_json(OutputHostTM))
     consteval_trace: Dict[str, Dict[str, Any]]
     post_const_eval_constants: Dict[str, torch.Tensor] = field(
         metadata=config( # For serialization of CompiledGraphState cls
@@ -100,9 +105,11 @@ class CompiledGraphState:
         default=None
     )
 
+    has_cache_buffers: bool = False
+
     @staticmethod
-    def from_compiled_graph(device: "TTDevice", compile_results: CompileResults) -> "CompiledGraphState":
-        graph = compile_results.lowered_graph
+    def from_compiled_graph(module: Module, compile_results: CompileResults) -> "CompiledGraphState":
+        graph = compile_results.final_graph
         ordered_input_names = graph.get_ordered_input_names()
         ordered_output_names = graph.get_ordered_output_names()
         ordered_input_gradient_names = graph.get_ordered_input_gradient_names()
@@ -158,26 +165,35 @@ def from_compiled_graph(device: "TTDevice", compile_results: CompileResults) ->
         for name, tensor in graph.get_constant_input_runtime_tensor_transform_constants():
             constant_to_tensor[name] = tensor
 
+        # TODO: will be needed for training
         optimizer_param_info = {}
-        for param_name, opt_params in device.get_optimizer_params(is_buda=True).items():
-            optimizer_param_info[param_name] = []
-            for input_node, param_key in get_optimizer_param_info(graph, param_name):
-                optimizer_param_info[param_name].append((input_node.name, param_key))
 
         consteval_trace = compile_results.pass_specific_output_kwargs["consteval_trace"]
-        device_inputs = get_device_constant_and_parameters(
-            device, constant_to_tensor=constant_to_tensor
-        )
+        has_cache_buffers = False
+
+        if isinstance(module, Module):
+            for p in module.get_parameters():
+                value = p.value(is_buda=False)
+                if value == None:
+                    raise ValueError(f"Parameter {p.get_name()} has no value")
+                constant_to_tensor[p.get_name()] = p.value(is_buda=False)
+        elif isinstance(module, torch.fx.GraphModule):
+            for name, value in module.named_parameters():
+                constant_to_tensor[name] = value
+
+        post_const_eval_constants = {}
         post_const_eval_constants: Dict[str, torch.Tensor] = get_post_const_eval_tensors(
             graph,
-            device_inputs,
+            constant_to_tensor,
             consteval_trace,
             constant_to_tile_dims,
             ordered_constant_node_names
         )
+
+        post_const_eval_parameters = {}
         post_const_eval_parameters: Dict[str, torch.Tensor] = get_post_const_eval_tensors(
             graph,
-            device_inputs,
+            constant_to_tensor,
             consteval_trace,
             parameter_to_tile_dims,
             ordered_parameter_node_names
@@ -206,8 +222,6 @@ def from_compiled_graph(device: "TTDevice", compile_results: CompileResults) ->
             ordered_target_shapes=ordered_target_shapes,
             ordered_intermediate_shapes=ordered_intermediate_shapes,
             ordered_output_data_formats=ordered_output_data_formats,
-            netlist_filename=compile_results.netlist_filename,
-            output_host_tms=compile_results.pass_specific_output_kwargs["output_host_tms"],
             consteval_trace=consteval_trace,
             optimizer_param_info=optimizer_param_info,
             ordered_input_subgraph_indices=ordered_input_subgraph_indices,
@@ -218,6 +232,7 @@ def from_compiled_graph(device: "TTDevice", compile_results: CompileResults) ->
             constant_to_tile_dims=constant_to_tile_dims,
             post_const_eval_constants=post_const_eval_constants,
             post_const_eval_parameters=post_const_eval_parameters,
+            has_cache_buffers=has_cache_buffers,
         )
 
     def get_tensor(self, name_to_tensor, name):
@@ -237,4 +252,54 @@ def get_constant_tensor(self, name):
         return self.get_tensor(self.post_const_eval_constants, name)
 
     def get_parameter_tensor(self, name):
-        return self.get_tensor(self.post_const_eval_parameters, name)
\ No newline at end of file
+        return self.get_tensor(self.post_const_eval_parameters, name)
+
+    def get_ordered_input_names_for_subgraph(self, subgraph_idx):
+        return [name for i, name in enumerate(self.ordered_input_names) if self.ordered_input_subgraph_indices[i] == subgraph_idx]
+
+    def get_ordered_input_shapes_for_subgraph(self, subgraph_idx):
+        return [shape for i, shape in enumerate(self.ordered_input_shapes) if self.ordered_input_subgraph_indices[i] == subgraph_idx]
+
+    def get_ordered_input_runtime_transforms_for_subgraph(self, subgraph_idx):
+        return [transform for i, transform in enumerate(self.ordered_input_runtime_tensor_transforms) if self.ordered_input_subgraph_indices[i] == subgraph_idx]
+
+    def get_ordered_input_tile_broadcast_dims_for_subgraph(self, subgraph_idx):
+        return [tile_dims for i, tile_dims in enumerate(self.ordered_input_tile_broadcast_dims) if self.ordered_input_subgraph_indices[i] == subgraph_idx]
+
+    def get_ordered_output_names_for_subgraph(self, subgraph_idx):
+        return [name for i, name in enumerate(self.ordered_output_names) if self.ordered_output_subgraph_indices[i] == subgraph_idx]
+
+    def get_ordered_output_shapes_for_subgraph(self, subgraph_idx):
+        return [shape for i, shape in enumerate(self.ordered_output_shapes) if self.ordered_output_subgraph_indices[i] == subgraph_idx]
+
+    def get_ordered_output_runtime_transforms_for_subgraph(self, subgraph_idx):
+        return [transform for i, transform in enumerate(self.ordered_output_runtime_tensor_transforms) if self.ordered_output_subgraph_indices[i] == subgraph_idx]
+
+class CompiledModel:
+    """
+    Callable object for running inference on the compiled model.
+    """
+    compiled_graph_state: CompiledGraphState
+    binary: Binary
+
+    def __init__(self, compiled_graph_state: CompiledGraphState, binary: Binary):
+        self.compiled_graph_state = compiled_graph_state
+        self.binary = binary
+
+    def __call__(self, *inputs: torch.Tensor) -> List[torch.Tensor]:
+        """
+        Run inference on the compiled model.
+
+        Parameters
+        ----------
+        inputs: Tuple[Tensor, ...]
+            Input tensors
+
+        Returns
+        -------
+        List[Tensor]
+            Output tensors
+        """
+        logger.info(f"Running model {self.compiled_graph_state.graph_name} on device...")
+        return run_binary(self.binary, 0, [*inputs])
+
diff --git a/pybuda/pybuda/config.py b/pybuda/pybuda/config.py
index e06daf4c0..8ac9077f7 100644
--- a/pybuda/pybuda/config.py
+++ b/pybuda/pybuda/config.py
@@ -8,30 +8,27 @@
 from typing import Tuple, Dict, List, Optional, Union, Set
 from collections.abc import Iterable
 from dataclasses import dataclass, field
-from pybuda._C import DataFormat, MathFidelity, NopInsertionInstruction, AMPNodeProperties, DramQueueConfigOverride
-import pybuda._C.balancer as pybalancer
-import pybuda._C.placer as pyplacer
-from pybuda._C.backend_api import DeviceMode
+from pybuda._C import DataFormat, MathFidelity, AMPNodeProperties
 import pybuda.query as query
 from dataclasses_json import dataclass_json, config
 
-from pybuda.utils import as_json, dict_as_json, list_as_json, optional_as_json, resolve_output_build_directory
+from pybuda.utils import as_json, dict_as_json, list_as_json, optional_as_json, resolve_output_build_directory, resolve_device_descriptor_path
 from loguru import logger
 
 
-
-
 class CompileDepth(Enum):
-    FULL = 0                    # Full compilation
-    START_COMPILE = 1           # Start compilation
-    GENERATE_INITIAL_GRAPH = 2  # Finish compilation after generating an initial graph
-    POST_INITIAL_GRAPH_PASS = 3 # Finish compilation after post initial graph pass
-    PRE_LOWERING_PASS = 4       # Finish compilation after pre lowering pass
-    BUDA_GRAPH_PRE_PLACER = 5   # Finish compilation after lowering to Buda pre-placer pass
-    BALANCER_PASS = 6           # Finish compilation after lowering to Buda balancer pass
-    GENERATE_NETLIST = 7        # Finish compilation after generating a netlist
-    POST_PATTERN_MATCHER = 8    # Finish compilation after running subgraph pattern_matcher
-    BACKEND_GOLDEN_VERIFY = 9   # Finish compilation after backend golder verification during compile stage
+    INIT_COMPILE = 0
+    GENERATE_INITIAL_GRAPH = 1
+    POST_INITIAL_GRAPH_PASS = 2
+    CONSTEVAL_GRAPH = 3
+    POST_PATTERN_MATCHER = 4
+    OPTIMIZED_GRAPH = 5
+    AUTOGRAD = 6
+    POST_AUTOGRAD_PASS = 7
+    PRE_LOWERING_PASS = 8
+    RUN_MLIR_COMPILER = 9
+    FINISH_COMPILE = 10
+    FULL = 11
 
     @classmethod
     def has_value(cls, value):
@@ -74,45 +71,6 @@ def get_backend_cfg_string(self) -> str:
 
         raise RuntimeError("Unsupported level")
 
-class PlacerOpOverridesAsJson:
-    @classmethod
-    def to_json(cls, value):
-        assert type(value) is tuple
-        if type(value[0]) is str:
-            lhs = value[0]
-        else:
-            lhs = value[0].to_json()
-        return [lhs, value[1].to_json()]
-
-    @classmethod
-    def from_json(cls, value):
-        assert type(value) is list
-        if type(value[0]) is str:
-            lhs = value[0]
-        else:
-            lhs = value[0].to_json()
-        return [lhs, pyplacer.OpOverride.from_json(value[1])]
-
-class PlacerBreaksAsJson:
-    @classmethod
-    def to_json(cls, value):
-        if type(value) is query.NodePredicateBuilder:
-            return value.to_json()
-        elif type(value) is list:
-            return [PlacerBreaksAsJson.to_json(v) for v in value]
-        else:
-            assert type(value) is str
-            return value
-
-    @classmethod
-    def from_json(cls, value):
-        if type(value) is dict:
-            return query.NodePredicateBuilder.from_json(value)
-        elif type(value) is list:
-            return [PlacerBreaksAsJson.from_json(v) for v in value]
-        else:
-            assert type(value) is str
-            return value
 
 
 class TTIDumpFormat(Enum):
@@ -159,16 +117,18 @@ class CompilerConfig:
     use_interactive_placer: bool = True     # use interactive placer if chosen policy supports it
     enable_enumerate_u_kt: bool = True      # Enable searching all possible matmul u_kts
     enable_link_past_cache_ios: bool = False # Enable auto detection and linking of past key-value pairs
+    enable_pt2_fx_graph_link: bool = False  # Enable linking of past key-value pairs in the graph
 
     compile_depth: int = field(default=CompileDepth.FULL, metadata=as_json(CompileDepth))  # Defines compilation depth. Used to limit scope of some unit tests
 
     enable_tvm_cpu_fallback: bool = True    # Create cpu device for unsupported pybuda ops
-    cpu_fallback_ops: Set[str] = field(default_factory=lambda: set(["embedding"]))  # Which ops should we fall back on
-    enable_tm_cpu_fallback: bool = False  # Extend CPU fallback for TM ops
-    tm_cpu_fallback_max_depth: int = 10  # Max search depth for extended CPU fallback
+    cpu_fallback_ops: Set[str] = field(default_factory=lambda: set(["embedding"])) # Types of ops to fall back to CPU for
+    enable_tm_cpu_fallback: bool = False    # Extend CPU fallback for TM ops
+    tm_cpu_fallback_max_depth: int = 10     # Max search depth for extended CPU fallback
 
     enable_tvm_dropout: bool = False        # (Temporary): Remove when buda supports dropout
     enable_tvm_unsupported_ops: bool = False# Create "unsupported" pybuda ops in python file, allowing user to modify later
+    enable_op_level_comparision: bool = False # Should we need to compare every op with framework output at each compilation stage.
     enable_tvm_constant_prop: bool = False  # Should we constant prop in tvm
     convert_framework_params_to_tvm: bool = True # Convert framework params to relay params
     enable_xla_jax_convert: bool = False    # Convert JAX model to TF through XLA
@@ -183,7 +143,7 @@ class CompilerConfig:
 
     enable_conv_prestride: bool = True      # Enables a transform for conv that directly reads input, such that it goes from stride > 1 to stride = 1
                                             # This usually translates to lower DRAM BW and less math as the input better populates tiles
-                                            # More can be found here: tenstorrent/budabackend#957
+
     max_pool_add_sub_surround: bool = False         # Add add op before, and subtract op after max_pool during the decomposition. The reason for 
                                                     # adding it is to tangle with negative values for max_pool, as current decomposition uses sparse
                                                     # matmul which is padded with 0. Therefore, 0 will be maximum value when max_pool is run - which 
@@ -196,13 +156,9 @@ class CompilerConfig:
     enable_forked_dram_inputs = False # If true, enables forked_dram_inputs optimization
 
     chip_placement_policy: str = "MMIO_LAST"       # how to order the given chip ids for placement
-    op_names_to_epoch_break: List[Union[query.NodePredicateBuilder, List[Union[str, query.NodePredicateBuilder]]]] = field(default_factory=list, metadata=list_as_json(PlacerBreaksAsJson))   # Each op in the list will be placed on a new epoch
-    op_names_to_chip_break: List[Union[query.NodePredicateBuilder, List[Union[str, query.NodePredicateBuilder]]]] = field(default_factory=list, metadata=list_as_json(PlacerBreaksAsJson)) # Each op in the list will be placed on a new chip
     op_names_dont_fuse: List[str] = field(default_factory=lambda: list())           # A list of ops to disable being fused
     op_names_manual_fuse: List[str] = field(default_factory=lambda: list())          # A list of ops to allow being fused, non specified ops will no longer participate in fusion
-    balancer_op_overrides: Dict[str, pybalancer.OpOverride] = field(default_factory=lambda: dict(), metadata=dict_as_json(pybalancer.OpOverride))  # User override of op balancer attributes (i.e. grid sizes)
     default_dram_parameters: Optional[bool] = None # If set to true/false, place parameters in dram by default i.e. prologue=False/True, if it's None we refer to microbatch-size to set prologue config
-    placer_op_overrides: List[Tuple[Union[str, query.NodePredicateBuilder], pyplacer.OpOverride]] = field(default_factory=list, metadata=list_as_json(PlacerOpOverridesAsJson))
     default_df_override: Optional[DataFormat] = field(default=None, metadata=optional_as_json(DataFormat)) # Default override for all node data formats, None means automatically inferred
     default_accumulate_df: Optional[DataFormat] = field(default=None, metadata=optional_as_json(DataFormat)) # Accumulation format, for chips that support it
     enable_broadcast_splitting: bool = False  # if true, large broadcasts will be split into multiple edges with nops between them
@@ -214,13 +170,13 @@ class CompilerConfig:
                                         # 2: Matmuls inputs/outputs are set to BFP8;    Fused ops, Softmax, LayerNorm ops are set to FP16;  GELU is BFP8;
                                         # 
                                         # Have in mind that in each AMP level, non-mentioned op types are left with default data format (usually set by user; i.e. FP32).
+    harvesting_mask: int = 0 # List of harvested rows (same across all chips)
     enable_auto_transposing_placement: bool = ("PYBUDA_ENABLE_AUTO_TRANSPOSE" in os.environ)  # compiler automatically detects ops to transpose on placement when the flag is set
     fracture_groups: List[Tuple[List[Tuple[str, int, int]], List[str], List[int]]] = field(default_factory=lambda: list()) # see insert_fracture_group
     conv_multi_op_fracture_factor_override: Dict[str, int] = field(default_factory=lambda: dict())  # override multi op fracture factor for conv
     enable_stable_softmax: bool = True
     enable_single_buffer_fallback: bool = False
 
-    device_mode: DeviceMode = field(default=DeviceMode.CompileAndRun, metadata=as_json(DeviceMode))
     backend_opt_level: int = 4 # backend optimization level
     backend_output_dir: str = field(default_factory=lambda: resolve_output_build_directory()) # backend compile and perf trace output directory
     backend_device_descriptor_path: str = ""
@@ -230,19 +186,16 @@ class CompilerConfig:
     store_backend_db_to_yaml: bool = False # whether to dump the backend DB to a yaml file or not
     input_queues_on_host: bool = True # whether to place input queues on device
     output_queues_on_host: bool = True # whether to place output queues on device
-    manual_dram_queue_placement: Dict[str, DramQueueConfigOverride] =  field(default_factory=lambda: dict(), metadata=dict_as_json(DramQueueConfigOverride)) # manual dram queue placements to target specific chip/dram chan
-    buffering_nops_to_insert: Dict[Tuple[str,str,int,int],NopInsertionInstruction] = field(default_factory=lambda: dict(), metadata=dict_as_json(NopInsertionInstruction))
     insert_queues: List[Tuple[str, str, int]] = field(default_factory=lambda: list(), metadata=list_as_json(tuple)) # Insert queues between (producer_op_name, consumer_op_name, input_port_id)
     amp_properties: List[AMPNodeProperties] = field(default_factory=lambda: list(), metadata=list_as_json(AMPNodeProperties))
     scheduler_constraints: List[List[str]] = field(default_factory=lambda: list())
     paddings: Dict[str, bool] = field(default_factory=lambda: dict())
     op_intermediates_to_save: List[str] = field(default_factory=lambda: list()) # list of tagged ops that will spill its output to queue
     tti_dump_format: TTIDumpFormat = field(default=TTIDumpFormat.DEFAULT, metadata=as_json(TTIDumpFormat))
-    dram_placement_algorithm: pyplacer.DRAMPlacementAlgorithm = field(default=pyplacer.DRAMPlacementAlgorithm.ROUND_ROBIN, metadata=as_json(pyplacer.DRAMPlacementAlgorithm))
 
     # TODO: add reportify dir
 
-    def __post_init__(self):
+    def apply_env_config_overrides(self):
         if "PYBUDA_OVERRIDE_NUM_CHIPS" in os.environ:
             self.chip_ids = list(range(int(os.environ.get('PYBUDA_OVERRIDE_NUM_CHIPS'))))
 
@@ -259,7 +212,7 @@ def __post_init__(self):
         if "PYBUDA_COMPILE_DEPTH" in os.environ:
             self.compile_depth = {
                     "full": CompileDepth.FULL,
-                    "start_compile": CompileDepth.START_COMPILE,
+                    "init_compile": CompileDepth.INIT_COMPILE,
                     "generate_initial_graph": CompileDepth.GENERATE_INITIAL_GRAPH,
                     "post_initial_graph_pass": CompileDepth.POST_INITIAL_GRAPH_PASS,
                     "pre_lowering_pass": CompileDepth.PRE_LOWERING_PASS,
@@ -314,8 +267,11 @@ def __post_init__(self):
         if "PYBUDA_SCHEDULER_POLICY" in os.environ:
             self.scheduler_policy = os.environ["PYBUDA_SCHEDULER_POLICY"]
 
-        if "PYBUDA_OVERRIDE_DEVICE_YAML" in os.environ:
-            self.backend_device_descriptor_path = os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"]
+        if "PYBUDA_OVERRIDE_DEVICE_YAML" in os.environ and os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] != "":
+            self.backend_device_descriptor_path = resolve_device_descriptor_path(os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"])
+
+    def __post_init__(self):
+        self.apply_env_config_overrides()
 
     def enable_amp_light(self, level: int = 1):
         if level == 0:
@@ -389,34 +345,20 @@ def manual_fuse(self, op_names: Union[str, List[str]]) -> None:
             assert isinstance(op_names[0], str)
             self.op_names_manual_fuse.extend(op_names)
 
-    def balancer_op_override(self, op_name: str, attribute: str, value):
-        op_override = self.balancer_op_overrides.get(op_name, pybalancer.OpOverride())
-        if isinstance(value, dict):
-            current_value = getattr(op_override, attribute) or dict()
-            value = {**current_value, **value}
-        setattr(op_override, attribute, value)
-        self.balancer_op_overrides[op_name] = op_override
-
     def save_intermediates(self) -> bool:
         return len(self.op_intermediates_to_save) > 0
 
 
-def set_harvested_rows_simulate(row_indices_allchips: List[List[int]]):
-    config_allchips = ""
-    num_chips = len(row_indices_allchips)
-    for chip, row_indices in enumerate(row_indices_allchips):
-        harvested_rows_mask = 0 
-        for r in row_indices:
-            harvested_rows_mask += (1 << r)
-        config_allchips += str(harvested_rows_mask)
-        if chip != num_chips-1:
-            config_allchips += ","
-    os.environ["TT_BACKEND_HARVESTED_ROWS"] = config_allchips
+def get_harvesting_mask(row_indices: List[int]):
+    harvested_rows_mask = 0 
+    for r in row_indices:
+        harvested_rows_mask += (1 << r)
+    return harvested_rows_mask
 
 # Backend runtime yaml path for supported B0 boards
 supported_backend_configurations = {
-    "wh_nebula_x1" : "tti/runtime_param_yamls/nebula_x1_syslevel.yaml",
-    "wh_nebula_x2" : "tti/runtime_param_yamls/nebula_x2_syslevel.yaml",
+    "wh_n150"      : "tti/runtime_param_yamls/wh_n150_syslevel.yaml",
+    "wh_n300"      : "tti/runtime_param_yamls/wh_n300_syslevel.yaml",
     "galaxy"       : "tti/runtime_param_yamls/galaxy_syslevel.yaml",
     "gs_e150"      : "tti/runtime_param_yamls/gs_e150_syslevel.yaml",
     "gs_e300"      : "tti/runtime_param_yamls/gs_e300_syslevel.yaml",
@@ -459,7 +401,6 @@ def set_configuration_options(
         op_intermediates_to_save: Optional[List[str]] = None,
         enable_enumerate_u_kt: Optional[bool] = None,
         enable_device_tilize: Optional[bool] = None,
-        dram_placement_algorithm: Optional[pyplacer.DRAMPlacementAlgorithm] = None,
         chip_placement_policy: Optional[str] = None,
         enable_forked_dram_inputs: Optional[bool] = None,
         device_config: Optional[str] = None):
@@ -589,7 +530,7 @@ def set_configuration_options(
     if backend_output_dir is not None:
         g_compiler_config.backend_output_dir = backend_output_dir
     if backend_device_descriptor_path is not None:
-        g_compiler_config.backend_device_descriptor_path = backend_device_descriptor_path
+        g_compiler_config.backend_device_descriptor_path = resolve_device_descriptor_path(backend_device_descriptor_path)
     if backend_cluster_descriptor_path is not None:
         g_compiler_config.backend_cluster_descriptor_path = backend_cluster_descriptor_path
     if backend_runtime_params_path is not None:
@@ -605,7 +546,7 @@ def set_configuration_options(
     if amp_level is not None:
         g_compiler_config.amp_level = amp_level
     if harvested_rows is not None:
-        set_harvested_rows_simulate(harvested_rows)
+        g_compiler_config.harvesting_mask = get_harvesting_mask(harvested_rows)
     if store_backend_db_to_yaml is not None:
         g_compiler_config.store_backend_db_to_yaml = store_backend_db_to_yaml
     if input_queues_on_host is not None:
@@ -624,15 +565,8 @@ def set_configuration_options(
         g_compiler_config.enable_device_tilize = enable_device_tilize
     if chip_placement_policy is not None:
         g_compiler_config.chip_placement_policy = chip_placement_policy
-    if dram_placement_algorithm is not None:
-        g_compiler_config.dram_placement_algorithm = dram_placement_algorithm
     if enable_forked_dram_inputs is not None:
         g_compiler_config.enable_forked_dram_inputs = enable_forked_dram_inputs 
-    if device_config is not None:
-        if device_config in supported_backend_configurations and pkg_resources.resource_exists("pybuda", supported_backend_configurations[device_config]):
-            g_compiler_config.backend_runtime_params_path = pkg_resources.resource_filename("pybuda", supported_backend_configurations[device_config])
-        else:
-            raise RuntimeError(f"Unsupported backend device configuration: {device_config}")
 
 def set_epoch_break(op_names: Union[str, query.NodePredicateBuilder, List[Union[str, query.NodePredicateBuilder]]]):
     """
@@ -827,6 +761,26 @@ def override_multi_op_fracture_factor(op_name: str, multi_op_fracture_factor: in
     global g_compiler_config
     g_compiler_config.conv_multi_op_fracture_factor_override[op_name] = multi_op_fracture_factor
 
+def add_cpu_fallback_ops(op_types: Union[str, List[str]]):
+    """
+    Add one or more op types to CPU fallback list. These operation will be executed on the host. 
+    """
+    global g_compiler_config
+    if isinstance(op_types, str):
+        g_compiler_config.cpu_fallback_ops.add(op_types)
+    else:
+        g_compiler_config.cpu_fallback_ops.update(op_types)
+
+def remove_cpu_fallback_ops(op_types: Union[str, List[str]]):
+    """
+    Remove one or more op types from the CPU fallback list.
+    """
+    global g_compiler_config
+    if isinstance(op_types, str):
+        g_compiler_config.cpu_fallback_ops.discard(op_types)
+    else:
+        for op_type in op_types:
+            g_compiler_config.cpu_fallback_ops.discard(op_type)
 
 def insert_fracture_group(nodes: List[Union[str, Tuple[str, Union[int, List[int]], Union[int, List[int]]]]], chip_ids: Union[List[int], Dict[str, List[int]]] = []):
     """
@@ -874,60 +828,116 @@ def insert_fracture_group(nodes: List[Union[str, Tuple[str, Union[int, List[int]
     g_compiler_config.fracture_groups.append((nodes, chip_ids))
 
 
-def override_op_placement(
-        op_name: Union[str, query.NodePredicateBuilder],
+def __insert_nop_impl(
+        src_op: str,
+        dest_ops: Union[str, List[str]],
         *,
-        start: Tuple[int, int] = None,
-        transpose_op = False,
-        chip_id: Optional[int] = None,
-        spatial_epoch_break: bool = False,
-        temporal_epoch_break: bool = False,
+        hoist_tms: bool = True,
+        nop_count: int = 1,
+        daisy_chain: bool = False,
+        is_fj_buffering = False,
     ):
+
+    assert isinstance(src_op, str)
+    if isinstance(dest_ops, str):
+        dest_ops = [dest_ops]
+    assert isinstance(hoist_tms, bool)
+
+    global g_compiler_config
+    merge_nops = bool(len(dest_ops) > 1)
+    buff_ind = 0
+    for dest_idx, dest_op in enumerate(dest_ops):
+        buff_ind += 1
+        request_merge = (dest_idx == len(dest_ops) -1)
+        nop_instr = NopInsertionInstruction(
+            src=src_op,
+            dest=dest_op,
+            hoist_tms=hoist_tms,
+            nop_count=nop_count,
+            input_id=None,
+            fork_id=None,
+            user_defined=True,
+            mergeable=merge_nops,
+            daisy_chain=daisy_chain,
+            request_merge=request_merge,
+            is_fj_buffering=is_fj_buffering
+        )
+        g_compiler_config.buffering_nops_to_insert[nop_instr.unique_id()] = nop_instr
+
+
+def insert_nop(src_op: str, dest_ops: Union[str, List[str]], *, hoist_tms: bool = True, nop_count: int = 1, daisy_chain: bool = False):
     """
-    Override op_placement to provide to the placer.  Node that successive calls with the same node name or overlapping predicate match will throw an error.
+    Instruct pybuda compiler to insert a NOP instruction on the edge identified by the named src/dest pair.
 
     Parameters
     ----------
-    op_name: str
-        op name
+    src_op: str
+        Name of the src op
 
-    start: Tuple[int, int]
-        Override the start grid-location for an op
+    dest_op: str
+        Name of the dest op
 
-    transpose_op: bool
-        whether to manually transpose this op
+    hoist_tms: bool
+        Configure whether the TMs on the original edge should be transfered to
+        (src -> NOP edge) or to the (NOP -> dest edge).
 
-    chip_id: int
-        pin this op to a specific chip id. Only Wormhole is supported for now.
+    daisy_chain: bool
+        Sets the merge-strategy for NOPs to `daisy_chain` when there are multiple dest-ops.
+        By default, the merge-strategy will create a single buffer-nop forking to `dest_ops`.
+        When `daisy_chain` is enabled, we will create a daisy-chain of nop operations to dest_ops.
+
+    """
 
-    spatial_epoch_break: bool
-        Create a new spatial epoch and place `op_name` as the first op in the new epoch.
+    __insert_nop_impl(
+        src_op=src_op,
+        dest_ops=dest_ops,
+        hoist_tms=hoist_tms,
+        nop_count=nop_count,
+        daisy_chain=daisy_chain,
+        is_fj_buffering=False,
+    )
 
-    temporal_epoch_break: bool
-        Create a new spatial epoch and place `op_name` as the first op in the new epoch.
-        This new spatial epoch will be placed on a new temporal epoch.
 
+def _internal_insert_fj_buffering_nop(src_op: str, dest_ops: Union[str, List[str]], *, hoist_tms: bool = True, nop_count: int = 1, daisy_chain: bool = False):
     """
-    assert isinstance(op_name, str) or isinstance(op_name, query.NodePredicateBuilder), f"parameter `op_name` should be a string or NodePredicateBuilder. User provided `op_name`: {op_name}"
-    assert isinstance(transpose_op, bool), f"parameter `tranpose_op` should be a bool. User provided `transpose_op`: {transpose_op}"
-    if start is not None:
-        assert isinstance(start, Iterable), f"parameter `start` should be an iterable. User provided `start`: {start}"
-        assert len(start) == 2, f"parameter `start` should have two elements. User provided `start` with {len(start)} elements"
+    Instruct pybuda compiler to insert a fork-join buffering NOP instruction on the edge identified by the named src/dest pair.
+    Note: Adding a fork-join buffering NOP instructions may lead to exceptions!
 
-    global g_compiler_config
-    g_compiler_config.placer_op_overrides.append((op_name, pyplacer.OpOverride(start, transpose_op, chip_id, temporal_epoch_break)))
+    Parameters
+    ----------
+    src_op: str
+        Name of the src op
+
+    dest_op: str
+        Name of the dest op
+
+    hoist_tms: bool
+        Configure whether the TMs on the original edge should be transfered to
+        (src -> NOP edge) or to the (NOP -> dest edge).
+
+    daisy_chain: bool
+        Sets the merge-strategy for NOPs to `daisy_chain` when there are multiple dest-ops.
+        By default, the merge-strategy will create a single buffer-nop forking to `dest_ops`.
+        When `daisy_chain` is enabled, we will create a daisy-chain of nop operations to dest_ops.
+
+    """
+    __insert_nop_impl(
+        src_op=src_op,
+        dest_ops=dest_ops,
+        hoist_tms=hoist_tms,
+        nop_count=nop_count,
+        daisy_chain=daisy_chain,
+        is_fj_buffering=True,
+    )
 
-    if temporal_epoch_break:
-        set_epoch_break(op_name)
-    if spatial_epoch_break:
-        if g_compiler_config.use_interactive_placer == False or "PYBUDA_DISABLE_INTERACTIVE_PLACER" in os.environ:
-            set_epoch_break(op_name)
-        else:
-            set_chip_break(op_name)
 
 def insert_buffering_nop(src_op: str, dest_ops: Union[str, List[str]], *, hoist_tms: bool = True, nop_count: int = 1, daisy_chain: bool = False):
     """
-    Instruct pybuda compiler to insert a NOP instruction on the edge identified by the named src/dest pair.
+    "DEPRECATION WARNING! Please use `insert_nop` instead of `insert_buffering_nop`. To add a buffering nop, use the \
+    internal API `_internal_insert_fj_buffering_nop`."
+
+    Instruct pybuda compiler to insert a buffering NOP instruction on the edge identified by the named src/dest pair.
+    Note: Adding buffering NOP instructions may lead to exceptions!
 
     Parameters
     ----------
@@ -947,31 +957,17 @@ def insert_buffering_nop(src_op: str, dest_ops: Union[str, List[str]], *, hoist_
         When `daisy_chain` is enabled, we will create a daisy-chain of nop operations to dest_ops.
 
     """
-    assert isinstance(src_op, str)
-    if isinstance(dest_ops, str):
-        dest_ops = [dest_ops]
-    assert isinstance(hoist_tms, bool)
 
-    global g_compiler_config
-    merge_nops = bool(len(dest_ops) > 1)
-    buff_ind = 0
-    for dest_idx, dest_op in enumerate(dest_ops):
-        buff_ind += 1
-        request_merge = (dest_idx == len(dest_ops) -1)
-        nop_instr = NopInsertionInstruction(
-            src=src_op,
-            dest=dest_op,
-            hoist_tms=hoist_tms,
-            nop_count=nop_count,
-            input_id=None,
-            fork_id=None,
-            user_defined=True,
-            mergeable=merge_nops,
-            daisy_chain=daisy_chain,
-            request_merge=request_merge,
-        )
-        g_compiler_config.buffering_nops_to_insert[nop_instr.unique_id()] = nop_instr
+    logger.warning("DEPRECATION WARNING! Please use `insert_nop` instead of `insert_buffering_nop`. To add fork-join \
+                   buffering nop, use the internal API `_internal_insert_fj_buffering_nop`.")
 
+    _internal_insert_fj_buffering_nop(
+        src_op=src_op,
+        dest_ops=dest_ops,
+        hoist_tms=hoist_tms,
+        nop_count=nop_count,
+        daisy_chain=daisy_chain
+    )
 
 
 def add_schedule_constraint(partial_ordering: List[str]):
@@ -1084,8 +1080,7 @@ def configure_mixed_precision(
     assert math_fidelity is None or isinstance(math_fidelity, MathFidelity)
     assert name_regex is None or isinstance(name_regex, str)
     assert op_type or epoch_type or name_regex or is_gradient_op
-    if input_df is None:
-        input_df = {}
+
     if isinstance(input_df, dict):
         for operand_index, config in input_df.items():
             assert len(config) == 2, f"For operand index {operand_index}, invalid config"
diff --git a/pybuda/pybuda/cpudevice.py b/pybuda/pybuda/cpudevice.py
deleted file mode 100644
index 44f9d32d1..000000000
--- a/pybuda/pybuda/cpudevice.py
+++ /dev/null
@@ -1,1126 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import collections
-from typing import Union, Tuple, List, Callable, Optional, Dict
-import queue
-
-import tensorflow as tf
-import torch
-import torch.multiprocessing as mp
-
-from multiprocessing.synchronize import Event as EventClass
-from multiprocessing.synchronize import Barrier as BarrierClass
-from loguru import logger
-
-from .device import Device
-from .module import Module, PyTorchModule, TFModule
-from .tensor import SomeTensor, buda_dataformat_to_pytorch_dtype, Tensor, to_pt_tensors, to_tf_variables, to_buda_tensors
-from .verify import VerifyConfig
-from .compile import CompilerConfig
-from .pybudaglobal import lazy_trace_data
-from .device_connector import DeviceConnector, TransferType, DirectPusherDeviceConnector
-from .utils import detach_tensors
-
-from pybuda.tvm_utils import map_tf_dtype_to_pt, map_pt_dtype_to_tf
-
-from torch import nn
-
-class mySequential(nn.Sequential):
-    def forward(self, *input):
-        for module in self._modules.values():
-            if torch.is_tensor(input): 
-                input = (input,)
-            input = module(*input)
-        return input
-
-class CPUDevice(Device):
-    """
-    CPUDevice represents a CPU processor. It will spawn a process and run local operations on the assigned processor.
-    """
-
-    # Recorded as class variables since they don't need to be sent over to target process
-    optimizer_f: Dict[Device, Callable] = {}
-    scheduler_f: Dict[Device, Callable] = {}
-
-    def __init__(self, 
-        name: str, 
-        optimizer_f: Callable = None,
-        scheduler_f: Callable = None,
-        mp_context = None,
-        retain_backward_graph = False,
-        module: Union[PyTorchModule, List[PyTorchModule]] = None,
-        input_dtypes: List[torch.dtype] = None,
-    ):
-        """
-        Create a CPU device with a given name. Optionally override Python multi-procesing context.
-
-        Parameters
-        ----------
-        name: str
-            Device name
-
-        optimizer_f: Callable, optional
-            Function that takes in a module and returns an optimizer. Required for training.
-
-        scheduler_f: Callable, optional
-            Function that takes in an optimizer, and returns one or more schedulers to step on each epoch.
-            If None, no scheduler will be used during training.
-
-        mp_context: mp.context, optional
-            If provided, mp_context will be used to create multi-processing queues, instead of the default one
-
-        module: Union[PyTorchModule, List[PyTorchModule]], optional
-            Optionally place given module(s) one the device has been created
-
-        """
-        super().__init__(name, mp_context)
-        self.sequential_module: Optional[torch.nn.Module] = None
-
-        # record as class variables to avoid pickling and sending to target process
-        CPUDevice.optimizer_f[self] = optimizer_f
-        CPUDevice.scheduler_f[self] = scheduler_f
-
-        self.optimizer: torch.optim.Optimizer = None
-        self.schedulers: List[torch.optim.lr_scheduler._LRScheduler] = []
-
-        self.retain_backward_graph = retain_backward_graph
-        self.devtype = None
-        self.device = "cpu"
-        self.framework = None
-        self.tf_grads = None
-        self.tf_gradient_tape = None
-        self.cpueval_tf_grads = None
-        self.cpueval_tf_gradient_tape = None
-        self.input_dtypes = input_dtypes
-
-        if module is not None:
-            if not isinstance(module, list):
-                module = [module]
-            for m in module:
-                self.place_module(m)
-
-        self._saved_fwd_data = None
-
-    def __repr__(self):
-        return f"CPUDevice '{self.name}'"
-    
-    def _initialize(self, 
-            training: bool, 
-            sequential: bool,
-            final_barrier: Optional[BarrierClass] = None,
-            shutdown_event: Optional[EventClass] = None,
-            scale_loss: float = 1.0,
-            checkpoint_interval: int = 0,
-            perf_trace: bool = False):
-        """
-        Initialize the CPU device.
-
-        Parameters
-        ----------
-        training: bool
-            If true, create optimizer and schedulers for trainig, linking them to the modules on the device
-
-        sequential: bool
-            Set sequential/concurrent mode for this device
-
-        final_barrier: mp.Event, optional
-            If provided, forward will wait for the wait event before completing, allowing processes and queues to
-            be alive until everything has completed.
-
-        shutdown_event: mp.Event, optional
-            If provided, forward will trigger the event in case of an exception, letting other processes know to
-            shut down. This should always be set in concurrent mode.
-
-        scale_loss: float, optional
-            If this device is calculating loss, multiply the value with scale_loss after calculating it
-
-        checkpoint_interval: int, optional
-            The weights will be checkpointed into checkpoint queues on host every `checkpoint_interval` optimizer
-            steps, if set to non-zero. Zero by default.
-
-        perf_trace: bool, optional
-            Ignored by CPU device
-        """
-
-        Device._initialize(self, sequential, final_barrier, shutdown_event)
-
-        if not training:
-            return # nothing to do here right now
-
-        self._scale_loss = scale_loss
-
-        if CPUDevice.optimizer_f[self] is None:
-            logger.warning("Warning: no optimizer function provided for {}. No optimization will be done.", self)
-        else:
-            module = self._get_sequential().module
-            params = module.parameters() if self.framework == "pytorch" else module.weights
-            if len(list(params)) > 0:
-                self.optimizer = CPUDevice.optimizer_f[self](module)
-                if (self.optimizer is None or 
-                    (isinstance(module, torch.nn.Module) ^ isinstance(self.optimizer, torch.optim.Optimizer)) or
-                    (isinstance(module, (tf.keras.Model, tf.keras.layers.Layer)) ^ isinstance(self.optimizer, tf.keras.optimizers.legacy.SGD))
-                ):
-                    raise RuntimeError(f"Optimizer function for {self} didn't return a PyTorch optimizer")
-            else:
-                self.optimizer = None
-
-
-        if self.optimizer is not None and CPUDevice.scheduler_f[self] is not None:
-            schedulers = CPUDevice.scheduler_f[self](self.optimizer)
-            if schedulers is not None:
-                if isinstance(schedulers, (tuple, list)):
-                    self.schedulers = list(schedulers)
-                else:
-                    self.schedulers = [schedulers]
-
-                # TODO: Any reason we ever want multiple schedulers for a particular device? 
-                # Maybe this should be refactored to just be one scheduler
-                assert len(self.schedulers) == 1, "Only one scheduler per device is currently supported"
-                for s in self.schedulers:
-                    if not isinstance(s, torch.optim.lr_scheduler._LRScheduler):
-                        raise RuntimeError(f"Schedule function for {self} returned a non-scheduler")
-
-    def forward_pt(self, loop_count: int):
-        """
-        Run forward pass on each module on this device, in order
-
-        Parameters
-        ----------
-        loop_count: int
-            Number of micro-batches to run
-        """
-
-        logger.debug("Starting forward on {}", self)
-        assert self._compiled, f"Module not compiled yet on {self}"
-
-        if not self._training:
-            self._modules_eval() # Set the module(s) to eval mode 
-
-        try:
-            for _ in range(loop_count):
-                inputs = self.forward_input_dc.read()
-
-                logger.trace("Forward inputs on {}:", self)
-                lazy_trace_data(inputs)
-
-                # Convert to pytorch tensors, if needed
-                inputs = to_pt_tensors(inputs)
-                torch_inputs = tuple(t.value() if isinstance(t, Tensor) else t for t in inputs)
-                torch_inputs = tuple(t.to(self.device) for t in torch_inputs)
-                for t in torch_inputs:
-                    if t.requires_grad:
-                        t.retain_grad()
-                        
-                if self.input_dtypes:
-                    assert len(self.input_dtypes) == len(torch_inputs), f"CPUDevice input_dtypes specified, but differs in size from number of actual inputs. Types specified: {len(self.input_dtypes)}, num inputs: {len(torch_inputs)}"
-                    torch_inputs = tuple(t.type(typ) for t, typ in zip(torch_inputs, self.input_dtypes))
-                    torch_inputs = detach_tensors(torch_inputs)
-                
-                elif any(t.dtype in (torch.float16, torch.bfloat16) for t in torch_inputs):
-                    torch_inputs = tuple(t.type(torch.float32) for t in torch_inputs)
-                    torch_inputs = detach_tensors(torch_inputs)
-
-                if self.loss_module is not None and len(self.modules) == 1:
-                    outputs = torch_inputs
-                else:
-                    self._get_sequential().compilation = False
-                    outputs: Tuple[SomeTensor] = self._modules_forward(*torch_inputs)
-
-                if self.loss_module is None:
-                    # Push data on to the output or next device
-                    outputs = tuple(o.to('cpu') for o in outputs)
-                    logger.trace("Forward outputs on {}:", self)
-                    #lazy_trace_data(outputs)
-
-                    detached_outputs = tuple(Tensor.create_from_torch(o).detach() for o in outputs)
-                    self.forward_dc.push(detached_outputs)
-
-                else:
-
-                    # Calculate loss
-                    targets = self.target_input_dc.read()
-                    targets = tuple(t.to(self.device) for t in targets)
-                    outputs = tuple(t.to(self.device) for t in outputs)
-
-                    if len(outputs) == 1:
-                        outputs = outputs[0]
-                    if len(targets) == 1:
-                        targets = targets[0]
-
-                    lout = self.loss_module.forward(outputs, targets)
-                    lout = self._scale_loss * lout
-                    lout = [lout]
-
-                    logger.info("Loss: {}", lout[0].item())
-
-                    outputs = lout
-                    if self.forward_dc:
-                        self.forward_dc.push(tuple(l.item() for l in lout))
-
-                if self._training:
-                    if self._saved_fwd_data is None:
-                        self._saved_fwd_data = queue.Queue() # local, no need for mp
-                    self._saved_fwd_data.put((torch_inputs, outputs))
-
-                self.forward_input_dc.pop()
-
-            logger.debug("Ending forward on {}", self)
-
-        except Exception as e:
-
-            # Let other processes know to stop
-            if self.shutdown_event is not None:
-                self.shutdown_event.set()
-            logger.debug("Ending forward due to exception on {}: {}", self, e)
-            raise
-
-    def forward_tf(self, loop_count: int):
-        """
-        Run forward pass on each module on this device, in order
-
-        Parameters
-        ----------
-        loop_count: int
-            Number of micro-batches to run
-        """
-        logger.debug("Starting forward on {}", self)
-        assert self._compiled, f"Module not compiled yet on {self}"
-
-        try:
-            for _ in range(loop_count):
-                inputs = self.forward_input_dc.read()
-
-                logger.trace("Forward inputs on {}:", self)
-                lazy_trace_data(inputs)
-                inputs = to_tf_variables(inputs)
-                outputs = inputs
-                module = self._get_sequential()
-                if self._training:
-                    with tf.GradientTape(persistent=True) as tape:
-                        [tape.watch(output) for output in outputs if output.trainable]
-                        outputs = module.call(*outputs)
-                else:
-                    outputs = module.call(*outputs)
-
-                if not isinstance(outputs, (list, tuple)):
-                    outputs = (outputs, )
-                detached_outputs = to_buda_tensors(outputs)
-                logger.trace("Forward outputs on {}:", self)
-                lazy_trace_data(detached_outputs)
-
-
-                if self.loss_module is None:
-                    # Push data on to the output or next device
-                    self.forward_dc.push(detached_outputs)
-
-                else:
-                    assert False, "TODO"
-
-                if self._training:
-                    if self._saved_fwd_data is None:
-                        self._saved_fwd_data = queue.Queue() # local, no need for mp
-                    self.tf_gradient_tape = tape
-                    self._saved_fwd_data.put((inputs, outputs,))
-
-                self.forward_input_dc.pop()
-
-            logger.debug("Ending forward on {}", self)
-
-        except Exception as e:
-
-            # Let other processes know to stop
-            if self.shutdown_event is not None:
-                self.shutdown_event.set()
-            logger.debug("Ending forward due to exception on {}: {}", self, e)
-            raise
-
-    
-    def forward(self, loop_count: int):
-        """
-        Run forward pass on each module on this device, in order
-
-        Parameters
-        ----------
-        loop_count: int
-            Number of micro-batches to run
-        """
-        #TODO Implement support for multiple subgraphs on cpu device
-        if self.framework == "pytorch":
-            forward_fn = self.forward_pt
-        elif self.framework == "tensorflow":
-            forward_fn = self.forward_tf
-        else:
-            raise RuntimeError(f"Unsupported framework: {self.framework}")
-        
-        return forward_fn(
-            loop_count,
-        )
-
-    def backward(self, loop_count: int, zero_grad: bool):
-        """
-        Run backward pass on each module on this device, in reverse order
-
-        Parameters
-        ----------
-        loop_count: int
-            Each mini-batch is broken into micro-batches. This is necessary to fill a multi-device pipeline, 
-            and should be roughly 4-6x the number of devices in the pipeline for ideal performance.
-
-        zero_grad: bool
-            Set to true to have optimizer zero out gradients before the run
-        """
-
-        if self.framework == "pytorch":
-            self.backward_pt(loop_count, zero_grad)
-        elif self.framework == "tensorflow":
-            self.backward_tf(loop_count, zero_grad)
-        else:
-            raise RuntimeError(f"Unsupported framework: {self.framework}")
-
-    def backward_tf(self, loop_count: int, zero_grad: bool):
-        logger.debug("Starting backward on {}", self)
-        self._modules_train() # Set the module(s) to train mode
-
-        try:
-            module = self._get_sequential()
-            # if zero_grad and self.optimizer:
-            #     self.optimizer.zero_grad()  #  zero out the gradients
-
-            for _ in range(loop_count):
-                fwd_inputs, fwd_outputs = self._saved_fwd_data.get()
-
-                if self.loss_module:
-                    incoming_grad = fwd_outputs
-                else:
-                    bw_inputs = self.backward_input_dc.read()
-                    # Convert to pytorch tensors, if needed
-                    incoming_grad = to_tf_variables(bw_inputs)
-
-                    self.backward_input_dc.pop()
-
-                self.tf_grads = self.tf_gradient_tape.gradient(
-                    fwd_outputs, 
-                    module.module.trainable_variables,
-                    output_gradients=incoming_grad)
-                # what if it has multiple outputs?
-                bw_inputs = self.tf_gradient_tape.gradient(
-                    fwd_outputs,
-                    [fwd_input for fwd_input in fwd_inputs if fwd_input.trainable],
-                    output_gradients=incoming_grad)
-
-                # Automatic device selection
-                bw_inputs_on_cpu = tuple(Tensor.create_from_torch(torch.Tensor(bw_inp.numpy())) for bw_inp in bw_inputs if bw_inp is not None)
-
-                logger.trace("Pushing bw inputs {} into {}", bw_inputs_on_cpu, type(self.backward_dc))
-                self.backward_dc.push(bw_inputs_on_cpu)
-
-                logger.trace("GRADIENT_CHECK: gradient out from {}: ", self)
-                lazy_trace_data(bw_inputs_on_cpu)
-
-            logger.debug("Ending backward on {}", self)
-
-        except Exception as e:
-
-            # Let other processes know to stop
-            if self.shutdown_event is not None:
-                self.shutdown_event.set()
-            logger.debug("Ending backward due to exception on {}", self)
-            raise
-
-
-    def backward_pt(self, loop_count: int, zero_grad: bool):
-        logger.debug("Starting backward on {}", self)
-        self._modules_train() # Set the module(s) to train mode
-
-        try:
-            if zero_grad and self.optimizer:
-                self.optimizer.zero_grad()  #  zero out the gradients
-
-            for _ in range(loop_count):
-                    
-                fwd_inputs, fwd_outputs = self._saved_fwd_data.get()
-
-                fwd_outputs = tuple(fwd_output.to(self.device) for fwd_output in fwd_outputs)
-
-                if self.loss_module:
-
-                    for l in fwd_outputs:
-                        l.backward()
-
-                else:
-                    # Get inputs from the next stage, and run backward
-                    bw_inputs = self.backward_input_dc.read()
-
-                    # Convert to pytorch tensors, if needed
-                    bw_inputs = tuple(t.value() if isinstance(t, Tensor) else t for t in bw_inputs)
-                    bw_inputs = tuple(t.to(self.device) for t in bw_inputs)
-
-                    logger.trace("GRADIENT_CHECK: bw_inputs into {}: ", self)
-                    lazy_trace_data(bw_inputs)
-
-                    req_grad_outs = [out for out in fwd_outputs if out.requires_grad]
-                    for i, rec_out in enumerate(req_grad_outs):
-                        if rec_out.requires_grad:
-                            rec_out.backward(bw_inputs[i], retain_graph=self.retain_backward_graph or (i < len(req_grad_outs) - 1))
-
-                    self.backward_input_dc.pop()
-                    
-
-                bw_inputs = tuple(fwd_input.grad for fwd_input in fwd_inputs if fwd_input.requires_grad or (fwd_input.grad_fn is not None))
-                bw_inputs_on_cpu = tuple(t.to("cpu") for t in bw_inputs)
-                logger.trace("Pushing bw inputs {} into {}", bw_inputs_on_cpu, type(self.backward_dc))
-                self.backward_dc.push(bw_inputs_on_cpu)
-
-                logger.trace("GRADIENT_CHECK: gradient out from {}: ", self)
-                lazy_trace_data(bw_inputs_on_cpu)
-
-            logger.debug("Ending backward on {}", self)
-
-        except Exception as e:
-
-            # Let other processes know to stop
-            if self.shutdown_event is not None:
-                self.shutdown_event.set()
-            logger.debug("Ending backward due to exception on {}", self)
-            raise
-
-    def generate(self, loop_count: int, write_index: int):
-        """
-        Run forward pass on each module on this device, in order
-
-        Parameters
-        ----------
-        loop_count: int
-            Number of micro-batches to run
-
-        """
-
-        logger.debug("Starting generate on {}", self)
-        assert self._compiled, f"Module not compiled yet on {self}"
-        return self.forward(loop_count=loop_count)
-
-    def compile_for_pt(self, 
-            inputs: Tuple[Tensor, ...],
-            compiler_cfg: CompilerConfig,
-            targets: List[Tensor] = [],
-            microbatch_size: int = 0,
-            microbatch_count: int = 1,
-            verify_cfg: Optional[VerifyConfig] = None,
-            ) -> Tuple[Tensor, ...]:
-        """
-        For a CPU device, there is currently no compilation. This function propagates input shapes through the model
-        to return output shapes and formats.
-
-        Parameters
-        ----------
-        inputs: Tuple[Tensor, ...]
-            Tuple of input tensors. They must have shape and format set, but do not need to hold data unless
-            auto-verification is set.
-
-        compiler_cfg: CompilerConfig
-            Compiler configuration
-
-        targets: List[Tensor], optional
-            Optional list of target tensors, if this device has a loss module
-
-        microbatch_size: int, optional
-            The size of microbatch. Must be non-zero for training mode.
-
-        microbatch_count: int
-            Only relevant for training and TT devices.
-
-        verify_cfg: Optional[VerifyConfig]
-            Optional auto-verification of compile process
-
-        Returns
-        -------
-        Tuple[Tensor, ...]
-            Output tensors
-
-        """
-        assert not self._compiled, "Trying to compile a design that's already been compiled"
-
-        training = compiler_cfg.enable_training
-        Device.compile_for(self, training, microbatch_size, microbatch_count)
-
-        if len(targets) > 0: # has loss module, only output is loss
-            self._compiled = True
-            return tuple([Tensor.create_from_torch(torch.tensor(1.0)).detach() for _ in targets])
-
-        # Create inputs of the right shape and format, if needed
-        torch_inputs = []
-        for t in inputs:
-            if t.has_value():
-                torch_inputs.append(t.value())
-            else:
-                torch_inputs.append(torch.zeros(*t.shape.get_pytorch_shape(), dtype=buda_dataformat_to_pytorch_dtype(t.data_format)))
-
-        torch_inputs = tuple(x.to(self.device) for x in torch_inputs)
-        self._get_sequential().compilation = True
-        outputs = self._modules_forward(*torch_inputs, targets=targets)
-        outputs = [o if o.is_floating_point() else o.float() for o in outputs]
-        outputs = tuple(x.to('cpu') for x in outputs)
-
-        while isinstance(outputs[0], tuple):
-            outputs = outputs[0]
-
-        outputs = tuple(Tensor.create_from_torch(o).detach() for o in outputs)
-        self._compiled = True
-        return outputs
-
-    def compile_for_tf(self, 
-            inputs: Tuple[Tensor, ...],
-            compiler_cfg: CompilerConfig,
-            targets: List[Tensor] = [],
-            microbatch_size: int = 0,
-            verify_cfg: Optional[VerifyConfig] = None,
-            ) -> Tuple[Tensor, ...]:
-        """
-        For a CPU device, there is currently no compilation. This function propagates input shapes through the model
-        to return output shapes and formats.
-
-        Parameters
-        ----------
-        inputs: Tuple[Tensor, ...]
-            Tuple of input tensors. They must have shape and format set, but do not need to hold data unless
-            auto-verification is set.
-
-        compiler_cfg: CompilerConfig
-            Compiler configuration
-
-        targets: List[Tensor], optional
-            Optional list of target tensors, if this device has a loss module
-
-        microbatch_size: int, optional
-            The size of microbatch. Must be non-zero for training mode.
-
-        verify_cfg: Optional[VerifyConfig]
-            Optional auto-verification of compile process
-
-        Returns
-        -------
-        Tuple[Tensor, ...]
-            Output tensors
-
-        """
-        assert not self._compiled, "Trying to compile a design that's already been compiled"
-
-        training = compiler_cfg.enable_training
-        Device.compile_for(self, training, microbatch_size)
-
-        # Create inputs of the right shape and format, if needed
-        tf_inputs = to_tf_variables(inputs)
-
-        outputs = tf_inputs
-        self._get_sequential().compilation = True
-        outputs = self._modules_forward(*tf_inputs, targets=targets)
-
-        if not isinstance(outputs, (list, tuple)):
-            outputs = (outputs, )
-
-        outputs = to_buda_tensors(outputs)
-        self._compiled = True
-        return outputs
-
-    def compile_for(self, 
-            inputs: Tuple[Tensor, ...],
-            compiler_cfg: CompilerConfig,
-            targets: List[Tensor] = [],
-            microbatch_size: int = 0,
-            microbatch_count: int = 1,
-            verify_cfg: Optional[VerifyConfig] = None,
-            ) -> Tuple[Tensor, ...]:
-        """
-        For a CPU device, there is currently no compilation. This function propagates input shapes through the model
-        to return output shapes and formats.
-
-        Parameters
-        ----------
-        inputs: Tuple[Tensor, ...]
-            Tuple of input tensors. They must have shape and format set, but do not need to hold data unless
-            auto-verification is set.
-
-        compiler_cfg: CompilerConfig
-            Compiler configuration
-
-        targets: List[Tensor], optional
-            Optional list of target tensors, if this device has a loss module
-
-        microbatch_size: int, optional
-            The size of microbatch. Must be non-zero for training mode.
-
-        microbatch_count: int
-            Only relevant for training and TT devices.
-
-        verify_cfg: Optional[VerifyConfig]
-            Optional auto-verification of compile process
-
-        Returns
-        -------
-        Tuple[Tensor, ...]
-            Output tensors
-
-        """
-        assert not self._compiled, "Trying to compile a design that's already been compiled"
-
-        if self.framework == "pytorch":
-            compile_for_fn = self.compile_for_pt
-        elif self.framework == "tensorflow":
-            compile_for_fn = self.compile_for_tf
-        else:
-            raise RuntimeError(f"Unsupported framework: {self.framework}")
-        
-        return compile_for_fn(
-            inputs,
-            compiler_cfg,
-            targets,
-            microbatch_size,
-            verify_cfg
-        )
-
-    def _get_sequential(self) -> Union[PyTorchModule, TFModule]:
-        """
-        Combine modules into one sequential module, if needed. Otherwise return one module.
-        """
-        contains_loss_module = 1 if self.loss_module else 0
-        num_network_modules = len(self.modules) - contains_loss_module
-        if self.sequential_module is None and num_network_modules > 1:
-            od = collections.OrderedDict()
-            for i, m in enumerate(self.modules):
-                if m != self.loss_module:
-                    od[m.name] = m.module
-
-            if self.framework == "pytorch":
-                self.sequential_module = PyTorchModule(self.name + "_sequential", mySequential(od))
-            elif self.framework == "tensorflow":
-                model = tf.keras.Sequential()
-                for module in od.values():
-                    model.add(module)
-                self.sequential_module = TFModule(self.name + "_sequential", model)
-            else:
-                raise RuntimeError(f"Unsupported framework: {self.framework}")
-
-        module = self.sequential_module if self.sequential_module is not None else self.modules[0]
-        return module
-
-    def _modules_forward(self, *args, targets=None) -> Tuple[torch.Tensor, ...]:
-        """
-        Run forward on all modules on device and return outputs
-        """
-        if len(self.modules) == 0:
-            raise RuntimeError("Trying to run device with no modules")
-
-        module = self._get_sequential()
-
-        if self.framework == "pytorch":
-            outputs: Tuple[SomeTensor] = module.forward(*args)
-        elif self.framework == "tensorflow":
-            tf_inputs = to_tf_variables(args)
-            outputs: Tuple[SomeTensor] = module.call(*tf_inputs)
-        else:
-            raise RuntimeError(f"Unsupported framework: {self.framework}")
-        if not isinstance(outputs, tuple):
-            outputs = (outputs, )
-
-        return outputs
-
-    def _modules_eval(self):
-        """
-        Set the module(s) to eval mode
-        """
-        self._get_sequential().module.eval()
-
-    def _modules_train(self):
-        """
-        Set the module(s) to train mode
-        """
-        if self.loss_module:
-            self._get_sequential().module.train()
-
-    def update_device_parameters_pt(self, parameters: Dict[str, torch.Tensor]):
-        self.sync() # wait until queued up commands have completed
-        module: PyTorchModule = self._get_sequential()
-        state_dict = module.module.state_dict()
-        for p in parameters:
-            if p not in state_dict:
-                continue
-            state_dict[p] = parameters[p]
-        module.module.load_state_dict(state_dict)
-
-    def update_device_parameters_tf(self, parameters: Dict[str, tf.Tensor]):
-        self.sync() # wait until queued up commands have completed
-        module: TFModule = self._get_sequential()
-        # module.module.trainable_variables = parameters
-        for param in module.module.trainable_variables:
-            name = param.name
-            param.assign(tf.convert_to_tensor(parameters[name].detach().numpy()))
-
-    def update_device_parameters(self, parameters: Dict[str, torch.Tensor]):
-        if self.framework == "pytorch":
-            update_device_parameters_fn = self.update_device_parameters_pt
-        elif self.framework == "tensorflow":
-            update_device_parameters_fn = self.update_device_parameters_tf
-
-        update_device_parameters_fn(parameters)
-
-    def cpueval_forward_pt(self, inputs: List[torch.Tensor], parameters: Dict[str, torch.Tensor], save_for_backward: bool, targets: List[torch.Tensor] = []) -> List[torch.Tensor]:
-        """
-        Evaluate forward pass for verification
-
-        Parameters
-        ----------
-        inputs: List[torch.Tensor]
-            One input into the model (for each ordered input node)
-
-        parameters: Dict[str, torch.Tensor]
-            Map of model parameters
-
-        save_for_backward: bool
-            If set, input and output tensors will be saved so we can run the backward pass later.
-
-        targets: List[torch.Tensor], optional
-            If we're running training, and there's a loss module on this device, provide target
-
-        Returns
-        -------
-        List[Tensor]
-            Forward graph output
-        """
-        if len(self.modules) == 0:
-            raise RuntimeError("Trying to run device with no modules")
-
-        module: PyTorchModule = self._get_sequential()
-        module.module = module.module.cpu()
-        if not save_for_backward:
-            module.module.eval() # disable dropout
-
-        # Override parameters values
-        if len(parameters) > 0:
-            self.update_device_parameters_pt(parameters)
-
-            # Copy back so that we can extract grad
-            for name, p in module.module.named_parameters():
-                parameters[name] = p
-
-        if save_for_backward:
-            self._saved_fw_inputs = inputs
-
-        module.compilation = False
-        outputs: Tuple[SomeTensor] = module.forward(*inputs)
-        if self.loss_module:
-            if len(outputs) == 1:
-                outputs = outputs[0]
-            if len(targets) == 1:
-                targets = targets[0]
-            lout = self.loss_module.forward(outputs, targets)
-            lout = self._scale_loss * lout
-            outputs = lout
-
-        if not isinstance(outputs, tuple):
-            outputs = (outputs, )
-        
-        if save_for_backward:
-            self._saved_fw_outputs = outputs
-
-        module.module = module.module.to(self.device)
-
-        return outputs
-
-    def cpueval_forward_tf(self, inputs: List[torch.Tensor], parameters: Dict[str, torch.Tensor], save_for_backward: bool, targets: List[torch.Tensor] = []) -> List[torch.Tensor]:
-        """
-        Evaluate forward pass for verification
-
-        Parameters
-        ----------
-        inputs: List[torch.Tensor]
-            One input into the model (for each ordered input node)
-
-        parameters: Dict[str, torch.Tensor]
-            Map of model parameters
-
-        save_for_backward: bool
-            If set, input and output tensors will be saved so we can run the backward pass later.
-
-        targets: List[torch.Tensor], optional
-            If we're running training, and there's a loss module on this device, provide target
-
-        Returns
-        -------
-        List[Tensor]
-            Forward graph output
-        """
-        if len(self.modules) == 0:
-            raise RuntimeError("Trying to run device with no modules")
-
-        module: TFModule = self._get_sequential()
-
-        # Override parameters values
-        if len(parameters) > 0:
-            # assert False, f"TODO"
-            self.update_device_parameters_tf(parameters)
-
-        inputs = to_tf_variables(inputs)
-        if save_for_backward:
-            self._saved_fw_inputs = inputs
-
-        outputs = inputs
-        if self._training:
-            with tf.GradientTape(persistent=True) as tape:
-                [tape.watch(output) for output in outputs if output.trainable]
-                outputs = module.call(*outputs)
-
-            self.cpueval_tf_gradient_tape = tape
-        else:
-            outputs = module.call(*outputs)
-    
-        if self.loss_module:
-            if len(outputs) == 1:
-                outputs = outputs[0]
-            if len(targets) == 1:
-                targets = targets[0]
-            lout = self.loss_module.forward(outputs, targets)
-            lout = self._scale_loss * lout
-            outputs = lout
-
-        if not isinstance(outputs, tuple):
-            outputs = (outputs, )
-        
-        if save_for_backward:
-            self._saved_fw_outputs = outputs
-        
-        outputs = to_pt_tensors(outputs)
-        return outputs
-
-    def cpueval_forward(self, inputs: List[torch.Tensor], parameters: Dict[str, torch.Tensor], save_for_backward: bool, targets: List[torch.Tensor] = []) -> List[torch.Tensor]:
-        """
-        Evaluate forward pass for verification
-
-        Parameters
-        ----------
-        inputs: List[torch.Tensor]
-            One input into the model (for each ordered input node)
-
-        parameters: Dict[str, torch.Tensor]
-            Map of model parameters
-
-        save_for_backward: bool
-            If set, input and output tensors will be saved so we can run the backward pass later.
-
-        targets: List[torch.Tensor], optional
-            If we're running training, and there's a loss module on this device, provide target
-
-        Returns
-        -------
-        List[Tensor]
-            Forward graph output
-        """
-        if self.framework == "pytorch":
-            cpueval_forward_fn = self.cpueval_forward_pt
-        elif self.framework == "tensorflow":
-            cpueval_forward_fn = self.cpueval_forward_tf
-        else:
-            raise RuntimeError(f"Unsupported framework: {self.framework}")
-
-        return cpueval_forward_fn(
-            inputs,
-            parameters,
-            save_for_backward,
-            targets
-        )
-
-    def cpueval_backward_tf(self, bw_inputs: List[torch.Tensor], parameters: Dict[str, torch.Tensor]) -> Tuple[List[torch.Tensor], Dict[str, torch.Tensor]]:
-
-        fw_outputs = self._saved_fw_outputs
-        self._saved_fw_outputs = None
-        fw_inputs = self._saved_fw_inputs
-        self._saved_fw_inputs = None
-
-        module = self._get_sequential()
-        incoming_grad = to_tf_variables(bw_inputs)
-        if self.loss_module:
-            grads = self.cpueval_tf_gradient_tape.gradient(
-                fw_outputs,
-                module.module.trainable_variables
-            )
-            input_grads = self.cpueval_tf_gradient_tape.gradient(
-                fw_outputs,
-                [fw_input for fw_input in fw_inputs if fw_input.trainable],
-            )
-        else:
-            grads = self.cpueval_tf_gradient_tape.gradient(
-                fw_outputs,
-                module.module.trainable_variables,
-                output_gradients=incoming_grad)
-
-            input_grads = self.cpueval_tf_gradient_tape.gradient(
-                fw_outputs,
-                [fw_input for fw_input in fw_inputs if fw_input.trainable],
-                output_gradients=incoming_grad)
-        self.cpueval_tf_grads = grads
-        param_grads = {}
-
-        for grad, param in zip(grads, module.module.trainable_variables):
-            if isinstance(grad, tf.Tensor):
-                param_grads[param.name] = torch.Tensor(grad.numpy())
-            elif isinstance(grad, tf.IndexedSlices):
-                param_grads[param.name] = torch.Tensor(tf.convert_to_tensor(grad).numpy())
-            else:
-                assert False, f"Hit unsupported gradient type {grad.__class__}"
-
-        input_grads = [torch.Tensor(t.numpy()) for t in input_grads if t is not None]
-
-        return input_grads, param_grads
-    
-    def cpueval_backward(self, bw_inputs: List[torch.Tensor], parameters: Dict[str, torch.Tensor]) -> Tuple[List[torch.Tensor], Dict[str, torch.Tensor]]:
-        if self.framework == "pytorch":
-            cpueval_backward_fn = super().cpueval_backward
-        elif self.framework == "tensorflow":
-            cpueval_backward_fn = self.cpueval_backward_tf
-        else:
-            raise RuntimeError(f"Unsupported framework: {self.framework}")
-
-
-        return cpueval_backward_fn(bw_inputs=bw_inputs, parameters=parameters)
-
-    def place_module(self, module: Union[Module, Tuple[Module], List[Module]]):
-
-        if not isinstance(module, (tuple, list)):
-            module = (module,)
-
-        for m in module:
-            if isinstance(m, PyTorchModule):
-                if self.framework is not None and self.framework != "pytorch":
-                    raise RuntimeError("Cannot mix frameworks on a single CPUDevice")
-                self.framework = "pytorch"
-            elif isinstance(m, TFModule):
-                if self.framework is not None and self.framework != "tensorflow":
-                    raise RuntimeError("Cannot mix frameworks on a single CPUDevice")
-                self.framework = "tensorflow"
-            else:
-                raise RuntimeError("Only PyTorch and TensorFlow modules can be placed on CPUDevices at this time.")
-
-        Device.place_module(self, module)
-
-    def _step_optimizer(self):
-        """
-        Step optimizer
-        """
-        if self.optimizer is None:
-            return
-        logger.debug("Stepping optimizer on {}", self)
-        if self.framework == "tensorflow":
-            assert self.tf_grads is not None
-            self.optimizer.apply_gradients(zip(self.tf_grads, self._get_sequential().module.trainable_variables))
-        elif self.framework == "pytorch":
-            self.optimizer.step()
-        else:
-            assert False, f"Only support Pytorch and TF CPU device, got {self.framework}"
-
-
-    def _step_schedulers(self):
-        """
-        Step schedulers
-        """
-        for s in self.schedulers:
-            s.step()
-            
-    def pop_parameter_checkpoint(self) -> Dict:
-        """
-        Return a dictionary of current parameter values for the models on this device.
-        """
-        raise RuntimeError("Not supported by cpu device yet")
-
-    def set_debug_gradient_trace_queue(self, q: queue.Queue):
-        """
-        [debug feature] Provide a queue to which incoming and outgoing gradients will be stored, for debug tracing.
-        """
-        self.debug_gradient_trace = q
-
-    def _create_forward_device_connector(self, target_device: Union["TTDevice", "CPUDevice"], sequential: bool, d2d_fwd_queue: Optional[queue.Queue] = None, microbatch = 1):
-
-        logger.debug("Creating forward device connector from {} to {}", self, target_device)
-        if isinstance(target_device, CPUDevice):
-            # Queues
-            self.forward_dc = DeviceConnector(TransferType.MP_QUEUE, TransferType.MP_QUEUE, self.shutdown_event, side_queue=d2d_fwd_queue)
-        else:
-            # Tilize to TTDevice
-            self.forward_dc = DirectPusherDeviceConnector(self.shutdown_event, sequential, side_queue=d2d_fwd_queue, microbatch=microbatch)
-
-        target_device._set_forward_input_dc(self.forward_dc)
-
-    def _create_backward_device_connector(self, target_device: Device, sequential: bool, d2d_bwd_queue: Optional[queue.Queue] = None, microbatch = 1):
-
-        logger.debug("Creating backward device connector from {} to {}", self, target_device)
-        if isinstance(target_device, CPUDevice):
-            # Queues
-            self.backward_dc = DeviceConnector(TransferType.MP_QUEUE, TransferType.MP_QUEUE, self.shutdown_event, side_queue=d2d_bwd_queue)
-        else:
-            # TTDevice copies directly to host, no pushing
-            self.backward_dc = DirectPusherDeviceConnector(self.shutdown_event, sequential, side_queue=d2d_bwd_queue, microbatch=microbatch)
-
-        target_device._set_backward_input_dc(self.backward_dc)
-
-    # Create device connector for the last device, pushing forward
-    def _create_forward_output_queue_device_connector(self, q: queue.Queue):
-        logger.debug("Creating forward output queue connector on {}", self)
-        self.forward_dc = DeviceConnector(TransferType.MP_QUEUE, TransferType.NONE, self.shutdown_event, q)
-
-    # Create device connector for the first device, pushing backward
-    def _create_backward_output_queue_device_connector(self, q: queue.Queue):
-        logger.debug("Creating backward output queue connector on {}", self)
-        self.backward_dc = DeviceConnector(TransferType.MP_QUEUE, TransferType.NONE, self.shutdown_event, q)
-
-    # Create device connector for the first device, reading from a Queue
-    def _create_input_queue_device_connector(self, q: queue.Queue, sequential: bool):
-        logger.debug("Creating input queue connector on {}", self)
-        self.forward_input_dc = DeviceConnector(TransferType.NONE, TransferType.MP_QUEUE, self.shutdown_event, q)
-
-    # Create device connector for the last device, reading from a Queue
-    def _create_target_queue_device_connector(self, q: queue.Queue, sequential: bool):
-        logger.debug("Creating target queue connector on {}", self)
-        self.target_input_dc = DeviceConnector(TransferType.NONE, TransferType.MP_QUEUE, self.shutdown_event, q)
-
-
-    def get_pytorch_optimizer(self, parameters: Dict[str, torch.Tensor], lr = None) -> Optional[torch.optim.Optimizer]:
-        if CPUDevice.optimizer_f[self] is None:
-            return None
-        if len(parameters) == 0:
-            return None
-        return CPUDevice.optimizer_f[self](list(parameters.values()))
-    
-    def get_pytorch_scheduler(self):
-        if CPUDevice.scheduler_f[self] is None:
-            return None
-        
-        return CPUDevice.scheduler_f[self](self.optimizer)
-
-    def get_parameter_checkpoint(self) -> Dict[str, Tensor]:
-        self.sync() # wait until queued up commands have completed
-        ret = {}
-        if self.framework == "pytorch":
-            for name, p in self._get_sequential().module.named_parameters():
-                ret[name] = Tensor.create_from_torch(p.cpu().data)
-        elif self.framework == "tensorflow":
-            for param in self._get_sequential().module.trainable_variables:
-                name = param.name
-                data = param.numpy()
-                ret[name] = Tensor.create_from_torch(torch.Tensor(data))
-        else:
-            assert False, f"Only support Pytorch and TF CPU device, got {self.framework}"
-        return ret
-
-    def get_parameter_gradients(self) -> Dict[str, Tensor]:
-        self.sync() # wait until queued up commands have completed
-        return {} # TODO
-    
-    def get_device_intermediates(self) -> Dict[str, Tensor]:
-        logger.warning("Fetching intermediate activations not supported on CPUDevice")
-        return {}
-
-    def sync(self):
-        """
-        Block until queued up commands have completed and the device is idle.
-        """
-        # TODO
-        pass
diff --git a/pybuda/pybuda/device.py b/pybuda/pybuda/device.py
deleted file mode 100644
index d4b839750..000000000
--- a/pybuda/pybuda/device.py
+++ /dev/null
@@ -1,926 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-
-import atexit
-import os
-from contextlib import contextmanager
-from typing import List, Tuple, Union, Optional, Dict, Any, Iterator
-import queue
-import threading
-
-import torch
-from multiprocessing.synchronize import Event as EventClass
-from multiprocessing.synchronize import Barrier as BarrierClass
-from multiprocessing.synchronize import Lock as LockClass
-import torch.multiprocessing as mp
-from loguru import logger
-from collections import OrderedDict, UserDict
-
-from .module import Module
-from .pybudaglobal import register_device, lazy_trace_data, set_state_changed, create_queue
-from .tensor import Tensor, buda_dataformat_to_pytorch_dtype, remove_microbatch, to_pt_tensors
-from .device_connector import DeviceConnector
-from pybuda._C.backend_api import initialize_child_process, finish_child_process
-from pybuda._C.graph import RuntimeTensorTransform
-from .utils import detach_tensors
-from pybuda._C import DataFormat
-
-class Device:
-    """
-    Device class represents a physical device which can be a Tenstorrent device, or a CPU. In a typical operation, 
-    each device spawns a process on the host CPU which is either used to run commands on the CPU (if device is 
-    a CPU), or feeds commands to the Tenstorrent device. 
-
-    Each device will allocate input queues for the first module it will execute. On a CPU, these are usually
-    some kind of multiprocessing queues with shared memory storage, and Tenstorrent devices have queues in 
-    on-device memory.
-
-    One or more Modules can be placed on the device to be executed.
-    """
-
-    def __init__(self, name: str, mp_context = None):
-        """
-        Create a device with a given name. Optionally override Python multi-procesing context.
-
-        Parameters
-        ----------
-        name: str
-            Device name
-
-        mp_context: mp.context, Optional
-            If provided, mp_context will be used to create multi-processing queues, instead of the default one
-        """
-
-        super().__init__()
-
-        self.name: str = name
-        self.modules: List[Module] = []
-        self.loss_module: Module = None # optional loss module when last device in pipeline
-        register_device(self)
-
-        #
-        # Input queues
-        # These input queues are used by the CPU device, and TT model devices. On silicon devices,
-        # the queues are in device memory, and these ones will not be used
-        if mp_context is None:
-            mp_context = mp.get_context('spawn')
-        self.target_input_queue: queue.Queue = create_queue(mp_context)
-        self.recompute_input_queue: queue.Queue = create_queue(mp_context)
-
-        # First device needs to buffer inputs if they are pushed in before its been initialized and compiled.
-        self._input_buffer = create_queue(mp_context)
-
-        # Process control events, set in _initialize if needed
-        self.shutdown_event : Optional[EventClass] = None
-        self.final_barrier : Optional[BarrierClass] = None
-
-        # Main process command queue
-        self.command_queue: queue.Queue = create_queue(mp_context)
-        self._command_queue_resp: queue.Queue = create_queue(mp_context)
-
-        # Flag indicating that we still need to compile this device
-        self._compiled = False
-        self._compile_output : "CompileResults" = []
-
-        # Device runs in same process as all others, running one at a time
-        self._sequential = True
-
-        self._first_device = True # to be cleared on non-first device when we start running
-
-        # Save first input on the side to use for initial compilation.. this is because we can't peek
-        # the first element of an mp queue
-        self._first_inputs: Optional[Tuple[torch.Tensor, ...]] = None
-        self._first_targets: Optional[Tuple[torch.Tensor, ...]] = None
-
-        self.forward_dc : DeviceConnector = None  # push through here
-        self.backward_dc : DeviceConnector = None  # push backward through here
-        self.forward_input_dc : DeviceConnector = None # read forward inputs through here
-        self.backward_input_dc : DeviceConnector = None # read backward inputs through here
-        self.target_input_dc: DeviceConnector = None
-        self.intermediates_dc: DeviceConnector = None # read intermediate outputs through here
-        self.dc_transfer_threads : Dict[str, Tuple[threading.Thread, queue.Queue]] = {}
-
-        # cpueval forward/backward intermediates
-        self._saved_fw_outputs = None
-        self._saved_fw_inputs = None
-
-        # If an automatic cpu fallback devices are generated, we may  need to forward inputs /outputs from this device 
-        # to the newly created ones
-        self.cpu_fallback_device_pre = None
-        self.cpu_fallback_device_post = None
-
-        # For generative inference, we want to keep track of the current token and tile indicies
-        self._current_token_idx = -1
-        self._current_tile_idx = -1
-
-        # Store io queue information for multiple subgraphs
-        self._io_queues = {}
-
-    def _initialize(self,
-            sequential: bool,
-            final_barrier: Optional[BarrierClass],
-            shutdown_event: Optional[EventClass]):
-        """
-        Setup steps before the workload starts.
-
-        Parameters
-        ----------
-        sequential: bool
-            Set sequential/concurrent mode for this device
-
-        final_barrier: mp.Barrier, optional
-            If provided, device process will wait for all other proceses to cross the Barrier, allowing 
-            processes and queues to be alive until everything has completed.
-
-        shutdown_event: mp.Event, optional
-            If provided, forward will trigger the event in case of an exception, letting other processes know to
-            shut down. This should always be set in concurrent mode.
-
-        """
-        self._sequential = sequential
-        self.final_barrier = final_barrier
-        self.shutdown_event = shutdown_event
-
-    def place_module(self, module: Union[Module, Tuple[Module], List[Module]]):
-        """
-        Places a module, or list of modules, on this device for execution. Modules will be run as a sequential pipeline
-        on this single device.
-
-        Parameters
-        ----------
-        module: Union[Module, Tuple[Module], List[Module]]
-            A single Module or a list of Modules to be placed on the device
-        """
-
-        def add(modules, module, device):
-            for m in modules:
-                if m.name == module.name:
-                    raise RuntimeError("Module names should be unique for each module on device")
-            modules.append(module)
-            module._set_device(device)
-
-        if isinstance(module, Module):
-            add(self.modules, module, self)
-        elif isinstance(module, (tuple, list)):
-            for m in module:
-                if not isinstance(m, Module):
-                    raise RuntimeError("Expected a Module in the list, but got " + str(type(m)))
-                add(self.modules, m, self)
-        else:
-            raise RuntimeError("Expected a Module or list of modules")
-        set_state_changed()
-
-    def remove_modules(self):
-        self.modules = []
-        set_state_changed()
-
-    def place_loss_module(self, module: Module):
-        """
-        Places a module used to calculate loss on this device. This must be the last device in the pipeline.
-
-        Parameters
-        ----------
-        module: Module
-            A single loss module
-        """
-
-        if not isinstance(module, Module):
-            raise RuntimeError("Expected a Module, but got " + str(type(module)))
-
-        self.place_module(module)
-        self.loss_module = module
-        set_state_changed()
-
-    def remove_loss_module(self):
-        """
-        Remove module used to calculate loss from this device
-        """
-
-        assert self.loss_module is not None
-
-        self.modules.remove(self.loss_module)
-        self.loss_module = None
-
-    def push_to_inputs(self, *tensors: Union[Tuple[Union[torch.Tensor, Tensor], ...], Dict[str, Union[torch.Tensor, Tensor]]]):
-        """
-        Push tensor(s) to module inputs, either in order, or by keyword argumet if a dictionary is used. The data will be queued 
-        up on the target device until it is ready to be consumed. 
-
-        This call can block if there is no space on the target device's input queues.
-
-        Parameters
-        ----------
-        *tensors: Union[torch.Tensor, Tensor]
-            Ordered list of inputs to be pushed into the module's input queue. Can be pytorch or pybuda tensor.
-
-        """
-        if self.cpu_fallback_device_pre is not None:
-            logger.info("push_to_inputs redirected from {} to {}", self, self.cpu_fallback_device_pre)
-            return self.cpu_fallback_device_pre.push_to_inputs(*tensors)
-
-        logger.trace("push_to_inputs on {}", self)
-        if len(tensors) == 1 and isinstance(tensors[0], (tuple, list, dict, UserDict, OrderedDict)):
-            # already grouped, break it up
-            tensors = tensors[0]
-
-        if isinstance(tensors, (dict, UserDict, OrderedDict)):
-            [self.modules[0].input_names, tensors] = zip(*tensors.items())
-
-        if self._first_inputs is None:
-            self._first_inputs = tensors
-
-        if ((self._first_inputs[0].shape)[0] != (tensors[0].shape)[0]):
-            raise RuntimeError("Batch size mismatch between first input and current input")
-        
-        self._input_buffer.put(to_pt_tensors(tensors))
-
-    def push_to_target_inputs(self, *tensors):
-        """
-        Push tensor(s) to module training target inputs, in order. The data will be queued up on the target 
-        device until it is ready to be consumed. 
-
-        This call can block if there is no space on the target device's input queues.
-
-        Parameters
-        ----------
-        tensors
-            Ordered list of inputs to be pushed into the module's target input queue
-        """
-        if self.cpu_fallback_device_post is not None:
-            logger.info("push_to_target_inputs redirected from {} to {}", self, self.cpu_fallback_device_post)
-            return self.cpu_fallback_device_post.push_to_target_inputs(*tensors)
-
-        logger.trace("push_to_target_inputs on {}", self)
-        if len(tensors) == 1 and isinstance(tensors[0], (tuple, list)):
-            # already grouped, break it up
-            tensors = tensors[0]
-
-        if self._first_targets is None:
-            self._first_targets = tensors
-
-        self.target_input_queue.put(tensors)
-
-    def _get_target_inputs(self):
-        """
-        Get inputs from training target input queue to send to a module for processing. Blocking until data has 
-        been received, or a shutdown event has been received by the process.
-
-        Returns
-        -------
-        Optional[Tuple[SomeTensor]]
-            A list of input tensors.  Will return None if process received a shutdown message.
-        """
-
-        return self._read_from_mp_queue(self.target_input_queue)
-
-    def _get_recompute_inputs(self):
-        """
-        Get saved inputs from the previous forward pass. Blocking until data has been received, or a shutdown
-        event has been received by the process.
-
-        Returns
-        -------
-        Optional[Tuple[SomeTensor]]
-            A list of input tensors.  Will return None if process received a shutdown message.
-        """
-
-        return self._read_from_mp_queue(self.recompute_input_queue)
-
-    def push_to_command_queue(self, cmd):
-        """
-        Send command to the running main loop in another process
-        """
-        self.command_queue.put(cmd)
-
-    def get_command_queue_response(self) -> Optional[Dict]:
-        """
-        Read from command queue response. This is blocking.
-
-        Returns
-        -------
-        Optional[Dict]
-            Command-specific dictionary with response data, or None in case of failures
-        """
-        while True:
-            try:
-                resp = self._command_queue_resp.get(timeout=0.1)
-                break
-            except queue.Empty as _:
-                if self.shutdown_event is not None and self.shutdown_event.is_set():
-                    logger.debug("Ending process on {} due to shutdown event", self)
-                    if self.final_barrier is not None:
-                        self.final_barrier.abort()
-                    return # got a signal to shutdown and end the process
-                continue
-        return resp
-
-
-    def _init_concurrent_run(self):
-        """
-        Callback before concurrent processes are launched
-        """
-        for dc in [self.forward_dc, self.backward_dc, self.forward_input_dc, self.backward_input_dc, self.target_input_dc, self.intermediates_dc]:
-            if dc:
-                dc.initialize()
-
-    def _drain_queue(self, q: queue.Queue):
-        """
-        Drain and discard queue contents
-        """
-        while True:
-            try:
-                q.get(timeout=0.1)
-                continue
-            except queue.Empty as _:
-                return 
-
-    def get_next_command(self, command_queue: queue.Queue) -> Optional["Command"]:
-        """
-        Read next command to run, from the given command queue. Blocking.
-
-        Parameters
-        ----------
-        command_queue: queue.Queue
-            Queue of commands
-
-        Returns
-        -------
-        Command
-            Next command from the queue, or None if shutdown_even was set
-        """
-
-        while True:
-            try:
-                cmd = command_queue.get(timeout=1)
-                logger.trace("{}: Got command from queue: {}", self, cmd)
-                break
-            except queue.Empty as _:
-                if self.shutdown_event is not None and self.shutdown_event.is_set():
-                    logger.debug("Ending process on {} due to shutdown event", self)
-                    if self.final_barrier is not None:
-                        self.final_barrier.abort()
-                    self._drain_queue(command_queue)
-                    return None # got a signal to shutdown and end the process
-                continue
-            except KeyboardInterrupt as _:
-                logger.info("Keyboard interrupt detected on {}", self)
-                if self.shutdown_event is not None:
-                    self.shutdown_event.set()
-                if self.final_barrier is not None:
-                    self.final_barrier.abort()  # prevent deadlock on other processes
-                self._drain_queue(command_queue)
-                return None
-
-        return cmd
-
-    def push_command_response(self, resp: Dict[str, Any]):
-        logger.trace("Pushing command response: {}", resp)
-        self._command_queue_resp.put(resp)
-
-    @contextmanager
-    def _try_run(self, msg: str) -> Iterator[None]:
-        """
-        Wrapper around arbitrary code that catches exceptions and raises abort flags
-        """
-        try:
-            yield
-        except Exception as e:
-            logger.error("{} error: {}", msg, e)
-            import traceback
-            print(traceback.format_exc())
-            if self.shutdown_event is not None:
-                self.shutdown_event.set()
-            if self.final_barrier is not None:
-                self.final_barrier.abort()  # prevent deadlock on other processes
-
-            # Drain command queue
-            self._drain_queue(self.command_queue)
-
-    def run_next_command(self, cmd: "Command") -> bool:
-        """
-        In concurrent mode, this is called in a forever loop by the process dedicated to this device. 
-        In sequential mode, the main process will call this until there's no more work to do.
-
-        Parameters
-        ----------
-        command_queue: queue.Queue
-            Command queue to read commands from
-
-        Returns
-        -------
-        bool
-            True if quit command was seen
-
-        """
-        from .run.commands import CommandType
-
-        if cmd.command_type == CommandType.QUIT:
-            logger.debug("Received SHUTDOWN command on {}", self)
-            self._shutdown_threads()
-            if self.final_barrier is not None:
-                logger.debug("Waiting for barrier on {}", self)
-                self.final_barrier.wait()
-            logger.debug("Shutting down on {}", self)
-            self.shutdown_device()
-            return True # Done
-
-        if cmd.command_type == CommandType.RUN_FORWARD:
-            logger.debug("Received RUN_FORWARD command on {} / {}", self, os.getpid())
-            with self._try_run("Forward"):
-                self.forward(loop_count=cmd.params["loop_count"])
-
-        elif cmd.command_type == CommandType.RUN_BACKWARD:
-            logger.debug("Received RUN_BACKWARD command on {} / {}", self, os.getpid())
-            with self._try_run("Backward"):
-                self.backward(loop_count=cmd.params["loop_count"], zero_grad=cmd.params["zero_grad"])
-
-        elif cmd.command_type == CommandType.RUN_GENERATE:
-            logger.debug("Received RUN_GENERATE command on {} / {}", self, os.getpid())
-            with self._try_run("Generate"):
-                self.generate(loop_count=cmd.params["loop_count"], write_index=cmd.params["write_index"], tokens_per_iter=cmd.params["tokens_per_iter"], token_id=cmd.params["token_id"])
-
-        elif cmd.command_type == CommandType.RUN_OPTIMIZER:
-            logger.debug("Received RUN_OPTIMIZER command on {} / {}", self, os.getpid())
-            with self._try_run("Optimizer"):
-                self._step_optimizer()
-            
-        elif cmd.command_type == CommandType.RUN_SCHEDULERS:
-            logger.debug("Received RUN_SCHEDULERS command on {} / {}", self, os.getpid())
-    
-            with self._try_run("Schedulers"):
-                self._step_schedulers()
-
-        elif cmd.command_type == CommandType.COMPILE:
-            logger.debug("Received COMPILE command on {} / {}", self, os.getpid())
-            logger.trace("Compile command: {}", cmd.params)
-            try:
-                ret = self.compile_for(
-                        cmd.params["inputs"],
-                        cmd.params["compiler_cfg"],
-                        cmd.params["targets"],
-                        cmd.params["microbatch_size"],
-                        cmd.params["microbatch_count"],
-                        cmd.params["verify_cfg"])
-                        
-                self.push_command_response({"outputs": ret})
-            except Exception as e:
-                import traceback
-                logger.error("Compile error: {}\n{}", e, traceback.format_exc())
-                self.push_command_response(e)
-
-        elif cmd.command_type == CommandType.GET_QUEUES:
-            assert "queue_type" in cmd.params
-            (
-                queues,
-                tile_broadcast_dims,
-                original_shapes,
-                requires_grad,
-                runtime_tensor_transforms,
-                constant_inputs,
-                tile_dims,
-            ) = self.get_dram_io_queues(cmd.params["queue_type"])
-            self.push_command_response(
-                    {
-                        "queues": queues, 
-                        "tile_broadcast_dims": tile_broadcast_dims, 
-                        "original_shapes": original_shapes,
-                        "requires_grad": requires_grad,
-                        "runtime_tensor_transforms": runtime_tensor_transforms,
-                        "constant_inputs": constant_inputs,
-                        "tile_dims": tile_dims,
-                    })
-
-        elif cmd.command_type == CommandType.SET_QUEUES:
-            logger.trace("Set DRAM IO queues on {}: {}", self, cmd.params['direction'])
-            self.set_dram_io_queues(
-                    cmd.params["direction"], 
-                    cmd.params["queues"], 
-                    cmd.params["tile_broadcast_dims"], 
-                    cmd.params["original_shapes"],
-                    cmd.params["requires_grad"],
-                    cmd.params["runtime_tensor_transforms"],
-                    cmd.params["constant_inputs"],
-                    cmd.params["tile_dims"])
-
-        elif cmd.command_type == CommandType.DC_TRANSFER:
-            logger.trace("DC Transfer on {}: {}", self, cmd.params['direction'])
-            direction = cmd.params["direction"]
-            if self._sequential:
-                self.dc_transfer(direction)
-            else:
-                # Push to thread and move on
-                if direction not in self.dc_transfer_threads:
-                    # Start a new thread
-                    dir_q = queue.Queue()
-                    self.dc_transfer_threads[direction] = (
-                            threading.Thread(target=self.dc_transfer_thread, args=(direction, dir_q)),
-                            dir_q)
-                    self.dc_transfer_threads[direction][0].start()
-
-                self.dc_transfer_threads[direction][1].put(direction)
-
-        elif cmd.command_type == CommandType.CPUEVAL_FORWARD:
-            logger.trace("CPUEVAL_FORWARD on {}", self)
-            ret = self.cpueval_forward(cmd.params["inputs"], cmd.params["parameters"], cmd.params["save_for_backward"], cmd.params["targets"])
-            ret = detach_tensors(ret)
-            self.push_command_response({"result": ret})
-
-        elif cmd.command_type == CommandType.CPUEVAL_BACKWARD:
-            logger.trace("CPUEVAL_BACKWARD on {}", self)
-            input_grads, params_grads = self.cpueval_backward(
-                    cmd.params["bw_inputs"], 
-                    cmd.params["parameters"])
-            self.push_command_response({"input_grads": input_grads, "params_grads": params_grads})
-
-        elif cmd.command_type == CommandType.CPUEVAL_LOSS:
-            logger.trace("CPUEVAL_LOSS on {}", self)
-            ret = self.cpueval_loss(cmd.params["fw_outputs"], cmd.params["targets"], cmd.params["scale_loss"])
-            #ret = tuple(t.detach() for t in ret)
-            self.push_command_response({"loss": ret})
-
-
-        elif cmd.command_type == CommandType.GET_PARAMETER_CHECKPOINT:
-            logger.trace("GET_PARAMETER_CHECKPOINT on {}", self)
-            self.push_command_response({"checkpoint": self.get_parameter_checkpoint()})
-
-        elif cmd.command_type == CommandType.GET_PARAMETER_GRADIENTS:
-            logger.trace("GET_PARAMETER_GRADIENTS on {}", self)
-            self.push_command_response({"gradients": self.get_parameter_gradients()})
-
-        elif cmd.command_type == CommandType.UPDATE_DEVICE_PARAMETERS:
-            logger.trace("UPDATE_DEVICE_PARAMETERS on {}", self)
-            self.update_device_parameters(cmd.params["parameters"])
-
-        elif cmd.command_type == CommandType.SYNC:
-            logger.trace("SYNC on {}", self)
-            self.sync()
-            self.push_command_response({"sync": True})
-
-        else:
-            raise RuntimeError("Unknown command received by ", self)
-
-        return False
-
-    def dc_transfer_thread(self, direction: str, direction_queue: queue.Queue):
-        """
-        Keep transfering data in a thread. One per direction.
-        """
-        while True:
-            try:
-                cmd = direction_queue.get(timeout=0.1)
-                logger.trace("DC transfer thread {} got cmd={}", direction, cmd)
-                if cmd == "quit":
-                    return
-
-                assert cmd == direction
-                self.dc_transfer(direction)
-                
-            except queue.Empty as _:
-                if self.shutdown_event is not None and self.shutdown_event.is_set():
-                    logger.debug("Ending dc transfer thread {} on {} due to shutdown event", direction, self)
-                    return 
-                continue
-
-    def dc_transfer(self, direction: str):
-        """
-        Transfer data between devices
-        """
-        if direction == "forward":
-            if self.forward_dc is not None:
-                self.forward_dc.transfer(blocking=True)
-        elif direction == "forward_input":
-            self.forward_input_dc.transfer(blocking=True)
-        elif direction == "target":
-            self.target_input_dc.transfer(blocking=True)
-        elif direction == "backward":
-            self.backward_dc.transfer(blocking=True)
-        elif direction == "intermediates":
-            if self.intermediates_dc is not None:
-                self.intermediates_dc.transfer(blocking=True)
-        else:
-            raise RuntimeError(f"Invalid direction: {direction}")
-
-    
-    def set_dram_io_queues(self, direction: str, queues: List["DramIODesc"], tile_broadcast_dims: Optional[List[List[int]]] = None, 
-            original_shapes: Optional[List[Tuple[int, ...]]] = None, requires_grad: Optional[List[bool]] = None,
-            runtime_tensor_transforms: Optional[List[RuntimeTensorTransform]] = None, constant_inputs: Optional[List[bool]] = None,
-            tile_dims: Optional[List[List[int]]] = None):
-
-        if direction == "forward_in":
-            assert original_shapes is not None
-            assert requires_grad is not None
-            self._io_queues[direction] = {"queues" : queues, "original_shapes" : original_shapes, "requires_grad" : requires_grad, "runtime_tensor_transforms" : runtime_tensor_transforms}
-            self.forward_input_dc.set_dram_io_pop_queues(queues, original_shapes, requires_grad, runtime_tensor_transforms)
-
-        elif direction == "forward_in_push":
-            assert tile_broadcast_dims is not None
-            self._io_queues[direction] = {
-                "queues" : queues, "tile_broadcast_dims" : tile_broadcast_dims, "runtime_tensor_transforms" : runtime_tensor_transforms, "constant_inputs" : constant_inputs, "tile_dims" : tile_dims}
-            self.forward_input_dc.set_dram_io_push_queues(queues, tile_broadcast_dims, runtime_tensor_transforms, constant_inputs, tile_dims)
-
-        elif direction == "forward_out":
-            assert tile_broadcast_dims is not None
-            self._io_queues[direction] = {"queues" : queues, "tile_broadcast_dims" : tile_broadcast_dims, "runtime_tensor_transforms" : runtime_tensor_transforms}
-            self.forward_dc.set_dram_io_push_queues(queues, tile_broadcast_dims, runtime_tensor_transforms)
-
-        elif direction == "forward_out_pop":
-            assert original_shapes is not None
-            assert requires_grad is not None
-            self._io_queues[direction] = {"queues" : queues, "original_shapes" : original_shapes, "requires_grad" : requires_grad, "runtime_tensor_transforms" : runtime_tensor_transforms}
-            self.forward_dc.set_dram_io_pop_queues(queues, original_shapes, requires_grad, runtime_tensor_transforms)
-
-        elif direction == "backward_in":
-            assert original_shapes is not None
-            assert requires_grad is not None
-            self._io_queues[direction] = {"queues" : queues, "original_shapes" : original_shapes, "requires_grad" : requires_grad, "runtime_tensor_transforms" : runtime_tensor_transforms}
-            self.backward_input_dc.set_dram_io_pop_queues(queues, original_shapes, requires_grad, runtime_tensor_transforms)
-
-        elif direction == "backward_out":
-            assert original_shapes is not None
-            assert requires_grad is not None
-            self._io_queues[direction] = {"queues" : queues, "original_shapes" : original_shapes, "requires_grad" : requires_grad, "runtime_tensor_transforms" : runtime_tensor_transforms}
-            self.backward_dc.set_dram_io_pop_queues(queues, original_shapes, requires_grad, runtime_tensor_transforms)
-
-        elif direction == "backward_out_push":
-            assert tile_broadcast_dims is not None
-            self._io_queues[direction] = {"queues" : queues, "tile_broadcast_dims" : tile_broadcast_dims, "runtime_tensor_transforms" : runtime_tensor_transforms}
-            self.backward_dc.set_dram_io_push_queues(queues, tile_broadcast_dims, runtime_tensor_transforms)
-
-        elif direction == "target_in_push":
-            assert tile_broadcast_dims is not None
-            self._io_queues[direction] = {"queues" : queues, "tile_broadcast_dims" : tile_broadcast_dims, "runtime_tensor_transforms" : runtime_tensor_transforms}
-            self.target_input_dc.set_dram_io_push_queues(queues, tile_broadcast_dims, runtime_tensor_transforms)
-
-        elif direction == "intermediates_pop":
-            assert original_shapes is not None
-            assert requires_grad is not None
-            self._io_queues[direction] = {"queues" : queues, "original_shapes" : original_shapes, "requires_grad" : requires_grad, "runtime_tensor_transforms" : runtime_tensor_transforms}
-            self.intermediates_dc.set_dram_io_pop_queues(queues, original_shapes, requires_grad, runtime_tensor_transforms)
-
-        else:
-            raise RuntimeError("Unknown direction")
-
-
-
-    def get_dram_io_queues(self, queue_type: str) -> List["DramIODesc"]:
-        raise RuntimeError("Only TTDevice implements get_dram_io_queues")
-
-    def run(self, output_dir: str):
-        """
-        Main process loop in concurrent mode. 
-        
-        The loop receives commands through its command queue, which indicate how many epochs & iterations to
-        run, whether to run training or inference, and position in the pipeline. 
-
-        The loop will run until shutdown command is sent in the command queue, or shutdown event is raised due
-        to an exception in another process
-
-        Parameters
-        ----------
-
-        output_dir: str
-            Output directory needed by perf trace on every process
-        """
-
-        atexit.register(atexit_handler, devices=(self,))
-        self._init_concurrent_run()
-        initialize_child_process(output_dir)
-
-        try:
-            while True:
-                cmd = self.get_next_command(self.command_queue)
-                if cmd is None:
-                    break
-                done = self.run_next_command(cmd)
-                if done:
-                    break
-
-        except Exception as e:
-            if self.shutdown_event is not None:
-                self.shutdown_event.set()
-            if self.final_barrier is not None:
-                self.final_barrier.abort()  # prevent deadlock on other processes
-            logger.debug("Ending process on {} due to exception: {}", self, e)
-            self.shutdown_device()
-            raise
-
-        #finally:
-        #    # TODO: this should only be done if we're really done... in concurrent mode, we should
-        #    # keep processes alive
-        #    self.shutdown_device()
-
-    def compile_for(self, training: bool, microbatch_size: int = 0, microbatch_count: int = 1):
-        """ 
-        Save microbatch size and count
-        """
-        self._microbatch_size = microbatch_size
-        self._microbatch_count = microbatch_count
-        self._training = training
-
-    def _get_first_tensors(self, first_tensors: Tuple[Tensor, ...]) -> Tuple[int, Tuple[Tensor, ...] ]:
-
-        # detect microbatch size
-        def get_microbatch_size(t):
-            return t.shape[0]
-
-        first_inputs = first_tensors
-        microbatch_size = get_microbatch_size(first_inputs[0])
-
-        for input in first_inputs:
-            mb_size = get_microbatch_size(input)
-            if (mb_size != microbatch_size) and (mb_size != 1):
-                raise RuntimeError("Microbatch size doesn't match for all inputs")
-
-        out_first_inputs = remove_microbatch(first_inputs)
-        return microbatch_size, out_first_inputs
-
-    def get_first_targets(self) -> List[Tensor]:
-        """
-        Return the tuple of first targets pushed to this device
-        """
-        if self._first_targets is None:
-            raise RuntimeError("Targets must be pushed into the last device before trying to compile the model.")
-
-        _, first_targets = self._get_first_tensors(self._first_targets)
-
-        return first_targets
-
-    def get_first_inputs(self, peek=False) -> Tuple[int, Tuple[Tensor, ...] ]:
-        """
-        Return the microbatch size, and first input in microbatch pushed into the device. If input_shapes/input_types
-        are provided, then those will be used to create input tensors.
-
-        This is used to compile and optimize the model for dimensions provided by the first input.
-        """
-        if self._first_inputs is None:
-            if peek:
-                return None, None
-            raise RuntimeError("Inputs must be pushed into the first device before trying to compile the model.")
-
-        microbatch_size, first_inputs = self._get_first_tensors(self._first_inputs)
-        if not peek:
-            self._first_inputs = None # release
-
-        return microbatch_size, first_inputs
-
-    def shutdown_device(self):
-        """
-        Check for any mp queues that are not empty, and drain them
-        """
-        if self._sequential:
-            return # notthing to clean up here if we're not multi-processing
-
-        finish_child_process()
-
-    def _read_from_mp_queue(self, q: queue.Queue):
-        """
-        Read from mp queue, and abort if shutdown event has been received by the process.
-
-        Returns
-        -------
-        Any
-            Data from the queue, or None if aborted
-        """
-
-        while True:
-            try:
-                out = q.get(timeout=0.1)
-                break
-            except queue.Empty as _:
-                if self.shutdown_event is not None and self.shutdown_event.is_set():
-                    logger.trace("_read_from_mp_queue aborting on {}", self)
-                    return None # got a signal to shutdown and end the process
-                continue
-        
-        return out
-
-    def _shutdown_threads(self):
-
-        for d in self.dc_transfer_threads.values():
-            d[1].put("quit")
-            d[0].join()
-        self.dc_transfer_threads = {} # clear threads
-
-        for dc in [self.forward_dc, self.backward_dc, self.forward_input_dc, self.backward_input_dc, self.target_input_dc]:
-            if dc:
-                dc.shutdown()
-
-    # Device connector for forward inputs
-    def _set_forward_input_dc(self, dc: DeviceConnector):
-        self.forward_input_dc = dc
-
-    # Device connector for backward inputs
-    def _set_backward_input_dc(self, dc: DeviceConnector):
-        self.backward_input_dc = dc
-
-    def cpueval_backward(self, 
-            bw_inputs: List[torch.Tensor],
-            parameters: Dict[str, torch.Tensor]) -> Tuple[List[torch.Tensor], Dict[str, torch.Tensor]]:
-        """
-        Evaluate backward pass for verification. `cpueval_forward` should've been called first, with 
-        `save_for_backward` set.
-
-        Parameters
-        ----------
-        bw_inputs: List[torch.Tensor]
-            BW inputs, i.e. losses for each fw output
-
-        parameters: Dict[str, torch.Tensor]
-            Module parameters
-
-        Returns
-        -------
-        List[Tensor]
-            Gradients on ordered inputs
-
-        Dict[str, Tensor]
-            Gradients on parameters
-        """
-
-        assert self._saved_fw_inputs is not None, "cpueval_forward has not been called with save_for_backward"
-        assert self._saved_fw_outputs is not None, "cpueval_forward has not been called with save_for_backward"
-
-        fw_outputs = self._saved_fw_outputs
-        self._saved_fw_outputs = None
-        fw_inputs = self._saved_fw_inputs
-        self._saved_fw_inputs = None
-
-        fw_outputs = [t for t in fw_outputs if t.requires_grad]
-        
-        if self.loss_module:
-            for fw_output in fw_outputs:
-                fw_output.backward()
-        else:
-            assert len(bw_inputs) == len(fw_outputs)
-
-            for i, (bw_input, fw_output) in enumerate(zip(bw_inputs, fw_outputs)):
-                fw_output.backward(bw_input, retain_graph=(i < len(bw_inputs) - 1))
-
-        param_grads = {name : value.grad.clone() for name, value in parameters.items() if value.requires_grad}
-        input_grads = [t.grad for t in fw_inputs if t.requires_grad]
-
-        return input_grads, param_grads
-
-    def generate(self, loop_count: int, write_index: int):
-        """
-        Run generate forward pass on each module on this device, in order
-
-        Parameters
-        ----------
-        loop_count: int
-            Number of micro-batches to run
-
-        write_index: int
-            Write location for past cache buffers
-
-        """
-        raise RuntimeError("Children should implement this")
-
-    def forward(self, loop_count: int):
-        """
-        Run forward pass on each module on this device, in order
-
-        Parameters
-        ----------
-        loop_count: int
-            Number of micro-batches to run
-        """
-        raise RuntimeError("Children should implement this")
-
-    def backward(self, loop_count: int, zero_grad: bool):
-        """
-        Run backward pass on each module on this device, in reverse order
-
-        Parameters
-        ----------
-        loop_count: int
-            Each mini-batch is broken into micro-batches. This is necessary to fill a multi-device pipeline, 
-            and should be roughly 4-6x the number of devices in the pipeline for ideal performance.
-
-        zero_grad: bool
-            Set to true to have optimizer zero out gradients before the run
-        """
-        raise RuntimeError("Children should implement this")
-
-    def _step_optimizer(self):
-        """
-        Step optimizer
-        """
-        raise RuntimeError("Children should implement this")
-    
-    def _step_schedulers(self):
-        """
-        Step schedulers
-        """
-        raise RuntimeError("Child should implement this")
-
-def atexit_handler(devices: Tuple[Optional[Device], ...]):
-    """
-    Shutdown the device on process exit (if not handled cleanly already)
-    """
-    logger.debug("atexit handler called for {}", devices)
-    for d in devices:
-        if d is not None:
-            d.shutdown_device()
-    logger.debug("atexit handler completed")
diff --git a/pybuda/pybuda/device_connector.py b/pybuda/pybuda/device_connector.py
deleted file mode 100644
index 08de7f1ab..000000000
--- a/pybuda/pybuda/device_connector.py
+++ /dev/null
@@ -1,423 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import threading
-from enum import Enum
-from typing import List, Optional, Union, Tuple
-import queue
-
-from multiprocessing.synchronize import Event as EventClass
-from queue import Queue
-import torch
-import torch.multiprocessing as mp
-from loguru import logger
-
-from .backend import BackendAPI
-from .tensor import Tensor, pytorch_tensor_to_tensor_desc, is_equivalent_data_format, pad_pytorch_tensor_to_buda
-from .utils import detach_tensors, align_up
-from pybuda._C.backend_api import DramIODesc, PytorchTensorDesc
-from pybuda._C.graph import RuntimeTensorTransform, RuntimeTensorTransformType, Shape
-from pybuda._C import DataFormat
-from .pybudaglobal import TILE_DIM, create_queue
-
-class TransferType(Enum):
-    MP_QUEUE = 1 # read from / write to a queue in shared memory (on host)
-    DIRECT = 2   # read/write directly (tilize/untilize)
-    NONE = 3     # no explicit transfer (i.e. device will do it on its own), so wrapper does nothing
-
-class DeviceConnector:
-    """
-    DeviceConnector is a light-weight gasket between two devices, providing mechanism to push/pop data. It
-    abstracts the mechanism for pushing and popping out, while implementing data transfer through mp queuees,
-    direct tilize/untilize, etc.
-
-    All structures within the class can be pickled and sent to other processes.
-    """
-    def __init__(self, 
-            push_type: TransferType, 
-            pop_type: TransferType, 
-            shutdown_event: Optional[EventClass],
-            queue: Optional[Queue] = None,
-            side_queue: Optional[Queue] = None):
-
-        self.push_type = push_type
-        self.pop_type = pop_type
-        self.shutdown_event = shutdown_event # if the event fires, any blocking actions should stop
-
-        if queue is not None:
-            self.queue = queue
-        elif self.pop_type == TransferType.MP_QUEUE:
-            mp_context = mp.get_context('spawn')
-            self.queue = create_queue(mp_context)
-
-        self.side_queue = side_queue
-
-    def shutdown(self):
-        pass # children will override
-
-    def initialize(self):
-        pass # children will override
-
-    def push_to_side_queue(self, tensors: List[Tensor], clone: bool = False):
-        """
-        Push to side queue, if one is set, to store debug data
-        """
-        if self.side_queue is not None:
-            if clone:
-                tensors = [t.clone() for t in tensors]
-            tensors = [t.detach() for t in tensors]
-            while True:
-                try:
-                    self.side_queue.put(tensors) # TODO: timeout and break on shutdown_event
-                    return
-                except queue.Full as _:
-                    if self.shutdown_event is not None and self.shutdown_event.is_set():
-                        logger.debug("Aborting side queue put due to shutdown event")
-                        return [] # got a signal to shutdown and end the process
-                    continue
-
-
-    def push(self, tensors: List[Tensor]):
-
-        self.push_to_side_queue(tensors)
-        if self.push_type == TransferType.MP_QUEUE:
-            while True:
-                try:
-                    self.queue.put(tensors) # TODO: timeout and break on shutdown_event
-                    return
-                except queue.Full as _:
-                    if self.shutdown_event is not None and self.shutdown_event.is_set():
-                        logger.debug("Aborting queue put due to shutdown event")
-                        return [] # got a signal to shutdown and end the process
-                    continue
-
-        
-        raise RuntimeError(f"Can't handle push to this type: {type(self)}")
-
-    def read(self) -> List[Tensor]:
-
-        if self.queue is not None:
-            while True:
-                try:
-                    data = self.queue.get(timeout=0.1)
-                    return data
-                except queue.Empty as _:
-                    if self.shutdown_event is not None and self.shutdown_event.is_set():
-                        logger.debug("Aborting queue get due to shutdown event")
-                        return [] # got a signal to shutdown and end the process
-                    continue
-
-        raise RuntimeError("No queue to read from")
-
-    def pop(self):
-        if self.queue is not None:
-            return # no-op
-
-        raise RuntimeError("Can't handle pop")
-
-    def transfer(self, blocking: bool):
-        pass # NOP by default, implemented by some versions
-
-    def set_dram_io_pop_queues(self, _: List[DramIODesc]):
-        pass
-
-    def set_dram_io_push_queues(self, _: List[DramIODesc], __: List[List[int]], ___: Optional[List[RuntimeTensorTransform]], ____: Optional[List[Tensor]] = None):
-        pass
-
-    def empty(self) -> bool:
-        if self.queue is None:
-            raise RuntimeError("This type of connector can't be polled for emptiness")
-        return self.queue.empty()
-
-class DirectPusherDeviceConnector(DeviceConnector):
-    """
-    Connector in which case one device directly pushes (tilizes) to the other
-    """
-    def __init__(self, shutdown_event: Optional[EventClass], sequential: bool, pop_type: TransferType = TransferType.NONE, side_queue: Optional[queue.Queue] = None, microbatch=1):
-        super().__init__(push_type=TransferType.DIRECT, pop_type=pop_type, shutdown_event=shutdown_event, side_queue=side_queue)
-        self.direct_push_queues = None # Will be set after compile
-        self.sequential = sequential
-        self.tile_broadcast_dims = None
-        self.runtime_tensor_transforms : List[RuntimeTensorTransform] = None
-        self.constant_tensors = None
-        self.microbatch = microbatch
-        self.pusher_thread = None
-
-    def pusher_thread_main(self, cmdqueue: queue.Queue):
-        logger.info("Pusher thread on {} starting", self)
-        while True:
-            while True:
-                try:
-                    cmd = cmdqueue.get(timeout=0.1)
-                    break
-                except queue.Empty as _:
-                    if self.shutdown_event is not None and self.shutdown_event.is_set():
-                        logger.debug("Ending pusher thread on {} due to shutdown event", self)
-                        return # got a signal to shutdown and end the process
-                    continue
-
-            if cmd == "quit":
-                return
-
-            logger.debug("Pusher thread pushing tensors")
-            self._internal_push(cmd)
-
-    def shutdown(self):
-        if self.pusher_thread:
-            self.pusher_thread_queue.put("quit")
-
-    def initialize(self):
-        # Create threads
-        if not self.sequential and not self.pusher_thread:
-            self.pusher_thread_queue = queue.Queue(maxsize=3) # don't allow pushes to go too far ahead, or we'll run out of memory
-            self.pusher_thread = threading.Thread(target=self.pusher_thread_main, args=(self.pusher_thread_queue,))
-            self.pusher_thread.start()
-
-    def _convert_tensor_for_tilize(self, tensor: Tensor, q: DramIODesc) -> Tensor:
-        """
-        Convert formats to closest supported format, depending on the destination queue
-        """
-        if is_equivalent_data_format(tensor.pt_data_format, q.data_format):
-            return tensor
-
-        pt_format = tensor.value().dtype
-        if not tensor.value().is_floating_point():
-            return tensor.to_format(q.data_format)
-
-        if q.data_format in [DataFormat.Float16, DataFormat.Bfp8, DataFormat.Bfp4, DataFormat.Bfp2]:
-            # tensor has to be float16
-            if pt_format != torch.float16:
-                return tensor.to_format(DataFormat.Float16)
-            return tensor
-
-        if q.data_format in [DataFormat.Float16_b, DataFormat.Bfp8_b, DataFormat.Bfp4_b, DataFormat.Bfp2_b]:
-            # tensor can be bfloat or fp32
-            if not pt_format in [torch.float32, torch.bfloat16]:
-                return tensor.to_format(DataFormat.Float16_b)
-            return tensor
-
-        # Don't know what format it is... leave as-is and let back-end convert
-        return tensor
-
-    def _embedding_index(self, tensor: torch.Tensor, original_shape: Tuple[int, ...], q: DramIODesc) -> Tensor:
-        assert q.data_format in [DataFormat.RawUInt8, DataFormat.RawUInt16, DataFormat.RawUInt32]
-        assert len(tensor.shape) <= 2, "Must be a 1d tensor"
-        assert len(original_shape) <= 1 or original_shape[-2] == 1, "Must be a 1d tensor"
-        assert len(original_shape) <= 2 or original_shape[-3] == 1, "Must be a 1d tensor"
-
-        q_rt = q.bufq_grid_dim_r * q.mblock_m * q.ublock_rt
-        w = tensor.shape[0] if len(tensor.shape) > 1 else 1
-        pad = align_up(tensor.shape[-1], TILE_DIM) - tensor.shape[-1]
-        tensor = torch.nn.functional.pad(tensor, (0, pad))
-        tensor = tensor.reshape(w, 1, 1, tensor.shape[-1])
-        tensor[:, :, :, original_shape[-1]:] = ~torch.tensor(0, dtype=tensor.dtype)
-        tensor = tensor.view(w, q_rt, -1, TILE_DIM)
-        pad = align_up(tensor.shape[-2], TILE_DIM) - tensor.shape[-2]
-        tensor = torch.nn.functional.pad(tensor, (0, 0, 0, pad))
-        tensor = tensor.view(w, q_rt, -1, TILE_DIM, TILE_DIM)
-        tensor = tensor.transpose(2, 3).view(w, 1, q_rt * TILE_DIM, -1)
-
-        assert len(tensor.shape) == 4, "_embedding_index: rank changed"
-        assert tensor.shape[0] == w, "_embedding_index: w changed"
-        assert tensor.shape[1] == q.t, "_embedding_index: t changed"
-        assert tensor.shape[2] == (q.bufq_grid_dim_r * q.mblock_m * q.ublock_rt * TILE_DIM), "_embedding_index: tensor dims mismatch q dims"
-        assert tensor.shape[3] == (q.bufq_grid_dim_c * q.mblock_n * q.ublock_ct * TILE_DIM), "_embedding_index: tensor dims mismatch q dims"
-        return tensor
-
-    def _internal_push(self, tensors: List[Tensor]):
-
-        tensor_dtypes = [None] * len(tensors)
-        if not self.direct_push_queues:
-            print(f"Direct push queues have not been set for {self}")
-        assert self.direct_push_queues, "Direct push queues have not been set"
-        assert self.tile_broadcast_dims is not None
-        assert len(tensors) == len(self.direct_push_queues), (
-                f"Incorrect number of tensors provided on input: {len(tensors)} vs {len(self.direct_push_queues)}")
-        assert self.runtime_tensor_transforms, "Runtime tensor transforms have not been set"
-        assert len(tensors) == len(self.runtime_tensor_transforms)
-
-        self.push_to_side_queue(tensors)
-
-        # Convert to supported tilize conversion format, if needed
-        if isinstance(tensors, tuple):
-            tensors = list(tensors)
-
-        for i, t in enumerate(tensors):
-            if isinstance(t, Tensor):
-                tensors[i] = self._convert_tensor_for_tilize(t, self.direct_push_queues[i])
-            else:
-                tensors[i] = t
-
-        # Handles RuntimeTensorTransform::ReinterpretShape
-        for i, t in enumerate(tensors):
-            if self.runtime_tensor_transforms[i].type == RuntimeTensorTransformType.EmbeddingIndex:
-                if isinstance(tensors[i], Tensor):
-                    t = t.value()
-                assert t is not None
-                t = self._embedding_index(t, self.runtime_tensor_transforms[i].original_shape, self.direct_push_queues[i])
-                tensors[i] = t
-                tensor_dtypes[i] = DataFormat.RawUInt32
-            elif self.runtime_tensor_transforms[i].type == RuntimeTensorTransformType.ConstantInput:
-                assert self.constant_tensors[i] is not None
-                tensors[i] = self.constant_tensors[i]
-                t = tensors[i]
-
-            if isinstance(tensors[i], torch.Tensor):
-                if self.runtime_tensor_transforms[i].type == RuntimeTensorTransformType.ReinterpretShape:
-                    # TODO: RuntimeTensorTransform could do this transform (for all the RuntimeTensorTransformTypes)
-                    t = t.contiguous().view(self.runtime_tensor_transforms[i].reinterpreted_shape.as_list())
-                elif self.runtime_tensor_transforms[i].type == RuntimeTensorTransformType.Prestride:
-                    continue
-                tile_r = self.tile_dims[i][0] if self.tile_dims is not None else TILE_DIM
-                tile_c = self.tile_dims[i][1] if self.tile_dims is not None else TILE_DIM
-                tensors[i] = pad_pytorch_tensor_to_buda(
-                    t, self.tile_broadcast_dims[i], squeeze=True, microbatch=self.microbatch, tile_r=tile_r, tile_c=tile_c)
-            else:
-                reinterpreted_shape = None
-                if self.runtime_tensor_transforms[i].type == RuntimeTensorTransformType.ReinterpretShape:
-                    reinterpreted_shape = self.runtime_tensor_transforms[i].reinterpreted_shape.as_list()
-                    tensors[i] = t.to_buda_shape(self.tile_broadcast_dims[i], reinterpret_shape=reinterpreted_shape, clone=False, squeeze=True, microbatch=self.microbatch)
-                elif self.runtime_tensor_transforms[i].type == RuntimeTensorTransformType.Prestride:
-                    pass
-                elif self.runtime_tensor_transforms[i].type == RuntimeTensorTransformType.NoTransform:
-                    tensors[i] = t.to_buda_shape(self.tile_broadcast_dims[i], reinterpret_shape=None, clone=False, squeeze=True, microbatch=self.microbatch)
-
-        def to_tensor_desc(t: Union[Tensor, torch.Tensor], type: Union[DataFormat, None]) -> PytorchTensorDesc:
-            if isinstance(t, Tensor):
-                return t.to_tensor_desc()
-            return pytorch_tensor_to_tensor_desc(t, df=type)
-
-        BackendAPI.push_to_queues(self.direct_push_queues, [to_tensor_desc(t, type) for t, type in zip(tensors, tensor_dtypes)], single_input=False)
-        self.save_tensors = tensors
-
-    def push(self, tensors: List[Tensor]):
-
-        if not self.sequential:
-            self.pusher_thread_queue.put(tensors)
-        else:
-            self._internal_push(tensors)
-
-    def set_dram_io_push_queues(
-            self, direct_push_queues: List[DramIODesc],
-            tile_broadcast_dims: List[List[int]],
-            runtime_tensor_transforms: Optional[List[RuntimeTensorTransform]],
-            constant_tensors: Optional[List[Tensor]] = None,
-            tile_dims: Optional[List[List[int]]] = None):
-
-        self.direct_push_queues = direct_push_queues
-        self.tile_broadcast_dims = tile_broadcast_dims
-        self.runtime_tensor_transforms = runtime_tensor_transforms if runtime_tensor_transforms is not None else [RuntimeTensorTransform() for _ in range(len(direct_push_queues))]
-        self.constant_tensors = constant_tensors if constant_tensors is not None else [None for _ in range(len(direct_push_queues))]
-        self.tile_dims = tile_dims
-
-class DirectPopperDeviceConnector(DeviceConnector):
-    """
-    Connector in which case one device produces data directly into queues, and other pops from them
-    """
-    def __init__(self, shutdown_event: Optional[EventClass], side_queue: Optional[queue.Queue] = None):
-        super().__init__(push_type=TransferType.NONE, pop_type=TransferType.DIRECT, shutdown_event=shutdown_event, side_queue=side_queue)
-        self.direct_pop_queues = None # Will be set after compile
-        self.original_shapes = None
-        self.runtime_tensor_transforms = None
-
-    def read(self) -> List[Tensor]:
-        assert self.direct_pop_queues is not None, "Direct pop queues have not been set"
-        if len(self.direct_pop_queues) == 0:
-            return []
-        assert self.original_shapes is not None
-        ret = BackendAPI.read_queues(self.direct_pop_queues, self.original_shapes, self.runtime_tensor_transforms, requires_grad=self.requires_grad, single_output=False, shutdown_event=self.shutdown_event, clone=False)
-        self.push_to_side_queue(ret)
-        return ret
-
-    def pop(self):
-        assert self.direct_pop_queues is not None, "Direct pop queues have not been set"
-        if len(self.direct_pop_queues) == 0:
-            return
-        BackendAPI.pop_queues(self.direct_pop_queues, single_output=False)
-
-    def set_dram_io_pop_queues(self, direct_pop_queues: List[DramIODesc], original_shapes: List[Tuple[int, ...]], requires_grad: List[bool], runtime_tensor_transforms: Optional[List[RuntimeTensorTransform]]):
-        self.direct_pop_queues = direct_pop_queues
-        self.original_shapes = original_shapes
-        self.requires_grad = requires_grad
-        self.runtime_tensor_transforms = runtime_tensor_transforms
-
-class DirectPusherPopperDeviceConnector(DirectPusherDeviceConnector):
-    """
-    Connector between two direct devices (i.e. TT devices)
-    """
-    def __init__(self, shutdown_event: Optional[EventClass], sequential: bool, side_queue: Optional[queue.Queue] = None):
-        super().__init__(pop_type=TransferType.DIRECT, shutdown_event=shutdown_event, sequential=sequential, side_queue=side_queue)
-        self.direct_pop_queues = None # Will be set after compile
-        self.original_shapes = None
-        self.runtime_tensor_transforms = None
-
-    def read(self) -> List[Tensor]:
-        assert self.direct_pop_queues is not None, "Direct pop queues have not been set"
-        if len(self.direct_pop_queues) == 0:
-            return []
-        assert self.original_shapes is not None
-        ret = BackendAPI.read_queues(self.direct_pop_queues, self.original_shapes, self.runtime_tensor_transforms, requires_grad=self.requires_grad, single_output=False, shutdown_event=self.shutdown_event, clone=True)
-        self.push_to_side_queue(ret)
-        return ret
-        
-    def pop(self):
-        assert self.direct_pop_queues is not None, "Direct pop queues have not been set"
-        if len(self.direct_pop_queues) == 0:
-            return
-        BackendAPI.pop_queues(self.direct_pop_queues, single_output=False)
-
-    def set_dram_io_pop_queues(self, direct_pop_queues: List[DramIODesc], original_shapes: List[Tuple[int, ...]], requires_grad: List[bool], runtime_tensor_transforms: Optional[List[RuntimeTensorTransform]]):
-        self.direct_pop_queues = direct_pop_queues
-        self.original_shapes = original_shapes
-        self.requires_grad = requires_grad
-        self.runtime_tensor_transforms = runtime_tensor_transforms
-
-    def transfer(self, blocking: bool):
-        """
-        Transfer a piece of data from src to dest
-        """
-        data = self.read()
-        self.push(data)
-
-
-class InputQueueDirectPusherDeviceConnector(DirectPusherDeviceConnector):
-    """
-    Connector from which we can read, from the given queue, but there are no pushes. This is typically the first
-    device in the pipeline.
-
-    It implementes a "transfer" function to transfer 1 set of inputs from the queue into the device.
-    """
-    def __init__(self, q: Queue, shutdown_event: Optional[EventClass], sequential: bool):
-        super().__init__(shutdown_event, sequential)
-        self.queue = q
-
-    def transfer(self, blocking: bool):
-        """
-        Transfer a piece of data from queue to device, if there are any. Optionally block.
-        """
-        if not blocking and self.queue.empty():
-            return 
-
-        data = self.read()
-        self.push(data)
-
-class OutputQueueDirectPoppperDeviceConnector(DirectPopperDeviceConnector):
-    """
-    Connector that has an external queue that pushes go to. No reading through this connector is allowed.
-
-    It implementes a "transfer" function to transfer 1 set of outputs from device to the queue
-    """
-    def __init__(self, q: Queue, shutdown_event: Optional[EventClass], side_queue: Optional[queue.Queue] = None):
-        super().__init__(shutdown_event, side_queue=side_queue)
-        self.queue = q
-
-    def transfer(self, blocking: bool):
-        """
-        Transfer a piece of data from device to read queue. Optionally blocking.
-        """
-        if not blocking:
-            raise NotImplementedError("Non-blocking transfer on output not implemented yet")
-
-        data = self.read()
-        self.queue.put([t.clone().detach() for t in data])  # Need to clone, otherwise popping will erase the tensor
-        self.pop()
diff --git a/pybuda/pybuda/fx/__init__.py b/pybuda/pybuda/fx/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/pybuda/pybuda/fx/capture.py b/pybuda/pybuda/fx/capture.py
new file mode 100644
index 000000000..6da0d452f
--- /dev/null
+++ b/pybuda/pybuda/fx/capture.py
@@ -0,0 +1,401 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# Capture the FX graph and convert to MixedGraph of PyBuda and CPU graphs
+#
+
+from typing import Dict, List, Optional
+import copy
+
+import torch
+from loguru import logger
+
+from pybuda._C.graph import create_op_node, create_data_edge, create_parameter_input, create_activation_input, create_output, create_constant_input, add_subgraph_io_link_edge
+from pybuda.tensor import pytorch_dtype_to_buda_dataformat
+from pybuda.fx.nodes import get_pybuda_node, torch_constant_ops, is_supported_op, get_unsupported_nodes
+from pybuda.config import _get_global_compiler_config
+from pybuda.fx.mixed_graph import MixedGraph
+from pybuda.fx.schedule import TensorSource, Schedule
+from pybuda.fx.graph_utils import reduce_graph, graph_lint
+
+import pybuda
+
+class CaptureFX:
+    def __init__(self):
+        self.graph : Optional[MixedGraph] = None
+        self.node_to_id : Dict[torch.fx.Node, int] = {}
+        self.param_to_id : Dict[str, int] = {}
+        self.const_to_id : Dict[torch.Tensor, int] = {}
+        self.id_to_intermed : Dict[int, torch.Tensor] = {}
+        self.output_nodes_per_subgraph : Dict[int, List] = {}
+
+    def reset_state(self):
+        self.graph = None
+        self.node_to_id = {}
+        self.param_to_id = {}
+        self.const_to_id = {}
+        self.id_to_intermed = {}
+        self.output_nodes_per_subgraph = {}
+    
+    def capture_sample_outputs(self, outputs: List[torch.Tensor], subgraph_id: int):
+        assert self.graph is not None
+        self.graph.capture_sample_outputs(outputs, subgraph_id)
+
+    def get_buda_graph(self) -> pybuda._C.graph.Graph:
+        assert self.graph is not None
+        return self.graph.graph
+
+    def append_to_graph(
+            self, 
+            module_name: str,
+            module: torch.nn.Module, 
+            aten_module: torch.nn.Module, 
+            sample_inputs: List[torch.Tensor],
+            subgraph_id: int):
+
+        if self.graph is None:
+            self.graph = MixedGraph(module_name)
+
+        self.graph.capture_sample_inputs(inputs=sample_inputs, subgraph_id=subgraph_id)
+
+        activations = [torch.rand(sample_input.shape).to(sample_input.dtype).to("cpu") for sample_input in sample_inputs]
+        device_graph_changed, graph_inputs, intermediate_tensors, output_tensors, schedule = self._append_to_graph(module, aten_module, activations, subgraph_id)
+        logger.debug(f"Appending to graph done, captured {len(self.get_buda_graph().nodes())} nodes")
+        return device_graph_changed, graph_inputs, intermediate_tensors, output_tensors, schedule
+
+    def eval_node(self, node):
+        assert isinstance(node.target, torch._ops.OpOverloadPacket)
+    
+        eval_args = [self.id_to_intermed[self.node_to_id[arg]] if isinstance(arg, torch.fx.node.Node) else arg for arg in node.args]
+        for idx, arg in enumerate(eval_args):
+            if isinstance(arg, (list, tuple)):
+                eval_args[idx] = [self.id_to_intermed[self.node_to_id[a]] if isinstance(a, torch.fx.node.Node) else a for a in arg]
+        kwargs = {k:v for k, v in node.kwargs.items() if k != "device"}
+    
+        return node.target(*eval_args, **kwargs)
+    
+    
+    def add_op(self, node, name, pybuda_node, subgraph_idx):
+        shape = node.meta['tensor_meta'].shape if pybuda_node.shape is None else pybuda_node.shape
+        dtype = pytorch_dtype_to_buda_dataformat(node.meta['tensor_meta'].dtype) if pybuda_node.dtype is None else pybuda_node.dtype
+    
+        logger.trace("add_op: {} shape: {} dtype: {}", name, shape, dtype)
+        self.add_constants_if_necessary(pybuda_node.args, subgraph_idx)
+        if "nn_module_stack" in node.meta:
+            tags = {
+                "layer": list(node.meta["nn_module_stack"].values())[-1][0],
+                "stack_trace": "-->".join([str(v) for v in node.meta["nn_module_stack"].values()])
+            }
+        else:
+            tags = {}
+        if len(shape) == 0:
+            shape = [1]
+        nid = create_op_node(
+                self.get_buda_graph(),
+                f"{name}_{subgraph_idx}",
+                pybuda_node.op,
+                [int(dim) for dim in shape],
+                pytorch_dtype_to_buda_dataformat(dtype),
+                subgraph_idx,
+                tags)
+        
+        for i, input_node in enumerate(pybuda_node.args):
+            create_data_edge(self.get_buda_graph(), self.node_to_id[input_node], 0, nid, i, [])
+    
+        if isinstance(node.target, torch._ops.OpOverloadPacket):
+            # We will add NOP in cases where input to current subgraph is left on device
+            # For input nodes, node.target is str
+            self.id_to_intermed[nid] = self.eval_node(node)
+    
+        if (pybuda_node.wrap_tuple):
+            nid = (nid,)
+        return nid
+    
+    def add_input(self, node, subgraph_idx):
+        nid = create_activation_input(
+                self.get_buda_graph(),
+                f"{node.name}_{subgraph_idx}",
+                [int(dim) for dim in node.meta['tensor_meta'].shape],
+                node.meta["tensor_meta"].requires_grad,
+                pytorch_dtype_to_buda_dataformat(node.meta["tensor_meta"].dtype),
+                subgraph_idx)
+        return nid
+        
+    
+    def add_constant(self, name, tensor, subgraph_idx):
+        if tensor in self.const_to_id:
+            return self.const_to_id[tensor]
+        nid = create_constant_input(
+                self.get_buda_graph(), 
+                f"{name}_{subgraph_idx}",
+                tensor,
+                [int(dim) for dim in tensor.shape],
+                pytorch_dtype_to_buda_dataformat(tensor.dtype),
+                subgraph_idx)
+        self.const_to_id[tensor] = nid
+        return nid
+    
+    def add_param(self, name, torch_param, subgraph_idx):
+        if name in self.param_to_id:
+            return self.param_to_id[name]
+        nid = create_parameter_input(
+                self.get_buda_graph(), 
+                name,
+                [int(dim) for dim in torch_param.shape],
+                torch_param.requires_grad,
+                pytorch_dtype_to_buda_dataformat(torch_param.dtype),
+                subgraph_idx)
+        self.param_to_id[name] = nid
+        return nid
+    
+    def add_outputs(self, node, subgraph_idx):
+        output_nids = []
+        output_tensors = []
+        output_requires_grad = []
+        for index, meta in enumerate(node.meta['tensor_meta']):
+            arg = node.args[0][index]
+            nid = create_output(
+                    self.get_buda_graph(), 
+                    node.name + "_" + arg.name + "_" + str(subgraph_idx),
+                    [int(dim) for dim in meta.shape],
+                    pytorch_dtype_to_buda_dataformat(meta.dtype),
+                    False,  #TODO Loss output
+                    subgraph_idx)
+            create_data_edge(self.get_buda_graph(), self.node_to_id[arg], 0, nid, index, [])
+            output_nids.append(nid)
+            output_requires_grad.append(meta.requires_grad)
+            output_tensors.append(self.id_to_intermed[self.node_to_id[arg]])
+        return output_nids, output_tensors, output_requires_grad
+    
+    def add_constants_if_necessary(self, ops, subgraph_idx):
+        for op in ops:
+            if isinstance(op, (float, int)):
+                if op in self.node_to_id:
+                    continue
+                tensor = torch.ones([1]) * op
+                self.node_to_id[op] = self.add_constant(f"{op}", tensor, subgraph_idx)
+                self.id_to_intermed[self.node_to_id[op]] = tensor
+    
+    
+    def map_node_name_to_org_name(self, module, aten_module):
+        ret = dict()
+    
+        # param nodes
+        aten_params = dict()
+        for itm in aten_module.named_parameters():
+            aten_name = itm[0]
+            aten_tensor = itm[1]
+            aten_params[id(aten_tensor)] = aten_name
+        module_params = dict()
+        for itm in module.named_parameters():
+            module_name = itm[0]
+            mod = itm[1]
+            module_params[id(mod)] = module_name
+        if len(module_params) == len(aten_params):
+            for tensor_id in module_params.keys():
+                ret[aten_params[tensor_id]] = module_params[tensor_id]
+    
+        # buffers
+        aten_buffers = dict()
+        for itm in aten_module.named_buffers():
+            aten_name = itm[0]
+            aten_tensor = itm[1]
+            if len(aten_tensor.shape) == 0:
+                continue
+            aten_buffers[id(aten_tensor)] = aten_name
+        module_buffers = dict()
+        for itm in module.named_buffers():
+            mod_name = itm[0]
+            mod_tensor = itm[1]
+            if len(mod_tensor.shape) == 0:
+                continue
+            module_buffers[id(mod_tensor)] = mod_name
+        if len(module_buffers) == len(aten_buffers):
+            for tensor_id in module_buffers.keys():
+                ret[aten_buffers[tensor_id]] = module_buffers[tensor_id]
+    
+        return ret
+
+    def process_function(self, node, subgraph_idx):
+        op_name = node.target.__name__
+
+        if op_name in torch_constant_ops:
+            kwargs = {k:v for k, v in node.kwargs.items() if k != "device"}
+            tensor = torch_constant_ops[op_name](*node.args, **kwargs)
+            if len(tensor.shape) == 0:
+                tensor = tensor.unsqueeze(0)
+            self.node_to_id[node] = self.add_constant(node.name, tensor.float(), subgraph_idx)
+            self.id_to_intermed[self.node_to_id[node]] = tensor
+        elif op_name == "getitem":
+            assert isinstance(self.node_to_id[node.args[0]], (list, tuple))
+            assert node.args[1] == 0, "currently getitem only supported for index = 0"
+            self.node_to_id[node] = self.node_to_id[node.args[0]][node.args[1]]
+            self.id_to_intermed[self.node_to_id[node]] = self.id_to_intermed[self.node_to_id[node]][node.args[1]]
+        elif is_supported_op(op_name, node):
+            pybuda_node = get_pybuda_node(op_name, node)
+            self.node_to_id[node] = self.add_op(node, node.name, pybuda_node, subgraph_idx)
+        else:
+            # Unsupported function, fall back to CPU
+            assert False, f"Unsupported function {op_name}"
+
+            #logger.warning(f"Unsupported function {op_name}, falling back to CPU")
+            #fg = torch.fx.Graph()
+            #arg_remap = {}
+            #self.node_to_id[node] = fg.node_copy(node, lambda n : self.node_to_id[n])
+
+            # Create an input from the fallback
+            #self.node_to_id[node] = self.add_input(node, subgraph_idx, module_inputs)
+
+            # Add to fallback list, which we're going to use to create fallback graphs later
+            #fallback_ops.append(node)
+
+            # Record the intermed value
+            #self.id_to_intermed[self.node_to_id[node]] = self.eval_node(node)
+
+    
+    def _append_to_graph(self, module, aten_module, activations, subgraph_idx):
+    
+        param_name_map = self.map_node_name_to_org_name(module, aten_module)
+    
+        tt_act = [a.to("tt") for a in activations]
+        # Run static shape propagation on aten module
+        if len(tt_act) > 0:
+            shape_prop = torch.fx.passes.shape_prop.ShapeProp(aten_module)
+            if shape_prop.fake_mode is not None:
+                fake_args = [shape_prop.fake_mode.from_tensor(t, static_shapes=True) if isinstance(t, torch.Tensor) else t for t in tt_act]
+            else:
+                fake_args = tt_act
+            shape_prop.run(*fake_args)
+        
+        graph_lint(aten_module.graph)
+        aten_module = aten_module.to("cpu") # why?
+    
+        module_inputs = []
+        output_nids = []
+        output_requires_grad = []
+        output_tensors = []
+
+        device_graph = copy.deepcopy(aten_module.graph)
+
+        # Remove unused nodes
+        reduce_graph(device_graph)
+
+        # Find unsupported nodes
+        fallback_ops, fallback_outputs = get_unsupported_nodes(device_graph, _get_global_compiler_config())
+
+        # Filter out unsupported nodes into separate FX graphs
+        device_graphs = self.graph.filter_unsupported_nodes(device_graph, fallback_ops, fallback_outputs, subgraph_idx)
+        device_graph = None # Zero out the original variable to avoid accidental use
+
+        # Generate the schedule so we can evaluate fallback and get proper inputs into the rest of the graph
+        schedule = self.graph.generate_schedule(subgraph_idx, aten_module)
+        if len(device_graphs) == 1 and len(device_graphs[0].nodes) == 0:
+            # Nothing left in the graph
+            logger.debug("No nodes left in the device graph after fallback, skipping")
+            return False, [], self.id_to_intermed, output_tensors, schedule
+
+        graph_inputs = generate_device_inputs_from_sample_inputs(activations, schedule)
+
+        # Now convert whatever is left. Since "subgraph" term is already used for each graph created by torch,
+        # we'll use "program" for each device graph created after fallbacks. Each of these will run as a separate
+        # program in the final netlist
+        for program_id, device_graph in enumerate(device_graphs):
+            input_index = 0
+            graph_idx = MixedGraph.get_program_subgraph_id(subgraph_idx, program_id) 
+            self.output_nodes_per_subgraph[graph_idx] = []
+
+            schedule_item = schedule.get_device_schedule_item(program_id)
+        
+            for node in device_graph.nodes:
+
+                if node.op == "placeholder":
+                    assert self.graph
+                    input_source = schedule_item.inputs[input_index]
+
+                    created = False
+                    input_id = None
+                    if input_source.src == TensorSource.INPUT:
+                        uid = self.graph.get_subgraph_input(subgraph_idx, input_source.index)
+                        if uid != -1 and _get_global_compiler_config().enable_pt2_fx_graph_link:
+                            # this input is on device, don't create input node, add edge to corresponding output
+                            input_id = self.add_input(node, graph_idx)
+            
+                            idx, output_index = self.graph.get_output_index(uid)
+                            add_subgraph_io_link_edge(self.get_buda_graph(), self.output_nodes_per_subgraph[idx][output_index], 0, input_id, 0)
+                            created = True
+                            logger.trace(f"Linking input edge from {self.output_nodes_per_subgraph[idx][output_index]} to {input_id}")
+
+                    if not created:
+                        input_id = self.add_input(node, graph_idx)
+
+                    assert input_id
+
+                    self.node_to_id[node] = input_id
+                    module_inputs.append(input_id)
+                    self.id_to_intermed[input_id] = graph_inputs[program_id][input_index]
+                    input_index +=1
+
+                elif node.op == "get_attr":
+                    assert node.target in param_name_map, f"Weight node is not mapped to original names: {node.target}"
+                    self.node_to_id[node] = self.add_param(param_name_map[node.target], aten_module.state_dict()[node.target], graph_idx)
+                    self.id_to_intermed[self.node_to_id[node]] = aten_module.state_dict()[node.target]
+                elif node.op == "call_function":
+                    self.process_function(node, graph_idx)
+                elif node.op == "output":
+                    o_nids, o_tensors, o_requires_grad = self.add_outputs(node, graph_idx)
+                    output_nids.extend(o_nids)
+                    output_tensors.extend(o_tensors)
+                    output_requires_grad.extend(o_requires_grad)
+                else:
+                    assert False, f"Unsupported op {node.op}"
+        
+            self.output_nodes_per_subgraph[graph_idx].append(output_nids)
+
+        self.get_buda_graph().register_module_inputs(module_inputs, append=True)
+        self.get_buda_graph().register_module_outputs(output_nids, output_requires_grad, append=True)
+        return True, graph_inputs, self.id_to_intermed, output_tensors, schedule
+    
+def generate_device_inputs_from_sample_inputs(inputs: List[torch.Tensor], schedule: Schedule) -> List[List[torch.Tensor]]:
+    
+    # Run through the schedule using sample inputs, and calculate the sample inputs
+    # for each of the device graphs.
+
+    output_map = {}
+    intermediates = {}
+
+    device_graph_inputs = []
+
+    # To avoid unnecessary computation, figure out how many device graphs are there upfront, so we can stop before the last one
+    num_device_graphs = len([i for i in schedule if not i.fallback])
+
+    # Run the schedule up to the device
+    for item in schedule:
+            
+        graph_inputs = []
+        for i in item.inputs:
+            if i.src == TensorSource.INTERMEDIATE:
+                graph_inputs.append(intermediates[i.index])
+            elif i.src == TensorSource.INPUT:
+                graph_inputs.append(inputs[i.index])
+            else:
+                graph_inputs.append(output_map[i.index])
+
+        if not item.fallback:
+            device_graph_inputs.append(graph_inputs)
+            if len(device_graph_inputs) == num_device_graphs:
+                return device_graph_inputs
+        
+        logger.trace(f"Running graph on CPU to generate sample device inputs: {item.graph_index}")
+        assert item.graph_module
+        graph_outputs = item.graph_module(*graph_inputs)
+    
+        # Record intermediates
+        for i, output in enumerate(item.outputs):
+            if output.intermediate:
+                intermediates[output.index] = graph_outputs[i]
+            else:
+                output_map[output.index] = graph_outputs[i]
+
+    assert False, "Something went wrong, we didn't generate enough inputs"
diff --git a/pybuda/pybuda/fx/graph_utils.py b/pybuda/pybuda/fx/graph_utils.py
new file mode 100644
index 000000000..b278321ff
--- /dev/null
+++ b/pybuda/pybuda/fx/graph_utils.py
@@ -0,0 +1,148 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# Various utility functions for working with FX graphs
+#
+from typing import List, Tuple, Union
+
+import torch
+from loguru import logger
+
+from pybuda.fx.nodes import call_function_is_nop, call_function_is_reshape
+            
+def reduce_graph(module_or_graph: Union[torch.fx.Graph, torch.fx.GraphModule]):
+    # Reduce the graph to only the nodes that are used
+    
+    # Traverse up the graph from output nodes to populate consumed nodes set
+    graph = module_or_graph.graph if isinstance(module_or_graph, torch.fx.GraphModule) else module_or_graph
+    consumed = set()
+    working_nodes = []
+    for node in graph.nodes:
+        if node.op == "output":
+            working_nodes.append(node)
+            consumed.add(node)
+
+    while len(working_nodes) > 0:
+        node = working_nodes.pop(0)
+        if not isinstance(node, torch.fx.Node):
+            continue
+        for arg in node.all_input_nodes:
+            if arg not in consumed:
+                consumed.add(arg)
+                working_nodes.append(arg)
+
+    for node in reversed(graph.nodes):
+        if node not in consumed:
+            logger.debug(f"Removing node {node.name}")
+            graph.erase_node(node)
+
+    if len(graph.nodes) == 1:
+        for node in graph.nodes:
+            if node.op == "output":
+                # Remove the output node if it's the only one
+                graph.erase_node(node)
+
+def get_output_node(graph: torch.fx.Graph) -> torch.fx.Node:
+    # Find the output node of the graph - any faster way to do this?
+    for node in reversed(graph.nodes):
+        if node.op == "output":
+            return node
+    return None
+
+def append_to_output(graph: torch.fx.Graph, src: torch.fx.Node) -> Tuple[torch.fx.Node, int]:
+    # Append a src node to the output of the graph, and return the index of the output
+    output_node = get_output_node(graph)
+    if output_node is None:
+        # Create a new one
+        output_node = graph.output((src,))
+        output_node.meta["tensor_meta"] = (src.meta["tensor_meta"],)
+        return (output_node, 0)
+
+    output_node.args = ((*output_node.args[0], src),)
+    output_node.meta["tensor_meta"] = (*output_node.meta["tensor_meta"] , src.meta["tensor_meta"])
+    return (output_node, len(output_node.args) - 1)
+
+def move_output_to_end(graph: torch.fx.Graph):
+    if len(graph.nodes) == 0:
+        return
+
+    # Output should be at the end, topologically, if it's not there already
+    output_node = None
+    first = True
+    for node in reversed(graph.nodes):
+        if node.op == "output":
+            if first:
+                return # already last
+            output_node = node
+            break
+        first = False
+
+    assert output_node is not None
+    graph.node_copy(output_node, lambda x : x)
+    graph.erase_node(output_node)
+
+def remove_output_index(node: torch.fx.Node, idx: int):
+    # Remove the output index for the list of outputs
+    args = list(node.args[0])
+    del args[idx]
+    node.args = (tuple(args),)
+
+    meta = list(node.meta["tensor_meta"])
+    del meta[idx]
+    node.meta["tensor_meta"] = tuple(meta)
+
+def graph_lint(graph: torch.fx.Graph, graph_name: str = "graph"):
+    # Check that the graph is well formed. Built-in lint doesn't do enough
+    # Run built-in first
+    graph.lint()
+
+    def check(cond, msg):
+        if not cond:
+            logger.error(f"Lint error in {graph_name}: {msg}")
+            graph.print_tabular()
+            raise RuntimeError(f"Lint error in {graph_name}: {msg}")
+
+    # Check that there's only one output node, and that tensor_meta list is matching output list
+    found_output = False
+    for node in graph.nodes:
+        if node.op == "output":
+            check(not found_output, f"Multiple output nodes found")
+            check(len(node.args) == 1, f"Output node {node} has more than one argument")
+            if "tensor_meta" in node.meta:
+                check(len(node.meta["tensor_meta"]) == len(node.args[0]), f"Output node {node} in has mismatched tensor meta and args: {node.meta['tensor_meta']} vs {node.args[0]}")
+    
+def graph_to_device(graph: torch.fx.Graph, device: Union[str, torch.device]):
+    # Update any ops in the graph that are explicitly assigning device, and override to the given device
+
+    def device_kwarg_to_cpu(node: torch.fx.Node):
+        # If the node is a device kwarg, then we need to move it to CPU
+        if 'device' in node.kwargs:
+            new_kwargs = node.kwargs.copy()
+            new_kwargs['device'] = device
+            node.kwargs = new_kwargs
+
+    for node in graph.nodes:
+        if not isinstance(node, torch.fx.Node):
+            continue
+
+        device_kwarg_to_cpu(node)
+
+def is_nop_graph(graph: torch.fx.Graph) -> bool:
+    for node in graph.nodes:
+        if node.op == "call_function" and not call_function_is_nop(node) and not call_function_is_reshape(node):
+            return False
+    return True
+
+def is_constant_graph(graph: torch.fx.Graph) -> bool:
+    for node in graph.nodes:
+        if node.op == "placeholder":
+            return False
+    return True
+
+def has_output(graph: torch.fx.Graph) -> bool:
+    for node in graph.nodes:
+        if node.op == "output" and len(node.all_input_nodes) > 0:
+            return True
+    return False
diff --git a/pybuda/pybuda/fx/mixed_graph.py b/pybuda/pybuda/fx/mixed_graph.py
new file mode 100644
index 000000000..36562a443
--- /dev/null
+++ b/pybuda/pybuda/fx/mixed_graph.py
@@ -0,0 +1,577 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# Mixed graph contains a pybuda graph, and one or more FX graphs that will be executed on the CPU. It is 
+# generated by capturing a FX graph from pt2. Unsupported ops, or arguments will be dropped down to CPU.
+#
+
+from typing import Dict, List, Tuple, Set
+from collections import defaultdict
+import os
+import copy
+
+import torch
+from loguru import logger
+
+import pybuda
+from pybuda.fx.nodes import torch_constant_ops, call_function_is_nop, call_function_is_reshape
+from pybuda.fx.schedule import Schedule
+from pybuda.fx.graph_utils import reduce_graph, get_output_node, append_to_output, move_output_to_end, remove_output_index, graph_lint, is_nop_graph, is_constant_graph, has_output, graph_to_device
+from pybuda.fx.trace import IOTracer
+from pybuda._C.torch_device import unique_id
+
+class MixedGraph:
+    def __init__(self, module_name: str):
+        self.graph = pybuda._C.graph.Graph(module_name)
+        self.inputs_per_subgraph : Dict[int, List[int]] = {}
+        self.outputs_per_subgraph : Dict[int, List[int]] = {}
+
+        # Original graph nodes, ordered before the filtering
+        self.input_nodes_per_subgraph : Dict[int, List[torch.fx.Node]] = {}
+        self.output_nodes_per_subgraph : Dict[int, List[torch.fx.Node]] = {}
+
+        self.fallback_graphs_per_subgraph: Dict[int, List[torch.fx.Graph]] = {}
+        self.mappings_per_subgraph: Dict[int, Dict[str, Dict[torch.fx.Node, torch.fx.Node]]] = {}
+        self.device_graphs_per_subgraph: Dict[int, List[torch.fx.Graph]] = {}
+        
+        # A bit hacky - but there's no compiler-level config for log level at the moment
+        log_env = 'LOGURU_LEVEL'
+        self.log_trace = log_env in os.environ and os.environ[log_env] == "TRACE"
+
+    @classmethod
+    def get_program_subgraph_id(cls, subgraph_idx: int, program_idx: int) -> int:
+        # encode in one number, maybe break it up later
+        assert program_idx < 100, "Too many programs in a single subgraph"
+        return subgraph_idx * 100 + program_idx
+
+    def capture_sample_inputs(self, inputs: List[torch.Tensor], subgraph_id: int):
+        self.inputs_per_subgraph[subgraph_id] = [unique_id(t) for t in inputs]
+
+    def capture_sample_outputs(self, outputs: List[torch.Tensor], subgraph_id: int):
+        self.outputs_per_subgraph[subgraph_id] = [unique_id(t) for t in outputs]
+
+    def get_subgraph_input(self, subgraph_id: int, input_index: int) -> int:
+        return self.inputs_per_subgraph[subgraph_id][input_index]
+
+    def get_output_index(self, uid) -> Tuple[int, int]:
+        # Return the subgraph index and output index for the given uid
+        for idx in self.outputs_per_subgraph:
+            if uid in self.outputs_per_subgraph[idx]:
+                return idx, self.outputs_per_subgraph[idx].index(uid)
+
+        assert False, "Output not found"
+
+    def filter_unsupported_nodes(self, device_graph: torch.fx.Graph, unsupported_ops: Set[torch.fx.Node], unsupported_outputs: Set[torch.fx.Node], subgraph_id: int):
+        # Move unsupported ops to CPU
+
+        # First, we'll copy all unsupported ops to a new FX graph. For each node that gets its input
+        # from a supported op, we'll create a new input in the fallback graph, and output in the original graph.
+        # We'll then record that mapping.
+
+        # Mapping of input node on one graph, and node that drives the output on the other graph that should feed this input
+        new_io_mapping : Dict[torch.fx.Node, torch.fx.Node] = {}
+
+        # Placeholders that are copied into the new graph
+        placeholder_map : Dict[torch.fx.Node, torch.fx.Node] = {}
+
+        # Mapping of copied nodes
+        copied_node_mapping : Dict[torch.fx.Node, torch.fx.Node] = {}
+
+        # Mapping of outputs that got moved out of main graph into fallback, to fallback's output
+        moved_output_mapping : Dict[torch.fx.Node, torch.fx.Node] = {}
+
+        # List of inputs/outputs to create in the original graph, once we're done traversing and it's safe to modify
+        scheduled_new_outputs : List[Tuple[torch.fx.Node, torch.fx.Node]] = [] # Tuple - source in original graph, dest in new graph
+        scheduled_new_inputs : List[Tuple[torch.fx.Node, torch.fx.Node, torch.fx.Node]] = [] # Tuple - dest in original graph, source in original graph, source in new graph
+        
+        logger.trace("Initial graph:")
+        if self.log_trace:
+            device_graph.print_tabular()
+
+        fallback_graph = torch.fx.Graph()
+
+        if subgraph_id not in self.input_nodes_per_subgraph:
+            self.input_nodes_per_subgraph[subgraph_id] = []
+
+        if subgraph_id not in self.output_nodes_per_subgraph:
+            self.output_nodes_per_subgraph[subgraph_id] = []
+        
+        output_node = get_output_node(device_graph)
+        if output_node is None:
+            self.device_graphs_per_subgraph[subgraph_id] = [fallback_graph]
+            self.fallback_graphs_per_subgraph[subgraph_id] = []
+            self.mappings_per_subgraph[subgraph_id] = {
+                "new_io_mapping": new_io_mapping,
+                "placeholder_map": placeholder_map,
+                "copied_node_mapping": copied_node_mapping,
+                "moved_output_mapping": moved_output_mapping
+            }
+            return [fallback_graph] # No outputs, nothing to do
+
+        if is_constant_graph(device_graph):
+            graph_to_device(device_graph, 'cpu')
+            self.device_graphs_per_subgraph[subgraph_id] = [fallback_graph]
+            self.fallback_graphs_per_subgraph[subgraph_id] = [device_graph]
+            self.mappings_per_subgraph[subgraph_id] = {
+                "new_io_mapping": new_io_mapping,
+                "placeholder_map": placeholder_map,
+                "copied_node_mapping": copied_node_mapping,
+                "moved_output_mapping": moved_output_mapping
+            }
+
+            # Record outputs
+            for arg in output_node.args[0]:
+                self.output_nodes_per_subgraph[subgraph_id].append(arg)
+                
+            return [fallback_graph]
+
+
+        # Some of the unsupported nodes will have nop arguments, which we want to copy over as well. Let's find those first.
+        to_copy_ops = unsupported_ops.copy()
+        to_copy_ops.update(unsupported_outputs)
+        for node in device_graph.nodes:
+                
+            # While traversing, record original inputs/outputs
+            if node.op == "placeholder":
+                self.input_nodes_per_subgraph[subgraph_id].append(node)
+                continue
+
+            if node.op == "output":
+                for arg in node.args[0]:
+                    self.output_nodes_per_subgraph[subgraph_id].append(arg)
+                continue
+
+            if node not in unsupported_ops and node not in unsupported_outputs:
+                continue
+            
+            for arg in node.all_input_nodes:
+                if arg.op != "call_function":
+                    continue
+                
+                op_name = arg.target.__name__
+                if call_function_is_nop(arg):
+                    to_copy_ops.add(arg)
+
+            # If any of the users of unsupported ops are reshapes, they are easier to run on CPU anyway
+            for user in node.users:
+                if user.op == "call_function" and call_function_is_reshape(user):
+                    to_copy_ops.add(user)
+                    unsupported_ops.add(user)
+
+        logger.trace(f"To copy/move to CPU: ", to_copy_ops)
+
+        # Now go through and copy the nodes that need copying
+        for node in device_graph.nodes:
+
+            # If the output is driven by an unsupported op, then we need to move it over to the new graph
+            if node.op == "output":
+                assert len(node.args) == 1
+                for driving_node in node.args[0]:
+                    if driving_node in unsupported_ops or driving_node in unsupported_outputs:
+                        logger.trace(f"Moving output: {driving_node} {hex(id(driving_node))}, to copied node: {hex(id(copied_node_mapping[driving_node]))}")
+                        
+                        # Add to fallback graph
+                        append_to_output(fallback_graph, copied_node_mapping[driving_node])
+                        moved_output_mapping[driving_node] = copied_node_mapping[driving_node]
+                        
+                continue
+
+            # If the op is supported, but has an argument that's unsupported, then we need to create an output on fallback graph to get
+            # the value, and a placeholder in the original graph, and map them 
+            if node not in to_copy_ops:
+                if not isinstance(node, torch.fx.Node):
+                    continue
+                for arg in node.all_input_nodes:
+                    if arg not in unsupported_ops:
+                        continue
+                    
+                    # Check if we already have this input scheduled
+                    already_scheduled = False
+                    for _, n, _ in scheduled_new_inputs:
+                        if n == arg:
+                            already_scheduled = True
+                            break
+
+                    if already_scheduled:
+                        continue
+                    
+                    # We should've already made a copy of this, so it's safe to get it from copied_node_mapping
+                    new_node = copied_node_mapping[arg]
+                    append_to_output(fallback_graph, new_node)
+                    scheduled_new_inputs.append((node, arg, new_node))
+
+                continue
+
+            if node not in to_copy_ops:
+                continue
+
+            # Figure out which of these need to be new inputs, and also create the map for copying
+            arg_map : Dict[torch.fx.Node, torch.fx.Node] = {}
+
+            def device_kwarg_to_cpu(node: torch.fx.Node):
+                # If the node is a device kwarg, then we need to move it to CPU
+                if 'device' in node.kwargs:
+                    new_kwargs = node.kwargs.copy()
+                    new_kwargs['device'] = 'cpu'
+                    node.kwargs = new_kwargs
+
+            for arg in node.all_input_nodes:
+            
+                if arg in to_copy_ops:
+                    # We should've already made a copy of this
+                    assert arg in copied_node_mapping, f"Node {arg} not copied, but it's an argument to {node} and in 'to copy ops'"
+                    arg_map[arg] = copied_node_mapping[arg]
+                    continue
+                
+                if arg.op == "call_function":
+                    op_name = arg.target.__name__
+
+                    # If the function is a constant, copy it over, no sense it evaluating it on device and then copying
+                    #if op_name in torch_constant_ops or op_name == "getitem":
+                    if op_name in torch_constant_ops:
+                        logger.trace(f"Copying constant op to fallback graph: {arg}")
+                        new_node = fallback_graph.node_copy(arg, lambda x: x) # there should be no node args to copy in a constant op
+                        copied_node_mapping[arg] = new_node
+                        arg_map[arg] = new_node
+                        device_kwarg_to_cpu(copied_node_mapping[arg])
+                        continue
+
+                    # Supported op, calculated on device. We need to create inputs and outputs, unless it's already an output
+                    already_output = arg in self.output_nodes_per_subgraph[subgraph_id]
+                    logger.trace(f"Creating new output/input pair to fallback graph: {arg}")
+                    in_node = fallback_graph.placeholder(arg.name)
+                    if not already_output:
+                        scheduled_new_outputs.append((arg, in_node))
+                    new_io_mapping[in_node] = arg
+                    arg_map[arg] = in_node
+                    continue
+
+                if arg.op == "placeholder":
+                    # We need to create a new placeholder in the fallback graph
+                    logger.trace(f"Copying placeholder to fallback graph: {arg}")
+                    in_node = fallback_graph.placeholder(arg.name)
+                    arg_map[arg] = in_node
+                    placeholder_map[in_node] = arg
+                    continue
+
+                # Explicitly allow other types for now, so that we don't miss anything important. Eventually we can remove the assert
+                if arg.op in ["int", "float"]:
+                    continue
+
+                assert False, f"Unsupported argument type {arg.op} for node {arg}"
+                
+
+            logger.trace(f"Falling back unsupported op, or needed nop, to fallback graph: {node}")
+            copied_node_mapping[node] = fallback_graph.node_copy(node, lambda x: arg_map[x])
+            device_kwarg_to_cpu(copied_node_mapping[node])
+
+        # Create new outputs
+        # Graph can only have one output, so we need to append it to existing output
+        for source, dest in scheduled_new_outputs:
+            append_to_output(device_graph, source)
+
+        # Create new inputs
+        for dest, source, new_source in scheduled_new_inputs:
+            with device_graph.inserting_before(dest):
+                in_node = device_graph.placeholder(source.name)
+                in_node.meta["tensor_meta"] = source.meta["tensor_meta"]
+            source.replace_all_uses_with(in_node)
+            new_io_mapping[in_node] = new_source
+        
+        # Remove outputs
+        output_node = get_output_node(device_graph)
+        assert output_node is not None
+
+        for driving_node in moved_output_mapping:
+            remove_output_index(output_node, output_node.args[0].index(driving_node))
+
+        # Remove the unsupported ops from the original graph
+        for node in reversed(device_graph.nodes):
+            if node in unsupported_ops:
+                device_graph.erase_node(node)
+        
+        # Reduce unused stuff
+        reduce_graph(device_graph)
+
+        # Move outputs to the end
+        move_output_to_end(device_graph)
+        move_output_to_end(fallback_graph)
+
+        graph_lint(device_graph, "Device_graph_after_fallback")
+        graph_lint(fallback_graph, "Merged_fallback")
+
+        logger.trace("After fallback:")
+        logger.trace("Device graph:")
+        if self.log_trace:
+            device_graph.print_tabular()
+        logger.trace("Fallback graph:")
+        if self.log_trace:
+            fallback_graph.print_tabular()
+        logger.trace("IO Mappings: ", new_io_mapping)
+
+        # Break up device/fallback graphs into multiple graphs if there are any circular dependencies
+        device_graphs, fallback_graphs = self.break_up_deadlocks(subgraph_id, device_graph, fallback_graph, new_io_mapping, placeholder_map, copied_node_mapping, moved_output_mapping)
+
+        device_graph = None # Clear the original graph variable to avoid accidental use
+        
+        """
+        print("After breakup:")
+        for i, g in enumerate(device_graphs):
+            print(f"Device graph {i}:")
+            g.print_tabular()
+        for i, g in enumerate(fallback_graphs):
+            print(f"Fallback graph {i}:")
+            g.print_tabular()
+        """
+
+        # Update output nodes
+        for i, node in enumerate(self.output_nodes_per_subgraph[subgraph_id]):
+            if node in moved_output_mapping:
+                self.output_nodes_per_subgraph[subgraph_id][i] = moved_output_mapping[node]
+
+        # Clear fallback graphs if there's only one empty one
+        if len(fallback_graphs) == 1 and len(fallback_graphs[0].nodes) == 0:
+            fallback_graphs = []
+        
+        [graph_lint(d, f"Device_{i}") for i, d in enumerate(device_graphs)]
+        [graph_lint(f, f"Fallback_{i}") for i, f in enumerate(fallback_graphs)]
+        
+        # If any final device graph is constant, let's move it to the cpu
+        device_graphs_to_remove = []
+        for i, device_graph in enumerate(device_graphs):
+            if len(device_graph.nodes) > 0 and (is_nop_graph(device_graph) or is_constant_graph(device_graph) or not has_output(device_graph)):
+                logger.debug(f"Device graph {i} is a NOP or constant, moving to CPU")
+                graph_to_device(device_graph, 'cpu')
+                fallback_graphs.append(device_graph)
+                device_graphs_to_remove.append(device_graph)
+        
+        for g in device_graphs_to_remove:
+            device_graphs.remove(g)
+                
+        if len(device_graphs) == 0:
+            device_graphs = [torch.fx.Graph()] # create a blank graph
+
+        logger.trace("=== Final graphs:")
+        logger.trace("= Device:")
+        for i, g in enumerate(device_graphs):
+            logger.trace(f"* Device graph {i}")
+            if self.log_trace:
+                g.print_tabular()
+        logger.trace("= Fallbacks:")
+        for i, f in enumerate(fallback_graphs):
+            logger.trace(f"* Falllback graph {i}")
+            if self.log_trace:
+                f.print_tabular()
+
+        self.fallback_graphs_per_subgraph[subgraph_id] = fallback_graphs
+        self.mappings_per_subgraph[subgraph_id] = {
+                "new_io_mapping": new_io_mapping,
+                "placeholder_map": placeholder_map,
+                "copied_node_mapping": copied_node_mapping,
+                "moved_output_mapping": moved_output_mapping
+        }
+        self.device_graphs_per_subgraph[subgraph_id] = device_graphs
+
+        return device_graphs
+
+    def break_up_deadlocks(self, 
+            subgraph_id: int,
+            device_graph: torch.fx.Graph, 
+            fallback_graph: torch.fx.Graph, 
+            new_io_mapping: Dict[torch.fx.Node, torch.fx.Node],
+            placeholder_map: Dict[torch.fx.Node, torch.fx.Node], 
+            copied_node_mapping: Dict[torch.fx.Node, torch.fx.Node], 
+            moved_output_mapping: Dict[torch.fx.Node, torch.fx.Node]) -> Tuple[List[torch.fx.Graph], List[torch.fx.Graph]]:
+
+        # Search for any circular dependencies between graphs, and keep breaking them down into multiple graphs until
+        # there are none left. 
+        # Since the original graph should not legally have any circular dependencies, it should be possible to create
+        # a set of smaller graphs without circular dependencies. 
+
+        fallback_graphs = [fallback_graph] if len(fallback_graph.nodes) > 0 else []
+        device_graphs = [device_graph] if len(device_graph.nodes) > 0 else []
+
+        progress = True
+        new_graphs = []
+
+        # Put everything in working graphs, but keep track of which are fallback and which are device graphs above
+        working_graphs = copy.copy(device_graphs) + copy.copy(fallback_graphs)
+
+        # For each graph, figure out which graphs inputs are coming from, and which graph outputs are going to
+        graph_inputs : Dict[torch.fx.Graph, Set[torch.fx.Node]] = defaultdict(set)
+        graph_outputs : Dict[torch.fx.Graph, Set[torch.fx.Node]] = defaultdict(set)
+        outputs_to_dest_node : Dict[torch.fx.Node, Set[torch.fx.Node]] = defaultdict(set) # map of output node to inputs in other graphs
+
+        def calculate_dependencies(working_graphs: List[torch.fx.Graph]):
+            # Recalculate from scratch. This is not efficient compared to doing it incrementally on every
+            # graph change, but that would be error-prone... Let's optimize if it's really needed.
+            graph_inputs.clear()
+            outputs_to_dest_node.clear()
+
+            for graph in working_graphs:
+                for node in graph.nodes:
+                    if node.op == 'placeholder':
+                        graph_inputs[graph].add(node)
+
+                    if node.op == 'output':
+                        for arg in node.all_input_nodes:
+                            # Figure out who needs this output
+                            for k, v in new_io_mapping.items():
+                                if v == arg:
+                                    outputs_to_dest_node[arg].add(k)
+
+            #print("Graph inputs:")
+            #print(graph_inputs)
+            #print("Graph outputs:")
+            #print(graph_outputs)
+            #print("Inputs to src graph:")
+            #print(inputs_to_src_graph)
+            #print("Outputs to dest graph:")
+            #print(outputs_to_dest_graph)
+
+        tracer = IOTracer(working_graphs)
+        while progress:
+            # Keep looking for input/output pair, make a copy of the graph and them remove down from input and up from output
+            # If either/both graphs end up empty, then they are linked and we have a problem
+
+            # There are probably better algorithms to do this :) - but this is simple and I think it works
+            working_graphs.extend(new_graphs)
+            calculate_dependencies(working_graphs)
+
+            new_graphs = []
+            progress = False
+            dependency = None
+
+            # For each input in each graph, trace to final output, or until we reach ourselves again, in which case we need to break the cycle
+            for _, inputs in graph_inputs.items():
+                for input_node in inputs:
+                    output_node_for_cycle = tracer.trace_for_cycle(input_node, outputs_to_dest_node)
+                    if output_node_for_cycle is not None:
+                        progress = True
+                        dependency = (output_node_for_cycle, input_node)
+                        break
+
+                if progress:
+                    break
+
+            if not progress:
+                break
+            
+            assert dependency is not None
+            assert dependency[0].graph == dependency[1].graph, "Dependency is not a loop"
+            graph = dependency[0].graph
+            
+            starting_input, loopback_input = dependency
+
+            # Create a copy, and separate graphs
+            logger.debug(f"Graph has a dependency to break: {starting_input} -> {loopback_input}")
+            graph.print_tabular()
+            new_graph = torch.fx.Graph()
+            #new_graph = copy.deepcopy(graph)
+            copy_map = {}
+            output_value = new_graph.graph_copy(graph, copy_map)
+            new_output = new_graph.output(output_value)
+            new_output.meta["tensor_meta"] = tuple(o.meta["tensor_meta"] for o in output_value)
+
+            # Remove nodes from the original graph
+            to_remove = []
+            to_process = [starting_input]
+            while to_process:
+                node = to_process.pop()
+                to_remove.append(node)
+                for user in node.users:
+                    if user not in to_remove:
+                        if user.op == "output":
+                            remove_output_index(user, user.args[0].index(node))
+                            break # done at output
+                        else:
+                            to_process.append(user)
+            
+            for node in reversed(to_remove):
+                graph.erase_node(node)
+
+            reduce_graph(graph) # remove dead code
+            assert len(graph.nodes) > 0, f"Graph {graph} is empty after trying to disjoin multiple fallback graphs"
+
+            # Removed nodes from the copy
+            to_remove = []
+            to_process = [copy_map[loopback_input]]
+            assert to_process[0].graph == new_graph
+            
+            while to_process:
+                node = to_process.pop()
+                to_remove.append(node)
+                for user in node.users:
+                    if user not in to_remove:
+                        if user.op == "output":
+                            remove_output_index(user, user.args[0].index(node))
+                            break # done at output
+                        else:
+                            to_process.append(user)
+
+            for node in reversed(to_remove):
+                new_graph.erase_node(node)
+
+            reduce_graph(new_graph)
+
+            assert len(new_graph.nodes) > 0, f"Graph {new_graph} is empty after trying to disjoin multiple fallback graphs"
+
+            new_graphs.append(new_graph)
+
+            if graph in fallback_graphs:
+                fallback_graphs.append(new_graph)
+            else:
+                device_graphs.append(new_graph)
+
+            # Update mappings, as some nodes have moved into the new graph
+            for node in new_graph.nodes:
+                if node.op == "output":
+                    continue
+                    
+                # Reverse lookup, since copy map is old->new, and we need new->old
+                original_node = None
+                for org_node, new_node in copy_map.items():
+                    if new_node == node:
+                        original_node = org_node
+                        break
+                assert original_node is not None
+
+                for node_map in [new_io_mapping, placeholder_map, copied_node_mapping, moved_output_mapping]:
+                    update_node_map(node_map, original_node, node)
+
+                if original_node in self.output_nodes_per_subgraph[subgraph_id]:
+                    self.output_nodes_per_subgraph[subgraph_id][self.output_nodes_per_subgraph[subgraph_id].index(original_node)] = node
+
+            tracer.remove_graph(graph)
+            tracer.add_graph(graph)
+            tracer.add_graph(new_graph)
+
+        return device_graphs, fallback_graphs
+
+
+    def generate_schedule(self, subgraph_idx: int, aten_module: torch.fx.GraphModule) -> Schedule:
+        # For given subgraph, figure out a schedule of FX and Buda graphs that need to run, and how to map inputs to outputs
+        schedule = Schedule(
+                self.input_nodes_per_subgraph[subgraph_idx], 
+                self.output_nodes_per_subgraph[subgraph_idx], 
+                aten_module,
+                self.device_graphs_per_subgraph[subgraph_idx],
+                self.fallback_graphs_per_subgraph[subgraph_idx], 
+                self.mappings_per_subgraph[subgraph_idx])
+
+
+        return schedule
+
+def update_node_map(node_map: Dict[torch.fx.Node, torch.fx.Node], org_node: torch.fx.Node, new_node: torch.fx.Node):
+    # Check both keys and values of the map for org_node and switch them to new_node
+    if org_node in node_map:
+        node_map[new_node] = node_map[org_node]
+        del node_map[org_node]
+
+    for k, v in node_map.items():
+        if v == org_node:
+            node_map[k] = new_node
+            break
+
diff --git a/pybuda/pybuda/fx/nodes.py b/pybuda/pybuda/fx/nodes.py
new file mode 100644
index 000000000..6d1ab30f5
--- /dev/null
+++ b/pybuda/pybuda/fx/nodes.py
@@ -0,0 +1,763 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+# 
+# Functions that convert FX nodes to PyBuda
+#
+
+import sys
+import math
+from typing import List, Set, Tuple
+
+import torch
+from loguru import logger
+
+from pybuda._C.graph import OpType
+from pybuda.tensor import pytorch_dtype_to_buda_dataformat
+from pybuda.config import CompilerConfig, _get_global_compiler_config
+
+class PyBudaNode:
+    def __init__(self, op: OpType, args: List[torch.fx.node.Node]):
+        self.op = op
+        self.args = args
+        self.shape = None
+        self.dtype = None
+        self.wrap_tuple = False
+
+def process_dummy_no_attr(node, pybuda_op_name):
+    return PyBudaNode(OpType(pybuda_op_name, []), node.args)
+
+def process_dummy_attr_in_args(node, pybuda_op_name):
+    attrs = node.args[1] if len(node.args) == 2 else node.args[1:]
+    if not isinstance(attrs, (list, tuple)):
+        attrs = [attrs, ]
+    return PyBudaNode(OpType(pybuda_op_name, attrs), [node.args[0], ])
+
+def process_expand(node, pybuda_op_name):
+    return PyBudaNode(OpType(pybuda_op_name, []), [node.args[0], ])
+
+def process_clamp(node, pybuda_op_name):
+    assert len(node.args) == 3
+    inputs = [node.args[0],]
+    min_ = node.args[1]
+    max_ = node.args[2]
+
+    if min_ is None:
+        assert max_ is not None, "Both min and max attributes for clmap are empty"
+        return PyBudaNode(OpType("relu", [max_, "max"]), inputs)
+    elif max_ is None:
+        assert min_ is not None, "Both min and max attributes for clmap are empty"
+        return PyBudaNode(OpType("relu", [min_, "min"]), inputs)
+    else:
+        return PyBudaNode(OpType(pybuda_op_name, named_attrs = {"min": min_, "max": max_}), inputs)
+
+def process_flatten(node, pybuda_op_name):
+    return PyBudaNode(OpType(pybuda_op_name, [-1, ]), [node.args[0], ])
+
+def process_gelu(node, pybuda_op_name):
+    return PyBudaNode(OpType(pybuda_op_name, ["none", ]), node.args)
+
+def process_getitem(node, pybuda_op_name):
+    num_dims = sum([(isinstance(dim, slice) and (dim.start is not None or dim.stop is not None)) or (not isinstance(dim, slice) and dim is not None) for dim in node.args[1]])
+    if num_dims == 0:
+        return PyBudaNode(OpType("nop", []), [node.args[0], ])
+    assert num_dims <= 1, "TODO: Support multi axis getitem"
+    for dim, slice_index in enumerate(node.args[1]):
+        if isinstance(slice_index, slice) and slice_index.start is None and slice_index.stop is None:
+            continue
+        if isinstance(slice_index, int):
+            start = slice_index
+            stop = None
+            stride = 1
+        else:
+            start = slice_index.start
+            stop = slice_index.stop
+            if slice_index.step is not None:
+                stride = slice_index.step
+            else:
+                stride = 1
+
+    if stop is None:
+        stop = start + 1
+    if stop < 0:
+        stop += node.args[0].meta['tensor_meta'].shape[dim]
+    
+    return PyBudaNode(OpType(pybuda_op_name, [dim, start, stop, stride]), [node.args[0], ])
+
+def process_interpolate(node, pybuda_op_name):
+    assert all([arg in node.kwargs for arg in ["size", "mode", "align_corners"]])
+
+    output_size = node.kwargs["size"]
+    align_corners = int(node.kwargs["align_corners"])
+    mode_str = node.kwargs["mode"]
+    if mode_str == "bilinear":
+        mode = 1
+    elif mode_str == "nearest":
+        mode = 0
+    else:
+        assert False, f"Unsupported interpolate mode: {mode_str}"
+
+    attrs = [output_size, output_size, mode, align_corners, 0] # channel-last is false for pt
+    return PyBudaNode(OpType(pybuda_op_name, attrs), [node.args[0], ])
+
+def process_transpose(node, pybuda_op_name):
+    torch_op_name = node.target.__name__
+    if torch_op_name == "permute":
+        dim0 = None
+        dim1 = None
+        for i, arg in enumerate(node.args[1]):
+            if arg != i:
+                if dim0 is None:
+                    dim0 = i
+                elif dim1 is None:
+                    dim1 = i
+                else:
+                    assert False, "Multi axis permute needs to be added to pybuda"
+
+    elif torch_op_name == "transpose":
+        dim0 = node.args[1]
+        dim1 = node.args[2]
+    
+    dims = len(node.args[0].meta['tensor_meta'].shape)
+    if dim0 > 0:
+        dim0 -= dims
+    if dim1 > 0:
+        dim1 -= dims
+    if dim0 > dim1:
+        dim0, dim1 = dim1, dim0
+
+    named_attrs = {"dim0": dim0, "dim1": dim1, "z_dim_slice": -1}
+
+    return PyBudaNode(OpType(pybuda_op_name, named_attrs=named_attrs), [node.args[0], ])
+
+def process_softmax(node, pybuda_op_name):
+    if len(node.args) == 1:
+        assert "dim" in node.kwargs, "dim must be specified"
+        dim = node.kwargs["dim"]
+    else:
+        dim = node.args[1]
+    
+    if dim >= 0:
+        dim -= len(node.args[0].meta['tensor_meta'].shape)
+    stable = 1
+    attrs = [dim, stable]
+    return PyBudaNode(OpType(pybuda_op_name, attrs), [node.args[0], ])
+
+def process_conv2d(node, pybuda_op_name):
+    assert len(node.args) == 9
+
+    inputs = [node.args[0], node.args[1]]
+    if node.args[2]: # bias
+        inputs.append(node.args[2])
+
+    strides = node.args[3]
+    if isinstance(node.args[4], list):
+        if len(node.args[4]) == 2:
+            padding = [node.args[4][1], node.args[4][1], node.args[4][0], node.args[4][0]]
+        else:
+            padding = node.args[4]
+    else:
+        padding = [node.args[4]] * 4 
+    dilation = node.args[5]
+    group = node.args[8]
+    assert all([d == dilation[0] for d in dilation]), "Dilation is not same for all-dim, not supported"
+    attrs = strides + [dilation[0], group] + padding + [False, 0, 0, 0, False] # channel-last = false for pt 
+
+    return PyBudaNode(OpType(pybuda_op_name, attrs), inputs)
+
+def process_maxpool2d(node, pybuda_op_name):
+    assert len(node.args) >= 2 and len(node.args) <= 7, f"Maxpool-2d supposed to have 2~7 args: #args = {len(node.args)}" 
+    inputs = [node.args[0],] 
+    kernel_size = node.args[1]
+    strides = node.args[1]
+    padding = [0] * 4
+    dilation = 1
+    ceil_mode = False
+
+    if len(node.args) >= 3:
+        strides = node.args[2]
+
+    if len(node.args) >= 4:
+        if isinstance(node.args[3], list):
+            if len(node.args[3]) == 2:
+                padding = [node.args[3][1], node.args[3][1], node.args[3][0], node.args[3][0]]
+            else:
+                padding = node.args[3]
+        else:
+            padding = [node.args[3]] * 4
+
+    if len(node.args) >= 5:
+        dilation = node.args[4]
+
+    if len(node.args) >= 6:
+        ceil_mode = node.args[5]
+
+    compiler_cfg = _get_global_compiler_config()
+    add_sub_surround = compiler_cfg.max_pool_add_sub_surround
+    add_sub_surround_value = compiler_cfg.max_pool_add_sub_surround_value
+    attrs = kernel_size + strides + [dilation, ceil_mode] + padding + [add_sub_surround, add_sub_surround_value, False] # channel-last = False for pt
+
+    pybuda_node = PyBudaNode(OpType(pybuda_op_name, attrs), inputs)
+    pybuda_node.shape = node.meta['tensor_meta'][0].shape
+    pybuda_node.dtype = pytorch_dtype_to_buda_dataformat(node.meta['tensor_meta'][0].dtype)
+    pybuda_node.wrap_tuple = True
+    return pybuda_node
+
+def process_matmul(node, pybuda_op_name):
+    assert len(node.args) == 2 or len(node.args) == 3
+    if len(node.args) == 3:
+        # Torch addmm inputs are bias, LHS, RHS
+        args = [node.args[1], node.args[2], node.args[0]]
+    else:
+        args = node.args
+    
+    return PyBudaNode(OpType(pybuda_op_name, []), args)
+
+def process_embedding(node, pybuda_op_name):
+    assert len(node.args) == 2 or len(node.args) == 3
+
+    #TODO Handle padding index (arg 2)
+    args = [node.args[0], node.args[1]]
+    return PyBudaNode(OpType(pybuda_op_name, []), args)
+
+def process_mean(node, pybuda_op_name):
+    assert len(node.args) >= 2
+    dim = node.args[1]
+    attrs = [dim,]
+    args = [node.args[0],]
+    return PyBudaNode(OpType(pybuda_op_name, attrs), args)
+
+def process_layernorm(node, pybuda_op_name):
+    assert len(node.args) == 5
+    dim = -1
+    epsilon = node.args[4]
+    attrs = [dim, epsilon]
+
+    args = [node.args[0], node.args[2], node.args[3]]
+    pybuda_node = PyBudaNode(OpType(pybuda_op_name, attrs), args)
+    pybuda_node.shape = node.meta['tensor_meta'][0].shape
+    pybuda_node.dtype = pytorch_dtype_to_buda_dataformat(node.meta['tensor_meta'][0].dtype)
+    pybuda_node.wrap_tuple = True
+    return pybuda_node
+
+def process_batchnorm(node, pybuda_op_name):
+    assert len(node.args) == 7
+    epsilon = node.args[-1]
+    attrs = [epsilon]
+    args = [node.args[0], node.args[1], node.args[2], node.args[3], node.args[4]] 
+    pybuda_node = PyBudaNode(OpType(pybuda_op_name, attrs), args)
+
+    pybuda_node.shape = node.meta['tensor_meta'][0].shape
+    pybuda_node.dtype = pytorch_dtype_to_buda_dataformat(node.meta['tensor_meta'][0].dtype)
+    pybuda_node.wrap_tuple = True
+    return pybuda_node
+
+def process_select(node, pybuda_op_name):
+    assert len(node.args) == 3
+
+    dim = node.args[1]
+    if dim >= 0:
+        dim -= len(node.args[0].meta['tensor_meta'].shape)
+    index = node.args[2]
+    attrs = [dim, index, index+1, 1]
+    args = [node.args[0], ]
+    return PyBudaNode(OpType(pybuda_op_name, attrs), args)
+
+def process_slice(node, pybuda_op_name):
+    assert len(node.args) == 4
+
+    dim = node.args[1]
+    start = node.args[2]
+    end = node.args[3]
+    if dim >= 0:
+        dim -= len(node.args[0].meta['tensor_meta'].shape)
+    if start == 0 and end == sys.maxsize:
+        pybuda_node = PyBudaNode(OpType("nop", []), [node.args[0], ])
+    else:
+        stride = 1
+        attrs = [dim, start, end, stride]
+        args = [node.args[0], ]
+        pybuda_node = PyBudaNode(OpType(pybuda_op_name, attrs), args)
+    return pybuda_node
+
+def process_unsqueeze(node, pybuda_op_name):
+    assert len(node.args) == 2
+    dim = node.args[1]
+    input_ndim = len(node.meta['tensor_meta'].shape) - 1 # supopsed to feed input ndim
+
+    if dim >= 0:
+        dim -= len(node.meta['tensor_meta'].shape)
+    
+    attrs = [dim, input_ndim]
+    return PyBudaNode(OpType(pybuda_op_name, attrs), [node.args[0], ])
+
+def process_reshape(node, pybuda_op_name):
+    attrs = node.args[1].copy() if len(node.args) == 2 else node.args[1:].copy()
+    if not isinstance(attrs, (list, tuple)):
+        attrs = [attrs, ]
+
+    input_volume = 1
+    for dim in node.args[0].meta['tensor_meta'].shape:
+        input_volume *= dim
+
+    blank_index = None
+    reshape_volume = 1
+    for i, dim in enumerate(attrs):
+        if dim == -1:
+            assert blank_index is None, "Only one dimension can be -1"
+            blank_index = i
+        else:
+            reshape_volume *= dim
+    
+    if blank_index is not None:
+        attrs[blank_index] = input_volume//reshape_volume
+
+    input_volume = node.args[0].meta['tensor_meta'].shape[0]
+    return PyBudaNode(OpType(pybuda_op_name, attrs), [node.args[0], ])
+
+def process_power(node, pybuda_op_name):
+    if isinstance(node.args[1], int) or isinstance(node.args[1], float) and math.isclose(node.args[1] / int(node.args[1]), 1.0):
+        attrs = [int(node.args[1]), ]
+        pybuda_node = PyBudaNode(OpType("pow", attrs), [node.args[0], ])
+    else:
+        pybuda_node = PyBudaNode(OpType("power", []), node.args)
+    return pybuda_node
+
+def process_cat(node, pybuda_op_name):
+    dim = node.args[1]
+    if dim >= 0:
+        dim -= len(node.meta['tensor_meta'].shape)
+    pybuda_node = PyBudaNode(OpType(pybuda_op_name, [dim, ]), node.args[0])
+    return pybuda_node
+
+def process_constant_pad_nd(node, pybuda_op_name):
+    padding = node.args[1]
+    value = node.args[2]
+    if value != 0.0:
+        raise ValueError("Buda only supports zero padding") # TODO: add to cpu fallback if padding is not 0
+    pybuda_node = PyBudaNode(OpType(pybuda_op_name, [*padding, 0, False]), [node.args[0], ]) # mode index 0 = constant
+    return pybuda_node
+
+dynamo_to_pybuda_function = {
+    "_softmax"                             : (process_softmax, "softmax"),
+    "add"                                  : (process_dummy_no_attr, "add"),
+    "add_"                                 : (process_dummy_no_attr, "add"),
+    "addmm"                                : (process_matmul, "matmul"),
+    "_native_batch_norm_legit_no_training" : (process_batchnorm, "batchnorm"), 
+    "bmm"                                  : (process_matmul, "matmul"),
+    "cat"                                  : (process_cat, "concatenate"),
+    "clamp"                                : (process_clamp, "clip"),
+    "clone"                                : (process_dummy_no_attr, "nop"),
+    "contiguous"                           : (process_dummy_no_attr, "nop"),
+    "constant_pad_nd"                      : (process_constant_pad_nd, "pad"),
+    "convolution"                          : (process_conv2d, "conv2d"), #TODO: check if conv3d is also mapped to 'convolution'
+    "div"                                  : (process_matmul, "divide"),
+    "embedding"                            : (process_embedding, "embedding"),
+    "eq"                                   : (process_dummy_no_attr, "equal"),
+    "expand"                               : (process_expand, "nop"),
+    "flatten"                              : (process_flatten, "reshape"),
+    "gelu"                                 : (process_gelu, "gelu"),
+    "getitem"                              : (process_getitem, "index"),
+    "gt"                                   : (process_dummy_no_attr, "greater"),
+    "gte"                                  : (process_dummy_no_attr, "greater_equal"),
+    "hardtanh"                             : (process_clamp, "clip"),
+    "iadd"                                 : (process_dummy_no_attr, "add"),
+    "interpolate"                          : (process_interpolate, "resize2d"),
+    "lt"                                   : (process_dummy_no_attr, "less"),
+    "lte"                                  : (process_dummy_no_attr, "less_equal"),
+    "matmul"                               : (process_dummy_no_attr, "matmul"),
+    "max_pool2d_with_indices"              : (process_maxpool2d, "max_pool2d"),
+    "mean"                                 : (process_mean, "reduce_avg"),
+    "mm"                                   : (process_matmul, "matmul"),
+    "mul"                                  : (process_dummy_no_attr, "multiply"),
+    "native_layer_norm"                    : (process_layernorm, "layernorm"),
+    "permute"                              : (process_transpose, "transpose"),
+    "relu"                                 : (process_dummy_no_attr, "relu"),
+    "relu_"                                : (process_dummy_no_attr, "relu"),
+    "select"                               : (process_select, "index"),
+    "sigmoid"                              : (process_dummy_no_attr, "sigmoid"),
+    "slice"                                : (process_slice, "index"),
+    "softmax"                              : (process_softmax, "softmax"),
+    "sub"                                  : (process_dummy_no_attr, "subtract"),
+    "tanh"                                 : (process_dummy_no_attr, "tanh"),
+    "to"                                   : (process_dummy_no_attr, "nop"), #TODO
+    "_to_copy"                             : (process_dummy_no_attr, "nop"), #TODO
+    "copy_"                                : (process_dummy_no_attr, "nop"), #TODO
+    "lift_fresh_copy"                      : (process_dummy_no_attr, "nop"), #TODO
+    "alias"                                : (process_dummy_no_attr, "nop"), #TODO
+    "transpose"                            : (process_transpose, "transpose"),
+    "truediv"                              : (process_dummy_no_attr, "divide"),
+    "unsqueeze"                            : (process_unsqueeze, "unsqueeze"),
+    "view"                                 : (process_reshape, "reshape"),
+    "_unsafe_view"                         : (process_reshape, "reshape"),
+    "where"                                : (process_dummy_no_attr, "where"),
+    "pow"                                  : (process_power, ""),
+}
+
+torch_constant_ops = {
+    "ones"                           : torch.ones,
+    "zeros"                          : torch.zeros,
+    "arange"                         : torch.arange,
+    "full"                           : torch.full,
+    "empty"                          : torch.empty,
+    "scalar_tensor"                  : torch.scalar_tensor,
+}
+
+
+def is_supported_op(torch_op_name, node: torch.fx.Node):
+    if torch_op_name not in dynamo_to_pybuda_function:
+        return False
+
+    # Check for special cases
+    if torch_op_name == "cat":
+        if len(node.args) == 1:
+            return False # We currently need explicit dim specificed in second arg
+
+    return True
+
+
+def get_pybuda_node(torch_op_name, node):
+    if not is_supported_op(torch_op_name, node):
+        print(f"Unsupported op {torch_op_name}")
+        breakpoint()
+        assert False, f"Unsupported op {torch_op_name}"
+    
+    return dynamo_to_pybuda_function[torch_op_name][0](node, dynamo_to_pybuda_function[torch_op_name][1])
+
+# Check to see if subgraph is already on device
+def is_on_device(subgraph_idx: int):
+    pass
+
+# Remove all nodes associated with subgraph
+def remove_subgraph(subgraph_idx: int):
+    pass
+
+def add_op(graph, node, name, pybuda_node, subgraph_idx):
+    global node_to_id
+    shape = node.meta['tensor_meta'].shape if pybuda_node.shape is None else pybuda_node.shape
+    dtype = pytorch_dtype_to_buda_dataformat(node.meta['tensor_meta'].dtype) if pybuda_node.dtype is None else pybuda_node.dtype
+
+    add_constants_if_necessary(graph, pybuda_node.args, subgraph_idx)
+    if "nn_module_stack" in node.meta:
+        tags = {
+            "layer": list(node.meta["nn_module_stack"].values())[-1][0],
+            "stack_trace": "-->".join([str(v) for v in node.meta["nn_module_stack"].values()])
+        }
+    else:
+        tags = {}
+    if len(shape) == 0:
+        shape = [1]
+    nid = create_op_node(
+            graph,
+            f"{name}_{subgraph_idx}",
+            pybuda_node.op,
+            [int(dim) for dim in shape],
+            pytorch_dtype_to_buda_dataformat(dtype),
+            subgraph_idx,
+            tags)
+    
+    for i, input_node in enumerate(pybuda_node.args):
+        create_data_edge(graph, node_to_id[input_node], 0, nid, i, [])
+
+    eval_args = [id_to_intermed[node_to_id[arg]] if isinstance(arg, torch.fx.node.Node) else arg for arg in node.args]
+    for idx, arg in enumerate(eval_args):
+        if isinstance(arg, (list, tuple)):
+            eval_args[idx] = [id_to_intermed[node_to_id[a]] if isinstance(a, torch.fx.node.Node) else a for a in arg]
+    kwargs = {k:v for k, v in node.kwargs.items() if k != "device"}
+
+    if isinstance(node.target, torch._ops.OpOverloadPacket):
+        # We will add NOP in cases where input to current subgraph is left on device
+        # For input nodes, node.target is str
+        id_to_intermed[nid] = node.target(*eval_args, **kwargs)
+    if (pybuda_node.wrap_tuple):
+        nid = (nid,)
+    return nid
+
+def add_input(graph, node, subgraph_idx, module_inputs):
+    nid = create_activation_input(
+            graph,
+            f"{node.name}_{subgraph_idx}",
+            [int(dim) for dim in node.meta['tensor_meta'].shape],
+            node.meta["tensor_meta"].requires_grad,
+            pytorch_dtype_to_buda_dataformat(node.meta["tensor_meta"].dtype),
+            subgraph_idx)
+    module_inputs.append(nid)
+    return nid
+    
+
+def add_constant(graph, name, tensor, subgraph_idx):
+    if tensor in const_to_id:
+        return const_to_id[tensor]
+    nid = create_constant_input(
+            graph, 
+            f"{name}_{subgraph_idx}",
+            tensor,
+            [int(dim) for dim in tensor.shape],
+            pytorch_dtype_to_buda_dataformat(tensor.dtype),
+            subgraph_idx)
+    const_to_id[tensor] = nid
+    return nid
+
+def add_param(graph, name, torch_param, subgraph_idx):
+    if name in param_to_id:
+        return param_to_id[name]
+    nid = create_parameter_input(
+            graph, 
+            name,
+            [int(dim) for dim in torch_param.shape],
+            torch_param.requires_grad,
+            pytorch_dtype_to_buda_dataformat(torch_param.dtype),
+            subgraph_idx)
+    param_to_id[name] = nid
+    return nid
+
+def add_outputs(graph, node, subgraph_idx, output_nids, output_requires_grad, output_tensors):
+    global node_to_id
+    for index, meta in enumerate(node.meta['tensor_meta']):
+        arg = node.args[0][index]
+        nid = create_output(
+                graph, 
+                node.name + "_" + arg.name + "_" + str(subgraph_idx),
+                [int(dim) for dim in meta.shape],
+                pytorch_dtype_to_buda_dataformat(meta.dtype),
+                False,  #TODO Loss output
+                subgraph_idx)
+        create_data_edge(graph, node_to_id[arg], 0, nid, index, [])
+        output_nids.append(nid)
+        output_requires_grad.append(meta.requires_grad)
+        output_tensors.append(id_to_intermed[node_to_id[arg]])
+
+def add_constants_if_necessary(graph, ops, subgraph_idx):
+    global node_to_id
+    for op in ops:
+        if isinstance(op, (float, int)):
+            if op in node_to_id:
+                continue
+            tensor = torch.ones([1]) * op
+            node_to_id[op] = add_constant(graph, f"{op}", tensor, subgraph_idx)
+            id_to_intermed[node_to_id[op]] = tensor
+
+
+def map_node_name_to_org_name(module, aten_module):
+    ret = dict()
+
+    # param nodes
+    aten_params = dict()
+    for itm in aten_module.named_parameters():
+        aten_name = itm[0]
+        aten_tensor = itm[1]
+        aten_params[id(aten_tensor)] = aten_name
+    module_params = dict()
+    for itm in module.named_parameters():
+        module_name = itm[0]
+        mod = itm[1]
+        module_params[id(mod)] = module_name
+    if len(module_params) == len(aten_params):
+        for tensor_id in module_params.keys():
+            ret[aten_params[tensor_id]] = module_params[tensor_id]
+
+    # buffers
+    aten_buffers = dict()
+    for itm in aten_module.named_buffers():
+        aten_name = itm[0]
+        aten_tensor = itm[1]
+        if len(aten_tensor.shape) == 0:
+            continue
+        aten_buffers[id(aten_tensor)] = aten_name
+    module_buffers = dict()
+    for itm in module.named_buffers():
+        mod_name = itm[0]
+        mod_tensor = itm[1]
+        if len(mod_tensor.shape) == 0:
+            continue
+        module_buffers[id(mod_tensor)] = mod_name
+    if len(module_buffers) == len(aten_buffers):
+        for tensor_id in module_buffers.keys():
+            ret[aten_buffers[tensor_id]] = module_buffers[tensor_id]
+
+    return ret
+
+
+def append_to_graph(graph, module, aten_module, activations, subgraph_idx, inputs_per_subgraph, outputs_per_subgraph):
+    param_name_map = map_node_name_to_org_name(module, aten_module)
+
+    tt_act = [a.to("tt") for a in activations]
+
+    # Run static shape propagation on aten module
+    shape_prop = torch.fx.passes.shape_prop.ShapeProp(aten_module)
+    if shape_prop.fake_mode is not None:
+        fake_args = [shape_prop.fake_mode.from_tensor(t, static_shapes=True) if isinstance(t, torch.Tensor) else t for t in tt_act]
+    else:
+        fake_args = tt_act
+    shape_prop.run(*fake_args)
+
+    aten_module = aten_module.to("cpu")
+
+    module_inputs = []
+    output_nids = []
+    output_requires_grad = []
+    output_tensors = []
+
+    def process_function(node):
+        global node_to_id
+        op_name = node.target.__name__
+
+        if op_name in torch_constant_ops:
+            kwargs = {k:v for k, v in node.kwargs.items() if k != "device"}
+            tensor = torch_constant_ops[op_name](*node.args, **kwargs)
+            if len(tensor.shape) == 0:
+                tensor = tensor.unsqueeze(0)
+            node_to_id[node] = add_constant(graph, node.name, tensor.float(), subgraph_idx)
+            id_to_intermed[node_to_id[node]] = tensor
+        elif op_name == "getitem":
+            assert isinstance(node_to_id[node.args[0]], (list, tuple))
+            assert node.args[1] == 0, "currently getitem only supported for index = 0"
+            node_to_id[node] = node_to_id[node.args[0]][node.args[1]]
+            id_to_intermed[node_to_id[node]] = id_to_intermed[node_to_id[node]][node.args[1]]
+        else:
+            pybuda_node = get_pybuda_node(op_name, node)
+            node_to_id[node] = add_op(graph, node, node.name, pybuda_node, subgraph_idx)
+
+    # Traverse up the graph from output nodes to populate consumed nodes set
+    consumed = set()
+    working_nodes = []
+    for node in aten_module.graph.nodes:
+        if node.op == "output":
+            working_nodes.append(node)
+            consumed.add(node)
+
+    while len(working_nodes) > 0:
+        node = working_nodes.pop(0)
+        for arg in node.args:
+            if isinstance(arg, torch.fx.node.Node) and arg not in consumed:
+                consumed.add(arg)
+                working_nodes.append(arg)
+            elif isinstance(arg, (list, tuple)):
+                for item in arg:
+                    if isinstance(item, torch.fx.node.Node) and item not in consumed:
+                        consumed.add(item)
+                        working_nodes.append(item)
+
+
+    input_index = 0
+    for index, node in enumerate(aten_module.graph.nodes):
+        if node not in consumed:
+            logger.debug(f"Skipping {node} because it was not consumed")
+            continue
+
+        if node.op == "placeholder":
+            uid = inputs_per_subgraph[subgraph_idx][input_index]
+            if uid != -1:
+                # this input is on device, don't create input node, add edge to corresponding output
+                node_to_id[node] = add_input(graph, node, subgraph_idx, module_inputs)
+
+                for idx in range(subgraph_idx):
+                    if uid not in outputs_per_subgraph[idx]:
+                        continue
+                    output_index = outputs_per_subgraph[idx].index(uid)
+                    add_subgraph_io_link_edge(graph, output_nodes_per_subgraph[idx][output_index], 0, node_to_id[node], 0)
+            else:
+                node_to_id[node] = add_input(graph, node, subgraph_idx, module_inputs)
+            id_to_intermed[node_to_id[node]] = activations[index]
+            input_index +=1
+        elif node.op == "get_attr":
+            assert node.target in param_name_map, f"Weight node is not mapped to original names: {node.target}"
+            node_to_id[node] = add_param(graph, param_name_map[node.target], aten_module.state_dict()[node.target], subgraph_idx)
+            id_to_intermed[node_to_id[node]] = aten_module.state_dict()[node.target]
+        elif node.op == "call_function":
+            process_function(node)
+        elif node.op == "output":
+            add_outputs(graph, node, subgraph_idx, output_nids, output_requires_grad, output_tensors)
+        else:
+            assert False, f"Unsupported op {node.op}"
+
+    graph.register_module_inputs(module_inputs, append=True)
+    graph.register_module_outputs(output_nids, output_requires_grad, append=True)
+
+    output_nodes_per_subgraph[subgraph_idx] = output_nids
+    return graph, id_to_intermed, output_tensors
+
+
+def call_function_is_nop(node):
+    assert node.op == "call_function"
+    op_name = node.target.__name__
+    if op_name in dynamo_to_pybuda_function:
+        return dynamo_to_pybuda_function[op_name][1] == "nop"
+    else:
+        return False
+
+def call_function_is_reshape(node):
+    assert node.op == "call_function"
+    op_name = node.target.__name__
+    if op_name in dynamo_to_pybuda_function:
+        return dynamo_to_pybuda_function[op_name][1] == "reshape"
+    else:
+        return False
+
+def unsupported_shared_embedding_input(graph: torch.fx.GraphModule, unsupported_nodes: Set[torch.fx.Node], unsupported_outputs: Set[torch.fx.Node]):
+    # Embedding input is untilized integer input. No other op can handle it, other than a "tilize" op, which currently is not implemented. So, we'll mark it as unsupported.
+
+    def search_up(node: torch.fx.Node, visited: Set[torch.fx.Node]):
+        if node in visited:
+            return 
+
+        if not isinstance(node, torch.fx.Node):
+            return
+
+        visited.add(node)
+                
+        for user in node.users:
+            if user in visited:
+                continue
+            if user.op == "call_function" and user.target.__name__ == "embedding":
+                continue
+            if user.op == "output":
+                unsupported_outputs.add(raw_input)
+                continue
+            unsupported_nodes.add(user)
+
+        for arg in node.all_input_nodes:
+            search_up(arg, visited)
+
+
+    for node in graph.nodes:
+        if node.op == "call_function" and node.target.__name__ == "embedding":
+            raw_input = node.args[1]
+            visited = set()
+            search_up(raw_input, visited)
+
+def get_unsupported_nodes(graph: torch.fx.Graph, config: CompilerConfig) -> Tuple[Set[torch.fx.Node], Set[torch.fx.Node]]:
+    # Traverse the FX graph and find all the nodes that are not supported and should fall back to CPU
+    # Returns a set of unsupported nodes, and a set of unsupported outputs - since there's only one output node,
+    # we represent those by nodes that drive the output, and have to be in a separate set
+    unsupported_nodes = set()
+    unsupported_outputs = set()
+    for node in graph.nodes:
+        if node.op != "call_function":
+            continue
+
+        op_name = node.target.__name__
+
+        if op_name in torch_constant_ops:
+            continue
+        
+        if op_name == "getitem":
+            continue
+
+        if op_name in config.cpu_fallback_ops:
+            unsuppored_nodes.add(node)
+            continue
+
+        if is_supported_op(op_name, node):
+            continue
+
+        unsupported_nodes.add(node)
+
+    # Additional passes to find unsupported patterns
+    unsupported_shared_embedding_input(graph, unsupported_nodes, unsupported_outputs)
+
+    if len(unsupported_outputs) > 0 or len(unsupported_nodes) > 0:
+        logger.trace("Unsupported nodes: " + str(unsupported_nodes) + " Unsupported outputs: " + str(unsupported_outputs))
+        
+    return unsupported_nodes, unsupported_outputs
diff --git a/pybuda/pybuda/fx/schedule.py b/pybuda/pybuda/fx/schedule.py
new file mode 100644
index 000000000..46c59ded7
--- /dev/null
+++ b/pybuda/pybuda/fx/schedule.py
@@ -0,0 +1,282 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+#
+# Schedule describes the order of execution of FX and Buda graphs, and the mapping of inputs and outputs
+#
+
+from typing import List, Dict, Set, Tuple, Optional
+from enum import Enum
+
+import torch
+from loguru import logger
+
+from pybuda.fx.graph_utils import get_output_node
+
+# Enum to hold the source of a tensor
+class TensorSource(Enum):
+    INPUT = 1
+    INTERMEDIATE = 2
+    OUTPUT = 3
+    
+# Convenience tuple indicating whether tensor is intermediate or input/output
+class TensorState:
+
+    def __init__(self, src: TensorSource, node: torch.fx.Node):
+        self.src = src
+        self.node = node
+
+    def __repr__(self):
+        return f"TensorState({self.src}, {self.node})"
+
+# Define where the input is coming from for each schedule item - from main inputs, or from other graphs
+# If it's from main inputs, then index is the index of the inputs, otherwise it's index into list of intermediates
+class InputSource:
+
+    def __init__(self, src: TensorSource, index: int):
+        self.src = src
+        self.index = index 
+
+    def __repr__(self):
+        return f"InputSource({self.src}, {self.index})"
+
+# Define where the output goes, to main outputs or to intermediates
+class OutputDest:
+    def __init__(self, intermediate: bool, index: int):
+        self.intermediate = intermediate
+        self.index = index
+
+    def __repr__(self):
+        return f"{'Intermediate' if self.intermediate else 'Output'} {self.index}"
+
+# Single graph call, with mappings of inputs and outputs
+class ScheduleItem:
+    def __init__(self, fallback: bool, inputs: List[InputSource], outputs: List[OutputDest], graph: Optional[torch.fx.Graph], graph_index: int):
+        self.fallback = fallback
+        self.graph = graph
+        self.graph_module : Optional[torch.fx.GraphModule] = None
+        self.graph_index = graph_index
+        self.inputs = inputs
+        self.outputs = outputs
+
+    def is_fallback_graph(self) -> bool:
+        return self.fallback
+
+    def get_subgraph_input_index(self, index: int) -> int:
+        # filter graph inputs
+        graph_inputs = [i.index for i in self.inputs if i.src == TensorSource.INPUT]
+        assert index < len(graph_inputs), f"Index {index} out of range for graph inputs {graph_inputs}"
+        return graph_inputs[index]
+
+    def __repr__(self) -> str:
+        if self.is_fallback_graph():
+            return f"ScheduleItem(fallback graph={self.graph_index}, inputs={self.inputs}, outputs={self.outputs})"
+        else:
+            return f"ScheduleItem(device graph={self.graph_index}, inputs={self.inputs}, outputs={self.outputs})"
+
+class Schedule:
+
+    def __init__(self, 
+            inputs: List[torch.fx.Node], 
+            outputs: List[torch.fx.Node],
+            aten_module: torch.fx.GraphModule,
+            device_graphs: List[torch.fx.Graph],
+            fallback_graphs: List[torch.fx.Graph],
+            mappings: Dict[str, Dict[torch.fx.Node, torch.fx.Node]]):
+
+        new_io_mapping = mappings["new_io_mapping"]
+        placeholder_map = mappings["placeholder_map"]
+        #copied_node_mapping = mappings["copied_node_mapping"]
+        #moved_output_mapping = mappings["moved_output_mapping"]
+
+        intermediate_valid: Set[torch.fx.Node] = set() # Set of valid intermediate nodes, after a graph has been executed
+        outputs_valid: Dict[torch.fx.Node, int] = {} # Map of valid outputs, and their index
+
+        # For each graph, figure out where the inputs are coming from, and which of the outputs it creates
+        input_mappings: Dict[int, List[TensorState]] = {} # list per subgraph
+        for i, graph in enumerate(fallback_graphs):
+            # Find inputs
+            input_mappings[i] = []
+            for node in graph.nodes:
+                if node.op != "placeholder":
+                    continue;
+            
+                # Find the original node
+                if node in placeholder_map or node in inputs:
+                    # Original input
+                    input_mappings[i].append(TensorState(TensorSource.INPUT, node))
+                    continue
+
+                # Intermediate or output from another graph
+                if node in new_io_mapping:
+                    src = new_io_mapping[node]
+                    if src in outputs:
+                        # Output from another graph
+                        input_mappings[i].append(TensorState(TensorSource.OUTPUT, src))
+                    else:
+                        input_mappings[i].append(TensorState(TensorSource.INTERMEDIATE, src))
+                    continue
+
+                # No other option is legal
+                assert False, f"Placeholder {node} not found in any mapping"
+
+        # The device graphs
+        device_mappings: Dict[int, List[TensorState]] = {} # list per subgraph
+        for i, device_graph in enumerate(device_graphs):
+            device_mappings[i] = []
+            for node in device_graph.nodes:
+                if node.op != "placeholder":
+                    continue;
+
+                # Original input
+                if node in inputs:
+                    device_mappings[i].append(TensorState(TensorSource.INPUT, node))
+                    continue
+
+                # Intermediate or output from another graph
+                if node in new_io_mapping:
+                    src = new_io_mapping[node]
+                    if src in outputs:
+                        device_mappings[i].append(TensorState(TensorSource.OUTPUT, src))
+                    else:
+                        device_mappings[i].append(TensorState(TensorSource.INTERMEDIATE, src))
+                    continue
+
+                # No other option is legal
+                assert False, f"Placeholder {node} not found in any mapping"
+
+        # Keep figuring out which graphs we can run, i.e. we have all inputs available, until we're done with all of them
+        self.schedule : List[ScheduleItem] = []
+        to_schedule : List[Tuple[bool, int]] = [(True, i) for i in range(len(fallback_graphs))]
+
+        for i in range(len(device_graphs)):
+            if len(device_graphs[i].nodes) > 0:
+                to_schedule.append((False, i))
+
+        # Map intermediate to unique IDs that we can put in the schedule
+        self.next_intermediate_id = 0
+        intermediate_ids : Dict[torch.fx.Node, int] = {}
+
+        # Figure out where outputs go, and set intermediate IDs and valids
+        def record_outputs(graph: torch.fx.Graph) -> List[OutputDest]:
+            output_list = []
+            output_node = get_output_node(graph)
+
+            for arg in output_node.args[0]:
+
+                # Figure out where this output goes - to an intermediate, or to the main outputs
+                if arg in outputs:
+                    output_list.append(OutputDest(False, outputs.index(arg)))
+                    outputs_valid[arg] = outputs.index(arg)
+                    continue
+
+                # Intermediate output, assign new ID and record
+                intermediate_ids[arg] = self.next_intermediate_id
+                output_list.append(OutputDest(True, self.next_intermediate_id))
+                self.next_intermediate_id += 1
+                        
+                # Record that the intermediate is valid, for scheduling purposes
+                intermediate_valid.add(arg)
+
+            return output_list
+
+        def generate_inputs(graph: torch.fx.Graph)  -> List[InputSource]:
+            # Generate list of input sources for this graph
+            input_list = []
+            for node in graph.nodes:
+                if node.op != "placeholder":
+                    continue
+
+                if node in inputs:
+                    input_list.append(InputSource(TensorSource.INPUT, inputs.index(node)))
+                    continue
+
+                if node in placeholder_map:
+                    input_list.append(InputSource(TensorSource.INPUT, inputs.index(placeholder_map[node])))
+                    continue
+
+                assert node in new_io_mapping
+                src = new_io_mapping[node]
+                if src in outputs:
+                    input_list.append(InputSource(TensorSource.OUTPUT, outputs.index(src)))
+                else:
+                    input_list.append(InputSource(TensorSource.INTERMEDIATE, intermediate_ids[src]))
+
+            return input_list
+
+        while len(to_schedule) > 0:
+            progress = False
+            for fallback, index in to_schedule:
+                if fallback:
+                    if all([t.node in intermediate_valid for t in input_mappings[index] if t.src == TensorSource.INTERMEDIATE]) and \
+                            all([t.node in outputs_valid for t in input_mappings[index] if t.src == TensorSource.OUTPUT]):
+                        # We can run this graph
+                        logger.trace(f"Scheduling fallback graph {index}")
+                        self.schedule.append(
+                                ScheduleItem(fallback, generate_inputs(fallback_graphs[index]), 
+                                        record_outputs(fallback_graphs[index]), fallback_graphs[index], index))
+                        to_schedule.remove((True, index))
+                        progress = True
+                else:
+                    if all([t.node in intermediate_valid for t in device_mappings[index] if t.src == TensorSource.INTERMEDIATE]) and \
+                            all([t.node in outputs_valid for t in device_mappings[index] if t.src == TensorSource.OUTPUT]):
+                        # We can run device graph
+                        logger.trace(f"Scheduling device graph")
+                        self.schedule.append(
+                                ScheduleItem(fallback, generate_inputs(device_graphs[index]),
+                                    record_outputs(device_graphs[index]), device_graphs[index], index))
+                        to_schedule.remove((False, index))
+                        progress = True
+
+            if not progress:
+                print("Intermediate valids", intermediate_valid)
+                print("Outputs valids", outputs_valid)
+                print("To schedule", to_schedule)
+                for i, im in enumerate(input_mappings):
+                    print(f"Input mappings {i}", im)
+                print("inputs (aten): ", device_mappings)
+            assert progress, "No progress made in scheduling"
+
+        # Create graph modules for the graphs
+        for item in self.schedule:
+            item.graph_module = torch.fx.GraphModule(aten_module, item.graph)
+
+        logger.trace(f"Schedule: {self}")
+        self.validate(len(inputs), len(outputs))
+
+    def __iter__(self):
+        return iter(self.schedule)
+
+    def __repr__(self):
+        ret = "Schedule:\n"
+        for item in self.schedule:
+            ret += f" - {item}\n"
+        return ret
+
+    def get_device_schedule_item(self, index: int) -> ScheduleItem:
+        return next(filter(lambda x: x.graph_index == index and not x.fallback, self.schedule))
+
+    def get_device_program_ids(self) -> List[int]:
+        return [i.graph_index for i in self.schedule if not i.fallback]
+
+    def validate(self, num_inputs: int, num_outputs: int):
+        # Check that all inputs are used, and all outputs are generated
+        unused_inputs = set(range(num_inputs))
+        unused_outputs = set(range(num_outputs))
+        for item in self.schedule:
+            for input_source in item.inputs:
+                if input_source.src != TensorSource.INPUT:
+                    continue
+                if input_source.index in unused_inputs: # it's ok if the input is used multiple times
+                    unused_inputs.remove(input_source.index)
+
+            for output_dest in item.outputs:
+                if output_dest.intermediate:
+                    continue
+                assert output_dest.index in unused_outputs, f"Output {output_dest.index} used multiple times, or beyond the number of outputs"
+                unused_outputs.remove(output_dest.index)
+
+        assert len(unused_inputs) == 0, f"Inputs {unused_inputs} are not used"
+        assert len(unused_outputs) == 0, f"Outputs {unused_outputs} are not generated"
+
diff --git a/pybuda/pybuda/fx/torch_decomp_reconstruct.py b/pybuda/pybuda/fx/torch_decomp_reconstruct.py
new file mode 100644
index 000000000..f3c7f50b4
--- /dev/null
+++ b/pybuda/pybuda/fx/torch_decomp_reconstruct.py
@@ -0,0 +1,80 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+import torch
+from torch.fx import subgraph_rewriter
+from typing import Any, Tuple
+
+# Decompose
+def decompose_split(self: torch.Tensor, split_size: int, dim: int = 0) -> Tuple[torch.Tensor, ...]:
+    starts = list(range(0, self.size(dim), split_size))
+    stops = starts[1:] + [self.size(dim)]
+    slices = []
+    for start, stop in zip(starts, stops):
+        slices.append(self.narrow(dim, start, stop - start))
+    return slices
+
+def decompose_matmul(bias, input, weight) -> torch.Tensor:
+    res = torch.matmul(input, weight)
+    res = torch.add(res, bias)
+    return res
+
+pybuda_decompositions = {
+   torch.ops.aten.split.Tensor: decompose_split,
+   torch.ops.aten.addmm.default: decompose_matmul,
+}
+
+def get_pybuda_decompositions():
+    return pybuda_decompositions
+
+# Reconstruct
+class ReconstructBilinearResize2d():
+    @staticmethod
+    def pattern(x, scale, output_size, input_size, device):
+        arange = torch.ops.aten.arange.start_step(0, output_size, dtype = torch.float32, layout = torch.strided, device = device, pin_memory = False)
+        arange_1 = torch.ops.aten.arange.start_step(0, output_size, dtype = torch.float32, layout = torch.strided, device = device, pin_memory = False)
+        mul = torch.ops.aten.mul.Tensor(arange, scale)
+        mul_1 = torch.ops.aten.mul.Tensor(arange_1, scale)
+        _to_copy = torch.ops.aten._to_copy.default(mul, dtype = torch.int64)
+        ceil = torch.ops.aten.ceil.default(mul)
+        clamp = torch.ops.aten.clamp.default(ceil, None, input_size)
+        _to_copy_1 = torch.ops.aten._to_copy.default(clamp, dtype = torch.int64)
+        _to_copy_2 = torch.ops.aten._to_copy.default(mul_1, dtype = torch.int64)
+        ceil_1 = torch.ops.aten.ceil.default(mul_1)
+        clamp_1 = torch.ops.aten.clamp.default(ceil_1, None, input_size)
+        _to_copy_3 = torch.ops.aten._to_copy.default(clamp_1, dtype = torch.int64)
+        unsqueeze = torch.ops.aten.unsqueeze.default(mul, 1)
+        unsqueeze_1 = torch.ops.aten.unsqueeze.default(_to_copy, 1)
+        unsqueeze_2 = torch.ops.aten.unsqueeze.default(_to_copy_1, 1)
+        index = torch.ops.aten.index.Tensor(x, [None, None, unsqueeze_1, _to_copy_2])
+        index_1 = torch.ops.aten.index.Tensor(x, [None, None, unsqueeze_2, _to_copy_2])
+        index_2 = torch.ops.aten.index.Tensor(x, [None, None, unsqueeze_1, _to_copy_3])
+        index_3 = torch.ops.aten.index.Tensor(x, [None, None, unsqueeze_2, _to_copy_3])
+        sub = torch.ops.aten.sub.Tensor(unsqueeze, unsqueeze_1)
+        sub_1 = torch.ops.aten.sub.Tensor(1.0, sub)
+        sub_2 = torch.ops.aten.sub.Tensor(mul_1, _to_copy_2)
+        sub_3 = torch.ops.aten.sub.Tensor(1.0, sub_2)
+        mul_2 = torch.ops.aten.mul.Tensor(index, sub_1)
+        mul_3 = torch.ops.aten.mul.Tensor(index_1, sub)
+        add = torch.ops.aten.add.Tensor(mul_2, mul_3)
+        mul_4 = torch.ops.aten.mul.Tensor(index_2, sub_1)
+        mul_5 = torch.ops.aten.mul.Tensor(index_3, sub)
+        add_1 = torch.ops.aten.add.Tensor(mul_4, mul_5)
+        mul_6 = torch.ops.aten.mul.Tensor(add, sub_3)
+        mul_7 = torch.ops.aten.mul.Tensor(add_1, sub_2)
+        add_2 = torch.ops.aten.add.Tensor(mul_6, mul_7)
+        return add_2
+
+    @staticmethod
+    def replacement(x, scale, output_size, input_size, device):
+        return torch.nn.functional.interpolate(x, size=output_size, mode='bilinear', align_corners=True)
+
+def apply_torch_reconstruct_patterns(aten):
+    patterns = [
+        ReconstructBilinearResize2d(),
+    ]
+
+    for p in patterns:
+        subgraph_rewriter.replace_pattern_with_filters(aten, p.pattern, p.replacement)
+
diff --git a/pybuda/pybuda/fx/trace.py b/pybuda/pybuda/fx/trace.py
new file mode 100644
index 000000000..5ecd677f1
--- /dev/null
+++ b/pybuda/pybuda/fx/trace.py
@@ -0,0 +1,82 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+import copy
+from typing import List, Dict, Set, Optional
+from collections import defaultdict
+
+from loguru import logger
+import torch
+
+class IOTracer:
+    # For a list of graphs, find which inputs affect which outputs. Cache results to make tracing faster.
+    def __init__(self, graphs: List[torch.fx.Graph]):
+        self.graphs = copy.copy(graphs)
+        assert len(graphs) == len(set(graphs)), "Graph list must not have duplicates"
+        self.input_to_output_map : Dict[torch.fx.Node, List[torch.fx.Node]] = {}
+
+    def remove_graph(self, graph: torch.fx.Graph):
+        assert graph in self.graphs
+        self.graphs.remove(graph)
+        to_remove = []
+        for input in self.input_to_output_map:
+            if input.graph == graph:
+                to_remove.append(input)
+
+    def add_graph(self, graph: torch.fx.Graph):
+        assert graph not in self.graphs
+        self.graphs.append(graph)
+
+    def get_output_nodes(self, input: torch.fx.Node) -> List[torch.fx.Node]:
+        if input not in self.input_to_output_map:
+            self._trace_graph(input.graph)
+
+        return self.input_to_output_map[input]
+
+    def _trace_graph(self, graph: torch.fx.Node):
+        # Trace all input to outputs
+
+        # Keep track of visited noted, and which outputs they lead to, to avoid tracing the whole graph again for other inputs
+        node_to_output : Dict[torch.fx.Node, Set[torch.fx.Node]] = defaultdict(set)
+
+        def trace(node: torch.fx.Node) -> Set[torch.fx.Node]:
+            for user in node.users:
+                if user.op == "output":
+                    node_to_output[node].add(node)
+                elif user in node_to_output: # depth-first, so we should have already reached the outputs if we hit the node again - no cycles
+                    node_to_output[node].update(node_to_output[user])
+                else:
+                    node_to_output[node].update(trace(user))
+            return node_to_output[node]
+        
+        for node in graph.nodes:
+            if node.op == "placeholder":
+                self.input_to_output_map[node] = list(trace(node))
+
+    def trace_for_cycle(self, input_node: torch.fx.Node, outputs_to_dest_node: Dict[torch.fx.Node, Set[torch.fx.Node]]) -> Optional[torch.fx.Node]:
+        # Given an input node, and mapping of outputs to inputs in other graphs, trace through to see if a cycle exists, i.e.
+        # if we reach the original graph again. Return the output node through which we reached the cycle, or None if there are none
+
+        logger.trace(f"Trace for cycle from {input_node}")
+        def trace(output_node: torch.fx.Node, target_graph: torch.fx.Graph) -> torch.fx.Node:
+            # Trace output node to other graphs, and see if target graph is reached
+            for input_node in outputs_to_dest_node[output_node]:
+                if input_node.graph == target_graph:
+                    return input_node
+
+                # Trace further
+                for output_node in self.get_output_nodes(input_node):
+                    node = trace(output_node, target_graph)
+                    if node:
+                        return node
+
+            return None
+
+        for output_node in self.get_output_nodes(input_node):
+            node = trace(output_node, input_node.graph)
+            if node:
+                logger.trace(f" -- tracing found cycle through output {output_node} to {node}")
+                return node
+
+        return None
diff --git a/pybuda/pybuda/gpudevice.py b/pybuda/pybuda/gpudevice.py
deleted file mode 100644
index 444cc558f..000000000
--- a/pybuda/pybuda/gpudevice.py
+++ /dev/null
@@ -1,22 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-from .cpudevice import CPUDevice
-
-class GPUDevice(CPUDevice):
-    """
-    GPUDevice represents a GPU processor. It will spawn a process and run local operations on the assigned processor.
-    """
-
-    def __init__(self,
-        *args,
-        **kwargs
-    ):
-        """
-        Create a GPU device with a given name.
-        """
-        super().__init__(*args, **kwargs)
-        self.device = "cuda"
-
-    def __repr__(self):
-        return f"GPUDevice '{self.name}'"
diff --git a/pybuda/pybuda/module.py b/pybuda/pybuda/module.py
index b6d102dcd..49cb5adb4 100644
--- a/pybuda/pybuda/module.py
+++ b/pybuda/pybuda/module.py
@@ -16,7 +16,6 @@
 from .parameter import Parameter
 import onnx
 import onnxruntime
-import mxnet as mx
 import jax.numpy as jnp
 import numpy as np
 
@@ -185,6 +184,28 @@ def backward(self, *args) -> Tuple[torch.tensor]:
         outputs = tuple(a[0].backward(a[1]) for a in args)
         return outputs
 
+    def add_parameter(self, name: str, parameter: Parameter):
+        """
+        Adds a new parameter. 
+
+        Parameters
+        ----------
+        name: str
+            Parameter name
+
+        parameter: Parameter
+            Parameter to add
+
+        prepend_name: Bool
+            Whether to prepend module name to parameter name
+        """
+
+        if isinstance(parameter, pybuda.parameter.Parameter):
+            parameter = torch.nn.Parameter(parameter.value(), requires_grad=False)
+        if name in self.module._parameters:
+            raise RuntimeError(f"Module {self.name} already has parameter '{name}'")
+        self.module._parameters[name] = parameter
+
     def set_parameters(self, **kwargs):
         """
         Set parameters (weights) in this module, by name.
@@ -215,12 +236,12 @@ def get_parameters(self) -> List[Parameter]:
         """
         params = []
         recorded_names = []
-        all_params = [self.module.named_parameters(), self.module.named_buffers(), self.module.state_dict().items()]
+        all_params = [self.module.named_parameters(), self.module.named_buffers(), self.module.state_dict().items(), self.module._parameters.items()]
         for name, param in itertools.chain(*all_params):
             if name in recorded_names:
                 continue
             pybuda_param = Parameter(
-                param,
+                param.cpu(),
                 requires_grad = param.requires_grad,
                 name=name)
             params.append(pybuda_param)
@@ -473,45 +494,45 @@ def get_parameters(self) -> List[Parameter]:
         return [] # TODO
     
 
-class MXNetModule(Module):
-    """
-    A wrapper around a MXNet module.
-    """
-    def __init__(self, name: str, module: mx.gluon.HybridBlock,):
-        """
-        Create MXNet module wrapper.
+# class MXNetModule(Module):
+#     """
+#     A wrapper around a MXNet module.
+#     """
+#     def __init__(self, name: str, module: mx.gluon.HybridBlock,):
+#         """
+#         Create MXNet module wrapper.
 
-        Parameters
-        ----------
-        module: mx.gluon.HybridBlock
-            MXNet module
-        """
-        super().__init__(name)
+#         Parameters
+#         ----------
+#         module: mx.gluon.HybridBlock
+#             MXNet module
+#         """
+#         super().__init__(name)
 
-        if not isinstance(module, mx.gluon.HybridBlock):
-            raise RuntimeError("mx.gluon.HybridBlock module expected, got " + str(type(module)))
-        self.module = module
+#         if not isinstance(module, mx.gluon.HybridBlock):
+#             raise RuntimeError("mx.gluon.HybridBlock module expected, got " + str(type(module)))
+#         self.module = module
 
-    def forward(self, *args, **kwargs):
-        return self.module(*args)
+#     def forward(self, *args, **kwargs):
+#         return self.module(*args)
 
-    def call(self, *args, **kwargs):
-        raise NotImplementedError
+#     def call(self, *args, **kwargs):
+#         raise NotImplementedError
 
-    def backward(self, *args):
+#     def backward(self, *args):
 
-        raise NotImplementedError
+#         raise NotImplementedError
 
-    def set_parameters(self, **kwargs):
-        raise NotImplementedError
+#     def set_parameters(self, **kwargs):
+#         raise NotImplementedError
 
-    def cpu_eval_forward(self, *args, **kwargs):
-        mxnet_inputs = [mx.nd.array(x.detach().numpy()) for x in args]
-        outputs = self.module(*mxnet_inputs, **kwargs)
-        return to_pt_tensors(outputs)
+#     def cpu_eval_forward(self, *args, **kwargs):
+#         mxnet_inputs = [mx.nd.array(x.detach().numpy()) for x in args]
+#         outputs = self.module(*mxnet_inputs, **kwargs)
+#         return to_pt_tensors(outputs)
 
-    def get_parameters(self) -> List[Parameter]:
-        return [] # TODO
+#     def get_parameters(self) -> List[Parameter]:
+#         return [] # TODO
 
 
 class TFGraphDefModule(Module):
@@ -929,3 +950,29 @@ def __init__(self, module: PyBudaModule, op_name: str, output_index: int):
         self.op_name = op_name
         self.output_index = output_index
 
+def wrap_module(module, name: str)-> Module:
+    """
+    Wrap a module in a PyBuda module
+
+    Parameters
+    ----------
+    module: Any
+        Module to wrap
+
+    name: str
+        Name of the module
+
+    Returns
+    -------
+    Module
+        Wrapped module
+    """
+    if isinstance(module, torch.nn.Module):
+        return PyTorchModule(name, module)
+    elif isinstance(module, tf.keras.Model):
+        return TFModule(name, module)
+    elif isinstance(module, PyBudaModule):
+        return module
+    else:
+        raise RuntimeError("Unsupported module type: " + str(type(module)))
+
diff --git a/pybuda/pybuda/op/dram_queue.py b/pybuda/pybuda/op/dram_queue.py
index bbd46772b..24a3cc3e5 100644
--- a/pybuda/pybuda/op/dram_queue.py
+++ b/pybuda/pybuda/op/dram_queue.py
@@ -6,9 +6,8 @@
 from ..tensor import Tensor
 from .common import PyBudaOp as op
 
-DEFAULT_NUM_ENTRIES = 4 # configured as a heuristic to hide DRAM latency
 
-def DRAMQueue(name: str, operandA: Tensor, *, num_entries: int = DEFAULT_NUM_ENTRIES) -> Tensor:
+def DRAMQueue(name: str, operandA: Tensor, *, num_entries: int) -> Tensor:
     """
     Explicit operation in the graph to buffer the input operand data through
     DRAM to its consumer(s).
@@ -19,7 +18,11 @@ def DRAMQueue(name: str, operandA: Tensor, *, num_entries: int = DEFAULT_NUM_ENT
         Op name, unique to the module, or leave blank to autoset
 
     num_entries: int
-        configuration for the number of entries that can be stored in the queue
+        configuration for the number of entries that can be stored in the queue.
+        num_entries shouldn't have default value because if queue turns out to be static it should
+        have num_entries equal to microbatch_size. 
+        Only in special cases, when we are sure we will need less space than microbatch size, we can 
+        set num_entries to something less than microbatch_size.
 
     Returns
     -------
diff --git a/pybuda/pybuda/op/eltwise_unary.py b/pybuda/pybuda/op/eltwise_unary.py
index 7bab0365f..dbaed489d 100644
--- a/pybuda/pybuda/op/eltwise_unary.py
+++ b/pybuda/pybuda/op/eltwise_unary.py
@@ -103,7 +103,9 @@ def Pow(
 
 def Identity(
         name: str, 
-        operandA: Tensor) -> Tensor:
+        operandA: Tensor,
+        unsqueeze : str =  None,
+        unsqueeze_dim : int = None) -> Tensor:
 
     """
     Identity operation.
@@ -116,13 +118,22 @@ def Identity(
     operandA: Tensor
         First operand
 
+    unsqueeze: str
+        If set, the operation returns a new tensor with a dimension of size one inserted at the specified position.
+
+    unsqueeze_dim: int
+        The index at where singleton dimenion can be inserted
+
     Returns
     -------
     Tensor
         Buda tensor
     """
 
-    return op("nop", name, operandA).get_tensor()
+    if unsqueeze==None and unsqueeze_dim==None:
+        return op("nop", name, operandA).get_tensor()
+    else:
+        return op("nop", name, operandA, unsqueeze=unsqueeze, unsqueeze_dim=unsqueeze_dim).get_tensor()
 
 
 def Buffer(
@@ -322,7 +333,10 @@ def Argmax(
         Buda tensor
     """
 
-    return op("argmax", name, operandA, attrs=((dim,) if dim is not None else ())).get_tensor()
+    if  dim is not None:
+        return op("argmax", name, operandA, dim=dim ).get_tensor()
+    else:
+        return op("argmax", name, operandA).get_tensor()
 
 def Clip(
         name: str,
@@ -352,7 +366,7 @@ def Clip(
         Buda tensor
     """
 
-    return op("clip", name, operandA, attrs=(min, max)).get_tensor()
+    return op("clip", name, operandA, min=min, max=max).get_tensor()
 
 def Sine(
         name: str,
@@ -470,7 +484,7 @@ def CumSum(
     
     assert not exclusive, "Currently not supported"        
 
-    return op("cumsum", name, operandA, attrs=(axis, exclusive)).get_tensor()
+    return op("cumsum", name, operandA, axis=axis, exclusive=exclusive).get_tensor()
 
 
 def LogicalNot(
diff --git a/pybuda/pybuda/op/eval/buda/__init__.py b/pybuda/pybuda/op/eval/buda/__init__.py
index 41de5f0d4..12e6a5a97 100644
--- a/pybuda/pybuda/op/eval/buda/__init__.py
+++ b/pybuda/pybuda/op/eval/buda/__init__.py
@@ -6,6 +6,18 @@
 from functools import lru_cache
 from .transpose import TransposeTM
 from .splice import Splice
+from .exp import Exp
+from .cosine import Cosine
+from .ethernet_datacopy import EthernetDatacopy
+from .reciprocal import Reciprocal
+from .abs import Abs
+from .tanh import Tanh
+from .log import Log
+from .nop import Nop
+from .buffer import Buffer
+from .sqrt import Sqrt
+from .tilizer import Tilizer
+from .clip import Clip
 
 op_to_module_map = {
         "add":             "eltwise_binary",
@@ -22,27 +34,26 @@
         "less_equal":      "eltwise_binary",
         "equal":           "eltwise_binary",
         "not_equal":       "eltwise_binary",
-        
-        "ethernet_datacopy":    "eltwise_unary",
-        "nop":             "eltwise_unary",
-        "buffer":          "eltwise_unary",
-        "exp":             "eltwise_unary",
-        "reciprocal":      "eltwise_unary",
-        "sqrt":            "eltwise_unary",
+        "ethernet_datacopy": EthernetDatacopy,
+        "exp":             Exp,
+        "nop":             Nop,
+        "buffer":          Buffer,
+        "reciprocal":      Reciprocal,
+        "sqrt":             Sqrt,
         "lrelu":           "eltwise_unary",
         "gelu":            "eltwise_unary",
         "gelu_derivative": "eltwise_unary",
-        "log":             "eltwise_unary",
+        "log":             Log,
         "sigmoid":         "eltwise_unary",
-        "clip":            "eltwise_unary",
+        "clip":            Clip,
         "reduce":          "eltwise_unary",
-        "tanh":            "eltwise_unary",
-        "abs":             "eltwise_unary",
+        "abs":             Abs,
+        "tanh":            Tanh,
         "dropout":         "eltwise_unary",
-        "cosine":          "eltwise_unary",
+        "cosine":          Cosine,
         "sine":            "eltwise_unary",
         "power":           "eltwise_unary",
-        "tilizer":         "eltwise_unary",
+        "tilizer":         Tilizer,
 
         "conv_sum":        "eltwise_nary",
         "hconcat":         "eltwise_nary",
diff --git a/pybuda/pybuda/op/eval/buda/abs.py b/pybuda/pybuda/op/eval/buda/abs.py
new file mode 100644
index 000000000..b68457345
--- /dev/null
+++ b/pybuda/pybuda/op/eval/buda/abs.py
@@ -0,0 +1,75 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+from ..interface import BudaEltwiseUnaryOp
+
+import torch
+import pybuda
+from pybuda.utils import align_up_tile, round_up_div
+from .tm import eval as tm_eval
+from pybuda.pybudaglobal import TILE_DIM
+from pybuda._C.graph import UBlockOrder, Shape
+
+
+class Abs(BudaEltwiseUnaryOp):
+    @classmethod
+    def create(cls, vector=None):
+        self = cls("abs")
+        if vector is not None:
+           self.set_buda_attr("vector", vector)
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Abs should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        ret = torch.abs(tensors[0])
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+
+    def shape(self, tensor_shapes, tile_height, tile_width):
+        assert len(tensor_shapes) == 1, "Abs should have one input"
+        shape = tensor_shapes[0]
+        if tile_height == TILE_DIM:
+            shape[-2] = align_up_tile(shape[-2])
+        elif tile_height < TILE_DIM:
+            shape[-2] = tile_height
+        else:
+            raise RuntimeError(
+                f"Tile height {tile_height} is larger than max allowed TILE_DIM {TILE_DIM}"
+            )
+
+        return shape, []
+
+    def parallelization(self, op_shape, fracture_factor):
+        return (op_shape.outputs[0].rt, op_shape.outputs[0].ct)
+
+    def input_ublock_order(self, num_operands):
+        return None
+
+    def execution_cycles(self, arch_name, op_model) -> int:
+        op_model_desc = op_model_to_desc("abs", arch_name, op_model)
+
+        compiler_cache_cycles = get_compiler_cached_cycles(op_model_desc)
+        if compiler_cache_cycles is not None:
+            return compiler_cache_cycles
+
+        use_legacy_path = bool(
+            int(os.environ.get("PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY", "0"))
+        )
+
+        if use_legacy_path:
+            tile_weight = get_op_model_param(op_model_desc, "tile_weight")
+            output_shape = op_model.op_shape.outputs[0]
+            num_tiles = (output_shape.z * output_shape.rt * output_shape.ct) / (
+                op_model.grid_shape.r * op_model.grid_shape.c
+            )
+            cycle_count = tile_weight * num_tiles
+            return min(int(cycle_count), 1 << 30)
+
+        return get_op_model_execution_cycles(op_model_desc)
diff --git a/pybuda/pybuda/op/eval/buda/buffer.py b/pybuda/pybuda/op/eval/buda/buffer.py
new file mode 100644
index 000000000..c15c28e22
--- /dev/null
+++ b/pybuda/pybuda/op/eval/buda/buffer.py
@@ -0,0 +1,70 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from ..interface import BudaEltwiseUnaryOp
+
+import torch
+from pybuda.utils import align_up_tile, round_up_div
+from .tm import eval as tm_eval
+from pybuda.pybudaglobal import TILE_DIM
+from pybuda._C.graph import UBlockOrder, Shape
+
+
+class Buffer(BudaEltwiseUnaryOp):
+    @classmethod
+    def create(cls):
+        self = cls("buffer")
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "buffer should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        ret = tensors[0]
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+        return ret
+
+    def shape(self, tensor_shapes, tile_height, tile_width):
+        assert len(tensor_shapes) == 1, "buffer should have one input"
+        shape = tensor_shapes[0]
+        if tile_height == TILE_DIM:
+            shape[-2] = align_up_tile(shape[-2])
+        elif tile_height < TILE_DIM:
+            shape[-2] = tile_height
+        else:
+            raise RuntimeError(
+                f"Tile height {tile_height} is larger than max allowed TILE_DIM {TILE_DIM}"
+            )
+
+        return shape, []
+
+    def parallelization(self, op_shape, fracture_factor):
+        return (op_shape.outputs[0].rt, op_shape.outputs[0].ct)
+
+    def input_ublock_order(self, num_operands):
+        return None
+
+    def execution_cycles(self, arch_name, op_model) -> int:
+        op_model_desc = op_model_to_desc("buffer", arch_name, op_model)
+
+        compiler_cache_cycles = get_compiler_cached_cycles(op_model_desc)
+        if compiler_cache_cycles is not None:
+            return compiler_cache_cycles
+
+        use_legacy_path = bool(
+            int(os.environ.get("PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY", "0"))
+        )
+
+        if use_legacy_path:
+            tile_weight = get_op_model_param(op_model_desc, "tile_weight")
+            output_shape = op_model.op_shape.outputs[0]
+            num_tiles = (output_shape.z * output_shape.rt * output_shape.ct) / (
+                op_model.grid_shape.r * op_model.grid_shape.c
+            )
+            cycle_count = tile_weight * num_tiles
+            return min(int(cycle_count), 1 << 30)
+
+        return get_op_model_execution_cycles(op_model_desc)
diff --git a/pybuda/pybuda/op/eval/buda/clip.py b/pybuda/pybuda/op/eval/buda/clip.py
new file mode 100644
index 000000000..97a077dd9
--- /dev/null
+++ b/pybuda/pybuda/op/eval/buda/clip.py
@@ -0,0 +1,74 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+from ..interface import BudaEltwiseUnaryOp
+
+import torch
+from pybuda.utils import align_up_tile, round_up_div
+from .tm import eval as tm_eval
+from pybuda.pybudaglobal import TILE_DIM
+from pybuda._C.graph import UBlockOrder, Shape
+
+
+class Clip(BudaEltwiseUnaryOp):
+    @classmethod
+    def create(cls, min=float('-inf'), max=float('inf')):
+        self = cls("clip")
+        self.min = min
+        self.max = max
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "clip should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        ret = torch.clip(tensors[0], min=self.min, max=self.max)
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+
+    def shape(self, tensor_shapes, tile_height, tile_width):
+        assert len(tensor_shapes) == 1, "Clip should have one input"
+        shape = tensor_shapes[0]
+        if tile_height == TILE_DIM:
+            shape[-2] = align_up_tile(shape[-2])
+        elif tile_height < TILE_DIM:
+            shape[-2] = tile_height
+        else:
+            raise RuntimeError(
+                f"Tile height {tile_height} is larger than max allowed TILE_DIM {TILE_DIM}"
+            )
+
+        return shape, []
+
+    def parallelization(self, op_shape, fracture_factor):
+        return (op_shape.outputs[0].rt, op_shape.outputs[0].ct)
+
+    def input_ublock_order(self, num_operands):
+        return None
+
+    def execution_cycles(self, arch_name, op_model) -> int:
+        op_model_desc = op_model_to_desc("clip", arch_name, op_model)
+
+        compiler_cache_cycles = get_compiler_cached_cycles(op_model_desc)
+        if compiler_cache_cycles is not None:
+            return compiler_cache_cycles
+
+        use_legacy_path = bool(
+            int(os.environ.get("PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY", "0"))
+        )
+
+        if use_legacy_path:
+            tile_weight = get_op_model_param(op_model_desc, "tile_weight")
+            output_shape = op_model.op_shape.outputs[0]
+            num_tiles = (output_shape.z * output_shape.rt * output_shape.ct) / (
+                op_model.grid_shape.r * op_model.grid_shape.c
+            )
+            cycle_count = tile_weight * num_tiles
+            return min(int(cycle_count), 1 << 30)
+
+        return get_op_model_execution_cycles(op_model_desc)
diff --git a/pybuda/pybuda/op/eval/buda/cosine.py b/pybuda/pybuda/op/eval/buda/cosine.py
new file mode 100644
index 000000000..335b9fb54
--- /dev/null
+++ b/pybuda/pybuda/op/eval/buda/cosine.py
@@ -0,0 +1,77 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+from ..interface import BudaEltwiseUnaryOp
+
+import torch
+import pybuda
+
+from pybuda.utils import align_up_tile, round_up_div
+from .tm import eval as tm_eval
+from pybuda.tensor import pad_pytorch_tensor_to_buda
+from pybuda.pybudaglobal import TILE_DIM
+from pybuda._C.graph import UBlockOrder, Shape
+
+
+class Cosine(BudaEltwiseUnaryOp):
+    @classmethod
+    def create(cls, vector=None):
+        self = cls("cosine")
+        if vector is not None:
+            self.set_buda_attr("vector", vector)
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Cosine should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        ret = torch.cos(tensors[0])
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+
+    def shape(self, tensor_shapes, tile_height, tile_width):
+        assert len(tensor_shapes) == 1, "Cosine should have one input"
+        shape = tensor_shapes[0]
+        if tile_height == TILE_DIM:
+            shape[-2] = align_up_tile(shape[-2])
+        elif tile_height < TILE_DIM:
+            shape[-2] = tile_height
+        else:
+            raise RuntimeError(
+                f"Tile height {tile_height} is larger than max allowed TILE_DIM {TILE_DIM}"
+            )
+
+        return shape, []
+
+    def parallelization(self, op_shape, fracture_factor):
+        return (op_shape.outputs[0].rt, op_shape.outputs[0].ct)
+
+    def input_ublock_order(self, num_operands):
+        return None
+
+    def execution_cycles(self, arch_name, op_model) -> int:
+        op_model_desc = op_model_to_desc("cosine", arch_name, op_model)
+
+        compiler_cache_cycles = get_compiler_cached_cycles(op_model_desc)
+        if compiler_cache_cycles is not None:
+            return compiler_cache_cycles
+
+        use_legacy_path = bool(
+            int(os.environ.get("PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY", "0"))
+        )
+
+        if use_legacy_path:
+            tile_weight = get_op_model_param(op_model_desc, "tile_weight")
+            output_shape = op_model.op_shape.outputs[0]
+            num_tiles = (output_shape.z * output_shape.rt * output_shape.ct) / (
+                op_model.grid_shape.r * op_model.grid_shape.c
+            )
+            cycle_count = tile_weight * num_tiles
+            return min(int(cycle_count), 1 << 30)
+
+        return get_op_model_execution_cycles(op_model_desc)
diff --git a/pybuda/pybuda/op/eval/buda/depthwise.py b/pybuda/pybuda/op/eval/buda/depthwise.py
index d0ecaf8b2..942a00f3e 100644
--- a/pybuda/pybuda/op/eval/buda/depthwise.py
+++ b/pybuda/pybuda/op/eval/buda/depthwise.py
@@ -17,7 +17,7 @@
 from pybuda.utils import align_up_tile
 from pybuda._C.graph import UBlockOrder
 
-from ..common import to_torch_operands, math_fidelity_to_multiplier, data_format_to_int, op_model_to_desc
+from ..common import to_torch_operands, math_fidelity_to_multiplier, data_format_to_int, op_model_to_desc, cast_for_cpu_eval
 
 
 def eval(type, attr, ops):
@@ -25,6 +25,7 @@ def eval(type, attr, ops):
     assert len(attr) == 1, f"Unexpected number of attrs for depthwise matmul: {len(attr)}"
 
     t_ops = to_torch_operands(*ops)
+    t_ops, original_type = cast_for_cpu_eval(t_ops, type)
     in0 = t_ops[0]
     in1 = t_ops[1]
     bias = t_ops[2] if len(t_ops) == 3 else None
@@ -53,7 +54,7 @@ def eval(type, attr, ops):
     if bias is not None:
         result += bias
 
-    return result
+    return result.to(original_type)
 
 
 def shape(type, attr, ops, tile_height, tile_width):
diff --git a/pybuda/pybuda/op/eval/buda/ethernet_datacopy.py b/pybuda/pybuda/op/eval/buda/ethernet_datacopy.py
new file mode 100644
index 000000000..c722d244e
--- /dev/null
+++ b/pybuda/pybuda/op/eval/buda/ethernet_datacopy.py
@@ -0,0 +1,67 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+from ..interface import BudaEltwiseUnaryOp
+
+import torch
+import pybuda
+from pybuda.utils import align_up_tile, round_up_div
+from .tm import eval as tm_eval
+from ..common import to_torch_operands
+from pybuda.tensor import pad_pytorch_tensor_to_buda
+from pybuda.pybudaglobal import TILE_DIM
+from pybuda._C.graph import UBlockOrder, Shape
+
+
+class EthernetDatacopy(BudaEltwiseUnaryOp):
+    @classmethod
+    def create(cls):
+        self = cls("ethernet_datacopy")
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Ethernet Datacopy should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        ret = tensors[0]
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+
+    def shape(self, tensor_shapes, tile_height, tile_width):
+        shape = tensor_shapes[0]
+        assert len(tensor_shapes) == 1, "Ethernet Datacopy should have one input"
+        if tile_height == TILE_DIM:
+            shape[-2] = align_up_tile(shape[-2])
+        elif tile_height < TILE_DIM:
+            shape[-2] = tile_height
+        else:
+            raise RuntimeError(
+                f"Tile height {tile_height} is larger than max allowed TILE_DIM {TILE_DIM}"
+            )
+
+        return shape, []
+
+    def parallelization(self, op_shape, fracture_factor):
+        return (op_shape.outputs[0].rt, op_shape.outputs[0].ct)
+
+    def input_ublock_order(self, num_operands):
+        return None
+
+    def execution_cycles(self, arch_name, op_model) -> int:
+        op_model_desc = op_model_to_desc("ethernet_datacopy", arch_name, op_model)
+
+        compiler_cache_cycles = get_compiler_cached_cycles(op_model_desc)
+        if compiler_cache_cycles is not None:
+            return compiler_cache_cycles
+
+        tile_weight = get_op_model_param(op_model_desc, "tile_weight")
+        output_shape = op_model.op_shape.outputs[0]
+        num_tiles = (output_shape.z * output_shape.rt * output_shape.ct) / (
+            op_model.grid_shape.r * op_model.grid_shape.c
+        )
+        cycle_count = tile_weight * num_tiles
+        return min(int(cycle_count), 1 << 30)
diff --git a/pybuda/pybuda/op/eval/buda/exp.py b/pybuda/pybuda/op/eval/buda/exp.py
new file mode 100644
index 000000000..b881a541a
--- /dev/null
+++ b/pybuda/pybuda/op/eval/buda/exp.py
@@ -0,0 +1,78 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+from ..interface import BudaEltwiseUnaryOp
+
+import torch
+import pybuda
+from pybuda._C import UnsupportedHWOpsError
+from pybuda.utils import align_up_tile, round_up_div
+from ..common import to_torch_operands
+from pybuda.tensor import pad_pytorch_tensor_to_buda
+from pybuda.pybudaglobal import TILE_DIM
+from pybuda._C.graph import UBlockOrder, Shape
+
+
+class Exp(BudaEltwiseUnaryOp):
+    @classmethod
+    def create(cls, approximate_mode=None):
+        self = cls("exp")
+        if approximate_mode is not None:
+            self.set_buda_attr("approximate_mode", approximate_mode)
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Exp should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        ret = torch.exp(tensors[0])
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+
+    def shape(self, tensor_shapes, tile_height, tile_width):
+        assert len(tensor_shapes) == 1, "Exp should have one input"
+        shape = tensor_shapes[0]
+        if tile_height == TILE_DIM:
+            shape[-2] = align_up_tile(shape[-2])
+        elif tile_height < TILE_DIM:
+            shape[-2] = tile_height
+        else:
+            raise RuntimeError(
+                f"Tile height {tile_height} is larger than max allowed TILE_DIM {TILE_DIM}"
+            )
+
+        return shape, []
+
+    def parallelization(self, op_shape, fracture_factor):
+        return (op_shape.outputs[0].rt, op_shape.outputs[0].ct)
+
+    def input_ublock_order(self, num_operands):
+        return None
+
+    def execution_cycles(self, arch_name, op_model) -> int:
+        op_model_desc = op_model_to_desc("exp", arch_name, op_model)
+
+        compiler_cache_cycles = get_compiler_cached_cycles(op_model_desc)
+        if compiler_cache_cycles is not None:
+            return compiler_cache_cycles
+
+        use_legacy_path = bool(
+            int(os.environ.get("PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY", "0"))
+        )
+
+        if use_legacy_path:
+            tile_weight = get_op_model_param(op_model_desc, "tile_weight")
+            output_shape = op_model.op_shape.outputs[0]
+            num_tiles = (output_shape.z * output_shape.rt * output_shape.ct) / (
+                op_model.grid_shape.r * op_model.grid_shape.c
+            )
+            cycle_count = tile_weight * num_tiles
+            return min(int(cycle_count), 1 << 30)
+
+        return get_op_model_execution_cycles(op_model_desc)
diff --git a/pybuda/pybuda/op/eval/buda/log.py b/pybuda/pybuda/op/eval/buda/log.py
new file mode 100644
index 000000000..4306aaa79
--- /dev/null
+++ b/pybuda/pybuda/op/eval/buda/log.py
@@ -0,0 +1,76 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+from ..interface import BudaEltwiseUnaryOp
+
+import torch
+from pybuda.utils import align_up_tile, round_up_div
+from .tm import eval as tm_eval
+from pybuda.pybudaglobal import TILE_DIM
+from pybuda._C.graph import UBlockOrder, Shape
+
+
+class Log(BudaEltwiseUnaryOp):
+    @classmethod
+    def create(cls, vector=None):
+        self = cls("log")
+        if vector is not None:
+          self.set_buda_attr("vector", vector)
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Log should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+
+        ret = torch.log(tensors[0] + 1e-10)
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+
+    def shape(self, tensor_shapes, tile_height, tile_width):
+        assert len(tensor_shapes) == 1, "Log should have one input"
+        shape = tensor_shapes[0]
+        if tile_height == TILE_DIM:
+            shape[-2] = align_up_tile(shape[-2])
+        elif tile_height < TILE_DIM:
+            shape[-2] = tile_height
+        else:
+            raise RuntimeError(
+                f"Tile height {tile_height} is larger than max allowed TILE_DIM {TILE_DIM}"
+            )
+
+        return shape, []
+
+    def parallelization(self, op_shape, fracture_factor):
+        return (op_shape.outputs[0].rt, op_shape.outputs[0].ct)
+
+    def input_ublock_order(self, num_operands):
+        return None
+
+    def execution_cycles(self, arch_name, op_model) -> int:
+        op_model_desc = op_model_to_desc("log", arch_name, op_model)
+
+        compiler_cache_cycles = get_compiler_cached_cycles(op_model_desc)
+        if compiler_cache_cycles is not None:
+            return compiler_cache_cycles
+
+        use_legacy_path = bool(
+            int(os.environ.get("PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY", "0"))
+        )
+
+        if use_legacy_path:
+            tile_weight = get_op_model_param(op_model_desc, "tile_weight")
+            output_shape = op_model.op_shape.outputs[0]
+            num_tiles = (output_shape.z * output_shape.rt * output_shape.ct) / (
+                op_model.grid_shape.r * op_model.grid_shape.c
+            )
+            cycle_count = tile_weight * num_tiles
+            return min(int(cycle_count), 1 << 30)
+
+        return get_op_model_execution_cycles(op_model_desc)
diff --git a/pybuda/pybuda/op/eval/buda/matmul.py b/pybuda/pybuda/op/eval/buda/matmul.py
index 32fdd4a9c..3e38b077b 100644
--- a/pybuda/pybuda/op/eval/buda/matmul.py
+++ b/pybuda/pybuda/op/eval/buda/matmul.py
@@ -388,16 +388,55 @@ def input_ublock_order(type, attr, num_operands):
 
 
 def execution_cycles(type, arch_name, op_model, theoretical) -> int:
+    # Special handling for sparse matmul as the backend API assumes 1x1 grid, but each sparse matmul core can do
+    # different amount of work, depending on what the encodings (sparse tensor) look like. Call for each core to find
+    # the slowest one.
+    #
+    if op_model.is_sparse_matmul:
+        # Calculate cycles per core
+        #
+        if (
+            os.environ.get("PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS", False)
+            and os.environ.get("PYBUDA_TEMP_SPARSE_ESTIMATE_ARGS_PER_CORE", False)
+        ):
+            cycles_to_return = 0
+            for r in range(op_model.grid_shape.r):
+                # Generate op model desc for current core
+                #
+                op_model_desc = op_model_to_desc(type, arch_name, op_model, sparse_r=r)
+
+                # Get execution cycles, try from cache first, if miss, then calculate
+                #
+                curr_cycles = 0
+                compiler_cache_cycles = get_compiler_cached_cycles(op_model_desc)
+                if compiler_cache_cycles is not None:
+                    curr_cycles = compiler_cache_cycles
+                else:
+                    curr_cycles = get_op_model_execution_cycles(op_model_desc)
+
+                # Save max cycles
+                #
+                cycles_to_return = max(cycles_to_return, curr_cycles)
+        else:
+            # Otherwise fallback to default behavior (create single op_model_desc, and let it decide whether to average
+            # parameters, or to sum and pretend everything is on a single core)
+            #
+            op_model_desc = op_model_to_desc(type, arch_name, op_model)
+            compiler_cache_cycles = get_compiler_cached_cycles(op_model_desc)
+            if compiler_cache_cycles is not None:
+                cycles_to_return = compiler_cache_cycles
+            else:
+                cycles_to_return = get_op_model_execution_cycles(op_model_desc)
+
+        return cycles_to_return
+    # End sparse matmul exec cycles calculation
+
     op_model_desc = op_model_to_desc(type, arch_name, op_model)
 
     compiler_cache_cycles = get_compiler_cached_cycles(op_model_desc)
     if compiler_cache_cycles is not None:
         return compiler_cache_cycles
 
-    is_sparse = op_model.is_sparse_matmul
-    if is_sparse:
-        return get_op_model_execution_cycles(op_model_desc)
-
     # Math fidelity and data format are just estimated guesses for now
     math_fid = math_fidelity_to_multiplier(op_model.math_fidelity())
     u_kt = op_model.input_buffers[0].block_shape.ublock.ct
@@ -418,11 +457,32 @@ def execution_cycles(type, arch_name, op_model, theoretical) -> int:
         x = [input0_df,output_df,math_fid,t,mblock_m,mblock_n,ublock_rt,ublock_ct,m_k,u_kt,mblock_executions,ublock_executions,0]
         cycle_count = cyclenet_execution_cycles(type, torch.tensor(x, dtype=torch.float32))
     elif theoretical:
-        tile_weight = 32 if arch_name == 'grayskull' else 18
+        tile_weight = 32 if arch_name == 'grayskull' else 16
         cycle_count = t * ublock_executions * math_fid * tile_weight  # based on max throughput for the chip
     else:
         cycle_count = get_op_model_execution_cycles(op_model_desc)
 
+        if op_model.input_buffers[0].data_format == DataFormat.Int8:
+            if op_model.buda_op_attrs().get("bias") is True:
+                op_model_desc.type = "nop"
+                op_model_desc.mblock_k = 0
+                op_model_desc.ublock_kt = 0
+                cycle_count += get_op_model_execution_cycles(op_model_desc)
+
+            if op_model.buda_op_attrs().get("requant") is True:
+                op_model_desc.type = "requantization"
+                op_model_desc.mblock_k = 0
+                op_model_desc.ublock_kt = 0
+                op_model_desc.math_fidelity = MathFidelity.HiFi4
+                cycle_count += get_op_model_execution_cycles(op_model_desc)
+
+            if op_model.buda_op_attrs().get("dequant") is True:
+                op_model_desc.type = "dequantization"
+                op_model_desc.mblock_k = 0
+                op_model_desc.ublock_kt = 0
+                op_model_desc.math_fidelity = MathFidelity.HiFi4
+                cycle_count += get_op_model_execution_cycles(op_model_desc)
+
     return cycle_count
 
 def cyclenet_execution_cycles(type, X) -> int:
diff --git a/pybuda/pybuda/op/eval/buda/nop.py b/pybuda/pybuda/op/eval/buda/nop.py
new file mode 100644
index 000000000..a7372a6b9
--- /dev/null
+++ b/pybuda/pybuda/op/eval/buda/nop.py
@@ -0,0 +1,106 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+from ..interface import BudaEltwiseUnaryOp
+
+import torch
+from pybuda.utils import align_up_tile, round_up_div
+from .tm import eval as tm_eval
+from pybuda.pybudaglobal import TILE_DIM
+from pybuda._C.graph import UBlockOrder, Shape
+
+
+class Nop(BudaEltwiseUnaryOp):
+    @classmethod
+    def create(
+        cls,
+        relu_en=False,
+        relu_threshold=0,
+        relu_mode=None,
+        squeeze=None,
+        squeeze_dim=None,
+        unsqueeze=None,
+        unsqueeze_dim=None,
+    ):
+        self = cls("nop")
+        # Adding relu buda attr for Nop relu
+        if relu_en == True:
+            self.set_buda_attr("relu_en", relu_en)
+            self.set_buda_attr("relu_threshold", relu_threshold)
+            self.set_buda_attr("relu_mode", relu_mode)
+        # Adding (un)squeeze attr for Nop (un)squeeze
+        self.squeeze = squeeze
+        self.squeeze_dim = squeeze_dim
+        self.unsqueeze = unsqueeze
+        self.unsqueeze_dim = unsqueeze_dim
+
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "nop should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        ret = tensors[0]
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+        return ret
+
+    def shape(self, tensor_shapes, tile_height, tile_width):
+        assert len(tensor_shapes) == 1, "Eltwise unary should have one input"
+        shape = tensor_shapes[0]
+        if tile_height == TILE_DIM:
+            shape[-2] = align_up_tile(shape[-2])
+        elif tile_height < TILE_DIM:
+            shape[-2] = tile_height
+        else:
+            raise RuntimeError(
+                f"Tile height {tile_height} is larger than max allowed TILE_DIM {TILE_DIM}"
+            )
+
+        # Add NOP squeeze condition squash 5D -> 4D for squeeze NOP
+        if hasattr(self, 'squeeze') and hasattr(self, 'squeeze_dim'):
+            if (self.squeeze and self.squeeze_dim != None):
+                if self.squeeze_dim == 0:
+                    ops_updated = Shape.create_buda(shape[1:], tile_height, tile_width)
+                    return ops_updated, []
+
+        # Add NOP unsqueeze condition extend 4D -> 5D for unsqueeze NOP
+        if hasattr(self, 'unsqueeze') and hasattr(self, 'unsqueeze_dim'):
+            if (self.unsqueeze is not None and self.unsqueeze_dim is not None):
+                if self.unsqueeze_dim == 4:
+                    ops_updated = Shape.create_buda([1] + shape, tile_height, tile_width)
+                    return ops_updated, []
+
+        return shape, []
+
+    def parallelization(self, op_shape, fracture_factor):
+        return (op_shape.outputs[0].rt, op_shape.outputs[0].ct)
+
+    def input_ublock_order(self, num_operands):
+        return None
+
+    def execution_cycles(self, arch_name, op_model) -> int:
+        op_model_desc = op_model_to_desc("nop", arch_name, op_model)
+
+        compiler_cache_cycles = get_compiler_cached_cycles(op_model_desc)
+        if compiler_cache_cycles is not None:
+            return compiler_cache_cycles
+
+        use_legacy_path = bool(
+            int(os.environ.get("PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY", "0"))
+        )
+
+        if use_legacy_path:
+            tile_weight = get_op_model_param(op_model_desc, "tile_weight")
+            output_shape = op_model.op_shape.outputs[0]
+            num_tiles = (output_shape.z * output_shape.rt * output_shape.ct) / (
+                op_model.grid_shape.r * op_model.grid_shape.c
+            )
+            cycle_count = tile_weight * num_tiles
+            return min(int(cycle_count), 1 << 30)
+
+        return get_op_model_execution_cycles(op_model_desc)
diff --git a/pybuda/pybuda/op/eval/buda/quantize.py b/pybuda/pybuda/op/eval/buda/quantize.py
index 098058941..bc5299ae0 100644
--- a/pybuda/pybuda/op/eval/buda/quantize.py
+++ b/pybuda/pybuda/op/eval/buda/quantize.py
@@ -17,7 +17,7 @@
 from pybuda.utils import align_up_tile, align_up
 from pybuda._C.graph import UBlockOrder
 
-from ..common import to_torch_operands, math_fidelity_to_multiplier, data_format_to_int, op_model_to_desc
+from ..common import op_model_to_desc, get_compiler_cached_cycles
 from pybuda.op.eval.pybuda.quantize import STRING_TO_LOWER_LIMIT, STRING_TO_UPPER_LIMIT, STRING_TO_TORCH_DTYPE
 
 def eval(type, attr, ops):
@@ -34,8 +34,8 @@ def eval(type, attr, ops):
         
     elif type == "dequantization":
         zero_point, axis = attr
-        input_float = ops[0]
-        scale = ops[1]
+        input_float = ops[0].float()
+        scale = ops[1].float()
         output_float = (input_float - zero_point) * scale
         return output_float
 
@@ -86,6 +86,17 @@ def input_ublock_order(type, attr, num_operands):
     return None
 
 def execution_cycles(type, arch_name, op_model) -> int:
-    b = op_model.output_buffers[0].block_shape
-    cycles_per_tile = 32 * 20
-    return b.mblock_m * b.mblock_n * b.ublock.rt * b.ublock.ct * b.t * cycles_per_tile
+    op_model_desc = op_model_to_desc(type, arch_name, op_model)
+
+    # for dequant and requant input0 format is important,
+    # output format is always Int8 for requant and Float32 for dequant;
+    # this is a workaround until we expand the API to accept all data formats for an op
+    if (op_model_desc.type == "dequantization" or op_model_desc.type == "requantization"):
+        op_model_desc.data_format = op_model.input_buffers[0].data_format
+
+    compiler_cache_cycles = get_compiler_cached_cycles(op_model_desc)
+    if compiler_cache_cycles is not None:
+        return compiler_cache_cycles
+
+    cycle_count = get_op_model_execution_cycles(op_model_desc)
+    return cycle_count
diff --git a/pybuda/pybuda/op/eval/buda/reciprocal.py b/pybuda/pybuda/op/eval/buda/reciprocal.py
new file mode 100644
index 000000000..6cadc40d7
--- /dev/null
+++ b/pybuda/pybuda/op/eval/buda/reciprocal.py
@@ -0,0 +1,79 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+from ..interface import BudaEltwiseUnaryOp
+
+import torch
+import pybuda
+from pybuda.utils import align_up_tile, round_up_div
+from .tm import eval as tm_eval
+from pybuda.pybudaglobal import TILE_DIM
+from pybuda._C.graph import UBlockOrder, Shape
+
+
+class Reciprocal(BudaEltwiseUnaryOp):
+    @classmethod
+    def create(cls, approximate_mode=None, vector=None):
+        self = cls("reciprocal")
+        if approximate_mode is not None:
+            self.set_buda_attr("approximate_mode", approximate_mode)
+        if vector is not None:
+            self.set_buda_attr("vector", vector)
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Reciprocal should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+
+        ret = torch.reciprocal(tensors[0] + 1e-10)  # add epsilon to avoid infinity
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+
+    def shape(self, tensor_shapes, tile_height, tile_width):
+        assert len(tensor_shapes) == 1, "Reciprocal should have one input"
+        shape = tensor_shapes[0]
+        if tile_height == TILE_DIM:
+            shape[-2] = align_up_tile(shape[-2])
+        elif tile_height < TILE_DIM:
+            shape[-2] = tile_height
+        else:
+            raise RuntimeError(
+                f"Tile height {tile_height} is larger than max allowed TILE_DIM {TILE_DIM}"
+            )
+
+        return shape, []
+
+    def parallelization(self, op_shape, fracture_factor):
+        return (op_shape.outputs[0].rt, op_shape.outputs[0].ct)
+
+    def input_ublock_order(self, num_operands):
+        return None
+
+    def execution_cycles(self, arch_name, op_model) -> int:
+        op_model_desc = op_model_to_desc("reciprocal", arch_name, op_model)
+
+        compiler_cache_cycles = get_compiler_cached_cycles(op_model_desc)
+        if compiler_cache_cycles is not None:
+            return compiler_cache_cycles
+
+        use_legacy_path = bool(
+            int(os.environ.get("PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY", "0"))
+        )
+
+        if use_legacy_path:
+            tile_weight = get_op_model_param(op_model_desc, "tile_weight")
+            output_shape = op_model.op_shape.outputs[0]
+            num_tiles = (output_shape.z * output_shape.rt * output_shape.ct) / (
+                op_model.grid_shape.r * op_model.grid_shape.c
+            )
+            cycle_count = tile_weight * num_tiles
+            return min(int(cycle_count), 1 << 30)
+
+        return get_op_model_execution_cycles(op_model_desc)
diff --git a/pybuda/pybuda/op/eval/buda/splice.py b/pybuda/pybuda/op/eval/buda/splice.py
index ecf9e8581..a8e43651a 100644
--- a/pybuda/pybuda/op/eval/buda/splice.py
+++ b/pybuda/pybuda/op/eval/buda/splice.py
@@ -5,11 +5,8 @@
 
 import torch
 import pybuda
-from pybuda._C.backend_api import get_op_model_execution_cycles
-from pybuda._C.balancer import FactorizedInt
 from pybuda._C import UnsupportedHWOpsError
 from ....pybudaglobal import TILE_DIM
-from ..common import op_model_to_desc
 from pybuda.utils import align_up_tile, round_up_div
 from .tm import eval as tm_eval
 
@@ -262,8 +259,8 @@ def shape(self, tensor_shapes, tile_height, tile_width):
             ), f"Select should have only 1 tensor_shape: len(tensor_shapes) = {len(tensor_shapes)}"
             assert len(self.canonical_ranges) == 1
             shape = list(tensor_shapes[0])
-            index, length, stride = self.canonical_ranges[0]
-            shape[self.dim] = length * round_up_div(shape[self.dim], index + stride)
+            index, length, stride = self.ranges[0] if self.dim == 1 else self.canonical_ranges[0]
+            shape[self.dim] = length * round_up_div(shape[self.dim] - index, index + stride)
             if self.dim >= 2:
                 shape[self.dim] = align_up_tile(shape[self.dim])
             return tuple(shape), []
diff --git a/pybuda/pybuda/op/eval/buda/sqrt.py b/pybuda/pybuda/op/eval/buda/sqrt.py
new file mode 100644
index 000000000..567d7c99c
--- /dev/null
+++ b/pybuda/pybuda/op/eval/buda/sqrt.py
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+from ..interface import BudaEltwiseUnaryOp
+
+import torch
+from pybuda.utils import align_up_tile, round_up_div
+from .tm import eval as tm_eval
+from pybuda.pybudaglobal import TILE_DIM
+from pybuda._C.graph import UBlockOrder, Shape
+
+class Sqrt(BudaEltwiseUnaryOp):
+    @classmethod
+    def create(cls):
+        self = cls("sqrt")
+        return self
+    
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Sqrt should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        
+        ret = torch.sqrt(tensors[0])
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+    
+    def shape(self, tensor_shapes, tile_height, tile_width):
+        assert len(tensor_shapes) == 1, "Sqrt should have one input"
+        shape = tensor_shapes[0]
+        if tile_height == TILE_DIM:
+            shape[-2] = align_up_tile(shape[-2])
+        elif tile_height < TILE_DIM:
+            shape[-2] = tile_height
+        else:
+            raise RuntimeError(f"Tile height {tile_height} is larger than max allowed TILE_DIM {TILE_DIM}")
+
+        return shape, []
+    
+    def parallelization(self, op_shape, fracture_factor):
+        return (op_shape.outputs[0].rt, op_shape.outputs[0].ct)
+
+    def input_ublock_order(self, num_operands):
+        return None
+    
+    def execution_cycles(self, arch_name, op_model) -> int:
+        op_model_desc = op_model_to_desc("sqrt", arch_name, op_model)
+
+        compiler_cache_cycles = get_compiler_cached_cycles(op_model_desc)
+        if compiler_cache_cycles is not None:
+            return compiler_cache_cycles
+
+        use_legacy_path = bool(int(os.environ.get("PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY", "0")))
+
+        if (use_legacy_path ): 
+            tile_weight = get_op_model_param(op_model_desc, "tile_weight")
+            output_shape = op_model.op_shape.outputs[0]
+            num_tiles = (output_shape.z * output_shape.rt * output_shape.ct) / (op_model.grid_shape.r * op_model.grid_shape.c)
+            cycle_count = tile_weight * num_tiles
+            return min(int(cycle_count), 1 << 30)
+
+        return get_op_model_execution_cycles(op_model_desc)
+    
\ No newline at end of file
diff --git a/pybuda/pybuda/op/eval/buda/tanh.py b/pybuda/pybuda/op/eval/buda/tanh.py
new file mode 100644
index 000000000..403cec567
--- /dev/null
+++ b/pybuda/pybuda/op/eval/buda/tanh.py
@@ -0,0 +1,76 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+from ..interface import BudaEltwiseUnaryOp
+
+import torch
+from pybuda.utils import align_up_tile, round_up_div
+from .tm import eval as tm_eval
+from pybuda.pybudaglobal import TILE_DIM
+from pybuda._C.graph import UBlockOrder, Shape
+
+
+class Tanh(BudaEltwiseUnaryOp):
+    @classmethod
+    def create(cls, vector=None):
+        self = cls("tanh")
+        if vector is not None:
+            self.set_buda_attr("vector", vector)
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Tanh should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+
+        ret = torch.tanh(tensors[0])
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+
+    def shape(self, tensor_shapes, tile_height, tile_width):
+        assert len(tensor_shapes) == 1, "Log should have one input"
+        shape = tensor_shapes[0]
+        if tile_height == TILE_DIM:
+            shape[-2] = align_up_tile(shape[-2])
+        elif tile_height < TILE_DIM:
+            shape[-2] = tile_height
+        else:
+            raise RuntimeError(
+                f"Tile height {tile_height} is larger than max allowed TILE_DIM {TILE_DIM}"
+            )
+
+        return shape, []
+
+    def parallelization(self, op_shape, fracture_factor):
+        return (op_shape.outputs[0].rt, op_shape.outputs[0].ct)
+
+    def input_ublock_order(self, num_operands):
+        return None
+
+    def execution_cycles(self, arch_name, op_model) -> int:
+        op_model_desc = op_model_to_desc("tanh", arch_name, op_model)
+
+        compiler_cache_cycles = get_compiler_cached_cycles(op_model_desc)
+        if compiler_cache_cycles is not None:
+            return compiler_cache_cycles
+
+        use_legacy_path = bool(
+            int(os.environ.get("PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY", "0"))
+        )
+
+        if use_legacy_path:
+            tile_weight = get_op_model_param(op_model_desc, "tile_weight")
+            output_shape = op_model.op_shape.outputs[0]
+            num_tiles = (output_shape.z * output_shape.rt * output_shape.ct) / (
+                op_model.grid_shape.r * op_model.grid_shape.c
+            )
+            cycle_count = tile_weight * num_tiles
+            return min(int(cycle_count), 1 << 30)
+
+        return get_op_model_execution_cycles(op_model_desc)
diff --git a/pybuda/pybuda/op/eval/buda/tilizer.py b/pybuda/pybuda/op/eval/buda/tilizer.py
new file mode 100644
index 000000000..c14218388
--- /dev/null
+++ b/pybuda/pybuda/op/eval/buda/tilizer.py
@@ -0,0 +1,71 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+from ..interface import BudaEltwiseUnaryOp
+
+import torch
+from pybuda.utils import align_up_tile, round_up_div
+from .tm import eval as tm_eval
+from pybuda.pybudaglobal import TILE_DIM
+from pybuda._C.graph import UBlockOrder, Shape
+
+class Tilizer(BudaEltwiseUnaryOp):
+    @classmethod
+    def create(cls):
+        self = cls("tilizer")
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Tilizer should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        ret = tensors[0]
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+    
+    def shape(self, tensor_shapes, tile_height, tile_width):
+        assert len(tensor_shapes) == 1, "Tilizer should have one input"
+        shape = tensor_shapes[0]
+        if tile_height == TILE_DIM:
+            shape[-2] = align_up_tile(shape[-2])
+        elif tile_height < TILE_DIM:
+            shape[-2] = tile_height
+        else:
+            raise RuntimeError(
+                f"Tile height {tile_height} is larger than max allowed TILE_DIM {TILE_DIM}"
+            )
+
+        return shape, []
+    
+    def parallelization(self, op_shape, fracture_factor):
+        return (op_shape.outputs[0].rt, op_shape.outputs[0].ct)
+
+    def input_ublock_order(self, num_operands):
+        return [UBlockOrder.R]
+    
+    def execution_cycles(self, arch_name, op_model) -> int:
+        op_model_desc = op_model_to_desc("tilizer", arch_name, op_model)
+
+        compiler_cache_cycles = get_compiler_cached_cycles(op_model_desc)
+        if compiler_cache_cycles is not None:
+            return compiler_cache_cycles
+
+        use_legacy_path = bool(
+            int(os.environ.get("PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY", "0"))
+        )
+
+        if use_legacy_path:
+            tile_weight = get_op_model_param(op_model_desc, "tile_weight")
+            output_shape = op_model.op_shape.outputs[0]
+            num_tiles = (output_shape.z * output_shape.rt * output_shape.ct) / (
+                op_model.grid_shape.r * op_model.grid_shape.c
+            )
+            cycle_count = tile_weight * num_tiles
+            return min(int(cycle_count), 1 << 30)
+
+        return get_op_model_execution_cycles(op_model_desc)
\ No newline at end of file
diff --git a/pybuda/pybuda/op/eval/buda/tm.py b/pybuda/pybuda/op/eval/buda/tm.py
index f80aecd29..187adaaa4 100644
--- a/pybuda/pybuda/op/eval/buda/tm.py
+++ b/pybuda/pybuda/op/eval/buda/tm.py
@@ -5,7 +5,6 @@
 from ..common import to_torch_operands
 import torch
 import pybuda
-import pybuda._C.balancer as balancer
 from pybuda.pybudaglobal import TILE_DIM
 from pybuda.utils import align_up_tile, round_up_div, align_up
 from ..sparse_utils import bcast_sparse_picker_matrix
@@ -76,7 +75,7 @@ def eval(type, attr, ops):
         zero_shape[dim] = 1
         zero_slice = torch.zeros(zero_shape, dtype=dtype).squeeze(dim)
         result = []
-        for offset in range(0, t_ops[0].shape[dim], stride):
+        for offset in range(0, t_ops[0].shape[dim] - begin, stride):
             for i in range(begin, begin + length):
                 if offset + i < t_ops[0].shape[dim] or stride == t_ops[0].shape[dim]:
                     result.append(t_ops[0].select(dim, offset + i))
@@ -327,7 +326,7 @@ def shape(type, attr, ops, tile_height, tile_width):
         shape = ops[0]
         if dim < 0:
             dim += max(len(shape), 4)
-        shape[dim] = length * round_up_div(shape[dim], stride)
+        shape[dim] = length * round_up_div(shape[dim] - begin, stride)
         if dim >= 2:
             shape[dim] = align_up_tile(shape[dim])
         return tuple(shape), []
diff --git a/pybuda/pybuda/op/eval/common.py b/pybuda/pybuda/op/eval/common.py
index 7d3eb8938..501d92771 100644
--- a/pybuda/pybuda/op/eval/common.py
+++ b/pybuda/pybuda/op/eval/common.py
@@ -15,8 +15,6 @@
 from collections import defaultdict
 from loguru import logger
 
-from pybuda._C.backend_api import OpModelDesc
-from pybuda._C.balancer import FusedSubOpModel, OpModel
 
 from ...pybudaglobal import TILE_DIM
 
@@ -58,7 +56,7 @@ def cast_for_cpu_eval(t_ops, op_name=None):
             t_ops[index] = op.to(torch.float32)
         if op.dtype == torch.int8:
             t_ops[index] = op.to(torch.float32)
-            if op_name == "matmul":
+            if op_name == "matmul" or op_name == "depthwise":
                 original_type = torch.int32
             elif op_name == "sparse_matmul":
                 original_type = torch.int8
@@ -331,79 +329,132 @@ def data_format_to_int(df: DataFormat) -> int:
         return 11
     raise RuntimeError(f"Unknown data format {df}")
 
-def op_model_to_desc(type: str, arch_name: str, op_model: OpModel, sub_op_model: FusedSubOpModel = None) -> OpModelDesc:
-    desc = OpModelDesc()
-    desc.arch = arch_name
-    desc.data_format = op_model.data_format
-    desc.math_fidelity = op_model.math_fidelity()
-    desc.t = op_model.output_buffers[0].block_shape.t
-
-    if op_model.op_type() == "fused_op":
-        desc.type = sub_op_model.type
-        desc.mblock_m = sub_op_model.mblock_m
-        desc.mblock_n = sub_op_model.mblock_n
-        desc.ublock_rt = sub_op_model.ublock_rt
-        desc.ublock_ct = sub_op_model.ublock_ct
-
-        if (desc.type == "matmul"):
-            desc.mblock_k = sub_op_model.mblock_k
-            desc.ublock_kt = sub_op_model.ublock_kt
-        elif (desc.type == "reduce"):
-            desc.op_attr = sub_op_model.reduce_dim
-    else:
-        desc.type = type
-        desc.mblock_m = op_model.output_buffers[0].block_shape.mblock_m
-        desc.mblock_n = op_model.output_buffers[0].block_shape.mblock_n
-        desc.ublock_rt = op_model.output_buffers[0].block_shape.ublock.rt
-        desc.ublock_ct = op_model.output_buffers[0].block_shape.ublock.ct
-
-        if type == "matmul":
-            if op_model.is_sparse_matmul:
-                desc.ublock_kt = op_model.input_buffers[1].block_shape.ublock.rt
-                desc.mblock_k = op_model.op_shape.inputs[1].rt // desc.ublock_kt
-                desc.sparse_indices = op_model.sparse_indices
-                if os.environ.get("PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES", False):
-                    desc.sparse_nz_ublocks = op_model.nz_ublocks
-                    desc.sparse_nz_strips = op_model.nz_strips
-
-                    # op model descriptor assumes grid_size [1, 1], so we need to scale down the number of
-                    # sparse tiles, ublocks and strips to what is expected to end up on a single core
-                    if os.environ.get("PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS", False):
-                        if op_model.nz_tiles > 1:
-                            desc.sparse_indices = max(op_model.nz_tiles // op_model.grid_shape.r, 1)
-                        else:
-                            desc.sparse_indices = op_model.nz_tiles
-
-                        if op_model.nz_ublocks > 1:
-                            desc.sparse_nz_ublocks = max(op_model.nz_ublocks // op_model.grid_shape.r, 1)
-
-                        if op_model.nz_strips > 1:
-                            desc.sparse_nz_strips = max(op_model.nz_strips // op_model.grid_shape.r, 1) 
-                else:
-                    # old sparse estimates
-                    if os.environ.get("PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS", False):
-                        if op_model.sparse_indices > 1:
-                            desc.sparse_indices = max(op_model.sparse_indices // op_model.grid_shape.r, 1)
-            else:
-                desc.ublock_kt = op_model.input_buffers[0].block_shape.ublock.ct
-                desc.mblock_k = op_model.op_shape.inputs[0].ct // desc.ublock_kt
-
-        if type == "depthwise":
-            desc.mblock_k = op_model.op_shape.inputs[1].rt
-            desc.ublock_kt = 1
-
-        desc.op_attr = op_model.get_reduce_dim()
-        # desc.op_attr is only used to capture the dim of reduce - ideally, we should support tt::BudaOpAttrs in
-        # tt_op_model_desc - when we do, uncomment the line below
-        # desc.op_attr = op_model.buda_op_attrs()
-
-        # If reduce_z, we manually copy the "z" param to special field in tt_op_model_desc - we should pass all buda attrs
-        if type == "reduce" and op_model.buda_op_attrs()["dim"] == "z":
-            desc.reduce_z = op_model.buda_op_attrs()["z"]
-
-    desc.approx_mode = "PYBUDA_EXP_APPROX" in os.environ
-
-    return desc
+# def op_model_to_desc(
+#     type: str,
+#     arch_name: str,
+#     op_model: OpModel,
+#     sub_op_model: FusedSubOpModel = None,
+#     sparse_grid_row=-1,
+# ) -> OpModelDesc:
+
+#     desc = OpModelDesc()
+#     desc.arch = arch_name
+#     desc.data_format = op_model.data_format
+#     desc.math_fidelity = op_model.math_fidelity()
+#     desc.t = op_model.output_buffers[0].block_shape.t
+#     desc.approx_mode = False
+
+#     if op_model.op_type() == "fused_op":
+#         desc.type = sub_op_model.type
+#         desc.mblock_m = sub_op_model.mblock_m
+#         desc.mblock_n = sub_op_model.mblock_n
+#         desc.ublock_rt = sub_op_model.ublock_rt
+#         desc.ublock_ct = sub_op_model.ublock_ct
+
+#         if (desc.type == "matmul"):
+#             desc.mblock_k = sub_op_model.mblock_k
+#             desc.ublock_kt = sub_op_model.ublock_kt
+#         elif (desc.type == "reduce"):
+#             desc.op_attr = sub_op_model.reduce_dim
+
+#         desc.approx_mode = "PYBUDA_EXP_APPROX" in os.environ
+#     else:
+#         desc.type = type
+#         desc.mblock_m = op_model.output_buffers[0].block_shape.mblock_m
+#         desc.mblock_n = op_model.output_buffers[0].block_shape.mblock_n
+#         desc.ublock_rt = op_model.output_buffers[0].block_shape.ublock.rt
+#         desc.ublock_ct = op_model.output_buffers[0].block_shape.ublock.ct
+
+#         if type == "matmul":
+#             if op_model.is_sparse_matmul:
+#                 desc.ublock_kt = op_model.input_buffers[1].block_shape.ublock.rt
+#                 desc.mblock_k = op_model.op_shape.inputs[1].rt // desc.ublock_kt
+#                 desc.sparse_indices = op_model.sparse_indices
+#                 scale_sparse_args = bool(int(os.environ.get("PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS", True)))
+#                 if bool(int(os.environ.get("PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES", True))):
+#                     sparse_metadata = op_model.get_sparse_metadata()
+#                     desc.sparse_indices = sum(sparse_metadata.nz_tiles)
+#                     desc.sparse_nz_ublocks = sum(sparse_metadata.nz_ublocks)
+#                     desc.sparse_nz_strips = sum(sparse_metadata.nz_strips)
+
+#                     # Op model descriptor assumes grid_size [1, 1], so we need to scale down the parameters to what is
+#                     # expected to end up on a single core. Initially, we did this by averaging the parameters with the
+#                     # number of cores. However, not all the cores perform the same amount of work, so we need to
+#                     # calculate parameters per core. We keep both of these modes in this transition period.
+#                     #
+#                     # PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS (scale_sparse_args) must be set to true to enable any of
+#                     # the mentioned modes.
+#                     #
+#                     # Mode 1:
+#                     #   Average the parameters (by default)
+#                     # Mode 2:
+#                     #   Scale the parameters by the number of cores (needs the env var
+#                     #   "PYBUDA_TEMP_SPARSE_ESTIMATE_ARGS_PER_CORE" to be set to true)
+#                     #
+#                     if scale_sparse_args:
+#                         per_core_mode = os.environ.get("PYBUDA_TEMP_SPARSE_ESTIMATE_ARGS_PER_CORE", False)
+#                         if not per_core_mode:
+#                             # Average mode
+#                             #
+#                             nz_tiles = sum(sparse_metadata.nz_tiles)
+#                             nz_ublocks = sum(sparse_metadata.nz_ublocks)
+#                             nz_strips = sum(sparse_metadata.nz_strips)
+
+#                             if nz_tiles > 1:
+#                                 desc.sparse_indices = max(nz_tiles // op_model.grid_shape.r, 1)
+#                             else:
+#                                 desc.sparse_indices = nz_tiles
+
+#                             if nz_ublocks > 1:
+#                                 desc.sparse_nz_ublocks = max(nz_ublocks // op_model.grid_shape.r, 1)
+
+#                             if nz_strips > 1:
+#                                 desc.sparse_nz_strips = max(nz_strips // op_model.grid_shape.r, 1)
+#                         else:
+#                             # Per core mode
+#                             #
+#                             assert sparse_grid_row != -1  # Must provide which row of cores we're fetching the estimates for
+#                             desc.sparse_indices = sparse_metadata.nz_tiles[sparse_grid_row]
+#                             desc.sparse_nz_ublocks = sparse_metadata.nz_ublocks[sparse_grid_row]
+#                             desc.sparse_nz_strips = sparse_metadata.nz_strips[sparse_grid_row]
+#                 else:
+#                     # old sparse estimates
+#                     if scale_sparse_args:
+#                         if op_model.sparse_indices > 1:
+#                             desc.sparse_indices = max(op_model.sparse_indices // op_model.grid_shape.r, 1)
+#             else:
+#                 desc.ublock_kt = op_model.input_buffers[0].block_shape.ublock.ct
+#                 desc.mblock_k = op_model.op_shape.inputs[0].ct // desc.ublock_kt
+
+#                 # requant/dequant part of matmul is calculated separately for now, and we need to pass
+#                 # matmul output format here
+#                 if "requant" in op_model.buda_op_attrs() or "dequant" in op_model.buda_op_attrs():
+#                     desc.data_format = DataFormat.Int32
+
+#         if type == "depthwise":
+#             desc.mblock_k = op_model.op_shape.inputs[1].rt
+#             desc.ublock_kt = 1
+#         if type == "maximum":
+#             if arch_name == "blackhole":
+#                 desc.version = 1
+#             else:
+#                 desc.version = 2
+
+#         desc.op_attr = op_model.get_reduce_dim()
+#         # desc.op_attr is only used to capture the dim of reduce - ideally, we should support tt::BudaOpAttrs in
+#         # tt_op_model_desc - when we do, uncomment the line below
+#         # desc.op_attr = op_model.buda_op_attrs()
+
+#         # If reduce_z, we manually copy the "z" param to special field in tt_op_model_desc - we should pass all buda attrs
+#         if type == "reduce" and op_model.buda_op_attrs()["dim"] == "z":
+#             desc.reduce_z = op_model.buda_op_attrs()["z"]
+
+#     attrs = op_model.buda_op_attrs()
+#     # If the attributes contain approximate mode set it.
+#     if 'approximate_mode' in attrs:
+#         desc.approx_mode = attrs['approximate_mode'] == 'true'
+
+#     return desc
 
 def calculate_tile_size(val):
     # We might not even care about large dim size 
@@ -434,30 +485,30 @@ def calculate_tile_size(val):
 # Global compiler cache
 g_compiler_perf_cache : defaultdict = defaultdict(dict)
 
-def get_compiler_cached_cycles(desc: OpModelDesc) -> int:
-    global g_compiler_perf_cache
+# def get_compiler_cached_cycles(desc: OpModelDesc) -> int:
+#     global g_compiler_perf_cache
 
-    if not g_compiler_perf_cache:
-        cache_file = os.environ.get("PYBUDA_COMPILER_CACHE", None)
-        if cache_file is not None and os.path.exists(cache_file):
-            with open(os.environ["PYBUDA_COMPILER_CACHE"], 'rb') as file:
-                import pickle
-                g_compiler_perf_cache = pickle.load(file)
-        else:
-            return None
+#     if not g_compiler_perf_cache:
+#         cache_file = os.environ.get("PYBUDA_COMPILER_CACHE", None)
+#         if cache_file is not None and os.path.exists(cache_file):
+#             with open(os.environ["PYBUDA_COMPILER_CACHE"], 'rb') as file:
+#                 import pickle
+#                 g_compiler_perf_cache = pickle.load(file)
+#         else:
+#             return None
 
-    cached_op_model = g_compiler_perf_cache["op_model"]
+#     cached_op_model = g_compiler_perf_cache["op_model"]
 
-    if desc.type in cached_op_model:
-        cache_cycles = cached_op_model[desc.type]
-        shapes = (desc.mblock_m, desc.mblock_n, desc.ublock_rt, desc.ublock_ct, desc.t)
+#     if desc.type in cached_op_model:
+#         cache_cycles = cached_op_model[desc.type]
+#         shapes = (desc.mblock_m, desc.mblock_n, desc.ublock_rt, desc.ublock_ct, desc.t)
 
-        if desc.type == 'matmul':  # append k dim to lookup
-            shapes = shapes + (desc.mblock_k, desc.ublock_kt)
+#         if desc.type == 'matmul':  # append k dim to lookup
+#             shapes = shapes + (desc.mblock_k, desc.ublock_kt)
 
-        if shapes in cache_cycles:
-            cycle_count = cache_cycles[shapes]
-            # print(f"Using recorded cycle count for {desc.type} of shapes {shapes} -> {cycle_count}")
-            return cycle_count
+#         if shapes in cache_cycles:
+#             cycle_count = cache_cycles[shapes]
+#             # print(f"Using recorded cycle count for {desc.type} of shapes {shapes} -> {cycle_count}")
+#             return cycle_count
 
-    return None
+#     return None
diff --git a/pybuda/pybuda/op/eval/interface.py b/pybuda/pybuda/op/eval/interface.py
index 809ff6c0b..2a43b09ab 100644
--- a/pybuda/pybuda/op/eval/interface.py
+++ b/pybuda/pybuda/op/eval/interface.py
@@ -3,7 +3,6 @@
 # SPDX-License-Identifier: Apache-2.0
 import torch
 from typing import List, Tuple, Dict, Union, Optional
-from pybuda._C.balancer import OpModel, OpShape
 from pybuda._C.graph import NodeContext, OpType
 from pybuda._C.passes import LoweringContext, DecomposingContext
 from pybuda._C.autograd import AutogradContext
@@ -218,13 +217,13 @@ def shape(
     ) -> Tuple[Tuple[int], List[int]]:
         raise NotImplemented()
 
-    def parallelization(self, op_shape: OpShape, fracture_factor: int) -> Tuple[int]:
+    def parallelization(self, op_shape, fracture_factor) -> Tuple[int]:
         raise NotImplemented()
 
     def input_ublock_order(self, num_tensors: int):
         raise NotImplemented()
 
-    def execution_cycles(self, arch_name: str, op_model: OpModel) -> int:
+    def execution_cycles(self, arch_name, op_model) -> int:
         raise NotImplemented()
 
     def is_tm(self) -> bool:
diff --git a/pybuda/pybuda/op/eval/pybuda/__init__.py b/pybuda/pybuda/op/eval/pybuda/__init__.py
index c58df8fdf..b1371bec7 100644
--- a/pybuda/pybuda/op/eval/pybuda/__init__.py
+++ b/pybuda/pybuda/op/eval/pybuda/__init__.py
@@ -5,6 +5,20 @@
 from types import ModuleType
 from functools import lru_cache
 from .transpose import TransposeTM
+from .exp import Exp
+from .cosine import Cosine
+from .ethernet_datacopy import EthernetDatacopy
+from .reciprocal import Reciprocal
+from .abs import Abs
+from .tanh import Tanh
+from .log import Log
+from .nop import Nop
+from .buffer import Buffer
+from .sqrt import Sqrt
+from .tilizer import Tilizer
+from .clip import Clip
+from .cumulativesum import CumulativeSum
+from .argmax import Argmax
 
 op_to_module_map = {
         "add":           "eltwise_binary",
@@ -24,29 +38,30 @@
         "not_equal":     "eltwise_binary",
         "logical_and":   "eltwise_binary",
 
-        "nop":             "eltwise_unary",
-        "buffer":          "eltwise_unary",
-        "exp":             "eltwise_unary",
-        "reciprocal":      "eltwise_unary",
-        "sqrt":            "eltwise_unary",
+        "exp":              Exp,
+        "reciprocal":      Reciprocal,
+        "nop":             Nop,
+        "buffer":          Buffer,
+        "sqrt":             Sqrt,
         "relu":            "eltwise_unary",
         "leaky_relu":      "eltwise_unary",
         "gelu":            "eltwise_unary",
         "gelu_derivative": "eltwise_unary",
-        "log":             "eltwise_unary",
+        "log":             Log,
         "sigmoid":         "eltwise_unary",
-        "clip":            "eltwise_unary",
-        "abs":             "eltwise_unary",
-        "cosine":          "eltwise_unary",
+        "clip":            Clip,
+        "cosine":          Cosine,
+        "abs":             Abs,
         "sine":            "eltwise_unary",
         "tile_broadcast":  "eltwise_unary",
-        "argmax":          "eltwise_unary",
-        "tanh":            "eltwise_unary",
-        "cumsum":          "eltwise_unary",
+        "tanh":            Tanh,
+        "cumsum":          CumulativeSum,
+        "argmax":          Argmax,
         "logical_not":     "eltwise_unary",
         "dropout":         "eltwise_unary",
         "pow":             "eltwise_unary",
-        "tilizer":          "eltwise_unary",
+        "tilizer":          Tilizer,
+        "erf":             "eltwise_unary",
 
         "conv_sum":        "eltwise_nary",
         "concatenate":     "eltwise_nary",
@@ -61,6 +76,7 @@
         "depthwise": "depthwise",
 
         "embedding": "embedding",
+        "ethernet_datacopy": EthernetDatacopy,
 
         "transpose":                   TransposeTM,
         "adv_index":                   "tm",
@@ -117,12 +133,14 @@
         "mask": "mask",
         "layernorm": "nn",
         "layernorm_bw": "nn",
+        "batchnorm" : "nn", 
 
         "quantize"            : "quantize",
         "buda_quantize"       : "quantize",
         "dequantize"          : "quantize",
         "requantize"          : "quantize",
         "buda_requantize"     : "quantize",
+        "buda_dequantize"     : "quantize",
 }
 
 
diff --git a/pybuda/pybuda/op/eval/pybuda/abs.py b/pybuda/pybuda/op/eval/pybuda/abs.py
new file mode 100644
index 000000000..426f02057
--- /dev/null
+++ b/pybuda/pybuda/op/eval/pybuda/abs.py
@@ -0,0 +1,73 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import torch
+import torch.nn.functional
+from ..interface import PyEltwiseUnaryOp
+from loguru import logger
+from ..common import to_torch_operands
+from ....pybudaglobal import TILE_DIM
+from ....tensor import buda_dataformat_to_pytorch_dtype
+import numpy as np
+from pybuda.op.eval.common import calculate_tile_size
+from ..buda.abs import Abs as BudaAbs
+
+
+class Abs(PyEltwiseUnaryOp):
+    @classmethod
+    def create(cls):
+        self = cls("abs")
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Abs should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        ret = torch.abs(tensors[0])
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+
+    def shape(self, tensor_shapes):
+        assert len(tensor_shapes) == 1, "Abs should have one input"
+        shape = tensor_shapes[0]
+        return shape, []
+
+    def backward(self, ac, operand, inputs, output, grad):
+        assert len(inputs) == 1, "Abs should have one input"
+        assert operand == 0, "Invalid operand index"
+        heaviside = ac.op("heaviside", (inputs[0], ac.constant(0.5)))
+        subtract = ac.op("subtract", (heaviside, ac.constant(0.5)))
+        stretched = ac.op("multiply", (subtract, ac.constant(2.0)))
+        return ac.op("multiply", (stretched, grad))
+
+    def lower(self, lc, tensors, outputs):
+        assert len(tensors) == 1, "Abs should  have one input"
+
+        if bool(int(os.environ.get("PYBUDA_ENABLE_TINY_TILE", "0"))):
+            node_shape = list(tensors[0].shape)
+            tile_height = calculate_tile_size(node_shape[-2])
+            tile_width = calculate_tile_size(node_shape[-1])
+            vector = "" if tile_height == TILE_DIM else "r"
+        else:
+            vector = None
+            tile_height, tile_width = TILE_DIM, TILE_DIM
+
+        lc.op(
+            BudaAbs.create(vector=vector),
+            tensors,
+            tile_height=tile_height,
+            tile_width=tile_width,
+        )
+
+    def initial_flops_estimate(self, tensor_shapes):
+        flops = 0
+        output_shape = self.shape(tensor_shapes)[0]
+        flops = np.prod(output_shape)
+
+        return flops
diff --git a/pybuda/pybuda/op/eval/pybuda/argmax.py b/pybuda/pybuda/op/eval/pybuda/argmax.py
new file mode 100644
index 000000000..e5607e45f
--- /dev/null
+++ b/pybuda/pybuda/op/eval/pybuda/argmax.py
@@ -0,0 +1,113 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import torch
+import torch.nn.functional
+from ..interface import PyEltwiseUnaryOp
+from loguru import logger
+from ..common import to_torch_operands
+from ....pybudaglobal import TILE_DIM
+from ....tensor import buda_dataformat_to_pytorch_dtype
+import numpy as np
+from pybuda.op.eval.common import calculate_tile_size
+from ..buda.abs import Abs as BudaAbs
+
+
+class Argmax(PyEltwiseUnaryOp):
+    @classmethod
+    def create(cls, dim=None):
+        self = cls("argmax")
+        self.dim = dim
+
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Argmax should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors] 
+
+        if hasattr(self, 'dim'):
+            dim=self.dim
+        else:
+            dim=None
+
+        ret = torch.argmax(tensors[0], dim, keepdims=True)
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+
+    def shape(self, tensor_shapes):
+        assert len(tensor_shapes) == 1, "Argmax should have one input"
+
+        if hasattr(self, 'dim'):
+            dim = self.dim
+        else:
+            dim = None
+
+        if dim is not None:
+            shape = list(tensor_shapes[0])
+            shape[dim] = 1
+        else:
+            shape = [1] * len(tensor_shapes[0])
+        return tuple(shape), []
+
+    def backward(self, ac, operand, inputs, output, grad):
+        raise RuntimeError(
+            "Argmax does not require grad and does not have a backwards function"
+        )
+
+    def decompose(self, dc, inputs):
+        inp_node = inputs[0]
+
+        if hasattr(self, 'dim'):
+            axis = self.dim
+        else:
+            axis=None
+
+        if axis is None:
+            import math
+
+            inp_node = dc.op(
+                "reshape", [inp_node], (1, math.prod(inp_node.shape.as_list()))
+            )
+            axis = -1
+
+        input_shape = inp_node.shape.as_list()
+        if axis >= 0:
+            axis -= len(input_shape)
+
+        data_type = buda_dataformat_to_pytorch_dtype(inp_node.output_df)
+        range_shape = [
+            dim if i == axis + len(input_shape) else 1
+            for i, dim in enumerate(input_shape)
+        ]
+
+        range = torch.arange(input_shape[axis], dtype=data_type).reshape(range_shape)
+        range_tensor = dc.tensor(range)
+
+        factor = torch.ones((input_shape), dtype=data_type) * 1e10
+        factor_tensor = dc.tensor(factor)
+
+        mult_1 = dc.op(
+            "multiply",
+            [inp_node, factor_tensor],
+        )
+        softmax = dc.op("softmax", [mult_1], (axis, 1))
+        mult_2 = dc.op("multiply", [softmax, range_tensor])
+        reduce_sum = dc.op("reduce_sum", [mult_2], (axis,))
+        dc.fuse(reduce_sum)
+
+    def lower(self, lc, tensors, outputs):
+        return None
+
+    def initial_flops_estimate(self, tensor_shapes):
+        flops = 0
+        output_shape = self.shape(tensor_shapes)[0]
+        flops = np.prod(output_shape)
+
+        return flops
diff --git a/pybuda/pybuda/op/eval/pybuda/buffer.py b/pybuda/pybuda/op/eval/pybuda/buffer.py
new file mode 100644
index 000000000..8d275fbf4
--- /dev/null
+++ b/pybuda/pybuda/op/eval/pybuda/buffer.py
@@ -0,0 +1,59 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import torch
+import torch.nn.functional
+from ..interface import PyEltwiseUnaryOp
+from loguru import logger
+from ..common import to_torch_operands
+from ....pybudaglobal import TILE_DIM
+import numpy as np
+from pybuda.op.eval.common import calculate_tile_size
+from ..buda.buffer import Buffer as BudaBuffer
+
+
+class Buffer(PyEltwiseUnaryOp):
+    @classmethod
+    def create(cls):
+        self = cls("buffer")
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "buffer should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        ret = tensors[0]
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+        return ret
+
+    def shape(self, tensor_shapes):
+        assert len(tensor_shapes) == 1, "Eltwise unary should have one input"
+        shape = tensor_shapes[0]
+        return shape, []
+
+    def backward(self, ac, operand, inputs, output, grad):
+        assert len(inputs) == 1, "Buffer should have one input"
+        assert operand == 0, "Invalid operand index"
+        return ac.op(
+            Buffer.create(),
+            [grad],
+        )
+
+    def lower(self, lc, tensors, outputs):
+        assert len(tensors) == 1, "Buffer should  have one input"
+
+        if bool(int(os.environ.get("PYBUDA_ENABLE_TINY_TILE", "0"))):
+            node_shape = list(tensors[0].shape)
+            tile_height = calculate_tile_size(node_shape[-2])
+            tile_width = calculate_tile_size(node_shape[-1])
+        else:
+            tile_height, tile_width = TILE_DIM, TILE_DIM
+
+        lc.op(
+            BudaBuffer.create(), tensors, tile_height=tile_height, tile_width=tile_width
+        )
diff --git a/pybuda/pybuda/op/eval/pybuda/clip.py b/pybuda/pybuda/op/eval/pybuda/clip.py
new file mode 100644
index 000000000..b71921f43
--- /dev/null
+++ b/pybuda/pybuda/op/eval/pybuda/clip.py
@@ -0,0 +1,114 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import torch
+import torch.nn.functional
+from ..interface import PyEltwiseUnaryOp
+from loguru import logger
+from ..common import to_torch_operands
+from ....pybudaglobal import TILE_DIM
+from ....tensor import buda_dataformat_to_pytorch_dtype
+import numpy as np
+from pybuda.op.eval.common import calculate_tile_size
+from ..buda.nop import Nop as BudaNop
+
+
+class Clip(PyEltwiseUnaryOp):
+    @classmethod
+    def create(cls, min=float('-inf'), max=float('inf')):
+        self = cls("clip")
+        self.min = min
+        self.max = max
+
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Clip should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        ret = torch.clip(tensors[0], min=self.min, max=self.max)
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+
+    def shape(self, tensor_shapes):
+        assert len(tensor_shapes) == 1, "Clip should have one input"
+        shape = tensor_shapes[0]
+        return shape, []
+
+    def backward(self, ac, operand, inputs, output, grad):
+        assert len(inputs) == 1, "Clip should have one input"
+        assert operand == 0, "Invalid operand index"
+        heaviside = ac.op("heaviside", (inputs[0], ac.constant(0.5)))
+        subtract = ac.op("subtract", (heaviside, ac.constant(0.5)))
+        stretched = ac.op("multiply", (subtract, ac.constant(2.0)))
+        return ac.op("multiply", (stretched, grad))
+
+    def lower(self, lc, tensors, outputs):
+        assert len(tensors) == 1, "Clip should  have one input"
+
+        min_value = self.min
+        max_value = self.max
+
+        # Inf protection
+        if max_value > 65504.0:
+            max_value = 65504.0
+
+        if (min_value == 0) and (max_value >= 0):
+            res = lc.op(BudaNop.create(relu_en=True, relu_threshold=max_value, relu_mode="max"),(tensors[0],))
+            return
+
+        shape = list(tensors[0].shape.as_list())
+        # Align up to tile
+        shape[-2] = ((shape[-2] - 1) // TILE_DIM + 1) * TILE_DIM
+        shape[-1] = ((shape[-1] - 1) // TILE_DIM + 1) * TILE_DIM
+        # Align up to 4 dimensions
+        if len(shape) > 4:
+            raise RuntimeError(
+                "Operator clip, operand must have number of dimensions less or equal to 4. "
+            )
+        if len(shape) < 4:
+            shape = (4 - len(shape)) * [1] + shape
+
+        min_value_tensor = lc.tensor(torch.zeros(shape) + min_value)
+        max_value_tensor = lc.tensor(torch.zeros(shape) + max_value)
+        diff_tensor = lc.tensor(torch.zeros(shape) + max_value - min_value)
+
+        # General Formula/Algorithm
+        # y = ReLU(x - min_value) + min_value
+        # y = ReLU(0.0 - y + max_value) - max_value
+        # y = 0.0 - y
+
+        res = lc.op("subtract", (tensors[0], min_value_tensor))
+        # x - min_value
+        res = lc.op(BudaNop.create(relu_en=True, relu_threshold=0.0 ,relu_mode="min"),(res,))
+
+        # ReLU(x - min_value)
+        res = lc.op("subtract", (diff_tensor, res))
+        # diff_value - ReLU(x - min_value), diff = max - min
+        res = lc.op(
+            BudaNop.create(relu_en=True, relu_threshold=0.0,relu_mode="min"),
+            (res,))
+        
+        # ReLU(diff_value - ReLU(x - min_value))
+        lc.op("subtract", (max_value_tensor, res))
+        # max_value - ReLU(diff_value - ReLU(x - min_value))
+
+    def backward(self, ac, operand, inputs, output, grad):
+        x = inputs[0]
+        shape = x.shape.as_list()
+        min_value = self.min
+        max_value = self.max
+        min_value_tensor = ac.tensor(torch.zeros(shape) + min_value)
+        max_value_tensor = ac.tensor(torch.zeros(shape) + max_value)
+
+        ge_x = ac.op("greater_equal", (x, min_value_tensor))
+        le_x = ac.op("less_equal", (x, max_value_tensor))
+        mask = ac.op("multiply", (ge_x, le_x))
+        res = ac.op("multiply", (mask, grad))
+        return res
diff --git a/pybuda/pybuda/op/eval/pybuda/convolution.py b/pybuda/pybuda/op/eval/pybuda/convolution.py
index 4325cfcd7..476a87fd2 100644
--- a/pybuda/pybuda/op/eval/pybuda/convolution.py
+++ b/pybuda/pybuda/op/eval/pybuda/convolution.py
@@ -11,6 +11,7 @@
 from pybuda import Tensor
 from pybuda.config import _get_global_compiler_config
 from .transpose import TransposeTM
+from .buffer import Buffer
 
 from ..common import to_torch_operands
 from ..sparse_utils import (
@@ -165,7 +166,7 @@ def decompose_fracture_conv2d_at_op_level(attr, dc, inputs):
     # Fracture convs into multiple ops
     fractured_convs = []
     for curr_kH in range(kH):
-        fractured_weights = dc.op("index", [weights], (-2, curr_kH, curr_kH + 1, 1), dont_decompose=True)
+        fractured_weights = dc.op("index", [weights], (-2, curr_kH, curr_kH + 1, 1), dont_decompose=True, output_df=weights.output_df)
         fractured_conv_operands = [activations, fractured_weights]
         if curr_kH == 0 and bias:
             fractured_conv_operands.append(bias)
@@ -182,7 +183,7 @@ def decompose_fracture_conv2d_at_op_level(attr, dc, inputs):
     while len(fractured_convs) > 1:
         left = fractured_convs.pop(0)
         right = fractured_convs.pop(0)
-        result = dc.op("add", [left, right])
+        result = dc.op("add", [left, right], output_df=left.output_df)
         fractured_convs.append(result)
 
     dc.fuse(fractured_convs[0])
@@ -244,7 +245,10 @@ def rotate_convtranspose2d_weights(dc, weights, cin, cout, depthwise, groups, kH
             weights = dc.op("hslice", [weights], (cout,))
     weights = dc.op(TransposeTM.create(2, 3), [weights]) # Transpose weight
     # Reshape into conv2d weight shape
-    weights = dc.op("reshape", [weights], (cout, cin, kH, kW))
+    if depthwise:
+        weights = dc.op("reshape", [weights], (cin, cout // groups, kH, kW))
+    else:
+        weights = dc.op("reshape", [weights], (cout // groups, cin, kH, kW))
 
     return weights
 
@@ -279,10 +283,10 @@ def decompose_conv2d_sparse_first(attr, dc, inputs):
     depthwise = depthwise and not dc.is_training_enabled() and not is_convtranspose2d
 
     if channel_last:
-        activations = dc.op("reshape", [activations], (w, 1, y * x, cin))
+        activations = dc.op("reshape", [activations], (w, 1, y * x, cin), output_df=activations.output_df)
     else:
-        activations = dc.op("reshape", [activations], (w, 1, cin, y * x))
-        activations = dc.op(TransposeTM.create(2, 3), [activations])
+        activations = dc.op("reshape", [activations], (w, 1, cin, y * x), output_df=activations.output_df)
+        activations = dc.op(TransposeTM.create(2, 3), [activations], output_df=activations.output_df)
 
     weights = transform_weights_for_conv2d(dc, weights, cin, cout, depthwise, groups, kH, kW)
 
@@ -295,8 +299,8 @@ def decompose_conv2d_sparse_first(attr, dc, inputs):
         xout = xout_transpose
 
     result = activations
-    result = dc.op("pad_tile", [result], (-1, result.shape[3]))
-    result = dc.op("pad_tile", [result], (-2, result.shape[2]))
+    result = dc.op("pad_tile", [result], (-1, result.shape[3]), output_df=result.output_df)
+    result = dc.op("pad_tile", [result], (-2, result.shape[2]), output_df=result.output_df)
 
     padding_same = (padding == [(kW // 2), (kW // 2), (kH // 2), (kH // 2)])
     pad_for_factorization = False
@@ -318,7 +322,7 @@ def decompose_conv2d_sparse_first(attr, dc, inputs):
             if dense_c in sparse_weight_padding_concat:
                 padded_dense_c = sparse_weight_padding_concat[dense_c]
                 pad_dense_c = padded_dense_c - dense_c
-                result = dc.op("pad", [result], (0, pad_dense_c*32, 0, False))
+                result = dc.op("pad", [result], (0, pad_dense_c*32, 0, False), output_df=result.output_df)
             else: # efficientnet-lite lite1 variant
                 padded_dense_c = sparse_weight_padding_mm[dense_c]
                 index = torch.arange(result.shape[-1]).tolist()
@@ -361,10 +365,10 @@ def decompose_conv2d_sparse_first(attr, dc, inputs):
                     picker = torch.sparse.mm(picker, transpose_tensor)
                 pickers.append(picker)
         sparse = dc.tensor(torch.stack(pickers).unsqueeze(0))
-        result = dc.op("sparse_matmul", [sparse, result])
+        result = dc.op("sparse_matmul", [sparse, result], output_df=result.output_df)
 
     if kH * kW > 1:
-        result = dc.op("hstack", [result], (kH * kW,))
+        result = dc.op("hstack", [result], (kH * kW,), output_df=result.output_df)
 
     if depthwise:
         result = dc.op("depthwise", [result, weights], (kH * kW,))
@@ -383,8 +387,8 @@ def decompose_conv2d_sparse_first(attr, dc, inputs):
         else:
             result = dc.op("select", [result], (-2, 0, sparse_r * 32, sparse.shape[-2]))
 
-    result = dc.op("narrow", [result], (3, 0, cout, result.shape[3]))
-    result = dc.op("narrow", [result], (2, 0, yout * xout, result.shape[2]))
+    result = dc.op("narrow", [result], (3, 0, cout, result.shape[3]), output_df=result.output_df)
+    result = dc.op("narrow", [result], (2, 0, yout * xout, result.shape[2]), output_df=result.output_df)
 
     if bias is not None:
         bias_y, bias_x = 1, 1
@@ -452,14 +456,14 @@ def decompose_conv2d_sparse_second(attr, dc, inputs):
     v_slice_factor = result.shape[-2] // TILE_DIM
     if v_slice_factor > 1:
         result = dc.op("vslice", [result], (v_slice_factor,))
-        result = dc.op("buffer", [result])  # HW workaround for: tenstorrent/budabackend#656
+        result = dc.op(Buffer.create(), [result])  # HW workaround for: tenstorrent/budabackend#656
 
     if (kH * kW) > 1:
         result = dc.op("hslice", [result], (kH * kW,))
         if "PYBUDA_MIDDLE_CNN_BUFFER" in os.environ: # most workloads are ok without it, and perf is much better... so enable only where needed
-            result = dc.op("buffer", [result])  # HW workaround for: tenstorrent/budabackend#656
+            result = dc.op(Buffer.create(), [result])  # HW workaround for: tenstorrent/budabackend#656
         result = dc.op("vstack", [result], (kH * kW,))
-        result = dc.op("buffer", [result])  # HW workaround for: tenstorrent/budabackend#656
+        result = dc.op(Buffer.create(), [result])  # HW workaround for: tenstorrent/budabackend#656
 
     if v_slice_factor > 1:
         result = dc.op("vstack", [result], (v_slice_factor,))
@@ -693,7 +697,7 @@ def eval(type, attr, ops):
         # from the network, and thus grad will not be generated for it on the .backwards() calls
         tconv = torch.nn.ConvTranspose2d(
             activations.shape[1],
-            weights.shape[1],
+            weights.shape[1] * groups,
             kernel_size=weights.shape[2],
             stride=stride,
             padding=padding[0],
@@ -768,6 +772,7 @@ def decompose(type, attr, dc, inputs):
         # We allow prestriding only in inference mode currently
         compiler_cfg = dc.get_compiler_cfg()
         is_prestride_enabled = not dc.is_training_enabled() and compiler_cfg.enable_conv_prestride
+
         if is_prestride_enabled and should_prestride(attr, dc, inputs):
             # Prestride
             decompose_conv2d_prestride(attr, dc, inputs)
@@ -806,7 +811,7 @@ def decompose(type, attr, dc, inputs):
             w, cin, y, x = (activations.shape.w, activations.shape.z, activations.shape.r, activations.shape.c)
             _, _, yout, xout = conv2d_out_shape('conv2d_transpose', attr, [activations.shape, weights.shape])[0]
 
-        _, cout, kH, kW = (weights.shape.w, weights.shape.z, weights.shape.r, weights.shape.c)
+        _, cout, kH, kW = (weights.shape.w, weights.shape.z * groups, weights.shape.r, weights.shape.c)
 
         # Transform padding from convtranspose2d space to conv2d
         actual_padding = [
@@ -816,7 +821,6 @@ def decompose(type, attr, dc, inputs):
             dilation * (kH - 1) - padding[3],
         ]
         depthwise = (cin == groups) and (cout == cin)
-        assert depthwise == False or cin == 1, "Dont support depthwise Conv2d Transpose yet"
 
         # stride > 1 means we dilate the input activations
         if stride > 1:
diff --git a/pybuda/pybuda/op/eval/pybuda/cosine.py b/pybuda/pybuda/op/eval/pybuda/cosine.py
new file mode 100644
index 000000000..5d3a8061e
--- /dev/null
+++ b/pybuda/pybuda/op/eval/pybuda/cosine.py
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import torch
+import torch.nn.functional
+from ..interface import PyEltwiseUnaryOp
+from loguru import logger
+from ..common import to_torch_operands
+from ....pybudaglobal import TILE_DIM
+from ....tensor import buda_dataformat_to_pytorch_dtype
+import numpy as np
+from pybuda.op.eval.common import calculate_tile_size
+from ..buda.cosine import Cosine as BudaCosine
+
+
+class Cosine(PyEltwiseUnaryOp):
+    @classmethod
+    def create(cls):
+        self = cls("cosine")
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Cosine should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+
+        ret = torch.cos(tensors[0])
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+
+    def shape(self, tensor_shapes):
+        assert len(tensor_shapes) == 1, "Cosine should have one input"
+        shape = tensor_shapes[0]
+        return shape, []
+
+    def backward(self, ac, operand, inputs, output, grad):
+        assert False, f"Cosine not defined in eltwise unary backward."
+
+    def lower(self, lc, tensors, outputs):
+        assert len(tensors) == 1, "Cosine should  have one input"
+        if bool(int(os.environ.get("PYBUDA_ENABLE_TINY_TILE", "0"))):
+            node_shape = list(tensors[0].shape)
+            tile_height = calculate_tile_size(node_shape[-2])
+            tile_width = calculate_tile_size(node_shape[-1])
+            vector = "" if tile_height == TILE_DIM else "r"
+        else:
+            vector = None
+            tile_height, tile_width = TILE_DIM, TILE_DIM
+
+        lc.op(
+            BudaCosine.create(vector=vector),
+            tensors,
+            tile_height=tile_height,
+            tile_width=tile_width,
+        )
+
+    def initial_flops_estimate(self, tensor_shapes):
+        flops = 0
+        output_shape = self.shape(tensor_shapes)[0]
+        flops = np.prod(output_shape)
+
+        return flops
diff --git a/pybuda/pybuda/op/eval/pybuda/cumulativesum.py b/pybuda/pybuda/op/eval/pybuda/cumulativesum.py
new file mode 100644
index 000000000..2599bb41a
--- /dev/null
+++ b/pybuda/pybuda/op/eval/pybuda/cumulativesum.py
@@ -0,0 +1,62 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import torch
+import torch.nn.functional
+from ..interface import PyEltwiseUnaryOp
+from loguru import logger
+from ..common import to_torch_operands
+from ....pybudaglobal import TILE_DIM
+from ....tensor import buda_dataformat_to_pytorch_dtype
+import numpy as np
+from pybuda.op.eval.common import calculate_tile_size
+from ..buda.abs import Abs as BudaAbs
+from .nop import Nop
+
+
+class CumulativeSum(PyEltwiseUnaryOp):
+    @classmethod
+    def create(cls, axis, exclusive=False):
+        self = cls("cumsum")
+        self.axis = axis
+        self.exclusive = exclusive
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Cumulative Sum should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        ret = torch.cumsum(tensors[0], dim=self.axis)
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+
+    def shape(self, tensor_shapes):
+        assert len(tensor_shapes) == 1, "Cumulative Sum should have one input"
+        shape = tensor_shapes[0]
+        return shape, []
+
+    def backward(self, ac, operand, inputs, output, grad):
+        assert len(inputs) == 1, "Cumulative Sum should have one input"
+        assert operand == 0, "Invalid operand index"
+        dim = self.axis
+        assert (
+            dim == 0
+        ), "Unsupported dim different then 0 for cumulative sum backward pass"
+        if dim == 0:
+            return ac.op(Nop.create(), (grad,))
+
+    def lower(self, lc, tensors, outputs):
+        return None
+
+    def initial_flops_estimate(self, tensor_shapes):
+        flops = 0
+        output_shape = self.shape(tensor_shapes)[0]
+        flops = np.prod(output_shape)
+
+        return flops
diff --git a/pybuda/pybuda/op/eval/pybuda/depthwise.py b/pybuda/pybuda/op/eval/pybuda/depthwise.py
index 8f274fce2..e77daaa50 100644
--- a/pybuda/pybuda/op/eval/pybuda/depthwise.py
+++ b/pybuda/pybuda/op/eval/pybuda/depthwise.py
@@ -4,7 +4,7 @@
 import torch
 
 from pybuda.pybudaglobal import TILE_DIM
-from ..common import to_torch_operands
+from ..common import to_torch_operands, cast_for_cpu_eval
 
 
 def eval(type, attr, ops):
@@ -13,6 +13,7 @@ def eval(type, attr, ops):
     assert len(attr) == 1, "Depthwise matmul should have one attribute"
 
     t_ops = to_torch_operands(*ops)
+    t_ops, original_type = cast_for_cpu_eval(t_ops, type)
     in0 = t_ops[0]
     in1 = t_ops[1]
     bias = t_ops[2] if len(t_ops) == 3 else None
@@ -34,7 +35,7 @@ def eval(type, attr, ops):
 
     assert bias is None, "Unexpected fused bias in depthwise, can be added..."
 
-    return result
+    return result.to(original_type)
 
 
 def shape(type, attr, ops):
diff --git a/pybuda/pybuda/op/eval/pybuda/dram_queue.py b/pybuda/pybuda/op/eval/pybuda/dram_queue.py
index e9e7615df..b40b98e63 100644
--- a/pybuda/pybuda/op/eval/pybuda/dram_queue.py
+++ b/pybuda/pybuda/op/eval/pybuda/dram_queue.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import torch
 import torch.nn.functional
+from .buffer import Buffer
 
 from ..common import to_torch_operands
 
@@ -23,4 +24,4 @@ def lower(type, attr, lc, ops, outputs):
 
 
 def backward(type, attr, ac, operand, inputs, output, grad):
-    return ac.op("buffer", (grad, ))
+    return ac.op(Buffer.create(), (grad, ))
diff --git a/pybuda/pybuda/op/eval/pybuda/eltwise_binary.py b/pybuda/pybuda/op/eval/pybuda/eltwise_binary.py
index 23765423f..f90ead13c 100644
--- a/pybuda/pybuda/op/eval/pybuda/eltwise_binary.py
+++ b/pybuda/pybuda/op/eval/pybuda/eltwise_binary.py
@@ -9,8 +9,12 @@
 import numpy as np
 import torch
 from .transpose import TransposeTM
-
-import os
+from ..buda.exp import Exp as BudaExp
+from .reciprocal import Reciprocal
+from .log import Log
+from ..buda.log import Log as BudaLog
+from .nop import Nop
+from ..buda.nop import Nop as BudaNop
 
 from ..common import to_torch_operands
 from pybuda.utils import align_up_tile
@@ -107,7 +111,8 @@ def lower(type, attr, lc, ops, outputs):
         A = ops[0]
         B = ops[1]
         in_shape = A.shape.as_list()
-        amplification = 1e4
+        # we have to amplify the difference between A and B to make sure absolute diff is greater than 1.
+        amplification = 1e10
 
         if len(in_shape) > 4:
             raise RuntimeError("Shape size is out of range.")
@@ -127,9 +132,9 @@ def ge(A, B):
                     # diff = (A - B) * amplifier
             diff_one = lc.op("add", (diff, one))
                     # diff + 1.0
-            res = lc.op("nop", (diff_one, ), [], { "relu_en": True, "relu_threshold": 1.0, "relu_mode": "min" })
+            res = lc.op(BudaNop.create(relu_en=True, relu_threshold=1.0, relu_mode="min" ), (diff_one, ))
                     # res = ReLU(diff + 1.0, 1.0)
-            res = lc.op("nop", (res, ), [], { "relu_en": True, "relu_threshold": 1.0, "relu_mode": "max" })
+            res = lc.op(BudaNop.create(relu_en=True, relu_threshold=1.0, relu_mode="max"), (res, ))
                     # res = Inv_ReLU(res, 1.0)
             return res
 
@@ -162,9 +167,10 @@ def eq(A, B):
             ne(A, B)
     elif type == "power":
         #lc.op("power_binary", ops, attr)  # 'power' backend op is unary
-        ln_x = lc.op("log", [ops[0]])
+        ln_x = lc.op(BudaLog.create(), [ops[0]])
         y_ln_x = lc.op("multiply", (ops[1], ln_x)) 
-        lc.op("exp", [y_ln_x], [], {"approximate_mode": "true" if "PYBUDA_EXP_APPROX" in os.environ else "false"})      
+        approximate_mode = "true" if "PYBUDA_EXP_APPROX" in os.environ else "false"
+        lc.op(BudaExp.create(approximate_mode=approximate_mode), [y_ln_x])            
     else:
         # Find proper tile sizes
         if bool(int(os.environ.get("PYBUDA_ENABLE_TINY_TILE", "0"))):
@@ -178,11 +184,11 @@ def eq(A, B):
         ops1_dims = len(ops[1].shape)
         if ops0_dims == 5 and ops1_dims < 5:
             while ops1_dims < 5:
-                ops[1] = lc.op("nop", [ops[1]], ["unsqueeze", ops1_dims], {}, tag="dont_remove")
+                ops[1] = lc.op(BudaNop.create(unsqueeze = "unsqueeze", unsqueeze_dim=ops1_dims), [ops[1]], tag="dont_remove")
                 ops1_dims += 1
         elif ops1_dims == 5 and ops0_dims < 5:
             while ops0_dims < 5:
-                ops[0] = lc.op("nop", [ops[0]], ["unsqueeze", ops0_dims], {}, tag="dont_remove")
+                ops[0] = lc.op(BudaNop.create(unsqueeze = "unsqueeze", unsqueeze_dim=ops0_dims), [ops[0]], tag="dont_remove")
                 ops0_dims += 1
         lc.op(type, ops, attr, {}, "", tile_height, TILE_DIM) # straight 1-1 for all other binaries
 
@@ -214,7 +220,7 @@ def backward(op_type, attr, ac, operand, inputs, output, grad):
                 if shapes[operand][i] < grad_shape[i]:
                     # Negative indexing for reduce axis
                     grad = ac.op("reduce_sum", (grad,), (i - grad_shape_len,))
-        return ac.op("nop", (grad,))  # pass gradient through
+        return ac.op(Nop.create(), (grad,))  # pass gradient through
 
     elif op_type == "subtract":
         if inputs[operand].shape != grad.shape:
@@ -222,7 +228,7 @@ def backward(op_type, attr, ac, operand, inputs, output, grad):
                 if shapes[operand][i] < grad.shape[i]:
                     grad = ac.op("reduce_sum", (grad,), (i,))
         if operand == 0:
-            return ac.op("nop", (grad,))
+            return ac.op(Nop.create(), (grad,))
         else:
             return ac.op("multiply", (grad, ac.constant(-1)))
 
@@ -236,15 +242,15 @@ def backward(op_type, attr, ac, operand, inputs, output, grad):
 
     elif op_type == "maximum":
         # TODO
-        return ac.op("nop", (grad,)) # pass gradient through
+        return ac.op(Nop.create(), (grad,)) # pass gradient through
 
     elif op_type == "power": 
         if operand == 0: # dx = y * (x^y) * recp(x)
-            recip = ac.op("reciprocal", (inputs[0],))
+            recip = ac.op(Reciprocal.create(), (inputs[0],))
             partial_grad = ac.op("multiply", (output, recip))  
             pow_grad = ac.op("multiply", (inputs[1], partial_grad))
         if operand == 1: # dy = (x^y) * ln(x)
-            ln_x = ac.op("log", [inputs[0]])
+            ln_x = ac.op(Log.create(), [inputs[0]])
             pow_grad = ac.op("multiply", (output, ln_x)) 
         return ac.op("multiply", (pow_grad, grad))
 
@@ -348,7 +354,7 @@ def decompose(op_type, attr, dc, inputs):
             raise RuntimeError(f"Found BinaryStack op with axis {axis}")
 
     elif op_type == "divide":
-        recip = dc.op("reciprocal", [inputs[1]])
+        recip = dc.op(Reciprocal.create(), [inputs[1]])
         result = dc.op("multiply", [inputs[0], recip])
         dc.fuse(result)
         return
@@ -392,7 +398,7 @@ def decompose_post_autograd(op_type, attr, dc, inputs):
         res = dc.op("add", (res, x_gt))
         dc.fuse(res)
         return
-    elif op_type == "maximum":
+    elif op_type == "maximum" and os.environ.get("PYBUDA_ENABLE_MAXIMUM_DECOMPOSITION", "0") == "1":
         operand0, operand1 = inputs[0], inputs[1]
         orig_op0_shape = operand0.shape.as_list()
         orig_op1_shape = operand1.shape.as_list()
diff --git a/pybuda/pybuda/op/eval/pybuda/eltwise_nary.py b/pybuda/pybuda/op/eval/pybuda/eltwise_nary.py
index e721ac07c..c95af595a 100644
--- a/pybuda/pybuda/op/eval/pybuda/eltwise_nary.py
+++ b/pybuda/pybuda/op/eval/pybuda/eltwise_nary.py
@@ -10,6 +10,8 @@
 import pybuda
 from ..common import to_torch_operands
 from .transpose import TransposeTM
+from .nop import Nop
+from .buffer import Buffer
 from ..buda.splice import Splice
 from pybuda.pybudaglobal import TILE_DIM, align_up_tile, is_tile_dim_aligned
 from ..sparse_utils import (
@@ -244,7 +246,7 @@ def decompose(type, attr, dc, inputs):
         
     if type == "concatenate":
         if len(inputs) == 1:
-            dc.fuse(dc.op("nop", [inputs[0]]))
+            dc.fuse(dc.op(Nop.create(), [inputs[0]]))
     
 from math import gcd
 from functools import reduce
@@ -262,7 +264,7 @@ def decompose_post_optimize(type, attr, dc, inputs):
             axis -= len(in1.shape)
 
         if len(inputs) == 1:
-            result = dc.op("nop", [in1])
+            result = dc.op(Nop.create(), [in1])
             dc.fuse(result)
             return
 
@@ -353,12 +355,12 @@ def decompose_post_optimize(type, attr, dc, inputs):
                 padded_shape_len += padded_inputs[-1].shape[axis]
 
                 if insert_slice and concat_slice_dim is not None:
-                    padded_inputs[-1] = dc.op("buffer", [padded_inputs[-1], ])
+                    padded_inputs[-1] = dc.op(Buffer.create(), [padded_inputs[-1], ])
                     assert padded_inputs[-1].shape[axis] % slice_size == 0
                     padded_inputs[-1] = dc.op(concat_slice_dim + "slice", [padded_inputs[-1], ], (padded_inputs[-1].shape[axis] // slice_size , ))
                 elif insert_slice and non_concat_slice_dim is not None:
                     slices = inputs[0].shape[non_concat_dim] // TILE_DIM
-                    padded_inputs[-1] = dc.op("buffer", [padded_inputs[-1], ])
+                    padded_inputs[-1] = dc.op(Buffer.create(), [padded_inputs[-1], ])
                     padded_inputs[-1] = dc.op(non_concat_slice_dim + "slice", [padded_inputs[-1], ], (slices, ))
                 
 
diff --git a/pybuda/pybuda/op/eval/pybuda/eltwise_unary.py b/pybuda/pybuda/op/eval/pybuda/eltwise_unary.py
index 553fe47ea..1d5dad84c 100644
--- a/pybuda/pybuda/op/eval/pybuda/eltwise_unary.py
+++ b/pybuda/pybuda/op/eval/pybuda/eltwise_unary.py
@@ -11,6 +11,15 @@
 from ....tensor import buda_dataformat_to_pytorch_dtype
 import numpy as np
 from pybuda.op.eval.common import calculate_tile_size
+from .tanh import Tanh
+from ..buda.log import Log as BudaLog
+from .nop import Nop
+from ..buda.nop import Nop as BudaNop
+from .buffer import Buffer
+
+from ..buda.exp import Exp as BudaExp
+from .exp import Exp
+from .reciprocal import Reciprocal
 
 M_2_SQRTPI  = 1.12837916709551257390	# 2/sqrt(pi) 
 M_SQRT2     = 1.41421356237309504880	# sqrt(2) 
@@ -187,7 +196,7 @@ def lower(type, attr, lc, ops, outputs):
             threshold = attr[0] - f32_epsilon
         if len(attr) > 1:
             mode = attr[1]
-        lc.op("nop", ops, [], {"relu_en": True, "relu_threshold": threshold, "relu_mode": mode })
+        lc.op(BudaNop.create(relu_en=True, relu_threshold=threshold, relu_mode=mode), ops)
         
     elif type == "leaky_relu":
         lc.op("lrelu", ops, attr, {"slope": attr[0]})
@@ -212,7 +221,7 @@ def lower(type, attr, lc, ops, outputs):
             node_shape = lc.pybuda_shape()
             tile_height = calculate_tile_size(node_shape[-2])
             if node_shape[-2] % tile_height == 0:
-                lc.op("nop", ops, [], {}, "", tile_height, TILE_DIM)
+                lc.op(BudaNop.create(), ops, tile_height=tile_height,tile_width=TILE_DIM)
                 return # Don't need to tile bcast to full tile
 
 
@@ -268,7 +277,7 @@ def lower(type, attr, lc, ops, outputs):
             buda_attr = {"p": p, "seed": seed}
             lc.op(type, ops, attr + [r, c, 1, 1, True, False], buda_attr) # straigh 1-1 for all other unaries
         else:
-            lc.op("nop", ops)
+            lc.op(BudaNop.create(), ops)
     elif type == "gelu":
         lc.op("gelu", ops, attr, {"approximate_mode": "true" if attr[0] == "tanh" else "false"})
     elif type == "gelu_derivative":
@@ -284,7 +293,7 @@ def lower(type, attr, lc, ops, outputs):
             max_value = 65504.0
 
         if (min_value == 0) and (max_value >= 0):
-            res = lc.op("nop", (ops[0], ), [], {"relu_en": True, "relu_mode": "max", "relu_threshold": max_value })
+            lc.op(BudaNop.create(relu_en=True, relu_threshold=max_value, relu_mode="max"), (ops[0], ))
             return
 
         shape = list(ops[0].shape.as_list())
@@ -308,11 +317,11 @@ def lower(type, attr, lc, ops, outputs):
 
         res = lc.op("subtract", (ops[0], min_value_tensor))
                 # x - min_value
-        res = lc.op("nop", (res, ), [], {"relu_en": True, "relu_mode": "min", "relu_threshold": 0.0 })
+        res = lc.op(BudaNop.create(relu_en=True, relu_threshold=0.0, relu_mode="min"), (res, ))
                 # ReLU(x - min_value)
         res = lc.op("subtract", (diff_tensor, res))
                 # diff_value - ReLU(x - min_value), diff = max - min
-        res = lc.op("nop", (res, ), [], {"relu_en": True, "relu_mode": "min", "relu_threshold": 0.0 })
+        res = lc.op(BudaNop.create(relu_en=True, relu_threshold=0.0, relu_mode="min"), (res, ))
                 # ReLU(diff_value - ReLU(x - min_value))
         lc.op("subtract", (max_value_tensor, res))
                 # max_value - ReLU(diff_value - ReLU(x - min_value))
@@ -324,10 +333,10 @@ def lower(type, attr, lc, ops, outputs):
         else:
             exponent_value = attr[0]
             shape = list(ops[0].shape.as_list()) 
-
-            ln_x = lc.op("log", ops)
+            ln_x = lc.op(BudaLog.create(), ops)
             y_ln_x = lc.op("multiply", (lc.tensor(torch.zeros(shape) + exponent_value), ln_x)) 
-            lc.op("exp", [y_ln_x], [], {"approximate_mode": "true" if "PYBUDA_EXP_APPROX" in os.environ else "false"})      
+            approximate_mode = "true" if "PYBUDA_EXP_APPROX" in os.environ else "false"
+            lc.op(BudaExp.create(approximate_mode=approximate_mode), [y_ln_x])          
 
     else:
         # Find proper tile sizes
@@ -359,16 +368,16 @@ def backward(type, attr, ac, operand, inputs, output, grad):
         ), "Eltwise unary should have no attributes, execpt for clip, leaky_relu and cumsum"
 
     if type == "nop":
-        return ac.op("nop", (grad, ))
+        return ac.op(Nop.create(), (grad, ))
 
     if type == "tilizer":
-        return ac.op("nop", (grad, ))
+        return ac.op(Nop.create(), (grad, ))
 
     if type == "tile_broadcast": # the full TM broadcast will generate a reduce
-        return ac.op("nop", (grad, ))
+        return ac.op(Nop.create(), (grad, ))
 
     if type == "buffer":
-        return ac.op("buffer", (grad, ))
+        return ac.op(Buffer.create(), (grad, ))
 
     if type == "exp":
         return ac.op("multiply", (output, grad))
@@ -379,7 +388,7 @@ def backward(type, attr, ac, operand, inputs, output, grad):
         return ac.op("multiply", (neg, grad))
 
     if type == "sqrt": # 0.5 / f(x)
-        rec = ac.op("reciprocal", (output,))
+        rec = ac.op(Reciprocal.create(), (output,))
         mult = ac.op("multiply", (rec, ac.constant(0.5)))
         return ac.op("multiply", (mult, grad))
 
@@ -425,7 +434,7 @@ def backward(type, attr, ac, operand, inputs, output, grad):
         return ac.op("multiply", (gelud, grad))
 
     if type == "log":
-        recip = ac.op("reciprocal", (inputs[0],))
+        recip = ac.op(Reciprocal.create(), (inputs[0],))
         return ac.op("multiply", (recip, grad))
 
     if type == "sigmoid":
@@ -449,7 +458,7 @@ def backward(type, attr, ac, operand, inputs, output, grad):
         assert dim == 0, "Unsupported dim different then 0 for cumulative sum backward pass"
         
         if dim == 0:
-            return ac.op("nop", (grad, ))
+            return ac.op(Nop.create(), (grad, ))
         
         return res
 
@@ -473,8 +482,7 @@ def backward(type, attr, ac, operand, inputs, output, grad):
     elif type == "pow":
         exponent_value = attr[0]
         shape = list(inputs[0].shape.as_list())
-              
-        recip = ac.op("reciprocal", (inputs[0],))
+        recip = ac.op(Reciprocal.create(), (inputs[0],))
         partial_grad = ac.op("multiply", (output, recip))  
         pow_grad = ac.op("multiply", (ac.tensor(torch.zeros(shape) + exponent_value), partial_grad))
         return ac.op("multiply", (pow_grad, grad))
@@ -502,30 +510,62 @@ def decompose(type, attr, dc, inputs):
         input_shape = inp_node.shape.as_list()
         if axis >= 0:
             axis -= len(input_shape)
+            assert axis < 0, "valid axis should be < 0 after subtracting len(input_shape)" 
+
+        # First we want to get array of zeros and ones, with ones standing on the indices of maximums. 
+        # For example, starting array is [1, 3, 5, 2, 0, 5]. We want to get [0, 0, 1, 0, 0, 1]. 
+        # We do that by multiplying array with some large number (10^10), subtracting maximum of the array from array, 
+        # then add 1 to each element to make sure that only maximums are now above 0 (equal to 1). 
+        # Then we threshold the array with ReLu to get [0, 0, 1, 0, 0, 1]. 
+        # Then we multiply that array with array of indices [0,1,2,3,4,5] to get [0,0,2,0,0,5]. 
+        # The rest is manipulation how to extract first maximum index. 
+        # We do that by taking complement of [0, 0, 1, 0, 0, 1] => [1, 1, 0, 1, 1, 0] and multiplying it 
+        # with size(6) and add it to [0,0,2,0,0,5] => [6,6,2,6,6,5] and just find argmin of this array which is 2.
 
         data_type = buda_dataformat_to_pytorch_dtype(inp_node.output_df)
-        range_shape = [dim if i == axis + len(input_shape) else 1 for i, dim in enumerate(input_shape)]
+        indices_shape = [dim if i == axis + len(input_shape) else 1 for i, dim in enumerate(input_shape)]
 
-        range = torch.arange(input_shape[axis], dtype=data_type).reshape(range_shape)
-        range_tensor = dc.tensor(range)
+        indices = torch.arange(input_shape[axis], dtype=data_type).reshape(indices_shape)
+        indices_tensor = dc.tensor(indices)
 
         factor = torch.ones((input_shape), dtype=data_type) * 1e10
         factor_tensor = dc.tensor(factor)
 
-        mult_1 = dc.op("multiply", [inp_node, factor_tensor],)
-        softmax = dc.op("softmax", [mult_1], (axis, 1))
-        mult_2 = dc.op("multiply", [softmax, range_tensor])
-        reduce_sum = dc.op("reduce_sum", [mult_2], (axis,))
-        dc.fuse(reduce_sum)
+        ones = torch.ones((input_shape), dtype=data_type) 
+        ones_tensor = dc.tensor(ones)
+        negative_ones = dc.tensor(ones * (-1))
+
+        # this it the tensor that has all elements equal to input shape on axis on which we do argmax.
+        offset_tensor = dc.tensor(ones * input_shape[axis])
+
+        scaled_input = dc.op("multiply", (inp_node, factor_tensor),)
+        max_1 = dc.op("reduce_max", [scaled_input], [axis])
+        scaled_input = dc.op("subtract", (scaled_input, max_1))
+        scaled_input = dc.op("add", [scaled_input, ones_tensor],)
+
+        relu_1 = dc.op("relu", (scaled_input,))
+        relu_1_complement = dc.op("subtract", (ones_tensor, relu_1))
+
+        mul_1 = dc.op("multiply", [relu_1, indices_tensor],)
+        mul_2 = dc.op("multiply", [relu_1_complement, offset_tensor],)
+        add_1 = dc.op("add", [mul_1, mul_2],)
+        negative_add_1 = dc.op("multiply", [add_1, negative_ones])
+        negative_argmax = dc.op("reduce_max", [negative_add_1], [axis])
+
+        output_neg_ones = torch.ones((negative_argmax.shape.as_list()), dtype=data_type) * (-1)
+        output_neg_ones_tensor = dc.tensor(output_neg_ones)
+        argmax = dc.op("multiply", [negative_argmax, output_neg_ones_tensor])
+
+        dc.fuse(argmax)
 
     elif type == "sigmoid" and bool(int(os.environ.get("PYBUDA_DECOMPOSE_SIGMOID", "0"))):
         inp = inputs[0]
         minus_one = dc.tensor(torch.ones([1,1]) * -1)
         plus_one = dc.tensor(torch.ones([1,1]))
         neg_ = dc.op("multiply", [inp, minus_one])
-        exp_ = dc.op("exp", [neg_])
+        exp_ = dc.op(Exp.create(), [neg_])
         result = dc.op("add", [plus_one, exp_])
-        result = dc.op("reciprocal", [result])
+        result = dc.op(Reciprocal.create(), [result])
         dc.fuse(result)
 
     elif type == "gelu" and bool(int(os.environ.get("PYBUDA_DECOMPOSE_GELU", "0"))):
@@ -540,7 +580,7 @@ def decompose(type, attr, dc, inputs):
         x_cuber_times_const = dc.op("multiply", [x_cubed, const])
         plus_x = dc.op("add", [x_cuber_times_const, inp_node])
         times_sqrt_2pi = dc.op("multiply", [plus_x, sqrt_2pi])
-        tanh = dc.op("tanh", [times_sqrt_2pi])
+        tanh = dc.op(Tanh.create(), [times_sqrt_2pi])
         plus_one = dc.op("add", [tanh, one])
         times_x = dc.op("multiply", [plus_one, inp_node])
         result = dc.op("multiply", [times_x, one_half])
diff --git a/pybuda/pybuda/op/eval/pybuda/ethernet_datacopy.py b/pybuda/pybuda/op/eval/pybuda/ethernet_datacopy.py
new file mode 100644
index 000000000..283586405
--- /dev/null
+++ b/pybuda/pybuda/op/eval/pybuda/ethernet_datacopy.py
@@ -0,0 +1,59 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import torch
+import torch.nn.functional
+from ..interface import PyEltwiseUnaryOp
+from loguru import logger
+from ..common import to_torch_operands
+from ....pybudaglobal import TILE_DIM
+from ....tensor import buda_dataformat_to_pytorch_dtype
+import numpy as np
+from pybuda.op.eval.common import calculate_tile_size
+from ..buda import ethernet_datacopy as BudaEthernetDataCopy
+
+
+class EthernetDatacopy(PyEltwiseUnaryOp):
+    @classmethod
+    def create(cls):
+        self = cls("ethernet_datacopy")
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "ethernet_datacopy should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        ret = tensors[0]
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+
+    def shape(self, tensor_shapes):
+        assert len(tensor_shapes) == 1, "ethernet_datacopy should have one input"
+        shape = tensor_shapes[0]
+        return shape, []
+
+    def backward(self, ac, operand, inputs, output, grad):
+        assert False, f"ethernet_datacopy not defined in eltwise unary backward."
+
+    def lower(self, lc, tensors, outputs):
+        assert len(tensors) == 1, "ethernet_datacopy should  have one input"
+        # Find proper tile sizes
+        if bool(int(os.environ.get("PYBUDA_ENABLE_TINY_TILE", "0"))):
+            node_shape = list(tensors[0].shape)
+            tile_height = calculate_tile_size(node_shape[-2])
+            tile_width = calculate_tile_size(node_shape[-1])
+        else:
+            tile_height, tile_width = TILE_DIM, TILE_DIM
+
+        lc.op(
+            BudaEthernetDataCopy.create(),
+            tensors,
+            tile_height=tile_height,
+            tile_width=tile_width,
+        )
diff --git a/pybuda/pybuda/op/eval/pybuda/exp.py b/pybuda/pybuda/op/eval/pybuda/exp.py
new file mode 100644
index 000000000..424303332
--- /dev/null
+++ b/pybuda/pybuda/op/eval/pybuda/exp.py
@@ -0,0 +1,56 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import torch
+import torch.nn.functional
+from ..interface import PyEltwiseUnaryOp
+from loguru import logger
+from ..common import to_torch_operands
+from ....pybudaglobal import TILE_DIM
+from ....tensor import buda_dataformat_to_pytorch_dtype
+import numpy as np
+from pybuda.op.eval.common import calculate_tile_size
+from ..buda.exp import Exp as BudaExp
+
+
+class Exp(PyEltwiseUnaryOp):
+    @classmethod
+    def create(cls):
+        self = cls("exp")
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Exp should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        ret = torch.exp(tensors[0])
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+
+    def shape(self, tensor_shapes):
+        assert len(tensor_shapes) == 1, "Exp should have one input"
+        shape = tensor_shapes[0]
+        return shape, []
+
+    def backward(self, ac, operand, inputs, output, grad):
+        assert len(inputs) == 1, "Exp should have one input"
+        assert operand == 0, "Invalid operand index"
+        return ac.op("multiply", (output, grad))
+
+    def lower(self, lc, tensors, outputs):
+        assert len(tensors) == 1, "Exp should  have one input"
+        approximate_mode = "true" if "PYBUDA_EXP_APPROX" in os.environ else "false"
+        lc.op(BudaExp.create(approximate_mode=approximate_mode), tensors)
+
+    def initial_flops_estimate(self, tensor_shapes):
+        flops = 0
+        output_shape = self.shape(tensor_shapes)[0]
+        flops = np.prod(output_shape)
+
+        return flops
diff --git a/pybuda/pybuda/op/eval/pybuda/log.py b/pybuda/pybuda/op/eval/pybuda/log.py
new file mode 100644
index 000000000..23a2c424f
--- /dev/null
+++ b/pybuda/pybuda/op/eval/pybuda/log.py
@@ -0,0 +1,72 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import torch
+import torch.nn.functional
+from ..interface import PyEltwiseUnaryOp
+from loguru import logger
+from ..common import to_torch_operands
+from ....pybudaglobal import TILE_DIM
+from ....tensor import buda_dataformat_to_pytorch_dtype
+import numpy as np
+from pybuda.op.eval.common import calculate_tile_size
+from ..buda.log import Log as BudaLog
+from .reciprocal import Reciprocal
+
+
+class Log(PyEltwiseUnaryOp):
+    @classmethod
+    def create(cls):
+        self = cls("log")
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Log should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        ret = torch.log(tensors[0] + 1e-10)
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+
+    def shape(self, tensor_shapes):
+        assert len(tensor_shapes) == 1, "Log should have one input"
+        shape = tensor_shapes[0]
+        return shape, []
+
+    def backward(self, ac, operand, inputs, output, grad):
+        assert len(inputs) == 1, "Log should have one input"
+        assert operand == 0, "Invalid operand index"
+        recip = ac.op(Reciprocal.create(), (inputs[0],))
+        return ac.op("multiply", (recip, grad))
+
+    def lower(self, lc, tensors, outputs):
+        assert len(tensors) == 1, "Log should  have one input"
+
+        if bool(int(os.environ.get("PYBUDA_ENABLE_TINY_TILE", "0"))):
+            node_shape = list(tensors[0].shape)
+            tile_height = calculate_tile_size(node_shape[-2])
+            tile_width = calculate_tile_size(node_shape[-1])
+            vector = "" if tile_height == TILE_DIM else "r"
+        else:
+            vector = None
+            tile_height, tile_width = TILE_DIM, TILE_DIM
+
+        lc.op(
+            BudaLog.create(vector=vector),
+            tensors,
+            tile_height=tile_height,
+            tile_width=tile_width,
+        )
+
+    def initial_flops_estimate(self, tensor_shapes):
+        flops = 0
+        output_shape = self.shape(tensor_shapes)[0]
+        flops = np.prod(output_shape)
+
+        return flops
diff --git a/pybuda/pybuda/op/eval/pybuda/matmul.py b/pybuda/pybuda/op/eval/pybuda/matmul.py
index 206ac97cf..1ddc48f3e 100644
--- a/pybuda/pybuda/op/eval/pybuda/matmul.py
+++ b/pybuda/pybuda/op/eval/pybuda/matmul.py
@@ -6,7 +6,6 @@
 from math import sqrt
 import os
 
-import pybuda._C.balancer as balancer
 from pybuda._C import DataFormat
 import torch
 
@@ -179,7 +178,7 @@ def lower(type, attr, buda_attr, lc, ops, outputs):
         picker = lc.get_pytorch_tensor(in0)
         zdim = 1 if len(picker.shape) < 3 else picker.shape[-3]
 
-        z_bcast_factor = 1 if len(attr) < 2 else attr[1]
+        z_bcast_factor = 1 if len(attr) < 2 else attr[1]  # set in sparse matmul's decompose
 
         # We can fully fracture kH * kW
         max_fracture_factor = z_bcast_factor if is_kernel_fracturing_candidate(ops, z_bcast_factor) else 1
@@ -200,7 +199,8 @@ def lower(type, attr, buda_attr, lc, ops, outputs):
         grid_r = round_up_div(picker.shape[-2], TILE_DIM)
         grid_c = 1  # this is always 1 by default, before balancing, needed for buda eval
 
-        sparse_tile_ptr_bits = sparse_buda.get_sparse_tile_ptr_bits(grid_r, t_factor_r, u_rt)  # Do we need to calculate this here at all? Can't we push a dummy value to buda attrs?
+        sparse_tile_ptr_bits = sparse_buda.get_sparse_tile_ptr_bits(grid_r, t_factor_r, u_rt)
+        sparse_ublock_idx_bits = sparse_buda.get_sparse_ublock_idx_bits(grid_r, t_factor_r, u_rt)
         sparse, encodings, _s_shape, _e_shape, _num_strips = sparse_buda.get_sparse_tiles_and_encodings(grid_r)
         sparse, encodings = shapeify_sparse_tiles_and_encodings(
             sparse=sparse,
@@ -227,10 +227,10 @@ def lower(type, attr, buda_attr, lc, ops, outputs):
         buda_attrs["num_sparse_tiles"] = sparse.shape[-1] // TILE_DIM
         buda_attrs["num_index_tiles"] = encodings.shape[-1] // TILE_DIM
         buda_attrs["sparse_tile_ptr_bits"] = sparse_tile_ptr_bits
-        buda_attrs["sparse_ublock_idx_bits"] = sparse_tile_ptr_bits
+        buda_attrs["sparse_ublock_idx_bits"] = sparse_ublock_idx_bits
         buda_attrs["fracture_factor"] = fracture_factor
         # We need fracture_factor in attributes as well, since shape() function doesn't get buda attrs
-        lc.op("matmul", [in0, in1, in2], (accumulate, is_sparse, sparse_tile_ptr_bits, 1, zdim, picker.shape[-2], in1.shape[-1], fracture_factor, u_rt, u_kt, u_ct, grid_c, t_factor_r, t_factor_c, sparse_tile_ptr_bits), buda_attrs)
+        lc.op("matmul", [in0, in1, in2], (accumulate, is_sparse, sparse_tile_ptr_bits, 1, zdim, picker.shape[-2], in1.shape[-1], fracture_factor, u_rt, u_kt, u_ct, grid_c, t_factor_r, t_factor_c, sparse_ublock_idx_bits), buda_attrs)
     else:
         # Find proper tile sizes
         if bool(int(os.environ.get("PYBUDA_ENABLE_TINY_TILE", "0"))):
@@ -254,6 +254,8 @@ def decompose(type, attr, dc, inputs):
         accumulate = (len(attr) >= 1) and bool(attr[0])
         z_bcast_factor = zdim if (zdim > 1 and in1.shape[-3] == 1) else 1
 
+        # In case of convolutions, z_bcast_factor is the volume of the conv's kernel (kernel_height * kernel_width)
+
         if z_bcast_factor > 1:
             picker = torch.cat([picker[0][z] for z in range(z_bcast_factor)])
             sparse = dc.tensor(picker)
diff --git a/pybuda/pybuda/op/eval/pybuda/nn.py b/pybuda/pybuda/op/eval/pybuda/nn.py
index b54dd64b1..9d9187d51 100644
--- a/pybuda/pybuda/op/eval/pybuda/nn.py
+++ b/pybuda/pybuda/op/eval/pybuda/nn.py
@@ -10,7 +10,10 @@
 
 from ..common import to_torch_operands
 from . import reduce
-
+from .exp import Exp
+from .reciprocal import Reciprocal
+from .log import Log
+from .sqrt import Sqrt
 
 
 def eval(op_type, attr, ops):
@@ -163,9 +166,38 @@ def eval(op_type, attr, ops):
         if operand == 2:
             return beta__.grad.reshape(beta__.shape)
 
-    assert False, f"{op_type} is not defined in nn eval."
+    if op_type == "batchnorm":
+        
+        assert len(ops) == 5, "batchnorm should have five operands."
+        assert len(attr) == 1, "batchnorm should have one attributes."
 
+        t_ops = to_torch_operands(*ops)
 
+        input_ = t_ops[0]   # Input tensor
+        weight = t_ops[1]   # weights, weight re-scaling parameter
+        bias = t_ops[2]      # bias, weight re-centering parameter
+        running_mean = t_ops[3]
+        running_var = t_ops[4] 
+        epsilon = attr[0]
+ 
+        #assert gamma.shape[-1] == input_.shape[-1], "Weights shape must be the same as normalized shape."
+        #for gdim in gamma.shape[:-1]:
+        #    assert gdim == 1, "All dimensions but the last one must be 1"
+        #assert beta.shape[-1] == input_.shape[-1], "Bias shape must be the same as normalized shape."
+        #for bdim in beta.shape[:-1]:
+        #    assert bdim == 1, "All dimensions but the last one must be 1"
+
+        return F.batch_norm(
+                    input=input_,
+                    running_mean=running_mean.shape[-1:],
+                    running_var=running_var.shape[-1:],
+                    normalized_shape=input_.shape[-1:],
+                    weight=weight.reshape(gamma.shape[-1:]),
+                    bias=bias.reshape(beta.shape[-1:]),
+                    eps=epsilon
+                )
+
+    assert False, f"{op_type} is not defined in nn eval."
 
 
 def shape(op_type, attr, ops):
@@ -230,6 +262,13 @@ def shape(op_type, attr, ops):
         
         return ops[operand], []
 
+    if op_type == "batchnorm":
+        
+        assert len(ops) == 5, "Layernorm should have five operands."
+        assert len(attr) == 1, "Layernorm should have one attributes."
+
+        return ops[0], []
+
     assert False, f"{op_type} is not defined in nn shape."
 
 
@@ -323,6 +362,9 @@ def backward(op_type, attr, ac, operand, inputs, output, grad):
 
         return ac.op("layernorm_bw", inputs, attr)
 
+    if op_type == "batchnorm":
+        raise NotImplementedError("Back propagation for Batchnorm op is not implemented yet")
+
     assert False, f"{op_type} is not defined in nn backward. "
 
 def decompose(op_type, attr, dc, inputs):
@@ -357,10 +399,41 @@ def decompose(op_type, attr, dc, inputs):
         dim = attr[0]
         stable = attr[1]
         result = dc.op("softmax", (x, ), (dim, stable))
-        result = dc.op("log", (result, ))
+        result = dc.op(Log.create(), (result, ))
         dc.fuse(result)
         return
-        
+
+    if op_type == "batchnorm": 
+        assert len(inputs) == 5, "Batchnorm should have five operands."
+        assert len(attr) == 1, "Layernorm should have one attributes."
+
+        input_ = inputs[0]
+        weight = inputs[1]
+        bias = inputs[2]
+        running_mean = inputs[3]
+        running_var = inputs[4]
+        epsilon = attr[0]
+
+        # const tensor
+        eps_tensor = dc.tensor(torch.zeros(running_var.shape.as_list()) + epsilon)
+        neg_one = dc.tensor(torch.zeros(running_mean.shape.as_list()) - 1.0)
+
+        # decompose
+        var_eps = dc.op("add", (running_var, eps_tensor), ())
+        sqrt = dc.op(Sqrt.create(), (var_eps,), ())
+        recipro = dc.op(Reciprocal.create(), (sqrt,), ())
+        weighted = dc.op("multiply", (recipro, weight), ())
+        neg_mean = dc.op("multiply", (neg_one, running_mean), ())
+        weighted_mean = dc.op("multiply", (weighted, neg_mean), ())
+        weighted_bias = dc.op("add", (weighted_mean, bias), ())
+        weighted_bias = dc.op("unsqueeze", [weighted_bias], (1, len(weighted_bias.shape),))
+        weighted_bias = dc.op("unsqueeze", [weighted_bias], (1, len(weighted_bias.shape),))
+        weighted_var = dc.op("unsqueeze", [weighted], (1, len(weighted.shape),))
+        weighted_var = dc.op("unsqueeze", [weighted_var], (1, len(weighted_var.shape),))
+        scaled = dc.op("multiply", (input_, weighted_var), ())
+        biased = dc.op("add", (scaled, weighted_bias), ())
+        dc.fuse(biased)
+        return
 
 def decompose_post_autograd(op_type, attr, dc, inputs):
     """
@@ -398,13 +471,14 @@ def decompose_post_autograd(op_type, attr, dc, inputs):
         if stable and dc.get_compiler_cfg().enable_stable_softmax:
             res_max = dc.op("reduce_max", (x, ), (dim, ))
             res_x_max = dc.op("subtract", (x, res_max), ())
-            res_exp = dc.op("exp", (res_x_max, ), ())
+            res_exp = dc.op(Exp.create(), (res_x_max, ), ())
         else:
-            res_exp = dc.op("exp", (x, ), ())
+            res_exp = dc.op(Exp.create(), (x, ), ())
+            
 
         res_exp_sum = dc.op("reduce_sum", (res_exp, ), (dim, ))
         res_exp_sum = dc.op("add", (res_exp_sum, dc.tensor(torch.zeros(res_exp_sum.shape.as_list()) + 1e-10)), ())
-        res_exp_sum_recip = dc.op("reciprocal", (res_exp_sum, ), ())
+        res_exp_sum_recip = dc.op(Reciprocal.create(), (res_exp_sum, ), ())
         result = dc.op("multiply", (res_exp, res_exp_sum_recip), ())
         dc.fuse(result)
         return
@@ -470,9 +544,9 @@ def decompose_post_autograd(op_type, attr, dc, inputs):
         # var_plus_eps = dc.op("add", (var, epsilon_tensor), ())
         var_add = dc.op("add", (var, epsilon_tensor), ())
         # std = dc.op("sqrt", (var_plus_eps, ), ())
-        std = dc.op("sqrt", (var_add, ), ())
+        std = dc.op(Sqrt.create(), (var_add, ), ())
         # recip = dc.op("reciprocal", (std, ), ())
-        ivar = dc.op("reciprocal", (std, ), ())
+        ivar = dc.op(Reciprocal.create(), (std, ), ())
         # normalized = dc.op("multiply", (x_minus_mean, recip), ())
         xhat = dc.op("multiply", (xmu, ivar), ())
         # normalized_weighted = dc.op("multiply", (normalized, weights), ())
diff --git a/pybuda/pybuda/op/eval/pybuda/nop.py b/pybuda/pybuda/op/eval/pybuda/nop.py
new file mode 100644
index 000000000..532e21913
--- /dev/null
+++ b/pybuda/pybuda/op/eval/pybuda/nop.py
@@ -0,0 +1,55 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import torch
+import torch.nn.functional
+from ..interface import PyEltwiseUnaryOp
+from loguru import logger
+from ..common import to_torch_operands
+from ....pybudaglobal import TILE_DIM
+from ....tensor import buda_dataformat_to_pytorch_dtype
+import numpy as np
+from pybuda.op.eval.common import calculate_tile_size
+from ..buda.nop import Nop as BudaNop
+
+
+class Nop(PyEltwiseUnaryOp):
+    @classmethod
+    def create(cls):
+        self = cls("nop")
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "nop should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        ret = tensors[0]
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+        return ret
+
+    def shape(self, tensor_shapes):
+        assert len(tensor_shapes) == 1, "Eltwise unary should have one input"
+        shape = tensor_shapes[0]
+        return shape, []
+
+    def backward(self, ac, operand, inputs, output, grad):
+        assert len(inputs) == 1, "Nop should have one input"
+        assert operand == 0, "Invalid operand index"
+        return ac.op(Nop.create(), [grad])
+
+    def lower(self, lc, tensors, outputs):
+        assert len(tensors) == 1, "Nop should  have one input"
+
+        if bool(int(os.environ.get("PYBUDA_ENABLE_TINY_TILE", "0"))):
+            node_shape = list(tensors[0].shape)
+            tile_height = calculate_tile_size(node_shape[-2])
+            tile_width = calculate_tile_size(node_shape[-1])
+        else:
+            tile_height, tile_width = TILE_DIM, TILE_DIM
+
+        lc.op(BudaNop.create(), tensors, tile_height=tile_height, tile_width=tile_width)
diff --git a/pybuda/pybuda/op/eval/pybuda/pooling.py b/pybuda/pybuda/op/eval/pybuda/pooling.py
index 136003bef..a4cb14f85 100644
--- a/pybuda/pybuda/op/eval/pybuda/pooling.py
+++ b/pybuda/pybuda/op/eval/pybuda/pooling.py
@@ -9,6 +9,7 @@
 from pybuda.pybudaglobal import TILE_DIM
 from pybuda.utils import align_up_tile
 from .transpose import TransposeTM
+from .nop import Nop
 
 from ..common import to_torch_operands
 from ..sparse_utils import (
@@ -391,7 +392,7 @@ def decompose(type, attr, dc, inputs):
         activations = inputs[0]
 
         if kernel_size == 1:
-            dc.fuse(dc.op("nop", [activations]))
+            dc.fuse(dc.op(Nop.create(), [activations]))
             return
 
         if max_pool_add_sub_surround:
@@ -512,7 +513,7 @@ def decompose(type, attr, dc, inputs):
         activations = inputs[0]
 
         if kD == 1 and kH == 1 and kW == 1:
-            dc.fuse(dc.op("nop", [activations]))
+            dc.fuse(dc.op(Nop.create(), [activations]))
             return
 
         #if max_pool_add_sub_surround:
diff --git a/pybuda/pybuda/op/eval/pybuda/quantize.py b/pybuda/pybuda/op/eval/pybuda/quantize.py
index 13ba6ef81..c80011b24 100644
--- a/pybuda/pybuda/op/eval/pybuda/quantize.py
+++ b/pybuda/pybuda/op/eval/pybuda/quantize.py
@@ -11,6 +11,7 @@
 import numpy as np
 from ..common import to_torch_operands
 from pybuda.tensor import pytorch_dtype_to_buda_dataformat
+from .reciprocal import Reciprocal
 
 STRING_TO_TORCH_DTYPE = {
     "torch.int8": torch.int8,
@@ -38,6 +39,7 @@ def eval(type, attr, ops):
         zero_point, axis, out_dtype = attr
         input_float = ops[0].float()
         scale = ops[1].float()
+
         output_float = torch.clamp(
             torch.round(input_float / scale) + zero_point,
             STRING_TO_LOWER_LIMIT[out_dtype],
@@ -56,8 +58,28 @@ def eval(type, attr, ops):
 
     elif type == "dequantize":
         zero_point, axis = attr
-        input_int8 = ops[0]
-        scale = ops[1]
+        input_int8 = ops[0].float()
+        scale = ops[1].float()
+
+        if axis < 0:
+            axis = len(input_int8.shape) + axis
+        left_ndim = axis
+        right_ndim = len(input_int8.shape) - axis - 1
+        if len(scale.shape) == 1:
+            target_shape = [1] * left_ndim + list(scale.shape) + [1] * right_ndim
+
+        if target_shape[axis] != input_int8.shape[axis]:
+            assert target_shape[axis] == 1
+            scale = torch.broadcast_to(scale, target_shape)
+        scale = torch.reshape(scale, target_shape)
+
+        output_float = (input_int8 - zero_point) * scale
+        return output_float
+
+    elif type == "buda_dequantize":
+        zero_point, axis = attr
+        input_int8 = ops[0].float()
+        scale = ops[1].float()
         output_float = (input_int8 - zero_point) * scale
         return output_float
 
@@ -66,8 +88,19 @@ def eval(type, attr, ops):
         input_int32 = ops[0]
         inp_scale, out_scale, = ops[1], ops[2]
         output_scale = inp_scale / out_scale
-        while len(output_scale.shape) != len(input_int32.shape):
-            output_scale = output_scale.unsqueeze(-1)
+
+        if axis < 0:
+            axis = len(input_int32.shape) + axis
+        left_ndim = axis
+        right_ndim = len(input_int32.shape) - axis - 1
+        if len(output_scale.shape) == 1:
+            target_shape = [1] * left_ndim + list(output_scale.shape) + [1] * right_ndim
+
+        if target_shape[axis] != input_int32.shape[axis]:
+            assert target_shape[axis] == 1
+            output_scale = torch.broadcast_to(output_scale, target_shape)
+        output_scale = torch.reshape(output_scale, target_shape)
+
 
         assert inp_zp == 0, "Only support input zero point of 0"
         output_float = torch.round(output_scale * (input_int32 - inp_zp) + out_zp)
@@ -92,35 +125,36 @@ def eval(type, attr, ops):
 
 def shape(type, attr, ops):
     broadcast = []
+    op0 = ops[0]
+    op1 = ops[1]
 
     if type == "quantize" or type == "buda_quantize":
-        op1 = list(ops[1])
-        while len(op1) < len(ops[0]):
-            op1 = [1] + op1
-        for dim in range(1, len(ops[0])):
-            if ops[0][dim] != op1[dim]:
-                broadcast.append((1, dim - len(ops[0]), ops[0][dim]))
-
-    if type == "buda_requantize":
+        axis = attr[1]
+        if axis < 0:
+            axis = len(ops[0]) + axis
+        left_ndim = axis
+        right_ndim = len(ops[0]) - axis - 1
+        if len(op1) == 1:
+            op1 = [1] * left_ndim + list(ops[1]) + [1] * right_ndim
+        elif len(op1) < len(op0):
+            while len(op1) < len(op0):
+                op1 = [1] + op1
+        assert len(op1) == len(op0), "Scale and input must have same dimension"
+        for dim in range(1, len(op0)):
+            if op0[dim] != op1[dim]:
+                broadcast.append((1, dim - len(op0), op0[dim]))
+
+    if type == "buda_requantize" or type == "buda_dequantize":
         for dim in range(1, len(ops[0])):
             if ops[0][dim] != ops[1][dim]:
                 broadcast.append((1, dim - len(ops[0]), ops[0][dim]))
-
-    if type == "dequantize":
-        op1 = list(ops[1])
-        while len(op1) < len(ops[0]):
-            op1 = [1] + op1
-        for dim in range(1, len(ops[0])):
-            if ops[0][dim] != op1[dim]:
-                broadcast.append((1, dim - len(ops[0]), ops[0][dim]))
-
     return ops[0], broadcast
 
 
 def lower(type, attr, lc, ops, outputs):
     if type == "buda_quantize":
         lc.op("quantization", ops, attr, {"zero_point": attr[0]}, "", TILE_DIM, TILE_DIM) # straight 1-1 for all other binaries
-    elif type == "dequantize":
+    elif type == "buda_dequantize":
         lc.op("dequantization", ops, attr, {"zero_point": attr[0]}, "", TILE_DIM, TILE_DIM) # straight 1-1 for all other binaries
     elif type == "buda_requantize":
         lc.op("requantization", ops, attr, {"zero_point": attr[0]}, "", TILE_DIM, TILE_DIM)
@@ -137,7 +171,7 @@ def decompose(type, attr, dc, inputs):
         torch_dtype = STRING_TO_TORCH_DTYPE[out_dtype]
         buda_dtype = pytorch_dtype_to_buda_dataformat(torch_dtype)
         scale = inputs[1]
-        scale = dc.op("reciprocal", [scale], output_df=scale.output_df)
+        scale = dc.op(Reciprocal.create(), [scale], output_df=scale.output_df)
         out = dc.op("buda_quantize", [inputs[0], scale], attrs=attr, output_df=buda_dtype)
         dc.fuse(out)
         return
@@ -146,38 +180,65 @@ def decompose(type, attr, dc, inputs):
         act, inp_scale, out_scale = inputs
         out_zp,inp_zp, axis, rounding, out_dtype = attr
         inp_scale_shape = inp_scale.shape.as_list()
-        if len(inp_scale_shape) == 1:
-            # populate batch dim
-            inp_scale = dc.op("unsqueeze", [inp_scale], attrs=(0, len(inp_scale_shape)), output_df=inp_scale.output_df)
-            inp_scale_shape = [1] + inp_scale_shape
-        
-        while len(inp_scale_shape) < len(act.shape.as_list()):
-            inp_scale = dc.op("unsqueeze", [inp_scale], attrs=(len(inp_scale_shape), len(inp_scale_shape)), output_df=inp_scale.output_df)
-            inp_scale_shape = inp_scale_shape + [1]
 
+        if axis < 0:
+            axis = len(act.shape) + axis
+        left_ndim = axis
+        right_ndim = len(act.shape) - axis - 1
+        if len(inp_scale_shape) == 1:
+            # Match ndim with actiavtion
+            for i in range(0, left_ndim):
+                inp_scale = dc.op("unsqueeze", [inp_scale], attrs=(0, len(inp_scale_shape)), output_df=inp_scale.output_df)
+                inp_scale_shape = [1] + inp_scale_shape
+            for i in range(0, right_ndim):
+                inp_scale = dc.op("unsqueeze", [inp_scale], attrs=(len(inp_scale_shape), len(inp_scale_shape)), output_df=inp_scale.output_df)
+                inp_scale_shape = inp_scale_shape + [1]
 
         out_scale_shape = out_scale.shape.as_list()
         if len(out_scale_shape) == 1:
-            # populate batch dim
-            out_scale = dc.op("unsqueeze", [out_scale], attrs=(0, len(out_scale_shape)), output_df=out_scale.output_df)
-            out_scale_shape = [1] + out_scale_shape
-        
-        while len(out_scale_shape) < len(act.shape.as_list()):
-            out_scale = dc.op("unsqueeze", [out_scale], attrs=(len(out_scale_shape), len(out_scale_shape)), output_df=out_scale.output_df)
-            out_scale_shape = out_scale_shape + [1]
+            # Match ndim with actiavtion
+            for i in range(0, left_ndim):
+                out_scale = dc.op("unsqueeze", [out_scale], attrs=(0, len(out_scale_shape)), output_df=out_scale.output_df)
+                out_scale_shape = [1] + out_scale_shape
+            for i in range(0, right_ndim):
+                out_scale = dc.op("unsqueeze", [out_scale], attrs=(len(out_scale_shape), len(out_scale_shape)), output_df=out_scale.output_df)
+                out_scale_shape = out_scale_shape + [1]
 
 
-        for i, (left, right) in enumerate(zip(inp_scale_shape, out_scale_shape)):
-            if i == 0:
-                continue
+        if out_scale_shape[axis] != act.shape[axis]:
+            assert out_scale_shape[axis] == 1
+            out_scale = dc.op("broadcast", [out_scale], attrs=(axis - len(out_scale_shape), act.shape[axis]),output_df=out_scale.output_df)
+            out_scale_shape[axis] = act.shape[axis]
 
-            if left != right:
-                out_scale = dc.op("broadcast", [out_scale], attrs=(i - len(out_scale_shape), left),output_df=out_scale.output_df)            
-
-        recip_out_scale = dc.op("reciprocal", [out_scale],output_df=out_scale.output_df,)
+        recip_out_scale = dc.op(Reciprocal.create(), [out_scale],output_df=out_scale.output_df,)    
         new_scale = dc.op("multiply", [inp_scale, recip_out_scale],output_df=out_scale.output_df,)
 
-        out = dc.op("buda_requantize", [act, new_scale], attrs=(out_zp, axis, rounding, out_dtype),)
+        torch_dtype = STRING_TO_TORCH_DTYPE[out_dtype]
+        buda_dtype = pytorch_dtype_to_buda_dataformat(torch_dtype)
+        out = dc.op("buda_requantize", [act, new_scale], attrs=(out_zp, axis, rounding, out_dtype),output_df=buda_dtype)
         dc.fuse(out)
         return
 
+    if type == "dequantize":
+        zero_point, axis = attr
+        act = inputs[0]
+        scale = inputs[1]
+        if axis < 0:
+            axis = len(act.shape) + axis
+        left_ndim = axis
+        right_ndim = len(act.shape) - axis - 1
+
+        scale_shape = scale.shape.as_list()
+        if len(scale_shape) == 1:
+            # Match ndim with actiavtion
+            for i in range(0, left_ndim):
+                scale = dc.op("unsqueeze", [scale], attrs=(0, len(scale_shape)), output_df=scale.output_df)
+                scale_shape = [1] + scale_shape
+            for i in range(0, right_ndim):
+                scale = dc.op("unsqueeze", [scale], attrs=(len(scale_shape), len(scale_shape)), output_df=scale.output_df)
+                scale_shape = scale_shape + [1]
+
+
+        out = dc.op("buda_dequantize", [act, scale], attrs=attr,)
+        dc.fuse(out)
+        return
diff --git a/pybuda/pybuda/op/eval/pybuda/reciprocal.py b/pybuda/pybuda/op/eval/pybuda/reciprocal.py
new file mode 100644
index 000000000..bc3f7e21f
--- /dev/null
+++ b/pybuda/pybuda/op/eval/pybuda/reciprocal.py
@@ -0,0 +1,52 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import torch
+import torch.nn.functional
+from ..interface import PyEltwiseUnaryOp
+from loguru import logger
+from ..common import to_torch_operands
+from ....pybudaglobal import TILE_DIM
+from ....tensor import buda_dataformat_to_pytorch_dtype
+import numpy as np
+from pybuda.op.eval.common import calculate_tile_size
+from ..buda.reciprocal import Reciprocal as BudaReciprocal
+
+
+class Reciprocal(PyEltwiseUnaryOp):
+    @classmethod
+    def create(cls):
+        self = cls("reciprocal")
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Reciprocal should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+
+        ret = torch.reciprocal(tensors[0] + 1e-10)  # add epsilon to avoid infinity
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+
+    def shape(self, tensor_shapes):
+        assert len(tensor_shapes) == 1, "Reciprocal should have one input"
+        shape = tensor_shapes[0]
+        return shape, []
+
+    def lower(self, lc, tensors, outputs):
+        assert len(tensors) == 1, "Reciprocal should  have one input"
+        approximate_mode = "true" if "PYBUDA_EXP_APPROX" in os.environ else "false"
+        lc.op(BudaReciprocal.create(approximate_mode=approximate_mode), tensors)
+
+    def backward(self, ac, operand, inputs, output, grad):
+        assert len(inputs) == 1, "Reciprocal should have one input"
+        assert operand == 0, "Invalid operand index"
+        sq = ac.op("multiply", (output, output))
+        neg = ac.op("multiply", (sq, ac.constant(-1)))
+        return ac.op("multiply", (neg, grad))
diff --git a/pybuda/pybuda/op/eval/pybuda/reduce.py b/pybuda/pybuda/op/eval/pybuda/reduce.py
index fba96eafc..18b1070b5 100644
--- a/pybuda/pybuda/op/eval/pybuda/reduce.py
+++ b/pybuda/pybuda/op/eval/pybuda/reduce.py
@@ -6,6 +6,8 @@
 from ....pybudaglobal import TILE_DIM, align_up_tile
 from ....tensor import buda_dataformat_to_pytorch_dtype
 from .transpose import TransposeTM
+from .nop import Nop
+from ..buda.nop import Nop as BudaNop
 import torch
 import numpy as np
 import math
@@ -62,7 +64,11 @@ def shape(type, attr, ops):
         if not attr[2]:
             ret[attr[0]] = attr[1]
     else:
-        ret[attr[0]] = 1
+        if isinstance(attr[0], list):
+            for dim in attr[0]:
+                ret[dim] = 1
+        else:
+            ret[attr[0]] = 1
 
     return tuple(ret), []
 
@@ -162,7 +168,7 @@ def pad_to_tile_dim(n):
 
     if reduce_len == 1 and not tile_broadcast:
         # Nothing to reduce
-        lc.op("nop", ops)
+        lc.op(BudaNop.create(), ops)
         return
 
     if reduce_len % TILE_DIM == 0 and type != "grouped_reduce_avg":
@@ -282,7 +288,7 @@ def backward(type, attr, ac, operand, inputs, output, grad):
             return ac.op("multiply", [grad, mask])
 
     if type == "reduce_sum":
-        return ac.op("nop", (grad, )) # the broadcast will be implicitly figured out during shape calculations
+        return ac.op(Nop.create(), (grad, )) # the broadcast will be implicitly figured out during shape calculations
 
     if type == "reduce_avg":
         dim = attr[0]
@@ -333,13 +339,21 @@ def decompose(type, attr, dc, inputs):
     assert len(inputs) == 1, "Reduce should have one input"
     assert len(attr) == 1 or len(attr) == 2 and type == "reduce_max" or len(attr) == 3 and type == "grouped_reduce_avg", "Reduce should have one dim parameter, and optional stride attr OR mandatory groups attr for grouped reduce."
 
+    if isinstance(attr[0], list):
+        x = inputs[0]
+        for dim in attr[0]:
+            x = dc.op("reduce_avg", [x], (dim,))
+        dc.fuse(x)
+        return
+
     inp_shape = inputs[0].shape.as_list()
     if inp_shape[attr[0]] == 1:
         # This is a NOP
-        result = dc.op("nop", inputs, ())
+        result = dc.op(Nop.create(), inputs, ())
         dc.fuse(result)
     elif (type == "reduce_sum" or type == "reduce_avg"):
         dim = attr[0]
+
         if dim >= 0:
             dim -= len(inputs[0].shape)
 
diff --git a/pybuda/pybuda/op/eval/pybuda/resize.py b/pybuda/pybuda/op/eval/pybuda/resize.py
index 19adf0c22..fd75cf7cd 100644
--- a/pybuda/pybuda/op/eval/pybuda/resize.py
+++ b/pybuda/pybuda/op/eval/pybuda/resize.py
@@ -7,6 +7,7 @@
 from pybuda import Tensor
 from pybuda.op.resize import INT_TO_RESIZE2d_METHOD
 from .transpose import TransposeTM
+from .nop import Nop
 
 from ..common import to_torch_operands
 from ..sparse_utils import create_nearest_neighbor_upsample_picker_matrix, create_bilinear_upsample_picker_matrix, create_nearest_neighbor_downsample_picker_matrix
@@ -91,9 +92,6 @@ def shape(type, attr, ops):
                 assert (
                     attr[0] % shape[-3] == 0 and attr[1] % shape[-2] == 0
                 ), "Only support upsample with integer scale factor"
-                assert (
-                    attr[0] // shape[-3] == attr[1] // shape[-2]
-                ), "Only support same scale factor for H and W"
             else:
                 assert attr[1] < shape[-2], "One dim downsamples, the other dim should also downsample"
                 assert (
@@ -110,9 +108,6 @@ def shape(type, attr, ops):
                 assert (
                     attr[0] % shape[-2] == 0 and attr[1] % shape[-1] == 0
                 ), "Only support upsample with integer scale factor"
-                assert (
-                    attr[0] // shape[-2] == attr[1] // shape[-1]
-                ), "Only support same scale factor for H and W"
             else:
                 assert attr[1] < shape[-1], "One dim downsamples, the other dim should also downsample"
                 assert (
@@ -158,9 +153,6 @@ def shape(type, attr, ops):
                 assert (
                     attr[0] % shape[-3] == 0 and attr[1] % shape[-2] == 0 and attr[2] % shape[-1] == 0
                 ), "Only support upsample with integer scale factor"
-                assert (
-                    attr[0] // shape[-3] == attr[1] // shape[-2] == attr[2] // shape[-1]
-                ), "Only support same scale factor for H and W"
             else:
                 assert attr[1] < shape[-2], "One dim downsamples, the other dim should also downsample"
                 assert attr[2] < shape[-1], "One dim downsamples, the other dim should also downsample"
@@ -189,7 +181,9 @@ def decompose_upsample_2d(attr, dc, inputs, resize_method):
     if channel_last:
         w, y, x, cin = (shape.w, shape.z, shape.r, shape.c)
         activations = dc.op("reshape", [activations], (w, 1, y * x, cin))
-        scale_factor = attr[0] // shape[-3]
+        scale_factor_y = attr[0] // shape[-3]
+        scale_factor_x = attr[1] // shape[-2]
+        scale_factor = (scale_factor_x, scale_factor_y)
     else:
         w, cin, y, x = (shape.w, shape.z, shape.r, shape.c)
         activations = dc.op(
@@ -199,7 +193,9 @@ def decompose_upsample_2d(attr, dc, inputs, resize_method):
         )
         activations = dc.op(TransposeTM.create(2, 3), [activations])
 
-        scale_factor = attr[0] // shape[-2]
+        scale_factor_y = attr[0] // shape[-2]
+        scale_factor_x = attr[1] // shape[-1]
+        scale_factor = (scale_factor_x, scale_factor_y)
 
     if resize_method == "nearest":
         dident = create_nearest_neighbor_upsample_picker_matrix(scale_factor, shape, channel_last=channel_last)
@@ -232,7 +228,7 @@ def decompose_upsample_2d(attr, dc, inputs, resize_method):
                 result = dc.op("sparse_matmul", [dident_tensor, activations])
 
     if channel_last:
-        result = dc.op("reshape", [result], (w, y * scale_factor, x * scale_factor, cin))
+        result = dc.op("reshape", [result], (w, y * scale_factor_y, x * scale_factor_x, cin))
         dc.fuse(result)
     else:
         result = dc.op(TransposeTM.create(2, 3), [result])
@@ -242,8 +238,8 @@ def decompose_upsample_2d(attr, dc, inputs, resize_method):
             (
                 w,
                 cin,
-                y * scale_factor,
-                x * scale_factor,
+                y * scale_factor_y,
+                x * scale_factor_x,
             ),
         )
 
@@ -261,7 +257,10 @@ def decompose_upsample_3d(attr, dc, inputs, resize_method):
     w, cin, din, y, x = (shape.v, shape.w, shape.z, shape.r, shape.c)
     activations = dc.op("reshape", inputs, (w, 1, cin*din, y*x),)
     activations = dc.op(TransposeTM.create(-2, -1), [activations])
-    scale_factor = attr[0] // shape[-3]
+    scale_factor_d = attr[0] // shape[-3]
+    scale_factor_y = attr[1] // shape[-2]
+    scale_factor_x = attr[2] // shape[-1]
+    scale_factor = (scale_factor_x, scale_factor_y, scale_factor_d)
 
     if resize_method == "nearest":
         dident_yx = create_nearest_neighbor_upsample_picker_matrix(scale_factor, shape, channel_last=channel_last)
@@ -279,7 +278,7 @@ def decompose_upsample_3d(attr, dc, inputs, resize_method):
     #    result = dc.op("reshape", [result], (w, y * scale_factor, x * scale_factor, cin))
     #    dc.fuse(result)
     #else: 
-    result = dc.op("reshape", [result], (w, cin, din*scale_factor, y*scale_factor, x*scale_factor,),)
+    result = dc.op("reshape", [result], (w, cin, din * scale_factor_d, y * scale_factor_y, x * scale_factor_x,),)
 
     dc.fuse(result)
 
@@ -328,9 +327,6 @@ def decompose_resize2d(attr, dc, inputs, resize_method):
             assert (
                 attr[0] % shape[-3] == 0 and attr[1] % shape[-2] == 0
             ), "Only support upsample with integer scale factor"
-            assert (
-                attr[0] // shape[-3] == attr[1] // shape[-2]
-            ), "Only support same scale factor for H and W"
         else:
             assert attr[1] < shape[-2], "One dim downsamples, the other dim should also downsample"
             assert (
@@ -345,9 +341,6 @@ def decompose_resize2d(attr, dc, inputs, resize_method):
             assert (
                 attr[0] % shape[-2] == 0 and attr[1] % shape[-1] == 0
             ), "Only support upsample with integer scale factor"
-            assert (
-                attr[0] // shape[-2] == attr[1] // shape[-1]
-            ), "Only support same scale factor for H and W"
         else:
             assert attr[1] < shape[-1], "One dim downsamples, the other dim should also downsample"
             assert (
@@ -357,9 +350,10 @@ def decompose_resize2d(attr, dc, inputs, resize_method):
                 shape[-2] // attr[0] == shape[-1] // attr[1]
             ), "Only support same scale factor for H and W"
 
-    scale_factor = scale_factor = attr[0] // shape[-3] if channel_last else attr[0] // shape[-2]
-    if scale_factor == 1:
-        result = dc.op("nop", [activations])
+    scale_factor_y = attr[0] // shape[-2]
+    scale_factor_x = attr[1] // shape[-1]
+    if scale_factor_x == 1 and scale_factor_y == 1:
+        result = dc.op(Nop.create(), [activations])
         dc.fuse(result)
         return
     
@@ -400,9 +394,6 @@ def decompose_resize3d(attr, dc, inputs, resize_method):
             assert (
                 attr[0] % shape[-3] == 0 and attr[1] % shape[-2] == 0 and attr[2] % shape[-1] == 0
             ), "Only support upsample with integer scale factor"
-            assert (
-                attr[0] // shape[-3] == attr[1] // shape[-2] == attr[2] // shape[-1]
-            ), "Only support same scale factor for H and W"
         else:
             assert attr[1] < shape[-2], "One dim downsamples, the other dim should also downsample"
             assert attr[2] < shape[-1], "One dim downsamples, the other dim should also downsample"
@@ -413,9 +404,11 @@ def decompose_resize3d(attr, dc, inputs, resize_method):
                 shape[-3] // attr[0] == shape[-2] // attr[1] == shape[-1] // attr[2]
             ), "Only support same scale factor for H and W" 
  
-    scale_factor = scale_factor = attr[0] // shape[-4] if channel_last else attr[0] // shape[-3]
-    if scale_factor == 1:
-        result = dc.op("nop", [activations])
+    scale_factor_d = attr[0] // shape[-3]
+    scale_factor_y = attr[1] // shape[-2]
+    scale_factor_x = attr[2] // shape[-1]
+    if scale_factor_x == 1 and scale_factor_y == 1 and scale_factor_d == 1:
+        result = dc.op(Nop.create(), [activations])
         dc.fuse(result)
         return
     
diff --git a/pybuda/pybuda/op/eval/pybuda/sqrt.py b/pybuda/pybuda/op/eval/pybuda/sqrt.py
new file mode 100644
index 000000000..770e51b23
--- /dev/null
+++ b/pybuda/pybuda/op/eval/pybuda/sqrt.py
@@ -0,0 +1,68 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import torch
+import torch.nn.functional
+from ..interface import PyEltwiseUnaryOp
+from loguru import logger
+from ..common import to_torch_operands
+from ....pybudaglobal import TILE_DIM
+from ....tensor import buda_dataformat_to_pytorch_dtype
+import numpy as np
+from pybuda.op.eval.common import calculate_tile_size
+from ..buda.sqrt import Sqrt as BudaSqrt
+from .reciprocal import Reciprocal
+
+
+class Sqrt(PyEltwiseUnaryOp):
+    @classmethod
+    def create(cls):
+        self = cls("sqrt")
+        return self
+    
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Sqrt should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        
+        ret = torch.sqrt(tensors[0])
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+    
+    def shape(self, tensor_shapes):
+        assert len(tensor_shapes) == 1, "Sqrt should have one input"
+        shape = tensor_shapes[0]
+        return shape, []
+    
+    def backward(self, ac, operand, inputs, output, grad):
+        assert len(inputs) == 1, "Sqrt should have one input"
+        assert operand == 0, "Invalid operand index"
+        rec = ac.op(Reciprocal.create(), (output,))
+        mult = ac.op("multiply", (rec, ac.constant(0.5)))
+        return ac.op("multiply", (mult, grad))
+    
+    def lower(self, lc, tensors, outputs):
+        assert len(tensors) == 1, "Sqrt should  have one input"
+
+        if bool(int(os.environ.get("PYBUDA_ENABLE_TINY_TILE", "0"))):
+            node_shape = list(tensors[0].shape)
+            tile_height = calculate_tile_size(node_shape[-2])
+            tile_width = calculate_tile_size(node_shape[-1])
+        else:
+            tile_height, tile_width = TILE_DIM, TILE_DIM
+
+        lc.op(BudaSqrt.create(), tensors, tile_height=tile_height, tile_width=tile_width)
+
+    def initial_flops_estimate(self, tensor_shapes):
+        flops = 0
+        output_shape = self.shape(tensor_shapes)[0]
+        flops = np.prod(output_shape)
+    
+        return flops  
+    
\ No newline at end of file
diff --git a/pybuda/pybuda/op/eval/pybuda/tanh.py b/pybuda/pybuda/op/eval/pybuda/tanh.py
new file mode 100644
index 000000000..4e5d837e1
--- /dev/null
+++ b/pybuda/pybuda/op/eval/pybuda/tanh.py
@@ -0,0 +1,71 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import torch
+import torch.nn.functional
+from ..interface import PyEltwiseUnaryOp
+from loguru import logger
+from ..common import to_torch_operands
+from ....pybudaglobal import TILE_DIM
+from ....tensor import buda_dataformat_to_pytorch_dtype
+import numpy as np
+from pybuda.op.eval.common import calculate_tile_size
+from ..buda.tanh import Tanh as BudaTanh
+
+
+class Tanh(PyEltwiseUnaryOp):
+    @classmethod
+    def create(cls):
+        self = cls("tanh")
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Tanh should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        ret = torch.tanh(tensors[0])
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+
+    def shape(self, tensor_shapes):
+        assert len(tensor_shapes) == 1, "Tanh should have one input"
+        shape = tensor_shapes[0]
+        return shape, []
+
+    def backward(self, ac, operand, inputs, output, grad):
+        tanh_square = ac.op("multiply", (output, output))
+        subtract = ac.op("subtract", (ac.constant(1), tanh_square))
+        res = ac.op("multiply", (subtract, grad))
+        return res
+
+    def lower(self, lc, tensors, outputs):
+        assert len(tensors) == 1, "Tanh should  have one input"
+
+        if bool(int(os.environ.get("PYBUDA_ENABLE_TINY_TILE", "0"))):
+            node_shape = list(tensors[0].shape)
+            tile_height = calculate_tile_size(node_shape[-2])
+            tile_width = calculate_tile_size(node_shape[-1])
+            vector = "" if tile_height == TILE_DIM else "r"
+        else:
+            vector = None
+            tile_height, tile_width = TILE_DIM, TILE_DIM
+
+        lc.op(
+            BudaTanh.create(vector=vector),
+            tensors,
+            tile_height=tile_height,
+            tile_width=tile_width,
+        )
+
+    def initial_flops_estimate(self, tensor_shapes):
+        flops = 0
+        output_shape = self.shape(tensor_shapes)[0]
+        flops = np.prod(output_shape)
+
+        return flops
diff --git a/pybuda/pybuda/op/eval/pybuda/tilizer.py b/pybuda/pybuda/op/eval/pybuda/tilizer.py
new file mode 100644
index 000000000..3a4f18ee2
--- /dev/null
+++ b/pybuda/pybuda/op/eval/pybuda/tilizer.py
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+import os
+
+import torch
+import torch.nn.functional
+from ..interface import PyEltwiseUnaryOp
+from loguru import logger
+from ..common import to_torch_operands
+from ....pybudaglobal import TILE_DIM
+from ....tensor import buda_dataformat_to_pytorch_dtype
+import numpy as np
+from pybuda.op.eval.common import calculate_tile_size
+from .nop import Nop
+
+class Tilizer(PyEltwiseUnaryOp):
+    @classmethod
+    def create(cls):
+        self = cls("tilizer")
+        return self
+
+    def eval(self, tensors):
+        assert len(tensors) == 1, "Tilizer should have one input"
+        shape = tensors[0].shape
+        original_types = [o.dtype for o in tensors]
+        ret = torch.tensors[0]
+
+        if ret.dtype != original_types[0]:
+            ret = ret.type(original_types[0])
+
+        return ret
+    
+    def shape(self, tensor_shapes):
+        assert len(tensor_shapes) == 1, "Tilizer should have one input"
+        shape = tensor_shapes[0]
+        return shape, []
+
+    def backward(self, ac, operand, inputs, output, grad):
+        assert len(inputs) == 1, "Tilizer should have one input"
+        assert operand == 0, "Invalid operand index"
+
+        return ac.op(Nop.create(), (grad, ))
+    
\ No newline at end of file
diff --git a/pybuda/pybuda/op/eval/pybuda/tm.py b/pybuda/pybuda/op/eval/pybuda/tm.py
index 77c749b7e..26c019fc8 100644
--- a/pybuda/pybuda/op/eval/pybuda/tm.py
+++ b/pybuda/pybuda/op/eval/pybuda/tm.py
@@ -29,9 +29,11 @@
 from pybuda.tensor import change_rank
 from pybuda.pybudaglobal import TILE_DIM
 from pybuda.utils import align_up_tile, round_up_div, align_up
-from pybuda._C.balancer import FactorizedInt
 from .transpose import TransposeTM
 from ..buda.splice import Splice
+from .nop import Nop
+from ..buda.nop import Nop as BudaNop
+from .buffer import Buffer
 
 def eval(type, attr, ops):
     assert len(ops) == 1 or (type == "adv_index" and len(ops) == 2), f"Tensor manipulation ops should have one input {len(ops)} {attr}"
@@ -53,7 +55,7 @@ def eval(type, attr, ops):
         zero_shape[dim] = 1
         zero_slice = torch.zeros(zero_shape, dtype=dtype).squeeze(dim)
         result = []
-        for offset in range(0, t_ops[0].shape[dim], stride):
+        for offset in range(0, t_ops[0].shape[dim] - begin, stride):
             for i in range(begin, begin + length):
                 if offset + i < t_ops[0].shape[dim] or stride == t_ops[0].shape[dim]:
                     result.append(t_ops[0].select(dim, offset + i))
@@ -90,7 +92,9 @@ def eval(type, attr, ops):
         if dim >= 0:
             dim -= len(ops[0].shape)
 
-        if dim == -4:
+        if dim == -5:
+            return t_ops[0][..., start:stop:stride, :, :, :, :]
+        elif dim == -4:
             return t_ops[0][..., start:stop:stride, :, :, :]
         elif dim == -3:
             return t_ops[0][..., start:stop:stride, :, :]
@@ -459,7 +463,7 @@ def shape(type, attr, ops):
         assert len(attr) == 4, "Select should have 4 attributes"
         dim, begin, length, stride = attr
         shape = list(ops[0])
-        shape[dim] = length * round_up_div(shape[dim], stride)
+        shape[dim] = length * round_up_div(shape[dim] - begin, stride)
         return tuple(shape), []
 
     if type == "gather":
@@ -731,7 +735,7 @@ def lower(type, attr, lc, ops, outputs):
 
         # Squeeze / unsqueeze ops that do not reshape a 4d tensor are nops
         if all([orig == new for orig, new in zip(orig_shape, attr)]):
-            lc.op("nop", ops, [], {})
+            lc.op(BudaNop.create(), ops)
         else:
             orig_w = orig_shape[-4]
             orig_z = orig_shape[-3]
@@ -826,7 +830,7 @@ def lower(type, attr, lc, ops, outputs):
         )
 
     elif type == "pad_tile":
-        return lc.op("nop", ops, [], {})
+        return lc.op(BudaNop.create(), ops)
 
     elif type == "narrow":
         assert len(attr) == 4
@@ -834,7 +838,7 @@ def lower(type, attr, lc, ops, outputs):
         if dim >= 0:
             dim -= len(ops[0].shape)
         if dim >= -2 and align_up_tile(length) == align_up_tile(ops[0].shape[dim]):
-            return lc.op("nop", ops, [], {})
+            return lc.op(BudaNop.create(), ops)
         else:
             raise NotImplementedError("Unimplemented narrow in buda")
 
@@ -842,7 +846,7 @@ def lower(type, attr, lc, ops, outputs):
         assert ((len(attr) == 4 and attr[0] == 0) or 
                (len(attr) == 6 and attr[0] == 0 and attr[2] == 0) or
                (attr[-2] != 0)), "Nop does not support left/top padding for constant mode"
-        return lc.op("nop", ops, [], {})
+        return lc.op(BudaNop.create(), ops)
 
     elif type == "unsqueeze":
         assert len(attr) == 2
@@ -850,15 +854,20 @@ def lower(type, attr, lc, ops, outputs):
         #assert input_ndim + 1 <= 4, "Cannot unsqueeze beyond 4D"
         if input_ndim + 1 > 4:
             assert attr[0] == 0, f"Unsqueeze 4D tensors to 5D is only supported for the 1st dim: {attr[0]}" 
-            return lc.op("nop", ops, ["unsqueeze", attr[1]], {}, tag="dont_remove")
-        return lc.op("nop", ops, [], {})
+            return lc.op(BudaNop.create(unsqueeze = "unsqueeze", unsqueeze_dim=attr[1]), ops, tag="dont_remove")
+
+        return lc.op(BudaNop.create(), ops)
 
     elif type == "squeeze":
         assert len(attr) == 1
-        return lc.op("nop", ops, [], {})
+        if len(ops[0].shape) >= 5:
+            assert attr[0] == 0, f"Squeeze 5D tensors to 4D is only supported for the 1st dim: {attr[0]}" 
+            return lc.op(BudaNop.create(squeeze="squeeze", squeeze_dim=attr[0]), ops, tag="dont_remove")
+
+        return lc.op(BudaNop.create(), ops)
 
     elif (type == "hstack" or type == "hslice") and attr[0] == 1:
-        return lc.op("nop", ops, [], {})
+        return lc.op(BudaNop.create(), ops)
 
     elif type == "buda_pad":
         return lc.tm("buda_pad", ops[0], attr, { "rt": attr[0], "ct": attr[1], "pad_value": attr[2]})
@@ -1009,7 +1018,7 @@ def backward(type, attr, ac, operand, inputs, output, grad):
 
         dim = attr[0]
         if grad.shape.len() == 4: # Cannot unsqueeze beyond 4D
-            return ac.op("nop", (grad,), attributes=())
+            return ac.op(Nop.create(), (grad,))
         return ac.op("unsqueeze", (grad,), attributes=(dim, grad.shape.len()))
 
     elif type == "broadcast": 
@@ -1084,11 +1093,11 @@ def decompose(type, attr, dc, inputs):
             return
         elif start % TILE_DIM == 0 and stop % TILE_DIM == 0 and stride == 1 and act.shape[dim] % TILE_DIM == 0:
             result = dc.op("select", [act], (dim, start, length, act.shape[dim]))
-            result = dc.op("buffer", [result]) # Workaround to enable T-streaming for Splice
+            result = dc.op(Buffer.create(), [result]) # Workaround to enable T-streaming for Splice
             dc.fuse(result)
             return
         elif act.shape[dim] == 1 and length == 1 and stride == 1:
-            result = dc.op("nop", [inputs[0]], ())
+            result = dc.op(Nop.create(), [inputs[0]])
             dc.fuse(result)
             return
         elif dim == -2 and stride == 1 and length == stop and "PYBUDA_PAD_MM" in os.environ:
@@ -1147,7 +1156,7 @@ def decompose(type, attr, dc, inputs):
         in0_shape = inputs[0].shape
         in1_shape = inputs[1].shape
         if len(in0_shape) == 1 or in0_shape[dim] == 1:
-            result = dc.op("nop", [inputs[0]], ())
+            result = dc.op(Nop.create(), [inputs[0]])
             dc.fuse(result)
             return
         if dim == 0 and len(in1_shape) <= 2:
@@ -1161,7 +1170,7 @@ def decompose(type, attr, dc, inputs):
     if type == "pad":
         if all([x == 0 for x in attr[0:-2]]):
             # Pad size is 0
-            result = dc.op("nop", [inputs[0]], ())
+            result = dc.op(Nop.create(), [inputs[0]])
             dc.fuse(result)
 
 
@@ -1327,7 +1336,7 @@ def decompose(type, attr, dc, inputs):
 
     if type == "broadcast":
         if attr[1] == 1:
-            dc.fuse(dc.op("nop", [inputs[0]]))
+            dc.fuse(dc.op(Nop.create(), [inputs[0]]))
 
     if type == "transpose":
         # canonicalize dims to use negative indexing
@@ -1426,7 +1435,7 @@ def decompose_select(attr, dc, inputs):
 
         result = inputs[0]
         if orig_shape[dim] == length:
-            result = dc.op("nop", [result])
+            result = dc.op(Nop.create(), [result])
             dc.fuse(result)
 
         # select on z dim is supported via splice
@@ -1503,17 +1512,17 @@ def decompose_xy_flatten_reshape(inputs, dc, orig_shape, attr):
 
     if orig_shape[-3] > 1:
         result = dc.op("hslice", [result], (orig_shape[-3],))
-        # result = dc.op("buffer", [result]) # HW workaround for: tenstorrent/budabackend#656
+        # result = dc.op(Buffer.create(), [result]) # HW workaround for: tenstorrent/budabackend#656
         
     rt = align_up_tile(r_new) // TILE_DIM
     if pad_for_factrization:
         rt = sparse_r_padding[orig_shape[-2]]
     result = dc.op("vslice", [result], (rt,))
-    # result = dc.op("buffer", [result]) # HW workaround for: tenstorrent/budabackend#656
+    # result = dc.op(Buffer.create(), [result]) # HW workaround for: tenstorrent/budabackend#656
     result = dc.op("hstack", [result], (rt,))
 
     if orig_shape[-3] > 1:
-        # result = dc.op("buffer", [result]) # HW workaround for: tenstorrent/budabackend#656
+        # result = dc.op(Buffer.create(), [result]) # HW workaround for: tenstorrent/budabackend#656
         result = dc.op("vstack", [result], (orig_shape[-3],))
 
 
@@ -1763,7 +1772,7 @@ def decompose_post_optimize(type, attr, dc, inputs):
 
             rt = align_up_tile(r_new) // TILE_DIM
             result = dc.op("vslice", [result], (rt,))
-            result = dc.op("buffer", [result]) # HW workaround for: tenstorrent/budabackend#656
+            result = dc.op(Buffer.create(), [result]) # HW workaround for: tenstorrent/budabackend#656
             result = dc.op("hstack", [result], (rt,))
         
             result = dc.op(TransposeTM.create(-2, -1), [result])
@@ -2090,6 +2099,25 @@ def decompose_post_optimize(type, attr, dc, inputs):
             
             result = dc.op("narrow", [result], (-1, 0, attr[-1], result.shape[-1]))
             result = dc.op("narrow", [result], (-2, 0, attr[-2], result.shape[-2]))
+        
+            
+        elif len(orig_shape) == 4 and len(orig_attr) < 4 and orig_shape[-4] == 1 and orig_shape[-3] * orig_shape[-2] == attr[-3] * attr[-2] and orig_shape[-1] == attr[-1] and orig_shape[-3] % attr[-3] == 0:
+            # example: (1, 1024, 4, 128) -> (1, 128, 32, 128)
+            # in vstack op, slice size is equal to orig_shape[-3] / attr[-3], hence the constraint orig_shape[-3] % attr[-3] == 0
+            # TODO: this is a temporary solution for decomposing reshape in pytorch implementation of Grouped Query Attention
+            # in this particular case, padding is not needed, while adding padding would increase the input tensor size 8 times (dim -2 going from 4 to 32)
+            # in the future, hard constraint of divisibility of R dim by TILE_DIM should be removed, but removing it currently causes some models to fail compilation
+            # hint: look into function convert_reshape_into_vslice_or_vstack_if_possible
+            result = dc.op('vstack', [result], (orig_shape[-3] // attr[-3],))
+
+        elif len(orig_shape) == 4 and len(attr) == 4 and orig_shape[-4] == attr[-4] and orig_shape[-1] == attr[-1] and attr[-3] == orig_shape[-2] * orig_shape[-3] and attr[-2] == 1 and attr[-3] % orig_shape[-3] == 0:
+            # example: (1, 6, 8, 128) -> (1, 48, 1, 128)
+            # in vslice op, slice size is equal to attr[-3] / orig_shape[-3], hence the constraint attr[-3] % orig_shape[-3] == 0
+            # TODO: this is a temporary solution for decomposing reshape in pytorch implementation of Grouped Query Attention
+            # in this particular case, padding is not needed, while adding padding would increase the resulting tensor size 32 times (dim -2 going from 1 to 32)
+            # in the future, hard constraint of divisibility of R dim by TILE_DIM should be removed, but removing it currently causes some models to fail compilation
+            # hint: look into function convert_reshape_into_vslice_or_vstack_if_possible
+            result = dc.op('vslice', [result], (orig_shape[-2],))
             
         elif (len(orig_shape) < 4 or (len(orig_shape) == 4 and orig_shape[0] == 1)) \
               and (len(attr) < 4 or (len(attr) == 4 and attr[0] == 1)): # General reshape (only support for w == 1)
@@ -2360,7 +2388,7 @@ def decompose_post_autograd(type, attr, dc, inputs):
         shape = list(attr)
 
         if shape == input_shape:
-            #dc.fuse(dc.op("nop", [inputs[0]]))
+            #dc.fuse(dc.op(Nop.create(), [inputs[0]]))
             return
 
         rank = 0
diff --git a/pybuda/pybuda/op/eval/sparse_utils.py b/pybuda/pybuda/op/eval/sparse_utils.py
index 8dac940ef..59986161e 100644
--- a/pybuda/pybuda/op/eval/sparse_utils.py
+++ b/pybuda/pybuda/op/eval/sparse_utils.py
@@ -653,11 +653,11 @@ def create_nearest_neighbor_upsample_picker_matrix(
         if for_din:
             raise RuntimeError("Resize3d is not supported in channel-last format yet")
 
-        rows = torch.arange(shape[-3] * scale_factor * shape[-2] * scale_factor)
+        rows = torch.arange(shape[-3] * scale_factor[0] * shape[-2] * scale_factor[1])
         cols = []
         for i in range(shape[-3]):
             col = (
-                torch.arange(shape[-2]).repeat_interleave(scale_factor).repeat(scale_factor)
+                torch.arange(shape[-2]).repeat_interleave(scale_factor[0]).repeat(scale_factor[1])
                 + i * (shape[-2])
             )
             cols.append(col)
@@ -675,12 +675,12 @@ def create_nearest_neighbor_upsample_picker_matrix(
         )
     else:
         if for_din:
-            rows = torch.arange(shape[-3] * scale_factor * shape[-4])
+            rows = torch.arange(shape[-3] * scale_factor[2] * shape[-4])
             #cols = torch.arange(shape[-3]).repeat_interleave(scale_factor)
             cols = []
             for i in range(shape[-4]):
                 col = (
-                    torch.arange(shape[-3]).repeat_interleave(scale_factor)
+                    torch.arange(shape[-3]).repeat_interleave(scale_factor[2])
                     + i * shape[-3]
                 )
                 cols.append(col)
@@ -688,11 +688,11 @@ def create_nearest_neighbor_upsample_picker_matrix(
             sparse_r = rows.shape[0]
             sparse_c = shape[-3] * shape[-4]
         else:
-            rows = torch.arange(shape[-2] * scale_factor * shape[-1] * scale_factor)
+            rows = torch.arange(shape[-2] * scale_factor[0] * shape[-1] * scale_factor[1])
             cols = []
             for i in range(shape[-2]):
                 col = (
-                    torch.arange(shape[-1]).repeat_interleave(scale_factor).repeat(scale_factor)
+                    torch.arange(shape[-1]).repeat_interleave(scale_factor[0]).repeat(scale_factor[1])
                     + i * shape[-1]
                 )
                 cols.append(col)
@@ -759,23 +759,23 @@ def create_bilinear_upsample_picker_matrix(
 
     # Final dident shape
     num_cols = r * c
-    num_rows = num_cols * scale_factor * scale_factor
+    num_rows = num_cols * scale_factor[0] * scale_factor[1]
 
-    upsample_c_idx = torch.arange(0, c * scale_factor)
-    upsample_r_idx = torch.arange(0, r * scale_factor)
+    upsample_c_idx = torch.arange(0, c * scale_factor[0])
+    upsample_r_idx = torch.arange(0, r * scale_factor[1])
     if align_corners:
         upsample_c_idx_adjusted = up_idx_to_orig_idx_align_corners(
-            upsample_c_idx, c, c * scale_factor
+            upsample_c_idx, c, c * scale_factor[0]
         )
         upsample_r_idx_adjusted = up_idx_to_orig_idx_align_corners(
-            upsample_r_idx, r, r * scale_factor
+            upsample_r_idx, r, r * scale_factor[1]
         )
     else:
         upsample_c_idx_adjusted = up_idx_to_orig_idx_no_align_corners(
-            upsample_c_idx, scale_factor
+            upsample_c_idx, scale_factor[0]
         )
         upsample_r_idx_adjusted = up_idx_to_orig_idx_no_align_corners(
-            upsample_r_idx, scale_factor
+            upsample_r_idx, scale_factor[1]
         )
 
     # Clip index between 0 and c
@@ -1544,9 +1544,9 @@ def conv2d_out_shape(type, attr, ops):
         )
 
         if channel_last == 1:
-            return (activations[0], y, x, weights[1]), []
+            return (activations[0], y, x, weights[1] * groups), []
         else:
-            return (activations[0], weights[1], y, x), []
+            return (activations[0], weights[1] * groups, y, x), []
 
 def conv3d_out_shape(type, attr, ops):
     assert len(ops) <= 3, "Conv3d should have three inputs"
diff --git a/pybuda/pybuda/op/nn.py b/pybuda/pybuda/op/nn.py
index 6187ed018..4dfe9282c 100644
--- a/pybuda/pybuda/op/nn.py
+++ b/pybuda/pybuda/op/nn.py
@@ -158,24 +158,29 @@ def Batchnorm(
         Buda tensor
     """
 
+    # NOTE: the decomposition below does not assume training context (running_mean/var update is not included)
+    batchnorm_flag = True
     if name == "":
         name = f"batchnorm_{get_unique_node_id()}"
 
-    running_mean = Unsqueeze(name + "_mean_unsqueeze_1", running_mean, 1)
-    running_mean = Unsqueeze(name + "_mean_unsqueeze_2", running_mean, 1)
-    running_var = Unsqueeze(name + "_var_unsqueeze_1", running_var, 1)
-    running_var = Unsqueeze(name + "_var_unsqueeze_2", running_var, 1)
-
-    weights = Unsqueeze(name + "_weights_unsqueeze_1", weights, 1)
-    weights = Unsqueeze(name + "_weights_unsqueeze_2", weights, 1)
-    bias = Unsqueeze(name + "_bias_unsqueeze_1", bias, 1)
-    bias = Unsqueeze(name + "_bias_unsqueeze_2", bias, 1)
-    epsilon_constant = Constant(name + "_eps", constant=epsilon)
-    x_minus_mean = Subtract(name + "_sub", operandA, running_mean)
-    var_plus_eps = Add(name + "_var_plus_eps", running_var, epsilon_constant)
-    recip = Reciprocal(name + "_recip", Sqrt(name + "_sqrt", var_plus_eps))
-    out = Multiply(name + "_output", x_minus_mean, recip)
-    return Add(name + "_bias", Multiply(name + "_weights", out, weights), bias)
+    if batchnorm_flag:
+        return op("batchnorm", name, operandA, weights, bias, running_mean, running_var, attrs=(epsilon,)).get_tensor()
+    else:
+        running_mean = Unsqueeze(name + "_mean_unsqueeze_1", running_mean, 1)
+        running_mean = Unsqueeze(name + "_mean_unsqueeze_2", running_mean, 1)
+        running_var = Unsqueeze(name + "_var_unsqueeze_1", running_var, 1)
+        running_var = Unsqueeze(name + "_var_unsqueeze_2", running_var, 1)
+
+        weights = Unsqueeze(name + "_weights_unsqueeze_1", weights, 1)
+        weights = Unsqueeze(name + "_weights_unsqueeze_2", weights, 1)
+        bias = Unsqueeze(name + "_bias_unsqueeze_1", bias, 1)
+        bias = Unsqueeze(name + "_bias_unsqueeze_2", bias, 1)
+        epsilon_constant = Constant(name + "_eps", constant=epsilon)
+        x_minus_mean = Subtract(name + "_sub", operandA, running_mean)
+        var_plus_eps = Add(name + "_var_plus_eps", running_var, epsilon_constant)
+        recip = Reciprocal(name + "_recip", Sqrt(name + "_sqrt", var_plus_eps))
+        out = Multiply(name + "_output", x_minus_mean, recip)
+        return Add(name + "_bias", Multiply(name + "_weights", out, weights), bias)
 
 
 class Linear(PyBudaModule):
diff --git a/pybuda/pybuda/optimizers.py b/pybuda/pybuda/optimizers.py
index ec266372a..8e25086b5 100644
--- a/pybuda/pybuda/optimizers.py
+++ b/pybuda/pybuda/optimizers.py
@@ -16,7 +16,6 @@
 import pybuda.torch_optimizers
 from pybuda.torch_optimizers import AdamNoBiasCorrection
 
-
 class Optimizer:
     """
     Optimizer base class
@@ -327,9 +326,10 @@ def generate_op_trace(self, ac, parameter, gradient):
             weight_decay = ac.constant(self.weight_decay)
         else:
             weight_decay = None
-
+        ## import locally to avoid circular dependency from Dataformat, fix it later
+        from pybuda.op.eval.pybuda.buffer import Buffer
         # we copy the grad accum. queue since it only accepts a single consumer/pop
-        gradient_copy = ac.op("buffer", (gradient,))
+        gradient_copy = ac.op(Buffer.create(), (gradient,))
 
         if weight_decay and not self.enable_adam_w:
             weight_decay_times_param = ac.op("multiply", (weight_decay, parameter))
@@ -356,32 +356,35 @@ def generate_op_trace(self, ac, parameter, gradient):
         updated_variance = ac.op(
             "add", (variance_times_beta2, gradient_squared_times_one_minus_beta2)
         )
+        from pybuda.op.eval.pybuda.reciprocal import Reciprocal
+        #import Sqrt module locally to avoid circular dependency
+        from pybuda.op.eval.pybuda.sqrt import Sqrt
         if self.bias_correction:
             # bias_correction1 = 1 - beta1 ** step
             beta1_one = ac.constant(1.0)
             beta1_pow = ac.input("beta1_pow", (1,), disable_consteval=True) # stores beta1 ** step
             updated_beta1_pow = ac.op("multiply", (beta1_pow, beta1))
             bias_correction1  = ac.op("subtract", (beta1_one, updated_beta1_pow))
-            reciprocal_bias_correction1 = ac.op("reciprocal", (bias_correction1,))
+            reciprocal_bias_correction1 = ac.op(Reciprocal.create(), (bias_correction1,))
 
             # bias_correction2 = 1 - beta2 ** step
             beta2_one = ac.constant(1.0)
             beta2_pow = ac.input("beta2_pow", (1,), disable_consteval=True) # stores beta2 ** step
             updated_beta2_pow = ac.op("multiply", (beta2_pow, beta2))
             bias_correction2 = ac.op("subtract", (beta2_one, updated_beta2_pow))
-            sqrt_bias_correction2 = ac.op("sqrt", (bias_correction2,))
-            reciprocal_sqrt_bias_correction2 = ac.op("reciprocal", (sqrt_bias_correction2,))
+            sqrt_bias_correction2 = ac.op(Sqrt.create(), (bias_correction2,))
+            reciprocal_sqrt_bias_correction2 = ac.op(Reciprocal.create(), (sqrt_bias_correction2,))
 
             # sqrt_of_variance / sqrt_bias_correction2
-            sqrt_of_variance_biased = ac.op("sqrt", (updated_variance,))
+            sqrt_of_variance_biased = ac.op(Sqrt.create(), (updated_variance,))
             sqrt_of_variance = ac.op("multiply", (sqrt_of_variance_biased, reciprocal_sqrt_bias_correction2))
         else:
-            sqrt_of_variance = ac.op("sqrt", (updated_variance,))
+            sqrt_of_variance = ac.op(Sqrt.create(), (updated_variance,))
 
         epsilon = ac.constant(self.epsilon)
         sqrt_of_variance_plus_epsilon = ac.op("add", (sqrt_of_variance, epsilon))
         reciprocal_of_sqrt_of_variance_plus_epsilon = ac.op(
-            "reciprocal", (sqrt_of_variance_plus_epsilon,)
+            Reciprocal.create(), (sqrt_of_variance_plus_epsilon,)
         )
 
         if self.bias_correction:
@@ -648,7 +651,9 @@ def generate_op_trace(self, ac, parameter, gradient):
         # gradinet buffering
 
         # g(t) -> gradient at current timestep
-        grad = ac.op("buffer", (gradient, ))
+        #temp fix to avoid circular dependency by importing locally
+        from pybuda.op.eval.pybuda.buffer import Buffer
+        grad = ac.op(Buffer.create(), (gradient, ))
 
         # m(t) <- beta1 * m(t - 1) + (1 - beta1) * g(t)
         # m(t)     : mean at current timestep
@@ -689,15 +694,19 @@ def generate_op_trace(self, ac, parameter, gradient):
         if len(phi_norm_shape) > 1:
             phi_norm = ac.op("reduce_sum", (phi_norm, ), (-2, ))
         phi_norm = ac.op("reduce_sum", (phi_norm, ), (-1, ))
-        phi_norm = ac.op("sqrt", (phi_norm, ))
+
+        #importing locally to avoid circular dependency from Dataformats
+        from pybuda.op.eval.pybuda.sqrt import Sqrt
+        phi_norm = ac.op(Sqrt.create(), (phi_norm, ))
 
         epsilon = ac.tensor(torch.zeros(param_shape) + self.eps)
         weight_decay = ac.tensor(torch.zeros(param_shape) + self.weight_decay)
 
         # adam ratio, ratio of corrected mean and corrected variance stabilized with epsilon
-        r_t = ac.op("sqrt", (updated_variance, ))
+        r_t = ac.op(Sqrt.create(), (updated_variance, ))
         r_t = ac.op("add", (r_t, epsilon))
-        r_t = ac.op("multiply", (updated_mean,  ac.op("reciprocal", (r_t, ))))
+        from pybuda.op.eval.pybuda.reciprocal import Reciprocal
+        r_t = ac.op("multiply", (updated_mean,  ac.op(Reciprocal.create(), (r_t, ))))
 
         if self.weight_decay != 0:
             decayed_param = ac.op("multiply", (parameter, weight_decay))
@@ -708,7 +717,7 @@ def generate_op_trace(self, ac, parameter, gradient):
         if len(r_t_norm_shape) > 1:
             r_t_norm = ac.op("reduce_sum", (r_t_norm, ), (-2, ))
         r_t_norm = ac.op("reduce_sum", (r_t_norm, ), (-1, ))
-        r_t_norm = ac.op("sqrt", (r_t_norm, ))
+        r_t_norm = ac.op(Sqrt.create(), (r_t_norm, ))
 
         #
         #   IF phi_norm != 0 AND r_t_norm != 0:
@@ -737,9 +746,10 @@ def generate_op_trace(self, ac, parameter, gradient):
         phi_norm_eq = ac.op("equal", (phi_norm, zero))
         r_t_norm_ne = ac.op("not_equal", (r_t_norm, zero))
         r_t_norm_eq = ac.op("equal", (r_t_norm, zero))
-        trust_ratio = ac.op("reciprocal", (r_t_norm, ))
+        trust_ratio = ac.op(Reciprocal.create(), (r_t_norm, ))
         trust_ratio = ac.op("multiply", (phi_norm, trust_ratio))
-        trust_ratio = ac.op("clip", (trust_ratio, ), (self.clip_value[0], self.clip_value[1]))
+        from pybuda.op.eval.pybuda.clip import Clip
+        trust_ratio = ac.op(Clip.create(min=self.clip_value[0], max=self.clip_value[1]), (trust_ratio, ))
         trust_ratio = ac.op("multiply", (trust_ratio, r_t_norm_ne))
         trust_ratio = ac.op("add", (trust_ratio, r_t_norm_eq))
         trust_ratio = ac.op("multiply", (trust_ratio, phi_norm_ne))
@@ -960,7 +970,9 @@ def generate_op_trace(self, ac, parameter, gradient):
         # gradinet buffering
 
         # g(t) -> gradient at current timestep
-        grad = ac.op("buffer", (gradient, ))
+        #temp fix for circular dependency
+        from pybuda.op.eval.pybuda.buffer import Buffer
+        grad = ac.op(Buffer.create(), (gradient, ))
 
         # lambda <- || w(t) || / (|| g(t) || + beta * || w(t) ||)
         weight_norm = ac.op("multiply", (parameter, parameter))
@@ -968,14 +980,16 @@ def generate_op_trace(self, ac, parameter, gradient):
         if len(weight_norm_shape) > 1:
             weight_norm = ac.op("reduce_sum", (weight_norm, ), (-2, ))
         weight_norm = ac.op("reduce_sum", (weight_norm, ), (-1, ))
-        weight_norm = ac.op("sqrt", (weight_norm, ))
+        #importing locally to avoid circular dependency from Dataformats
+        from pybuda.op.eval.pybuda.sqrt import Sqrt
+        weight_norm = ac.op(Sqrt.create(), (weight_norm, ))
 
         grad_norm = ac.op("multiply", (grad, grad))
         grad_norm_shape = grad_norm.shape.as_list()
         if len(grad_norm_shape) > 1:
             grad_norm = ac.op("reduce_sum", (grad_norm, ), (-2, ))
         grad_norm = ac.op("reduce_sum", (grad_norm, ), (-1, ))
-        grad_norm = ac.op("sqrt", (grad_norm, ))
+        grad_norm = ac.op(Sqrt.create(), (grad_norm, ))
  
         #
         #   IF weight_norm != 0 AND grad_norm != 0:
@@ -1016,7 +1030,8 @@ def generate_op_trace(self, ac, parameter, gradient):
         local_learning_rate = ac.op("multiply", (weight_decay, weight_norm))
         local_learning_rate = ac.op("add", (grad_norm, local_learning_rate))
         local_learning_rate = ac.op("add", (epsilon, local_learning_rate))
-        local_learning_rate = ac.op("reciprocal", (local_learning_rate, ))
+        from pybuda.op.eval.pybuda.reciprocal import Reciprocal
+        local_learning_rate = ac.op(Reciprocal.create(), (local_learning_rate, ))
         local_learning_rate = ac.op("multiply", (weight_norm, local_learning_rate))
         local_learning_rate = ac.op("multiply", (lars_coeff, local_learning_rate))
 
diff --git a/pybuda/pybuda/pybudaglobal.py b/pybuda/pybuda/pybudaglobal.py
index 1bfc6de0f..4596f0514 100644
--- a/pybuda/pybuda/pybudaglobal.py
+++ b/pybuda/pybuda/pybudaglobal.py
@@ -13,7 +13,6 @@
 
 from loguru import logger
 
-from pybuda._C.backend_api import BackendType
 
 devices = []     # Ordered list of devices running in a pipeline
 modules = []
@@ -84,18 +83,11 @@ def pybuda_reset():
     global modules
     global optimizers
 
-    for d in devices:
-        d.shutdown_device()
-
     devices = []
     modules = []
     
     from pybuda.config import _clear_global_compiler_config
     _clear_global_compiler_config()
-    from pybuda.run.context import context_reset
-    context_reset()
-    if os.environ.get("TT_BACKEND_HARVESTED_ROWS", None):
-        del os.environ["TT_BACKEND_HARVESTED_ROWS"]
     set_state_changed()
 
 def state_changed() -> bool:
@@ -160,7 +152,7 @@ def lazy_trace_data(data):
     """
     logger.opt(lazy=True).trace("{x}", x=lambda: data)
 
-def is_silicon(devtype: BackendType):
+def is_silicon():
     """
     Returns true if the device is a "silicon-like" - i.e. a silicon device or versim
     """
diff --git a/pybuda/pybuda/python_codegen.py b/pybuda/pybuda/python_codegen.py
index b954763be..565813a09 100644
--- a/pybuda/pybuda/python_codegen.py
+++ b/pybuda/pybuda/python_codegen.py
@@ -107,7 +107,7 @@ def import_module_path(self):
 class PyBudaWriter(PythonWriter):
     incompatible_np_float_types = [tf.bfloat16, ]
 
-    def __init__(self, module_name, framework, contains_incompatible_np_floats=False):
+    def __init__(self, module_name, framework, contains_incompatible_np_floats=False, delete_inputs=True):
         super().__init__(module_name)
 
         self.framework = framework
@@ -115,6 +115,7 @@ def __init__(self, module_name, framework, contains_incompatible_np_floats=False
         self.const_names = []
         self.num_submodels = 0
         self.contains_incompatible_np_floats = contains_incompatible_np_floats
+        self.delete_inputs = delete_inputs
         self.dev = "TTDevice"
 
     def write_header(self):
@@ -215,7 +216,6 @@ def write_forward(self, ops, inputs, outputs):
             set_src_layer = ""
             if ops[key].src_layer:
                 set_src_layer = f'.set_src_layer("{ops[key].src_layer}")'
-
             if ops[key].is_submodule_call:
                 if len(ops[key].loop_with):
                     if len(ops[key].loop_with) + 1 == self.num_submodels:
@@ -234,8 +234,9 @@ def write_forward(self, ops, inputs, outputs):
                     self.wl(f"{ops[key].output_name} = {ops[key].function_name}({activation_names}{arg_text}){set_src_layer}")
             else:
                 self.wl(f"{ops[key].output_name} = {ops[key].function_name}(\"{ops[key].node_name}\"{activation_names}{arg_text}){set_src_layer}")
-                for name_to_del in ops[key].inputs_to_delete:
-                    self.wl(f"{name_to_del}._value = None")
+                if self.delete_inputs:
+                    for name_to_del in ops[key].inputs_to_delete:
+                        self.wl(f"{name_to_del}._value = None")
 
         outputs = list(outputs.values())
         if len(outputs) == 1:
diff --git a/pybuda/pybuda/run/__init__.py b/pybuda/pybuda/run/__init__.py
deleted file mode 100644
index 564120eba..000000000
--- a/pybuda/pybuda/run/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-from .api import (
-    run_inference,
-    run_training,
-    shutdown,
-    initialize_pipeline,
-    run_forward,
-    run_backward,
-    run_optimizer,
-    run_schedulers,
-    get_parameter_checkpoint,
-    get_parameter_gradients,
-    update_device_parameters,
-    error_raised,
-    get_checkpoint_queue,
-    get_loss_queue,
-    get_intermediates_queue,
-    sync,
-    run_generate,
-    run_generative_inference,
-    detect_available_devices,
-)
diff --git a/pybuda/pybuda/run/api.py b/pybuda/pybuda/run/api.py
deleted file mode 100644
index 7bcfc4b65..000000000
--- a/pybuda/pybuda/run/api.py
+++ /dev/null
@@ -1,491 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import List, Tuple, Optional, Union, Dict
-import queue
-
-import torch
-import torch.multiprocessing as mp
-
-from ..module import PyBudaModule
-from ..tensor import Tensor
-from ..verify import VerifyConfig
-
-from pybuda._C import DataFormat
-from pybuda._C.backend_api import DeviceMode
-
-from ..pybudaglobal import get_devices
-from .impl import (
-    _run_forward,
-    _initialize_pipeline,
-    _run_inference,
-    _run_devices_training,
-    _run_generate,
-    _shutdown,
-    _run_backward,
-    _run_optimizer,
-    _run_schedulers,
-    _save_parameter_checkpoint,
-    _get_parameter_checkpoint,
-    _get_parameter_gradients,
-    _update_device_parameters,
-    _error_raised,
-    _get_checkpoint_queue,
-    _get_loss_queue,
-    _get_intermediates_queue,
-    _sync,
-    _detect_available_devices,
-)
-
-
-def run_inference(
-        module: Optional[PyBudaModule] = None,
-        inputs: List[Union[Tuple[Union[torch.Tensor, Tensor], ...], Dict[str, Union[torch.Tensor, Tensor]]]] = [],
-        input_count: int = 1,
-        output_queue: queue.Queue = None,
-        _sequential: bool = False,
-        _perf_trace: bool = False,
-        _verify_cfg: Optional[VerifyConfig] = None) -> queue.Queue:
-    """
-    Main "run" function for inference. After all modules have been defined and placed on devices, this will 
-    execute the workload. Unless 'sequential' is set, the function will return as soon as the devices are set up
-    to run, and inference will run as long as new inputs are pushed into the device(s). If sequential mode is on,
-    the function will run through inputs that are already in the input buffer and return when done.
-
-    Parameters
-    ----------
-    module: PyBudaModule, optional
-        If provided, place given module on a TT Device and run inference. Alternatively, manually create device(s) and
-        placed module(s) on them.
-
-    inputs: List[Union[Tuple[Union[torch.Tensor, Tensor], ...], Dict[str, Union[torch.Tensor, Tensor]]]], optional
-        An optional list of input tensor tuples or dictionaries (passed as args or kwargs to module), to feed into the inference pipeline. 
-        Alternatively, use `device.push_to_inputs` to manually provide inputs outside of this call.
-
-    input_count: int, default=1
-        The number of inputs to run inference on. If 0, inference will run "forever", until `shutdown` or `run_inference`
-        is called again.
-
-    output_queue: queue.Queue, optional
-        If provided, outputs will be pushed into the queue as they are calculated. Otherwise, one will be created
-        and returned.
-
-    _sequential: bool, Internal
-        Don't use.
-
-    _perf_trace: bool, Internal
-        Don't use.
-
-    _verify_cfg: Internal
-        Don't use.
-
-    Returns
-    -------
-    queue.Queue
-        Queue holding the output results. Either the output_queue provided, or one that's created.
-
-    """
-
-    return _run_inference(module, inputs, input_count, output_queue, _sequential, _perf_trace, _verify_cfg)
-
-def run_training(
-        epochs: int = 1,
-        steps: int = 1,
-        accumulation_steps: int = 1,
-        microbatch_count: int = 1,
-        checkpoint_queue: queue.Queue = None,
-        loss_queue: queue.Queue = None,
-        checkpoint_interval: int = 0,
-        _sequential: bool = False,
-        _perf_trace: bool = False,
-        _verify_cfg: Optional[VerifyConfig] = None) -> queue.Queue:
-        
-    """
-    Main "run" function for training. After all modules have been defined and placed on devices, this will 
-    execute the workload.
-
-    Parameters
-    ----------
-    epochs: int
-        The number of epoch to run. Scheduler, if provided, will be stepped after each one.
-
-    steps: int
-        The number of batches to run. After every step, the optimizer will be stepped.
-
-    accumulation_steps: int
-        The number of mini-batches in a batch. Each mini-batch is limited in size by how much of the
-        intermediate data can fit in device memory. 
-
-    microbatch_count: int
-        Each mini-batch is optionally further broken into micro-batches. This is necessary to fill a 
-        multi-device pipeline, and should be roughly 4-6x the number of devices in the pipeline for ideal
-        performance.
-
-    checkpoint_queue: Queue, optional
-        If provided, weight checkpoints will be pushed into this queue, along with the final set of weights.
-        If one is not provided, one will be created and returned.
-
-    loss_queue: Queue, optional
-        If provided, loss values will be pushed into this queeu.
-
-    checkpoint_interval: int, optional
-        The weights will be checkpointed into checkpoint queues on host every `checkpoint_interval` optimizer
-        steps, if set to non-zero. Zero by default.
-
-    _sequential: Internal
-        Don't use
-
-    _perf_trace: Internal
-        Don't use
-
-    _verify_cfg: Internal
-        Don't use.
-
-    Returns
-    -------
-    queue.Queue
-         Checkpoint queue, holding weight checkpoints, and final trained weights.
-
-    """
-
-    if epochs == 0 or steps == 0 or accumulation_steps == 0 or microbatch_count == 0:
-        raise RuntimeError("Calling run_training with one of the loop indices at 0. Nothing to do.")
-
-    return _run_devices_training(sequential=_sequential, epochs=epochs, steps=steps, accumulation_steps=accumulation_steps, microbatch_count=microbatch_count, checkpoint_interval=checkpoint_interval, perf_trace=_perf_trace, checkpoint_queue=checkpoint_queue, loss_queue=loss_queue, verify_cfg=_verify_cfg)
-
-def run_generative_inference(
-        module: Optional[PyBudaModule] = None,
-        inputs: List[Union[Tuple[Union[torch.Tensor, Tensor], ...], Dict[str, Union[torch.Tensor, Tensor]]]] = [],
-        input_count: int = 1,
-        output_queue: queue.Queue = None,
-        _sequential: bool = False,
-        _perf_trace: bool = False,
-        _verify_cfg: Optional[VerifyConfig] = None) -> queue.Queue:
-    """
-    Main "run" function for generative inference. After all modules have been defined and placed on devices, this will 
-    execute the workload. Unless 'sequential' is set, the function will return as soon as the devices are set up
-    to run, and inference will run as long as new inputs are pushed into the device(s). If sequential mode is on,
-    the function will run through inputs that are already in the input buffer and return when done.
-
-    Parameters
-    ----------
-    module: PyBudaModule, optional
-        If provided, place given module on a TT Device and run inference. Alternatively, manually create device(s) and
-        placed module(s) on them.
-
-    inputs: List[Union[Tuple[Union[torch.Tensor, Tensor], ...], Dict[str, Union[torch.Tensor, Tensor]]]], optional
-        An optional list of input tensor tuples or dictionaries (passed as args or kwargs to module), to feed into the inference pipeline. 
-        Alternatively, use `device.push_to_inputs` to manually provide inputs outside of this call.
-
-    input_count: int, default=1
-        The number of inputs to run inference on. If 0, inference will run "forever", until `shutdown` or `run_inference`
-        is called again.
-
-    output_queue: queue.Queue, optional
-        If provided, outputs will be pushed into the queue as they are calculated. Otherwise, one will be created
-        and returned.
-
-    _sequential: bool, Internal
-        Don't use.
-
-    _perf_trace: bool, Internal
-        Don't use.
-
-    _verify_cfg: Internal
-        Don't use.
-
-    Returns
-    -------
-    queue.Queue
-        Queue holding the output results. Either the output_queue provided, or one that's created.
-
-    """
-
-    return _run_generative_inference(module, inputs, input_count, output_queue, _sequential, _perf_trace, _verify_cfg)
-
-def run_forward(input_count: int = 1, _sequential: bool = False):
-    """
-    Run forward passes on the pre-compiled and initialized pipeline of devices. This API should be
-    called from custom implementations of inference and training loops, in lieue of calling 
-    `run_inference` and `run_training` APIs.
-
-    If this is a part of an inference run, the results will be placed in the outptut queues which 
-    should have already been setup through `initialize_pipeline` call. If this is called as a part
-    of the training pass, then loss will be pushed to the output queue, if one was set up.
-
-    Parameters
-    ----------
-    input_count: int, default=1
-        The number of inputs to run inference on. If 0, inference will run "forever", until `shutdown` or `run_inference`
-        is called again.
-
-    _sequential: Internal
-        Don't use
-    """
-    return _run_forward(input_count, _sequential)
-
-def run_generate(input_count: int = 1, write_index: int = -1, tokens_per_iter: int = -1, token_id: int = -1, _sequential: bool = False):
-    """
-    Run forward passes on the pre-compiled and initialized pipeline of devices and maintain past cache
-    write and read pointers. This API should be called from custom implementations of inference and 
-    training loops, in lieue of calling `run_inference` and `run_training` APIs.
-
-    If this is a part of an inference run, the results will be placed in the outptut queues which 
-    should have already been setup through `initialize_pipeline` call. If this is called as a part
-    of the training pass, then loss will be pushed to the output queue, if one was set up.
-
-    Parameters
-    ----------
-    input_count: int, default=1
-        The number of inputs to run inference on. If 0, inference will run "forever", until `shutdown` or `run_inference`
-        is called again.
-
-    _sequential: Internal
-        Don't use
-    """
-    assert write_index >= 0 or (tokens_per_iter > 0 and token_id >= 0), "Either write_index or tokens_per_iter and token_id should be set."
-    return _run_generate(input_count, write_index, tokens_per_iter, token_id, _sequential)
-
-def run_backward(input_count: int = 1, zero_grad: bool = False, _sequential: bool = False):
-    """
-    Run backward passes on the pre-compiled and initialized pipeline of devices. This API should be 
-    called from custom implementations of inference and training loops, in lieue of calling 
-    `run_inference` and `run_training` APIs.
-
-    `zero_grad` should be set for the first backward call of a batch, to zero out accumulated gradients.
-
-    No results will be returned. get_parameter_gradients() can be used to get a snapshot of
-    gradients after the backward pass has completed.
-
-    Parameters
-    ----------
-    input_count: int, default=1
-        The number of inputs to run inference on. If 0, inference will run "forever", until `shutdown` or `run_inference`
-        is called again.
-
-    zero_grad: bool, optional
-        If set, acccumulated gradients on device will be zeroed out before the backward pass begins.
-
-    _sequential: Internal
-        Don't use
-    """
-    return _run_backward(input_count, zero_grad, _sequential)
-
-def run_optimizer(checkpoint: bool = False, _sequential: bool = False):
-    """
-    Run optimizer on all devices. If `checkpoint` is set, a checkpoint of parameters will be taken and 
-    placed into the checkpoint queue that has been set up during `initialize_pipeline` call.
-
-    Parameters
-    ----------
-    checkpoint: bool, optional
-        If set, checkpoint of parameters will be placed into checkpoint queue.
-
-    _sequential: Internal
-        Don't use
-    """
-    _run_optimizer(_sequential)
-
-    if checkpoint:
-        _save_parameter_checkpoint(_sequential)
-
-def run_schedulers(_sequential: bool = False):
-    """
-    Run learning rate schedulers on all devices. 
-
-    Parameters
-    ----------
-    _sequential: Internal
-        Don't use
-    """
-    _run_schedulers(_sequential)
-
-def get_parameter_gradients(device: Optional[Union["CPUDevice", "TTDevice"]] = None, _sequential: bool = False) -> List[Dict[str, Tensor]]:
-    """
-    Return currently accumulated parameter gradients. If a device is specified, only gradients for that device
-    will be returned, otherwise a list of gradients for all devices will come back.
-
-    Parameters
-    ----------
-    device: Union[CPUDevice, TTDevice], Optional
-        Device to read parameter gradients from. If None, all devices will be read from.
-
-    _sequential: Internal
-        Don't use
-
-    Returns
-    -------
-    List[Dict[str, Tensor]]
-        List of parameter checkpoints for devices in the pipeline, or the given device
-    """
-    if device is None:
-        return [_get_parameter_gradients(d, _sequential) for d in get_devices()]
-
-    return [_get_parameter_gradients(device, _sequential)]
-
-def get_parameter_checkpoint(device: Optional[Union["CPUDevice", "TTDevice"]] = None, _sequential: bool = False) -> List[Dict[str, Tensor]]:
-    """
-    Return current parameter values. If a device is specified, only parameters for that device will 
-    be returned, otherwise a list of parameters for all devices will come back.
-
-    Parameters
-    ----------
-    device: Union[CPUDevice, TTDevice], Optional
-        Device to read parameter values from. If None, all devices will be read from.
-
-    _sequential: Internal
-        Don't use
-
-    Returns
-    -------
-    List[Dict[str, Tensor]]
-        List of parameter checkpoints for devices in the pipeline, or the given device
-    """
-    if device is None:
-        return [_get_parameter_checkpoint(d, _sequential) for d in get_devices()]
-
-    return [_get_parameter_checkpoint(device, _sequential)]
-
-def update_device_parameters(device: Optional[Union["CPUDevice", "TTDevice"]] = None, parameters: List[Dict[str, Tensor]] = [], _sequential: bool = False):
-    """
-    Push new parameters onto given device, or if none is provided, then all devices in the pipeline.
-
-    Parameters
-    ----------
-    device: Union[CPUDevice, TTDevice], Optional
-        Device to read parameter values from. If None, all devices will be read from.
-
-    parameters: List[Dict[str, torch.Tensor]]
-        List of dictionaries of parameters to update
-
-    _sequential: Internal
-        Don't use
-    """
-    devices = [device] if device is not None else get_devices()
-    return _update_device_parameters(devices, parameters, _sequential)
-
-
-def initialize_pipeline(
-        training: bool, 
-        output_queue: Optional[queue.Queue] = None, 
-        checkpoint_queue: Optional[queue.Queue] = None, 
-        sample_inputs: Union[Tuple[Union[torch.Tensor, Tensor], ...], Dict[str, Union[torch.Tensor, Tensor]]] = tuple(),
-        sample_targets: Tuple[Union[torch.Tensor, Tensor], ...] = tuple(),
-        microbatch_count: int = 1,
-        d2d_fwd_queues: List[queue.Queue] = [],
-        d2d_bwd_queues: List[queue.Queue] = [],
-        _sequential: bool = False, 
-        _verify_cfg: Optional[VerifyConfig] = None,
-        _device_mode: DeviceMode = DeviceMode.CompileAndRun) -> queue.Queue:
-    """
-    Initialize the pipeline to run inference and training through manual `run_forward`, `run_backward`, `run_optimizer`, etc. calls. This should be not used with 
-    "all-in-one" APIs like `run_inference` and `run_training`, which will initialize the pipeline themselves.
-
-    Parameters
-    ----------
-    training: bool
-        Set to true to prepare the pipeline for training.
-
-    output_queue: queue.Queue, optional
-        If provided, inference outputs will be pushed into the queue as they are calculated. Otherwise, one will be created
-        and returned (in inference mode)
-
-    checkpoint_queue: Queue, optional
-        If provided, weight checkpoints will be pushed into this queue, along with the final set of weights.
-        If one is not provided, one will be created and returned (in training mode)
-
-    sample_inputs: Tuple[Union[torch.Tensor, Tensor], ...], optional
-        If calling initialize_pipeline directly to compile models and initialize devices, then a representative sample
-        of inputs must be provided to accuractely compile the design. Typically, this would be the first input that 
-        will be sent through the model post-compile. The tensors must be of the correct shape and data type.
-
-    sample_targets: Tuple[Union[torch.Tensor, Tensor], ...], optional
-        If calling initialize_pipeline directly to compile models and initialize devices for training, then a 
-        representative sample of training tagets must be provided to accuractely compile the design. 
-        Typically, this would be the first target that will be sent to the last device post-compile. 
-        The tensors must be of the correct shape and data type.
-
-    microbatch_count: int
-        Only relevant for training. This represents the number of microbatches that are pushed through
-        fwd path before bwd path runs. The device will ensure that buffering is large enough to contain
-        microbatch_count number of microbatch intermediate data.
-
-    d2d_fwd_queues: List[queue.Queue], optional
-        If provided, device-to-device intermediate data that passes through host will also be stored in the provided
-        queues. The queues are assigned in order from the first device in the pipeline. The last device will not 
-        be assigned a queue.
-
-    d2d_bwd_queues: List[queue.Queue], optional
-        If provided, device-to-device intermediate data in the training backward pass, that passes through 
-        host will also be stored in the provided queues. The queues are assigned in order from the 
-        second device in the pipeline. The first device will not be assigned a queue.
-
-    _sequential: Internal
-        Don't use
-
-    _verify_cfg: Internal
-        Don't use.
-
-    Returns
-    -------
-    queue.Queue
-        Output queue for inference, or checkpoint queue for training
-
-
-    """
-
-    if not training:
-        assert len(sample_targets) == 0, "Sample targets should not be provided unless the training mode is on"
-
-    return _initialize_pipeline(training, output_queue, checkpoint_queue, sample_inputs, sample_targets, microbatch_count,
-            d2d_fwd_queues, d2d_bwd_queues, _sequential, _verify_cfg, _device_mode)
-
-
-def get_loss_queue() -> queue.Queue:
-    """
-    If a loss queue was not provided for training, one will be automatically created. This call can
-    be used to retrieve that queue.
-    """
-    return _get_loss_queue()
-
-def get_checkpoint_queue() -> queue.Queue:
-    """
-    If a checkpoint queue was not provided for training, one will be automatically created. This call can
-    be used to retrieve that queue.
-    """
-    return _get_checkpoint_queue()
-
-def get_intermediates_queue() -> queue.Queue:
-    """
-    If intermediates were tagged for saving, they will be pushed into a queue. This call can be used to retrieve that queue.
-    """
-    return _get_intermediates_queue()
-
-def sync():
-    """
-    Block until all devices have gone idle.
-    """
-    _sync()
-
-def shutdown():
-    """ 
-    Shutdown running processes and clean up pybuda
-    """
-    return _shutdown()
-
-def error_raised() -> bool:
-    """
-    Returns True if an unrecoverable error has been raised. A full shutdown / reset is needed to restart.
-    """
-    return _error_raised()
-
-def detect_available_devices():
-    """
-    Returns a list of available devices on the system.
-    """
-    return _detect_available_devices()
-
-import atexit
-atexit.register(shutdown)
-
diff --git a/pybuda/pybuda/run/commands.py b/pybuda/pybuda/run/commands.py
deleted file mode 100644
index 3b4d057b2..000000000
--- a/pybuda/pybuda/run/commands.py
+++ /dev/null
@@ -1,144 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import Dict, Tuple, Union, List, Any, Optional
-from enum import Enum
-from pybuda.tensor import Tensor
-import torch
-
-class CommandType(Enum):
-    QUIT = 1
-    RUN_FORWARD = 2
-    RUN_BACKWARD = 3
-    RUN_OPTIMIZER = 4
-    RUN_SCHEDULERS = 5
-    GET_PARAMETER_CHECKPOINT = 6
-    GET_PARAMETER_GRADIENTS = 7
-    COMPILE = 8
-    UPDATE_DEVICE_PARAMETERS = 9
-    GET_QUEUES = 11
-    SET_QUEUES = 12
-    DC_TRANSFER = 13
-    CPUEVAL_FORWARD = 14
-    CPUEVAL_BACKWARD = 15
-    CPUEVAL_LOSS = 16
-    SYNC = 17
-    RUN_GENERATE = 18
-
-class Command:
-    """
-    Command sent to running processes indicating what they need to run, and for how long.
-    """
-    def __init__(self, command_type: CommandType, params: Dict[str, Any] = {}):
-        self.command_type = command_type
-        self.params: Dict[str, Any] = params
-
-    def __repr__(self):
-        return f"{self.command_type}: {self.params}"
-        
-    @classmethod
-    def quit(cls) -> "Command":
-        return Command(CommandType.QUIT)
-
-    @classmethod
-    def run_forward(cls, loop_count: int) -> "Command":
-        return Command(CommandType.RUN_FORWARD, {"loop_count": loop_count})
-
-    @classmethod
-    def run_backward(cls, loop_count: int, zero_grad: bool) -> "Command":
-        return Command(CommandType.RUN_BACKWARD, {"loop_count": loop_count, "zero_grad": zero_grad})
-
-    @classmethod
-    def run_generate(cls, loop_count: int, write_index: int, tokens_per_iter: int, token_id: int) -> "Command":
-        return Command(CommandType.RUN_GENERATE, {"loop_count": loop_count, "write_index": write_index, "tokens_per_iter": tokens_per_iter, "token_id": token_id})
-
-    @classmethod
-    def run_optimizer(cls) -> "Command":
-        return Command(CommandType.RUN_OPTIMIZER, {})
-    
-    @classmethod
-    def run_schedulers(cls) -> "Command":
-        return Command(CommandType.RUN_SCHEDULERS, {})
-
-    @classmethod
-    def compile(cls,
-            inputs: Tuple["Tensor", ...],
-            compiler_cfg: "CompilerConfig",
-            targets: List["Tensor"],
-            microbatch_size: int,
-            microbatch_count: int,
-            verify_cfg: "VerifyConfig") -> "Command":
-
-        # Detach inputs in case they were calculated through some formulas before being pushed in
-        if compiler_cfg.compile_subgraphs:
-            input_groups = []
-            for group in inputs:
-                input_groups.append([t.detach() for t in group])
-            detached_inputs = input_groups
-        else:
-            detached_inputs = [t.detach() for t in inputs]
-        return Command(CommandType.COMPILE,
-                {
-                    "inputs": detached_inputs,
-                    "compiler_cfg": compiler_cfg,
-                    "targets": targets,
-                    "microbatch_size": microbatch_size,
-                    "microbatch_count": microbatch_count,
-                    "verify_cfg": verify_cfg,
-                })
-
-
-    @classmethod
-    def get_queues(cls, queue_type: str) -> "Command":
-        return Command(CommandType.GET_QUEUES, {"queue_type": queue_type})
-
-    @classmethod
-    def set_queues(cls, direction: str, queues: List["DramIODesc"], tile_broadcast_dims: Optional[List[List[int]]], 
-            original_shapes: Optional[List[Tuple[int, ...]]], requires_grad: Optional[List[bool]],
-            runtime_tensor_transforms: Optional[List["RuntimeTensorTransform"]],
-            constant_inputs: Optional[List[Tensor]],
-            tile_dims: Optional[List[List[int]]]) -> "Command":
-        return Command(CommandType.SET_QUEUES, 
-                {
-                    "direction": direction,
-                    "queues": queues,
-                    "tile_broadcast_dims": tile_broadcast_dims,
-                    "original_shapes": original_shapes,
-                    "requires_grad": requires_grad,
-                    "runtime_tensor_transforms": runtime_tensor_transforms,
-                    "constant_inputs": constant_inputs,
-                    "tile_dims": tile_dims,
-                })
-
-    @classmethod
-    def dc_transfer(cls, direction: str) -> "Command":
-        return Command(CommandType.DC_TRANSFER, {"direction": direction})
-
-    @classmethod
-    def cpueval_forward(cls, inputs: List[torch.Tensor], parameters: Dict[str, torch.Tensor], save_for_backward: bool, targets: List[torch.Tensor]) -> "Command":
-        return Command(CommandType.CPUEVAL_FORWARD, {"inputs": inputs, "parameters": parameters, "save_for_backward": save_for_backward, "targets": targets})
-
-    @classmethod
-    def cpueval_backward(cls, bw_inputs: List[torch.Tensor], parameters: Dict[str, torch.Tensor]) -> "Command":
-        return Command(CommandType.CPUEVAL_BACKWARD, { "bw_inputs": bw_inputs, "parameters": parameters })
-
-    @classmethod
-    def cpueval_loss(cls, fw_outputs: List[torch.Tensor], targets: List[torch.Tensor], scale_loss: float) -> "Command":
-        return Command(CommandType.CPUEVAL_LOSS, {"fw_outputs": fw_outputs, "targets": targets, "scale_loss": scale_loss})
-
-    @classmethod
-    def get_parameter_checkpoint(cls) -> "Command":
-        return Command(CommandType.GET_PARAMETER_CHECKPOINT, {})
-
-    @classmethod
-    def get_parameter_gradients(cls) -> "Command":
-        return Command(CommandType.GET_PARAMETER_GRADIENTS, {})
-
-    @classmethod
-    def update_device_parameters(cls, params: Dict[str, torch.Tensor]) -> "Command":
-        return Command(CommandType.UPDATE_DEVICE_PARAMETERS, {"parameters": params})
-
-    @classmethod
-    def sync(cls) -> "Command":
-        return Command(CommandType.SYNC, {})
diff --git a/pybuda/pybuda/run/context.py b/pybuda/pybuda/run/context.py
deleted file mode 100644
index 8368a462f..000000000
--- a/pybuda/pybuda/run/context.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-"""
-Global context for current inference/training runs
-"""
-
-from typing import Dict, Optional, List
-
-import threading
-import torch.multiprocessing as mp
-from multiprocessing.synchronize import Event as EventClass
-from multiprocessing.synchronize import Barrier as BarrierClass
-
-g_run_contexts: Dict[str, "RunContext"] = {}
-g_current_context: Optional["RunContext"] = None
-
-
-class RunContext:
-
-    def __init__(self, name):
-        self.name = name
-        self.active = False
-        self.training: Optional[bool] = None
-        self.shutdown_event: Optional[EventClass] = None
-        self.final_barrier: Optional[BarrierClass] = None
-        self.input_gradient_queue: Optional[mp.Queue] = None
-        self.output_queue: Optional[mp.Queue] = None
-        self.checkpoint_queue: Optional[mp.Queue] = None
-        self.intermediates_queue: Optional[mp.Queue] = None
-        self.processes: List[mp.Process] = []
-        self.loop_thread: Optional[threading.Thread] = None
-        self.error: bool = False
-
-        ## For hacked version of FW looping. Remove when Hacked FW looping is removed
-        self.global_input_index = 0
-
-    @classmethod
-    def create_new(cls, 
-            training: bool,
-            shutdown_event: EventClass,
-            final_barrier: BarrierClass,
-            name: str = "pybuda_default") -> "RunContext":
-        """ 
-        Create a new context, register it and make it current
-        """
-        global g_current_context, g_run_contexts
-        if name in g_run_contexts:
-            raise RuntimeError("Trying to create a new context when one already exists with the same name")
-
-        ctx = RunContext(name)
-        ctx.active = True
-        ctx.training = training
-        ctx.shutdown_event = shutdown_event
-        ctx.final_barrier = final_barrier
-
-        g_current_context = ctx
-        g_run_contexts[name] = ctx
-        return ctx
-
-def get_current_context() -> Optional[RunContext]:
-    """
-    Get current run context, or None if there isn't one
-    """
-    return g_current_context
-
-def clear_current_context():
-    global g_current_context, g_run_contexts
-    if g_current_context is None:
-        return
-
-    del g_run_contexts[g_current_context.name]
-    g_current_context = None
-
-def context_reset():
-    global g_current_context, g_run_contexts
-    g_run_contexts = {}
-    g_current_context = None
-
diff --git a/pybuda/pybuda/run/impl.py b/pybuda/pybuda/run/impl.py
deleted file mode 100644
index 218cacabb..000000000
--- a/pybuda/pybuda/run/impl.py
+++ /dev/null
@@ -1,1355 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-from typing import List, Tuple, Optional, Union, Dict
-import queue
-import os
-import threading
-import copy
-
-import torch
-import torch.multiprocessing as mp
-from loguru import logger
-
-from .commands import Command
-from .context import RunContext, get_current_context, clear_current_context
-from ..pybudaglobal import get_devices, profiler, state_changed, clear_state_changed, set_device_pipeline, create_queue
-from ..device import Device
-from ..ttdevice import TTDevice
-from ..cpudevice import CPUDevice
-from ..gpudevice import GPUDevice
-from ..module import PyBudaModule
-from ..tensor import Tensor, remove_microbatch, to_buda_tensors, to_pt_tensors
-from ..config import CompilerConfig
-from ..verify import VerifyConfig, TestKind
-from ..config import _get_global_compiler_config
-from ..utils import detach_tensors
-
-from pybuda.tvm_to_python import generate_pybuda_module, cleanup_temporary_files
-from pybuda.tvm_utils import flatten_inputs
-from pybuda._C.backend_api import BackendDevice, BackendType, initialize_child_process, finish_child_process, DeviceMode, clear_backend_param_cache, detect_available_silicon_devices
-
-def _detect_available_devices():
-    if "PYBUDA_EMULATE_SILICON_DEVICE" in os.environ:
-        if "GOLDEN_WORMHOLE_B0" in os.environ:
-            return [BackendDevice.from_string("wormhole_b0")]
-        else:
-            return [BackendDevice.from_string("grayskull")]
-    else:
-        return detect_available_silicon_devices()
-
-
-def _translate_framework_modules_on_devices(
-        sample_inputs: List[Tuple[Union[torch.Tensor, Tensor], ...]],
-        sample_targets: List[Tuple[Union[torch.Tensor, Tensor], ...]],
-        sample_input_names: List[str],
-        compiler_cfg: CompilerConfig,
-        verify_cfg: VerifyConfig):
-
-    def _wrap_inputs(inputs):
-        if not isinstance(inputs, (list, tuple)):
-            inputs = (inputs, )
-        return inputs
-
-    prev_state = state_changed()
-    devices = get_devices()
-    updated_devices = copy.copy(devices)
-    _, inputs = _get_device_zero_inputs(sample_inputs, peek=True)
-    device_index = 0
-    while device_index < len(updated_devices):
-        device = updated_devices[device_index]
-        if isinstance(device, (CPUDevice,)):
-            for module in device.modules:
-                if device.loss_module is None:
-                    if isinstance(device, GPUDevice):
-                        inputs = [input.cuda() for input in to_pt_tensors(inputs)]
-                        inputs = module.forward(*inputs)
-                        inputs = [input.cpu() for input in inputs]
-                        inputs = _wrap_inputs(inputs)
-                    else:
-                        module.compilation = True
-                        inputs = _wrap_inputs(module.forward(*to_pt_tensors(inputs)))
-
-        elif isinstance(device, TTDevice):
-            # Modes of operation:
-            # 1. compiler_cfg.compile_subgraphs = True:
-            #    - Ensure number of input groups match with number of modules
-            #    - For each module, generate a PyBudaModule, mark subgraph ID
-            #    - ASSERT NO CPU fallback
-            # 2. compiler_cfg.compile_subgraphs = False:
-            #    - feed output of previous module to next module
-            #    - For each module, generate a PyBudaModule, mark with the same subgraph ID
-            #    - ASSERT NO CPU fallback
-
-            multiple_module_on_one_device = len(device.modules) > 1
-            
-            # Multiple modules on one device
-            if len(device.modules) > 1 and device.loss_module is None:
-
-                # Compile multiple subgraphs
-                if (compiler_cfg.compile_subgraphs):
-                    num_modules = len(device.modules)
-                    num_input_groups = len(inputs)
-                    assert num_modules == num_input_groups, "Number of modules on a single TTDevice must match the number of input groups"
-                    assert  device.loss_module is None, "Compile subgraph currently does not support loss module on the same device"
-                    assert len(devices) == 1, "Compile subgraph currently does not support multiple devices"
-
-                    for module_index, (module, input_group) in enumerate(zip(device.modules, inputs)):
-                        if not isinstance(module, PyBudaModule):
-                            # Generate PybudaModule through TVM
-                            (
-                                translated_modules,
-                                translated_device_types,
-                                inputs
-                            ) = generate_pybuda_module(module, input_group, verify_cfg=verify_cfg, clean_later=True)
-
-                            tt_device = updated_devices[device_index]
-
-                            assert (len(translated_device_types) == 1 
-                                    and translated_device_types[0] == "TTDevice"), "Compile subgraph currently does not support CPU fallback"
-
-                            translated_pybuda_module = translated_modules[0]
-                            inputs = _wrap_inputs(translated_pybuda_module.forward(*to_buda_tensors(inputs)))
-                            translated_pybuda_module.subgraph_idx = module_index
-                            tt_device.modules[module_index] = translated_pybuda_module
-                        else:
-                            module.subgraph_idx = module_index
-
-                # Merge multiple subgraphs into one graph
-                else:
-                    for module_index, module in enumerate(device.modules):
-                        if not isinstance(module, PyBudaModule):
-                            # Generate PybudaModule through TVM
-                            (
-                                translated_modules,
-                                translated_device_types,
-                                inputs
-                            ) = generate_pybuda_module(module, inputs, verify_cfg=verify_cfg, clean_later=True)
-
-                            tt_device = updated_devices[device_index]
-
-                            assert (len(translated_device_types) == 1 
-                                    and translated_device_types[0] == "TTDevice"), "Multiple module on 1 device currently does not support CPU fallback"
-
-                            translated_pybuda_module = translated_modules[0]
-                            inputs = _wrap_inputs(translated_pybuda_module.forward(*to_buda_tensors(inputs)))
-                            translated_pybuda_module.subgraph_idx = 0 # Multiple modules on 1 device, merge into 1 graph
-                            tt_device.modules[module_index] = translated_pybuda_module
-                        else:
-                            inputs = _wrap_inputs(module.forward(*to_buda_tensors(inputs)))
-                            module.subgraph_idx = 0 # Multiple modules on 1 device, merge into 1 graph
-            
-            else:
-                assert compiler_cfg.compile_subgraphs == False, "Found only 1 module on a TTDevice, but compiler_cfg.compile_subgraphs is set to True"
-                module_index = 0
-                while module_index < len(device.modules):
-                    module = updated_devices[device_index].modules[module_index]
-                    is_last_device = device_index == len(updated_devices) - 1
-
-                    if not isinstance(module, PyBudaModule):
-                        is_loss_module = False
-                        if module is device.loss_module:
-                            is_loss_module = True
-                            inputs = tuple(list(inputs + sample_targets))
-                        translated_modules, translated_device_types, inputs = generate_pybuda_module(module, inputs, verify_cfg=verify_cfg, clean_later=True, input_names=sample_input_names)
-                        tt_device = updated_devices[device_index]
-
-                        added_modules = 0
-                        assert len(translated_device_types) <= 3
-                        assert any([device_type == "TTDevice" for device_type in translated_device_types])
-                        for index, (module, device_type) in enumerate(zip(translated_modules, translated_device_types)):
-                            if device_type == "CPUDevice":
-                                inputs = to_pt_tensors(inputs)
-                                input_dtypes = [inp.dtype for inp in inputs]
-                                inputs = _wrap_inputs(module.forward(*inputs))
-                                cpu_device = CPUDevice(name=f"cpu{index}_fallback", module=module, input_dtypes=input_dtypes)
-                                logger.warning("Unsupported ops found {} main graph, will be executed on {}", 'before' if index == 0 else 'after', cpu_device)
-                                if index == 0:
-                                    # if the first device is a fallback device, we want any subsequent inputs pushed to the 
-                                    # original device to go to cpu_device
-                                    while not tt_device._input_buffer.empty():
-                                        logger.debug("Copied input buffer from tt to cpu device")
-                                        cpu_device.push_to_inputs(tt_device._input_buffer.get())
-                                    tt_device.cpu_fallback_device_pre = cpu_device
-                                else:
-                                    tt_device.cpu_fallback_device_post = cpu_device
-                                    if tt_device.loss_module is not None:
-                                        logger.warning("Due to CPU fallback, loss module moved to {}", cpu_device)
-                                        cpu_device.place_loss_module(tt_device.loss_module)
-                                        tt_device.remove_loss_module()
-                                        while not tt_device.target_input_queue.empty():
-                                            logger.debug("Copied target buffer from tt to cpu device")
-                                            cpu_device.push_to_target_inputs(tt_device.target_input_queue.get())
-
-                                updated_devices.insert(device_index, cpu_device)
-                                device_index += 1
-                            else:
-                                inputs = _wrap_inputs(module.forward(*to_buda_tensors(inputs)))
-                                tt_device.modules[module_index] = module
-                                if is_loss_module:
-                                    tt_device.loss_module = module
-                                added_modules += 1
-                                device_index += 1
-                        # if the original device had an optimizer, and we have fallback device(s) we need to create one for the fallback device(s)
-                        if tt_device.optimizer and (tt_device.cpu_fallback_device_pre or tt_device.cpu_fallback_device_post):
-                            fallback_params = {}
-                            if tt_device.cpu_fallback_device_pre:
-                                fallback_params.update({param.get_name() : param.value() for param in tt_device.cpu_fallback_device_pre.modules[0].get_parameters()})
-                            if tt_device.cpu_fallback_device_post:
-                                fallback_params.update({param.get_name() : param.value() for param in tt_device.cpu_fallback_device_post.modules[0].get_parameters() if not any([param.value() is existing_param for existing_param in fallback_params.values()])})
-
-                            if len(fallback_params):
-                                cpu_optim = tt_device.optimizer.get_pytorch_optimizer(fallback_params)
-                                # optimizer goes on last device in the pipeline
-                                if tt_device.cpu_fallback_device_post:
-                                    tt_device.cpu_fallback_device_post.optimizer = cpu_optim
-                                else:
-                                    tt_device.cpu_fallback_device_pre.optimizer = cpu_optim
-
-                        # incremented for each added device, and will be incremented once again below
-                        device_index -= 1
-                        module_index += added_modules - 1
-                    elif (not is_last_device or len(device.modules) > 1) and device.loss_module is None:
-                        inputs = _wrap_inputs(module.forward(*to_buda_tensors(inputs)))
-                    module_index += 1
-
-        device_index += 1
-
-    set_device_pipeline(updated_devices)
-    if not prev_state:
-        clear_state_changed()
-
-def _cleanup_temporary_files():
-    cleanup_temporary_files()
-
-def _run_inference(
-        module: Optional[PyBudaModule] = None,
-        inputs: List[Union[Tuple[Union[torch.Tensor, Tensor], ...], Dict[str, Union[torch.Tensor, Tensor]]]] = [],
-        input_count: int = 1,
-        output_queue: queue.Queue = None,
-        sequential: bool = False,
-        perf_trace: bool = False,
-        verify_cfg: Optional[VerifyConfig] = None) -> queue.Queue:
-    """
-    Main "run" function for inference. After all modules have been defined and placed on devices, this will 
-    execute the workload. Unless 'sequential' is set, the function will return as soon as the devices are set up
-    to run, and inference will run as long as new inputs are pushed into the device(s). If sequential mode is on,
-    the function will run through inputs that are already in the input buffer and return when done.
-    """
-
-    resume = False
-    if module is not None:
-        # Create a device if one hasn't been created yet
-        devices = get_devices()
-        if len(devices) == 0:
-            _ = TTDevice("auto_tt0")
-            devices = get_devices()
-
-        # Check if we'r resuming or starting a new run
-        if (any(len(d.modules) > 0 for d in devices[1:])
-                or len(devices[0].modules) != 1
-                or (len(devices[0].modules) == 1 and devices[0].modules[0] != module)):
-            for d in devices:
-                d.remove_modules()
-            devices[0].place_module(module)
-        else:
-            logger.debug("Resuming previous inference")
-            resume = True # called with the same module
-
-    elif not state_changed():
-        resume = True
-
-
-    if len(inputs) > 0:
-        devices = get_devices()
-        if len(devices) == 0:
-            raise RuntimeError("No devices have been created, and no modules provided. There's nothing to run inference on.")
-        for input in inputs:
-            devices[0].push_to_inputs(input)
-        if input_count != 1 and input_count != len(inputs):
-            raise RuntimeError("Input count should not be provided when a list of inputs exists")
-        input_count = len(inputs)
-
-    if input_count == 0 and sequential:
-        raise RuntimeError("In sequential mode, inputs must be pushed ahead of time. Therefore, 'run forever' mode is invalid.")
-
-    clear_state_changed()
-    return _run_devices_inference(
-            input_count=input_count,
-            sequential=sequential, 
-            output_queue=output_queue, 
-            perf_trace=perf_trace, 
-            verify_cfg=verify_cfg, 
-            resume=resume)
-
-def _run_command(device: Union[CPUDevice, TTDevice], sequential: bool, command: Command, response: bool = False
-        ) -> Optional[Dict]:
-    if sequential:
-        logger.trace("{}: Got command from queue: {}", device, command)
-        device.run_next_command(command)
-    else:
-        device.push_to_command_queue(command)
-
-    if response:
-        return device.get_command_queue_response()
-
-    return None
-
-def _sequential_override(sequential: bool) -> bool:
-    """
-    Force sequential on if any of the devices are Golden model
-    """
-    if sequential:
-        return True
-    if "PYBUDA_FORCE_SEQUENTIAL" in os.environ:
-        return True
-    for d in get_devices():
-        if d.devtype == BackendType.Golden or d.devtype == BackendType.Model:
-            return True
-    return False
-    
-
-def _initialize_pipeline(
-        training: bool, 
-        output_queue: Optional[queue.Queue] = None,
-        checkpoint_queue: Optional[queue.Queue] = None, 
-        sample_inputs: Union[Tuple[Union[torch.Tensor, Tensor], ...], Dict[str, Union[torch.Tensor, Tensor]]] = tuple(),
-        sample_targets: Tuple[Union[torch.Tensor, Tensor], ...] = tuple(),
-        microbatch_count: int = 1,
-        d2d_fwd_queues: List[queue.Queue] = [],
-        d2d_bwd_queues: List[queue.Queue] = [],
-        sequential: bool = False, 
-        verify_cfg: Optional[VerifyConfig] = None,
-        device_mode: DeviceMode = DeviceMode.CompileAndRun) -> queue.Queue:
-    """
-    Initialize the pipeline to run inference and training through manual `run_forward`, `run_backward`, 
-    `run_optimizer`, etc. calls. This should be not used with "all-in-one" APIs like `run_inference` 
-    and `run_training`, which will initialize the pipeline themselves.
-    """
-
-    # If sample_inputs is a dictionary, extract names from its keys
-    sample_input_names = []
-    if isinstance(sample_inputs, dict):
-        sample_input_names = list(sample_inputs.keys())
-        sample_inputs = list(sample_inputs.values())
-
-    devices = get_devices()
-
-    if len(devices) == 0:
-        logger.warning("Nothing to do")
-        return None
-
-    sequential = _sequential_override(sequential)
-    if not sequential:
-        initialize_child_process(_get_global_compiler_config().backend_output_dir)
-    
-    if training:
-        for d in devices[:-1]:
-            if d.loss_module is not None:
-                raise RuntimeError("Only the last device in the pipieline should have a loss module.")
-
-        if devices[-1].loss_module is None:
-            raise RuntimeError("The last device in pipeline must have a loss module to be able to train.")
-    else:
-        if checkpoint_queue is not None:
-            raise RuntimeError("Checkpoint queue should only be provided in training mode")
-
-    # Translate framework modules. May increase number of devices due to CPU fallback
-    # sample_inputs, _, _ = flatten_inputs(sample_inputs) # NESTED INPUT ASSERT NUM GROUP == NUM MODULES ON THAT DEVICE
-    _translate_framework_modules_on_devices(sample_inputs, sample_targets, sample_input_names, _get_global_compiler_config(), verify_cfg)
-    devices = get_devices()
-
-    # Initialize & connect devices
-    shutdown_event, final_barrier = _initialize_devices(devices, sequential, training=training, verify_cfg=verify_cfg)
-
-    # Create a new context
-    ctx = get_current_context()
-    if ctx is None:
-        ctx = RunContext.create_new(training, shutdown_event, final_barrier)
-
-    mp_context = mp.get_context('spawn')
-    if output_queue is None:
-        output_queue = create_queue(mp_context)
-    
-    if _get_global_compiler_config().save_intermediates:
-        ctx.intermediates_queue = create_queue(mp_context)
-
-    microbatch, _ = _get_device_zero_inputs(sample_inputs, peek=True)
-
-    # It's possible that no inputs have been provided at this point, just default to 1
-    if microbatch is None:
-        microbatch = 1
-
-    if training:
-        input_gradient_queue = create_queue(mp_context) # create a sink so that we can drain it here
-        _connect_devices(
-                devices, 
-                sequential=sequential,
-                training=True, 
-                microbatch=microbatch,
-                input_gradient_queue=input_gradient_queue, 
-                output_queue=output_queue,
-                intermediates_queue=ctx.intermediates_queue,
-                d2d_fwd_queues=d2d_fwd_queues,
-                d2d_bwd_queues=d2d_bwd_queues)
-        ctx.input_gradient_queue = input_gradient_queue
-    else:
-        _connect_devices(
-                devices, 
-                sequential=sequential,
-                training=False,
-                microbatch=microbatch,
-                input_gradient_queue=None, 
-                output_queue=output_queue,
-                intermediates_queue=ctx.intermediates_queue,
-                d2d_fwd_queues=d2d_fwd_queues)
-
-    ctx.output_queue = output_queue
-
-    # Start device processes
-    if not sequential:
-        ctx.processes = _start_device_processes(devices, _get_global_compiler_config().backend_output_dir)
-
-    # Compile all devices
-    _compile_devices(sequential, training=training, sample_inputs=sample_inputs, sample_targets=sample_targets, microbatch_count=microbatch_count, verify_cfg=verify_cfg)
-
-    if device_mode == DeviceMode.CompileOnly:
-        return output_queue
-
-    # Pass DRAM queue information between compiled devices
-    _pass_dram_io_descriptors(devices, sequential, training=training, save_intermediates=_get_global_compiler_config().save_intermediates)
-
-    if training:
-        # Create queues for input/parameter gradients, for verification (if enabled)
-        if verify_cfg and verify_cfg.enable_input_gradient_checking:
-            verify_cfg._input_gradient_queue = input_gradient_queue
-        if verify_cfg and verify_cfg.enable_parameter_gradient_checking:
-            verify_cfg._parameter_gradient_queue = create_queue(mp_context)
-
-        # Create checkpoint_queue if one is not provided
-        if checkpoint_queue is None:
-            checkpoint_queue = queue.Queue()
-
-        ctx.checkpoint_queue = checkpoint_queue
-
-        return checkpoint_queue
-
-    return output_queue
-
-def _is_active() -> bool:
-    """ 
-    Return true if a run is active
-    """
-    ctx = get_current_context()
-    return ctx is not None and ctx.active
-
-def _run_devices_inference(input_count: int, sequential: bool, output_queue: Optional[queue.Queue], resume: bool, perf_trace: bool, verify_cfg: Optional[VerifyConfig]):
-
-    devices = get_devices()
-
-    if len(devices) == 0:
-        logger.warning("Nothing to do")
-        return output_queue
-
-    if resume and not _is_active():
-        # can't really resume
-        resume = False
-        
-    if resume:
-        ctx = get_current_context()
-        assert ctx is not None
-
-    if resume and ctx.output_queue is None:
-        logger.warning("Output queue not saved from previous run")
-        resume = False
-
-    if not resume:
-        output_queue = _initialize_pipeline(False, output_queue, sequential=sequential, verify_cfg=verify_cfg)
-    else:
-        output_queue = ctx.output_queue
-
-    sequential = _sequential_override(sequential)
-    if sequential:
-        _run_forward(input_count, sequential)
-    else:
-        loop_thread = threading.Thread(target=_run_forward, args=(input_count, sequential))
-        ctx = get_current_context()
-        assert ctx is not None
-        ctx.loop_thread = loop_thread
-        loop_thread.start()
-
-    return output_queue
-
-def _error_shutdown():
-    """ 
-    Cleanup on error
-    """
-    if "PYBUDA_TRACE_SHUTDOWN" in os.environ:
-        import traceback
-        logger.debug(traceback.format_exc())
-    ctx = get_current_context()
-    if ctx is None:
-        # There's not context, something went really wrong...
-        logger.warning("No context available for error shutdown.")
-        return
-
-    ctx.error = True
-    if ctx.final_barrier:
-        ctx.final_barrier.abort()
-    if ctx.shutdown_event:
-        ctx.shutdown_event.set()
-
-    _shutdown(clear_context=False)
-
-def _error_raised() -> bool:
-    ctx = get_current_context()
-    if ctx is not None and ctx.shutdown_event and ctx.shutdown_event.is_set():
-        return True
-
-    return ctx is not None and ctx.error
-
-def _run_forward_with_fw_looping(ctx: RunContext, microbatch_looping: bool, devices: List[Device], input_count: int, sequential: bool):
-    num_pushes_per_fwd = int(os.environ["NUM_EXEC_LOOP_ITERATIONS"])
-
-    logger.info(f"impl.py::_run_forward num_pushes_per_fwd = {num_pushes_per_fwd}")
-    logger.info(f"impl.py::_run_forward input_count = {input_count}")
-
-    try:
-        i = ctx.global_input_index
-        logger.info(f"ctx.global_input_index={ctx.global_input_index}")
-        if microbatch_looping:
-            if fw_epoch_looping_enabled:
-                assert(num_pushes_per_fwd % input_count == 0)
-            
-            for d in devices:
-                invoke_fwd = (i % num_pushes_per_fwd) == 0 or not isinstance(d, TTDevice)
-                if invoke_fwd:
-                    logger.debug("Running {} device forward: {}", 'sequential' if sequential else 'concurrent', d)
-                    _run_command(d, sequential, Command.run_forward(loop_count=input_count))
-                    if _error_raised():
-                        return 
-
-            ctx.global_input_index += input_count    
-
-            for _ in range(input_count):
-                if ctx.training:
-                    _run_command(devices[-1], sequential, Command.dc_transfer("target"))
-                    if _error_raised():
-                        return
-
-                for d in devices:
-                    _run_command(d, sequential, Command.dc_transfer("forward_input"))
-                    if _error_raised():
-                        return
-
-                # Read out the output
-                _run_command(devices[-1], sequential, Command.dc_transfer("forward"))
-                if _error_raised():
-                    return
-        else:
-            for _ in range(input_count):
-                if ctx.training:
-                    _run_command(devices[-1], sequential, Command.dc_transfer("target"))
-                    if _error_raised():
-                        return
-
-                for d in devices:
-                    _run_command(d, sequential, Command.dc_transfer("forward_input"))
-                    invoke_fwd = (i % num_pushes_per_fwd) == 0 or not isinstance(d, TTDevice)
-                    if invoke_fwd:
-                        logger.debug("Running {} device forward: {}", 'sequential' if sequential else 'concurrent', d)
-                        _run_command(d, sequential, Command.run_forward(loop_count=1))
-
-                        if _error_raised():
-                            return 
-
-                # Read out the output
-                _run_command(devices[-1], sequential, Command.dc_transfer("forward"))
-                ctx.global_input_index += 1
-
-    except Exception as e:
-        logger.error("Forward loop error: {}", e)
-        _error_shutdown()
-
-def _run_forward(input_count: int = 1, sequential: bool = False):
-    """
-    Run forward passes on the pre-compiled and initialized pipeline of devices. This API should be 
-    called from custom implementations of inference and training loops, in lieue of 
-    calling `run_inference` and `run_training` APIs.
-
-    The result (inference output, or loss if running training) will be placed in the output 
-    queue which should have already been setup through `initialize_pipeline` call.
-    """
-
-    if _error_raised():
-        return 
-
-    devices = get_devices()
-    if len(devices) == 0:
-        logger.warning("Nothing to do")
-        return 
-
-    sequential = _sequential_override(sequential)
-    microbatch_looping = False if sequential else "PYBUDA_MICROBATCH_LOOPING" in os.environ
-
-    ctx = get_current_context()
-    if ctx is None:
-        raise RuntimeError("Trying to run forward without initializing the pipeline.")
-
-    fw_epoch_looping_enabled = bool("NUM_EXEC_LOOP_ITERATIONS" in os.environ and int(os.environ["NUM_EXEC_LOOP_ITERATIONS"]) > 0)
-    if fw_epoch_looping_enabled:
-        _run_forward_with_fw_looping(ctx, microbatch_looping, devices, input_count, sequential)
-        ## return instead of if-else just to minimize the diff for when we push (and later revert) this change
-        return
-
-    try:
-        if microbatch_looping:
-            for d in devices:
-                logger.debug("Running {} device forward: {}", 'sequential' if sequential else 'concurrent', d)
-                _run_command(d, sequential, Command.run_forward(loop_count=input_count))
-                if _error_raised():
-                    return 
-
-            for _ in range(input_count):
-                if ctx.training:
-                    _run_command(devices[-1], sequential, Command.dc_transfer("target"))
-                    if _error_raised():
-                        return
-
-                for d in devices:
-                    _run_command(d, sequential, Command.dc_transfer("forward_input"))
-                    if _error_raised():
-                        return
-
-                # Read out the output
-                _run_command(devices[-1], sequential, Command.dc_transfer("forward"))
-                if _error_raised():
-                    return
-
-                # Read out intermediates output
-                if not ctx.training:
-                    for d in devices:
-                        _run_command(d, sequential, Command.dc_transfer("intermediates"))
-                        if _error_raised():
-                            return
-
-        else:
-            for _ in range(input_count):
-                if ctx.training:
-                    _run_command(devices[-1], sequential, Command.dc_transfer("target"))
-                    if _error_raised():
-                        return
-
-                for d in devices:
-                    logger.debug("Running {} device forward: {}", 'sequential' if sequential else 'concurrent', d)
-                    _run_command(d, sequential, Command.dc_transfer("forward_input"))
-                    _run_command(d, sequential, Command.run_forward(loop_count=1))
-                    if _error_raised():
-                        return
-
-                # Read out the output
-                _run_command(devices[-1], sequential, Command.dc_transfer("forward"))
-                if _error_raised():
-                    return
-
-                # Read out intermediates output
-                if not ctx.training:
-                    for d in devices:
-                        _run_command(d, sequential, Command.dc_transfer("intermediates"))
-                        if _error_raised():
-                            return
-
-    except Exception as e:
-        logger.error("Forward loop error: {}", e)
-        _error_shutdown()
-
-def _run_backward(input_count: int, zero_grad: bool, sequential: bool):
-
-    if _error_raised():
-        return 
-
-    devices = get_devices()
-    if len(devices) == 0:
-        logger.warning("Nothing to do")
-        return 
-
-    sequential = _sequential_override(sequential)
-    microbatch_looping = False if sequential else True
-
-    try:
-        if microbatch_looping:
-            for d in reversed(devices):
-                logger.debug("Running {} device backward: {}", 'sequential' if sequential else 'concurrent', d)
-                _run_command(d, sequential, Command.run_backward(loop_count=input_count, zero_grad=zero_grad))
-                if _error_raised():
-                    return 
-
-                for _ in range(input_count):
-                    _run_command(d, sequential, Command.dc_transfer("backward"))
-                    if _error_raised():
-                        return
-
-                    _run_command(d, sequential, Command.dc_transfer("intermediates"))
-                    if _error_raised():
-                        return 
-        else:
-            for i in range(input_count):
-                for d in reversed(devices):
-                    logger.debug("Running {} device backward: {}", 'sequential' if sequential else 'concurrent', d)
-                    _run_command(d, sequential, Command.run_backward(loop_count=1, zero_grad=zero_grad))
-                    if _error_raised():
-                        return 
-
-                    _run_command(d, sequential, Command.dc_transfer("backward"))
-                    if _error_raised():
-                        return 
-
-                    _run_command(d, sequential, Command.dc_transfer("intermediates"))
-                    if _error_raised():
-                        return
-                zero_grad = False
-
-    except Exception as e:
-        logger.error("Backward loop error: {}", e)
-        _error_shutdown()
-
-def _run_generate(input_count: int, write_index: int, tokens_per_iter: int, token_id: int, sequential: bool):
-    if _error_raised():
-        return 
-
-    devices = get_devices()
-    if len(devices) == 0:
-        logger.warning("Nothing to do")
-        return 
-
-    sequential = _sequential_override(sequential)
-    microbatch_looping = False if sequential else "PYBUDA_MICROBATCH_LOOPING" in os.environ
-
-    try:
-        if microbatch_looping:
-            for d in devices:
-                logger.debug("Running {} device forward: {}", 'sequential' if sequential else 'concurrent', d)
-                _run_command(d, sequential, Command.run_generate(loop_count=input_count, write_index=write_index, tokens_per_iter=tokens_per_iter, token_id=token_id))
-                if _error_raised():
-                    return 
-
-            for _ in range(input_count):
-                for d in devices:
-                    _run_command(d, sequential, Command.dc_transfer("forward_input"))
-                    if _error_raised():
-                        return
-
-                # Read out the output
-                _run_command(devices[-1], sequential, Command.dc_transfer("forward"))
-                if _error_raised():
-                    return
-        else:
-            for _ in range(input_count):
-                for d in devices:
-                    _run_command(d, sequential, Command.dc_transfer("forward_input"))
-                    logger.debug("Running {} device forward: {}", 'sequential' if sequential else 'concurrent', d)
-                    _run_command(d, sequential, Command.run_generate(loop_count=1, write_index=write_index, tokens_per_iter=tokens_per_iter, token_id=token_id))
-                    token_id += tokens_per_iter
-                    if _error_raised():
-                        return
-
-                # Read out the output
-                _run_command(devices[-1], sequential, Command.dc_transfer("forward"))
-                if _error_raised():
-                    return
-            # if there are no inputs to push, just execute the program once
-            if input_count == 0:
-                for d in devices:
-                    _run_command(d, sequential, Command.run_generate(loop_count=1, write_index=write_index, tokens_per_iter=tokens_per_iter, token_id=token_id))
-                    if _error_raised():
-                        return
-               
-
-    except Exception as e:
-        logger.error("Generate loop error: {}", e)
-        _error_shutdown()
-
-def _run_optimizer(sequential: bool):
-
-    if _error_raised():
-        return 
-
-    devices = get_devices()
-    if len(devices) == 0:
-        logger.warning("Nothing to do")
-        return 
-
-    sequential = _sequential_override(sequential)
-
-    try:
-        for d in devices:
-            logger.debug("Running {} device optimizer: {}", 'sequential' if sequential else 'concurrent', d)
-            _run_command(d, sequential, Command.run_optimizer())
-            if _error_raised():
-                return 
-    except Exception as e:
-        logger.error("Optimizer loop error: {}", e)
-        _error_shutdown()
-
-def _run_schedulers(sequential: bool):
-
-    if _error_raised():
-        return 
-
-    devices = get_devices()
-    if len(devices) == 0:
-        logger.warning("Nothing to do")
-        return 
-
-    sequential = _sequential_override(sequential)
-
-    try:
-        for d in devices:
-            logger.debug("Running {} device scheduler: {}", 'sequential' if sequential else 'concurrent', d)
-            _run_command(d, sequential, Command.run_schedulers())
-            if _error_raised():
-                return 
-    except Exception as e:
-        logger.error("Scheduler loop error: {}", e)
-        _error_shutdown()
-
-def _get_parameter_checkpoint(device: Union[CPUDevice, TTDevice], sequential: bool) -> Dict[str, Tensor]:
-    sequential = _sequential_override(sequential)
-
-    try:
-        ret = _run_command(device, sequential, Command.get_parameter_checkpoint(), response=True)
-        if ret is None:
-            raise RuntimeError("Error getting parameter checkpoint")
-
-        return ret["checkpoint"]
-
-    except Exception as e:
-        logger.error("Parameter checkpoint error: {}", e)
-        _error_shutdown()
-        return {}
-
-def _get_parameter_gradients(device: Union[CPUDevice, TTDevice], sequential: bool) -> Dict[str, Tensor]:
-    sequential = _sequential_override(sequential)
-
-    try:
-        ret = _run_command(device, sequential, Command.get_parameter_gradients(), response=True)
-        if ret is None:
-            raise RuntimeError("Error getting parameter gradients")
-
-        return ret["gradients"]
-
-    except Exception as e:
-        logger.error("Parameter gradient read error: {}", e)
-        _error_shutdown()
-        return {}
-    
-def _get_device_intermediates(device: Union[CPUDevice, TTDevice], sequential: bool) -> Dict[str, Tensor]:
-    sequential = _sequential_override(sequential)
-
-    try:
-        ret = _run_command(device, sequential, Command.get_device_intermediates(), response=True)
-        if ret is None:
-            raise RuntimeError("Error getting intermediate activations")
-
-        return ret["device_intermediates"]
-
-    except Exception as e:
-        logger.error("Intermediate activations read error: {}", e)
-        _error_shutdown()
-        return {}
-    
-
-def _run_training_loop(
-        sequential: bool,
-        epochs: int, 
-        steps: int, 
-        accumulation_steps: int, 
-        microbatch_count: int, 
-        checkpoint_interval: int,
-        checkpoint_queue: queue.Queue,
-        verify_cfg: Optional[VerifyConfig]):
-    """
-    Run the training loop after everything's been set up.
-    """
-    try:
-        optimizer_step_count = 0
-        checkpointed = False
-        devices = get_devices()
-        for epoch in range(epochs):
-
-            if _error_raised():
-                return
-
-            logger.info("**** Starting epoch {}", epoch)
-
-            checkpointed = False
-            for batch in range(steps):
-            
-                logger.info("** Starting batch {} in epoch {}", batch, epoch)
-                for mini_batch in range(accumulation_steps):
-                    logger.info("** Starting mini-batch {}, batch {}, in epoch {}", mini_batch, batch, epoch)
-                    _run_forward(input_count=microbatch_count, sequential=sequential)
-
-                    if _error_raised():
-                        return
-
-                    _run_backward(input_count=microbatch_count, zero_grad=(mini_batch==0), sequential=sequential)
-
-                    if _error_raised():
-                        return
-
-                    # Drain input gradients if nobody is consuming them - TODO
-                    #if verify_cfg is None or not verify_cfg.enable_input_gradient_checking:
-                    #    while not input_gradient_queue.empty():
-                    #        input_gradient_queue.get()
-
-                    # Record gradients for verification
-                    if verify_cfg and verify_cfg._parameter_gradient_queue is not None:
-                        
-                        gradient_checkpoint = [_get_parameter_gradients(device, sequential) for device in devices]
-                        verify_cfg._parameter_gradient_queue.put(gradient_checkpoint)
-
-
-                _run_optimizer(sequential)
-
-                if _error_raised():
-                    return
-
-                optimizer_step_count += 1
-                if (checkpoint_interval > 0) and (optimizer_step_count % checkpoint_interval == 0):
-                    checkpoint = [_get_parameter_checkpoint(device, sequential) for device in devices]
-                    checkpoint_queue.put(checkpoint)
-                    checkpointed = True
-
-
-            for d in devices:
-                d._step_schedulers()
-
-            if _error_raised():
-                return
-    
-        # Save final checkpoint
-        if not checkpointed: # don't double-checkpoint on the last one
-            checkpoint = [_get_parameter_checkpoint(device, sequential) for device in devices]
-            checkpoint_queue.put(checkpoint)
-
-    except Exception as e:
-        logger.error("Training loop error: {}", e)
-        _error_shutdown()
-
-def _run_devices_training(
-        sequential: bool,
-        epochs: int, 
-        steps: int, 
-        accumulation_steps: int, 
-        microbatch_count: int, 
-        checkpoint_interval: int,
-        perf_trace: bool,
-        checkpoint_queue: Optional[queue.Queue],
-        loss_queue: Optional[queue.Queue],
-        verify_cfg: Optional[VerifyConfig]) -> queue.Queue:
-    
-    devices = get_devices()
-
-    if len(devices) == 0:
-        logger.warning("Nothing to do")
-        return checkpoint_queue
-
-    checkpoint_queue = _initialize_pipeline(training=True, output_queue=loss_queue, checkpoint_queue=checkpoint_queue, sequential=sequential, verify_cfg=verify_cfg, microbatch_count=microbatch_count)
-    
-    sequential = _sequential_override(sequential)
-    
-    loop_args = (sequential, epochs, steps, accumulation_steps, microbatch_count, checkpoint_interval, checkpoint_queue, verify_cfg)
-    if sequential:
-        _run_training_loop(*loop_args)
-        if profiler is not None:
-            profiler.stop()
-            profiler.print()
-            html = profiler.output_html()
-            with open('training_sequential_profile.html', 'w') as f:
-                f.write(html)
-    else:
-        loop_thread = threading.Thread(target=_run_training_loop, args=loop_args)
-        ctx = get_current_context()
-        assert ctx is not None
-        ctx.loop_thread = loop_thread
-        loop_thread.start()
-
-    return checkpoint_queue
-
-def _save_parameter_checkpoint(sequential: bool):
-    """
-    Read a checkpoint of parameters and push to checkpoint queue
-    """
-    devices = get_devices()
-
-    if len(devices) == 0:
-        logger.warning("Nothing to do")
-        return
-
-    ctx = get_current_context()
-    if ctx is None:
-        raise RuntimeError("No current running context")
-
-    if not ctx.training or ctx.checkpoint_queue is None:
-        raise RuntimeError("Pipeline hasn't been initialized for training")
-
-    try:
-        checkpoint_queue = ctx.checkpoint_queue
-        sequential = _sequential_override(sequential)
-        checkpoint = [_get_parameter_checkpoint(device, sequential) for device in devices]
-        checkpoint_queue.put(checkpoint)
-    except Exception as e:
-        logger.error("Save parameter checkpoint error: {}", e)
-        _error_shutdown()
-
-def _initialize_devices(devices: List[Union[CPUDevice, TTDevice]], sequential: bool, training: bool, verify_cfg: Optional[VerifyConfig]):
-    """
-    Setup all devices
-    """
-    if not sequential:
-        mp_context = mp.get_context('spawn')
-        shutdown_event = mp_context.Event()
-        final_barrier = mp_context.Barrier(len(devices) + 1) # plus 1 for this process
-    else:
-        final_barrier = None
-        shutdown_event = mp.Event()
-
-    scale_loss = 1.0
-    if verify_cfg is not None and training:
-        scale_loss = verify_cfg.scale_loss
-
-    d: Union[CPUDevice, TTDevice]
-    for i, d in enumerate(devices):
-        d._initialize(
-                training=training, 
-                sequential=sequential,
-                final_barrier=final_barrier,
-                shutdown_event=shutdown_event,
-                checkpoint_interval=0,
-                scale_loss=scale_loss)
-                #perf_dump_mode=buda.PerfDumpMode.SingleDumpPerEpoch if perf_trace else buda.PerfDumpMode.Disable)
-        if i > 0:
-            d._first_device = False
-
-    return shutdown_event, final_barrier
-
-def _connect_devices(
-        devices: List[Union[CPUDevice, TTDevice]], 
-        sequential: bool,
-        training: bool,
-        microbatch: int,
-        output_queue: queue.Queue,
-        input_gradient_queue: Optional[queue.Queue],
-        intermediates_queue: Optional[queue.Queue],
-        d2d_fwd_queues: List[queue.Queue] = [],
-        d2d_bwd_queues: List[queue.Queue] = []):
-    """
-    Connect devices by creating device connectors between the appropriate pairs
-    """
-    for i, d in enumerate(devices[:-1]):
-        target_device = devices[i+1]
-
-        d2d_fwd_queue = d2d_fwd_queues[i] if len(d2d_fwd_queues) > i else None
-        d._create_forward_device_connector(target_device=target_device, sequential=sequential, d2d_fwd_queue=d2d_fwd_queue, microbatch=microbatch)
-        if training:
-            d2d_bwd_queue = d2d_bwd_queues[i] if len(d2d_bwd_queues) > i else None
-            target_device._create_backward_device_connector(d, sequential=sequential, d2d_bwd_queue=d2d_bwd_queue)
-
-    # Connect the first device to input / output of the whole system
-    devices[0]._create_input_queue_device_connector(devices[0]._input_buffer, sequential=sequential)
-
-    if intermediates_queue is not None:
-        for device in devices:
-            if isinstance(device, TTDevice):
-                device._create_intermediates_queue_device_connector(intermediates_queue)
-
-    if training:
-        # Input gradient queue if one is provided
-        if input_gradient_queue is not None:
-            devices[0]._create_backward_output_queue_device_connector(input_gradient_queue)
-
-        # Target & Loss queues
-        devices[-1]._create_forward_output_queue_device_connector(output_queue)
-        devices[-1]._create_target_queue_device_connector(devices[-1].target_input_queue, sequential=sequential)
-    else:
-        devices[-1]._create_forward_output_queue_device_connector(output_queue)
-
-def _pass_dram_io_descriptors(devices: List[Union[CPUDevice, TTDevice]], sequential: bool, training: bool, save_intermediates: bool):
-    """
-    Pass dram io descriptors from TT devices to CPU devices
-    """
-    for i, d in enumerate(devices[:-1]):
-        target_device = devices[i+1]
-
-        # Get queue descriptors from the target device, if it's a TTDevice
-
-        # Forward
-        if isinstance(target_device, TTDevice):
-            # Pushing to TTDevice, we need to set destination queue information
-            ret = _run_command(target_device, sequential, Command.get_queues("input"), response=True)
-            if ret is None:
-                raise RuntimeError("Failed to connect devices.")
-            _run_command(d, sequential, Command.set_queues("forward_out", ret["queues"], ret["tile_broadcast_dims"], ret["original_shapes"], ret["requires_grad"], ret["runtime_tensor_transforms"], ret["constant_inputs"], ret["tile_dims"]))
-
-        if isinstance(d, TTDevice):
-            # Reading from TTDevice, need to set TTDevice's output queues
-            ret = _run_command(d, sequential, Command.get_queues("output"), response=True)
-            if ret is None:
-                raise RuntimeError("Failed to connect devices.")
-            _run_command(target_device, sequential, Command.set_queues("forward_in", ret["queues"], ret["tile_broadcast_dims"], ret["original_shapes"], ret["requires_grad"], ret["runtime_tensor_transforms"], ret["constant_inputs"], ret["tile_dims"]))
-
-        if not training:
-            continue
-
-        # Backward, the other way around
-        if isinstance(d, TTDevice):
-            # Pushing backward to TTDevice, we need to set destination queue information
-            ret = _run_command(d, sequential, Command.get_queues("bw_input"), response=True)
-            if ret is None:
-                raise RuntimeError("Failed to connect devices.")
-            _run_command(target_device, sequential, Command.set_queues("backward_out_push", ret["queues"], ret["tile_broadcast_dims"], ret["original_shapes"], ret["requires_grad"], ret["runtime_tensor_transforms"], ret["constant_inputs"], ret["tile_dims"]))
-
-        if isinstance(target_device, TTDevice):
-            # Reading backward from TTDevice, need to set TTDevice's output queues
-            ret = _run_command(target_device, sequential, Command.get_queues("bw_output"), response=True)
-            if ret is None:
-                raise RuntimeError("Failed to connect devices.")
-            _run_command(d, sequential, Command.set_queues("backward_in", ret["queues"], ret["tile_broadcast_dims"], ret["original_shapes"], ret["requires_grad"], ret["runtime_tensor_transforms"], ret["constant_inputs"], ret["tile_dims"]))
-
-
-    # Set it for the first device
-    if isinstance(devices[0], TTDevice):
-        ret = _run_command(devices[0], sequential, Command.get_queues("input"), response=True)
-        if ret is None:
-            raise RuntimeError("Failed to connect devices.")
-        # Force "sequential" to true to set ret on local process, which will be pushing data in
-        _run_command(devices[0], sequential, Command.set_queues("forward_in_push", ret["queues"], ret["tile_broadcast_dims"], ret["original_shapes"], ret["requires_grad"], ret["runtime_tensor_transforms"], ret["constant_inputs"], ret["tile_dims"]))
-
-    # Set it for the last device
-    if training and isinstance(devices[-1], TTDevice):
-        ret = _run_command(devices[-1], sequential, Command.get_queues("target"), response=True)
-        if ret is None:
-            raise RuntimeError("Failed to connect devices.")
-        # Force "sequential" to true to set ret on local process, which will be pushing data in
-        _run_command(devices[-1], sequential, Command.set_queues("target_in_push", ret["queues"], ret["tile_broadcast_dims"], ret["original_shapes"], ret["requires_grad"], ret["runtime_tensor_transforms"], ret["constant_inputs"], ret["tile_dims"]))
-
-    if training and isinstance(devices[0], TTDevice):
-        ret = _run_command(devices[0], sequential, Command.get_queues("bw_output"), response=True)
-        if ret is None:
-            raise RuntimeError("Failed to connect devices.")
-        _run_command(devices[0], sequential, Command.set_queues("backward_out", ret["queues"], ret["tile_broadcast_dims"], ret["original_shapes"], ret["requires_grad"], ret["runtime_tensor_transforms"], ret["constant_inputs"], ret["tile_dims"]))
-
-    if isinstance(devices[-1], TTDevice):
-        ret = _run_command(devices[-1], sequential, Command.get_queues("output"), response=True)
-        if ret is None:
-            raise RuntimeError("Failed to connect devices.")
-        _run_command(devices[-1], sequential, Command.set_queues("forward_out_pop", ret["queues"], ret["tile_broadcast_dims"], ret["original_shapes"], ret["requires_grad"], ret["runtime_tensor_transforms"], ret["constant_inputs"], ret["tile_dims"]))
-
-    for device in devices:
-        if save_intermediates and isinstance(device, TTDevice):
-            ret = _run_command(device, sequential, Command.get_queues("intermediates"), response=True)
-            if ret is None:
-                raise RuntimeError("Failed to connect devices.")
-            _run_command(device, sequential, Command.set_queues("intermediates_pop", ret["queues"], ret["tile_broadcast_dims"], ret["original_shapes"], ret["requires_grad"], ret["runtime_tensor_transforms"], ret["constant_inputs"], ret["tile_dims"]))
-
-
-def _start_device_processes(devices: List[Union[CPUDevice, TTDevice]], output_dir: str) -> List[mp.Process]:
-    processes: List = []
-    mp_context = mp.get_context('spawn')
-
-    try:
-        for i, d in enumerate(devices):
-            logger.trace("Creating child process for device {}", d)
-
-            # Only first device should still have first inputs around. Due to automatic CPU fallback,
-            # TT device that's no longer first could have a stale copy of these, which pytorch will
-            # try to transfer over when starting the process, causing a "bad fds_to_keep" system error.
-            if i > 0:
-                d._first_inputs = None 
-
-            # Create python thread instead of another process
-            if os.environ.get("PYBUDA_FORCE_THREADS", "0") != "0":
-                processes.append(threading.Thread(target=d.run, args=(output_dir,)))
-            else:
-                processes.append(mp_context.Process(target=d.run, args=(output_dir,)))
-
-        for p in processes:
-            p.start()
-    except Exception as e:
-        logger.error("Process spawn error: {}", e)
-        _error_shutdown()
-
-    return processes
-
-def _get_device_zero_inputs(sample_inputs, peek=False):
-    compiler_cfg = _get_global_compiler_config()
-    devices = get_devices()
-    if compiler_cfg.compile_subgraphs:
-        num_input_groups = len(sample_inputs)
-        num_modules = len(devices[0].modules)
-        assert num_input_groups == num_modules, \
-                "Number of input groups ({}) must match number of modules ({})".format(num_input_groups, num_modules)
-        microbatch_size = sample_inputs[0][0].shape[0]
-
-        batch_removed_inputs = []
-        for i in range(num_input_groups):
-            assert microbatch_size == sample_inputs[i][0].shape[0], \
-                    "Microbatch size must be the same for subgraph modules."
-            batch_removed_inputs.append(remove_microbatch(sample_inputs[i]))
-
-        inputs = batch_removed_inputs
-    else:
-        if len(sample_inputs) > 0:
-            microbatch_size = sample_inputs[0].shape[0]
-            inputs = remove_microbatch(sample_inputs)
-        else:
-            microbatch_size, inputs = devices[0].get_first_inputs(peek)
-
-    return microbatch_size, inputs
-
-def _compile_devices(
-        sequential: bool, 
-        training: bool, 
-        sample_inputs: Tuple[Union[torch.Tensor, Tensor], ...] = tuple(),
-        sample_targets: Tuple[Union[torch.Tensor, Tensor], ...] = tuple(),
-        microbatch_count: int = 1,
-        verify_cfg: Optional[VerifyConfig] = None):
-    """
-    Compile modules on TT devices, for inference or training. If input shaes / types are provided, those
-    will be used... otherwise, first input from the input buffer will be peeked at (one must be there already).
-    """
-
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_training = training
-    if verify_cfg is None:
-        verify_cfg = VerifyConfig.disabled()
-    else:
-        verify_cfg.run_golden = False
-
-    # TODO: need to give user ability to set this outside of verify_cfg
-    for epoch_break in verify_cfg.epoch_breaks:
-        compiler_cfg.place_on_new_epoch(epoch_break)
-
-    devices = get_devices()
-    microbatch_size, inputs = _get_device_zero_inputs(sample_inputs)
-
-    targets = []
-    if training:
-        if len(sample_targets) > 0:
-            targets = remove_microbatch(sample_targets)
-        else:
-            targets = devices[-1].get_first_targets()
-
-    for i, d in enumerate(devices):
-        dev_targets = [] if i < len(devices) - 1 else targets
-        ret = _run_command(d, sequential, Command.compile(inputs, compiler_cfg, dev_targets, microbatch_size, microbatch_count, verify_cfg), response=True)
-        if isinstance(ret, Exception):
-            raise ret
-        if ret is None:
-            raise RuntimeError(f"Compile failed for {d}")
-
-        inputs = ret["outputs"]
-
-def _shutdown(clear_context: bool = True):
-    """ 
-    Shutdown running processes and clean up
-    """
-    _cleanup_temporary_files()
-
-    ctx = get_current_context()
-    if ctx is None:
-        clear_backend_param_cache()
-        return # nothing to shutdown
-        
-    logger.debug("PyBuda shutdown")
-
-    sequential = len(ctx.processes) == 0
-    devices = get_devices()
-
-    if not _error_raised():
-        for d in devices:
-            _run_command(d, sequential, Command.quit())
-
-        if ctx.final_barrier:
-            logger.trace("Setting final barrier on main process")
-            ctx.final_barrier.wait()
-
-        if ctx.loop_thread:
-            ctx.loop_thread.join()
-
-    logger.debug("Waiting until processes done")
-    if len(ctx.processes) > 0:
-
-        if _error_raised():
-            # wait a couple of seconds, then kill processes
-            import time
-            start = time.time()
-            while time.time() - start <= 2:
-                if not any(p.is_alive() for p in ctx.processes):
-                    break
-                time.sleep(0.25)
-        
-            for p in ctx.processes:
-                p.terminate()
-                p.join()
-
-        else:
-            # clean join
-            for p in ctx.processes:
-                if p == mp.current_process():
-                    continue # don't wait on yourself
-                p.join()
-
-        finish_child_process() # clean up backend
-
-    if clear_context:
-        clear_current_context()
-
-def _update_device_parameters(devices: List[Union["CPUDevice", "TTDevice"]], parameters: List[Dict[str, torch.Tensor]], sequential: bool = False):
-    """
-    Push new parameters onto given device, or if none is provided, then all devices in the pipeline.
-    """
-    sequential = _sequential_override(sequential)
-    for p in parameters:
-        for name in p:
-            p[name] = p[name].detach().value() if isinstance(p[name], Tensor) else detach_tensors([p[name]])[0]
-    for i, d in enumerate(devices):
-        _run_command(d, sequential, Command.update_device_parameters(parameters[i]))
-        
-def _get_loss_queue() -> Optional[queue.Queue]:
-    ctx = get_current_context()
-    if ctx is None or not _is_active() or not ctx.training:
-        logger.warning("No active training run, no loss queue available.")
-        return None
-    return ctx.output_queue
-
-def _get_checkpoint_queue() -> Optional[queue.Queue]:
-    ctx = get_current_context()
-    if ctx is None or not _is_active() or not ctx.training:
-        logger.warning("No active training run, no checkpoint queue available.")
-        return None
-    return ctx.checkpoint_queue
-
-def _get_intermediates_queue() -> Optional[queue.Queue]:
-    ctx = get_current_context()
-    if ctx is None or not _is_active():
-        logger.warning("No active training run, no checkpoint queue available.")
-        return None
-    return ctx.intermediates_queue
-
-def _sync():
-    """
-    Call sync on each device, block until response has been received.
-    """
-    ctx = get_current_context()
-    if ctx is None:
-        return # nothing to sync on
-        
-    logger.debug("PyBuda sync")
-
-    sequential = len(ctx.processes) == 0
-    devices = get_devices()
-    for d in devices:
-        _run_command(d, sequential, Command.sync(), response=True)
-    
diff --git a/pybuda/pybuda/tensor.py b/pybuda/pybuda/tensor.py
index 0cfc65c0a..035389df4 100644
--- a/pybuda/pybuda/tensor.py
+++ b/pybuda/pybuda/tensor.py
@@ -7,7 +7,6 @@
 
 import torch
 import tensorflow as tf
-import mxnet
 import numpy as np
 import math
 from loguru import logger
@@ -18,9 +17,7 @@
 
 from .pybudaglobal import TILE_DIM, align_up_tile, round_up_div
 from pybuda._C import DataFormat
-from pybuda._C.backend_api import PytorchTensorDesc, TilizedTensorDesc
 from pybuda._C.graph import OpType, RuntimeTensorTransform, RuntimeTensorTransformType, get_constant_input_value
-from pybuda._C.backend_api import DramIODesc
 from pybuda.utils import detach_tensors
 from functools import reduce
 from operator import mul
@@ -204,11 +201,11 @@ def to_buda_shape(self) -> "Tensor":
         """
         raise RuntimeError("Children should override")
 
-    def to_tensor_desc(self) -> "PytorchTensorDesc":
-        """
-        Return a tensor descriptor, with shapes, strides, and a pointer to data buffer
-        """
-        raise RuntimeError("Children should override")
+    # def to_tensor_desc(self) -> "PytorchTensorDesc":
+    #     """
+    #     Return a tensor descriptor, with shapes, strides, and a pointer to data buffer
+    #     """
+    #     raise RuntimeError("Children should override")
 
     def is_constant(self) -> bool:
         """
@@ -269,12 +266,12 @@ def create_from_trace(cls, src_op: "PyBudaOp", shape: Tuple[int, ...], data_form
         """
         return TensorFromTrace(src_op, shape, data_format)
 
-    @classmethod
-    def create_from_tensor_descriptor(cls, descriptor: "PytorchTensorDesc") -> "TensorFromDescriptor":
-        """
-        New path to creating front-end Tensor
-        """
-        return TensorFromDescriptor(descriptor)
+    # @classmethod
+    # def create_from_tensor_descriptor(cls, descriptor: "PytorchTensorDesc") -> "TensorFromDescriptor":
+    #     """
+    #     New path to creating front-end Tensor
+    #     """
+    #     return TensorFromDescriptor(descriptor)
 
 class TensorFromPytorch(Tensor):
     """
@@ -328,11 +325,11 @@ def to_buda_shape(
             value, tile_broadcast_dims, squeeze, microbatch, tile_r, tile_c)
         return Tensor.create_from_torch(new_tensor)
 
-    def to_tensor_desc(self) -> "PytorchTensorDesc":
-        """
-        Creates a fully-populated descriptor if a pytorch tensor is set as value. Otherwise, an empty wrapper.
-        """
-        return pytorch_tensor_to_tensor_desc(self._value)
+    # def to_tensor_desc(self) -> "PytorchTensorDesc":
+    #     """
+    #     Creates a fully-populated descriptor if a pytorch tensor is set as value. Otherwise, an empty wrapper.
+    #     """
+    #     return pytorch_tensor_to_tensor_desc(self._value)
 
     @property
     def pt_data_format(self) -> torch.dtype:
@@ -422,24 +419,24 @@ def to_format(self, data_format: DataFormat) -> "Tensor":
         new_t._data_format = data_format
         return new_t
 
-    def to_tensor_desc(self, batch: int = 0, override_data_format: DataFormat = DataFormat.Invalid) -> "PytorchTensorDesc":
-        """
-        Creates a descriptor, but doesn't assign a valid data pointer.
-        Optionally modify the shape to add a batch value.
+    # def to_tensor_desc(self, batch: int = 0, override_data_format: DataFormat = DataFormat.Invalid) -> "PytorchTensorDesc":
+    #     """
+    #     Creates a descriptor, but doesn't assign a valid data pointer.
+    #     Optionally modify the shape to add a batch value.
 
-        Parameters
-        ----------
-        t: Tensor
-            Pybuda tensor to be turned into a descriptor
+    #     Parameters
+    #     ----------
+    #     t: Tensor
+    #         Pybuda tensor to be turned into a descriptor
 
-        batch: int, optional
-            If batch != 0, set batch dimension to given value
-        """
+    #     batch: int, optional
+    #         If batch != 0, set batch dimension to given value
+    #     """
 
-        if self._value:
-            return pytorch_tensor_to_tensor_desc(self._value)
+    #     if self._value:
+    #         return pytorch_tensor_to_tensor_desc(self._value)
 
-        assert False
+    #     assert False
 
     def detach(self) -> Tensor:
         if self.has_value():
@@ -461,103 +458,103 @@ def create_pt_zeros(self) -> torch.Tensor:
     def to_framework(self, framework: str) -> "Tensor":
         return super().to_framework(framework)
 
-class TensorFromDescriptor(Tensor):
-    """
-    Tensor wrapper created from tensor descriptor
-    """
-    def __init__(self, descriptor: "PytorchTensorDesc"):
-        super().__init__()
-        self.descriptor = descriptor
-        self.requires_grad = False
-
-    # Cloning a tensor from descriptor creates a pytorch tensor
-    def clone(self) -> "TensorFromTorch": 
-        return Tensor.create_from_torch(self.value(clone=True))
+# class TensorFromDescriptor(Tensor):
+#     """
+#     Tensor wrapper created from tensor descriptor
+#     """
+#     def __init__(self, descriptor: "PytorchTensorDesc"):
+#         super().__init__()
+#         self.descriptor = descriptor
+#         self.requires_grad = False
+
+#     # Cloning a tensor from descriptor creates a pytorch tensor
+#     def clone(self) -> "TensorFromTorch": 
+#         return Tensor.create_from_torch(self.value(clone=True))
     
-    def has_value(self) -> bool:
-        return True
+#     def has_value(self) -> bool:
+#         return True
 
-    def value(self, clone = False) -> torch.Tensor:
-        tensor = tensor_desc_to_pytorch_tensor(self.descriptor)
-        if clone:
-            return tensor.clone()
+#     def value(self, clone = False) -> torch.Tensor:
+#         tensor = tensor_desc_to_pytorch_tensor(self.descriptor)
+#         if clone:
+#             return tensor.clone()
 
-        return tensor
+#         return tensor
 
-    def to_buda_shape(self) -> "Tensor":
-        raise RuntimeError("Tensor descriptor should not be converted to buda shape")
+#     def to_buda_shape(self) -> "Tensor":
+#         raise RuntimeError("Tensor descriptor should not be converted to buda shape")
 
-    def to_tensor_desc(self) -> "PytorchTensorDesc":
-        return self.descriptor
+#     def to_tensor_desc(self) -> "PytorchTensorDesc":
+#         return self.descriptor
 
-    # TODO: Can reinterpret shape be moved outside of this method?
-    def narrow_to_original_shape(self, original_shape: Tuple[int, ...], reinterpret_shape: Optional[Tuple[int, ...]] = None, has_microbatch_dim: bool = False, unpadded_shape: Optional[Tuple[int, ...]] = None) -> "Tensor":
-        """
-        Narrow the tensor to a smaller one, if original shape is smaller
-        """
-        assert type(original_shape) == tuple, "original_shape must be a tuple"
+#     # TODO: Can reinterpret shape be moved outside of this method?
+#     def narrow_to_original_shape(self, original_shape: Tuple[int, ...], reinterpret_shape: Optional[Tuple[int, ...]] = None, has_microbatch_dim: bool = False, unpadded_shape: Optional[Tuple[int, ...]] = None) -> "Tensor":
+#         """
+#         Narrow the tensor to a smaller one, if original shape is smaller
+#         """
+#         assert type(original_shape) == tuple, "original_shape must be a tuple"
 
-        tensor = self.value()
+#         tensor = self.value()
 
-        if self.shape.get_pytorch_shape() == original_shape and (reinterpret_shape is None or len(reinterpret_shape) == 0):
-            return Tensor.create_from_torch(tensor)
+#         if self.shape.get_pytorch_shape() == original_shape and (reinterpret_shape is None or len(reinterpret_shape) == 0):
+#             return Tensor.create_from_torch(tensor)
 
-        shape_transform = original_shape if (reinterpret_shape is None or len(reinterpret_shape) == 0) else reinterpret_shape
+#         shape_transform = original_shape if (reinterpret_shape is None or len(reinterpret_shape) == 0) else reinterpret_shape
 
-        new_shape = list(self.shape.get_pytorch_shape())
-        # Only R/C get narrowed
-        new_shape[-1] = shape_transform[-1]
-        if len(shape_transform) > 1:
-            new_shape[-2] = shape_transform[-2]
-            new_shape = (*shape_transform[:-2], new_shape[-2], new_shape[-1])
-            new_tensor = narrow_buda_tensor_to_pytorch(tensor, new_shape, has_microbatch_dim=has_microbatch_dim)
-        else:
-            new_shape = (new_shape[-1],)
-            new_tensor = narrow_buda_tensor_to_pytorch(tensor, new_shape, has_microbatch_dim=has_microbatch_dim)
+#         new_shape = list(self.shape.get_pytorch_shape())
+#         # Only R/C get narrowed
+#         new_shape[-1] = shape_transform[-1]
+#         if len(shape_transform) > 1:
+#             new_shape[-2] = shape_transform[-2]
+#             new_shape = (*shape_transform[:-2], new_shape[-2], new_shape[-1])
+#             new_tensor = narrow_buda_tensor_to_pytorch(tensor, new_shape, has_microbatch_dim=has_microbatch_dim)
+#         else:
+#             new_shape = (new_shape[-1],)
+#             new_tensor = narrow_buda_tensor_to_pytorch(tensor, new_shape, has_microbatch_dim=has_microbatch_dim)
 
-        new_tensor = new_tensor.reshape(original_shape)
+#         new_tensor = new_tensor.reshape(original_shape)
         
-        # Reshape the rest
-        return Tensor.create_from_torch(new_tensor)
+#         # Reshape the rest
+#         return Tensor.create_from_torch(new_tensor)
 
-    @property
-    def pt_data_format(self) -> torch.dtype:
-        return buda_dataformat_to_pytorch_dtype(self.descriptor.format)
+#     @property
+#     def pt_data_format(self) -> torch.dtype:
+#         return buda_dataformat_to_pytorch_dtype(self.descriptor.format)
 
-    @property
-    def data_format(self) -> DataFormat:
-        return self.descriptor.format
+#     @property
+#     def data_format(self) -> DataFormat:
+#         return self.descriptor.format
 
-    def to_format(self, data_format: DataFormat) -> "Tensor":
-        """
-        Convert this tensor to data_format
-        """
-        new_pt_tensor = self.value().type(buda_dataformat_to_pytorch_dtype(data_format))
-        new_pt_tensor.requires_grad = self.requires_grad
-        return Tensor.create_from_torch(new_pt_tensor, dev_data_format=data_format)
+#     def to_format(self, data_format: DataFormat) -> "Tensor":
+#         """
+#         Convert this tensor to data_format
+#         """
+#         new_pt_tensor = self.value().type(buda_dataformat_to_pytorch_dtype(data_format))
+#         new_pt_tensor.requires_grad = self.requires_grad
+#         return Tensor.create_from_torch(new_pt_tensor, dev_data_format=data_format)
 
-    @property
-    def shape(self):
-        return TensorShape(*self.descriptor.shape)
+#     @property
+#     def shape(self):
+#         return TensorShape(*self.descriptor.shape)
 
-    def to_framework(self, framework: str) -> "Tensor":
-        return super().to_framework(framework)
+#     def to_framework(self, framework: str) -> "Tensor":
+#         return super().to_framework(framework)
 
 
-def verify_tile_dims(data, msg = "Dim check"):
-    """ 
-    Verify that data tensor, or all tensors in data list have rows and columns divisible with tile dimensions
-    """
-    if isinstance(data, (list, tuple)):
-        for d in data:
-            verify_tile_dims(d, msg)
-        return
+# def verify_tile_dims(data, msg = "Dim check"):
+#     """ 
+#     Verify that data tensor, or all tensors in data list have rows and columns divisible with tile dimensions
+#     """
+#     if isinstance(data, (list, tuple)):
+#         for d in data:
+#             verify_tile_dims(d, msg)
+#         return
 
-    if data.shape[-1] % TILE_DIM != 0:
-        raise RuntimeError(f"{msg}: Shape {data.shape}: Column of {data.shape[-1]} encountered, which is not divisible with tile dimension of {TILE_DIM}")
+#     if data.shape[-1] % TILE_DIM != 0:
+#         raise RuntimeError(f"{msg}: Shape {data.shape}: Column of {data.shape[-1]} encountered, which is not divisible with tile dimension of {TILE_DIM}")
 
-    if data.shape[-2] % TILE_DIM != 0:
-        raise RuntimeError(f"{msg}: Shape {data.shape}: Row of {data.shape[-2]} encountered, which is not divisible with tile dimension of {TILE_DIM}")
+#     if data.shape[-2] % TILE_DIM != 0:
+#         raise RuntimeError(f"{msg}: Shape {data.shape}: Row of {data.shape[-2]} encountered, which is not divisible with tile dimension of {TILE_DIM}")
 
 def pytorch_dtype_to_buda_dataformat(dtype: torch.dtype, fp32_fallback: Optional[DataFormat] = None) -> DataFormat:
 
@@ -649,125 +646,138 @@ def is_equivalent_data_format(pt_df: torch.dtype, tt_df: DataFormat) -> bool:
 
     return False
 
-def pytorch_tensor_to_tensor_desc(t: torch.Tensor, df: DataFormat = None, element_size=None) -> "PytorchTensorDesc":
-    if isinstance(t, PytorchTensorDesc) or isinstance(t, TilizedTensorDesc):
-        return t
-
-    if not t.is_contiguous():
-        t = t.contiguous()
-
-    if df is None:
-        if t.dtype == torch.float32:
-            format = DataFormat.Float32
-        elif t.dtype == torch.bfloat16:
-            format = DataFormat.Float16_b
-        elif t.dtype == torch.float16:
-            format = DataFormat.Float16
-        elif t.dtype == torch.int32:
-            format = DataFormat.Int32
-        elif t.dtype == torch.int8:
-            format = DataFormat.Int8
-        elif t.dtype == torch.int64:
-            logger.warning("Converting int64 to int32 for tilization")
-            t = t.to(torch.int32)   # TODO: Fix this hack
-            format = DataFormat.RawUInt32
-        else:
-            raise RuntimeError("Unsupported torch tensor type for tilization: " + str(t.dtype))
-    else:
-        # If we already know dataformat, don't infer
-        format = df
-
-    tilize_ndim = 4
-    shape = list(t.shape)
-    dim = len(shape)
-    if (dim == 2):
-        dim = 3
-    while len(shape) > tilize_ndim:
-        if shape[0] != 1:
-            raise RuntimeError("Dropping a dimension that's not 1 to reduce shape to 4D: " + str(t.shape))
-        shape = shape[1:]
-
-    while len(shape) < tilize_ndim:
-        shape = [1] + shape
-
-    strides = list(t.stride())
-    while len(strides) > tilize_ndim:
-        strides = strides[1:]
-
-    while len(strides) < tilize_ndim:
-        strides = [strides[0]] + strides
-
-    if element_size is None:
-        element_size = t.element_size()
-
-    strides = [s * element_size for s in strides]
-    desc = PytorchTensorDesc(
-        t,
-        element_size,
-        format,
-        dim,
-        shape,
-        strides,
-    )
-
-    return desc
-
-
-def tensor_desc_to_pytorch_tensor(desc: "PytorchTensorDesc") -> torch.Tensor:
-    if desc.format == DataFormat.Float32:
-        dtype = torch.float32
-    elif desc.format == DataFormat.Float16_b:
-        dtype = torch.bfloat16
-    elif desc.format == DataFormat.Float16:
-        dtype = torch.float16
-    elif desc.format == DataFormat.RawUInt32:
-        dtype = torch.int
-    else:
-        raise RuntimeError(f"Unsupported tensor type({desc.format}) for untilization")
-
-    t = torch.frombuffer(desc, dtype=dtype)
-    t = torch.reshape(t, desc.shape)
+# def pytorch_tensor_to_tensor_desc(t: torch.Tensor, df: DataFormat = None, element_size=None) -> "PytorchTensorDesc":
+#     if isinstance(t, PytorchTensorDesc) or isinstance(t, TilizedTensorDesc):
+#         return t
+
+#     if not t.is_contiguous():
+#         t = t.contiguous()
+
+#     if df is None:
+#         if t.dtype == torch.float32:
+#             format = DataFormat.Float32
+#         elif t.dtype == torch.bfloat16:
+#             format = DataFormat.Float16_b
+#         elif t.dtype == torch.float16:
+#             format = DataFormat.Float16
+#         elif t.dtype == torch.int32:
+#             format = DataFormat.Int32
+#         elif t.dtype == torch.int8:
+#             format = DataFormat.Int8
+#         elif t.dtype == torch.int64:
+#             logger.warning("Converting int64 to int32 for tilization")
+#             t = t.to(torch.int32)   # TODO: Fix this hack
+#             format = DataFormat.RawUInt32
+#         else:
+#             raise RuntimeError("Unsupported torch tensor type for tilization: " + str(t.dtype))
+#     else:
+#         # If we already know dataformat, don't infer
+#         format = df
+        
+#         # Before we push the tensors to the queue, we need to make sure that the 
+#         # tensors are in the right format and aligned between PyBuda and PyTorch. 
+#         # If this isn't the case, expected shapes on the queues will be invalid 
+#         # and the runtime will crash. 
+#         #
+#         # Therefore, when we know the data format, we should check if the tensor
+#         # is appropriate/supported PyTorch format. If that isn't the case, we should
+#         # convert it to the appropriate PyTorch aligned format.
+#         pytorch_dtype = buda_dataformat_to_pytorch_dtype(format)
+#         if t.dtype != pytorch_dtype:
+#             logger.warning(f"Converting tensor from {t.dtype} to {pytorch_dtype}")
+#             t = t.type(pytorch_dtype)
+
+#     tilize_ndim = 4
+#     shape = list(t.shape)
+#     dim = len(shape)
+#     if (dim == 2):
+#         dim = 3
+#     while len(shape) > tilize_ndim:
+#         if shape[0] != 1:
+#             raise RuntimeError("Dropping a dimension that's not 1 to reduce shape to 4D: " + str(t.shape))
+#         shape = shape[1:]
+
+#     while len(shape) < tilize_ndim:
+#         shape = [1] + shape
+
+#     strides = list(t.stride())
+#     while len(strides) > tilize_ndim:
+#         strides = strides[1:]
+
+#     while len(strides) < tilize_ndim:
+#         strides = [strides[0]] + strides
+
+#     if element_size is None:
+#         element_size = t.element_size()
+
+#     strides = [s * element_size for s in strides]
+#     desc = PytorchTensorDesc(
+#         t,
+#         element_size,
+#         format,
+#         dim,
+#         shape,
+#         strides,
+#     )
+
+#     return desc
+
+
+# def tensor_desc_to_pytorch_tensor(desc: "PytorchTensorDesc") -> torch.Tensor:
+#     if desc.format == DataFormat.Float32:
+#         dtype = torch.float32
+#     elif desc.format == DataFormat.Float16_b:
+#         dtype = torch.bfloat16
+#     elif desc.format == DataFormat.Float16:
+#         dtype = torch.float16
+#     elif desc.format == DataFormat.RawUInt32:
+#         dtype = torch.int
+#     else:
+#         raise RuntimeError(f"Unsupported tensor type({desc.format}) for untilization")
+
+#     t = torch.frombuffer(desc, dtype=dtype)
+#     t = torch.reshape(t, desc.shape)
     
-    return t
-
-def buffer_to_pytorch_tensor(buf_ptr:int, shape: Tuple, format: DataFormat) -> "PytorchTensorDesc":
-    """ 
-    Convert buffer point to pytorch tensor, given shape and data format.
-    The assumption is that the buffer is in row-major format.
-    """
-
-    tilize_ndim = 4
-    dim = len(shape)
-    while len(shape) < tilize_ndim:
-        shape = [1] + shape
-    while len(shape) > tilize_ndim:
-        if shape[0] != 1:
-            raise RuntimeError("Trimming a dimension that's not 1")
-        shape = shape[1:]
-
-    if format == DataFormat.Float32:
-        element_size = 4
-    elif format == DataFormat.Float16_b:
-        element_size = 2
-    elif format == DataFormat.Float16:
-        element_size = 2
-    else:
-        raise RuntimeError("Unsupported format")
-
-    strides = [element_size]
-    for i in range(tilize_ndim-1):
-        strides = [shape[-1-i] * strides[0]] + strides
-
-    desc = PytorchTensorDesc(
-        buf_ptr,
-        element_size,
-        format,
-        dim,
-        shape,
-        strides,
-    )
-
-    return tensor_desc_to_pytorch_tensor(desc)
+#     return t
+
+# def buffer_to_pytorch_tensor(buf_ptr:int, shape: Tuple, format: DataFormat) -> "PytorchTensorDesc":
+#     """ 
+#     Convert buffer point to pytorch tensor, given shape and data format.
+#     The assumption is that the buffer is in row-major format.
+#     """
+
+#     tilize_ndim = 4
+#     dim = len(shape)
+#     while len(shape) < tilize_ndim:
+#         shape = [1] + shape
+#     while len(shape) > tilize_ndim:
+#         if shape[0] != 1:
+#             raise RuntimeError("Trimming a dimension that's not 1")
+#         shape = shape[1:]
+
+#     if format == DataFormat.Float32:
+#         element_size = 4
+#     elif format == DataFormat.Float16_b:
+#         element_size = 2
+#     elif format == DataFormat.Float16:
+#         element_size = 2
+#     else:
+#         raise RuntimeError("Unsupported format")
+
+#     strides = [element_size]
+#     for i in range(tilize_ndim-1):
+#         strides = [shape[-1-i] * strides[0]] + strides
+
+#     desc = PytorchTensorDesc(
+#         buf_ptr,
+#         element_size,
+#         format,
+#         dim,
+#         shape,
+#         strides,
+#     )
+
+#     return tensor_desc_to_pytorch_tensor(desc)
 
 
 def pad_sparse_pytorch_tensor_to_buda(sparse: torch.Tensor) -> torch.Tensor:
@@ -1063,8 +1073,6 @@ def to_pt_tensors(tensors: Union[Tuple[Union[torch.Tensor, Tensor, tf.Tensor], .
 
         elif isinstance(t, np.ndarray):
             pytorch_tensors.append(torch.Tensor(t))
-        elif isinstance(t, mxnet.ndarray.ndarray.NDArray):
-            pytorch_tensors.append(torch.Tensor(t.asnumpy()))
         elif isinstance(t, jaxlib.xla_extension.DeviceArray):
             pytorch_tensors.append(torch.Tensor(np.array(t)))
         else:
@@ -1107,8 +1115,6 @@ def to_jax_tensors(tensors: Union[Tuple[Union[torch.Tensor, Tensor, tf.Tensor],
 
         elif isinstance(t, np.ndarray):
             jax_tensors.append(jnp.asarray(t))
-        elif isinstance(t, mxnet.ndarray.ndarray.NDArray):
-            jax_tensors.append(jnp.asarray(t.asnumpy()))
         elif isinstance(t, jaxlib.xla_extension.DeviceArray):
             jax_tensors.append(t)
         else:
@@ -1363,48 +1369,64 @@ def get_post_const_eval_tensors(graph, device_constant_and_parameters, consteval
 
     return post_const_eval_constants
 
-def _embedding_index(tensor: torch.Tensor, original_shape: Tuple[int, ...], queue: DramIODesc):
-    assert queue.data_format in [DataFormat.RawUInt8, DataFormat.RawUInt16, DataFormat.RawUInt32]
-    assert len(tensor.shape) <= 2, "Must be a 1d tensor"
-    assert len(original_shape) <= 1 or original_shape[-2] == 1, "Must be a 1d tensor"
-    assert len(original_shape) <= 2 or original_shape[-3] == 1, "Must be a 1d tensor"
-
-    q_rt = queue.bufq_grid_dim_r * queue.mblock_m * queue.ublock_rt
-    w = tensor.shape[0] if len(tensor.shape) > 1 else 1
-    pad = align_up(tensor.shape[-1], TILE_DIM) - tensor.shape[-1]
-    tensor = torch.nn.functional.pad(tensor, (0, pad))
-    tensor = tensor.reshape(w, 1, 1, tensor.shape[-1])
-    tensor[:, :, :, original_shape[-1]:] = ~torch.tensor(0, dtype=tensor.dtype)
-    tensor = tensor.view(w, q_rt, -1, TILE_DIM)
-    pad = align_up(tensor.shape[-2], TILE_DIM) - tensor.shape[-2]
-    tensor = torch.nn.functional.pad(tensor, (0, 0, 0, pad))
-    tensor = tensor.view(w, q_rt, -1, TILE_DIM, TILE_DIM)
-    tensor = tensor.transpose(2, 3).view(w, 1, q_rt * TILE_DIM, -1)
-
-    assert len(tensor.shape) == 4, "_embedding_index: rank changed"
-    assert tensor.shape[0] == w, "_embedding_index: w changed"
-    assert tensor.shape[1] == queue.t, "_embedding_index: t changed"
-    assert tensor.shape[2] == (queue.bufq_grid_dim_r * queue.mblock_m * queue.ublock_rt * TILE_DIM), "_embedding_index: tensor dims mismatch q dims"
-    assert tensor.shape[3] == (queue.bufq_grid_dim_c * queue.mblock_n * queue.ublock_ct * TILE_DIM), "_embedding_index: tensor dims mismatch q dims"
-    return tensor
-
-def _reinterpret_shape(tensor: torch.Tensor, shape: List[int]):
-    breakpoint()
-
-def do_runtime_transform(transform, tensor, q):
+# def _embedding_index(tensor: torch.Tensor, original_shape: Tuple[int, ...], queue: DramIODesc):
+#     assert queue.data_format in [DataFormat.RawUInt8, DataFormat.RawUInt16, DataFormat.RawUInt32]
+#     assert len(tensor.shape) <= 2, "Must be a 1d tensor"
+#     assert len(original_shape) <= 1 or original_shape[-2] == 1, "Must be a 1d tensor"
+#     assert len(original_shape) <= 2 or original_shape[-3] == 1, "Must be a 1d tensor"
+
+#     q_rt = queue.bufq_grid_dim_r * queue.mblock_m * queue.ublock_rt
+#     w = tensor.shape[0] if len(tensor.shape) > 1 else 1
+#     pad = align_up(tensor.shape[-1], TILE_DIM) - tensor.shape[-1]
+#     tensor = torch.nn.functional.pad(tensor, (0, pad))
+#     tensor = tensor.reshape(w, 1, 1, tensor.shape[-1])
+#     tensor[:, :, :, original_shape[-1]:] = ~torch.tensor(0, dtype=tensor.dtype)
+#     tensor = tensor.view(w, q_rt, -1, TILE_DIM)
+#     pad = align_up(tensor.shape[-2], TILE_DIM) - tensor.shape[-2]
+#     tensor = torch.nn.functional.pad(tensor, (0, 0, 0, pad))
+#     tensor = tensor.view(w, q_rt, -1, TILE_DIM, TILE_DIM)
+#     tensor = tensor.transpose(2, 3).view(w, 1, q_rt * TILE_DIM, -1)
+
+#     assert len(tensor.shape) == 4, "_embedding_index: rank changed"
+#     assert tensor.shape[0] == w, "_embedding_index: w changed"
+#     assert tensor.shape[1] == queue.t, "_embedding_index: t changed"
+#     assert tensor.shape[2] == (queue.bufq_grid_dim_r * queue.mblock_m * queue.ublock_rt * TILE_DIM), "_embedding_index: tensor dims mismatch q dims"
+#     assert tensor.shape[3] == (queue.bufq_grid_dim_c * queue.mblock_n * queue.ublock_ct * TILE_DIM), "_embedding_index: tensor dims mismatch q dims"
+#     return tensor
+
+# def _reinterpret_shape(tensor: torch.Tensor, shape: List[int], queue: DramIODesc, tile_bcast_dims: List[int]):
+#     tensor = tensor.contiguous().view(shape)
+#     tile_r = queue.tile_height
+#     tile_c = queue.tile_width
+#     microbatch = queue.input_count
+#     tensor = pad_pytorch_tensor_to_buda(tensor, tile_bcast_dims, squeeze=True, microbatch=microbatch, tile_r=tile_r, tile_c=tile_c)
+#     return tensor, queue
+
+# def _prestride_shape(tensor: torch.Tensor, stride_height: int, stride_width: int, queue: DramIODesc):
+#     assert stride_height == stride_width, "Backend supports only square strides for prestriding transform"
+#     stride = stride_height
+#     stride_desc = StrideDescriptor()
+#     stride_desc.stride = stride
+#     stride_desc.xy_offsets = [(x, y) for y in range(stride) for x in range(stride)]
+#     queue.s_descriptor = stride_desc 
+#     return tensor, queue
+
+def do_runtime_transform(transform, tensor, q, tile_bcast_dims):
+    assert False
     if transform.type == RuntimeTensorTransformType.EmbeddingIndex:
-        return _embedding_index(tensor, transform.original_shape, q)
+        return _embedding_index(tensor, transform.original_shape, q), q
     elif transform.type == RuntimeTensorTransformType.ReinterpretShape:
-        return _reinterpret_shape(tensor, transform.reinterpreted_shape.as_list())
+        return _reinterpret_shape(tensor, transform.reinterpreted_shape.as_list(), q, tile_bcast_dims)
     elif transform.type == RuntimeTensorTransformType.NoTransform:
-        return tensor
+        return tensor, q
+    elif transform.type == RuntimeTensorTransformType.Prestride:
+        return _prestride_shape(tensor, transform.stride_height, transform.stride_width, q)
     else:
-        assert False, "Unsupported runtime transform type"
-
+        assert False, f"Unsupported runtime transform type: {transform.type}"
 
-def eval_runtime_transform(transform, inp, q):
+def eval_runtime_transform(transform, inp, q, tile_bcast_dims):
     if isinstance(transform, str):
         transform = json.loads(transform)
         transform = RuntimeTensorTransform.from_json(transform)
     logger.info(f"Aplying runtime transform {transform}")
-    return do_runtime_transform(transform, inp, q)
\ No newline at end of file
+    return do_runtime_transform(transform, inp, q, tile_bcast_dims)
diff --git a/pybuda/pybuda/tools/net2reportify.py b/pybuda/pybuda/tools/net2reportify.py
index 8d8a5a607..af021f7e4 100755
--- a/pybuda/pybuda/tools/net2reportify.py
+++ b/pybuda/pybuda/tools/net2reportify.py
@@ -135,8 +135,6 @@ def net2placement(
     if device_yaml is None:
         if netlist["devices"]["arch"] == "grayskull":
             device_yaml = "third_party/budabackend/device/grayskull_120_arch.yaml"
-        elif netlist["devices"]["arch"] == "wormhole":
-            device_yaml = "third_party/budabackend/device/wormhole_80_arch.yaml"
         elif netlist["devices"]["arch"] == "wormhole_b0":
             device_yaml = "third_party/budabackend/device/wormhole_b0_80_arch.yaml"
         else:
diff --git a/pybuda/pybuda/tools/perf_analysis.py b/pybuda/pybuda/tools/perf_analysis.py
index 59e9129bf..e0a79a8b5 100755
--- a/pybuda/pybuda/tools/perf_analysis.py
+++ b/pybuda/pybuda/tools/perf_analysis.py
@@ -17,11 +17,93 @@
 from collections import defaultdict
 from loguru import logger
 
+
+__MAX_NUM_INPUTS = 8
+__DEFAULT_SCREEN_DUMP_DIRECTORY = "perf_analysis_screens"
+
+def get_epoch_details_mapping(data, config):
+    """
+    Return table header mapping for epoch detials screen
+    """
+    epoch_detials_mapping =  {
+        'name': 'op_name' if config['full_op_names'] else 'op_name_short',
+        'grid': 'grid_size_str',
+        'mb': 'mblock_str',
+        'ub': 'ublock_str',
+        't' : 't_value',
+        'm/u_kt': 'm_k/u_kt',
+        'est': 'estimated_cycles',
+        'kernel': 'kernel_single_runtime',
+        'util': 'kernel_math_utilization',
+        'est_lim': 'estimated_lim_cycles',
+        'bw_kernel': 'bw_bound_single_runtime',
+        'bw_util': 'bw_bound_math_utilization',
+        'bw problem': 'bw_problem',
+        'out_req': 'required_output_pipe_bw_0',
+        'out_est': 'estimated_output_bw_0',
+        'out_bw': 'output_pipe_bw_0',
+        'in0_req': 'required_input_bw_0',
+        'in0_est': 'estimated_input_bw_0',
+        'in0_bw': 'input_pipe_bw_0',
+        'in1_req': 'required_input_bw_1',
+        'in1_est': 'estimated_input_bw_1',
+        'in1_bw': 'input_pipe_bw_1',
+        'in2_req': 'required_input_bw_2',
+        'in2_est': 'estimated_input_bw_2',
+        'in2_bw': 'input_pipe_bw_2',
+        'in3_req': 'required_input_bw_3',
+        'in3_est': 'estimated_input_bw_3',
+        'in3_bw': 'input_pipe_bw_3',
+        'in4_req': 'required_input_bw_4',
+        'in4_est': 'estimated_input_bw_4',
+        'in4_bw': 'input_pipe_bw_4',
+        'in5_req': 'required_input_bw_5',
+        'in5_est': 'estimated_input_bw_5',
+        'in5_bw': 'input_pipe_bw_5',
+        'in6_req': 'required_input_bw_6',
+        'in6_est': 'estimated_input_bw_6',
+        'in6_bw': 'input_pipe_bw_6',
+        'in7_req': 'required_input_bw_7',
+        'in7_est': 'estimated_input_bw_7',
+        'in7_bw': 'input_pipe_bw_7',
+    }
+
+    if all(d["balancer_util"] == 0 for d in data['epoch_summary']): # we had no balancer util numbers loaded
+        del epoch_detials_mapping["est"]
+        del epoch_detials_mapping["est_lim"]
+        for i in range(__MAX_NUM_INPUTS):
+            del epoch_detials_mapping[f"in{i}_est"]
+        del epoch_detials_mapping["out_est"]
+
+    return epoch_detials_mapping
+
+def get_epoch_summary_mapping(data, config):
+    """
+    Return table header mapping for epoch summary screen
+    """
+    epoch_summary_mapping = {
+        'epoch': 'epoch',
+        'slowest op': 'slowest_op' if config['full_op_names'] else 'slowest_op_short',
+        'cycles': 'pipeline_cycles',
+        'speed': 'inputs_per_second',
+        'util': 'real_utilization',
+        'mm cores': 'matmul_cores',
+        'balancer util': 'balancer_util',
+        'req noc bw GB/s': 'required_noc_bw',
+        'act noc bw GB/s': 'actual_noc_bw',
+        'req dram bw GB/s': 'required_dram_bw',
+        'act dram bw GB/s': 'actual_dram_bw',
+    }
+    if all(d["balancer_util"] == 0 for d in data['epoch_summary']): # we had no balancer util numbers loaded
+        del epoch_summary_mapping["balancer util"]
+
+    return epoch_summary_mapping
+
 def arch_clk(arch):
     """
     Return clock speed for an arch TODO: get this from somewhere?
     """
-    if arch == "wormhole_b0" or arch == "wormhole":
+    if arch == "wormhole_b0":
         return 10**9
     if arch == "grayskull":
         return 1.2 * 10**9
@@ -190,7 +272,7 @@ def load_netlist(config):
 
     # Figure out the number of cores. This is wonky, we need a more reliable source... for now, assume 1 row harvested, and then upsize if 
     # netlists uses more rows or columns
-    if config["arch"] == "wormhole_b0" or config["arch"] == "wormhole":
+    if config["arch"] == "wormhole_b0":
         grid = [9, 8]
     elif config["arch"] == "grayskull":
         grid = [11, 10]
@@ -242,6 +324,31 @@ def load_estimated_cycles():
         
     return data_table
 
+def load_balancer_score():
+    """
+    Load balancer score for every epoch and for total solution.
+    """
+    file_path = "balancer_score.csv"
+
+    # Check if file exists
+    if not os.path.exists(file_path):
+        print(f"{file_path} does not exist. Run with PYBUDA_OP_PERF=1 to generate it, if running with pybuda. Loading will continue without it.")
+        return {}
+
+    print(f"Loading {file_path}...")
+    data_table = {}
+    try:
+        with open(file_path, 'r') as file:
+            reader = csv.DictReader(file)
+            for row in reader:
+                row[' score'] = float(row[' score'])
+                data_table[row["epoch"]] = row
+    except Exception as e:
+        print(f"An error occurred while reading the file: {str(e)}.")
+        sys.exit(6)
+
+    return data_table
+
 def verify_data(netlist_data, perf_data, estimated_data):
     """ 
     Verify that the data is consistent between the 3 sources.
@@ -258,6 +365,12 @@ def verify_data(netlist_data, perf_data, estimated_data):
 
     print("Verified!")
 
+def round_parse_float(float_str: str, default_value: float = 0.0, decimal_places: int = 2) -> float:
+    """
+    Try to parse given string to a float and round it to the given number of decimal places.
+    """
+    return round(try_parse_float(float_str, default_value), decimal_places)
+
 def merge_data(netlist_data, perf_data, estimated_data, config):
     """
     Merge data into one, per-epoch, table of ops with all data
@@ -270,14 +383,27 @@ def merge_data(netlist_data, perf_data, estimated_data, config):
             data_table[op['op_name']] = op
             for k, d in netlist_data[epoch_idx][op['op_name']].items():
                 data_table[op['op_name']][k] = d
+
+            has_input_on_idx = lambda idx: f'input_pipe_bw_{idx}' in data_table[op['op_name']] and try_parse_float(data_table[op['op_name']][f'input_pipe_bw_{idx}'])
+
             if len(estimated_data) > 0:
                 data_table[op['op_name']]['estimated_cycles'] = estimated_data[op['op_name']][' cycles']
                 data_table[op['op_name']]['estimated_lim_cycles'] = estimated_data[op['op_name']][' limiter_cycles']
                 data_table[op['op_name']]['tiles'] = estimated_data[op['op_name']][' tiles']
+                for i in range(__MAX_NUM_INPUTS):
+                    if has_input_on_idx(i):
+                        data_table[op['op_name']][f'estimated_input_bw_{i}'] = round_parse_float(
+                            estimated_data[op['op_name']][f' estimated_input_bw_{i}'])
+                data_table[op['op_name']]['estimated_output_bw_0'] = round_parse_float(
+                    estimated_data[op['op_name']][' estimated_output_bw_0'])
             else:
                 data_table[op['op_name']]['estimated_cycles'] = 0
                 data_table[op['op_name']]['estimated_lim_cycles'] = 0
                 data_table[op['op_name']]['tiles'] = 0
+                for i in range(__MAX_NUM_INPUTS):
+                    if has_input_on_idx(i):
+                        data_table[op['op_name']][f'estimated_input_bw_{i}'] = 0
+                data_table[op['op_name']]['estimated_output_bw_0'] = 0
 
         merged_data.append(data_table)
 
@@ -295,7 +421,7 @@ def merge_data(netlist_data, perf_data, estimated_data, config):
 
     return merged_data
 
-def summarize_epoch(epoch, epoch_data, config):
+def summarize_epoch(epoch, epoch_data, balancer_score_data, config):
     """
     Summarize data for ops in one epoch to get a summary - epoch speed, utilization, slowest op, etc.
     """
@@ -362,6 +488,12 @@ def summarize_epoch(epoch, epoch_data, config):
     required_dram_bw = int(10 * as_gb_sec(clock_speed, required_dram_bw)) / 10 # 1 decimal place
     actual_noc_bw = int(10 * as_gb_sec(clock_speed, actual_noc_bw)) / 10 # 1 decimal place
     actual_dram_bw = int(10 * as_gb_sec(clock_speed, actual_dram_bw)) / 10 # 1 decimal place
+
+    if str(epoch) in balancer_score_data:
+        balancer_epoch_score = balancer_score_data[str(epoch)][' score']
+    else:
+        balancer_epoch_score = 0.0
+
     summary["real_utilization"] = util
     summary["balancer_util"] = balancer_util
     summary["matmul_cores"] = matmul_cores
@@ -373,9 +505,10 @@ def summarize_epoch(epoch, epoch_data, config):
     summary["sum_estimated_lim_err"] = sum_estimated_lim_err
     summary["sum_kernel"] = sum_kernel
     summary["sum_kernel_bw"] = sum_kernel_bw
+    summary["balancer_epoch_score"] = balancer_epoch_score
     return summary
 
-def summarize_model(epoch_summary_data, config):
+def summarize_model(epoch_summary_data, balancer_score_data, config):
     """
     Calculate overall model summary - total speed, utilization, etc.
     """
@@ -395,19 +528,25 @@ def summarize_model(epoch_summary_data, config):
     kernel_estimation_error = int((sum_estimated_kernel_err / float(sum_kernel)) * 100)
     kernel_bw_estimation_error = int((sum_estimated_lim_err / float(sum_kernel_bw)) * 100)
 
+    if "total" in balancer_score_data:
+        balancer_solution_score = balancer_score_data["total"][' score']
+    else:
+        balancer_solution_score = 0.0
+
     return {
         "overall_speed": overall_speed,
         "overall_util": overall_util,
+        "balancer_solution_score": balancer_solution_score,
         "kernel_estimation_error": kernel_estimation_error,
         "kernel_bw_estimation_error": kernel_bw_estimation_error
     }
 
-def summarize_data(data, config):
+def summarize_data(data, balancer_score_data, config):
     """
     Summarize data for ops in epochs to get a per-epoch summary - epoch speed, utilization, slowest op, etc.
     """
-    epoch_summary_data = [summarize_epoch(i, d, config) for i, d in enumerate(data)]
-    model_summary_data = summarize_model(epoch_summary_data, config)
+    epoch_summary_data = [summarize_epoch(i, d, balancer_score_data, config) for i, d in enumerate(data)]
+    model_summary_data = summarize_model(epoch_summary_data, balancer_score_data, config)
     return epoch_summary_data, model_summary_data
 
 def process_epoch_data(epoch_data, config):
@@ -478,11 +617,12 @@ def load_data(config):
     epoch_count = len(netlist_data)
     perf_data = load_perf_analysis(epoch_count, config)
     estimated_data = load_estimated_cycles()
+    balancer_score_data = load_balancer_score()
 
     verify_data(netlist_data, perf_data, estimated_data)
     epoch_data = merge_data(netlist_data, perf_data, estimated_data, config)
     epoch_data = process_epoch_data(epoch_data, config)
-    epoch_summary_data, model_summary_data = summarize_data(epoch_data, config)
+    epoch_summary_data, model_summary_data = summarize_data(epoch_data, balancer_score_data, config)
     model_summary_data['netlist'] = config["netlist"]
 
     data = {
@@ -598,7 +738,7 @@ def bad_u_kt(x):
     'est': lambda x, d: curses.color_pair(1) if abs(x-d['kernel']) >= 0.5 * d['kernel'] else curses.color_pair(3) if abs(x-d['kernel']) >= 0.2 * d['kernel'] else curses.color_pair(0),
     'est_lim': lambda x, d: curses.color_pair(1) if abs(x-d['bw_kernel']) >= 0.5 * d['bw_kernel'] else curses.color_pair(3) if abs(x-d['bw_kernel']) >= 0.2 * d['bw_kernel'] else curses.color_pair(0)
 }
-for i in range(8):
+for i in range(__MAX_NUM_INPUTS):
     highlight_funcs[f"in{i}_bw"] = (lambda x, d, i=i: curses.color_pair(1) if f"in{i}_req" in d and try_parse_float(x) < try_parse_float(d[f"in{i}_req"]) else curses.color_pair(0))
 
 def draw_epoch_summary(win, epoch, epoch_summary_data):
@@ -609,14 +749,18 @@ def draw_epoch_summary(win, epoch, epoch_summary_data):
     win.addstr(f"{data['inputs_per_second']}/s", curses.A_BOLD)
     win.addstr(", Utilization: ")
     win.addstr(f"{data['real_utilization']}%", curses.A_BOLD)
-    win.addstr(", Balancer Utilization: ")
+    win.addstr(", Balancer utilization: ")
     win.addstr(f"{data['balancer_util']}%", curses.A_BOLD)
+    win.addstr(", Balancer score: ")
+    win.addstr(f"{data['balancer_epoch_score']}", curses.A_BOLD)
 
 def draw_model_summary(win, data):
     win.addstr(0, 0, "Netlist: ")
     win.addstr(data['netlist'], curses.A_BOLD)
     win.addstr(" Approximate performance: ")
     win.addstr(f"{data['overall_speed']}/s", curses.A_BOLD)
+    win.addstr(" Balancer score: ")
+    win.addstr(f"{data['balancer_solution_score']}", curses.A_BOLD)
     win.addstr(" Approximate utilization: ")
     win.addstr(f"{data['overall_util']}%", curses.A_BOLD)
     win.addstr(" Kernel estimate error: ")
@@ -702,7 +846,21 @@ def draw_help(win, config):
 status_prompt = "[E] epoch [P] previous [N] next [S] summary [F] op names [R] reload [H] help [Q] quit [ARROWS] scroll"
 epoch_prompt = " Epoch #: "
 
-def display_screen(table_data, mapping, stdscr, config):
+def serialize_screen(stdscr, output_file_name):
+    """
+    Serializes curses lib console screen to output txt file
+    """
+    max_y, max_x = stdscr.getmaxyx()
+    with open(output_file_name, "w") as output_file:
+        for y in range(max_y):
+            for x in range(max_x):
+                # more details here: https://stackoverflow.com/a/43584573
+                decoded_char = stdscr.inch(y, x) & 0xFF
+                output_file.write(chr(decoded_char))
+            output_file.write("\n")
+
+
+def display_screen(table_data, mapping, stdscr, config, save_screen=False, base_dir=None):
     """
     Main drawing function, repeatedly called on every key press
     """
@@ -744,7 +902,62 @@ def display_screen(table_data, mapping, stdscr, config):
     max_y, max_x = stdscr.getmaxyx()
     stdscr.addnstr(max_y - 1, 0, key_map, max_x-1)
 
-def main(stdscr, data):
+    if save_screen:
+        assert base_dir is not None, "Expecting base_dir argument to be passed in"
+
+        screen_name = config.get('screen_name', 'screen_output')
+        output_file_name = f"{screen_name}.txt"
+        file_dir = os.path.join(base_dir, config.get("netlist", "netlist.yaml").strip(".yaml"))
+        os.makedirs(file_dir, exist_ok=True)
+
+        serialize_screen(stdscr, os.path.join(file_dir, output_file_name))
+
+
+def get_screen_description(data, config):
+    """
+    Return data required for drawing the screen described by the config and data maps
+    """
+    # Figure out which data to display
+    if config['epoch'] is not None:
+        # Epoch data
+        table_data = data['epochs'][config['epoch']].values()
+
+        # Order and names of columns to show on the epoch screen
+        mapping = get_epoch_details_mapping(data, config)
+        max_rows = len(data['epochs'][config['epoch']]) - 2
+    else:
+        # Summary columns
+        mapping = get_epoch_summary_mapping(data, config)
+
+        table_data = data['epoch_summary']
+        max_rows = len(data['epoch_summary']) - 2
+
+    max_columns = len(mapping) - 2
+
+    return mapping, table_data, max_rows, max_columns
+
+
+def save_epoch_screens_to_files(stdscr, data, config, epoch_sreens_save_dir):
+    """
+    Save epoch summary and details screens to txt files in a given folder
+    """
+    num_epochs = len(data['epochs'])
+
+    # Save epochs summary screen
+    config["epoch"] = None
+    config["screen_name"] = "epoch_summary"
+    mapping, table_data, _, _ = get_screen_description(data, config)
+    display_screen(table_data, mapping, stdscr, config, save_screen=True, base_dir=epoch_sreens_save_dir)
+
+    # Save details screens per epoch
+    for epoch in range(num_epochs):
+        config["epoch"] = epoch
+        config["screen_name"] = f"epoch_{epoch}_details"
+        mapping, table_data, _, _ = get_screen_description(data, config)
+        display_screen(table_data, mapping, stdscr, config, save_screen=True, base_dir=epoch_sreens_save_dir)
+
+
+def main(stdscr, data, save_epoch_screens=False, epoch_sreens_save_dir=None):
 
     # Curses config
     curses.init_pair(1, curses.COLOR_RED, curses.COLOR_BLACK)
@@ -758,78 +971,18 @@ def main(stdscr, data):
         'full_op_names': False, # show full op names (vs. shortened)
         'row_offset': 0, # row scrolling offset in tables
         'col_offset': 0, # column scrolling offset in tables
-        'epoch': None, # current epoch, or None if we're on the summary screen
+        'epoch': None, # current epoch, or None if we're on the summary screen,
+        "netlist": data['model_summary']['netlist'] # netlist for which perf data is displayed
     }
-    while key != ord('q') and key != ord('Q'):
-        
-        # Figure out which data to display
-        if config['epoch'] is not None:
-            # Epoch data
-            table_data = data['epochs'][config['epoch']].values()
-
-            # Order and names of columns to show on the epoch screen
-            mapping = {
-                    'name': 'op_name' if config['full_op_names'] else 'op_name_short',
-                    'grid': 'grid_size_str',
-                    'mb': 'mblock_str',
-                    'ub': 'ublock_str',
-                    't' : 't_value',
-                    'm/u_kt': 'm_k/u_kt',
-                    'est': 'estimated_cycles',
-                    'kernel': 'kernel_single_runtime',
-                    'util': 'kernel_math_utilization',
-                    'est_lim': 'estimated_lim_cycles',
-                    'bw_kernel': 'bw_bound_single_runtime',
-                    'bw_util': 'bw_bound_math_utilization',
-                    'bw problem': 'bw_problem',
-                    'out_req': 'required_output_pipe_bw_0',
-                    'out_bw': 'output_pipe_bw_0',
-                    'in0_req': 'required_input_bw_0',
-                    'in0_bw': 'input_pipe_bw_0',
-                    'in1_req': 'required_input_bw_1',
-                    'in1_bw': 'input_pipe_bw_1',
-                    'in2_req': 'required_input_bw_2',
-                    'in2_bw': 'input_pipe_bw_2',
-                    'in3_req': 'required_input_bw_3',
-                    'in3_bw': 'input_pipe_bw_3',
-                    'in4_req': 'required_input_bw_4',
-                    'in4_bw': 'input_pipe_bw_4',
-                    'in5_req': 'required_input_bw_5',
-                    'in5_bw': 'input_pipe_bw_5',
-                    'in6_req': 'required_input_bw_6',
-                    'in6_bw': 'input_pipe_bw_6',
-                    'in7_req': 'required_input_bw_7',
-                    'in7_bw': 'input_pipe_bw_7',
-                    }
-            max_rows = len(data['epochs'][config['epoch']]) - 2
-            if all(d["balancer_util"] == 0 for d in data['epoch_summary']): # we had no balancer util numbers loaded
-                del mapping["est"]
-                del mapping["est_lim"]
 
-        else:
-            # Summary columns
-            mapping = {
-                    'epoch': 'epoch',
-                    'slowest op': 'slowest_op' if config['full_op_names'] else 'slowest_op_short',
-                    'cycles': 'pipeline_cycles',
-                    'speed': 'inputs_per_second',
-                    'util': 'real_utilization',
-                    'mm cores': 'matmul_cores',
-                    'balancer util': 'balancer_util',
-                    'req noc bw GB/s': 'required_noc_bw',
-                    'act noc bw GB/s': 'actual_noc_bw',
-                    'req dram bw GB/s': 'required_dram_bw',
-                    'act dram bw GB/s': 'actual_dram_bw',
-                    }
-            if all(d["balancer_util"] == 0 for d in data['epoch_summary']): # we had no balancer util numbers loaded
-                del mapping["balancer util"]
-
-            table_data = data['epoch_summary']
-            max_rows = len(data['epoch_summary']) - 2
-        
-        max_columns = len(mapping) - 2
+    if save_epoch_screens:
+        save_epoch_screens_to_files(stdscr, data, config, epoch_sreens_save_dir)
+        return False
+
+    while key != ord('q') and key != ord('Q'):
+        mapping, table_data, max_rows, max_columns = get_screen_description(data, config)
 
-        display_screen(table_data, mapping, stdscr, config)
+        display_screen(table_data, mapping, stdscr, config, save_screen=False)
         key = stdscr.getch()
         
         if key == ord('R') or key == ord('r'):
@@ -860,7 +1013,7 @@ def main(stdscr, data):
 
         elif key == ord('E') or key == ord('e'):
             config['prompt_epoch'] = True
-            display_screen(table_data, mapping, stdscr, config)
+            display_screen(table_data, mapping, stdscr, config, save_screen=False)
             curses.echo()  # Enable echo to display input
             epoch_num = stdscr.getstr(curses.LINES - 1, len(status_prompt + epoch_prompt))
             curses.noecho()  # Disable echo
@@ -906,6 +1059,7 @@ def main(stdscr, data):
     parser.add_argument(      '--save', help='Save collected data into provided file')
     parser.add_argument(      '--load', help='Load data from a previously saved file, instead of from current workspace')
     parser.add_argument('-c', '--cache', help='Cache performance results in a file, to aid future compiles')
+    parser.add_argument(      '--save_epoch_screens', help='Save epoch summary and detail screen states to txt files in the given folder for easier offline comparison', nargs='?', const=__DEFAULT_SCREEN_DUMP_DIRECTORY)
     args = parser.parse_args()
 
     logger.add("perf_analysis_debug.log")
@@ -952,6 +1106,9 @@ def main(stdscr, data):
 
     if ui:
         print("Done loading data. Let's analyze!")
+        should_save_epoch_screens = args.save_epoch_screens is not None
+        if should_save_epoch_screens is not None:
+            print(f"Saving epoch screens to folder: '{args.save_epoch_screens}'")
         reload = True
         while reload:
-            reload = curses.wrapper(main, data)
+            reload = curses.wrapper(main, data, should_save_epoch_screens, args.save_epoch_screens)
diff --git a/pybuda/pybuda/tools/run_net2pipe.py b/pybuda/pybuda/tools/run_net2pipe.py
index f10fe54f7..8d11421d3 100755
--- a/pybuda/pybuda/tools/run_net2pipe.py
+++ b/pybuda/pybuda/tools/run_net2pipe.py
@@ -55,7 +55,7 @@ def generate_blobgen_cmd(
                 physical_grid_size_y = int(device_descriptor_yaml["physical"]["y_size"])
         arch_name = str(device_descriptor_yaml["arch_name"]).lower()
         overlay_version = int(device_descriptor_yaml["features"]["overlay"]["version"])
-        tensix_memsize = 1499136 if "wormhole" in arch_name else 1024 * 1024
+        tensix_memsize = 1499136 if "wormhole_b0" == arch_name else 1024 * 1024
         noc_translation_id_enabled = False
         if "noc" in device_descriptor_yaml["features"] and "translation_id_enabled" in device_descriptor_yaml["features"]["noc"]:
             noc_translation_id_enabled = bool(device_descriptor_yaml["features"]["noc"]["translation_id_enabled"])
@@ -80,7 +80,7 @@ def generate_blobgen_cmd(
         eth_cores = device_descriptor_yaml["eth"]
         if len(eth_cores) > 0:
             l1_overlay_blob_base = 128 + 140 * 1024
-            if "wormhole" in arch_name:
+            if "wormhole_b0" == arch_name:
                 eth_max_memsize = 256 * 1024
                 eth_overlay_blob_base = 0x9000 + 92 * 1024 + 128
                 eth_data_buffer_space_base = 0x9000 + 124 * 1024
@@ -247,6 +247,7 @@ def net2pipe(
 ):
     net2pipe_output_dir = "net2pipe_output"
     stdout = f"{net2pipe_output_dir}/net2pipe.stdout"
+    stderr = f"{net2pipe_output_dir}/net2pipe.stderr"
     subprocess.run(["rm", "-rf", net2pipe_output_dir])
     subprocess.run(["mkdir", "-p", net2pipe_output_dir])
     if "BUDA_HOME" not in os.environ:
@@ -270,10 +271,13 @@ def net2pipe(
     try:
         with open(stdout, "wb") as fd:
             fd.write(p.stdout)
+        with open(stderr, "wb") as fd:
+            fd.write(p.stderr)
     except:
         pass
     if verbose:
         sys.stdout.buffer.write(p.stdout)
+        sys.stderr.buffer.write(p.stderr)
     if p.returncode != 0:
         pytest = ""
         repro = " ".join(cmd)
@@ -288,6 +292,9 @@ def net2pipe(
             found |= "ERROR" in l
             if found and l != "":
                 error_message += l
+        for l in p.stderr.decode("utf-8").split("\n"):
+            if l != "":
+                error_message += l
 
     if p.returncode == 0 and stats:
         net2pipe_stats(net2pipe_output_dir)
diff --git a/pybuda/pybuda/tools/tti_merge.py b/pybuda/pybuda/tools/tti_merge.py
index d240d08e0..178792e46 100644
--- a/pybuda/pybuda/tools/tti_merge.py
+++ b/pybuda/pybuda/tools/tti_merge.py
@@ -17,131 +17,152 @@
 # Track all temp directories used for intermediate steps
 # Delete them as part of cleanup
 temp_directories = []
-def str_to_format(format_str):
-    if(format_str == "Float32"):
-        return DataFormat.Float32
-    elif(format_str == "Float16"):
-        return DataFormat.Float16
-    elif(format_str == "Float16_b"):
-        return DataFormat.Float16_b
-    elif(format_str == "Bfp8"):
-        return DataFormat.Bfp8
-    elif(format_str == "Bfp8_b"):
-        return DataFormat.Bfp8_b
-    elif(format_str == "Bfp4"):
-        return DataFormat.Bfp4
-    elif(format_str == "Bfp4_b"):
-        return DataFormat.Bfp4_b
-    elif(format_str == "Bfp2"):
-        return DataFormat.Bfp2
-    elif(format_str == "Bfp2_b"):
-        return DataFormat.Bfp2_b
-    elif(format_str == "RawUInt32"):
-        return DataFormat.RawUInt32
-    elif(format_str == "RawUInt16"):
-        return DataFormat.RawUInt16
-    elif(format_str == "RawUInt8"):
-        return DataFormat.RawUInt8
-    else:
-        assert False, "Invalid Format"
     
-def uniquify_global_structures(model_paths):
+def uniquify_and_merge_netlists(model_paths, inter_model_connections, consumer_to_producers_map, overlay_size_per_model):
     temp_dir = tempfile.mkdtemp()
     temp_directories.append(temp_dir)
-    uniquified_netlist_paths = []
-    for i in range(0, len(model_paths)):
-        unique_global_struct_names = {}
-        model_path = model_paths[i]
-        
-        with open(model_path, 'r') as file:
-            file_content = file.read()
-        
-        with open(model_path, 'r') as file:
-            netlist_data = yaml.load(file, Loader = yaml.FullLoader)
-            for queue in netlist_data["queues"].keys():
-                unique_global_struct_names[queue] = "model_" + str(i) + "_" + queue
-            for graph in netlist_data["graphs"].keys():
-                unique_global_struct_names[graph] = "model_" + str(i) + "_" + graph
-                for op in netlist_data["graphs"][graph]:
-                    if(op == "target_device" or op == "input_count"):
-                        continue
-                    unique_global_struct_names[op] = "model_" + str(i) + "_" + op
-                         
-            for program in netlist_data["programs"]:
-                program_name = list(program.keys())[0]
-                unique_global_struct_names[program_name] = "model_" + str(i) + "_" + program_name
-            if "fused_ops" in netlist_data:
-                for sched in netlist_data["fused_ops"].keys():
-                    for op in netlist_data["fused_ops"][sched]["schedules"][0]:
-                        op_name = list(op.keys())[0]
-                        unique_global_struct_names[op_name] = "model_" + str(i) + "_" + op_name
-        replacement_keys = list(unique_global_struct_names.keys())
-        replacement_keys.reverse()
-        for unique_key in replacement_keys:
-            pattern = re.compile(r'\b' + re.escape(unique_key) + r'\b')
-            file_content = pattern.sub(unique_global_struct_names[unique_key], file_content)
-        
-        indexed_model_path = str(model_path).split(".yaml")[0] + "_" + str(i) + ".yaml"
-        base_filename = os.path.basename(indexed_model_path)
-        temp_file_path = os.path.join(temp_dir, base_filename)
-        
-        with open(temp_file_path, "w+") as file:
-            file.write(file_content)
-        uniquified_netlist_paths.append(temp_file_path)
-    return uniquified_netlist_paths
-
-def merge_unique_netlists(unique_netlist_paths, overlay_blob_size_per_model):
+    
+    consumer_inputs = list(inter_model_connections.keys())
+    producer_outputs = list(inter_model_connections.values())
+    producer_queue_shapes = {}
+    producer_data_format= {}
+    producer_tile_dims = {}
+    model_input_counts = {}
+    fused_op_counter = 0
     merged_model = {"devices" : [],
                     "queues" : {},
                     "graphs" : {},
                     "programs" : []}
+    # Track if the IO queues specified in the dependency list are actually present in the netlist
+    sub_graph_nodes_visited = {}
+    for queue in consumer_inputs + producer_outputs:
+        sub_graph_nodes_visited[queue] = False
     
-    fused_op_counter = 0
-    for (i, netlist) in enumerate(unique_netlist_paths):
+    for i in range(0, len(model_paths)):
+        unique_queue_names = {}
+        unique_graph_names = {}
+        unique_op_names = {}
         fused_op_idx_updates = {}
-        with open(netlist, 'r') as file:
+        
+        model_path = model_paths[i]
+        with open(model_path, 'r') as file:
             netlist_data = yaml.load(file, Loader = yaml.FullLoader)
-            if(i == 0):
-                merged_model["devices"] = netlist_data["devices"]
+            merged_model["devices"] = netlist_data["devices"]
             for queue in netlist_data["queues"].keys():
-                merged_model["queues"][queue] = netlist_data["queues"][queue]
+                updated_queue_name = "model_" + str(i) + "_" + queue
+                if updated_queue_name in sub_graph_nodes_visited:
+                    # This queue was specified as a model-to-model queue in the dependency list
+                    # Mark it as visited, since it was found
+                    sub_graph_nodes_visited[updated_queue_name] = True
+                    # Keep track of queue parameters for queues feeding downstream models
+                    if updated_queue_name in producer_outputs:
+                        num_tiles_y = netlist_data["queues"][queue]["ublock"][0] * netlist_data["queues"][queue]["mblock"][0] * netlist_data["queues"][queue]["grid_size"][0]
+                        num_tiles_x = netlist_data["queues"][queue]["ublock"][1] * netlist_data["queues"][queue]["mblock"][1] * netlist_data["queues"][queue]["grid_size"][1]
+                        producer_queue_shapes[updated_queue_name] = [netlist_data["queues"][queue]["entries"], netlist_data["queues"][queue]["t"], num_tiles_y, num_tiles_x]
+                        producer_data_format[updated_queue_name] = netlist_data["queues"][queue]["df"]
+                        if "tile_dim" in netlist_data["queues"][queue]["tile_dim"]:
+                            producer_tile_dims[updated_queue_name] = netlist_data["queues"][queue]["tile_dim"]
+                        else:
+                            producer_tile_dims[updated_queue_name] = [32, 32]
+                    
+                if updated_queue_name in inter_model_connections:
+                    # This queue is being tied to a queue from a previous model.
+                    # Alias this queue with the feeder after ensuring that the producer and consumer are compatible.
+                    # Since this queue is aliased with its producer queue (which has already been added to the merged netlist) don't add this queue again
+                    num_tiles_y = netlist_data["queues"][queue]["ublock"][0] * netlist_data["queues"][queue]["mblock"][0] * netlist_data["queues"][queue]["grid_size"][0]
+                    num_tiles_x = netlist_data["queues"][queue]["ublock"][1] * netlist_data["queues"][queue]["mblock"][1] * netlist_data["queues"][queue]["grid_size"][1]
+                    consumer_queue_shape = [netlist_data["queues"][queue]["entries"], netlist_data["queues"][queue]["t"], num_tiles_y, num_tiles_x]
+                    tile_dim = [32, 32]
+                    if "tile_dim" in netlist_data["queues"][queue]["tile_dim"]:
+                        tile_dim = netlist_data["queues"][queue]["tile_dim"]
+                    assert consumer_queue_shape == producer_queue_shapes[inter_model_connections[updated_queue_name]], "Consumer " + queue + " shape is incompatible with the producer."
+                    assert netlist_data["queues"][queue]["df"] == producer_data_format[inter_model_connections[updated_queue_name]], "Consumer " + queue + " data format is incompatible with the producer."
+                    assert tile_dim == producer_tile_dims[inter_model_connections[updated_queue_name]], "Consumer " + queue + " tile dimensions are incompatible with the producer."
+                    updated_queue_name = inter_model_connections[updated_queue_name]
+                else:
+                    # This queue is not tied to a queue from a previous model. Add an unaliased version of it to the merged netlist.
+                    input_name = netlist_data["queues"][queue]["input"]
+                    # Queues can only be fed by ops in the sane model or by host.
+                    updated_input_name = "HOST" if (input_name.lower() == "host") else "model_" + str(i) + "_" + input_name    
+                    merged_model["queues"][updated_queue_name] = netlist_data["queues"][queue]
+                    merged_model["queues"][updated_queue_name]["input"] = updated_input_name
+                # Track the updated queue name, for modifying graph and program structures.
+                unique_queue_names[queue] = updated_queue_name
+                
+                
             for graph in netlist_data["graphs"].keys():
+                updated_graph_name = "model_" + str(i) + "_" + graph
+                unique_graph_names[graph] = updated_graph_name
+                merged_model["graphs"][updated_graph_name] = {}
                 for op in netlist_data["graphs"][graph]:
                     if(op == "target_device" or op == "input_count"):
-                        continue
-                    if netlist_data["graphs"][graph][op]["type"] == "fused_op":
-                        local_id = netlist_data["graphs"][graph][op]["attributes"]["fused_op_id"]
-                        if not local_id in fused_op_idx_updates:
-                            fused_op_idx_updates[local_id] = fused_op_counter
-                            fused_op_counter = fused_op_counter + 1
-                        netlist_data["graphs"][graph][op]["attributes"]["fused_op_id"] = fused_op_idx_updates[local_id]
-                    
-                    if "attributes" in netlist_data["graphs"][graph][op]:
-                        if "kernel_broadcast" in netlist_data["graphs"][graph][op]["attributes"]:
-                            updated_kernel_bcast = {}
-                            for input in netlist_data["graphs"][graph][op]["attributes"]["kernel_broadcast"]:
-                                updated_kernel_bcast[input.replace("model_" + str(i) + "_", "")] = netlist_data["graphs"][graph][op]["attributes"]["kernel_broadcast"][input]
-                            netlist_data["graphs"][graph][op]["attributes"]["kernel_broadcast"] = updated_kernel_bcast
-                    if i in overlay_blob_size_per_model:
-                        netlist_data["graphs"][graph][op]["overlay_size"] = int(overlay_blob_size_per_model[i])
-                
-                merged_model["graphs"][graph] = netlist_data["graphs"][graph]
-            for program in netlist_data["programs"]:
+                        if (op == "input_count"):
+                            model_input_counts["model_" + str(i)] = netlist_data["graphs"][graph][op]
+                            if "model_" + str(i) in consumer_to_producers_map:
+                                # Model has producers
+                                for producer in consumer_to_producers_map["model_" + str(i)]:
+                                    assert netlist_data["graphs"][graph][op] == model_input_counts[producer], "The microbatch sizes across producers and consumers are not consistent."
+
+                        merged_model["graphs"][updated_graph_name][op] = netlist_data["graphs"][graph][op]
+                        
+                    else:
+                        for input_idx  in range(len(netlist_data["graphs"][graph][op]["inputs"])):
+                            if netlist_data["graphs"][graph][op]["inputs"][input_idx] in unique_queue_names:
+                                netlist_data["graphs"][graph][op]["inputs"][input_idx] = unique_queue_names[netlist_data["graphs"][graph][op]["inputs"][input_idx]]
+                            elif netlist_data["graphs"][graph][op]["inputs"][input_idx] in unique_op_names:
+                                netlist_data["graphs"][graph][op]["inputs"][input_idx] = unique_op_names[netlist_data["graphs"][graph][op]["inputs"][input_idx]]
+                            else:
+                                assert False, "Input to op " + op + " is not another op or a queue."
+                        
+                        if i in overlay_size_per_model:
+                            netlist_data["graphs"][graph][op]["overlay_size"] = int(overlay_size_per_model[i])
+                        
+                        if netlist_data["graphs"][graph][op]["type"] == "fused_op":
+                            local_id = netlist_data["graphs"][graph][op]["attributes"]["fused_op_id"]
+                            if not local_id in fused_op_idx_updates:
+                                fused_op_idx_updates[local_id] = fused_op_counter
+                                fused_op_counter = fused_op_counter + 1
+                            netlist_data["graphs"][graph][op]["attributes"]["fused_op_id"] = fused_op_idx_updates[local_id]
+                        updated_op_name = "model_" + str(i) + "_" + op
+                        unique_op_names[op] = updated_op_name
+                        merged_model["graphs"][updated_graph_name][updated_op_name] = netlist_data["graphs"][graph][op]
+            
+            for prog_idx, program in enumerate(netlist_data["programs"]):
                 program_name = list(program.keys())[0]
-                program_dict = {program_name : program[program_name]}
-                merged_model["programs"].append(program_dict)
+                updated_program_name = "model_" + str(i) + "_" + program_name
+                for instrn_idx, instrn in enumerate(netlist_data["programs"][prog_idx][program_name]):
+                    if(type(instrn) == dict):
+                        instrn_code = list(instrn.keys())[0]
+                        if instrn_code == "execute":
+                            netlist_data["programs"][prog_idx][program_name][instrn_idx][instrn_code]["graph_name"] = unique_graph_names[netlist_data["programs"][prog_idx][program_name][instrn_idx][instrn_code]["graph_name"]]
+                            queue_settings = netlist_data["programs"][prog_idx][program_name][instrn_idx][instrn_code]["queue_settings"]
+                            netlist_data["programs"][prog_idx][program_name][instrn_idx][instrn_code]["queue_settings"] = {}
+                            for queue in queue_settings:
+                                updated_queue = unique_queue_names[queue]
+                                netlist_data["programs"][prog_idx][program_name][instrn_idx][instrn_code]["queue_settings"][updated_queue] = queue_settings[queue]
+                        if instrn_code == "allocate_queue" or instrn_code == "deallocate_queue":
+                            for queue_idx in range(len(netlist_data["programs"][prog_idx][program_name][instrn_idx][instrn_code])):
+                                queue_name = netlist_data["programs"][prog_idx][program_name][instrn_idx][instrn_code][queue_idx]
+                                netlist_data["programs"][prog_idx][program_name][instrn_idx][instrn_code][queue_idx] = unique_queue_names[queue_name]
+            
+                merged_model["programs"].append({updated_program_name : netlist_data["programs"][prog_idx][program_name]})
+
             if "fused_ops" in netlist_data:
                 if not "fused_ops" in merged_model:
                     merged_model["fused_ops"] = {}
-                for sched in netlist_data["fused_ops"].keys():
-                    merged_model["fused_ops"][fused_op_idx_updates[sched]] = netlist_data["fused_ops"][sched]
-                    for op in merged_model["fused_ops"][fused_op_idx_updates[sched]]["schedules"][0]:
-                        for op_idx in range(len(op[list(op.keys())[0]]["inputs"])):
-                            input_name = op[list(op.keys())[0]]["inputs"][op_idx]
-                            op[list(op.keys())[0]]["inputs"][op_idx] = input_name.replace("model_" + str(i) + "_", "")
-                        
-                    merged_model["fused_ops"][fused_op_idx_updates[sched]]
-                    
+                for group in netlist_data["fused_ops"].keys():
+                    schedules = netlist_data["fused_ops"][group]["schedules"]
+                    netlist_data["fused_ops"][group]["schedules"] = []
+                    for sched_idx in range(len(schedules)):
+                        netlist_data["fused_ops"][group]["schedules"].append([])
+                        for op_idx, op in enumerate(schedules[sched_idx]):
+                            updated_op_name = "model_" + str(i) + "_" + list(op.keys())[0]
+                            netlist_data["fused_ops"][group]["schedules"][sched_idx].append({updated_op_name : schedules[sched_idx][op_idx][list(op.keys())[0]]})
+                    merged_model["fused_ops"][fused_op_idx_updates[group]] = netlist_data["fused_ops"][group]
+        
+    # Assert if the queues specified in the dependency list are not found in the appropriate netlists
+    for queue in sub_graph_nodes_visited:
+        assert sub_graph_nodes_visited[queue], "Queue " + queue + " was specified in the dependency list but was not found in any netlist."    
     return merged_model
 
 def update_buffer_ranges(merged_model, queue, buf_info, buf_group_ranges):
@@ -197,7 +218,7 @@ def get_queue_size(netlist, queue):
     is_untilized = False
     if("layout" in netlist["queues"][queue]):
         is_untilized = (netlist["queues"][queue]["layout"] != "tilized")
-    format = str_to_format(netlist["queues"][queue]["df"])
+    format = backend_api.get_format_from_string(netlist["queues"][queue]["df"])
     ublock_ct = netlist["queues"][queue]["ublock"][0]
     ublock_rt = netlist["queues"][queue]["ublock"][1]
     mblock_m = netlist["queues"][queue]["mblock"][0]
@@ -211,23 +232,28 @@ def get_queue_size(netlist, queue):
         tile_width = netlist["queues"][queue]["tile_dim"][1]
     return backend_api.get_io_size_in_bytes(format, is_untilized, ublock_ct, ublock_rt, mblock_m, mblock_n, t, entries, tile_height, tile_width)
     
-def reallocate_queues(merged_model, dynamic_queues, start_offset_to_queue_buf_per_model, soc_descriptor, switch_chans_if_capacity_hit, overlap_dynamic_queues):
-    dev_cfg = backend_api.DeviceConfig("wormhole_b0",
+def reallocate_queues(arch, merged_model, dynamic_queues, start_offset_to_queue_buf_per_model, soc_descriptor, switch_chans_if_capacity_hit, overlap_dynamic_queues):
+    dev_cfg = backend_api.DeviceConfig(arch,
                              soc_descriptor,
                              "",
                              "",
                              "",
                              False,
                              [])
+    
     max_reserved_backend_space = dev_cfg.get_dram_backend_reserved_max()
-    backend_reserved_dram_memory = {0 : max_reserved_backend_space, 1 : max_reserved_backend_space, 2 : max_reserved_backend_space, 3 : max_reserved_backend_space, 
-                                   4 : max_reserved_backend_space, 5 : max_reserved_backend_space}
+    # Constants derived from the SOC descriptor. These will be unchanged for the arch.
+    max_dram_space = dev_cfg.get_dram_channel_capacity()
+    backend_reserved_dram_memory = {}
     memory_consumed_per_host_channel = {}
+    
+    for chan in range(dev_cfg.get_dram_num_channels()):
+        backend_reserved_dram_memory[chan] = max_reserved_backend_space
+        
     for host_chan in range(dev_cfg.get_host_memory_num_channels()):
         memory_consumed_per_host_channel[host_chan] = dev_cfg.get_host_memory_channel_start_address()
     
     static_queue_dram_space = copy.copy(backend_reserved_dram_memory)
-    MAX_DRAM_SPACE = 2**31
     
     if not switch_chans_if_capacity_hit:
         logger.warning("Memory Optimization Allowing Buffer Channels to be Reallocated is disabled")
@@ -259,16 +285,16 @@ def reallocate_queues(merged_model, dynamic_queues, start_offset_to_queue_buf_pe
         queue_size = get_queue_size(merged_model, queue)
         if(merged_model["queues"][queue]["loc"].lower() == "dram"):
             for alloc in merged_model["queues"][queue]["dram"]:
-                if static_queue_dram_space[alloc[0]] + queue_size > MAX_DRAM_SPACE:
+                if static_queue_dram_space[alloc[0]] + queue_size > max_dram_space:
                     if switch_chans_if_capacity_hit:
                         logger.info("DRAM Channel {} capacity hit. Bytes Used: {}. Reallocating queue to a different channel", alloc[0], static_queue_dram_space[alloc[0]])
                         for i in static_queue_dram_space:
-                            if static_queue_dram_space[i] + queue_size <= MAX_DRAM_SPACE:
+                            if static_queue_dram_space[i] + queue_size <= max_dram_space:
                                 alloc[0] = i
                             
                 alloc[1] = static_queue_dram_space[alloc[0]]
                 static_queue_dram_space[alloc[0]] += queue_size
-                assert static_queue_dram_space[alloc[0]] <= MAX_DRAM_SPACE, "DRAM space exceeded for DRAM channel " + str(alloc[0]) + " when trying to allocate memory for queue " + queue + " Bytes used: " + str(static_queue_dram_space[alloc[0]])
+                assert static_queue_dram_space[alloc[0]] <= max_dram_space, "DRAM space exceeded for DRAM channel " + str(alloc[0]) + " when trying to allocate memory for queue " + queue + " Bytes used: " + str(static_queue_dram_space[alloc[0]])
                 static_queue_dram_space[alloc[0]] = backend_api.get_next_aligned_address(static_queue_dram_space[alloc[0]])
         else:
             for (alloc_idx, alloc) in enumerate(merged_model["queues"][queue]["host"]):
@@ -309,7 +335,7 @@ def uniquify_tensor_bin_names(unzipped_tti_paths, merged_tti_path):
                             os.path.join(merged_tti_path, "unzipped_tti", "tensors", "model_" + str(i) + "_" + tensor_bin))
 
 
-def merge_device_metadata(unzipped_tti_paths, merged_tti_path):
+def merge_device_metadata(unzipped_tti_paths, merged_tti_path, inter_model_connections):
     logger.info("Generating Metadata for merged model...")
     netlist_names = []
     merged_md = {
@@ -327,6 +353,13 @@ def merge_device_metadata(unzipped_tti_paths, merged_tti_path):
                     "devtype" : {}
                 }
     
+    intermediate_inputs = set()
+    intermediate_outputs = set()
+    
+    for connection in inter_model_connections:
+        intermediate_inputs.add(connection)
+        intermediate_outputs.add(inter_model_connections[connection])
+        
     for (i, tti_path) in enumerate(unzipped_tti_paths):
         with open(os.path.join(tti_path, "unzipped_tti", "device.json"), "r") as file:
             device_md = json.load(file)
@@ -335,9 +368,11 @@ def merge_device_metadata(unzipped_tti_paths, merged_tti_path):
         for name in device_md["compiled_graph_state"]["ordered_parameter_node_names"]:
             merged_md["compiled_graph_state"]["ordered_parameter_node_names"].append("model_" + str(i) + "_" + name)
         for name in device_md["compiled_graph_state"]["ordered_input_names"]:
-            merged_md["compiled_graph_state"]["ordered_input_names"].append("model_" + str(i) + "_" + name)
+            if not "model_" + str(i) + "_" + name in intermediate_inputs:
+                merged_md["compiled_graph_state"]["ordered_input_names"].append("model_" + str(i) + "_" + name)
         for name in device_md["compiled_graph_state"]["ordered_output_names"]:
-            merged_md["compiled_graph_state"]["ordered_output_names"].append("model_" + str(i) + "_" + name)
+            if not "model_" + str(i) + "_" + name in intermediate_outputs:
+                merged_md["compiled_graph_state"]["ordered_output_names"].append("model_" + str(i) + "_" + name)
         for name in device_md["compiled_graph_state"]["post_const_eval_parameters"]:
             merged_md["compiled_graph_state"]["post_const_eval_parameters"]["model_" + str(i) + "_" + name] = device_md["compiled_graph_state"]["post_const_eval_parameters"][name]
             tensor_bin = merged_md["compiled_graph_state"]["post_const_eval_parameters"]["model_" + str(i) + "_" + name]["bin"].split("/")[1]
@@ -414,25 +449,43 @@ def unzip_ttis_and_generate_output_dir(tti_file_paths, output_tti_dir):
         sp.run(['tar', '-xf', tti, '-C', unzipped_tti_directory])
     return unzipped_tti_directories
 
-def merge_netlists(netlist_paths, merged_tti_path, unzipped_tti_paths, overlay_blob_size_per_model, switch_chans_if_capacity_hit, overlap_dynamic_queues):
+def merge_netlists(arch, netlist_paths, merged_tti_path, unzipped_tti_paths, overlay_blob_size_per_model, switch_chans_if_capacity_hit, overlap_dynamic_queues, inter_model_connections, consumer_to_producers_map):
     logger.info("Merging Netlists...")
-    soc_descriptor = os.path.join(unzipped_tti_paths[0], "unzipped_tti/backend_build_binaries/device_desc_runtime/0.yaml")
-    if not os.path.exists(soc_descriptor):
+
+    wh_soc_desc_dir = os.path.join(unzipped_tti_paths[0], "unzipped_tti/backend_build_binaries/device_desc_runtime")
+    gs_golden_soc_desc_dir = os.path.join(unzipped_tti_paths[0], "unzipped_tti/backend_build_binaries/device_descs")
+    soc_descriptor = ""
+    
+    soc_desc_dir = wh_soc_desc_dir # Expect files in WH device desc location
+    if not os.path.exists(wh_soc_desc_dir):
+        # If WH device desc dir does not exist, check GS silicon or Golden (All archs) location
+        soc_desc_dir = gs_golden_soc_desc_dir
+    # If device desc dir does not exist, set it to default device_descs.yaml
+    if os.path.exists(soc_desc_dir):
+        soc_desc_files = os.listdir(soc_desc_dir)
+        if len(soc_desc_files):
+            soc_descriptor = os.path.join(soc_desc_dir, soc_desc_files[0])
+    
+    if not soc_descriptor:
         soc_descriptor = os.path.join(unzipped_tti_paths[0], "unzipped_tti/backend_build_binaries/device_desc.yaml")
-    uniquifed_netlist =  merge_unique_netlists(uniquify_global_structures(netlist_paths), overlay_blob_size_per_model)
-    dynamic_queues, start_offset_to_queue_buf_per_model = get_dynamic_queue_info(uniquifed_netlist)
-    merged_model = reallocate_queues(uniquifed_netlist, dynamic_queues, start_offset_to_queue_buf_per_model, soc_descriptor, switch_chans_if_capacity_hit, overlap_dynamic_queues)
+    
+    assert(os.path.exists(soc_descriptor), "Could not find SOC Descriptor in Unzipped TTI Files")
+    
+    merged_netlist = uniquify_and_merge_netlists(netlist_paths, inter_model_connections, consumer_to_producers_map, overlay_blob_size_per_model)
+    dynamic_queues, start_offset_to_queue_buf_per_model = get_dynamic_queue_info(merged_netlist)
+    merged_model = reallocate_queues(arch, merged_netlist, dynamic_queues, start_offset_to_queue_buf_per_model, soc_descriptor, switch_chans_if_capacity_hit, overlap_dynamic_queues)
+
     yaml_output = yaml.dump(merged_model, default_flow_style=False, sort_keys=False)
     netlist_path = os.path.join(merged_tti_path, "unzipped_tti/merged_netlist.yaml")
     with open(netlist_path, "w+") as file:
         file.write(yaml_output)
     return netlist_path
         
-def compile_backend_binaries(merged_tti_path, netlist_path):
+def compile_backend_binaries(arch, merged_tti_path, netlist_path):
     logger.info("Compiling TT Binaries for merged model...")
     os.makedirs(os.path.join(merged_tti_path, "unzipped_tti/backend_build_binaries/"))
     bcfg = backend_api.BackendConfig(backend_api.BackendType.Silicon,
-                              backend_api.BackendDevice.Wormhole_B0,
+                              backend_api.BackendDevice.from_string(arch.capitalize()),
                               backend_api.DeviceMode.CompileOnly,
                               0,
                               os.path.join(merged_tti_path, "unzipped_tti/backend_build_binaries/"),
@@ -450,23 +503,81 @@ def cleanup():
     logger.info("Cleaning up intermediate state and exiting")
     for dir in temp_directories:
         shutil.rmtree(dir)
-            
-def merge_models(model_binaries, arch, merged_model_location = "", switch_chans_if_capacity_hit = True, overlap_dynamic_queues = True):
+        
+def check_model_dep_constraints(models, dep_list):
+    for model in dep_list:
+        for input in dep_list[model]["inputs"]:
+            if type(dep_list[model]["inputs"][input]) == str:
+                assert dep_list[model]["inputs"][input].lower() == "host", "If input for model " + str(model) + " is not host, the feeder must be specified in format [feeder_model_name, feeder_queue_name]."
+            else:
+                assert type(dep_list[model]["inputs"][input]) == list, "The feeder for model " + str(model) + " must be specified in format [feeder_model_name, feeder_queue_name]."
+                assert dep_list[model]["inputs"][input][0] in models, "Feeder model " + str(dep_list[model]["inputs"][input][0]) + " to consumer model " + model + " is not specified."
+                
+def parse_model_deps(models, dependency_list_file):
+    ordered_models = []
+    model_connections = {}
+    model_name_remap = {}
+    consumer_to_producers_map = {}
+    with open(dependency_list_file, "r") as dep_file:
+        dep_list = yaml.load(dep_file, Loader = yaml.FullLoader)
+    
+    check_model_dep_constraints(models, dep_list)
+    while len(models) != len(ordered_models):
+        for model in models:
+            if not model in ordered_models:
+                if not model in dep_list:
+                    logger.warning("Could not find model {} in dependency list. Assuming that this model is fed by host.", model)
+                    ordered_models.append(model)
+                else:
+                    feeders_found = True
+                    for input in dep_list[model]["inputs"]:
+                        if not type(dep_list[model]["inputs"][input]) == str:
+                            feeder_model = dep_list[model]["inputs"][input][0]
+                            feeders_found = feeder_model in ordered_models
+                    if feeders_found:
+                        ordered_models.append(model)
+                        
+    for model_idx in range(len(ordered_models)):
+        model_name_remap[ordered_models[model_idx]] = "model_" + str(model_idx) 
+        if ordered_models[model_idx] in dep_list:
+            for input in dep_list[ordered_models[model_idx]]["inputs"]:
+                if type(dep_list[ordered_models[model_idx]]["inputs"][input]) == str:
+                    continue
+                feeder_model = model_name_remap[dep_list[ordered_models[model_idx]]["inputs"][input][0]]
+                feeder_queue = dep_list[ordered_models[model_idx]]["inputs"][input][1]
+                model_connections["model_" + str(model_idx) + "_" + input] = feeder_model + "_" + feeder_queue
+                if not "model_" + str(model_idx) in consumer_to_producers_map:
+                    consumer_to_producers_map["model_" + str(model_idx)] = []
+                consumer_to_producers_map["model_" + str(model_idx)].append(feeder_model)
+    return ordered_models, model_connections, consumer_to_producers_map
+
+def merge_models(model_bin_location, models, arch = "wormhole_b0", merged_model_location = "", switch_chans_if_capacity_hit = True,
+                 overlap_dynamic_queues = True):       
     # Main API that gets exported to other files
     try:
-        assert arch == "grayskull" or arch == "wormhole_b0", "Expected arch to be Grayskull or Wormhole_B0"
+        assert arch == "grayskull" or arch == "wormhole_b0", "Expected arch to be grayskull or wormhole_b0"
         output_loc = merged_model_location
         if not output_loc:
             output_loc = "merged_model.tti"
-        
         merged_binary_dir = tempfile.mkdtemp()
         temp_directories.append(merged_binary_dir)
+        
+        # Parse dependency file, topologically sort models, and infer connections
+        ordered_models = models
+        inter_model_connections = {}
+        consumer_to_producers_map = {}
+        dependency_file = "" # Explicitly set dependency file to empty, since we don't have compiler support for pipelined models
+        if dependency_file:
+            ordered_models, inter_model_connections, consumer_to_producers_map = parse_model_deps(models, dependency_file)
+        
+        model_binaries = [os.path.join(model_bin_location, x + ".tti") for x in ordered_models]
         unzipped_tti_paths = unzip_ttis_and_generate_output_dir(model_binaries, merged_binary_dir)
         overlay_blob_size_per_model = verify_and_copy_config_json(unzipped_tti_paths, merged_binary_dir)
-        netlist_names = merge_device_metadata(unzipped_tti_paths, merged_binary_dir)
+        netlist_names = merge_device_metadata(unzipped_tti_paths, merged_binary_dir, inter_model_connections)
         uniquify_tensor_bin_names(unzipped_tti_paths, merged_binary_dir)
-        merged_netlist_path = merge_netlists(netlist_names, merged_binary_dir, unzipped_tti_paths, overlay_blob_size_per_model, switch_chans_if_capacity_hit, overlap_dynamic_queues)
-        compile_backend_binaries(merged_binary_dir, merged_netlist_path)
+        merged_netlist_path = merge_netlists(arch, netlist_names, merged_binary_dir, unzipped_tti_paths, overlay_blob_size_per_model, 
+                                             switch_chans_if_capacity_hit, overlap_dynamic_queues, inter_model_connections, consumer_to_producers_map)
+        compile_backend_binaries(arch, merged_binary_dir, merged_netlist_path)
         create_merged_tti(output_loc, merged_binary_dir)
         logger.info("Binaries for the merged model are stored in: " + output_loc)
         logger.info("Done!")
@@ -474,14 +585,23 @@ def merge_models(model_binaries, arch, merged_model_location = "", switch_chans_
     except Exception as e:
         logger.exception(e)
         cleanup()
-    
+        
 if __name__ == "__main__":
     # Interface to run tool directly
     parser =  argparse.ArgumentParser()
-    parser.add_argument("--model_binaries", type = str, help = "List of model binaries (tti files) to merge.", required = True, nargs = "*")
-    parser.add_argument("--arch", type = str, help = "Target TT architecture.", default="wormhole_b0")
-    parser.add_argument("--merged_model_location", type = str, help = "Filesystem location where the merged model binaries are stored.")
-    parser.add_argument("--skip_channel_reallocation", type = bool, help = "Skip memory usage optimization that reallocates buffers on different DRAM channels, once channel capacity is hit.", default = False)
-    parser.add_argument("--dynamic_queue_overlap_off", type = bool, help = "Turn off memory usage optimization that overlaps dynamic queues", default = False)
+    parser.add_argument("--model_binaries_location", "-mbl", type=str, help="Location of model binaries (tti files) to merge.", required=True)
+    parser.add_argument("--models", "-mdl", type=str, help="List of models to merge", required=True, nargs="*")
+    # Disable passing in dependency files for now, since we don't have compiler support for pipelined models
+    # parser.add_argument("--dependency_file", "-df", type=str, help="YAML file describing IO dependencies between models")
+    parser.add_argument("--arch", "-a", type=str, help="Target TT architecture.", default="wormhole_b0")
+    parser.add_argument("--merged_model_location", "-mml", type=str, help="Filesystem location where the merged model binaries are stored.")
+    parser.add_argument("--skip_channel_reallocation", "-scr", type=bool, help="Skip memory usage optimization that reallocates buffers on different DRAM channels, once channel capacity is hit.", default=False)
+    parser.add_argument("--dynamic_queue_overlap_off", "-dqo", type=bool, help="Turn off memory usage optimization that overlaps dynamic queues", default=False)
     args = parser.parse_args()
-    merge_models(args.model_binaries, args.arch.lower(), args.merged_model_location, not args.skip_channel_reallocation, not args.dynamic_queue_overlap_off)
+    
+    merge_models(args.model_binaries_location,
+                 args.models,
+                 args.arch.lower(),
+                 args.merged_model_location,
+                 not args.skip_channel_reallocation, 
+                 not args.dynamic_queue_overlap_off)
diff --git a/pybuda/pybuda/torch_compile.py b/pybuda/pybuda/torch_compile.py
index 9e956fe32..d758ddf11 100644
--- a/pybuda/pybuda/torch_compile.py
+++ b/pybuda/pybuda/torch_compile.py
@@ -1,29 +1,32 @@
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
 # SPDX-License-Identifier: Apache-2.0
-import copy
 import hashlib
 import os
 import pybuda
-import sys
 import torch
-import types
 import io
 import json
+from typing import List
 from contextlib import redirect_stdout
-from pybuda._C.graph import get_constant_input_value, Graph
-from pybuda._C.backend_api import translate_addresses
-from pybuda._C.torch_device import get_default_device, push_tensor, is_created_on_device, original_shape, PyBudaTensorDesc, CompileRequest, Program 
+
 from loguru import logger
-from pybuda.capture_fx_graph import append_to_graph
-from pybuda.tensor import const_eval_tensor, do_runtime_transform
+
+from pybuda._C.torch_device import get_default_device, unique_id, PyBudaTensorDesc
 from pybuda.compiled_graph_state import CompiledGraphState
+from pybuda.fx.capture import CaptureFX
+from pybuda.fx.schedule import TensorSource
+from pybuda.fx.mixed_graph import MixedGraph
+
+
 _tt0 = None
 _compile_cache = None
 _compile_cache_dir = os.environ.get("PYBUDA_COMPILE_CACHE_DIR", "tt_build")
-_graph = None
+_capture: CaptureFX = CaptureFX()
 _subgraph_index = 0
 _module_index = 0
+_tensor_to_unique_id = {}
+_link_subgraph_unique_tensor_ids = []
 """
 There are dummy enums defined in pytorch, like PrivateUse1 that can be used
 for bringing up new device types.  Eventually we should mainline an enum for
@@ -39,9 +42,8 @@ def reset_state():
     _tt0 = None
     global _compile_cache 
     _compile_cache = None
-    global _graph
-    _graph = None
     global _subgraph_index
+    _capture.reset_state()
     _subgraph_index = 0
     # do not reset module index, we need unique name for bbe compile in case filename cannot be extracted
     logger.debug("Resetting state")
@@ -73,10 +75,10 @@ def torch_device(index=0):
     return get_available_devices()[index].torch_device()
 
 
-def _build_backend_compile_request(device, compiler_cfg, compiled_graph_state):
+def _build_backend_compile_request(device, compiler_cfg, compiled_graph_state, subgraph_idx: int, program_ids: List[int]):
     soc_desc_yaml = (
         compiler_cfg.backend_device_descriptor_path
-        if compiler_cfg.backend_cluster_descriptor_path == ""
+        if compiler_cfg.backend_device_descriptor_path == ""
         else device.soc_desc_yaml
     )
 
@@ -97,16 +99,35 @@ def _build_backend_compile_request(device, compiler_cfg, compiled_graph_state):
         cluster_yaml,
     )
 
-    inputs = [
-        PyBudaTensorDesc(name, shape)
-        for name, shape in zip(
-            compiled_graph_state.ordered_input_names, compiled_graph_state.ordered_input_shapes
-        )
-    ]
-
-    input_runtime_transforms = [
-        json.dumps(transform.to_json()) for transform in compiled_graph_state.ordered_input_runtime_tensor_transforms
-    ]
+    inputs = {}
+    for program_id in program_ids:
+        graph_idx = MixedGraph.get_program_subgraph_id(subgraph_idx, program_id)
+        program_inputs = [
+            PyBudaTensorDesc(name, shape)
+            for name, shape in zip(
+                compiled_graph_state.get_ordered_input_names_for_subgraph(graph_idx), compiled_graph_state.get_ordered_input_shapes_for_subgraph(graph_idx)
+            )
+        ]
+        inputs[graph_idx] = program_inputs
+
+    #input_runtime_transforms = {}
+    #for i in range(subgraph_idx + 1):
+    #    input_runtime_transforms[i] = [
+    #        json.dumps(transform.to_json()) for transform in compiled_graph_state.get_ordered_input_runtime_transforms_for_subgraph(i)
+    #    ]
+    input_runtime_transforms = device.input_runtime_transforms # append to existing ones
+    for program_id in program_ids:
+        graph_idx = MixedGraph.get_program_subgraph_id(subgraph_idx, program_id)
+        input_runtime_transforms[graph_idx] = [
+            json.dumps(transform.to_json()) for transform in compiled_graph_state.get_ordered_input_runtime_transforms_for_subgraph(graph_idx)
+        ]
+
+    input_tile_bcast_dims = device.input_tile_bcast_dims # append to existing ones
+    for program_id in program_ids:
+        graph_idx = MixedGraph.get_program_subgraph_id(subgraph_idx, program_id)
+        input_tile_bcast_dims[graph_idx] = compiled_graph_state.get_ordered_input_tile_broadcast_dims_for_subgraph(graph_idx)
+    #for i in range(subgraph_idx + 1):
+    #    input_tile_bcast_dims[i] = compiled_graph_state.get_ordered_input_tile_broadcast_dims_for_subgraph(i)
 
     constants = [
         PyBudaTensorDesc(
@@ -122,15 +143,28 @@ def _build_backend_compile_request(device, compiler_cfg, compiled_graph_state):
         for name, param in compiled_graph_state.post_const_eval_parameters.items()
     ]
 
-    outputs = [
-        PyBudaTensorDesc(name, shape)
-        for name, shape in zip(
-            compiled_graph_state.ordered_output_names, compiled_graph_state.ordered_output_shapes
-        )
-    ]
-    output_runtime_transforms = [
-        json.dumps(transform.to_json()) for transform in compiled_graph_state.ordered_output_runtime_tensor_transforms
-    ]
+    outputs = {}
+    for program_id in program_ids:
+        graph_idx = MixedGraph.get_program_subgraph_id(subgraph_idx, program_id)
+        program_outputs = [
+            PyBudaTensorDesc(name, shape)
+            for name, shape in zip(
+                compiled_graph_state.get_ordered_output_names_for_subgraph(graph_idx), compiled_graph_state.get_ordered_output_shapes_for_subgraph(graph_idx)
+            )
+        ]
+        outputs[graph_idx] = program_outputs
+
+    output_runtime_transforms = device.output_runtime_transforms # append to existing ones
+    #for i in range(subgraph_idx + 1):
+    #    output_runtime_transforms[i] = [
+    #        json.dumps(transform.to_json()) for transform in compiled_graph_state.get_ordered_output_runtime_transforms_for_subgraph(i)
+    #    ]
+    for program_id in program_ids:
+        graph_idx = MixedGraph.get_program_subgraph_id(subgraph_idx, program_id)
+        output_runtime_transforms[graph_idx] = [
+            json.dumps(transform.to_json()) for transform in compiled_graph_state.get_ordered_output_runtime_transforms_for_subgraph(graph_idx)
+        ]
+
 
     logger.debug("Build CompileRequest")
     return CompileRequest(
@@ -139,6 +173,7 @@ def _build_backend_compile_request(device, compiler_cfg, compiled_graph_state):
         bcfg,
         inputs,
         input_runtime_transforms,
+        input_tile_bcast_dims,
         constants,
         parameters,
         outputs,
@@ -146,58 +181,42 @@ def _build_backend_compile_request(device, compiler_cfg, compiled_graph_state):
     )
 
 
-def _compile(module, aten_module, module_name, sample_inputs, aten_sample_inputs, device, compiler_cfg):
+def _compile(module, aten_module, module_name, sample_inputs, device, compiler_cfg):
     global _tt0
     global _subgraph_index
-    global _graph
-
-    if _tt0 is None:
-        _tt0 = pybuda.TTDevice("tt0", arch=device.arch)
-    else:
-        _tt0.remove_modules()
 
-    torch_device = list(module.parameters())[0].device if len(list(module.parameters())) > 0 else "tt"
-    module = module.to("cpu")   # PyBuda compile on CPU
-    _tt0.place_module(pybuda.module.PyTorchModule(module_name, module))
-
-    if _graph is None:
-        logger.debug("Creating New graph")
-        _graph = Graph(module_name)
-    
-    assert (
-        _tt0.arch == device.arch
-    ), f"Mismatch in the arch compiling for vs the currently bound device {_tt0.arch} != {device.arch}"
-    assert (
-        _tt0.devtype == device.type
-    ), f"Mismatch in the arch compiling for vs the currently bound device {_tt0.devtype} != {device.type}"
+    if os.environ.get("PRINT_PT2_GRAPH", "0") == "1":
+        logger.info("Compiling pt2 graph:")
+        aten_module.graph.print_tabular()    
 
     # Frontend Compile
     logger.debug("Appending to Graph")
-    _graph, intermediate_tensors, output_tensors = append_to_graph(_graph, module, aten_module, aten_sample_inputs, sample_inputs, _subgraph_index)
-    logger.debug(f"Appending to graph done, captured {len(_graph.nodes())} nodes")
+    device_graph_changed, graph_inputs, intermediate_tensors, output_tensors, schedule = _capture.append_to_graph(
+        module_name, module, aten_module, sample_inputs, _subgraph_index)
+    
     _subgraph_index += 1
-    _tt0.graph = _graph.clone()
-    _tt0.intermediate_tensors = intermediate_tensors
-    _tt0.output_tensors = [pybuda.Tensor.create_from_torch(output_tensor) for output_tensor in output_tensors]
+
+    if not device_graph_changed:
+        return None, None, schedule
+
     logger.debug("Frontend Compile")
-    fe_compile_result = pybuda.compile.pybuda_compile(
-        _tt0,
+    module = module.to("cpu")
+
+    fe_compile_result = pybuda.pybuda_compile_torch(
         module_name,
-        *list(map(pybuda.Tensor.create_from_torch, sample_inputs)),
-        compiler_cfg=compiler_cfg,
-        microbatch_size=sample_inputs[0].shape[0],
-        # TODO: support all arguments
+        module,
+        _capture.get_buda_graph(),
+        *[pybuda.Tensor.create_from_torch(sample_input.to("cpu")) for sample_input in [g for gs in graph_inputs for g in gs]]
     )
 
     # Backend Compile
     logger.debug("Backend Compile")
-    compiled_graph_state = CompiledGraphState.from_compiled_graph(_tt0, fe_compile_result)
+    compiled_graph_state = CompiledGraphState.from_compiled_graph(module, fe_compile_result)
     workload = device.compile(
-        _build_backend_compile_request(device, compiler_cfg, compiled_graph_state)
+        _build_backend_compile_request(device, compiler_cfg, compiled_graph_state, _subgraph_index - 1, schedule.get_device_program_ids())
     )
-
-    module = module.to(torch_device)   # PyBuda compile on CPU
-    return workload, compiled_graph_state
+    
+    return workload, compiled_graph_state, schedule
 
 
 def _create_compile_key(module, module_name, sample_inputs, device, compiler_cfg):
@@ -218,7 +237,7 @@ def _populate_compile_cache():
     return compile_cache
 
 
-def _compile_cached(module, aten_module, module_name, sample_inputs, aten_sample_inputs, device, compiler_cfg, cache):
+def _compile_cached(module, aten_module, module_name, sample_inputs, device, compiler_cfg, cache):
     global _compile_cache
     global _tt0
 
@@ -242,26 +261,27 @@ def _compile_cached(module, aten_module, module_name, sample_inputs, aten_sample
     else:
         compiler_cfg.backend_output_dir = pybuda.utils.resolve_output_build_directory()
 
-    workload, compiled_graph_state = _compile(module, aten_module, module_name, sample_inputs, aten_sample_inputs, device, compiler_cfg)
+    workload, compiled_graph_state, schedule = _compile(module, aten_module, module_name, sample_inputs, device, compiler_cfg)
 
-    if key is not None:
-        _compile_cache[key] = workload
-    return workload, compiled_graph_state
+    if key is not None and workload is not None:
+        _compile_cache[key] = (workload, compiled_graph_state, schedule)
+    return workload, compiled_graph_state, schedule
 
 class compiledModel(torch.nn.Module):
-    def __init__(self, module, device, workload, compiled_graph_state, index):
+    def __init__(self, module, device, workload, compiled_graph_state, schedule, index):
         super().__init__()
         self.module = module
         self.device = device
         self.workload = workload
         self.compiled_graph_state = compiled_graph_state
+        self.schedule = schedule
         self.index = index
+        self.is_compile = True
 
     # Submit work to device
     def forward(self, *inputs, **kwargs):
         logger.debug("Invoke Submit")
         assert type(inputs) is tuple
-
         inputs = tuple([i.to(self.device.torch_device()) for i in inputs])
         for i, input in enumerate(inputs):
             if input.device != self.device.torch_device():
@@ -270,98 +290,151 @@ def forward(self, *inputs, **kwargs):
                 )
 
         loop_count = 1
-        program_params = {"$p_loop_count": str(loop_count)}
-        program = Program(f"run_fwd_{self.index}", program_params)
-        logger.info(f"Running run_fwd_{self.index}")
-
-        outputs = self.device.dispatch(self.workload, [program], list(inputs), self.compiled_graph_state.output_host_tms)
-
+        if self.compiled_graph_state is not None and not self.compiled_graph_state.has_cache_buffers:
+            program_params = {"$p_loop_count": str(loop_count)}
+        else:
+            program_params = {
+            "$p_cache_write_index": str(0),
+            "$p_inner_loop_count": str(1),
+            "$p_inner_increment": str(1),
+            "$p_outer_loop_count": str(1),
+            "$p_outer_increment": str(1),
+        }
+        output_map = {}
+        intermediates = {}
+
+        # Run the schedule
+        outputs_generated = set()
+        logger.info(f"Running subgraph {self.index}")
+        for item in self.schedule:
+                
+            graph_inputs = []
+            for i in item.inputs:
+                if i.src == TensorSource.INTERMEDIATE:
+                    graph_inputs.append(intermediates[i.index].to('tt'))
+                elif i.src == TensorSource.INPUT:
+                    graph_inputs.append(inputs[i.index])
+                else:
+                    graph_inputs.append(output_map[i.index])
+
+            #print("graph inputs:")
+            #for i, p in enumerate(graph_inputs):
+            #    print(" - ", i, ": ", p.to('cpu'))
+
+            if item.fallback:
+                # CPU graph
+                logger.trace(f"Running fallback graph on CPU: {item.graph_index}")
+                #graph_module = torch.fx.GraphModule({}, item.graph)
+                graph_inputs = [i.to('cpu') for i in graph_inputs]
+                graph_outputs = item.graph_module(*graph_inputs)
+                logger.trace(f"Done, produced {len(graph_outputs)} outputs.")
+                graph_outputs = tuple(t.to('tt') for t in graph_outputs)
+            else:
+                # Device - dispatch to device
+                program_index = MixedGraph.get_program_subgraph_id(self.index, item.graph_index)
+                logger.debug(f"Running program[{program_index}] on device")
+                
+                graph_outputs = self.device.dispatch(
+                        self.workload, program_index, list(graph_inputs), self.compiled_graph_state.output_host_tms, self.is_compile)
+            
+                for i, output in enumerate(graph_outputs):
+                    if torch.isnan(output.to('cpu')).any(): # debug
+                        logger.error(f"Output {i} has NaNs:")
+                        logger.error(output.to('cpu'))
+                        raise RuntimeError(f"Output {i} is NaN")
+                """
+                for i, output in enumerate(graph_outputs):
+                    print(f"Graph output {i} of {len(graph_outputs)}: shape={output.to('cpu').shape}, desired shape={self.workload.outputs[program_index][i].shape}, item.outputs={item.outputs}")
+
+                for i, output in enumerate(graph_outputs):
+                    narrowed = False
+                    for dim in range(len(output.shape)):
+                        # TODO: reproduce in a smaller test than StableDiffusion
+                        if output.shape[dim] != self.workload.outputs[item.graph_index][i].shape[dim]:
+                            narrowed = True
+                            graph_outputs[i] = output.narrow(dim, 0, self.workload.outputs[item.graph_index][i].shape[dim]).to('cpu').to('tt')
+
+                    # TODO:
+                    # If narrowed, the tensor is now on CPU, and can't be used to link to the next graph.. so that needs disabling somehow
+                    if not narrowed:
+                        graph_outputs[i] = graph_outputs[i].clone()
+                """
+            
+            #print("graph outputs:")
+            #for i, p in enumerate(graph_outputs):
+            #    print(" - ", i, ": ", p.to('cpu'))
+
+
+            # Record outputs
+            for i, output in enumerate(item.outputs):
+                if output.intermediate:
+                    intermediates[output.index] = graph_outputs[i]
+                else:
+                    assert output.index not in outputs_generated
+                    output_map[output.index] = graph_outputs[i]
+                    outputs_generated.add(output.index)
+                    
+        # Flatten output map into list
+        outputs = [output_map[i] for i in range(len(output_map))]
+
+        for out in outputs:
+            _tensor_to_unique_id[unique_id(out)] = out
+
+        _capture.capture_sample_outputs(outputs, self.index)
+        # CHeck previous outputs and push to new param queue
+
+        # TODO: We need to do something to clone this in case backend is going to deallocate.... but we don't yet know it it will :(.
+        # Instead, we copy to cpu.
+        #outputs = [o.to('cpu') for o in outputs]
         return outputs
     
     def to(self, dev):
-        for desc in self.workload.parameters:
-            name = desc.name
-            value = self.compiled_graph_state.post_const_eval_parameters[name]
-            # value = self.compiled_graph_state.post_const_eval_parameters[name].to(dev)
-            # value = const_eval_tensor({name: self.module.state_dict()[name]}, self.consteval_trace, self.parameter_to_tile_dims, name).to(dev)
-            push_tensor(self.workload.backend, desc, value, "")
-
-        for desc in self.workload.constants:
-            name = desc.name
-            value = self.compiled_graph_state.post_const_eval_constants[name]
-            # value = self.compiled_graph_state.post_const_eval_constants[name].to(dev)
-            # value = const_eval_tensor({name: get_constant_input_value(constant, True)}, self.consteval_trace, self.constant_to_tile_dims, name).to(dev)
-            push_tensor(self.workload.backend, desc, value, "")
-        # self.module.to(dev)
-
-from typing import Any, Tuple
-def decompose_split(self: torch.Tensor, split_size: int, dim: int = 0) -> Tuple[torch.Tensor, ...]:
-    starts = list(range(0, self.size(dim), split_size))
-    stops = starts[1:] + [self.size(dim)]
-    slices = []
-    for start, stop in zip(starts, stops):
-        slices.append(self.narrow(dim, start, stop - start))
-    return slices
-
-def decompose_matmul(bias, input, weight) -> torch.Tensor:
-    res = torch.matmul(input, weight)
-    res = torch.add(res, bias)
-    return res
-
-from torch._decomp import core_aten_decompositions, get_decompositions
-from torch._functorch.aot_autograd import aot_module_simplified
-from torch.fx.experimental.proxy_tensor import make_fx
-from torch.fx.experimental.proxy_tensor import DecompositionInterpreter
-from torch.fx.interpreter import Interpreter
+        pass
+
 
+from torch._decomp import core_aten_decompositions
+from torch.fx.experimental.proxy_tensor import make_fx
 from torch._functorch.compile_utils import strip_overloads
 
-pybuda_decompositions = {
-                    torch.ops.aten.split.Tensor: decompose_split,
-                    torch.ops.aten.addmm.default: decompose_matmul,
-                  }
+from pybuda.fx.torch_decomp_reconstruct import get_pybuda_decompositions, apply_torch_reconstruct_patterns
+
 def compile_torch(
     module,
     sample_inputs,
     options=None,
 ):
-    class ATENStorer:
-        def __init__(self):
-            self.aten_module = None
-            self.sample_inputs = None
-        
-        def __call__(self, *args: Any, **kwds: Any) -> Any:
-            self.aten_module = args[0]
-            self.sample_inputs = args[1]
-
+    torch_device = list(module.parameters())[0].device if len(list(module.parameters())) > 0 else "tt"
     with torch.no_grad():
+        pybuda_decompositions = get_pybuda_decompositions()
         decompositions = {**core_aten_decompositions(), **pybuda_decompositions}
-        aten_storer = ATENStorer()
-        aot_module_simplified(module, sample_inputs, fw_compiler=aten_storer, decompositions=decompositions, keep_inference_input_mutations=True)
-        return _torch_compile(module, sample_inputs, aten_storer.aten_module, aten_storer.sample_inputs)
-        
-        
+        fake_tensor_mode = torch._dynamo.utils.detect_fake_mode(sample_inputs)
+        fake_tensor_mode.allow_non_fake_inputs = True
+        aten = make_fx(module, tracing_mode="symbolic", decomposition_table=decompositions, _allow_non_fake_inputs=True)(*sample_inputs)
+        apply_torch_reconstruct_patterns(aten)
+        return _torch_compile(module, sample_inputs, aten, original_torch_device=torch_device)
+
+_device = None
 def _torch_compile(
     module,
     sample_inputs,
     aten_module,
-    aten_sample_inputs,
     device=None,
     compiler_cfg=None,
     module_name=None,
     cache=False,  # Disabled for now
+    original_torch_device=None,
 ):
-    """
-    Ideally we can remove having to pass in tt0 (the ttdevice.py) object here,
-    but currently it's so coupled to our compile flow that it's too much work to
-    remove its dependency for this proof of concept.
-    Ideally pybuda.compile.pybuda_compile just takes a device_config dataclass
-    which has the device target information to decouple it from the runtime device.
-    """
     logger.info("Torch Compile")
+    #global _ordered_inputs_per_subgraph
+    #_ordered_inputs_per_subgraph[_subgraph_index] = [unique_id(inp) for inp in sample_inputs]
+
     strip_overloads(aten_module)
-    if device is None:
-        device = get_default_device()
+
+    global _device
+    if _device is None:
+        _device = get_default_device()
+        assert _device is get_default_device()
+    device = _device
 
     if compiler_cfg is None:
         compiler_cfg = pybuda.config._get_global_compiler_config()
@@ -378,14 +451,17 @@ def _torch_compile(
 
     cache &= not bool(os.environ.get("PYBUDA_DISABLE_COMPILE_CACHE", "0"))
 
-    rand_inputs = [torch.rand(sample_input.shape).to(sample_input.dtype) for sample_input in sample_inputs]
-    rand_atan_inputs = [torch.rand(aten_sample_input.shape).to(aten_sample_input.dtype).to(aten_sample_input.device) for aten_sample_input in aten_sample_inputs]
-
-    workload, compiled_graph_state = _compile_cached(
-        module, aten_module, module_name, rand_inputs, rand_atan_inputs, device, compiler_cfg, cache
+    workload, compiled_graph_state, schedule = _compile_cached(
+        module, aten_module, module_name, sample_inputs, device, compiler_cfg, cache
     )
 
-    compiled_model = compiledModel(module, device, workload, compiled_graph_state, _subgraph_index-1)
+    compiled_model = compiledModel(module, device, workload, compiled_graph_state, schedule, _subgraph_index-1)
     # Push parameters and constants to device
     compiled_model.to(device.torch_device())
+    logger.info("Done Torch Compile")
+    if original_torch_device is not None:
+        module = module.to(original_torch_device)
     return compiled_model
+    
+
+# compile_torch = aot_autograd(fw_compiler=_torch_compile, decompositions={**core_aten_decompositions(), **pybuda_decompositions})
diff --git a/pybuda/pybuda/transformers/pipeline.py b/pybuda/pybuda/transformers/pipeline.py
index 1dec61e64..db36cfcfa 100644
--- a/pybuda/pybuda/transformers/pipeline.py
+++ b/pybuda/pybuda/transformers/pipeline.py
@@ -120,6 +120,26 @@ def prepare_inputs_for_generation(self, *inputs, **kwargs):
         total_length = self.max_length
 
         pad_len = total_length - orig_len
+
+        if "cache_position" in kwargs:            
+            # Cache positions indicate the positions of input sequence tokens within the sequence. They're utilized to
+            # update cache positions and to deduce the complete sequence length.
+            #
+            # However, cache_position is presumed to be unpadded (for accurate sequence length calculations). Consequently,
+            # it poses issues during compilation, particularly as we assume tile-aligned dimensions at some point. Therefore, 
+            # cache_position is expected to be arranged from (1, orig_len) and serves as a constant for the model if not defined 
+            # as input. Hence, we're removing it from the kwargs and relying on the model's default.
+            #
+            # For more details, refer to the following code snippet:
+            # - build/python_env/lib/python3.10/site-packages/transformers/generation/utils.py:2404 =>
+            # => model_kwargs["cache_position"] = torch.arange(cur_len, device=input_ids.device)
+            #
+            # This displays that cache_position is generated internally during pipeline setup, and is not expected to be
+            # provided as input for the model.
+            
+            logger.warning("Removing cache_position from kwargs. It is not expected to be provided as input for the model.")
+            kwargs.pop("cache_position", None)
+
         input_ids = torch.nn.functional.pad(input_ids, (0, pad_len), value=self.pad_token_id)
         attention_mask = torch.ones_like(input_ids)
         attention_mask[:, -pad_len:] = 0
diff --git a/pybuda/pybuda/ttcluster.py b/pybuda/pybuda/ttcluster.py
deleted file mode 100644
index 1ca3df3d2..000000000
--- a/pybuda/pybuda/ttcluster.py
+++ /dev/null
@@ -1,244 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import os
-import threading
-import queue
-import time
-
-from typing import Optional, Union, List
-
-from loguru import logger
-
-from .ttdevice import TTDevice
-from .optimizers import Optimizer
-from .pybudaglobal import is_silicon
-from pybuda._C import NodeEpochType, DataFormat
-from pybuda._C.backend_api import BackendDevice, BackendType
-
-class TTCluster(TTDevice):
-    """
-    TTCluster represents a group of Tenstorrent devices on one system. A single model can be spread over all 
-    devices in a cluster.
-    """
-    def __init__(self, 
-            name: str, 
-            cluster_size: int = 0,
-            arch: Optional[BackendDevice] = None,
-            devtype: Optional[BackendType] = None, 
-            optimizer: Optional[Optimizer] = None,
-            fp32_fallback: DataFormat = DataFormat.Float16_b,
-            param_fp32_fallback: DataFormat = DataFormat.Float16_b,
-            mp_context = None):
-        """
-        Initialize a cluster of Tenstorrent devices on same system. All parameters pass through to underlying
-        individual devices.
-
-        For development and debug purposes, device model or golden-model can be used instead of silicon. Device
-        model is the default in development mode.
-
-        Parameters
-        ----------
-        name: str
-            Device name
-
-        cluster_size: int, optional
-            Number of devices from the current system to be used in this cluster. If not provided, the maximum will
-            be automatically assigned.
-
-        devtype: BackendType, optional
-            Type of Tenstorrent device. Only used to run testing on models of devices instead of a real silicon chip.
-
-        optimizer: Optimizer, optional
-            PyBuda optimizer to be used on this device. Mandatory if running in training mode.
-
-        fp32_fallback: DataFormat
-            If TT device doesn't support FP32, tensors will fall back to this format. Bfloat16 is the default.
-
-        mp_context: mp.context, optional
-            Optioanlly override Python multi-processing context when creating mp queues
-        """
-        super().__init__(name, 0, arch, devtype, optimizer, fp32_fallback, param_fp32_fallback, mp_context)
-        self.cluster_size = cluster_size
-
-        # List of op-names to map to ordered chip breaks
-        self.device_start_ops: List[str] = []
-
-    def get_cluster_size(self) -> int:
-        """
-        Return the number of devices in the given cluster.
-
-        Returns
-        -------
-        int
-            Number of devices in the cluster
-        """
-        # Figure out the number of devices on the system, and make sure that we're only setting 
-        # breaks for less or equal to that
-        
-        if not is_silicon(self.devtype):
-            return 1
-
-        if self.cluster_size == 0:
-            return buda.get_number_of_chips(self._get_gstate())
-
-        max_chips = buda.get_number_of_chips(self._get_gstate())
-        if self.cluster_size > max_chips:
-            raise RuntimeError(f"Cluster size ({self.cluster_size}) set to more than available devices ({max_chips})")
-        return self.cluster_size
-        
-    def _init_concurrent_run(self):
-        """
-        Callback before concurrent processes are launched
-        """
-        if self.devtype == TTDeviceType.Silicon:
-            if self.cluster_size == 0:
-                self.cluster_size = self.get_cluster_size()
-            self.feeder_thread_queues = [queue.Queue() for _ in range(self.cluster_size)]
-            self.feeder_thread = [
-                    threading.Thread(target=self._run_feeder_thread, args=(self.feeder_thread_queues[device], device)) 
-                    for device in range(self.cluster_size)]
-
-            for t in self.feeder_thread:
-                t.start()
-
-    def _send_to_feeder_thread(self, cmd: str, count: int = 1, wait: bool = False):
-        """
-        Push command `count` times to the feeder threads
-        """
-        assert self.feeder_thread_queues is not None
-        for i in range(count):
-            for q in self.feeder_thread_queues:
-                q.put(cmd)
-
-        if wait:
-            for q in self.feeder_thread_queues:
-                while not q.empty():
-                    if self.shutdown_event is not None and self.shutdown_event.is_set():
-                        if self.final_barrier is not None:
-                            self.final_barrier.abort()
-                        return # got a signal to shutdown and end the process
-                    time.sleep(0.01)
-
-    def _run_forward(self, copy_inputs: bool):
-        """
-        Sequential run forward
-        """
-        assert self.cluster_size > 0
-        for device_index in range(self.cluster_size):
-            buda.run(self.gstate, 0, NodeEpochType.Forward, self.devtype, copy_inputs, False, self._perf_desc, device_index)
-
-    def _run_backward(self, copy_inputs: bool):
-        """
-        Sequential run backward
-        """
-        assert self.cluster_size > 0
-        for device_index in reversed(range(self.cluster_size)):
-            buda.run(self.gstate, 0, NodeEpochType.Backward, self.devtype, copy_inputs, False, self._perf_desc, device_index)
-
-    def _run_optimizer(self):
-        """
-        Sequential run of optimizer
-        """
-        assert self.cluster_size > 0
-        for device_index in reversed(range(self.cluster_size)):
-            buda.run(self.gstate, 0, NodeEpochType.Optimizer, self.devtype, True, False, self._perf_desc, device_index)
-
-    def _run_forward_on_device(self, device_index: int, copy_inputs: bool = True):
-        """
-        Run from feeder thread
-        """
-        buda.run(self.gstate, 0, NodeEpochType.Forward, self.devtype, copy_inputs, False, self._perf_desc, device_index)
-
-    def _run_backward_on_device(self, device_index: bool, copy_inputs: bool = True):
-        """
-        Run from feeder thread
-        """
-        buda.run(self.gstate, 0, NodeEpochType.Backward, self.devtype, copy_inputs, False, self._perf_desc, device_index)
-
-    def _run_optimizer_on_device(self, device_index: int):
-        """
-        Run from feeder thread
-        """
-        buda.run(self.gstate, 0, NodeEpochType.Optimizer, self.devtype, True, False, self._perf_desc, device_index)
-
-    def _run_zero_grad_on_device(self, device_index: int):
-        buda.zero_grad(gstate=self._get_gstate(), graph_id=0, device_name=self.devtype, chip_id=device_index)
-        for name, value in self.optimizer.get_param_dict().items():
-            buda.push_optimizer_parameter(self._get_gstate(), 0, value, name, self.devtype, device_index)
-
-    def _run_feeder_thread(self, cmdqueue: queue.Queue, device_index: int):
-        """
-        A thread that feeds the epoch programs into the device, in background
-        """
-
-        logger.info("Feeder thread on {}, device index {} starting", self, device_index)
-        while True:
-            cmd = cmdqueue.get()
-            logger.info("Run feeder thread {} cmd: {}", device_index, cmd)
-            if cmd == "fwd":
-                self._run_forward_on_device(copy_inputs=True, device_index=device_index)
-            elif cmd == "bwd":
-                self._run_backward_on_device(copy_inputs=True, device_index=device_index)
-            elif cmd == "opt":
-                self._run_optimizer_on_device(device_index=device_index)
-            elif cmd == "zero_grad":
-                self._run_zero_grad_on_device(device_index=device_index)
-            elif cmd == "quit":
-                break
-            else:
-                raise RuntimeError(f"Invalid feeder thread command: {cmd}")
-
-    def _post_graph_callback(self):
-        """
-        Called after buda graph has been generated, but the compile process hasn't yet happened.
-        """
-        TTDevice._post_graph_callback(self)
-
-        if is_silicon(self.devtype):
-            # Figure out if we're trying to use too many chips
-            self.cluster_size = self.get_cluster_size()
-            if len(self.device_start_ops) >= self.cluster_size:
-                raise RuntimeError(f"Too many chip breaks ({len(self.device_start_ops)}) set for devices available ({self.cluster_size})")
-            if len(self.device_start_ops) + 1 < self.cluster_size:
-                self.cluster_size = len(self.device_start_ops) + 1
-                logger.info("Reducing cluster size to {} due to smaller number of chip break ops.", self.cluster_size)
-           
-        # Register chip breaks with gstate
-        for op_on_new_chip in self.device_start_ops:
-            buda.place_on_new_chip(self._get_gstate(), op_on_new_chip)
-
-
-    def set_device_start_op(self, op_name: Union[str, List[str]]):
-        """
-        This call allows manual paritition of a single model across multiple devices on this cluster. Each op provided
-        (by name) will the "first" op on the next device in sequence.
-
-        Parameters
-        ----------
-        op_name: Union[str, List[str]]
-            An op, or a list of ops
-        """
-        if isinstance(op_name, str):
-            self.device_start_ops.append(op_name)
-        else:
-            if not isinstance(op_name, (list, tuple)):
-                raise RuntimeError("Invalid op_name parameter")
-            self.device_start_ops.extend(op_name)
-
-    def _get_bw_tilizer_target_device_id(self):
-        """
-        Return the device_id that we push backward inputs to. That's the last device in chain.
-        """
-        if self.cluster_size == 0:
-            self.cluster_size = self.get_cluster_size()
-        assert self.cluster_size > 0
-        return self.cluster_size - 1
-
-    def _shutdown_threads(self):
-        if self.feeder_thread is not None:
-            for q in self.feeder_thread_queues:
-                q.put("quit")
-            for t in self.feeder_thread:
-                t.join()
-            self.feeder_thread = None
diff --git a/pybuda/pybuda/ttdevice.py b/pybuda/pybuda/ttdevice.py
deleted file mode 100644
index e5f13b5e9..000000000
--- a/pybuda/pybuda/ttdevice.py
+++ /dev/null
@@ -1,1613 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-from typing import Optional, List, Tuple, Union, Dict, Set
-from collections import deque
-import os
-import queue
-import inspect
-import copy
-
-import torch
-import torch.multiprocessing as mp
-from multiprocessing.synchronize import Event as EventClass
-from multiprocessing.synchronize import Barrier as BarrierClass
-from loguru import logger
-
-from .device import Device
-from .pybudaglobal import PYBUDA_DEVMODE, lazy_trace_data, is_silicon, profiler, state_changed, set_state_changed, clear_state_changed, start_tracing, stop_tracing, reset_unique_node_id
-from .module import Module, PyBudaModule
-from .tensor import Tensor, to_pt_tensors, remove_microbatch, consteval_input_bw, consteval_shape
-from .parameter import Parameter
-from .optimizers import Optimizer
-from .schedulers import LearningRateScheduler
-
-from pybuda._C.graph import Graph, create_op_node, create_data_edge, create_parameter_input, create_activation_input, create_output, create_constant_input, create_target_input, add_partial_datacopy_edge, RuntimeTensorTransform, RuntimeTensorTransformType, Shape, OpType
-from pybuda._C.graph import eval as graph_eval
-from pybuda._C import DataFormat
-from pybuda.tvm_utils import flatten_inputs
-
-from .pybudaglobal import TILE_DIM, create_queue
-from .verify import VerifyConfig
-from .config import CompilerConfig, _get_global_compiler_config
-from .backend import BackendAPI
-from pybuda._C.backend_api import BackendDevice, BackendType, DeviceMode, StrideDescriptor, DramIODesc, DeviceConfig, get_device_descs_for_available_devices, get_custom_device_desc, get_device_cluster_yaml, load_cached_sys_param
-from .device_connector import (
-    DeviceConnector, 
-    TransferType, 
-    DirectPopperDeviceConnector, 
-    OutputQueueDirectPoppperDeviceConnector, 
-    InputQueueDirectPusherDeviceConnector,
-    DirectPusherPopperDeviceConnector)
-
-
-class TTDevice(Device):
-    """
-    TTDevice represents one or more Tenstorrent devices that will receive modules to run.
-    """
-
-    def __init__(self, 
-            name: str, 
-            num_chips: int = None,
-            chip_ids: Union[List[int], List[Tuple[int]]] = None,
-            arch: Optional[BackendDevice] = None,
-            devtype: Optional[BackendType] = None, 
-            device_mode: Optional[DeviceMode] = None,
-            optimizer: Optional[Optimizer] = None,
-            scheduler: Optional[LearningRateScheduler] = None,
-            fp32_fallback: DataFormat = DataFormat.Float16_b,
-            mp_context = None,
-            module: Union[Module, List[Module]] = None,
-    ):
-        """
-        Initialize a new Tenstorrent device.
-
-        For development and debug purposes, device model or golden-model can be used instead of silicon. Device
-        model is the default in development mode.
-
-        Parameters
-        ----------
-        name: str
-            Device name
-
-        num_chips: int, optional
-            On a system with multiple Tenstorrent silicon devices available, one TTDevice can span more than one chip by setting this parameter to more than 1.
-            This allows a larger model to be spread over multiple chips. 
-
-            To take all available devices, set num_chips to 0. 
-
-        chip_ids: Union[List[int], List[Tuple[int]]], optional
-            By default, TTDevice will allocate the first available set of chips. If the application requires a particular chip, or set of chips, to be used,
-            chip_ids allows the user to pick particular ones.
-            The user can directly provide the chip_ids or the coordinates of the chips on Nebula/Galaxy systems.
-
-        arch: BackendDevice, optional
-            Which Tenstorrent chip arch (GRAYSKULL, WORMHOLE etc.)
-
-        devtype: BackendType, optional
-            Type of Tenstorrent device. Only used to run testing on models of devices instead of a real silicon chip.
-
-        optimizer: Optimizer, optional
-            PyBuda optimizer to be used on this device. Mandatory if running in training mode.
-
-        fp32_fallback: DataFormat
-            If TT device doesn't support FP32, tensors will fall back to this format. Bfloat16 is the default.
-
-        mp_context: mp.context, optional
-            Optioanlly override Python multi-processing context when creating mp queues
-
-        module: Union[Module, List[Module]], optional
-            Optionally place given module(s) one the device has been created
-        """
-        super().__init__(name, mp_context)
-        self.optimizer = optimizer
-        self.scheduler = scheduler
-        self.fp32_fallback = fp32_fallback
-
-        self._checkpoint_interval = 0
-        self._unused_parameters = set() # populated during `self.generate_graph`; records unused params
-
-        #self._perf_dump_mode: buda.PerfDumpMode = buda.PerfDumpMode.Disable
-        #self._perf_desc: Optional[buda.PerfDesc] = None
-
-        self.backend_api : Optional[BackendAPI] = None
-
-        self.devtype = (BackendType.Golden if PYBUDA_DEVMODE() else BackendType.Silicon) if devtype is None else devtype
-        if self.devtype != BackendType.Silicon:
-            if "GOLDEN_WORMHOLE_B0" in os.environ:
-                if arch is None:
-                    arch = BackendDevice.Wormhole_B0
-            elif "GOLDEN_WORMHOLE" in os.environ:
-                if arch is None:
-                    arch = BackendDevice.Wormhole
-            else:
-                if arch is None:
-                    arch = BackendDevice.Grayskull
-
-        # chip_ids        num_chips       chip_ids used
-        # ---------------------------------------------
-        # None or []      None            [0]
-        # None or []      0               [0..num_devices_detected-1]
-        # None or []      2               [0,1]
-        # [0,1]           None or 2       [0,1]
-        # [0,1]           not 2           error
-        
-        if chip_ids is None or len(chip_ids) == 0:
-            if num_chips is None:
-                self.chip_ids = [0]
-                self.num_chips = 1
-            else:
-                self.chip_ids = list(range(num_chips))
-                self.num_chips = num_chips
-        else:
-                assert num_chips is None or len(chip_ids) == num_chips, f"num_chips:{num_chips} does not match chip_ids:{chip_ids}"
-                self.chip_ids = chip_ids
-                self.num_chips = len(chip_ids)
-
-        self.arch = arch
-
-        # device_mode is modified when we execute device save/load
-        self.device_mode = DeviceMode.CompileAndRun if device_mode is None else device_mode
-
-        self.graph = None
-        self.intermediate_tensors = {}
-        self.compiled_netlists = []
-        self.allocated_blocks = []
-        self.current_host_address = 0
-        self._active_subgraph = 0
-        reset_unique_node_id()
-
-        if module is not None:
-            if not isinstance(module, list):
-                module = [module]
-            for m in module:
-                self.place_module(m)
-
-    def __repr__(self):
-        return f"TTDevice '{self.name}'"
-
-    def get_device_config(self, compiler_cfg=None):
-        """
-        Figure out which silicon devices will be used, if in silicon mode
-        """
-
-        # Call get_device_config here without device.yaml if:
-        # (1) it's running on compute machine with targeting Golden backend (devtype = Golden)
-        # (2) it's running on compute machine but targeting Silicon device (eg. generating TTI on compute machine)
-        # For the following case, get_device_config is not called here, but later with device.yaml obtained from backend
-        # (3) it's running on Silicon machine with setting device-mode to CompileOnly (eg. generating TTI on silion machine)
-        device_descs = get_device_descs_for_available_devices(compiler_cfg.backend_output_dir) # possibly modify here
-        if self.devtype != BackendType.Silicon or (self.device_mode == DeviceMode.CompileOnly and len(device_descs) == 0):
-            assert self.arch is not None, "Unknown arch for non-silicon compile"
-
-            # retrieve harvested cfg if devtype is set to Silicon (i.e. TTI)
-            harvesting_mask = 0
-            if len(compiler_cfg.backend_runtime_params_path) > 0:
-                cached_syslevel_runtime_param = load_cached_sys_param(compiler_cfg.backend_runtime_params_path)
-                harvesting_mask = int(cached_syslevel_runtime_param["system-device0-harvesting_mask"])
-            default_device_desc = get_custom_device_desc(self.arch, mmio=True, harvesting_mask=harvesting_mask)
-            return get_device_config(self.arch,
-                                     self.chip_ids,
-                                     compiler_cfg.backend_cluster_descriptor_path,
-                                     compiler_cfg.backend_runtime_params_path,
-                                     compiler_cfg.store_backend_db_to_yaml,
-                                     self.devtype,
-                                     default_device_desc.soc_desc_yaml,
-                                     backend_output_dir=compiler_cfg.backend_output_dir,
-                                     backend_device_descriptor_path_override=compiler_cfg.backend_device_descriptor_path)
-
-        device_list = [d.arch for d in device_descs if d.mmio]
-        if len(device_list) == 0:
-            raise RuntimeError("No Tenstorrent devices present.")
-
-        for desc in device_descs:
-            assert desc.arch == device_descs[0].arch, f"Device {desc.arch} architecture doesn't match the system"
-        detected_arch = device_list[0]
-
-        if self.arch:
-            assert detected_arch == self.arch, f"User constructed a TTDevice of {self.arch} but detected: {detected_arch}"
-        self.arch = detected_arch
-
-        # Pick chips ids based on the arch
-        if len(self.chip_ids) == 0:
-            self.num_chips = len(device_list)
-            self.chip_ids = list(range(self.num_chips))
-
-        first_id = 0
-        # if PYBUDA_NEBULA_GALAXY_PLACER is specified, use soc_desc of unharvested_chip
-        if "PYBUDA_NEBULA_GALAXY_PLACER" in os.environ:
-            for device_id, desc in enumerate(device_descs):
-                if desc.harvesting_mask == 0:
-                    first_id = device_id
-                    break
-
-        soc_desc = device_descs[first_id].soc_desc_yaml
-        cluster_yaml = get_device_cluster_yaml(compiler_cfg.backend_output_dir) if compiler_cfg.backend_cluster_descriptor_path == "" else compiler_cfg.backend_cluster_descriptor_path
-        dev_cfg = get_device_config(self.arch,
-                      chip_ids=self.chip_ids,
-                      backend_cluster_descriptor_path=cluster_yaml,
-                      backend_runtime_params_path=compiler_cfg.backend_runtime_params_path,
-                      store_backend_db_to_yaml=compiler_cfg.store_backend_db_to_yaml,
-                      backend_type=self.devtype,
-                      device_yaml=soc_desc,
-                      backend_output_dir=compiler_cfg.backend_output_dir,
-                      backend_device_descriptor_path_override=compiler_cfg.backend_device_descriptor_path)
-
-        # NOTE: followings should be removed when decweek3 uplift is merged
-        if "PYBUDA_FORCE_EMULATE_HARVESTED" in os.environ and dev_cfg.grid_size.r == 10: # non-harvested
-            if "TT_BACKEND_HARVESTED_ROWS" not in os.environ:
-                if self.arch == BackendDevice.Wormhole_B0 or self.arch == BackendDevice.Wormhole:
-                    os.environ["TT_BACKEND_HARVESTED_ROWS"] = "2048"
-                else:
-                    os.environ["TT_BACKEND_HARVESTED_ROWS"] = "2050"
-            dev_cfg = get_device_config(self.arch,
-                        chip_ids=self.chip_ids,
-                        backend_cluster_descriptor_path=cluster_yaml,
-                        backend_runtime_params_path=compiler_cfg.backend_runtime_params_path,
-                        store_backend_db_to_yaml=compiler_cfg.store_backend_db_to_yaml,
-                        backend_type=self.devtype,
-                        device_yaml=soc_desc,
-                        backend_output_dir=compiler_cfg.backend_output_dir,
-                        backend_device_descriptor_path_override=compiler_cfg.backend_device_descriptor_path)
-        return dev_cfg
-
-
-    def place_module(self, module: Union[Module, Tuple[Module], List[Module]]):
-
-        if not isinstance(module, (tuple, list)):
-            module = (module,)
-
-        for m in module:
-            if not isinstance(m, Module):
-                raise RuntimeError("Only PyBuda modules can be placed on TTDevices at this time.")
-
-        Device.place_module(self, module)
-
-    def _initialize(self, 
-            training: bool, 
-            sequential: bool,
-            final_barrier: Optional[BarrierClass] = None, 
-            shutdown_event: Optional[EventClass] = None,
-            scale_loss: float = 1.0,
-            checkpoint_interval: int = 0,
-            perf_trace: bool = False):
-        """
-        Initialize the Tenstorrent device.
-
-        Parameters
-        ----------
-        training: bool
-            If true, create optimizer and schedulers for training, linking them to the modules on the device
-
-        sequential: bool
-            Set sequential/concurrent mode for this device
-
-        final_barrier: mp.Event, optional
-            If provided, forward will wait for the wait event before completing, allowing processes and queues to
-            be alive until everything has completed.
-
-        shutdown_event: mp.Event, optional
-            If provided, forward will trigger the event in case of an exception, letting other processes know to
-            shut down. This should always be set in concurrent mode.
-
-        scale_loss: float, optional
-            If this device is calculating loss, multiply the value with scale_loss after calculating it
-            NOT CURRENTLY SUPPORTED ON TTDEVICE
-
-        checkpoint_interval: int, optional
-            The weights will be checkpointed into checkpoint queues on host every `checkpoint_interval` optimizer
-            steps, if set to non-zero. Zero by default.
-
-        perf_trace: bool, optional
-            Set performance tracing mode when running on silicon
-        """
-
-        Device._initialize(self, sequential, final_barrier, shutdown_event)
-
-        self._training = training
-        self._checkpoint_interval = checkpoint_interval
-        self._perf_trace = perf_trace
-
-        if self._checkpoint_interval > 0:
-            self._checkpoint_queues: Dict[str, queue.Queue] = {}
-            self._optimizer_state_checkpoint_queues: Dict[str, queue.Queue] = {}
-            # Create queues for each of the parameters
-            mp_context = mp.get_context('spawn')
-            for module in self.modules:
-                for parameter in module.get_parameters():
-                    name = parameter.get_name()
-                    if name in self._checkpoint_queues or name in self._optimizer_state_checkpoint_queues:
-                        raise RuntimeError(f"Duplicate parameter name found on device {self}: {name}")
-
-                    self._checkpoint_queues[name] = create_queue(mp_context)
-                    self._optimizer_state_checkpoint_queues[name] = create_queue(mp_context)
-
-    def remove_modules(self):
-        """
-        Remove placed modules, and clear the device
-        """
-        self._compiled = False
-        self._compile_output = {}
-
-        if self.backend_api:
-            self.backend_api.shutdown()
-            self.backend_api = None
-
-        Device.remove_modules(self)
-
-    def set_active_subgraph(self, subgraph_index: int):
-        """
-        Set the currently active subgraph by limiting the io queues.
-        """
-        full_io_queues = copy.copy(self._io_queues)
-        self._active_subgraph = subgraph_index
-        forward_in_push = {}
-        for k, v in self._io_queues["forward_in_push"].items():
-            forward_in_push[k] = []
-            for i, sgi in enumerate(self._compiled_graph_state.ordered_input_subgraph_indices):
-                if (sgi == subgraph_index):
-                    forward_in_push[k].append(self._io_queues["forward_in_push"][k][i])
-
-        forward_out_pop = {}
-        for k, v in self._io_queues["forward_out_pop"].items():
-            forward_out_pop[k] = []
-            for i, sgi in enumerate(self._compiled_graph_state.ordered_output_subgraph_indices):
-                if (sgi == subgraph_index):
-                    forward_out_pop[k].append(self._io_queues["forward_out_pop"][k][i])
-
-        self.set_dram_io_queues("forward_in_push", **forward_in_push)
-        self.set_dram_io_queues("forward_out_pop", **forward_out_pop)
-
-        # restore to the full set
-        self._io_queues = full_io_queues
-
-    def get_active_subgraph(self):
-        """
-        Gets the currently active subgraph.
-        """
-        return self._active_subgraph
-        
-
-    def generate_graph(self, 
-            *inputs: Tensor, 
-            target_tensors: List[Tensor] = [],
-            return_intermediate: bool = False, 
-            graph_name: str = "default_graph", 
-            compiler_cfg: Optional[CompilerConfig] = None, 
-            trace_only: bool = False, 
-            verify_cfg: Optional[VerifyConfig] = None) -> Tuple[Graph, Tuple[Tensor, ...], Dict[str, Tensor], Tuple[Tensor, ...], Optional[Tensor]]:
-        """
-        Generate a buda graph from the modules on the device, and return the graph and output tensors.
-        If input tensors have a value set, the output tensor will also have the calculated output value
-        set.
-
-        Parameters
-        ----------
-        inputs: Tuple[Tensor, ....]
-            Input tensors
-
-        target_tensors: List[Tensor]
-            Target inputs. Optional, if trace_only is set. Otherwise, value must be provided.
-
-        return_intermediate: bool
-            Optional. If set, a dictionary of node IDs -> tensors will be return with intermediate values, for data mismatch debug.
-
-        trace_only: bool
-            If set, the graph is made for a quick trace only and shouldn't have side-effects
-
-        Returns
-        -------
-        Graph, Tuple[Tensor, ...], Dict[str, Tensor], Tuple[Tensor, ...], Optional[Tensor]
-            Buda graph, outputs, optional intermediates, original inputs, target tensor
-        """
-
-        output_to_module_name_prefix = {}
-        output_to_subgraph_index = {}
-
-        # Create the graph
-        graph = Graph(graph_name)
-        graph.set_microbatch(1)
-
-        if compiler_cfg is None:
-            compiler_cfg = _get_global_compiler_config()
-
-        graph.set_enable_training(compiler_cfg.enable_training)
-
-        reset_unique_node_id()
-
-        # Trace through the modules
-        all_subgraph_outputs = []
-        outputs = inputs
-        for idx, module in enumerate(self.modules):
-            if compiler_cfg.compile_subgraphs:
-                outputs = inputs[idx]
-
-            if not isinstance(module, PyBudaModule):
-                # TODO multiple modules and mixing of pybuda and pytorch modules. 
-                from .tvm import compile_tvm_for_pybuda
-
-
-                # Convert to target format, and fall-back from fp32 if that's what left
-                # Getting "unsupported scalar BFloat16 error"
-                #pytorch_inputs = (t.to_format(t.data_format).value() if isinstance(t, Tensor) else t for t in inputs)
-                #pytorch_inputs = tuple(t.type(buda_dataformat_to_pytorch_dtype(self.fp32_fallback)) if t.dtype == torch.float32 else t for t in pytorch_inputs)
-                pytorch_inputs = to_pt_tensors(inputs)
-
-                prev_state = state_changed()
-                graph, buda_module, inputs, outputs, intermediate = compile_tvm_for_pybuda(graph, module, pytorch_inputs, compiler_cfg, graph_name, verify_cfg=verify_cfg)
-                if not trace_only:
-                    self.modules.remove(module)
-                    self.modules.insert(0, buda_module)
-                if not(prev_state):
-                    clear_state_changed()
-                return graph, outputs, intermediate, inputs, target_tensors
-
-            start_tracing()
-            if module == self.loss_module:
-                if len(target_tensors) == 0:
-                    assert trace_only, "Target tensors must be provided for each output if generate_graph is not in trace only mode"
-                    target_tensors = [Tensor.create_from_trace(None, out.shape, out.data_format) for out in outputs]
-
-                assert len(target_tensors) == len(outputs), "Invalid number of target tensor for outputs"
-                if len(outputs) == 1:
-                    outputs = module.forward(outputs[0], target_tensors[0])
-                else:
-                    outputs = module.forward(tuple(outputs), tuple(target_tensors))
-            else:
-                outputs = module.forward(*outputs)
-            stop_tracing()
-            if isinstance(outputs, Tensor):
-                outputs = (outputs,) # Force a tuple
-
-            for output in outputs:
-                output_to_module_name_prefix[output] = module.get_name()
-                if compiler_cfg.compile_subgraphs:
-                    assert output not in output_to_subgraph_index, "Output tensor {} is produced by multiple modules".format(output)
-
-                output_to_subgraph_index[output] = module.subgraph_idx
-
-            if compiler_cfg.compile_subgraphs == False and idx == len(self.modules) - 1:
-                all_subgraph_outputs += outputs
-            elif compiler_cfg.compile_subgraphs == True:
-                all_subgraph_outputs += outputs
-
-
-        if trace_only:
-            return graph, all_subgraph_outputs, {}, inputs, target_tensors
-
-        visited_tensors = {}
-        pending_tensors = deque()
-        intermediate = {}
-        module_input_tensor_to_node: Dict[str, Tensor] = {}
-        module_output_tensor_to_node: Dict[str, Tensor] = {}
-        module_target_tensor_to_node: Dict[str, Tensor] = {}
-        module_loopback_tensor_to_node: Dict[str, Tensor] = {}
-        passthroughs: Set = set()
-
-        input_node_names = []
-        input_names_known = True
-        if isinstance(inputs[0], Tensor):
-            inputs = (inputs,)
-        for index, (module, submodule_input) in enumerate(zip(self.modules, inputs)):
-            submodule_input_node_names = list(inspect.signature(super(PyBudaModule, module).__getattribute__("forward")).parameters.keys())
-            if len(self.modules) > 1:
-                submodule_input_node_names = [f"{input_name}_{index}" for input_name in submodule_input_node_names]
-            input_node_names += submodule_input_node_names
-            if len(submodule_input_node_names) != len(submodule_input):
-                input_names_known = False
-        inputs, _, _ = flatten_inputs(inputs)
-
-        for out in all_subgraph_outputs:
-            is_loss_output = self.loss_module is not None
-            if out.src_op is None:
-
-                # No source op. It could be a pass-through, so let's compare to inputs
-                found = False
-                for input in inputs:
-                    if input == out:
-                        # Found a passthrough
-                        outq = create_output(graph, 
-                            output_to_module_name_prefix.get(out, "") + f".output_passthrough_{len(passthroughs)}",
-                            out.shape.get_pytorch_shape(), 
-                            out.data_format,
-                            is_loss_output,
-                            output_to_subgraph_index.get(out, 0))
-                        passthroughs.add(input)
-                        found = True
-                        break
-
-                if not found:
-                    raise RuntimeError("Untraced output tensor encountered")
-
-            else:
-                outq = create_output(graph, 
-                        output_to_module_name_prefix.get(out, "") + ".output_" + out.src_op.name, 
-                        out.shape.get_pytorch_shape(), 
-                        out.data_format,
-                        is_loss_output,
-                        output_to_subgraph_index.get(out, 0))
-            module_output_tensor_to_node[out] = outq
-            pending_tensors.append( (out, outq, 0, [], output_to_subgraph_index.get(out, 0)) )
-
-        recorded_parameters = {}
-
-        while pending_tensors:
-
-            tensor, output, port_index, operand_broadcast, subgraph_idx = pending_tensors.popleft()
-
-            if tensor in visited_tensors:
-                # Already created the note - let's add the edge and move on
-                create_data_edge(graph, visited_tensors[tensor], 0, output, port_index, operand_broadcast)
-                continue
-
-            if isinstance(tensor, int):
-                # integer constant. Don't add to visited tensors.
-                assert False # not supported any more
-
-            if isinstance(tensor, Parameter):
-                # parameter tensor
-                if tensor.get_name() is not None:
-                    name = tensor.get_name()
-                else:
-                    name = "parameter_" + graph.get_node_name(output)
-
-                if name in recorded_parameters:
-                    # Multiple subgraphs might use the same parameter. If it is used in the same subgraph,
-                    # we should have already found it in the visited_tensors dictionary. Putting an assert here
-                    # to catch fallouts.
-                    assert graph.get_subgraph_id_for_node(recorded_parameters[name]) != subgraph_idx, \
-                            "Trying to add parameter with name: {} that is used in the same subgraph".format(name)
-                    create_data_edge(graph, recorded_parameters[name], 0, output, port_index, operand_broadcast)
-                    continue
-
-                inq = create_parameter_input(
-                        graph, 
-                        name,
-                        tensor.shape.get_pytorch_shape(),
-                        tensor.requires_grad,
-                        tensor.data_format,
-                        subgraph_idx)
-                create_data_edge(graph, inq, 0, output, port_index, operand_broadcast)
-                visited_tensors[tensor] = inq
-                recorded_parameters[name] = inq
-                continue
-            
-            if tensor.src_op is None:
-                input_name = input_node_names[inputs.index(tensor)] if input_names_known and tensor in inputs else "input_" + str(port_index) + "_" + graph.get_node_name(output)
-                if tensor in passthroughs:
-                    # passthrough input->output, add a nop
-                    inq = create_activation_input(
-                            graph,
-                            input_name,
-                            tensor.shape.get_pytorch_shape(),
-                            tensor.requires_grad,
-                            tensor.data_format,
-                            subgraph_idx)
-
-                    nop = create_op_node(graph, f"_passthrough_nop_{output}", 
-                            OpType("nop"), tensor.shape.get_pytorch_shape(), tensor.data_format, subgraph_idx, {})
-
-                    create_data_edge(graph, inq, 0, nop, 0, operand_broadcast)
-                    create_data_edge(graph, nop, 0, output, 0, operand_broadcast)
-                    visited_tensors[tensor] = inq
-                    module_input_tensor_to_node[tensor] = inq
-                    continue
-
-                elif tensor in target_tensors:
-                    # Target input
-                    inq = create_target_input(
-                            graph,
-                            input_name,
-                            tensor.shape.get_pytorch_shape(),
-                            tensor.requires_grad,
-                            tensor.data_format,
-                            subgraph_idx)
-                    create_data_edge(graph, inq, 0, output, port_index, operand_broadcast)
-                    visited_tensors[tensor] = inq
-                    module_target_tensor_to_node[tensor] = inq
-                    continue
-
-                elif tensor.is_constant():
-                    # Target input
-                    inq = create_constant_input(
-                            graph,
-                            input_name,
-                            tensor.value(),
-                            tensor.shape.get_pytorch_shape(),
-                            tensor.data_format,
-                            subgraph_idx)
-                    create_data_edge(graph, inq, 0, output, port_index, operand_broadcast)
-                    visited_tensors[tensor] = inq
-                    module_target_tensor_to_node[tensor] = inq
-                    continue
-
-                else:
-                    # input tensor
-                    input_creator = create_activation_input if input_name not in compiler_cfg.loopback_outputs else create_parameter_input
-
-                    if input_name in compiler_cfg.loopback_outputs:
-                        module.add_parameter(input_name, Parameter(tensor.value(), requires_grad=True, name=input_name))
-
-                    inq = input_creator(
-                            graph,
-                            input_name,
-                            tensor.shape.get_pytorch_shape(),
-                            tensor.requires_grad,
-                            tensor.data_format,
-                            subgraph_idx)
-                    create_data_edge(graph, inq, 0, output, port_index, operand_broadcast)
-                    visited_tensors[tensor] = inq
-                    if input_name not in compiler_cfg.loopback_outputs:
-                        module_input_tensor_to_node[tensor] = inq
-                    elif input_name in compiler_cfg.loopback_outputs:
-                        module_loopback_tensor_to_node[tensor] = inq
-                        recorded_parameters[input_name] = inq
-                    continue
-
-            elif tensor.src_op.op_type == "constant":
-                constant_value = tensor.src_op.attrs[0]
-                constant = create_constant_input(
-                        graph,
-                        "constant_" + str(port_index) + "_" + graph.get_node_name(output),
-                        constant_value,
-                        tensor.data_format,
-                        subgraph_idx)
-
-                create_data_edge(graph, constant, 0, output, port_index, operand_broadcast)
-                visited_tensors[tensor] = constant
-                continue
-
-            '''
-            print("ttdevice.py, create_op_node")
-            print(f"graph type: {type(graph)}")
-            print(f"src_op name: {tensor.src_op.name}")
-            print(f"src_op op_type: {tensor.src_op.op_type}")
-            print(f"src_op attrs: {tensor.src_op.attrs}")
-            print(f"shape: {tensor.shape.get_pytorch_shape()}")
-            print(f"data format: {tensor.data_format}")
-            '''
-
-            tags = {}
-            if tensor.src_layer is not None:
-                tags["layer"] = tensor.src_layer
-            op = create_op_node(graph, tensor.src_op.name, tensor.src_op.cpp_op_type, tensor.shape.get_pytorch_shape(), tensor.data_format, subgraph_idx, tags)
-
-            visited_tensors[tensor] = op
-            if return_intermediate and tensor.has_value():
-                intermediate[op] = tensor.value()
-
-            create_data_edge(graph, op, 0, output, port_index, operand_broadcast)
-
-            for i, t in enumerate(tensor.src_op.operands):
-                pending_tensors.append( (t, op, i, tensor.src_op.operand_broadcast, subgraph_idx) )
-
-        # Register input/output order of the module to the graph now that the nodes are created
-        module_inputs = [module_input_tensor_to_node[input_tensor] for input_tensor in inputs if input_tensor in module_input_tensor_to_node]
-        module_outputs = [module_output_tensor_to_node[output_tensor] for output_tensor in all_subgraph_outputs if output_tensor in module_output_tensor_to_node]
-        module_targets = [module_target_tensor_to_node[target_tensor] for target_tensor in target_tensors]
-        out_requires_grad = [output_tensor.requires_grad for output_tensor in all_subgraph_outputs if output_tensor in module_output_tensor_to_node]
-
-        # Remove unused inputs from list of module inputs
-        inputs = [input_tensor for input_tensor in inputs if input_tensor in module_input_tensor_to_node or input_tensor in module_output_tensor_to_node]
-
-        # Remove loopback inputs from list of module inputs
-        inputs = [input_tensor for input_tensor in inputs if input_tensor not in module_loopback_tensor_to_node]
-
-        if len(compiler_cfg.loopback_outputs):
-            output_to_remove = []
-            out_requires_grad_to_remove = []
-            for input_name, output_indices in compiler_cfg.loopback_outputs.items():
-                if isinstance(output_indices, int):
-                    output_indices = [output_indices]
-                for output_index in output_indices:
-                    input_id = graph.get_node_id(input_name)
-                    output_id = module_outputs[output_index]
-                    add_partial_datacopy_edge(graph, output_id, 0, input_id, 0)
-                    output_to_remove.append(module_outputs[output_index])
-                    out_requires_grad_to_remove.append(out_requires_grad[output_index])
-            [module_outputs.remove(value) for value in output_to_remove]
-            [out_requires_grad.remove(value) for value in out_requires_grad_to_remove]
-    
-        graph.register_module_inputs(module_inputs)
-        graph.register_module_targets(module_targets)
-        graph.register_module_outputs(module_outputs, out_requires_grad)
-
-        for parameter in self.get_parameters():
-            parameter_name = parameter.get_name()
-            if parameter_name not in recorded_parameters:
-                self._unused_parameters.add(parameter_name)
-
-        if return_intermediate:
-            return graph, outputs, intermediate, inputs, target_tensors
-
-        return graph, outputs, {}, inputs, target_tensors
-
-    def compile_for(self, 
-            inputs: Tuple[Tensor, ...],
-            compiler_cfg: CompilerConfig,
-            targets: List[Tensor] = [],
-            microbatch_size: int = 0,
-            microbatch_count: int = 1,
-            verify_cfg: Optional[VerifyConfig] = None,
-            ) -> Tuple[Tensor, ...]:
-
-        """
-        Compile modules placed on this device, with given input shapes, input formats, and microbatch size.
-
-        Parameters
-        ----------
-        training: bool
-            Specify whether to compile for training or inference. If set to true, autograd will be executed
-            before the compile.
-
-        inputs: Tuple[Tensor, ...]
-            Tuple of input tensors. They must have shape and format set, but do not need to hold data unless
-            auto-verification is set.
-
-        compiler_cfg: CompilerConfig
-            Compiler configuration
-
-        targets: List[Tensor], optional
-            Optional list of target tensors, if this device has a loss module
-
-        microbatch_size: int, optional
-            The size of microbatch. Must be non-zero for training mode.
-
-        microbatch_count: int
-            Only relevant for training. This represents the number of microbatches that are pushed through
-            fwd path before bwd path runs. The device will ensure that buffering is large enough to contain
-            microbatch_count number of microbatch intermediate data.
-
-        verify_cfg: Optional[VerifyConfig]
-            Optional auto-verification of compile process
-
-        Returns
-        -------
-        Tuple[Tensor, ...]
-            Output tensors
-
-
-        """
-        if self.device_mode != DeviceMode.RunOnly:
-            assert not self._compiled, "Trying to compile a design that's already been compiled"
-
-        training = compiler_cfg.enable_training
-        if compiler_cfg.compile_subgraphs:
-            input_shapes_group = []
-            for input in inputs:
-                input_shapes_group.append(tuple(i.shape for i in input))
-            input_shapes = tuple(input_shapes_group)
-        else:
-            input_shapes = tuple(i.shape for i in inputs)
-        # input_formats = tuple(i.data_format for i in inputs)
-        Device.compile_for(self, training, microbatch_size, microbatch_count)
-
-        if training:
-            logger.debug("Compiling for Training mode on {}", self)
-        else:
-            logger.debug("Compiling for Inference mode on {}", self)
-
-        self.input_shapes = input_shapes # record for checking later
-
-        if verify_cfg is None:
-            verify_cfg = VerifyConfig.disabled() # no verification config provided, disable by default
-
-        losses = None
-        from .compile import pybuda_compile
-        from .compiled_graph_state import CompiledGraphState
-        if self.device_mode == DeviceMode.CompileAndRun or self.device_mode == DeviceMode.CompileOnly:
-            graph_name = self.modules[0].get_name()
-            self._compile_output = pybuda_compile(
-                    self,
-                    graph_name,
-                    *inputs,
-                    compiler_cfg = compiler_cfg,
-                    verify_cfg = verify_cfg,
-                    losses = losses,
-                    targets = targets,
-                    microbatch_size = microbatch_size,
-                    microbatch_count = microbatch_count)
-
-            self._compiled_graph_state = CompiledGraphState.from_compiled_graph(self, self._compile_output)
-
-        device_mode_for_backend = DeviceMode.RunOnly if "PYBUDA_SKIP_BACKEND_COMPILE" in os.environ else self.device_mode
-        backend_runtime_args = compiler_cfg.backend_runtime_args if "PYBUDA_FORCE_SEQUENTIAL" in os.environ else compiler_cfg.backend_runtime_args + " --concurrent-mode"
-
-        # Set some perf defaults for WH
-        if self.arch == BackendDevice.Wormhole_B0:
-            os.environ["TT_BACKEND_MULTI_THREADED_PUSH"] = "1"
-
-        self.backend_api = BackendAPI(
-            self.devtype,
-            self.arch,
-            self,
-            self._compiled_graph_state.netlist_filename,
-            self._compiled_graph_state,
-            not self._sequential,
-            None,
-            None,
-            compiler_cfg.performance_trace,
-            device_mode_for_backend,
-            verify_cfg.golden_ignore_df_precision,
-            compiler_cfg.backend_opt_level,
-            compiler_cfg.backend_output_dir,
-            # for nebula+galaxy, backend_device_descriptor_path is for unharvested device_desc
-            # creating backend with it will cause crashes when runtime tries to reset the harvested cores in nebulas
-            # not passing device_desc allows runtime to create unharvested&harvested device_desc's for each chip
-            compiler_cfg.backend_device_descriptor_path if "PYBUDA_NEBULA_GALAXY_PLACER" not in os.environ else "",
-            compiler_cfg.backend_cluster_descriptor_path,
-            backend_runtime_args)
-            
-        if self.device_mode == DeviceMode.CompileAndRun or self.device_mode == DeviceMode.RunOnly:
-            # Copy constants and parameters to device - probably shouldn't be part of compile, but explicit on run!
-            self.backend_api.push_constants_and_parameters(translate=True)
-            self.backend_api.push_optimizer_parameters(translate=True)
-
-        self._compiled = True
-        if self._compile_output and self._compile_output.outputs:
-            return [t.detach() for t in self._compile_output.outputs] # detach so it can pushed into mp queues
-        else:
-            assert self.device_mode == DeviceMode.RunOnly, (
-                "We should only be returning empty tensors when configuring the device from image."
-            )
-            # don't necessarily need to contain any contents, but disallow auto-verification
-            return [
-                Tensor.create_from_trace(None, shape, data_format)
-                for shape, data_format in 
-                zip(self._compiled_graph_state.ordered_output_shapes, self._compiled_graph_state.ordered_output_data_formats)
-            ]
-
-    def update_device_parameters(self, parameters: Dict[str, torch.Tensor]):
-
-        assert self.backend_api
-        self.sync() # wait until queued up commands have completed
-        self.backend_api.update_device_paramaters(parameters)
-
-    def _post_graph_callback(self):
-        """
-        Called after buda graph has been generated, but the compile process hasn't yet happened.
-        """
-        pass
-
-    def forward(self, loop_count: int):
-        """
-        Run forward pass on each module on this device, in order
-
-        Parameters
-        ----------
-        loop_count: int
-            Number of micro-batches to run
-
-        """
-
-        logger.debug("Starting forward on {}", self)
-        assert self._compiled, f"Module not compiled yet on {self}"
-        assert self.backend_api is not None
-
-        self.backend_api.schedule_run_forward(loop_count)
-
-    def generate(self, loop_count: int, write_index: int, tokens_per_iter: int, token_id: int):
-        """
-        Run forward pass on each module on this device, in order
-
-        Parameters
-        ----------
-        loop_count: int
-            Number of micro-batches to run
-
-        """
-
-        logger.debug("Starting generate on {}", self)
-        assert self._compiled, f"Module not compiled yet on {self}"
-        assert self.backend_api is not None
-
-        if tokens_per_iter != -1:
-            if tokens_per_iter == TILE_DIM: #pre-populating cache
-                write_index = token_id // TILE_DIM
-                inner_loop_count = loop_count
-                outer_loop_count = 1
-                inner_increment = 1
-                outer_increment = 0
-            elif tokens_per_iter != 1: #last pre-population step
-                assert loop_count == 1
-                write_index = token_id // TILE_DIM
-                inner_loop_count = 1
-                outer_loop_count = 1
-                inner_increment = 0
-                outer_increment = 0
-            else: #token generation
-                write_index = token_id // TILE_DIM
-                outer_loop_count = loop_count // TILE_DIM if loop_count > TILE_DIM else 1
-                inner_loop_count = loop_count % TILE_DIM if loop_count < TILE_DIM else TILE_DIM
-                inner_increment = 0
-                outer_increment = 1
-        else: #manual_mode
-            inner_loop_count = loop_count
-            inner_increment = 0
-            outer_loop_count = 1
-            outer_increment = 0
-        logger.debug("Generating: write_index: {}, inner_loop_count: {}, inner_increment: {}, outer_loop_count: {}, outer_increment: {}", write_index, inner_loop_count, inner_increment, outer_loop_count, outer_increment)
-        self.backend_api.schedule_run_generate(write_index, inner_loop_count, inner_increment, outer_loop_count, outer_increment)
-        
-        
-    def cpueval_forward(self, inputs: List[torch.Tensor], parameters: Dict[str, torch.Tensor], save_for_backward: bool, targets: List[torch.Tensor] = []) -> List[torch.Tensor]:
-        """
-        Evaluate forward pass for verification
-
-        Parameters
-        ----------
-        inputs: List[torch.Tensor]
-            One input into the model (for each ordered input node)
-
-        parameters: Dict[str, torch.Tensor]
-            Map of model parameters
-
-        save_for_backward: bool
-            If set, input and output tensors will be saved so we can run the backward pass later.
-
-        targets: List[torch.Tensor], optional
-            If we're running training, and there's a loss module on this device, provide target
-
-        Returns
-        -------
-        List[Tensor]
-            Forward graph output
-        """
-        assert self.device_mode != DeviceMode.RunOnly, (
-            "Device has been configured from image. We disallow auto-verification in this mode."
-        )
-        assert self._compile_output is not None and self._compile_output.initial_graph is not None
-
-        if save_for_backward:
-            self._saved_fw_inputs = inputs
-
-        microbatch_size = self._compile_output.initial_graph.get_microbatch()
-        assert inputs[0].shape[0] == microbatch_size
-        output_list = []
-        for i in range(microbatch_size):
-            if microbatch_size > 1:
-                mb_inputs = tuple(input[i:i+1] for input in inputs)
-            else:
-                mb_inputs = inputs 
-
-            if targets is not None:
-                mb_targets = tuple(target[i:i+1] for target in targets)
-            else:
-                mb_targets = None
-            output, *_ = graph_eval(self._compile_output.initial_graph, mb_inputs, parameters, self, 0.1, 1.00, targets=mb_targets)
-
-            output_list.append(output)
-
-        outputs = []
-        for out in zip(*output_list):
-            outputs.append(torch.cat(out, 0))
-        outputs = tuple(outputs)
-
-        if save_for_backward:
-            self._saved_fw_outputs = outputs
-
-        return outputs
-
-    def backward(self, loop_count: int, zero_grad: bool):
-        """
-        Run backward pass on each module on this device, in reverse order
-
-        Parameters
-        ----------
-        loop_count: int
-            Each mini-batch is broken into micro-batches. This is necessary to fill a multi-device pipeline, 
-            and should be roughly 4-6x the number of devices in the pipeline for ideal performance.
-
-        zero_grad: bool
-            Set to true to have optimizer zero out gradients before the run
-        """
-        logger.debug("Starting backward on {}", self)
-
-        # Since we don't support loss on ttdevice yet, we will always do a forward first, which will compile
-        assert self._compiled, "Model not compiled yet"
-        assert self.backend_api is not None
-
-        self.backend_api.schedule_run_backward(loop_count, zero_grad)
-
-    def _step_optimizer(self):
-        """
-        Step optimizer
-        """
-        logger.debug("Stepping optimizer on {}", self)
-        assert self.backend_api is not None
-        self.backend_api.schedule_run_optimizer()
-
-    def _step_schedulers(self):
-        """
-        Step schedulers
-        """
-        if self.scheduler:
-            logger.debug("Stepping schedulers on {}", self)
-            assert self.backend_api is not None
-            self.backend_api.schedule_run_schedulers(self)
-
-    def get_parameter_checkpoint(self) -> Dict[str, Tensor]:
-        """
-        Return a dictionary of current parameter values for the models on this device
-        """
-        self.sync() # wait until queued up commands have completed
-        assert self.backend_api is not None
-        ret = {}
-        queues = []
-        shapes = []
-        names = []
-        for module in self.modules:
-            for parameter in module.get_parameters():
-                if parameter.requires_grad:
-                    name = parameter.get_name()
-                    names.append(name)
-                    queues.append(self.backend_api.be_api.get_queue_descriptor(name))
-                    constevaled_shape = consteval_shape(self._compiled_graph_state, name, parameter.value())
-                    shapes.append(constevaled_shape)
-
-        values = BackendAPI.read_queues(queues, shapes, runtime_tensor_transforms=None, requires_grad= [False] * len(queues), single_output=True, rd_ptr=0, 
-                shutdown_event=self.shutdown_event, clone=True, has_microbatch_dim=False)
-        for name, value in zip(names, values):
-            if self._training:
-                ret[name] = Tensor.create_from_torch(consteval_input_bw(self._compiled_graph_state, name, value.value(), False))
-            else:
-                # We don't have a backward consteval graph recorded, return the raw paramater value
-                ret[name] = Tensor.create_from_torch(value.value())
-
-        return ret
-
-    def get_all_parameters(self) -> Dict[str, Tensor]:
-        """
-        Return a dictionary of current parameter values for the models on this device
-        """
-        self.sync() # wait until queued up commands have completed
-        assert self.backend_api is not None
-        ret = {}
-        queues = []
-        shapes = []
-        names = []
-        for name, param in self._compiled_graph_state.post_const_eval_parameters.items():
-            names.append(name)
-            queues.append(self.backend_api.be_api.get_queue_descriptor(name))
-            shapes.append(param.shape)
-
-        values = BackendAPI.read_queues(queues, shapes, runtime_tensor_transforms=None, requires_grad= [False] * len(queues), single_output=True, rd_ptr=0, 
-                shutdown_event=self.shutdown_event, clone=True, has_microbatch_dim=False)
-        for name, value in zip(names, values):
-            if self._training:
-                ret[name] = Tensor.create_from_torch(consteval_input_bw(self._compiled_graph_state, name, value.value(), False))
-            else:
-                # We don't have a backward consteval graph recorded, return the raw paramater value
-                ret[name] = Tensor.create_from_torch(value.value())
-
-        return ret
-
-
-    def get_parameter_gradients(self) -> Dict[str, Tensor]:
-        """
-        Return a dictionary of currently accumulated gradient values for the models on this device
-        """
-        self.sync() # wait until queued up commands have completed
-        assert self.backend_api is not None
-        ret = {}
-        queues = []
-        shapes = []
-        names = []
-        for module in self.modules:
-            for parameter in module.get_parameters():
-                if parameter.requires_grad:
-                    name = parameter.get_name()
-                    queue_name = "grad_acc_" + name
-                    constevaled_shape = consteval_shape(self._compiled_graph_state, name, parameter.value())
-                    names.append(name)
-                    shapes.append(constevaled_shape)
-                    queues.append(self.backend_api.get_output_queue_descriptor(queue_name))
-
-        values = BackendAPI.read_queues(queues, shapes, runtime_tensor_transforms=None, requires_grad = [False] * len(queues), single_output=True, rd_ptr=0,
-                shutdown_event=self.shutdown_event, clone=True, has_microbatch_dim=False)
-        for name, value in zip(names, values):
-            ret[name] = Tensor.create_from_torch(consteval_input_bw(self._compiled_graph_state, name, value.value(), False))
-
-        return ret
-    
-    def _model_pop_optimizer_state_checkpoint(self) -> Dict:
-        """
-        """
-        
-        if len(self.optimizer.get_optimizer_state_keys()) == 0:
-            return {}
-
-        ret = {}
-        for module in self.modules:
-            for parameter in module.get_parameters():
-                if parameter.requires_grad:
-                    name = parameter.get_name()
-                    tensor = parameter.get_empty_tensor().tensor
-                    optimizer_states = buda.pop_optimizer_state_checkpoint(
-                        self._get_gstate(),
-                        0,
-                        tensor,
-                        self.devtype
-                    )
-
-                    ret[name] = optimizer_states
-        return ret
-
-    def _get_fw_tilizer_target_device_id(self):
-        """
-        Return the device_id that we push forward inputs to. In single-device setup, that's always 0
-        """
-        return 0
-
-    def _get_bw_tilizer_target_device_id(self):
-        """
-        Return the device_id that we push backward inputs to. In single-device setup, that's always 0
-        """
-        return 0
-
-    def get_parameters(self, ignore_unused_parameters: bool = True) -> List[Parameter]:
-        """
-        Parameters
-        ----------
-        ignore_used_parameters: bool
-            If true, any parameter not being recorded by the graph-trace (i.e. parameter is unused in
-            graph execution) is not included in the returned list to user.
-        """
-        ret: List[Parameter] = []
-        for module in self.modules:
-            ret.extend(module.get_parameters())
-
-        if ignore_unused_parameters:
-            ret = [parameter for parameter in ret if parameter.get_name() not in self._unused_parameters]
-
-        return ret
-
-    def get_optimizer(self) -> Optional[Optimizer]:
-        return self.optimizer
-
-    def get_optimizer_params(self, is_buda: bool) -> Dict[str, Dict[str, Tensor]]:
-        """
-        Return a dictionary of dictionaries of optimizer parameters for each model parameter.
-        """
-        if not self.optimizer:
-            return {}
-
-        ret = {}
-        for param in self.get_parameters():
-            if not param.requires_grad:
-                continue
-
-            name = param.get_name()
-            optimizer_params = self.optimizer.get_optimizer_params(name, is_buda)
-            if optimizer_params is None:
-                continue
-
-            ret[name] = optimizer_params
-
-        return ret
-
-    def get_scheduler_params(self, is_buda: bool) -> Dict[str, Dict[str, Tensor]]:
-        """
-        Return a dictionary of dictionaries of optimizer parameters used by scheduler.
-        """
-        if not self.optimizer:
-            return {}
-
-        ret = {}
-        for param in self.get_parameters():
-            if not param.requires_grad:
-                continue
-
-            name = param.get_name()
-            optimizer_params = self.scheduler.get_scheduler_params(name, is_buda)
-            if optimizer_params is None:
-                continue
-
-            ret[name] = optimizer_params
-
-        return ret
-
-    def _get_fwd_inputs_tile_broadcast_dims(self) -> List[List[int]]:
-        """
-        Return a list of tile broadcast dims for each direct input into the device (fwd)
-        """
-        assert self._compiled_graph_state
-        return self._compiled_graph_state.ordered_input_tile_broadcast_dims
-
-    def _get_target_inputs_tile_broadcast_dims(self) -> List[List[int]]:
-        """
-        Return a list of tile broadcast dims for each target input into the device
-        """
-        assert self._compiled_graph_state
-        return self._compiled_graph_state.ordered_target_tile_broadcast_dims
-
-    def _get_bwd_inputs_tile_broadcast_dims(self) -> List[List[int]]:
-        """
-        Return a list of tile broadcast dims for each direct input into the device (bwd)
-        """
-        assert self._compiled_graph_state
-        return self._compiled_graph_state.ordered_bw_input_tile_broadcast_dims
-
-    def _get_input_shapes(self, grad_only: bool) -> List[Tuple[int, ...]]:
-        """
-        Return a list of original input shapes. If `grad_only`, only return those that have requires_grad set
-        """
-        assert self._compiled_graph_state
-        input_shapes = self._compiled_graph_state.ordered_input_shapes
-        requires_grad = self._compiled_graph_state.ordered_input_requires_grad
-        microbatch = self._compiled_graph_state.microbatch
-
-        for i, in_shape in enumerate(input_shapes):
-            if in_shape[0] == 1:
-                in_shape[0] = microbatch
-
-        if grad_only:
-            input_shapes = [s for i, s in enumerate(input_shapes) if requires_grad[i]]
-        return input_shapes
-    
-    def _adjust_shapes_for_microbatch(self, shapes: List[Tuple[int, ...]], microbatch: int) -> List[Tuple[int, ...]]:
-        for i, out_shape in enumerate(shapes):
-            if out_shape[0] != 1 and out_shape[0] != microbatch:
-                out_shape.insert(0, 1)
-            out_shape[0] =  microbatch
-        return shapes
-
-
-    def _get_output_shapes(self, grad_only: bool) -> List[Tuple[int, ...]]:
-        """
-        Return a list of original output shapes. If `grad_only`, only return those that have requires_grad set
-        """
-        assert self._compiled_graph_state
-        output_shapes = self._compiled_graph_state.ordered_output_shapes
-        requires_grad = self._compiled_graph_state.ordered_output_requires_grad
-        output_shapes = self._adjust_shapes_for_microbatch(output_shapes, self._compiled_graph_state.microbatch)
-
-        if grad_only:
-            output_shapes = [s for i, s in enumerate(output_shapes) if requires_grad[i]]
-
-        return output_shapes
-
-    def _get_intermediate_shapes(self) -> List[Tuple[int, ...]]:
-        assert self._compiled_graph_state
-        shapes = self._compiled_graph_state.ordered_intermediate_shapes
-        return self._adjust_shapes_for_microbatch(shapes, self._compiled_graph_state.microbatch)
-
-    def _get_input_runtime_tensor_transforms(self) -> List["RuntimeTensorTransform"]:
-        assert self._compiled_graph_state
-
-        input_runtime_tensor_transforms = self._compiled_graph_state.ordered_input_runtime_tensor_transforms
-        microbatch = self._compiled_graph_state.microbatch
-
-        # If RuntimeTensorTransform is a ReinterpretShape:
-        #   If microbatch 1, reinterpret input shapes will drop it... we need
-        #   that unary dimension because the backwards reinterpret shapes uses 
-        #   this and the narrow tensor code expects activations and activation
-        #   gradients to have a microbatch dimension
-        for i, transform in enumerate(input_runtime_tensor_transforms):
-            if transform.type != RuntimeTensorTransformType.ReinterpretShape:
-                continue
-
-            while len(transform.reinterpreted_shape) < 3:
-                transform.reinterpreted_shape = Shape.create_with_type_from_other(transform.reinterpreted_shape, [1] + transform.reinterpreted_shape.as_list())
-
-            if transform.reinterpreted_shape.as_list()[0] not in [1, microbatch]:
-                transform.reinterpreted_shape = Shape.create_with_type_from_other(transform.reinterpreted_shape, [1] + transform.reinterpreted_shape.as_list())
-
-            if transform.reinterpreted_shape.as_list()[0] == 1:
-                reinterpreted_shape_as_list = transform.reinterpreted_shape.as_list()
-                reinterpreted_shape_as_list[0] = microbatch
-                transform.reinterpreted_shape = Shape.create_with_type_from_other(transform.reinterpreted_shape, reinterpreted_shape_as_list)
-
-        return input_runtime_tensor_transforms
-
-    def _get_output_runtime_tensor_transforms(self) -> List[List[int]]:
-        assert self._compiled_graph_state
-
-        output_runtime_tensor_transforms = self._compiled_graph_state.ordered_output_runtime_tensor_transforms
-        microbatch = self._compiled_graph_state.microbatch
-
-        # # If RuntimeTensorTransform is a ReinterpretShape:
-        # #   If microbatch 1, reinterpret input shapes will drop it... we need
-        # #   that unary dimension because the backwards reinterpret shapes uses 
-        # #   this and the narrow tensor code expects activations and activation
-        # #   gradients to have a microbatch dimension
-        for i, transform in enumerate(output_runtime_tensor_transforms):
-            if transform.type == RuntimeTensorTransformType.ReinterpretShape:
-                if transform.reinterpreted_shape.as_list()[0] not in [1, microbatch]:
-                    transform.reinterpreted_shape = Shape.create_with_type_from_other(transform.reinterpreted_shape, [1] + transform.reinterpreted_shape.as_list())
-
-                reinterpreted_shape_as_list = transform.reinterpreted_shape.as_list()
-                reinterpreted_shape_as_list[0] = microbatch
-                transform.reinterpreted_shape = Shape.create_with_type_from_other(transform.reinterpreted_shape, reinterpreted_shape_as_list)
-            elif transform.type == RuntimeTensorTransformType.Unpad:
-                if transform.unpadded_shape.as_list()[0] not in [1, microbatch]:
-                    transform.unpadded_shape = Shape.create_with_type_from_other(transform.unpadded_shape, [1] + transform.unpadded_shape.as_list())
-
-                unpadded_shape_as_list = transform.unpadded_shape.as_list()
-                unpadded_shape_as_list[0] = microbatch
-                transform.unpadded_shape = Shape.create_with_type_from_other(transform.unpadded_shape, unpadded_shape_as_list)
-
-        return output_runtime_tensor_transforms
-
-    def _get_output_requires_grad(self) -> List[bool]:
-        """
-        Return a list of requires_grad flags on each output
-        """
-        assert self._compiled_graph_state
-        return self._compiled_graph_state.ordered_output_requires_grad
-
-    def _get_input_requires_grad(self) -> List[bool]:
-        """
-        Return a list of requires_grad flags on each input
-        """
-        assert self._compiled_graph_state
-        return self._compiled_graph_state.ordered_input_requires_grad
-
-
-    def _create_forward_device_connector(self, target_device: Union["TTDevice", "CPUDevice"], sequential: bool, d2d_fwd_queue: Optional[queue.Queue] = None, microbatch = 1):
-
-        logger.debug("Creating forward device connector from {} to {}", self, target_device)
-        if isinstance(target_device, TTDevice):
-            # direct transfer both ways
-            self.forward_dc = DirectPusherPopperDeviceConnector(self.shutdown_event, sequential, side_queue=d2d_fwd_queue)
-        else:
-            # TTDevice copies directly to host, no pushing
-            self.forward_dc = DirectPopperDeviceConnector(self.shutdown_event, side_queue=d2d_fwd_queue)
-
-        target_device._set_forward_input_dc(self.forward_dc)
-
-    def _create_backward_device_connector(self, target_device: Device, sequential: bool, d2d_bwd_queue: Optional[queue.Queue] = None, microbatch = 1):
-
-        logger.debug("Creating backward device connector from {} to {}", self, target_device)
-        if isinstance(target_device, TTDevice):
-            # direct transfer both ways
-            self.backward_dc = DirectPusherPopperDeviceConnector(self.shutdown_event, sequential, side_queue=d2d_bwd_queue)
-        else:
-            # TTDevice copies directly to host, no pushing
-            self.backward_dc = DirectPopperDeviceConnector(self.shutdown_event, side_queue=d2d_bwd_queue)
-        target_device._set_backward_input_dc(self.backward_dc)
-
-    # Create device connector for the last device, pushing forward
-    def _create_forward_output_queue_device_connector(self, q: queue.Queue):
-        logger.debug("Creating forward output queue connector on {}", self)
-        self.forward_dc = OutputQueueDirectPoppperDeviceConnector(q, self.shutdown_event)
-
-    # Create device connector for the first device, pushing backward
-    def _create_backward_output_queue_device_connector(self, q: queue.Queue):
-        logger.debug("Creating backward output queue connector on {}", self)
-        self.backward_dc = OutputQueueDirectPoppperDeviceConnector(q, self.shutdown_event)
-
-    # Create device connector for the first device, reading from a Queue
-    def _create_input_queue_device_connector(self, q: queue.Queue, sequential: bool):
-        logger.debug("Creating input queue connector on {}", self)
-        self.forward_input_dc = InputQueueDirectPusherDeviceConnector(q, self.shutdown_event, sequential)
-
-    # Create device connector for the last device, reading from a Queue
-    def _create_target_queue_device_connector(self, q: queue.Queue, sequential: bool):
-        logger.debug("Creating input queue connector on {}", self)
-        self.target_input_dc = InputQueueDirectPusherDeviceConnector(q, self.shutdown_event, sequential)
-        
-    # Create device connector for the last device, reading from a Queue
-    def _create_intermediates_queue_device_connector(self, q: queue.Queue):
-        logger.debug("Creating fwd intermediates queue connector on {}", self)
-        self.intermediates_dc = OutputQueueDirectPoppperDeviceConnector(q, self.shutdown_event)
-
-
-    def get_dram_io_queues(self, queue_type: str) -> Tuple[List[DramIODesc], Optional[List[List[int]]], Optional[List], Optional[List[bool]], Optional[List[Tensor]]]:
-        """
-        Returns the appropriate queue description, tile broadcast information, and original shapes, where applicable
-        """
-        assert self.backend_api
-        if (queue_type == "input"):
-            input_qs = self.backend_api.get_ordered_input_queues()
-            transforms = self._get_input_runtime_tensor_transforms()
-            constant_inputs = [None for _ in self._compiled_graph_state.ordered_input_names]
-            input_tile_dims = [ 
-                self._compiled_graph_state.input_to_tile_dims[name]
-                for name in self._compiled_graph_state.ordered_input_names
-            ]
-
-            for idx, transform in enumerate(transforms):
-                if transform.type == RuntimeTensorTransformType.Prestride:
-                    assert transform.stride_height == transform.stride_width, "Backend supports only square strides for prestriding transform"
-                    stride = transform.stride_height
-                    stride_desc = StrideDescriptor()
-                    stride_desc.stride = stride
-                    stride_desc.xy_offsets = [(x, y) for y in range(stride) for x in range(stride)]
-
-                    input_qs[idx].s_descriptor = stride_desc
-                elif transform.type == RuntimeTensorTransformType.ConstantInput:
-                    constant_inputs[idx] = self._compiled_graph_state.constant_to_tensor[self._compiled_graph_state.ordered_input_names[idx]]
-            return input_qs, self._get_fwd_inputs_tile_broadcast_dims(), None, None, transforms, constant_inputs, input_tile_dims
-
-        if (queue_type == "target"):
-            return self.backend_api.get_ordered_target_queues(), self._get_target_inputs_tile_broadcast_dims(), None, None, None, None, None
-
-        if (queue_type == "output"): 
-            return self.backend_api.get_ordered_output_queues(), None, self._get_output_shapes(grad_only=False), self._get_output_requires_grad(), self._get_output_runtime_tensor_transforms(), None, None
-
-        if (queue_type == "bw_input"):
-            return self.backend_api.get_ordered_bw_input_queues(), self._get_bwd_inputs_tile_broadcast_dims(), None, None, self._get_output_runtime_tensor_transforms(), None, None
-
-        if (queue_type == "bw_output"):
-            qs = self.backend_api.get_ordered_bw_output_queues()
-            return qs, None, self._get_input_shapes(grad_only=True), [True] * len(qs), self._get_input_runtime_tensor_transforms(), None, None
-
-        if (queue_type == "intermediates"):
-            intermediate_shapes = self._get_intermediate_shapes()
-            requires_grad = [False] * len(intermediate_shapes)
-            qs = self.backend_api.get_intermediate_activation_queues()
-
-            return qs, None, intermediate_shapes, requires_grad, None, None, None
-
-        raise RuntimeError("Unknown type of queue")
-
-    def get_pytorch_optimizer(self, parameters: Dict[str, torch.Tensor], lr = None) -> Optional[torch.optim.Optimizer]:
-        if self.optimizer is None or len(parameters) == 0:
-            return None
-        return self.optimizer.get_pytorch_optimizer(parameters, lr)
-    
-    def get_pytorch_scheduler(self):
-        if self.scheduler is None:
-            return None
-        return self.scheduler.get_pytorch_scheduler(self.optimizer.torch_optimizer)
-
-    def shutdown_device(self):
-        """
-        Shutdown device at the end of the workload
-        """
-        self.remove_modules()
-
-        logger.trace("start shutdown threads")
-        self._shutdown_threads()
-
-        Device.shutdown_device(self)
-
-    def sync(self):
-        """
-        Block until queued up commands have completed and the device is idle.
-        """
-        assert self.backend_api is not None
-        self.backend_api.sync()
-
-    def get_compiled_results(self) -> Optional["CompileResults"]:
-        from .compile import CompileResults
-        if not self._compiled or not self._compile_output:
-            logger.error(f"User has not yet compiled a device")
-            return None
-        return self._compile_output
-
-    def compile_to_image(
-        self, 
-        *,
-        img_path: str = None,
-        training: bool = False, 
-        sample_inputs: Tuple[Union[torch.Tensor, Tensor], ...] = tuple(),
-        sample_targets: Tuple[Union[torch.Tensor, Tensor], ...] = tuple(),
-        microbatch_count: int = 1,
-        verify_cfg: Optional[VerifyConfig] = None,
-        cpueval_outputs: Optional[List[torch.Tensor]] = None,
-    ) -> "TTDeviceImage":
-
-        assert self.arch, "When compiling to image, TTDevice must be explicitly constructed with target-arch"
-        assert self.devtype, "When compiling to image, TTDevice must be explicitly constructed with dev_type"
-
-        compiler_cfg = _get_global_compiler_config()
-        if not self._compiled:
-            self.device_mode = DeviceMode.CompileOnly
-            from .run import initialize_pipeline
-            initialize_pipeline(
-                training=training,
-                sample_inputs=sample_inputs,
-                sample_targets=sample_targets,
-                microbatch_count=microbatch_count,
-                _sequential=True,
-                _verify_cfg=verify_cfg,
-                _device_mode=self.device_mode
-            )
-
-        from .tti import TTDeviceImage
-        device_image = TTDeviceImage.create_image_from_device(
-            self, 
-            training,
-            microbatch_count,
-            verify_cfg,
-            compiler_cfg,
-            cpueval_outputs=cpueval_outputs,
-        )
-        TTDeviceImage.save_to_disk(device_image, img_path, self.backend_api)
-
-        return device_image
-
-    
-    @staticmethod
-    def load_image(*, img: Optional["TTDeviceImage"] = None, img_path: Optional[str] = None) -> "TTDevice":
-        from .tti import TTDeviceImage
-        if img and img_path:
-            logger.error("only one of image/image-path should be specified")
-        if img is None:
-            img = TTDeviceImage.load_from_disk(img_path)
-        return TTDeviceImage.create_device_from_image(img)
-
-
-def budabackend_path() -> str:
-    if "BUDA_HOME" in os.environ:
-        return os.environ["BUDA_HOME"]
-
-    if os.path.exists(os.getcwd() + '/third_party/budabackend'):
-        # must be in pybuda root
-        return "third_party/budabackend/"
-    else:
-        return ""
-
-
-def get_backend_string(backend_type: BackendType) -> str:
-    BACKEND_TYPE_TO_DEVICE_GRID = {
-            BackendType.Golden: "golden",
-            BackendType.Model: "model",
-            BackendType.NoBackend: "nobackend",
-            BackendType.Silicon: "silicon",
-    }
-    if backend_type in BACKEND_TYPE_TO_DEVICE_GRID:
-        return BACKEND_TYPE_TO_DEVICE_GRID[backend_type]
-    else:
-        raise Exception("Running pybuda_compile with unknown backend_type config")
-
-
-def get_default_device_yaml(arch: BackendDevice, device_yaml: str, backend_output_dir: str, device_yaml_override: Optional[str]) -> str:
-    if device_yaml_override:
-        if arch not in {BackendDevice.Grayskull, BackendDevice.Wormhole, BackendDevice.Wormhole_B0}:
-            raise RuntimeError("Running pybuda_compile with unknown arch config")
-        if os.path.isfile(device_yaml_override):
-            return device_yaml_override
-        elif os.path.isfile(budabackend_path() + f"device/{device_yaml_override}"):
-            return budabackend_path() + f"device/{device_yaml_override}"
-
-    # NOTE: followings should be removed when decweek3 uplift is merged
-    harvested_rows_manual = os.environ.get("TT_BACKEND_HARVESTED_ROWS", None)
-    if device_yaml != "" and harvested_rows_manual is None:
-        return device_yaml
-
-    harvesting_mask = 0
-    if harvested_rows_manual is not None:
-        masks = harvested_rows_manual.split(",")
-        for mask in masks:
-            assert mask == masks[0], "We currently only support identical harvest masks for all chips"
-        harvesting_mask = int(masks[0])
-
-    default_device_desc = get_custom_device_desc(arch, mmio=True, harvesting_mask=harvesting_mask, out_dir=backend_output_dir)
-    return default_device_desc.soc_desc_yaml
-
-def get_default_cluster_descriptor(backend_output_dir: str, backend_cluster_descriptor_path: str = "") -> str:
-    cluster_override = os.environ.get("PYBUDA_OVERRIDE_CLUSTER_YAML", None)
-    if cluster_override:
-        if os.path.isfile(cluster_override):
-            return cluster_override
-        elif os.path.isfile(budabackend_path() + f"/{cluster_override}"):
-            return budabackend_path() + f"/{cluster_override}"
-        else:
-            raise RuntimeError(f"PYBUDA_OVERRIDE_CLUSTER_YAML={cluster_override} is not a valid file.")
-    elif backend_cluster_descriptor_path == "":
-        backend_cluster_descriptor_path = get_device_cluster_yaml(backend_output_dir)
-
-    return backend_cluster_descriptor_path
-
-def get_device_config(arch: BackendDevice,
-                      chip_ids: Union[List[int], List[Tuple[int]]] = None,
-                      backend_cluster_descriptor_path = "",
-                      backend_runtime_params_path = "",
-                      store_backend_db_to_yaml = False,
-                      backend_type = BackendType.NoBackend,
-                      device_yaml = "",
-                      backend_output_dir = "./tt_build",
-                      backend_device_descriptor_path_override = None) -> str: 
-    return DeviceConfig(
-        arch.to_string(),
-        get_default_device_yaml(arch, device_yaml, backend_output_dir, backend_device_descriptor_path_override),
-        get_default_cluster_descriptor(backend_output_dir, backend_cluster_descriptor_path),
-        backend_runtime_params_path, 
-        get_backend_string(backend_type),
-        store_backend_db_to_yaml,
-        chip_ids,
-    )
-
diff --git a/pybuda/pybuda/tti/__init__.py b/pybuda/pybuda/tti/__init__.py
deleted file mode 100644
index 2c29d994a..000000000
--- a/pybuda/pybuda/tti/__init__.py
+++ /dev/null
@@ -1,4 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-from .tti import TTDeviceImage
diff --git a/pybuda/pybuda/tti/archive.py b/pybuda/pybuda/tti/archive.py
deleted file mode 100644
index 0ca316152..000000000
--- a/pybuda/pybuda/tti/archive.py
+++ /dev/null
@@ -1,538 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import os
-import shutil
-import importlib
-import subprocess
-import inspect
-import packaging
-import struct
-import sys
-import tempfile
-import time
-import functools
-from loguru import logger
-import pathlib
-
-from collections.abc import Iterable
-from pybuda.config import _set_global_compiler_config, TTIDumpFormat
-from pybuda.module import PyBudaModule
-from pybuda.tensor import pytorch_tensor_to_tensor_desc, tensor_desc_to_pytorch_tensor
-from pybuda.utils import generate_hash, get_current_pytest, write_buda_envs_configs
-from pybuda.tti.utils import (
-    compute_file_checksum,
-    write_checksum_to_file,
-    read_checksum_from_file,
-)
-
-import torch
-import json
-import pickle
-from typing import Dict, List, Optional, Union
-from pybuda.optimizers import Optimizer
-from pybuda.backend import BackendAPI
-from pybuda._C.backend_api import (
-    BackendType,
-    PytorchTensorDesc,
-    TilizedTensorDesc,
-    get_device_descs_for_available_devices,
-    binarize_tensor,
-    debinarize_tensor,
-    tilize_tensor,
-)
-from pybuda._C import DataFormat
-
-
-def is_version_at_least(v, *, min_version="1.1.0"):
-    return packaging.version.parse(v) >= packaging.version.parse(min_version)
-
-def load_tensor_from_disk(filepath, value):
-    if filepath.endswith(TTIDumpFormat.BACKEND_TILIZED.extension()):
-        desc = TilizedTensorDesc()
-        desc.format = DataFormat.from_json(value["format"])
-        desc.num_buffers = value["num_buffers"]
-        desc.buf_size_bytes = value["buf_size_bytes"]
-
-    else:
-        desc = PytorchTensorDesc()
-        desc.itemsize = value["itemsize"]
-        desc.format = DataFormat.from_json(value["format"])
-        desc.shape = value["shape"]
-        desc.strides = value["strides"]
-    debinarize_tensor(desc, filepath)
-    return desc
-
-
-class TTDeviceImageJsonEncoder(json.JSONEncoder):
-    DTYPE_TO_BIN_FORMAT = {
-        torch.half: "f",
-        torch.float16: "f",
-        torch.bfloat16: "f",
-        torch.float32: "f",
-        torch.int: "i",
-        torch.int32: "i",
-        torch.short: "h",
-        torch.int16: "h",
-        torch.int8: "b",
-        torch.uint8: "B",
-    }
-    @staticmethod
-    def encode_descriptor(filename: str, tensor_desc: PytorchTensorDesc, tilized_tensor_desc: Optional[TilizedTensorDesc] = None):
-        encoding = {
-            "bin": filename,
-            "itemsize": tensor_desc.itemsize,
-            "shape": tensor_desc.shape,
-            "strides": tensor_desc.strides,
-            "dim": tensor_desc.dim,
-            "format": tensor_desc.format,
-        }
-        if tilized_tensor_desc:
-            encoding.update({
-                "format": tilized_tensor_desc.format,
-                "num_buffers": tilized_tensor_desc.num_buffers,
-                "buf_size_bytes": tilized_tensor_desc.buf_size_bytes,
-            })
-        return encoding
-
-    @staticmethod
-    def rehash_as_pickled_object(
-        d, key, object_value, filename_encoding, base_directory
-    ):
-        with open(os.path.join(base_directory, filename_encoding), "wb") as pkl_file:
-            pickle.dump(object_value, pkl_file, pickle.HIGHEST_PROTOCOL)
-        d[key] = filename_encoding
-
-    @staticmethod
-    def rehash_tensor_as_pickled_object(d, key, object_value, base_directory):
-        filename_encoding = os.path.join(
-            "tensors", f"torch.Tensor.{key}.pkl".replace("/", "_")
-        )
-        TTDeviceImageJsonEncoder.rehash_as_pickled_object(
-            d, key, object_value, filename_encoding, base_directory
-        )
-
-    @staticmethod
-    def rehash_tensor_as_bin_object(d, key, object_value, base_directory, tti_dump_format=Optional[TTIDumpFormat], backend_api: Optional[BackendAPI] = None):
-        filename_encoding = os.path.join(
-            "tensors", f"torch.Tensor.{key}.{tti_dump_format.extension()}".replace("/", "_")
-        )
-
-        assert isinstance(
-            object_value, torch.Tensor
-        ), "rehash_tensor_as_bin_object expects a torch.Tensor"
-
-        from .tti import TTDeviceImage
-
-        tensor = object_value.contiguous()  # contiguous row-major memory layout
-        if is_version_at_least(TTDeviceImage.TTI_VERSION, min_version="1.1.0"):
-            qdesc = backend_api.be_api.get_queue_descriptor(key)
-            tensor_desc = pytorch_tensor_to_tensor_desc(tensor)
-            tilized_tensor_desc = tilize_tensor(qdesc, tensor_desc) if tti_dump_format == TTIDumpFormat.BACKEND_TILIZED else None
-            desc_to_binarize = tilized_tensor_desc if tilized_tensor_desc else tensor_desc
-
-            binarize_tensor(desc_to_binarize, os.path.join(base_directory, filename_encoding))
-            d[key] = TTDeviceImageJsonEncoder.encode_descriptor(filename_encoding, tensor_desc, tilized_tensor_desc)
-
-        else:
-            tensor_desc = pytorch_tensor_to_tensor_desc(tensor)
-            fmt = TTDeviceImageJsonEncoder.DTYPE_TO_BIN_FORMAT[object_value.dtype]
-            with open(
-                os.path.join(base_directory, filename_encoding), "wb"
-            ) as bin_file:
-                for val in tensor.ravel().tolist():
-                    bin_file.write(struct.pack(fmt, val))
-            d[key] = TTDeviceImageJsonEncoder.encode_descriptor(filename_encoding, desc, tilized_tensor_desc)
-
-    @staticmethod
-    def preprocess_keys(d, base_directory: str, tti_dump_format: Optional[TTIDumpFormat] = None, backend_api: Optional[BackendAPI] = None):
-        """Convert a dict's keys to strings if they are not."""
-        kvs = list(d.items())
-        for key, value in kvs:
-            if not isinstance(key, str) and isinstance(key, torch.dtype):
-                d[str(key)] = value
-                del d[key]
-            elif isinstance(value, torch.Tensor) or (
-                isinstance(value, Iterable)
-                and any(isinstance(sub_value, torch.Tensor) for sub_value in value)
-            ):
-                use_backend_format = tti_dump_format in (TTIDumpFormat.BACKEND, TTIDumpFormat.BACKEND_TILIZED)
-                if use_backend_format and key != "cpueval_outputs":
-                    TTDeviceImageJsonEncoder.rehash_tensor_as_bin_object(
-                        d, key, value, base_directory, tti_dump_format=tti_dump_format, backend_api=backend_api
-                    )
-                else:
-                    TTDeviceImageJsonEncoder.rehash_tensor_as_pickled_object(
-                        d, key, value, base_directory
-                    )
-            elif isinstance(value, Optimizer):
-                pkl_filepath = f"Optimizer.{value.get_type()}.pkl"
-                TTDeviceImageJsonEncoder.rehash_as_pickled_object(
-                    d, key, value, pkl_filepath, base_directory
-                )
-            elif isinstance(value, dict):
-                d[key] = TTDeviceImageJsonEncoder.preprocess_keys(
-                    value, base_directory, tti_dump_format, backend_api
-                )
-
-        return d
-
-    def default(self, obj):
-        if hasattr(obj, "to_json"):
-            return obj.to_json()
-        try:
-            return json.JSONEncoder.default(self, obj)
-        except TypeError as e:
-            raise RuntimeError(
-                f"JSON Serialization failed: {e}. {obj.__class__.__name__}.to_json(..) method needs to be implemented. Object={obj}."
-            )
-
-
-class TTDeviceImageJsonDecoder(json.JSONDecoder):
-    DICT_KEY_DECODING = {
-        "torch.": torch,
-    }
-    DATA_FORMAT_TO_DTYPE = {
-        DataFormat.Float32: torch.float32,
-        DataFormat.Float16_b: torch.bfloat16,
-        DataFormat.Float16: torch.float16,
-        DataFormat.RawUInt32: torch.int,
-        DataFormat.RawUInt16: torch.int16,
-        DataFormat.RawUInt8: torch.uint8,
-        DataFormat.Int8: torch.int8,
-    }
-
-    def __init__(self, *args, **kwargs):
-        json.JSONDecoder.__init__(self, object_hook=self.object_hook, *args, **kwargs)
-
-    def object_hook(self, dct):
-        return dct
-
-    @staticmethod
-    def rehash_as_tensor(value, directory):
-        from .tti import TTDeviceImage
-
-        if is_version_at_least(TTDeviceImage.TTI_VERSION, min_version="1.1.0"):
-            filepath= value["bin"]
-            lazy_load_callable  = functools.partial(load_tensor_from_disk, os.path.join(directory, filepath), value)
-            return lazy_load_callable
-        else:
-            dtype = TTDeviceImageJsonDecoder.DATA_FORMAT_TO_DTYPE[
-                DataFormat.from_json(value["format"])
-            ]
-            fmt = TTDeviceImageJsonEncoder.DTYPE_TO_BIN_FORMAT[dtype]
-            itemsize = struct.calcsize(fmt)
-            tensor_data = []
-            bin_filepath = value["bin"]
-            with open(os.path.join(directory, bin_filepath), "rb") as bin_file:
-                while True:
-                    bytes_data = bin_file.read(itemsize)
-                    if not bytes_data:
-                        break
-                    (val,) = struct.unpack(fmt, bytes_data)
-                    tensor_data.append(val)
-
-            tensor = torch.tensor(tensor_data, dtype=dtype).reshape(*value["shape"])
-            return tensor
-
-    @staticmethod
-    def postprocess_keys(d, directory):
-        """Convert a encoded dict's keys to back to original type ."""
-        kvs = list(d.items())
-        for key, value in kvs:
-            if isinstance(d[key], dict):
-                if "bin" in d[key]:
-                    d[key] = TTDeviceImageJsonDecoder.rehash_as_tensor(value, directory)
-                else:
-                    value = TTDeviceImageJsonDecoder.postprocess_keys(value, directory)
-
-            # convert nonstring to string if needed
-            for (
-                encoded_string,
-                decoded_type,
-            ) in TTDeviceImageJsonDecoder.DICT_KEY_DECODING.items():
-                if isinstance(key, str) and key.startswith(encoded_string):
-                    decoded_type = getattr(decoded_type, key[len(encoded_string) :])
-                    d[decoded_type] = value
-                    del d[key]
-            if isinstance(value, str) and value.endswith(".pkl"):
-                with open(os.path.join(directory, value), "rb") as pkl_file:
-                    d[key] = pickle.load(pkl_file)
-
-        return d
-
-
-class TTIArchive:
-    TTI_UNZIPPED_DIR_NAME = "unzipped_tti"
-
-    @staticmethod
-    def _create_tti_archive(device_images_directory: str, device_img_path: str) -> None:
-        tti_absolute_file_path = os.path.realpath(device_img_path)
-
-        try:
-            subprocess.run(
-                [
-                    "tar",
-                    "-cf",
-                    tti_absolute_file_path,
-                    "-C",
-                    device_images_directory,
-                    TTIArchive.TTI_UNZIPPED_DIR_NAME,
-                ]
-            )
-        except subprocess.CalledProcessError as e:
-            logger.error(f"Command failed with error {e}")
-
-    @staticmethod
-    def _copy_backend_build_files(*, src_dir: str, dst_dir: str):
-        logger.info(
-            "TTDeviceImage: copying backend build files from {} to {}", src_dir, dst_dir
-        )
-        os.makedirs(src_dir, exist_ok=True)
-        try:
-            if not src_dir.endswith("/"):
-                src_dir += "/"
-            cmd = [
-                "rsync",
-                "-a",
-                "--exclude=*.log",
-                "--exclude=blob.yaml",
-                "--exclude=*.d",
-                src_dir,
-                dst_dir,
-            ]
-            logger.info("Running command: {}", " ".join(cmd))
-            subprocess.run(cmd, check=True)
-        except subprocess.CalledProcessError as e:
-            logger.error(f"Command failed with error {e}")
-
-    @staticmethod
-    def _copy_netlist_yaml(*, netlist_yaml: str, dst_dir: str):
-        shutil.copy(src=netlist_yaml, dst=dst_dir)
-
-    @staticmethod
-    def _copy_module_file(*, module_file, dst_dir):
-        src = os.path.relpath(module_file, start=os.curdir)
-        dst = os.path.join(dst_dir, src)
-        dstfolder = os.path.dirname(dst)
-        if not os.path.exists(dstfolder):
-            os.makedirs(dstfolder, exist_ok=True)
-        shutil.copy(src=src, dst=dst)
-
-    @staticmethod
-    def _get_device_img_path(device_img_path_override: Optional[str] = None):
-        if device_img_path_override:
-            device_img_path = device_img_path_override
-        else:
-            DEFAULT_DEVICE_PATH = (
-                f"device_images/tt_{generate_hash(get_current_pytest())}.tti"
-            )
-            device_img_path = DEFAULT_DEVICE_PATH
-        return device_img_path
-
-    @staticmethod
-    def get_instantiate_modules(
-        module_name_to_metadata: Dict[str, Dict[str, str]], unzipped_tti_directory: str
-    ) -> List[PyBudaModule]:
-        instantiated_modules = []
-        for name, metadata in module_name_to_metadata.items():
-            unzipped_directory_module_path = os.path.join(
-                unzipped_tti_directory, "module_files", metadata["module_file_path"]
-            )
-            expected_module_path = os.path.join(
-                os.getcwd(), metadata["module_file_path"]
-            )
-
-            # if module does not exist during execution, we'll copy over the module file to import location
-            if not pathlib.Path(expected_module_path).is_file():
-                logger.info(
-                    "TTDeviceImage: copying module file from {} to {}",
-                    unzipped_tti_directory,
-                    expected_module_path,
-                )
-                os.makedirs(os.path.dirname(expected_module_path), exist_ok=True)
-                shutil.copy(unzipped_directory_module_path, expected_module_path)
-
-            # Create a new module object and load
-            module = importlib.import_module(metadata["module"])
-
-            # Fetch class from module and instantiate a new object from the class
-            obj_class = getattr(module, metadata["class"])
-            instantiated_modules.append(obj_class(name))
-
-        return instantiated_modules
-
-    @staticmethod
-    def construct_device_image(unzipped_tti_directory: str) -> "TTDeviceImage":
-        from .tti import TTDeviceImage
-
-        device_image = None
-        with open(
-            os.path.join(unzipped_tti_directory, "device.json"), "r"
-        ) as json_file:
-            device_image_dict = json.load(json_file, cls=TTDeviceImageJsonDecoder)
-            TTDeviceImageJsonDecoder.postprocess_keys(
-                device_image_dict, unzipped_tti_directory
-            )
-
-            try:
-                device_image = TTDeviceImage.from_dict(device_image_dict)
-            except KeyError as e:
-                raise ValueError(
-                    f"TTI failed to deserialize. TTDeviceImage not contain key: {e}. TTI recompilation required."
-                )
-
-            sys.path.append(
-                "."
-            )  # We need this line because the tvm->python code path does and pickle requires a match
-            device_image.modules = TTIArchive.get_instantiate_modules(
-                device_image.module_name_to_metadata, unzipped_tti_directory
-            )
-            netlist_file_basename = os.path.basename(
-                device_image.compiled_graph_state.netlist_filename
-            )
-            device_image.compiled_graph_state.netlist_filename = os.path.join(
-                unzipped_tti_directory, netlist_file_basename
-            )
-
-        return device_image
-
-    @staticmethod
-    def load_from_disk(tti_file_path: str) -> "TTDeviceImage":
-        tti_file_path = TTIArchive._get_device_img_path(tti_file_path)
-        absolute_device_image_path = os.path.realpath(tti_file_path)
-        logger.info("TTDeviceImage::loading from {}", absolute_device_image_path)
-        absolute_device_image_directory = os.path.dirname(absolute_device_image_path)
-        unzipped_tti_directory = os.path.join(
-            absolute_device_image_directory, TTIArchive.TTI_UNZIPPED_DIR_NAME
-        )
-
-        def contains_matching_checksum(tti_checksum) -> bool:
-            directory_checksum = read_checksum_from_file(
-                os.path.join(unzipped_tti_directory, "checksum.txt")
-            )
-            return tti_checksum == directory_checksum
-
-        tti_checksum = compute_file_checksum(absolute_device_image_path)
-        if contains_matching_checksum(tti_checksum):
-            logger.info(
-                f"TTI: Netlist checksum matches - populating TTDevice from pre-existing dir {unzipped_tti_directory}"
-            )
-        else:
-            logger.info(
-                f"TTI: No matching checksum found - extracting TTI to dir {os.path.realpath(unzipped_tti_directory)} "
-            )
-            shutil.rmtree(unzipped_tti_directory, ignore_errors=True)
-            try:
-                subprocess.run(
-                    ["tar", "-xf", tti_file_path, "-C", absolute_device_image_directory]
-                )
-            except subprocess.CalledProcessError as e:
-                logger.error(f"Command failed with error {e}")
-            write_checksum_to_file(
-                tti_checksum, os.path.join(unzipped_tti_directory, "checksum.txt")
-            )
-
-        device_image = TTIArchive.construct_device_image(unzipped_tti_directory)
-        device_image.compiler_cfg.backend_output_dir = os.path.join(
-            absolute_device_image_directory,
-            device_image.compiler_cfg.backend_output_dir,
-        )
-        device_image.compiler_cfg.backend_runtime_params_path = ""
-
-        # Check if the loaded device-desc.yaml exists, generate new one if not
-        # (temporary measure till budabackend#2066 resolved)
-        if not os.path.exists(device_image.compiler_cfg.backend_device_descriptor_path):
-            if device_image.devtype == BackendType.Silicon:
-                soc_files = get_device_descs_for_available_devices(device_image.compiler_cfg.backend_output_dir)
-                first_id = (
-                    device_image.chip_ids[0] if len(device_image.chip_ids) > 0 else 0
-                )
-                soc_file_new = soc_files[first_id].soc_desc_yaml
-                device_image.compiler_cfg.backend_device_descriptor_path = soc_file_new
-
-        _set_global_compiler_config(device_image.compiler_cfg)
-
-        return device_image
-
-    @staticmethod
-    def save_to_disk(
-        device_image: "TTDeviceImage", device_img_path_override: Optional[str] = None, backend_api: Optional[BackendAPI] = None
-    ):
-        from .tti import TTDeviceImage
-
-        device_img_path = TTIArchive._get_device_img_path(device_img_path_override)
-        logger.info("TTI: Saving device image to {}", device_img_path)
-
-        start_time = time.time()
-        dst_relative_directory_tti = os.path.dirname(device_img_path)
-        os.makedirs(os.path.realpath(dst_relative_directory_tti), exist_ok=True)
-
-        with tempfile.TemporaryDirectory() as tmp:
-            src_tti_directory_to_zip = os.path.join(
-                tmp, TTIArchive.TTI_UNZIPPED_DIR_NAME
-            )
-            os.makedirs(src_tti_directory_to_zip, exist_ok=True)
-
-            copy_start = time.time()
-            relative_backend_output_dir = os.path.join(
-                TTIArchive.TTI_UNZIPPED_DIR_NAME, "backend_build_binaries"
-            )
-            TTIArchive._copy_backend_build_files(
-                src_dir=device_image.compiler_cfg.backend_output_dir,
-                dst_dir=os.path.join(
-                    src_tti_directory_to_zip, "backend_build_binaries"
-                ),
-            )
-            logger.debug(
-                "TTI: Copying backend build files took {} seconds",
-                time.time() - copy_start,
-            )
-
-            netlist_path = device_image.compiled_graph_state.netlist_filename
-            device_image.compiler_cfg.backend_output_dir = relative_backend_output_dir
-            device_image.compiler_cfg.backend_runtime_params_path = ""
-            netlist_file_basename = os.path.basename(netlist_path)
-            device_image.compiled_graph_state.netlist_filename = os.path.join(
-                relative_backend_output_dir, netlist_file_basename
-            )
-
-            tensors_directory = os.path.join(src_tti_directory_to_zip, "tensors")
-            os.makedirs(tensors_directory, exist_ok=True)
-
-            with open(os.path.join(src_tti_directory_to_zip, "device.json"), "w") as f:
-                device_image_state_dict = TTDeviceImage.to_dict(device_image)
-                del device_image_state_dict["modules"]
-                TTDeviceImageJsonEncoder.preprocess_keys(
-                    device_image_state_dict,
-                    src_tti_directory_to_zip,
-                    device_image.compiler_cfg.tti_dump_format,
-                    backend_api=backend_api,
-                )
-                device_image_state_json = json.dumps(
-                    device_image_state_dict,
-                    cls=TTDeviceImageJsonEncoder,
-                    indent=4,
-                    skipkeys=True,
-                )
-                f.write(device_image_state_json)
-
-            module_files_directory = os.path.join(
-                src_tti_directory_to_zip, "module_files"
-            )
-            os.makedirs(module_files_directory, exist_ok=True)
-
-            for pybuda_module in device_image.modules:
-                module_file = inspect.getfile(pybuda_module.__class__)
-                TTIArchive._copy_module_file(
-                    module_file=module_file, dst_dir=module_files_directory
-                )
-
-                TTIArchive._copy_netlist_yaml(
-                    netlist_yaml=netlist_path, dst_dir=src_tti_directory_to_zip
-                )
-                write_buda_envs_configs(src_tti_directory_to_zip)
-            TTIArchive._create_tti_archive(tmp, device_img_path)
-        logger.info(
-            "TTI: Saving device image took {} seconds", time.time() - start_time
-        )
diff --git a/pybuda/pybuda/tti/runtime_param_yamls/galaxy_syslevel.yaml b/pybuda/pybuda/tti/runtime_param_yamls/galaxy_syslevel.yaml
deleted file mode 100644
index 80358cec2..000000000
--- a/pybuda/pybuda/tti/runtime_param_yamls/galaxy_syslevel.yaml
+++ /dev/null
@@ -1,103 +0,0 @@
-system_level_params:
-  system-device-chip_locations: 28,2,7,0,0,-27,1,7,0,0,-26,0,7,0,0,-6,0,6,0,0,-7,0,5,0,0,-8,0,4,0,0,-9,0,3,0,0,-10,0,2,0,0,-11,0,1,0,0,-12,0,0,0,0,-3,2,5,0,0,-29,2,6,0,0,-0,3,5,0,0,-4,1,5,0,0,-30,3,6,0,0,-1,3,4,0,0,-5,1,6,0,0,-31,3,7,0,0,-2,2,4,0,0,-13,1,0,0,0,-14,1,1,0,0,-15,1,2,0,0,-16,1,3,0,0,-17,1,4,0,0,-18,2,3,0,0,-19,2,2,0,0,-20,2,1,0,0,-21,2,0,0,0,-22,3,0,0,0,-23,3,1,0,0,-24,3,2,0,0,-25,3,3,0,0,-
-  system-device-chips_with_mmio: 0-
-  system-device-cluster_descriptor: "" 
-  system-device-ethernet_connections: 23,4,20,12,-23,5,20,13,-23,6,20,14,-23,7,20,15,-23,0,22,0,-23,1,22,1,-23,2,22,2,-23,3,22,3,-23,8,24,8,-23,9,24,9,-23,10,24,10,-23,11,24,11,-24,4,19,12,-24,5,19,13,-24,6,19,14,-24,7,19,15,-24,8,23,8,-24,9,23,9,-24,10,23,10,-24,11,23,11,-24,0,25,0,-24,1,25,1,-24,2,25,2,-24,3,25,3,-19,12,24,4,-19,11,20,11,-19,10,20,10,-19,15,24,7,-19,2,18,2,-19,14,24,6,-19,1,18,1,-19,13,24,5,-19,0,18,0,-19,4,15,12,-19,5,15,13,-19,6,15,14,-19,7,15,15,-19,3,18,3,-19,8,20,8,-19,9,20,9,-1,0,0,0,-1,1,0,1,-1,2,0,2,-1,3,0,3,-1,4,2,12,-1,5,2,13,-1,6,2,14,-1,7,2,15,-1,8,25,8,-1,9,25,9,-1,10,25,10,-1,11,25,11,-30,8,0,8,-30,9,0,9,-30,10,0,10,-30,11,0,11,-30,4,29,12,-30,5,29,13,-30,6,29,14,-30,7,29,15,-30,0,31,0,-30,1,31,1,-30,2,31,2,-30,3,31,3,-17,7,8,15,-17,6,8,14,-17,5,8,13,-17,3,16,3,-17,2,16,2,-17,15,2,7,-17,1,16,1,-17,14,2,6,-17,12,2,4,-17,13,2,5,-17,0,16,0,-17,8,4,8,-17,9,4,9,-17,10,4,10,-17,11,4,11,-17,4,8,12,-4,8,17,8,-4,7,7,15,-4,6,7,14,-4,11,17,11,-4,10,17,10,-4,9,17,9,-4,12,3,4,-4,13,3,5,-4,0,5,0,-4,14,3,6,-4,1,5,1,-4,15,3,7,-4,2,5,2,-4,3,5,3,-4,4,7,12,-4,5,7,13,-25,8,1,8,-25,9,1,9,-25,10,1,10,-25,11,1,11,-25,4,18,12,-25,5,18,13,-25,6,18,14,-25,7,18,15,-25,0,24,0,-25,1,24,1,-25,2,24,2,-25,3,24,3,-7,12,4,4,-7,13,4,5,-7,0,6,0,-7,14,4,6,-7,1,6,1,-7,15,4,7,-7,2,6,2,-7,3,6,3,-7,8,8,8,-7,9,8,9,-7,10,8,10,-7,11,8,11,-6,12,5,4,-6,13,5,5,-6,0,7,0,-6,14,5,6,-6,1,7,1,-6,15,5,7,-6,2,7,2,-6,3,7,3,-6,8,26,8,-6,9,26,9,-6,10,26,10,-6,11,26,11,-22,4,21,12,-22,5,21,13,-22,6,21,14,-22,7,21,15,-22,0,23,0,-22,1,23,1,-22,2,23,2,-22,3,23,3,-2,8,18,8,-2,7,17,15,-2,6,17,14,-2,11,18,11,-2,10,18,10,-2,9,18,9,-2,12,1,4,-2,13,1,5,-2,0,3,0,-2,14,1,6,-2,1,3,1,-2,15,1,7,-2,2,3,2,-2,3,3,3,-2,4,17,12,-2,5,17,13,-31,4,28,12,-31,5,28,13,-31,6,28,14,-31,7,28,15,-31,0,30,0,-31,1,30,1,-31,2,30,2,-31,3,30,3,-28,4,27,12,-28,5,27,13,-28,6,27,14,-28,7,27,15,-28,0,29,0,-28,13,31,5,-28,1,29,1,-28,14,31,6,-28,2,29,2,-28,15,31,7,-28,3,29,3,-28,12,31,4,-18,12,25,4,-18,3,19,3,-18,2,19,2,-18,15,25,7,-18,14,25,6,-18,1,19,1,-18,13,25,5,-18,0,19,0,-18,8,2,8,-18,9,2,9,-18,10,2,10,-18,11,2,11,-18,4,16,12,-18,5,16,13,-18,6,16,14,-18,7,16,15,-5,12,29,4,-5,11,27,11,-5,10,27,10,-5,15,29,7,-5,2,4,2,-5,14,29,6,-5,1,4,1,-5,13,29,5,-5,0,4,0,-5,3,4,3,-5,4,6,12,-5,5,6,13,-5,6,6,14,-5,7,6,15,-5,8,27,8,-5,9,27,9,-29,12,30,4,-29,3,28,3,-29,2,28,2,-29,15,30,7,-29,14,30,6,-29,1,28,1,-29,13,30,5,-29,0,28,0,-29,8,3,8,-29,9,3,9,-29,10,3,10,-29,11,3,11,-29,4,5,12,-29,5,5,13,-29,6,5,14,-29,7,5,15,-0,0,1,0,-0,1,1,1,-0,2,1,2,-0,3,1,3,-0,4,3,12,-0,5,3,13,-0,6,3,14,-0,7,3,15,-0,8,30,8,-0,9,30,9,-0,10,30,10,-0,11,30,11,-3,8,29,8,-3,7,4,15,-3,6,4,14,-3,11,29,11,-3,10,29,10,-3,9,29,9,-3,12,0,4,-3,13,0,5,-3,0,2,0,-3,14,0,6,-3,1,2,1,-3,15,0,7,-3,2,2,2,-3,3,2,3,-3,4,4,12,-3,5,4,13,-27,8,5,8,-27,9,5,9,-27,10,5,10,-27,11,5,11,-27,4,26,12,-27,5,26,13,-27,6,26,14,-27,7,26,15,-27,12,28,4,-27,13,28,5,-27,14,28,6,-27,15,28,7,-26,8,6,8,-26,9,6,9,-26,10,6,10,-26,11,6,11,-26,12,27,4,-26,13,27,5,-26,14,27,6,-26,15,27,7,-8,8,7,8,-8,9,7,9,-8,10,7,10,-8,11,7,11,-8,0,9,0,-8,13,17,5,-8,1,9,1,-8,14,17,6,-8,2,9,2,-8,15,17,7,-8,3,9,3,-8,12,17,4,-9,0,8,0,-9,13,16,5,-9,1,8,1,-9,14,16,6,-9,2,8,2,-9,15,16,7,-9,3,8,3,-9,8,10,8,-9,9,10,9,-9,10,10,10,-9,11,10,11,-9,12,16,4,-10,8,9,8,-10,9,9,9,-10,10,9,10,-10,11,9,11,-10,0,11,0,-10,13,15,5,-10,1,11,1,-10,14,15,6,-10,2,11,2,-10,15,15,7,-10,3,11,3,-10,12,15,4,-16,12,18,4,-16,3,17,3,-16,2,17,2,-16,15,18,7,-16,14,18,6,-16,1,17,1,-16,13,18,5,-16,0,17,0,-16,4,9,12,-16,5,9,13,-16,6,9,14,-16,7,9,15,-16,8,15,8,-16,9,15,9,-16,10,15,10,-16,11,15,11,-11,0,10,0,-11,13,14,5,-11,1,10,1,-11,14,14,6,-11,2,10,2,-11,15,14,7,-11,3,10,3,-11,8,12,8,-11,9,12,9,-11,10,12,10,-11,11,12,11,-11,12,14,4,-15,12,19,4,-15,11,16,11,-15,10,16,10,-15,15,19,7,-15,2,14,2,-15,14,19,6,-15,1,14,1,-15,13,19,5,-15,0,14,0,-15,4,10,12,-15,5,10,13,-15,6,10,14,-15,7,10,15,-15,3,14,3,-15,8,16,8,-15,9,16,9,-12,8,11,8,-12,9,11,9,-12,10,11,10,-12,11,11,11,-12,12,13,4,-12,13,13,5,-12,14,13,6,-12,15,13,7,-14,12,20,4,-14,3,15,3,-14,2,15,2,-14,15,20,7,-14,14,20,6,-14,1,15,1,-14,13,20,5,-14,0,15,0,-14,4,11,12,-14,5,11,13,-14,6,11,14,-14,7,11,15,-14,8,13,8,-14,9,13,9,-14,10,13,10,-14,11,13,11,-13,4,12,12,-13,5,12,13,-13,6,12,14,-13,7,12,15,-13,8,14,8,-13,9,14,9,-13,10,14,10,-13,11,14,11,-13,12,21,4,-13,13,21,5,-13,14,21,6,-13,15,21,7,-21,4,13,12,-21,5,13,13,-21,6,13,14,-21,7,13,15,-21,0,20,0,-21,13,22,5,-21,1,20,1,-21,14,22,6,-21,2,20,2,-21,15,22,7,-21,3,20,3,-21,12,22,4,-20,12,23,4,-20,3,21,3,-20,2,21,2,-20,15,23,7,-20,14,23,6,-20,1,21,1,-20,13,23,5,-20,0,21,0,-20,4,14,12,-20,5,14,13,-20,6,14,14,-20,7,14,15,-20,8,19,8,-20,9,19,9,-20,10,19,10,-20,11,19,11,-
-  system-device-num_mmio_devices: 32
-  system-device-number_of_chips: 32
-  system-device0-harvesting_mask: 0
-  system-device0-num_harvested_rows: 0
-  system-device0-type: WORMHOLE_B0
-  system-device1-harvesting_mask: 0
-  system-device1-num_harvested_rows: 0
-  system-device1-type: WORMHOLE_B0
-  system-device10-harvesting_mask: 0
-  system-device10-num_harvested_rows: 0
-  system-device10-type: WORMHOLE_B0
-  system-device11-harvesting_mask: 0
-  system-device11-num_harvested_rows: 0
-  system-device11-type: WORMHOLE_B0
-  system-device12-harvesting_mask: 0
-  system-device12-num_harvested_rows: 0
-  system-device12-type: WORMHOLE_B0
-  system-device13-harvesting_mask: 0
-  system-device13-num_harvested_rows: 0
-  system-device13-type: WORMHOLE_B0
-  system-device14-harvesting_mask: 0
-  system-device14-num_harvested_rows: 0
-  system-device14-type: WORMHOLE_B0
-  system-device15-harvesting_mask: 0
-  system-device15-num_harvested_rows: 0
-  system-device15-type: WORMHOLE_B0
-  system-device16-harvesting_mask: 0
-  system-device16-num_harvested_rows: 0
-  system-device16-type: WORMHOLE_B0
-  system-device17-harvesting_mask: 0
-  system-device17-num_harvested_rows: 0
-  system-device17-type: WORMHOLE_B0
-  system-device18-harvesting_mask: 0
-  system-device18-num_harvested_rows: 0
-  system-device18-type: WORMHOLE_B0
-  system-device19-harvesting_mask: 0
-  system-device19-num_harvested_rows: 0
-  system-device19-type: WORMHOLE_B0
-  system-device2-harvesting_mask: 0
-  system-device2-num_harvested_rows: 0
-  system-device2-type: WORMHOLE_B0
-  system-device20-harvesting_mask: 0
-  system-device20-num_harvested_rows: 0
-  system-device20-type: WORMHOLE_B0
-  system-device21-harvesting_mask: 0
-  system-device21-num_harvested_rows: 0
-  system-device21-type: WORMHOLE_B0
-  system-device22-harvesting_mask: 0
-  system-device22-num_harvested_rows: 0
-  system-device22-type: WORMHOLE_B0
-  system-device23-harvesting_mask: 0
-  system-device23-num_harvested_rows: 0
-  system-device23-type: WORMHOLE_B0
-  system-device24-harvesting_mask: 0
-  system-device24-num_harvested_rows: 0
-  system-device24-type: WORMHOLE_B0
-  system-device25-harvesting_mask: 0
-  system-device25-num_harvested_rows: 0
-  system-device25-type: WORMHOLE_B0
-  system-device26-harvesting_mask: 0
-  system-device26-num_harvested_rows: 0
-  system-device26-type: WORMHOLE_B0
-  system-device27-harvesting_mask: 0
-  system-device27-num_harvested_rows: 0
-  system-device27-type: WORMHOLE_B0
-  system-device28-harvesting_mask: 0
-  system-device28-num_harvested_rows: 0
-  system-device28-type: WORMHOLE_B0
-  system-device29-harvesting_mask: 0
-  system-device29-num_harvested_rows: 0
-  system-device29-type: WORMHOLE_B0
-  system-device3-harvesting_mask: 0
-  system-device3-num_harvested_rows: 0
-  system-device3-type: WORMHOLE_B0
-  system-device30-harvesting_mask: 0
-  system-device30-num_harvested_rows: 0
-  system-device30-type: WORMHOLE_B0
-  system-device31-harvesting_mask: 0
-  system-device31-num_harvested_rows: 0
-  system-device31-type: WORMHOLE_B0
-  system-device4-harvesting_mask: 0
-  system-device4-num_harvested_rows: 0
-  system-device4-type: WORMHOLE_B0
-  system-device5-harvesting_mask: 0
-  system-device5-num_harvested_rows: 0
-  system-device5-type: WORMHOLE_B0
-  system-device6-harvesting_mask: 0
-  system-device6-num_harvested_rows: 0
-  system-device6-type: WORMHOLE_B0
-  system-device7-harvesting_mask: 0
-  system-device7-num_harvested_rows: 0
-  system-device7-type: WORMHOLE_B0
-  system-device8-harvesting_mask: 0
-  system-device8-num_harvested_rows: 0
-  system-device8-type: WORMHOLE_B0
-  system-device9-harvesting_mask: 0
-  system-device9-num_harvested_rows: 0
-  system-device9-type: WORMHOLE_B0
diff --git a/pybuda/pybuda/tti/runtime_param_yamls/gs_e150_syslevel.yaml b/pybuda/pybuda/tti/runtime_param_yamls/gs_e150_syslevel.yaml
deleted file mode 100644
index f920f3a65..000000000
--- a/pybuda/pybuda/tti/runtime_param_yamls/gs_e150_syslevel.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-system_level_params:
-  system-device-num_mmio_devices: 1
-  system-device-number_of_chips: 1
-  system-device0-harvesting_mask: 0
-  system-device0-num_harvested_rows: 0
-  system-device0-type: GRAYSKULL
diff --git a/pybuda/pybuda/tti/runtime_param_yamls/gs_e300_syslevel.yaml b/pybuda/pybuda/tti/runtime_param_yamls/gs_e300_syslevel.yaml
deleted file mode 100644
index 45eec30b0..000000000
--- a/pybuda/pybuda/tti/runtime_param_yamls/gs_e300_syslevel.yaml
+++ /dev/null
@@ -1,6 +0,0 @@
-system_level_params:
-  system-device-num_mmio_devices: 1
-  system-device-number_of_chips: 1
-  system-device0-harvesting_mask: 2050
-  system-device0-num_harvested_rows: 2
-  system-device0-type: GRAYSKULL
diff --git a/pybuda/pybuda/tti/runtime_param_yamls/nebula_x1_syslevel.yaml b/pybuda/pybuda/tti/runtime_param_yamls/nebula_x1_syslevel.yaml
deleted file mode 100644
index 97e05949e..000000000
--- a/pybuda/pybuda/tti/runtime_param_yamls/nebula_x1_syslevel.yaml
+++ /dev/null
@@ -1,10 +0,0 @@
-system_level_params:
-  system-device-chip_locations: 0,0,0,0,0,-
-  system-device-chips_with_mmio: 0-
-  system-device-cluster_descriptor: ""
-  system-device-ethernet_connections: ""
-  system-device-num_mmio_devices: 1
-  system-device-number_of_chips: 1
-  system-device0-harvesting_mask: 2048
-  system-device0-num_harvested_rows: 1
-  system-device0-type: WORMHOLE_B0
diff --git a/pybuda/pybuda/tti/runtime_param_yamls/nebula_x2_syslevel.yaml b/pybuda/pybuda/tti/runtime_param_yamls/nebula_x2_syslevel.yaml
deleted file mode 100644
index 5ab57ebe7..000000000
--- a/pybuda/pybuda/tti/runtime_param_yamls/nebula_x2_syslevel.yaml
+++ /dev/null
@@ -1,13 +0,0 @@
-system_level_params:
-  system-device-chip_locations: 0,0,0,0,0,-1,1,0,0,0,-
-  system-device-chips_with_mmio: 0-
-  system-device-cluster_descriptor: "" 
-  system-device-ethernet_connections: 0,8,1,0,-0,9,1,1,-1,0,0,8,-1,1,0,9,-
-  system-device-num_mmio_devices: 2
-  system-device-number_of_chips: 2
-  system-device0-harvesting_mask: 2052
-  system-device0-num_harvested_rows: 2
-  system-device0-type: WORMHOLE_B0
-  system-device1-harvesting_mask: 2176
-  system-device1-num_harvested_rows: 2
-  system-device1-type: WORMHOLE_B0
diff --git a/pybuda/pybuda/tti/tti.py b/pybuda/pybuda/tti/tti.py
deleted file mode 100644
index 30fe90d0f..000000000
--- a/pybuda/pybuda/tti/tti.py
+++ /dev/null
@@ -1,262 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-from typing import Dict, List, Optional, Set, Any, Tuple, ClassVar
-from dataclasses import dataclass, field
-
-from .archive import TTIArchive
-import importlib
-import inspect
-import os
-from pybuda.ttdevice import TTDevice, get_device_config
-
-from pybuda.config import CompilerConfig, _get_global_compiler_config
-from pybuda.verify.config import VerifyConfig
-from pybuda.utils import get_pybuda_git_hash, get_budabackend_git_hash, as_json, dict_as_json, list_as_json, optional_as_json
-
-from pybuda._C import DataFormat
-from pybuda._C.backend_api import BackendDevice, DeviceMode, BackendType
-from ..run.api import detect_available_devices
-
-from pybuda.optimizers import Optimizer
-from pybuda.compiled_graph_state import CompiledGraphState
-
-import dataclasses
-from dataclasses_json import dataclass_json, config 
-
-from loguru import logger
-
-import torch
-
-@dataclass_json
-@dataclasses.dataclass()
-class TTDeviceImage:
-    """
-    A TTDeviceImage defines all required state sourced from TTDevice to produce a TTI-archive.
-    """
-    TTI_VERSION: ClassVar[str] = "1.1.0" 
-
-    # Static device state
-    version: str
-    device_image_name: str
-    arch: BackendDevice = field(metadata=as_json(BackendDevice))
-    devtype: BackendType = field(metadata=as_json(BackendType))
-    chip_ids: List[int]
-    fp32_fallback: DataFormat = field(metadata=as_json(DataFormat))
-    optimizer: Optional[Optimizer]
-    training: bool
-    microbatch_size: int
-    microbatch_count: int
-    grid_size: List[int] 
-    harvested_rows: Optional[List[int]]
-
-    # snapshot of the state generated from pybuda compile
-    compiled_graph_state: CompiledGraphState
-
-    # We probably don't need to serialize this but we'll do so to get a static
-    # snapshot of both sets of configs.
-    verify_cfg: Optional[VerifyConfig]
-    compiler_cfg: CompilerConfig
-
-    # snapshot of placed ops/modules onto device 
-    loss_module: Optional[str]
-    module_name_to_metadata: Dict[str, Dict[str, str]] = field(default_factory=dict)
-    modules: List[Any] = field(default_factory=list)
-
-    # generated config:
-    #  Note: validation; error by default -> override with flag into warning
-    pybuda_pip_version_id: int = field(init=False)
-    pybuda_commit_hash: str = field(init=False)
-    budabackend_commit_hash: str = field(init=False)
-
-    def __post_init__(self):
-        # generated attributes set here since TTDeviceImage default frozen
-        object.__setattr__(self, "pybuda_pip_version_id", importlib.metadata.version('pybuda'))
-        object.__setattr__(self, "pybuda_commit_hash", get_pybuda_git_hash())
-        object.__setattr__(self, "budabackend_commit_hash", get_budabackend_git_hash())
-
-    @staticmethod
-    def get_harvested_rows(device: "TTDevice", device_cfg: "DeviceConfig") -> List[int]:
-        if device.devtype == BackendType.Golden:
-            harvested_rows = []
-        else:
-            harvested_rows = device_cfg.get_harvested_cfg()
-
-        if len(harvested_rows) > 0:
-            harvested_rows = [harvested_rows[c_id] for c_id in device.chip_ids] 
-
-        return harvested_rows
-    
-
-    @staticmethod
-    def create_image_from_device(
-        device: "TTDevice",
-        training: bool,
-        microbatch_count: int,
-        verify_cfg: VerifyConfig, 
-        compiler_cfg: CompilerConfig,
-        cpueval_outputs: Optional[List[torch.Tensor]] = None,
-    ) -> "TTDeviceImage": 
-        device_cfg = device.get_device_config(compiler_cfg=compiler_cfg)
-        grid_size = device_cfg.grid_size 
-        device._compiled_graph_state.cpueval_outputs = cpueval_outputs
-
-        device_image = TTDeviceImage(
-            version=TTDeviceImage.TTI_VERSION,
-            device_image_name=device.name,
-            arch=device.arch,
-            devtype=device.devtype,
-            chip_ids=device.chip_ids,
-            fp32_fallback=device.fp32_fallback,
-            optimizer=device.optimizer,
-            training=training,
-            microbatch_size=device._compiled_graph_state.microbatch,
-            microbatch_count=microbatch_count,
-            compiled_graph_state=device._compiled_graph_state,
-            verify_cfg=verify_cfg,
-            compiler_cfg=compiler_cfg,
-            module_name_to_metadata={
-                module.get_name(): {
-                    "module": module.__module__,
-                    "class": module.__class__.__name__,
-                    "module_file_path": os.path.relpath(inspect.getfile(module.__class__), start=os.curdir)
-                } for module in device.modules
-            },
-            loss_module=device.loss_module.get_name() if device.loss_module else None,
-            modules=device.modules,
-            grid_size=[grid_size.r, grid_size.c],
-            harvested_rows=TTDeviceImage.get_harvested_rows(device, device_cfg),
-        )
-        return device_image
-
-    @staticmethod
-    def load_from_disk(tti_file_path: str) -> "TTDeviceImage":
-        from .archive import TTIArchive
-        return TTIArchive.load_from_disk(tti_file_path)
-
-    @staticmethod
-    def save_to_disk(
-        device_image,
-        tti_file_path: Optional[str] = None,
-        *args,
-        **kwargs,
-    ) -> "TTDeviceImage":
-        from .archive import TTIArchive
-        return TTIArchive.save_to_disk(device_image, tti_file_path, *args, **kwargs)
-    
-    @staticmethod
-    def validate_image_version_compatibility(device_image: "TTDeviceImage"):
-        runtime_pybuda_public_version_id = importlib.metadata.version('pybuda')
-        image_pybuda_public_version_id = device_image.pybuda_pip_version_id
-        assert runtime_pybuda_public_version_id == image_pybuda_public_version_id , (
-            "Error: Saved image pybuda version does not match with runtime version"
-        )
-        expected_version, actual_version = TTDeviceImage.TTI_VERSION, device_image.version
-        if actual_version is None:
-            raise ValueError(f"TTI Version mismatch: expected {expected_version}, but version not found in TTI. TTI recompilation required.")
-
-        if not hasattr(device_image, "version") or expected_version != actual_version:
-            raise ValueError(f"TTI Version mismatch: expected {expected_version}, got {actual_version}. TTI recompilation required.")
-
-        runtime_pybuda_commit_hash = get_pybuda_git_hash()
-        image_pybuda_commit_hash = device_image.pybuda_commit_hash
-        if runtime_pybuda_commit_hash and image_pybuda_commit_hash and runtime_pybuda_commit_hash != image_pybuda_commit_hash:
-            logger.warning(
-                f"Warning: runtime pybuda_commit_hash is {runtime_pybuda_commit_hash} but "
-                  "device_image pybuda_commit_hash is {image_pybuda_commit_hash}"
-            )
-
-        runtime_budabackend_commit_hash = get_budabackend_git_hash()
-        image_budabackend_commit_hash = device_image.budabackend_commit_hash
-        if runtime_budabackend_commit_hash and image_budabackend_commit_hash and runtime_budabackend_commit_hash != image_budabackend_commit_hash:
-            logger.warning(
-                f"Warning: runtime budabackend_commit_hash is {runtime_budabackend_commit_hash} but "
-                  "device_image budabackend_commit_hash is {image_budabackend_commit_hash}"
-            )
-    
-    @staticmethod
-    def validate_grid_size(device: "TTDevice", device_image: "TTDeviceImage"):
-        compiler_cfg = _get_global_compiler_config()
-        detected_device_cfg = device.get_device_config(compiler_cfg)
-        detected_grid_size = [detected_device_cfg.grid_size.r, detected_device_cfg.grid_size.c]
-        if device.arch == BackendDevice.Wormhole_B0:
-            assert detected_grid_size[0] >= device_image.grid_size[0], f"Grid-size in device image do not match this device's grid-size. detected-grid-size: {detected_grid_size}, cached-grid-size: {device_image.grid_size}"
-            if detected_grid_size[0] > device_image.grid_size[0]:
-                logger.info(f"Detected grid size has more rows than cached grid size, overlays will be recompiled later. detected-grid-size: {detected_grid_size}, cached-grid-size: {device_image.grid_size}") 
-        else:
-            assert detected_grid_size == device_image.grid_size, f"Grid-size in device image do not match this device's grid-size. detected-grid-size: {detected_grid_size}, cached-grid-size: {device_image.grid_size}"
-
-    @staticmethod
-    def create_device_from_image(image: "TTDeviceImage") -> "TTDevice":
-        """
-        Construct a fully-formed TTDevice back to the user.
-        """
-        TTDeviceImage.validate_image_version_compatibility(image)
-
-        device = TTDevice(
-            name=image.device_image_name,
-            chip_ids=image.chip_ids,
-            arch=image.arch,
-            devtype=image.devtype,
-            optimizer=image.optimizer,
-            fp32_fallback=image.fp32_fallback,
-        )
-        device._compiled = True
-        device._compiled_graph_state = image.compiled_graph_state
-        device.modules = image.modules
-        device.device_mode = DeviceMode.RunOnly
-
-        TTDeviceImage.validate_grid_size(device, image)
-
-        for module in device.modules:
-            module._set_device(device)
-            if module.get_name() == image.loss_module:
-                device.loss_module = module        
-
-        return device
-
-    def is_compiled_for_training(self):
-        return self.training
-
-    def info(self):
-        """
-        Return summary info for the compiled device image back to the user.
-        """
-        print(
-            f"""
-            Image Info...
-            - Version Info:
-                - pybuda_version: {self.pybuda_pip_version_id}
-                - pybuda_commit: {self.pybuda_commit_hash}
-                - buda_backend_commit: {self.budabackend_commit_hash}
-            - Device Name: {self.device_image_name}
-
-            Device Info...
-            - arch: {self.arch}
-            - chip_ids: {self.chip_ids}
-            - backend device type: {self.devtype}
-            - grid size: {self.grid_size}
-            - harvested rows: {self.harvested_rows if self.harvested_rows else "None"}
-
-            Compilation Graph State...
-            - training: {self.training}
-            - ordered input shapes: {self.compiled_graph_state.ordered_input_shapes}
-            - ordered targets shapes: {self.compiled_graph_state.ordered_target_shapes}
-            """
-        )
-    
-    @staticmethod
-    def _get_original_shapes(shapes, microbatch) -> List[List[int]]:
-        original_shapes = []
-        for input_shape in shapes:
-            original_shape = input_shape.copy()
-            if original_shape and original_shape[0] == 1:
-                original_shape[0] = microbatch
-            original_shapes.append(original_shape)
-        return original_shapes
-
-    def get_input_shapes(self) -> List[List[int]]:
-        return TTDeviceImage._get_original_shapes(self.compiled_graph_state.ordered_input_shapes, self.compiled_graph_state.microbatch)
-
-    def get_target_shapes(self) -> List[List[int]]:
-        return TTDeviceImage._get_original_shapes(self.compiled_graph_state.ordered_target_shapes, self.compiled_graph_state.microbatch)
diff --git a/pybuda/pybuda/tti/utils.py b/pybuda/pybuda/tti/utils.py
deleted file mode 100644
index 10deeec2a..000000000
--- a/pybuda/pybuda/tti/utils.py
+++ /dev/null
@@ -1,27 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import hashlib
-import os
-
-def compute_file_checksum(file_path, chunk_size=8192):
-    hasher = hashlib.sha256()
-    with open(file_path, 'rb') as f:
-        while True:
-            chunk = f.read(chunk_size)  # Read in chunks (default: 8KB)
-            if not chunk:
-                break
-            hasher.update(chunk)
-    return hasher.hexdigest()
-
-
-def write_checksum_to_file(checksum, output_file_path):
-    with open(output_file_path, 'w') as f:
-        f.write(checksum)
-
-def read_checksum_from_file(checksum_file_name):
-    if not os.path.exists(checksum_file_name):
-        return None
-    with open(checksum_file_name, 'r') as f:
-        checksum = f.read().strip()
-    return checksum
\ No newline at end of file
diff --git a/pybuda/pybuda/tvm_to_python.py b/pybuda/pybuda/tvm_to_python.py
index c3189c688..21f0c79dc 100644
--- a/pybuda/pybuda/tvm_to_python.py
+++ b/pybuda/pybuda/tvm_to_python.py
@@ -9,11 +9,8 @@
 
 import torch
 import numpy as np
-import tensorflow as tf
-import onnx
-import mxnet as mx
 
-import pybuda._C.pattern_matcher as pypattern_matcher
+# import pybuda._C.pattern_matcher as pypattern_matcher
 from pybuda.module import OnnxModule, PyBudaModule, TFLiteModule
 from pybuda.config import _get_global_compiler_config
 from pybuda.verify.config import _get_global_verify_config
@@ -359,6 +356,38 @@ def populate_torch_tile_args(graph, nid, compiler_cfg):
 
     return args
 
+def populate_torch_layernorm_args(graph, nid, compiler_cfg):
+    curr_node, args = _populate_torch_init_args(graph, nid)
+
+    epsilon = float(curr_node["attrs"]["epsilon"][0][0])
+    epsilon = round(epsilon, 10)
+    input_shape = curr_node["attrs"]["shape"][0][0]
+    dim = int(curr_node["attrs"]["axis"][0][0])
+    normalized_shape = input_shape[dim]
+    args['attr'] = {
+        "normalized_shape":{
+            "val": (normalized_shape,),
+            "inp_pos": 1,
+        },
+        "epsilon" :{
+            "val": epsilon,
+            "inp_pos": 4,
+        },
+    }
+
+    return args
+
+def populate_torch_dropout_args(graph, nid, training):
+    curr_node, args = _populate_torch_init_args(graph, nid)
+
+    args['attr'] = {
+        "training": {
+            "val": training,
+            "inp_pos": 2,
+        },
+    }
+    return args
+
 def _populate_torch_init_args(graph, nid):
     curr_node = graph["nodes"][nid]
     
@@ -425,6 +454,8 @@ def _populate_torch_init_args(graph, nid):
     "transpose"                     : "transpose",
     # "take"                        : "take",
     "where"                         : "where",
+    "layernorm"                     : "layernorm",
+    "pybuda_cpudevice.dropout"      : "dropout",
 }
 
 pytorch_op_to_function_name = {
@@ -477,6 +508,8 @@ def _populate_torch_init_args(graph, nid):
     "transpose"                     : "torch.transpose",
     # "take"                          : "torch.take",
     "where"                         : "torch.where",
+    "layernorm"                     : "torch.nn.functional.layer_norm",
+    "dropout"                       : "torch.nn.functional.dropout",
 }
 
 pytorch_ops_needing_arguments = {
@@ -501,6 +534,8 @@ def _populate_torch_init_args(graph, nid):
     "tile"                          : populate_torch_tile_args,
     "transpose"                     : populate_torch_transpose_args,
     # "power"                         : populate_torch_power_args,
+    "layernorm"                     : populate_torch_layernorm_args,
+    "dropout"                       : populate_torch_dropout_args,
 }
 
 def populate_binary_stack_args(graph, nid, compiler_cfg):
@@ -592,12 +627,12 @@ def populate_conv2d_transpose_args(graph, nid, compiler_cfg):
     assert all([x == 1 for x in dilation]), "Only supports dilation of 1"
     args.append(("dilation", f"{dilation[0]}",))
 
-    assert int(node["attrs"]["groups"][0][0]) == 1, "Only supports group of 1"
-    kernel_size = [int(kernel) for kernel in node["attrs"]["kernel_size"][0]]
-
+    in_channel = next((n['attrs']['shape'][0][0][0] for n in graph['nodes'] if n['name'] == 'model.weight'), None)
     groups = int(node["attrs"]["groups"][0][0])
+    assert groups == 1 or (in_channel is not None and groups == in_channel), "Only supports group of 1 or in_channel"
     args.append(("groups", f"{groups}",))
-    
+
+    kernel_size = [int(kernel) for kernel in node["attrs"]["kernel_size"][0]]
     channel_last = int(node["attrs"]["data_layout"][0][0] == "NHWC")
     args.append(("channel_last", f"{channel_last}"))
     
@@ -1190,7 +1225,6 @@ def populate_quantize_args(graph, nid, compiler_cfg):
 
     args.append(("out_dtype", "torch." + node['attrs']['out_dtype'][0][0]))
     args.append(("axis", f"{int(node['attrs']['axis'][0][0])}"))
-    print(f"Node quantize out dtype {node['attrs']['out_dtype'][0][0]}")
     return args
 
 def populate_dequantize_args(graph, nid, compiler_cfg):
@@ -1206,7 +1240,6 @@ def populate_requantize_args(graph, nid, compiler_cfg):
     node = graph["nodes"][nid]
     args.append(("axis", f"{int(node['attrs']['axis'][0][0])}"))
     args.append(("out_dtype", "torch." + node['attrs']['out_dtype'][0][0]))
-    print(f"Node Requantize out dtype {node['attrs']['out_dtype'][0][0]}")
 
     return args
 
@@ -1771,7 +1804,7 @@ def make_parser_friendly_name(node, node_type):
                         )
                     else:
                         if torch.numel(tensor) == 1 and len(tensor.shape) == 0:
-                            tensor = tensor.reshape((1, 1))
+                            tensor = tensor.reshape((1,))
                         if len(tensor.shape) > 4 and all([x == 1 for x in tensor.shape[0:-4]]):
                             tensor = tensor.reshape(tensor.shape[-4:])
                         if requires_grad:
@@ -1807,7 +1840,7 @@ def make_parser_friendly_name(node, node_type):
                     )
                 else:
                     if torch.numel(tensor) == 1 and len(tensor.shape) == 0:
-                        tensor = tensor.reshape((1, 1))
+                        tensor = tensor.reshape((1,))
                     if len(tensor.shape) > 4 and all([x == 1 for x in tensor.shape[0:-4]]):
                         tensor = tensor.reshape(tensor.shape[-4:])
                     params_from_tvm[node["buda_name"]] = tensor
@@ -1832,10 +1865,12 @@ def make_parser_friendly_name(node, node_type):
                 args = ()
                 argument_getter = pybuda_ops_needing_arguments if json_graph["device"] == "tt" else pytorch_ops_needing_arguments
                 if op_type in argument_getter:
-                    # if op_type == "dropout":
-                    #     args = argument_getter[op_type](graph=graph, nid=nid, training=is_training)
-                    # else:
-                    args = argument_getter[op_type](graph=graph, nid=nid, compiler_cfg=compiler_cfg)
+                    if op_type == "dropout" and json_graph["device"] != "tt":
+                        if is_training:
+                            logger.warning("Dropout op cannot be cpu fallback in training mode due to the absence of rate/p(probability) argument and it may also result in pcc mismatch")
+                        args = argument_getter[op_type](graph=graph, nid=nid, training=is_training)
+                    else:
+                        args = argument_getter[op_type](graph=graph, nid=nid, compiler_cfg=compiler_cfg)
                     assert args is not None
                 
                 if args == () and json_graph["device"] == "cpu" and op_type not in argument_getter:
@@ -1905,6 +1940,13 @@ def make_parser_friendly_name(node, node_type):
                         input_node["users"] = []
                     input_node["users"].append(nid)
                     input_names.append(input_node["buda_name"])
+                # Handle concatenate case when a single node name in referenced twice in the input list
+                if node["name"] == "pybuda.concatenate" and len(input_names) == 1:              
+                    inp_shape = graph["nodes"][node["inputs"][input_port][0]]["attrs"]["shape"][0][0]
+                    out_shape = node["attrs"]["shape"][0][0]
+                    
+                    if inp_shape[:2] == out_shape[:2] and inp_shape[2] * 2 == out_shape[2]:
+                        input_names = [input_names[0], input_names[0]]
 
                 ops[node["nid"]] = Operation(
                     function_name=function_name,
@@ -1948,110 +1990,110 @@ def replace_node_name(orig, new):
         submodule = False
         param_names = {}
         const_names = {}
-        if compiler_cfg.tvm_module_to_num_patterns.get(framework_mod.get_name(), None):
-            match_subgraph_patterns = compiler_cfg.tvm_module_to_num_patterns[framework_mod.get_name()]
+        # if compiler_cfg.tvm_module_to_num_patterns.get(framework_mod.get_name(), None):
+        #     match_subgraph_patterns = compiler_cfg.tvm_module_to_num_patterns[framework_mod.get_name()]
             
-            ret = pypattern_matcher.lower_json_to_pattern_matcher(graph, match_subgraph_patterns)
-            subgraph_matches = ret.subgraph_matches
+        #     ret = pypattern_matcher.lower_json_to_pattern_matcher(graph, match_subgraph_patterns)
+        #     subgraph_matches = ret.subgraph_matches
             
-            if len(subgraph_matches) > 1:
-                submodule = True
-
-                matched_params = {}
-                matched_consts = {}
-                matched_ops = {}
-                submodule_input_ports = {}
-                submodule_outputs = {}
-                submodule_outputs_requiring_batch_dim_fix = []
-                submodule_output_shapes = {}
-
-                # Collect submodule IOs
-                for orig_nid in subgraph_matches[0].keys():
-                    node = graph["nodes"][orig_nid]
-                    if "num_inputs" in node["attrs"]:
-                        for input_port in range(int(node["attrs"]["num_inputs"])):
-                            if node["inputs"][input_port][0] not in subgraph_matches[0]:
-                                submodule_input_ports[orig_nid] = input_port
-
-                    node = graph["nodes"][orig_nid]
-                    if "users" in node:
-                        for user in node["users"]:
-                            if user not in subgraph_matches[0] and node["op"] != "*":
-                                submodule_outputs[node["nid"]] = node["buda_name"]
-                                if node["buda_shape"][0] != 1:
-                                    submodule_outputs_requiring_batch_dim_fix.append(node["buda_name"])
-
-                # add ops for each submodule call
-                idx = max(sorted(submodule_input_ports)) + 0.5
-                input_nids = list(sorted(submodule_input_ports.keys()))
-
-                input_nodes = [graph["nodes"][input_nid] if submodule_input_ports[input_nid] == -1 else graph["nodes"][graph["nodes"][input_nid]["inputs"][submodule_input_ports[input_nid]][0]] for input_nid in input_nids]
-                submodule_inputs = {input_node["nid"]:input_node["buda_name"] for input_node in input_nodes}
-                activations = [input_node_name for _, input_node_name in sorted(graph_input_names.items())]
+        #     if len(subgraph_matches) > 1:
+        #         submodule = True
+
+        #         matched_params = {}
+        #         matched_consts = {}
+        #         matched_ops = {}
+        #         submodule_input_ports = {}
+        #         submodule_outputs = {}
+        #         submodule_outputs_requiring_batch_dim_fix = []
+        #         submodule_output_shapes = {}
+
+        #         # Collect submodule IOs
+        #         for orig_nid in subgraph_matches[0].keys():
+        #             node = graph["nodes"][orig_nid]
+        #             if "num_inputs" in node["attrs"]:
+        #                 for input_port in range(int(node["attrs"]["num_inputs"])):
+        #                     if node["inputs"][input_port][0] not in subgraph_matches[0]:
+        #                         submodule_input_ports[orig_nid] = input_port
+
+        #             node = graph["nodes"][orig_nid]
+        #             if "users" in node:
+        #                 for user in node["users"]:
+        #                     if user not in subgraph_matches[0] and node["op"] != "*":
+        #                         submodule_outputs[node["nid"]] = node["buda_name"]
+        #                         if node["buda_shape"][0] != 1:
+        #                             submodule_outputs_requiring_batch_dim_fix.append(node["buda_name"])
+
+        #         # add ops for each submodule call
+        #         idx = max(sorted(submodule_input_ports)) + 0.5
+        #         input_nids = list(sorted(submodule_input_ports.keys()))
+
+        #         input_nodes = [graph["nodes"][input_nid] if submodule_input_ports[input_nid] == -1 else graph["nodes"][graph["nodes"][input_nid]["inputs"][submodule_input_ports[input_nid]][0]] for input_nid in input_nids]
+        #         submodule_inputs = {input_node["nid"]:input_node["buda_name"] for input_node in input_nodes}
+        #         activations = [input_node_name for _, input_node_name in sorted(graph_input_names.items())]
                 
-                ops[idx] = Operation(
-                    function_name="self.layers[0]",
-                    output_name="layer_0",
-                    input_names=activations,
-                )
-                ops[idx].is_submodule_call = True
-
-                output_nids = list(submodule_outputs.keys())
-                assert len(output_nids) == 1, "TODO"
+        #         ops[idx] = Operation(
+        #             function_name="self.layers[0]",
+        #             output_name="layer_0",
+        #             input_names=activations,
+        #         )
+        #         ops[idx].is_submodule_call = True
+
+        #         output_nids = list(submodule_outputs.keys())
+        #         assert len(output_nids) == 1, "TODO"
                 
-                for i in range(1, len(subgraph_matches)):
-                    #if the input node is in the submodule
-                    activations = []
-                    for input_nid in input_nids:
-                        if submodule_input_ports[input_nid] == -1:
-                            matched_nid = subgraph_matches[i][input_nid]
-                        else:
-                            matched_user = subgraph_matches[i][input_nid]
-                            matched_nid = graph["nodes"][matched_user]["inputs"][submodule_input_ports[input_nid]][0]
-
-                        idx = matched_nid + 0.5
-                        activations.append(graph["nodes"][matched_nid]["buda_name"])
-
-                    # unlike ops, submodules should not have repeated inputs
-                    activations = list(dict.fromkeys(activations))
-                    ops[idx] = Operation(
-                        function_name=f"self.layers[{i}]",
-                        output_name=f"layer_{i}",
-                        input_names=activations,
-                    )
-                    ops[idx].is_submodule_call = True
-
-                # build submodule param / op dicts, remove from main
-                for orig_nid in subgraph_matches[0].keys():
-                    if orig_nid in params:
-                        matched_params[orig_nid] = params[orig_nid]
-                        param_name = params[orig_nid][0] 
-                        param_names[param_name] = (f"layer_{0}", param_name)
-                        del params[orig_nid]
-                        for index, subgraph in enumerate(subgraph_matches[1:]):
-                            param_names[params[subgraph[orig_nid]][0]] = (f"layer_{index + 1}", param_name)
-                            del params[subgraph[orig_nid]]
-
-                    if orig_nid in constants:
-                        matched_consts[orig_nid] = constants[orig_nid]
-                        const_name = constants[orig_nid][0] 
-                        const_names[const_name] = (f"layer_{0}", const_name)
-                        del constants[orig_nid]
-                        for index, subgraph in enumerate(subgraph_matches[1:]):
-                            const_names[constants[subgraph[orig_nid]][0]] = (f"layer_{index + 1}", const_name)
-                            del constants[subgraph[orig_nid]]
-
-                    if orig_nid in ops:
-                        matched_ops[orig_nid] = ops[orig_nid]
-                        del ops[orig_nid]
-                        for subgraph in subgraph_matches[1:]:
-                            del ops[subgraph[orig_nid]]
-
-                #replace references to outputs of each submodule with submodule
-                for idx, subgraph in enumerate(subgraph_matches):
-                    name_to_replace = graph["nodes"][subgraph[output_nids[0]]]["buda_name"]
-
-                    replace_node_name(name_to_replace, f"layer_{idx}")
+        #         for i in range(1, len(subgraph_matches)):
+        #             #if the input node is in the submodule
+        #             activations = []
+        #             for input_nid in input_nids:
+        #                 if submodule_input_ports[input_nid] == -1:
+        #                     matched_nid = subgraph_matches[i][input_nid]
+        #                 else:
+        #                     matched_user = subgraph_matches[i][input_nid]
+        #                     matched_nid = graph["nodes"][matched_user]["inputs"][submodule_input_ports[input_nid]][0]
+
+        #                 idx = matched_nid + 0.5
+        #                 activations.append(graph["nodes"][matched_nid]["buda_name"])
+
+        #             # unlike ops, submodules should not have repeated inputs
+        #             activations = list(dict.fromkeys(activations))
+        #             ops[idx] = Operation(
+        #                 function_name=f"self.layers[{i}]",
+        #                 output_name=f"layer_{i}",
+        #                 input_names=activations,
+        #             )
+        #             ops[idx].is_submodule_call = True
+
+        #         # build submodule param / op dicts, remove from main
+        #         for orig_nid in subgraph_matches[0].keys():
+        #             if orig_nid in params:
+        #                 matched_params[orig_nid] = params[orig_nid]
+        #                 param_name = params[orig_nid][0] 
+        #                 param_names[param_name] = (f"layer_{0}", param_name)
+        #                 del params[orig_nid]
+        #                 for index, subgraph in enumerate(subgraph_matches[1:]):
+        #                     param_names[params[subgraph[orig_nid]][0]] = (f"layer_{index + 1}", param_name)
+        #                     del params[subgraph[orig_nid]]
+
+        #             if orig_nid in constants:
+        #                 matched_consts[orig_nid] = constants[orig_nid]
+        #                 const_name = constants[orig_nid][0] 
+        #                 const_names[const_name] = (f"layer_{0}", const_name)
+        #                 del constants[orig_nid]
+        #                 for index, subgraph in enumerate(subgraph_matches[1:]):
+        #                     const_names[constants[subgraph[orig_nid]][0]] = (f"layer_{index + 1}", const_name)
+        #                     del constants[subgraph[orig_nid]]
+
+        #             if orig_nid in ops:
+        #                 matched_ops[orig_nid] = ops[orig_nid]
+        #                 del ops[orig_nid]
+        #                 for subgraph in subgraph_matches[1:]:
+        #                     del ops[subgraph[orig_nid]]
+
+        #         #replace references to outputs of each submodule with submodule
+        #         for idx, subgraph in enumerate(subgraph_matches):
+        #             name_to_replace = graph["nodes"][subgraph[output_nids[0]]]["buda_name"]
+
+        #             replace_node_name(name_to_replace, f"layer_{idx}")
 
         # Some float types (e.g. tf.bfloat16) are not compatible with numpy
         # We must signal to the PyBudaWriter if the model contains these types so it can implement the workaround
@@ -2069,7 +2111,10 @@ def replace_node_name(orig, new):
             current_module_name += f"_{json_graph['device']}_{graph_index}" 
 
         if json_graph["device"] == "tt":
-            writer = PyBudaWriter(current_module_name, framework, contains_incompatible_np_floats=contains_incompatible_np_floats)
+            delete_inputs = not ((verify_cfg is not None and verify_cfg.verify_all) or compiler_cfg.enable_op_level_comparision)
+            if not delete_inputs:
+                logger.warning("Preserving Intermediate tensor values in PyBudaModule forward may causes out-of-memory issues")
+            writer = PyBudaWriter(current_module_name, framework, contains_incompatible_np_floats=contains_incompatible_np_floats, delete_inputs=delete_inputs)
         else:
             writer = PyTorchWriter(current_module_name, source_framework=framework)
 
diff --git a/pybuda/pybuda/utils.py b/pybuda/pybuda/utils.py
index 004624bd8..84fb40e5a 100644
--- a/pybuda/pybuda/utils.py
+++ b/pybuda/pybuda/utils.py
@@ -212,6 +212,26 @@ def get_budabackend_git_hash() -> Optional[str]:
     except:
         return None
 
+def budabackend_path() -> str:
+    if "BUDA_HOME" in os.environ:
+        return os.environ["BUDA_HOME"]
+
+    if os.path.exists(os.getcwd() + '/third_party/budabackend'):
+        # must be in pybuda root
+        return "third_party/budabackend/"
+    else:
+        return ""
+
+
+def resolve_device_descriptor_path(device_yaml_override: str) -> str:
+    if os.path.isfile(device_yaml_override):
+        return device_yaml_override
+    elif os.path.isfile(budabackend_path() + f"device/{device_yaml_override}"):
+        return budabackend_path() + f"device/{device_yaml_override}"
+    else:
+        raise FileNotFoundError(f"Device descriptor file not found: {device_yaml_override}")
+
+
 def get_buda_compile_and_runtime_configs() -> Dict[str, str]: 
     """
     Capture compile-time and runtime environment variables used to compile and run on the device.
diff --git a/pybuda/pybuda/verify/__init__.py b/pybuda/pybuda/verify/__init__.py
index 4e45a15dd..96a50965c 100644
--- a/pybuda/pybuda/verify/__init__.py
+++ b/pybuda/pybuda/verify/__init__.py
@@ -2,5 +2,4 @@
 
 # SPDX-License-Identifier: Apache-2.0
 from .config import VerifyConfig, TestKind
-from .verify import verify_net2pipe, do_verify, verify_golden, _generate_random_losses, _run_pytorch_backward, get_intermediate_tensors
-from .backend import verify_module, verify_module_pipeline
+from .verify import do_verify, verify_golden, _generate_random_losses, _run_pytorch_backward, get_intermediate_tensors
diff --git a/pybuda/pybuda/verify/backend.py b/pybuda/pybuda/verify/backend.py
deleted file mode 100644
index 39ebd705c..000000000
--- a/pybuda/pybuda/verify/backend.py
+++ /dev/null
@@ -1,868 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-"""
-Verify modules, or pipeline of modules, using the backend
-"""
-import os
-
-from typing import Tuple, Dict, List, Optional
-import queue
-
-from loguru import logger
-from pybuda.tvm_utils import map_tf_dtype_to_pt, flatten_inputs
-import torch
-import tensorflow as tf
-
-import pybuda
-from pybuda import Module
-from ..tensor import Tensor, TensorShape, to_pt_tensors, to_buda_tensors, remove_microbatch
-from .config import VerifyConfig, should_waive_gradient, _get_global_verify_config
-from pybuda._C.backend_api import BackendType, DeviceMode
-from ..module import PyTorchModule, Module, PyBudaModule
-from ..config import CompilerConfig, _get_global_compiler_config, CompileDepth, _set_global_compiler_config
-from ..parameter import Parameter
-from ..device import Device
-from .cpueval import cpueval_inference, cpueval_training
-from .utils import CPUCombiner
-from ..pybudaglobal import get_devices
-
-def pybuda_override_veto(compiler_cfg):
-    import json
-
-    # 1. Tackle with global compiler configurations
-    new_g_compiler_config = CompilerConfig()
-
-    # Override the default compiler config with the user specified config
-    logger.info("Overriding general compiler configs (ones not specified are all removed):")
-    for key, value in json.loads(os.environ["PYBUDA_OVERRIDES_VETO"])["general_conf"].items():
-        if value is not None and value != "":
-            logger.info("  Overriding '{}' key with '{}'", key, value)
-            new_g_compiler_config.__setattr__(key, value)
-        elif value is not None and value == "":
-            current_value = getattr(compiler_cfg, key)
-            logger.info("  Using default key '{}' with '{}' value", key, current_value)
-            new_g_compiler_config.__setattr__(key, current_value)
-        else:
-            assert False, "Shouldn't hit this case"
-
-    compiler_cfg = new_g_compiler_config
-    _set_global_compiler_config(new_g_compiler_config)
-
-    # 2. Tackle with compiler configurations set through environment
-    # variables
-
-    # Get currently set compiler configurations as environment variables
-    initial_env_var_config_state =  json.loads(os.environ["PYBUDA_OVERRIDES_VETO"])["environ_conf"]
-    current_env_var_config_state = {key: value for key, value in os.environ.items() if key.startswith("PYBUDA_") and key != "PYBUDA_OVERRIDES_VETO"}
-
-    # Remove and update reference env configs
-    logger.info("Overriding env var compiler configs:")
-    for key, value in current_env_var_config_state.items():
-        if key not in initial_env_var_config_state:
-            logger.info("  Removing '{}' key from env var config", key)
-            del os.environ[key]
-        elif key in initial_env_var_config_state and initial_env_var_config_state[key] != "" and initial_env_var_config_state[key] != value:
-            logger.info("  Overriding '{}' key with '{}'", key, initial_env_var_config_state[key])
-            os.environ[key] = initial_env_var_config_state[key]
-        elif key in initial_env_var_config_state and initial_env_var_config_state[key] == "":
-            logger.info("  Using default key '{}' with '{}' value", key, value)
-            os.environ[key] = value
-        else:
-            logger.info("  Keeping '{}' key as '{}' value", key, value)
-
-    # Add new env configs
-    for key, value in initial_env_var_config_state.items():
-        os.environ[key] = value
-        if value == "":
-            continue
-        if key not in current_env_var_config_state:
-            logger.info("  Adding '{}' key with '{}' value", key, value)
-            os.environ[key] = value
-            
-    return compiler_cfg
-
-
-def _generate_random_inputs(input_shapes: List[Tuple], input_params: List[Dict], verify_cfg: VerifyConfig, uniform_inputs: bool, inputs_centered_on_zero: bool) -> List[Tuple[Tensor, ...]]:
-    """
-    Generate random inputs with shapes and input parameters provided
-    """
-    inputs = []
-    for _ in range(verify_cfg.total_number_of_inputs()):
-
-        def create_tensor(shape, i):
-            dtype = torch.float32
-            requires_grad = True
-            dev_data_format = None
-            if len(input_params) > i:
-                if "data_format" in input_params[i]:
-                    dtype = input_params[i]["data_format"]
-                if "requires_grad" in input_params[i]:
-                    requires_grad = input_params[i]["requires_grad"]
-                if "dev_data_format" in input_params[i]:
-                    dev_data_format = input_params[i]["dev_data_format"]
-
-            if type(dtype) == tf.DType:
-                dtype = map_tf_dtype_to_pt(dtype)
-
-            if dtype in [torch.int8, torch.int, torch.int64]:
-                return Tensor.create_from_torch(torch.randint(high=25000, size=shape, dtype=dtype), dev_data_format=dev_data_format)
-            
-            # avoid zeros
-            if uniform_inputs:
-                t = torch.rand(*shape, dtype=dtype)
-                if inputs_centered_on_zero:
-                    t = t - 0.5
-            else:
-                mean = 0.0 if inputs_centered_on_zero else 0.5
-                t = torch.abs(torch.normal(mean=mean, std=0.1, size=shape, dtype=dtype)) + 0.00001
-                t = t.detach()
-            t.requires_grad = requires_grad
-            return Tensor.create_from_torch(t)
-                        
-        inputs.append(tuple(create_tensor(shape, i) for i, shape in enumerate(input_shapes)))
-
-    return inputs
-
-def _translate_framework_modules(
-        modules: List[Module], 
-        device_types: List[str], 
-        module_to_device: Dict[int, int], 
-        pipe_inputs: Tuple[Tensor, ...],
-        dev_count: int,
-        verify_cfg: VerifyConfig,
-        compiler_cfg: CompilerConfig):
-    """
-    Translate any framework modules to PyBuda python
-    """
-    modules_copy = []
-    i = 0
-    while i < len(modules):
-        module = modules[i]
-        if verify_cfg.verify_pipeline_result_vs_framework:
-            modules_copy.append(module)
-
-        pt_inputs = to_pt_tensors(pipe_inputs)
-        if device_types[module_to_device[i]] == "CPUDevice":
-            pipe_inputs = module.forward(*pt_inputs)
-        elif not isinstance(module, PyBudaModule) and device_types[module_to_device[i]] == "TTDevice":
-            from pybuda.tvm_to_python import generate_pybuda_module
-            translated_modules, translated_device_types, _ = generate_pybuda_module(module, pt_inputs, compiler_cfg, module.name, verify_cfg)
-            modules[i:i+1] = translated_modules
-            device_types[module_to_device[i]:module_to_device[i]+1] = translated_device_types
-            if len(translated_modules) > 1:
-                extra_devices = len(translated_modules) - 1
-                dev_count += extra_devices
-                updated_module_to_device = {}
-                for k, v in module_to_device.items():
-                    if k < i:
-                        updated_module_to_device[k] = v
-                    elif k == i:
-                        for new_device in range(len(translated_modules)):
-                            updated_module_to_device[k + new_device] = v + new_device
-                    else:
-                        updated_module_to_device[k + extra_devices] = v + extra_devices
-                module_to_device = updated_module_to_device
-                i += extra_devices
-        i += 1
-
-    return modules, device_types, module_to_device, dev_count, modules_copy
-
-            
-
-
-def _update_and_randomize_params(
-        modules: List[Module], 
-        dev_count: int,
-        module_to_device: Dict[int, int], 
-        all_parameters: List[List[Parameter]],
-        params_centered_on_zero: bool,
-        scale_params: float,
-        uniform_inputs: bool) -> List[Dict[str, torch.Tensor]]:
-    """
-    Randomize parameters, and pick up new ones from changes in compilcation
-    """
-    params_changed = False
-    for m in modules:
-        params_changed = params_changed or any([param not in all_parameters[0] for param in m.get_parameters()])
-    
-    if params_changed:
-        new_all_parameters: List[List[Parameter]] = [ [] for _ in range(dev_count) ]
-        for i, m in enumerate(modules):
-            if hasattr(m.device, "get_parameters"):
-                new_all_parameters[module_to_device[i]].extend(m.device.get_parameters())
-            else:
-                new_all_parameters[module_to_device[i]].extend(m.get_parameters())
-        all_parameters = new_all_parameters
-
-    # Randomize (if values are not set already), and save parameters
-    all_saved_parameters: List[Dict[str, torch.Tensor]] = []
-    for dev_index in range(dev_count):
-        all_saved_parameters.append( {} )
-        for p in all_parameters[dev_index]:
-            if not p.has_value():
-                if uniform_inputs:
-                    t = torch.rand(*p.shape.get_pytorch_shape(), dtype=p.pt_data_format)
-                    if params_centered_on_zero:
-                        t -= 0.5
-                else:
-                    if params_centered_on_zero:
-                        t = torch.normal(mean=0.0, std=0.1, size=p.shape.get_pytorch_shape(), dtype=p.pt_data_format)
-                    else:
-                        t = torch.abs(torch.normal(mean=0.5, std=0.1, size=p.shape.get_pytorch_shape(), dtype=p.pt_data_format)) + 0.00001
-                assert scale_params > 0
-                t /= scale_params
-
-                # TODO: add data types, control over requires_grad
-                t = t.detach()
-                t.requires_grad = True
-                p.set_value(t)
-            else:
-                t = p.value()
-            #saved_t = t.type(torch.bfloat16)
-            #saved_t = saved_t.detach()
-            saved_t = t.clone().detach()
-            
-            saved_t.requires_grad = t.requires_grad
-            all_saved_parameters[dev_index][p.get_name()] = saved_t
-    
-    return all_saved_parameters
-
-def _update_parameters_from_checkpoint(all_saved_parameters: List[Dict[str, torch.Tensor]], device_index: int, checkpoint: Dict[str, Tensor]):
-    """
-    Updated saved parameters from device checkpoint, bringing the golden back into "sync"
-    """
-    device_parameters = all_saved_parameters[device_index]
-    for name in device_parameters:
-        assert name in checkpoint
-        t = checkpoint[name].value().detach()
-        #t = t.type(torch.float32)
-        t.requires_grad = True
-        # NB: we want to update the parameter in-place so any saved optimizer state is retained
-        all_saved_parameters[device_index][name].data.copy_(t.data)
-
-def _create_devices(modules: List[Module], device_types: List[str], module_to_device: Dict[int, int], sample_inputs: List[Tensor] ,verify_cfg: VerifyConfig, all_parameters: List[List[Parameter]], loss_module: Module) -> List[Device]:
-    """
-    Create devices to run the modules on
-    """
-    def _wrap_inputs(inputs):
-        if not isinstance(inputs, (list, tuple)):
-            inputs = (inputs, )
-        return inputs
-    
-    inputs = sample_inputs
-    from ..cpudevice import CPUDevice
-    from ..gpudevice import GPUDevice
-    devices: List[Device] = []
-    for index, device_type in enumerate(device_types):
-        module = modules[module_to_device[index]]
-        if verify_cfg.test_kind.is_training():
-            if verify_cfg.optimizer:
-                optimizer_type = pybuda.optimizers.get_optimizer_type_from_string(verify_cfg.optimizer["type"])
-                optimizer = optimizer_type(
-                    parameters=all_parameters[index], **verify_cfg.optimizer["params"]
-                )
-            else:
-                optimizer = None
-
-            scheduler = verify_cfg.scheduler["type"](
-               optimizer
-            ) if verify_cfg.scheduler else None
-
-            if device_type == "CPUDevice" or device_type == "GPUDevice":
-                def optimizer_f(m: torch.nn.Module):
-                    assert verify_cfg.optimizer["type"] == "sgd"
-                    lr = verify_cfg.optimizer["params"]["learning_rate"]
-                    opt_klass = torch.optim.SGD
-                    if isinstance(m, torch.nn.Module):
-                        opt_klass = torch.optim.SGD
-                        return opt_klass(m.parameters(), lr=lr)
-                    elif isinstance(m, (tf.keras.Model, tf.keras.layers.Layer)):
-                        opt_klass = tf.keras.optimizers.legacy.SGD
-                        return opt_klass(lr=lr)
-                    else:
-                        return opt_klass(m, lr=lr)
-                
-                def scheduler_f(o):
-                    if scheduler is None:
-                        return 
-                    elif isinstance(o, torch.optim.Optimizer):
-                        return scheduler.get_pytorch_scheduler(o)
-                    else:
-                        logger.warning("Scheduler not yet supported for other types of optimizers")
-            
-            if device_type == "CPUDevice":
-                tt0 = CPUDevice(f"cpu{index}", optimizer_f=optimizer_f, scheduler_f=scheduler_f)
-                devices.append(tt0)
-            elif device_type == "GPUDevice":
-                tt0 = GPUDevice(f"gpu{index}", optimizer_f=optimizer_f, scheduler_f=scheduler_f)
-                devices.append(tt0)
-            elif device_type == "TTDevice":
-                if verify_cfg.devmode == DeviceMode.RunOnly:
-                    tt0 = pybuda.TTDevice.load_image(img_path=verify_cfg.tti_path)
-                else:
-                    tt0 = pybuda.TTDevice(f"tt{index}", devtype=verify_cfg.devtype, arch=verify_cfg.arch, device_mode=verify_cfg.devmode, optimizer=optimizer, scheduler=scheduler, num_chips=verify_cfg.num_chips, chip_ids=verify_cfg.chip_ids, fp32_fallback=verify_cfg.fp32_fallback)
-                devices.append(tt0)
-            elif device_type == "-":
-                assert len(devices) > 0, "At least one device must be chosen before '-' is used"
-            else:
-                raise RuntimeError("Unsupported device type")
-
-            if optimizer:
-                optimizer.set_optimizer_parameters()
-
-        else:
-            if device_type == "CPUDevice":
-                inputs = to_pt_tensors(inputs)
-                input_dtypes = [inp.dtype for inp in inputs]
-                tt0 = CPUDevice(f"cpu{index}", input_dtypes=input_dtypes)
-                devices.append(tt0)
-            elif device_type == "GPUDevice":
-                inputs = to_pt_tensors(inputs)
-                tt0 = GPUDevice(f"gpu{index}")
-                devices.append(tt0)
-            elif device_type == "TTDevice":
-                inputs = to_buda_tensors(inputs)
-                if verify_cfg.devmode == DeviceMode.RunOnly:
-                    tt0 = pybuda.TTDevice.load_image(img_path=verify_cfg.tti_path)
-                else:
-                    tt0 = pybuda.TTDevice(f"tt{index}", devtype=verify_cfg.devtype, arch=verify_cfg.arch, device_mode=verify_cfg.devmode, num_chips=verify_cfg.num_chips, chip_ids=verify_cfg.chip_ids, fp32_fallback=verify_cfg.fp32_fallback)
-                devices.append(tt0)
-            elif device_type == "-":
-                assert len(devices) > 0, "At least one device must be chosen before '-' is used"
-            else:
-                raise RuntimeError("Unsupported device type")
-
-            inputs = _wrap_inputs(module.forward(*inputs))
-
-    for index, module in enumerate(modules):
-        target_device = devices[module_to_device[index]]
-        if verify_cfg.devmode == DeviceMode.RunOnly and isinstance(target_device, pybuda.TTDevice):
-            continue
-        target_device.place_module(module)
-
-    if verify_cfg.test_kind.is_training() and loss_module is not None:
-        if not (verify_cfg.devmode == DeviceMode.RunOnly and isinstance(devices[-1], pybuda.TTDevice)):
-            devices[-1].place_loss_module(loss_module)
-    return devices
-
-def _setup_training(devices: List[Device], first_inputs: Tuple[Tensor], verify_cfg: VerifyConfig, add_loss: bool) -> List[Tensor]:
-    """
-    Create loss device / module, generate random targets.
-    """
-    from ..cpudevice import CPUDevice
-    from ..gpudevice import GPUDevice
-    target_tensors = None
-    outputs = remove_microbatch(first_inputs)
-    for d in devices:
-        if type(d) == CPUDevice:
-            outputs = to_pt_tensors(outputs)
-            outputs = d._modules_forward(*outputs)
-        elif type(d) == GPUDevice:
-            outputs = to_pt_tensors(outputs)
-            outputs = (output.to(d.device) for output in outputs)
-            outputs = d._modules_forward(*outputs)
-            outputs = (output.to("cpu") for output in outputs)
-        elif isinstance(d, pybuda.TTDevice):
-            outputs = to_buda_tensors(outputs)
-            _, outputs, _, _, target_tensors = d.generate_graph(*outputs, return_intermediate="False", graph_name="PreVerify", compiler_cfg=CompilerConfig(), trace_only=True, verify_cfg=verify_cfg)
-        else:
-            raise RuntimeError(f"Unsupported device type: {type(d)}")
-    output_shapes = [out.shape for out in outputs]
-
-    # Create loss modules
-    if add_loss:
-        # Need to add CPU loss calculation
-        cpu = CPUDevice("cpu_loss", 
-                optimizer_f = None, # No parameters in cpu modules
-                scheduler_f = None)
-
-        identity = PyTorchModule("combiner0", CPUCombiner())
-        cpu.place_module(identity)
-
-        loss_module = PyTorchModule("l1loss", torch.nn.L1Loss())
-        cpu.place_loss_module(loss_module)
-
-
-    # Generate random targets
-    targets = []
-    for _ in range(verify_cfg.total_number_of_inputs()):
-        if add_loss:
-            # If we must add a loss (i.e no loss module is provided), CPUCombiner
-            # sill pad all outputs to the shape of the largest output, and return the
-            # sum of those tensors. And so the target must match that shape too.
-            target = []
-            def max_shape(shapes):
-                mshp = [1]*4
-                for i in range(-1, -4, -1):
-                    mx = 1
-                    for shape in shapes:
-                        if len(shape) < -i:
-                            continue
-                        if shape[i] > mx:
-                            mx = shape[i]
-                    mshp[i] = mx
-                return tuple(mshp)
-
-            output_shape = max_shape(output_shapes)
-            if isinstance(output_shape, TensorShape):
-                output_shape = output_shape.get_pytorch_shape()
-            target.append(torch.normal(mean=0.0, std=0.5, size=output_shape))
-            cpu.push_to_target_inputs(tuple(target))
-        else:
-            assert len(target_tensors) > 0, "Target tensors are missing even though loss module exists on this device"
-            target = [torch.normal(mean=0.0, std=0.5, size=target_tensor.shape.get_pytorch_shape()) for target_tensor in target_tensors]
-            devices[-1].push_to_target_inputs(tuple(target))
-
-        targets.append(target)
-
-    return targets
-
-def _verify_training(
-        devices: List[Device],
-        inputs: List[Tuple[Tensor, ...]], 
-        targets: List[Tensor], 
-        all_saved_parameters: List[Dict[str, torch.Tensor]], 
-        checkpoint_q: queue.Queue,
-        verify_cfg: VerifyConfig):
-    """
-    Verify training results vs. cpu eval
-    """
-    assert len(inputs) % verify_cfg.steps == 0, "Total inputs should be divisible by number of steps"
-    step_size = len(inputs) // (verify_cfg.steps * verify_cfg.epochs)
-    
-    assert step_size % verify_cfg.accumulation_steps == 0, "Step size should be divisible by number of accumulation_steps"
-    acc_step_size = step_size // verify_cfg.accumulation_steps
-
-    from pybuda.op.eval import compare_tensor_to_golden # avoid circular import
-    test_pass = True
-    fail_on_first = "SHOW_ALL_FAILS" not in os.environ
-
-    opt_lr = verify_cfg.optimizer['params']['learning_rate'] if verify_cfg.optimizer else None
-
-    optimizer_index = 0
-    try:
-        for epoch in range(verify_cfg.epochs):
-            for step in range(verify_cfg.steps):
-                logger.debug("Running cpueval training for step {}", step)
-                step_inputs = inputs[step*step_size+epoch*verify_cfg.steps*step_size : (step+1)*step_size+epoch*verify_cfg.steps*step_size]
-                step_targets = targets[step*step_size+epoch*verify_cfg.steps*step_size : (step+1)*step_size+epoch*verify_cfg.steps*step_size]
-
-                eval_res = cpueval_training(step_inputs, all_saved_parameters, step_targets, verify_cfg.sequential, verify_cfg.scale_loss, lr=opt_lr if epoch==0 else None)
-
-                # Check input gradients
-                if verify_cfg.enable_input_gradient_checking:
-                    logger.debug("Verifying input gradients")
-                    for iter in range(step_size):
-                        res = verify_cfg._input_gradient_queue.get()
-                        for i, (golden, result) in enumerate(zip(eval_res.devices[0].grad[iter].inputs, res)):
-
-                            if isinstance(result, torch.Tensor):
-                                result = Tensor.create_from_torch(result)
-
-                            test_pass &= compare_tensor_to_golden(f"Iteration {iter} - Input {i} gradient", golden, result.value().type(golden.dtype), is_buda=True, verify_cfg=verify_cfg)
-                            if fail_on_first:
-                                assert test_pass, f"Data mismatch on iteration {iter} - Input {i} gradient"
-
-                # Check parameter gradients
-                if verify_cfg.enable_parameter_gradient_checking:
-                    logger.debug("Verifying parameter gradients")
-                    for acc_step in range(step_size // acc_step_size):
-                        res = verify_cfg._parameter_gradient_queue.get()
-                        for device_index, _ in enumerate(devices):
-                            gradients = res[device_index]
-                
-                            for name in gradients:
-                                calc = gradients[name].value()
-                                # cpu eval gives us gradients on every step, but run_training gives us one on each
-                                # accumulation step. So, we need to compare the last one in the step.
-                                grad_index = (acc_step + 1) * acc_step_size - 1
-                                golden = eval_res.devices[device_index].grad[grad_index].parameters[name]
-
-                                warning_only = should_waive_gradient(name, verify_cfg)
-                                test_pass &= compare_tensor_to_golden(f"Acc step {acc_step}, device {device_index} - Gradient for parameter {name}", golden, calc.type(golden.dtype), 
-                                            is_buda=True, verify_cfg=verify_cfg,
-                                            warning_only=warning_only)
-                                if fail_on_first:
-                                    assert test_pass, f"Data mismatch on acc step {acc_step}, device {device_index} - Gradient for parameter {name}"
-
-                # Check weights (checkpoints and last)
-                optimizer_index += 1
-                if (step == verify_cfg.steps - 1) or (verify_cfg.checkpoint_interval > 0 and optimizer_index % verify_cfg.checkpoint_interval == 0):
-                    checkpoint_name = "Final" if (step == verify_cfg.steps - 1) else f"Optimizer step {optimizer_index}"
-                    logger.debug("Verifying parameter checkpoint {}", checkpoint_name)
-                    checkpoint = checkpoint_q.get()
-                    for device_index, _ in enumerate(devices):
-                        param_checkpoint = checkpoint[device_index]
-            
-                        for name in param_checkpoint:
-                            calc = param_checkpoint[name].value()
-                            golden = eval_res.devices[device_index].final_parameters[name]
-                            warning_only = should_waive_gradient(name, verify_cfg)
-                            test_pass &= compare_tensor_to_golden(f"{checkpoint_name} parameter {name}, device {device_index}", golden, calc,
-                                        is_buda=True, verify_cfg=verify_cfg,
-                                        warning_only=warning_only)
-                            if fail_on_first:
-                                assert test_pass, f"Data mismatch on {checkpoint_name} parameter {name}, device {device_index}"
-
-                        if step != verify_cfg.steps - 1:
-                            # Not the last. Let's sync weights back to cpueval to continue to track
-                            logger.debug("Syncing device param checkpoint to pytorch golden")
-                            _update_parameters_from_checkpoint(all_saved_parameters, device_index, param_checkpoint)
-
-            # epoch boundary scheduler step
-            devices = get_devices()
-            for d in devices:
-                scheduler = d.get_pytorch_scheduler()
-                if scheduler:
-                    scheduler.step()
-
-    finally:
-        assert test_pass, f"Training data mismatch detected"
-
-
-def get_framework_pipeline_outputs(inputs: Tuple[Tensor, ...], modules: List[Module]):
-    i = 0
-    while i < len(modules):
-        pt_inputs = to_pt_tensors(inputs)
-
-        inputs = modules[i].cpu_eval_forward(*pt_inputs)
-        i += 1
-
-    return inputs
-
-def _verify_inference(
-        inputs: List[Tuple[Tensor, ...]], 
-        all_saved_parameters: List[Dict[str, torch.Tensor]], 
-        result_q: queue.Queue, 
-        verify_cfg: VerifyConfig,
-        modules_copy: List[Module],
-        inputs_copy: List[Tuple[Tensor, ...]],):
-    """
-    Verify inference vs cpueval
-    """
-    from pybuda.op.eval import compare_tensor_to_golden, calculate_pcc # avoid circular import
-    test_pass = True
-    fail_on_first = "SHOW_ALL_FAILS" not in os.environ
-    contains_framework_module = any([not isinstance(x, PyBudaModule) for x in modules_copy])
-    if verify_cfg.devmode == DeviceMode.RunOnly:
-        # device._compile_output.initial_graph is not archived in TTI,
-        # check if cpu-evaluated outputs are cached and loaded from TTI 
-        devices = get_devices()
-        tt_device = devices[0] if isinstance(devices[0], pybuda.TTDevice) else devices[1]
-        assert tt_device._compiled_graph_state.cpueval_outputs, "cpueval-output are not loaded from TTI although device is set to RunOnly mode, exit"
-        loaded_eval_outputs = list(list([tt_device._compiled_graph_state.cpueval_outputs]))
-
-    try: 
-        for iter, single_input in enumerate(inputs):
-            result = result_q.get()
-            if verify_cfg.devmode == DeviceMode.RunOnly:
-                eval_outputs = loaded_eval_outputs[iter]
-                if not isinstance(eval_outputs[0], list):
-                    eval_outputs[0] = [eval_outputs[0]]
-            else:
-                eval_outputs = cpueval_inference([single_input], all_saved_parameters, verify_cfg.sequential)
-
-            if verify_cfg.override_module_outptus is not None:
-                contains_framework_module = False
-                eval_outputs = verify_cfg.override_module_outptus
-
-            if contains_framework_module:
-                framework_outputs = get_framework_pipeline_outputs(inputs_copy[iter], modules_copy)
-                assert len(framework_outputs) == len(result), "Number of framework outputs doesn't match number of Buda outputs"
-
-            for i, (eval_out, result_out) in enumerate(zip(eval_outputs[0], result)):
-                if contains_framework_module and verify_cfg.enabled:
-                    test_pass &= compare_tensor_to_golden(f"Iteration {iter} - Framework Output {i}", framework_outputs[i], result_out.value(), is_buda=True, verify_cfg=verify_cfg)
-
-                    if fail_on_first:
-                        pcc_value = calculate_pcc(framework_outputs[i], result_out.value().to(framework_outputs[i].dtype))
-                        assert test_pass, f"Data mismatch on iteration {iter} - Eval Output {i}. PCC got {pcc_value}, required={verify_cfg.pcc}"
-
-                # Temporary workaround for microbatch dim being different between backend and frontend
-                eval_out = eval_out.reshape(result_out.shape.get_pytorch_shape())
-
-                if verify_cfg.enabled:
-                    test_pass &= compare_tensor_to_golden(f"Iteration {iter} - Eval Output {i}", eval_out, result_out.value(),
-                            is_buda=True, verify_cfg=verify_cfg)
-
-                    if fail_on_first:
-                        pcc_value = calculate_pcc(eval_out, result_out.value().to(eval_out.dtype))
-                        assert test_pass, f"Data mismatch on iteration {iter} - Eval Output {i}. PCC got {pcc_value}, required={verify_cfg.pcc}"
-
-    finally: 
-        assert test_pass, f"Data mismatch detected"
-
-def verify_module_pipeline(modules: List[Module], input_shapes: List[Tuple], verify_cfg: VerifyConfig, 
-        input_params: List[Dict] = [], device_types = ["TTDevice"], params_centered_on_zero: bool = False, 
-        scale_params: float = 1.0, inputs: Optional[List[Tuple[Tensor]]] = None,
-        loss_module: Optional[Module] = None, uniform_inputs: bool = False, inputs_centered_on_zero: bool = False) -> List[Device]:
-    """
-    Test a pipeline of modules, with given verification requirements in verify_cfg, and given input shapes.
-
-    This can do full inference and training testing on graph-level, golden, model, silicon, etc. 
-    compared to pytorch equivalent model.
-
-    input_params can be used to specify dtype and requires_grad for inputs
-
-    Parameters
-    ----------
-    modules: List[Module]
-        Pipeline of modules to test
-
-    input_shapes: List[Tuple]
-        List of input shapes to feed into the first module
-
-    verify_cfg: VerifyConfig
-        Verification parameters
-
-    inputs_params: List[Dict]
-        Optional parameters for each of the inputs - such as requires_grad, data_format, dev_data_format
-
-    device_types: List[str] 
-        List of device types (TTDevice, CPUDevice) to place modules on. Modules will be matched with list
-        of devices 1-to-1, and if list of modules is longer than list of devices, all subsequent modules will
-        be placed on the last device. If device type string is "-", the module will be placed on the previous
-        device. This can't be the first device type.
-
-    params_centered_on_zero: bool
-        If set, parameters will be picked in -0.5, 0.5 range instead of 0, 1
-
-    scale_params: float
-        Divide parameters with this value
-
-    inputs: List[Tuple[Tensor]]
-        Optional list of model activations to run
-        
-    loss_module: Optional[Module]
-        Optional loss module for training. If none is set (and training is enabled), pytorch L1Loss will be
-        added, and a CPUDevice will be instantiated to calculate it.
-
-    uniform_inputs: bool (default False)
-        If set, random inputs will be uniformly distributed from 0 to 1 (i.e. default torch.rand)
-
-    inputs_centered_on_zero: bool (default False)
-        If set, random inputs will be cenetred around zero
-
-    Returns
-    -------
-    List[Device, ...]
-        List of generated devices with modules placed on them
-
-
-    """
-    
-    if verify_cfg.enabled and verify_cfg.test_kind.is_training() and not verify_cfg.sequential:
-        logger.warning("Concurrent training is NOT stable in the verify flow. Likely to run into errors during _verify_training")
-
-    compiler_cfg = _get_global_compiler_config()
-    
-    if "PYBUDA_OVERRIDES_VETO" in os.environ:
-        compiler_cfg = pybuda_override_veto(compiler_cfg)
-
-    assert verify_cfg is not None, "VerifyConfig must be provided for verify_module flow"
-    force_full = bool(int(os.environ.get("PYBUDA_FORCE_FULL_COMPILE_DEPTH", "0")))
-    if force_full:
-        compiler_cfg.compile_depth = CompileDepth.FULL
-    run_backend = verify_cfg.devtype != BackendType.NoBackend and compiler_cfg.compile_depth == CompileDepth.FULL and verify_cfg.devmode != DeviceMode.CompileOnly
-
-    # Figure out module placement so we can extract parameters
-    module_to_device = {}
-    dev_index = 0
-    for i, m in enumerate(modules):
-        module_to_device[i] = dev_index
-        if i < len(device_types) - 1:
-            if device_types[i] != "-":
-                dev_index += 1
-
-    dev_count = dev_index + 1
-
-    # Generate inputs, if they are not already provided
-    if inputs is None:
-        inputs = _generate_random_inputs(input_shapes, input_params, verify_cfg, uniform_inputs, inputs_centered_on_zero)
-    else:
-        for i in range(verify_cfg.total_number_of_inputs()):
-            inputs[i] = pybuda.tensor.to_buda_tensors(to_pt_tensors(inputs[i]))
-
-    # Translate framework module if needed
-    if compiler_cfg.compile_tvm_to_python:
-        single_microbatch_inputs = remove_microbatch(inputs[0])
-        import copy
-        inputs_copy = copy.deepcopy(inputs)
-        modules, device_types, module_to_device, dev_count, modules_copy = _translate_framework_modules(modules, device_types, module_to_device, single_microbatch_inputs, dev_count, verify_cfg, compiler_cfg)
-
-        # Pybuda module will only accept flattened inputs
-        for i in range(len(inputs)):
-            flattened, _, _ = flatten_inputs(to_pt_tensors(inputs[i]))
-            inputs[i] = pybuda.tensor.to_buda_tensors(flattened)
-
-    # Extract parameters
-    all_parameters: List[List[Parameter]] = [ [] for _ in range(dev_count) ]
-    for i, m in enumerate(modules):
-        all_parameters[module_to_device[i]].extend(m.get_parameters())
-
-    # Create devices / place modules
-    devices = _create_devices(modules, device_types, module_to_device, remove_microbatch(inputs[0]), verify_cfg, all_parameters, loss_module)
-
-    all_saved_parameters = _update_and_randomize_params(modules, dev_count, module_to_device, all_parameters, params_centered_on_zero, scale_params, uniform_inputs)
-
-    if verify_cfg.test_kind.is_training():
-        all_saved_parameters.append( {} ) # one for cpu loss
-
-    if not run_backend:
-        # Compile and exit
-        from ..compile import pybuda_compile
-        compiler_cfg.enable_training = verify_cfg.test_kind.is_training()
-        for epoch_break in verify_cfg.epoch_breaks:
-            compiler_cfg.place_on_new_epoch(epoch_break)
-
-        compiled_devices = []
-        with torch.no_grad():
-            single_microbatch_inputs = remove_microbatch(inputs[0])
-            pipe_inputs = single_microbatch_inputs
- 
-        if verify_cfg.devmode == DeviceMode.CompileOnly:
-            # Compile first
-            from ..run import initialize_pipeline
-            initialize_pipeline(
-                training=verify_cfg.test_kind.is_training(),
-                sample_inputs=to_buda_tensors(to_pt_tensors(pipe_inputs)),
-                _sequential=True,
-                _verify_cfg=verify_cfg,
-                _device_mode=verify_cfg.devmode,
-            )
-
-            # Pre-calculate cpu-evauated outputs, append it to compiled_graph_state
-            cpueval_outputs = []
-            for single_input in inputs:
-                eval_output = cpueval_inference([single_input], all_saved_parameters, verify_cfg.sequential)
-                cpueval_outputs.append(eval_output[0][0])
-
-        for device in devices:
-            if isinstance(device, pybuda.cpudevice.CPUDevice):
-                for module in device.modules:
-                    pipe_inputs = module.forward(*to_pt_tensors(pipe_inputs))
-
-                # Cast outputs from CPU-fallback to FP32
-                pipe_inputs = [t.float() for t in pipe_inputs]
-            else:
-                if verify_cfg.devmode == DeviceMode.CompileOnly:
-                    # Note: below does not re-compile the model (already compiled above), but just generate TTI
-                    device.compile_to_image(
-                        img_path=verify_cfg.tti_path,
-                        training=verify_cfg.test_kind.is_training(),
-                        sample_inputs= to_buda_tensors(to_pt_tensors(pipe_inputs)),
-                        cpueval_outputs=cpueval_outputs,
-                    )
-                else:
-                    device._compile_output = pybuda_compile(device, device.modules[0].get_name(), *to_buda_tensors(to_pt_tensors(pipe_inputs)),
-                        compiler_cfg=compiler_cfg,
-                        verify_cfg=verify_cfg)
-                pipe_inputs = device._compile_output.outputs
-
-        return devices
-    # Generate the graph, to get output shapes (if training)
-    # TODO for device pipeline
-    if verify_cfg.test_kind.is_training():
-        targets = _setup_training(devices, inputs[0], verify_cfg, loss_module is None)
-
-        # Push inputs
-        # TODO: concurrent pushing in a thread?
-        for input in inputs:
-            devices[0].push_to_inputs(*input)
-
-        pybuda.set_configuration_options(enable_recompute=verify_cfg.test_kind.is_recompute())
-        checkpoint_q = pybuda.run_training(
-                epochs=verify_cfg.epochs, 
-                steps=verify_cfg.steps, 
-                accumulation_steps=verify_cfg.accumulation_steps, 
-                microbatch_count=verify_cfg.microbatch_count, 
-                _sequential=verify_cfg.sequential, 
-                checkpoint_interval=verify_cfg.checkpoint_interval,
-                _perf_trace=False,
-                _verify_cfg=verify_cfg)
-
-        assert not pybuda.error_raised(), "Error during training"
-
-        # if parameters were updated as part of compile (by tvm), update them
-        # FIXME: can't pick up new parameters here, since we already ran training and got them changed!
-        #all_saved_parameters = update_and_randomize_params(all_parameters)
-        #all_saved_parameters.append( {} ) # one for cpu loss
-
-        _verify_training(devices, inputs, targets, all_saved_parameters, checkpoint_q, verify_cfg)
-
-    else:
-
-        # Push inputs
-        # TODO: concurrent pushing in a thread?
-        for input in inputs:
-            devices[0].push_to_inputs(*input)
-
-        result_q = pybuda.run_inference(_sequential=verify_cfg.sequential, _verify_cfg=verify_cfg, input_count=len(inputs))
-        assert not pybuda.error_raised(), "Error during inference"
-    
-        # if parameters were updated as part of compile (by tvm), update them
-        all_saved_parameters = _update_and_randomize_params(modules, dev_count, module_to_device, all_parameters, params_centered_on_zero, scale_params, uniform_inputs)
-
-        _verify_inference(inputs, all_saved_parameters, result_q, verify_cfg, modules_copy, inputs_copy)
-
-    return devices
-
-
-def verify_module(module: Module, input_shapes: List[Tuple], verify_cfg: VerifyConfig, input_params: List[Dict] = [],
-        device_type: str = "TTDevice", params_centered_on_zero: bool = False, scale_params: float = 1.0,
-        inputs: Optional[List[Tuple[Tensor]]] = None,
-        loss_module: Optional[Module] = None, uniform_inputs: bool = False, inputs_centered_on_zero: bool = False):
-    """
-    Test a module on one device, with given verification requirements in verify_cfg, and given input shapes.
-
-    This can do full inference and training testing on graph-level, golden, model, silicon, etc. 
-    compared to pytorch equivalent model.
-
-    input_params can be used to specify dtype and requires_grad for inputs
-
-    Parameters
-    ----------
-    module: Module
-        Module to test
-
-    input_shapes: List[Tuple]
-        List of input shapes to feed into the first module
-
-    verify_cfg: VerifyConfig
-        Verification parameters
-
-    inputs_params: List[Dict]
-        Optional parameters for each of the inputs - such as requires_grad, data_format, dev_data_format
-
-    device_type: str 
-        Device type (TTDevice, CPUDevice) to place module on.
-
-    params_centered_on_zero: bool
-        If set, parameters will be picked in -0.5, 0.5 range instead of 0, 1
-
-    scale_params: float
-        Divide parameters with this value
-
-    inputs: List[Tuple[Tensor]]
-        Optional list of model activations to run
-        
-    loss_module: Optional[Module]
-        Optional loss module for training. If none is set (and training is enabled), pytorch L1Loss will be
-        added, and a CPUDevice will be instantiated to calculate it.
-
-    uniform_inputs: bool (default False)
-        If set, random inputs will be uniformly distributed from 0 to 1 (i.e. default torch.rand)
-
-    inputs_centered_on_zero: bool (default False)
-        If set, random inputs will be cenetred around zero
-    
-    Returns
-    -------
-    List[Device, ...]
-        List of generated devices with modules placed on them
-
-
-    """
-    return verify_module_pipeline([module], input_shapes, verify_cfg, input_params, [device_type], params_centered_on_zero, scale_params, inputs, loss_module, uniform_inputs, inputs_centered_on_zero)
-
diff --git a/pybuda/pybuda/verify/config.py b/pybuda/pybuda/verify/config.py
index 686d66066..a38db5bf0 100644
--- a/pybuda/pybuda/verify/config.py
+++ b/pybuda/pybuda/verify/config.py
@@ -8,8 +8,6 @@
 
 import torch
 
-import pybuda.optimizers
-from pybuda._C.backend_api import BackendType, BackendDevice, DeviceMode
 from pybuda._C import DataFormat
 from dataclasses_json import dataclass_json
 from pybuda.utils import as_json
@@ -86,9 +84,6 @@ class VerifyConfig:
     # For auto-testing
     sequential: bool = True
     test_kind: TestKind = field(default=TestKind.INFERENCE, metadata=as_json(TestKind))
-    devtype: BackendType = field(default=BackendType.Golden, metadata=as_json(BackendType))
-    devmode: DeviceMode = field(default=DeviceMode.CompileAndRun, metadata=as_json(DeviceMode))
-    arch: BackendDevice = field(default=BackendDevice.Grayskull if "GOLDEN_WORMHOLE_B0" not in os.environ else BackendDevice.Wormhole_B0, metadata=as_json(BackendDevice))
     scale_loss: float = 50.0  # Loss-scaling to make gradients bigger and easier to verify
     optimizer: Optional[Dict[str, Any]] = field(default_factory=lambda : {"type": "sgd", "params": {"learning_rate": 50.0 } })
     scheduler: Optional[Dict] = None
@@ -123,6 +118,8 @@ class VerifyConfig:
     if verify_all:
         verify_pipeline_result_vs_framework = True
         verify_pybuda_codegen_vs_framework = True
+        verify_tvm_compile = True
+        verify_each_buda_pass = True
 
     def __post_init__(self):
         # set defaults if not set explicitly by user. Relax under silicon, focus on pcc more.
@@ -162,9 +159,6 @@ def __post_init__(self):
         if "TT_BACKEND_GOLDEN_QUANTIZE" in os.environ:
             self.golden_ignore_df_precision = False
 
-        if self.arch == BackendDevice.Grayskull:
-            assert self.fp32_fallback != DataFormat.Float32, "Fallback for f32 cannot be f32 itself"
-
 
     @classmethod
     def disabled(cls) -> "VerifyConfig":
diff --git a/pybuda/pybuda/verify/verify.py b/pybuda/pybuda/verify/verify.py
index 6e2ffef09..4121cf653 100644
--- a/pybuda/pybuda/verify/verify.py
+++ b/pybuda/pybuda/verify/verify.py
@@ -17,9 +17,6 @@
 from .config import VerifyConfig, should_waive_gradient
 from ..config import PerfTraceLevel
 import pybuda._C.graph as pygraph
-from pybuda._C.backend_api import BackendType, DeviceMode
-from pybuda.tensor import pytorch_tensor_to_tensor_desc
-from ..backend import BackendAPI
 from pybuda.tools.run_net2pipe import net2pipe
 
 def _generate_random_losses(outputs, is_buda):
@@ -71,7 +68,6 @@ def do_verify(
         parameters: Dict[str, torch.Tensor],
         golden_input_grads: Tuple[torch.Tensor, ...],
         outputs: Tuple[Tensor, ...],
-        device: "TTDevice",
         intermediate_golden_tensors: Dict,
         verify_cfg: VerifyConfig,
         is_buda: bool,
@@ -219,64 +215,4 @@ def verify_golden(
         outputs: Tuple[torch.Tensor],
         verify_cfg: VerifyConfig):
 
-    logger.info("Running golden backend verify")
-
-    backend_api = None
-    try:
-        from pybuda.compiled_graph_state import CompiledGraphState
-        compiled_graph_state = CompiledGraphState.from_compiled_graph(device, compile_results)
-        backend_api = BackendAPI(
-            BackendType.Golden,
-            device.arch,
-            device,
-            netlist_filename,
-            compiled_graph_state,
-            False,
-            None,
-            None,
-            PerfTraceLevel.NONE,
-            DeviceMode.CompileAndRun)
-
-        backend_api.push_constants_and_parameters()
-        backend_api.push_optimizer_parameters()
-
-        iqs = backend_api.get_ordered_input_queues()
-        assert len(inputs) == len(iqs)
-
-        padded_inputs = []
-        for i, t in enumerate(inputs):
-            padded_tensor = pad_pytorch_tensor_to_buda(t.value(), compiled_graph_state.ordered_input_tile_broadcast_dims[i])
-            padded_inputs.append(pytorch_tensor_to_tensor_desc(padded_tensor))
-        BackendAPI.push_to_queues(iqs, padded_inputs, single_input=True)
-
-        # Run fwd program
-        backend_api.schedule_run_forward(loop_count=1)
-
-        # Get outputs, and check them
-        from pybuda.op.eval import compare_tensor_to_golden  # avoid circular import
-        oq = backend_api.get_ordered_output_queues()
-        assert len(oq) == len(outputs)
-        calculated_outputs = BackendAPI.read_queues(oq, [g.value().shape for g in outputs], None, [False] * len(oq), single_output=True, has_microbatch_dim=False)
-
-        ok = True
-        for i, (golden, calculated) in enumerate(zip(outputs, calculated_outputs)):
-            output_tensor = calculated.value()
-            golden = golden.value().type(output_tensor.dtype)
-            output_tensor = narrow_buda_tensor_to_pytorch(output_tensor, golden.shape)
-            ok &= compare_tensor_to_golden(f"Output {i}", golden, output_tensor, verify_cfg=verify_cfg)
-
-        BackendAPI.pop_queues(oq, single_output=True)
-
-    finally:
-        # Make sure to clean up
-        if backend_api is not None:
-            backend_api.shutdown()
-
-    assert ok, "Verify Golden: Data mismatch detected"
-
-
-def verify_net2pipe(netlist, device_yaml, cluster_cfg_yaml):
-    level = int(os.environ.get("PYBUDA_VERIFY_NET2PIPE", "0"))
-    returncode, error_message = net2pipe(netlist, device_yaml=device_yaml, cluster_cfg_yaml=cluster_cfg_yaml, stats=(level > 3), run_pipegen=(level > 1), run_blobgen=(level > 2))
-    assert returncode == 0, f"net2pipe failed: {error_message}"
-    logger.info("net2pipe success!")
+    assert False #Run ttnn golden
diff --git a/pybuda/test/README.debug.md b/pybuda/test/README.debug.md
new file mode 100644
index 000000000..0e13ca546
--- /dev/null
+++ b/pybuda/test/README.debug.md
@@ -0,0 +1,6 @@
+
+*Test specific environment variables that can be used to fine tune default behavior of PyBuda tests.*
+
+## Parameters
+ * RANDOM\_TEST\_COUNT: Number of random tests to be generated and executed. The parameter generate test_index in range from 0 to RANDOM\_TEST\_COUNT-1. (default: 5)
+ * RANDOM\_TESTS\_SELECTED: Limiting random tests to only selected subset defined as comma separated list of test indexes. E.x. "3,4,6". Default is no limitation if not specified or empty.
diff --git a/pybuda/test/backend/models/test_bert.py b/pybuda/test/backend/models/test_bert.py
index 429665a64..57d1b79d2 100644
--- a/pybuda/test/backend/models/test_bert.py
+++ b/pybuda/test/backend/models/test_bert.py
@@ -504,7 +504,7 @@ def test_pt_encoder(test_kind, test_device, size, encoder_count, num_chips):
         # set pcc values to highest seen on each type of architecture
         if test_device.is_grayskull():
             pcc = 0.68
-        elif test_device.is_wormhole():
+        elif test_device.is_wormhole_b0():
             pcc = 0.9
 
     import os
diff --git a/pybuda/test/backend/models/test_gpt2.py b/pybuda/test/backend/models/test_gpt2.py
index 873ded365..af230f7ff 100644
--- a/pybuda/test/backend/models/test_gpt2.py
+++ b/pybuda/test/backend/models/test_gpt2.py
@@ -56,7 +56,6 @@ def forward(self, input_ids, attention_mask):
 def test_pt_gpt2_fallback(test_kind, test_device):
     model = GPT2LMHeadModel.from_pretrained("gpt2")
     tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-    _get_global_compiler_config().enable_t_streaming = True
     tokenizer.pad_token = tokenizer.eos_token
 
     prefix_text = "My name is Bert, and I am"
diff --git a/pybuda/test/backend/test_backend.py b/pybuda/test/backend/test_backend.py
index 04e79ff90..5c5de2453 100644
--- a/pybuda/test/backend/test_backend.py
+++ b/pybuda/test/backend/test_backend.py
@@ -42,7 +42,7 @@ def test_basic_wormhole(test_kind):
     if test_kind == TestKind.TRAINING_RECOMPUTE:
         pytest.skip() # tenstorrent/budabackend#382
     verify_module(BudaTest("verify_module"), [(1, 64, 64), (1, 64, 64)],
-           VerifyConfig(test_kind=test_kind, run_net2pipe=True, arch=BackendDevice.Wormhole))
+           VerifyConfig(test_kind=test_kind, run_net2pipe=True))
 
 def test_multi_input_inference():
     verify_module(BudaTest("verify_module"), [(1, 64, 64), (1, 64, 64)],
diff --git a/pybuda/test/benchmark/benchmark.py b/pybuda/test/benchmark/benchmark.py
index ca137d027..ac19bbd59 100755
--- a/pybuda/test/benchmark/benchmark.py
+++ b/pybuda/test/benchmark/benchmark.py
@@ -4,6 +4,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import os
 import inspect
+from loguru import logger
 from typing import List, Tuple, Union, Optional, Dict
 import time
 import threading
@@ -27,7 +28,6 @@
 import benchmark.models.inception_v4
 import benchmark.models.mobilenet_v1
 import benchmark.models.mobilenet_v2
-import benchmark.models.mobilenet_v2_timm
 import benchmark.models.mobilenet_v3_timm
 import benchmark.models.openpose_body
 import benchmark.models.openpose_hand
@@ -35,14 +35,15 @@
 import benchmark.models.resnet
 import benchmark.models.t5
 import benchmark.models.unet
-import benchmark.models.unit
 import benchmark.models.vit
-import benchmark.models.vovnet_v1
 import benchmark.models.vovnet_v2
 import benchmark.models.whisper
 import benchmark.models.yolo_v3
 import benchmark.models.yolo_v5
 
+import benchmark.models.custom.custom_resnet_highres
+import benchmark.models.custom.custom_vit_highres
+
 
 def single_thread_generative_model_run(args, first_device, last_device, inputs, targets, output_q, num_tokens_to_generate, first_current_index, pad_token_id, write_index):
     print("Executing in single-threaded generative model mode")
@@ -412,9 +413,9 @@ def run(
     parser.add_argument('-df',  '--dataformat', choices=['Fp32', 'Fp16', 'Fp16_b', 'Bfp8', 'Bfp8_b', 'Bfp4', 'Bfp4_b'], default='Bfp8_b', help='Set data format')
     parser.add_argument('-mf',  '--math_fidelity', choices=['LoFi', 'HiFi2', 'HiFi3', 'HiFi4'], default='LoFi', help='Set math fidelity')
     parser.add_argument('-opt', '--backend_opt_level', choices=[0, 1, 2, 3, 4], default=4, type=int, help='Set backend optimization level')
-    parser.add_argument(        '--loop_count', default=32, type=int, help='Set the number of times to loop through the model. By default, it will be 5x the number of chips.')
+    parser.add_argument(        '--loop_count', default=128, type=int, help='Set the number of times to loop through the model. By default, it will be 5x the number of chips.')
     parser.add_argument(        '--microbatch_count', default=1, type=int, help='Set the number of times to loop within each program.')
-    parser.add_argument('-mb',  '--microbatch', default=64, type=int, help='The microbatch size to run the benchmark on. The model should set its own reasonable default if no microbatch is forced here.')
+    parser.add_argument('-mb',  '--microbatch', default=128, type=int, help='The microbatch size to run the benchmark on. The model should set its own reasonable default if no microbatch is forced here.')
     parser.add_argument(        '--chips', default=1, type=int, help='Number of chips to run benchmark on. 0 to run on all available.')
     parser.add_argument(        '--recompute', action='store_true', help='Enable recompute in training')
     parser.add_argument(        '--layers', default=0, type=int, help='Number of layers to run on models where this is applicable (i.e. nlp encoders/decoders)')
@@ -425,10 +426,10 @@ def run(
     parser.add_argument(        '--disable_output', default=0, type=int, choices=[0, 1], help='Disables the generation of the output json file')
     parser.add_argument(        '--load_tti', default="", type=str, help='Skip compile and load from TTI-archive configured for silicon (specify path to TTI).')
     parser.add_argument(        '--save_tti', default="", type=str, help='Save compilation for TTDevice into a TTI-archive configured for silicon to file and exit program. (speciy path to save to).')
-    parser.add_argument(        '--arch', choices=['grayskull', 'wormhole', 'wormhole_b0'], default=None, help='Set arch for offline TTI compilation.')
+    parser.add_argument(        '--arch', choices=['grayskull', 'wormhole_b0'], default=None, help='Set arch for offline TTI compilation.')
     parser.add_argument(        '--device', choices=['silicon', 'golden', 'model'], default=None, help='Set device.')
     parser.add_argument(        '--runtime_params_yaml', default=None, help='Set runtime params yaml for offline compile of WH devices.')
-    parser.add_argument(        '--device-config', choices=['galaxy', 'wh_nebula_x1', 'wh_nebula_x2', 'gs_e150', 'gs_e300'], default=None, type=str, help='Runtime params yaml for offline compile of WH devices would be configured based on that.')
+    parser.add_argument(        '--device-config', choices=['galaxy', 'wh_n150', 'wh_n300', 'gs_e150', 'gs_e300'], default=None, type=str, help='Runtime params yaml for offline compile of WH devices would be configured based on that.')
     parser.add_argument(        '--auto_transpose', action='store_true', help='Enable auto-transpose on placement')
     parser.add_argument('-bp',  '--balancer-policy', choices=['default', 'CNN', 'Ribbon', 'NLP'], default='default', help='Set balancer policy.')
     parser.add_argument(        '--perf_analysis', action='store_true', help='Enable backend perf analyzer and op estimates in compiler')
@@ -496,7 +497,12 @@ def run(
     if args.model == "bert" and args.chips > 1:
         os.environ["PYBUDA_MULTICHIP_BERT"] = str(args.chips)
 
-    kwargs = {"training": args.training, "microbatch": args.microbatch, "data_type": args.dataformat}
+    kwargs = {
+        "training": args.training, 
+        "microbatch": args.microbatch, 
+        "data_type": args.dataformat,
+        "math_fidelity": args.math_fidelity
+    }
 
     device_list = pybuda.detect_available_devices()
     if device_list:
@@ -534,9 +540,12 @@ def run(
 
     if model_config is None:
         print("The model configuration is empty. ")
-        exit(0)
+        exit(1)
         
     duts, inputs, targets, other = model_config
+    implied_microbatch = inputs[0].shape[0]
+    if (implied_microbatch != args.microbatch):
+        logger.warning(f"Model configuration implies microbatch size of {implied_microbatch}, but command line specifies {args.microbatch}. Overriding microbatch size to {implied_microbatch}.")
     try:
         result = run(args, duts, inputs, targets, other)
 
@@ -548,6 +557,7 @@ def run(
             "machine_name": socket.gethostname()
         }
         print("Error encountered while running benchmark: ", e)
+        exit(1)
 
     import subprocess
     short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').strip()
diff --git a/pybuda/test/benchmark/benchmark/models/bert.py b/pybuda/test/benchmark/benchmark/models/bert.py
index e25ac2ce4..04fa14984 100644
--- a/pybuda/test/benchmark/benchmark/models/bert.py
+++ b/pybuda/test/benchmark/benchmark/models/bert.py
@@ -9,40 +9,39 @@
 
 from pybuda import PyTorchModule
 from pybuda.config import _get_global_compiler_config
-from transformers import BertModel, BertConfig
+from transformers import BertModel, BertConfig, BertForSequenceClassification
+
 
 from ..common import benchmark_model
 
-# Embedding wrapper that extends and passes attention mask through - to run on host
-class EmbWrapper(torch.nn.Module):
+class BertEncoderWrapper(torch.nn.Module):
+
     def __init__(self, bert):
         super().__init__()
         self.bert = bert
+
     def forward(self, input_ids, attention_mask, token_type_ids):
         attention_mask = attention_mask * 1.0
         emb_output = self.bert.embeddings(input_ids, token_type_ids)
-        # input_ids = input_ids * 1.0
-        # emb_output = input_ids.unsqueeze(2).repeat(1, 1, 1024)
         extended_attention_mask = (1.0 - attention_mask) * -10000.0
         extended_attention_mask = extended_attention_mask.unsqueeze(dim=-2)
-        return emb_output, extended_attention_mask
+        return self.bert.encoder(emb_output, extended_attention_mask)
+
 
 class BertWrapper(torch.nn.Module):
-    def __init__(self, bert):
+
+    def __init__(self, model):
         super().__init__()
-        self.bert = bert
+        self.model = model
+
+    def forward(self, input_ids, token_type_ids):
+        return self.model(input_ids, token_type_ids)
 
-    def forward(self, input_ids, attention_mask, token_type_ids):
-        attention_mask = attention_mask * 1.0
-        emb_output = self.bert.embeddings(input_ids, token_type_ids)
-        # input_ids = input_ids * 1.0
-        # emb_output = input_ids.unsqueeze(2).repeat(1, 1, 1024)
-        extended_attention_mask = (1.0 - attention_mask) * -10000.0
-        extended_attention_mask = extended_attention_mask.unsqueeze(dim=-2)
-        return self.bert.encoder(emb_output, extended_attention_mask)
 
 @benchmark_model(configs=["tiny", "base", "large", "base_tc", "large_tc"])
-def bert(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, force_num_layers: Optional[int] = None):
+def bert(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str, force_num_layers: Optional[int] = None):
+
+    from pybuda._C.backend_api import BackendDevice
 
     compiler_cfg = _get_global_compiler_config()
 
@@ -52,14 +51,10 @@ def bert(training: bool, config: str, microbatch: int, devtype: str, arch: str,
         target_microbatch = 512
         compiler_cfg.enable_auto_transposing_placement = True
         os.environ["PYBUDA_EXP_APPROX"] = "1" 
-        os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"] = "1"
     elif config == "base":
         model_name = "bert-base-uncased"
         seq_len = 128
         target_microbatch = 128
-        # Testing!
-        #if not force_num_layers:
-        #    pybuda.config._get_global_compiler_config().place_on_new_epoch(f"matmul_518")
     elif config == "large":
         model_name = "bert-large-uncased"
         if training:
@@ -105,9 +100,11 @@ def bert(training: bool, config: str, microbatch: int, devtype: str, arch: str,
         if compiler_cfg.balancer_policy == "default":
             compiler_cfg.balancer_policy = "Ribbon"
             os.environ["PYBUDA_RIBBON2"] = "1"
-            if data_type != "Bfp8_b":
-                os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
+            os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
+            os.environ["PYBUDA_ENABLE_HOST_INPUT_NOP_BUFFERING"] = "1"
             if data_type == "Bfp8_b":
+                if pybuda.detect_available_devices()[0] != BackendDevice.Grayskull:
+                    os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
                 os.environ["PYBUDA_EXP_APPROX"] = "1"
                 pybuda.config.configure_mixed_precision(op_type="add", output_df=pybuda.DataFormat.Float16_b)
                 pybuda.config.configure_mixed_precision(op_type="subtract", output_df=pybuda.DataFormat.Float16_b)
@@ -118,52 +115,43 @@ def bert(training: bool, config: str, microbatch: int, devtype: str, arch: str,
     if microbatch == 0:
         microbatch = target_microbatch
 
-    cfg = BertConfig.from_pretrained(model_name)
-    if force_num_layers:
-        cfg.num_hidden_layers = force_num_layers
-    model = BertModel(config=cfg)
-    if not training:
-        model.eval()
+    if config == "large_tc":
+        model = BertForSequenceClassification.from_pretrained(model_name)
+    else:
+        cfg = BertConfig.from_pretrained(model_name)
+        if force_num_layers:
+            cfg.num_hidden_layers = force_num_layers
+        model = BertModel(config=cfg)
 
-    #
-    # Apply functional workarounds
-    #
+    # Configure model mode for training or evaluation
     if training:
-        #_get_global_compiler_config().enable_broadcast_splitting = True # fork error workaround
-        pass
+        model.train()
+    else:
+        model.eval()
 
     #
     # Create inputs, targets, models
     #
-    if "PYBUDA_SKIP_EMBEDDINGS" in os.environ:
-        inputs = [
-            torch.rand(microbatch, seq_len, cfg.hidden_size), # emb_output
-            torch.rand(microbatch, 1, 1, seq_len), # extended_attention_mask
-        ]
-        models = {"tt": PyTorchModule("bert_encoders", model.encoder)}
-    elif bool(int(os.environ.get("PYBUDA_DEVICE_EMBEDDINGS", "0"))) and not training:
+    if config == "large_tc":
         inputs = [
             torch.randint(high=25000, size=(microbatch, seq_len), dtype=torch.int), # input tokens
-            torch.randint(high=2, size=(microbatch, 1, seq_len), dtype=torch.float), # mask
             torch.randint(high=2, size=(microbatch, seq_len), dtype=torch.int), # token type IDs
         ]
         models = {"tt": PyTorchModule("bert", BertWrapper(model))}
-        pybuda.config._get_global_compiler_config().cpu_fallback_ops.remove("embedding")
     else:
         inputs = [
             torch.randint(high=25000, size=(microbatch, seq_len), dtype=torch.int), # input tokens
-            torch.randint(high=2, size=(microbatch, seq_len), dtype=torch.int), # mask
+            torch.randint(high=2, size=(microbatch, 1, seq_len), dtype=torch.float), # mask
             torch.randint(high=2, size=(microbatch, seq_len), dtype=torch.int), # token type IDs
         ]
-        models = {"cpu-pre": PyTorchModule("bert_emb", EmbWrapper(model)), "tt": PyTorchModule("bert_encoders", model.encoder)}
+        models = {"tt": PyTorchModule("bert", BertEncoderWrapper(model))}
+        pybuda.config._get_global_compiler_config().cpu_fallback_ops.remove("embedding")
 
     targets = tuple()
     if training:
         targets = [torch.rand(microbatch, seq_len, cfg.hidden_size)]
         models["cpu-loss"] = PyTorchModule("l1loss", torch.nn.L1Loss())
 
-    #pybuda.config.set_num_repeated_patterns(models["tt"].get_name(), len(models["tt"].module.layer))
-
     return models, inputs, targets, {}
 
 
diff --git a/pybuda/test/benchmark/benchmark/models/custom/custom_resnet_highres.py b/pybuda/test/benchmark/benchmark/models/custom/custom_resnet_highres.py
new file mode 100644
index 000000000..f76aff8e7
--- /dev/null
+++ b/pybuda/test/benchmark/benchmark/models/custom/custom_resnet_highres.py
@@ -0,0 +1,58 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import os
+import onnx
+import torch
+from loguru import logger
+
+import pybuda
+from pybuda import OnnxModule
+from pybuda.config import _get_global_compiler_config
+from ...common import benchmark_model
+
+
+@benchmark_model(configs=["default"])
+def custom_resnet_highres(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str):
+
+    # Set up ONNX model path
+    model_path = "third_party/confidential_customer_models/model_0/files/resnet50_fp32_w1280h960.onnx"
+    if not os.path.exists(model_path):
+        logger.error("Model not found in path \"{0}\"! Exiting...", model_path)
+        exit(1)
+
+    # Load ONNX model
+    onnx_model = onnx.load(model_path)
+    onnx.checker.check_model(onnx_model)
+    pybuda_onnx_model = OnnxModule(
+        "CUSTOM_ResNet_HighRes",
+        onnx_model,
+        model_path,
+    )
+
+    # Configurations
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.enable_auto_transposing_placement = True
+    compiler_cfg.balancer_policy = "Ribbon"
+
+    # Overrides
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    #os.environ["PYBUDA_RIBBON2_DISABLE_CLEANUP_BUF_NOPS"] = "1"
+    #os.environ["PYBUDA_SPARSE_ENABLE_LAYOUT_DATAFLOW"] = "1"
+    #os.environ["PYBUDA_MAXMIZE_SPARSE_UBLOCK"] = "1"
+    #os.environ["PYBUDA_DISABLE_CAP_SPARSE_MM_FIDELITY"] = "1"
+    #os.environ["PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY"] = "1"
+    #os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{77*1024}"
+    os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+    os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
+
+    models = {"tt" : pybuda_onnx_model}
+    dimension = onnx_model.graph.input[0].type.tensor_type.shape
+    input_shape = [d.dim_value for d in dimension.dim]
+    inputs = [torch.rand(*input_shape)]
+    targets = tuple()
+    if training:
+        targets = [torch.rand(1, 100)]
+
+    return models, inputs, targets, {}
diff --git a/pybuda/test/benchmark/benchmark/models/custom/custom_vit_highres.py b/pybuda/test/benchmark/benchmark/models/custom/custom_vit_highres.py
new file mode 100644
index 000000000..886e99818
--- /dev/null
+++ b/pybuda/test/benchmark/benchmark/models/custom/custom_vit_highres.py
@@ -0,0 +1,48 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import os
+import onnx
+import torch
+from loguru import logger
+
+import pybuda
+from pybuda import OnnxModule
+from pybuda.config import _get_global_compiler_config
+from ...common import benchmark_model
+
+
+@benchmark_model(configs=["default"])
+def custom_vit_highres(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str):
+
+    # Set up ONNX model path
+    model_path = "third_party/confidential_customer_models/model_0/files/vit_b_16_w1280h800.onnx"
+    if not os.path.exists(model_path):
+        logger.error("Model not found! Exiting...")
+        exit(1)
+
+    # Load ONNX model
+    onnx_model = onnx.load(model_path)
+    onnx.checker.check_model(onnx_model)
+    pybuda_onnx_model = OnnxModule(
+        "CUSTOM_ViT_HighRes",
+        onnx_model,
+        model_path,
+    )
+
+    # Configurations
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+
+    # Overrides
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    models = {"tt" : pybuda_onnx_model}
+    dimension = onnx_model.graph.input[0].type.tensor_type.shape
+    input_shape = [d.dim_value for d in dimension.dim]
+    inputs = [torch.rand(*input_shape)]
+    targets = tuple()
+    if training:
+        targets = [torch.rand(1, 100)]
+
+    return models, inputs, targets, {}
diff --git a/pybuda/test/benchmark/benchmark/models/deit.py b/pybuda/test/benchmark/benchmark/models/deit.py
index 7ad722add..f6c6eba67 100644
--- a/pybuda/test/benchmark/benchmark/models/deit.py
+++ b/pybuda/test/benchmark/benchmark/models/deit.py
@@ -12,7 +12,7 @@
 
 
 @benchmark_model(configs=["base", "small"])
-def deit(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str):
+def deit(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.enable_auto_transposing_placement = True
@@ -20,16 +20,18 @@ def deit(training: bool, config: str, microbatch: int, devtype: str, arch: str,
     if compiler_cfg.balancer_policy == "default":
         compiler_cfg.balancer_policy = "Ribbon"
         os.environ["PYBUDA_RIBBON2"] = "1"
-        os.environ["PYBUDA_RIBBON2_OPTIMIZATION_ITERATIONS"] = "10"
 
-        # These are about to be enabled by default.
-        #
-        if data_type != "Bfp8_b":
-            os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
-        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
+    # These are about to be enabled by default.
+    #
+    os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
+
+    if data_type == "Fp16_b":
+        os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES_APPLY_FILTERING"] = "1"
 
     if data_type == "Bfp8_b":
+        os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
         pybuda.config.configure_mixed_precision(op_type="reciprocal", output_df=pybuda.DataFormat.Float16_b)
+        os.environ["PYBUDA_FUSE_DF_OVERRIDE"] = "0"
 
     # Determine model variant
     if config == "base":
diff --git a/pybuda/test/benchmark/benchmark/models/hrnet.py b/pybuda/test/benchmark/benchmark/models/hrnet.py
index 17763efee..de08357e5 100644
--- a/pybuda/test/benchmark/benchmark/models/hrnet.py
+++ b/pybuda/test/benchmark/benchmark/models/hrnet.py
@@ -27,24 +27,24 @@
         "v2_w64",
     ]
 )
-def hrnet(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str):
+def hrnet(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str):
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.enable_auto_transposing_placement = True
 
     if compiler_cfg.balancer_policy == "default":
         compiler_cfg.balancer_policy = "Ribbon"
         os.environ["PYBUDA_RIBBON2"] = "1"
-        os.environ["PYBUDA_RIBBON2_OPTIMIZATION_ITERATIONS"] = "10"
-        os.environ["PYBUDA_SUPRESS_T_FACTOR_MM"] = "46" # removing causes hang #2139
-        os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"  # #2130
 
-        # These are about to be enabled by default.
-        #
-        if data_type != "Bfp8_b":
-            os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
-            os.environ["PYBUDA_RIBBON2_DISABLE_NON_MATMUL_UTIL"] = "1"
-        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
+    os.environ["PYBUDA_SUPRESS_T_FACTOR_MM"] = "46" # removing causes hang #2139
+    os.environ["PYBUDA_ENABLE_HOST_INPUT_NOP_BUFFERING"] = "1"
+
+    # These are about to be enabled by default.
+    #
+    os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
+    if data_type == "Fp16_b":
+        # Hangs with autotranspose on #2542
+        compiler_cfg.enable_auto_transposing_placement = False
 
     # Manually enable amp light for Ribbon
     if compiler_cfg.balancer_policy == "Ribbon":
@@ -75,7 +75,7 @@ def hrnet(training: bool, config: str, microbatch: int, devtype: str, arch: str,
         available_devices = pybuda.detect_available_devices()
         if available_devices:
             if available_devices[0] == BackendDevice.Grayskull:
-                pybuda.config.insert_buffering_nop('add_312', ['add_341'], nop_count=2)
+                pybuda.config._internal_insert_fj_buffering_nop('add_312', ['add_341'], nop_count=2)
     else:
         raise RuntimeError("Unknown config")
 
diff --git a/pybuda/test/benchmark/benchmark/models/inception_v4.py b/pybuda/test/benchmark/benchmark/models/inception_v4.py
index 6156bbed0..3254260da 100644
--- a/pybuda/test/benchmark/benchmark/models/inception_v4.py
+++ b/pybuda/test/benchmark/benchmark/models/inception_v4.py
@@ -11,25 +11,14 @@
 
 
 @benchmark_model(configs=["224"])
-def inception_v4(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str):
+def inception_v4(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.enable_auto_transposing_placement = True
+
 
     if compiler_cfg.balancer_policy == "default":
         compiler_cfg.balancer_policy = "Ribbon"
         os.environ["PYBUDA_RIBBON2"] = "1"
-        if data_type != "Bfp8_b":
-            os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
-        os.environ["PYBUDA_OP_MODEL_COMPARE_VERSION"] = "1"
-
-        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
-
-    if compiler_cfg.balancer_policy == "Ribbon":
-        available_devices = pybuda.detect_available_devices()
-        from pybuda._C.backend_api import BackendDevice
-        if available_devices:
-            if available_devices[0] == BackendDevice.Grayskull:
-                pybuda.config.insert_buffering_nop("conv2d_28.dc.matmul.11", ["conv2d_43.dc.sparse_matmul.9.dc.sparse_matmul.1.lc2"], nop_count=3)
 
     # Set model parameters based on chosen task and model configuration
     if config == "224":
diff --git a/pybuda/test/benchmark/benchmark/models/mobilenet_v1.py b/pybuda/test/benchmark/benchmark/models/mobilenet_v1.py
index 2cfa202f3..47069a556 100644
--- a/pybuda/test/benchmark/benchmark/models/mobilenet_v1.py
+++ b/pybuda/test/benchmark/benchmark/models/mobilenet_v1.py
@@ -11,30 +11,32 @@
 
 
 @benchmark_model(configs=["192", "224"])
-def mobilenet_v1(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str):
+def mobilenet_v1(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str):
     
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_transposing_placement = True
 
     if compiler_cfg.balancer_policy == "default":
         compiler_cfg.balancer_policy = "Ribbon"
         os.environ["PYBUDA_RIBBON2"] = "1"
 
-    os.environ["PYBUDA_SUPRESS_T_FACTOR_MM"] = "8"
+    os.environ["PYBUDA_ENABLE_HOST_INPUT_NOP_BUFFERING"] = "1"
 
     # These are about to be enabled by default.
     #
-    os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
-    if data_type != "Bfp8_b":
-        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
+    os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
+
+    if data_type == "Fp16_b":
+        os.environ["PYBUDA_SUPRESS_T_FACTOR_MM"] = "40"
+        os.environ["PYBUDA_TEMP_DISABLE_MODEL_KB_PROLOGUE_BW"] = "1"
 
     if data_type == "Bfp8_b":
-        # tenstorrent/pybuda#2228
-        os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
+        os.environ["PYBUDA_FUSE_DF_OVERRIDE"] = "0"
         pybuda.config.configure_mixed_precision(name_regex="input.*add.*", output_df=pybuda.DataFormat.Float16_b)
         pybuda.config.configure_mixed_precision(op_type="add", output_df=pybuda.DataFormat.Float16_b)
-        pybuda.config.configure_mixed_precision(op_type="depthwise", output_df=pybuda.DataFormat.Float16_b)
+        pybuda.config.configure_mixed_precision(op_type="multiply", math_fidelity=pybuda.MathFidelity.HiFi2)
+        pybuda.config.configure_mixed_precision(op_type="depthwise", output_df=pybuda.DataFormat.Float16_b, math_fidelity=pybuda.MathFidelity.HiFi2)
+
 
     # Set model parameters based on chosen task and model configuration
     model_name = ""
diff --git a/pybuda/test/benchmark/benchmark/models/mobilenet_v2.py b/pybuda/test/benchmark/benchmark/models/mobilenet_v2.py
index 02f33d6fc..919e1bd56 100644
--- a/pybuda/test/benchmark/benchmark/models/mobilenet_v2.py
+++ b/pybuda/test/benchmark/benchmark/models/mobilenet_v2.py
@@ -11,34 +11,35 @@
 
 
 @benchmark_model(configs=["224", "160", "96"])
-def mobilenet_v2(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str):
+def mobilenet_v2(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str):
     
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_transposing_placement = True
 
     if compiler_cfg.balancer_policy == "default":
         compiler_cfg.balancer_policy = "Ribbon"
         os.environ["PYBUDA_RIBBON2"] = "1"
 
+    os.environ["PYBUDA_ENABLE_HOST_INPUT_NOP_BUFFERING"] = "1"
+
     # These are about to be enabled by default.
     #
-    os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
-    if data_type != "Bfp8_b":
-        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
-        os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1" 
-        os.environ["PYBUDA_BALANCER_PREPASS_DISABLED"] = "1"
-
-    if arch == "grayskull":
-        os.environ["PYBUDA_MAXIMIZE_SPARSE_UBLOCK"] = "1"
-        os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1" 
-        os.environ["PYBUDA_RIBBON2_OPTIMIZATION_ITERATIONS"] = "10" 
-        os.environ["PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY"] = "1"
+    os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
+
+    if data_type == "Fp16_b":
+        os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
 
     if data_type == "Bfp8_b":
         pybuda.config.configure_mixed_precision(name_regex="input.*add.*", output_df=pybuda.DataFormat.Float16_b)
         pybuda.config.configure_mixed_precision(op_type="add", output_df=pybuda.DataFormat.Float16_b)
-        pybuda.config.configure_mixed_precision(op_type="depthwise", input_df={1: (pybuda.DataFormat.Float16_b, False),}, output_df=pybuda.DataFormat.Float16_b)
+        pybuda.config.configure_mixed_precision(
+            op_type="depthwise", 
+            input_df={1: (pybuda.DataFormat.Float16_b, False),}, 
+            output_df=pybuda.DataFormat.Float16_b, 
+            math_fidelity=pybuda.MathFidelity.HiFi2
+        )
+        pybuda.config.configure_mixed_precision(op_type="multiply", math_fidelity=pybuda.MathFidelity.HiFi2)
+        pybuda.config.configure_mixed_precision(op_type="matmul", math_fidelity=pybuda.MathFidelity.HiFi2)
 
     # Set model parameters based on chosen task and model configuration
     if config == "224":
diff --git a/pybuda/test/benchmark/benchmark/models/mobilenet_v2_timm.py b/pybuda/test/benchmark/benchmark/models/mobilenet_v2_timm.py
deleted file mode 100644
index 4acfef672..000000000
--- a/pybuda/test/benchmark/benchmark/models/mobilenet_v2_timm.py
+++ /dev/null
@@ -1,53 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import os
-import pybuda
-import torch
-import timm
-
-from ..common import benchmark_model
-from pybuda.config import _get_global_compiler_config
-
-
-@benchmark_model(configs=["224"])
-def mobilenet_v2_timm(training: bool, config: str, microbatch: int, devtype: str, arch: str):
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
-
-    if compiler_cfg.balancer_policy == "default":
-        compiler_cfg.balancer_policy = "Ribbon"
-
-    # os.environ["PYBUDA_PAD_SPARSE_MM"] = "{25:26}"
-
-    # Set model parameters based on chosen task and model configuration
-    if config != "224":
-        raise RuntimeError("Unknown config")
-    model_name = "mobilenetv2_100"
-    img_res = 224
-
-    # Configure microbatch, if none provided
-    if microbatch == 0:
-        microbatch = 32  # default
-
-    # Load model
-    model = timm.create_model(model_name, pretrained=True)
-
-    # Configure model mode for training or evaluation
-    if training:
-        model.train()
-    else:
-        model.eval()
-
-    modules = {"tt": pybuda.PyTorchModule(f"pt_mobilenet_v2_timm_{config}", model)}
-
-    input_shape = (microbatch, 3, img_res, img_res)
-    inputs = [torch.rand(*input_shape)]
-    targets = tuple()
-
-    # Add loss function, if training
-    if training:
-        model["cpu-loss"] = pybuda.PyTorchModule("l1loss", torch.nn.L1Loss())
-        targets = [torch.rand(1, 100)]
-
-    return modules, inputs, targets, {}
diff --git a/pybuda/test/benchmark/benchmark/models/mobilenet_v3_timm.py b/pybuda/test/benchmark/benchmark/models/mobilenet_v3_timm.py
index d2f159bc9..4d6d08075 100644
--- a/pybuda/test/benchmark/benchmark/models/mobilenet_v3_timm.py
+++ b/pybuda/test/benchmark/benchmark/models/mobilenet_v3_timm.py
@@ -11,26 +11,16 @@
 
 
 @benchmark_model(configs=["small", "large"])
-def mobilenet_v3_timm(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str):
+def mobilenet_v3_timm(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
 
     if compiler_cfg.balancer_policy == "default":
         compiler_cfg.balancer_policy = "Ribbon"
         os.environ["PYBUDA_RIBBON2"] = "1" 
-        os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
-        os.environ["PYBUDA_BALANCER_PREPASS_DISABLED"] = "1"
 
-        # These are about to be enabled by default.
-        #
-        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
-        if data_type != "Bfp8_b":
-            os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
-
-    if arch == "grayskull":
-        os.environ["PYBUDA_MAXIMIZE_SPARSE_UBLOCK"] = "1" 
-        os.environ["PYBUDA_SUPRESS_T_FACTOR_MM"] = "16" 
-        os.environ["PYBUDA_FUSED_OP_MULTIPLIER"] = "7" 
+    os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+    os.environ["PYBUDA_BALANCER_PREPASS_DISABLED"] = "1"
+    os.environ["PYBUDA_ENABLE_HOST_INPUT_NOP_BUFFERING"] = "1"
 
     # Set model parameters based on chosen task and model configuration
     model_name = ""
diff --git a/pybuda/test/benchmark/benchmark/models/openpose_body.py b/pybuda/test/benchmark/benchmark/models/openpose_body.py
index 60b3262b0..f5c19df5b 100644
--- a/pybuda/test/benchmark/benchmark/models/openpose_body.py
+++ b/pybuda/test/benchmark/benchmark/models/openpose_body.py
@@ -11,28 +11,17 @@
 
 
 @benchmark_model(configs=["2d", "3d"])
-def openpose_osmr_body(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str):
+def openpose_osmr_body(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str):
     # Configurations
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_transposing_placement = True
 
     if compiler_cfg.balancer_policy == "default":
         compiler_cfg.balancer_policy = "Ribbon"
         os.environ["PYBUDA_RIBBON2"] = "1" 
-        
-    os.environ["PYBUDA_SUPRESS_T_FACTOR_MM"] = "13"
-    os.environ["PYBUDA_OP_MODEL_COMPARE_VERSION"] = "1"
-
-    # These are about to be enabled by default.
-    #
-    if data_type != "Bfp8_b":
-        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
-    else:
-        # tenstorrent/pybuda#2228
-        os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
 
-    os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
+    os.environ["PYBUDA_SUPRESS_T_FACTOR_MM"] = "13"
+    os.environ["PYBUDA_ENABLE_HOST_INPUT_NOP_BUFFERING"] = "1"
 
     # Set model parameters based on chosen task and model configuration
     model_name = ""
diff --git a/pybuda/test/benchmark/benchmark/models/openpose_hand.py b/pybuda/test/benchmark/benchmark/models/openpose_hand.py
index 402359ed2..26b0d843b 100644
--- a/pybuda/test/benchmark/benchmark/models/openpose_hand.py
+++ b/pybuda/test/benchmark/benchmark/models/openpose_hand.py
@@ -10,36 +10,32 @@
 from ..common import benchmark_model
 
 @benchmark_model(configs=["basic"])
-def openpose_hand(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str):
+def openpose_hand(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str):
     # Import confidential model implementation
     sys.path.append(os.path.join(os.path.dirname(__file__), '../../../../../', 'third_party/confidential_customer_models/'))
     from benchmarks.openpose import OpenPoseHandModel, transfer
 
-    available_devices = pybuda.detect_available_devices()
-    if available_devices:
-        # Hang, this piece of code should be removed when we consume the budabackend fix
-        # https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/merge_requests/1684
-        if available_devices[0] == BackendDevice.Grayskull:
-            return
-
     # Configurations
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_transposing_placement = True
 
     if compiler_cfg.balancer_policy == "default":
         compiler_cfg.balancer_policy = "Ribbon"
         os.environ["PYBUDA_RIBBON2"] = "1"
 
-        # These are about to be enabled by default.
-        #
-        # os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1" causes hang
-        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
+    os.environ["PYBUDA_ENABLE_HOST_INPUT_NOP_BUFFERING"] = "1"
+
+    # These are about to be enabled by default.
+    #
+    os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
 
     # Manually enable amp light for Ribbon
     if compiler_cfg.balancer_policy == "Ribbon":
         compiler_cfg.enable_amp_light()
 
+    if pybuda.detect_available_devices()[0] == BackendDevice.Grayskull:
+        pybuda.set_epoch_break("conv2d_99.dc.sparse_matmul.9.dc.sparse_matmul.1.lc2")
+
     # Set model parameters based on chosen task and model configuration
     model_name = ""
     img_res = 224
diff --git a/pybuda/test/benchmark/benchmark/models/other.py b/pybuda/test/benchmark/benchmark/models/other.py
index 6c06e1ca6..4f7fb74e0 100644
--- a/pybuda/test/benchmark/benchmark/models/other.py
+++ b/pybuda/test/benchmark/benchmark/models/other.py
@@ -5,7 +5,7 @@
 Catch-all for random perf testing
 """
 
-import numpy as np
+import os
 import pybuda
 import torch
 
@@ -84,7 +84,7 @@ def forward(self, x):
 
 
 @benchmark_model(configs=["224"])
-def big_conv(training: bool, config: str, microbatch: int, devtype: str, arch: str):
+def big_conv(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str):
     if config == "224":
         input_size = (224, 224)
         cin = 3
@@ -113,8 +113,8 @@ def big_conv(training: bool, config: str, microbatch: int, devtype: str, arch: s
         bias=False)
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.balancer_policy = "Ribbon"
+    os.environ["PYBUDA_RIBBON2"] = "1"
 
     models = {"tt": mod}
     inputs = [torch.rand(microbatch, cin, input_size[0], input_size[1])]
diff --git a/pybuda/test/benchmark/benchmark/models/resnet.py b/pybuda/test/benchmark/benchmark/models/resnet.py
index 7d450551e..f760b7e46 100644
--- a/pybuda/test/benchmark/benchmark/models/resnet.py
+++ b/pybuda/test/benchmark/benchmark/models/resnet.py
@@ -14,26 +14,27 @@
 from transformers import ResNetForImageClassification
 
 @benchmark_model(configs=["resnet18", "resnet50"])
-def resnet(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str):
+def resnet(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str):
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_transposing_placement = True
 
     if compiler_cfg.balancer_policy == "default":
         compiler_cfg.balancer_policy = "Ribbon"
         os.environ["PYBUDA_RIBBON2"] = "1"
 
-        # These are about to be enabled by default.
-        #
-        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
-        if data_type != "Bfp8_b":
-            os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
+    os.environ["PYBUDA_ENABLE_HOST_INPUT_NOP_BUFFERING"] = "1"
+    os.environ["PYBUDA_ALLOW_MULTICOLUMN_SPARSE_MATMUL"] = "1"
 
-    os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"] = "1"
+    # These are about to be enabled by default.
+    #
+    os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
 
-    if arch != "wormhole_b0":
-        os.environ["PYBUDA_EXTRA_L1_MARGIN"] = "100000"
+    if data_type == "Fp16_b":
+        os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES_APPLY_FILTERING"] = "1"
+
+    if data_type == "Bfp8_b":
+        pybuda.config.configure_mixed_precision(name_regex="input.*add.*", output_df=pybuda.DataFormat.Float16_b)
 
     # Set model parameters based on chosen task and model configuration
     if config == "resnet18":
@@ -59,11 +60,10 @@ def resnet(training: bool, config: str, microbatch: int, devtype: str, arch: str
     return models, inputs, targets, {}
 
 @benchmark_model(configs=["resnet50"])
-def resnet_quant(training: bool, config: str, microbatch: int, devtype: str, arch: str):
+def resnet_quant(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_fusing = False
     compiler_cfg.graph_solver_self_cut_type = "FastCut"
 
@@ -75,7 +75,7 @@ def resnet_quant(training: bool, config: str, microbatch: int, devtype: str, arc
     # Set model parameters based on chosen task and model configuration
     if config == "resnet50":
         # Download ONNX model
-        save_path = "pybuda/test/quantized/simple_models/ResNet50-v1.5-Int8.onnx"
+        save_path = "third_party/confidential_customer_models/quantized/ResNet50-v1.5-Int8.onnx"
         if not os.path.exists(save_path):
             raise RuntimeError("Model not found")
 
@@ -280,7 +280,6 @@ def resnet50_layer(training: bool, config: str, microbatch: int, devtype: str, a
     layer = config
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     # verify_cfg.verify_pybuda_codegen_vs_framework = False # hacking 7x7 to 1x1 will cause mismatches
 
     if compiler_cfg.balancer_policy == "default":
@@ -297,7 +296,7 @@ def resnet50_layer(training: bool, config: str, microbatch: int, devtype: str, a
             fractured_conv_sparse_mms = [f"conv2d_0.dc.conv2d.3.dc.conv2d.{1 + i * 2}.dc.sparse_matmul.9.dc.sparse_matmul.1.lc2" for i in range(fracture_factor)]
             fractured_conv_dense_mms = [f"conv2d_0.dc.conv2d.3.dc.conv2d.{1 + i * 2}.dc.matmul.11" for i in range(fracture_factor)]
 
-            pybuda.insert_buffering_nop(
+            pybuda.insert_nop(
                 "input_1",
                 fractured_conv_sparse_mms,
                 hoist_tms=True)
diff --git a/pybuda/test/benchmark/benchmark/models/t5.py b/pybuda/test/benchmark/benchmark/models/t5.py
index ba05fd19b..3c9ec955f 100644
--- a/pybuda/test/benchmark/benchmark/models/t5.py
+++ b/pybuda/test/benchmark/benchmark/models/t5.py
@@ -2,7 +2,6 @@
 
 # SPDX-License-Identifier: Apache-2.0
 import os
-
 import pybuda
 
 from pybuda._C.backend_api import BackendDevice
@@ -12,21 +11,29 @@
 
 
 @benchmark_model(configs=["base", "large"])
-def t5(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str):
+def t5(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str):
 
     compiler_cfg = _get_global_compiler_config()
 
     if compiler_cfg.balancer_policy == "default":
-        compiler_cfg.balancer_policy = "NLP"
+        compiler_cfg.balancer_policy = "Ribbon"
+        os.environ["PYBUDA_RIBBON2"] = "1"
+
+    # These are about to be enabled by default.
+    #
+    os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
+    os.environ["PYBUDA_EXP_APPROX"] = "1"
+
+    if data_type == "Bfp8_b":
+        pybuda.config.configure_mixed_precision(op_type="add", output_df=pybuda.DataFormat.Float16_b)
+        pybuda.config.configure_mixed_precision(op_type="subtract", output_df=pybuda.DataFormat.Float16_b)
+        pybuda.config.configure_mixed_precision(op_type="reciprocal", output_df=pybuda.DataFormat.Float16_b)
 
     available_devices = pybuda.detect_available_devices()
     # Determine model variant
     if config == "base":
         variant = "t5-base"
-        if available_devices and available_devices[0] == BackendDevice.Grayskull:
-            # Hang, this piece of code should be removed when we consume the budabackend fix
-            # https://yyz-gitlab.local.tenstorrent.com/tenstorrent/budabackend/-/merge_requests/1684
-            return
+
     elif config == "large":
         variant = "t5-large"
     else:
@@ -44,12 +51,18 @@ def t5(training: bool, config: str, microbatch: int, devtype: str, arch: str, da
 
 
 @benchmark_model(configs=["base", "large"])
-def flan_t5(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str):
+def flan_t5(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str):
 
     compiler_cfg = _get_global_compiler_config()
 
     if compiler_cfg.balancer_policy == "default":
-        compiler_cfg.balancer_policy = "NLP"
+        compiler_cfg.balancer_policy = "Ribbon"
+        os.environ["PYBUDA_RIBBON2"] = "1"
+
+    # These are about to be enabled by default.
+    #
+    os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
+    os.environ["PYBUDA_EXP_APPROX"] = "1"
 
     # Determine model variant
     if config == "base":
@@ -60,8 +73,6 @@ def flan_t5(training: bool, config: str, microbatch: int, devtype: str, arch: st
         raise RuntimeError("Unknown config")
     
     if data_type == "Bfp8_b":
-        # tenstorrent/pybuda#2228
-        os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
         pybuda.config.configure_mixed_precision(op_type="add", output_df=pybuda.DataFormat.Float16_b)
         pybuda.config.configure_mixed_precision(op_type="subtract", output_df=pybuda.DataFormat.Float16_b)
         pybuda.config.configure_mixed_precision(op_type="reciprocal", output_df=pybuda.DataFormat.Float16_b)
diff --git a/pybuda/test/benchmark/benchmark/models/unet.py b/pybuda/test/benchmark/benchmark/models/unet.py
index efb17718d..4d20606dc 100644
--- a/pybuda/test/benchmark/benchmark/models/unet.py
+++ b/pybuda/test/benchmark/benchmark/models/unet.py
@@ -10,28 +10,26 @@
 
 
 @benchmark_model(configs=["256"])
-def unet(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str):
+def unet(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_constant_prop = True
+    compiler_cfg.enable_auto_transposing_placement = True
 
     if compiler_cfg.balancer_policy == "default":
         compiler_cfg.balancer_policy = "Ribbon"
         os.environ["PYBUDA_RIBBON2"] = "1"
 
-        # These are about to be enabled by default.
-        #
-        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
-        if data_type != "Bfp8_b":
-            os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
-
     # Manually enable amp light for Ribbon
     if compiler_cfg.balancer_policy == "Ribbon":
         compiler_cfg.enable_amp_light()
 
-    if data_type == "Bfp8_b":
-        pybuda.config.configure_mixed_precision(op_type="matmul", output_df=pybuda.DataFormat.Float16_b)
-        pybuda.config.configure_mixed_precision(op_type="add", output_df=pybuda.DataFormat.Float16_b)
+    os.environ["PYBUDA_ENABLE_HOST_INPUT_NOP_BUFFERING"] = "1"
+    os.environ["PYBUDA_ALLOW_MULTICOLUMN_SPARSE_MATMUL"] = "1"
+    os.environ["PYBUDA_SUPRESS_T_FACTOR_MM"] = "60"
+
+    # These are about to be enabled by default.
+    #
+    os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
 
     # Set model parameters based on chosen task and model configuration
     if config == "256":
diff --git a/pybuda/test/benchmark/benchmark/models/unit.py b/pybuda/test/benchmark/benchmark/models/unit.py
deleted file mode 100644
index dfddfcbff..000000000
--- a/pybuda/test/benchmark/benchmark/models/unit.py
+++ /dev/null
@@ -1,77 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-"""
-Various non-real unit testing models for performance sanity and debug
-"""
-
-import pybuda
-import torch
-
-from ..common import benchmark_model
-
-# NLP feedforward bock
-class NLPFF(pybuda.PyBudaModule):
-    def __init__(self, name: str, c: int):
-        super().__init__(name)
-        self.ff1_w = pybuda.Parameter(torch.rand(c, c * 4), requires_grad=True)
-        self.ff1_bias = pybuda.Parameter(torch.rand(1, c*4), requires_grad=True)
-        self.ff2_w = pybuda.Parameter(torch.rand(c * 4, c), requires_grad=True)
-        self.ff2_bias = pybuda.Parameter(torch.rand(1, c), requires_grad=True)
-
-    def forward(self, act):
-        act = pybuda.op.Matmul("ff1", act, self.ff1_w)
-        act = pybuda.op.Add("ff1_bias", act, self.ff1_bias)
-        act = pybuda.op.Gelu("gelu", act)
-        act = pybuda.op.Matmul("ff2", act, self.ff2_w)
-        act = pybuda.op.Add("ff2_bias", act, self.ff2_bias)
-        return act
-
-# NLP self-attention batched matmul
-class BMM(pybuda.PyBudaModule):
-    def __init__(self, name: str):
-        super().__init__(name)
-        self.bias = pybuda.Parameter(torch.rand(1, 1024), requires_grad=True)
-
-    def forward(self, k, q, v):
-        k = pybuda.op.HSlice("k_slice", k, 16)
-        q = pybuda.op.HSlice("q_slice", q, 16)
-        v = pybuda.op.HSlice("v_slice", v, 16)
-        qt = pybuda.op.Transpose("qt", q, -1, -2)
-        out = pybuda.op.Matmul("bmm0", pybuda.op.Matmul("bmm1", k, qt), v)
-        out = pybuda.op.HStack("hstack", out)
-        return out + self.bias
-
-
-@benchmark_model(configs=["nlp-ff-base", "nlp-ff-large", "nlp-bmm"])
-def unit(training: bool, config: str, microbatch: int, devtype: str, arch: str):
-
-    if microbatch == 0:
-        microbatch = 64
-
-    targets = []
-    if config == "nlp-ff-base":
-        models = {"tt": NLPFF(config, 768)}
-        inputs = [torch.rand(microbatch, 128, 768)]
-        if training:
-            targets = [torch.rand(microbatch, 128, 768)]
-    elif config == "nlp-ff-large":
-        models = {"tt": NLPFF(config, 1024)}
-        inputs = [torch.rand(microbatch, 384, 1024)]
-        if training:
-            targets = [torch.rand(microbatch, 384, 1024)]
-
-    elif config == "nlp-bmm":
-        models = {"tt": BMM("bmm")}
-        inputs = [torch.rand(microbatch, 384, 1024), torch.rand(microbatch, 384, 1024), torch.rand(microbatch, 384, 1024)] 
-        if training:
-            targets = [torch.rand(microbatch, 384, 1024)]
-
-    else:
-        assert(False)
-
-    if training:
-        models["cpu-loss"] = pybuda.PyTorchModule("l1loss", torch.nn.L1Loss())
-
-    return models, inputs, targets, {}
-
diff --git a/pybuda/test/benchmark/benchmark/models/vit.py b/pybuda/test/benchmark/benchmark/models/vit.py
index f45779afd..8c93d9ba9 100644
--- a/pybuda/test/benchmark/benchmark/models/vit.py
+++ b/pybuda/test/benchmark/benchmark/models/vit.py
@@ -11,25 +11,26 @@
 
 
 @benchmark_model(configs=["base", "large"])
-def vit(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str):
+def vit(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str):
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_transposing_placement = True
 
     if compiler_cfg.balancer_policy == "default":
         compiler_cfg.balancer_policy = "Ribbon"
         os.environ["PYBUDA_RIBBON2"] = "1"
-        os.environ["PYBUDA_RIBBON2_OPTIMIZATION_ITERATIONS"] = "10"
 
-        # These are about to be enabled by default.
-        #
-        if data_type != "Bfp8_b":
-            os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
-        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
+    # These are about to be enabled by default.
+    #
+    os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
+
+    if data_type == "Fp16_b":
+        os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES_APPLY_FILTERING"] = "1"
 
     if data_type == "Bfp8_b":
+        os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
         pybuda.config.configure_mixed_precision(op_type="reciprocal", output_df=pybuda.DataFormat.Float16_b)
+        os.environ["PYBUDA_FUSE_DF_OVERRIDE"] = "0"
 
     # Set model parameters based on chosen task and model configuration
     img_res = 224
diff --git a/pybuda/test/benchmark/benchmark/models/vovnet_v1.py b/pybuda/test/benchmark/benchmark/models/vovnet_v1.py
deleted file mode 100644
index 507cb5cfd..000000000
--- a/pybuda/test/benchmark/benchmark/models/vovnet_v1.py
+++ /dev/null
@@ -1,57 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import pybuda
-import torch
-
-from ..common import benchmark_model
-from pybuda.config import _get_global_compiler_config
-from pytorchcv.model_provider import get_model as ptcv_get_model
-
-
-@benchmark_model(configs=["27s", "39", "57"])
-def vovnet_v1(training: bool, config: str, microbatch: int, devtype: str, arch: str):
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
-
-    if compiler_cfg.balancer_policy == "default":
-        compiler_cfg.balancer_policy = "Ribbon"
-
-    # Set model parameters based on chosen task and model configuration
-    img_res = 224
-
-    model_name = ""
-    if config == "27s":
-        model_name = "vovnet27s"
-    elif config == "39":
-        model_name = "vovnet39"
-    elif config == "57":
-        model_name = "vovnet57"
-    else:
-        raise RuntimeError("Unknown config")
-
-    # Configure microbatch, if none provided
-    if microbatch == 0:
-        microbatch = 32  # default
-
-    # Load model
-    model = ptcv_get_model(model_name, pretrained=True)
-
-    # Configure model mode for training or evaluation
-    if training:
-        model.train()
-    else:
-        model.eval()
-
-    modules = {"tt": pybuda.PyTorchModule(f"pt_vovnet_v1_{config}_{compiler_cfg.balancer_policy}", model)}
-
-    input_shape = (microbatch, 3, img_res, img_res)
-    inputs = [torch.rand(*input_shape)]
-    targets = tuple()
-
-    # Add loss function, if training
-    if training:
-        model["cpu-loss"] = pybuda.PyTorchModule("l1loss", torch.nn.L1Loss())
-        targets = [torch.rand(1, 100)]
-
-    return modules, inputs, targets, {}
diff --git a/pybuda/test/benchmark/benchmark/models/vovnet_v2.py b/pybuda/test/benchmark/benchmark/models/vovnet_v2.py
index 3dbb84ea6..7b5c0a9f7 100644
--- a/pybuda/test/benchmark/benchmark/models/vovnet_v2.py
+++ b/pybuda/test/benchmark/benchmark/models/vovnet_v2.py
@@ -11,30 +11,29 @@
 
 
 @benchmark_model(configs=["19", "39", "99"])
-def vovnet_v2(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str):
+def vovnet_v2(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str):
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
+    from pybuda._C.backend_api import BackendDevice
+    available_devices = pybuda.detect_available_devices()
+    if available_devices[0] != BackendDevice.Grayskull:
+        compiler_cfg.enable_auto_transposing_placement = True
 
     if compiler_cfg.balancer_policy == "default":
         compiler_cfg.balancer_policy = "Ribbon"
-        os.environ["PYBUDA_RIBBON2"] = "1" 
+        os.environ["PYBUDA_RIBBON2"] = "1"
 
-    os.environ["PYBUDA_DISABLE_EXPLICIT_DRAM_IO"] = "1"
-    os.environ["PYBUDA_RIBBON2_DISABLE_NON_MATMUL_UTIL"] = "1"
+    os.environ["PYBUDA_ALLOW_MULTICOLUMN_SPARSE_MATMUL"] = "1"
+    os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
+    os.environ["PYBUDA_SUPRESS_T_FACTOR_MM"] = "60"
 
     # These are about to be enabled by default.
     #
-    # os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1" tenstorrent/budabackend#2459
-    os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
+    os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
 
     if config == "39" and data_type != "Bfp8_b":
         compiler_cfg.enable_amp_light()
 
-    if data_type == "Bfp8_b":
-        # tenstorrent/pybuda#2228
-        os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
-
     # Set model parameters based on chosen task and model configuration
     img_res = 224
 
diff --git a/pybuda/test/benchmark/benchmark/models/whisper.py b/pybuda/test/benchmark/benchmark/models/whisper.py
index d91e4ea5b..d2ea32946 100644
--- a/pybuda/test/benchmark/benchmark/models/whisper.py
+++ b/pybuda/test/benchmark/benchmark/models/whisper.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import pybuda
 import torch
+import os
 
 from ..common import benchmark_model, generate_test_device
 from pybuda.config import _get_global_compiler_config
@@ -11,7 +12,7 @@
 
 
 @benchmark_model(configs=["tiny","small"])
-def whisper_decoder(training: bool, config: str, microbatch: int, devtype: str, arch: str):
+def whisper_decoder(training: bool, config: str, microbatch: int, devtype: str, arch: str, math_fidelity: str):
     # Determine model variant
     if config == "tiny":
         variant = "openai/whisper-tiny"
@@ -33,12 +34,13 @@ def whisper_decoder(training: bool, config: str, microbatch: int, devtype: str,
 
 
 @benchmark_model(configs=["small"])
-def whisper(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str):
+def whisper(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str):
 
     compiler_cfg = _get_global_compiler_config()
 
     if compiler_cfg.balancer_policy == "default":
-        compiler_cfg.balancer_policy = "NLP"
+        compiler_cfg.balancer_policy = "Ribbon"
+        os.environ["PYBUDA_RIBBON2"] = "1"
 
     # Determine model variant
     if config == "small":
diff --git a/pybuda/test/benchmark/benchmark/models/yolo_v3.py b/pybuda/test/benchmark/benchmark/models/yolo_v3.py
index 809ad2a37..eff53873e 100644
--- a/pybuda/test/benchmark/benchmark/models/yolo_v3.py
+++ b/pybuda/test/benchmark/benchmark/models/yolo_v3.py
@@ -14,28 +14,26 @@
 
 
 @benchmark_model(configs=["default", "tiny"])
-def yolo_v3(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str):
+def yolo_v3(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_transposing_placement = True
 
     if compiler_cfg.balancer_policy == "default":
         compiler_cfg.balancer_policy = "Ribbon"
         os.environ["PYBUDA_RIBBON2"] = "1" 
 
-    os.environ["PYBUDA_OVERRIDE_INPUT_QUEUE_ENTRIES"] = "32"
+    os.environ["PYBUDA_ENABLE_HOST_INPUT_NOP_BUFFERING"] = "1"
 
     # These are about to be enabled by default.
     #
-    os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
-    if data_type != "Bfp8_b":
-        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
+    os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
 
     if data_type == "Bfp8_b":
         os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
+        os.environ["PYBUDA_ALLOW_MULTICOLUMN_SPARSE_MATMUL"] = "1"
 
-    # TODO: Due to issue tenstorrent/pybuda#1297 
-    pybuda.config.override_op_size("_fused_op_1", (2, 2))
+    if data_type == "Fp16_b":
+        os.environ["PYBUDA_OVERRIDE_INPUT_QUEUE_ENTRIES"] = "32"
 
     # Set model parameters based on chosen task and model configuration
     config_name = ""
diff --git a/pybuda/test/benchmark/benchmark/models/yolo_v5.py b/pybuda/test/benchmark/benchmark/models/yolo_v5.py
index 8147b9a4d..3af749ea5 100644
--- a/pybuda/test/benchmark/benchmark/models/yolo_v5.py
+++ b/pybuda/test/benchmark/benchmark/models/yolo_v5.py
@@ -11,31 +11,34 @@
 
 
 @benchmark_model(configs=["s", "m"])
-def yolo_v5(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str):
+def yolo_v5(training: bool, config: str, microbatch: int, devtype: str, arch: str, data_type: str, math_fidelity: str):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
-
-    from pybuda._C.backend_api import BackendDevice
+    compiler_cfg.enable_auto_transposing_placement = True
 
     if compiler_cfg.balancer_policy == "default":
         compiler_cfg.balancer_policy = "Ribbon"
         os.environ["PYBUDA_RIBBON2"] = "1"
 
-        # These are about to be enabled by default.
-        #
-        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "1"
-        if data_type != "Bfp8_b":
-            os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "1"
-        
-        if data_type == "Bfp8_b":
-            os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
-
+    from pybuda._C.backend_api import BackendDevice
     available_devices = pybuda.detect_available_devices()
+
+    # Temp perf workaround for tenstorrent/bbe#2595
+    os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
+
+    if data_type == "Fp16_b":
+        if available_devices[0] != BackendDevice.Grayskull:
+            os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
+
+    if data_type == "Bfp8_b":
+        os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
+        # Temp workaround for tenstorrent/bbe#2595, output BW is unpredictable.
+        os.environ["PYBUDA_DISABLE_STREAM_OUTPUT"] = "1"
+
     if available_devices[0] == BackendDevice.Grayskull:
         compiler_cfg.enable_tm_cpu_fallback = True
-
-    elif available_devices[0] == BackendDevice.Wormhole_B0:
-        os.environ["PYBUDA_SUPRESS_T_FACTOR_MM"] = "49"
+        compiler_cfg.enable_tm_cpu_fallback = True
+        compiler_cfg.enable_auto_fusing = False  # required to fix accuracy
+        os.environ["PYBUDA_DECOMPOSE_SIGMOID"] = "1"
 
     # Set model parameters based on chosen task and model configuration
     config_name = ""
diff --git a/pybuda/test/benchmark/run_benchmark_df_bfp8 b/pybuda/test/benchmark/run_benchmark_df_bfp8
deleted file mode 100644
index 3359c651d..000000000
--- a/pybuda/test/benchmark/run_benchmark_df_bfp8
+++ /dev/null
@@ -1,63 +0,0 @@
-rm perf.json
-
-# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
-# Data Format Bfp8_b
-# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
-
-# Default data format (-df) is Bfp8_b, default math fidelity (-mf) is LoFi
-
-# ResNet
-pybuda/test/benchmark/benchmark.py -m resnet -c resnet50 -o perf.json
-
-# Mobilenet v1
-pybuda/test/benchmark/benchmark.py -m mobilenet_v1 -c 224 -mf HiFi2 -o perf.json
-
-# Mobilenet v2
-pybuda/test/benchmark/benchmark.py -m mobilenet_v2 -c 224 -mf HiFi2 -o perf.json
-
-# Mobilenet v3
-pybuda/test/benchmark/benchmark.py -m mobilenet_v3_timm -c large -o perf.json
-
-# Vit
-pybuda/test/benchmark/benchmark.py -m vit -c base -o perf.json
-
-# Deit
-pybuda/test/benchmark/benchmark.py -m deit -c base -o perf.json
-
-# VoVNet v2
-pybuda/test/benchmark/benchmark.py -m vovnet_v2 -c 39 -o perf.json
-
-# OpenPose Body
-pybuda/test/benchmark/benchmark.py -m openpose_osmr_body -c 2d -o perf.json
-
-# OpenPose Hand
-pybuda/test/benchmark/benchmark.py -m openpose_hand -c basic -o perf.json
-
-# HRNet
-pybuda/test/benchmark/benchmark.py -m hrnet -c v2_w64 -o perf.json
-
-# YOLOv3
-pybuda/test/benchmark/benchmark.py -m yolo_v3 -c default -mb 32 -o perf.json
-
-# YOLOv5
-pybuda/test/benchmark/benchmark.py -m yolo_v5 -c s -o perf.json
-
-# Inception v4
-pybuda/test/benchmark/benchmark.py -m inception_v4 -c 224 -o perf.json
-
-# UNet
-pybuda/test/benchmark/benchmark.py -m unet -c 256 -mb 48 -mf HiFi2 -o perf.json
-
-# Bert large
-pybuda/test/benchmark/benchmark.py -m bert -c large_tc -o perf.json
-
-# Whisper
-pybuda/test/benchmark/benchmark.py -m whisper -c small --loop_count 1 -mb 1 --single-thread --generative -o perf.json
-
-# T5
-pybuda/test/benchmark/benchmark.py -m t5 -c base --loop_count 1 -mb 1 --single-thread --generative -o perf.json
-pybuda/test/benchmark/benchmark.py -m t5 -c large --loop_count 1 -mb 1 --single-thread --generative -o perf.json
-
-# Flan-T5
-pybuda/test/benchmark/benchmark.py -m flan_t5 -c base --loop_count 1 -mb 1 -mf HiFi2 --single-thread --generative -o perf.json
-pybuda/test/benchmark/benchmark.py -m flan_t5 -c large --loop_count 1 -mb 1 -mf HiFi2 --single-thread --generative -o perf.json
\ No newline at end of file
diff --git a/pybuda/test/benchmark/run_benchmark_df_fp16 b/pybuda/test/benchmark/run_benchmark_df_fp16
deleted file mode 100644
index dd46e559e..000000000
--- a/pybuda/test/benchmark/run_benchmark_df_fp16
+++ /dev/null
@@ -1,67 +0,0 @@
-rm perf.json
-
-
-# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
-# Data Format Fp16, Fp16_b
-# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
-
-# ResNet
-pybuda/test/benchmark/benchmark.py -m resnet -c resnet50 -df Fp16_b -mf HiFi3 -o perf.json
-pybuda/test/benchmark/benchmark.py -m resnet_quant -c resnet50 -df Fp32 -mf HiFi4 -o perf.json
-
-# Mobilenet v1
-pybuda/test/benchmark/benchmark.py -m mobilenet_v1 -c 224 -df Fp16_b -mf HiFi2 -o perf.json
-
-# Mobilenet v2
-pybuda/test/benchmark/benchmark.py -m mobilenet_v2 -c 224 -df Fp16_b -mf HiFi2 -o perf.json
-
-# Mobilenet v3
-pybuda/test/benchmark/benchmark.py -m mobilenet_v3_timm -c large -df Fp16_b -mf HiFi2 -o perf.json
-
-# Vit
-pybuda/test/benchmark/benchmark.py -m vit -c base -df Fp16_b -mf HiFi2 -o perf.json
-
-# Deit
-pybuda/test/benchmark/benchmark.py -m deit -c base -df Fp16_b -mf HiFi2 -o perf.json
-
-# VoVNet v2
-pybuda/test/benchmark/benchmark.py -m vovnet_v2 -c 39 -df Fp16_b -mf HiFi3 -o perf.json
-
-# OpenPose Body
-pybuda/test/benchmark/benchmark.py -m openpose_osmr_body -c 2d -df Fp16 -mf HiFi3 -o perf.json
-
-# OpenPose Hand
-pybuda/test/benchmark/benchmark.py -m openpose_hand -c basic -df Fp16_b -mf HiFi3 -o perf.json
-
-# HRNet
-pybuda/test/benchmark/benchmark.py -m hrnet -c v2_w64 -df Fp16_b -mf HiFi3 -o perf.json
-
-# YOLOv3
-# Issue to remove grid size override from model tenstorrent/pybuda#1297
-# Issue to make it run with mb 64 tenstorrent/pybuda#1298
-# Issue to remove PYBUDA_OVERRIDE_INPUT_QUEUE_ENTRIES=32 tenstorrent/pybuda#1299
-pybuda/test/benchmark/benchmark.py -m yolo_v3 -c default -mb 32 -df Fp16_b -mf HiFi3 -o perf.json
-
-# YOLOv5
-pybuda/test/benchmark/benchmark.py -m yolo_v5 -c s -df Fp16_b -mf HiFi3 -o perf.json
-
-# Inception v4
-pybuda/test/benchmark/benchmark.py -m inception_v4 -c 224 -df Fp16_b -mf HiFi3 -o perf.json
-
-# UNet
-pybuda/test/benchmark/benchmark.py -m unet -c 256 -mb 48 -df Fp16_b -mf HiFi3 -o perf.json
-
-# Bert large
-pybuda/test/benchmark/benchmark.py -m bert -c large_tc -df Fp16_b -mf HiFi3 -o perf.json
-
-# Whisper
-pybuda/test/benchmark/benchmark.py -m whisper -c small --loop_count 1 -mb 1 -df Fp16_b -mf HiFi3 --single-thread --generative -o perf.json
-
-# T5
-pybuda/test/benchmark/benchmark.py -m t5 -c base --loop_count 1 -mb 1 -df Fp16_b -mf HiFi3 --single-thread --generative -o perf.json
-pybuda/test/benchmark/benchmark.py -m t5 -c large --loop_count 1 -mb 1 -df Fp16_b -mf HiFi3 --single-thread --generative -o perf.json
-
-# Flan-T5
-pybuda/test/benchmark/benchmark.py -m flan_t5 -c base --loop_count 1 -mb 1 -df Fp16_b -mf HiFi3 --single-thread --generative -o perf.json
-pybuda/test/benchmark/benchmark.py -m flan_t5 -c large --loop_count 1 -mb 1 -df Fp16_b -mf HiFi3 --single-thread --generative -o perf.json
-
diff --git a/pybuda/test/benchmark/run_benchmark_gs_e150_df_bfp8 b/pybuda/test/benchmark/run_benchmark_gs_e150_df_bfp8
new file mode 100644
index 000000000..27b22ec72
--- /dev/null
+++ b/pybuda/test/benchmark/run_benchmark_gs_e150_df_bfp8
@@ -0,0 +1,49 @@
+# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
+# Data Format Bfp8_b
+# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
+
+# ------------------------------------------------------- #
+# Grayskull e150, unharvested chip, grid size: 10x12
+# ------------------------------------------------------- #
+
+# Resnet
+pybuda/test/benchmark/benchmark.py -m resnet -c resnet50 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Mobilenet v1
+pybuda/test/benchmark/benchmark.py -m mobilenet_v1 -c 224 -mf HiFi2 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Mobilenet v2
+pybuda/test/benchmark/benchmark.py -m mobilenet_v2 -c 224 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Mobilenet v3
+pybuda/test/benchmark/benchmark.py -m mobilenet_v3_timm -c large -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Vovnet v2
+pybuda/test/benchmark/benchmark.py -m vovnet_v2 -c 39 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Openpose Body
+pybuda/test/benchmark/benchmark.py -m openpose_osmr_body -c 2d -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Openpose Hand
+pybuda/test/benchmark/benchmark.py -m openpose_hand -c basic -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# YOLOv3
+pybuda/test/benchmark/benchmark.py -m yolo_v3 -c default -mb 32 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# YOLOv5
+pybuda/test/benchmark/benchmark.py -m yolo_v5 -c s -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Inception v4
+pybuda/test/benchmark/benchmark.py -m inception_v4 -c 224 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Unet
+pybuda/test/benchmark/benchmark.py -m unet -c 256 -mb 64 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Whisper
+pybuda/test/benchmark/benchmark.py -m whisper -c small --loop_count 1 -mb 1 --single-thread --generative -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# T5
+pybuda/test/benchmark/benchmark.py -m t5 -c large --loop_count 1 -mb 1 --single-thread --generative -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Flan-T5
+pybuda/test/benchmark/benchmark.py -m flan_t5 -c large --loop_count 1 -mb 1 -mf HiFi2 --single-thread --generative -o pybuda-silicon-gs-e150-perf-all-perf.json
\ No newline at end of file
diff --git a/pybuda/test/benchmark/run_benchmark_gs_e150_df_fp16 b/pybuda/test/benchmark/run_benchmark_gs_e150_df_fp16
new file mode 100644
index 000000000..7f1e7c1a3
--- /dev/null
+++ b/pybuda/test/benchmark/run_benchmark_gs_e150_df_fp16
@@ -0,0 +1,19 @@
+# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
+# Data Format Fp16, Fp16_b
+# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
+
+# ------------------------------------------------------- #
+# Grayskull e150, unharvested chip, grid size: 10x12
+# ------------------------------------------------------- #
+
+# Vit
+pybuda/test/benchmark/benchmark.py -m vit -c base -df Fp16_b -mf HiFi2 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Deit
+pybuda/test/benchmark/benchmark.py -m deit -c base -df Fp16_b -mf HiFi2 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Hrnet
+pybuda/test/benchmark/benchmark.py -m hrnet -c v2_w64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Bert
+pybuda/test/benchmark/benchmark.py -m bert -c large_tc -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e150-perf-all-perf.json
\ No newline at end of file
diff --git a/pybuda/test/benchmark/run_benchmark_gs_e150_release b/pybuda/test/benchmark/run_benchmark_gs_e150_release
new file mode 100644
index 000000000..e94f4f55a
--- /dev/null
+++ b/pybuda/test/benchmark/run_benchmark_gs_e150_release
@@ -0,0 +1,61 @@
+# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
+# Models with data Formats that have good accuracy on Grayskull and that we release as official numbers
+# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
+
+# ------------------------------------------------------- #
+# Grayskull e150, unharvested chip, grid size: 10x12
+# ------------------------------------------------------- #
+
+# ResNet fp16_b
+pybuda/test/benchmark/benchmark.py -m resnet -c resnet50 -mb 64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Mobilenet v1 fp16_b
+pybuda/test/benchmark/benchmark.py -m mobilenet_v1 -c 224 -mb 64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Mobilenet v2 fp16_b
+pybuda/test/benchmark/benchmark.py -m mobilenet_v2 -c 224 -mb 64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Mobilenet v3 fp16_b
+pybuda/test/benchmark/benchmark.py -m mobilenet_v3_timm -c large -mb 64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Vit bfp8_b
+pybuda/test/benchmark/benchmark.py -m vit -c base -mb 64 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Deit bfp8_b
+pybuda/test/benchmark/benchmark.py -m deit -c base -mb 64 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# VoVNet v2 fp16_b
+pybuda/test/benchmark/benchmark.py -m vovnet_v2 -c 39 -mb 64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# OpenPose Body fp16
+pybuda/test/benchmark/benchmark.py -m openpose_osmr_body -c 2d -mb 64 -df Fp16 -mf HiFi3 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# OpenPose Hand fp16_b
+pybuda/test/benchmark/benchmark.py -m openpose_hand -c basic -mb 64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# HRNet bfp8_b
+pybuda/test/benchmark/benchmark.py -m hrnet -c v2_w64 -mb 64 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# YOLOv3 fp16_b
+pybuda/test/benchmark/benchmark.py -m yolo_v3 -c default -mb 32 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# YOLOv5 fp16_b
+pybuda/test/benchmark/benchmark.py -m yolo_v5 -c s -mb 64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Inception v4 fp16_b
+pybuda/test/benchmark/benchmark.py -m inception_v4 -c 224 -mb 64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# UNet fp16_b
+pybuda/test/benchmark/benchmark.py -m unet -c 256 -mb 64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Bert large bfp8_b
+pybuda/test/benchmark/benchmark.py -m bert -c large_tc -mb 64 -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Whisper fp16_b
+pybuda/test/benchmark/benchmark.py -m whisper -c small --loop_count 1 -mb 1 -df Fp16_b -mf HiFi3 --single-thread --generative -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# T5 fp16_b
+pybuda/test/benchmark/benchmark.py -m t5 -c large --loop_count 1 -mb 1 -df Fp16_b -mf HiFi3 --single-thread --generative -o pybuda-silicon-gs-e150-perf-all-perf.json
+
+# Flan-T5 fp16_b
+pybuda/test/benchmark/benchmark.py -m flan_t5 -c large --loop_count 1 -mb 1 -df Fp16_b -mf HiFi3 --single-thread --generative -o pybuda-silicon-gs-e150-perf-all-perf.json
diff --git a/pybuda/test/benchmark/run_benchmark_gs_e75_df_bfp8 b/pybuda/test/benchmark/run_benchmark_gs_e75_df_bfp8
new file mode 100644
index 000000000..2cc788587
--- /dev/null
+++ b/pybuda/test/benchmark/run_benchmark_gs_e75_df_bfp8
@@ -0,0 +1,51 @@
+# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
+# Data Format Bfp8_b
+# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
+
+# ------------------------------------------------------- #
+# Grayskull e75, two-row harvested chip, grid size: 8x12
+# ------------------------------------------------------- #
+
+export PYBUDA_FORCE_EMULATE_HARVESTED=2
+
+# Resnet
+pybuda/test/benchmark/benchmark.py -m resnet -c resnet50 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Mobilenet v1
+pybuda/test/benchmark/benchmark.py -m mobilenet_v1 -c 224 -mf HiFi2 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Mobilenet v2
+pybuda/test/benchmark/benchmark.py -m mobilenet_v2 -c 224 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Mobilenet v3
+pybuda/test/benchmark/benchmark.py -m mobilenet_v3_timm -c large -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Vovnet v2
+pybuda/test/benchmark/benchmark.py -m vovnet_v2 -c 39 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Openpose Body
+pybuda/test/benchmark/benchmark.py -m openpose_osmr_body -c 2d -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Openpose Hand
+pybuda/test/benchmark/benchmark.py -m openpose_hand -c basic -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# YOLOv3
+pybuda/test/benchmark/benchmark.py -m yolo_v3 -c default -mb 32 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# YOLOv5
+pybuda/test/benchmark/benchmark.py -m yolo_v5 -c s -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Inception v4
+pybuda/test/benchmark/benchmark.py -m inception_v4 -c 224 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Unet
+pybuda/test/benchmark/benchmark.py -m unet -c 256 -mb 64 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Whisper
+pybuda/test/benchmark/benchmark.py -m whisper -c small --loop_count 1 -mb 1 --single-thread --generative -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# T5
+pybuda/test/benchmark/benchmark.py -m t5 -c large --loop_count 1 -mb 1 --single-thread --generative -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Flan-T5
+pybuda/test/benchmark/benchmark.py -m flan_t5 -c large --loop_count 1 -mb 1 -mf HiFi2 --single-thread --generative -o pybuda-silicon-gs-e75-perf-all-perf.json
\ No newline at end of file
diff --git a/pybuda/test/benchmark/run_benchmark_gs_e75_df_fp16 b/pybuda/test/benchmark/run_benchmark_gs_e75_df_fp16
new file mode 100644
index 000000000..7dc2461e7
--- /dev/null
+++ b/pybuda/test/benchmark/run_benchmark_gs_e75_df_fp16
@@ -0,0 +1,21 @@
+# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
+# Data Format Fp16, Fp16_b
+# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
+
+# ------------------------------------------------------- #
+# Grayskull e75, two-row harvested chip, grid size: 8x12
+# ------------------------------------------------------- #
+
+export PYBUDA_FORCE_EMULATE_HARVESTED=2
+
+# Vit
+pybuda/test/benchmark/benchmark.py -m vit -c base -df Fp16_b -mf HiFi2 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Deit
+pybuda/test/benchmark/benchmark.py -m deit -c base -df Fp16_b -mf HiFi2 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Hrnet
+pybuda/test/benchmark/benchmark.py -m hrnet -c v2_w64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Bert
+pybuda/test/benchmark/benchmark.py -m bert -c large_tc -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e75-perf-all-perf.json
\ No newline at end of file
diff --git a/pybuda/test/benchmark/run_benchmark_gs_e75_release b/pybuda/test/benchmark/run_benchmark_gs_e75_release
new file mode 100644
index 000000000..264a62091
--- /dev/null
+++ b/pybuda/test/benchmark/run_benchmark_gs_e75_release
@@ -0,0 +1,63 @@
+# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
+# Models with data Formats that have good accuracy on Grayskull and that we release as official numbers
+# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
+
+# ------------------------------------------------------- #
+# Grayskull e75, two-row harvested chip, grid size: 8x12
+# ------------------------------------------------------- #
+
+export PYBUDA_FORCE_EMULATE_HARVESTED=2
+
+# ResNet fp16_b
+pybuda/test/benchmark/benchmark.py -m resnet -c resnet50 -mb 64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Mobilenet v1 fp16_b
+pybuda/test/benchmark/benchmark.py -m mobilenet_v1 -c 224 -mb 64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Mobilenet v2 fp16_b
+pybuda/test/benchmark/benchmark.py -m mobilenet_v2 -c 224 -mb 64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Mobilenet v3 fp16_b
+pybuda/test/benchmark/benchmark.py -m mobilenet_v3_timm -c large -mb 64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Vit bfp8_b
+pybuda/test/benchmark/benchmark.py -m vit -c base -mb 64 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Deit bfp8_b
+pybuda/test/benchmark/benchmark.py -m deit -c base -mb 64 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# VoVNet v2 fp16_b
+pybuda/test/benchmark/benchmark.py -m vovnet_v2 -c 39 -mb 64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# OpenPose Body fp16
+pybuda/test/benchmark/benchmark.py -m openpose_osmr_body -c 2d -mb 64 -df Fp16 -mf HiFi3 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# OpenPose Hand fp16_b
+pybuda/test/benchmark/benchmark.py -m openpose_hand -c basic -mb 64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# HRNet bfp8_b
+pybuda/test/benchmark/benchmark.py -m hrnet -c v2_w64 -mb 64 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# YOLOv3 fp16_b
+pybuda/test/benchmark/benchmark.py -m yolo_v3 -c default -mb 32 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# YOLOv5 fp16_b
+pybuda/test/benchmark/benchmark.py -m yolo_v5 -c s -mb 64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Inception v4 fp16_b
+pybuda/test/benchmark/benchmark.py -m inception_v4 -c 224 -mb 64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# UNet fp16_b
+pybuda/test/benchmark/benchmark.py -m unet -c 256 -mb 64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Bert large bfp8_b
+pybuda/test/benchmark/benchmark.py -m bert -c large_tc -mb 64 -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Whisper fp16_b
+pybuda/test/benchmark/benchmark.py -m whisper -c small --loop_count 1 -mb 1 -df Fp16_b -mf HiFi3 --single-thread --generative -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# T5 fp16_b
+pybuda/test/benchmark/benchmark.py -m t5 -c large --loop_count 1 -mb 1 -df Fp16_b -mf HiFi3 --single-thread --generative -o pybuda-silicon-gs-e75-perf-all-perf.json
+
+# Flan-T5 fp16_b
+pybuda/test/benchmark/benchmark.py -m flan_t5 -c large --loop_count 1 -mb 1 -df Fp16_b -mf HiFi3 --single-thread --generative -o pybuda-silicon-gs-e75-perf-all-perf.json
diff --git a/pybuda/test/benchmark/run_benchmark_tti b/pybuda/test/benchmark/run_benchmark_tti
index 771ae1fb0..247ba4cc8 100644
--- a/pybuda/test/benchmark/run_benchmark_tti
+++ b/pybuda/test/benchmark/run_benchmark_tti
@@ -4,8 +4,8 @@ unset PYBUDA_CI_DIR
 
 # TTI Save
 pybuda/test/benchmark/benchmark.py -m bert -c tiny -opt 4 -o perf.json --env "PYBUDA_EXP_APPROX=1 PYBUDA_DISABLE_DYNAMIC_DRAM=1 PYBUDA_FORCE_INTERMED_TO_OUTPUT_DF=1" --auto_transpose --save_tti device_images/bert_tiny.tti
-pybuda/test/benchmark/benchmark.py -m mobilenet_v1 -c 224 -opt 4 --loop_count 32 -mb 64 -bp Ribbon -df Fp16_b -mf HiFi2 --env "PYBUDA_RIBBON2=1 PYBUDA_LEGACY_UBLOCK_SHAPE=1 PYBUDA_MAXIMIZE_SPARSE_UBLOCK=1 PYBUDA_ENABLE_L1_ACCUMULATE=1 PYBUDA_EXTRA_L1_MARGIN=65536 PYBUDA_FUSED_OP_MULTIPLIER=20 PYBUDA_ENABLE_DEPTHWISE=1 " -o perf.json --auto_transpose --save_tti device_images/mobilenet_v1.tti
+pybuda/test/benchmark/benchmark.py -m mobilenet_v1 -c 224 -opt 4 --loop_count 32 -mb 64 -bp Ribbon -df Fp16_b -mf HiFi2 --env "PYBUDA_RIBBON2=1 PYBUDA_LEGACY_UBLOCK_SHAPE=1 PYBUDA_MAXIMIZE_SPARSE_UBLOCK=1 PYBUDA_ENABLE_L1_ACCUMULATE=1 PYBUDA_EXTRA_L1_MARGIN=65536 PYBUDA_FUSED_OP_MULTIPLIER=20 PYBUDA_ENABLE_DEPTHWISE=1" -o perf.json --auto_transpose --save_tti device_images/mobilenet_v1.tti
 
 # TTI Load
 pybuda/test/benchmark/benchmark.py -m bert -c tiny -opt 4 -o perf.json --env "PYBUDA_EXP_APPROX=1 PYBUDA_DISABLE_DYNAMIC_DRAM=1 PYBUDA_FORCE_INTERMED_TO_OUTPUT_DF=1" --auto_transpose --load_tti device_images/bert_tiny.tti
-pybuda/test/benchmark/benchmark.py -m mobilenet_v1 -c 224 -opt 4 --loop_count 32 -mb 64 -bp Ribbon -df Fp16_b -mf HiFi2 --env "PYBUDA_RIBBON2=1 PYBUDA_LEGACY_UBLOCK_SHAPE=1 PYBUDA_MAXIMIZE_SPARSE_UBLOCK=1 PYBUDA_ENABLE_L1_ACCUMULATE=1 PYBUDA_EXTRA_L1_MARGIN=65536 PYBUDA_FUSED_OP_MULTIPLIER=20 PYBUDA_ENABLE_DEPTHWISE=1 " -o perf.json --auto_transpose --load_tti device_images/mobilenet_v1.tti
+pybuda/test/benchmark/benchmark.py -m mobilenet_v1 -c 224 -opt 4 --loop_count 32 -mb 64 -bp Ribbon -df Fp16_b -mf HiFi2 --env "PYBUDA_RIBBON2=1 PYBUDA_LEGACY_UBLOCK_SHAPE=1 PYBUDA_MAXIMIZE_SPARSE_UBLOCK=1 PYBUDA_ENABLE_L1_ACCUMULATE=1 PYBUDA_EXTRA_L1_MARGIN=65536 PYBUDA_FUSED_OP_MULTIPLIER=20 PYBUDA_ENABLE_DEPTHWISE=1" -o perf.json --auto_transpose --load_tti device_images/mobilenet_v1.tti
diff --git a/pybuda/test/benchmark/run_benchmark_wh_df_bfp8 b/pybuda/test/benchmark/run_benchmark_wh_df_bfp8
new file mode 100644
index 000000000..9a731bf0f
--- /dev/null
+++ b/pybuda/test/benchmark/run_benchmark_wh_df_bfp8
@@ -0,0 +1,25 @@
+# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
+# Data Format Bfp8_b
+# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
+
+# Default data format (-df) is Bfp8_b, default math fidelity (-mf) is LoFi
+
+# Mobilenet v3
+pybuda/test/benchmark/benchmark.py -m mobilenet_v3_timm -c large -o pybuda-silicon-wh-b0-perf-all-bfp8_b-perf.json
+
+# OpenPose Body
+pybuda/test/benchmark/benchmark.py -m openpose_osmr_body -c 2d -o pybuda-silicon-wh-b0-perf-all-bfp8_b-perf.json
+
+# YOLOv5
+pybuda/test/benchmark/benchmark.py -m yolo_v5 -c s -o pybuda-silicon-wh-b0-perf-all-bfp8_b-perf.json
+
+# Whisper
+pybuda/test/benchmark/benchmark.py -m whisper -c small --loop_count 1 -mb 1 --single-thread --generative -o pybuda-silicon-wh-b0-perf-all-bfp8_b-perf.json
+
+# T5
+# Low accuracy.
+pybuda/test/benchmark/benchmark.py -m t5 -c large --loop_count 1 -mb 1 --single-thread --generative -o pybuda-silicon-wh-b0-perf-all-bfp8_b-perf.json
+
+# Flan-T5
+# Low accuracy.
+pybuda/test/benchmark/benchmark.py -m flan_t5 -c large --loop_count 1 -mb 1 -mf HiFi2 --single-thread --generative -o pybuda-silicon-wh-b0-perf-all-bfp8_b-perf.json
diff --git a/pybuda/test/benchmark/run_benchmark_wh_df_fp16 b/pybuda/test/benchmark/run_benchmark_wh_df_fp16
new file mode 100644
index 000000000..385a366ce
--- /dev/null
+++ b/pybuda/test/benchmark/run_benchmark_wh_df_fp16
@@ -0,0 +1,41 @@
+# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
+# Data Format Fp16, Fp16_b
+# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
+
+# ResNet
+pybuda/test/benchmark/benchmark.py -m resnet -c resnet50 -df Fp16_b -mf HiFi3 -o pybuda-silicon-wh-b0-perf-all-fp16-perf.json
+
+# Mobilenet v1
+pybuda/test/benchmark/benchmark.py -m mobilenet_v1 -c 224 -df Fp16_b -mf HiFi2 -o pybuda-silicon-wh-b0-perf-all-fp16-perf.json
+
+# Mobilenet v2
+pybuda/test/benchmark/benchmark.py -m mobilenet_v2 -c 224 -df Fp16_b -mf HiFi2 -o pybuda-silicon-wh-b0-perf-all-fp16-perf.json
+
+# Vit
+pybuda/test/benchmark/benchmark.py -m vit -c base -df Fp16_b -mf HiFi2 -o pybuda-silicon-wh-b0-perf-all-fp16-perf.json
+
+# Deit
+pybuda/test/benchmark/benchmark.py -m deit -c base -df Fp16_b -mf HiFi2 -o pybuda-silicon-wh-b0-perf-all-fp16-perf.json
+
+# VoVNet v2
+pybuda/test/benchmark/benchmark.py -m vovnet_v2 -c 39 -df Fp16_b -mf HiFi3 -o pybuda-silicon-wh-b0-perf-all-fp16-perf.json
+
+# OpenPose Hand
+pybuda/test/benchmark/benchmark.py -m openpose_hand -c basic -df Fp16_b -mf HiFi3 -o pybuda-silicon-wh-b0-perf-all-fp16-perf.json
+
+# HRNet
+pybuda/test/benchmark/benchmark.py -m hrnet -c v2_w64 -df Fp16_b -mf HiFi3 -o pybuda-silicon-wh-b0-perf-all-fp16-perf.json
+
+# YOLOv3
+# Issue to make it run with mb 64 tenstorrent/pybuda#1298
+# Issue to remove PYBUDA_OVERRIDE_INPUT_QUEUE_ENTRIES=32 tenstorrent/pybuda#1299
+pybuda/test/benchmark/benchmark.py -m yolo_v3 -c default -mb 32 -df Fp16_b -mf HiFi3 -o pybuda-silicon-wh-b0-perf-all-fp16-perf.json
+
+# Inception v4
+pybuda/test/benchmark/benchmark.py -m inception_v4 -c 224 -df Fp16_b -mf HiFi3 -o pybuda-silicon-wh-b0-perf-all-fp16-perf.json
+
+# UNet
+pybuda/test/benchmark/benchmark.py -m unet -c 256 -df Fp16_b -mf HiFi3 -o pybuda-silicon-wh-b0-perf-all-fp16-perf.json
+
+# Bert large
+pybuda/test/benchmark/benchmark.py -m bert -c large_tc -df Fp16_b -mf HiFi3 -o pybuda-silicon-wh-b0-perf-all-fp16-perf.json
diff --git a/pybuda/test/benchmark/run_benchmark_wh_release b/pybuda/test/benchmark/run_benchmark_wh_release
new file mode 100644
index 000000000..13c1cd319
--- /dev/null
+++ b/pybuda/test/benchmark/run_benchmark_wh_release
@@ -0,0 +1,62 @@
+# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
+# Models with data Formats that have good accuracy on Wormhole B0 and that we release as official numbers
+# ------------------------------------------------------------------------------------------------------------------------------------------------------------ #
+
+
+# ResNet bfp8_b
+pybuda/test/benchmark/benchmark.py -m resnet -c resnet50 -mb 256 -o pybuda-silicon-wh-b0-perf-all-perf.json
+
+#ResNet quant fp32
+pybuda/test/benchmark/benchmark.py -m resnet_quant -c resnet50 -df Fp32 -mf HiFi4 -mb 64 -o pybuda-silicon-wh-b0-perf-all-perf.json
+
+# Mobilenet v1 bfp8_b
+pybuda/test/benchmark/benchmark.py -m mobilenet_v1 -c 224 -mb 256 -o pybuda-silicon-wh-b0-perf-all-perf.json
+
+# Mobilenet v2 bfp8_b
+pybuda/test/benchmark/benchmark.py -m mobilenet_v2 -c 224 -mb 256 -o pybuda-silicon-wh-b0-perf-all-perf.json
+
+# Mobilenet v3 fp16_b
+pybuda/test/benchmark/benchmark.py -m mobilenet_v3_timm -c large -df Fp16_b -mf HiFi2 -mb 64 -o pybuda-silicon-wh-b0-perf-all-perf.json
+
+# Vit bfp8_b
+pybuda/test/benchmark/benchmark.py -m vit -c base -mb 256 -o pybuda-silicon-wh-b0-perf-all-perf.json
+
+# Deit bfp8_b
+pybuda/test/benchmark/benchmark.py -m deit -c base -mb 256 -o pybuda-silicon-wh-b0-perf-all-perf.json
+
+# VoVNet v2 bfp8_b
+pybuda/test/benchmark/benchmark.py -m vovnet_v2 -c 39 -o pybuda-silicon-wh-b0-perf-all-perf.json
+
+# OpenPose Body fp16
+pybuda/test/benchmark/benchmark.py -m openpose_osmr_body -c 2d -df Fp16 -mf HiFi3 -o pybuda-silicon-wh-b0-perf-all-perf.json
+
+# OpenPose Hand bfp8_b
+pybuda/test/benchmark/benchmark.py -m openpose_hand -c basic -o pybuda-silicon-wh-b0-perf-all-perf.json
+
+# HRNet bfp8_b
+pybuda/test/benchmark/benchmark.py -m hrnet -c v2_w64 -o pybuda-silicon-wh-b0-perf-all-perf.json
+
+# YOLOv3 bfp8_b
+# Issue to make it run with mb 64 tenstorrent/pybuda#1298
+pybuda/test/benchmark/benchmark.py -m yolo_v3 -c default -mb 32 -o pybuda-silicon-wh-b0-perf-all-perf.json
+
+# YOLOv5 fp16_b
+pybuda/test/benchmark/benchmark.py -m yolo_v5 -c s -df Fp16_b -mf HiFi3 -o pybuda-silicon-wh-b0-perf-all-perf.json
+
+# Inception v4 bfp8_b
+pybuda/test/benchmark/benchmark.py -m inception_v4 -c 224 -o pybuda-silicon-wh-b0-perf-all-perf.json
+
+# UNet bfp8_b
+pybuda/test/benchmark/benchmark.py -m unet -c 256 -mb 64 -o pybuda-silicon-wh-b0-perf-all-perf.json
+
+# Bert large bfp8_b
+pybuda/test/benchmark/benchmark.py -m bert -c large_tc -mb 64 -o pybuda-silicon-wh-b0-perf-all-perf.json
+
+# Whisper fp16_b
+pybuda/test/benchmark/benchmark.py -m whisper -c small --loop_count 1 -mb 1 -df Fp16_b -mf HiFi3 --single-thread --generative -o pybuda-silicon-wh-b0-perf-all-perf.json
+
+# T5 fp16_b
+pybuda/test/benchmark/benchmark.py -m t5 -c large --loop_count 1 -mb 1 -df Fp16_b -mf HiFi3 --single-thread --generative -o pybuda-silicon-wh-b0-perf-all-perf.json
+
+# Flan-T5 fp16_b
+pybuda/test/benchmark/benchmark.py -m flan_t5 -c large --loop_count 1 -mb 1 -df Fp16_b -mf HiFi3 --single-thread --generative -o pybuda-silicon-wh-b0-perf-all-perf.json
diff --git a/pybuda/test/conftest.py b/pybuda/test/conftest.py
index 7f8a1ee58..4ef3670fb 100644
--- a/pybuda/test/conftest.py
+++ b/pybuda/test/conftest.py
@@ -7,15 +7,25 @@
 import subprocess
 
 import pytest
+import _pytest.skipping
 import torch.multiprocessing as mp
 import torch
 import tensorflow as tf
 
+# This is a workaround to set RTLD_GLOBAL flag to load emulation ZeBu library.
+# Essentially symbol names have to be unique in global scope to work with ZeBu,
+# hence need to be set as GLOBAL. This is a requirement for ZeBu.
+import sys
+original_flags = sys.getdlopenflags()
+if (os.environ.get("PYBUDA_ENABLE_EMULATION_DEVICE") == "1"):
+    sys.setdlopenflags(os.RTLD_LAZY | os.RTLD_GLOBAL)
+# Import code that requires os.RTLD_GLOBAL goes here
+# Reset the flags to their original value
+if (os.environ.get("PYBUDA_ENABLE_EMULATION_DEVICE") == "1"):
+    sys.setdlopenflags(original_flags)
+
 import pybuda
-from pybuda._C.backend_api import BackendType, BackendDevice, DeviceMode
 from pybuda.verify.config import TestKind
-import pybuda.compile as COMPILE_INFO
-from pybuda.run.api import detect_available_devices
 from pybuda.torch_compile import reset_state
 
 collect_ignore = ["legacy_tests"]
@@ -33,7 +43,27 @@ def pytest_sessionstart(session):
     tf.config.threading.set_inter_op_parallelism_threads(num_threads)
     torch._dynamo.reset()
     reset_state()
-
+    # If specified by env variable, print the environment variables
+    # It can be useful in CI jobs to get the state of the enviroment variables before test session starts
+    print_env_variables = bool(int(os.environ.get("PYTEST_PRINT_ENV_VARIABLES", "0")))
+    if print_env_variables:
+        pybuda_specific_vars = {}
+        tt_backend_specific_vars = {}
+        print(f"####### Environment variables - Count: {len(os.environ)} #######")
+        for key, value in os.environ.items():
+            print(f"{key}={value}")
+            if key.startswith("PYBUDA_") or key.startswith("GOLDEN_"):
+                pybuda_specific_vars[key] = value
+            elif key.startswith("TT_BACKEND_"):
+                tt_backend_specific_vars[key] = value
+
+        print(f"####### PYBUDA specific enviroment variables - Count: {len(pybuda_specific_vars)} #######")
+        for key, value in pybuda_specific_vars.items():
+            print(f"{key}={value}")
+
+        print(f"####### TT_BACKEND specific enviroment variables - Count: {len(tt_backend_specific_vars)} #######")
+        for key, value in tt_backend_specific_vars.items():
+            print(f"{key}={value}")
 
 @pytest.fixture(autouse=True)
 def clear_pybuda():
@@ -56,7 +86,6 @@ def clear_pybuda():
     yield
 
     # clean up after each test
-    pybuda.shutdown()
     pybuda.pybuda_reset() 
     torch._dynamo.reset()
     reset_state()
@@ -133,79 +162,104 @@ def no_skip(*args, **kwargs):
         return
 
     pytest.skip = no_skip
+    _pytest.skipping.skip = no_skip  # can't run skipped tests with decorator @pytest.mark.skip without this
+
+# DEVICE_CONFIG_TO_BACKEND_DEVICE_TYPE = {
+#     "gs_e150": BackendDevice.Grayskull,
+#     "gs_e300": BackendDevice.Grayskull,
+#     "wh_n150": BackendDevice.Wormhole_B0,
+#     "wh_n300": BackendDevice.Wormhole_B0,
+#     "galaxy": BackendDevice.Wormhole_B0,
+# }
+
+# @dataclass
+# class TestDevice:
+#     devtype: BackendType
+#     arch: BackendDevice
+#     devmode: DeviceMode
+#     tti_path: str = None
+
+#     @classmethod
+#     def from_str(cls, name: str, devmode: DeviceMode, tti_path: str = None, device_config=None) -> "TestDevice":
+#         if name == "Golden":
+#             if device_config and DEVICE_CONFIG_TO_BACKEND_DEVICE_TYPE.get(device_config, None):
+#                 return TestDevice(devtype=BackendType.Golden, arch=DEVICE_CONFIG_TO_BACKEND_DEVICE_TYPE[device_config], devmode=devmode, tti_path=tti_path)
+#             elif "GOLDEN_WORMHOLE_B0" in os.environ:
+#                 return TestDevice(devtype=BackendType.Golden, arch=BackendDevice.Wormhole_B0, devmode=devmode, tti_path=tti_path)
+#             elif "PYBUDA_GOLDEN_BLACKHOLE" in os.environ:
+#                 return TestDevice(devtype=BackendType.Golden, arch=BackendDevice.Blackhole, devmode=devmode, tti_path=tti_path)
+#             return TestDevice(devtype=BackendType.Golden, arch=BackendDevice.Grayskull, devmode=devmode, tti_path=tti_path)
+#         if name == "Model":
+#             return TestDevice(devtype=BackendType.Model, arch=BackendDevice.Grayskull, devmode=devmode, tti_path=tti_path)
+#         if name == "Versim":
+#             # Set default versim device arch to Grayskull
+#             versim_backend_device = BackendDevice.Grayskull
+#             # If PYBUDA_VERSIM_DEVICE_ARCH is set, use that arch for Versim device
+#             versim_arch_name = os.environ.get("PYBUDA_VERSIM_DEVICE_ARCH", None)
+#             if versim_arch_name != None:
+#                 versim_backend_device = BackendDevice.from_string(versim_arch_name)
+#             return TestDevice(devtype=BackendType.Versim, arch=versim_backend_device, devmode=devmode, tti_path=tti_path)
+#         if name == "Emulation":
+#             # Set default emulation device arch to Grayskull
+#             emulation_backend_device = BackendDevice.Grayskull
+#             # If PYBUDA_EMULATION_DEVICE_ARCH is set, use that arch for Emulation device
+#             emulation_arch_name = os.environ.get("PYBUDA_EMULATION_DEVICE_ARCH", None)
+#             if emulation_arch_name != None:
+#                 emulation_backend_device = BackendDevice.from_string(emulation_arch_name)
+#             return TestDevice(devtype=BackendType.Emulation, arch=emulation_backend_device, devmode=devmode, tti_path=tti_path)
+#         if name == "Grayskull":
+#             return TestDevice(devtype=BackendType.Silicon, arch=BackendDevice.Grayskull, devmode=devmode, tti_path=tti_path)
+#         if name == "Wormhole_B0":
+#             return TestDevice(devtype=BackendType.Silicon, arch=BackendDevice.Wormhole_B0, devmode=devmode, tti_path=tti_path)
+#         if name == "Blackhole":
+#             return TestDevice(devtype=BackendType.Silicon, arch=BackendDevice.Blackhole, devmode=devmode, tti_path=tti_path)
+#         raise RuntimeError("Unknown test device: " + name)
+
+#     def is_available(self, device_list: List[BackendDevice], silicon_only: bool, no_silicon: bool, devtype: Optional[BackendType], devmode: DeviceMode) -> bool:
+#         """ 
+#         Return true if this kind of device is available on the current host. Expect a list of devices from 
+#         `detect_available_devices`.
+#         """
+#         if devtype is not None and self.devtype != devtype:
+#             return False
+
+#         if self.devtype == BackendType.Golden:
+#             return not silicon_only
+
+#         if self.devtype == BackendType.Model:
+#             return bool(int(os.environ.get("PYBUDA_ENABLE_MODEL_DEVICE", "0")))
+        
+#         if self.devtype == BackendType.Versim:
+#             return bool(int(os.environ.get("PYBUDA_ENABLE_VERSIM_DEVICE", "0")))
 
-DEVICE_CONFIG_TO_BACKEND_DEVICE_TYPE = {
-    "gs_e150": BackendDevice.Grayskull,
-    "gs_e300": BackendDevice.Grayskull,
-    "wh_nebula_x1": BackendDevice.Wormhole_B0,
-    "wh_nebula_x2": BackendDevice.Wormhole_B0,
-    "galaxy": BackendDevice.Wormhole_B0,
-}
-
-@dataclass
-class TestDevice:
-    devtype: BackendType
-    arch: BackendDevice
-    devmode: DeviceMode
-    tti_path: str = None
-
-    @classmethod
-    def from_str(cls, name: str, devmode: DeviceMode, tti_path: str = None, device_config=None) -> "TestDevice":
-        if name == "Golden":
-            if device_config and DEVICE_CONFIG_TO_BACKEND_DEVICE_TYPE.get(device_config, None):
-                return TestDevice(devtype=BackendType.Golden, arch=DEVICE_CONFIG_TO_BACKEND_DEVICE_TYPE[device_config], devmode=devmode, tti_path=tti_path)
-            elif "GOLDEN_WORMHOLE_B0" in os.environ:
-                return TestDevice(devtype=BackendType.Golden, arch=BackendDevice.Wormhole_B0, devmode=devmode, tti_path=tti_path)
-            elif "GOLDEN_WORMHOLE" in os.environ:
-                return TestDevice(devtype=BackendType.Golden, arch=BackendDevice.Wormhole, devmode=devmode, tti_path=tti_path)
-            return TestDevice(devtype=BackendType.Golden, arch=BackendDevice.Grayskull, devmode=devmode, tti_path=tti_path)
-        if name == "Model":
-            return TestDevice(devtype=BackendType.Model, arch=BackendDevice.Grayskull, devmode=devmode, tti_path=tti_path)
-        if name == "Grayskull":
-            return TestDevice(devtype=BackendType.Silicon, arch=BackendDevice.Grayskull, devmode=devmode, tti_path=tti_path)
-        if name == "Wormhole":
-            return TestDevice(devtype=BackendType.Silicon, arch=BackendDevice.Wormhole, devmode=devmode, tti_path=tti_path)
-        if name == "Wormhole_B0":
-            return TestDevice(devtype=BackendType.Silicon, arch=BackendDevice.Wormhole_B0, devmode=devmode, tti_path=tti_path)
-        raise RuntimeError("Unknown test device: " + name)
-
-    def is_available(self, device_list: List[BackendDevice], silicon_only: bool, no_silicon: bool, devtype: Optional[BackendType], devmode: DeviceMode) -> bool:
-        """ 
-        Return true if this kind of device is available on the current host. Expect a list of devices from 
-        `detect_available_devices`.
-        """
-        if devtype is not None and self.devtype != devtype:
-            return False
-
-        if self.devtype == BackendType.Golden:
-            return not silicon_only
-
-        if self.devtype == BackendType.Model:
-            return bool(int(os.environ.get("PYBUDA_ENABLE_MODEL_DEVICE", "0")))
-
-        if self.devtype == BackendType.Silicon:
-            compiled_arch_name = os.environ.get("BACKEND_ARCH_NAME", None) or os.environ.get("ARCH_NAME", None)
-            if compiled_arch_name == "wormhole_b0":
-                compiled_arch = BackendDevice.Wormhole_B0
-            else:
-                compiled_arch = BackendDevice.Grayskull
+#         if self.devtype == BackendType.Emulation:
+#             return bool(int(os.environ.get("PYBUDA_ENABLE_EMULATION_DEVICE", "0")))
+
+#         if self.devtype == BackendType.Silicon:
+#             compiled_arch_name = os.environ.get("BACKEND_ARCH_NAME", None) or os.environ.get("ARCH_NAME", None)
+#             if compiled_arch_name == "wormhole_b0":
+#                 compiled_arch = BackendDevice.Wormhole_B0
+#             elif compiled_arch_name == "blackhole":
+#                 compiled_arch = BackendDevice.Blackhole
+#             else:
+#                 compiled_arch = BackendDevice.Grayskull
 
-            is_offline_silicon_compile = devmode == DeviceMode.CompileOnly and self.arch == compiled_arch
-            return (self.arch in device_list and not no_silicon) or is_offline_silicon_compile
+#             is_offline_silicon_compile = devmode == DeviceMode.CompileOnly and self.arch == compiled_arch
+#             return (self.arch in device_list and not no_silicon) or is_offline_silicon_compile
 
-        return False
+#         return False
 
-    def is_silicon(self):
-        return self.devtype == BackendType.Silicon
+#     def is_silicon(self):
+#         return self.devtype == BackendType.Silicon
 
-    def is_grayskull(self):
-        return self.arch == BackendDevice.Grayskull
+#     def is_grayskull(self):
+#         return self.arch == BackendDevice.Grayskull
     
-    def is_wormhole(self):
-        return self.arch == BackendDevice.Wormhole or self.arch == BackendDevice.Wormhole_B0
+#     def is_wormhole_b0(self):
+#         return self.arch == BackendDevice.Wormhole_B0
     
-    def is_wormhole_b0(self):
-        return self.arch == BackendDevice.Wormhole_B0
+#     def is_blackhole(self):
+#         return self.arch == BackendDevice.Blackhole
 
 device_cfg_global = None
 def pytest_generate_tests(metafunc):
@@ -218,7 +272,7 @@ def pytest_generate_tests(metafunc):
         metafunc.parametrize("training", (False, True), ids=["inference", "training"])
 
     if "test_device" in metafunc.fixturenames:
-        names = ["Golden", "Model", "Grayskull", "Wormhole", "Wormhole_B0"]
+        names = ["Golden", "Model", "Versim", "Emulation", "Grayskull", "Wormhole_B0", "Blackhole"]
 
         # Set device-mode for the test
         compile_only = metafunc.config.getoption("--compile-only")
@@ -287,23 +341,29 @@ def pytest_runtest_logreport(report):
             # - Level 2 - set by dev in test;   we want to remove them          (e.g. enable/disable by default, redefine as more user friendly, etc.)
             # - Level 3 - set by dev in test;   we want to remove them entirely (purely for testing purposes)
             #
-            _set_pybuda_override_veto({
-                # Level 0 overrides (These should not be set as part of compiler config; runtime based)
-                "chip_ids": "",
-                "backend_device_descriptor_path": "",
-                
-                # Level 1 overrides
-                "balancer_policy": "",
-                "enable_t_streaming": "",
-                "default_df_override": "",
-            },
-            {
-                # Level 2 overrides
-                "PYBUDA_RIBBON2": "",
-                "PYBUDA_DISABLE_STREAM_OUTPUT": "",
-                "PYBUDA_PAD_OUTPUT_BUFFER": "",
-                "PYBUDA_OVERRIDE_DEVICE_YAML": "" # Mostly used for 1x1 model overrides
-            })
+            if "PYBUDA_OVERRIDES_VETO_CUSTOM_SETUP" in os.environ:
+                _set_pybuda_override_veto({
+                    "backend_output_dir": "",
+                }, {})
+            else:
+                _set_pybuda_override_veto({
+                    "backend_output_dir": "",
+                    "backend_runtime_params_path": "",
+                    "harvesting_mask": "",
+                    "cpu_fallback_ops": "",
+
+                    # Level 1 overrides
+                    "balancer_policy": "",
+                    "enable_t_streaming": "",
+                    "default_df_override": "",
+                },
+                {
+                    # Level 2 overrides
+                    "PYBUDA_RIBBON2": "",
+                    "PYBUDA_DISABLE_STREAM_OUTPUT": "",
+                    "PYBUDA_PAD_OUTPUT_BUFFER": "",
+                    "PYBUDA_OVERRIDE_DEVICE_YAML": "" # Mostly used for 1x1 model overrides
+                })
 
     elif report.when == "teardown":
         environ_before_test_keys = set(environ_before_test.keys())
@@ -319,8 +379,3 @@ def pytest_runtest_logreport(report):
             if os.environ.get(key, "") != default_value:
                 os.environ[key] = default_value
 
-    if report.failed:
-        last_stage = COMPILE_INFO.LAST_SUCCESSFUL_STAGE
-        if not last_stage: 
-            last_stage = "failed before compile"
-        print(f"\nLAST SUCCESSFUL COMPILE STAGE: {last_stage}\n")
diff --git a/pybuda/test/data_formats/test_df.py b/pybuda/test/data_formats/test_df.py
index 96f4fd68b..dab898917 100644
--- a/pybuda/test/data_formats/test_df.py
+++ b/pybuda/test/data_formats/test_df.py
@@ -182,7 +182,7 @@ def bwd_op_format_promotio(act, *, ff1_weights):
     )
 
     module = ModuleBuilder(bwd_op_format_promotio, ff1_weights=pybuda.Parameter(1,1,64,64))
-    verify_module(module, [(1, 1, 64, 64)], VerifyConfig(test_kind=config.TestKind.TRAINING, arch=BackendDevice.Wormhole))
+    verify_module(module, [(1, 1, 64, 64)], VerifyConfig(test_kind=config.TestKind.TRAINING))
 
 def test_eltwise_binary_mixed_ab_inputs(test_device):
     shape = (1, 1, 32, 32)
diff --git a/pybuda/test/emulation/test_emulation_basic_ops.py b/pybuda/test/emulation/test_emulation_basic_ops.py
new file mode 100644
index 000000000..ce7a37f0e
--- /dev/null
+++ b/pybuda/test/emulation/test_emulation_basic_ops.py
@@ -0,0 +1,69 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+#
+# Emulation-related tests for end-to-end emulation
+#
+from pybuda import pybuda
+from pybuda._C.backend_api import BackendType
+from pybuda.module import PyTorchModule
+from pybuda.verify.backend import verify_module
+from pybuda.tensor import Tensor
+from test.utils import download_model
+from pybuda.verify.config import TestKind, VerifyConfig
+import pytest
+import torch
+from test.common import run
+from transformers import BertModel
+
+def test_emulation_simple_matmul(test_device):
+    # Run only emulation tests
+    if test_device.devtype != BackendType.Emulation:
+        pytest.skip()
+
+    @run(
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+            pcc=0.99),
+    )
+    def simple_matmul(a, b):
+        c = pybuda.op.Matmul("matmul0", a, b)
+        return c
+
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.input_queues_on_host = False
+    compiler_cfg.output_queues_on_host = False
+
+    shape = (1, 1, 128, 128)
+    a = Tensor.create_from_torch(torch.randn(shape))
+    b = Tensor.create_from_torch(torch.randn(shape))
+    simple_matmul(a, b)
+
+def test_bert_tiny(test_device):
+    # Run only emulation tests
+    if test_device.devtype != BackendType.Emulation:
+        pytest.skip()
+        
+    input_shape = (1, 128)
+    model = download_model(BertModel.from_pretrained, "prajjwal1/bert-tiny", add_pooling_layer=False)
+
+    pt_module = PyTorchModule("bert", model)
+
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.retain_tvm_python_files = True
+    compiler_cfg.input_queues_on_host = False
+    compiler_cfg.output_queues_on_host = False
+
+    verify_module(
+        pt_module,
+        (input_shape,),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            test_kind=TestKind.INFERENCE,
+            pcc=0.9,
+        ),
+        input_params=[{"requires_grad": False, "data_format": torch.int}],
+    )
\ No newline at end of file
diff --git a/pybuda/test/falcon/pybudify.py b/pybuda/test/falcon/pybudify.py
index 6ad1eed80..9223ec811 100644
--- a/pybuda/test/falcon/pybudify.py
+++ b/pybuda/test/falcon/pybudify.py
@@ -41,10 +41,6 @@ def __init__(self, pt_module, device='silicon', arch='wormhole_b0', precision='f
             # os.environ["PYBUDA_DISABLE_INTERACTIVE_PLACER"] = "1" # Until interactive placer supports multi-chip placement overrides
             # os.environ["PYBUDA_PLACER_SNAKE"] = "1"
             # os.environ["PYBUDA_ETH_LINKS_NEBULA"] = "1"
-            # os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"] = "1"
-
-            if self.odkv or self.masked_odkv:
-                os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"]= "1" # much better performance, not sure why?
 
             pybuda = self.pybuda = __import__('pybuda') # let us set log levels before importing pybuda
 
@@ -312,11 +308,8 @@ def __init__(self, pt_module, device='silicon', arch='wormhole_b0', precision='f
                 #     input_df={0: [pybuda.DataFormat.Bfp8_b, True], 1: [pybuda.DataFormat.Bfp8_b, True], 2: [pybuda.DataFormat.Bfp8_b, True]})
 
                 compiler_cfg.loopback_outputs = names_dict
-            else:
-                compiler_cfg.enable_t_streaming = True
 
             pybuda_arch = { 'grayskull': pybuda.BackendDevice.Grayskull,
-                            'wormhole': pybuda.BackendDevice.Wormhole,
                             'wormhole_b0': pybuda.BackendDevice.Wormhole_B0 }[arch]
 
             if tti_load is not None:
diff --git a/pybuda/test/falcon/tests/falcon_modules/falcon.py b/pybuda/test/falcon/tests/falcon_modules/falcon.py
index d4a874433..78918dd52 100644
--- a/pybuda/test/falcon/tests/falcon_modules/falcon.py
+++ b/pybuda/test/falcon/tests/falcon_modules/falcon.py
@@ -1258,7 +1258,6 @@ def __init__(self, args):
                                          enable_auto_transposing_placement=True,
                                         #  backend_cluster_descriptor_path="/proj_sw/user_dev/jrock/pybuda-falcon-stable-avx/pybuda/third_party/budabackend/wormhole_2chip_cluster.yaml" if args.num_chips > 1 else None,
                                          )
-        pybuda.config._get_global_compiler_config().enable_t_streaming = True
         pybuda.config._get_global_compiler_config().use_interactive_placer = True
 
         self.cpu0 = pybuda.CPUDevice("cpu0", module=self.embeddings_module)
diff --git a/pybuda/test/fx/__init__.py b/pybuda/test/fx/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/pybuda/test/fx/conftest.py b/pybuda/test/fx/conftest.py
new file mode 100644
index 000000000..28352b9ff
--- /dev/null
+++ b/pybuda/test/fx/conftest.py
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import pytest
+
+from pybuda.torch_compile import compile_torch
+from pybuda.config import remove_cpu_fallback_ops
+
+@pytest.fixture(autouse=True)
+def disable_embedding_fallback():
+    remove_cpu_fallback_ops("embedding")
+    yield
+
+torch._dynamo.reset()
+def generic_model_test(src_model, num_inputs = 1, num_outputs = 1, inputs = []):
+    # Generic runner for models
+    model = torch.compile(src_model.to('tt'), backend=compile_torch)
+
+    for _ in range(3):
+        if len(inputs) == 0:
+            inputs = [torch.rand(1, 128, 768) for _ in range(num_inputs)]
+        device = 'tt'
+        tt_inputs = [i.to(device) for i in inputs]
+        tt_res = model(*tt_inputs)
+        if num_outputs > 0:
+            tt_res = tuple([tt_res.to('cpu')]) if isinstance(tt_res, torch.Tensor) else tuple([t.to('cpu') for t in tt_res])
+
+        cpu_res = src_model.to('cpu')(*inputs)
+        if isinstance(cpu_res, torch.Tensor):
+            cpu_res = tuple([cpu_res])
+
+        for i in range(num_outputs):
+            assert torch.allclose(cpu_res[i], tt_res[i], atol=0, rtol=1e-2), f"** MISMATCH **\nCPU:\n{cpu_res[i]}\nTT:\n{tt_res[i]}"
+
+
diff --git a/pybuda/test/fx/test_basics.py b/pybuda/test/fx/test_basics.py
new file mode 100644
index 000000000..45f07cabf
--- /dev/null
+++ b/pybuda/test/fx/test_basics.py
@@ -0,0 +1,85 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import torch
+from pybuda.torch_compile import compile_torch
+
+from .conftest import generic_model_test
+
+class NoOutputModel(torch.nn.Module):
+    def forward(self, a):
+        a = a + 1
+        a = 3 * a
+
+def test_no_output_graph():
+    # Test the case where the model has no outputs
+    generic_model_test(NoOutputModel(), num_outputs=0)
+
+class NoInputModel(torch.nn.Module):
+    def forward(self):
+        return torch.tensor([1])
+
+@pytest.mark.skip(reason="https://yyz-gitlab.local.tenstorrent.com/tenstorrent/pybuda/-/issues/2475")
+def test_no_input_model():
+    # Test the case where the model has no inputs
+    generic_model_test(NoInputModel(), num_inputs=0)
+
+class EmptyModelNoOutput(torch.nn.Module):
+    def forward(self, a):
+        pass
+
+def test_empty_model_no_output():
+    # Test the case where the model has no operations, and no output
+    generic_model_test(EmptyModel())
+
+class EmptyModel(torch.nn.Module):
+    def forward(self, a):
+        return a
+
+def test_empty_model():
+    # Test the case where the model has no operations
+    generic_model_test(EmptyModel())
+
+class DanglingOps(torch.nn.Module):
+    def forward(self, a):
+        a = a + 1
+        b = a + 2
+        c = b * 12
+        return a
+
+def test_dangling_ops():
+    # Test the case where the model has an op who output goes nowhere
+    generic_model_test(DanglingOps())
+
+
+def test_to_double():
+    tensor = torch.rand(32, 32).to('tt')
+    tensor.to(dtype=torch.double)
+ 
+def test_print():
+    tensor = torch.rand(32, 32).to('tt')
+    print(tensor)
+
+@pytest.mark.skip(reason="https://yyz-gitlab.local.tenstorrent.com/tenstorrent/pybuda/-/issues/2438")
+def test_longint():
+    original_data = torch.randint(0, 10, (1, 8))
+    tensor = original_data.to('tt').to(dtype=torch.int).to('cpu')
+
+    original_data = original_data.to(dtype=torch.int)
+    assert torch.allclose(original_data, tensor)
+
+
+class NonAlignedSize(torch.nn.Module):
+    def forward(self, a):
+        return a + 1
+
+@pytest.mark.parametrize("rows", [1, 32])
+def test_return_non_aligned_sizes(rows):
+    model = torch.compile(NonAlignedSize(), backend=compile_torch)
+    input = torch.rand(1, rows, 33)
+    input_tt = input.to('tt')
+    tt_res = model(input_tt).to('cpu')
+    cpu_res = NonAlignedSize()(input)
+    assert torch.allclose(cpu_res, tt_res, atol=0, rtol=1e-3)
+
diff --git a/pybuda/test/fx/test_features.py b/pybuda/test/fx/test_features.py
new file mode 100644
index 000000000..f5cb4bc15
--- /dev/null
+++ b/pybuda/test/fx/test_features.py
@@ -0,0 +1,279 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+from typing import Tuple
+import os
+
+import pytest
+import torch
+import torch.nn as nn
+
+import pybuda
+from pybuda.torch_compile import compile_torch
+from pybuda.config import _get_global_compiler_config
+
+from .conftest import generic_model_test
+
+#
+# TODO: Tests here depend on the fact that argmax/index are not supported at the moment. If that changes, and they are added to the device, many of
+# these tests will be irrelevant, and need to be updated with a different fallback op. Ideally something that we would (almost) never support.
+#
+
+def test_link():
+    class Linear(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear = nn.Linear(32, 32, bias=True)
+
+        def forward(self, x1):
+            m1 = self.linear(x1)
+            return m1
+
+    _get_global_compiler_config().enable_pt2_fx_graph_link = True
+    os.environ["PYBUDA_DEVMODE"] = "1"
+    input = torch.rand(1, 32, 32)
+    input2 = torch.rand(1, 32, 32)
+    input3 = torch.rand(1, 32, 32)
+
+    input = input.to("tt")
+
+    pybuda_mod = torch.compile(Linear().to("tt"), backend=compile_torch)
+    result_c = pybuda_mod(input)
+    pybuda_mod_2 = torch.compile(Linear().to("tt"), backend=compile_torch)
+    result__ = pybuda_mod_2(result_c)
+
+    result_c = pybuda_mod(input)
+    result = pybuda_mod_2(result_c)
+
+    result = result.to("cpu")
+
+def test_decomp():
+    pytest.skip() #TODO fix: FATAL    | Always          - Unsupported (for now) _copy_from TTDevice[0] to TTDevice[0]
+    os.environ["PYBUDA_DEVMODE"] = "1"
+    class BasicModule(nn.Module):
+        def forward(self, x):
+            x = x * 2
+            a,b,c = torch.split(x, 3, dim=-1)
+            return a + b + c
+
+    mod, input = BasicModule(), torch.randn(2, 9).to(dtype=torch.float16)
+ 
+    pybuda_mod = torch.compile(mod, backend=compile_torch, dynamic=False)
+    out = pybuda_mod(input)
+
+@pytest.mark.parametrize("shape", [(1024, 1024)])
+@pytest.mark.parametrize("mb", [1, 8, 16])
+@pytest.mark.parametrize("loop", [1, 8, 16])
+@pytest.mark.parametrize("native", [True, False])
+def test_push(shape, mb, loop, native):
+    if mb != 1:
+        pytest.skip() #TODO
+    os.environ["PYBUDA_DEVMODE"] = "1"
+    import time
+
+    pybuda.config.set_configuration_options(
+        default_df_override=pybuda.config.DataFormat.Float32
+    )
+
+    class Add(nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x1, x2):
+            return x1 + x2
+
+    model = Add()
+    sample_inputs = [torch.empty(mb, 1, *shape), torch.empty(mb, 1, *shape)]
+    inputs = [(torch.ones(mb, 1, *shape), torch.ones(mb, 1, *shape))] * loop
+
+    if native:
+        model = model.to("tt")
+        pybuda_mod = pybuda_mod = torch.compile(model, backend=compile_torch, dynamic=False)
+        comp_inputs = [i.to("tt") for i in inputs[0]]
+        result = pybuda_mod(*comp_inputs) # compile
+        start = time.perf_counter()
+        for args in inputs:
+            args = [a.to("tt") for a in args]
+            result = pybuda_mod(*args)
+            result.to("cpu")
+        elapsed = time.perf_counter() - start
+    else:
+        tt0 = pybuda.TTDevice("tt0")
+        tt0.place_module(pybuda.module.PyTorchModule("add", model))
+        output_q = pybuda.initialize_pipeline(
+            training=False, sample_inputs=sample_inputs
+        )
+
+        start = time.perf_counter()
+        for i in range(loop):
+            tt0.push_to_inputs(inputs[i])
+        pybuda.run_forward(input_count=loop)
+        for i in range(loop):
+            result = output_q.get(timeout=30)
+        elapsed = time.perf_counter() - start
+
+    float32_size = 4
+    data = mb * shape[0] * shape[1] * float32_size / (1024 * 1024)
+
+    print(
+        f"Batch[{mb:2}] Loop[{loop:2}] Native[{native:1}] Data[{data}mB] Elapsed[{elapsed:2.4}sec]"
+    )
+
+
+# Clip-like argmax code that does argmax followed by index
+class ClipArgmax(torch.nn.Module):
+    def __init__(self, eltwise_before, eltwise_after):
+        super().__init__()
+        self.eltwise_before = eltwise_before
+        self.eltwise_after = eltwise_after
+
+    def forward(self, last_hidden_state, input_ids):
+        if self.eltwise_before:
+            last_hidden_state = last_hidden_state * last_hidden_state # something to do on device
+        pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                input_ids.to(device=last_hidden_state.device).argmax(dim=-1),
+            ]
+        if self.eltwise_after:
+            pooled_output = pooled_output * pooled_output # something to do on device
+        return pooled_output
+
+@pytest.mark.parametrize("eltwise_before", [True, False])
+@pytest.mark.parametrize("eltwise_after", [True, False])
+def test_fallback(eltwise_before, eltwise_after):
+    shape = (1, 128, 768)
+    generic_model_test(ClipArgmax(eltwise_before, eltwise_after), inputs=(torch.rand(*shape), torch.randint(0, shape[1], (1, shape[1])).int()))
+
+class ClipArgmaxSandwich(torch.nn.Module):
+    def forward(self, last_hidden_state, input_ids):
+        pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                input_ids.to(device=last_hidden_state.device).argmax(dim=-1),
+            ]
+        pooled_output = pooled_output * pooled_output # something to do on device
+        pooled_output = pooled_output[
+                torch.arange(pooled_output.shape[0], device=last_hidden_state.device),
+                input_ids.to(device=last_hidden_state.device).argmax(dim=-1),
+            ]
+        return pooled_output
+
+def test_fallback_before_and_after():
+    # Fallback before and after, with device in the middle
+    shape = (1, 128, 768)
+    generic_model_test(ClipArgmaxSandwich(), inputs=(torch.rand(*shape), torch.randint(0, shape[1], (shape[0], shape[1])).int()))
+
+
+class RawIntOutput(nn.Module):
+    def __init__(self):
+        super().__init__()
+        embed_dim = 128
+        vocab_size = 1024
+        self.token_embedding = nn.Embedding(vocab_size, embed_dim)
+
+    def forward(self, input_ids: torch.LongTensor) -> Tuple[torch.Tensor, torch.LongTensor]:
+        seq_length = input_ids.shape[-1]
+        input_ids = input_ids[:, :seq_length]
+        emb = self.token_embedding(input_ids)
+        return emb, input_ids
+
+def test_fallback_on_raw_int():
+    # Test the case where the raw int output into embedding is also passed through to output, through some kind of nop/reshape/slice
+    # We want to fall back to CPU for the raw int output
+    generic_model_test(RawIntOutput(), inputs=[torch.randint(0, 1024, (1, 128)).int()])
+
+class FallbackOutputReuse(nn.Module):
+    def forward(self, a):
+        b = a * a
+        c = torch.argmax(b, dim=-1)
+        return c, b
+
+def test_fallback_with_output_reuse():
+    # Test the case where the fallback graph is using one of the graph outputs as its input
+    generic_model_test(FallbackOutputReuse(), num_outputs=2)
+
+class ForkedInput(torch.nn.Module):
+    def forward(self, last_hidden_state, input_ids):
+        pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                input_ids.to(device=last_hidden_state.device).argmax(dim=-1),
+            ]
+        device_output = last_hidden_state * last_hidden_state # something to do on device
+        return pooled_output, device_output
+
+def test_forked_input():
+    # Test the case where the input is used in both fallback and device graph
+    generic_model_test(ForkedInput(), inputs=(torch.rand(1, 128, 768), torch.randint(0, 128, (1, 128)).int()), num_outputs=2)
+
+class ForkedInputToNop(torch.nn.Module):
+    def forward(self, last_hidden_state, input_ids):
+        pooled_output = last_hidden_state[
+                torch.arange(last_hidden_state.shape[0], device=last_hidden_state.device),
+                input_ids.to(device=last_hidden_state.device).argmax(dim=-1),
+            ]
+        device_output = last_hidden_state 
+        return pooled_output, device_output
+
+def test_forked_input_to_nop():
+    # Test the case where the input is used in both fallback and device graph, but device graph is NOP so it also falls back to CPU
+    generic_model_test(ForkedInputToNop(), inputs=(torch.rand(1, 128, 768), torch.randint(0, 128, (1, 128)).int()), num_outputs=2)
+
+foobar = 5.0
+class DisjointedGraphs(torch.nn.Module):
+    def forward(self, a):
+        a = a + 1
+        a = a.to('cpu')
+        if a[:, 0] > foobar:
+            b = a + 2
+        else:
+            b = a + 3
+
+        return b
+
+def test_disjointed_graphs():
+    # Test the case where pt2 generates two completely independent graphs
+    generic_model_test(DisjointedGraphs(), inputs=(torch.Tensor([[4.0]]),))
+
+class DisjointedGraphsWithParams(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.linear1 = torch.nn.Linear(1, 1, bias=False)
+        self.linear2 = torch.nn.Linear(1, 1, bias=False)
+    def forward(self, a):
+        a = self.linear1(a)
+        a = a.to('cpu')
+        if a[0] > 1:
+            b = a + 2
+        else:
+            b = self.linear2(a) 
+
+        return b
+
+@pytest.mark.skip(reason="Fails in shape handling, Allan is working on it.. or we need to cause disjointed graphs differently")
+def test_disjointed_graphs_with_params():
+    generic_model_test(DisjointedGraphsWithParams(), inputs=(torch.tensor([4.0]),))
+
+class ModelWithTensorAttributes(nn.Module):
+    def __init__(self, a):
+        super().__init__()
+        self.a = a
+    def forward(self, x):
+        return x + self.a
+
+@pytest.mark.skip(reason="Input 0 for op add_0 is uninitialized, missing queue settings could cause us to access out of bounds queue.")
+def test_model_with_attributes():
+    # Test the case where the model has attributes that are used in the calculation
+    shape = (32, 64)
+    generic_model_test(ModelWithTensorAttributes(torch.rand(*shape).to('tt')), inputs=(torch.rand(*shape),))
+
+class ModelWithTensorAttributesNoInput(nn.Module):
+    def __init__(self, a):
+        super().__init__()
+        self.a = a
+    def forward(self):
+        return self.a * 2
+
+def test_model_with_attributes_no_input():
+    # Test the case where the model has attributes that are used in the calculation
+    shape = (32, 64)
+    generic_model_test(ModelWithTensorAttributesNoInput(torch.rand(*shape)), num_inputs=0)
+
diff --git a/pybuda/test/fx/test_models.py b/pybuda/test/fx/test_models.py
new file mode 100644
index 000000000..ed01a332d
--- /dev/null
+++ b/pybuda/test/fx/test_models.py
@@ -0,0 +1,287 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import pybuda
+import torch
+import torch.nn as nn
+import os
+from pytorchcv.model_provider import get_model as ptcv_get_model
+from transformers import BertModel, GPT2LMHeadModel, GPT2Config, GPT2Model, AutoFeatureExtractor, ResNetForImageClassification
+from pybuda.torch_compile import compile_torch
+
+def test_unet_osmr_cityscape_pytorch():
+    # STEP 1: Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
+    compiler_cfg.cpu_fallback_ops = set()
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.enable_auto_fusing = False
+    compiler_cfg.enable_enumerate_u_kt = False
+    compiler_cfg.default_dram_parameters = False
+    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    os.environ["PYBUDA_FORCE_RESIZE_DENSE_MM"] = "1"
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    #if test_device.arch == BackendDevice.Wormhole_B0:
+    compiler_cfg.balancer_policy = "Ribbon"
+    os.environ["PYBUDA_BALANCER_PREPASS_DISABLED"] = "1"
+    #elif test_device.arch == BackendDevice.Grayskull:
+    #    compiler_cfg.balancer_policy = "CNN"
+
+    # STEP 2: Create PyBuda module from PyTorch model
+    unet_osmr = ptcv_get_model("unet_cityscapes", pretrained=False)
+    unet_osmr.eval()
+
+    # STEP 3: Run inference on Tenstorrent device
+    img_tensor = torch.randn(1, 3, 224, 224)
+
+    # Run the model on cpu
+    golden = unet_osmr(img_tensor)
+
+    # Run the model on TT device
+    unet_osmr.to("tt")
+    img_tensor = img_tensor.to("tt")
+    pybuda_mod = torch.compile(unet_osmr, backend=compile_torch, dynamic=False)
+    result = pybuda_mod(img_tensor)
+    output = result[0].to("cpu")
+
+    # Compare the result
+    assert pybuda.op.eval.compare_tensor_to_golden(f"pt_unet_osmr_cityscape", golden[0], output, is_buda=True, pcc=0.99)
+
+
+def test_resnet(): 
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.cpu_fallback_ops = set()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.enable_training = False
+    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
+
+    # Load ResNet feature extractor and model checkpoint from HuggingFace
+    feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/resnet-50", torchscript=True)
+    resnet = ResNetForImageClassification.from_pretrained("microsoft/resnet-50", torchscript=True)
+    resnet.eval()
+ 
+    # Load data sample
+    # url = "https://datasets-server.huggingface.co/assets/imagenet-1k/--/default/train/18/image/image.jpg"
+    # image = Image.open(requests.get(url, stream=True).raw)
+    image = torch.rand(1, 3, 256, 256)
+
+    # Data preprocessing
+    inputs = feature_extractor(image, return_tensors="pt")
+    pixel_values = inputs["pixel_values"]
+    
+    # Run the model on cpu
+    resnet_cpu = ResNetForImageClassification.from_pretrained("microsoft/resnet-50", torchscript=True)
+    golden = resnet_cpu(pixel_values)
+
+    # Run the model on TT device
+    resnet.to("tt")
+    pixel_values = pixel_values.to("tt") 
+    pybuda_mod = torch.compile(resnet, backend=compile_torch, dynamic=False)
+    result = pybuda_mod(pixel_values)
+    output = result[0].to("cpu")
+    
+    # Compare the result
+    assert pybuda.op.eval.compare_tensor_to_golden(f"pt_resnet50", golden[0], output, is_buda=True, pcc=0.99)
+
+def test_gpt2():
+    config = GPT2Config.from_pretrained("gpt2")
+    config.num_hidden_layers = 2
+
+    os.environ["PYBUDA_DEVMODE"] = "1"
+    compile_cfg = pybuda.config._get_global_compiler_config()
+    compile_cfg.enable_link_past_cache_ios = True
+    compile_cfg.cpu_fallback_ops = set()
+    compile_cfg.default_df_override = pybuda._C.Float16_b
+
+    gpt2 = GPT2LMHeadModel(config).eval()
+    input_ids = torch.randint(0, 10000, (1, 32)).int()
+    golden = gpt2(input_ids)
+
+    pybuda_mod = torch.compile(gpt2, backend=compile_torch, dynamic=False)
+    result = pybuda_mod(input_ids)
+
+    next_token_logits = result[0]
+    next_token_logits = next_token_logits.to("cpu")
+
+    res = result[0].to("cpu")
+    assert pybuda.op.eval.compare_tensor_to_golden(f"gpt2", golden[0], res, is_buda=True, pcc=0.99)
+    
+def test_gen():
+    torch.set_num_threads(1)
+    torch.set_num_interop_threads(1)
+    config = GPT2Config.from_pretrained("gpt2")
+    config.num_hidden_layers = 1
+    config.return_dict = False
+
+    os.environ["PYBUDA_DEVMODE"] = "1"
+    compile_cfg = pybuda.config._get_global_compiler_config()
+    compile_cfg.enable_link_past_cache_ios = True
+    compile_cfg.cpu_fallback_ops = set()
+    compile_cfg.default_df_override = pybuda._C.Float16_b
+
+    gpt2 = GPT2Model(config).eval()
+    gpt2.to("tt")
+
+    input_ids = torch.randint(0, 10000, (1, 32)).int().to("tt")
+
+    pybuda_mod = torch.compile(gpt2, backend=compile_torch, dynamic=False)
+    result = pybuda_mod(input_ids)
+
+    res = result[0].to("cpu")
+    inp2 = torch.randint(0, 10000, (1, 32)).int()
+    inp2 = inp2.to("tt")
+    result = pybuda_mod(inp2, result[1])
+    rs2 = result[0].to("cpu")
+
+def test_bert():
+    os.environ["PYBUDA_DEVMODE"] = "1"
+    compile_cfg = pybuda.config._get_global_compiler_config()
+    compile_cfg.cpu_fallback_ops = set()
+
+    bert = BertModel.from_pretrained("prajjwal1/bert-tiny", torchscript=True)
+    bert_cpu = BertModel.from_pretrained("prajjwal1/bert-tiny", torchscript=True)
+
+
+    input_ids = torch.randint(0, 10000, (1, 128)).int()
+    golden = bert_cpu(input_ids)
+
+    print("Copying model")
+    bert.to("tt")
+
+    print("Copying inputs")
+    input_ids = input_ids.to("tt")
+
+    print("Compiling Model")
+    pybuda_mod = torch.compile(bert, backend=compile_torch, dynamic=False)
+    result = pybuda_mod(input_ids)
+    print("Copying outputs")
+
+    result = [r.to("cpu") for r in result]
+    for i, (g, r) in enumerate(zip(golden, result)):
+        assert pybuda.op.eval.compare_tensor_to_golden(f"bert_{i}", g, r, is_buda=True, pcc=0.99)
+
+    inp2 = torch.randint(0, 10000, (1, 128)).int()
+    golden = bert_cpu(inp2)
+
+    inp2 = inp2.to("tt")
+    result = pybuda_mod(inp2)
+    result = [r.to("cpu") for r in result]
+    for i, (g, r) in enumerate(zip(golden, result)):
+        assert pybuda.op.eval.compare_tensor_to_golden(f"bert_{i}", g, r, is_buda=True, pcc=0.99)
+
+    inp3 = torch.randint(0, 10000, (1, 64)).int()
+    golden = bert_cpu(inp3)
+    inp3 = inp3.to("tt")
+    result = pybuda_mod(inp3)
+    result = [r.to("cpu") for r in result]
+    for i, (g, r) in enumerate(zip(golden, result)):
+        assert pybuda.op.eval.compare_tensor_to_golden(f"bert_{i}", g, r, is_buda=True, pcc=0.99)
+
+    inp4 = torch.randint(0, 10000, (1, 128)).int()
+    golden = bert_cpu(inp4)
+    inp4 = inp4.to("tt")
+    result = pybuda_mod(inp4)
+    result = [r.to("cpu") for r in result]
+    for i, (g, r) in enumerate(zip(golden, result)):
+        assert pybuda.op.eval.compare_tensor_to_golden(f"bert_{i}", g, r, is_buda=True, pcc=0.99)
+
+    inp5 = torch.randint(0, 10000, (1, 64)).int()
+    golden = bert_cpu(inp5)
+    inp5 = inp5.to("tt")
+    result = pybuda_mod(inp5)
+    result = [r.to("cpu") for r in result]
+    for i, (g, r) in enumerate(zip(golden, result)):
+        assert pybuda.op.eval.compare_tensor_to_golden(f"bert_{i}", g, r, is_buda=True, pcc=0.99)
+
+from diffusers import StableDiffusionPipeline
+
+def test_sd():
+    model = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4")
+    prompt = "a photo of an astronaut riding a horse on mars"
+    model = model.to("tt")
+    pybuda_mod = torch.compile(model, backend=compile_torch)
+    image = pybuda_mod(prompt=prompt, num_images_per_prompt=1, output_type="pil").images[0]
+
+from transformers import MobileNetV2FeatureExtractor, MobileNetV2ForImageClassification
+from PIL import Image
+import requests
+
+def test_mobilenet_v2():
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    feature_extractor = MobileNetV2FeatureExtractor.from_pretrained("Matthijs/mobilenet_v2_1.0_224")
+    model = MobileNetV2ForImageClassification.from_pretrained("Matthijs/mobilenet_v2_1.0_224")
+
+    inputs = feature_extractor(images=image, return_tensors="pt")
+
+    outputs = model(**inputs)
+    logits = outputs.logits
+
+    # model predicts one of the 1000 ImageNet classes
+    predicted_class_idx_cpu = logits.argmax(-1).item()
+    #print("Predicted class:", model.config.id2label[predicted_class_idx])
+
+    pybuda_mod = torch.compile(model.to('tt'), backend=compile_torch)
+    for k, v in inputs.items():
+        inputs[k] = v.to("tt")
+    outputs = pybuda_mod(**inputs)
+    logits = outputs.logits
+
+    # model predicts one of the 1000 ImageNet classes
+    predicted_class_idx_tt = logits.argmax(-1).item()
+    #print("Predicted class:", model.config.id2label[predicted_class_idx])
+
+    assert predicted_class_idx_cpu == predicted_class_idx_tt
+
+# need to pip install ultralytics
+#from ultralytics import YOLO 
+@pytest.mark.skip(reason="WIP")
+def test_yolo_v8():
+
+    # Load a model
+    model = YOLO("yolov8n.pt")  # load a pretrained model (recommended for training)
+
+    # Use the model
+    results = model("https://ultralytics.com/images/bus.jpg")  # predict on an image
+    print(results)
+
+
+    model.to('tt')
+    tt_model = torch.compile(model.model, backend=compile_torch)
+    model.model = tt_model
+    tt_results = model("https://ultralytics.com/images/bus.jpg")  # predict on an image
+    print(tt_results)
+
+class TTAmpModule:
+    def get_amp_supported_dtype(self):
+        return []
+
+    def is_autocast_enabled(self):
+        return False
+
+    def set_autocast_enabled(self, enable):
+        pass
+
+    def get_autocast_dtype(self):
+        return torch.float32
+
+    def set_autocast_dtype(self, dtype):
+        pass
+
+from transformers import AutoTokenizer, AutoModelForCausalLM
+def test_gemma_2b():
+
+    torch._register_device_module("tt", TTAmpModule())
+
+    tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
+    model = AutoModelForCausalLM.from_pretrained("google/gemma-2b")
+    model = torch.compile(model.to("tt"), backend=compile_torch)
+
+    input_text = "Write me a poem about Machine Learning."
+    input_ids = tokenizer(input_text, return_tensors="pt").to('tt')
+
+    outputs = model.generate(**input_ids)
+    print(tokenizer.decode(outputs[0]))
diff --git a/pybuda/test/fx/test_ops.py b/pybuda/test/fx/test_ops.py
new file mode 100644
index 000000000..8ff77edd3
--- /dev/null
+++ b/pybuda/test/fx/test_ops.py
@@ -0,0 +1,106 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pytest
+import pybuda
+import torch
+import torch.nn as nn
+import os
+from pybuda.torch_compile import compile_torch
+
+def test_add():
+    class Add(nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x1, x2):
+            return x1 + x2, x2 + x1 + 2
+
+    os.environ["PYBUDA_DEVMODE"] = "1"
+    model = Add()
+    inputs = [torch.rand(1, 32, 32), torch.rand(1, 32, 32)]
+    golden = model(*inputs)
+    pybuda_mod = torch.compile(model, backend="tt")
+    # inputs = [i.to("tt") for i in inputs]
+    result = pybuda_mod(*inputs)
+    result = [r.to("cpu") for r in result]
+
+    assert [torch.allclose(g, r) for g, r in zip(golden, result)]
+
+def test_conv2d():
+    class Conv2d(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.conv = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
+
+        def forward(self, x):
+            x = self.conv(x)
+            return x
+
+    os.environ["PYBUDA_DEVMODE"] = "1"
+    model = Conv2d()
+    inputs = torch.rand(1, 3, 32, 32)
+    golden = model(inputs) 
+
+    if True:
+        pybuda_mod = torch.compile(model, backend=compile_torch, dynamic=False)
+        result = pybuda_mod(inputs)
+        result = result.to("cpu")
+        assert pybuda.op.eval.compare_tensor_to_golden(f"conv2d", golden, result, is_buda=True, pcc=0.99)
+    else: 
+        from pybuda.verify.backend import verify_module
+        mod = pybuda.PyTorchModule("conv", model)
+        verify_module(
+            mod,
+            ([1,3,32,32],),
+            verify_cfg=pybuda.VerifyConfig(
+                arch=pybuda.BackendDevice.Wormhole_B0,
+                devtype=pybuda.BackendType.Golden,
+                test_kind=pybuda.verify.TestKind.INFERENCE,
+                pcc=0.99
+            ), 
+        )
+
+def test_bn():
+    class BN(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.bn = nn.BatchNorm2d(64)
+
+        def forward(self, x):
+            x = self.bn(x)
+            return x
+
+    os.environ["PYBUDA_DEVMODE"] = "1"
+    model = BN()
+    model.eval()
+
+    inputs = torch.rand(1, 64, 32, 32)
+    golden = model(inputs)
+    # inputs = [i.to("tt") for i in inputs]
+    pybuda_mod = torch.compile(model, backend=compile_torch)
+    result = pybuda_mod(inputs)
+    result = result.to("cpu")
+
+    assert pybuda.op.eval.compare_tensor_to_golden(f"linear", golden, result, is_buda=True, pcc=0.99)
+
+def test_linear():
+    class Linear(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.linear = nn.Linear(32, 64, bias=True)
+
+        def forward(self, x1, x2):
+            m1 = self.linear(x1)
+            return m1 + x2
+
+    os.environ["PYBUDA_DEVMODE"] = "1"
+    model = Linear()
+    inputs = [torch.rand(1, 32, 32), torch.rand(1, 32, 64)]
+    golden = model(*inputs)
+    # inputs = [i.to("tt") for i in inputs]
+    pybuda_mod = torch.compile(model.to("tt"), backend=compile_torch)
+    result = pybuda_mod(*[i.to("tt") for i in inputs])
+    result = result.to("cpu")
+
+    assert pybuda.op.eval.compare_tensor_to_golden(f"linear", golden, result, is_buda=True, pcc=0.99)
diff --git a/pybuda/test/galaxy/bert/run_squad_wh.py b/pybuda/test/galaxy/bert/run_squad_wh.py
index debcbdce9..7448ffcfb 100644
--- a/pybuda/test/galaxy/bert/run_squad_wh.py
+++ b/pybuda/test/galaxy/bert/run_squad_wh.py
@@ -79,30 +79,30 @@ def encoder_output_buffering_single_chip():
     config = pybuda.config
 
     # input_1 -> matmul_2, matmul_8, matmul_22
-    config.insert_buffering_nop("input_1", ["matmul_2", "matmul_8", "matmul_22"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_8", ["matmul_55", "matmul_61", "matmul_75"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_17", ["matmul_108", "matmul_114", "matmul_128"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_26", ["matmul_161", "matmul_167", "matmul_181"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_35", ["matmul_214", "matmul_220", "matmul_234"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_44", ["matmul_267", "matmul_273", "matmul_287"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_53", ["matmul_320", "matmul_326", "matmul_340"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_62", ["matmul_373", "matmul_379", "matmul_393"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_71", ["matmul_426", "matmul_432", "matmul_446"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_80", ["matmul_479", "matmul_485", "matmul_499"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_89", ["matmul_532", "matmul_538", "matmul_552"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_98", ["matmul_585", "matmul_591", "matmul_605"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_107", ["matmul_638", "matmul_644", "matmul_658"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_116", ["matmul_691", "matmul_697", "matmul_711"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_125", ["matmul_744", "matmul_750", "matmul_764"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_134", ["matmul_797", "matmul_803", "matmul_817"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_143", ["matmul_850", "matmul_856", "matmul_870"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_152", ["matmul_903", "matmul_909", "matmul_923"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_161", ["matmul_956", "matmul_962", "matmul_976"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_170", ["matmul_1009", "matmul_1015", "matmul_1029"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_179", ["matmul_1062", "matmul_1068", "matmul_1082"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_188", ["matmul_1115", "matmul_1121", "matmul_1135"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_197", ["matmul_1168", "matmul_1174", "matmul_1188"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_206", ["matmul_1221", "matmul_1227", "matmul_1241"], hoist_tms=False)
+    config.insert_nop("input_1", ["matmul_2", "matmul_8", "matmul_22"], hoist_tms=False)
+    config.insert_nop("_fused_op_8", ["matmul_55", "matmul_61", "matmul_75"], hoist_tms=False)
+    config.insert_nop("_fused_op_17", ["matmul_108", "matmul_114", "matmul_128"], hoist_tms=False)
+    config.insert_nop("_fused_op_26", ["matmul_161", "matmul_167", "matmul_181"], hoist_tms=False)
+    config.insert_nop("_fused_op_35", ["matmul_214", "matmul_220", "matmul_234"], hoist_tms=False)
+    config.insert_nop("_fused_op_44", ["matmul_267", "matmul_273", "matmul_287"], hoist_tms=False)
+    config.insert_nop("_fused_op_53", ["matmul_320", "matmul_326", "matmul_340"], hoist_tms=False)
+    config.insert_nop("_fused_op_62", ["matmul_373", "matmul_379", "matmul_393"], hoist_tms=False)
+    config.insert_nop("_fused_op_71", ["matmul_426", "matmul_432", "matmul_446"], hoist_tms=False)
+    config.insert_nop("_fused_op_80", ["matmul_479", "matmul_485", "matmul_499"], hoist_tms=False)
+    config.insert_nop("_fused_op_89", ["matmul_532", "matmul_538", "matmul_552"], hoist_tms=False)
+    config.insert_nop("_fused_op_98", ["matmul_585", "matmul_591", "matmul_605"], hoist_tms=False)
+    config.insert_nop("_fused_op_107", ["matmul_638", "matmul_644", "matmul_658"], hoist_tms=False)
+    config.insert_nop("_fused_op_116", ["matmul_691", "matmul_697", "matmul_711"], hoist_tms=False)
+    config.insert_nop("_fused_op_125", ["matmul_744", "matmul_750", "matmul_764"], hoist_tms=False)
+    config.insert_nop("_fused_op_134", ["matmul_797", "matmul_803", "matmul_817"], hoist_tms=False)
+    config.insert_nop("_fused_op_143", ["matmul_850", "matmul_856", "matmul_870"], hoist_tms=False)
+    config.insert_nop("_fused_op_152", ["matmul_903", "matmul_909", "matmul_923"], hoist_tms=False)
+    config.insert_nop("_fused_op_161", ["matmul_956", "matmul_962", "matmul_976"], hoist_tms=False)
+    config.insert_nop("_fused_op_170", ["matmul_1009", "matmul_1015", "matmul_1029"], hoist_tms=False)
+    config.insert_nop("_fused_op_179", ["matmul_1062", "matmul_1068", "matmul_1082"], hoist_tms=False)
+    config.insert_nop("_fused_op_188", ["matmul_1115", "matmul_1121", "matmul_1135"], hoist_tms=False)
+    config.insert_nop("_fused_op_197", ["matmul_1168", "matmul_1174", "matmul_1188"], hoist_tms=False)
+    config.insert_nop("_fused_op_206", ["matmul_1221", "matmul_1227", "matmul_1241"], hoist_tms=False)
 
     config.override_op_placement(op_name="buffer_0_input_1_matmul_2", start=[1, 2])
     config.override_op_placement(op_name="buffer_0__fused_op_8_matmul_55", start=[1, 2])
@@ -189,30 +189,30 @@ def encoder_output_buffering_galaxy():
     config = pybuda.config
 
     # input_1 -> matmul_2, matmul_8, matmul_22, add_37
-    config.insert_buffering_nop("input_1", ["matmul_2", "matmul_8", "matmul_22", "add_37"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_8", ["matmul_55", "matmul_61", "matmul_75", "add_90"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_17", ["matmul_108", "matmul_114", "matmul_128", "add_143"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_26", ["matmul_161", "matmul_167", "matmul_181", "add_196"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_35", ["matmul_214", "matmul_220", "matmul_234", "add_249"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_44", ["matmul_267", "matmul_273", "matmul_287", "add_302"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_53", ["matmul_320", "matmul_326", "matmul_340", "add_355"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_62", ["matmul_373", "matmul_379", "matmul_393", "add_408"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_71", ["matmul_426", "matmul_432", "matmul_446", "add_461"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_80", ["matmul_479", "matmul_485", "matmul_499", "add_514"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_89", ["matmul_532", "matmul_538", "matmul_552", "add_567"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_98", ["matmul_585", "matmul_591", "matmul_605", "add_620"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_107", ["matmul_638", "matmul_644", "matmul_658", "add_673"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_116", ["matmul_691", "matmul_697", "matmul_711", "add_726"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_125", ["matmul_744", "matmul_750", "matmul_764", "add_779"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_134", ["matmul_797", "matmul_803", "matmul_817", "add_832"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_143", ["matmul_850", "matmul_856", "matmul_870", "add_885"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_152", ["matmul_903", "matmul_909", "matmul_923", "add_938"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_161", ["matmul_956", "matmul_962", "matmul_976", "add_991"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_170", ["matmul_1009", "matmul_1015", "matmul_1029", "add_1044"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_179", ["matmul_1062", "matmul_1068", "matmul_1082", "add_1097"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_188", ["matmul_1115", "matmul_1121", "matmul_1135", "add_1150"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_197", ["matmul_1168", "matmul_1174", "matmul_1188", "add_1203"], hoist_tms=False)
-    config.insert_buffering_nop("_fused_op_206", ["matmul_1221", "matmul_1227", "matmul_1241", "add_1256"], hoist_tms=False)
+    config.insert_nop("input_1", ["matmul_2", "matmul_8", "matmul_22", "add_37"], hoist_tms=False)
+    config.insert_nop("_fused_op_8", ["matmul_55", "matmul_61", "matmul_75", "add_90"], hoist_tms=False)
+    config.insert_nop("_fused_op_17", ["matmul_108", "matmul_114", "matmul_128", "add_143"], hoist_tms=False)
+    config.insert_nop("_fused_op_26", ["matmul_161", "matmul_167", "matmul_181", "add_196"], hoist_tms=False)
+    config.insert_nop("_fused_op_35", ["matmul_214", "matmul_220", "matmul_234", "add_249"], hoist_tms=False)
+    config.insert_nop("_fused_op_44", ["matmul_267", "matmul_273", "matmul_287", "add_302"], hoist_tms=False)
+    config.insert_nop("_fused_op_53", ["matmul_320", "matmul_326", "matmul_340", "add_355"], hoist_tms=False)
+    config.insert_nop("_fused_op_62", ["matmul_373", "matmul_379", "matmul_393", "add_408"], hoist_tms=False)
+    config.insert_nop("_fused_op_71", ["matmul_426", "matmul_432", "matmul_446", "add_461"], hoist_tms=False)
+    config.insert_nop("_fused_op_80", ["matmul_479", "matmul_485", "matmul_499", "add_514"], hoist_tms=False)
+    config.insert_nop("_fused_op_89", ["matmul_532", "matmul_538", "matmul_552", "add_567"], hoist_tms=False)
+    config.insert_nop("_fused_op_98", ["matmul_585", "matmul_591", "matmul_605", "add_620"], hoist_tms=False)
+    config.insert_nop("_fused_op_107", ["matmul_638", "matmul_644", "matmul_658", "add_673"], hoist_tms=False)
+    config.insert_nop("_fused_op_116", ["matmul_691", "matmul_697", "matmul_711", "add_726"], hoist_tms=False)
+    config.insert_nop("_fused_op_125", ["matmul_744", "matmul_750", "matmul_764", "add_779"], hoist_tms=False)
+    config.insert_nop("_fused_op_134", ["matmul_797", "matmul_803", "matmul_817", "add_832"], hoist_tms=False)
+    config.insert_nop("_fused_op_143", ["matmul_850", "matmul_856", "matmul_870", "add_885"], hoist_tms=False)
+    config.insert_nop("_fused_op_152", ["matmul_903", "matmul_909", "matmul_923", "add_938"], hoist_tms=False)
+    config.insert_nop("_fused_op_161", ["matmul_956", "matmul_962", "matmul_976", "add_991"], hoist_tms=False)
+    config.insert_nop("_fused_op_170", ["matmul_1009", "matmul_1015", "matmul_1029", "add_1044"], hoist_tms=False)
+    config.insert_nop("_fused_op_179", ["matmul_1062", "matmul_1068", "matmul_1082", "add_1097"], hoist_tms=False)
+    config.insert_nop("_fused_op_188", ["matmul_1115", "matmul_1121", "matmul_1135", "add_1150"], hoist_tms=False)
+    config.insert_nop("_fused_op_197", ["matmul_1168", "matmul_1174", "matmul_1188", "add_1203"], hoist_tms=False)
+    config.insert_nop("_fused_op_206", ["matmul_1221", "matmul_1227", "matmul_1241", "add_1256"], hoist_tms=False)
 
     config.override_op_placement(op_name="buffer_0_input_1_matmul_2", start=[1, 2])
     config.override_op_placement(op_name="buffer_0__fused_op_8_matmul_55", start=[1, 2])
@@ -300,30 +300,30 @@ def attention_mask_buffering_galaxy():
     compiler_cfg = _get_global_compiler_config()
     config = pybuda.config
 
-    config.insert_buffering_nop("attention_mask", ["_fused_op_0", "_fused_op_9", "_fused_op_18", "_fused_op_27", "_fused_op_36", "_fused_op_45", "_fused_op_54", "_fused_op_63", "_fused_op_72", "_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_attention_mask__fused_op_0", ["_fused_op_9", "_fused_op_18", "_fused_op_27", "_fused_op_36", "_fused_op_45", "_fused_op_54", "_fused_op_63", "_fused_op_72", "_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9", ["_fused_op_18", "_fused_op_27", "_fused_op_36", "_fused_op_45", "_fused_op_54", "_fused_op_63", "_fused_op_72", "_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18", ["_fused_op_27", "_fused_op_36", "_fused_op_45", "_fused_op_54", "_fused_op_63", "_fused_op_72", "_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27", ["_fused_op_36", "_fused_op_45", "_fused_op_54", "_fused_op_63", "_fused_op_72", "_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36", ["_fused_op_45", "_fused_op_54", "_fused_op_63", "_fused_op_72", "_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45", ["_fused_op_54", "_fused_op_63", "_fused_op_72", "_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54", ["_fused_op_63", "_fused_op_72", "_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63", ["_fused_op_72", "_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72", ["_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81", ["_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90", ["_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99", ["_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108", ["_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117", ["_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117__fused_op_126", ["_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117__fused_op_126__fused_op_135", ["_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117__fused_op_126__fused_op_135__fused_op_144", ["_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117__fused_op_126__fused_op_135__fused_op_144__fused_op_153", ["_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117__fused_op_126__fused_op_135__fused_op_144__fused_op_153__fused_op_162", ["_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117__fused_op_126__fused_op_135__fused_op_144__fused_op_153__fused_op_162__fused_op_171", ["_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117__fused_op_126__fused_op_135__fused_op_144__fused_op_153__fused_op_162__fused_op_171__fused_op_180", ["_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117__fused_op_126__fused_op_135__fused_op_144__fused_op_153__fused_op_162__fused_op_171__fused_op_180__fused_op_189", ["_fused_op_198", "_fused_op_207"], hoist_tms=False)
-    config.insert_buffering_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117__fused_op_126__fused_op_135__fused_op_144__fused_op_153__fused_op_162__fused_op_171__fused_op_180__fused_op_189__fused_op_198", ["_fused_op_207"], hoist_tms=False)
+    config.insert_nop("attention_mask", ["_fused_op_0", "_fused_op_9", "_fused_op_18", "_fused_op_27", "_fused_op_36", "_fused_op_45", "_fused_op_54", "_fused_op_63", "_fused_op_72", "_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_attention_mask__fused_op_0", ["_fused_op_9", "_fused_op_18", "_fused_op_27", "_fused_op_36", "_fused_op_45", "_fused_op_54", "_fused_op_63", "_fused_op_72", "_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9", ["_fused_op_18", "_fused_op_27", "_fused_op_36", "_fused_op_45", "_fused_op_54", "_fused_op_63", "_fused_op_72", "_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18", ["_fused_op_27", "_fused_op_36", "_fused_op_45", "_fused_op_54", "_fused_op_63", "_fused_op_72", "_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27", ["_fused_op_36", "_fused_op_45", "_fused_op_54", "_fused_op_63", "_fused_op_72", "_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36", ["_fused_op_45", "_fused_op_54", "_fused_op_63", "_fused_op_72", "_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45", ["_fused_op_54", "_fused_op_63", "_fused_op_72", "_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54", ["_fused_op_63", "_fused_op_72", "_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63", ["_fused_op_72", "_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72", ["_fused_op_81", "_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81", ["_fused_op_90", "_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90", ["_fused_op_99", "_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99", ["_fused_op_108", "_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108", ["_fused_op_117", "_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117", ["_fused_op_126", "_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117__fused_op_126", ["_fused_op_135", "_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117__fused_op_126__fused_op_135", ["_fused_op_144", "_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117__fused_op_126__fused_op_135__fused_op_144", ["_fused_op_153", "_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117__fused_op_126__fused_op_135__fused_op_144__fused_op_153", ["_fused_op_162", "_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117__fused_op_126__fused_op_135__fused_op_144__fused_op_153__fused_op_162", ["_fused_op_171", "_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117__fused_op_126__fused_op_135__fused_op_144__fused_op_153__fused_op_162__fused_op_171", ["_fused_op_180", "_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117__fused_op_126__fused_op_135__fused_op_144__fused_op_153__fused_op_162__fused_op_171__fused_op_180", ["_fused_op_189", "_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117__fused_op_126__fused_op_135__fused_op_144__fused_op_153__fused_op_162__fused_op_171__fused_op_180__fused_op_189", ["_fused_op_198", "_fused_op_207"], hoist_tms=False)
+    config.insert_nop("buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18__fused_op_27__fused_op_36__fused_op_45__fused_op_54__fused_op_63__fused_op_72__fused_op_81__fused_op_90__fused_op_99__fused_op_108__fused_op_117__fused_op_126__fused_op_135__fused_op_144__fused_op_153__fused_op_162__fused_op_171__fused_op_180__fused_op_189__fused_op_198", ["_fused_op_207"], hoist_tms=False)
 
     config.add_schedule_constraint(["matmul_55", "buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9"])
     config.add_schedule_constraint(["matmul_108", "buffer_0_buffer_0_buffer_0_attention_mask__fused_op_0__fused_op_9__fused_op_18"])
@@ -956,12 +956,12 @@ def op_overrides(parameters):
         # buffer insertion:
         # attention_mask -> 
         # fused_op_0 -> fused_op_2 buffer insertion
-        config.insert_buffering_nop(f"_fused_op_{0+9*i}", [f"_fused_op_{2+9*i}"], hoist_tms=False)
+        config.insert_nop(f"_fused_op_{0+9*i}", [f"_fused_op_{2+9*i}"], hoist_tms=False)
         config.override_op_size(f"buffer_0__fused_op_{0+9*i}__fused_op_{2+9*i}", [2, 1])
         config.override_op_placement(op_name=f"buffer_0__fused_op_{0+9*i}__fused_op_{2+9*i}", start=[3, 3], transpose_op=True)
 
         # fused_op_5 -> add_51 buffer insertion
-        config.insert_buffering_nop(f"_fused_op_{5+9*i}", [f"add_{51+53*i}"], hoist_tms=False)
+        config.insert_nop(f"_fused_op_{5+9*i}", [f"add_{51+53*i}"], hoist_tms=False)
         config.override_op_size(f"buffer_0__fused_op_{5+9*i}_add_{51+53*i}", [1, 2])
         config.override_op_placement(op_name=f"buffer_0__fused_op_{5+9*i}_add_{51+53*i}", start=[8, 6])
 
diff --git a/pybuda/test/galaxy/test_galaxy_bert_demo.py b/pybuda/test/galaxy/test_galaxy_bert_demo.py
index f81f3f1de..b047ad882 100644
--- a/pybuda/test/galaxy/test_galaxy_bert_demo.py
+++ b/pybuda/test/galaxy/test_galaxy_bert_demo.py
@@ -83,7 +83,7 @@ def apply_galaxy_am_buffering(config):
         for i in range(num_enc):
             am_consumer_ops.append(f"add_{17+53*i}")
 
-        config.insert_buffering_nop("attention_mask", am_consumer_ops)
+        config.insert_nop("attention_mask", am_consumer_ops)
 
     def apply_config_overrides(config):
         for i in range(num_enc):
@@ -145,10 +145,10 @@ def apply_galaxy_am_buffering(config):
         for i in range(num_enc):
             am_consumer_ops.append(f"add_{17+53*i}")
 
-        config.insert_buffering_nop("attention_mask", am_consumer_ops)
+        config.insert_nop("attention_mask", am_consumer_ops)
         curr_buffer_name = am_consumer_ops[0]
         for i in range(num_enc):
-            config.insert_buffering_nop(curr_buffer_name, am_consumer_ops[i+1:], hoist_tms=False)
+            config.insert_nop(curr_buffer_name, am_consumer_ops[i+1:], hoist_tms=False)
             curr_buffer_name = f"buffer_0_{curr_buffer_name}_{am_consumer_ops[i+1]}"
 
     def apply_config_overrides(config):
diff --git a/pybuda/test/galaxy/test_galaxy_multichip.py b/pybuda/test/galaxy/test_galaxy_multichip.py
index 709505ae1..1d28f9ccd 100644
--- a/pybuda/test/galaxy/test_galaxy_multichip.py
+++ b/pybuda/test/galaxy/test_galaxy_multichip.py
@@ -124,7 +124,7 @@ def test_pt_encoder(test_kind, test_device, size, encoder_count, num_chips):
     relative_atol, pcc = get_relaxed_atol_pcc(test_kind, test_device, size, microbatch)
 
     if test_device.is_silicon() and test_kind.is_training() and size == "base":
-        if test_device.is_wormhole():
+        if test_device.is_wormhole_b0():
             pcc = 0.9
 
     if test_device.is_silicon() and test_kind.is_training() and size == "large":
@@ -175,7 +175,7 @@ def test_pt_encoder(test_kind, test_device, size, encoder_count, num_chips):
 @pytest.mark.parametrize(
     "num_chips", [2, 4, 8, 12, 32], ids=["chip2", "chip4", "chip8", "chip12", "chip32"],
 )
-def test_multichip_wormhole_multi_encoder_split_concurrent(
+def test_multichip_wormhole_b0_multi_encoder_split_concurrent(
     test_kind, cfg, test_device, encoder_count, num_chips
 ):
     hidden_dim = cfg[0]
@@ -221,7 +221,7 @@ def test_multichip_wormhole_multi_encoder_split_concurrent(
         VerifyConfig(
             test_kind=test_kind,
             devtype=test_device.devtype,
-            arch=BackendDevice.Wormhole,
+            arch=BackendDevice.Wormhole_B0,
             relative_atol=relative_atol,
             pcc=pcc,
             accumulation_steps=1,
@@ -541,13 +541,13 @@ def two_chip_multi_temporal_unary_to_unary(act):
     ]
 
     devtype = BackendType.Silicon
-    arch = BackendDevice.Wormhole
+    arch = BackendDevice.Wormhole_B0
     compiler_cfg = _get_global_compiler_config()
     # pybuda.set_configuration_options(
     #        backend_cluster_descriptor_path=eth_connections_file
     # )
 
-    # Only run this on WH silicon, where create-ethernet-map can be called
+    # Only run this on WH_B0 silicon, where create-ethernet-map can be called
     device_cfg = get_device_config(
         arch,
         [], # chip_ids
diff --git a/pybuda/test/llama/decode.py b/pybuda/test/llama/decode.py
index 53af31dc9..cd212df6b 100644
--- a/pybuda/test/llama/decode.py
+++ b/pybuda/test/llama/decode.py
@@ -24,7 +24,7 @@ def main():
 
     parser.add_argument('-d', '--device', choices=['huggingface', 'pytorch', 'golden', 'silicon'], default='huggingface', help='huggingface: run using HF code only, pytorch: use our shim but run in PyTorch, golden/silicon: run via pybuda')
     parser.add_argument('--no-kv-cache', action='store_true', help='Do not use a kv-cache and only generate the first 32 tokens')
-    parser.add_argument('--arch', choices=['greyskull', 'wormhole', 'wormhole_b0'], default='wormhole', help='Architecture to use for silicon')
+    parser.add_argument('--arch', choices=['greyskull', 'wormhole_b0'], default='wormhole_b0', help='Architecture to use for silicon')
     parser.add_argument('--num-chips', type=int, default=1, help='Number of chips to use')
     parser.add_argument('--fuse', action='store_true', help='Fuse layers')
     parser.add_argument('--perf', choices=['none', 'light', 'verbose'], default=None, help='Performance tracing')
diff --git a/pybuda/test/llama/hang.py b/pybuda/test/llama/hang.py
index f461776ad..541a2f632 100644
--- a/pybuda/test/llama/hang.py
+++ b/pybuda/test/llama/hang.py
@@ -19,7 +19,7 @@ def main():
     parser = ArgumentParser('Generate text token-by-token starting with a pre-filled KV cache')
     parser.add_argument('-m', '--model', type=str, default='decapoda-research/llama-7b-hf', help='Model name')
     parser.add_argument('-d', '--device', choices=['huggingface', 'pytorch', 'golden', 'silicon'], default='huggingface', help='huggingface: run using HF code only, pytorch: use our shim but run in PyTorch, golden/silicon: run via pybuda')
-    parser.add_argument('--arch', choices=['greyskull', 'wormhole', 'wormhole_b0'], default='wormhole', help='Architecture to use for silicon')
+    parser.add_argument('--arch', choices=['greyskull', 'wormhole_b0'], default='wormhole_b0', help='Architecture to use for silicon')
     parser.add_argument('--precision', choices=['fp32', 'fp16', 'bf16', 'fp8', 'fp8b'], default='fp32', help='Precision to use for all silicon tensors')
     parser.add_argument('--amp-level', type=int, default=0, choices=[0, 1, 2], help='Automatic mixed precision level (0=off, 1=mixed b-formats, 2=mixed a-formats)')
     parser.add_argument('--num-chips', type=int, default=1, help='Number of chips to use')
diff --git a/pybuda/test/llama/llama_test.py b/pybuda/test/llama/llama_test.py
index c23cb7dd7..18f55d4ac 100644
--- a/pybuda/test/llama/llama_test.py
+++ b/pybuda/test/llama/llama_test.py
@@ -11,7 +11,7 @@
 
 
 @pytest.mark.parametrize("device", ["silicon"])
-@pytest.mark.parametrize("arch", ["greyskull", "wormhole", "wormhole_b0"])
+@pytest.mark.parametrize("arch", ["greyskull", "wormhole_b0"])
 @pytest.mark.parametrize("chips_to_use", ["chip1", "chip2", "chip32"])
 def test_llama(device, arch, chips_to_use):
     '''
diff --git a/pybuda/test/llama/placement.py b/pybuda/test/llama/placement.py
index 587feed73..b445ba4f4 100644
--- a/pybuda/test/llama/placement.py
+++ b/pybuda/test/llama/placement.py
@@ -51,7 +51,7 @@ def manual_placer(config, filename, loops=1, non_fuse_offset=0):
         }
     }
     and uses config.override_op_size, config.override_op_placement,
-    config.insert_buffering_nop, config.add_schedule_constraint,
+    config.insert_nop, config.add_schedule_constraint,
     config.set_chip_break, config.set_epoch_break and INSERT_DRAM_QUEUES
     env variable. Loop offsets are used to apply the same placement to
     multiple epochs from a single file. YMMV. See example_placement.json.
@@ -112,7 +112,7 @@ def manual_placer(config, filename, loops=1, non_fuse_offset=0):
                     transpose = grid_location_properties.get("transpose", False)
 
                 print("Insert nop buffer between {} and {}, hoist_tms: {}".format(op_name_src, op_name_dest, hoist_tms))
-                config.insert_buffering_nop(op_name_src, op_names_dest, hoist_tms=hoist_tms)
+                config.insert_nop(op_name_src, op_names_dest, hoist_tms=hoist_tms)
 
                 if size is not None:
                     config.override_op_size(buffer_node_name, size)
@@ -157,7 +157,7 @@ def manual_placer(config, filename, loops=1, non_fuse_offset=0):
                 op_names_dest = all_op_names_dest[loop:]
 
                 print("Insert daisy chain nop buffer between {} and {}, hoist_tms: {}".format(buffer_name, op_names_dest, hoist_tms))
-                config.insert_buffering_nop(buffer_name, op_names_dest, hoist_tms=hoist_tms)
+                config.insert_nop(buffer_name, op_names_dest, hoist_tms=hoist_tms)
 
                 new_buffer_name = "buffer_0_{}_{}".format(buffer_name, op_names_dest[0])
 
diff --git a/pybuda/test/llama/pybudify_caching.py b/pybuda/test/llama/pybudify_caching.py
index 8de6934f3..05554c389 100644
--- a/pybuda/test/llama/pybudify_caching.py
+++ b/pybuda/test/llama/pybudify_caching.py
@@ -9,7 +9,7 @@
 
 
 class PyBudify(torch.nn.Module):
-    def __init__(self, pt_module, device='silicon', arch='wormhole', precision='fp32', amp_config_file=None, micro_batch_size=1, fuse=False, num_chips=1, perf=None, verify=False, log_level='ERROR', tti_save=None, tti_load=None,
+    def __init__(self, pt_module, device='silicon', arch='wormhole_b0', precision='fp32', amp_config_file=None, micro_batch_size=1, fuse=False, num_chips=1, perf=None, verify=False, log_level='ERROR', tti_save=None, tti_load=None,
                  prefill_kvs=[], write_index=0, num_layers=None, netlist_name="pybudify_module", opt_level=0, nlp_target_cycles=-1, placement_config_file=None):
         super().__init__()
 
@@ -22,7 +22,7 @@ def __init__(self, pt_module, device='silicon', arch='wormhole', precision='fp32
 
         if device != 'pytorch':
             # pybuda workarounds
-            os.environ["GOLDEN_WORMHOLE"] = "1"
+            os.environ["GOLDEN_WORMHOLE_B0"] = "1"
             # os.environ["PYBUDA_ENABLE_BROADCAST_SPLITTING"] = "1"
             #os.environ["PYBUDA_DISABLE_FORK_JOIN_BUF"] = "1"
             # os.environ["PYBUDA_DRAM_PICK_CAPACITY"] = "1"
@@ -39,7 +39,6 @@ def __init__(self, pt_module, device='silicon', arch='wormhole', precision='fp32
 
             if nlp_target_cycles > 0:
                 os.environ["PYBUDA_NLP_MANUAL_TARGET"] = str(nlp_target_cycles)
-            # os.environ["PYBUDA_ENABLE_T_STREAMING"] = "1"
 
             pybuda = self.pybuda = __import__('pybuda') # let us set log levels before importing pybuda
 
@@ -95,7 +94,6 @@ def __init__(self, pt_module, device='silicon', arch='wormhole', precision='fp32
             pybuda.set_configuration_options(enable_auto_fusing=fuse, performance_trace=perf_level, backend_opt_level=opt_level, input_queues_on_host=False)
 
             pybuda_arch = { 'grayskull': pybuda.BackendDevice.Grayskull,
-                            'wormhole': pybuda.BackendDevice.Wormhole,
                             'wormhole_b0': pybuda.BackendDevice.Wormhole_B0 }[arch]
             
             if tti_load is not None:
diff --git a/pybuda/test/llama/tt_eval.py b/pybuda/test/llama/tt_eval.py
index 790e16202..4938b4131 100644
--- a/pybuda/test/llama/tt_eval.py
+++ b/pybuda/test/llama/tt_eval.py
@@ -19,7 +19,7 @@ def main():
     parser = ArgumentParser('Generate text token-by-token starting with a pre-filled KV cache')
     parser.add_argument('-m', '--model', type=str, default='decapoda-research/llama-7b-hf', help='Model name')
     parser.add_argument('-d', '--device', choices=['pytorch', 'golden', 'silicon'], default='huggingface', help='huggingface: run using HF code only, pytorch: use our shim but run in PyTorch, golden/silicon: run via pybuda')
-    parser.add_argument('--arch', choices=['greyskull', 'wormhole', 'wormhole_b0'], default='wormhole', help='Architecture to use for silicon')
+    parser.add_argument('--arch', choices=['greyskull', 'wormhole_b0'], default='wormhole_b0', help='Architecture to use for silicon')
     parser.add_argument('--num-chips', type=int, default=1, help='Number of chips to use')
     parser.add_argument('--fuse', action='store_true', help='Fuse layers')
     parser.add_argument('--perf', choices=['none', 'light', 'verbose'], default=None, help='Performance tracing')
diff --git a/pybuda/test/mlir/mnist/__init__.py b/pybuda/test/mlir/mnist/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/pybuda/test/mlir/mnist/test_inference.py b/pybuda/test/mlir/mnist/test_inference.py
new file mode 100644
index 000000000..0722e626e
--- /dev/null
+++ b/pybuda/test/mlir/mnist/test_inference.py
@@ -0,0 +1,17 @@
+import torch
+from torch import nn
+
+from .utils import *
+
+
+def test_mnist_inference():
+    inputs = [torch.rand(1, 784)]
+
+    framework_model = MNISTLinear()
+    fw_out = framework_model(*inputs)
+
+    compiled_model = torch.compile(framework_model.to("tt"), backend="tt")
+    co_out = compiled_model(*[i.to("tt") for i in inputs])
+
+    co_out = [co.to("cpu") for co in co_out]
+    assert [torch.allclose(fo, co) for fo, co in zip(fw_out, co_out)]
diff --git a/pybuda/test/mlir/mnist/test_training.py b/pybuda/test/mlir/mnist/test_training.py
new file mode 100644
index 000000000..0a03792e4
--- /dev/null
+++ b/pybuda/test/mlir/mnist/test_training.py
@@ -0,0 +1,65 @@
+import torch
+from torch import nn
+
+import pybuda
+from .utils import *
+
+def test_mnist_training():
+    torch.manual_seed(0)
+
+    # Config
+    num_epochs = 9
+    batch_size = 64
+    learning_rate = 0.005
+    
+    # Load dataset
+    test_loader, train_loader = load_dataset(batch_size)
+
+    # Load TensorBoard writer (for logging)
+    writer = load_tb_writer()
+    
+    # Define model and instruct it to compile and run on TT device
+    framework_model = MNISTLinear()
+    tt_model = pybuda.compile(framework_model)
+    tt_model.to("tt")
+
+    # Create a torch loss and leave on CPU
+    loss = torch.nn.L1Loss()
+
+    # Define optimizer and instruct it to compile and run on TT device
+    framework_optimizer = torch.optim.SGD(framework_model.parameters(), lr=learning_rate)
+    tt_optimizer = pybuda.compile(framework_optimizer)
+    tt_optimizer.to("tt")
+
+    for epoch_idx in range(num_epochs):
+        for batch_idx, (data, target) in enumerate(train_loader):
+            # Put inputs on device
+            data = data.to("tt")
+            
+            # Create target tensor and leave on CPU
+            target = nn.functional.one_hot(target, num_classes=10).float()
+
+            # Reset gradients (every batch)
+            tt_optimizer.zero_grad()
+            
+            # Forward pass (prediction) on device
+            pred = tt_model(data)
+            
+            # Pull output back to CPU
+            pred = pred.to("cpu")
+
+            # Compute loss on CPU
+            loss = tt_loss(pred, target)
+            
+            # RUn backward pass on device
+            loss.backward()
+            
+            # Adjust weights (on device)
+            tt_optimizer.step()
+
+            # Log gradients
+            for name, param in tt_model.named_parameters():
+                writer.add_histogram(f"{name}.grad", param.grad, batch_idx)
+
+            # Log loss
+            writer.add_scalar("Loss", loss.item(), batch_idx)
diff --git a/pybuda/test/mlir/mnist/utils.py b/pybuda/test/mlir/mnist/utils.py
new file mode 100644
index 000000000..68c679ef5
--- /dev/null
+++ b/pybuda/test/mlir/mnist/utils.py
@@ -0,0 +1,60 @@
+from datetime import datetime
+
+import torch
+from torch import nn
+from torch.utils.data import DataLoader
+import torchvision.transforms as transforms
+from torch.utils.tensorboard import SummaryWriter
+from torchvision.datasets import MNIST as mnist_dataset
+
+
+# Model definition
+class MNISTLinear(nn.Module):
+    def __init__(self, input_size=784, output_size=10, hidden_size=256):
+        super(MNISTLinear, self).__init__()
+        self.l1 = nn.Linear(input_size, hidden_size)
+        self.relu = nn.ReLU()
+        self.l2 = nn.Linear(hidden_size, output_size)
+
+    def forward(self, x):
+        x = self.l1(x)
+        x = self.relu(x)
+        x = self.l2(x)
+
+        return nn.functional.softmax(x)
+
+
+def load_tb_writer():
+    """
+    Load TensorBoard writer for logging
+    """
+    current_time = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    log_dir = f"runs/gradient_visualization/{current_time}/"
+    writer = SummaryWriter(log_dir)
+
+    return writer
+
+
+def load_dataset(batch_size):
+    """
+    Load and normalize MNIST dataset
+    """
+    transform = transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Normalize((0.1307,), (0.3081,)),  # Mean and std for MNIST
+            transforms.Lambda(lambda x: x.view(-1)),  # Flatten image
+        ]
+    )
+
+    train_dataset = mnist_dataset(
+        root="./data", train=True, download=True, transform=transform
+    )
+    test_dataset = mnist_dataset(
+        root="./data", train=False, download=True, transform=transform
+    )
+
+    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
+    test_loader = DataLoader(test_dataset, batch_size=1000, shuffle=False)
+
+    return test_loader, train_loader
diff --git a/pybuda/test/mlir/test_ops.py b/pybuda/test/mlir/test_ops.py
new file mode 100644
index 000000000..41b1d3afe
--- /dev/null
+++ b/pybuda/test/mlir/test_ops.py
@@ -0,0 +1,128 @@
+import os
+
+import torch
+from torch import nn
+
+import pybuda
+
+def test_add():
+    class Add(nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, a, b):
+            return a + b
+        
+    inputs = [torch.rand(1, 32, 32), torch.rand(1, 32, 32)]
+    
+    framework_model = Add()
+    fw_out = framework_model(*inputs)
+    
+    compiled_model = pybuda.compile(framework_model, sample_inputs=inputs)
+    co_out = compiled_model(*inputs)
+    
+    co_out = [co.to("cpu") for co in co_out]
+    assert [torch.allclose(fo, co) for fo, co in zip(fw_out, co_out)]
+
+
+def test_subtract():
+    class Subtract(nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, a, b):
+            return a - b
+        
+    inputs = [torch.rand(1, 32, 32), torch.rand(1, 32, 32)]
+    
+    framework_model = Subtract()
+    fw_out = framework_model(*inputs)
+    
+    compiled_model = pybuda.compile(framework_model, sample_inputs=inputs)
+    co_out = compiled_model(*inputs)
+    
+    co_out = [co.to("cpu") for co in co_out]
+    assert [torch.allclose(fo, co) for fo, co in zip(fw_out, co_out)]
+
+
+def test_multiply():
+    class Multiply(nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, a, b):
+            return a * b
+        
+    inputs = [torch.rand(1, 32, 32), torch.rand(1, 32, 32)]
+    
+    framework_model = Multiply()
+    fw_out = framework_model(*inputs)
+    
+    compiled_model = pybuda.compile(framework_model, sample_inputs=inputs)
+    co_out = compiled_model(*inputs)
+    
+    co_out = [co.to("cpu") for co in co_out]
+    assert [torch.allclose(fo, co) for fo, co in zip(fw_out, co_out)]
+
+
+def test_relu():
+    class ReLU(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.relu = nn.ReLU()
+
+        def forward(self, a):
+            return self.relu(a)
+        
+    inputs = [torch.rand(1, 32)]
+    
+    framework_model = ReLU()
+    fw_out = framework_model(*inputs)
+    
+    compiled_model = pybuda.compile(framework_model, sample_inputs=inputs)
+    co_out = compiled_model(*inputs)
+    
+    co_out = [co.to("cpu") for co in co_out]
+    assert [torch.allclose(fo, co) for fo, co in zip(fw_out, co_out)]
+
+
+def test_linear():
+    class Linear(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.l1 = nn.Linear(20, 30, bias=True)
+
+        def forward(self, a):
+            return self.l1(a)
+        
+    inputs = [torch.rand(1, 128, 20)]
+    
+    framework_model = Linear()
+    fw_out = framework_model(*inputs)
+    
+    compiled_model = pybuda.compile(framework_model, sample_inputs=inputs)
+    co_out = compiled_model(*inputs)
+    
+    co_out = [co.to("cpu") for co in co_out]
+    assert [torch.allclose(fo, co) for fo, co in zip(fw_out, co_out)]
+
+
+def test_softmax():
+    class Softmax(nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.softmax = nn.Softmax(dim=1)
+
+        def forward(self, a):
+            return self.softmax(a)
+        
+    inputs = [torch.rand(1, 128)]
+    
+    framework_model = Softmax()
+    fw_out = framework_model(*inputs)
+    
+    compiled_model = pybuda.compile(framework_model, sample_inputs=inputs)
+    co_out = compiled_model(*inputs)
+    
+    co_out = [co.to("cpu") for co in co_out]
+    assert [torch.allclose(fo, co) for fo, co in zip(fw_out, co_out)]
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py
new file mode 100644
index 000000000..3e41e0887
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_ddrnet.py
@@ -0,0 +1,175 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pybuda, os
+import pytest
+from torchvision import transforms
+import requests
+from PIL import Image
+import onnx
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+from pybuda._C.backend_api import BackendDevice
+
+variants = ["ddrnet23s", "ddrnet23", "ddrnet39"]
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_ddrnet(variant, test_device):
+
+    # STEP 1: Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    if test_device.arch == BackendDevice.Wormhole_B0:
+        # These overrides are planned to be ON by default
+        os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
+
+    if test_device.arch == BackendDevice.Grayskull:
+        # Temp mitigations for net2pipe errors, should be removed.
+        #
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
+        os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
+
+    # STEP 2: # Create PyBuda module from onnx weights
+    model_name = f"{variant}_onnx"
+
+    load_path = (
+        f"third_party/confidential_customer_models/generated/files/{variant}.onnx"
+    )
+
+    model = onnx.load(load_path)
+    tt_model = pybuda.OnnxModule(model_name, model, load_path)
+
+    # STEP 3: Prepare input
+    url = "https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg"
+    input_image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+    preprocess = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
+    input_tensor = preprocess(input_image)
+    img_tensor = input_tensor.unsqueeze(0)
+
+    verify_module(
+        tt_model,
+        input_shapes=([img_tensor.shape]),
+        inputs=([img_tensor]),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            pcc=(
+                0.98
+                if test_device.arch == BackendDevice.Grayskull
+                and variant != "ddrnet23s"
+                else 0.99
+            ),
+        ),
+    )
+
+
+variants = ["ddrnet_23_slim_1024"]
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_ddrnet_semantic_segmentation_onnx(variant, test_device):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    if test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "36864"
+        compiler_cfg.balancer_op_override(
+            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone931.dc.sparse_matmul.4.lc2",
+            "t_stream_shape",
+            (1, 8),
+        )
+        compiler_cfg.balancer_op_override(
+            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone925.dc.sparse_matmul.4.lc2",
+            "t_stream_shape",
+            (1, 8),
+        )
+        compiler_cfg.balancer_op_override(
+            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11803.dc.sparse_matmul.4.lc2",
+            "t_stream_shape",
+            (1, 8),
+        )
+        compiler_cfg.balancer_op_override(
+            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11809.dc.sparse_matmul.4.lc2",
+            "t_stream_shape",
+            (1, 8),
+        )
+        compiler_cfg.balancer_op_override(
+            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11986.dc.sparse_matmul.4.lc2",
+            "t_stream_shape",
+            (1, 16),
+        )
+        compiler_cfg.balancer_op_override(
+            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11980.dc.sparse_matmul.4.lc2",
+            "t_stream_shape",
+            (1, 8),
+        )
+        compiler_cfg.balancer_op_override(
+            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11872.dc.sparse_matmul.4.lc2",
+            "t_stream_shape",
+            (1, 8),
+        )
+        compiler_cfg.balancer_op_override(
+            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11866.dc.sparse_matmul.4.lc2",
+            "t_stream_shape",
+            (1, 8),
+        )
+
+    if test_device.arch == BackendDevice.Grayskull:
+        compiler_cfg.balancer_op_override(
+            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone931.dc.sparse_matmul.4.lc2",
+            "t_stream_shape",
+            (1, 32),
+        )
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "24576"
+        compiler_cfg.balancer_op_override(
+            "conv2d_197.dc.conv2d.5.dc.reshape.0_operand_commute_clone11915.dc.sparse_matmul.4.lc2",
+            "t_stream_shape",
+            (1, 32),
+        )
+
+    # Load and validate the model
+    load_path = f"third_party/confidential_customer_models/customer/model_0/files/cnn/ddrnet/{variant}.onnx"
+    model = onnx.load(load_path)
+    onnx.checker.check_model(model)
+    model_name = f"onnx_{variant}"
+    tt_model = pybuda.OnnxModule(model_name, model, load_path)
+
+    # Prepare input
+    image_path = "third_party/confidential_customer_models/cv_demos/ddrnet/semantic_segmentation/image/road_scenes.png"
+    input_image = Image.open(image_path)
+    input_image = transforms.Resize((1024, 1024))(input_image)
+    input_tensor = transforms.ToTensor()(input_image)
+    input_batch = input_tensor.unsqueeze(0)
+
+    # Inference
+    verify_module(
+        tt_model,
+        input_shapes=([input_batch.shape]),
+        inputs=([input_batch]),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_dla.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_dla.py
new file mode 100644
index 000000000..c8f84725c
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_dla.py
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pybuda
+import onnx
+import os
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+import requests
+import pytest
+from pybuda.verify.config import TestKind
+from pybuda._C.backend_api import BackendDevice
+import torchvision.transforms as transforms
+from PIL import Image
+
+
+variants = [
+    "dla34",
+    "dla46_c",
+    "dla46x_c",
+    "dla60x_c",
+    "dla60",
+    "dla60x",
+    "dla102",
+    "dla102x",
+    "dla102x2",
+    "dla169",
+]
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_dla_onnx(test_device, variant):
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda._C.Float16_b
+
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    # Load data sample
+    url = "https://images.rawpixel.com/image_1300/cHJpdmF0ZS9sci9pbWFnZXMvd2Vic2l0ZS8yMDIyLTA1L3BkMTA2LTA0Ny1jaGltXzEuanBn.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    # Preprocessing
+    transform = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
+    img_tensor = transform(image).unsqueeze(0)
+
+    onnx_dir_path = "dla"
+    onnx_model_path = f"dla/{variant}_Opset18.onnx"
+    if not os.path.exists(onnx_model_path):
+        if not os.path.exists("dla"):
+            os.mkdir("dla")
+        url = f"https://github.com/onnx/models/raw/main/Computer_Vision/{variant}_Opset18_timm/{variant}_Opset18.onnx?download="
+        response = requests.get(url, stream=True)
+        with open(onnx_model_path, "wb") as f:
+            f.write(response.content)
+
+    # Load DLA model
+    model_name = f"dla_{variant}_onnx"
+    onnx_model = onnx.load(onnx_model_path)
+    tt_model = pybuda.OnnxModule(model_name, onnx_model, onnx_model_path)
+
+    pcc = 0.99
+    if test_device.arch == BackendDevice.Wormhole_B0:
+        if variant == "dla34":
+            pcc = 0.98
+        elif variant == "dla169":
+            pcc = 0.96
+    elif test_device.arch == BackendDevice.Grayskull:
+        if variant == "dla46_c":
+            pcc = 0.97
+        if variant == "dla102x2":
+            os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+    verify_module(
+        tt_model,
+        input_shapes=[img_tensor.shape],
+        inputs=[(img_tensor, )],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            pcc=pcc,
+        ),
+    )
+
+    # Cleanup model files
+    os.remove(onnx_model_path)
+    os.rmdir(onnx_dir_path)
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_fpn.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_fpn.py
new file mode 100644
index 000000000..e6947ca20
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_fpn.py
@@ -0,0 +1,39 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import pybuda
+import onnx
+import os
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+
+
+def test_fpn_onnx(test_device, test_kind):
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda._C.Float16_b
+
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    os.environ["PYBUDA_FORCE_EMULATE_HARVESTED"] = "1"
+
+    # Load FPN model
+    onnx_model_path = "third_party/confidential_customer_models/generated/files/fpn.onnx"
+    model = onnx.load(onnx_model_path)
+    tt_model = pybuda.OnnxModule("onnx_fpn", model, onnx_model_path)
+
+    feat0 = torch.rand(1, 10, 64, 64)
+    feat1 = torch.rand(1, 20, 16, 16)
+    feat2 = torch.rand(1, 30, 8, 8)
+
+    verify_module(
+        tt_model,
+        input_shapes=[feat0.shape, feat1.shape, feat2.shape],
+        inputs=[(feat0, feat1, feat2)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=test_kind,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_hardnet.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_hardnet.py
new file mode 100644
index 000000000..5667a054c
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_hardnet.py
@@ -0,0 +1,77 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pybuda, os
+import onnx
+from PIL import Image
+from torchvision import transforms
+import urllib
+from pybuda.verify.backend import verify_module
+import pytest
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+from pybuda._C.backend_api import BackendDevice
+
+variants = ["hardnet68", "hardnet85", "hardnet68ds", "hardnet39ds"]
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_hardnet_onnx(variant, test_device):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    if variant == "hardnet68ds":
+        os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+    if variant == "hardnet85" and test_device.arch == BackendDevice.Grayskull:
+        os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+    # Download an example image
+    url, filename = (
+        "https://github.com/pytorch/hub/raw/master/images/dog.jpg",
+        "dog.jpg",
+    )
+    try:
+        urllib.URLopener().retrieve(url, filename)
+    except:
+        urllib.request.urlretrieve(url, filename)
+
+    # Preprocessing
+    input_image = Image.open(filename)
+    preprocess = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
+    input_tensor = preprocess(input_image)
+    img_tensor = input_tensor.unsqueeze(0)
+
+    load_path = (
+        f"third_party/confidential_customer_models/generated/files/{variant}.onnx"
+    )
+    model_name = f"{variant}_onnx"
+
+    # Create PyBuda module from onnx weights
+    model = onnx.load(load_path)
+    tt_model = pybuda.OnnxModule(model_name, model, load_path)
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=([img_tensor.shape]),
+        inputs=([img_tensor]),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            pcc=0.98,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_lstm_valence.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_lstm_valence.py
index 5ee8a115a..8256da206 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_lstm_valence.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_lstm_valence.py
@@ -20,7 +20,6 @@ def test_lstm_valence_onnx(test_device):
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     os.environ["PYBUDA_RIBBON2"] = "1"
     
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio.py
new file mode 100644
index 000000000..4077b68e8
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_perceiverio.py
@@ -0,0 +1,113 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pybuda
+import onnx
+
+import os
+import requests
+from PIL import Image
+import pytest
+
+from transformers import AutoImageProcessor
+
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+
+
+def get_sample_data(model_name):
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    image_processor = AutoImageProcessor.from_pretrained(model_name)
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+    return pixel_values
+
+
+@pytest.mark.parametrize(
+    "model_name",
+    [
+        "deepmind/vision-perceiver-conv",
+        "deepmind/vision-perceiver-learned",
+        "deepmind/vision-perceiver-fourier",
+    ],
+)
+def test_perceiver_for_image_classification_onnx(test_device, model_name):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    compiler_cfg.enable_auto_fusing = False
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    verify_enabled = True
+
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+
+        if model_name == "deepmind/vision-perceiver-learned":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{105*1024}"
+
+        elif model_name == "deepmind/vision-perceiver-conv":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{10*1024}"
+            compiler_cfg.balancer_op_override("multiply_19", "t_stream_shape", (1, 1))
+            compiler_cfg.balancer_op_override("multiply_142", "t_stream_shape", (1, 1))
+            compiler_cfg.balancer_op_override("multiply_3103", "t_stream_shape", (1, 1))
+            compiler_cfg.balancer_op_override("multiply_3123", "t_stream_shape", (1, 1))
+            compiler_cfg.balancer_op_override("multiply_2745", "t_stream_shape", (1, 1))
+            compiler_cfg.balancer_op_override("multiply_2934", "t_stream_shape", (1, 1))
+
+        elif model_name == "deepmind/vision-perceiver-fourier":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{101*1024}"
+
+    elif test_device.arch == pybuda.BackendDevice.Grayskull:
+
+        if test_device.devtype == pybuda.BackendType.Silicon:
+            verify_enabled = False
+
+        if model_name == "deepmind/vision-perceiver-learned":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{101*1024}"
+
+        elif model_name == "deepmind/vision-perceiver-fourier":
+            os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+            compiler_cfg.place_on_new_epoch("hslice_50.dc.sparse_matmul.2.lc2")
+            compiler_cfg.place_on_new_epoch("matmul_47")
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{101*1024}"
+            compiler_cfg.balancer_op_override(
+                "hslice_50.dc.sparse_matmul.2.lc2", "t_stream_shape", (1, 7)
+            )
+
+    onnx_model_path = (
+        "third_party/confidential_customer_models/generated/files/"
+        + str(model_name).split("/")[-1].replace("-", "_")
+        + ".onnx"
+    )
+
+    # Sample Image
+    pixel_values = get_sample_data(model_name)
+
+    # Load the onnx model
+    onnx_model = onnx.load(onnx_model_path)
+    onnx.checker.check_model(onnx_model)
+
+    # Create PyBuda module from Onnx model
+    tt_model = pybuda.OnnxModule(
+        str(model_name.split("/")[-1].replace("-", "_")) + "_onnx",
+        onnx_model,
+        onnx_model_path,
+    )
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=(pixel_values.shape,),
+        inputs=[(pixel_values,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            enabled=verify_enabled,  # pcc drops in silicon devicetype
+            pcc=0.96,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_retinanet.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_retinanet.py
index 98ddf8468..c30c23a99 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_retinanet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_retinanet.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
 # SPDX-License-Identifier: Apache-2.0
+
 # STEP 0: import PyBuda library
 import pytest
 
@@ -18,6 +19,9 @@
 from PIL import Image
 import numpy as np
 
+import requests
+from torchvision import transforms
+
 ## https://github.com/onnx/models/tree/main/vision/object_detection_segmentation/retinanet
 
 ########
@@ -44,9 +48,6 @@ def img_preprocess(scal_val=1):
 ######### 
  
 def test_retinanet_r101_640x480_onnx(test_device):
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip("Failing on GS by exceding resource constraints and blobgen error respectively")
-
     os.environ["PYBUDA_DECOMPOSE_SIGMOID"] = "1"
     os.environ["PYBUDA_DISABLE_CONV_MULTI_OP_FRACTURE"] = "1"
     os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"]  = f"{76*1024}"
@@ -54,10 +55,15 @@ def test_retinanet_r101_640x480_onnx(test_device):
     os.environ["PYBUDA_BALANCER_PREPASS_DISABLED"] = "1"
     os.environ["PYBUDA_LEGACY_UBLOCK_SHAPE"] = "1"
 
+    # Temp mitigations for net2pipe errors, should be removed.
+    #
+    os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
+    os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
+    os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
+
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.graph_solver_self_cut_type = "ConsumerOperandDataEdgesFirst"
     compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
     compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_356"] = 3
@@ -71,6 +77,7 @@ def test_retinanet_r101_640x480_onnx(test_device):
     img_tensor = img_preprocess()
 
     # STEP 3: Run inference on Tenstorrent device
+    pcc = 0.97 if test_device.arch == BackendDevice.Grayskull and test_device.devtype == BackendType.Silicon else 0.99
     verify_module(
         tt_model, 
         input_shapes=([img_tensor.shape]),
@@ -80,5 +87,110 @@ def test_retinanet_r101_640x480_onnx(test_device):
             arch=test_device.arch,
             devtype=test_device.devtype,
             devmode=test_device.devmode,
+            pcc=pcc,
         )
     )
+
+def img_preprocessing():
+
+    url = "https://i.ytimg.com/vi/q71MCWAEfL8/maxresdefault.jpg"
+    pil_img = Image.open(requests.get(url, stream=True).raw)
+    new_size = (640, 480)
+    pil_img = pil_img.resize(new_size, resample=Image.BICUBIC)
+    preprocess = transforms.Compose(
+        [
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
+    img = preprocess(pil_img)
+    img = img.unsqueeze(0)
+    return img
+
+variants = [
+    "retinanet_rn18fpn",
+    "retinanet_rn34fpn",
+    "retinanet_rn50fpn",
+    "retinanet_rn152fpn",
+]
+
+@pytest.mark.parametrize("variant", variants)
+def test_retinanet_onnx(variant, test_device):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_DECOMPOSE_SIGMOID"] = "1"
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    if test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "73728"
+
+        if variant == "retinanet_rn18fpn":
+            compiler_cfg.place_on_new_epoch("conv2d_117.dc.matmul.11")
+            compiler_cfg.balancer_op_override("conv2d_82.dc.matmul.11", "t_stream_shape", (1,1))
+            compiler_cfg.balancer_op_override("conv2d_60.dc.matmul.11", "grid_shape", (1,1))
+
+        elif variant == "retinanet_rn34fpn":
+            compiler_cfg.place_on_new_epoch("conv2d_157.dc.matmul.11")
+            compiler_cfg.balancer_op_override("conv2d_122.dc.matmul.11", "t_stream_shape", (1,1))
+            compiler_cfg.balancer_op_override("conv2d_100.dc.matmul.11", "grid_shape", (1,1))
+        
+        elif variant == "retinanet_rn50fpn":
+            compiler_cfg.place_on_new_epoch("conv2d_190.dc.matmul.11")
+            compiler_cfg.balancer_op_override("conv2d_155.dc.matmul.11", "t_stream_shape", (1,1))
+            compiler_cfg.balancer_op_override("conv2d_133.dc.matmul.11", "grid_shape", (1,1))
+
+        elif variant == "retinanet_rn152fpn":
+            compiler_cfg.place_on_new_epoch("conv2d_428.dc.matmul.11")
+            compiler_cfg.balancer_op_override("conv2d_393.dc.matmul.11", "t_stream_shape", (1,1))
+            compiler_cfg.balancer_op_override("conv2d_371.dc.matmul.11", "grid_shape", (1,1))
+    
+    if test_device.arch == BackendDevice.Grayskull:
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "69632"
+        # Temp mitigations for net2pipe errors, should be removed.
+        #
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
+        os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
+
+        if variant == "retinanet_rn18fpn":
+            compiler_cfg.balancer_op_override("conv2d_82.dc.matmul.11", "t_stream_shape", (1,1))
+            compiler_cfg.balancer_op_override("conv2d_60.dc.matmul.11", "t_stream_shape", (1,1))
+
+        elif variant == "retinanet_rn34fpn":
+            compiler_cfg.balancer_op_override("conv2d_122.dc.matmul.11", "t_stream_shape", (1,1))
+            compiler_cfg.balancer_op_override("conv2d_100.dc.matmul.11", "t_stream_shape", (1,1))
+
+        elif variant == "retinanet_rn50fpn":
+            compiler_cfg.balancer_op_override("conv2d_155.dc.matmul.11", "t_stream_shape", (1,1))
+            compiler_cfg.balancer_op_override("conv2d_133.dc.matmul.11", "t_stream_shape", (1,1))
+
+        elif variant == "retinanet_rn152fpn":
+            compiler_cfg.balancer_op_override("conv2d_393.dc.matmul.11", "t_stream_shape", (1,1))
+            compiler_cfg.balancer_op_override("conv2d_371.dc.matmul.11", "t_stream_shape", (1,1))
+ 
+    # Prepare model
+    load_path = (
+        f"third_party/confidential_customer_models/generated/files/{variant}.onnx"
+    )
+    model_name = f"onnx_{variant}"
+    model = onnx.load(load_path)
+    tt_model = pybuda.OnnxModule(model_name, model, load_path)
+
+    # Prepare input
+    input_batch = img_preprocessing()
+
+    # Inference
+    verify_module(
+        tt_model,
+        input_shapes=([input_batch.shape]),
+        inputs=([input_batch]),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls.py
new file mode 100644
index 000000000..75928a091
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_imgcls.py
@@ -0,0 +1,80 @@
+import pybuda
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+from transformers import AutoImageProcessor
+import os
+import pytest
+import requests
+from PIL import Image
+import onnx
+
+
+def get_sample_data(model_name):
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    image_processor = AutoImageProcessor.from_pretrained(model_name)
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+    return pixel_values
+
+
+variants_img_classification = [
+    "nvidia/mit-b0",
+    "nvidia/mit-b1",
+    "nvidia/mit-b2",
+    "nvidia/mit-b3",
+    "nvidia/mit-b4",
+    "nvidia/mit-b5",
+]
+
+
+@pytest.mark.parametrize("variant", variants_img_classification)
+def test_segformer_image_classification_onnx(test_device, variant):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+    pcc_value = 0.99
+
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+
+        if variant in [
+            "nvidia/mit-b1",
+            "nvidia/mit-b2",
+            "nvidia/mit-b3",
+            "nvidia/mit-b4",
+            "nvidia/mit-b5",
+        ]:
+            os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+        if variant == "nvidia/mit-b0" and test_device.devtype == pybuda.BackendType.Silicon:
+            pcc_value = 0.97
+
+    # Load the sample image
+    pixel_values = get_sample_data(variant)
+
+    onnx_model_path = "third_party/confidential_customer_models/generated/files/" + str(variant).split("/")[-1].replace("-", "_") + ".onnx"
+    model = onnx.load(onnx_model_path)
+    onnx.checker.check_model(model)
+
+    tt_model = pybuda.OnnxModule(str(variant).split("/")[-1].replace("-", "_"), model, onnx_model_path)
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=[(pixel_values.shape,)],
+        inputs=[(pixel_values,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            verify_pybuda_codegen_vs_framework=True,
+            verify_tvm_compile=True,
+            pcc=pcc_value,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_semseg.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_semseg.py
new file mode 100644
index 000000000..84cea846e
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_segformer_semseg.py
@@ -0,0 +1,105 @@
+import pybuda
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+from transformers import AutoImageProcessor
+import os
+import pytest
+import requests
+from PIL import Image
+import onnx
+
+
+def get_sample_data(model_name):
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    image_processor = AutoImageProcessor.from_pretrained(model_name)
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+    return pixel_values
+
+
+variants_semseg = [
+    "nvidia/segformer-b0-finetuned-ade-512-512",
+    "nvidia/segformer-b1-finetuned-ade-512-512",
+    "nvidia/segformer-b2-finetuned-ade-512-512",
+    "nvidia/segformer-b3-finetuned-ade-512-512",
+    "nvidia/segformer-b4-finetuned-ade-512-512",
+]
+
+
+@pytest.mark.parametrize("variant", variants_semseg)
+def test_segformer_semantic_segmentation_onnx(test_device, variant):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+    pcc_value = 0.99
+
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+        if variant in [
+            "nvidia/segformer-b1-finetuned-ade-512-512",
+            "nvidia/segformer-b2-finetuned-ade-512-512",
+            "nvidia/segformer-b3-finetuned-ade-512-512",
+            "nvidia/segformer-b4-finetuned-ade-512-512",
+        ]:
+
+            os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+        if variant == "nvidia/segformer-b2-finetuned-ade-512-512" and test_device.devtype == pybuda.BackendType.Silicon:
+            pcc_value = 0.98
+
+    elif test_device.arch == pybuda.BackendDevice.Grayskull:
+        compiler_cfg.enable_auto_fusing = False
+
+        if variant == "nvidia/segformer-b2-finetuned-ade-512-512":
+            compiler_cfg.place_on_new_epoch("add_1423")
+            compiler_cfg.place_on_new_epoch("concatenate_1427.dc.concatenate.0")
+
+        if variant == "nvidia/segformer-b3-finetuned-ade-512-512":
+            compiler_cfg.place_on_new_epoch("add_2431")
+            compiler_cfg.place_on_new_epoch("concatenate_2435.dc.concatenate.0")
+
+        if variant == "nvidia/segformer-b4-finetuned-ade-512-512":
+            compiler_cfg.place_on_new_epoch("add_3523")
+            compiler_cfg.place_on_new_epoch("concatenate_3527.dc.concatenate.0")
+
+        if test_device.devtype == pybuda.BackendType.Silicon:
+
+            if variant in [
+                "nvidia/segformer-b0-finetuned-ade-512-512",
+                "nvidia/segformer-b2-finetuned-ade-512-512",
+                "nvidia/segformer-b4-finetuned-ade-512-512",
+            ]:
+                pcc_value = 0.98
+
+            if variant == "nvidia/segformer-b1-finetuned-ade-512-512":
+                pcc_value = 0.97
+
+    # Load the sample image
+    pixel_values = get_sample_data(variant)
+
+    onnx_model_path = "third_party/confidential_customer_models/generated/files/" + str(variant).split("/")[-1].replace("-", "_") + ".onnx"
+    model = onnx.load(onnx_model_path)
+    onnx.checker.check_model(model)
+
+    tt_model = pybuda.OnnxModule(str(variant).split("/")[-1].replace("-", "_"), model, onnx_model_path)
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=[(pixel_values.shape,)],
+        inputs=[(pixel_values,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            verify_pybuda_codegen_vs_framework=True,
+            verify_tvm_compile=True,
+            pcc=pcc_value,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v3.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v3.py
index 76a5830cd..b2ca57804 100644
--- a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v3.py
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v3.py
@@ -41,14 +41,11 @@ def preprocess(img):
     return image_data
 ######### 
 
+@pytest.mark.skip(reason="While loop in model, not supported yet")
 def test_yolov3_tiny_onnx(test_device):
-        
-    pytest.skip("While loop in model, not supported yet")
-    
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
 
     # STEP 2: Create PyBuda module from PyTorch model 
     load_path = "third_party/confidential_customer_models/model_2/onnx/saved/yolo_v3/tiny-yolov3-11.onnx"
@@ -75,13 +72,11 @@ def test_yolov3_tiny_onnx(test_device):
         )
     )
  
+@pytest.mark.skip(reason="While loop in model, not supported yet")
 def test_yolov3_onnx(test_device):
-    pytest.skip("While loop in model, not supported yet")
-    
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
 
     # STEP 2: Create PyBuda module from PyTorch model 
     load_path = "third_party/confidential_customer_models/model_2/onnx/saved/yolo_v3/yolov3-10.onnx"
@@ -107,4 +102,3 @@ def test_yolov3_onnx(test_device):
             test_kind=TestKind.INFERENCE,
         )
     )
-
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py
new file mode 100644
index 000000000..96deb8905
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_v5.py
@@ -0,0 +1,291 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pybuda, os
+import requests
+import torch
+from PIL import Image
+from pathlib import Path
+import cv2
+import numpy as np
+from yolov5.utils.dataloaders import exif_transpose, letterbox
+import onnx, pytest
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+from pybuda._C.backend_api import BackendDevice
+
+
+def data_preprocessing(ims: Image.Image, size: tuple) -> tuple:
+    """Data preprocessing function for YOLOv5 object detection.
+
+    Parameters
+    ----------
+    ims : Image.Image
+        Input image
+    size : tuple
+        Desired image size
+
+    Returns
+    -------
+    tuple
+        List of images, number of samples, filenames, image size, inference size, preprocessed images
+    """
+
+    _, ims = (
+        (len(ims), list(ims)) if isinstance(ims, (list, tuple)) else (1, [ims])
+    )  # number, list of images
+    shape0, shape1, files = [], [], []  # image and inference shapes, filenames
+
+    for i, im in enumerate(ims):
+        f = f"image{i}"  # filename
+        im, f = np.asarray(exif_transpose(im)), getattr(im, "filename", f) or f
+        files.append(Path(f).with_suffix(".jpg").name)
+        if im.shape[0] < 5:  # image in CHW
+            im = im.transpose((1, 2, 0))  # reverse dataloader .transpose(2, 0, 1)
+        im = (
+            im[..., :3] if im.ndim == 3 else cv2.cvtColor(im, cv2.COLOR_GRAY2BGR)
+        )  # enforce 3ch input
+        s = im.shape[:2]  # HWC
+        shape0.append(s)  # image shape
+        g = max(size) / max(s)  # gain
+        shape1.append([int(y * g) for y in s])
+        ims[i] = im if im.data.contiguous else np.ascontiguousarray(im)  # update
+    shape1 = [size[0] for _ in np.array(shape1).max(0)]  # inf shape
+    x = [letterbox(im, shape1, auto=False)[0] for im in ims]  # pad
+    x = np.ascontiguousarray(
+        np.array(x).transpose((0, 3, 1, 2))
+    )  # stack and BHWC to BCHW
+    x = torch.from_numpy(x) / 255  # uint8 to fp16/32
+    return x
+
+
+variants = ["yolov5n", "yolov5s", "yolov5m", "yolov5l", "yolov5x"]
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_yolo_v5_320x320_onnx(test_device, variant):
+
+    # pybuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    input_size = 320
+
+    if test_device.arch == BackendDevice.Grayskull:
+        compiler_cfg.enable_tm_cpu_fallback = True
+        if variant == "yolov5x":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{112*1024}"
+
+    # Load the ONNX model
+    onnx_model_path = f"./third_party/confidential_customer_models/generated/files/{variant}_{input_size}.onnx"
+    onnx_model = onnx.load(onnx_model_path)
+    model_name = f"{variant}_{input_size}_onnx"
+
+    # Load data sample
+    url = "http://images.cocodataset.org/val2017/000000397133.jpg"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+    # Data preprocessing on Host
+    pixel_values = data_preprocessing(image, size=(input_size, input_size))
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        pybuda.OnnxModule(model_name, onnx_model, onnx_model_path),
+        input_shapes=([pixel_values.shape]),
+        inputs=([pixel_values]),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
+
+
+variants = ["yolov5n", "yolov5s", "yolov5m", "yolov5l", "yolov5x"]
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_yolo_v5_480x480_onnx(test_device, variant):
+
+    # pybuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    compiler_cfg.enable_tm_cpu_fallback = True
+    # Temp mitigations for net2pipe errors, should be removed.
+    #
+    os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
+    os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
+    os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
+
+    if test_device.arch == BackendDevice.Wormhole_B0:
+
+        os.environ["PYBUDA_INSERT_SLICE_FOR_CONCAT"] = "1"
+        os.environ["PYBUDA_CONCAT_SLICE_Y"] = "10"
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{112*1024}"
+        if variant == "yolov5m":
+            compiler_cfg.balancer_op_override(
+                "concatenate_19.dc.concatenate.30.dc.concatenate.0.dc.concatenate.12",
+                "grid_shape",
+                (1, 1),
+            )
+    elif test_device.arch == BackendDevice.Grayskull:
+
+        if variant in ["yolov5n", "yolov5s"]:
+            os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+        if variant in ["yolov5m", "yolov5x"]:
+            os.environ["PYBUDA_INSERT_SLICE_FOR_CONCAT"] = "1"
+            os.environ["PYBUDA_CONCAT_SLICE_Y"] = "10"
+            if variant == "yolov5m":
+                compiler_cfg.balancer_op_override(
+                    "concatenate_26.dc.concatenate.30.dc.concatenate.0.dc.concatenate.12",
+                    "grid_shape",
+                    (1, 1),
+                )
+            if variant == "yolov5x":
+                os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{112*1024}"
+                compiler_cfg.balancer_op_override(
+                    "concatenate_40.dc.concatenate.30.dc.concatenate.0.dc.concatenate.12",
+                    "grid_shape",
+                    (1, 1),
+                )
+
+    input_size = 480
+
+    # Load the ONNX model
+    onnx_model_path = f"./third_party/confidential_customer_models/generated/files/{variant}_{input_size}.onnx"
+    onnx_model = onnx.load(onnx_model_path)
+    model_name = f"{variant}_{input_size}_onnx"
+
+    # Load data sample
+    url = "http://images.cocodataset.org/val2017/000000397133.jpg"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+    # Data preprocessing on Host
+    pixel_values = data_preprocessing(image, size=(input_size, input_size))
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        pybuda.OnnxModule(model_name, onnx_model, onnx_model_path),
+        input_shapes=([pixel_values.shape]),
+        inputs=([pixel_values]),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
+
+
+variants = ["yolov5n", "yolov5s", "yolov5m", "yolov5l", "yolov5x"]
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_yolo_v5_640x640_onnx(test_device, variant):
+
+    # pybuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    if test_device.arch == BackendDevice.Wormhole_B0:
+
+        os.environ["PYBUDA_INSERT_SLICE_FOR_CONCAT"] = "1"
+        os.environ["PYBUDA_CONCAT_SLICE_Y"] = "10"
+
+        if variant in ["yolov5n", "yolov5s"]:
+            if variant == "yolov5s":
+                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{112*1024}"
+            compiler_cfg.balancer_op_override(
+                "concatenate_259.dc.concatenate.7", "grid_shape", (1, 1)
+            )
+
+        if variant == "yolov5m":
+            compiler_cfg.balancer_op_override(
+                "concatenate_332.dc.concatenate.7", "grid_shape", (1, 1)
+            )
+            compiler_cfg.balancer_op_override(
+                "concatenate_332.dc.concatenate.7", "t_stream_shape", (1, 1)
+            )
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{112*1024}"
+
+        if variant == "yolov5l":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{112*1024}"
+            os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+            compiler_cfg.balancer_op_override(
+                "concatenate_405.dc.concatenate.7", "grid_shape", (1, 1)
+            )
+
+        if variant == "yolov5x":
+            compiler_cfg.balancer_op_override(
+                "concatenate_478.dc.concatenate.7", "grid_shape", (1, 1)
+            )
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{150*1024}"
+
+    elif test_device.arch == BackendDevice.Grayskull:
+
+        compiler_cfg.enable_tm_cpu_fallback = True
+
+        if variant=="yolov5n":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "4096"
+
+        if variant == "yolov5l":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{112*1024}"
+
+        if variant in ["yolov5m", "yolov5x"]:
+            os.environ["PYBUDA_INSERT_SLICE_FOR_CONCAT"] = "1"
+            os.environ["PYBUDA_CONCAT_SLICE_Y"] = "10"
+
+            os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+            if variant == "yolov5m":
+                compiler_cfg.balancer_op_override(
+                    "concatenate_26.dc.concatenate.30.dc.concatenate.0.dc.concatenate.12",
+                    "grid_shape",
+                    (1, 1),
+                )
+                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{150*1024}"
+
+            if variant == "yolov5x":
+                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{112*1024}"
+                compiler_cfg.balancer_op_override(
+                    "concatenate_40.dc.concatenate.30.dc.concatenate.0.dc.concatenate.12",
+                    "grid_shape",
+                    (1, 1),
+                )
+
+    input_size = 640
+
+    # Load the ONNX model
+    onnx_model_path = f"./third_party/confidential_customer_models/generated/files/{variant}_{input_size}.onnx"
+    onnx_model = onnx.load(onnx_model_path)
+    model_name = f"{variant}_{input_size}_onnx"
+
+    # Load data sample
+    url = "http://images.cocodataset.org/val2017/000000397133.jpg"
+    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+    # Data preprocessing on Host
+    pixel_values = data_preprocessing(image, size=(input_size, input_size))
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        pybuda.OnnxModule(model_name, onnx_model, onnx_model_path),
+        input_shapes=([pixel_values.shape]),
+        inputs=([pixel_values]),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py
new file mode 100644
index 000000000..9d3e893fa
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/onnx/test_yolo_x.py
@@ -0,0 +1,245 @@
+import pybuda, os
+import pytest
+import cv2, torch
+import numpy as np
+import onnx
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+import requests
+from pybuda._C.backend_api import BackendDevice
+
+
+def preprocess(img, input_size, swap=(2, 0, 1)):
+    if len(img.shape) == 3:
+        padded_img = np.ones((input_size[0], input_size[1], 3), dtype=np.uint8) * 114
+    else:
+        padded_img = np.ones(input_size, dtype=np.uint8) * 114
+
+    r = min(input_size[0] / img.shape[0], input_size[1] / img.shape[1])
+    resized_img = cv2.resize(
+        img,
+        (int(img.shape[1] * r), int(img.shape[0] * r)),
+        interpolation=cv2.INTER_LINEAR,
+    ).astype(np.uint8)
+    padded_img[: int(img.shape[0] * r), : int(img.shape[1] * r)] = resized_img
+
+    padded_img = padded_img.transpose(swap)
+    padded_img = np.ascontiguousarray(padded_img, dtype=np.float32)
+    padded_img = torch.from_numpy(padded_img)
+    return padded_img
+
+
+variants = ["yolox_nano", "yolox_tiny", "yolox_s", "yolox_m", "yolox_l", "yolox_darknet", "yolox_x"]
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_yolox_onnx(variant, test_device):
+
+    # pybuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    os.environ["PYBUDA_DECOMPOSE_SIGMOID"] = "1"
+
+    if test_device.arch == BackendDevice.Wormhole_B0:
+
+        if variant in ["yolox_nano", "yolox_tiny"]:
+
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 2))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 2))
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "81920"
+
+        elif variant == "yolox_s":
+
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 4))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 4))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 4))
+            compiler_cfg.balancer_op_override("conv2d_33.dc.matmul.8", "t_stream_shape", (1, 1))
+            compiler_cfg.place_on_new_epoch("concatenate_275.dc.sparse_matmul.11.lc2")
+
+        elif variant == "yolox_m":
+
+            os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
+            os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
+            os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
+
+            compiler_cfg.place_on_new_epoch("conv2d_187.dc.matmul.8")
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 4))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.sparse_matmul.9.dc.sparse_matmul.1.lc2", "t_stream_shape", (5, 1))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 4))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 4))
+            compiler_cfg.place_on_new_epoch("concatenate_354.dc.sparse_matmul.11.lc2")
+
+        elif variant in ["yolox_l", "yolox_darknet", "yolox_x"]:
+
+            os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_EXPAND_OUTPUT_BUFFERS"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
+
+            if variant == "yolox_l":
+
+                os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 4))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 4))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 4))
+                compiler_cfg.place_on_new_epoch("concatenate_433.dc.sparse_matmul.11.lc2")
+
+            elif variant == "yolox_darknet":
+
+                compiler_cfg.balancer_op_override("conv2d_28.dc.matmul.8", "t_stream_shape", (1, 1))
+                compiler_cfg.balancer_op_override("conv2d_33.dc.matmul.8", "t_stream_shape", (1, 1))
+                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "53248"
+                compiler_cfg.place_on_new_epoch("concatenate_222.dc.sparse_matmul.11.lc2")
+
+            elif variant == "yolox_x":
+
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 4))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 4))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 4))
+                compiler_cfg.place_on_new_epoch("concatenate_512.dc.sparse_matmul.11.lc2")
+
+    elif test_device.arch == BackendDevice.Grayskull:
+
+        if variant in ["yolox_nano", "yolox_tiny"]:
+
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (8, 1))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (8, 1))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (8, 1))
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "81920"
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.14.lc2", "t_stream_shape", (1, 13))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.14.lc2", "grid_shape", (1, 8))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.14.lc2", "grid_shape", (1, 8))
+
+        elif variant == "yolox_s":
+
+            compiler_cfg.balancer_op_override(
+                "concatenate_275.dc.concatenate.7_to_concatenate_275.dc.sparse_matmul.11.lc2_1_serialized_dram_queue.before_padded_node.nop_0",
+                "grid_shape",
+                (1, 1),
+            )
+            compiler_cfg.balancer_op_override("concatenate_275.dc.concatenate.7.before_padded_node.nop_0", "grid_shape", (1, 1))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 5))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 5))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (8, 1))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 5))
+            compiler_cfg.place_on_new_epoch("concatenate_275.dc.sparse_matmul.11.lc2")
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "4096"
+
+        elif variant == "yolox_m":
+
+            compiler_cfg.balancer_op_override("concatenate_354.dc.concatenate.7.before_padded_node.nop_0", "grid_shape", (1, 1))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 5))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 5))
+            compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 5))
+            compiler_cfg.place_on_new_epoch("concatenate_354.dc.sparse_matmul.11.lc2")
+
+        elif variant in ["yolox_l", "yolox_darknet", "yolox_x"]:
+
+            os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_EXPAND_OUTPUT_BUFFERS"] = "1"
+            os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
+
+            if variant == "yolox_l":
+
+                compiler_cfg.place_on_new_epoch("conv2d_372.dc.matmul.11")
+                compiler_cfg.balancer_op_override("concatenate_433.dc.concatenate.7.before_padded_node.nop_0", "grid_shape", (1, 1))
+                compiler_cfg.balancer_op_override(
+                    "concatenate_433.dc.concatenate.7_to_concatenate_433.dc.sparse_matmul.11.lc2_1_serialized_dram_queue.before_padded_node.nop_0",
+                    "grid_shape",
+                    (1, 1),
+                )
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "t_stream_shape", (1, 60))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (5, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 8))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 5))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 8))
+                compiler_cfg.place_on_new_epoch("concatenate_433.dc.sparse_matmul.11.lc2")
+                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "4096"
+                compiler_cfg.place_on_new_epoch("_fused_op_163")
+
+            elif variant == "yolox_darknet":
+
+                compiler_cfg.place_on_new_epoch("conv2d_199.dc.matmul.11")
+                compiler_cfg.balancer_op_override("concatenate_222.dc.concatenate.7.before_padded_node.nop_0", "grid_shape", (1, 1))
+                compiler_cfg.place_on_new_epoch("concatenate_222.dc.sparse_matmul.11.lc2")
+                os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "4096"
+
+            elif variant == "yolox_x":
+
+                os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
+                os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
+                os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
+
+                os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+                compiler_cfg.place_on_new_epoch("conv2d_385.dc.conv2d.5.dc.matmul.11")
+                compiler_cfg.place_on_new_epoch("conv2d_385.dc.conv2d.1.dc.matmul.11")
+                compiler_cfg.place_on_new_epoch("conv2d_385.dc.conv2d.3.dc.matmul.11")
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.1.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 5))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (4, 2))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.5.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 8))
+                compiler_cfg.balancer_op_override("conv2d_7.dc.conv2d.3.dc.reshape.0.dc.sparse_matmul.10.lc2", "t_stream_shape", (1, 8))
+                compiler_cfg.place_on_new_epoch("concatenate_512.dc.sparse_matmul.11.lc2")
+
+    # prepare input
+    if variant in ["yolox_nano", "yolox_tiny"]:
+        input_shape = (416, 416)
+    else:
+        input_shape = (640, 640)
+
+    url = "http://images.cocodataset.org/val2017/000000397133.jpg"
+    response = requests.get(url)
+    with open("input.jpg", "wb") as f:
+        f.write(response.content)
+    img = cv2.imread("input.jpg")
+    img_tensor = preprocess(img, input_shape)
+    img_tensor = img_tensor.unsqueeze(0)
+
+    # Load and validate the ONNX model
+    onnx_model_path = f"third_party/confidential_customer_models/generated/files/{variant}.onnx"
+    onnx_model = onnx.load(onnx_model_path)
+    onnx.checker.check_model(onnx_model)
+    model_name = f"onnx_{variant}"
+    tt_model = pybuda.OnnxModule(model_name, onnx_model, onnx_model_path)
+
+    # PCC
+    if test_device.arch == BackendDevice.Wormhole_B0:
+        if variant == "yolox_nano":
+            pcc = 0.93
+        else:
+            pcc = 0.99
+    elif test_device.arch == BackendDevice.Grayskull:
+        if variant == "yolox_nano":
+            pcc = 0.91
+        elif variant in ["yolox_m", "yolox_darknet"]:
+            pcc = 0.92
+        elif variant in ["yolox_s", "yolox_l"]:
+            pcc = 0.93
+        elif variant == "yolox_x":
+            pcc = 0.94
+        else:
+            pcc = 0.99
+
+    # Inference
+    verify_module(
+        tt_model,
+        input_shapes=([img_tensor.shape]),
+        inputs=([img_tensor]),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            pcc=pcc,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_alexnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_alexnet.py
index cfc8258ed..e1218db39 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_alexnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_alexnet.py
@@ -7,6 +7,7 @@
 import pytest
 from PIL import Image
 from torchvision import transforms
+from loguru import logger
 
 import pybuda
 from pybuda import VerifyConfig
@@ -16,13 +17,11 @@
 from pytorchcv.model_provider import get_model as ptcv_get_model
 
 
+@pytest.mark.skip(reason="Not supported")
 def test_alexnet_torchhub(test_device):
-    pytest.skip("Working; Not priority for Phase 1")
-
     # Configurations
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
 
     os.environ["PYBUDA_CONV2D_SPARSE_SECOND"] = "1"
 
@@ -34,19 +33,23 @@ def test_alexnet_torchhub(test_device):
     pybuda_model = pybuda.PyTorchModule("pt_alexnet_torchhub", framework_model)
 
     # Load and pre-process image
-    torch.hub.download_url_to_file(
-        "https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg"
-    )
-    input_image = Image.open("dog.jpg")
-    preprocess = transforms.Compose(
-        [
-            transforms.Resize(256),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-        ]
-    )
-    img_tensor = preprocess(input_image).unsqueeze(0)
+    try:
+        torch.hub.download_url_to_file(
+            "https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg"
+        )
+        input_image = Image.open("dog.jpg")
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            ]
+        )
+        img_tensor = preprocess(input_image).unsqueeze(0)
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        img_tensor = torch.rand(1, 3, 224, 224)
 
     # Sanity run
     # os.system(
@@ -75,13 +78,11 @@ def test_alexnet_torchhub(test_device):
     )
 
 
+@pytest.mark.skip(reason="Not supported")
 def test_alexnet_osmr(test_device):
-    pytest.skip("Fails on TVM side; Not priority for Phase 1")
-
     # Configurations
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
 
     os.environ["PYBUDA_CONV2D_SPARSE_SECOND"] = "1"
 
@@ -91,19 +92,23 @@ def test_alexnet_osmr(test_device):
     pybuda_model = pybuda.PyTorchModule("pt_alexnet_osmr", framework_model)
 
     # Load and pre-process image
-    torch.hub.download_url_to_file(
-        "https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg"
-    )
-    input_image = Image.open("dog.jpg")
-    preprocess = transforms.Compose(
-        [
-            transforms.Resize(256),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-        ]
-    )
-    img_tensor = preprocess(input_image).unsqueeze(0)
+    try:
+        torch.hub.download_url_to_file(
+            "https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg"
+        )
+        input_image = Image.open("dog.jpg")
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            ]
+        )
+        img_tensor = preprocess(input_image).unsqueeze(0)
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        img_tensor = torch.rand(1, 3, 224, 224)
 
     # Sanity run
     os.system(
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_autoencoder.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_autoencoder.py
index bad32c2aa..1689c7150 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_autoencoder.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_autoencoder.py
@@ -94,7 +94,6 @@ def forward(self, x):
 def test_conv_ae_pytorch(test_device):
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "Ribbon"
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
@@ -132,7 +131,6 @@ def test_conv_ae_pytorch(test_device):
 def test_linear_ae_pytorch(test_device):
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "Ribbon"
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py
index 6d4053c75..b8a2e5560 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_blazepose.py
@@ -21,15 +21,14 @@
 from mediapipepytorch.visualization import POSE_CONNECTIONS, draw_landmarks
 
 
+@pytest.mark.skip(reason="Only test 1x1 grid")
 def test_blazepose_detector_pytorch(test_device):
-    pytest.skip("Only test 1x1 grid")
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip("Grayskull test failing with TM ERROR (producer = conv2d_163.dc.add.11_fused_tm_op_0.dc.matmul.7, consumer = conv2d_163.dc.add.11_fused_tm_op_0.dc.matmul.12): TM order does't satisfy constraints for stacking with phased pipes, buf_size_mb must be a multiple of the total stack factor or producer t")
 
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
 
     # Load BlazePose Detector
     pose_detector = BlazePose()
@@ -57,8 +56,8 @@ def test_blazepose_detector_pytorch(test_device):
         )
     )
 
+@pytest.mark.skip(reason="Only test 1x1 grid")
 def test_blazepose_regressor_pytorch(test_device):
-    pytest.skip("Only test 1x1 grid")
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip("Grayskull test failing with data mismatch")
     
@@ -68,7 +67,6 @@ def test_blazepose_regressor_pytorch(test_device):
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
 
 
     # Load BlazePose Landmark Regressor
@@ -88,9 +86,8 @@ def test_blazepose_regressor_pytorch(test_device):
         )
     )
 
-
+@pytest.mark.skip(reason="Not supported")
 def test_blazepose_detector_pytorch_1x1(test_device):
-    pytest.skip()
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip("Grayskull test failing with TM ERROR (producer = conv2d_163.dc.add.11_fused_tm_op_0.dc.matmul.7, consumer = conv2d_163.dc.add.11_fused_tm_op_0.dc.matmul.12): TM order does't satisfy constraints for stacking with phased pipes, buf_size_mb must be a multiple of the total stack factor or producer t")
 
@@ -100,7 +97,6 @@ def test_blazepose_detector_pytorch_1x1(test_device):
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
 
     # Load BlazePose Detector
     pose_detector = BlazePose()
@@ -128,9 +124,8 @@ def test_blazepose_detector_pytorch_1x1(test_device):
         )
     )
 
+@pytest.mark.skip(reason="Not supported")
 def test_blazepose_regressor_pytorch_1x1(test_device):
-    pytest.skip()
-
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip("Grayskull test failing with data mismatch")
     
@@ -141,8 +136,6 @@ def test_blazepose_regressor_pytorch_1x1(test_device):
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
-
 
     # Load BlazePose Landmark Regressor
     pose_regressor = BlazePoseLandmark()
@@ -173,7 +166,6 @@ def test_blaze_palm_pytorch_1x1(test_device):
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.cpu_fallback_ops = set(["concatenate"])
 
     # Load BlazePalm Detector
@@ -219,7 +211,6 @@ def test_blaze_hand_pytorch_1x1(test_device):
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_33"] = -1
     compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_112"] = -1
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py
new file mode 100644
index 000000000..647c0bc45
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ddrnet.py
@@ -0,0 +1,169 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pybuda, os
+import torch
+from torchvision import transforms
+import requests
+from PIL import Image
+import pytest
+from pybuda.verify.backend import verify_module
+from pybuda.verify.config import TestKind
+from pybuda import VerifyConfig
+import sys
+from pybuda._C.backend_api import BackendDevice
+
+sys.path.append("third_party/confidential_customer_models/generated/scripts/")
+from model_ddrnet import DualResNet_23, DualResNet_39, BasicBlock
+
+sys.path.append(
+    "third_party/confidential_customer_models/cv_demos/ddrnet/semantic_segmentation/model"
+)
+from semseg import DualResNet, BasicBlock_seg
+
+variants = ["ddrnet23s", "ddrnet23", "ddrnet39"]
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_ddrnet_pytorch(variant, test_device):
+
+    # STEP 1: Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    # STEP 2: Create PyBuda module from PyTorch model
+    if variant == "ddrnet23s":
+
+        model = DualResNet_23(
+            block=BasicBlock, layers=[2, 2, 2, 2], planes=32, last_planes=1024
+        )
+
+    elif variant == "ddrnet23":
+
+        model = DualResNet_23(
+            block=BasicBlock, layers=[2, 2, 2, 2], planes=64, last_planes=2048
+        )
+
+    elif variant == "ddrnet39":
+
+        model = DualResNet_39(
+            block=BasicBlock, layers=[3, 4, 6, 3], planes=64, last_planes=2048
+        )
+
+    state_dict_path = (
+        f"third_party/confidential_customer_models/generated/files/{variant}.pth"
+    )
+
+    state_dict = torch.load(state_dict_path, map_location=torch.device("cpu"))
+
+    model.load_state_dict(state_dict, strict=False)
+
+    model.eval()
+
+    model_name = f"pt_{variant}"
+
+    tt_model = pybuda.PyTorchModule(model_name, model)
+
+    # STEP 3: Prepare input
+    url = "https://raw.githubusercontent.com/pytorch/hub/master/images/dog.jpg"
+    input_image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+    preprocess = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
+    input_tensor = preprocess(input_image)
+    input_batch = input_tensor.unsqueeze(0)
+
+    verify_module(
+        tt_model,
+        input_shapes=([input_batch.shape]),
+        inputs=([input_batch]),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            pcc=(
+                0.98
+                if test_device.arch == BackendDevice.Grayskull
+                and variant != "ddrnet23s"
+                else 0.99
+            ),
+        ),
+    )
+
+
+variants = ["ddrnet23s_cityscapes", "ddrnet23_cityscapes"]
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_ddrnet_semantic_segmentation_pytorch(variant, test_device):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    if (
+        variant == "ddrnet23s_cityscapes"
+        and test_device.arch == BackendDevice.Wormhole_B0
+    ):
+        compiler_cfg.enable_auto_fusing = False
+        compiler_cfg.amp_level = 2
+
+    # prepare model
+    if variant == "ddrnet23s_cityscapes":
+        model = DualResNet(
+            BasicBlock_seg,
+            [2, 2, 2, 2],
+            num_classes=19,
+            planes=32,
+            spp_planes=128,
+            head_planes=64,
+            augment=True,
+        )
+
+    elif variant == "ddrnet23_cityscapes":
+        model = DualResNet(
+            BasicBlock_seg,
+            [2, 2, 2, 2],
+            num_classes=19,
+            planes=64,
+            spp_planes=128,
+            head_planes=128,
+            augment=True,
+        )
+
+    state_dict_path = f"third_party/confidential_customer_models/cv_demos/ddrnet/semantic_segmentation/weights/{variant}.pth"
+    state_dict = torch.load(state_dict_path, map_location=torch.device("cpu"))
+    model.load_state_dict(state_dict, strict=False)
+    model.eval()
+    model_name = f"pt_{variant}"
+    tt_model = pybuda.PyTorchModule(model_name, model)
+
+    # prepare input
+    image_path = "third_party/confidential_customer_models/cv_demos/ddrnet/semantic_segmentation/image/road_scenes.png"
+    input_image = Image.open(image_path)
+    input_tensor = transforms.ToTensor()(input_image)
+    input_batch = input_tensor.unsqueeze(0)
+
+    # Inference
+    verify_module(
+        tt_model,
+        input_shapes=([input_batch.shape]),
+        inputs=([input_batch]),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_densenet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_densenet.py
index e2fe05ce8..3869d60ab 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_densenet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_densenet.py
@@ -6,6 +6,7 @@
 import pybuda
 import os 
 import urllib.request
+from loguru import logger
 
 import skimage
 import torch
@@ -29,40 +30,48 @@
 
 ############
 def get_input_img():
-    url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
-    urllib.request.urlretrieve(url, filename)
-    img = Image.open(filename).convert('RGB')
-
-    transform = Compose([
-        Resize(256),
-        CenterCrop(224),
-        PILToTensor(),
-        ConvertImageDtype(torch.float32),
-        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-    ])
-
-    # Preprocessing
-    img_tensor = transform(img).unsqueeze(0)
+    try:
+        url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
+        urllib.request.urlretrieve(url, filename)
+        img = Image.open(filename).convert('RGB')
+
+        transform = Compose([
+            Resize(256),
+            CenterCrop(224),
+            PILToTensor(),
+            ConvertImageDtype(torch.float32),
+            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+
+        # Preprocessing
+        img_tensor = transform(img).unsqueeze(0)
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        img_tensor = torch.rand(1, 3, 224, 224)
     print(img_tensor.shape)
     return img_tensor
 #############
 
 def get_input_img_hf_xray():
-    img_url = "https://huggingface.co/spaces/torchxrayvision/torchxrayvision-classifier/resolve/main/16747_3_1.jpg"
-    img_path = "xray.jpg"
-    urllib.request.urlretrieve(img_url, img_path)
-    img = skimage.io.imread(img_path)
-    img = xrv.datasets.normalize(img, 255)
-    # Check that images are 2D arrays
-    if len(img.shape) > 2:
-        img = img[:, :, 0]
-    if len(img.shape) < 2:
-        print("error, dimension lower than 2 for image")
-    # Add color channel
-    img = img[None, :, :]
-    transform = torchvision.transforms.Compose([xrv.datasets.XRayCenterCrop(), xrv.datasets.XRayResizer(224)])
-    img = transform(img)
-    img_tensor = torch.from_numpy(img).unsqueeze(0)
+    try:
+        img_url = "https://huggingface.co/spaces/torchxrayvision/torchxrayvision-classifier/resolve/main/16747_3_1.jpg"
+        img_path = "xray.jpg"
+        urllib.request.urlretrieve(img_url, img_path)
+        img = skimage.io.imread(img_path)
+        img = xrv.datasets.normalize(img, 255)
+        # Check that images are 2D arrays
+        if len(img.shape) > 2:
+            img = img[:, :, 0]
+        if len(img.shape) < 2:
+            print("error, dimension lower than 2 for image")
+        # Add color channel
+        img = img[None, :, :]
+        transform = torchvision.transforms.Compose([xrv.datasets.XRayCenterCrop(), xrv.datasets.XRayResizer(224)])
+        img = transform(img)
+        img_tensor = torch.from_numpy(img).unsqueeze(0)
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        img_tensor = torch.rand(1, 1, 224, 224)
     
     return img_tensor
 
@@ -77,7 +86,6 @@ def test_densenet_121_pytorch(variant, test_device):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     os.environ["PYBUDA_DISABLE_CONSTANT_FOLDING"] = "1"
 
@@ -116,19 +124,11 @@ def test_densenet_121_pytorch(variant, test_device):
     
 def test_densenet_161_pytorch(test_device):
     
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip("Grayskull test failing with exceeding dram queues error in net2pipe")
-    
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True    
+    compiler_cfg.balancer_policy = "Ribbon"  
+    os.environ["PYBUDA_RIBBON2"] = "1"
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
-    compiler_cfg.place_on_new_epoch("concatenate_131.dc.sparse_matmul.7.lc2") 
-    os.environ["PYBUDA_DISABLE_CONSTANT_FOLDING"] = "1"
-    os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "ConsumerOperandDataEdgesFirst"
-    os.environ["PYBUDA_LEGACY_UBLOCK_SHAPE"] = "1"
-
  
     # STEP 2: Create PyBuda module from PyTorch model
     model = download_model(torch.hub.load, "pytorch/vision:v0.10.0", "densenet161", pretrained=True)
@@ -147,13 +147,9 @@ def test_densenet_161_pytorch(test_device):
             devtype=test_device.devtype,
             devmode=test_device.devmode,
             test_kind=TestKind.INFERENCE,
-            pcc=0.95,
+            pcc=0.92 if test_device.devtype == BackendType.Silicon and test_device.arch == BackendDevice.Grayskull else 0.95,
         )
     )
-    
-    
-
-
 
 def test_densenet_169_pytorch(test_device):
     
@@ -163,7 +159,6 @@ def test_densenet_169_pytorch(test_device):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     os.environ["PYBUDA_DISABLE_CONSTANT_FOLDING"] = "1"
     os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "ConsumerOperandDataEdgesFirst"
@@ -197,7 +192,6 @@ def test_densenet_201_pytorch(test_device):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     os.environ["PYBUDA_DISABLE_CONSTANT_FOLDING"] = "1"
     os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "ConsumerOperandDataEdgesFirst"
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_dla.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_dla.py
new file mode 100644
index 000000000..9dbdb42f5
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_dla.py
@@ -0,0 +1,95 @@
+import os
+
+import pybuda
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify import TestKind
+from pybuda._C.backend_api import BackendDevice
+import requests
+import pytest
+import torchvision.transforms as transforms
+from PIL import Image
+
+from test.model_demos.models.dla import (
+    dla34,
+    dla46_c,
+    dla46x_c,
+    dla60x_c,
+    dla60,
+    dla60x,
+    dla102,
+    dla102x,
+    dla102x2,
+    dla169,
+)
+
+
+variants_func = {
+    "dla34": dla34,
+    "dla46_c": dla46_c,
+    "dla46x_c": dla46x_c,
+    "dla60x_c": dla60x_c,
+    "dla60": dla60,
+    "dla60x": dla60x,
+    "dla102": dla102,
+    "dla102x": dla102x,
+    "dla102x2": dla102x2,
+    "dla169": dla169,
+}
+variants = list(variants_func.keys())
+
+
+@pytest.mark.parametrize("variant", variants, ids=variants)
+def test_dla_pytorch(variant, test_device):
+    # PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda._C.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    func = variants_func[variant]
+    model_name = f"dla_{variant}_pytorch"
+
+    pcc = 0.99
+    if test_device.arch == BackendDevice.Wormhole_B0:
+        if variant == ("dla60", "dla60x"):
+            compiler_cfg.place_on_new_epoch("concatenate_776.dc.concatenate.0")
+    elif test_device.arch == BackendDevice.Grayskull:
+        if func.__name__ in ("dla102x2", "dla169"):
+            os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+        if func.__name__ == "dla46_c":
+            pcc = 0.97
+
+    # Load data sample
+    url = "https://images.rawpixel.com/image_1300/cHJpdmF0ZS9sci9pbWFnZXMvd2Vic2l0ZS8yMDIyLTA1L3BkMTA2LTA0Ny1jaGltXzEuanBn.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    # Preprocessing
+    transform = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
+    img_tensor = transform(image).unsqueeze(0)
+
+    pytorch_model = func(pretrained="imagenet")
+    pytorch_model.eval()
+
+    # Create pybuda.PyTorchModule using the loaded Pytorch model
+    tt_model = pybuda.PyTorchModule(model_name, pytorch_model)
+
+    verify_module(
+        tt_model,
+        input_shapes=[img_tensor.shape],
+        inputs=[(img_tensor,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            pcc=pcc,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
index ea0a55b29..23ad86cb6 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet.py
@@ -10,6 +10,7 @@
 import torchvision.models as models
 from timm.data import resolve_data_config
 from timm.data.transforms_factory import create_transform
+from loguru import logger
 
 import pybuda
 from pybuda import VerifyConfig
@@ -40,7 +41,6 @@ def test_efficientnet_timm(variant, test_device):
     # Configuration
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_fusing = False
 
     if variant == "efficientnet_b0":
@@ -68,15 +68,19 @@ def test_efficientnet_timm(variant, test_device):
     pybuda_model = pybuda.PyTorchModule("pt_effnet_timm", framework_model)
 
     # Load and pre-process image
-    url, filename = (
-        "https://github.com/pytorch/hub/raw/master/images/dog.jpg",
-        "dog.jpg",
-    )
-    urllib.request.urlretrieve(url, filename)
-    img = Image.open(filename).convert("RGB")
-    config = resolve_data_config({}, model=framework_model)
-    transform = create_transform(**config)
-    img_tensor = transform(img).unsqueeze(0)
+    try:
+        url, filename = (
+            "https://github.com/pytorch/hub/raw/master/images/dog.jpg",
+            "dog.jpg",
+        )
+        urllib.request.urlretrieve(url, filename)
+        img = Image.open(filename).convert("RGB")
+        config = resolve_data_config({}, model=framework_model)
+        transform = create_transform(**config)
+        img_tensor = transform(img).unsqueeze(0)
+    except: 
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        img_tensor = torch.rand(1, 3, 224, 224)
 
     # Sanity run
     # cpu_output = framework_model(img_tensor)
@@ -123,7 +127,6 @@ def test_efficientnet_torchvision(variant, test_device):
     # Configuration
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_fusing = False # Until #844 is resolved
     compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
 
@@ -171,15 +174,19 @@ def test_efficientnet_torchvision(variant, test_device):
     pybuda_model = pybuda.PyTorchModule("pt_effnet_torchvis", framework_model)
 
     # Load and pre-process image
-    url, filename = (
-        "https://github.com/pytorch/hub/raw/master/images/dog.jpg",
-        "dog.jpg",
-    )
-    urllib.request.urlretrieve(url, filename)
-    img = Image.open(filename).convert("RGB")
-    config = resolve_data_config({}, model=framework_model)
-    transform = create_transform(**config)
-    img_tensor = transform(img).unsqueeze(0)
+    try:
+        url, filename = (
+            "https://github.com/pytorch/hub/raw/master/images/dog.jpg",
+            "dog.jpg",
+        )
+        urllib.request.urlretrieve(url, filename)
+        img = Image.open(filename).convert("RGB")
+        config = resolve_data_config({}, model=framework_model)
+        transform = create_transform(**config)
+        img_tensor = transform(img).unsqueeze(0)
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        img_tensor = torch.rand(1, 3, 224, 224) 
 
     # Sanity run
     # cpu_output = framework_model(img_tensor)
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet_lite.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet_lite.py
index 85f969fbe..c5cf80d01 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet_lite.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_efficientnet_lite.py
@@ -47,7 +47,6 @@ def test_efficientnet_lite_0_pytorch(test_device):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
 
     if test_device.arch == BackendDevice.Wormhole_B0:
         os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "ConsumerOperandDataEdgesFirst"
@@ -87,7 +86,6 @@ def test_efficientnet_lite_1_pytorch(test_device):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     compiler_cfg.amp_level = 2
     os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
@@ -128,7 +126,6 @@ def test_efficientnet_lite_2_pytorch(test_device):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     compiler_cfg.amp_level = 2
     compiler_cfg.balancer_op_override("conv2d_99.dc.conv2d.1.dc.matmul.12", "grid_shape", (7,5))
@@ -174,7 +171,6 @@ def test_efficientnet_lite_3_pytorch(test_device):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     os.environ["PYBUDA_PAD_SPARSE_MM"] = "{613:640, 39:48, 11:12}"
     os.environ["PYBUDA_MANUAL_SPLICE_DECOMP_TH"] = "613"
     os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "ConsumerOperandDataEdgesFirst"
@@ -217,7 +213,6 @@ def test_efficientnet_lite_4_pytorch(test_device):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     os.environ["PYBUDA_PAD_SPARSE_MM"] = "{46:48}"
     os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "ConsumerOperandDataEdgesFirst"
     os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_fpn.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_fpn.py
new file mode 100644
index 000000000..1c0b0d740
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_fpn.py
@@ -0,0 +1,57 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import torch
+import torch.nn as nn
+import pybuda
+import os
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from torchvision.ops import FeaturePyramidNetwork
+from collections import OrderedDict
+
+
+class FPNWrapper(nn.Module):
+    def __init__(
+        self, in_channels_list, out_channels, extra_blocks=None, norm_layer=None
+    ):
+        super().__init__()
+        self.fpn = FeaturePyramidNetwork(
+            in_channels_list, out_channels, extra_blocks, norm_layer
+        )
+
+    def forward(self, feat0, feat1, feat2):
+        x = OrderedDict()
+        x["feat0"] = feat0
+        x["feat1"] = feat1
+        x["feat2"] = feat2
+        return self.fpn(x)
+
+
+def test_fpn_pytorch(test_device, test_kind):
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda._C.Float16_b
+
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    os.environ["PYBUDA_FORCE_EMULATE_HARVESTED"] = "1"
+
+    # Load FPN model
+    model = FPNWrapper([10, 20, 30], 5)
+    tt_model = pybuda.PyTorchModule("pytorch_fpn", model)
+
+    feat0 = torch.rand(1, 10, 64, 64)
+    feat1 = torch.rand(1, 20, 16, 16)
+    feat2 = torch.rand(1, 30, 8, 8)
+
+    verify_module(
+        tt_model,
+        input_shapes=[feat0.shape, feat1.shape, feat2.shape],
+        inputs=[(feat0, feat1, feat2)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=test_kind,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ghostnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ghostnet.py
index 3e8fad00e..a5b905a01 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ghostnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ghostnet.py
@@ -10,6 +10,7 @@
 from PIL import Image
 from timm.data import resolve_data_config
 from timm.data.transforms_factory import create_transform
+from loguru import logger
 
 import pybuda
 from pybuda import VerifyConfig
@@ -35,7 +36,6 @@ def test_ghostnet_timm(variant, test_device):
     # Configurations
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
     # Load model
@@ -44,15 +44,19 @@ def test_ghostnet_timm(variant, test_device):
     pybuda_model = pybuda.PyTorchModule("pt_ghostnet_timm", framework_model)
 
     # Load and pre-process image
-    url, filename = (
-        "https://github.com/pytorch/hub/raw/master/images/dog.jpg",
-        "dog.jpg",
-    )
-    urllib.request.urlretrieve(url, filename)
-    img = Image.open(filename).convert("RGB")
-    config = resolve_data_config({}, model=framework_model)
-    transform = create_transform(**config)
-    img_tensor = transform(img).unsqueeze(0)
+    try:
+        url, filename = (
+            "https://github.com/pytorch/hub/raw/master/images/dog.jpg",
+            "dog.jpg",
+        )
+        urllib.request.urlretrieve(url, filename)
+        img = Image.open(filename).convert("RGB")
+        config = resolve_data_config({}, model=framework_model)
+        transform = create_transform(**config)
+        img_tensor = transform(img).unsqueeze(0)
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        img_tensor = torch.rand(1, 3, 224, 224)
 
     # Sanity run
     # os.system(
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_googlenet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_googlenet.py
index 3998cd515..2e0ea97ea 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_googlenet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_googlenet.py
@@ -14,7 +14,7 @@
 import torch
 from PIL import Image
 from torchvision import models, transforms
-
+from loguru import logger
 
 def test_googlenet_pytorch(test_device):
     # Set PyBuda configuration parameters
@@ -22,7 +22,6 @@ def test_googlenet_pytorch(test_device):
         pybuda.config._get_global_compiler_config()
     )  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     
     # Create PyBuda module from PyTorch model
@@ -33,22 +32,26 @@ def test_googlenet_pytorch(test_device):
     tt_model = pybuda.PyTorchModule("pt_googlenet", model)
 
     # Image preprocessing
-    torch.hub.download_url_to_file(
-        "https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg"
-    )
-    input_image = Image.open("dog.jpg")
-    preprocess = transforms.Compose(
-        [
-            transforms.Resize(256),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-            ),
-        ]
-    )
-    input_tensor = preprocess(input_image)
-    input_batch = input_tensor.unsqueeze(0)  # create a mini-batch as expected by the model
+    try:
+        torch.hub.download_url_to_file(
+            "https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg"
+        )
+        input_image = Image.open("dog.jpg")
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        input_tensor = preprocess(input_image)
+        input_batch = input_tensor.unsqueeze(0)  # create a mini-batch as expected by the model
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        input_batch = torch.rand(1, 3, 224, 224)
 
     # Run inference on Tenstorrent device
     verify_module(
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_hardnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_hardnet.py
new file mode 100644
index 000000000..3da1ee88f
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_hardnet.py
@@ -0,0 +1,91 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pybuda, os
+import torch
+import pytest
+import urllib
+from PIL import Image
+from torchvision import transforms
+from pybuda.verify.backend import verify_module
+from pybuda.verify.config import TestKind
+from pybuda import VerifyConfig
+from pybuda._C.backend_api import BackendDevice
+
+variants = ["hardnet68", "hardnet85", "hardnet68ds", "hardnet39ds"]
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_hardnet_pytorch(test_device, variant):
+
+    # STEP 1: Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    if variant == "hardnet85" and test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+    # load only the model architecture without pre-trained weights.
+    model = torch.hub.load("PingoLH/Pytorch-HarDNet", variant, pretrained=False)
+
+    # load the weights downloaded from https://github.com/PingoLH/Pytorch-HarDNet
+    checkpoint_path = (
+        f"third_party/confidential_customer_models/generated/files/{variant}.pth"
+    )
+
+    # Load weights from the checkpoint file and maps tensors to CPU, ensuring compatibility even without a GPU.
+    state_dict = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+
+    # Inject weights into model
+    model.load_state_dict(state_dict)
+    model.eval()
+
+    # STEP 2: Create PyBuda module from PyTorch model
+    model_name = f"pt_{variant}"
+    tt_model = pybuda.PyTorchModule(model_name, model)
+
+    # STEP 3: Prepare input
+    url, filename = (
+        "https://github.com/pytorch/hub/raw/master/images/dog.jpg",
+        "dog.jpg",
+    )
+    try:
+        urllib.URLopener().retrieve(url, filename)
+    except:
+        urllib.request.urlretrieve(url, filename)
+
+    # Preprocessing
+    input_image = Image.open(filename)
+
+    preprocess = transforms.Compose(
+        [
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ]
+    )
+    input_tensor = preprocess(input_image)
+    input_batch = input_tensor.unsqueeze(0)
+
+    pcc = (
+        0.99
+        if variant in ["hardnet68ds", "hardnet39ds"]
+        and test_device.arch == BackendDevice.Wormhole_B0
+        else 0.97
+    )
+
+    verify_module(
+        tt_model,
+        input_shapes=([input_batch.shape]),
+        inputs=([input_batch]),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            pcc=pcc,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_hrnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_hrnet.py
index af1532023..b4f7fc582 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_hrnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_hrnet.py
@@ -11,7 +11,7 @@
 from pybuda.verify.config import TestKind
 
 import os
-
+from loguru import logger
 import pybuda
 import torch
 import torch.multiprocessing
@@ -34,7 +34,6 @@ def generate_model_hrnet_imgcls_osmr_pytorch(test_device, variant):
 
     # tenstorrent/pybuda#950
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     
     # STEP 2: Create PyBuda module from PyTorch model
@@ -56,25 +55,26 @@ def generate_model_hrnet_imgcls_osmr_pytorch(test_device, variant):
     tt_model = pybuda.PyTorchModule(f"pt_hrnet_osmr_{variant}", model)
 
     # Model load
-    os.system(
-        "wget -nc https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt"
-    )
-    torch.hub.download_url_to_file(
-        "https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg"
-    )
-    input_image = Image.open("dog.jpg")
-    preprocess = transforms.Compose(
-        [
-            transforms.Resize(256),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-        ]
-    )
-    input_tensor = preprocess(input_image)
-    input_batch = input_tensor.unsqueeze(
-        0
-    )  # create a mini-batch as expected by the model
+    try:
+        torch.hub.download_url_to_file(
+            "https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg"
+        )
+        input_image = Image.open("dog.jpg")
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+            ]
+        )
+        input_tensor = preprocess(input_image)
+        input_batch = input_tensor.unsqueeze(
+            0
+        )  # create a mini-batch as expected by the model
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        input_batch = torch.rand(1, 3, 224, 224)
     print(input_batch.shape)
     
     return tt_model, [input_batch], {}
@@ -120,7 +120,6 @@ def generate_model_hrnet_imgcls_timm_pytorch(test_device, variant):
     
     # tenstorrent/pybuda#950
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
     # STEP 2: Create PyBuda module from PyTorch model
@@ -142,15 +141,19 @@ def generate_model_hrnet_imgcls_timm_pytorch(test_device, variant):
     tt_model = pybuda.PyTorchModule(f"pt_hrnet_timm_{variant}", model)
 
     ## Preprocessing
-    config = resolve_data_config({}, model=model)
-    transform = create_transform(**config)
-    url, filename = (
-        "https://github.com/pytorch/hub/raw/master/images/dog.jpg",
-        "dog.jpg",
-    )
-    urllib.request.urlretrieve(url, filename)
-    img = Image.open(filename).convert("RGB")
-    input_tensor = transform(img).unsqueeze(0)  # transform and add batch dimension
+    try:
+        config = resolve_data_config({}, model=model)
+        transform = create_transform(**config)
+        url, filename = (
+            "https://github.com/pytorch/hub/raw/master/images/dog.jpg",
+            "dog.jpg",
+        )
+        urllib.request.urlretrieve(url, filename)
+        img = Image.open(filename).convert("RGB")
+        input_tensor = transform(img).unsqueeze(0)  # transform and add batch dimension
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        input_tensor = torch.rand(1, 3, 224, 224)
     print(input_tensor.shape)
     
     return tt_model, [input_tensor], {}
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_inception_v4.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_inception_v4.py
index 9b1a51d8a..79ca4a4a5 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_inception_v4.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_inception_v4.py
@@ -8,6 +8,7 @@
 import pybuda
 import os
 import urllib
+from loguru import logger
 from test.utils import download_model
 import torch
 from pytorchcv.model_provider import get_model as ptcv_get_model
@@ -29,7 +30,6 @@ def generate_model_inceptionV4_imgcls_osmr_pytorch(test_device, variant):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     os.environ["PYBUDA_PAD_SPARSE_MM"] = "{694:704, 676:704, 167:182, 158:160, 39:48}"
     os.environ["PYBUDA_MANUAL_SPLICE_DECOMP_TH"] = "158"
@@ -38,6 +38,11 @@ def generate_model_inceptionV4_imgcls_osmr_pytorch(test_device, variant):
     compiler_cfg.balancer_op_override("_fused_op_7", "t_stream_shape", (158,1)) # TM error
     if test_device.arch == BackendDevice.Wormhole_B0:
         compiler_cfg.balancer_op_override("conv2d_551.dc.sparse_matmul.10.dc.sparse_matmul.1.lc2", "grid_shape", (1,4))
+        # Temp mitigations for net2pipe errors, should be removed.
+        #
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
+        os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
     elif test_device.arch == BackendDevice.Grayskull:
         compiler_cfg.balancer_op_override("_fused_op_2", "t_stream_shape", (676,1)) # TM error (ref pybuda#1527)
 
@@ -57,32 +62,38 @@ def generate_model_inceptionV4_imgcls_osmr_pytorch(test_device, variant):
 def preprocess_timm_model(model_name):
    model = timm.create_model(model_name, pretrained=True)
    model.eval()
-   config = resolve_data_config({}, model=model)
-   transform = create_transform(**config)
-   url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
-   urllib.request.urlretrieve(url, filename)
-   img = Image.open(filename).convert('RGB')
-   img_tensor = transform(img).unsqueeze(0) # transform and add batch dimension 
+   try:
+       config = resolve_data_config({}, model=model)
+       transform = create_transform(**config)
+       url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
+       urllib.request.urlretrieve(url, filename)
+       img = Image.open(filename).convert('RGB')
+       img_tensor = transform(img).unsqueeze(0) # transform and add batch dimension 
+   except:
+       logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+       img_tensor = torch.rand(1, 3, 299, 299)
    return model, img_tensor
  
 def get_image():
-    if not os.path.exists("imagenet_classes.txt"):
-        os.system("wget -nc https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt")
-    if not os.path.exists("dog.jpg"):
-        torch.hub.download_url_to_file("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
-    input_image = Image.open("dog.jpg")
-    preprocess = transforms.Compose(
-        [
-            transforms.Resize(299),
-            transforms.CenterCrop(299),
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-            ),
-        ]
-    )
-    img_tensor = preprocess(input_image)
-    img_tensor = img_tensor.unsqueeze(0)
+    try:
+        if not os.path.exists("dog.jpg"):
+            torch.hub.download_url_to_file("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
+        input_image = Image.open("dog.jpg")
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(299),
+                transforms.CenterCrop(299),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        img_tensor = preprocess(input_image)
+        img_tensor = img_tensor.unsqueeze(0)
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        img_tensor = torch.rand(1, 3, 299, 299)
     return img_tensor
 
 
@@ -112,7 +123,6 @@ def generate_model_inceptionV4_imgcls_timm_pytorch(test_device, variant):
     # Configurations
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     os.environ["PYBUDA_PAD_SPARSE_MM"] = "{694:704, 676:704, 167:182, 158:160, 39:48}"
     os.environ["PYBUDA_MANUAL_SPLICE_DECOMP_TH"] = "158"
@@ -121,6 +131,11 @@ def generate_model_inceptionV4_imgcls_timm_pytorch(test_device, variant):
     compiler_cfg.balancer_op_override("_fused_op_7", "t_stream_shape", (158,1)) # TM error
     if test_device.arch == BackendDevice.Wormhole_B0:
         compiler_cfg.balancer_op_override("conv2d_551.dc.sparse_matmul.10.dc.sparse_matmul.1.lc2", "grid_shape", (1,4))
+        # Temp mitigations for net2pipe errors, should be removed.
+        #
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
+        os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
     elif test_device.arch == BackendDevice.Grayskull:
         compiler_cfg.balancer_op_override("_fused_op_2", "t_stream_shape", (676,1)) # TM error (ref pybuda#1527)
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mlp_mixer.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mlp_mixer.py
index 8705d1d38..518940d4c 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mlp_mixer.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mlp_mixer.py
@@ -17,6 +17,7 @@
 from torchvision import transforms
 import timm
 import requests
+from loguru import logger
 from PIL import Image
 from timm.data import resolve_data_config
 from timm.data.transforms_factory import create_transform
@@ -69,16 +70,19 @@ def test_mlp_mixer_timm_pytorch(variant, test_device):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
 
-    url = "https://datasets-server.huggingface.co/assets/imagenet-1k/--/default/train/18/image/image.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    try:
+        url = "https://images.rawpixel.com/image_1300/cHJpdmF0ZS9sci9pbWFnZXMvd2Vic2l0ZS8yMDIyLTA1L3BkMTA2LTA0Ny1jaGltXzEuanBn.jpg"
+        image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        image = torch.rand(1, 3, 256, 256)
     pixel_values = transform(image).unsqueeze(0)
-    label = "tiger"
 
     # STEP 2: Create PyBuda module from PyTorch model
     tt_model = pybuda.PyTorchModule(variant+"_pt", model)
 
+    pcc = 0.92 if test_device.arch == BackendDevice.Grayskull and variant == "mixer_b16_224_miil" else 0.99
     verify_module(
         tt_model,
         input_shapes=[(pixel_values.shape,)],
@@ -88,7 +92,7 @@ def test_mlp_mixer_timm_pytorch(variant, test_device):
             devtype=test_device.devtype,
             devmode=test_device.devmode,
             test_kind=TestKind.INFERENCE,
-            pcc=0.99,
+            pcc=pcc,
         ),
     )
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1.py
index 3fe7f0b64..7f7b532c9 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1.py
@@ -208,7 +208,6 @@ def generate_model_mobilenetv1_imgcls_hf_pytorch(test_device, variant):
         pybuda.config._get_global_compiler_config()
     )  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
     # Create PyBuda module from PyTorch model
@@ -255,7 +254,6 @@ def generate_model_mobilenetV1I224_imgcls_hf_pytorch(test_device, variant):
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     os.environ["PYBUDA_RIBBON2"] = "1"
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1_ssd.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1_ssd.py
index 1145726d5..3c2348be1 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1_ssd.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v1_ssd.py
@@ -27,7 +27,6 @@ def test_mobilenet_v1_ssd_pytorch_1x1(test_device):
         pybuda.config._get_global_compiler_config()
     )  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_102"] = -1
     compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_131"] = -1
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v2.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v2.py
index 55d006244..186ecb517 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v2.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v2.py
@@ -17,20 +17,19 @@
 from timm.data import resolve_data_config
 from timm.data.transforms_factory import create_transform
 import requests
+from loguru import logger
 from transformers import AutoImageProcessor, AutoModelForImageClassification
 from transformers import MobileNetV2FeatureExtractor, MobileNetV2ForSemanticSegmentation
 
 def generate_model_mobilenetV2_imgcls_torchhub_pytorch(test_device, variant):
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip("Failing on GS with: Core (c=0,y=8,x=1) [routing]  (c=0,y=6,x=0) [worker] [op_name=conv2d_483.dc.matmul.12] exceeded resource constraints: active dram queues used: 63 limit: 40")
-    
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = (
         pybuda.config._get_global_compiler_config()
     )  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    if test_device.arch == BackendDevice.Grayskull:
+        compiler_cfg.balancer_policy = "CNN"
 
     # STEP 2: Create PyBuda module from PyTorch model
     model = download_model(torch.hub.load,
@@ -75,7 +74,6 @@ def generate_model_mobilenetV2I96_imgcls_hf_pytorch(test_device, variant):
         pybuda.config._get_global_compiler_config()
     )  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
     # Create PyBuda module from PyTorch model
@@ -121,7 +119,6 @@ def generate_model_mobilenetV2I160_imgcls_hf_pytorch(test_device, variant):
         pybuda.config._get_global_compiler_config()
     )  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
     # Create PyBuda module from PyTorch model
@@ -162,16 +159,14 @@ def test_mobilenetv2_160(test_device):
     )
 
 def generate_model_mobilenetV2I244_imgcls_hf_pytorch(test_device, variant):
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip("Failing on GS with: Core (c=0,y=8,x=1) [routing]  (c=0,y=6,x=0) [worker] [op_name=conv2d_496.dc.matmul.12] exceeded resource constraints: active dram queues used: 63 limit: 40")
-    
     # Set PyBuda configuration parameters
     compiler_cfg = (
         pybuda.config._get_global_compiler_config()
     )  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    if test_device.arch == BackendDevice.Grayskull:
+        os.environ["PYBUDA_RIBBON2"] = "1"
 
     # Create PyBuda module from PyTorch model
     preprocessor = download_model(AutoImageProcessor.from_pretrained,
@@ -212,33 +207,36 @@ def test_mobilenetv2_224(test_device):
     )
     
 def generate_model_mobilenetV2_imgcls_timm_pytorch(test_device, variant):
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip("Failing on GS with: Core (c=0,y=8,x=1) [routing]  (c=0,y=6,x=0) [worker] [op_name=conv2d_483.dc.matmul.12] exceeded resource constraints: active dram queues used: 63 limit: 40")
-    
     # Set PyBuda configuration parameters
     compiler_cfg = (
         pybuda.config._get_global_compiler_config()
     )  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    if test_device.arch == BackendDevice.Grayskull:
+        os.environ["PYBUDA_RIBBON2"] = "1"
 
     # Create PyBuda module from PyTorch model
     model = download_model(timm.create_model, variant, pretrained=True)
     tt_model = pybuda.PyTorchModule("mobilenet_v2__hf_timm", model)
 
     # Image load and pre-processing into pixel_values
-    config = resolve_data_config({}, model=model)
-    transform = create_transform(**config)
-    url, filename = (
-        "https://github.com/pytorch/hub/raw/master/images/dog.jpg",
-        "dog.jpg",
-    )
-    urllib.request.urlretrieve(url, filename)
-    img = Image.open(filename).convert("RGB")
-    image_tensor = transform(img).unsqueeze(
-        0
-    )  # transform and add batch dimension
+    try:
+        config = resolve_data_config({}, model=model)
+        transform = create_transform(**config)
+        url, filename = (
+            "https://github.com/pytorch/hub/raw/master/images/dog.jpg",
+            "dog.jpg",
+        )
+        urllib.request.urlretrieve(url, filename)
+        img = Image.open(filename).convert("RGB")
+        image_tensor = transform(img).unsqueeze(
+            0
+        )  # transform and add batch dimension
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        image_tensor = torch.rand(1, 3, 224, 224)
+
     return tt_model, [image_tensor], {}
 
     
@@ -274,7 +272,6 @@ def generate_model_mobilenetV2_semseg_hf_pytorch(test_device, variant):
     # Configurations
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     os.environ["PYBUDA_RIBBON2"] = "1"
 
@@ -291,15 +288,19 @@ def generate_model_mobilenetV2_semseg_hf_pytorch(test_device, variant):
     
     # II 3x224x224
     # Load and pre-process image
-    config = resolve_data_config({}, model=framework_model)
-    transform = create_transform(**config)
-    url, filename = (
-        "https://github.com/pytorch/hub/raw/master/images/dog.jpg",
-        "dog.jpg",
-    )
-    urllib.request.urlretrieve(url, filename)
-    img = Image.open(filename).convert("RGB")
-    img_tensor = transform(img).unsqueeze(0)
+    try:
+        config = resolve_data_config({}, model=framework_model)
+        transform = create_transform(**config)
+        url, filename = (
+            "https://github.com/pytorch/hub/raw/master/images/dog.jpg",
+            "dog.jpg",
+        )
+        urllib.request.urlretrieve(url, filename)
+        img = Image.open(filename).convert("RGB")
+        img_tensor = transform(img).unsqueeze(0)
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        img_tensor = torch.rand(1, 3, 224, 224)
 
     # Sanity run
     # cpu_out = framework_model(img_tensor)
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v3.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v3.py
index f79ab951c..1c92a8d1d 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v3.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_mobilenet_v3.py
@@ -19,6 +19,7 @@
 import timm
 from timm.data import resolve_data_config
 from timm.data.transforms_factory import create_transform
+from loguru import logger
 
 def generate_model_mobilenetV3_imgcls_torchhub_pytorch(test_device, variant):
     # Set PyBuda configuration parameters
@@ -26,7 +27,6 @@ def generate_model_mobilenetV3_imgcls_torchhub_pytorch(test_device, variant):
         pybuda.config._get_global_compiler_config()
     )  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     if test_device.arch == BackendDevice.Grayskull:
         os.environ["PYBUDA_RIBBON2"] = "1"
@@ -74,7 +74,6 @@ def generate_model_mobilenetV3_imgcls_timm_pytorch(test_device, variant):
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
     # Create PyBuda module from PyTorch model
@@ -92,17 +91,21 @@ def generate_model_mobilenetV3_imgcls_timm_pytorch(test_device, variant):
     tt_model = pybuda.PyTorchModule(variant, model)
 
     # Image load and pre-processing into pixel_values
-    config = resolve_data_config({}, model=model)
-    transform = create_transform(**config)
-    url, filename = (
-        "https://github.com/pytorch/hub/raw/master/images/dog.jpg",
-        "dog.jpg",
-    )
-    urllib.request.urlretrieve(url, filename)
-    img = Image.open(filename).convert("RGB")
-    image_tensor = transform(img).unsqueeze(
-        0
-    )  # transform and add batch dimension
+    try:
+        config = resolve_data_config({}, model=model)
+        transform = create_transform(**config)
+        url, filename = (
+            "https://github.com/pytorch/hub/raw/master/images/dog.jpg",
+            "dog.jpg",
+        )
+        urllib.request.urlretrieve(url, filename)
+        img = Image.open(filename).convert("RGB")
+        image_tensor = transform(img).unsqueeze(
+            0
+        )  # transform and add batch dimension
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        image_tensor = torch.rand(1, 3, 224, 224)
 
     
     return tt_model, [image_tensor], {}
@@ -116,6 +119,8 @@ def test_mobilenetv3_timm(variant, test_device):
         test_device, variant,
     )
 
+    os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
+
     verify_module(
         model,
         input_shapes=[inputs[0].shape],
@@ -132,6 +137,7 @@ def test_mobilenetv3_timm(variant, test_device):
 
 variants = ["mobilenetv3_large_100", "mobilenetv3_small_100"]
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.skip(reason="Not supported")
 def test_mobilenetv3_timm_1x1(variant, test_device):
     pytest.skip()
     os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_openpose.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_openpose.py
index 22d027509..89b87b7ed 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_openpose.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_openpose.py
@@ -312,7 +312,6 @@ def generate_model_openpose_posdet_custom_pytorch(test_device, variant):
     # Configurations
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_enumerate_u_kt = False
     compiler_cfg.graph_solver_self_cut_type = "FastCut"
     compiler_cfg.default_df_override = DataFormat.Float16_b
@@ -353,6 +352,7 @@ def generate_model_openpose_posdet_custom_pytorch(test_device, variant):
 
 
 @pytest.mark.parametrize("variant", variants)
+@pytest.mark.skip(reason="Not needed for release")
 def test_openpose_basic(variant, test_device):
     model, inputs, _ = generate_model_openpose_posdet_custom_pytorch(
         test_device, variant,
@@ -390,7 +390,6 @@ def generate_model_openpose_posdet_osmr_pytorch(test_device, variant):
     # Configurations
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_fusing = False
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio.py
new file mode 100644
index 000000000..fb28fa897
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_perceiverio.py
@@ -0,0 +1,146 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pybuda
+import torch
+import os
+import requests
+from PIL import Image
+import pytest
+from loguru import logger
+from transformers import (
+    AutoImageProcessor,
+    PerceiverForImageClassificationConvProcessing,
+    PerceiverForImageClassificationLearned,
+    PerceiverForImageClassificationFourier,
+)
+
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+
+
+def get_sample_data(model_name):
+    image_processor = AutoImageProcessor.from_pretrained(model_name)
+    try:
+        url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        image = Image.open(requests.get(url, stream=True).raw)
+        pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+    except:
+        logger.warning(
+            "Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date"
+        )
+        height = image_processor.to_dict()["size"]["height"]
+        width = image_processor.to_dict()["size"]["width"]
+        pixel_values = torch.rand(1, 3, height, width).to(torch.float32)
+    return pixel_values
+
+
+variants = [
+    "deepmind/vision-perceiver-conv",
+    "deepmind/vision-perceiver-learned",
+    "deepmind/vision-perceiver-fourier",
+]
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_perceiverio_for_image_classification_pytorch(test_device, variant):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    verify_enabled = True
+    pcc_value = 0.99
+
+    # Temp mitigations for net2pipe errors, should be removed.
+    #
+    if variant == "deepmind/vision-perceiver-conv":
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
+        os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
+
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+
+        if variant == "deepmind/vision-perceiver-conv":
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{10*1024}"
+
+        if variant in [
+            "deepmind/vision-perceiver-learned",
+            "deepmind/vision-perceiver-fourier",
+        ]:
+            os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+            compiler_cfg.enable_auto_fusing = False
+
+        if variant == "deepmind/vision-perceiver-fourier":
+            compiler_cfg.balancer_op_override(
+                "hslice_41.dc.sparse_matmul.2.lc2", "t_stream_shape", (1, 2)
+            )
+            if test_device.devtype == pybuda.BackendType.Silicon:
+                pcc_value = 0.97
+
+        if variant == "deepmind/vision-perceiver-learned":
+            if test_device.devtype == pybuda.BackendType.Silicon:
+                pcc_value = 0.92
+
+    elif test_device.arch == pybuda.BackendDevice.Grayskull:
+
+        if test_device.devtype == pybuda.BackendType.Silicon:
+            verify_enabled = False
+
+        if variant in [
+            "deepmind/vision-perceiver-conv",
+            "deepmind/vision-perceiver-learned",
+            "deepmind/vision-perceiver-fourier",
+        ]:
+            compiler_cfg.enable_auto_fusing = False
+
+        if variant in [
+            "deepmind/vision-perceiver-learned",
+            "deepmind/vision-perceiver-fourier",
+        ]:
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{101*1024}"
+            os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+
+        if variant == "deepmind/vision-perceiver-fourier":
+            compiler_cfg.balancer_op_override(
+                "hslice_41.dc.sparse_matmul.2.lc2", "t_stream_shape", (1, 7)
+            )
+
+    # Sample Image
+    pixel_values = get_sample_data(variant)
+
+    # Load the model from HuggingFace
+    if variant == "deepmind/vision-perceiver-learned":
+        model = PerceiverForImageClassificationLearned.from_pretrained(variant)
+
+    elif variant == "deepmind/vision-perceiver-conv":
+        model = PerceiverForImageClassificationConvProcessing.from_pretrained(variant)
+
+    elif variant == "deepmind/vision-perceiver-fourier":
+        model = PerceiverForImageClassificationFourier.from_pretrained(variant)
+
+    else:
+        logger.info(f"The model {variant} is not supported")
+
+    model.eval()
+
+    tt_model = pybuda.PyTorchModule(
+        "pt_" + str(variant.split("/")[-1].replace("-", "_")), model
+    )
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=[(pixel_values.shape,)],
+        inputs=[(pixel_values)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            enabled=verify_enabled,  # pcc drops in silicon devicetype
+            pcc=pcc_value,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_rcnn.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_rcnn.py
new file mode 100644
index 000000000..c3277c43a
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_rcnn.py
@@ -0,0 +1,96 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pybuda
+import cv2
+import torch.nn.init as init
+import torchvision
+import torch.nn as nn
+import torchvision.transforms as transforms
+import os, sys
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+
+
+def test_rcnn_pytorch(test_device):
+
+    # Load Alexnet Model
+    model = torchvision.models.alexnet(pretrained=True)
+    num_classes = 2
+    num_features = model.classifier[6].in_features
+
+    # Create class specific linear SVMs [Refer Section 2 in paper]
+    svm_layer = nn.Linear(num_features, num_classes)
+
+    # Replacing the Alexnet's ImageNet specific 1000-way classification layer with a randomly initialized (N + 1)-way classification layer(where N is the number of object classes, plus 1 for background)
+    # [Refer Section 2.3.Domain-specific fine-tuning in Paper]
+    init.normal_(svm_layer.weight, mean=0, std=0.01)
+    init.constant_(svm_layer.bias, 0)
+    model.classifier[6] = svm_layer
+
+    model.eval()
+
+    # Cancel gradient tracking
+    for param in model.parameters():
+        param.requires_grad = False
+
+    # Image
+    img = cv2.imread("pybuda/test/model_demos/utils/cnn/pytorch/images/car.jpg")
+
+    transform = transforms.Compose(
+        [
+            transforms.ToPILImage(),
+            transforms.Resize((227, 227)),
+            transforms.RandomHorizontalFlip(),
+            transforms.ToTensor(),
+            transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)),
+        ]
+    )
+
+    # Selective search - A separate tool for generating proposals(potential regions that might contain objects) which can be fed to actual model
+    # As it is a pre-processing step,it is implemented on cpu
+    gs = cv2.ximgproc.segmentation.createSelectiveSearchSegmentation()
+    gs.setBaseImage(img)
+    gs.switchToSelectiveSearchFast()
+    rects = gs.process()
+    rects[:, 2] += rects[:, 0]
+    rects[:, 3] += rects[:, 1]
+    print("Suggested number of proposals: %d" % len(rects))
+
+    # PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda._C.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    # Pybuda model
+    tt_model = pybuda.PyTorchModule("rcnn", model)
+
+    # Proposals generated by selective search were fed to a model in a loop manner to compute features.
+    # [Refer line No.151 in https://github.com/object-detection-algorithm/R-CNN/blob/master/py/car_detector.py]
+    for rect in rects:
+
+        xmin, ymin, xmax, ymax = rect
+        rect_img = img[ymin:ymax, xmin:xmax]
+
+        rect_transform = transform(rect_img)
+        inputs = rect_transform.unsqueeze(0)
+
+        verify_module(
+            tt_model,
+            input_shapes=[inputs.shape],
+            inputs=[(inputs)],
+            verify_cfg=VerifyConfig(
+                arch=test_device.arch,
+                devtype=test_device.devtype,
+                devmode=test_device.devmode,
+                test_kind=TestKind.INFERENCE,
+            ),
+        )
+
+        break  # As generated proposals will be around 2000, halt inference after getting result from single proposal.
+
+
+# Paper - https://arxiv.org/abs/1311.2524
+# Repo - https://github.com/object-detection-algorithm/R-CNN
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_resnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_resnet.py
index 21815fa7e..22b297fde 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_resnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_resnet.py
@@ -18,6 +18,7 @@
 import torch
 from timm.data import resolve_data_config
 from timm.data.transforms_factory import create_transform
+from loguru import logger
 
 def generate_model_resnet_imgcls_hf_pytorch(test_device, variant):    
     # Load ResNet feature extractor and model checkpoint from HuggingFace
@@ -28,20 +29,20 @@ def generate_model_resnet_imgcls_hf_pytorch(test_device, variant):
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
-    os.environ["PYBUDA_DISABLE_STREAM_OUTPUT"] = "1"  # Disable streaming for output queue (perf)
     os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
 
     # Load data sample
-    url = "https://datasets-server.huggingface.co/assets/imagenet-1k/--/default/train/18/image/image.jpg"
-    image = Image.open(requests.get(url, stream=True).raw)
-    label = "tiger"
+    try:
+        url = "https://images.rawpixel.com/image_1300/cHJpdmF0ZS9sci9pbWFnZXMvd2Vic2l0ZS8yMDIyLTA1L3BkMTA2LTA0Ny1jaGltXzEuanBn.jpg" 
+        image = Image.open(requests.get(url, stream=True).raw)
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        image = torch.rand(1, 3, 256, 256)
 
     # Data preprocessing
     inputs = feature_extractor(image, return_tensors="pt")
     pixel_values = inputs["pixel_values"]
-    
     model = pybuda.PyTorchModule("pt_resnet50", model)
     
     return model, [pixel_values], {}
@@ -84,16 +85,18 @@ def generate_model_resnet_imgcls_timm_pytorch(test_device, variant):
         pybuda.config._get_global_compiler_config()
     )  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
     # Load data sample
-    url = "https://datasets-server.huggingface.co/assets/imagenet-1k/--/default/train/18/image/image.jpg"
-    image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
-    label = "tiger"
+    try:
+        url = "https://images.rawpixel.com/image_1300/cHJpdmF0ZS9sci9pbWFnZXMvd2Vic2l0ZS8yMDIyLTA1L3BkMTA2LTA0Ny1jaGltXzEuanBn.jpg"
+        image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        image = torch.rand(1, 3, 256, 256)
 
     # Data preprocessing
-    pixel_values = transform(image).unsqueeze(0)
+    pixel_values = transform(image).unsqueeze(0) 
     
     model = pybuda.PyTorchModule("pt_resnet50", model)
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_resnext.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_resnext.py
index 99507e9bf..61b856c0e 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_resnext.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_resnext.py
@@ -9,7 +9,7 @@
 from pybuda.verify.config import TestKind
 
 import os
-
+from loguru import logger
 import pybuda
 import torch
 from PIL import Image
@@ -20,43 +20,42 @@
 #############
 def get_image_tensor():
     # Image processing
-    os.system(
-        "wget -nc https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt"
-    )
-    torch.hub.download_url_to_file(
-        "https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg"
-    )
-    input_image = Image.open("dog.jpg")
-    preprocess = transforms.Compose(
-        [
-            transforms.Resize(224),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-            ),
-        ]
-    )
-    input_tensor = preprocess(input_image)
-    input_batch = input_tensor.unsqueeze(
-        0
-    )  # create a mini-batch as expected by the model
+    try:
+        torch.hub.download_url_to_file(
+            "https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg"
+        )
+        input_image = Image.open("dog.jpg")
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(224),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        input_tensor = preprocess(input_image)
+        input_batch = input_tensor.unsqueeze(
+            0
+        )  # create a mini-batch as expected by the model
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        input_batch = torch.rand(1, 3, 224, 224)
     return input_batch
 
 def test_resnext_50_torchhub_pytorch(test_device):
-    
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip("Failing on GS with: Core (c=0,y=8,x=1) [routing]  (c=0,y=6,x=0) [worker] [op_name=conv2d_412.dc.matmul.12] exceeded resource constraints: active dram queues used: 56 limit: 40")
-    
     # STEP 1: Set PyBuda configuration parameters
-    compiler_cfg = (
-        pybuda.config._get_global_compiler_config()
-    )  # load global compiler config object
+    compiler_cfg = pybuda.config._get_global_compiler_config() # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
     os.environ["PYBUDA_RIBBON2"] = "1"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     
+    if test_device.arch == BackendDevice.Grayskull:
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{72*1024}"
+
+    os.environ["PYBUDA_TEMP_DISABLE_MODEL_KB_PROLOGUE_BW"] = "1"
+
     # STEP 2: Create PyBuda module from PyTorch model
     model = download_model(torch.hub.load,
         "pytorch/vision:v0.10.0", "resnext50_32x4d", pretrained=True
@@ -83,19 +82,19 @@ def test_resnext_50_torchhub_pytorch(test_device):
     )
     
 def test_resnext_101_torchhub_pytorch(test_device):
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip("Grayskull failing with: <PIPEGEN-ERROR> Chip = 0, Core x = 1, y = 7(logical x = 0, y = 5): has more than 24 prefetch buf streams")
-    
     # STEP 1: Set PyBuda configuration parameters
-    compiler_cfg = (
-        pybuda.config._get_global_compiler_config()
-    )  # load global compiler config object
+    compiler_cfg = pybuda.config._get_global_compiler_config() # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
     os.environ["PYBUDA_RIBBON2"] = "1"
-    os.environ["PYBUDA_BALANCER_PREPASS_DISABLED"] = "1"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
+    if test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_BALANCER_PREPASS_DISABLED"] = "1"
+    elif test_device.arch == BackendDevice.Grayskull:
+        compiler_cfg.enable_auto_fusing = False
+        os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{80*1024}"
+
     # STEP 2: Create PyBuda module from PyTorch model
     model = download_model(torch.hub.load,
         "pytorch/vision:v0.10.0", "resnext101_32x8d", pretrained=True
@@ -127,13 +126,10 @@ def test_resnext_101_32x8d_fb_wsl_pytorch(test_device):
         pytest.skip("Grayskull failing with: <PIPEGEN-ERROR> Chip = 0, Core x = 1, y = 7(logical x = 0, y = 5): has more than 24 prefetch buf streams")
     
     # STEP 1: Set PyBuda configuration parameters
-    compiler_cfg = (
-        pybuda.config._get_global_compiler_config()
-    )  # load global compiler config object
+    compiler_cfg = pybuda.config._get_global_compiler_config() # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
     os.environ["PYBUDA_RIBBON2"] = "1"
     os.environ["PYBUDA_BALANCER_PREPASS_DISABLED"] = "1"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
     # STEP 2: Create PyBuda module from PyTorch model
@@ -165,14 +161,11 @@ def test_resnext_101_32x8d_fb_wsl_pytorch(test_device):
 def test_resnext_14_osmr_pytorch(test_device):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.balancer_policy = "Ribbon"
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
-    if test_device.arch == BackendDevice.Wormhole_B0:
-        compiler_cfg.balancer_policy = "Ribbon"
-        os.environ["PYBUDA_RIBBON2"] = "1"
-        compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
-    else:
-        compiler_cfg.balancer_policy = "CNN"
+    if test_device.arch == BackendDevice.Grayskull:
         compiler_cfg.enable_auto_fusing = False
         os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
         os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{24*1024}"
@@ -202,19 +195,16 @@ def test_resnext_14_osmr_pytorch(test_device):
     )
     
 def test_resnext_26_osmr_pytorch(test_device):
-    
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip("Failing on GS with: Core (c=0,y=8,x=1) [routing]  (c=0,y=6,x=0) [worker] [op_name=conv2d_283.dc.matmul.12] exceeded resource constraints: active dram queues used: 56 limit: 40")
-    
     # STEP 1: Set PyBuda configuration parameters
-    compiler_cfg = (
-        pybuda.config._get_global_compiler_config()
-    )  # load global compiler config object
+    compiler_cfg = pybuda.config._get_global_compiler_config() # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     os.environ["PYBUDA_RIBBON2"] = "1"
-    os.environ["PYBUDA_BALANCER_PREPASS_DISABLED"] = "1"
+
+    if test_device.arch == BackendDevice.Grayskull:
+        compiler_cfg.enable_auto_fusing = False
+        os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{72*1024}"
 
     # STEP 2: Create PyBuda module from PyTorch model
     model = download_model(ptcv_get_model, "resnext26_32x4d", pretrained=True)
@@ -241,18 +231,15 @@ def test_resnext_26_osmr_pytorch(test_device):
     )
     
 def test_resnext_50_osmr_pytorch(test_device):
-    
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip("Failing on GS with: Core (c=0,y=8,x=1) [routing]  (c=0,y=6,x=0) [worker] [op_name=conv2d_412.dc.matmul.12] exceeded resource constraints: active dram queues used: 56 limit: 40")
-    
     # STEP 1: Set PyBuda configuration parameters
-    compiler_cfg = (
-        pybuda.config._get_global_compiler_config()
-    )  # load global compiler config object
+    compiler_cfg = pybuda.config._get_global_compiler_config() # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
     os.environ["PYBUDA_RIBBON2"] = "1"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    if test_device.arch == BackendDevice.Grayskull:
+        compiler_cfg.enable_auto_fusing = False
+        os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{72*1024}"
 
     # STEP 2: Create PyBuda module from PyTorch model
     model = download_model(ptcv_get_model, "resnext50_32x4d", pretrained=True)
@@ -278,19 +265,18 @@ def test_resnext_50_osmr_pytorch(test_device):
     )
     
 def test_resnext_101_osmr_pytorch(test_device):
-    
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip("Grayskull failing with: <PIPEGEN-ERROR> Chip = 0, Core x = 1, y = 7(logical x = 0, y = 5): has more than 24 prefetch buf streams")
-    
     # STEP 1: Set PyBuda configuration parameters
-    compiler_cfg = (
-        pybuda.config._get_global_compiler_config()
-    )  # load global compiler config object
+    compiler_cfg = pybuda.config._get_global_compiler_config() # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
     os.environ["PYBUDA_RIBBON2"] = "1"
-    os.environ["PYBUDA_BALANCER_PREPASS_DISABLED"] = "1"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    
+    if test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_BALANCER_PREPASS_DISABLED"] = "1"
+    elif test_device.arch == BackendDevice.Grayskull:
+        compiler_cfg.enable_auto_fusing = False
+        os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{80*1024}"
 
     # STEP 2: Create PyBuda module from PyTorch model
     model = download_model(ptcv_get_model, "resnext101_64x4d", pretrained=True)
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_retinanet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_retinanet.py
index d6660600c..d275c5639 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_retinanet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_retinanet.py
@@ -1,149 +1,160 @@
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
 # SPDX-License-Identifier: Apache-2.0
+
+import pybuda
+from PIL import Image
+import requests
+from torchvision import transforms
+import os
 import pytest
-from test.utils import download_model
 from pybuda.verify.backend import verify_module
-from pybuda import VerifyConfig, PyTorchModule
-from pybuda._C.backend_api import BackendType, BackendDevice
 from pybuda.verify.config import TestKind
+from pybuda import VerifyConfig
+import sys
 
-import pybuda
-import os
+sys.path.append("third_party/confidential_customer_models/cv_demos/retinanet/model/")
+from model_implementation import Model
+from pybuda._C.backend_api import BackendDevice
 
-import torch
-import torchvision
 
-from PIL import Image
-from torchvision import transforms
-torch.multiprocessing.set_sharing_strategy("file_system") 
- 
-def get_image():
-    os.system(
-        "wget -nc https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt"
-    )
-    torch.hub.download_url_to_file(
-        "https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg"
-    )
-    input_image = Image.open("dog.jpg")
+def img_preprocess():
+
+    url = "https://i.ytimg.com/vi/q71MCWAEfL8/maxresdefault.jpg"
+    pil_img = Image.open(requests.get(url, stream=True).raw)
+    new_size = (640, 480)
+    pil_img = pil_img.resize(new_size, resample=Image.BICUBIC)
     preprocess = transforms.Compose(
         [
-            transforms.Resize(800),
-            transforms.CenterCrop(800),
             transforms.ToTensor(),
-            transforms.Normalize(
-                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-            ),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
         ]
     )
-    img_tensor = preprocess(input_image)
-    img_tensor = img_tensor.unsqueeze(0)
-
-    return img_tensor
-
-   
-   
-class RetinaNetModelWrapper(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-
-    def forward(self, image_tensors):
-        
-        images, targets = self.model.transform(image_tensors, None)
-        features = self.model.backbone(images.tensors)
-        if isinstance(features, torch.Tensor):
-            features = OrderedDict([('0', features)])
-            
-        features = list(features.values())
-        head_outputs = self.model.head(features)
-        # import pdb; pdb.set_trace()
-        return image_tensors, features[0], features[1], features[2], features[3], features[4], head_outputs['cls_logits'], head_outputs['bbox_regression']
-
-from torchvision.models.detection.image_list import ImageList  
-class RetinaNetModelPostProcessing(torch.nn.Module):
-    def __init__(self, model):
-        super().__init__()
-        self.model = model
-
-    def forward(self, image_tensors, feat0, feat1, feat2, feat3, feat4, cls_logits, bbox_regression):        
-        # get the original image sizes
-        original_image_sizes: List[Tuple[int, int]] = []
-        for img in image_tensors:
-            val = img.shape[-2:]
-            assert len(val) == 2
-            original_image_sizes.append((val[0], val[1]))
-        
-        features = [feat0, feat1, feat2, feat3, feat4]
-        head_outputs = {'cls_logits': cls_logits, 'bbox_regression': bbox_regression}
-        image_sizes = [tuple(img.shape[-2:]) for img in image_tensors]
-        images = ImageList(image_tensors, image_sizes)
-        anchors = self.model.anchor_generator(images, features)
-        
-        detections: List[Dict[str, Tensor]] = []
-        # recover level sizes
-        num_anchors_per_level = [x.size(2) * x.size(3) for x in features]
-        HW = 0
-        for v in num_anchors_per_level:
-            HW += v
-        HWA = head_outputs['cls_logits'].size(1)
-        A = HWA // HW
-        num_anchors_per_level = [hw * A for hw in num_anchors_per_level]
-
-        # split outputs per level
-        split_head_outputs: Dict[str, List[Tensor]] = {}
-        for k in head_outputs:
-            split_head_outputs[k] = list(head_outputs[k].split(num_anchors_per_level, dim=1))
-        split_anchors = [list(a.split(num_anchors_per_level)) for a in anchors]
-
-        # compute the detections
-        detections = self.model.postprocess_detections(split_head_outputs, split_anchors, images.image_sizes)
-        detections = self.model.transform.postprocess(detections, images.image_sizes, original_image_sizes)
-        
-        return detections
-
-def test_retinanet_r50_fpn_v1_torchvision_pytorch(test_device):
-    pytest.skip("Under development")
-
-    # STEP 1: Set PyBuda configuration parameters
-    compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
-    compiler_cfg.default_df_override = pybuda._C.DataFormat.Bfp8_b
-    compiler_cfg.amp_level = 2
-    compiler_cfg.enable_auto_fusing = False
-    compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_8"] = 7
-    compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_167"] = 3
-    compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_219"] = 3
-    compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_259"] = 3
-    compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_299"] = 3
-    compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_717"] = 3
-    compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_723"] = 3
-    compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_725"] = 3
-    compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_727"] = 3
-    compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_729"] = 3
-    compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_789"] = 3
-    compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_791"] = 3
-    compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_793"] = 3
-    compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_795"] = 3
-    compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_797"] = 3
-    os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "ConsumerOperandDataEdgesFirst"
-
-    # STEP 2: Create PyBuda module from PyTorch model 
-    model = download_model(torchvision.models.detection.retinanet_resnet50_fpn, pretrained=True)
+    img = preprocess(pil_img)
+    img = img.unsqueeze(0)
+    return img
+
+
+variants = [
+    "retinanet_rn18fpn",
+    "retinanet_rn34fpn",
+    "retinanet_rn50fpn",
+    "retinanet_rn101fpn",
+    "retinanet_rn152fpn",
+]
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_retinanet(variant, test_device):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_DECOMPOSE_SIGMOID"] = "1"
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "73728"
+
+    if test_device.arch == BackendDevice.Wormhole_B0:
+
+        if variant == "retinanet_rn18fpn":
+            compiler_cfg.place_on_new_epoch("conv2d_357.dc.matmul.11")
+            compiler_cfg.balancer_op_override(
+                "conv2d_322.dc.matmul.11", "t_stream_shape", (1, 1)
+            )
+            compiler_cfg.balancer_op_override(
+                "conv2d_300.dc.matmul.11", "grid_shape", (1, 1)
+            )
+
+        elif variant == "retinanet_rn34fpn":
+            compiler_cfg.place_on_new_epoch("conv2d_589.dc.matmul.11")
+            compiler_cfg.balancer_op_override(
+                "conv2d_554.dc.matmul.11", "t_stream_shape", (1, 1)
+            )
+            compiler_cfg.balancer_op_override(
+                "conv2d_532.dc.matmul.11", "grid_shape", (1, 1)
+            )
+
+        elif variant == "retinanet_rn50fpn":
+            compiler_cfg.place_on_new_epoch("conv2d_826.dc.matmul.11")
+            compiler_cfg.balancer_op_override(
+                "conv2d_791.dc.matmul.11", "t_stream_shape", (1, 1)
+            )
+            compiler_cfg.balancer_op_override(
+                "conv2d_769.dc.matmul.11", "grid_shape", (1, 1)
+            )
+
+        elif variant == "retinanet_rn101fpn":
+            compiler_cfg.place_on_new_epoch("conv2d_1557.dc.matmul.11")
+            compiler_cfg.balancer_op_override(
+                "conv2d_1522.dc.matmul.11", "t_stream_shape", (1, 1)
+            )
+            compiler_cfg.balancer_op_override(
+                "conv2d_1500.dc.matmul.11", "grid_shape", (1, 1)
+            )
+
+        elif variant == "retinanet_rn152fpn":
+            compiler_cfg.place_on_new_epoch("conv2d_2288.dc.matmul.11")
+            compiler_cfg.balancer_op_override(
+                "conv2d_2253.dc.matmul.11", "t_stream_shape", (1, 1)
+            )
+            compiler_cfg.balancer_op_override(
+                "conv2d_2231.dc.matmul.11", "grid_shape", (1, 1)
+            )
+
+    if test_device.arch == BackendDevice.Grayskull:
+        # Temp mitigations for net2pipe errors, should be removed.
+        #
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
+        os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
+
+        if variant == "retinanet_rn18fpn":
+            compiler_cfg.balancer_op_override(
+                "conv2d_322.dc.matmul.11", "t_stream_shape", (1, 1)
+            )
+
+        elif variant == "retinanet_rn34fpn":
+            compiler_cfg.balancer_op_override(
+                "conv2d_554.dc.matmul.11", "t_stream_shape", (1, 1)
+            )
+
+        elif variant == "retinanet_rn50fpn":
+            compiler_cfg.balancer_op_override(
+                "conv2d_791.dc.matmul.11", "t_stream_shape", (1, 1)
+            )
+
+        elif variant == "retinanet_rn101fpn":
+            compiler_cfg.balancer_op_override(
+                "conv2d_1522.dc.matmul.11", "t_stream_shape", (1, 1)
+            )
+
+        elif variant == "retinanet_rn152fpn":
+            compiler_cfg.balancer_op_override(
+                "conv2d_2253.dc.matmul.11", "t_stream_shape", (1, 1)
+            )
+
+    # Prepare model
+
+    checkpoint_path = (
+        f"third_party/confidential_customer_models/cv_demos/retinanet/weights/{variant}.pth"
+    )
+    model = Model.load(checkpoint_path)
     model.eval()
-    tt_model = pybuda.PyTorchModule("retinanet_v1_pt", RetinaNetModelWrapper(model))
-    # import pdb; pdb.set_trace()
-    # STEP 3: Run inference on Tenstorrent device 
-    img_tensor = get_image()
-    output = model(img_tensor)
-
-    tt_model = RetinaNetModelWrapper(model)
-    cpu_model = RetinaNetModelPostProcessing(model)
-    tt_output = cpu_model(*tt_model(img_tensor))
-    tt0 = pybuda.TTDevice("tt0", devtype=test_device.devtype, arch=test_device.arch, module=PyTorchModule("retinanet_pt", tt_model))
-    cpu1 = pybuda.CPUDevice("cpu1", module=PyTorchModule("retinanet_pt", cpu_model))
-    
-    tt0.push_to_inputs(img_tensor)
-    output_q = pybuda.run_inference(_verify_cfg=VerifyConfig(relative_atol=0.3), _sequential=True)
-    
+    tt_model = pybuda.PyTorchModule(f"pt_{variant}", model)
+
+    # Prepare input
+    input_batch = img_preprocess()
+
+    # Inference
+    verify_module(
+        tt_model,
+        input_shapes=([input_batch.shape]),
+        inputs=([input_batch]),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls.py
new file mode 100644
index 000000000..0cb708966
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_imgcls.py
@@ -0,0 +1,96 @@
+import pybuda
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+from transformers import (
+    AutoImageProcessor,
+    SegformerForImageClassification,
+    SegformerConfig,
+)
+
+import os
+import requests
+import pytest
+from PIL import Image
+
+
+def get_sample_data(model_name):
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    image_processor = AutoImageProcessor.from_pretrained(model_name)
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+    return pixel_values
+
+
+variants_img_classification = [
+    "nvidia/mit-b0",
+    "nvidia/mit-b1",
+    "nvidia/mit-b2",
+    "nvidia/mit-b3",
+    "nvidia/mit-b4",
+    "nvidia/mit-b5",
+]
+
+
+@pytest.mark.parametrize("variant", variants_img_classification)
+def test_segformer_image_classification_pytorch(test_device, variant):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+    pcc_value = 0.99
+
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+
+        if variant in [
+            "nvidia/mit-b1",
+            "nvidia/mit-b2",
+            "nvidia/mit-b3",
+            "nvidia/mit-b4",
+            "nvidia/mit-b5",
+        ]:
+            os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+        if variant == "nvidia/mit-b0" and test_device.devtype == pybuda.BackendType.Silicon:
+            pcc_value = 0.97
+
+    elif test_device.arch == pybuda.BackendDevice.Grayskull:
+
+        if variant in ["nvidia/mit-b1"] and test_device.devtype == pybuda.BackendType.Silicon:
+            pcc_value = 0.97
+
+    # Set model configurations
+    config = SegformerConfig.from_pretrained(variant)
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config = SegformerConfig(**config_dict)
+
+    # Load the model from HuggingFace
+    model = SegformerForImageClassification.from_pretrained(variant, config=config)
+    model.eval()
+
+    # Load the sample image
+    pixel_values = get_sample_data(variant)
+
+    # Create PyBuda module from PyTorch model
+    tt_model = pybuda.PyTorchModule("pt_" + str(variant.split("/")[-1].replace("-", "_")), model)
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=[(pixel_values.shape,)],
+        inputs=[(pixel_values,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            verify_pybuda_codegen_vs_framework=True,
+            verify_tvm_compile=True,
+            pcc=pcc_value,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_semseg.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_semseg.py
new file mode 100644
index 000000000..46c4ba62b
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_segformer_semseg.py
@@ -0,0 +1,103 @@
+import pybuda
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+from transformers import (
+    AutoImageProcessor,
+    SegformerForSemanticSegmentation,
+)
+
+import os
+import requests
+import pytest
+from PIL import Image
+
+
+def get_sample_data(model_name):
+    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+    image = Image.open(requests.get(url, stream=True).raw)
+
+    image_processor = AutoImageProcessor.from_pretrained(model_name)
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+    return pixel_values
+
+
+variants_semseg = [
+    "nvidia/segformer-b0-finetuned-ade-512-512",
+    "nvidia/segformer-b1-finetuned-ade-512-512",
+    "nvidia/segformer-b2-finetuned-ade-512-512",
+    "nvidia/segformer-b3-finetuned-ade-512-512",
+    "nvidia/segformer-b4-finetuned-ade-512-512",
+]
+
+
+@pytest.mark.parametrize("variant", variants_semseg)
+def test_segformer_semantic_segmentation_pytorch(test_device, variant):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+    pcc_value = 0.99
+
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+        if variant in [
+            "nvidia/segformer-b1-finetuned-ade-512-512",
+            "nvidia/segformer-b2-finetuned-ade-512-512",
+            "nvidia/segformer-b3-finetuned-ade-512-512",
+            "nvidia/segformer-b4-finetuned-ade-512-512",
+        ]:
+
+            os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+        if (
+            variant
+            in [
+                "nvidia/segformer-b0-finetuned-ade-512-512",
+                "nvidia/segformer-b2-finetuned-ade-512-512",
+            ]
+            and test_device.devtype == pybuda.BackendType.Silicon
+        ):
+            pcc_value = 0.98
+
+    elif test_device.arch == pybuda.BackendDevice.Grayskull:
+
+        if variant == "nvidia/segformer-b2-finetuned-ade-512-512":
+            compiler_cfg.place_on_new_epoch("concatenate_1098.dc.concatenate.0")
+
+        if variant == "nvidia/segformer-b3-finetuned-ade-512-512":
+            compiler_cfg.place_on_new_epoch("concatenate_1890.dc.concatenate.0")
+
+        if variant == "nvidia/segformer-b4-finetuned-ade-512-512":
+            compiler_cfg.place_on_new_epoch("concatenate_2748.dc.concatenate.0")
+
+        if test_device.devtype == pybuda.BackendType.Silicon:
+            pcc_value = 0.98
+
+    # Load the model from HuggingFace
+    model = SegformerForSemanticSegmentation.from_pretrained(variant)
+    model.eval()
+
+    # Load the sample image
+    pixel_values = get_sample_data(variant)
+
+    # Create PyBuda module from PyTorch model
+    tt_model = pybuda.PyTorchModule("pt_" + str(variant.split("/")[-1].replace("-", "_")), model)
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=[(pixel_values.shape,)],
+        inputs=[(pixel_values,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            verify_pybuda_codegen_vs_framework=True,
+            verify_tvm_compile=True,
+            pcc=pcc_value,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ssd300_resnet50.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ssd300_resnet50.py
new file mode 100644
index 000000000..3d030863c
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_ssd300_resnet50.py
@@ -0,0 +1,113 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pybuda
+import numpy as np
+import torch
+import os
+import skimage
+import requests
+from pybuda.verify.backend import verify_module
+from pybuda.verify.config import TestKind
+from pybuda import VerifyConfig
+from pybuda._C.backend_api import BackendDevice
+
+
+def load_image(image_path):
+    """Code from Loading_Pretrained_Models.ipynb - a Caffe2 tutorial"""
+    mean, std = 128, 128
+    img = skimage.img_as_float(skimage.io.imread(image_path))
+    if len(img.shape) == 2:
+        img = np.array([img, img, img]).swapaxes(0, 2)
+    return img
+
+
+def rescale(img, input_height, input_width):
+    """Code from Loading_Pretrained_Models.ipynb - a Caffe2 tutorial"""
+    aspect = img.shape[1] / float(img.shape[0])
+    if aspect > 1:
+        # landscape orientation - wide image
+        res = int(aspect * input_height)
+        imgScaled = skimage.transform.resize(img, (input_width, res))
+    if aspect < 1:
+        # portrait orientation - tall image
+        res = int(input_width / aspect)
+        imgScaled = skimage.transform.resize(img, (res, input_height))
+    if aspect == 1:
+        imgScaled = skimage.transform.resize(img, (input_width, input_height))
+    return imgScaled
+
+
+def crop_center(img, cropx, cropy):
+    """Code from Loading_Pretrained_Models.ipynb - a Caffe2 tutorial"""
+    y, x, c = img.shape
+    startx = x // 2 - (cropx // 2)
+    starty = y // 2 - (cropy // 2)
+    return img[starty : starty + cropy, startx : startx + cropx]
+
+
+def normalize(img, mean=128, std=128):
+    img = (img * 256 - mean) / std
+    return img
+
+
+def prepare_input(img_uri):
+    img = load_image(img_uri)
+    img = rescale(img, 300, 300)
+    img = crop_center(img, 300, 300)
+    img = normalize(img)
+    return img
+
+
+def test_pytorch_ssd300_resnet50(test_device):
+
+    # STEP 1 : Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    compiler_cfg.amp_level = 1
+
+    if test_device.arch == BackendDevice.Grayskull:
+        os.environ["PYBUDA_RIBBON2"] = "1"
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "90112"
+
+    if test_device.arch == BackendDevice.Wormhole_B0:
+        compiler_cfg.place_on_new_epoch("conv2d_766.dc.matmul.11")
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "45056"
+
+    # STEP 2 : prepare model
+    model = torch.hub.load(
+        "NVIDIA/DeepLearningExamples:torchhub", "nvidia_ssd", pretrained=False
+    )
+    url = "https://api.ngc.nvidia.com/v2/models/nvidia/ssd_pyt_ckpt_amp/versions/19.09.0/files/nvidia_ssdpyt_fp16_190826.pt"
+    checkpoint_path = "nvidia_ssdpyt_fp16_190826.pt"
+
+    response = requests.get(url)
+    with open(checkpoint_path, "wb") as f:
+        f.write(response.content)
+
+    checkpoint = torch.load(checkpoint_path, map_location=torch.device("cpu"))
+    model.load_state_dict(checkpoint["model"])
+    model.eval()
+    tt_model = pybuda.PyTorchModule("ssd300_resnet50", model)
+
+    # STEP 3 : prepare input
+    img = "http://images.cocodataset.org/val2017/000000397133.jpg"
+    HWC = prepare_input(img)
+    CHW = np.swapaxes(np.swapaxes(HWC, 0, 2), 1, 2)
+    batch = np.expand_dims(CHW, axis=0)
+    input_batch = torch.from_numpy(batch).float()
+
+    # STEP 4 : Inference
+    verify_module(
+        tt_model,
+        input_shapes=[(input_batch.shape,)],
+        inputs=[(input_batch,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            pcc=0.96 if test_device.arch == BackendDevice.Wormhole_B0 else 0.98,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_swin.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_swin.py
index a838e70a2..0a0ce5d1d 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_swin.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_swin.py
@@ -25,7 +25,6 @@ def test_swin_v1_tiny_4_224_hf_pytorch(test_device):
     compiler_cfg = pybuda.config._get_global_compiler_config()    
     compiler_cfg.retain_tvm_python_files = True
     compiler_cfg.enable_tvm_constant_prop = True
-    compiler_cfg.enable_t_streaming = True
     os.environ["PYBUDA_ENABLE_STABLE_SOFTMAX"] = "1"
     os.environ["TVM_BACKTRACE"]="1" 
     
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py
new file mode 100644
index 000000000..704665c38
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_tri_basic_2.py
@@ -0,0 +1,95 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pybuda
+import torch
+
+from types import SimpleNamespace
+
+import cv2
+import os
+
+from test.model_demos.models.tri_basic_2.model.semseg import resnet34_semseg
+
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+
+
+def test_tri_basic_2_sematic_segmentation_pytorch(test_device):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.default_dram_parameters = False
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    compiler_cfg.enable_auto_fusing = False
+
+    compiler_cfg.balancer_op_override("add_114", "t_stream_shape", (1, 1))  # TM error
+    compiler_cfg.balancer_op_override("add_142", "t_stream_shape", (1, 1))  # TM error
+    compiler_cfg.balancer_op_override("add_171", "t_stream_shape", (1, 1))  # TM error
+
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+        compiler_cfg.balancer_op_override(
+            "add_156", "t_stream_shape", (1, 1)
+        )  # TM error
+        compiler_cfg.balancer_op_override(
+            "add_200", "t_stream_shape", (1, 1)
+        )  # TM error
+        compiler_cfg.balancer_op_override(
+            "add_214", "t_stream_shape", (1, 1)
+        )  # TM error
+
+    elif test_device.arch == pybuda.BackendDevice.Grayskull:
+        compiler_cfg.balancer_op_override(
+            "add_200", "t_stream_shape", (1, 1)
+        )  # TM error
+        compiler_cfg.balancer_op_override(
+            "add_229", "t_stream_shape", (1, 1)
+        )  # TM error
+        compiler_cfg.balancer_op_override(
+            "conv2d_15.dc.conv2d.3.dc.sparse_matmul.9.dc.sparse_matmul.1.lc2",
+            "t_stream_shape",
+            (10, 1),
+        )
+
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    os.environ["PYBUDA_TEMP_DISABLE_MODEL_KB_PROLOGUE_BW"] = "1"
+
+    # Sample Input
+    image_w = 800
+    image_h = 800
+    image = cv2.imread(
+        "third_party/confidential_customer_models/cv_demos/tri_basic_2/images/left.png"
+    )
+    image = cv2.resize(image, (image_w, image_h), interpolation=cv2.INTER_LINEAR)
+    image_tensor = (
+        torch.from_numpy(image).permute(2, 0, 1).unsqueeze(0).to(torch.float32) / 255.0
+    ).contiguous()
+
+    # Load the model and weights
+    hparams = SimpleNamespace(num_classes=24)
+    model = resnet34_semseg(hparams)
+    state_dict = torch.load(
+        "third_party/confidential_customer_models/cv_demos/tri_basic_2/weights/basic_semseg.ckpt",
+        map_location="cpu",
+    )
+    model.load_state_dict(state_dict)
+    model.eval()
+
+    # Create PyBuda module from PyTorch model
+    tt_model = pybuda.PyTorchModule("pt_tri_basic_2_semseg", model)
+
+    # Run inference on Tenstorrent device
+    verify_module(
+        tt_model,
+        input_shapes=[(image_tensor.shape,)],
+        inputs=[(image_tensor,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_unet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_unet.py
index e45755aea..f550e9190 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_unet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_unet.py
@@ -11,7 +11,7 @@
 from torchvision import transforms
 from torchvision.transforms import Compose, ConvertImageDtype, Normalize, PILToTensor, Resize, CenterCrop
 import requests
-
+from loguru import logger
 
 from PIL import Image
 import numpy as np
@@ -30,8 +30,6 @@ def generate_model_unet_imgseg_osmr_pytorch(test_device, variant):
     
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.enable_t_streaming = True
-    compiler_cfg.enable_auto_fusing = False
     compiler_cfg.enable_enumerate_u_kt = False
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     os.environ["PYBUDA_FORCE_RESIZE_DENSE_MM"] = "1"
@@ -39,6 +37,11 @@ def generate_model_unet_imgseg_osmr_pytorch(test_device, variant):
     if test_device.arch == BackendDevice.Wormhole_B0:
         compiler_cfg.balancer_policy = "Ribbon"
         os.environ["PYBUDA_BALANCER_PREPASS_DISABLED"] = "1"
+        # Temp mitigations for net2pipe errors, should be removed.
+        #
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
+        os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
     elif test_device.arch == BackendDevice.Grayskull:
         compiler_cfg.balancer_policy = "CNN"
 
@@ -74,33 +77,35 @@ def test_unet_osmr_cityscape_pytorch(test_device):
 
 
 def get_imagenet_sample():
-    url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
-    urllib.request.urlretrieve(url, filename)
-    img = Image.open(filename).convert('RGB')
-
-    # Preprocessing
-    transform = Compose([
-        Resize(256),
-        CenterCrop(224),
-        PILToTensor(),
-        ConvertImageDtype(torch.float32),
-        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-    ])
-
-    # Preprocessing
-    img_tensor = transform(img).unsqueeze(0)
+    try:
+        url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
+        urllib.request.urlretrieve(url, filename)
+        img = Image.open(filename).convert('RGB')
+
+        # Preprocessing
+        transform = Compose([
+            Resize(256),
+            CenterCrop(224),
+            PILToTensor(),
+            ConvertImageDtype(torch.float32),
+            Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+        ])
+
+        # Preprocessing
+        img_tensor = transform(img).unsqueeze(0)
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        img_tensor = torch.rand(1, 3, 224, 224)
     return img_tensor
 
 
-@pytest.mark.xfail()
+@pytest.mark.skip(reason="Not supported")
 def test_unet_holocron_pytorch(test_device):
-    pytest.skip() # Not needed in phase 1
     from holocron.models.segmentation.unet import unet_tvvgg11 
     
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
     # STEP 2: Create PyBuda module from PyTorch model
@@ -143,7 +148,6 @@ def generate_model_unet_imgseg_smp_pytorch(test_device, variant):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"]= "FastCut"
     compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_1488"] = 3
@@ -198,7 +202,6 @@ def generate_model_unet_imgseg_torchhub_pytorch(test_device, variant):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
     if test_device.arch == BackendDevice.Grayskull:
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vgg.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vgg.py
index 1437dab76..35965dc3d 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vgg.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vgg.py
@@ -17,6 +17,7 @@
 from PIL import Image
 from torchvision import transforms
 from vgg_pytorch import VGG 
+from loguru import logger
 import timm
 from timm.data import resolve_data_config
 from timm.data.transforms_factory import create_transform
@@ -30,10 +31,15 @@ def test_vgg_osmr_pytorch(variant, test_device):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     if (test_device.arch == BackendDevice.Wormhole_B0):
         os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "65536"
+        os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
+        # Temp mitigations for net2pipe errors, should be removed.
+        #
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
+        os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
 
     # STEP 2: Create PyBuda module from PyTorch model
     # Variants: 
@@ -46,17 +52,20 @@ def test_vgg_osmr_pytorch(variant, test_device):
     tt_model = pybuda.PyTorchModule(f"pt_{variant}_osmr", model)
 
     # Image preprocessing
-    os.system("wget -nc https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt")
-    torch.hub.download_url_to_file("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
-    input_image = Image.open("dog.jpg")
-    preprocess = transforms.Compose([
-        transforms.Resize(256),
-        transforms.CenterCrop(224),
-        transforms.ToTensor(),
-        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-    ])
-    input_tensor = preprocess(input_image)
-    input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
+    try:
+        torch.hub.download_url_to_file("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
+        input_image = Image.open("dog.jpg")
+        preprocess = transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ])
+        input_tensor = preprocess(input_image)
+        input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        input_batch = torch.rand(1, 3, 224, 224)
 
     verify_module(
         tt_model,
@@ -78,10 +87,10 @@ def test_vgg_19_hf_pytorch(test_device):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     if test_device.arch == BackendDevice.Wormhole_B0:
         os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "65536"
+        os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
 
     '''
     # https://pypi.org/project/vgg-pytorch/
@@ -97,17 +106,20 @@ def test_vgg_19_hf_pytorch(test_device):
     tt_model = pybuda.PyTorchModule("pt_vgg_19_hf", model)
 
     # Image preprocessing
-    os.system("wget -nc https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt")
-    torch.hub.download_url_to_file("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
-    input_image = Image.open("dog.jpg")
-    preprocess = transforms.Compose([
-        transforms.Resize(256),
-        transforms.CenterCrop(224),
-        transforms.ToTensor(),
-        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-    ])
-    input_tensor = preprocess(input_image)
-    input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
+    try:
+        torch.hub.download_url_to_file("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
+        input_image = Image.open("dog.jpg")
+        preprocess = transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ])
+        input_tensor = preprocess(input_image)
+        input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        input_batch = torch.rand(1, 3, 224, 224)
 
     verify_module(
         tt_model,
@@ -126,13 +138,16 @@ def test_vgg_19_hf_pytorch(test_device):
 def preprocess_timm_model(model_name):
    model = timm.create_model(model_name, pretrained=True)
    model.eval()
-   config = resolve_data_config({}, model=model)
-   transform = create_transform(**config)
-
-   url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
-   urllib.request.urlretrieve(url, filename)
-   img = Image.open(filename).convert('RGB')
-   img_tensor = transform(img).unsqueeze(0) # transform and add batch dimension
+   try:
+       config = resolve_data_config({}, model=model)
+       transform = create_transform(**config)
+       url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
+       urllib.request.urlretrieve(url, filename)
+       img = Image.open(filename).convert('RGB')
+       img_tensor = transform(img).unsqueeze(0) # transform and add batch dimension
+   except:
+       logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+       img_tensor = torch.rand(1, 3, 224, 224) 
    
    return model, img_tensor
 
@@ -144,7 +159,6 @@ def test_vgg_bn19_timm_pytorch(test_device):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     if test_device.arch == BackendDevice.Wormhole_B0:
         os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "65536"
@@ -172,7 +186,6 @@ def test_vgg_bn19_torchhub_pytorch(test_device):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     
     if test_device.arch == BackendDevice.Wormhole_B0:
@@ -193,17 +206,20 @@ def test_vgg_bn19_torchhub_pytorch(test_device):
 
  
     # Image preprocessing
-    os.system("wget -nc https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt")
-    torch.hub.download_url_to_file("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
-    input_image = Image.open("dog.jpg")
-    preprocess = transforms.Compose([
-        transforms.Resize(256),
-        transforms.CenterCrop(224),
-        transforms.ToTensor(),
-        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
-    ])
-    input_tensor = preprocess(input_image)
-    input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
+    try:
+        torch.hub.download_url_to_file("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
+        input_image = Image.open("dog.jpg")
+        preprocess = transforms.Compose([
+            transforms.Resize(256),
+            transforms.CenterCrop(224),
+            transforms.ToTensor(),
+            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
+        ])
+        input_tensor = preprocess(input_image)
+        input_batch = input_tensor.unsqueeze(0) # create a mini-batch as expected by the model
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        input_batch = torch.rand(1, 3, 224, 224)
 
     verify_module(
         tt_model,
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vilt.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vilt.py
index 3568be780..d0d568d42 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vilt.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vilt.py
@@ -100,7 +100,6 @@ def generate_model_vilt_question_answering_hf_pytorch(test_device, variant):
 
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     compiler_cfg.balancer_policy = "Ribbon"
     os.environ["PYBUDA_RIBBON2"] = "1"
@@ -157,7 +156,6 @@ def generate_model_vilt_maskedlm_hf_pytorch(test_device, variant):
 
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     compiler_cfg.balancer_policy = "Ribbon"
     os.environ["PYBUDA_RIBBON2"] = "1" 
@@ -197,7 +195,6 @@ def test_vilt_maskedlm_hf_pytorch(variant, test_device):
     model, inputs, _ = generate_model_vilt_maskedlm_hf_pytorch(
         test_device, variant,
     )
-    pcc = 0.98 if test_device.arch == BackendDevice.Wormhole_B0 else 0.99
     verify_module(
         model,
         input_shapes=[(inputs[0].shape,inputs[1].shape)],
@@ -207,6 +204,6 @@ def test_vilt_maskedlm_hf_pytorch(variant, test_device):
             devtype=test_device.devtype,
             devmode=test_device.devmode,
             test_kind=TestKind.INFERENCE,
-            pcc=pcc,
+            pcc=0.98,
         )
     )
\ No newline at end of file
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vit.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vit.py
index 4f1642398..95229ca30 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vit.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vit.py
@@ -17,10 +17,16 @@
 from PIL import Image
 from transformers import AutoImageProcessor, ViTForImageClassification
 
+
+dataset = load_dataset("huggingface/cats-image")
+image_1 = dataset["test"]["image"][0]
+url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+image_2 = Image.open(requests.get(url, stream=True).raw)
+
+
 def generate_model_vit_imgcls_hf_pytorch(test_device, variant):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     compiler_cfg.balancer_policy = "Ribbon"
 
@@ -40,11 +46,6 @@ def generate_model_vit_imgcls_hf_pytorch(test_device, variant):
     return tt_model, [img_tensor], {}
 
 
-dataset = load_dataset("huggingface/cats-image")
-image_1 = dataset["test"]["image"][0]
-url = "http://images.cocodataset.org/val2017/000000039769.jpg"
-image_2 = Image.open(requests.get(url, stream=True).raw)
-
 variants = ["google/vit-base-patch16-224", "google/vit-large-patch16-224"]
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_vit_classify_224_hf_pytorch(variant, test_device):
@@ -73,6 +74,7 @@ def test_vit_classify_224_hf_pytorch(variant, test_device):
 
 variants = ["google/vit-base-patch16-224", "google/vit-large-patch16-224"]
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.skip(reason="Redundant, already tested with test_vit_classification_1x1_demo")
 def test_vit_classify_224_hf_pytorch_1x1(variant, test_device):
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip()
@@ -98,3 +100,62 @@ def test_vit_classify_224_hf_pytorch_1x1(variant, test_device):
             pcc=0.9
         )
     )
+
+modes = [
+    "verify",
+    "demo"
+]
+variants = [
+    "google/vit-base-patch16-224",
+    "google/vit-large-patch16-224",
+]
+@pytest.mark.parametrize("mode", modes, ids=modes)
+@pytest.mark.parametrize("variant", variants, ids=variants)
+def test_vit_classification_1x1_demo(test_device, mode, variant):
+    if test_device.arch == BackendDevice.Grayskull:
+        pytest.skip("Not supported")
+
+    # Setup for 1x1 grid
+    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    
+    # Configurations
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    compiler_cfg.enable_tvm_cpu_fallback = False
+    
+    # Load image preprocessor and model
+    image_processor = download_model(AutoImageProcessor.from_pretrained,  variant)
+    framework_model = download_model(ViTForImageClassification.from_pretrained, variant)
+    model_name = "_".join(variant.split('/')[-1].split('-')[:2]) + f"_{mode}"
+    tt_model = pybuda.PyTorchModule(model_name, framework_model)
+
+    # Load and preprocess image
+    dataset = load_dataset("huggingface/cats-image")
+    input_image = dataset["test"]["image"][0]
+    input_image = image_processor(input_image, return_tensors="pt").pixel_values
+    
+    if mode == "verify":
+        # Verify model on Tenstorrent device
+        verify_module(
+            tt_model,
+            input_shapes=[(input_image.shape,)],
+            inputs=[(input_image,)],
+            verify_cfg=VerifyConfig(
+                arch=test_device.arch,
+                devtype=test_device.devtype,
+                devmode=test_device.devmode,
+                test_kind=TestKind.INFERENCE,
+            )
+        )
+    elif mode == "demo":
+        # Run inference on Tenstorrent device
+        output_q = pybuda.run_inference(tt_model, inputs=([input_image]))
+        output = output_q.get()[0].value().detach().float().numpy()
+
+        # Postprocessing
+        predicted_class_idx = output.argmax(-1).item()
+
+        # Print output
+        print("Predicted class:", framework_model.config.id2label[predicted_class_idx])
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vovnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vovnet.py
index 138fd84be..7217c8b2a 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vovnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_vovnet.py
@@ -19,30 +19,32 @@
 import urllib
 from timm.data import resolve_data_config
 from timm.data.transforms_factory import create_transform
+from loguru import logger
 
 import sys
  
  
 def get_image():
-    os.system(
-        "wget -nc https://raw.githubusercontent.com/pytorch/hub/master/imagenet_classes.txt"
-    )
-    torch.hub.download_url_to_file(
-        "https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg"
-    )
-    input_image = Image.open("dog.jpg")
-    preprocess = transforms.Compose(
-        [
-            transforms.Resize(256),
-            transforms.CenterCrop(224),
-            transforms.ToTensor(),
-            transforms.Normalize(
-                mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
-            ),
-        ]
-    )
-    img_tensor = preprocess(input_image)
-    img_tensor = img_tensor.unsqueeze(0)
+    try:
+        torch.hub.download_url_to_file(
+            "https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg"
+        )
+        input_image = Image.open("dog.jpg")
+        preprocess = transforms.Compose(
+            [
+                transforms.Resize(256),
+                transforms.CenterCrop(224),
+                transforms.ToTensor(),
+                transforms.Normalize(
+                    mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]
+                ),
+            ]
+        )
+        img_tensor = preprocess(input_image)
+        img_tensor = img_tensor.unsqueeze(0)
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        img_tensor = torch.rand(1, 3, 224, 224)
 
     return img_tensor
 
@@ -51,7 +53,6 @@ def generate_model_vovnet_imgcls_osmr_pytorch(test_device, variant):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
     # STEP 2: Create PyBuda module from PyTorch model 
@@ -93,10 +94,14 @@ def preprocess_steps(model_type):
     config = resolve_data_config({}, model=model)
     transform = create_transform(**config)
     
-    url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
-    urllib.request.urlretrieve(url, filename)
-    img = Image.open(filename).convert('RGB')
-    img_tensor = transform(img).unsqueeze(0) # transform and add batch dimension
+    try:
+        url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
+        urllib.request.urlretrieve(url, filename)
+        img = Image.open(filename).convert('RGB')
+        img_tensor = transform(img).unsqueeze(0) # transform and add batch dimension
+    except:
+        logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+        img_tensor = torch.rand(1, 3, 224, 224)
 
     return model, img_tensor
 
@@ -104,8 +109,8 @@ def preprocess_steps(model_type):
 def generate_model_vovnet39_imgcls_stigma_pytorch(test_device, variant):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.balancer_policy = "Ribbon"
+    os.environ["PYBUDA_RIBBON2"] = "1"
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
    
     # STEP 2: Create PyBuda module from PyTorch model 
@@ -120,7 +125,7 @@ def test_vovnet_v1_39_stigma_pytorch(test_device, enable_default_dram_parameters
     model, inputs, _ = generate_model_vovnet39_imgcls_stigma_pytorch(
         test_device, None,
     )
-    
+
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.default_dram_parameters = enable_default_dram_parameters
 
@@ -136,14 +141,13 @@ def test_vovnet_v1_39_stigma_pytorch(test_device, enable_default_dram_parameters
         )
     )
     
-from src_vovnet_stigma import vovnet57   
+from src_vovnet_stigma import vovnet57
 
 
 def generate_model_vovnet57_imgcls_stigma_pytorch(test_device, variant):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
    
     # STEP 2: Create PyBuda module from PyTorch model 
@@ -179,10 +183,14 @@ def preprocess_timm_model(model_name):
    config = resolve_data_config({}, model=model)
    transform = create_transform(**config)
 
-   url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
-   urllib.request.urlretrieve(url, filename)
-   img = Image.open(filename).convert('RGB')
-   img_tensor = transform(img).unsqueeze(0) # transform and add batch dimension
+   try:
+       url, filename = ("https://github.com/pytorch/hub/raw/master/images/dog.jpg", "dog.jpg")
+       urllib.request.urlretrieve(url, filename)
+       img = Image.open(filename).convert('RGB')
+       img_tensor = transform(img).unsqueeze(0) # transform and add batch dimension
+   except:
+       logger.warning("Failed to download the image file, replacing input with random tensor. Please check if the URL is up to date")
+       img_tensor = torch.rand(1, 3, 224, 224)
    
    return model, img_tensor
 
@@ -192,20 +200,21 @@ def generate_model_vovnet_imgcls_timm_pytorch(test_device, variant):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     
     # tenstorrent/pybuda#915
-    compiler_cfg.enable_auto_fusing = False
+    if test_device.arch == BackendDevice.Grayskull and variant == "ese_vovnet39b":
+        compiler_cfg.balancer_policy = "Ribbon"
+        os.environ["PYBUDA_RIBBON2"] = "1"
 
     # STEP 2: Create PyBuda module from PyTorch model
     tt_model = pybuda.PyTorchModule(variant+"_pt", model)
     
     return tt_model, [image_tensor], {}
 
-   
-varaints = ["ese_vovnet19b_dw", "ese_vovnet39b", "ese_vovnet99b"]
-@pytest.mark.parametrize("variant", varaints, ids=varaints)
+
+variants = ["ese_vovnet19b_dw", "ese_vovnet39b", "ese_vovnet99b"]
+@pytest.mark.parametrize("variant", variants, ids=variants)
 def test_vovnet_timm_pytorch(variant, test_device):
     model, inputs, _ = generate_model_vovnet_imgcls_timm_pytorch(
         test_device, variant,
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_wideresnet.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_wideresnet.py
index db2a4b6d6..75c276ab5 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_wideresnet.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_wideresnet.py
@@ -25,6 +25,8 @@ def test_wideresnet_pytorch(variant, test_device):
         variant,
     )
 
+    os.environ["PYBUDA_TEMP_DISABLE_MODEL_KB_PROLOGUE_BW"] = "1"
+
     verify_module(
         model,
         input_shapes=[(inputs[0].shape,)],
@@ -53,6 +55,8 @@ def test_wideresnet_timm(variant, test_device):
         variant,
     )
 
+    os.environ["PYBUDA_TEMP_DISABLE_MODEL_KB_PROLOGUE_BW"] = "1"
+
     verify_module(
         model,
         input_shapes=[(inputs[0].shape,)],
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_xception.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_xception.py
index 4e83b330b..546dc7d2b 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_xception.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_xception.py
@@ -6,6 +6,7 @@
 from pybuda.verify.backend import verify_module
 from pybuda import VerifyConfig
 from pybuda.verify.config import TestKind
+from pybuda._C.backend_api import BackendDevice
 from test.model_demos.models.xception import generate_model_xception_imgcls_timm
 
 variants = ["xception", "xception41", "xception65", "xception71"]
@@ -13,6 +14,13 @@
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_xception_timm(variant, test_device):
+    if test_device.arch == BackendDevice.Grayskull and variant == "xception":
+        os.environ["PYBUDA_TEMP_DISABLE_MODEL_KB_PROLOGUE_BW"] = "1"
+        # Temp mitigations for net2pipe errors, should be removed.
+        #
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
+        os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
     (
         model,
         inputs,
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v3.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v3.py
index 0bc4b4777..cb0a6cf0c 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v3.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v3.py
@@ -27,10 +27,7 @@ def generate_model_yolotinyV3_imgcls_holli_pytorch(test_device, variant):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.Float16_b
-    if test_device.devtype == BackendType.Golden:
-        compiler_cfg.enable_auto_fusing = False
 
     model = Yolov3Tiny(num_classes=80, use_wrong_previous_anchors=True)
     model.load_state_dict(torch.load('third_party/confidential_customer_models/model_2/pytorch/yolo_v3/weights/yolov3_tiny_coco_01.h5'))
@@ -72,11 +69,8 @@ def generate_model_yoloV3_imgcls_holli_pytorch(test_device, variant):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.Float16_b
-    if test_device.devtype == BackendType.Golden:
-        compiler_cfg.enable_auto_fusing = False
-
+    os.environ["PYBUDA_RIBBON2"] = "1"
     model = Yolov3(num_classes=80)
     model.load_state_dict(torch.load('third_party/confidential_customer_models/model_2/pytorch/yolo_v3/weights/yolov3_coco_01.h5', map_location=torch.device('cpu')))
     model.eval()
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py
index 0e55b751b..2ca404b34 100644
--- a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v5.py
@@ -24,7 +24,6 @@
 def generate_model_yoloV5I320_imgcls_torchhub_pytorch(test_device, variant, size):
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tm_cpu_fallback = False
     compiler_cfg.enable_conv_prestride = True
     compiler_cfg.enable_tvm_constant_prop = True
@@ -87,7 +86,6 @@ def generate_model_yoloV5I640_imgcls_torchhub_pytorch(test_device, variant, size
     # Add required env vars as per: https://yyz-gitlab.local.tenstorrent.com/tenstorrent/model-demos/-/issues/46
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_fusing = False
     os.environ["PYBUDA_DECOMPOSE_SIGMOID"] = "1"
     os.environ["PYBUDA_DISABLE_CAP_SPARSE_MM_FIDELITY"] = "1"
@@ -112,10 +110,27 @@ def generate_model_yoloV5I640_imgcls_torchhub_pytorch(test_device, variant, size
             os.environ["PYBUDA_RIBBON2"] = "1"
             if size in ["x"]:
                 compiler_cfg.place_on_new_epoch("conv2d_210.dc.matmul.11")
+                os.environ["PYBUDA_TEMP_BALANCER_DISABLE_TARGET_PROXIMITY"] = "1"
+                os.environ["PYBUDA_TEMP_RIBBON2_LEGACY_UTIL_EVAL"] = "1"
+                # Temp mitigations for net2pipe errors, should be removed.
+                #
+                os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
+                os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
+                os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
         if size in ["m"]:
             os.environ["PYBUDA_RIBBON2"] = "1"
             os.environ["PYBUDA_INSERT_SLICE_FOR_CONCAT"] = "1"
             os.environ["PYBUDA_CONCAT_SLICE_Y"] = "10"
+            os.environ["PYBUDA_TEMP_BALANCER_DISABLE_TARGET_PROXIMITY"] = "1"
+            compiler_cfg.place_on_new_epoch("conv2d_27.dc.matmul.8")
+        if size in ["l"]:
+            compiler_cfg.place_on_new_epoch("conv2d_313.dc.matmul.8")
+            # Temp mitigations for net2pipe errors, should be removed.
+            #
+            os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
+            os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
+            os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
+
 
     elif test_device.arch == BackendDevice.Wormhole_B0:
         os.environ["PYBUDA_PAD_SPARSE_MM"] = "{13:16, 3:4}"
@@ -137,10 +152,10 @@ def generate_model_yoloV5I640_imgcls_torchhub_pytorch(test_device, variant, size
         if size == "n":
             compiler_cfg.balancer_op_override("concatenate_19.dc.concatenate.30.dc.concatenate.1.dc.buffer.0", "t_stream_shape", (3,1))
         if size == "m":
-            #compiler_cfg.balancer_op_override("concatenate_260.dc.concatenate.0", "grid_shape", (1,1))
             compiler_cfg.balancer_op_override("concatenate_332.dc.concatenate.7", "grid_shape", (1,1))
             os.environ["PYBUDA_RIBBON2"] = "1"
             os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"]  = f"{112*1024}"
+            os.environ["PYBUDA_TEMP_RIBBON2_LEGACY_UTIL_EVAL"] = "1"
         if size == "l":
             compiler_cfg.enable_auto_transposing_placement = True
             compiler_cfg.enable_tm_cpu_fallback = True
@@ -152,6 +167,12 @@ def generate_model_yoloV5I640_imgcls_torchhub_pytorch(test_device, variant, size
             os.environ["PYBUDA_RIBBON2"] = "1"
             compiler_cfg.enable_tm_cpu_fallback = True
             os.environ["PYBUDA_DISABLE_CAP_SPARSE_MM_FIDELITY"] = "0"
+            os.environ["PYBUDA_TEMP_BALANCER_DISABLE_TARGET_PROXIMITY"] = "1"
+            # Temp mitigations for net2pipe errors, should be removed.
+            #
+            os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
+            os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
+            os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
 
     name = "yolov5" + size
     model = download_model(torch.hub.load, variant, name, pretrained=True)
@@ -167,6 +188,9 @@ def generate_model_yoloV5I640_imgcls_torchhub_pytorch(test_device, variant, size
     "size", size, ids=["yolov5" + s for s in size]
 )
 def test_yolov5_640x640(test_device, size):
+    if size in ["l"] and test_device.arch == BackendDevice.Grayskull:
+        os.environ["PYBUDA_TEMP_DISABLE_MODEL_KB_PROLOGUE_BW"] = "1"
+
     model, inputs, _ = generate_model_yoloV5I640_imgcls_torchhub_pytorch(
         test_device, "ultralytics/yolov5",
         size=size,
@@ -188,7 +212,6 @@ def test_yolov5_640x640(test_device, size):
 def generate_model_yoloV5I480_imgcls_torchhub_pytorch(test_device, variant, size):
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tm_cpu_fallback = True
     os.environ["PYBUDA_DECOMPOSE_SIGMOID"] = "1"
     os.environ["PYBUDA_LEGACY_UBLOCK_SHAPE"] = "1"
@@ -211,12 +234,13 @@ def generate_model_yoloV5I480_imgcls_torchhub_pytorch(test_device, variant, size
 
     elif test_device.arch == BackendDevice.Wormhole_B0:
         # Add required env vars as per: https://yyz-gitlab.local.tenstorrent.com/tenstorrent/model-demos/-/issues/46
-        os.environ["PYBUDA_RIBBON2"] = "1"
-        os.environ["PYBUDA_PAD_SPARSE_MM"] = "{13:16, 3:4}"
-        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"]  = f"{64*1024}"
         compiler_cfg.default_df_override = DataFormat.Float16_b
-        compiler_cfg.enable_auto_fusing = False
-        
+
+        os.environ["PYBUDA_RIBBON2"] = "1"
+        if size != "x":
+            os.environ["PYBUDA_PAD_SPARSE_MM"] = "{13:16, 3:4}"
+            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"]  = f"{64*1024}"
+
         if size == "s":
             compiler_cfg.default_dram_parameters = False
         else:
@@ -227,12 +251,14 @@ def generate_model_yoloV5I480_imgcls_torchhub_pytorch(test_device, variant, size
             os.environ["PYBUDA_CONCAT_SLICE_Y"] = "10"
             compiler_cfg.balancer_op_override("concatenate_26.dc.concatenate.30.dc.concatenate.1.dc.buffer.0", "t_stream_shape", (6,1))
         elif size == "l":
+            compiler_cfg.enable_auto_fusing = False
             compiler_cfg.place_on_new_epoch("concatenate_208.dc.concatenate.0")
         elif size == "x":
-            os.environ["PYBUDA_INSERT_SLICE_FOR_CONCAT"] = "1"
-            os.environ["PYBUDA_CONCAT_SLICE_Y"] = "10"
             os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
 
+            # These are planned to be on by default
+            os.environ["PYBUDA_RIBBON2_CALCULATE_TARGET_CYCLES"] = "1"
+
     name = "yolov5" + size
     model = download_model(torch.hub.load, variant, name, pretrained=True)
     module = PyTorchModule("pt_" + name + "_480x480", model)
@@ -248,6 +274,11 @@ def generate_model_yoloV5I480_imgcls_torchhub_pytorch(test_device, variant, size
 def test_yolov5_480x480(test_device, size):
     if test_device.arch == BackendDevice.Grayskull:
         os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
+    if size in ["m", "l"] and test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
+    if size in ["s"] and test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_TEMP_DISABLE_MODEL_KB_PROLOGUE_BW"] = "1"
+
     model, inputs, _ = generate_model_yoloV5I480_imgcls_torchhub_pytorch(
         test_device, "ultralytics/yolov5",
         size=size,
@@ -267,9 +298,8 @@ def test_yolov5_480x480(test_device, size):
     )
 
 
+@pytest.mark.skip(reason="Not supported")
 def test_yolov5_1280x1280(test_device):
-    pytest.skip("Currently not supporting high-res CV models.")
-
     # Add required env vars as per: https://yyz-gitlab.local.tenstorrent.com/tenstorrent/model-demos/-/issues/46
     os.environ["PYBUDA_PAD_SPARSE_MM"] = "{13:16}"
     os.environ["PYBUDA_INSERT_SLICE_FOR_CONCAT"] = "1"
@@ -278,7 +308,6 @@ def test_yolov5_1280x1280(test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tm_cpu_fallback = True
     compiler_cfg.default_df_override = DataFormat.Float16_b
     compiler_cfg.paddings = {
diff --git a/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v6.py b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v6.py
new file mode 100644
index 000000000..060ab1076
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/cnn/pytorch/test_yolo_v6.py
@@ -0,0 +1,179 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import pybuda
+import os
+import pytest
+import requests
+import math, cv2
+import numpy as np
+import torch
+from PIL import Image
+from yolov6 import YOLOV6
+from pybuda.verify.backend import verify_module
+from pybuda.verify.config import TestKind
+from pybuda import VerifyConfig
+from pybuda._C.backend_api import BackendDevice
+
+# preprocessing steps referred form https://github.com/meituan/YOLOv6/blob/main/inference.ipynb
+
+
+def letterbox(
+    im, new_shape=(640, 640), color=(114, 114, 114), auto=True, scaleup=True, stride=32
+):
+    """Resize and pad image while meeting stride-multiple constraints."""
+    shape = im.shape[:2]  # current shape [height, width]
+    if isinstance(new_shape, int):
+        new_shape = (new_shape, new_shape)
+    elif isinstance(new_shape, list) and len(new_shape) == 1:
+        new_shape = (new_shape[0], new_shape[0])
+
+    # Scale ratio (new / old)
+    r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
+    if not scaleup:  # only scale down, do not scale up (for better val mAP)
+        r = min(r, 1.0)
+
+    # Compute padding
+    new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
+    dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]  # wh padding
+
+    if auto:  # minimum rectangle
+        dw, dh = np.mod(dw, stride), np.mod(dh, stride)  # wh padding
+
+    dw /= 2  # divide padding into 2 sides
+    dh /= 2
+
+    if shape[::-1] != new_unpad:  # resize
+        im = cv2.resize(im, new_unpad, interpolation=cv2.INTER_LINEAR)
+    top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1))
+    left, right = int(round(dw - 0.1)), int(round(dw + 0.1))
+    im = cv2.copyMakeBorder(
+        im, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color
+    )  # add border
+
+    return im, r, (left, top)
+
+
+def check_img_size(img_size, s=32, floor=0):
+    def make_divisible(x, divisor):
+        # Upward revision the value x to make it evenly divisible by the divisor.
+        return math.ceil(x / divisor) * divisor
+
+    """Make sure image size is a multiple of stride s in each dimension, and return a new shape list of image."""
+    if isinstance(img_size, int):  # integer i.e. img_size=640
+        new_size = max(make_divisible(img_size, int(s)), floor)
+    elif isinstance(img_size, list):  # list i.e. img_size=[640, 480]
+        new_size = [max(make_divisible(x, int(s)), floor) for x in img_size]
+    else:
+        raise Exception(f"Unsupported type of img_size: {type(img_size)}")
+
+    if new_size != img_size:
+        print(
+            f"WARNING: --img-size {img_size} must be multiple of max stride {s}, updating to {new_size}"
+        )
+    return new_size if isinstance(img_size, list) else [new_size] * 2
+
+
+def process_image(path, img_size, stride, half):
+    """Process image before image inference."""
+
+    img_src = np.asarray(Image.open(requests.get(path, stream=True).raw))
+    image = letterbox(img_src, img_size, stride=stride)[0]
+    # Convert
+    image = image.transpose((2, 0, 1))  # HWC to CHW
+    image = torch.from_numpy(np.ascontiguousarray(image))
+    image = image.half() if half else image.float()  # uint8 to fp16/32
+    image /= 255  # 0 - 255 to 0.0 - 1.0
+
+    return image, img_src
+
+
+# Didn't dealt with yolov6n6,yolov6s6,yolov6m6,yolov6l6 variants because of its higher input size(1280)
+variants = ["yolov6n", "yolov6s", "yolov6m", "yolov6l"]
+
+
+@pytest.mark.parametrize("variant", variants)
+def test_yolo_v6_pytorch(variant, test_device):
+
+    # STEP 1 : Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+
+    if variant in ["yolov6m", "yolov6l"]:
+        os.environ["PYBUDA_FORK_JOIN_BUF_QUEUES"] = "1"
+        os.environ["PYBUDA_FORK_JOIN_EXPAND_OUTPUT_BUFFERS"] = "1"
+        os.environ["PYBUDA_FORK_JOIN_SKIP_EXPANDING_BUFFERS"] = "1"
+        os.environ["PYBUDA_MAX_FORK_JOIN_BUF"] = "1"
+
+        # Temp mitigations for net2pipe errors, should be removed.
+        #
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_FUSED_ESTIMATES"] = "0"
+        os.environ["PYBUDA_TEMP_SCALE_SPARSE_ESTIMATE_ARGS"] = "0"
+        os.environ["PYBUDA_TEMP_ENABLE_NEW_SPARSE_ESTIMATES"] = "0"
+
+        if test_device.arch == BackendDevice.Grayskull and variant == "yolov6m":
+            compiler_cfg.balancer_op_override(
+                "conv2d_258.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (1, 1)
+            )
+            compiler_cfg.balancer_op_override(
+                "conv2d_258.dc.reshape.12.dc.sparse_matmul.3.lc2",
+                "t_stream_shape",
+                (2, 1),
+            )
+
+        if test_device.arch == BackendDevice.Wormhole_B0 and variant == "yolov6l":
+            os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+        if test_device.arch == BackendDevice.Grayskull and variant == "yolov6l":
+            compiler_cfg.balancer_op_override(
+                "conv2d_484.dc.reshape.0.dc.sparse_matmul.4.lc2", "grid_shape", (1, 1)
+            )
+            compiler_cfg.balancer_op_override(
+                "conv2d_484.dc.reshape.12.dc.sparse_matmul.3.lc2",
+                "t_stream_shape",
+                (2, 1),
+            )
+
+    # STEP 2 :prepare model
+    url = f"https://github.com/meituan/YOLOv6/releases/download/0.3.0/{variant}.pt"
+    weights = f"{variant}.pt"
+
+    try:
+        response = requests.get(url)
+        with open(weights, "wb") as file:
+            file.write(response.content)
+        print(f"Downloaded {url} to {weights}")
+    except Exception as e:
+        print(f"Error downloading {url}: {e}")
+
+    model = YOLOV6(weights)
+    model = model.model
+    model.eval()
+
+    tt_model = pybuda.PyTorchModule(f"{variant}_pt", model)
+
+    # STEP 3 : prepare input
+    url = "http://images.cocodataset.org/val2017/000000397133.jpg"
+    stride = 32
+    input_size = 640
+    img_size = check_img_size(input_size, s=stride)
+    img, img_src = process_image(url, img_size, stride, half=False)
+    input_batch = img.unsqueeze(0)
+
+    # STEP 4 : Inference
+    verify_module(
+        tt_model,
+        input_shapes=([input_batch.shape]),
+        inputs=([input_batch]),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
+
+    # STEP 5 : remove downloaded weights
+    os.remove(weights)
diff --git a/pybuda/test/model_demos/high_prio/cnn/tflite/test_efficientnet_lite.py b/pybuda/test/model_demos/high_prio/cnn/tflite/test_efficientnet_lite.py
index 565cb1bfb..48e51296b 100644
--- a/pybuda/test/model_demos/high_prio/cnn/tflite/test_efficientnet_lite.py
+++ b/pybuda/test/model_demos/high_prio/cnn/tflite/test_efficientnet_lite.py
@@ -32,7 +32,6 @@ def test_efficientnet_lite0_1x1(test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_constant_prop = True
     compiler_cfg.graph_solver_self_cut_type = "FastCut"
     compiler_cfg.default_df_override = pybuda.DataFormat.Float16
@@ -63,7 +62,6 @@ def test_efficientnet_lite4_1x1(test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_constant_prop = True
     compiler_cfg.graph_solver_self_cut_type = "FastCut"
     compiler_cfg.default_df_override = pybuda.DataFormat.Float16
@@ -88,12 +86,10 @@ def test_efficientnet_lite4_1x1(test_device):
 
 
 
-
+@pytest.mark.skip(reason="Not supported")
 def test_efficientnet_lite0(test_device):
-    pytest.skip("Only test 1x1 grid")
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_constant_prop = True
     compiler_cfg.graph_solver_self_cut_type = "FastCut"
 
@@ -123,11 +119,11 @@ def test_efficientnet_lite0(test_device):
         ),
     )
 
+
+@pytest.mark.skip(reason="Not supported")
 def test_efficientnet_lite1(test_device):
-    pytest.skip()
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_constant_prop = True
     compiler_cfg.default_df_override = DataFormat.Float16_b
 
@@ -158,11 +154,10 @@ def test_efficientnet_lite1(test_device):
         ),
     )
 
+@pytest.mark.skip(reason="Not supported")
 def test_efficientnet_lite2(test_device):
-    pytest.skip()
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_constant_prop = True
     compiler_cfg.amp_level = 2
     compiler_cfg.default_df_override = DataFormat.Float16_b
@@ -191,11 +186,10 @@ def test_efficientnet_lite2(test_device):
         ),
     )
 
+@pytest.mark.skip(reason="Not supported")
 def test_efficientnet_lite3(test_device):
-    pytest.skip()
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_constant_prop = True
     compiler_cfg.amp_level = 2
     compiler_cfg.default_df_override = DataFormat.Float16_b
@@ -224,14 +218,13 @@ def test_efficientnet_lite3(test_device):
         ),
     )
 
+@pytest.mark.skip(reason="Not supported")
 def test_efficientnet_lite4(test_device):
-    pytest.skip()
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip("Grayskull failing with: Error! The overlay blob for chip_0__y_1__x_12 does not fit, the max size is 65408, however we tried to allocate 71240.")
     
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_constant_prop = True
     compiler_cfg.enable_conv_prestride = True
     compiler_cfg.graph_solver_self_cut_type = "FastCut"
diff --git a/pybuda/test/model_demos/high_prio/cnn/tflite/test_hand_landmarker.py b/pybuda/test/model_demos/high_prio/cnn/tflite/test_hand_landmarker.py
index c504a293a..af46db2ad 100644
--- a/pybuda/test/model_demos/high_prio/cnn/tflite/test_hand_landmarker.py
+++ b/pybuda/test/model_demos/high_prio/cnn/tflite/test_hand_landmarker.py
@@ -23,8 +23,6 @@
 import os
 
 
-
-
 def test_hand_landmark_lite_1x1(test_device):
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip()
@@ -35,7 +33,6 @@ def test_hand_landmark_lite_1x1(test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_constant_prop = True
     compiler_cfg.graph_solver_self_cut_type = "ConsumerOperandDataEdgesFirst"
     compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
@@ -59,6 +56,7 @@ def test_hand_landmark_lite_1x1(test_device):
         ),
     )
 
+
 def test_palm_detection_lite_1x1(test_device):
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip()
@@ -68,7 +66,6 @@ def test_palm_detection_lite_1x1(test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_constant_prop = True
     compiler_cfg.graph_solver_self_cut_type = "ConsumerOperandDataEdgesFirst"
 
diff --git a/pybuda/test/model_demos/high_prio/cnn/tflite/test_mobilenet_ssd.py b/pybuda/test/model_demos/high_prio/cnn/tflite/test_mobilenet_ssd.py
index 636d0d013..f6cbbe39a 100644
--- a/pybuda/test/model_demos/high_prio/cnn/tflite/test_mobilenet_ssd.py
+++ b/pybuda/test/model_demos/high_prio/cnn/tflite/test_mobilenet_ssd.py
@@ -23,8 +23,6 @@
 import os
 
 
-
-
 def test_mobilenet_ssd_1x1(test_device):
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip()
@@ -34,7 +32,6 @@ def test_mobilenet_ssd_1x1(test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_constant_prop = True
     compiler_cfg.graph_solver_self_cut_type = "FastCut"
     compiler_cfg.default_df_override=pybuda.DataFormat.Float16_b
diff --git a/pybuda/test/model_demos/high_prio/cnn/tflite/test_pose_landmark.py b/pybuda/test/model_demos/high_prio/cnn/tflite/test_pose_landmark.py
index e17e6024e..98ec9790c 100644
--- a/pybuda/test/model_demos/high_prio/cnn/tflite/test_pose_landmark.py
+++ b/pybuda/test/model_demos/high_prio/cnn/tflite/test_pose_landmark.py
@@ -23,8 +23,6 @@
 import os
 
 
-
-
 def test_pose_landmark_lite_1x1(test_device):
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip()
@@ -36,7 +34,6 @@ def test_pose_landmark_lite_1x1(test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_constant_prop = True
     compiler_cfg.graph_solver_self_cut_type = "FastCut"
     compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
@@ -73,7 +70,6 @@ def test_pose_landmark_heavy_1x1(test_device):
     
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_constant_prop = True
     compiler_cfg.graph_solver_self_cut_type = "FastCut"
     compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
@@ -100,15 +96,13 @@ def test_pose_landmark_heavy_1x1(test_device):
         ),
     )
 
-
+@pytest.mark.skip(reason="Not supported")
 def test_pose_landmark_lite(test_device):
-    pytest.skip()
     if test_device.devtype == BackendType.Silicon:
         pytest.skip("silicon run hangs")
     
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_constant_prop = True
     compiler_cfg.graph_solver_self_cut_type = "ConsumerOperandDataEdgesFirst"
     if test_device.arch == BackendDevice.Grayskull:
@@ -139,12 +133,10 @@ def test_pose_landmark_lite(test_device):
         ),
     )
 
-
+@pytest.mark.skip(reason="Not supported")
 def test_pose_landmark_heavy(test_device):
-    pytest.skip()
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_constant_prop = True
     compiler_cfg.graph_solver_self_cut_type = "ConsumerOperandDataEdgesFirst"
 
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_albert.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_albert.py
index 18f2e7b1b..133d7e903 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_albert.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_albert.py
@@ -29,10 +29,11 @@ def test_albert_masked_lm_pytorch(size, variant, test_device):
         default_df_override=pybuda.DataFormat.Float16,
         amp_level=2,
     )
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+
     if ("xxlarge" in model_ckpt):
         if test_device.arch == BackendDevice.Grayskull:
             compiler_cfg = pybuda.config._get_global_compiler_config()
-            compiler_cfg.enable_t_streaming = True
             compiler_cfg.enable_auto_fusing = False
             compiler_cfg.amp_level = 2
             os.environ["PYBUDA_NLP_MANUAL_TARGET"] = "2000000"
@@ -42,7 +43,6 @@ def test_albert_masked_lm_pytorch(size, variant, test_device):
         elif test_device.arch == BackendDevice.Wormhole_B0:
             # until tenstorrent/budabackend#1120 is resolved
             pybuda.config.set_configuration_options(
-                enable_t_streaming=True,
                 enable_auto_fusing=False,
                 enable_enumerate_u_kt=False,
                 amp_level=1,
@@ -54,16 +54,16 @@ def test_albert_masked_lm_pytorch(size, variant, test_device):
         os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{8*1024}"
 
         if test_device.arch == BackendDevice.Grayskull:
-            compiler_cfg = pybuda.config._get_global_compiler_config()
-            compiler_cfg.enable_t_streaming = True
             os.environ["PYBUDA_NLP_MANUAL_TARGET"] = "2000000"
     elif "large" == size:
         if test_device.arch == BackendDevice.Grayskull:
-            compiler_cfg = pybuda.config._get_global_compiler_config()
-            compiler_cfg.enable_t_streaming = True
             compiler_cfg.enable_auto_fusing = False
             os.environ["PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY"] = "1"
-
+        elif test_device.arch == BackendDevice.Wormhole_B0:
+            os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
+    elif "base" == size:
+        if test_device.arch == BackendDevice.Wormhole_B0:
+            os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
 
     # Load data sample
     sample_text = "The capital of France is [MASK]."
@@ -104,6 +104,8 @@ def test_albert_token_classification_pytorch(size, variant, test_device):
         amp_level=2,
     )
 
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+
     # NOTE: These model variants are pre-trined only. They need to be fine-tuned
     # on a downstream task. Code is for demonstration purposes only.
     # Variants: albert-base-v1, albert-large-v1, albert-xlarge-v1, albert-xxlarge-v1
@@ -111,7 +113,6 @@ def test_albert_token_classification_pytorch(size, variant, test_device):
     model_ckpt = f"albert-{size}-{variant}"
     if "xxlarge" in model_ckpt:
         pybuda.config.set_configuration_options(
-            enable_t_streaming=True,
             enable_auto_fusing=False,
             enable_enumerate_u_kt=False,
         )
@@ -121,15 +122,17 @@ def test_albert_token_classification_pytorch(size, variant, test_device):
         os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{8*1024}"
 
         if test_device.arch == BackendDevice.Grayskull:
-            compiler_cfg = pybuda.config._get_global_compiler_config()
-            compiler_cfg.enable_t_streaming = True
             os.environ["PYBUDA_NLP_MANUAL_TARGET"] = "2000000"
     elif "large" == size:
         if test_device.arch == BackendDevice.Grayskull:
-            compiler_cfg = pybuda.config._get_global_compiler_config()
-            compiler_cfg.enable_t_streaming = True
             compiler_cfg.enable_auto_fusing = False
             os.environ["PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY"] = "1"
+        elif test_device.arch == BackendDevice.Wormhole_B0:
+            os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
+    elif "base" == size:
+        if test_device.arch == BackendDevice.Wormhole_B0:
+            os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
+
 
     # Load ALBERT tokenizer and model from HuggingFace
     tokenizer = AlbertTokenizer.from_pretrained(model_ckpt)
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_bart.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_bart.py
index 35b57e6fa..e0c669d0c 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_bart.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_bart.py
@@ -53,6 +53,7 @@ def forward(self, input_ids, attention_mask, decoder_input_ids):
         out = self.model(input_ids, attention_mask, decoder_input_ids)[0]
         return out
 
+@pytest.mark.skip(reason="Not supported for release")
 def test_pt_bart_classifier(test_device): 
     compiler_cfg = _get_global_compiler_config() 
 
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_bert.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_bert.py
index 25bda68cd..7cc2bc2d3 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_bert.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_bert.py
@@ -67,6 +67,8 @@ def generate_model_bert_qa_hf_pytorch(test_device, variant):
 
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object 
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    compiler_cfg.balancer_policy = "Ribbon"
+    os.environ["PYBUDA_RIBBON2"] = "1"
 
     # Load data sample from SQuADv1.1
     context = """Super Bowl 50 was an American football game to determine the champion of the National Football League
@@ -146,6 +148,9 @@ def test_bert_sequence_classification_pytorch(test_device):
         test_device, "textattack/bert-base-uncased-SST-2",
     )
 
+    if test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
+
     verify_module(
         model,
         input_shapes=[(inputs[0].shape,)],
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_codegen.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_codegen.py
index d85fb3c4d..5fce40d7d 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_codegen.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_codegen.py
@@ -17,14 +17,14 @@
 
 variants = [
     "Salesforce/codegen-350M-mono",
+    # "Salesforce/codegen-350M-multi", # Currently not supported
+    # "Salesforce/codegen-350M-nl", # Currently not supported
 ]
 
-
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_codegen(test_device, variant):
     # Configurations
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False
     compiler_cfg.default_dram_parameters = False
     compiler_cfg.enable_enumerate_u_kt = False
@@ -82,8 +82,7 @@ def forward(self, input_ids, attention_mask):
         inputs=[(input_ids, attn_mask,)],
         verify_cfg=VerifyConfig(
             arch=test_device.arch,
-            # tenstorrent/pybuda#1031
-            devtype=BackendType.NoBackend if test_device.devtype == BackendType.Golden else test_device.devtype,
+            devtype=test_device.devtype,
             devmode=test_device.devmode,
             test_kind=TestKind.INFERENCE,
             chip_ids=NebulaGalaxy.chip_ids if "PYBUDA_NEB_GALAXY_CI" in os.environ and int(os.environ.get("PYBUDA_NEB_GALAXY_CI"))==1 else [0],
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_distilbert.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_distilbert.py
index 7eeda2798..af509ab64 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_distilbert.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_distilbert.py
@@ -27,7 +27,7 @@ def test_distilbert_masked_lm_pytorch(variant, test_device):
 
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object 
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
-
+    
     # Load data sample
     sample_text = "The capital of France is [MASK]."
 
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_dpr.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_dpr.py
index 68b83ac63..f97d86bf7 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_dpr.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_dpr.py
@@ -100,6 +100,9 @@ def test_dpr_reader_pytorch(variant, test_device):
 
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object 
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+
+    if test_device.arch == BackendDevice.Wormhole_B0:
+        os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
  
     # Data preprocessing
     input_tokens = tokenizer(
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_fuyu_8b.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_fuyu_8b.py
index 2079e8583..e2211967f 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_fuyu_8b.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_fuyu_8b.py
@@ -147,7 +147,6 @@ def test_fuyu8b(test_device):
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
     #compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     #compiler_cfg.enable_tvm_constant_prop = True
     compiler_cfg.enable_tvm_cpu_fallback = False
@@ -204,7 +203,6 @@ def test_fuyu8b_past_cache(test_device):
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     compiler_cfg.enable_tvm_cpu_fallback = False
     compiler_cfg.compile_subgraphs = True
@@ -267,9 +265,7 @@ def test_fuyu8b_past_cache(test_device):
 
     # Prepare inputs
     text_prompt = "Generate a coco-style caption. "
-    url = "https://huggingface.co/adept/fuyu-8b/resolve/main/bus.png"
-    #image_path = "bus.png"
-    #image_pil = Image.open(image_path)
+    url = "https://huggingface.co/adept/fuyu-8b/resolve/main/bus.png" 
     image_pil = Image.open(requests.get(url, stream=True).raw).convert("RGB")
     model_inputs = processor(text=text_prompt, images=[image_pil], device="cpu", return_tensor='pt')
 
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_gemma_2b.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_gemma_2b.py
new file mode 100644
index 000000000..1bbcfdf6d
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_gemma_2b.py
@@ -0,0 +1,575 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+import torch
+import pytest
+from transformers import GemmaModel, GemmaConfig
+from transformers import AutoTokenizer, GemmaForCausalLM
+from transformers import AutoTokenizer, AutoModelForCausalLM
+
+import pybuda
+from pybuda import (
+    VerifyConfig,
+    PyTorchModule,
+    CompileDepth,
+)
+from test.utils import download_model
+from pybuda.pybudaglobal import TILE_DIM
+from pybuda.verify.config import TestKind
+from pybuda._C import DataFormat, MathFidelity
+from pybuda._C.backend_api import BackendDevice
+from pybuda._C.backend_api import BackendType
+from pybuda.verify.backend import verify_module
+from pybuda.transformers.pipeline import pipeline as pybuda_pipeline
+
+
+def cpu_sanity_run_0():
+    tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
+    model = AutoModelForCausalLM.from_pretrained("google/gemma-2b")
+
+    input_text = "Write me a poem about Machine Learning."
+    input_ids = tokenizer(input_text, return_tensors="pt")
+
+    outputs = model.generate(**input_ids)
+    print(tokenizer.decode(outputs[0]))
+
+
+def cpu_sanity_run_1():
+    model = GemmaForCausalLM.from_pretrained("google/gemma-2b")
+    tokenizer = AutoTokenizer.from_pretrained("google/gemma-2b")
+
+    prompt = "What is your favorite city?"
+    inputs = tokenizer(prompt, return_tensors="pt")
+
+    # Generate
+    generate_ids = model.generate(inputs.input_ids, max_length=30)
+    generated_text = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    
+    print(generated_text)
+
+
+variants = [
+    "google/gemma-2b",
+]
+
+
+@pytest.mark.skip(reason="Tested as part of full model test run")
+@pytest.mark.parametrize("variant", variants, ids=variants)
+def test_gemma_2b_rotary_embedding(test_device, variant):
+    # Random see for reproducibility
+    torch.manual_seed(42)
+    
+    # Configurations
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    
+    # Load model
+    class Wrapper(torch.nn.Module):
+        def __init__(self, model):
+            super().__init__()
+            self.model = model.model.layers[0].self_attn.rotary_emb
+
+        def forward(self, x, pos_ids):
+            cos, sin = self.model(x, pos_ids)
+            
+            return cos, sin
+
+    config = download_model(GemmaConfig.from_pretrained, variant)
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config_dict["use_cache"] = False
+    config = GemmaConfig(**config_dict)
+    pytorch_model = download_model(GemmaForCausalLM.from_pretrained, variant, config=config)
+    pytorch_model = Wrapper(pytorch_model)
+    tt_model = PyTorchModule("pytorch_gemma_2b_rotary_embedding", pytorch_model)
+
+    # Define inputs
+    x = torch.rand((1, 1, 7, 256)).to(torch.float32)
+    pos_ids = torch.arange(7).unsqueeze(0).to(torch.float32)
+    
+    # Sanity run
+    out = pytorch_model(x, pos_ids)
+    print(out)
+
+    verify_module(
+        tt_model,
+        input_shapes=[(x.shape, pos_ids.shape,)],
+        inputs=[(x, pos_ids,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
+
+
+@pytest.mark.skip(reason="Tested as part of full model test run")
+@pytest.mark.parametrize("variant", variants, ids=variants)
+def test_gemma_2b_rms_norm(test_device, variant):
+    # Random see for reproducibility
+    torch.manual_seed(42)
+    
+    # Configurations
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+
+    # Load model
+    class Wrapper(torch.nn.Module):
+        def __init__(self, model):
+            super().__init__()
+            self.model = model.model.layers[0].input_layernorm
+
+        def forward(self, x):
+            out = self.model(x)
+            
+            return out
+
+    config = download_model(GemmaConfig.from_pretrained, variant)
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config_dict["use_cache"] = False
+    config = GemmaConfig(**config_dict)
+    pytorch_model = download_model(GemmaForCausalLM.from_pretrained, variant, config=config)
+    pytorch_model = Wrapper(pytorch_model)
+    tt_model = PyTorchModule("pytorch_gemma_2b_rms_norm", pytorch_model)
+
+    # Define inputs
+    x = torch.rand((1, 7, 2048)).to(torch.float32)
+    
+    # Sanity run
+    out = pytorch_model(x)
+    print(out)
+
+    verify_module(
+        tt_model,
+        input_shapes=[(x.shape,)],
+        inputs=[(x,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
+
+
+@pytest.mark.skip(reason="Tested as part of full model test run")
+@pytest.mark.parametrize("variant", variants, ids=variants)
+def test_gemma_2b_attention(test_device, variant):
+    # Random see for reproducibility
+    torch.manual_seed(42)
+    
+    # Configurations
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+
+    # Load model
+    class Wrapper(torch.nn.Module):
+        def __init__(self, model):
+            super().__init__()
+            self.model = model.model.layers[0].self_attn
+
+        def forward(self, hidden_states, attn_mask, pos_ids):
+            attn_output, attn_weights, past_key_value = self.model(hidden_states, attn_mask, pos_ids)
+            
+            return attn_output
+
+    config = download_model(GemmaConfig.from_pretrained, variant)
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config_dict["use_cache"] = False
+    config = GemmaConfig(**config_dict)
+    pytorch_model = download_model(GemmaForCausalLM.from_pretrained, variant, config=config)
+    pytorch_model = Wrapper(pytorch_model)
+    tt_model = PyTorchModule("pytorch_gemma_2b_attention", pytorch_model)
+
+    # Define inputs
+    hidden_states = torch.rand((1, 7, 2048)).to(torch.float32)
+    attn_mask = torch.ones((1, 1, 7, 7)).to(torch.float32)
+    pos_ids = torch.arange(7).unsqueeze(0).to(torch.float32)
+    
+    # Sanity run
+    out = pytorch_model(hidden_states, attn_mask, pos_ids)
+    print(out)
+
+    verify_module(
+        tt_model,
+        input_shapes=[(hidden_states.shape, attn_mask.shape, pos_ids.shape,)],
+        inputs=[(hidden_states, attn_mask, pos_ids,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
+
+
+@pytest.mark.skip(reason="Tested as part of full model test run")
+@pytest.mark.parametrize("variant", variants, ids=variants)
+def test_gemma_2b_mlp(test_device, variant):
+    # Random see for reproducibility
+    torch.manual_seed(42)
+    
+    # Configurations
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+
+    # Load model
+    class Wrapper(torch.nn.Module):
+        def __init__(self, model):
+            super().__init__()
+            self.model = model.model.layers[0].mlp
+
+        def forward(self, hidden_states):
+            out = self.model(hidden_states)
+            
+            return out
+
+    config = download_model(GemmaConfig.from_pretrained, variant)
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config_dict["use_cache"] = False
+    config = GemmaConfig(**config_dict)
+    pytorch_model = download_model(GemmaForCausalLM.from_pretrained, variant, config=config)
+    pytorch_model = Wrapper(pytorch_model)
+    tt_model = PyTorchModule("pytorch_gemma_2b_attention", pytorch_model)
+
+    # Define inputs
+    x = torch.rand((1, 7, 2048)).to(torch.float32)
+    
+    # Sanity run
+    out = pytorch_model(x)
+    print(out)
+
+    verify_module(
+        tt_model,
+        input_shapes=[(x.shape,)],
+        inputs=[(x,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
+
+
+@pytest.mark.skip(reason="Tested as part of full model test run")
+@pytest.mark.parametrize("variant", variants, ids=variants)
+def test_gemma_2b_single_decoder(test_device, variant):
+    # Random see for reproducibility
+    torch.manual_seed(42)
+    
+    # Configurations
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+
+    # Load model
+    class Wrapper(torch.nn.Module):
+        def __init__(self, model):
+            super().__init__()
+            self.model = model.model.layers[0]
+
+        def forward(self, hidden_states, attn_mask, pos_ids):
+            out = self.model(hidden_states, attn_mask, pos_ids)
+            
+            return out
+
+    config = download_model(GemmaConfig.from_pretrained, variant)
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config_dict["use_cache"] = False
+    config = GemmaConfig(**config_dict)
+    pytorch_model = download_model(GemmaForCausalLM.from_pretrained, variant, config=config)
+    pytorch_model = Wrapper(pytorch_model)
+    tt_model = PyTorchModule("pytorch_gemma_2b_single_decoder", pytorch_model)
+
+    # Define inputs
+    hidden_states = torch.rand((1, 7, 2048)).to(torch.float32)
+    attn_mask = torch.ones((1, 1, 7, 7)).to(torch.float32)
+    pos_ids = torch.arange(7).unsqueeze(0).to(torch.float32)
+    
+    # Sanity run
+    out = pytorch_model(hidden_states, attn_mask, pos_ids)
+    print(out)
+
+    verify_module(
+        tt_model,
+        input_shapes=[(hidden_states.shape, attn_mask.shape, pos_ids.shape,)],
+        inputs=[(hidden_states, attn_mask, pos_ids,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
+
+
+@pytest.mark.skip(reason="Tested as part of a full generative model run")
+@pytest.mark.parametrize("variant", variants, ids=variants)
+def test_gemma_2b(test_device, variant):
+    # Random see for reproducibility
+    torch.manual_seed(42)
+
+    # Configurations
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    config = download_model(GemmaConfig.from_pretrained, variant)
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config_dict["use_cache"] = False
+    config = GemmaConfig(**config_dict)
+    pytorch_model = download_model(GemmaForCausalLM.from_pretrained, variant, config=config)
+    tt_model = PyTorchModule("pytorch_gemma_2b", pytorch_model)
+    
+    # Load tokenizer
+    tokenizer = download_model(AutoTokenizer.from_pretrained, variant)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # Sample input
+    prompt = "What is your favorite city?"
+    inputs = tokenizer(prompt, return_tensors="pt")
+
+    # Sanity run
+    generate_ids = pytorch_model.generate(inputs.input_ids, max_length=30)
+    generated_text = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    
+    print(f"Sanity run generated text: {generated_text}")
+    
+    input_ids = inputs["input_ids"]
+    attn_mask = inputs["attention_mask"]
+    
+    verify_module(
+        tt_model,
+        input_shapes=[(input_ids.shape, attn_mask.shape,)],
+        inputs=[(input_ids, attn_mask,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
+
+
+@pytest.mark.skip(reason="Tested as part of a full generative model run")
+@pytest.mark.parametrize("variant", variants, ids=variants)
+def test_gemma_2b_1x1(test_device, variant):
+    pytest.xfail("Passing locally, failing on CI. Keeping as XFail to be able to track potential regressions.")
+    
+    # Random see for reproducibility
+    torch.manual_seed(42)
+    
+    # Configurations
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+
+    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+    compiler_cfg.balancer_policy = "Ribbon"
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+
+    config = download_model(GemmaConfig.from_pretrained, variant)
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config_dict["use_cache"] = False
+    config = GemmaConfig(**config_dict)
+    pytorch_model = download_model(GemmaForCausalLM.from_pretrained, variant, config=config)
+    tt_model = PyTorchModule("pytorch_gemma_2b_1x1", pytorch_model)
+    
+    # Load tokenizer
+    tokenizer = download_model(AutoTokenizer.from_pretrained, variant)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # Sample input
+    prompt = "What is your favorite city?"
+    inputs = tokenizer(prompt, return_tensors="pt")
+
+    # Sanity run
+    generate_ids = pytorch_model.generate(inputs.input_ids, max_length=30)
+    generated_text = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    
+    print(f"Sanity run generated text: {generated_text}")
+    
+    input_ids = inputs["input_ids"]
+    attn_mask = inputs["attention_mask"]
+    
+    verify_module(
+        tt_model,
+        input_shapes=[(input_ids.shape, attn_mask.shape,)],
+        inputs=[(input_ids, attn_mask,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
+
+
+@pytest.mark.parametrize("variant", variants, ids=variants)
+def test_gemma_2b_gen(test_device, variant):
+    # Random seed for reproducibility
+    torch.manual_seed(42)
+    
+    # Configurations
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    if test_device.arch != BackendDevice.Grayskull:
+        compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+        
+        # Configure all matmul ops to operate on HiFi4 with Bfp8_b inputs/params and Float16 accumulation
+        pybuda.config.configure_mixed_precision(
+            op_type='matmul',
+            math_fidelity=MathFidelity.HiFi4,
+            input_df={0:[DataFormat.Bfp8_b, False], 1:[DataFormat.Bfp8_b, False]},
+            accumulate_df=DataFormat.Float16_b
+        )
+
+        # Configure all other ops to run on HiFi4 with Float16 accumulation
+        pybuda.config.configure_mixed_precision(
+            op_type='^((?!matmul).)*$',
+            math_fidelity=MathFidelity.HiFi4,
+            accumulate_df=DataFormat.Float16_b
+        )
+    
+    if test_device.arch == BackendDevice.Grayskull:
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = f"{65*1024}"
+
+    config = download_model(GemmaConfig.from_pretrained, variant)
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config_dict["use_cache"] = False
+    
+    config = GemmaConfig(**config_dict)
+    pytorch_model = download_model(GemmaForCausalLM.from_pretrained, variant, config=config)
+    
+    # Load tokenizer
+    tokenizer = download_model(AutoTokenizer.from_pretrained, variant)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # Sample input
+    prompt = "What is your favorite city?"
+    inputs = tokenizer(prompt, return_tensors="pt")
+
+    # Sanity run
+    generate_ids = pytorch_model.generate(inputs.input_ids, max_length=30)
+    generated_pt_text = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    
+    print("Based on prompt:")
+    print(f"{prompt}")
+    print(f"\nPyTorch (sanity) generated:")
+    pt_ans = generated_pt_text.split('\n\n')[1]
+    print(f"{pt_ans}")
+    
+    # Initialize and Run text2text generator on Tenstorrent device
+    text2text_generator = pybuda_pipeline(
+        "text2text-generation",
+        model=pytorch_model,
+        tokenizer=tokenizer,
+        pybuda_max_length=32,
+    )
+    generated_tt_text = text2text_generator(
+        prompt,
+        max_length=32,
+        num_beams=1,
+        num_return_sequences=1,
+        no_repeat_ngram_size=2,
+    )
+    
+    print("Based on prompt:")
+    print(f"{prompt}")
+    print(f"\nTT generated:")
+    for sequence in generated_tt_text:
+        tt_ans = sequence['generated_text'][len(prompt):]
+        print(f"{tt_ans}")
+
+
+@pytest.mark.parametrize("variant", variants, ids=variants)
+def test_gemma_2b_1x1_gen(test_device, variant):
+    if test_device.arch == BackendDevice.Grayskull:
+        pytest.skip("Not supporting the Grayskull 1x1 overlay yet")
+    
+    # Random seed for reproducibility
+    torch.manual_seed(42)
+
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    
+    if test_device.devtype == BackendType.Silicon and "CI_PROJECT_DIR" in os.environ: 
+        pytest.skip("Failing on CI with Read 0xffffffff from ARC scratch[6]: you should reset the board")
+
+    # Configurations
+    compiler_cfg.balancer_policy = "Ribbon"
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_OVERRIDE_DEVICE_YAML"] = "wormhole_b0_1x1.yaml"
+
+    # Configure all matmul ops to operate on HiFi4 with Bfp8_b inputs/params and Float16 accumulation
+    pybuda.config.configure_mixed_precision(
+        op_type='matmul',
+        math_fidelity=MathFidelity.HiFi4,
+        input_df={0:[DataFormat.Bfp8_b, False], 1:[DataFormat.Bfp8_b, False]},
+        accumulate_df=DataFormat.Float16_b
+    )
+
+    # Configure all other ops to run on HiFi4 with Float16 accumulation
+    pybuda.config.configure_mixed_precision(
+        op_type='^((?!matmul).)*$',
+        math_fidelity=MathFidelity.HiFi4,
+        accumulate_df=DataFormat.Float16_b
+    )
+
+    config = download_model(GemmaConfig.from_pretrained, variant)
+    config_dict = config.to_dict()
+    config_dict["return_dict"] = False
+    config_dict["use_cache"] = False
+    
+    config = GemmaConfig(**config_dict)
+    pytorch_model = download_model(GemmaForCausalLM.from_pretrained, variant, config=config)
+    
+    # Load tokenizer
+    tokenizer = download_model(AutoTokenizer.from_pretrained, variant)
+    tokenizer.pad_token = tokenizer.eos_token
+
+    # Sample input
+    prompt = "What is your favorite city?"
+    inputs = tokenizer(prompt, return_tensors="pt")
+
+    # Sanity run
+    generate_ids = pytorch_model.generate(inputs.input_ids, max_length=30)
+    generated_pt_text = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+    
+    print("Based on prompt:")
+    print(f"{prompt}")
+    print(f"\nPyTorch (sanity) generated:")
+    pt_ans = generated_pt_text.split('\n\n')[1]
+    print(f"{pt_ans}")
+    
+    # Initialize and Run text2text generator on Tenstorrent device
+    text2text_generator = pybuda_pipeline(
+        "text2text-generation",
+        model=pytorch_model,
+        tokenizer=tokenizer,
+        pybuda_max_length=32,
+    )
+    generated_tt_text = text2text_generator(
+        prompt,
+        max_length=32,
+        num_beams=1,
+        num_return_sequences=1,
+        no_repeat_ngram_size=2,
+    )
+    
+    print("Based on prompt:")
+    print(f"{prompt}")
+    print(f"\nTT generated:")
+    for sequence in generated_tt_text:
+        tt_ans = sequence['generated_text'][len(prompt):]
+        print(f"{tt_ans}")
+
+
+if __name__ == "__main__":
+    test_gemma_2b_gen()
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_gpt2.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_gpt2.py
index 796db6adf..2cbcecf6f 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_gpt2.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_gpt2.py
@@ -79,7 +79,6 @@ def test_gpt2_past_cache(test_device):
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.compile_subgraphs = True
     compiler_cfg.enable_tvm_cpu_fallback = False
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_fusing = False
 
     model = GPT2LMHeadModel.from_pretrained("gpt2", return_dict=False)
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_gptneo.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_gptneo.py
index 53fa7cd72..23a29af84 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_gptneo.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_gptneo.py
@@ -38,12 +38,8 @@ def test_gptneo_causal_lm(variant, test_device):
     compiler_cfg = pybuda.config._get_global_compiler_config() 
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
-    if variant == "EleutherAI/gpt-neo-2.7B" and test_device.arch == BackendDevice.Grayskull:
-        compiler_cfg.balancer_policy = "Ribbon"
-
-    if variant == "EleutherAI/gpt-neo-1.3B" and test_device.arch == BackendDevice.Wormhole_B0:
-        os.environ["PYBUDA_RIBBON2"] = "1"
-        compiler_cfg.balancer_policy = "Ribbon"
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    compiler_cfg.balancer_policy = "Ribbon"
 
     if variant == "EleutherAI/gpt-neo-2.7B" and test_device.arch == BackendDevice.Wormhole_B0:
         os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "65536"
@@ -127,6 +123,9 @@ def test_gptneo_sequence_classification(variant, test_device):
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
+    if variant in ["EleutherAI/gpt-neo-1.3B", "EleutherAI/gpt-neo-2.7B"]:
+        os.environ["PYBUDA_LEGACY_KERNEL_BROADCAST"] = "1"
+
     tokenizer = download_model(AutoTokenizer.from_pretrained, variant)
     tokenizer.pad_token = tokenizer.eos_token
     model = download_model(
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_mistral.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_mistral.py
new file mode 100644
index 000000000..a7460d812
--- /dev/null
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_mistral.py
@@ -0,0 +1,322 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+
+import time
+import pytest
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer, MistralConfig
+
+import pybuda
+from pybuda import VerifyConfig
+from pybuda import PyTorchModule
+from pybuda._C.backend_api import BackendDevice, DeviceMode
+from pybuda._C import DataFormat, MathFidelity
+from pybuda.verify.backend import verify_module
+from pybuda.verify.config import TestKind
+from typing import Optional
+from pybuda.transformers.pipeline import NLPPipelineWrapper
+
+
+variants = ['mistralai/Mistral-7B-v0.1']
+@pytest.mark.skip(reason="Tested as part of full model test run")
+@pytest.mark.parametrize("variant", variants, ids=variants)
+def test_mistral_decoder_layer(variant, test_device):
+
+    if test_device.arch != BackendDevice.Wormhole_B0:
+        pytest.skip("Currently only supported on Wormhole B0 N150 device")
+
+    model = AutoModelForCausalLM.from_pretrained(variant, device_map="auto")
+    model.eval()
+    module = model.model.layers[0]
+
+    # test should work for batch size 1 and seqlen <= 128
+    # for larger seqlen, a problem with valid node placement can occur
+    batch_size = 1
+    hidden_dim = 4096
+    seqlen = 128
+
+    sample_inputs = torch.randn(batch_size, seqlen, hidden_dim)
+
+    verify_module(
+        pybuda.PyTorchModule(
+            f"mistral_decoder_layer_seqlen_{seqlen}_bs_{batch_size}", module),
+            input_shapes=[(sample_inputs.shape,)],
+            inputs=[(sample_inputs,)],
+            verify_cfg=VerifyConfig(
+                arch=test_device.arch,
+                devtype=test_device.devtype,
+                devmode=test_device.devmode,
+                test_kind=TestKind.INFERENCE,
+        )
+    )
+
+
+variants = ['mistralai/Mistral-7B-v0.1']
+@pytest.mark.parametrize("variant", variants, ids=variants)
+def test_mistral(variant, test_device):
+    if test_device.arch != BackendDevice.Wormhole_B0:
+        pytest.skip("Currently only supported on Wormhole B0 N150 device")
+
+    configuration = MistralConfig()
+
+    configuration.sliding_window = None
+    configuration.use_cache = False
+    configuration.return_dict = False
+
+    pybuda.set_configuration_options(default_df_override=pybuda.DataFormat.Float16_b, balancer_policy='Ribbon')
+
+    # configuration for all ops that are not matmul
+    pybuda.config.configure_mixed_precision(
+        op_type='^((?!matmul).)*$',
+        math_fidelity=MathFidelity.HiFi4,
+        accumulate_df=DataFormat.Float16_b
+    )
+
+    # configuration for all matmul ops
+    # when inputs to matmuls are Bfp8_b, the whole model can fit to single chip
+    pybuda.config.configure_mixed_precision(
+        op_type='matmul',
+        math_fidelity=MathFidelity.HiFi4,
+        input_df={0:[DataFormat.Bfp8_b, False], 1:[DataFormat.Bfp8_b, False]},
+        accumulate_df=DataFormat.Float16_b
+    )
+
+    module = AutoModelForCausalLM.from_pretrained(variant, device_map="auto", config = configuration)
+    tokenizer = AutoTokenizer.from_pretrained(variant)
+    
+    module.eval()
+    for param in module.parameters():
+        param.requires_grad = False
+
+    # test should work for batch size 1 and seqlen <= 128
+    # for larger seqlen, a DRAM allocation problem might occur (this model is already near maximum model size for single chip)
+    batch_size = 1
+    prompt = "Of course, fancy writing doesn't just conceal ideas. It can also conceal the lack of them. That's why some people write that way, to conceal the fact that they have nothing to say. Whereas writing simply keeps you honest. If you say nothing simply, it will be obvious to everyone, including you. Simple writing also lasts better. People reading your stuff in the future will be in much the same position as people from other countries reading it today. The culture and the language will have changed. It's not vain to care about that, any more than it's vain for "
+    sample_inputs = tokenizer(prompt, return_tensors = 'pt')['input_ids']
+
+    verify_module(
+        pybuda.PyTorchModule(
+            f"full_model_seqlen_{sample_inputs.shape[-1]}_bs_{batch_size}_layers_{configuration.num_hidden_layers}", module),
+            input_shapes=[(sample_inputs.shape,)],
+            inputs=[(sample_inputs, )],
+            verify_cfg=VerifyConfig(
+                arch=test_device.arch,
+                devtype=test_device.devtype,
+                devmode=test_device.devmode,
+                test_kind=TestKind.INFERENCE,
+        )
+    )
+
+variants = ['mistralai/Mistral-7B-v0.1']
+@pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.skip(reason="This test currently serves the same purpose as test_mistral")
+def test_mistral_decode(variant, test_device):
+    if test_device.arch != BackendDevice.Wormhole_B0:
+        pytest.skip("Currently only supported on Wormhole B0 N150 device")
+
+    configuration = MistralConfig()
+    configuration.sliding_window = None
+    configuration.use_cache = False
+    configuration.return_dict = False
+
+    pybuda.set_configuration_options(default_df_override=pybuda.DataFormat.Float16_b, balancer_policy='Ribbon')
+
+    # configuration for all ops that are not matmul
+    pybuda.config.configure_mixed_precision(
+        op_type='^((?!matmul).)*$',
+        math_fidelity=MathFidelity.HiFi4,
+        accumulate_df=DataFormat.Float16_b
+    )
+
+    # configuration for all matmul ops
+    # when inputs to matmuls are Bfp8_b, the whole model can fit to single chip
+    pybuda.config.configure_mixed_precision(
+        op_type='matmul',
+        math_fidelity=MathFidelity.HiFi4,
+        input_df={0:[DataFormat.Bfp8_b, False], 1:[DataFormat.Bfp8_b, False]},
+        accumulate_df=DataFormat.Float16_b
+    )
+
+    pytorch_model = AutoModelForCausalLM.from_pretrained(variant, device_map="auto", config = configuration)
+    tokenizer = AutoTokenizer.from_pretrained(variant)
+
+    pytorch_model.eval()
+    for param in pytorch_model.parameters():
+        param.requires_grad = False
+    
+    tokenizer.pad_token = tokenizer.eos_token
+
+    prompt = "Of course, fancy writing doesn't just conceal ideas. It can also conceal the lack of them. That's why some people write that way, to conceal the fact that they have nothing to say. Whereas writing simply keeps"
+    inputs = tokenizer(prompt, return_tensors="pt")
+
+    max_generated_tokens = 100
+
+    generate_ids = pytorch_model.generate(inputs.input_ids, max_length=max_generated_tokens)
+    generated_pt_text = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+
+    print("Based on prompt:")
+    print(f"{prompt}")
+    print(f"\nPyTorch (sanity) generated:")
+    pt_ans = generated_pt_text.split('\n\n')
+    print(f"{pt_ans}")
+
+    wrapper = NLPPipelineWrapper(
+        pytorch_model,
+        tokenizer,
+        pytorch_model.__class__.__name__,
+        use_cache=None,
+        forward_fn=None,
+        max_length=max_generated_tokens
+        )
+    
+    pytorch_model.prepare_inputs_for_generation = wrapper.prepare_inputs_for_generation
+
+    # this generates sample text, to trigger model compilation, so it is not factored during latency measurement
+    outputs = pytorch_model.generate(inputs['input_ids'][:,0:1], do_sample=False, max_length=max_generated_tokens)
+    output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+
+    start = time.time()
+    outputs = pytorch_model.generate(inputs['input_ids'], do_sample=False, max_length=max_generated_tokens)
+    output_text = tokenizer.batch_decode(outputs, skip_special_tokens=True)
+    end = time.time()
+
+    num_generated_tokens = outputs.shape[-1] - inputs['input_ids'].shape[-1]
+    print('TT generated:')
+    print(output_text[0])
+    print(f'Tokens / s: {num_generated_tokens / (end-start)}')
+
+variants = ['mistralai/Mistral-7B-v0.1']
+@pytest.mark.parametrize("variant", variants, ids=variants)
+def test_mistral_kv_cache(variant, test_device):
+    if test_device.arch != BackendDevice.Wormhole_B0:
+        pytest.skip("Currently only supported on Wormhole B0 N150 device")
+
+    configuration = MistralConfig()
+    configuration.sliding_window = None
+    configuration.use_cache = True
+    configuration.return_dict = False
+
+    max_new_tokens = 10
+    pybuda.set_configuration_options(default_df_override=pybuda.DataFormat.Float16_b, balancer_policy='Ribbon')
+
+    # configuration for all ops that are not matmul
+    pybuda.config.configure_mixed_precision(
+        op_type='^((?!matmul).)*$',
+        math_fidelity=MathFidelity.HiFi4,
+        accumulate_df=DataFormat.Float16_b
+    )
+
+    # configuration for all matmul ops
+    # when inputs to matmuls are Bfp8_b, the whole model can fit to single chip
+    pybuda.config.configure_mixed_precision(
+        op_type='matmul',
+        math_fidelity=MathFidelity.HiFi4,
+        input_df={0:[DataFormat.Bfp8_b, False], 1:[DataFormat.Bfp8_b, False]},
+        accumulate_df=DataFormat.Float16_b
+    )
+
+    model = AutoModelForCausalLM.from_pretrained(variant, device_map="auto", config = configuration)
+    tokenizer = AutoTokenizer.from_pretrained(variant)
+
+    model.eval()
+    for param in model.parameters():
+        param.requires_grad = False
+
+    tokenizer.pad_token = tokenizer.eos_token
+
+    prompt = "Of course, fancy writing doesn't just conceal ideas. It can also conceal the lack of them. That's why some people write that way, to conceal the fact that they have nothing to say. Whereas writing simply keeps"
+
+    inputs = tokenizer(prompt, return_tensors='pt')
+
+    T = inputs['input_ids'].shape[-1]
+    output_ids = inputs['input_ids'].clone()
+    position_ids = torch.arange(T)
+    inputs = tuple(inputs.values())
+    inputs += (position_ids,)
+    
+    # perform prefill with torch model on cpu
+    logits, past_key_values = model(*inputs)
+
+    tt1 = pybuda.TTDevice("tt1", devtype=test_device.devtype, arch=test_device.arch, module=PyTorchModule("mistral_model_base", BaseModelWrapper(model)))
+
+    next_token = sample(logits)
+    output_ids = torch.cat([output_ids, next_token], axis=1)
+    position_ids = torch.tensor([[T]])
+    mask = torch.ones(1, T + 1)
+
+    inputs = (next_token, mask, position_ids, )
+    for i in range(configuration.num_hidden_layers):
+        inputs += (past_key_values[i][0], past_key_values[i][1])
+
+    # compile model before measuring perf
+    output_q = pybuda.initialize_pipeline(training=False, sample_inputs=inputs, _sequential=True, _device_mode = DeviceMode.CompileAndRun)
+
+    start_time = time.time()
+    for i in range(max_new_tokens):
+        
+        position_ids = torch.tensor([[T]])
+        mask = torch.ones(1, T + 1)
+        if i > 0: # for i = 0 we have already defined inputs
+            inputs = (next_token, mask, position_ids, *past_key_values)
+
+        tt1.push_to_inputs(inputs)
+        pybuda.run_forward(input_count=1, _sequential=True)
+        outputs = output_q.get()
+
+        logits = outputs[0].value().to(dtype=torch.float)
+
+        next_token = sample(logits)
+        output_ids = torch.cat([output_ids, next_token], axis=1)
+        past_key_values = [el.value() for el in outputs[1:]]
+        T += 1
+
+    duration = time.time() - start_time
+
+    tokens_per_second = max_new_tokens / duration
+    generated_text = tokenizer.decode(output_ids[0].numpy().tolist())
+    print(generated_text)
+    print(f'Tokens per second: {tokens_per_second}')
+
+
+class BaseModelWrapper(torch.nn.Module):
+    def __init__(self, model: torch.nn.Module) -> None:
+        super().__init__()
+        self.model = model
+
+    def forward(self,
+                input_ids: torch.Tensor,
+                attention_mask: torch.Tensor,
+                position_ids: torch.Tensor,
+                *kv):
+        """
+        input_ids: Shape [bs, 1]
+        attention_mask: Shape [bs, seqlen]
+        position_ids: Shape [1, 1]
+        kv: KV cache in format (k0, v0, k1, v1, ..., k_{L-1}, v_{L-1}) where L is the number of layers/blocks
+        """
+        kv = tuple(zip(kv[:-1:2], kv[1::2])) # making tuple of pairs (key_cache, value_cache)
+        outputs = self.model(input_ids, attention_mask, position_ids, kv)
+        # flattening past key values because TT compiler expects flattened output in format tuple(torch.Tensor,  ..., torch.Tensor)
+        outputs = [outputs[0]] + [el for subl in outputs[1] for el in subl]
+        return tuple(outputs)
+            
+
+def multinomial_sample_one_no_sync(probs_sort):
+    q = torch.empty_like(probs_sort).exponential_(1)
+    return torch.argmax(probs_sort / q, dim=-1, keepdim=True).to(dtype=torch.int)
+
+def logits_to_probs(logits, temperature: float = 1.0, top_k: Optional[int] = None):
+    logits = logits / max(temperature, 1e-5)
+
+    if top_k is not None:
+        v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+        pivot = v.select(-1, -1).unsqueeze(-1)
+        logits = torch.where(logits < pivot, -float("Inf"), logits)
+    probs = torch.nn.functional.softmax(logits, dim=-1)
+    return probs
+
+def sample(logits, temperature: float = 1.0, top_k: Optional[int] = None):
+    probs = logits_to_probs(logits[0, -1], temperature, top_k)
+    idx_next = multinomial_sample_one_no_sync(probs)
+    return idx_next.unsqueeze(0)
\ No newline at end of file
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_opt.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_opt.py
index c3a491bbf..c377e0007 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_opt.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_opt.py
@@ -21,15 +21,13 @@ def test_opt_causal_lm(variant, test_device):
 
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.default_df_override = DataFormat.Float16_b
-    if variant == "facebook/opt-1.3b" or variant == "facebook/opt-350m":
-        compiler_cfg.enable_auto_fusing = False
-        if variant == "facebook/opt-1.3b":
-            compiler_cfg.amp_level = 2
+    if variant == "facebook/opt-1.3b":
+        compiler_cfg.amp_level = 2
 
-            # Disable expanding output buffer of fork nodes - causes out of memory issue in blobgen.
-            os.environ["PYBUDA_FORK_JOIN_EXPAND_FORK_OUTPUT_BUF"] = "0"
-        if variant == "facebook/opt-350m":
-            os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "65536"
+        # Disable expanding output buffer of fork nodes - causes out of memory issue in blobgen.
+        os.environ["PYBUDA_FORK_JOIN_EXPAND_FORK_OUTPUT_BUF"] = "0"
+    if variant == "facebook/opt-350m":
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "65536"
 
     config = OPTConfig.from_pretrained(variant)
     config_dict = config.to_dict()
@@ -73,10 +71,8 @@ def test_opt_qa(variant, test_device):
 
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.default_df_override = DataFormat.Float16_b
-    if variant == "facebook/opt-1.3b" or variant == "facebook/opt-350m":
-        compiler_cfg.enable_auto_fusing = False
-        if variant == "facebook/opt-1.3b":
-            compiler_cfg.default_df_override = DataFormat.Float16
+    if variant == "facebook/opt-1.3b":
+        compiler_cfg.default_df_override = DataFormat.Float16
 
     tokenizer = download_model(AutoTokenizer.from_pretrained, variant)
     model = download_model(OPTForQuestionAnswering.from_pretrained,
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_roberta.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_roberta.py
index e535e7023..73aa2080f 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_roberta.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_roberta.py
@@ -62,14 +62,6 @@ def test_roberta_sentiment_pytorch(test_device):
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object 
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
-    # Load label mapping
-    labels = []
-    mapping_link = "https://raw.githubusercontent.com/cardiffnlp/tweeteval/main/datasets/sentiment/mapping.txt"
-    with urllib.request.urlopen(mapping_link) as f:
-        html = f.read().decode("utf-8").split("\n")
-        csvreader = csv.reader(html, delimiter="\t")
-    labels = [row[1] for row in csvreader if len(row) > 1]
-
     # Example from multi-nli validation set
     text = """Great road trip views! @ Shartlesville, Pennsylvania"""
 
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_t5.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_t5.py
index 77541e970..f9eadb41a 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_t5.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_t5.py
@@ -16,21 +16,18 @@
 from pybuda.transformers.pipeline import pipeline as pybuda_pipeline
 from transformers import T5ForConditionalGeneration, T5Tokenizer, T5Config
 
-
+@pytest.mark.skip(reason="Not supported")
 def test_t5_loop_tiny_tile(test_device):
-    pytest.skip()
     import os
     os.environ["PYBUDA_ENABLE_TINY_TILE"] = "1"
     # Add PyBUDA configurations
     os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
     os.environ["TT_BACKEND_MULTI_THREADED_PUSH"] = "1"
-    os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"] = "1"
     os.environ["PYBUDA_FORCE_SEQUENTIAL"] = "1"
     os.environ["TT_BACKEND_DRAM_POLLING_FREQUENCY"] = "64"
     # os.environ["TT_BACKEND_PROFILER"] = "1"
 
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False
     compiler_cfg.default_df_override = pybuda._C.Float16_b
     compiler_cfg.default_dram_parameters = False
@@ -74,7 +71,6 @@ def forward(self, decoder_input_ids, encoder_outputs):
     print("TIME: ", time.time() - start_time)
 
 
-
 variants = ["t5-small", "t5-base", "t5-large", "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large"]
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_t5_generation(variant, test_device):
@@ -86,7 +82,6 @@ def test_t5_generation(variant, test_device):
     # os.environ["PYBUDA_ENABLE_TINY_TILE"] = "1"
     # Add PyBUDA configurations
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False
     compiler_cfg.enable_auto_fusing = False  # tenstorrent/pybuda#844
     compiler_cfg.amp_level = 1
@@ -141,7 +136,6 @@ def forward(self, decoder_input_ids, encoder_outputs):
     )
 
 
-
 class T5_encoder(torch.nn.Module):
     def __init__(self, model):
         super().__init__()
@@ -183,7 +177,6 @@ def test_t5_past_cache_enc_dec(variant, test_device):
     import os
     os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
     os.environ["TT_BACKEND_MULTI_THREADED_PUSH"] = "1"
-    os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"] = "1"
     os.environ["PYBUDA_EXTRA_L1_MARGIN"] = "120000"
     os.environ["PYBUDA_FORCE_SEQUENTIAL"] = "1"
     if "flan" in variant:
@@ -195,7 +188,6 @@ def test_t5_past_cache_enc_dec(variant, test_device):
     os.environ["TT_BACKEND_EPOCH_BIN_NUM_SLOTS"] = "64"
     os.environ["PYBUDA_ROTATE_PAST_CACHE_PARAMS"] = "1"
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False
     compiler_cfg.default_df_override = pybuda._C.Float16_b
     compiler_cfg.default_dram_parameters = False
@@ -208,6 +200,7 @@ def test_t5_past_cache_enc_dec(variant, test_device):
 
     if test_device.arch == BackendDevice.Grayskull:
         os.environ["PYBUDA_TEMP_ELT_UNARY_ESTIMATES_LEGACY"] = "1"
+        compiler_cfg.balancer_op_override("matmul_5865", "t_stream_shape", (1, 1))
 
     if test_device.arch == BackendDevice.Wormhole_B0:
         if variant == "google/flan-t5-large":
@@ -332,19 +325,16 @@ def test_t5_past_cache_enc_dec(variant, test_device):
 
 variants = ["t5-small", "t5-base", "t5-large", "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large"]
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.skip(reason="Redundant")
 def test_t5_past_cache_pybuda_pipeline(variant, test_device):
-    pytest.skip() # tested in test_t5_past_cache_enc_dec
     import os
-    os.environ["PYBUDA_DISABLE_STREAM_OUTPUT"] = "1"
     os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
     os.environ["TT_BACKEND_MULTI_THREADED_PUSH"] = "1"
-    os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"] = "1"
     os.environ["PYBUDA_EXTRA_L1_MARGIN"] = "169536"
     os.environ["PYBUDA_FORCE_SEQUENTIAL"] = "1"
     os.environ["PYBUDA_NLP_MANUAL_TARGET"] = "30000"
     os.environ["TT_BACKEND_DRAM_POLLING_FREQUENCY"] = "64"
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False
     compiler_cfg.default_df_override = pybuda._C.Float16_b
     compiler_cfg.default_dram_parameters = False
@@ -502,23 +492,19 @@ def wrap_generate(inputs):
     print(answer)
 
 
-
 variants = ["t5-small", "t5-base", "t5-large", "google/flan-t5-small", "google/flan-t5-base", "google/flan-t5-large"]
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.skip(reason="Redundant")
 def test_t5_pybuda_pipeline(variant, test_device):
-    pytest.skip() # tested in test_t5_past_cache_enc_dec
     # Too slow for post-commit ci
 
     import os
-    os.environ["PYBUDA_DISABLE_STREAM_OUTPUT"] = "1"
     os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
     os.environ["TT_BACKEND_MULTI_THREADED_PUSH"] = "1"
-    os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"] = "1"
     os.environ["PYBUDA_FORCE_SEQUENTIAL"] = "1"
     os.environ["PYBUDA_NLP_MANUAL_TARGET"] = "30000"
     os.environ["TT_BACKEND_DRAM_POLLING_FREQUENCY"] = "64"
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False
     compiler_cfg.default_df_override = pybuda._C.Float16_b
     compiler_cfg.default_dram_parameters = False
@@ -555,8 +541,8 @@ def test_t5_pybuda_pipeline(variant, test_device):
     )
     print(answer)
 
+
 def test_t5_small_tiny_tile(test_device):
-        
     if test_device.arch == BackendDevice.Grayskull:
         pytest.skip("Grayskull test failing with TM ERROR (producer = matmul_49, consumer = matmul_53): input using kernel_broadcast but post-TM input canonical form is not periodic")
 
@@ -565,7 +551,6 @@ def test_t5_small_tiny_tile(test_device):
     os.environ["PYBUDA_LEGACY_UBLOCK_SHAPE"] = "1"
     # Add PyBUDA configurations
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False
     compiler_cfg.enable_auto_fusing = False  # tenstorrent/pybuda#844
     compiler_cfg.amp_level = 1
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_whisper_0.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_whisper_0.py
index 46bcde2b4..31192c5c4 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_whisper_0.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_whisper_0.py
@@ -44,7 +44,6 @@
 def generate_model_whisper_congen_hf_pytorch(test_device, variant):
     # Configurations
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False  # Run full model on silicon
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
@@ -53,8 +52,6 @@ def generate_model_whisper_congen_hf_pytorch(test_device, variant):
 
     if test_device.arch == BackendDevice.Wormhole_B0:
         compiler_cfg.amp_level = 1
-        os.environ["PYBUDA_DISABLE_STREAM_OUTPUT"] = "1"  # Disable streaming for LM head to output queue (perf)
-
 
     class Wrapper(torch.nn.Module):
         def __init__(self, model):
@@ -129,7 +126,7 @@ def forward(self, decoder_input_ids, encoder_hidden_states):
 
     return pybuda_model, [decoder_input_ids, encoder_outputs], {"pcc": pcc}
 
-
+@pytest.mark.skip(reason="Redundant")
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_whisper(test_device, variant):
     pytest.skip("Already tested with past-cache and separated encoder-decoder")
@@ -160,6 +157,7 @@ def test_whisper(test_device, variant):
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.skip(reason="Redundant")
 def test_whisper_pipeline(test_device, variant):
     pytest.skip("Already tested with past-cache and separated encoder-decoder")
     if test_device.arch == BackendDevice.Grayskull:
@@ -167,7 +165,6 @@ def test_whisper_pipeline(test_device, variant):
 
     # Configurations
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_fusing = False  # tenstorrent/pybuda#844
     compiler_cfg.amp_level = 2
     compiler_cfg.enable_link_past_cache_ios = False
@@ -221,6 +218,7 @@ def test_whisper_pipeline(test_device, variant):
     assert cpu_out["text"] == tt_out["text"]
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.skip(reason="Redundant")
 def test_whisper_encoder(test_device, variant):
     pytest.skip("Already tested with past-cache and separated encoder-decoder")
 
@@ -229,7 +227,6 @@ def test_whisper_encoder(test_device, variant):
 
     # Configurations
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False
     compiler_cfg.input_queues_on_host = True
     compiler_cfg.enable_link_past_cache_ios = True
@@ -239,11 +236,9 @@ def test_whisper_encoder(test_device, variant):
     if test_device.arch == BackendDevice.Wormhole_B0:
         compiler_cfg.amp_level = 1
         compiler_cfg.default_dram_parameters = False
-        os.environ["PYBUDA_DISABLE_STREAM_OUTPUT"] = "1"  # Disable streaming for LM head to output queue (perf)
         os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
         os.environ["PYBUDA_PAD_OUTPUT_BUFFER_THRESHOLD_TILES"] = "1536"
         os.environ["PYBUDA_NLP_MANUAL_TARGET"] = "35000"
-        os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"] = "1"
         os.environ["TT_BACKEND_MULTI_THREADED_PUSH"] = "1"
         os.environ["TT_BACKEND_DRAM_POLLING_FREQUENCY"] = "64"
         os.environ["PYBUDA_NOP_ON_DIRECT_SHORT_PATH"] = "1"
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_whisper_1.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_whisper_1.py
index de452e929..5253923a6 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_whisper_1.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_whisper_1.py
@@ -41,6 +41,7 @@
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.skip(reason="Redundant")
 def test_whisper_dec_past_cache(test_device, variant):
     pytest.skip("Already tested with past-cache and separated encoder-decoder")
     model, inputs, other = generate_model_whisper_decoder_past_cache(test_device, variant)
@@ -73,45 +74,40 @@ def test_whisper_dec_past_cache(test_device, variant):
         if test_device.devtype != BackendType.Silicon:
             break
 
+
 @pytest.mark.parametrize("variant", variants, ids=variants)
 def test_whisper_enc_dec(test_device, variant):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False  # Run full model on silicon
     compiler_cfg.input_queues_on_host = True
     compiler_cfg.compile_subgraphs = True
     compiler_cfg.enable_link_past_cache_ios = True
     compiler_cfg.backend_opt_level = 4
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    compiler_cfg.balancer_policy = "Ribbon"
     os.environ["PYBUDA_FORCE_SEQUENTIAL"] = "1"
-
+    os.environ["PYBUDA_RIBBON2"] = "1"
     if test_device.arch == BackendDevice.Wormhole_B0:
         compiler_cfg.amp_level = 1
-        os.environ["PYBUDA_DISABLE_STREAM_OUTPUT"] = "1"  # Disable streaming for LM head to output queue (perf)
         os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
         os.environ["PYBUDA_PAD_OUTPUT_BUFFER_THRESHOLD_TILES"] = "1536"
 
-        os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"] = "1"
         os.environ["TT_BACKEND_MULTI_THREADED_PUSH"] = "1"
         os.environ["TT_BACKEND_DRAM_POLLING_FREQUENCY"] = "64"
         os.environ["PYBUDA_NOP_ON_DIRECT_SHORT_PATH"] = "1"
-        os.environ["PYBUDA_NLP_MANUAL_TARGET_PER_SUBGRAPH"] = "0, 35000, 1, 35000, 2, 23000"
         os.environ["PYBUDA_SKIP_SMALL_UKT"] = "1"
 
 
         if variant == "openai/whisper-base":
-            os.environ["PYBUDA_NLP_MANUAL_TARGET_PER_SUBGRAPH"] = "0, 55000, 1, 35000, 2, 23000"
             os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "None"
             compiler_cfg.enable_auto_fusing = False
 
         if variant == "openai/whisper-small":
-            os.environ["PYBUDA_NLP_MANUAL_TARGET_PER_SUBGRAPH"] = "0, 35000, 1, 50000, 2, 23000"
             os.environ["PYBUDA_DISABLE_SELF_CUT_FOR_SUBGRAPHS"] = "1, 2"
 
         if variant == "openai/whisper-medium":
             os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "None"
             compiler_cfg.enable_auto_fusing = False
-            os.environ["PYBUDA_NLP_MANUAL_TARGET_PER_SUBGRAPH"] = "0, 125000, 1, 80000, 2, 30000"
             compiler_cfg.balancer_op_override("layernorm_66.dc.add.14", "t_stream_shape", (1,1))
             compiler_cfg.balancer_op_override("layernorm_1193.dc.add.14", "t_stream_shape", (1,1))
         
@@ -121,11 +117,15 @@ def test_whisper_enc_dec(test_device, variant):
 
     elif test_device.arch == BackendDevice.Grayskull:
         compiler_cfg.enable_auto_fusing = False
+        if "large" in variant:
+            compiler_cfg.place_on_new_epoch("matmul_2805")
+        if "medium" in variant:
+            compiler_cfg.place_on_new_epoch("matmul_3295")
         if variant == "openai/whisper-base":
             compiler_cfg.amp_level = 1
         else:
-            compiler_cfg.enable_enumerate_u_kt = False
-            os.environ["PYBUDA_NLP_MANUAL_TARGET"] = "2000000"
+        #    compiler_cfg.enable_enumerate_u_kt = False
+            os.environ["PYBUDA_TEMP_RIBBON2_LEGACY_UTIL_EVAL"] = "1"
 
     run_encoder_on_tt = ("tiny" in variant) or ("base" in variant) or ("small" in variant) or ("medium" in variant)
 
@@ -311,20 +311,18 @@ def test_whisper_enc_dec(test_device, variant):
 
 
 @pytest.mark.parametrize("variant", variants, ids=variants)
+@pytest.mark.skip(reason="Redundant")
 def test_whisper_enc_dec_pipeline(test_device, variant):
     pytest.skip("Already tested with past-cache and separated encoder-decoder")
     compiler_cfg = _get_global_compiler_config()
     # compiler_cfg.amp_level = 1
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False  # Run full model on silicon
     compiler_cfg.input_queues_on_host = True
     compiler_cfg.compile_subgraphs = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     compiler_cfg.enable_link_past_cache_ios = True
     os.environ["PYBUDA_FORCE_SEQUENTIAL"] = "1"
-    os.environ["PYBUDA_DISABLE_STREAM_OUTPUT"] = "1"  # Disable streaming for LM head to output queue (perf)
     os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
-    os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"] = "1"
     os.environ["TT_BACKEND_MULTI_THREADED_PUSH"] = "1"
     os.environ["TT_BACKEND_DRAM_POLLING_FREQUENCY"] = "64"
     os.environ["TT_BACKEND_PROFILER"] = "1"
diff --git a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_xglm.py b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_xglm.py
index f438ca83d..1610372c5 100644
--- a/pybuda/test/model_demos/high_prio/nlp/pytorch/test_xglm.py
+++ b/pybuda/test/model_demos/high_prio/nlp/pytorch/test_xglm.py
@@ -52,6 +52,7 @@ def test_xglm_causal_lm(variant, test_device):
         return_tensors="pt",
     )   
 
+    pcc = 0.98 if test_device.devtype == BackendType.Silicon and test_device.arch == BackendDevice.Wormhole_B0 else 0.99
     verify_module(
         pybuda.PyTorchModule("pt_xglm_causal_lm", model),
         input_shapes=[(input_tokens['input_ids'].shape, input_tokens['attention_mask'].shape,)],
@@ -62,5 +63,6 @@ def test_xglm_causal_lm(variant, test_device):
             devmode=test_device.devmode,
             test_kind=TestKind.INFERENCE,
             chip_ids=NebulaGalaxy.chip_ids if "PYBUDA_NEB_GALAXY_CI" in os.environ and int(os.environ.get("PYBUDA_NEB_GALAXY_CI"))==1 else [0],
+            pcc=pcc,
         )
-)
+    )
diff --git a/pybuda/test/model_demos/models/deit.py b/pybuda/test/model_demos/models/deit.py
index 93032b45d..ae8452029 100644
--- a/pybuda/test/model_demos/models/deit.py
+++ b/pybuda/test/model_demos/models/deit.py
@@ -13,7 +13,6 @@
 def generate_model_deit_imgcls_hf_pytorch(test_device, variant):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     compiler_cfg.balancer_policy = "Ribbon"
 
diff --git a/pybuda/test/model_demos/models/dla.py b/pybuda/test/model_demos/models/dla.py
new file mode 100644
index 000000000..dee9c7ffd
--- /dev/null
+++ b/pybuda/test/model_demos/models/dla.py
@@ -0,0 +1,593 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+"""
+# Code adapted from:
+# https://github.com/ucbdrive/dla
+
+BSD 3-Clause License
+
+Copyright (c) 2018, Fisher Yu
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+* Redistributions of source code must retain the above copyright notice, this
+  list of conditions and the following disclaimer.
+
+* Redistributions in binary form must reproduce the above copyright notice,
+  this list of conditions and the following disclaimer in the documentation
+  and/or other materials provided with the distribution.
+
+* Neither the name of the copyright holder nor the names of its
+  contributors may be used to endorse or promote products derived from
+  this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+"""
+
+
+import torch
+from torch import nn
+import torch.utils.model_zoo as model_zoo
+
+from os.path import join
+import math
+from collections import namedtuple
+
+
+BatchNorm = nn.BatchNorm2d
+WEB_ROOT = "http://dl.yf.io/dla/models"
+Dataset = namedtuple(
+    "Dataset", ["model_hash", "classes", "mean", "std", "eigval", "eigvec", "name"]
+)
+
+imagenet = Dataset(
+    name="imagenet",
+    classes=1000,
+    mean=[0.485, 0.456, 0.406],
+    std=[0.229, 0.224, 0.225],
+    eigval=[55.46, 4.794, 1.148],
+    eigvec=[
+        [-0.5675, 0.7192, 0.4009],
+        [-0.5808, -0.0045, -0.8140],
+        [-0.5836, -0.6948, 0.4203],
+    ],
+    model_hash={
+        "dla34": "ba72cf86",
+        "dla46_c": "2bfd52c3",
+        "dla46x_c": "d761bae7",
+        "dla60x_c": "b870c45c",
+        "dla60": "24839fc4",
+        "dla60x": "d15cacda",
+        "dla102": "d94d9790",
+        "dla102x": "ad62be81",
+        "dla102x2": "262837b6",
+        "dla169": "0914e092",
+    },
+)
+
+datasets = {"imagenet": imagenet}
+
+
+def get_model_url(data, name):
+    return join(WEB_ROOT, data.name, "{}-{}.pth".format(name, data.model_hash[name]))
+
+
+def conv3x3(in_planes, out_planes, stride=1):
+    "3x3 convolution with padding"
+    return nn.Conv2d(
+        in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False
+    )
+
+
+class BasicBlock(nn.Module):
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BasicBlock, self).__init__()
+        self.conv1 = nn.Conv2d(
+            inplanes,
+            planes,
+            kernel_size=3,
+            stride=stride,
+            padding=dilation,
+            bias=False,
+            dilation=dilation,
+        )
+        self.bn1 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(
+            planes,
+            planes,
+            kernel_size=3,
+            stride=1,
+            padding=dilation,
+            bias=False,
+            dilation=dilation,
+        )
+        self.bn2 = BatchNorm(planes)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Bottleneck(nn.Module):
+    expansion = 2
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(Bottleneck, self).__init__()
+        expansion = Bottleneck.expansion
+        bottle_planes = planes // expansion
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm(bottle_planes)
+        self.conv2 = nn.Conv2d(
+            bottle_planes,
+            bottle_planes,
+            kernel_size=3,
+            stride=stride,
+            padding=dilation,
+            bias=False,
+            dilation=dilation,
+        )
+        self.bn2 = BatchNorm(bottle_planes)
+        self.conv3 = nn.Conv2d(bottle_planes, planes, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class BottleneckX(nn.Module):
+    expansion = 2
+    cardinality = 32
+
+    def __init__(self, inplanes, planes, stride=1, dilation=1):
+        super(BottleneckX, self).__init__()
+        cardinality = BottleneckX.cardinality
+        # dim = int(math.floor(planes * (BottleneckV5.expansion / 64.0)))
+        # bottle_planes = dim * cardinality
+        bottle_planes = planes * cardinality // 32
+        self.conv1 = nn.Conv2d(inplanes, bottle_planes, kernel_size=1, bias=False)
+        self.bn1 = BatchNorm(bottle_planes)
+        self.conv2 = nn.Conv2d(
+            bottle_planes,
+            bottle_planes,
+            kernel_size=3,
+            stride=stride,
+            padding=dilation,
+            bias=False,
+            dilation=dilation,
+            groups=cardinality,
+        )
+        self.bn2 = BatchNorm(bottle_planes)
+        self.conv3 = nn.Conv2d(bottle_planes, planes, kernel_size=1, bias=False)
+        self.bn3 = BatchNorm(planes)
+        self.relu = nn.ReLU(inplace=True)
+        self.stride = stride
+
+    def forward(self, x, residual=None):
+        if residual is None:
+            residual = x
+
+        out = self.conv1(x)
+        out = self.bn1(out)
+        out = self.relu(out)
+
+        out = self.conv2(out)
+        out = self.bn2(out)
+        out = self.relu(out)
+
+        out = self.conv3(out)
+        out = self.bn3(out)
+
+        out += residual
+        out = self.relu(out)
+
+        return out
+
+
+class Root(nn.Module):
+    def __init__(self, in_channels, out_channels, kernel_size, residual):
+        super(Root, self).__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride=1,
+            bias=False,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.bn = BatchNorm(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+        self.residual = residual
+
+    def forward(self, *x):
+        children = x
+        x = self.conv(torch.cat(x, 1))
+        x = self.bn(x)
+        if self.residual:
+            x += children[0]
+        x = self.relu(x)
+
+        return x
+
+
+class Tree(nn.Module):
+    def __init__(
+        self,
+        levels,
+        block,
+        in_channels,
+        out_channels,
+        stride=1,
+        level_root=False,
+        root_dim=0,
+        root_kernel_size=1,
+        dilation=1,
+        root_residual=False,
+    ):
+        super(Tree, self).__init__()
+        if root_dim == 0:
+            root_dim = 2 * out_channels
+        if level_root:
+            root_dim += in_channels
+        if levels == 1:
+            self.tree1 = block(in_channels, out_channels, stride, dilation=dilation)
+            self.tree2 = block(out_channels, out_channels, 1, dilation=dilation)
+        else:
+            self.tree1 = Tree(
+                levels - 1,
+                block,
+                in_channels,
+                out_channels,
+                stride,
+                root_dim=0,
+                root_kernel_size=root_kernel_size,
+                dilation=dilation,
+                root_residual=root_residual,
+            )
+            self.tree2 = Tree(
+                levels - 1,
+                block,
+                out_channels,
+                out_channels,
+                root_dim=root_dim + out_channels,
+                root_kernel_size=root_kernel_size,
+                dilation=dilation,
+                root_residual=root_residual,
+            )
+        if levels == 1:
+            self.root = Root(root_dim, out_channels, root_kernel_size, root_residual)
+        self.level_root = level_root
+        self.root_dim = root_dim
+        self.downsample = None
+        self.project = None
+        self.levels = levels
+        if stride > 1:
+            self.downsample = nn.MaxPool2d(stride, stride=stride)
+        if in_channels != out_channels:
+            self.project = nn.Sequential(
+                nn.Conv2d(
+                    in_channels, out_channels, kernel_size=1, stride=1, bias=False
+                ),
+                BatchNorm(out_channels),
+            )
+
+    def forward(self, x, residual=None, children=None):
+        children = [] if children is None else children
+        bottom = self.downsample(x) if self.downsample else x
+        residual = self.project(bottom) if self.project else bottom
+        if self.level_root:
+            children.append(bottom)
+        x1 = self.tree1(x, residual)
+        if self.levels == 1:
+            x2 = self.tree2(x1)
+            x = self.root(x2, x1, *children)
+        else:
+            children.append(x1)
+            x = self.tree2(x1, children=children)
+        return x
+
+
+class DLA(nn.Module):
+    def __init__(
+        self,
+        levels,
+        channels,
+        num_classes=1000,
+        block=BasicBlock,
+        residual_root=False,
+        return_levels=False,
+        pool_size=7,
+        linear_root=False,
+    ):
+        super(DLA, self).__init__()
+        self.channels = channels
+        self.return_levels = return_levels
+        self.num_classes = num_classes
+        self.base_layer = nn.Sequential(
+            nn.Conv2d(3, channels[0], kernel_size=7, stride=1, padding=3, bias=False),
+            BatchNorm(channels[0]),
+            nn.ReLU(inplace=True),
+        )
+        self.level0 = self._make_conv_level(channels[0], channels[0], levels[0])
+        self.level1 = self._make_conv_level(
+            channels[0], channels[1], levels[1], stride=2
+        )
+        self.level2 = Tree(
+            levels[2],
+            block,
+            channels[1],
+            channels[2],
+            2,
+            level_root=False,
+            root_residual=residual_root,
+        )
+        self.level3 = Tree(
+            levels[3],
+            block,
+            channels[2],
+            channels[3],
+            2,
+            level_root=True,
+            root_residual=residual_root,
+        )
+        self.level4 = Tree(
+            levels[4],
+            block,
+            channels[3],
+            channels[4],
+            2,
+            level_root=True,
+            root_residual=residual_root,
+        )
+        self.level5 = Tree(
+            levels[5],
+            block,
+            channels[4],
+            channels[5],
+            2,
+            level_root=True,
+            root_residual=residual_root,
+        )
+
+        self.avgpool = nn.AvgPool2d(pool_size)
+        self.fc = nn.Conv2d(
+            channels[-1], num_classes, kernel_size=1, stride=1, padding=0, bias=True
+        )
+
+        for m in self.modules():
+            if isinstance(m, nn.Conv2d):
+                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
+                m.weight.data.normal_(0, math.sqrt(2.0 / n))
+            elif isinstance(m, BatchNorm):
+                m.weight.data.fill_(1)
+                m.bias.data.zero_()
+
+    def _make_level(self, block, inplanes, planes, blocks, stride=1):
+        downsample = None
+        if stride != 1 or inplanes != planes:
+            downsample = nn.Sequential(
+                nn.MaxPool2d(stride, stride=stride),
+                nn.Conv2d(inplanes, planes, kernel_size=1, stride=1, bias=False),
+                BatchNorm(planes),
+            )
+
+        layers = []
+        layers.append(block(inplanes, planes, stride, downsample=downsample))
+        for i in range(1, blocks):
+            layers.append(block(inplanes, planes))
+
+        return nn.Sequential(*layers)
+
+    def _make_conv_level(self, inplanes, planes, convs, stride=1, dilation=1):
+        modules = []
+        for i in range(convs):
+            modules.extend(
+                [
+                    nn.Conv2d(
+                        inplanes,
+                        planes,
+                        kernel_size=3,
+                        stride=stride if i == 0 else 1,
+                        padding=dilation,
+                        bias=False,
+                        dilation=dilation,
+                    ),
+                    BatchNorm(planes),
+                    nn.ReLU(inplace=True),
+                ]
+            )
+            inplanes = planes
+        return nn.Sequential(*modules)
+
+    def forward(self, x):
+        y = []
+        x = self.base_layer(x)
+        for i in range(6):
+            x = getattr(self, "level{}".format(i))(x)
+            y.append(x)
+        if self.return_levels:
+            return y
+        else:
+            x = self.avgpool(x)
+            x = self.fc(x)
+            x = x.view(x.size(0), -1)
+
+            return x
+
+    def load_pretrained_model(self, data_name, name):
+        data = datasets[data_name]
+        fc = self.fc
+        if self.num_classes != data.classes:
+            self.fc = nn.Conv2d(
+                self.channels[-1],
+                data.classes,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=True,
+            )
+        try:
+            model_url = get_model_url(data, name)
+        except KeyError:
+            raise ValueError("{} trained on {} does not exist.".format(data.name, name))
+        self.load_state_dict(model_zoo.load_url(model_url))
+        self.fc = fc
+
+
+def dla34(pretrained=None, **kwargs):  # DLA-34
+    model = DLA(
+        [1, 1, 1, 2, 2, 1], [16, 32, 64, 128, 256, 512], block=BasicBlock, **kwargs
+    )
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, "dla34")
+    return model
+
+
+def dla46_c(pretrained=None, **kwargs):  # DLA-46-C
+    Bottleneck.expansion = 2
+    model = DLA(
+        [1, 1, 1, 2, 2, 1], [16, 32, 64, 64, 128, 256], block=Bottleneck, **kwargs
+    )
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, "dla46_c")
+    return model
+
+
+def dla46x_c(pretrained=None, **kwargs):  # DLA-X-46-C
+    BottleneckX.expansion = 2
+    model = DLA(
+        [1, 1, 1, 2, 2, 1], [16, 32, 64, 64, 128, 256], block=BottleneckX, **kwargs
+    )
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, "dla46x_c")
+    return model
+
+
+def dla60x_c(pretrained=None, **kwargs):  # DLA-X-60-C
+    BottleneckX.expansion = 2
+    model = DLA(
+        [1, 1, 1, 2, 3, 1], [16, 32, 64, 64, 128, 256], block=BottleneckX, **kwargs
+    )
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, "dla60x_c")
+    return model
+
+
+def dla60(pretrained=None, **kwargs):  # DLA-60
+    Bottleneck.expansion = 2
+    model = DLA(
+        [1, 1, 1, 2, 3, 1], [16, 32, 128, 256, 512, 1024], block=Bottleneck, **kwargs
+    )
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, "dla60")
+    return model
+
+
+def dla60x(pretrained=None, **kwargs):  # DLA-X-60
+    BottleneckX.expansion = 2
+    model = DLA(
+        [1, 1, 1, 2, 3, 1], [16, 32, 128, 256, 512, 1024], block=BottleneckX, **kwargs
+    )
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, "dla60x")
+    return model
+
+
+def dla102(pretrained=None, **kwargs):  # DLA-102
+    Bottleneck.expansion = 2
+    model = DLA(
+        [1, 1, 1, 3, 4, 1],
+        [16, 32, 128, 256, 512, 1024],
+        block=Bottleneck,
+        residual_root=True,
+        **kwargs,
+    )
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, "dla102")
+    return model
+
+
+def dla102x(pretrained=None, **kwargs):  # DLA-X-102
+    BottleneckX.expansion = 2
+    model = DLA(
+        [1, 1, 1, 3, 4, 1],
+        [16, 32, 128, 256, 512, 1024],
+        block=BottleneckX,
+        residual_root=True,
+        **kwargs,
+    )
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, "dla102x")
+    return model
+
+
+def dla102x2(pretrained=None, **kwargs):  # DLA-X-102 64
+    BottleneckX.cardinality = 64
+    model = DLA(
+        [1, 1, 1, 3, 4, 1],
+        [16, 32, 128, 256, 512, 1024],
+        block=BottleneckX,
+        residual_root=True,
+        **kwargs,
+    )
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, "dla102x2")
+    return model
+
+
+def dla169(pretrained=None, **kwargs):  # DLA-169
+    Bottleneck.expansion = 2
+    model = DLA(
+        [1, 1, 2, 3, 5, 1],
+        [16, 32, 128, 256, 512, 1024],
+        block=Bottleneck,
+        residual_root=True,
+        **kwargs,
+    )
+    if pretrained is not None:
+        model.load_pretrained_model(pretrained, "dla169")
+    return model
diff --git a/pybuda/test/model_demos/models/falcon/pybudify.py b/pybuda/test/model_demos/models/falcon/pybudify.py
index 499919b7e..8a783b4e7 100644
--- a/pybuda/test/model_demos/models/falcon/pybudify.py
+++ b/pybuda/test/model_demos/models/falcon/pybudify.py
@@ -77,12 +77,6 @@ def __init__(
             # os.environ["PYBUDA_DISABLE_INTERACTIVE_PLACER"] = "1" # Until interactive placer supports multi-chip placement overrides
             # os.environ["PYBUDA_PLACER_SNAKE"] = "1"
             # os.environ["PYBUDA_ETH_LINKS_NEBULA"] = "1"
-            # os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"] = "1"
-
-            if self.odkv or self.masked_odkv:
-                os.environ[
-                    "PYBUDA_DISABLE_DYNAMIC_DRAM"
-                ] = "1"  # much better performance, not sure why?
 
             pybuda = self.pybuda = __import__(
                 "pybuda"
@@ -372,7 +366,6 @@ def __init__(
             if self.masked_odkv:
                 # print('masked_odkv')
 
-                # compiler_cfg.enable_t_streaming = True
                 # compiler_cfg.manual_t_streaming = True
 
                 # pybuda.config.override_t_stream_dir(f"concatenate_50.dc.sparse_matmul.4.lc2", "c")
@@ -416,10 +409,8 @@ def __init__(
                 compiler_cfg.loopback_outputs = names_dict
 
             elif self.odkv:
-
-                # compiler_cfg.enable_t_streaming = True
                 # compiler_cfg.manual_t_streaming = True
-
+                
                 # pybuda.config.override_t_stream_dir(f"concatenate_50.dc.sparse_matmul.4.lc2", "c")
                 # pybuda.config.override_t_stream_dir(f"concatenate_67.dc.sparse_matmul.4.lc2", "c")
 
@@ -450,12 +441,9 @@ def __init__(
                 #     input_df={0: [pybuda.DataFormat.Bfp8_b, True], 1: [pybuda.DataFormat.Bfp8_b, True], 2: [pybuda.DataFormat.Bfp8_b, True]})
 
                 compiler_cfg.loopback_outputs = names_dict
-            else:
-                compiler_cfg.enable_t_streaming = True
 
             pybuda_arch = {
                 "grayskull": pybuda.BackendDevice.Grayskull,
-                "wormhole": pybuda.BackendDevice.Wormhole,
                 "wormhole_b0": pybuda.BackendDevice.Wormhole_B0,
             }[arch]
 
diff --git a/pybuda/test/model_demos/models/ghostnet.py b/pybuda/test/model_demos/models/ghostnet.py
index 46c4cf99a..a9feae6d2 100644
--- a/pybuda/test/model_demos/models/ghostnet.py
+++ b/pybuda/test/model_demos/models/ghostnet.py
@@ -14,7 +14,6 @@
 def generate_model_ghostnet_imgcls_timm(test_device, variant):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     compiler_cfg.balancer_policy = "Ribbon"
     os.environ["PYBUDA_RIBBON2"] = "1"
diff --git a/pybuda/test/model_demos/models/t5.py b/pybuda/test/model_demos/models/t5.py
index 72060b9b4..9be295689 100644
--- a/pybuda/test/model_demos/models/t5.py
+++ b/pybuda/test/model_demos/models/t5.py
@@ -49,28 +49,16 @@ def forward(self, decoder_input_ids, decoder_attention_mask, encoder_last_hidden
 
 def generate_t5_past_cache_enc_dec(test_device, variant): 
     os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
-    os.environ["TT_BACKEND_MULTI_THREADED_PUSH"] = "1"
-    os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"] = "1"
-    os.environ["PYBUDA_EXTRA_L1_MARGIN"] = "120000" 
     os.environ["PYBUDA_FORCE_SEQUENTIAL"] = "1"
-    if "flan" in variant:
-        os.environ["PYBUDA_NLP_MANUAL_TARGET"] = "35000"
-    else:
-        os.environ["PYBUDA_NLP_MANUAL_TARGET"] = "26000" 
     os.environ["TT_BACKEND_DRAM_POLLING_FREQUENCY"] = "64"
-    os.environ["TT_BACKEND_PROFILER"] = "1"
     os.environ["TT_BACKEND_EPOCH_BIN_NUM_SLOTS"] = "64"
     os.environ["PYBUDA_ROTATE_PAST_CACHE_PARAMS"] = "1"
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False
     compiler_cfg.default_df_override = pybuda._C.Float16_b
     compiler_cfg.default_dram_parameters = False
-    compiler_cfg.input_queues_on_host = True
-    #compiler_cfg.enable_auto_fusing = False
     compiler_cfg.enable_amp_light()
     compiler_cfg.compile_subgraphs = True
-    #compiler_cfg.enable_enumerate_u_kt = False
     compiler_cfg.enable_link_past_cache_ios = True
  
     model_name = variant
diff --git a/pybuda/test/model_demos/models/tri_basic_2/model/__init__.py b/pybuda/test/model_demos/models/tri_basic_2/model/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/pybuda/test/model_demos/models/tri_basic_2/model/semseg.py b/pybuda/test/model_demos/models/tri_basic_2/model/semseg.py
new file mode 100644
index 000000000..f575869c4
--- /dev/null
+++ b/pybuda/test/model_demos/models/tri_basic_2/model/semseg.py
@@ -0,0 +1,145 @@
+# Copyright 2023 Toyota Research Institute.  All rights reserved.
+
+import math
+from typing import List
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+
+class BasicResidualBlock(nn.Module):
+
+    def __init__(self, *, in_channels, out_channels, stride=1, dilation_rate=1):
+        super().__init__()
+        if in_channels == out_channels and stride == 1:
+            self.shortcut = nn.Identity()
+        else:
+            self.shortcut = nn.Sequential(
+                nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm2d(out_channels),
+            )
+
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, padding=dilation_rate,
+                               dilation=dilation_rate, bias=False)
+        self.norm1 = nn.BatchNorm2d(out_channels)
+        self.activation = nn.ReLU(inplace=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, padding=dilation_rate, dilation=dilation_rate,
+                               bias=False)
+        self.norm2 = nn.BatchNorm2d(out_channels)
+
+    def forward(self, inputs):
+        shortcut = self.shortcut(inputs)
+
+        outputs = self.activation(self.norm1(self.conv1(inputs)))
+        outputs = self.norm2(self.conv2(outputs))
+        outputs = outputs + shortcut
+        outputs = self.activation(outputs)
+        return outputs
+
+
+def resnet_group(*, block_func, in_channels, out_channels, stride, num_blocks, dilation_rates=[1]):
+    assert len(dilation_rates) > 0
+
+    residual_blocks = [
+        block_func(in_channels=in_channels, out_channels=out_channels, stride=stride, dilation_rate=dilation_rates[0])
+    ]
+    for idx in range(1, num_blocks):
+        residual_blocks.append(block_func(in_channels=out_channels, out_channels=out_channels, stride=1,
+                                          dilation_rate=dilation_rates[idx % len(dilation_rates)]))
+    return nn.Sequential(*residual_blocks)
+
+
+class Fpn(nn.Module):
+
+    def __init__(self, *, in_channels, out_channels):
+        super().__init__()
+
+        idxs = []
+        convs = []
+        for idx, channels in enumerate(in_channels):
+            idxs.append(idx)
+            convs.append(nn.Conv2d(channels, out_channels, kernel_size=1, bias=True))
+        self.idxs = idxs[::-1]
+        self.convs = nn.ModuleList(convs[::-1])
+
+        self.upsample2 = nn.Upsample(scale_factor=2, mode="bilinear", align_corners=False)
+
+    def forward(self, group_outputs: List[torch.Tensor]):
+        outputs = None
+        for idx, module in enumerate(self.convs):
+            current = module(group_outputs[self.idxs[idx]])
+            if outputs is None:
+                outputs = current
+            else:
+                outputs = self.upsample2(outputs) + current
+
+        return outputs
+
+
+class BasicResNet(nn.Module):
+
+    def __init__(self, hparams, *, num_blocks, num_channels, dilation_rates):
+        super().__init__()
+        assert len(num_blocks) == 4
+        assert len(num_channels) == len(num_blocks)
+        assert len(dilation_rates) == len(num_blocks)
+
+        self.num_channels = num_channels
+
+        self.conv_in = nn.Conv2d(3, num_channels[0], kernel_size=7, padding=3, stride=2, bias=False)
+        self.norm_in = nn.BatchNorm2d(num_channels[0])
+        self.activation_in = nn.ReLU(inplace=True)
+        self.pool_in = nn.MaxPool2d(kernel_size=2)
+
+        self.group1 = resnet_group(block_func=BasicResidualBlock, in_channels=num_channels[0],
+                                   out_channels=num_channels[0], stride=1, num_blocks=num_blocks[0],
+                                   dilation_rates=dilation_rates[0])
+        self.group2 = resnet_group(block_func=BasicResidualBlock, in_channels=num_channels[0],
+                                   out_channels=num_channels[1], stride=2, num_blocks=num_blocks[1],
+                                   dilation_rates=dilation_rates[1])
+        self.group3 = resnet_group(block_func=BasicResidualBlock, in_channels=num_channels[1],
+                                   out_channels=num_channels[2], stride=2, num_blocks=num_blocks[2],
+                                   dilation_rates=dilation_rates[2])
+        self.group4 = resnet_group(block_func=BasicResidualBlock, in_channels=num_channels[2],
+                                   out_channels=num_channels[3], stride=2, num_blocks=num_blocks[3],
+                                   dilation_rates=dilation_rates[3])
+
+        self.head = Fpn(in_channels=num_channels, out_channels=hparams.num_classes)
+
+        self.upsample = nn.Upsample(scale_factor=4, mode="bilinear", align_corners=False)
+
+    def get_output_channels(self):
+        return self.num_channels
+
+    def forward(self, inputs):
+        _, _, h, w = inputs.shape
+
+        vpad = math.ceil(h / 32) * 32 - h
+        top_pad = vpad // 2
+        bottom_pad = vpad - top_pad
+        hpad = math.ceil(w / 32) * 32 - w
+        left_pad = hpad // 2
+        right_pad = hpad - left_pad
+
+        inputs = F.pad(inputs, (left_pad, right_pad, top_pad, bottom_pad))
+
+        outputs = self.pool_in(self.activation_in(self.norm_in(self.conv_in(inputs))))
+        group1_outputs = self.group1(outputs)
+        group2_outputs = self.group2(group1_outputs)
+        group3_outputs = self.group3(group2_outputs)
+        group4_outputs = self.group4(group3_outputs)
+
+        outputs = [group1_outputs, group2_outputs, group3_outputs, group4_outputs]
+        logits = self.upsample(self.head(outputs))
+
+        logits = logits[:, :, top_pad:top_pad + h, left_pad:left_pad + w]
+
+        return logits
+
+
+def resnet34_semseg(hparams):
+    return BasicResNet(hparams,
+                       num_blocks=[3, 4, 6, 3],
+                       num_channels=[64, 128, 256, 512],
+                       dilation_rates=[[1], [1], [1, 1, 2, 5, 9, 17], [1]])
diff --git a/pybuda/test/model_demos/models/whisper.py b/pybuda/test/model_demos/models/whisper.py
index be67080a2..a3ab883e1 100644
--- a/pybuda/test/model_demos/models/whisper.py
+++ b/pybuda/test/model_demos/models/whisper.py
@@ -69,7 +69,6 @@ def forward(self, decoder_input_ids, decoder_attention_mask, encoder_last_hidden
 
 def generate_model_whisper_decoder_past_cache(test_device, variant):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False  # Run full model on silicon
     compiler_cfg.input_queues_on_host = True
     compiler_cfg.enable_link_past_cache_ios = True
@@ -80,9 +79,7 @@ def generate_model_whisper_decoder_past_cache(test_device, variant):
 
     if test_device.arch == BackendDevice.Wormhole_B0:
         compiler_cfg.amp_level = 1
-        os.environ["PYBUDA_DISABLE_STREAM_OUTPUT"] = "1"  # Disable streaming for LM head to output queue (perf)
         os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
-        os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"] = "1"
         os.environ["TT_BACKEND_MULTI_THREADED_PUSH"] = "1"
         os.environ["TT_BACKEND_DRAM_POLLING_FREQUENCY"] = "64"
 
@@ -137,33 +134,19 @@ def generate_model_whisper_decoder_past_cache(test_device, variant):
 def generate_model_whisper_enc_dec(test_device, variant):
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.amp_level = 1
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False  # Run full model on silicon
-    compiler_cfg.input_queues_on_host = True
     compiler_cfg.compile_subgraphs = True
     compiler_cfg.enable_link_past_cache_ios = True
-    compiler_cfg.backend_opt_level = 3
-    #compiler_cfg.enable_auto_fusing = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
     os.environ["PYBUDA_FORCE_SEQUENTIAL"] = "1"
-    os.environ["PYBUDA_DISABLE_STREAM_OUTPUT"] = "1"  # Disable streaming for LM head to output queue (perf)
     os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
     os.environ["PYBUDA_PAD_OUTPUT_BUFFER_THRESHOLD_TILES"] = "1536"
 
-    os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"] = "1"
-    os.environ["TT_BACKEND_MULTI_THREADED_PUSH"] = "1"
-    os.environ["TT_BACKEND_DRAM_POLLING_FREQUENCY"] = "64"
-    os.environ["TT_BACKEND_PROFILER"] = "1"
-    os.environ["PYBUDA_NOP_ON_DIRECT_SHORT_PATH"] = "1"
-
     if variant == "openai/whisper-base":
         os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "None"
         compiler_cfg.enable_auto_fusing = False
 
-    if variant == "openai/whisper-medium" or variant == "openai/whisper-large":
-        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"] = "0"
-
     run_encoder_on_tt = ("tiny" in variant) or ("base" in variant) or ("small" in variant)
 
     pad_model = True
diff --git a/pybuda/test/model_demos/models/wideresnet.py b/pybuda/test/model_demos/models/wideresnet.py
index a1e352dba..68ff26bfd 100644
--- a/pybuda/test/model_demos/models/wideresnet.py
+++ b/pybuda/test/model_demos/models/wideresnet.py
@@ -16,7 +16,6 @@ def generate_model_wideresnet_imgcls_pytorch(test_device, variant):
 
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     compiler_cfg.balancer_policy = "Ribbon"
     os.environ["PYBUDA_RIBBON2"] = "1"
@@ -46,8 +45,7 @@ def generate_model_wideresnet_imgcls_pytorch(test_device, variant):
 def generate_model_wideresnet_imgcls_timm(test_device, variant):
 
     # STEP 1: Set PyBuda configuration parameters
-    compiler_cfg = (pybuda.config._get_global_compiler_config())  
-    compiler_cfg.enable_t_streaming = True
+    compiler_cfg = (pybuda.config._get_global_compiler_config())
     compiler_cfg.balancer_policy = "Ribbon"
     compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
     os.environ["PYBUDA_RIBBON2"] = "1"
diff --git a/pybuda/test/model_demos/models/xception.py b/pybuda/test/model_demos/models/xception.py
index c40e4673b..65ef958a8 100644
--- a/pybuda/test/model_demos/models/xception.py
+++ b/pybuda/test/model_demos/models/xception.py
@@ -15,7 +15,6 @@
 def generate_model_xception_imgcls_timm(test_device, variant):
     # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
 
     if variant == "xception" and test_device.arch == BackendDevice.Wormhole_B0:
diff --git a/pybuda/test/model_demos/utils/cnn/pytorch/images/car.jpg b/pybuda/test/model_demos/utils/cnn/pytorch/images/car.jpg
new file mode 100644
index 000000000..b829107b8
Binary files /dev/null and b/pybuda/test/model_demos/utils/cnn/pytorch/images/car.jpg differ
diff --git a/pybuda/test/nightly/cnn/building_blocks/test_mobilenet.py b/pybuda/test/nightly/cnn/building_blocks/test_mobilenet.py
index d0091fe4d..971ee9079 100644
--- a/pybuda/test/nightly/cnn/building_blocks/test_mobilenet.py
+++ b/pybuda/test/nightly/cnn/building_blocks/test_mobilenet.py
@@ -121,8 +121,8 @@ def forward(self, x):
 )
 @pytest.mark.parametrize(
     "arch", 
-    (BackendDevice.Grayskull, BackendDevice.Wormhole), 
-    ids=["Grayskull", "Wormhole"]
+    (BackendDevice.Grayskull, BackendDevice.Wormhole_B0), 
+    ids=["Grayskull", "Wormhole_B0"]
 )
 def test_mobilenet_v1_depthwise_separable_conv(
     image_size, 
@@ -143,18 +143,18 @@ def test_mobilenet_v1_depthwise_separable_conv(
         (224, 2, 32, 64, 1, 0.25, BackendDevice.Grayskull),
         (192, 2, 32, 64, 1, 0.25, BackendDevice.Grayskull),
         (160, 2, 32, 64, 1, 0.25, BackendDevice.Grayskull),
-        (224, 2, 32, 64, 1, 1, BackendDevice.Wormhole),
-        (192, 2, 32, 64, 1, 1, BackendDevice.Wormhole),
-        (160, 2, 32, 64, 1, 1, BackendDevice.Wormhole),
-        (128, 2, 32, 64, 1, 1, BackendDevice.Wormhole),
-        (224, 2, 32, 64, 1, 0.75, BackendDevice.Wormhole),
-        (192, 2, 32, 64, 1, 0.75, BackendDevice.Wormhole),
-        (160, 2, 32, 64, 1, 0.75, BackendDevice.Wormhole),
-        (128, 2, 32, 64, 1, 0.75, BackendDevice.Wormhole),
-        (224, 2, 32, 64, 1, 0.25, BackendDevice.Wormhole),
-        (192, 2, 32, 64, 1, 0.25, BackendDevice.Wormhole),
-        (160, 2, 32, 64, 1, 0.25, BackendDevice.Wormhole),
-        (128, 2, 32, 64, 1, 0.25, BackendDevice.Wormhole),
+        (224, 2, 32, 64, 1, 1, BackendDevice.Wormhole_B0),
+        (192, 2, 32, 64, 1, 1, BackendDevice.Wormhole_B0),
+        (160, 2, 32, 64, 1, 1, BackendDevice.Wormhole_B0),
+        (128, 2, 32, 64, 1, 1, BackendDevice.Wormhole_B0),
+        (224, 2, 32, 64, 1, 0.75, BackendDevice.Wormhole_B0),
+        (192, 2, 32, 64, 1, 0.75, BackendDevice.Wormhole_B0),
+        (160, 2, 32, 64, 1, 0.75, BackendDevice.Wormhole_B0),
+        (128, 2, 32, 64, 1, 0.75, BackendDevice.Wormhole_B0),
+        (224, 2, 32, 64, 1, 0.25, BackendDevice.Wormhole_B0),
+        (192, 2, 32, 64, 1, 0.25, BackendDevice.Wormhole_B0),
+        (160, 2, 32, 64, 1, 0.25, BackendDevice.Wormhole_B0),
+        (128, 2, 32, 64, 1, 0.25, BackendDevice.Wormhole_B0),
     ]
 
     if (image_size, input_size_divider, in_channels_base, out_channels_base, 
@@ -162,7 +162,6 @@ def test_mobilenet_v1_depthwise_separable_conv(
         pytest.skip(msg="This combination is expected to fail, moved to _xfail version of the function.") 
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     in_channels = int(in_channels_base * width_multiplier)
@@ -199,18 +198,18 @@ def test_mobilenet_v1_depthwise_separable_conv(
         (224, 2, 32, 64, 1, 0.25, BackendDevice.Grayskull),
         (192, 2, 32, 64, 1, 0.25, BackendDevice.Grayskull),
         (160, 2, 32, 64, 1, 0.25, BackendDevice.Grayskull),
-        (224, 2, 32, 64, 1, 1, BackendDevice.Wormhole),
-        (192, 2, 32, 64, 1, 1, BackendDevice.Wormhole),
-        (160, 2, 32, 64, 1, 1, BackendDevice.Wormhole),
-        (128, 2, 32, 64, 1, 1, BackendDevice.Wormhole),
-        (224, 2, 32, 64, 1, 0.75, BackendDevice.Wormhole),
-        (192, 2, 32, 64, 1, 0.75, BackendDevice.Wormhole),
-        (160, 2, 32, 64, 1, 0.75, BackendDevice.Wormhole),
-        (128, 2, 32, 64, 1, 0.75, BackendDevice.Wormhole),
-        (224, 2, 32, 64, 1, 0.25, BackendDevice.Wormhole),
-        (192, 2, 32, 64, 1, 0.25, BackendDevice.Wormhole),
-        (160, 2, 32, 64, 1, 0.25, BackendDevice.Wormhole),
-        (128, 2, 32, 64, 1, 0.25, BackendDevice.Wormhole),
+        (224, 2, 32, 64, 1, 1, BackendDevice.Wormhole_B0),
+        (192, 2, 32, 64, 1, 1, BackendDevice.Wormhole_B0),
+        (160, 2, 32, 64, 1, 1, BackendDevice.Wormhole_B0),
+        (128, 2, 32, 64, 1, 1, BackendDevice.Wormhole_B0),
+        (224, 2, 32, 64, 1, 0.75, BackendDevice.Wormhole_B0),
+        (192, 2, 32, 64, 1, 0.75, BackendDevice.Wormhole_B0),
+        (160, 2, 32, 64, 1, 0.75, BackendDevice.Wormhole_B0),
+        (128, 2, 32, 64, 1, 0.75, BackendDevice.Wormhole_B0),
+        (224, 2, 32, 64, 1, 0.25, BackendDevice.Wormhole_B0),
+        (192, 2, 32, 64, 1, 0.25, BackendDevice.Wormhole_B0),
+        (160, 2, 32, 64, 1, 0.25, BackendDevice.Wormhole_B0),
+        (128, 2, 32, 64, 1, 0.25, BackendDevice.Wormhole_B0),
     ]
 )
 def test_mobilenet_v1_depthwise_separable_conv_xfail(
@@ -223,7 +222,6 @@ def test_mobilenet_v1_depthwise_separable_conv_xfail(
     arch
 ):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     in_channels = int(in_channels_base * width_multiplier)
@@ -269,8 +267,8 @@ def test_mobilenet_v1_depthwise_separable_conv_xfail(
 )
 @pytest.mark.parametrize(
     "arch", 
-    (BackendDevice.Grayskull, BackendDevice.Wormhole), 
-    ids=["Grayskull", "Wormhole"]
+    (BackendDevice.Grayskull, BackendDevice.Wormhole_B0), 
+    ids=["Grayskull", "Wormhole_B0"]
 )
 def test_mobilenet_v2_inverted_residual(
     image_size, 
@@ -286,11 +284,11 @@ def test_mobilenet_v2_inverted_residual(
         (192, 16, 64, 96, 1, 0.25, 6, BackendDevice.Grayskull),
         (224, 32, 160, 320, 1, 0.75, 7, BackendDevice.Grayskull),
         (192, 32, 160, 320, 1, 0.75, 7, BackendDevice.Grayskull),
-        (192, 16, 64, 96, 1, 0.25, 6, BackendDevice.Wormhole),
-        (224, 32, 160, 320, 1, 0.75, 7, BackendDevice.Wormhole),
-        (192, 32, 160, 320, 1, 0.75, 7, BackendDevice.Wormhole),
-        (160, 32, 160, 320, 1, 0.75, 7, BackendDevice.Wormhole),
-        (128, 32, 160, 320, 1, 0.75, 7, BackendDevice.Wormhole)
+        (192, 16, 64, 96, 1, 0.25, 6, BackendDevice.Wormhole_B0),
+        (224, 32, 160, 320, 1, 0.75, 7, BackendDevice.Wormhole_B0),
+        (192, 32, 160, 320, 1, 0.75, 7, BackendDevice.Wormhole_B0),
+        (160, 32, 160, 320, 1, 0.75, 7, BackendDevice.Wormhole_B0),
+        (128, 32, 160, 320, 1, 0.75, 7, BackendDevice.Wormhole_B0)
     ]
 
     if (image_size, input_size_divider, in_channels_base, out_channels_base, 
@@ -298,7 +296,6 @@ def test_mobilenet_v2_inverted_residual(
         pytest.skip(msg="This combination is expected to fail, moved to _xfail version of the function.") 
     
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     in_channels = int(in_channels_base * width_multiplier)
@@ -329,11 +326,11 @@ def test_mobilenet_v2_inverted_residual(
         (192, 16, 64, 96, 1, 0.25, 6, BackendDevice.Grayskull),
         (224, 32, 160, 320, 1, 0.75, 7, BackendDevice.Grayskull),
         (192, 32, 160, 320, 1, 0.75, 7, BackendDevice.Grayskull),
-        (192, 16, 64, 96, 1, 0.25, 6, BackendDevice.Wormhole),
-        (224, 32, 160, 320, 1, 0.75, 7, BackendDevice.Wormhole),
-        (192, 32, 160, 320, 1, 0.75, 7, BackendDevice.Wormhole),
-        (160, 32, 160, 320, 1, 0.75, 7, BackendDevice.Wormhole),
-        (128, 32, 160, 320, 1, 0.75, 7, BackendDevice.Wormhole)
+        (192, 16, 64, 96, 1, 0.25, 6, BackendDevice.Wormhole_B0),
+        (224, 32, 160, 320, 1, 0.75, 7, BackendDevice.Wormhole_B0),
+        (192, 32, 160, 320, 1, 0.75, 7, BackendDevice.Wormhole_B0),
+        (160, 32, 160, 320, 1, 0.75, 7, BackendDevice.Wormhole_B0),
+        (128, 32, 160, 320, 1, 0.75, 7, BackendDevice.Wormhole_B0)
     ]
 )
 def test_mobilenet_v2_inverted_residual_xfail(
@@ -347,7 +344,6 @@ def test_mobilenet_v2_inverted_residual_xfail(
     arch
 ):  
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     in_channels = int(in_channels_base * width_multiplier)
diff --git a/pybuda/test/nightly/cnn/building_blocks/test_resnet.py b/pybuda/test/nightly/cnn/building_blocks/test_resnet.py
index 578ee2178..5b083be23 100644
--- a/pybuda/test/nightly/cnn/building_blocks/test_resnet.py
+++ b/pybuda/test/nightly/cnn/building_blocks/test_resnet.py
@@ -132,10 +132,9 @@ def __init__(self, block_expansion=1, num_classes=1000):
 
 @pytest.mark.parametrize("input_size", [128, 256, 512])
 @pytest.mark.parametrize("input_channels", [1, 3])
-@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole])
+@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole_B0])
 def test_resnet_input_block(input_size, input_channels, arch):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     model = ResnetInputBlock(input_channels)
@@ -158,10 +157,9 @@ def test_resnet_input_block(input_size, input_channels, arch):
 
 @pytest.mark.parametrize("input_channels", [512, 2048])
 @pytest.mark.parametrize("num_classes", [10, 100, 1000])
-@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole])
+@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole_B0])
 def test_resnet_output_block(input_channels, num_classes, arch):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     model = ResnetOutputBlock(input_channels//512, num_classes)
@@ -189,23 +187,22 @@ def test_resnet_output_block(input_channels, num_classes, arch):
 )
 @pytest.mark.parametrize(
     "arch", 
-    [BackendDevice.Grayskull, BackendDevice.Wormhole]
+    [BackendDevice.Grayskull, BackendDevice.Wormhole_B0]
 )
 def test_resnet_basic_block(input_size, in_channels, out_channels, stride, arch):
     expected_to_fail = [
         (14, 256, 256, 1, BackendDevice.Grayskull),
         (7, 512, 512, 1, BackendDevice.Grayskull),
-        (56, 64, 64, 1, BackendDevice.Wormhole),
-        (28, 128, 128, 1, BackendDevice.Wormhole),
-        (14, 256, 256, 1, BackendDevice.Wormhole),
-        (7, 512, 512, 1, BackendDevice.Wormhole)
+        (56, 64, 64, 1, BackendDevice.Wormhole_B0),
+        (28, 128, 128, 1, BackendDevice.Wormhole_B0),
+        (14, 256, 256, 1, BackendDevice.Wormhole_B0),
+        (7, 512, 512, 1, BackendDevice.Wormhole_B0)
     ]
 
     if (input_size, in_channels, out_channels, stride, arch) in expected_to_fail:
         pytest.skip(msg="This combination is expected to fail, moved to _xfail version of the function.")
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     model = BasicResidualBlock(in_channels, out_channels, stride)
@@ -231,15 +228,14 @@ def test_resnet_basic_block(input_size, in_channels, out_channels, stride, arch)
     [
         (14, 256, 256, 1, BackendDevice.Grayskull),
         (7, 512, 512, 1, BackendDevice.Grayskull),
-        (56, 64, 64, 1, BackendDevice.Wormhole),
-        (28, 128, 128, 1, BackendDevice.Wormhole),
-        (14, 256, 256, 1, BackendDevice.Wormhole),
-        (7, 512, 512, 1, BackendDevice.Wormhole)
+        (56, 64, 64, 1, BackendDevice.Wormhole_B0),
+        (28, 128, 128, 1, BackendDevice.Wormhole_B0),
+        (14, 256, 256, 1, BackendDevice.Wormhole_B0),
+        (7, 512, 512, 1, BackendDevice.Wormhole_B0)
     ]
 )
 def test_resnet_basic_block_xfail(input_size, in_channels, out_channels, stride, arch):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     model = BasicResidualBlock(in_channels, out_channels, stride)
@@ -267,7 +263,7 @@ def test_resnet_basic_block_xfail(input_size, in_channels, out_channels, stride,
 )
 @pytest.mark.parametrize(
     "arch", 
-    [BackendDevice.Grayskull, BackendDevice.Wormhole]
+    [BackendDevice.Grayskull, BackendDevice.Wormhole_B0]
 )
 def test_resnet_bottleneck_block(input_size, in_channels, out_channels, stride, arch):
     expected_to_fail = [
@@ -275,17 +271,16 @@ def test_resnet_bottleneck_block(input_size, in_channels, out_channels, stride,
         (28, 512, 512, 1, BackendDevice.Grayskull),
         (14, 1024, 1024, 1, BackendDevice.Grayskull),
         (7, 2048, 2048, 1, BackendDevice.Grayskull),
-        (56, 256, 256, 1, BackendDevice.Wormhole),
-        (28, 512, 512, 1, BackendDevice.Wormhole),
-        (14, 1024, 1024, 1, BackendDevice.Wormhole),
-        (7, 2048, 2048, 1, BackendDevice.Wormhole)
+        (56, 256, 256, 1, BackendDevice.Wormhole_B0),
+        (28, 512, 512, 1, BackendDevice.Wormhole_B0),
+        (14, 1024, 1024, 1, BackendDevice.Wormhole_B0),
+        (7, 2048, 2048, 1, BackendDevice.Wormhole_B0)
     ]
 
     if (input_size, in_channels, out_channels, stride, arch) in expected_to_fail:
         pytest.skip(msg="This combination is expected to fail, moved to _xfail version of the function.")
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     model = BottleneckResidualBlock(in_channels, out_channels, stride)
@@ -315,15 +310,14 @@ def test_resnet_bottleneck_block(input_size, in_channels, out_channels, stride,
         (28, 512, 512, 1, BackendDevice.Grayskull),
         (14, 1024, 1024, 1, BackendDevice.Grayskull),
         (7, 2048, 2048, 1, BackendDevice.Grayskull),
-        (56, 256, 256, 1, BackendDevice.Wormhole),
-        (28, 512, 512, 1, BackendDevice.Wormhole),
-        (14, 1024, 1024, 1, BackendDevice.Wormhole),
-        (7, 2048, 2048, 1, BackendDevice.Wormhole)
+        (56, 256, 256, 1, BackendDevice.Wormhole_B0),
+        (28, 512, 512, 1, BackendDevice.Wormhole_B0),
+        (14, 1024, 1024, 1, BackendDevice.Wormhole_B0),
+        (7, 2048, 2048, 1, BackendDevice.Wormhole_B0)
     ]
 )
 def test_resnet_bottleneck_block_xfail(input_size, in_channels, out_channels, stride, arch):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     model = BottleneckResidualBlock(in_channels, out_channels, stride)
@@ -351,7 +345,7 @@ def test_resnet_bottleneck_block_xfail(input_size, in_channels, out_channels, st
 )
 @pytest.mark.parametrize(
     "arch", 
-    [BackendDevice.Grayskull, BackendDevice.Wormhole]
+    [BackendDevice.Grayskull, BackendDevice.Wormhole_B0]
 )
 def test_resnext_bottleneck_block(input_size, in_channels, out_channels, stride, arch):
     expected_to_fail = [
@@ -359,17 +353,16 @@ def test_resnext_bottleneck_block(input_size, in_channels, out_channels, stride,
         (28, 512, 512, 1, BackendDevice.Grayskull),
         (14, 1024, 1024, 1, BackendDevice.Grayskull),
         (7, 2048, 2048, 1, BackendDevice.Grayskull),
-        (56, 256, 256, 1, BackendDevice.Wormhole),
-        (28, 512, 512, 1, BackendDevice.Wormhole),
-        (14, 1024, 1024, 1, BackendDevice.Wormhole),
-        (7, 2048, 2048, 1, BackendDevice.Wormhole)
+        (56, 256, 256, 1, BackendDevice.Wormhole_B0),
+        (28, 512, 512, 1, BackendDevice.Wormhole_B0),
+        (14, 1024, 1024, 1, BackendDevice.Wormhole_B0),
+        (7, 2048, 2048, 1, BackendDevice.Wormhole_B0)
     ]
 
     if (input_size, in_channels, out_channels, stride, arch) in expected_to_fail:
         pytest.skip(msg="This combination is expected to fail, moved to _xfail version of the function.")
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     model = BottleneckResidualBlock(in_channels, out_channels, stride, 32, 2)
@@ -399,15 +392,14 @@ def test_resnext_bottleneck_block(input_size, in_channels, out_channels, stride,
         (28, 512, 512, 1, BackendDevice.Grayskull),
         (14, 1024, 1024, 1, BackendDevice.Grayskull),
         (7, 2048, 2048, 1, BackendDevice.Grayskull),
-        (56, 256, 256, 1, BackendDevice.Wormhole),
-        (28, 512, 512, 1, BackendDevice.Wormhole),
-        (14, 1024, 1024, 1, BackendDevice.Wormhole),
-        (7, 2048, 2048, 1, BackendDevice.Wormhole)
+        (56, 256, 256, 1, BackendDevice.Wormhole_B0),
+        (28, 512, 512, 1, BackendDevice.Wormhole_B0),
+        (14, 1024, 1024, 1, BackendDevice.Wormhole_B0),
+        (7, 2048, 2048, 1, BackendDevice.Wormhole_B0)
     ]
 )
 def test_resnext_bottleneck_block_xfail(input_size, in_channels, out_channels, stride, arch):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     model = BottleneckResidualBlock(in_channels, out_channels, stride, 32, 2)
diff --git a/pybuda/test/nightly/cnn/building_blocks/test_unet.py b/pybuda/test/nightly/cnn/building_blocks/test_unet.py
index d412d7b5c..a00fac1a2 100644
--- a/pybuda/test/nightly/cnn/building_blocks/test_unet.py
+++ b/pybuda/test/nightly/cnn/building_blocks/test_unet.py
@@ -145,7 +145,7 @@ def forward(self, x):
 @pytest.mark.parametrize("input_size", [128, 256, 512])
 @pytest.mark.parametrize("in_channels, out_channels", 
                          [(3, 32), (32, 64), (64, 128), (128, 256), (256, 512)])
-@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole])
+@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole_B0])
 def test_unet_double_conv_batchnorm_relu(input_size, in_channels, out_channels, arch):
     expected_to_fail = [
         (128, 32, 64, BackendDevice.Grayskull),
@@ -160,25 +160,24 @@ def test_unet_double_conv_batchnorm_relu(input_size, in_channels, out_channels,
         (128, 256, 512, BackendDevice.Grayskull),
         (256, 256, 512, BackendDevice.Grayskull),
         (512, 256, 512, BackendDevice.Grayskull),
-        (128, 32, 64, BackendDevice.Wormhole),
-        (256, 32, 64, BackendDevice.Wormhole),
-        (512, 32, 64, BackendDevice.Wormhole),
-        (128, 64, 128, BackendDevice.Wormhole),
-        (256, 64, 128, BackendDevice.Wormhole),
-        (512, 64, 128, BackendDevice.Wormhole),
-        (128, 128, 256, BackendDevice.Wormhole),
-        (256, 128, 256, BackendDevice.Wormhole),
-        (512, 128, 256, BackendDevice.Wormhole),
-        (128, 256, 512, BackendDevice.Wormhole),
-        (256, 256, 512, BackendDevice.Wormhole),
-        (512, 256, 512, BackendDevice.Wormhole)
+        (128, 32, 64, BackendDevice.Wormhole_B0),
+        (256, 32, 64, BackendDevice.Wormhole_B0),
+        (512, 32, 64, BackendDevice.Wormhole_B0),
+        (128, 64, 128, BackendDevice.Wormhole_B0),
+        (256, 64, 128, BackendDevice.Wormhole_B0),
+        (512, 64, 128, BackendDevice.Wormhole_B0),
+        (128, 128, 256, BackendDevice.Wormhole_B0),
+        (256, 128, 256, BackendDevice.Wormhole_B0),
+        (512, 128, 256, BackendDevice.Wormhole_B0),
+        (128, 256, 512, BackendDevice.Wormhole_B0),
+        (256, 256, 512, BackendDevice.Wormhole_B0),
+        (512, 256, 512, BackendDevice.Wormhole_B0)
     ]
 
     if (input_size, in_channels, out_channels, arch) in expected_to_fail:
         pytest.skip(msg="This combination is expected to fail, moved to _xfail version of the function.")
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     model = DoubleConvBatchnormRelu(in_channels, out_channels)
@@ -214,23 +213,22 @@ def test_unet_double_conv_batchnorm_relu(input_size, in_channels, out_channels,
         (128, 256, 512, BackendDevice.Grayskull),
         (256, 256, 512, BackendDevice.Grayskull),
         (512, 256, 512, BackendDevice.Grayskull),
-        (128, 32, 64, BackendDevice.Wormhole),
-        (256, 32, 64, BackendDevice.Wormhole),
-        (512, 32, 64, BackendDevice.Wormhole),
-        (128, 64, 128, BackendDevice.Wormhole),
-        (256, 64, 128, BackendDevice.Wormhole),
-        (512, 64, 128, BackendDevice.Wormhole),
-        (128, 128, 256, BackendDevice.Wormhole),
-        (256, 128, 256, BackendDevice.Wormhole),
-        (512, 128, 256, BackendDevice.Wormhole),
-        (128, 256, 512, BackendDevice.Wormhole),
-        (256, 256, 512, BackendDevice.Wormhole),
-        (512, 256, 512, BackendDevice.Wormhole)
+        (128, 32, 64, BackendDevice.Wormhole_B0),
+        (256, 32, 64, BackendDevice.Wormhole_B0),
+        (512, 32, 64, BackendDevice.Wormhole_B0),
+        (128, 64, 128, BackendDevice.Wormhole_B0),
+        (256, 64, 128, BackendDevice.Wormhole_B0),
+        (512, 64, 128, BackendDevice.Wormhole_B0),
+        (128, 128, 256, BackendDevice.Wormhole_B0),
+        (256, 128, 256, BackendDevice.Wormhole_B0),
+        (512, 128, 256, BackendDevice.Wormhole_B0),
+        (128, 256, 512, BackendDevice.Wormhole_B0),
+        (256, 256, 512, BackendDevice.Wormhole_B0),
+        (512, 256, 512, BackendDevice.Wormhole_B0)
     ]
 )
 def test_unet_double_conv_batchnorm_relu_xfail(input_size, in_channels, out_channels, arch):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     model = DoubleConvBatchnormRelu(in_channels, out_channels)
@@ -254,15 +252,14 @@ def test_unet_double_conv_batchnorm_relu_xfail(input_size, in_channels, out_chan
 @pytest.mark.parametrize("input_size", [128, 256, 512])
 @pytest.mark.parametrize("in_channels, out_channels", 
                          [(3, 32), (32, 64), (64, 128), (128, 256), (256, 512)])
-@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole])
+@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole_B0])
 def test_unet_double_conv_relu(input_size, in_channels, out_channels, arch):
-    expected_to_fail = [(512, 256, 512, BackendDevice.Wormhole)]
+    expected_to_fail = [(512, 256, 512, BackendDevice.Wormhole_B0)]
 
     if (input_size, in_channels, out_channels, arch) in expected_to_fail:
         pytest.skip(msg="This combination is expected to fail, moved to _xfail version of the function.")
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     model = DoubleConvRelu(in_channels, out_channels)
@@ -285,11 +282,10 @@ def test_unet_double_conv_relu(input_size, in_channels, out_channels, arch):
                    "tenstorrent/pybuda#422")
 @pytest.mark.parametrize(
     "input_size, in_channels, out_channels, arch",
-    [(512, 256, 512, BackendDevice.Wormhole)]
+    [(512, 256, 512, BackendDevice.Wormhole_B0)]
 )
 def test_unet_double_conv_relu_xfail(input_size, in_channels, out_channels, arch):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     model = DoubleConvRelu(in_channels, out_channels)
@@ -313,10 +309,9 @@ def test_unet_double_conv_relu_xfail(input_size, in_channels, out_channels, arch
 @pytest.mark.parametrize("input_size", [128, 256, 512])
 @pytest.mark.parametrize("in_channels, out_channels", 
                          [(3, 32), (32, 64), (64, 128), (128, 256), (256, 512)])
-@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole])
+@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole_B0])
 def test_unet_double_conv_batchnorm_relu_maxpool(input_size, in_channels, out_channels, arch):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     model = DoubleConvBatchnormReluMaxpool(in_channels, out_channels)
@@ -343,10 +338,9 @@ def test_unet_double_conv_batchnorm_relu_maxpool(input_size, in_channels, out_ch
 
 @pytest.mark.parametrize("input_size", [128, 256, 512])
 @pytest.mark.parametrize("input_channels", [1, 3, 32, 64, 128, 256])
-@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole])
+@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole_B0])
 def test_unet_maxpool(input_size, input_channels, arch):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     model = Maxpool()
@@ -373,10 +367,9 @@ def test_unet_maxpool(input_size, input_channels, arch):
 
 @pytest.mark.parametrize("input_size", [128, 256, 512])
 @pytest.mark.parametrize("input_channels", [256, 128, 64, 32])
-@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole])
+@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole_B0])
 def test_unet_upconv(input_size, input_channels, arch):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     model = Upconv(input_channels)
@@ -403,10 +396,9 @@ def test_unet_upconv(input_size, input_channels, arch):
 
 @pytest.mark.parametrize("input_size", [128, 256, 512])
 @pytest.mark.parametrize("input_channels", [256, 128, 64, 32])
-@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole])
+@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole_B0])
 def test_unet_upconv_double_conv_relu(input_size, input_channels, arch):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     model = UpconvDoubleConvRelu(input_channels)
@@ -434,10 +426,9 @@ def test_unet_upconv_double_conv_relu(input_size, input_channels, arch):
 @pytest.mark.parametrize("input_size", [128, 256, 512])
 @pytest.mark.parametrize("encoder_activations_channels", [3, 32, 64])
 @pytest.mark.parametrize("upconv_activations_channels", [3, 32, 64])
-@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole])
+@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole_B0])
 def test_unet_concat(input_size, encoder_activations_channels, upconv_activations_channels, arch):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     model = Concat()
@@ -462,10 +453,9 @@ def test_unet_concat(input_size, encoder_activations_channels, upconv_activation
 @pytest.mark.parametrize("input_size", [128, 256, 512])
 @pytest.mark.parametrize("input_channels", [128, 64, 32])
 @pytest.mark.parametrize("output_channels", [3, 2, 1])
-@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole])
+@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole_B0])
 def test_unet_unityconv(input_size, input_channels, output_channels, arch):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     model = UnityConv(input_channels, output_channels)
diff --git a/pybuda/test/nightly/cnn/building_blocks/test_vit.py b/pybuda/test/nightly/cnn/building_blocks/test_vit.py
index ef3cdca7a..519e3c962 100644
--- a/pybuda/test/nightly/cnn/building_blocks/test_vit.py
+++ b/pybuda/test/nightly/cnn/building_blocks/test_vit.py
@@ -43,7 +43,7 @@
 @pytest.mark.parametrize("num_hidden_layers", [6, 8])
 @pytest.mark.parametrize("num_attention_heads", [8, 16])
 @pytest.mark.parametrize("intermed_expansion_factor", [3, 4])
-@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole])
+@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole_B0])
 def test_vit_encoder(
     image_size,
     num_channels,
@@ -54,16 +54,16 @@ def test_vit_encoder(
     arch
 ):  
     expected_to_fail = [
-        (256, 3, 32, 6, 8, 3, BackendDevice.Wormhole),
-        (224, 3, 32, 8, 8, 3, BackendDevice.Wormhole),
-        (256, 3, 32, 6, 16, 3, BackendDevice.Wormhole),
-        (224, 3, 32, 8, 16, 3, BackendDevice.Wormhole),
-        (256, 3, 32, 6, 8, 4, BackendDevice.Wormhole),
-        (224, 3, 32, 8, 8, 4, BackendDevice.Wormhole),
-        (256, 3, 32, 8, 8, 4, BackendDevice.Wormhole),
-        (224, 3, 32, 8, 16, 4, BackendDevice.Wormhole),
-        (256, 3, 32, 8, 16, 4, BackendDevice.Wormhole),
-        (256, 3, 32, 6, 16, 4, BackendDevice.Wormhole)
+        (256, 3, 32, 6, 8, 3, BackendDevice.Wormhole_B0),
+        (224, 3, 32, 8, 8, 3, BackendDevice.Wormhole_B0),
+        (256, 3, 32, 6, 16, 3, BackendDevice.Wormhole_B0),
+        (224, 3, 32, 8, 16, 3, BackendDevice.Wormhole_B0),
+        (256, 3, 32, 6, 8, 4, BackendDevice.Wormhole_B0),
+        (224, 3, 32, 8, 8, 4, BackendDevice.Wormhole_B0),
+        (256, 3, 32, 8, 8, 4, BackendDevice.Wormhole_B0),
+        (224, 3, 32, 8, 16, 4, BackendDevice.Wormhole_B0),
+        (256, 3, 32, 8, 16, 4, BackendDevice.Wormhole_B0),
+        (256, 3, 32, 6, 16, 4, BackendDevice.Wormhole_B0)
     ]
 
     if (image_size, num_channels, patch_size, num_hidden_layers, num_attention_heads, 
@@ -71,7 +71,6 @@ def test_vit_encoder(
         pytest.skip(msg="This combination is expected to fail, moved to _xfail version of the function.")
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     hidden_size = patch_size * patch_size * num_channels
@@ -107,16 +106,16 @@ def test_vit_encoder(
 @pytest.mark.parametrize(
     "image_size, num_channels, patch_size, num_hidden_layers, num_attention_heads, intermed_expansion_factor, arch",
     [
-        (256, 3, 32, 6, 8, 3, BackendDevice.Wormhole),
-        (224, 3, 32, 8, 8, 3, BackendDevice.Wormhole),
-        (256, 3, 32, 6, 16, 3, BackendDevice.Wormhole),
-        (224, 3, 32, 8, 16, 3, BackendDevice.Wormhole),
-        (256, 3, 32, 6, 8, 4, BackendDevice.Wormhole),
-        (224, 3, 32, 8, 8, 4, BackendDevice.Wormhole),
-        (256, 3, 32, 8, 8, 4, BackendDevice.Wormhole),
-        (224, 3, 32, 8, 16, 4, BackendDevice.Wormhole),
-        (256, 3, 32, 8, 16, 4, BackendDevice.Wormhole),
-        (256, 3, 32, 6, 16, 4, BackendDevice.Wormhole)
+        (256, 3, 32, 6, 8, 3, BackendDevice.Wormhole_B0),
+        (224, 3, 32, 8, 8, 3, BackendDevice.Wormhole_B0),
+        (256, 3, 32, 6, 16, 3, BackendDevice.Wormhole_B0),
+        (224, 3, 32, 8, 16, 3, BackendDevice.Wormhole_B0),
+        (256, 3, 32, 6, 8, 4, BackendDevice.Wormhole_B0),
+        (224, 3, 32, 8, 8, 4, BackendDevice.Wormhole_B0),
+        (256, 3, 32, 8, 8, 4, BackendDevice.Wormhole_B0),
+        (224, 3, 32, 8, 16, 4, BackendDevice.Wormhole_B0),
+        (256, 3, 32, 8, 16, 4, BackendDevice.Wormhole_B0),
+        (256, 3, 32, 6, 16, 4, BackendDevice.Wormhole_B0)
     ]
 )
 def test_vit_encoder_xfail(
@@ -129,7 +128,6 @@ def test_vit_encoder_xfail(
     arch
 ):  
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     hidden_size = patch_size * patch_size * num_channels
@@ -166,7 +164,7 @@ def test_vit_encoder_xfail(
 @pytest.mark.parametrize("num_channels", [3, 1])
 @pytest.mark.parametrize("patch_size", [16, 32])
 @pytest.mark.parametrize("num_attention_heads", [4, 8, 16])
-@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole])
+@pytest.mark.parametrize("arch", [BackendDevice.Grayskull, BackendDevice.Wormhole_B0])
 def test_vit_pooler(
     image_size,
     num_channels,
@@ -175,7 +173,6 @@ def test_vit_pooler(
     arch
 ):   
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     hidden_size = patch_size * patch_size * num_channels
diff --git a/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py b/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
index 5b6c2b768..8363a17ca 100644
--- a/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
+++ b/pybuda/test/operators/eltwise_binary/test_eltwise_binary.py
@@ -50,9 +50,9 @@
 
 
 @pytest.mark.parametrize("shape", shape, ids=[f"shape{'x'.join([str(jtem) for jtem in item])}" for item in shape])
-@pytest.mark.parametrize("operation", ["Add", "Subtract", "Multiply", "Heaviside", "Greater", "GreaterEqual", "Less", "LessEqual", "Equal", "NotEqual"])
+@pytest.mark.parametrize("operation", ["Add", "Max", "Min", "Power", "Subtract", "Multiply", "Heaviside", "Greater", "GreaterEqual", "Less", "LessEqual", "Equal", "NotEqual"])
 @pytest.mark.parametrize("recompute", (True, False), ids=["Recompute", "NoRecompute"])
-@pytest.mark.parametrize("mode", ["Training", "Inference"])
+@pytest.mark.parametrize("mode", ["Inference"])
 @pytest.mark.parametrize("model", [item.split(".")[0] for item in os.listdir(MODELS_PATH) if "model" in item])
 def test_eltwise_binary(
     mode,
diff --git a/pybuda/test/operators/eltwise_binary_comparison/test_eltwise_binary_comparison.py b/pybuda/test/operators/eltwise_binary_comparison/test_eltwise_binary_comparison.py
index e37ad486c..06a96b42d 100644
--- a/pybuda/test/operators/eltwise_binary_comparison/test_eltwise_binary_comparison.py
+++ b/pybuda/test/operators/eltwise_binary_comparison/test_eltwise_binary_comparison.py
@@ -62,7 +62,7 @@
 @pytest.mark.parametrize("shape", shape, ids=[f"shape{'x'.join([str(jtem) for jtem in item])}" for item in shape])
 @pytest.mark.parametrize("recompute", (True, False), ids=["Recompute", "NoRecompute"])
 @pytest.mark.parametrize("model", [item.split(".")[0] for item in os.listdir(MODELS_PATH) if "model" in item])
-@pytest.mark.parametrize("mode", ["Training", "Inference"])
+@pytest.mark.parametrize("mode", ["Inference"])
 def test_comparison(
     mode,
     recompute,
diff --git a/pybuda/test/operators/eltwise_unary/conftest.py b/pybuda/test/operators/eltwise_unary/conftest.py
index f70c4cc98..c4e500fc4 100644
--- a/pybuda/test/operators/eltwise_unary/conftest.py
+++ b/pybuda/test/operators/eltwise_unary/conftest.py
@@ -5,6 +5,9 @@
 # Conftest for parameters setup for element-wise unary operators
 #
 
+import json
+
+
 def pytest_addoption(parser):
     # model
     parser.addoption(
@@ -41,6 +44,13 @@ def pytest_addoption(parser):
         default='Sqrt',
         help="Unary element-wise operation which we want to perform."
     )
+	# kwargs
+    parser.addoption(
+        "--un_kwargs_json",
+        action="store",
+        default='{}',
+        help="Additional arguents, in JSON format, for given operation. If they are needed."
+	)
 
 def pytest_generate_tests(metafunc):
 
@@ -62,4 +72,8 @@ def pytest_generate_tests(metafunc):
 
 	option_op = metafunc.config.option.un_op
 	if 'un_op' in metafunc.fixturenames and option_op is not None:
-		metafunc.parametrize("un_op", [option_op])
\ No newline at end of file
+		metafunc.parametrize("un_op", [option_op])
+
+	option_kwargs = metafunc.config.option.un_kwargs_json
+	if 'un_kwargs' in metafunc.fixturenames and option_kwargs is not None:
+		metafunc.parametrize("un_kwargs", [json.loads(option_kwargs)])
\ No newline at end of file
diff --git a/pybuda/test/operators/eltwise_unary/test_command.sh b/pybuda/test/operators/eltwise_unary/test_command.sh
index 685ccbd49..76a061d44 100644
--- a/pybuda/test/operators/eltwise_unary/test_command.sh
+++ b/pybuda/test/operators/eltwise_unary/test_command.sh
@@ -19,6 +19,13 @@ pytest -svv test_eltwise_unary_single.py --un_model model_2 --un_train True --un
 pytest -svv test_eltwise_unary_single.py --un_model model_5 --un_train False --un_op 'Gelu'
 pytest -svv test_eltwise_unary_single.py --un_model model_4 --un_shape '[1, 32, 256, 2048]'
 
+pytest -svv pybuda/test/operators/eltwise_unary/test_eltwise_unary_single.py --un_model model_3 --un_train False --un_recompute False --un_shape '[1, 32, 96, 128]' --un_op 'Clip' --un_kwargs_json='{"min": 0.234, "max": 0.982}'
+pytest -svv pybuda/test/operators/eltwise_unary/test_eltwise_unary_single.py --un_model model_3 --un_train False --un_recompute False --un_shape '[1, 32, 96, 128]' --un_op 'LogicalNot'
+pytest -svv pybuda/test/operators/eltwise_unary/test_eltwise_unary_single.py --un_model model_3 --un_train False --un_recompute False --un_shape '[1, 32, 96, 128]' --un_op 'CumSum' --un_kwargs_json='{"axis": 2}'
+pytest -svv pybuda/test/operators/eltwise_unary/test_eltwise_unary_single.py --un_model model_5 --un_train False --un_recompute False --un_shape '[19, 20, 16]' --un_op 'Pow' --un_kwargs_json='{"exponent": 0.54881352186203}'
+pytest -svv pybuda/test/operators/eltwise_unary/test_eltwise_unary_single.py --un_model model_5 --un_train False --un_recompute False --un_shape '[1, 1, 24, 9]' --un_op 'Pow' --un_kwargs_json='{"exponent": 0.5488135039273248}'
+pytest -svv pybuda/test/operators/eltwise_unary/test_eltwise_unary_single.py --un_model model_5 --un_train False --un_recompute False --un_shape '[1, 1, 24, 9]' --un_op 'Tilize'
+
 # Issues
 pytest -svv test_eltwise_unary_single.py --un_model model_4 --un_train True --un_recompute False --un_op 'Exp' --un_shape '[21, 127, 102, 19]'
 
diff --git a/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py b/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py
index 298170eea..44e9778e3 100644
--- a/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py
+++ b/pybuda/test/operators/eltwise_unary/test_eltwise_unary.py
@@ -13,6 +13,7 @@
 
 import pybuda.op
 from pybuda import TTDevice, BackendType, pybuda_compile, VerifyConfig, CompilerConfig
+from pybuda.verify.config import TestKind
 
 from . import models
 
@@ -49,14 +50,16 @@
 
 
 @pytest.mark.parametrize("shape", shape, ids=[f"shape{'x'.join([str(jtem) for jtem in item])}" for item in shape])
-@pytest.mark.parametrize("operation", ["Abs", "LeakyRelu", "Exp", "Identity", "Reciprocal", "Sigmoid", "Sqrt", "Gelu", "Log", "Relu", "Buffer"])
+@pytest.mark.parametrize("operation", ["Abs", "LeakyRelu", "Exp", "Identity", "Reciprocal", "Sigmoid", "Sqrt", "Gelu", "Log", "Relu", "Buffer", "Tanh", "Dropout", "Sine", "Cosine", "Argmax", "Clip"])
 @pytest.mark.parametrize("model", [item.split(".")[0] for item in os.listdir(MODELS_PATH) if "model" in item])
+@pytest.mark.parametrize("op_test_kind", [TestKind.INFERENCE])
 def test_eltwise_unary(
-    test_kind,
+    op_test_kind,
     operation,
     model,
     shape
 ):
+    test_kind = op_test_kind
 
     if model == "model_9" and operation == "Reciprocal":
         pytest.xfail("tenstorrent/pybuda#18")
@@ -67,6 +70,9 @@ def test_eltwise_unary(
         kwargs['alpha'] = np.random.rand()
         if test_kind.is_training():
             pcc = 0.95
+    if operation == "Clip":
+        kwargs['min'] = np.random.rand()
+        kwargs['max'] = np.random.rand()
         
     architecture = f'models.{model}.BudaElementWiseUnaryTest(operator=pybuda.op.{operation}, opname="{operation}", shape={shape}'
     for k, v in kwargs.items():
diff --git a/pybuda/test/operators/eltwise_unary/test_eltwise_unary_single.py b/pybuda/test/operators/eltwise_unary/test_eltwise_unary_single.py
index 02c577960..3a89066a5 100644
--- a/pybuda/test/operators/eltwise_unary/test_eltwise_unary_single.py
+++ b/pybuda/test/operators/eltwise_unary/test_eltwise_unary_single.py
@@ -28,7 +28,8 @@ def test_eltwise_unary(
     un_recompute,
     un_op,
     un_model,
-    un_shape
+    un_shape,
+    un_kwargs
 ):
 
     print("\n")
@@ -37,6 +38,7 @@ def test_eltwise_unary(
     print(f"un_op --> {un_op}")
     print(f"un_model --> {un_model}")
     print(f"un_shape --> {un_shape}")
+    print(f"un_kwargs --> {un_kwargs}")
     print("\n")
 
     if not un_train and un_recompute:
@@ -56,15 +58,27 @@ def test_eltwise_unary(
     model = un_model
     shape = eval(un_shape) if type(un_shape) == str else un_shape
 
+    kwargs = un_kwargs
+    pcc = 0.99
+    if operation == "LeakyRelu":
+        if un_train:
+            pcc = 0.95
+    
+
     print("\n")
     print(f"Training --> {training}")
     print(f"Recompute --> {recompute}")
     print(f"Operation --> {operation}")
     print(f"Model --> {model}")
     print(f"Shape --> {shape}")
+    print(f"Kwargs --> {kwargs}")
     print("\n")
 
-    architecture = f'models.{model}.BudaElementWiseUnaryTest(operator=pybuda.op.{operation}, opname="{operation}", shape={shape})'
+    architecture = f'models.{model}.BudaElementWiseUnaryTest(operator=pybuda.op.{operation}, opname="{operation}", shape={shape}'
+    for k, v in kwargs.items():
+        architecture = f'{architecture}, {k}={v}'
+    architecture = f'{architecture})'
+
     model = eval(architecture)
     tt0 = TTDevice("tt0", devtype=BackendType.Golden)
     tt0.place_module(model)
@@ -76,5 +90,5 @@ def test_eltwise_unary(
                         enable_training=training,
                         enable_recompute=recompute
                      ), 
-        verify_cfg=VerifyConfig()
+        verify_cfg=VerifyConfig(pcc=pcc)
     )
\ No newline at end of file
diff --git a/pybuda/test/operators/eltwise_unary_attr/clip/test_clip.py b/pybuda/test/operators/eltwise_unary_attr/clip/test_clip.py
index 2a18f6368..44b54106d 100644
--- a/pybuda/test/operators/eltwise_unary_attr/clip/test_clip.py
+++ b/pybuda/test/operators/eltwise_unary_attr/clip/test_clip.py
@@ -60,7 +60,7 @@
 @pytest.mark.parametrize("shape", shape, ids=["shape=" + "x".join([str(item) for item in sh]) for sh in shape])
 @pytest.mark.parametrize("recompute", (True, False), ids=["Recompute", "NoRecompute"])
 @pytest.mark.parametrize("model", [item.split(".")[0] for item in os.listdir(MODELS_PATH) if "model" in item])
-@pytest.mark.parametrize("mode", ["Training", "Inference"])
+@pytest.mark.parametrize("mode", ["Inference"])
 def test_clip(
     mode,
     recompute,
diff --git a/pybuda/test/operators/eltwise_unary_attr/leaky_relu/test_leaky_relu.py b/pybuda/test/operators/eltwise_unary_attr/leaky_relu/test_leaky_relu.py
index e1f2b5e78..b21d44933 100644
--- a/pybuda/test/operators/eltwise_unary_attr/leaky_relu/test_leaky_relu.py
+++ b/pybuda/test/operators/eltwise_unary_attr/leaky_relu/test_leaky_relu.py
@@ -55,7 +55,7 @@
 @pytest.mark.parametrize("shape", shape, ids=["shape=" + "x".join([str(item) for item in sh]) for sh in shape])
 @pytest.mark.parametrize("recompute", (True, False), ids=["Recompute", "NoRecompute"])
 @pytest.mark.parametrize("model", [item.split(".")[0] for item in os.listdir(MODELS_PATH) if "model" in item])
-@pytest.mark.parametrize("mode", ["Training", "Inference"])
+@pytest.mark.parametrize("mode", ["Inference"])
 def test_leaky_relu(
     mode,
     recompute,
diff --git a/pybuda/test/operators/grouped_reduce/test_grouped_reduce.py b/pybuda/test/operators/grouped_reduce/test_grouped_reduce.py
index e9d169bf4..a8dbb7bad 100644
--- a/pybuda/test/operators/grouped_reduce/test_grouped_reduce.py
+++ b/pybuda/test/operators/grouped_reduce/test_grouped_reduce.py
@@ -19,6 +19,8 @@
 import pybuda.op
 from pybuda import TTDevice, BackendType, pybuda_compile, VerifyConfig, CompilerConfig
 
+from pybuda.verify.config import TestKind
+
 from . import models
 
 MODELS_PATH = "./pybuda/test/operators/grouped_reduce/models/"
@@ -53,14 +55,16 @@ def factors(n):
 @pytest.mark.parametrize("model", [item.split(".")[0] for item in os.listdir(MODELS_PATH) if "model" in item])
 @pytest.mark.parametrize("dim", [2, 3])
 @pytest.mark.parametrize("keep_dims", [True, False])
+@pytest.mark.parametrize("op_test_kind", [TestKind.INFERENCE])
 def test_grouped_reduce(
-    test_kind,
+    op_test_kind,
     operation,
     model,
     shape,
     dim,
     keep_dims,
 ):
+    test_kind = op_test_kind
     facs = factors(shape[dim])
     if len(facs) < 3:
         pytest.skip("Not enough factors")
diff --git a/pybuda/test/operators/matmul/test_matmul.py b/pybuda/test/operators/matmul/test_matmul.py
index b1e264f35..cb2345b0a 100644
--- a/pybuda/test/operators/matmul/test_matmul.py
+++ b/pybuda/test/operators/matmul/test_matmul.py
@@ -15,6 +15,8 @@
 import pybuda.op
 from pybuda import TTDevice, BackendType, pybuda_compile, VerifyConfig, CompilerConfig
 
+from pybuda.verify.config import TestKind
+
 from .models import generic
 from .models import custom
 
@@ -59,11 +61,13 @@
 #)
 @pytest.mark.parametrize("shape", shape, ids=[f"shape{'x'.join([str(jtem) for jtem in item])}" for item in shape])
 @pytest.mark.parametrize("model", [item.split(".")[0] for item in os.listdir(MODELS_GENERIC_PATH) if "model" in item])
+@pytest.mark.parametrize("op_test_kind", [TestKind.INFERENCE])
 def test_matmul_generic(
-    test_kind,
+    op_test_kind,
     model,
     shape
 ):
+    test_kind = op_test_kind
     if test_kind.is_training() and len(shape) >= 3 and shape[-3] > 1:
         pytest.skip("Matmul with gradient accumulate must have t=1")
 
diff --git a/pybuda/test/tti/__init__.py b/pybuda/test/operators/nary/__init__.py
similarity index 100%
rename from pybuda/test/tti/__init__.py
rename to pybuda/test/operators/nary/__init__.py
diff --git a/pybuda/test/operators/nary/test_eltwise_nary.py b/pybuda/test/operators/nary/test_eltwise_nary.py
new file mode 100644
index 000000000..14b07462a
--- /dev/null
+++ b/pybuda/test/operators/nary/test_eltwise_nary.py
@@ -0,0 +1,149 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+#
+# Tests for testing of element-wise nary operators
+#
+
+import pybuda.tensor
+import pytest
+
+import torch
+import os
+
+import pybuda
+import pybuda.op
+from pybuda import PyBudaModule, Tensor, VerifyConfig
+from test.common import run
+from pybuda.verify import TestKind, verify_module
+
+verify_cfg = VerifyConfig(
+    run_golden=True, run_net2pipe=True
+)  # Run backend golden check on all tests in here
+
+
+@pytest.mark.parametrize(
+    "input_shape",
+    [
+        (1, 10, 32, 32),
+        (1, 32, 16, 16),
+    ],
+)
+@pytest.mark.parametrize("axis", [-3])
+@pytest.mark.parametrize("stride", [1])
+@pytest.mark.parametrize("num_operands", [2, 3])
+def test_interleave(test_kind, test_device, input_shape, axis, stride, num_operands):
+    class Model(PyBudaModule):
+        def __init__(self, name, axis, stride):
+            super().__init__(name)
+            self.axis = axis
+            self.stride = stride
+
+        def forward(self, *operands):
+            x = pybuda.op.Interleave(
+                "interleave0", *operands, axis=self.axis, stride=self.stride
+            )
+            return x
+
+    input_shapes = tuple([input_shape for _ in range(num_operands)])
+    mod = Model("interleave_test", axis, stride)
+    verify_module(
+        mod,
+        input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=test_kind,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+    )
+
+
+@pytest.mark.parametrize("dim", [1, 2, -1])
+@pytest.mark.parametrize("aligned", [True, False])
+def test_concat(test_kind, test_device, dim, aligned):
+    @run(
+        VerifyConfig(
+            test_kind=test_kind, devtype=test_device.devtype, arch=test_device.arch
+        ),
+    )
+    def simple_concat(a, b):
+        return pybuda.op.Concatenate("", a, b, axis=dim)
+
+    if aligned:
+        shapes = {
+            -1: (1, 3, 128, 96),
+            2: (1, 3, 1024, 32),
+            1: (1, 1, 128, 32),
+        }
+        a = Tensor.create_from_torch(
+            torch.randn((1, 3, 128, 32), requires_grad=test_kind.is_training())
+        )
+    else:
+        shapes = {
+            -1: (1, 3, 128, 6),
+            2: (1, 3, 128, 6),
+            1: (1, 1, 128, 6),
+        }
+        a = Tensor.create_from_torch(
+            torch.randn((1, 3, 128, 6), requires_grad=test_kind.is_training())
+        )
+    b = Tensor.create_from_torch(
+        torch.randn(shapes[dim], requires_grad=test_kind.is_training())
+    )
+    c = simple_concat(a, b)
+
+
+def test_concat_two_kinds_pad(test_device):
+    class Module(PyBudaModule):
+        def __init__(self, name):
+            super().__init__(name)
+            self.add_parameter(
+                "w", pybuda.Parameter(*(1, 1, 352, 192), requires_grad=True)
+            )
+
+        def forward(self, in0, in1, in2, in3, in4, in5, y):
+            in0 = pybuda.op.Multiply("m0", in0, in0)
+            in1 = pybuda.op.Multiply("m1", in1, in2)
+            in2 = pybuda.op.Multiply("m2", in2, in3)
+            in3 = pybuda.op.Multiply("m3", in3, in4)
+            in4 = pybuda.op.Multiply("m4", in4, in4)
+            in5 = pybuda.op.Multiply("m5", in5, in1)
+            x = pybuda.op.Concatenate("", in0, in1, in2, in3, in4, in5, axis=-1)
+            x = pybuda.op.Multiply("m6", x, y)
+            x = pybuda.op.PadTile("p0", x, -1, 336)
+            x = pybuda.op.Matmul("mm0", x, self.get_parameter("w"))
+            return x
+
+    compiler_cfg = pybuda.config._get_global_compiler_config() # load global compiler config object
+    compiler_cfg.balancer_policy = "CNN"
+    # compiler_cfg.place_on_new_epoch("m6_transpose_nop_0")
+    os.environ["PYBUDA_DISABLE_CONSTANT_FOLDING"] = "1"
+    os.environ["PYBUDA_PAD_SPARSE_MM"] = "{11:12}"
+    os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "ConsumerOperandDataEdgesFirst"
+
+    # input shape
+    common_len = 3136
+    input_shapes = (
+        (1, 1, common_len, 96),
+        (1, 1, common_len, 48),
+        (1, 1, common_len, 48),
+        (1, 1, common_len, 48),
+        (1, 1, common_len, 48),
+        (1, 1, common_len, 48),
+        (1, 1, common_len, 336),
+    )
+    mod = Module("test_concat_two_kinds_pad")
+    verify_module(
+        mod,
+        input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+    )
+
+    os.environ["PYBUDA_PAD_SPARSE_MM"] = "{}"
+
+
+
diff --git a/pybuda/test/operators/nary/test_where.py b/pybuda/test/operators/nary/test_where.py
new file mode 100644
index 000000000..f9fce3f27
--- /dev/null
+++ b/pybuda/test/operators/nary/test_where.py
@@ -0,0 +1,190 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+#
+# Tests for testing of where operator
+#
+
+import pytest
+
+import torch
+
+import pybuda
+import pybuda.op
+import pybuda.tensor
+
+from pybuda import PyBudaModule, VerifyConfig
+from pybuda.verify import TestKind, verify_module
+
+@pytest.mark.skip(reason="This test is failing due to not supporting 'BoolTensor' for a condition")
+def test_cond_bool_tensor_manual_inputs(test_device):
+    class Model(PyBudaModule):
+        def __init__(self, name):
+            super().__init__(name)
+
+        def forward(self, cond, x, y):
+            return pybuda.op.Where("Where0", cond, x, y)
+
+    mod = Model("where_test_model")
+
+    # manual creation of input tensors
+    # contidion_tensor is a boolean tensor what it should be
+    condition_tensor = pybuda.tensor.Tensor.create_from_torch(
+        torch.tensor([[[1, 0],
+                       [1, 0],
+                       [1, 0]]], dtype=torch.bool)
+    )
+    x_tensor = pybuda.tensor.Tensor.create_from_torch(
+        torch.tensor([[[0.1490, 0.3861],
+                       [1.4934, 0.4805],
+                       [-0.3992, -1.1574]]])
+    )
+    y_tensor = pybuda.tensor.Tensor.create_from_torch(
+        torch.tensor([[[1.0, 1.0],
+                       [1.0, 1.0],
+                       [1.0, 1.0]]])
+    )
+
+    verify_module(
+        mod,
+        input_shapes=None,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        inputs=[(condition_tensor, x_tensor, y_tensor)],
+    )
+
+@pytest.mark.skip(reason="This test is failing when condition_tensor elements have values <> 0.0 or 1.0")
+def test_cond_non_bool_tensor_manual_inputs(test_device):
+    class Model(PyBudaModule):
+        def __init__(self, name):
+            super().__init__(name)
+
+        def forward(self, cond, x, y):
+            return pybuda.op.Where("Where0", cond, x, y)
+
+    mod = Model("where_test_model")
+
+    condition_tensor = pybuda.tensor.Tensor.create_from_torch(
+        torch.tensor([[[0.2, 1.0],
+                       [0.0, 1.0],
+                       [1.1, 1.0]]])
+    )
+    x_tensor = pybuda.tensor.Tensor.create_from_torch(
+        torch.tensor([[[0.1490, 0.3861],
+                       [1.4934, 0.4805],
+                       [-0.3992, -1.1574]]])
+    )
+    y_tensor = pybuda.tensor.Tensor.create_from_torch(
+        torch.tensor([[[1.0, 1.0],
+                       [1.0, 1.0],
+                       [1.0, 1.0]]])
+    )
+
+    verify_module(
+        mod,
+        input_shapes=None,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        inputs=[(condition_tensor, x_tensor, y_tensor)],
+    )
+
+@pytest.mark.skip(reason="This test is failing due assertion error - data mismatch detected")
+@pytest.mark.parametrize("input_shape", [(1, 3, 3)])
+def test_where_input_shapes(test_device, input_shape):
+    class Model(PyBudaModule):
+        def __init__(self, name):
+            super().__init__(name)
+
+        def forward(self, cond, x, y):
+            return pybuda.op.Where("Where0", cond, x, y)
+
+    mod = Model("where_test_model")
+    input_shapes = tuple([input_shape for _ in range(3)])
+
+    verify_module(
+        mod,
+        input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+    )
+
+# Manually test where operator with PyTorch and PyBuda.
+# Results are same for both, but verify_module fails due to different pcc values.
+
+# working
+cond_values_1 = [[[0., 0.],
+                  [1., 0.],
+                  [1., 0.]]]
+
+# not working
+cond_values_2 = [[[0.2, 0.],
+                  [1., 0.],
+                  [1., 0.]]]
+
+@pytest.mark.skip(reason="This test is failing due to verify_module calculates wrong pcc")
+@pytest.mark.parametrize("cond_values", [cond_values_1, cond_values_2])
+def test_where_verify_module(test_device, cond_values):
+    class Model(PyBudaModule):
+        def __init__(self, name):
+            super().__init__(name)
+
+        def forward(self, cond, x, y):
+            v = pybuda.op.Where("Where0", cond, x, y)
+            # PyBuda always works as expected:
+            print(f"\n\nPyBuda output value: {v}\n\n")
+            return v
+
+    mod = Model("where_test_model")
+
+    condition_torch = torch.tensor(cond_values, dtype=torch.bool)   # torch works only with bool type - explicit define dtype  
+    condition_buda = pybuda.tensor.Tensor.create_from_torch(torch.tensor(cond_values))  # buda can work also with other types
+
+    print(f"condition_torch:\n{condition_torch}")  # condition is a boolean tensor
+    print(f"condition_buda:\n{condition_buda}")    # condition is a float tensor
+
+    x_tensor = pybuda.tensor.Tensor.create_from_torch(
+        torch.tensor([[[1000., 1000.],
+                       [1000., 1000.],
+                       [1000., 1000.]]])
+    )
+    y_tensor = pybuda.tensor.Tensor.create_from_torch(
+        torch.tensor([[[5.0, 5.0],
+                       [5.0, 5.0],
+                       [5.0, 5.0]]])
+    )    
+
+    result_torch = torch.where(condition_torch, pybuda.tensor.Tensor.to_pytorch(x_tensor), pybuda.tensor.Tensor.to_pytorch(y_tensor))
+    print(f"result_torch:\n{result_torch}")
+    result_buda = pybuda.op.Where("Where0", condition_buda, x_tensor, y_tensor)
+    print(f"result_buda:\n{result_buda}")
+
+    output_are_the_same = torch.eq(result_torch, pybuda.tensor.Tensor.to_pytorch(result_buda)).all()
+    print(f"\nAre results equal: {output_are_the_same}")
+    if not output_are_the_same:
+        # never failing here
+        pytest.fail("Results are not equal")
+
+    # verify_module calculates wrong pcc value for failing case
+    # This is the error message for failing case:
+    #    "AssertionError: Data mismatch on iteration 0 - Eval Output 0. PCC got 0.8087360843031886, required=0.99"
+    #    ...
+    #    "1 failed, 1 passed in 0.89s"
+    verify_module(
+        mod,
+        input_shapes=None,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch,
+        ),
+        inputs=[(condition_buda, x_tensor, y_tensor)],
+    )
diff --git a/pybuda/test/operators/reduce/test_reduce_4d.py b/pybuda/test/operators/reduce/test_reduce_4d.py
index e2bcbc76d..9769ba4a9 100644
--- a/pybuda/test/operators/reduce/test_reduce_4d.py
+++ b/pybuda/test/operators/reduce/test_reduce_4d.py
@@ -18,6 +18,8 @@
 import pybuda.op
 from pybuda import TTDevice, BackendType, pybuda_compile, VerifyConfig, CompilerConfig
 
+from pybuda.verify.config import TestKind
+
 from . import models_4d
 
 MODELS_PATH = "./pybuda/test/operators/reduce/models_4d/"
@@ -48,13 +50,16 @@
 @pytest.mark.parametrize("shape", shape, ids=[f"shape={'x'.join([str(jtem) for jtem in item])}" for item in shape])
 @pytest.mark.parametrize("operation", ["ReduceSum", "ReduceAvg", "ReduceMax"])
 @pytest.mark.parametrize("model", [item.split(".")[0] for item in os.listdir(MODELS_PATH) if "model" in item])
+@pytest.mark.parametrize("op_test_kind", [TestKind.INFERENCE])
 def test_reduce(
-    test_kind,
+    op_test_kind,
     operation,
     model,
     shape
 ):
 
+    test_kind = op_test_kind
+
     if operation == "ReduceMax" and test_kind.is_training():
         pytest.xfail()
 
diff --git a/pybuda/test/operators/reduce/test_reduce_nd.py b/pybuda/test/operators/reduce/test_reduce_nd.py
index f0b5f89cc..bdbc7934b 100644
--- a/pybuda/test/operators/reduce/test_reduce_nd.py
+++ b/pybuda/test/operators/reduce/test_reduce_nd.py
@@ -58,7 +58,7 @@
 @pytest.mark.parametrize("operation", ["ReduceSum", "ReduceAvg"])
 @pytest.mark.parametrize("recompute", (True, False), ids=["Recompute", "NoRecompute"])
 @pytest.mark.parametrize("model", [item.split(".")[0] for item in os.listdir(MODELS_PATH) if "model" in item])
-@pytest.mark.parametrize("mode", ["Training", "Inference"])
+@pytest.mark.parametrize("mode", ["Inference"])
 def test_reduce(
     mode,
     recompute,
diff --git a/pybuda/test/operators/tm/fuse/__init__.py b/pybuda/test/operators/tm/fuse/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/pybuda/test/operators/tm/fuse/test_fuse_tm_sequence.py b/pybuda/test/operators/tm/fuse/test_fuse_tm_sequence.py
new file mode 100644
index 000000000..a749bba34
--- /dev/null
+++ b/pybuda/test/operators/tm/fuse/test_fuse_tm_sequence.py
@@ -0,0 +1,111 @@
+import pybuda
+import pybuda.op
+from pybuda import PyBudaModule
+
+import torch
+import os
+
+from pybuda.verify.backend import verify_module
+from pybuda import VerifyConfig
+from pybuda.verify.config import TestKind
+
+
+
+class PtFuseTMMultiUser(PyBudaModule):
+    def __init__(self, name):
+        super().__init__(name)
+        self.add_parameter("segformer.encoder.layer_norm.0.weight", pybuda.Parameter(*(32,), requires_grad=True, dev_data_format=pybuda.DataFormat.Float32))
+        self.add_parameter("segformer.encoder.layer_norm.0.bias", pybuda.Parameter(*(32,), requires_grad=True, dev_data_format=pybuda.DataFormat.Float32))
+        self.add_parameter("segformer.encoder.patch_embeddings.1.proj.weight", pybuda.Parameter(*(64, 32, 3, 3), requires_grad=True, dev_data_format=pybuda.DataFormat.Float32))
+        self.add_parameter("segformer.encoder.patch_embeddings.1.proj.bias", pybuda.Parameter(*(64,), requires_grad=True, dev_data_format=pybuda.DataFormat.Float32))
+
+    def forward(self, input):
+        layernorm_340 = pybuda.op.Layernorm("", input, self.get_parameter("segformer.encoder.layer_norm.0.weight"), self.get_parameter("segformer.encoder.layer_norm.0.bias"), dim=-1, epsilon=1e-05)
+        reshape_341 = pybuda.op.Reshape("", layernorm_340, shape=(1, 128, 128, 32))
+        transpose_342 = pybuda.op.Transpose("", reshape_341, dim0=-3, dim1=-1, z_dim_slice=32, out_dtype=torch.float32)
+        transpose_343 = pybuda.op.Transpose("", transpose_342, dim0=-2, dim1=-1, out_dtype=torch.float32)
+        conv2d_344 = pybuda.op.Conv2d("", transpose_343, self.get_parameter("segformer.encoder.patch_embeddings.1.proj.weight"), self.get_parameter("segformer.encoder.patch_embeddings.1.proj.bias"), stride=[2, 2], padding=[1, 1, 1, 1], dilation=1, groups=1, channel_last=0)
+        reshape_783 = pybuda.op.Reshape("", transpose_343, shape=(1, 32, 16384))
+        transpose_784 = pybuda.op.Transpose("", reshape_783, dim0=-2, dim1=-1, out_dtype=torch.float32)
+        reshape_785 = pybuda.op.Reshape("", transpose_784, shape=(16384, 32))
+        return conv2d_344, reshape_785
+
+def test_fuse_tm_sequence_multi_user(test_device):
+    """
+       Test case to fuse tm sequence when there are multiple user for the matched pattern in optimization buda passes
+
+       Pattern to Match:
+                vslice
+                transpose(-3, -1)
+                transpose(-2, -1)
+                reshape
+
+
+       Pattern to Replace:
+                transpose(-2, -1)
+
+
+       Graph before fusing:
+
+                  Input
+            [1, 1, 16384, 32]
+                    |
+                    |
+                vslice_1(128,)
+            [1, 128, 128, 32]
+                    |
+                    |
+                transpose_1(-3, -1, 32)
+            [1, 32, 128, 128]
+                    |
+                    |
+                transpose_2(-2, -1, -1)
+            [1, 32, 128, 128]
+                    /\
+                   /  \
+                  /    \
+           reshape_1   reshape_2
+    [1, 1, 32, 16384]   [1, 32, 16384]
+
+
+    Graph after fusing:
+
+                  Input
+              [1, 1, 16384, 32]
+                    |
+                    |
+            fused_op_transpose_1(-2, -1, -1)
+              [1, 1, 32, 16384]
+                     \
+                      \
+                      reshape_2
+                      [1, 32, 16384]
+
+
+    If there are multiple user at the last pattern matched node which are same op and same shape
+    (i.e reshape_1(1, 1, 32, 16384) and reshape_2(1, 32, 16384)), in that cases reshape_1 will be fused
+    and reshape_2 will be connected to the fused_op_transpose_1.
+
+    """
+
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"]="1"
+
+    tt_model = PtFuseTMMultiUser("fuse_tm_sequence_multi_user")
+    
+    pt_tensor = pybuda.Tensor.create_from_torch(torch.rand((1, 16384, 32)))
+    
+    verify_module(
+        tt_model,
+        input_shapes=(pt_tensor.shape,),
+        inputs=[(pt_tensor,)],
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
+    
diff --git a/pybuda/test/operators/tm/hstack_hslice/test_hstack_hslice.py b/pybuda/test/operators/tm/hstack_hslice/test_hstack_hslice.py
index a65c40e18..64dbef931 100644
--- a/pybuda/test/operators/tm/hstack_hslice/test_hstack_hslice.py
+++ b/pybuda/test/operators/tm/hstack_hslice/test_hstack_hslice.py
@@ -55,7 +55,7 @@
 @pytest.mark.parametrize("shape, slice", zip(shape, slices), ids=["shape=" + "x".join([str(item) for item in sh])+ "-slice=" + str(sl) for sh, sl in zip(shape, slices)])
 @pytest.mark.parametrize("recompute", (True, False), ids=["Recompute", "NoRecompute"])
 @pytest.mark.parametrize("model", [item.split(".")[0] for item in os.listdir(MODELS_PATH) if "model" in item])
-@pytest.mark.parametrize("mode", ["Training", "Inference"])
+@pytest.mark.parametrize("mode", ["Inference"])
 def test_hstack_hslice(
     mode,
     recompute,
diff --git a/pybuda/test/operators/tm/pad/test_pad.py b/pybuda/test/operators/tm/pad/test_pad.py
index dcee56460..9e06ee61b 100644
--- a/pybuda/test/operators/tm/pad/test_pad.py
+++ b/pybuda/test/operators/tm/pad/test_pad.py
@@ -65,7 +65,7 @@
 @pytest.mark.parametrize("shape", shape, ids=["shape" + "x".join([str(jtem) for jtem in item]) for item in shape])
 @pytest.mark.parametrize("recompute", (True, False), ids=["Recompute", "NoRecompute"])
 @pytest.mark.parametrize("model", [item.split(".")[0] for item in os.listdir(MODELS_PATH) if "model" in item])
-@pytest.mark.parametrize("mode", ["Training", "Inference"])
+@pytest.mark.parametrize("mode", ["Inference"])
 def test_hstack_hslice(
     test_device,
     mode,
diff --git a/pybuda/test/operators/tm/reshape/test_reshape.py b/pybuda/test/operators/tm/reshape/test_reshape.py
index 51da60263..590a7b74f 100644
--- a/pybuda/test/operators/tm/reshape/test_reshape.py
+++ b/pybuda/test/operators/tm/reshape/test_reshape.py
@@ -14,6 +14,7 @@
 
 import pybuda.op
 from pybuda import TTDevice, BackendType, pybuda_compile, VerifyConfig, CompilerConfig
+from pybuda.verify.config import TestKind
 
 from . import models
 
@@ -47,13 +48,16 @@
 
 @pytest.mark.parametrize("old_shape, new_shape", zip(old_shape, new_shape), ids=["old_shape=" + "x".join([str(item) for item in old]) + "-new_shape=" + "x".join([str(item) for item in new]) for old, new in zip(old_shape, new_shape)])
 @pytest.mark.parametrize("model", [item.split(".")[0] for item in os.listdir(MODELS_PATH) if "model" in item])
+@pytest.mark.parametrize("op_test_kind", [TestKind.INFERENCE])
 def test_reshape(
-    test_kind,
+    op_test_kind,
     model, 
     old_shape, 
     new_shape
 ):
 
+    test_kind = op_test_kind
+    
     if model == "model_3":
         pytest.skip("These models return intermediate nodes. That's not supported today." 
                     "Autograd is trying to do backward pass twice for the same subpath in the graph and that's not correct. ")
diff --git a/pybuda/test/operators/tm/vstack_vslice/test_vstack_vslice.py b/pybuda/test/operators/tm/vstack_vslice/test_vstack_vslice.py
index 7463ae190..dea76799c 100644
--- a/pybuda/test/operators/tm/vstack_vslice/test_vstack_vslice.py
+++ b/pybuda/test/operators/tm/vstack_vslice/test_vstack_vslice.py
@@ -57,7 +57,7 @@
 @pytest.mark.parametrize("shape, slice", zip(shape, slices), ids=["shape=" + "x".join([str(item) for item in sh])+ "-slice=" + str(sl) for sh, sl in zip(shape, slices)])
 @pytest.mark.parametrize("recompute", (True, False), ids=["Recompute", "NoRecompute"])
 @pytest.mark.parametrize("model", [item.split(".")[0] for item in os.listdir(MODELS_PATH) if "model" in item])
-@pytest.mark.parametrize("mode", ["Training", "Inference"])
+@pytest.mark.parametrize("mode", ["Inference"])
 def test_vstack_vslice(
     mode,
     recompute,
diff --git a/pybuda/test/quantized/simple_models/conv2d_with_bias-Int8.onnx b/pybuda/test/quantized/simple_models/conv2d_with_bias-Int8.onnx
deleted file mode 100644
index 62048e8dd..000000000
Binary files a/pybuda/test/quantized/simple_models/conv2d_with_bias-Int8.onnx and /dev/null differ
diff --git a/pybuda/test/quantized/simple_models/conv_model-QOperator.onnx b/pybuda/test/quantized/simple_models/conv_model-QOperator.onnx
deleted file mode 100644
index 7862e3447..000000000
Binary files a/pybuda/test/quantized/simple_models/conv_model-QOperator.onnx and /dev/null differ
diff --git a/pybuda/test/quantized/simple_models/conv_model.onnx b/pybuda/test/quantized/simple_models/conv_model.onnx
deleted file mode 100644
index a8fe8fa67..000000000
Binary files a/pybuda/test/quantized/simple_models/conv_model.onnx and /dev/null differ
diff --git a/pybuda/test/quantized/simple_models/matmul_no_bias-Int8.onnx b/pybuda/test/quantized/simple_models/matmul_no_bias-Int8.onnx
deleted file mode 100644
index 94af59dbf..000000000
Binary files a/pybuda/test/quantized/simple_models/matmul_no_bias-Int8.onnx and /dev/null differ
diff --git a/pybuda/test/quantized/simple_models/matmul_no_bias-UInt8.onnx b/pybuda/test/quantized/simple_models/matmul_no_bias-UInt8.onnx
deleted file mode 100644
index 9c83b7922..000000000
Binary files a/pybuda/test/quantized/simple_models/matmul_no_bias-UInt8.onnx and /dev/null differ
diff --git a/pybuda/test/quantized/simple_models/matmul_no_bias-infer.onnx b/pybuda/test/quantized/simple_models/matmul_no_bias-infer.onnx
deleted file mode 100644
index 7eaa81172..000000000
Binary files a/pybuda/test/quantized/simple_models/matmul_no_bias-infer.onnx and /dev/null differ
diff --git a/pybuda/test/quantized/simple_models/matmul_no_bias.onnx b/pybuda/test/quantized/simple_models/matmul_no_bias.onnx
deleted file mode 100644
index c106f960a..000000000
--- a/pybuda/test/quantized/simple_models/matmul_no_bias.onnx
+++ /dev/null
@@ -1,13 +0,0 @@
-pytorch2.1.0:�
-8
-onnx::MatMul_0
-onnx::MatMul_43/fc1/MatMul"MatMul
-main_graph*(Bonnx::MatMul_4J0�=v*�����>���>Z 
-onnx::MatMul_0
-
-
-b
-3
-
-
-B
\ No newline at end of file
diff --git a/pybuda/test/quantized/simple_models/matmul_with_bias-Int8.onnx b/pybuda/test/quantized/simple_models/matmul_with_bias-Int8.onnx
deleted file mode 100644
index ae8bb496d..000000000
Binary files a/pybuda/test/quantized/simple_models/matmul_with_bias-Int8.onnx and /dev/null differ
diff --git a/pybuda/test/quantized/simple_models/mlp-QOperator.onnx b/pybuda/test/quantized/simple_models/mlp-QOperator.onnx
deleted file mode 100644
index a2848cb9e..000000000
Binary files a/pybuda/test/quantized/simple_models/mlp-QOperator.onnx and /dev/null differ
diff --git a/pybuda/test/quantized/simple_models/mlp.onnx b/pybuda/test/quantized/simple_models/mlp.onnx
deleted file mode 100644
index ccb07928a..000000000
Binary files a/pybuda/test/quantized/simple_models/mlp.onnx and /dev/null differ
diff --git a/pybuda/test/quantized/simple_models/mlp_gelu-QOperator.onnx b/pybuda/test/quantized/simple_models/mlp_gelu-QOperator.onnx
deleted file mode 100644
index 4a0ca0009..000000000
Binary files a/pybuda/test/quantized/simple_models/mlp_gelu-QOperator.onnx and /dev/null differ
diff --git a/pybuda/test/quantized/simple_models/mlp_gelu.onnx b/pybuda/test/quantized/simple_models/mlp_gelu.onnx
deleted file mode 100644
index 6e4916bf5..000000000
Binary files a/pybuda/test/quantized/simple_models/mlp_gelu.onnx and /dev/null differ
diff --git a/pybuda/test/quantized/test_onnx_quantized.py b/pybuda/test/quantized/test_onnx_quantized.py
index dcd0a05ea..648eb0991 100644
--- a/pybuda/test/quantized/test_onnx_quantized.py
+++ b/pybuda/test/quantized/test_onnx_quantized.py
@@ -20,11 +20,8 @@
 from pybuda.verify.config import TestKind
 from pybuda.config import _get_global_compiler_config
 
-def test_onnx_quantized_mlp_gelu(test_kind, test_device):
+def test_onnx_quantized_mlp_gelu(test_device):
     pytest.skip()
-    # Skip training
-    if test_kind.is_training():
-        pytest.skip()
 
     # Download ONNX model
     save_path = "pybuda/test/quantized/simple_models/mlp_gelu-QOperator.onnx"
@@ -42,7 +39,6 @@ def test_onnx_quantized_mlp_gelu(test_kind, test_device):
     # Configurations
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_fusing = False
 
     # Sanity run
@@ -59,16 +55,14 @@ def test_onnx_quantized_mlp_gelu(test_kind, test_device):
         verify_cfg=VerifyConfig(
             arch=test_device.arch,
             devtype=test_device.devtype,
-            test_kind=test_kind,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
             verify_pybuda_codegen_vs_framework=True,
         ),
     )
 
-def test_onnx_quantized_mlp(test_kind, test_device):
+def test_onnx_quantized_mlp(test_device):
     pytest.skip()
-    # Skip training
-    if test_kind.is_training():
-        pytest.skip()
 
     # Download ONNX model
     save_path = "pybuda/test/quantized/simple_models/mlp-QOperator.onnx"
@@ -86,7 +80,6 @@ def test_onnx_quantized_mlp(test_kind, test_device):
     # Configurations
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_fusing = False
 
     # Sanity run
@@ -103,17 +96,15 @@ def test_onnx_quantized_mlp(test_kind, test_device):
         verify_cfg=VerifyConfig(
             arch=test_device.arch,
             devtype=test_device.devtype,
-            test_kind=test_kind,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
             verify_pybuda_codegen_vs_framework=True,
         ),
     )
 
 
-def test_onnx_quantized_conv(test_kind, test_device):
+def test_onnx_quantized_conv(test_device):
     pytest.skip()
-    # Skip training
-    if test_kind.is_training():
-        pytest.skip()
 
     # Download ONNX model
     save_path = "pybuda/test/quantized/simple_models/conv2d_with_bias-Int8.onnx"
@@ -131,7 +122,6 @@ def test_onnx_quantized_conv(test_kind, test_device):
     # Configurations
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_fusing = False
 
     # Sanity run
@@ -169,16 +159,14 @@ def test_onnx_quantized_conv(test_kind, test_device):
     #     verify_cfg=VerifyConfig(
     #         arch=test_device.arch,
     #         devtype=test_device.devtype,
+    #         devmode=test_device.devmode,
     #         test_kind=test_kind,
     #         verify_pybuda_codegen_vs_framework=True,
     #     ),
     # )
 
-def test_onnx_quantized_mm_int8_no_bias(test_kind, test_device):
+def test_onnx_quantized_mm_int8_no_bias(test_device):
     pytest.skip()
-    # Skip training
-    if test_kind.is_training():
-        pytest.skip()
 
     # Download ONNX model
     save_path = "pybuda/test/quantized/simple_models/matmul_no_bias-Int8.onnx"
@@ -196,7 +184,6 @@ def test_onnx_quantized_mm_int8_no_bias(test_kind, test_device):
     # Configurations
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_fusing = False
 
     # Sanity run
@@ -213,17 +200,15 @@ def test_onnx_quantized_mm_int8_no_bias(test_kind, test_device):
         verify_cfg=VerifyConfig(
             arch=test_device.arch,
             devtype=test_device.devtype,
-            test_kind=test_kind,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
             verify_pybuda_codegen_vs_framework=True,
             # verify_all=True, # need to update matmul eval in buda 
         ),
     )
 
-def test_onnx_quantized_mm_int8_bias(test_kind, test_device):
+def test_onnx_quantized_mm_int8_bias(test_device):
     pytest.skip()
-    # Skip training
-    if test_kind.is_training():
-        pytest.skip()
 
     # Download ONNX model
     save_path = "pybuda/test/quantized/simple_models/matmul_with_bias-Int8.onnx"
@@ -241,7 +226,6 @@ def test_onnx_quantized_mm_int8_bias(test_kind, test_device):
     # Configurations
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_fusing = False
 
     # Sanity run
@@ -258,17 +242,15 @@ def test_onnx_quantized_mm_int8_bias(test_kind, test_device):
         verify_cfg=VerifyConfig(
             arch=test_device.arch,
             devtype=test_device.devtype,
-            test_kind=test_kind,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
             verify_pybuda_codegen_vs_framework=True,
             # verify_all=True,
         ),
     )
 
-def test_onnx_quantized_mm_uint8_no_bias(test_kind, test_device):
+def test_onnx_quantized_mm_uint8_no_bias(test_device):
     pytest.skip()
-    # Skip training
-    if test_kind.is_training():
-        pytest.skip()
 
     # Download ONNX model
     save_path = "pybuda/test/quantized/simple_models/matmul_no_bias-UInt8.onnx"
@@ -286,7 +268,6 @@ def test_onnx_quantized_mm_uint8_no_bias(test_kind, test_device):
     # Configurations
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_fusing = False
 
     # Sanity run
@@ -303,88 +284,12 @@ def test_onnx_quantized_mm_uint8_no_bias(test_kind, test_device):
         verify_cfg=VerifyConfig(
             arch=test_device.arch,
             devtype=test_device.devtype,
-            test_kind=test_kind,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
             verify_pybuda_codegen_vs_framework=True,
             verify_all=True,
         ),
     )
 
 
-def test_onnx_quantized_resnet(test_kind, test_device):
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip()
-
-    # Skip training
-    if test_kind.is_training():
-        pytest.skip()
-
-    # Download ONNX model
-    save_path = "third_party/confidential_customer_models/model_2/onnx/quant/ResNet50-v1.5-Int8.onnx"
-    if not os.path.exists(save_path):
-        raise RuntimeError("Model not found")
-
-    # LOAD ONNX model
-    onnx_model = onnx.load(save_path)
-    onnx.checker.check_model(onnx_model)
-    pybuda_onnx_model = OnnxModule(
-        "onnx_quantized_ResNet50",
-        onnx_model,
-        save_path,
-    )
-    # Configurations
-    compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
-    compiler_cfg.enable_auto_fusing = False
-    compiler_cfg.graph_solver_self_cut_type = "FastCut"
-    compiler_cfg.default_df_override = DataFormat.Float32
-
-    os.environ["PYBUDA_DISABLE_CONV_MULTI_OP_FRACTURE"] = "1"
-    os.environ["PYBUDA_FRACTURIZATION_DISABLE"] = "1"
-    # os.environ["PYBUDA_REPRODUCE_SUBGRAPH"] = "1"
-    # os.environ["PYBUDA_REPRODUCE_SUBGRAPH_INPUT"] = "quantize_0.dc.buda_quantize.1"
-    # os.environ["PYBUDA_REPRODUCE_SUBGRAPH_OUTPUT"] = "conv2d_1.dc.matmul.11"
-
-    # Sanity run
-    input_shape = []
-    for i in range(len(onnx_model.graph.input)):
-        dimension = onnx_model.graph.input[i].type.tensor_type.shape
-        i_shape = [d.dim_value for d in dimension.dim]
-        input_shape.append(i_shape)
-    
-    # tti_path = "onnx_int8_resnet50_epoch_0.tti"
-    # if not os.path.exists(tti_path):
-    #     tt_module = pybuda_onnx_model
-    #     device = pybuda.TTDevice(
-    #         "tt0", module=tt_module,arch=pybuda.BackendDevice.Wormhole_B0, devtype=pybuda.BackendType.Silicon)
-    #     tti_img = device.compile_to_image(
-    #         img_path=tti_path,
-    #         training=False,
-    #         sample_inputs=[torch.randn(shape) for shape in input_shape],
-    #     )
-
-
-    # device_img: pybuda.TTDeviceImage = pybuda.TTDeviceImage.load_from_disk(tti_path)
-    # ttdevice = pybuda.TTDevice.load_image(img=device_img)
-
-    # inputs = [torch.randn(shape) for shape in input_shape]
-    # ttdevice.push_to_inputs(*inputs)
-    # output_q = pybuda.run_inference(_sequential=True)
-    # output = output_q.get()[0].value().detach()
-
-    # golden_output = pybuda_onnx_model.forward(*inputs)
-    # assert np.allclose(output, golden_output[0], atol=1e-3, rtol=1e-3)
-    # Compile and verify
-    verify_module(
-        pybuda_onnx_model,
-        input_shape,
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            test_kind=test_kind,
-            verify_pybuda_codegen_vs_framework=True,
-            verify_all=True,
-        ),
-    )
-
 
diff --git a/pybuda/test/quantized/test_onnx_quantized_mobilenet.py b/pybuda/test/quantized/test_onnx_quantized_mobilenet.py
new file mode 100644
index 000000000..8225c3fc5
--- /dev/null
+++ b/pybuda/test/quantized/test_onnx_quantized_mobilenet.py
@@ -0,0 +1,127 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import os
+import urllib
+
+import onnx
+import pytest
+import numpy as np
+import onnxruntime
+import torch
+import pybuda
+from pybuda import (
+    OnnxModule,
+    VerifyConfig,
+    DataFormat,
+    BackendDevice,
+    BackendType,
+)
+from pybuda.verify import verify_module
+from pybuda.verify.config import TestKind
+from pybuda.config import _get_global_compiler_config
+
+def test_onnx_quantized_mb_v2_depth(test_device):
+    # Skip test on blackhole until we have support for quantized models on blackhole pybuda#2700
+    if test_device.arch == BackendDevice.Blackhole:
+        pytest.skip("Blackhole does not support quantized models")
+
+    # Download ONNX model
+    save_path = "third_party/confidential_customer_models/quantized/mb_v2_depthwise-Int8.onnx"
+    if not os.path.exists(save_path):
+        raise RuntimeError("Model not found")
+
+    # LOAD ONNX model
+    onnx_model = onnx.load(save_path)
+    onnx.checker.check_model(onnx_model)
+    pybuda_onnx_model = OnnxModule(
+        "onnx_quantized_mb_v2_depthwise",
+        onnx_model,
+        save_path,
+    )
+    # Configurations
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.enable_auto_fusing = False
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    if test_device.devtype == BackendType.Silicon:
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"]  = f"{80*1024}"
+
+    # Sanity run
+    input_shape = []
+    for i in range(len(onnx_model.graph.input)):
+        dimension = onnx_model.graph.input[i].type.tensor_type.shape
+        i_shape = [d.dim_value for d in dimension.dim]
+        input_shape.append(i_shape)
+    
+    # Compile and verify
+    verify_module(
+        pybuda_onnx_model,
+        input_shape,
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            enabled = False if test_device.devtype == BackendType.Silicon else True,
+            # verify_pybuda_codegen_vs_framework=True,
+            # verify_all=True
+        ),
+    )
+
+
+def test_onnx_quantized_mb_v2(test_device):
+    # Skip test on blackhole until we have support for quantized models on blackhole pybuda#2700
+    if test_device.arch == BackendDevice.Blackhole:
+        pytest.skip("Blackhole does not support quantized models")
+
+    # Download ONNX model
+    save_path = "third_party/confidential_customer_models/quantized/mobilenet_v2-Int8.onnx"
+    if not os.path.exists(save_path):
+        raise RuntimeError("Model not found")
+
+    # LOAD ONNX model
+    onnx_model = onnx.load(save_path)
+    onnx.checker.check_model(onnx_model)
+    pybuda_onnx_model = OnnxModule(
+        "onnx_quantized_mb_v2",
+        onnx_model,
+        save_path,
+    )
+    # Configurations
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.enable_auto_fusing = False
+    compiler_cfg.graph_solver_self_cut_type = "ConsumerOperandDataEdgesFirst"
+    compiler_cfg.place_on_new_epoch("conv2d_118.dc.reshape.0.dc.sparse_matmul.14.lc2")
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    os.environ["PYBUDA_DISABLE_CONV_MULTI_OP_FRACTURE"] = "1"
+    os.environ["PYBUDA_FRACTURIZATION_DISABLE"] = "1"
+    os.environ["PYBUDA_DISABLE_PADDING_PASS"] = "1"
+    os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"]  = f"{80*1024}"
+    if test_device.devtype == BackendType.Silicon:
+        os.environ["TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE"]  = f"{96*1024}"
+
+    # Sanity run
+    input_shape = []
+    for i in range(len(onnx_model.graph.input)):
+        dimension = onnx_model.graph.input[i].type.tensor_type.shape
+        i_shape = [d.dim_value for d in dimension.dim]
+        input_shape.append(i_shape)
+    
+    # Compile and verify
+    verify_module(
+        pybuda_onnx_model,
+        input_shape,
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            enabled = False if test_device.devtype == BackendType.Silicon else True,
+            # verify_pybuda_codegen_vs_framework=True,
+            # verify_all=True
+        ),
+    )
\ No newline at end of file
diff --git a/pybuda/test/quantized/test_onnx_quantized_resnet.py b/pybuda/test/quantized/test_onnx_quantized_resnet.py
new file mode 100644
index 000000000..bfbf16c00
--- /dev/null
+++ b/pybuda/test/quantized/test_onnx_quantized_resnet.py
@@ -0,0 +1,102 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import os
+import urllib
+
+import onnx
+import pytest
+import numpy as np
+import onnxruntime
+import torch
+import pybuda
+from pybuda import (
+    OnnxModule,
+    VerifyConfig,
+    DataFormat,
+    BackendDevice,
+)
+from pybuda.verify import verify_module
+from pybuda.verify.config import TestKind
+from pybuda.config import _get_global_compiler_config
+
+
+
+
+def test_onnx_quantized_resnet(test_device):
+    # Skip test on blackhole until we have support for quantized models on blackhole pybuda#2700
+    if test_device.arch == BackendDevice.Blackhole:
+        pytest.skip("Blackhole does not support quantized models")
+
+    if test_device.arch == BackendDevice.Grayskull:
+        pytest.skip()
+
+    # Download ONNX model
+    save_path = "third_party/confidential_customer_models/quantized/ResNet50-v1.5-Int8.onnx"
+    if not os.path.exists(save_path):
+        raise RuntimeError("Model not found")
+
+    # LOAD ONNX model
+    onnx_model = onnx.load(save_path)
+    onnx.checker.check_model(onnx_model)
+    pybuda_onnx_model = OnnxModule(
+        "onnx_quantized_ResNet50",
+        onnx_model,
+        save_path,
+    )
+    # Configurations
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_auto_fusing = False
+    compiler_cfg.graph_solver_self_cut_type = "FastCut"
+    compiler_cfg.default_df_override = DataFormat.Float32
+
+    # os.environ["PYBUDA_DISABLE_CONV_MULTI_OP_FRACTURE"] = "1"
+    os.environ["PYBUDA_FRACTURIZATION_DISABLE"] = "1"
+    # os.environ["PYBUDA_REPRODUCE_SUBGRAPH"] = "1"
+    # os.environ["PYBUDA_REPRODUCE_SUBGRAPH_INPUT"] = "quantize_0.dc.buda_quantize.1"
+    # os.environ["PYBUDA_REPRODUCE_SUBGRAPH_OUTPUT"] = "conv2d_1.dc.matmul.11"
+
+    # Sanity run
+    input_shape = []
+    for i in range(len(onnx_model.graph.input)):
+        dimension = onnx_model.graph.input[i].type.tensor_type.shape
+        i_shape = [d.dim_value for d in dimension.dim]
+        input_shape.append(i_shape)
+    
+    # tti_path = "onnx_int8_resnet50_epoch_0.tti"
+    # if not os.path.exists(tti_path):
+    #     tt_module = pybuda_onnx_model
+    #     device = pybuda.TTDevice(
+    #         "tt0", module=tt_module,arch=pybuda.BackendDevice.Wormhole_B0, devtype=pybuda.BackendType.Silicon)
+    #     tti_img = device.compile_to_image(
+    #         img_path=tti_path,
+    #         training=False,
+    #         sample_inputs=[torch.randn(shape) for shape in input_shape],
+    #     )
+
+
+    # device_img: pybuda.TTDeviceImage = pybuda.TTDeviceImage.load_from_disk(tti_path)
+    # ttdevice = pybuda.TTDevice.load_image(img=device_img)
+
+    # inputs = [torch.randn(shape) for shape in input_shape]
+    # ttdevice.push_to_inputs(*inputs)
+    # output_q = pybuda.run_inference(_sequential=True)
+    # output = output_q.get()[0].value().detach()
+
+    # golden_output = pybuda_onnx_model.forward(*inputs)
+    # assert np.allclose(output, golden_output[0], atol=1e-3, rtol=1e-3)
+    # Compile and verify
+    verify_module(
+        pybuda_onnx_model,
+        input_shape,
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            # verify_pybuda_codegen_vs_framework=True,
+            verify_all=True,
+        ),
+    )
+
diff --git a/pybuda/test/quantized/test_onnx_quantized_vit.py b/pybuda/test/quantized/test_onnx_quantized_vit.py
new file mode 100644
index 000000000..7601c77d8
--- /dev/null
+++ b/pybuda/test/quantized/test_onnx_quantized_vit.py
@@ -0,0 +1,71 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import os
+
+import onnx
+import pytest
+from pybuda import (
+    OnnxModule,
+    VerifyConfig,
+    DataFormat,
+    BackendDevice,
+    BackendType,
+)
+from pybuda.verify import verify_module
+from pybuda.verify.config import TestKind
+from pybuda.config import _get_global_compiler_config
+
+def test_int8_onnx_vit_calibrated(test_device):
+    # Skip test on blackhole until we have support for quantized models on blackhole pybuda#2700
+    if test_device.arch == BackendDevice.Blackhole:
+        pytest.skip("Blackhole does not support quantized models")
+
+    if test_device.arch == BackendDevice.Grayskull:
+        pytest.skip()
+
+    # Download ONNX model
+    save_path = "third_party/confidential_customer_models/quantized/vit-Int8-calibrated.onnx"
+    if not os.path.exists(save_path):
+        raise RuntimeError("Model not found")
+
+    # LOAD ONNX model
+    onnx_model = onnx.load(save_path)
+    onnx.checker.check_model(onnx_model)
+    pybuda_onnx_model = OnnxModule(
+        "onnx_quantized_vit_calibrated",
+        onnx_model,
+        save_path,
+    )
+    # Configurations
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.enable_auto_fusing = False
+    compiler_cfg.graph_solver_self_cut_type = "FastCut"
+    compiler_cfg.default_df_override = DataFormat.Float32
+
+    # os.environ["PYBUDA_DISABLE_CONV_MULTI_OP_FRACTURE"] = "1"
+    os.environ["PYBUDA_FRACTURIZATION_DISABLE"] = "1"
+
+    # Sanity run
+    input_shape = []
+    for i in range(len(onnx_model.graph.input)):
+        dimension = onnx_model.graph.input[i].type.tensor_type.shape
+        i_shape = [d.dim_value for d in dimension.dim]
+        input_shape.append(i_shape)
+
+
+    # Compile and verify
+    pcc = 0.97 if test_device.devtype == BackendType.Silicon else 0.99
+    verify_module(
+        pybuda_onnx_model,
+        input_shape,
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            pcc=pcc,
+        ),
+    )
diff --git a/pybuda/test/random/conftest.py b/pybuda/test/random/conftest.py
index 73220a606..640b16f2b 100644
--- a/pybuda/test/random/conftest.py
+++ b/pybuda/test/random/conftest.py
@@ -11,7 +11,7 @@
     
 @pytest.fixture(autouse=True)
 def run_test(test_index, random_seeds):
-    pybuda.config.set_configuration_options(balancer_policy="Random", enable_t_streaming=True, use_interactive_placer=True)
+    pybuda.config.set_configuration_options(balancer_policy="Random", use_interactive_placer=True)
     os.environ["PYBUDA_BALANCER_RANDOM_POLICY_SEED"] = str(random_seeds[test_index])
     
     rng = random.Random(random_seeds[test_index])
@@ -32,7 +32,20 @@ def pytest_generate_tests(metafunc):
             test_count = int(os.environ["RANDOM_TEST_COUNT"])
         else:
             test_count = 5
-        metafunc.parametrize("test_index", range(test_count))
+        tests_selected_indecies = []
+        if "RANDOM_TESTS_SELECTED" in os.environ:
+            tests_selected = os.environ["RANDOM_TESTS_SELECTED"]
+            tests_selected = tests_selected.strip()
+            if len(tests_selected) > 0:
+                tests_selected_indecies = tests_selected.split(",")
+                tests_selected_indecies = [int(i) for i in tests_selected_indecies]
+        if len(tests_selected_indecies) > 0:
+            metafunc.parametrize("test_index", tests_selected_indecies)
+            last_test_selected = max(tests_selected_indecies)
+            if test_count < last_test_selected + 1:
+                test_count = last_test_selected + 1
+        else:
+            metafunc.parametrize("test_index", range(test_count))
 
         global seeds
         if len(seeds) > 0:
diff --git a/pybuda/test/random/test_three_ops.py b/pybuda/test/random/test_three_ops.py
index 116b74a7d..2091dc801 100644
--- a/pybuda/test/random/test_three_ops.py
+++ b/pybuda/test/random/test_three_ops.py
@@ -30,7 +30,8 @@ def forward(self, act):
             raise Exception("Unknown op1")
 
         if self.op2 == "sqrt":
-            b = torch.sqrt(a)
+            # sqrt accepts only positive numbers
+            b = torch.sqrt(torch.relu(a))
         elif self.op2 == "tanh":
             b = torch.tanh(a)
         elif self.op2 == "add":
@@ -39,7 +40,12 @@ def forward(self, act):
             raise Exception("Unknown op2")
 
         if self.op3 == "matmul":
-            c = torch.matmul(a, torch.transpose(b, 1, 2))
+            # if first operation was conv2d last dim must move to second
+            if self.op1 == "conv2d":
+                a = a.permute(0, 3, 1, 2)
+                b = b.permute(0, 3, 1, 2)
+            # transpose should use last 2 columns, in case of conv there are 4 dimensions
+            c = torch.matmul(a, torch.transpose(b, b.dim()-2, b.dim()-1))
         elif self.op3 == "eltwise":
             c = a + b
         else:
@@ -49,10 +55,14 @@ def forward(self, act):
 
 
 def test_three_ops(test_index, random_seeds, test_device):
-    rng = random.Random(random_seeds[test_index])
-    rows = rng.randint(16, 512)
-    cols1 = rng.randint(16, 512)
-    cols2 = rng.randint(16, 512)
+    random_seed = random_seeds[test_index]
+    rng = random.Random(random_seed)
+    # smaller feature_size_factor results in less failed tests
+    feature_size_factor = 2
+    # feature_size_factor = 16
+    rows = rng.randint(16, 32 * feature_size_factor)
+    cols1 = rng.randint(16, 32 * feature_size_factor)
+    cols2 = rng.randint(16, 32 * feature_size_factor)
     microbatch_size = rng.randint(1, 8)
 
     model = ThreeOpModel(rng, cols1, cols2)
diff --git a/pybuda/test/santacoder/decode.py b/pybuda/test/santacoder/decode.py
index 12433f3ac..9309566f1 100644
--- a/pybuda/test/santacoder/decode.py
+++ b/pybuda/test/santacoder/decode.py
@@ -16,7 +16,7 @@
 # Pytest to run santacoder model
 @pytest.mark.parametrize("tokens", [10, 100])
 @pytest.mark.parametrize("device", ["silicon"])
-@pytest.mark.parametrize("arch", ["greyskull", "wormhole", "wormhole_b0"])
+@pytest.mark.parametrize("arch", ["greyskull", "wormhole_b0"])
 @pytest.mark.parametrize("precision", ["fp32", "fp16", "bf16", "fp8", "fp8b"])
 @pytest.mark.parametrize("amp_level", ["amp0", "amp1", "amp2"])
 @pytest.mark.parametrize("num_chips", ["chip1", "chip2", "chip32"])
@@ -153,7 +153,7 @@ def main(parameters):
     parser.add_argument('--output-at-end', action='store_true', help='Output at the end of generation instead of token by token')
 
     parser.add_argument('-d', '--device', choices=['huggingface', 'pytorch', 'golden', 'silicon'], default='huggingface', help='huggingface: run using HF code only, pytorch: use our shim but run in PyTorch, golden/silicon: run via pybuda')
-    parser.add_argument('--arch', choices=['greyskull', 'wormhole', 'wormhole_b0'], default='wormhole', help='Architecture to use for silicon')
+    parser.add_argument('--arch', choices=['greyskull', 'wormhole_b0'], default='wormhole_b0', help='Architecture to use for silicon')
     parser.add_argument('--precision', choices=['fp32', 'fp16', 'bf16', 'fp8', 'fp8b'], default='fp32', help='Precision to use for all silicon tensors')
     parser.add_argument('--amp-level', type=int, choices=[0, 1, 2], help='Automatic mixed precision level (0=off, 1=mixed b-formats, 2=mixed a-formats)')
     parser.add_argument('--num-chips', type=int, default=1, help='Number of chips to use')
diff --git a/pybuda/test/santacoder/pybudify.py b/pybuda/test/santacoder/pybudify.py
index b71c1b79a..786018f5f 100644
--- a/pybuda/test/santacoder/pybudify.py
+++ b/pybuda/test/santacoder/pybudify.py
@@ -7,7 +7,7 @@
 
 
 class PyBudify(torch.nn.Module):
-    def __init__(self, pt_module, device='silicon', arch='wormhole', precision='fp32', amp_level=0, micro_batch_size=1, fuse=False, num_chips=1, perf=None, verify=False, log_level='ERROR', tti_save=None, tti_load=None):
+    def __init__(self, pt_module, device='silicon', arch='wormhole_b0', precision='fp32', amp_level=0, micro_batch_size=1, fuse=False, num_chips=1, perf=None, verify=False, log_level='ERROR', tti_save=None, tti_load=None):
         super().__init__()
 
         self.device = device
@@ -17,7 +17,7 @@ def __init__(self, pt_module, device='silicon', arch='wormhole', precision='fp32
 
         if device != 'pytorch':
             # pybuda workarounds
-            os.environ["GOLDEN_WORMHOLE"] = "1"
+            os.environ["GOLDEN_WORMHOLE_B0"] = "1"
             os.environ["PYBUDA_ENABLE_BROADCAST_SPLITTING"] = "1"
             #os.environ["PYBUDA_DISABLE_FORK_JOIN_BUF"] = "1"
             os.environ["PYBUDA_DRAM_PICK_CAPACITY"] = "1"
@@ -67,7 +67,6 @@ def __init__(self, pt_module, device='silicon', arch='wormhole', precision='fp32
             pybuda.set_configuration_options(default_df_override=fallback, accumulate_df=fallback, amp_level=amp_level, enable_auto_fusing=fuse, performance_trace=perf_level, backend_opt_level=3)
 
             pybuda_arch = { 'grayskull': pybuda.BackendDevice.Grayskull,
-                            'wormhole': pybuda.BackendDevice.Wormhole,
                             'wormhole_b0': pybuda.BackendDevice.Wormhole_B0 }[arch]
             
             if tti_load is not None:
diff --git a/pybuda/test/serve/qa_serve.py b/pybuda/test/serve/qa_serve.py
index c0f51d9f2..851a2e07f 100644
--- a/pybuda/test/serve/qa_serve.py
+++ b/pybuda/test/serve/qa_serve.py
@@ -65,7 +65,7 @@ def __init__(self):
         tokenizer = BertTokenizer.from_pretrained(model_name, pad_to_max_length=True)
         self.nlp = pipeline('question-answering', model=model, tokenizer=tokenizer)
 
-        test_device = TestDevice(devtype=BackendType.Golden, arch=BackendDevice.Wormhole)
+        test_device = TestDevice(devtype=BackendType.Golden, arch=BackendDevice.Wormhole_B0)
 
 
         # Create pipeline, with encoders on TT
diff --git a/pybuda/test/test_api.py b/pybuda/test/test_api.py
new file mode 100644
index 000000000..f181f2b32
--- /dev/null
+++ b/pybuda/test/test_api.py
@@ -0,0 +1,57 @@
+import os
+import pytest
+import torch
+import torch.nn as nn
+
+import tensorflow as tf
+
+import pybuda
+import pybuda.config
+
+def test_torch():
+    class Add(nn.Module):
+        def __init__(self):
+            super().__init__()
+
+        def forward(self, x1, x2):
+            return torch.add(x1, x2)
+
+    model = Add()
+    shape = (1, 1024, 32)
+    inputs = [torch.rand(shape), torch.rand(shape)]
+
+    golden = model(*inputs)
+
+    compiled_model = pybuda.compile(model, sample_inputs=[torch.rand(shape), torch.rand(shape)])
+
+    output = compiled_model(*inputs)
+
+    print(f"golden: {golden}")
+    print(f"output: {output}")
+    if not torch.allclose(output[0], golden, rtol=1e-1):
+        raise ValueError("Output does not match the golden output")
+
+def test_tf():
+    class TFAdd(tf.keras.Model):
+        def __init__(self):
+            super().__init__()
+
+        def call(self, x1, x2):
+            return x1 + x2
+
+    model = TFAdd()
+    shape = (1, 1024, 32)
+    inputs = [torch.rand(shape), torch.rand(shape)]
+
+    inputs_tf = [tf.convert_to_tensor(x) for x in inputs]
+    golden = model(inputs_tf[0], inputs_tf[1])
+    golden = torch.tensor(golden.numpy())
+
+    compiled_model = pybuda.compile(model, sample_inputs=[torch.rand(shape), torch.rand(shape)])
+
+    output = compiled_model(*inputs)
+
+    print(f"golden: {golden}")
+    print(f"output: {output}")
+    if not torch.allclose(output[0], golden, rtol=1e-1):
+        raise ValueError("Output does not match the golden output")
diff --git a/pybuda/test/test_constraints.py b/pybuda/test/test_constraints.py
index 18a2b305c..b78b990ff 100644
--- a/pybuda/test/test_constraints.py
+++ b/pybuda/test/test_constraints.py
@@ -108,7 +108,7 @@ def max_fork_streams(a, b):
 
 
 def test_stream_stacking_rotate():
-    pybuda.config.set_configuration_options(balancer_policy="MaximizeTMinimizeGrid", enable_t_streaming=True)
+    pybuda.config.set_configuration_options(balancer_policy="MaximizeTMinimizeGrid")
 
     @compile(
         verify_cfg=VerifyConfig(run_golden=False, run_net2pipe=True)
@@ -129,7 +129,7 @@ def stream_stacking_rotate(a, b, c):
 
 
 def test_stream_stacking_transpose():
-    pybuda.config.set_configuration_options(balancer_policy="MaximizeTMinimizeGrid", enable_t_streaming=True)
+    pybuda.config.set_configuration_options(balancer_policy="MaximizeTMinimizeGrid")
 
     @compile(
         verify_cfg=VerifyConfig(run_golden=False, run_net2pipe=True)
@@ -148,10 +148,7 @@ def stream_stacking_transpose(a, b, c):
 
 
 def test_r_stream_mm_rhs():
-    pybuda.config.set_configuration_options(
-            balancer_policy="MaximizeTMinimizeGrid",
-            enable_t_streaming=True,
-    )
+    pybuda.config.set_configuration_options(balancer_policy="MaximizeTMinimizeGrid")
     pybuda.config._get_global_compiler_config().insert_queues = [("exp0", "mm1", 1)]
 
     @compile(
diff --git a/pybuda/test/test_conv2d.py b/pybuda/test/test_conv2d.py
index 5157fa59c..8d93cf030 100644
--- a/pybuda/test/test_conv2d.py
+++ b/pybuda/test/test_conv2d.py
@@ -95,14 +95,13 @@ def test_conv2d(
     relative_atol = 0.3 if test_kind.is_training() and test_device.devtype == BackendType.Silicon else 0.1
     pcc = 0.96 if test_device.devtype == BackendType.Silicon else 0.99
 
-    pybuda.config.set_configuration_options(enable_conv_prestride=False, enable_t_streaming=False)
+    pybuda.config.set_configuration_options(enable_conv_prestride=False)
     try:
         pybuda.verify.verify_module(mod, [(1, in_channels, original_shape[0], original_shape[1])],
             VerifyConfig(test_kind=test_kind, devtype=test_device.devtype, arch=test_device.arch, relative_atol=relative_atol, pcc=pcc))
     except RuntimeError as e:
         if (
             "Compile failed for TTDevice" in str(e) or
-            "Nodes have no valid grids, exiting" in str(e) or
             "Could not satisfy all constraints for edge" in str(e)
             ):
             pytest.xfail("tenstorrent/pybuda#185")
@@ -137,7 +136,6 @@ def test_convtranspose2d(
     padding_mode,
 ):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
 
     training = test_kind.is_training()
     if training:
@@ -177,14 +175,13 @@ def test_convtranspose2d(
         pybuda.verify.verify_module(mod, [(1, in_channels, original_shape[0], original_shape[1])],
             VerifyConfig(test_kind=test_kind, devtype=test_device.devtype, arch=test_device.arch, relative_atol=relative_atol, pcc=pcc))
     except RuntimeError as e:
-        if "Compile failed for TTDevice" in str(e) or "nodes have no valid grids, exiting" in str(e):
+        if "Compile failed for TTDevice" in str(e):
             pytest.xfail("tenstorrent/pybuda#185")
         raise
 
 
 def test_convtranspose2d_data_mismatch_repro(test_device):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
 
     # Fracturing the conv causes the data mismatch
     # Forcing the fracturing here, so the mismatch repros with small input
@@ -263,7 +260,6 @@ def test_conv2d_t_streaming(
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "MaximizeTMinimizeGrid"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_conv_prestride = False
 
     pybuda.verify.verify_module(mod, [(1, in_channels, original_shape[0], original_shape[1])],
@@ -326,7 +322,6 @@ def test_conv2d_fractured(
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "MaximizeTMinimizeGrid"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_conv_prestride = False
 
     pybuda.config.override_fracture_factor("conv2d_fractured.conv.dc.sparse_matmul.9.dc.sparse_matmul.1.lc2", kernel_size[0])
@@ -397,7 +392,6 @@ def test_conv2d_multi_op_fractured(
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "MaximizeTMinimizeGrid"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_conv_prestride = False
 
     os.environ["PYBUDA_FORCE_DISALLOW_FRACTURING"] = "1"  # Disables "within-op" fracturing
@@ -469,7 +463,6 @@ def test_conv2d_prestrided(
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "MaximizeTMinimizeGrid"
-    compiler_cfg.enable_t_streaming = True
 
     os.environ["PYBUDA_FORCE_DISALLOW_FRACTURING"] = "1"
 
@@ -548,7 +541,6 @@ def test_conv2d_resnet_prestrided_fractured(
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "MaximizeTMinimizeGrid"
-    compiler_cfg.enable_t_streaming = True
     pybuda.config.override_fracture_factor("conv2d_resnet_prestrided_fractured.conv.dc.conv2d.3.dc.sparse_matmul.9.dc.sparse_matmul.1.lc2", 8)
 
     devices = pybuda.verify.verify_module(mod, [(1, in_channels, original_shape[0], original_shape[1])],
@@ -625,7 +617,6 @@ def test_conv2d_fractured_multi_c(
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "MaximizeTMinimizeGrid"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_conv_prestride = False
     # pybuda.config.override_fracture_factor("conv2d_fractured_multi_c.conv.dc.sparse_matmul.9.dc.sparse_matmul.1.lc2", fracture_factor)
     # pybuda.config.override_op_size("?", (?, fracture_factor * 2))
@@ -737,7 +728,6 @@ def test_max_pool2d(
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.enable_broadcast_splitting = True # tenstorrent/budabackend#694
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_conv_prestride = False
 
     if training:
@@ -826,7 +816,7 @@ def test_max_pool2d_stream_through_queue(test_device):
 
 @pytest.mark.parametrize("producer_stream_factor", [1, 2, 4])
 def test_max_pool2d_stream_through_queue_1x1(test_device, producer_stream_factor):
-    if not test_device.is_wormhole():
+    if not test_device.is_wormhole_b0():
         pytest.skip()
     in_channels = 64
     kernel_size = 2
@@ -884,7 +874,6 @@ def test_avg_pool2d(
 def test_conv2d_stream_data_mismatch(test_device):
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "MaximizeTMinimizeGrid"
-    compiler_cfg.enable_t_streaming = True
     return test_conv2d(
         TestKind.INFERENCE,
         test_device,
@@ -904,7 +893,6 @@ def test_conv2d_stream_data_mismatch(test_device):
 def test_conv2d_stream_through_queue(test_device):
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "MaximizeTMinimizeGrid"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.place_on_new_epoch("conv2d.dc.matmul.11")
     return test_conv2d(
         TestKind.INFERENCE,
@@ -925,7 +913,6 @@ def test_conv2d_stream_through_queue(test_device):
 def test_conv2d_vgg_head(test_device):
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "MaximizeTMinimizeGrid"
-    compiler_cfg.enable_t_streaming = True
     pybuda.config.override_t_stream_shape("conv2d.dc.sparse_matmul.9.dc.sparse_matmul.1.lc2", (28, 1))
     return test_conv2d(
         TestKind.INFERENCE,
diff --git a/pybuda/test/test_fork_join.py b/pybuda/test/test_fork_join.py
index 6bbdc757b..1059e6384 100644
--- a/pybuda/test/test_fork_join.py
+++ b/pybuda/test/test_fork_join.py
@@ -74,8 +74,6 @@ def test_fork_join_variant(test_kind, test_device, input_shape, config):
                 if config is "m" then apropriate node is matmul, and if it is "e", then node is element-wise op.
     """
     num_in_channels = 1
-    compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = False
     relative_atol, pcc = get_relaxed_atol_pcc(test_kind, test_device)
     verify_module(ForkJoinVariant("test_fork_join_variant", input_shape, config), [(1, num_in_channels, input_shape[0], input_shape[1])],
             VerifyConfig(test_kind=test_kind, devtype=test_device.devtype, arch=test_device.arch, pcc=pcc, relative_atol=relative_atol))
@@ -111,6 +109,9 @@ def forward(self, act1):
 
 @pytest.mark.parametrize("format", [DataFormat.Bfp8_b, DataFormat.Float16_b], ids=["bfp8", "fp16"])
 def test_fork_join(test_kind, test_device, format):
+    if test_device.arch == pybuda.BackendDevice.Blackhole:
+         pytest.skip("Skip until BudaBackend#2628 is consumed.")
+
     microbatch_count = 16
 
     relative_atol, pcc = get_relaxed_atol_pcc(test_kind, test_device)
@@ -314,7 +315,6 @@ def test_multilevel_fork_join_vovnet(test_kind, test_device, format):
 
         compiler_cfg = pybuda.config._get_global_compiler_config()
         compiler_cfg.balancer_policy = "Ribbon"
-        compiler_cfg.enable_t_streaming = True
         compiler_cfg.default_df_override = format
         # Op overrides
         pybuda.config.override_op_size("conv2d_0.dc.sparse_matmul.9.dc.sparse_matmul.1.lc2", (1, 4))
@@ -686,7 +686,6 @@ def test_fork_join_yolo_v3(test_kind, test_device):
 
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = DataFormat.Float16_b
     compiler_cfg.enable_auto_transposing_placement = True
 
@@ -822,7 +821,6 @@ def test_fork_join_hrnet(test_kind, test_device):
     width = 224
 
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "Ribbon"
     compiler_cfg.default_df_override = DataFormat.Float16_b
 
@@ -835,3 +833,28 @@ def test_fork_join_hrnet(test_kind, test_device):
     verify_module(HRNetForkJoin("test_fork_join_hrnet"), [(1, channels, height, width)],
                   VerifyConfig(test_kind=test_kind, devtype=test_device.devtype, arch=test_device.arch, pcc=pcc, relative_atol=relative_atol))
 
+class ForkJoinExpandOutputBuffer(pybuda.PyBudaModule):
+    def __init__(self, name):
+        super().__init__(name)
+        self.weights0 = pybuda.Parameter(1, 64, 128, requires_grad=False)
+
+    def forward(self, act1):
+        fork = pybuda.op.Matmul("matmul", act1, self.weights0)
+        left = pybuda.op.Exp("exp", fork)
+        right = pybuda.op.Buffer("buffer", fork)
+        join = pybuda.op.Add("add", left, right)
+        return join
+
+# Test implementation of Backend constrains for buf_size_mb.
+def test_fork_join_expand_output_buffer_constraints(test_kind, test_device):
+    if test_kind.is_training():
+        pytest.skip("Skipping training test")
+        
+    pybuda.config.override_op_size("matmul", (2, 1))
+    pybuda.config.override_op_size("exp", (2, 4))
+    pybuda.config.override_t_stream_shape("matmul", (10, 1))
+    pybuda.config.override_t_stream_shape("exp", (1, 1))
+
+    relative_atol, pcc = get_relaxed_atol_pcc(test_kind, test_device)
+    verify_module(ForkJoinExpandOutputBuffer("test_fork_join_expand_output_buffer_constraints"), [(1, 1, 6400, 64)],
+                  VerifyConfig(test_kind=test_kind, devtype=test_device.devtype, arch=test_device.arch, pcc=pcc, relative_atol=relative_atol))
diff --git a/pybuda/test/test_fracturing.py b/pybuda/test/test_fracturing.py
index 979fc85c7..a0929a404 100644
--- a/pybuda/test/test_fracturing.py
+++ b/pybuda/test/test_fracturing.py
@@ -41,7 +41,7 @@ def simple_fracture(x, param=None):
 @pytest.mark.parametrize("dim", [-1, -2])
 @pytest.mark.parametrize("factor", [2, 3, 4])
 def test_fracture_multichip(test_kind, test_device, dim, factor):
-    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0 or test_device.arch == pybuda.BackendDevice.Blackhole:
         pytest.skip("Skip until #736 is solved")
         
     if test_device.arch == pybuda.BackendDevice.Grayskull:
diff --git a/pybuda/test/test_fusing.py b/pybuda/test/test_fusing.py
index 395f763ab..498f7a1a4 100644
--- a/pybuda/test/test_fusing.py
+++ b/pybuda/test/test_fusing.py
@@ -454,7 +454,7 @@ def forward(self, act1):
 def test_matmul_gelu(test_device, test_kind):
     relative_atol, pcc = get_relaxed_atol_pcc(True, test_device)
     verify_module(FuseMatmulGelu("fuse_matmul_gelu"), [FuseMatmulGelu.shape],
-        VerifyConfig(test_kind=test_kind, skip_shutdown=True, arch=BackendDevice.Wormhole, devtype=test_device.devtype,
+        VerifyConfig(test_kind=test_kind, skip_shutdown=True, arch=test_device.arch, devtype=test_device.devtype,
                 relative_atol=relative_atol,
                 pcc=pcc))
 
diff --git a/pybuda/test/test_indexing.py b/pybuda/test/test_indexing.py
index 6b71ee685..675427946 100644
--- a/pybuda/test/test_indexing.py
+++ b/pybuda/test/test_indexing.py
@@ -38,7 +38,7 @@ def test_index(
         pytest.skip("temporarily skip it, fails on pipeline but cannot reproduce it locally")
 
     @compile(
-        compiler_cfg=CompilerConfig(enable_training=training, enable_t_streaming=False),
+        compiler_cfg=CompilerConfig(enable_training=training, enable_t_streaming=True),
         verify_cfg=VerifyConfig(),
     )
     def index(x):
@@ -172,7 +172,7 @@ def test_index3d(
         start2, stop2, stride2 = index2
 
     @compile(
-        compiler_cfg=CompilerConfig(enable_training=training, enable_t_streaming=False),
+        compiler_cfg=CompilerConfig(enable_training=training, enable_t_streaming=True),
         verify_cfg=VerifyConfig(),
     )
     def index3d(x):
diff --git a/pybuda/test/test_long_short_path.py b/pybuda/test/test_long_short_path.py
index 003800369..1ab46cf21 100644
--- a/pybuda/test/test_long_short_path.py
+++ b/pybuda/test/test_long_short_path.py
@@ -18,7 +18,8 @@
 def test_intra_epoch_relay_queue(mode, microbatch_size):
     def linked_list(activations):
         activations = pybuda.op.Buffer(f"buffer_pre", activations)
-        activations = pybuda.op.DRAMQueue(f"buffering_queue", activations)
+        # num_entries=microbatch_size, so if the queue is statically allocated, it still has enough memory
+        activations = pybuda.op.DRAMQueue(f"buffering_queue", activations, num_entries=microbatch_size)
         activations = pybuda.op.Buffer(f"buffer_post", activations)
         return activations
 
@@ -91,7 +92,8 @@ def test_two_branch_fork_join_branch_asymmetry_with_buffering_queue(
     mode, num_ops_left_branch, num_ops_right_branch
 ):
     training = mode == "training"
-    shape = (1, 1, 64, 64)
+    microbatch_size = 1
+    shape = (microbatch_size, 1 , 64, 64)
 
     @compile(
         compiler_cfg=pybuda.CompilerConfig(enable_training=training),
@@ -104,7 +106,8 @@ def two_branch_fork_join_branch_asymmetry(act1):
         for i in range(num_ops_left_branch):
             left_branch = pybuda.op.Buffer(f"buffer_left_{i}", left_branch)
 
-        left_branch = pybuda.op.DRAMQueue(f"buffering_queue", left_branch)
+        # num_entries=microbatch_size, so if the queue is statically allocated, it still has enough memory
+        left_branch = pybuda.op.DRAMQueue(f"buffering_queue", left_branch, num_entries=microbatch_size)
 
         for i in range(num_ops_right_branch):
             right_branch = pybuda.op.Buffer(f"buffer_right_{i}", right_branch)
diff --git a/pybuda/test/test_multichip.py b/pybuda/test/test_multichip.py
index 8cefc92cd..dbc658323 100644
--- a/pybuda/test/test_multichip.py
+++ b/pybuda/test/test_multichip.py
@@ -19,8 +19,8 @@
 
 backend_devices = {
     "grayskull" : BackendDevice.Grayskull,
-    "wormhole" : BackendDevice.Wormhole,
     "wormhole_b0": BackendDevice.Wormhole_B0,
+    "blackhole": BackendDevice.Blackhole
 }
 
 # Currently only guarded for Grayskull:
@@ -137,8 +137,8 @@ def linked_list_two_chips(act):
     module = ModuleBuilder(linked_list_two_chips)
     verify_module(module, [(1, 1, 64, 64)],
             # chip_ids=[0, 1] fails in net2pipe bbe_issue#2331
-            # VerifyConfig(test_kind=TestKind.INFERENCE, run_net2pipe=True, arch=BackendDevice.Wormhole, chip_ids=[0,1]))
-            VerifyConfig(test_kind=TestKind.INFERENCE, run_net2pipe=True, arch=BackendDevice.Wormhole))
+            # VerifyConfig(test_kind=TestKind.INFERENCE, run_net2pipe=True, arch=BackendDevice.Wormhole_B0, chip_ids=[0,1]))
+            VerifyConfig(test_kind=TestKind.INFERENCE, run_net2pipe=True))
 
 def test_four_chip_wormhole_sanity():
     pytest.skip("Skip until BBE commit 42d9685b1 is consumed")
@@ -155,7 +155,7 @@ def linked_list_four_chips(act):
 
     module = ModuleBuilder(linked_list_four_chips)
     verify_module(module, [(1, 1, 64, 64)],
-            VerifyConfig(test_kind=TestKind.INFERENCE, run_net2pipe=True, arch=BackendDevice.Wormhole, chip_ids=list(range(8))))
+            VerifyConfig(test_kind=TestKind.INFERENCE, run_net2pipe=True, arch=BackendDevice.Wormhole_B0, chip_ids=list(range(8))))
 
 
 
diff --git a/pybuda/test/test_padding/other/test_padding_pass_a.py b/pybuda/test/test_padding/other/test_padding_pass_a.py
index 5d91ce6eb..65732d980 100644
--- a/pybuda/test/test_padding/other/test_padding_pass_a.py
+++ b/pybuda/test/test_padding/other/test_padding_pass_a.py
@@ -368,7 +368,6 @@ def forward(self, x1, x2):
 TEST_A_FRACTURING_FLAG = False
 TEST_A_RESOURCE_USAGE_FALLBACK_MODE = True
 
-TEST_A_CHIP_PLACEMENT_T_STREAMING_FLAG = True
 TEST_A_CHIP_PLACEMENT_FORCE_INTERMED_FLAG = False
 TEST_A_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG = False
 TEST_A_CHIP_PLACEMENT_SELF_CUT_TYPE_FLAG = True
@@ -428,8 +427,6 @@ def set_environment():
         os.environ["LOGGER_LEVEL"] = "DEBUG"
 
     # Include or not environment variables for debugging chip placement module
-    if TEST_A_CHIP_PLACEMENT_T_STREAMING_FLAG:
-        os.environ["PYBUDA_ENABLE_T_STREAMING"] = "1"
     if TEST_A_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG:
         os.environ["PYBUDA_LEGALIZER_DETAILED_DEBUGGING"] = "1"
     if TEST_A_CHIP_PLACEMENT_LEGALIZER_NODE_NAME:
@@ -572,7 +569,6 @@ def test_padding_pass_a(
     act_shape = (1, 3, original_shape[0], original_shape[1])
 
     compiler_cfg = CompilerConfig(
-        enable_t_streaming=True,
         enable_training=test_kind.is_training(),
         balancer_policy="Ribbon"
     )
diff --git a/pybuda/test/test_padding/other/test_padding_pass_b.py b/pybuda/test/test_padding/other/test_padding_pass_b.py
index 794872002..7405eefeb 100644
--- a/pybuda/test/test_padding/other/test_padding_pass_b.py
+++ b/pybuda/test/test_padding/other/test_padding_pass_b.py
@@ -467,7 +467,6 @@ def forward(self, x1, x2):
 TEST_B_PRINT_GRAPH_AT_FLAG = False
 TEST_B_FRACTURING_FLAG = False
 
-TEST_B_CHIP_PLACEMENT_T_STREAMING_FLAG = False
 TEST_B_CHIP_PLACEMENT_FORCE_INTERMED_FLAG = False
 TEST_B_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG = False
 TEST_B_CHIP_PLACEMENT_SELF_CUT_TYPE_FLAG = False
@@ -518,8 +517,6 @@ def set_environment():
         os.environ["LOGGER_LEVEL"] = "DEBUG"
 
     # Include or not environment variables for debugging chip placement module
-    if TEST_B_CHIP_PLACEMENT_T_STREAMING_FLAG:
-        os.environ["PYBUDA_ENABLE_T_STREAMING"] = "1"
     if TEST_B_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG:
         os.environ["PYBUDA_LEGALIZER_DETAILED_DEBUGGING"] = "1"
     if TEST_B_CHIP_PLACEMENT_LEGALIZER_NODE_NAME:
@@ -643,7 +640,6 @@ def test_padding_pass_b(
     act_shape = (1, in_channels, original_shape[0], original_shape[1])
 
     compiler_cfg = CompilerConfig(
-        enable_t_streaming = True,
         enable_training=test_kind.is_training()
     )
     verify_cfg = VerifyConfig(
diff --git a/pybuda/test/test_padding/other/test_padding_pass_c.py b/pybuda/test/test_padding/other/test_padding_pass_c.py
index 861cd2e17..d11689aec 100644
--- a/pybuda/test/test_padding/other/test_padding_pass_c.py
+++ b/pybuda/test/test_padding/other/test_padding_pass_c.py
@@ -330,7 +330,6 @@ def forward(self, x1, x2):
 TEST_C_PRINT_GRAPH_AT_FLAG = False
 TEST_C_FRACTURING_FLAG = False
 
-TEST_C_CHIP_PLACEMENT_T_STREAMING_FLAG = True
 TEST_C_CHIP_PLACEMENT_FORCE_INTERMED_FLAG = True
 TEST_C_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG = True
 TEST_C_CHIP_PLACEMENT_SELF_CUT_TYPE_FLAG = True
@@ -376,8 +375,6 @@ def set_environment():
         os.environ["LOGGER_LEVEL"] = "DEBUG"
 
     # Include or not environment variables for debugging chip placement module
-    if TEST_C_CHIP_PLACEMENT_T_STREAMING_FLAG:
-        os.environ["PYBUDA_ENABLE_T_STREAMING"] = "1"
     if TEST_C_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG:
         os.environ["PYBUDA_LEGALIZER_DETAILED_DEBUGGING"] = "1"
     if TEST_C_CHIP_PLACEMENT_LEGALIZER_NODE_NAME:
@@ -491,7 +488,6 @@ def test_padding_pass_c(
     act_shape = (1, in_channels, original_shape[0], original_shape[1])
 
     compiler_cfg = CompilerConfig(
-        enable_t_streaming = True,
         enable_training=test_kind.is_training()
     )
     verify_cfg = VerifyConfig(
diff --git a/pybuda/test/test_padding/other/test_padding_pass_d.py b/pybuda/test/test_padding/other/test_padding_pass_d.py
index 318e89163..8f8c12c5b 100644
--- a/pybuda/test/test_padding/other/test_padding_pass_d.py
+++ b/pybuda/test/test_padding/other/test_padding_pass_d.py
@@ -275,7 +275,6 @@ def forward(self, x1, x2, x3):
 TEST_D_PRINT_GRAPH_AT_FLAG = False
 TEST_D_FRACTURING_FLAG = False
 
-TEST_D_CHIP_PLACEMENT_T_STREAMING_FLAG = True
 TEST_D_CHIP_PLACEMENT_FORCE_INTERMED_FLAG = True
 TEST_D_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG = True
 TEST_D_CHIP_PLACEMENT_SELF_CUT_TYPE_FLAG = True
@@ -321,8 +320,6 @@ def set_environment():
         os.environ["LOGGER_LEVEL"] = "DEBUG"
 
     # Include or not environment variables for debugging chip placement module
-    if TEST_D_CHIP_PLACEMENT_T_STREAMING_FLAG:
-        os.environ["PYBUDA_ENABLE_T_STREAMING"] = "1"
     if TEST_D_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG:
         os.environ["PYBUDA_LEGALIZER_DETAILED_DEBUGGING"] = "1"
     if TEST_D_CHIP_PLACEMENT_LEGALIZER_NODE_NAME:
@@ -415,7 +412,6 @@ def test_padding_pass_d(
     act_shape = (1, in_channels, original_shape[0], original_shape[1])
 
     compiler_cfg = CompilerConfig(
-        enable_t_streaming = True,
         enable_training=test_kind.is_training()
     )
     verify_cfg = VerifyConfig(
diff --git a/pybuda/test/test_padding/other/test_padding_pass_e.py b/pybuda/test/test_padding/other/test_padding_pass_e.py
index b063f7620..db6ecf48b 100644
--- a/pybuda/test/test_padding/other/test_padding_pass_e.py
+++ b/pybuda/test/test_padding/other/test_padding_pass_e.py
@@ -242,7 +242,6 @@ def forward(self, x):
 TEST_E_PRINT_GRAPH_AT_FLAG = False
 TEST_E_FRACTURING_FLAG = False
 
-TEST_E_CHIP_PLACEMENT_T_STREAMING_FLAG = True
 TEST_E_CHIP_PLACEMENT_FORCE_INTERMED_FLAG = False
 TEST_E_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG = False
 TEST_E_CHIP_PLACEMENT_SELF_CUT_TYPE_FLAG = True
@@ -305,8 +304,6 @@ def set_environment():
         os.environ["LOGGER_LEVEL"] = "DEBUG"
 
     # Include or not environment variables for debugging chip placement module
-    if TEST_E_CHIP_PLACEMENT_T_STREAMING_FLAG:
-        os.environ["PYBUDA_ENABLE_T_STREAMING"] = "1"
     if TEST_E_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG:
         os.environ["PYBUDA_LEGALIZER_DETAILED_DEBUGGING"] = "1"
     if TEST_E_CHIP_PLACEMENT_LEGALIZER_NODE_NAME:
diff --git a/pybuda/test/test_padding/other/test_padding_pass_f.py b/pybuda/test/test_padding/other/test_padding_pass_f.py
index 2e83c5ccf..8b3959915 100644
--- a/pybuda/test/test_padding/other/test_padding_pass_f.py
+++ b/pybuda/test/test_padding/other/test_padding_pass_f.py
@@ -246,7 +246,6 @@ def forward(self, x1, x2, x3):
 TEST_F_PRINT_GRAPH_AT_FLAG = False
 TEST_F_FRACTURING_FLAG = False
 
-TEST_F_CHIP_PLACEMENT_T_STREAMING_FLAG = True
 TEST_F_CHIP_PLACEMENT_FORCE_INTERMED_FLAG = True
 TEST_F_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG = True
 TEST_F_CHIP_PLACEMENT_SELF_CUT_TYPE_FLAG = True
@@ -293,8 +292,6 @@ def set_environment():
         os.environ["LOGGER_LEVEL"] = "DEBUG"
 
     # Include or not environment variables for debugging chip placement module
-    if TEST_F_CHIP_PLACEMENT_T_STREAMING_FLAG:
-        os.environ["PYBUDA_ENABLE_T_STREAMING"] = "1"
     if TEST_F_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG:
         os.environ["PYBUDA_LEGALIZER_DETAILED_DEBUGGING"] = "1"
     if TEST_F_CHIP_PLACEMENT_SELF_CUT_TYPE_FLAG:
@@ -429,7 +426,6 @@ def test_padding_pass_f(
     act_shape = (1, in_channels, original_shape[0], original_shape[1])
 
     compiler_cfg = CompilerConfig(
-        enable_t_streaming = True,
         enable_training=test_kind.is_training()
     )
     verify_cfg = VerifyConfig(
diff --git a/pybuda/test/test_padding/other/test_padding_pass_g.py b/pybuda/test/test_padding/other/test_padding_pass_g.py
index c01a7ae03..c56cb4ba9 100644
--- a/pybuda/test/test_padding/other/test_padding_pass_g.py
+++ b/pybuda/test/test_padding/other/test_padding_pass_g.py
@@ -156,7 +156,6 @@ def forward(self, x1, x2):
 TEST_G_PRINT_GRAPH_AT_FLAG = False
 TEST_G_FRACTURING_FLAG = False
 
-TEST_G_CHIP_PLACEMENT_T_STREAMING_FLAG = True
 TEST_G_CHIP_PLACEMENT_FORCE_INTERMED_FLAG = True
 TEST_G_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG = True
 TEST_G_CHIP_PLACEMENT_SELF_CUT_TYPE_FLAG = True
@@ -212,8 +211,6 @@ def set_environment():
         os.environ["LOGGER_LEVEL"] = "DEBUG"
 
     # Include or not environment variables for debugging chip placement module
-    if TEST_G_CHIP_PLACEMENT_T_STREAMING_FLAG:
-        os.environ["PYBUDA_ENABLE_T_STREAMING"] = "1"
     if TEST_G_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG:
         os.environ["PYBUDA_LEGALIZER_DETAILED_DEBUGGING"] = "1"
     if TEST_G_CHIP_PLACEMENT_LEGALIZER_NODE_NAME:
@@ -320,8 +317,7 @@ def test_padding_pass_g(
         model.name, 
         *inputs, 
         compiler_cfg=CompilerConfig(
-            enable_training=test_kind.is_training(),
-            enable_t_streaming=True
+            enable_training=test_kind.is_training()
         ), 
         verify_cfg=VerifyConfig()
     )
\ No newline at end of file
diff --git a/pybuda/test/test_padding/other/test_padding_pass_h.py b/pybuda/test/test_padding/other/test_padding_pass_h.py
index 374e3241c..84a767df2 100644
--- a/pybuda/test/test_padding/other/test_padding_pass_h.py
+++ b/pybuda/test/test_padding/other/test_padding_pass_h.py
@@ -172,7 +172,6 @@ def forward(self, x):
 TEST_H_PRINT_GRAPH_AT_FLAG = False
 TEST_H_FRACTURING_FLAG = False
 
-TEST_H_CHIP_PLACEMENT_T_STREAMING_FLAG = True
 TEST_H_CHIP_PLACEMENT_FORCE_INTERMED_FLAG = True
 TEST_H_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG = True
 TEST_H_CHIP_PLACEMENT_SELF_CUT_TYPE_FLAG = True
@@ -232,8 +231,6 @@ def set_environment():
         os.environ["LOGGER_LEVEL"] = "DEBUG"
 
     # Include or not environment variables for debugging chip placement module
-    if TEST_H_CHIP_PLACEMENT_T_STREAMING_FLAG:
-        os.environ["PYBUDA_ENABLE_T_STREAMING"] = "1"
     if TEST_H_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG:
         os.environ["PYBUDA_LEGALIZER_DETAILED_DEBUGGING"] = "1"
     if TEST_H_CHIP_PLACEMENT_SELF_CUT_TYPE_FLAG:
@@ -353,7 +350,6 @@ def test_padding_pass_h(
         *model.inputs, 
         compiler_cfg=CompilerConfig(
             enable_training=test_kind.is_training(),
-            enable_t_streaming=True,
         ), 
         verify_cfg=verify_cfg
     )
@@ -389,7 +385,6 @@ def test_padding_pass_h_argument(
 
     compiler_cfg = CompilerConfig(
         enable_training=test_kind.is_training(),
-        enable_t_streaming=True,
         paddings=paddings
     )
 
diff --git a/pybuda/test/test_padding/other/test_padding_pass_i.py b/pybuda/test/test_padding/other/test_padding_pass_i.py
index 32f950620..559798d62 100644
--- a/pybuda/test/test_padding/other/test_padding_pass_i.py
+++ b/pybuda/test/test_padding/other/test_padding_pass_i.py
@@ -185,7 +185,6 @@ def forward(self, x):
 TEST_I_PRINT_GRAPH_AT_FLAG = False
 TEST_I_FRACTURING_FLAG = False
 
-TEST_I_CHIP_PLACEMENT_T_STREAMING_FLAG = True
 TEST_I_CHIP_PLACEMENT_FORCE_INTERMED_FLAG = True
 TEST_I_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG = True
 TEST_I_CHIP_PLACEMENT_SELF_CUT_TYPE_FLAG = True
@@ -240,8 +239,6 @@ def set_environment():
         os.environ["LOGGER_LEVEL"] = "DEBUG"
 
     # Include or not environment variables for debugging chip placement module
-    if TEST_I_CHIP_PLACEMENT_T_STREAMING_FLAG:
-        os.environ["PYBUDA_ENABLE_T_STREAMING"] = "1"
     if TEST_I_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG:
         os.environ["PYBUDA_LEGALIZER_DETAILED_DEBUGGING"] = "1"
     if TEST_I_CHIP_PLACEMENT_SELF_CUT_TYPE_FLAG:
@@ -352,8 +349,7 @@ def test_padding_pass_i(
         model.name, 
         *model.inputs, 
         compiler_cfg=CompilerConfig(
-            enable_training=test_kind.is_training(),
-            enable_t_streaming=True
+            enable_training=test_kind.is_training()
         ), 
         verify_cfg=VerifyConfig()
     )
\ No newline at end of file
diff --git a/pybuda/test/test_padding/other/test_padding_pass_k.py b/pybuda/test/test_padding/other/test_padding_pass_k.py
index 4ae421ef2..83c2dfe8a 100644
--- a/pybuda/test/test_padding/other/test_padding_pass_k.py
+++ b/pybuda/test/test_padding/other/test_padding_pass_k.py
@@ -421,7 +421,6 @@ def forward(self, x):
 TEST_K_PRINT_GRAPH_AT_FLAG = False
 TEST_K_FRACTURING_FLAG = False
 
-TEST_K_CHIP_PLACEMENT_T_STREAMING_FLAG = True
 TEST_K_CHIP_PLACEMENT_FORCE_INTERMED_FLAG = False
 TEST_K_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG = True
 TEST_K_CHIP_PLACEMENT_SELF_CUT_TYPE_FLAG = False
@@ -474,8 +473,6 @@ def set_environment():
         os.environ["LOGGER_LEVEL"] = "DEBUG"
 
     # Include or not environment variables for debugging chip placement module
-    if TEST_K_CHIP_PLACEMENT_T_STREAMING_FLAG:
-        os.environ["PYBUDA_ENABLE_T_STREAMING"] = "1"
     if TEST_K_CHIP_PLACEMENT_LEGALIZER_DETAILED_FLAG:
         os.environ["PYBUDA_LEGALIZER_DETAILED_DEBUGGING"] = "1"
     if TEST_K_CHIP_PLACEMENT_LEGALIZER_NODE_NAME:
@@ -621,7 +618,6 @@ def test_padding_pass_k(
     act_shape = (1, in_channels, original_shape[0], original_shape[1])
 
     compiler_cfg = CompilerConfig(
-        enable_t_streaming = True,
         enable_training=test_kind.is_training()
     )
     verify_cfg = VerifyConfig(
diff --git a/pybuda/test/test_placer_apis.py b/pybuda/test/test_placer_apis.py
index a9028c6c8..2d5b914c2 100644
--- a/pybuda/test/test_placer_apis.py
+++ b/pybuda/test/test_placer_apis.py
@@ -241,37 +241,6 @@ def conflicting_placement_overrides(act, *, ff1_weights, ff2_weights):
     verify.verify_module(module, [(1, 1, 64, 64)], VerifyConfig(test_kind=test_kind))
 
 
-def test_wh_pin_op_to_chip_id(test_kind):
-    def wh_pin_op_to_chip_id(act, *, ff1_weights, ff2_weights):
-        op0 = pybuda.op.Matmul(f"ff1", act, ff1_weights)
-        op1 = pybuda.op.Buffer(f"gelu", op0)
-        op2 = pybuda.op.Matmul(f"ff2", op1, ff2_weights)
-        return op2
-
-    pybuda.config.set_epoch_break("gelu")
-    pybuda.config.override_op_placement("gelu", start=[2,2], chip_id=1)
-    pybuda.config.override_op_placement("ff2", start=[2,2])
-
-    module = ModuleBuilder(wh_pin_op_to_chip_id, ff1_weights=pybuda.Parameter(1,1,64,64), ff2_weights=pybuda.Parameter(1,1,64,64))
-    verify.verify_module(module, [(1, 1, 64, 64)], VerifyConfig(test_kind=test_kind, arch=BackendDevice.Wormhole, chip_ids=range(2)))
-
-
-def test_wh_temporal_break(test_kind):
-    def wh_pin_op_to_chip_id(act, *, ff1_weights, ff2_weights):
-        op0 = pybuda.op.Matmul(f"ff1", act, ff1_weights)
-        op1 = pybuda.op.Buffer(f"gelu", op0)
-        op2 = pybuda.op.Matmul(f"ff2", op1, ff2_weights)
-        op3 = pybuda.op.Buffer(f"buffer", op2)
-        return op3
-
-    pybuda.config.set_epoch_break("gelu")
-    pybuda.config.set_epoch_break("ff2")
-    pybuda.config.override_op_placement("buffer", temporal_epoch_break=True)
-
-    module = ModuleBuilder(wh_pin_op_to_chip_id, ff1_weights=pybuda.Parameter(1,1,64,64), ff2_weights=pybuda.Parameter(1,1,64,64))
-    verify.verify_module(module, [(1, 1, 64, 64)], VerifyConfig(test_kind=test_kind, arch=BackendDevice.Wormhole, chip_ids=range(2)))
-
-
 def test_dram_allocator_api(test_device):
     shape = (1, 1, 32, 32)
     test_kind = verify.TestKind.INFERENCE
diff --git a/pybuda/test/test_recompile.py b/pybuda/test/test_recompile.py
new file mode 100644
index 000000000..79b7b1e4b
--- /dev/null
+++ b/pybuda/test/test_recompile.py
@@ -0,0 +1,52 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+import os
+import pybuda
+import pytest
+from pybuda.verify import verify_module, VerifyConfig, TestKind
+
+class FusingStreamLimitsStress(pybuda.PyBudaModule):
+    """
+    Module which tests recompile when fused op doesn't satisfy stream contraints.
+    """
+
+    shape = (1, 1, 3200, 128)
+
+    def __init__(self, name):
+        super().__init__(name)
+        self.weights = pybuda.Parameter(self.shape[-1], self.shape[-1], requires_grad=True)
+
+    def forward(self, act1, act2):
+        matmuls = []
+        for i in range(10):
+            matmuls.append(pybuda.op.Matmul(f"matmul_{i}", act1, self.weights))
+
+        for i in range(10):
+            matmuls.append(pybuda.op.Matmul(f"matmul_{i+10}", act2, self.weights))
+
+        # Expecting fusing of ops below
+        add = pybuda.op.Add("", matmuls[0], matmuls[1])
+        for i in range(2, 20):
+            add = pybuda.op.Add("", add, matmuls[i])
+
+        return add
+
+def test_recompile_fuse_stream_limits(test_device):
+    pytest.skip()
+
+    # Setting target cycles to 0 causes us to hit stream constraints on fused op.
+    os.environ["PYBUDA_RIBBON_TARGET_CYCLES"] = "0"
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    os.environ["PYBUDA_TEMP_BALANCER_MODEL_PCIE_BW"] = "0"
+    os.environ["PYBUDA_TEMP_DISABLE_MODEL_KB_PROLOGUE_BW"] = "1"
+
+    # Enable recompilation to recover from net2pipe failure.
+    os.environ["PYBUDA_AUTO_RECOMPILE"] = "1"
+
+    run_net2pipe = not test_device.is_silicon()
+
+    pybuda.config.set_configuration_options(balancer_policy="Ribbon")
+    verify_module(FusingStreamLimitsStress("recompile_fuse_stream_limits"), [FusingStreamLimitsStress.shape, FusingStreamLimitsStress.shape],
+            VerifyConfig(test_kind=TestKind.INFERENCE, arch=test_device.arch, devtype=test_device.devtype, run_net2pipe=run_net2pipe))
+
diff --git a/pybuda/test/test_sanity.py b/pybuda/test/test_sanity.py
index db081ffcd..f948b07bb 100644
--- a/pybuda/test/test_sanity.py
+++ b/pybuda/test/test_sanity.py
@@ -27,6 +27,7 @@
     CompileDepth,
     VerifyConfig,
     PyTorchModule,
+    ci
 )
 from pybuda.ttdevice import get_device_config
 from pybuda.config import CompileDepth, _get_global_compiler_config
@@ -42,8 +43,8 @@
 
 backend_devices = {
     "grayskull" : BackendDevice.Grayskull,
-    "wormhole" : BackendDevice.Wormhole,
     "wormhole_b0": BackendDevice.Wormhole_B0,
+    "blackhole": BackendDevice.Blackhole
 }
 
 class BudaTestAdd(PyBudaModule):
@@ -346,7 +347,6 @@ def test_select(test_kind, test_device, shape, dim_index_length):
         length = shape[dim] - index
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     #if test_kind.is_training():
     #    compiler_cfg.compile_depth = CompileDepth.BALANCER_PASS
     
@@ -361,6 +361,32 @@ def simple_select(x):
     x = Tensor.create_from_torch(torch.rand(*shape, requires_grad=test_kind.is_training()))
     simple_select(x)
 
+@pytest.mark.parametrize("shape", [(1, 3, 288, 124),(1, 6, 288, 124)])
+@pytest.mark.parametrize("dim_index_length", [(-3, 1, 1, 2),])
+def test_single_select(test_kind, test_device, shape, dim_index_length):
+    dim, index, length, stride = dim_index_length
+    if index + length > shape[dim]:
+        pytest.skip()
+
+    if length == -1:
+        length = shape[dim] - index
+
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.manual_t_streaming = True
+    # pybuda.config.override_t_stream_shape("index.dc.select.0", (9, 1))
+
+    @compile(
+        compiler_cfg = CompilerConfig(enable_t_streaming=True, manual_t_streaming = True),
+        verify_cfg = VerifyConfig(test_kind=test_kind, devtype=test_device.devtype, arch=test_device.arch, verify_all=True),
+    )
+
+    def simple_select(x):
+        ret = pybuda.op.Select("select0", x, dim, (index, length), stride=stride)
+        return ret
+
+    x = Tensor.create_from_torch(torch.rand(*shape, requires_grad=test_kind.is_training()))
+    simple_select(x)
 
 @pytest.mark.parametrize("mode", ["inference", "training"])
 @pytest.mark.parametrize("shape", [(1, 1, 384, 384), (1, 12, 384, 384), (1, 1, 384, 96)])
@@ -457,6 +483,26 @@ def simple_argmax(x):
     x = Tensor.create_from_torch(torch.rand((1, 2, 384, 384), requires_grad=False))
     simple_argmax(x)
 
+@pytest.mark.parametrize("dim", [-1, 0])
+@pytest.mark.parametrize("input_shape", [(1,1,1,32), (1,1,3,32)])
+@pytest.mark.parametrize("max_value", [0.5, 0, 1, 5])
+def test_argmax_multiple_maximums(dim, input_shape, max_value):
+    pytest.skip("Skipping since the test is broken, issue #2477")
+    verify_cfg=VerifyConfig(run_golden=False) # argmax not supported
+    x = torch.zeros(input_shape)
+    for i in range(input_shape[0]):
+        x[0,0,i,2] = max_value
+        x[0,0,i,4] = max_value
+        x[0,0,i,6] = max_value
+    x = Tensor.create_from_torch(x)
+    @run(
+        verify_cfg=verify_cfg,
+    )
+    def simple_argmax(x):
+        return pybuda.op.Argmax("argmax0", x, dim=dim)
+
+    simple_argmax(x)
+
 def test_passthrough():
 
     @compile(compiler_cfg=CompilerConfig(enable_training=False))
@@ -498,34 +544,6 @@ def simple_reduce_tile_broadcast(a, b):
     b = Tensor.create_from_torch(torch.randn((1, 4, 4, 4), requires_grad=test_kind.is_training()))
     simple_reduce_tile_broadcast(a, b)
 
-
-@pytest.mark.parametrize("dim", [1, 2, -1])
-@pytest.mark.parametrize("aligned", [True, False])
-def test_concat(test_kind, test_device, dim, aligned):
-    @run(
-        VerifyConfig(test_kind=test_kind, devtype=test_device.devtype, arch=test_device.arch),
-    )
-    def simple_concat(a, b):
-        return pybuda.op.Concatenate("", a, b, axis=dim)
- 
-    if aligned:
-        shapes = {
-            -1: (1, 3, 128, 96),
-            2: (1, 3, 1024, 32),
-            1: (1, 1, 128, 32),
-        }
-        a = Tensor.create_from_torch(torch.randn((1, 3, 128, 32), requires_grad=test_kind.is_training()))
-    else:
-        shapes = {
-            -1: (1, 3, 128, 6),
-            2: (1, 3, 128, 6),
-            1: (1, 1, 128, 6),
-        }
-        a = Tensor.create_from_torch(torch.randn((1, 3, 128, 6), requires_grad=test_kind.is_training()))
-    b = Tensor.create_from_torch(torch.randn(shapes[dim], requires_grad=test_kind.is_training()))
-    c = simple_concat(a, b)
-
-
 class MultiEpochModule(pybuda.PyBudaModule):
     def __init__(self, name: str, num_matmuls: int):
         super().__init__(name)
@@ -558,7 +576,6 @@ def test_sparse_matmul(test_device, config):
     from pybuda.op.eval.sparse_utils import create_conv2d_sparse_picker_matrix
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "MaximizeTMinimizeGrid"
 
     if config == "3x3conv":
@@ -656,7 +673,6 @@ def simple_sparse_matmul(act, sparse=None):
 
 def test_simple_clip(test_device):
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "MaximizeTMinimizeGrid"
 
     @run(
@@ -1040,6 +1056,10 @@ def consumer_ops_belonging_to_different_chips(act, *, weights):
     pybuda.set_epoch_break("buffer_c")
 
     arch = backend_devices[os.environ.get("BACKEND_ARCH_NAME", "grayskull")]
+
+    if arch == BackendDevice.Blackhole:
+        pytest.skip("Blackhole doesn't support chip breaks. Skipping until BudaBackend#2650 is fixed.")
+
     compiler_cfg = _get_global_compiler_config()
     # tenstorrent/pybuda#480
     compiler_cfg.use_interactive_placer = False if arch is BackendDevice.Grayskull else True
@@ -1083,7 +1103,6 @@ def forward(self, x):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "MaximizeTMinimizeGrid"
-    compiler_cfg.enable_t_streaming = True
 
     pybuda.verify.verify_module(
         Model(),
@@ -1203,12 +1222,14 @@ def forward(self, act1, act2):
             p1 = pybuda.op.Greater("greater", act1, act2)   
         elif self.mode == "ne":
             p1 = pybuda.op.NotEqual("ne", act1, act2)       
+        elif self.mode == "maximum":
+            p1 = pybuda.op.Max("maximum", act1, act2)
         else:
             p1 = pybuda.op.Equal("eq", act1, act2)   
         return p1
 
 
-@pytest.mark.parametrize("mode", ["less", "greater", "lteq", "gteq", "ne", "eq", "heaviside"])
+@pytest.mark.parametrize("mode", ["less", "greater", "lteq", "gteq", "ne", "eq", "heaviside", "maximum"])
 def test_binary(test_device, mode):
     x = Tensor.create_from_torch(torch.randn((1, 1, 64, 64), requires_grad=True)) 
     y = Tensor.create_from_torch(torch.randn((1, 1, 64, 64), requires_grad=True)) 
@@ -1231,7 +1252,7 @@ def test_large_reshape(shape):
     block_size = shape[2] 
     
     @compile(
-        compiler_cfg=CompilerConfig(enable_training=False, enable_t_streaming=True, compile_depth=CompileDepth.BUDA_GRAPH_PRE_PLACER),
+        compiler_cfg=CompilerConfig(enable_training=False, compile_depth=CompileDepth.BUDA_GRAPH_PRE_PLACER),
         verify_cfg=VerifyConfig(run_golden=True),  # reshape not supported by backend
     )
     def simple_large_reshape(x, y): 
@@ -1307,7 +1328,6 @@ def forward(self, inp):
             return trans
 
     compiler_config = _get_global_compiler_config()
-    compiler_config.enable_t_streaming = True
     mod = InterVer("Intermediate_verification")
     verify_module(mod, [(1, 64, 1024)], VerifyConfig(test_kind=test_kind, verify_all=True))
 
@@ -1344,7 +1364,6 @@ def forward(self, inp):
             return m1
 
     compiler_config = _get_global_compiler_config()
-    compiler_config.enable_t_streaming = True
     mod = channel_select_fusion("channel_select_fusion")
     verify_module(mod, [(1, 3, 224, 224)], VerifyConfig(test_kind=test_kind, verify_all=True))
 
@@ -1433,33 +1452,6 @@ def simple_embedding(x, table=None):
     table = pybuda.Parameter.create_from_torch(torch.nn.Parameter(torch.randn((dictionary_size, hidden_dim))))
     simple_embedding(x, table=table)
 
-@pytest.mark.parametrize("input_shape", [(1, 10, 32, 32), (1, 32, 16, 16),])
-@pytest.mark.parametrize("axis", [-3])
-@pytest.mark.parametrize("stride", [1])
-@pytest.mark.parametrize("num_operands", [2,3])
-def test_interleave(test_kind, test_device, input_shape, axis, stride, num_operands):
-    class Model(PyBudaModule):
-        def __init__(self, name, axis, stride):
-            super().__init__(name)
-            self.axis = axis
-            self.stride = stride
-
-        def forward(self, *operands):
-            x = pybuda.op.Interleave("interleave0", *operands, axis=self.axis, stride=self.stride)
-            return x
-
-    input_shapes = tuple([input_shape for _ in range(num_operands)])
-    mod = Model("interleave_test", axis, stride)
-    verify_module(
-        mod,
-        input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=test_kind,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        ),
-    )
-
 @pytest.mark.parametrize("mode", ["hslice", "hstack", "vslice", "vstack"])
 def test_slice_stack_non_tile_aligned(test_kind, test_device, mode):
     class SliceStackModule(PyBudaModule):
@@ -1503,50 +1495,6 @@ def forward(self, activations):
         )
     )
 
-def test_concat_two_kinds_pad(test_device):
-    class Module(PyBudaModule):
-        def __init__(self, name):
-            super().__init__(name)
-            self.add_parameter("w", pybuda.Parameter(*(1, 1, 352, 192), requires_grad=True))
-
-        def forward(self, in0, in1, in2, in3, in4, in5, y):
-            in0 = pybuda.op.Multiply("m0", in0, in0)
-            in1 = pybuda.op.Multiply("m1", in1, in2)
-            in2 = pybuda.op.Multiply("m2", in2, in3)
-            in3 = pybuda.op.Multiply("m3", in3, in4)
-            in4 = pybuda.op.Multiply("m4", in4, in4)
-            in5 = pybuda.op.Multiply("m5", in5, in1)
-            x = pybuda.op.Concatenate("", in0, in1, in2, in3, in4, in5, axis=-1)
-            x = pybuda.op.Multiply("m6", x, y)
-            x = pybuda.op.PadTile("p0", x, -1, 336)
-            x = pybuda.op.Matmul("mm0", x, self.get_parameter("w"))
-            return x
-
-    compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
-    # compiler_cfg.place_on_new_epoch("m6_transpose_nop_0")
-    os.environ["PYBUDA_DISABLE_CONSTANT_FOLDING"] = "1"
-    os.environ["PYBUDA_PAD_SPARSE_MM"] = "{11:12}"
-    os.environ["PYBUDA_GRAPHSOLVER_SELF_CUT_TYPE"] = "ConsumerOperandDataEdgesFirst"
-
-    # input shape
-    common_len = 3136
-    input_shapes = ((1, 1, common_len, 96), (1, 1, common_len, 48), (1, 1, common_len, 48), (1, 1, common_len, 48), (1, 1, common_len, 48), (1, 1, common_len, 48), (1, 1, common_len, 336))
-    mod = Module("test_concat_two_kinds_pad")
-    verify_module(
-        mod,
-        input_shapes,
-        verify_cfg=VerifyConfig(
-            test_kind=TestKind.INFERENCE,
-            devtype=test_device.devtype,
-            arch=test_device.arch,
-        )
-    )
-
-    os.environ["PYBUDA_PAD_SPARSE_MM"] = "{}"
-
-
 def test_negative_reduce_max(test_device):
     df = pybuda.config.DataFormat.Float16
     pybuda.config.set_configuration_options(default_df_override=df, accumulate_df=df)
@@ -1624,6 +1572,9 @@ def test_grad_eltwise_op(test_device):
     shape = (1, 1, 512, 512)
     test_kind = TestKind.TRAINING
 
+    if test_device.arch == pybuda.BackendDevice.Blackhole:
+         pytest.skip("Skip until BudaBackend#2628 is consumed.")
+
     @run(
         verify_cfg=VerifyConfig(
             test_kind=test_kind,
@@ -1660,7 +1611,6 @@ def forward(self, x):
             return x
 
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object 
-    compiler_cfg.enable_t_streaming = True 
     #compiler_cfg.balancer_op_override("multiply_0", "t_stream_shape", (1,1))
     #compiler_cfg.balancer_op_override("matmul_4", "t_stream_shape", (2,1))
     #compiler_cfg.balancer_op_override("matmul_4", "t_stream_dir", "r")
@@ -1738,7 +1688,6 @@ def forward(self, x):
             return x
 
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object 
-    compiler_cfg.enable_t_streaming = True  
     compiler_cfg.balancer_policy = "Ribbon"
     compiler_cfg.balancer_op_override("conv2d_0.dc.sparse_matmul.9.dc.sparse_matmul.1.lc2", "t_stream_shape", (2,1))
     import os 
@@ -1781,7 +1730,6 @@ def forward(self, x):
             return x
 
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "Ribbon"
     compiler_cfg.balancer_op_override("sparse_matmul_1.dc.sparse_matmul.1.lc2", "t_stream_shape", (2,1))
     compiler_cfg.balancer_op_override("sparse_matmul_1.dc.sparse_matmul.1.lc2", "grid_shape", (2,1))
@@ -1829,6 +1777,7 @@ def test_read_back_intermediates(test_kind, test_device):
     else:
         op_intermediates = ["matmul_intermediate"]
 
+    os.environ["PYBUDA_DISABLE_STREAM_OUTPUT"]  = "1" #issue #2657
     pybuda.set_configuration_options(op_intermediates_to_save=op_intermediates)
     num_inputs = 4
 
@@ -1884,11 +1833,11 @@ def daisy_chain_2d(x):
         # insert daisy-chain along each column
         for j in range(columns):
             gelu_rows = [f"gelu_{i}_{j}" for i in range(rows)]
-            pybuda.insert_buffering_nop(input, gelu_rows, daisy_chain=True)
+            pybuda.insert_nop(input, gelu_rows, daisy_chain=True)
 
         # insert daisy-chain across first row
         gelu_first_row = [f"buffer_0_inputs_gelu_{0}_{j}" for j in range(columns)]
-        pybuda.insert_buffering_nop(input, gelu_first_row, daisy_chain=True)
+        pybuda.insert_nop(input, gelu_first_row, daisy_chain=True)
         
         return outputs
 
@@ -1924,8 +1873,7 @@ def forward(self, x):
             x = pybuda.op.Conv3d("", x, self.get_parameter("weight"), None, stride=[stride, stride, stride], padding=[padding, padding, padding, padding, padding, padding], dilation=dilation, groups=1, channel_last=0)
             return x
 
-    compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object 
-    compiler_cfg.enable_t_streaming = True  
+    compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
  
     input_shapes = ((1, inC, inD, inH, inW),)
@@ -1958,8 +1906,7 @@ def forward(self, x):
             x = pybuda.op.MaxPool3d("", x, (kD, kH, kW), stride=stride, padding=padding, dilation=dilation, channel_last=0)
             return x
 
-    compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object 
-    compiler_cfg.enable_t_streaming = True  
+    compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "Ribbon"
  
     input_shapes = ((1, inC, inD, inH, inW),)
@@ -1989,7 +1936,6 @@ def forward(self, x):
             return x
 
     compiler_cfg = pybuda.config._get_global_compiler_config()  # load global compiler config object
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "Ribbon"
 
     input_shapes = ((1, 3, inD, inH, inW),)
@@ -2021,7 +1967,7 @@ def forward(self, x):
     input_shapes = ((1, 3, 32, 32),)
 
     # input shape
-    mod = Module("test_emulate_harvested")
+    mod = Module("test")
     verify_module(
         mod,
         input_shapes,
@@ -2032,3 +1978,132 @@ def forward(self, x):
         )
     )
 
+def test_blackhole_golden_sanity():
+    class Module(PyBudaModule):
+        def __init__(self, name):
+            super().__init__(name)
+
+        def forward(self, a, b, c):
+            x = pybuda.op.Add("add0", a, b)
+            x = pybuda.op.Matmul("matmul0", x, c)
+            return x
+
+    input_shapes = ((1, 3, 64, 64),(1, 3, 64, 64), (1, 3, 64, 64))
+
+    # input shape
+    module = Module("test_blackhole_golden_sanity")
+    verify_module(
+        module,
+        input_shapes,
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=BackendType.Golden,
+            arch=BackendDevice.Blackhole,
+        )
+    )
+
+def test_conv2d_transpose_0(test_device):
+    class Conv2d_transpose_model(torch.nn.Module):
+        def __init__(self, in_channel,out_channel,kernel_size,stride,padding,groups):
+            super().__init__()
+            self.model = torch.nn.ConvTranspose2d(in_channels=in_channel, out_channels=out_channel,
+                                                    kernel_size=kernel_size, stride=stride, padding=padding,
+                                                    output_padding=0, groups=groups, bias=False)
+
+        def forward(self, input):
+            return self.model(input)
+
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+    os.environ["PYBUDA_FORCE_CONV_MULTI_OP_FRACTURE"] = "1"
+
+    # Different in_channel and out_channel
+    model = Conv2d_transpose_model(in_channel=256,out_channel=512,kernel_size=(4, 4),stride=(2, 2),padding=(1, 1),groups=1)
+    model.eval()
+
+    tt_model = pybuda.PyTorchModule("conv2d_transpose", model)
+    input_shape = (1, 256, 12, 40)
+
+    verify_module(
+        tt_model,
+        input_shapes=(input_shape,),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        )
+    )
+
+def test_conv2d_transpose_1(test_device):
+    class Conv2d_transpose_model(torch.nn.Module):
+        def __init__(self, in_channel,out_channel,kernel_size,stride,padding,groups):
+            super().__init__()
+            self.model = torch.nn.ConvTranspose2d(in_channels=in_channel, out_channels=out_channel,
+                                                    kernel_size=kernel_size, stride=stride, padding=padding,
+                                                    output_padding=0, groups=groups, bias=False)
+
+        def forward(self, input):
+            return self.model(input)
+
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.balancer_policy = "Ribbon"
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    os.environ["PYBUDA_RIBBON2"] = "1"
+
+    # Same in_channel and out_channel, but different groups
+    model = Conv2d_transpose_model(in_channel=256,out_channel=256,kernel_size=(4, 4),stride=(2, 2),padding=(1, 1),groups=256)
+    model.eval()
+
+    tt_model = pybuda.PyTorchModule("conv2d_transpose", model)
+    input_shape = (1, 256, 12, 40)
+
+    verify_module(
+        tt_model,
+        input_shapes=(input_shape,),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        )
+    )
+
+# Verify that create sym link function creates a lock file in /tmp/user directory
+def test_symlink_creation_per_user_lock():
+    # create a simple file in the working sub directory
+    # working_directory/subdir/file.txt
+    working_directory = os.getcwd()
+    subdir = os.path.join(working_directory, "subdir")
+    os.makedirs(subdir, exist_ok=True)
+    file_path = os.path.join(subdir, "file.txt")
+    with open(file_path, "w") as f:
+        f.write("hello world")
+
+    # create a symlink to the file in the working sub directory
+    # working_directory/symlink.txt -> working_directory/subdir/file.txt
+    symlink_path = os.path.join(working_directory, "symlink.txt")
+    ci.create_symlink(file_path, symlink_path)
+
+    # check if the symlink was created
+    assert os.path.islink(symlink_path)
+
+    # check if there is a lock file in /tmp/user directory
+    # /tmp/user/symlink.txt.lock
+    import pwd
+    user = pwd.getpwuid(os.getuid()).pw_name
+    assert user is not None
+    lock_file_path = f"/tmp/{user}/symlink.txt.lock"
+    # check if lock_file_path exists
+    assert os.path.exists(lock_file_path)
+
+    # Test cleanup
+    # remove the symlink
+    os.remove(symlink_path)
+    # remove subdir and its file content
+    os.remove(file_path)
+    os.rmdir(subdir)
+    # remove the lock file
+    os.remove(lock_file_path)
\ No newline at end of file
diff --git a/pybuda/test/test_shapes.py b/pybuda/test/test_shapes.py
index 679d803b2..ffc1443d1 100644
--- a/pybuda/test/test_shapes.py
+++ b/pybuda/test/test_shapes.py
@@ -58,7 +58,6 @@
     ( (1, 32),           (1, 1) ),
     ( (32, 1),           (1, 1) ),
     ( (1, 1, 1, 32, 32), (1, 1, 1, 1, 32) ),
-    ( (1, 1, 1, 32, 32), (1, 1, 1, 32, 1) ),
 
     # not divisible by 32
     ( (1, 1, 32, 33),    (1, 1, 1, 33) ),
diff --git a/pybuda/test/test_streaming.py b/pybuda/test/test_streaming.py
index 141e6b65c..aedb576a0 100644
--- a/pybuda/test/test_streaming.py
+++ b/pybuda/test/test_streaming.py
@@ -25,7 +25,6 @@ def stream_transpose(a, b, param=None):
         return x
 
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
 
     pybuda.config.override_op_size("add0", (1, 1))
     pybuda.config.override_op_size("transpose0", (1, 1))
@@ -58,7 +57,6 @@ def stream_to_slice(x):
         return x
 
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
 
     pybuda.config.override_op_size("buf0", (1, 1))
     pybuda.config.override_op_size("buf1", (1, 1))
@@ -87,7 +85,6 @@ def stream_slice_transpose(x):
         return x
 
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
 
     if mode == "producer_streaming":
         pybuda.config.override_t_stream_shape("producer", (2, 1))
@@ -108,7 +105,7 @@ def stream_slice_transpose(x):
 
 @pytest.mark.parametrize("dir", ["r", "c"])
 def test_stream_interleave(test_device, dir):
-    pybuda.config.set_configuration_options(enable_t_streaming=True, balancer_policy="MaximizeTMinimizeGrid")
+    pybuda.config.set_configuration_options(balancer_policy="MaximizeTMinimizeGrid")
     pybuda.config.override_t_stream_dir("interleave", dir)
 
     @run(test_device)
@@ -131,7 +128,6 @@ def manual_stream(x):
         return x
 
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.manual_t_streaming = True
 
     pybuda.config.override_t_stream_shape("buf1", (4, 1))
diff --git a/pybuda/test/test_torch_device.py b/pybuda/test/test_torch_device.py
deleted file mode 100644
index 888ec577a..000000000
--- a/pybuda/test/test_torch_device.py
+++ /dev/null
@@ -1,248 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import pytest
-import pybuda
-import torch
-import torch.nn as nn
-import os
-from transformers import BertModel, GPT2LMHeadModel, GPT2Config, GPT2Model
-from pybuda.torch_compile import compile_torch
-from typing import Tuple
-
-def test_gpt2():
-    config = GPT2Config.from_pretrained("gpt2")
-    config.num_hidden_layers = 2
-
-    os.environ["PYBUDA_DEVMODE"] = "1"
-    compile_cfg = pybuda.config._get_global_compiler_config()
-    compile_cfg.enable_link_past_cache_ios = True
-    compile_cfg.cpu_fallback_ops = set()
-    compile_cfg.default_df_override = pybuda._C.Float16_b
-    compile_cfg.enable_t_streaming = True
-
-    gpt2 = GPT2LMHeadModel(config).eval()
-    input_ids = torch.randint(0, 10000, (1, 32)).int()
-    golden = gpt2(input_ids)
-
-    pybuda_mod = torch.compile(gpt2, backend=compile_torch, dynamic=False)
-    result = pybuda_mod(input_ids)
-
-    next_token_logits = result[0]
-    next_token_logits = next_token_logits.to("cpu")
-
-    res = result[0].to("cpu")
-    assert pybuda.op.eval.compare_tensor_to_golden(f"gpt2", golden[0], res, is_buda=True, pcc=0.99)
-    
-def test_gen():
-    pytest.skip()   # Working on it
-    config = GPT2Config.from_pretrained("gpt2")
-    config.num_hidden_layers = 1
-
-    os.environ["PYBUDA_DEVMODE"] = "1"
-    compile_cfg = pybuda.config._get_global_compiler_config()
-    # compile_cfg.enable_link_past_cache_ios = True
-    compile_cfg.cpu_fallback_ops = set()
-    compile_cfg.default_df_override = pybuda._C.Float16_b
-    compile_cfg.enable_t_streaming = True
-
-    gpt2 = GPT2LMHeadModel(config).eval()
-    gpt2.to("tt")
-
-    input_ids = torch.randint(0, 10000, (1, 32)).int().to("tt")
-    # past_cache_shape = (1, 12, 96, 64)
-    # past_cache = []
-    # for _ in range(config.num_hidden_layers):
-    #     past_cache.append((torch.zeros(past_cache_shape).to("tt"), torch.zeros(past_cache_shape).to("tt")))
-    # past_cache = tuple(past_cache)
-
-    pybuda_mod = torch.compile(gpt2, backend=compile_torch, dynamic=False)
-    result = pybuda_mod(input_ids)
-
-    res = result[0].to("cpu")
-    breakpoint()
-    inp2 = torch.randint(0, 10000, (1, 32)).int()
-    inp2 = inp2.to("tt")
-    result = pybuda_mod(inp2, result[1])
-    
-def test_add():
-    class Add(nn.Module):
-        def __init__(self):
-            super().__init__()
-
-        def forward(self, x1, x2):
-            return x1 + x2, x2 + x1 + 2
-
-    os.environ["PYBUDA_DEVMODE"] = "1"
-    model = Add()
-    inputs = [torch.rand(1, 32, 32), torch.rand(1, 32, 32)]
-    golden = model(*inputs)
-    pybuda_mod = torch.compile(model, backend=compile_torch)
-    # inputs = [i.to("tt") for i in inputs]
-    result = pybuda_mod(*inputs)
-    result = [r.to("cpu") for r in result]
-
-    assert [torch.allclose(g, r) for g, r in zip(golden, result)]
-
-def test_linear():
-    class Linear(nn.Module):
-        def __init__(self):
-            super().__init__()
-            self.linear = nn.Linear(32, 64, bias=True)
-
-        def forward(self, x1, x2):
-            m1 = self.linear(x1)
-            return m1 + x2
-
-    os.environ["PYBUDA_DEVMODE"] = "1"
-    model = Linear()
-    inputs = [torch.rand(1, 32, 32), torch.rand(1, 32, 64)]
-    golden = model(*inputs)
-    # inputs = [i.to("tt") for i in inputs]
-    pybuda_mod = torch.compile(model, backend=compile_torch)
-    result = pybuda_mod(*inputs)
-    result = result.to("cpu")
-
-    assert pybuda.op.eval.compare_tensor_to_golden(f"linear", golden, result, is_buda=True, pcc=0.99)
-
-def test_bert():
-    os.environ["PYBUDA_DEVMODE"] = "1"
-    compile_cfg = pybuda.config._get_global_compiler_config()
-    compile_cfg.cpu_fallback_ops = set()
-
-    bert = BertModel.from_pretrained("prajjwal1/bert-tiny", torchscript=True)
-    bert_cpu = BertModel.from_pretrained("prajjwal1/bert-tiny", torchscript=True)
-
-
-    input_ids = torch.randint(0, 10000, (1, 128)).int()
-    golden = bert_cpu(input_ids)
-
-    print("Copying model")
-    bert.to("tt")
-
-    print("Copying inputs")
-    input_ids = input_ids.to("tt")
-
-    print("Compiling Model")
-    pybuda_mod = torch.compile(bert, backend=compile_torch, dynamic=False)
-    result = pybuda_mod(input_ids)
-    print("Copying outputs")
-
-    result = [r.to("cpu") for r in result]
-    for i, (g, r) in enumerate(zip(golden, result)):
-        assert pybuda.op.eval.compare_tensor_to_golden(f"bert_{i}", g, r, is_buda=True, pcc=0.99)
-
-    inp2 = torch.randint(0, 10000, (1, 128)).int()
-    golden = bert_cpu(inp2)
-
-    inp2 = inp2.to("tt")
-    result = pybuda_mod(inp2)
-    result = [r.to("cpu") for r in result]
-    for i, (g, r) in enumerate(zip(golden, result)):
-        assert pybuda.op.eval.compare_tensor_to_golden(f"bert_{i}", g, r, is_buda=True, pcc=0.99)
-
-    inp3 = torch.randint(0, 10000, (1, 64)).int()
-    golden = bert_cpu(inp3)
-    inp3 = inp3.to("tt")
-    result = pybuda_mod(inp3)
-    result = [r.to("cpu") for r in result]
-    for i, (g, r) in enumerate(zip(golden, result)):
-        assert pybuda.op.eval.compare_tensor_to_golden(f"bert_{i}", g, r, is_buda=True, pcc=0.99)
-
-    inp4 = torch.randint(0, 10000, (1, 128)).int()
-    golden = bert_cpu(inp4)
-    inp4 = inp4.to("tt")
-    result = pybuda_mod(inp4)
-    result = [r.to("cpu") for r in result]
-    for i, (g, r) in enumerate(zip(golden, result)):
-        assert pybuda.op.eval.compare_tensor_to_golden(f"bert_{i}", g, r, is_buda=True, pcc=0.99)
-
-    inp5 = torch.randint(0, 10000, (1, 64)).int()
-    golden = bert_cpu(inp5)
-    inp5 = inp5.to("tt")
-    result = pybuda_mod(inp5)
-    result = [r.to("cpu") for r in result]
-    for i, (g, r) in enumerate(zip(golden, result)):
-        assert pybuda.op.eval.compare_tensor_to_golden(f"bert_{i}", g, r, is_buda=True, pcc=0.99)
-
-
-from torch._dynamo import export
-from torch._decomp import register_decomposition
-import torch
-import torch.nn as nn
-
-torch._dynamo.reset()
-import torch._dynamo as dynamo
-
-
-def test_decomp():
-    pytest.skip() #TODO fix: FATAL    | Always          - Unsupported (for now) _copy_from TTDevice[0] to TTDevice[0]
-    os.environ["PYBUDA_DEVMODE"] = "1"
-    class BasicModule(nn.Module):
-        def forward(self, x):
-            x = x * 2
-            a,b,c = torch.split(x, 3, dim=-1)
-            return a + b + c
-
-    mod, input = BasicModule(), torch.randn(2, 9).to(dtype=torch.float16)
- 
-    pybuda_mod = torch.compile(mod, backend=compile_torch, dynamic=False)
-    out = pybuda_mod(input)
-
-@pytest.mark.parametrize("shape", [(1024, 1024)])
-@pytest.mark.parametrize("mb", [1, 8, 16])
-@pytest.mark.parametrize("loop", [1, 8, 16])
-@pytest.mark.parametrize("native", [True, False])
-def test_push(shape, mb, loop, native):
-    if mb != 1:
-        pytest.skip() #TODO
-    os.environ["PYBUDA_DEVMODE"] = "1"
-    import time
-
-    pybuda.config.set_configuration_options(
-        default_df_override=pybuda.config.DataFormat.Float32
-    )
-
-    class Add(nn.Module):
-        def __init__(self):
-            super().__init__()
-
-        def forward(self, x1, x2):
-            return x1 + x2
-
-    model = Add()
-    sample_inputs = [torch.empty(mb, 1, *shape), torch.empty(mb, 1, *shape)]
-    inputs = [(torch.ones(mb, 1, *shape), torch.ones(mb, 1, *shape))] * loop
-
-    if native:
-        model = model.to("tt")
-        pybuda_mod = pybuda_mod = torch.compile(model, backend=compile_torch, dynamic=False)
-        comp_inputs = [i.to("tt") for i in inputs[0]]
-        result = pybuda_mod(*comp_inputs) # compile
-        start = time.perf_counter()
-        for args in inputs:
-            args = [a.to("tt") for a in args]
-            result = pybuda_mod(*args)
-            result.to("cpu")
-        elapsed = time.perf_counter() - start
-    else:
-        tt0 = pybuda.TTDevice("tt0")
-        tt0.place_module(pybuda.module.PyTorchModule("add", model))
-        output_q = pybuda.initialize_pipeline(
-            training=False, sample_inputs=sample_inputs
-        )
-
-        start = time.perf_counter()
-        for i in range(loop):
-            tt0.push_to_inputs(inputs[i])
-        pybuda.run_forward(input_count=loop)
-        for i in range(loop):
-            result = output_q.get(timeout=30)
-        elapsed = time.perf_counter() - start
-
-    float32_size = 4
-    data = mb * shape[0] * shape[1] * float32_size / (1024 * 1024)
-
-    print(
-        f"Batch[{mb:2}] Loop[{loop:2}] Native[{native:1}] Data[{data}mB] Elapsed[{elapsed:2.4}sec]"
-    )
diff --git a/pybuda/test/test_transpose_ops_placement.py b/pybuda/test/test_transpose_ops_placement.py
index 12a913fc4..aa4d210e3 100644
--- a/pybuda/test/test_transpose_ops_placement.py
+++ b/pybuda/test/test_transpose_ops_placement.py
@@ -113,8 +113,8 @@ def forward(self, act1, act2):
 @pytest.mark.parametrize("r", [x+1 for x in range(10)])
 @pytest.mark.parametrize("c", [x+1 for x in range(10)])
 def test_manual_op_transpose(test_device, r, c):
-    if test_device.arch == pybuda.BackendDevice.Wormhole_B0 and (r > 8 or c > 8):
-        pytest.skip("Wormhole has 8 columns, skip the op-test with c = 9 or 10")
+    if (test_device.arch == pybuda.BackendDevice.Wormhole_B0 or test_device.arch == pybuda.BackendDevice.Blackhole) and (r > 8 or c > 8):
+        pytest.skip(f"{test_device.arch.to_string()} has 8 columns, skip the op-test with c = 9 or 10")
 
     compiler_cfg = _get_global_compiler_config()
     dev_cfg = get_device_config(test_device.arch, [0], compiler_cfg.backend_cluster_descriptor_path, compiler_cfg.backend_runtime_params_path, compiler_cfg.store_backend_db_to_yaml, test_device.devtype)
@@ -161,8 +161,8 @@ def test_auto_op_transpose_case1(test_device):
     grid_transpose = placer_solution.name_to_op_placement["add"].grid_transpose
 
     expected_placed_core_rows = (0,1)
-    # if WH_B0, there is 1 less op, since transpose combined with srcA
-    expected_placed_core_cols = (4,6) if test_device.is_wormhole_b0() else (5,7)
+    # if grayskull, there is 1 more op, since transpose combined with srcA isn't supported
+    expected_placed_core_cols = (5,7) if test_device.is_grayskull() else (4,6)
 
     assert (placed_core.start.row, placed_core.end.row) == expected_placed_core_rows, f"(placed_core.start.row, placed_core.end.row) = {(placed_core.start.row, placed_core.end.row)} != expected_placed_core_rows = ({expected_placed_core_rows})"
     assert (placed_core.start.col, placed_core.end.col) == expected_placed_core_cols, f"(placed_core.start.col, placed_core.end.col) = {(placed_core.start.col, placed_core.end.col)} != expected_placed_core_cols = ({expected_placed_core_cols})"
@@ -227,7 +227,7 @@ def test_auto_op_transpose_case3(test_device):
 
 
 def test_auto_op_transpose_multi_rows1(test_device): 
-    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+    if test_device.arch != pybuda.BackendDevice.Grayskull:
         pytest.skip("Targetting grid-size of GS only")
 
     compiler_cfg = _get_global_compiler_config()  
@@ -259,7 +259,7 @@ def test_auto_op_transpose_multi_rows1(test_device):
 
 
 def test_auto_op_transpose_multi_rows2(test_device): 
-    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+    if test_device.arch != pybuda.BackendDevice.Grayskull:
         pytest.skip("Targetting grid-size of GS only")
 
     compiler_cfg = _get_global_compiler_config()  
diff --git a/pybuda/test/test_user.py b/pybuda/test/test_user.py
index e4efe316b..72122742a 100644
--- a/pybuda/test/test_user.py
+++ b/pybuda/test/test_user.py
@@ -18,7 +18,6 @@
 
 from pybuda.schedulers import LearningRateScheduler
 from pybuda.pybudaglobal import pybuda_reset
-from pybuda._C.backend_api import BackendDevice, BackendType, DeviceMode 
 from test.utils import download_model
 
 # https://github.com/pytorch/pytorch/wiki/Autograd-and-Fork
@@ -846,40 +845,40 @@ def test_parallel_chips():
     for i, p in enumerate(procs):
         p.join()
 
-def test_tti_inference_save_and_load():
-    available_devices = pybuda.detect_available_devices()
-    if available_devices and available_devices[0] == BackendDevice.Grayskull:
-        tt0 = pybuda.TTDevice(
-            "tt0",
-            arch=BackendDevice.Grayskull,
-            devtype=BackendType.Golden,
-        )
-    else:
-        tt0 = pybuda.TTDevice(
-            "tt0",
-            arch=BackendDevice.Wormhole_B0,
-            devtype=BackendType.Golden,
-        )
-
-
-    module = PyBudaTestModule("test_pybuda_module")
-    tt0.place_module(module)
-
-    # Saving to Archive
-    input_shape = (1, 1, 32, 32)
-    input1, input2  = torch.rand(*input_shape), torch.rand(*input_shape)
-    device_img = tt0.compile_to_image(
-        img_path="device_images/test_tt0.tti", 
-        training=False,
-        sample_inputs=(input1, input2),
-    )
-    pybuda_reset()  # flush the global state that lingers around for test
-
-    # Loading from Archive
-    tt1 = pybuda.TTDevice.load_image(img_path="device_images/test_tt0.tti")
-    tt1.push_to_inputs((input1, input2))
-    output_q = pybuda.run_inference()
-    output = _safe_read(output_q)
+# def test_tti_inference_save_and_load():
+#     available_devices = pybuda.detect_available_devices()
+#     if available_devices and available_devices[0] == BackendDevice.Grayskull:
+#         tt0 = pybuda.TTDevice(
+#             "tt0",
+#             arch=BackendDevice.Grayskull,
+#             devtype=BackendType.Golden,
+#         )
+#     else:
+#         tt0 = pybuda.TTDevice(
+#             "tt0",
+#             arch=BackendDevice.Wormhole_B0,
+#             devtype=BackendType.Golden,
+#         )
+
+
+#     module = PyBudaTestModule("test_pybuda_module")
+#     tt0.place_module(module)
+
+#     # Saving to Archive
+#     input_shape = (1, 1, 32, 32)
+#     input1, input2  = torch.rand(*input_shape), torch.rand(*input_shape)
+#     device_img = tt0.compile_to_image(
+#         img_path="device_images/test_tt0.tti", 
+#         training=False,
+#         sample_inputs=(input1, input2),
+#     )
+#     pybuda_reset()  # flush the global state that lingers around for test
+
+#     # Loading from Archive
+#     tt1 = pybuda.TTDevice.load_image(img_path="device_images/test_tt0.tti")
+#     tt1.push_to_inputs((input1, input2))
+#     output_q = pybuda.run_inference()
+#     output = _safe_read(output_q)
 
 
 @pytest.mark.parametrize("hoist_tms", [True, False])
@@ -887,7 +886,7 @@ def test_nop_insertion_api(hoist_tms):
     tt0 = pybuda.TTDevice("tt0", module=PyBudaTestQueryKeyModule(f"query_key_module_hoist_tms_{hoist_tms}"))
 
     # Use API to set manual data format override on an op
-    pybuda.insert_buffering_nop("mha_key", "mha_as", hoist_tms=hoist_tms)
+    pybuda.insert_nop("mha_key", "mha_as", hoist_tms=hoist_tms)
     microbatch_size, seq_len, hidden_dim = (1, 128, 128)
     encoder_input = torch.rand(microbatch_size, seq_len, hidden_dim)
 
@@ -899,7 +898,7 @@ def test_nop_fork_insertion_api(hoist_tms):
     tt0 = pybuda.TTDevice("tt0", module=PyBudaTestQueryKeyModule(f"forking_nop_insertion{hoist_tms}"))
 
     # Use API to set manual data format override on an op
-    pybuda.insert_buffering_nop("encoder_input", ["mha_key", "mha_query"], hoist_tms=hoist_tms)
+    pybuda.insert_nop("encoder_input", ["mha_key", "mha_query"], hoist_tms=hoist_tms)
     microbatch_size, seq_len, hidden_dim = (1, 128, 128)
     encoder_input = torch.rand(microbatch_size, seq_len, hidden_dim)
 
@@ -911,9 +910,9 @@ def test_nop_daily_chain_insertion_api(hoist_tms):
     tt0 = pybuda.TTDevice("tt0", module=PyBudaTestForkWithThreeUsers(f"daisy_chain_nop_insertion{hoist_tms}"))
 
     # Use API to set manual data format override on an op
-    pybuda.insert_buffering_nop("encoder_input", ["mm_a", "mm_b", "mm_c"], hoist_tms=hoist_tms)
-    pybuda.insert_buffering_nop("buffer_0_encoder_input_mm_a", ["mm_b", "mm_c"], hoist_tms=hoist_tms)
-    pybuda.insert_buffering_nop("buffer_0_buffer_0_encoder_input_mm_a_mm_b", ["mm_c"], hoist_tms=hoist_tms)
+    pybuda.insert_nop("encoder_input", ["mm_a", "mm_b", "mm_c"], hoist_tms=hoist_tms)
+    pybuda.insert_nop("buffer_0_encoder_input_mm_a", ["mm_b", "mm_c"], hoist_tms=hoist_tms)
+    pybuda.insert_nop("buffer_0_buffer_0_encoder_input_mm_a_mm_b", ["mm_c"], hoist_tms=hoist_tms)
     microbatch_size, seq_len, hidden_dim = (1, 128, 128)
     encoder_input = torch.rand(microbatch_size, seq_len, hidden_dim)
 
diff --git a/pybuda/test/tti/test_tti.py b/pybuda/test/tti/test_tti.py
deleted file mode 100644
index 5c8611142..000000000
--- a/pybuda/test/tti/test_tti.py
+++ /dev/null
@@ -1,504 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-from loguru import logger
-from pybuda.device import Device
-from pybuda.module import PyBudaModule
-from pybuda.pybudaglobal import pybuda_reset
-import pytest
-
-import queue
-import random
-import torch
-import os
-
-import pybuda
-import pybuda.op
-from pybuda import PyTorchModule, TTDeviceImage, VerifyConfig
-from pybuda._C.backend_api import BackendDevice, BackendType, DeviceMode, detect_available_silicon_devices
-from transformers import BertModel, BertConfig
-from ..common import ModuleBuilder, TestDevice, run, ModuleBuilder, device
-from test.bert.modules import PyBudaFeedForward 
-from pybuda.ttdevice import get_device_config
-from pybuda.config import _get_global_compiler_config
-from test.utils import download_model
-
-class PyBudaTestModule(pybuda.PyBudaModule):
-    shape = (1, 1, 32, 32)
-
-    def __init__(self, name):
-        super().__init__(name)
-        self.weights1 = pybuda.Parameter(
-            torch.rand(*PyBudaTestModule.shape, requires_grad=True)
-        )
-        self.weights2 = pybuda.Parameter(
-            torch.rand(*PyBudaTestModule.shape, requires_grad=True)
-        )
-
-    def forward(self, act1, act2):
-        m1 = pybuda.op.Matmul("matmul1", act1, self.weights1)
-        m2 = pybuda.op.Matmul("matmul2", act2, self.weights2)
-
-        add = pybuda.op.Add("add_mm", m1, m2)
-        constant = pybuda.op.Constant("constant", constant=2.0)
-
-        add_constant = pybuda.op.Add("add_constant", add, constant)
-
-        return add_constant
-
-    @staticmethod
-    def _save_device_image(
-        *,
-        device_name,
-        arch,
-        backend_type,
-        device_mode,
-        module,
-        input_shapes,
-        target_shapes=tuple(),
-        enable_training=False,
-        enable_optimizer=False,
-        num_chips=1,
-        chip_ids=[]
-    ):
-        optimizer = (
-            pybuda.optimizers.SGD(learning_rate=0.1, device_params=True)
-            if enable_optimizer
-            else None
-        )
-
-        # Create a TT device
-        tt0 = pybuda.TTDevice(
-            get_device_name(device_name, backend_type),
-            arch=arch, 
-            devtype=backend_type,
-            optimizer=optimizer,
-            num_chips=num_chips,
-            chip_ids=chip_ids,
-        )
-
-        # Place a module on the device
-        tt0.place_module(module)
-
-        if enable_training:
-            tt0.place_loss_module(pybuda.op.loss.L1Loss("l1_loss"))
-
-        sample_inputs = [torch.rand(*shape) for shape in input_shapes]
-        sample_targets = [torch.rand(*shape) for shape in target_shapes]
-
-        if device_mode == DeviceMode.CompileAndRun and enable_training:
-            loss_q, checkpoint_q = mp_context.Queue(), mp_context.Queue()
-            tt0.push_to_inputs(sample_inputs)
-            tt0.push_to_target_inputs(sample_targets)
-            pybuda.run_training(checkpoint_queue=checkpoint_q, loss_queue=loss_q)
-        elif device_mode == DeviceMode.CompileAndRun:
-            tt0.push_to_inputs(sample_inputs)
-            output_q = pybuda.run_inference()
-            output = _safe_read(output_q)
-
-        # save device_image
-        device_img = tt0.compile_to_image(
-            img_path=f"device_images/{get_device_name(device_name, backend_type)}.tti",
-            training=enable_training,
-            sample_inputs=sample_inputs,
-            sample_targets=sample_targets,
-        )
-        pybuda_reset()  # NB: note the reset; invoke to clear the global state that lingers around
-        return device_img
-
-    @staticmethod
-    def _load_device_image(
-        device_name, backend_type, *, set_module_params=False, enable_training=False
-    ):
-        # load device_image
-        img_path = f"device_images/{get_device_name(device_name, backend_type)}.tti"
-        img = TTDeviceImage.load_from_disk(img_path)
-        tt1 = pybuda.TTDevice.load_image(img=img)
-
-        if set_module_params:
-            module = tt1.modules[-1]
-            module.set_parameter(
-                "weights1", torch.rand(*PyBudaTestModule.shape, requires_grad=True)
-            )
-            module.set_parameter(
-                "weights2", torch.rand(*PyBudaTestModule.shape, requires_grad=True)
-            )
-
-        loss_q = mp_context.Queue()
-        checkpoint_q = mp_context.Queue()
-
-        inputs = [torch.rand(shape) for shape in img.get_input_shapes()]
-        tt1.push_to_inputs(inputs)
-
-        if enable_training:
-            targets = [torch.rand(shape) for shape in img.get_target_shapes()]
-            tt1.push_to_target_inputs(*targets)
-            pybuda.run_training(checkpoint_queue=checkpoint_q, loss_queue=loss_q)
-            print("checkpoint: ", _safe_read(checkpoint_q))
-            print("loss: ", _safe_read(loss_q))
-        else:
-            output_q = pybuda.run_inference()
-            output = _safe_read(output_q)
-            print(output)
-
-
-class MyLinear(torch.nn.Module):
-    def __init__(self):
-        super().__init__()
-        self.lin = torch.nn.Linear(64, 128, bias=True)
-
-    def forward(self, act):
-        x = self.lin(act)
-        return x
-
-class MatMulRelu(PyBudaModule):
-    shape = (1, 128, 128)
-    def __init__(self, name):
-        super().__init__(name)
-        self.weights1 = pybuda.Parameter(
-            torch.rand(*MatMulRelu.shape, requires_grad=True)
-        )
-
-    def forward(self, act):
-        matmul = pybuda.op.Matmul("matmul1", act, self.weights1)
-        relu = pybuda.op.Relu(f"relu", matmul)
-        return relu 
-
-
-def get_bert_encoder_module():
-    config = download_model(BertConfig.from_pretrained, "prajjwal1/bert-tiny")
-    config.num_hidden_layers = 1
-    model = BertModel(config=config)
-    module = PyTorchModule("bert_encoder", model.encoder)
-    return module
-
-
-# https://github.com/pytorch/pytorch/wiki/Autograd-and-Fork
-mp_context = torch.multiprocessing.get_context("spawn")
-
-
-def _safe_read(q):
-    """
-    Read a queue, but return None if an error was raised in the meantime, preventing a hang on error.
-    """
-    while True:
-        try:
-            data = q.get(timeout=0.5)
-            return data
-        except queue.Empty as _:
-            if pybuda.error_raised():
-                raise RuntimeError("Error raised in pybuda")
-        except KeyboardInterrupt:
-            return None
-
-
-def get_device_name(device_name, backend_type: BackendType):
-    if backend_type == BackendType.Silicon:
-        backend_type_suffix = "silicon"
-    else:
-        backend_type_suffix = "golden"
-    return f"{device_name}_{backend_type_suffix}"
-
-
-@pytest.fixture
-def pybuda_module():
-    return PyBudaTestModule("test_pybuda_module")
-
-def test_inference_compile_to_image_and_run_then_rerun_from_image(test_device, pybuda_module):
-    pybuda_module._save_device_image(
-        device_name = "tt0",
-        arch = test_device.arch,
-        backend_type = BackendType.Golden,
-        device_mode = DeviceMode.CompileAndRun,
-        module = pybuda_module,
-        input_shapes = [(4, 32, 32), (4, 32, 32)],
-    )
-    pybuda_module._load_device_image("tt0", BackendType.Golden)    
-    
-def test_inference_compile_only_then_run_from_image(test_device, pybuda_module):
-    pybuda_module._save_device_image(
-        device_name = "tt1",
-        arch = test_device.arch,
-        backend_type = BackendType.Golden,
-        device_mode = DeviceMode.CompileOnly,
-        module = pybuda_module,
-        input_shapes = [(4, 32, 32), (4, 32, 32)],
-    )
-    pybuda_module._load_device_image("tt1", BackendType.Golden)
-
-
-def test_inference_compile_only_silicon_target_device(test_device, pybuda_module):
-    pybuda_module._save_device_image(
-        device_name = "tt2",
-        arch = test_device.arch,
-        backend_type = BackendType.Silicon,
-        device_mode = DeviceMode.CompileOnly,
-        module = pybuda_module,
-        input_shapes = [(4, 32, 32), (4, 32, 32)],
-    )
-
-@pytest.mark.parametrize("num_harvested_rows", [x+1 for x in range(3)])
-def test_inference_compile_only_silicon_target_device_harvested_manual(test_device, num_harvested_rows, pybuda_module):     
-    compiler_cfg = _get_global_compiler_config()
-    dev_cfg = get_device_config(test_device.arch, [0], compiler_cfg.backend_cluster_descriptor_path, compiler_cfg.backend_runtime_params_path, compiler_cfg.store_backend_db_to_yaml, BackendType.Golden)
-    if num_harvested_rows < 10 - dev_cfg.grid_size.r:
-        pytest.skip("Simulated harveted rows are less than actually harvested rows on the silicon")    
-    
-    detected_harvested_rows = []
-    harvesting_rows_available = [1,2,3,4,5,7,8,9,10,11]
-    if dev_cfg.grid_size.r < 10:
-        row_coordinate = 0
-        harvesting_mask = dev_cfg.get_harvested_cfg()
-        while harvesting_mask: 
-            if (harvesting_mask & 1):
-                detected_harvested_rows.append(row_coordinate)
-                harvesting_rows_available.remove(row_coordinate)
-            harvesting_mask = (harvesting_mask >> 1)
-            row_coordinate += 1 
-    harvested_rows = random.sample(harvesting_rows_available, num_harvested_rows-len(detected_harvested_rows))+detected_harvested_rows
-    pybuda.set_configuration_options(harvested_rows=[harvested_rows]) 
-    device_image = pybuda_module._save_device_image(
-        device_name = "tt2-harvested-manual",
-        arch = test_device.arch,
-        backend_type = BackendType.Silicon,
-        device_mode = DeviceMode.CompileOnly,
-        module = pybuda_module,
-        input_shapes = [(4, 32, 32), (4, 32, 32)],
-    )
-    pybuda_reset() 
-    device_image.info()
-
-
-@pytest.mark.skip(reason="currently local testing only")
-def test_inference_compile_only_silicon_target_device_harvested_auto(test_device, pybuda_module):
-    device_image = pybuda_module._save_device_image(
-        device_name = "tt2-harvested-auto",
-        arch = test_device.arch,
-        backend_type = BackendType.Golden,
-        device_mode = DeviceMode.CompileOnly,
-        module = pybuda_module,
-        input_shapes = [(4, 32, 32), (4, 32, 32)],
-    ) 
-    device_image.info()
-
-
-def test_inference_compile_only_module_params_unset_on_save(test_device, pybuda_module):
-    pybuda_module._save_device_image(
-        device_name = "tt3",
-        arch = test_device.arch,
-        backend_type = BackendType.Golden,
-        device_mode = DeviceMode.CompileOnly,
-        module = pybuda_module,
-        input_shapes = [(4, 32, 32), (4, 32, 32)],
-    )
-    pybuda_module._load_device_image("tt3", BackendType.Golden, set_module_params=True)
-
-
-def test_inference_output_host_tms(test_device, pybuda_module):
-    """
-    def gelu_relu(act):
-        op0 = pybuda.op.Gelu(f"op0", act)
-        op1 = pybuda.op.Relu(f"op1", op0)
-        return op1
-    module = ModuleBuilder(gelu_relu)
-    """
-    module = MatMulRelu("matmul_relu")
-
-    pybuda_module._save_device_image(
-        device_name = "output_host_tms",
-        arch = test_device.arch,
-        backend_type = BackendType.Golden,
-        device_mode = DeviceMode.CompileOnly,
-        module = module,
-        input_shapes = [(1, 128, 128),],
-    )
-    pybuda_module._load_device_image("output_host_tms", BackendType.Golden)
-
-
-def test_training_compile_to_image_and_run_then_rerun_from_image(test_device, pybuda_module):
-    pybuda_module._save_device_image(
-        device_name = "tt4",
-        arch = test_device.arch,
-        backend_type = BackendType.Golden,
-        device_mode = DeviceMode.CompileAndRun,
-        enable_training = True,
-        module = pybuda_module,
-        input_shapes = [(4, 32, 32), (4, 32, 32)],
-        target_shapes = [
-            (4, 32, 32),
-        ],
-    )
-    pybuda_module._load_device_image("tt4", BackendType.Golden, enable_training=True)
-
-
-def test_training_compile_only_then_run_from_image(test_device, pybuda_module):
-    pybuda_module._save_device_image(
-        device_name = "tt5",
-        arch = test_device.arch,
-        backend_type = BackendType.Golden,
-        device_mode = DeviceMode.CompileOnly,
-        enable_training = True,
-        module = pybuda_module,
-        input_shapes = [(4, 32, 32), (4, 32, 32)],
-        target_shapes = [
-            (4, 32, 32),
-        ],
-    )
-    pybuda_module._load_device_image("tt5", BackendType.Golden, enable_training=True)
-
-
-def test_training_compile_only_silicon_target_device(test_device, pybuda_module):
-    pybuda_module._save_device_image(
-        device_name = "tt6",
-        arch = test_device.arch,
-        backend_type = BackendType.Silicon,
-        device_mode = DeviceMode.CompileOnly,
-        module = pybuda_module,
-        input_shapes = [(4, 32, 32), (4, 32, 32)],
-    )
-
-
-def test_training_compile_only_then_run_from_image_with_optimizer(test_device, pybuda_module):
-    pybuda_module._save_device_image(
-        device_name = "tt7",
-        arch = test_device.arch,
-        backend_type = BackendType.Golden,
-        device_mode = DeviceMode.CompileOnly,
-        enable_optimizer = True,
-        enable_training = True,
-        module = pybuda_module,
-        input_shapes = [(4, 32, 32), (4, 32, 32)],
-        target_shapes = [
-            (4, 32, 32),
-        ],
-    )
-    pybuda_module._load_device_image("tt7", BackendType.Golden, enable_training=True)
-
-
-def test_device_image_apis(test_device, pybuda_module):
-    inference_img = pybuda_module._save_device_image(
-        device_name = "tt_inference",
-        arch = test_device.arch,
-        backend_type = BackendType.Golden,
-        device_mode = DeviceMode.CompileOnly,
-        module = pybuda_module,
-        input_shapes = [(4, 32, 32), (4, 32, 32)],
-    )
-    training_img = pybuda_module._save_device_image(
-        device_name = "tt_training",
-        arch = test_device.arch,
-        backend_type = BackendType.Golden,
-        device_mode = DeviceMode.CompileOnly,
-        enable_training = True,
-        module = pybuda_module,
-        input_shapes = [(4, 32, 32), (4, 32, 32)],
-        target_shapes = [
-            (4, 32, 32),
-        ],
-    )
-
-    assert not inference_img.is_compiled_for_training()
-    assert training_img.is_compiled_for_training()
-
-
-def test_const_eval_save_and_load(test_device, pybuda_module):
-    pybuda_module._save_device_image(
-        device_name = "tt9",
-        arch = test_device.arch,
-        backend_type = BackendType.Golden,
-        device_mode = DeviceMode.CompileAndRun,
-        module = PyTorchModule("pt_linear", MyLinear()),
-        input_shapes = [
-            (1, 128, 64),
-        ],
-    )
-    pybuda_module._load_device_image("tt9", BackendType.Golden)
-
-
-def test_pt_encoder_silicon_save_and_inspect(test_device, pybuda_module):
-    device_img = pybuda_module._save_device_image(
-        device_name = "tt10",
-        arch = test_device.arch,
-        backend_type = BackendType.Silicon,
-        device_mode = DeviceMode.CompileOnly,
-        module = get_bert_encoder_module(),
-        input_shapes = [(1, 128, 128), (1, 1, 128, 128)],
-    )
-    device_img.info()
-
-
-def test_pt_encoder_golden_save_and_load(test_device, pybuda_module):
-    pybuda_module._save_device_image(
-        device_name = "tt11",
-        arch = test_device.arch,
-        backend_type = BackendType.Golden,
-        device_mode = DeviceMode.CompileOnly,
-        module = get_bert_encoder_module(),
-        input_shapes = [(1, 128, 128), (1, 1, 128, 128)],
-    )
-    pybuda_module._load_device_image("tt11", BackendType.Golden)
-
-
-def test_tti_buffer_queue(test_device, pybuda_module):
-    pybuda.config.set_configuration_options()
-    pybuda.config.override_op_size("matmul_22", (2,2))
-    pybuda.config.insert_buffering_nop("matmul_22", "matmul_29", hoist_tms = False)
-    pybuda.config.override_dram_queue_placement("layer.0.attention.self.query.weight", channel=4)
-    pybuda_module._save_device_image(
-        device_name = "tti_buffer_queue",
-        arch = test_device.arch,
-        backend_type = test_device.devtype,
-        device_mode = DeviceMode.CompileOnly,
-        module = get_bert_encoder_module(),
-        input_shapes = [(1, 128, 128), (1, 1, 128, 128)],
-    )
-    pybuda_module._load_device_image("tti_buffer_queue", test_device.devtype)
-
-
-class simple_matmul(PyBudaModule):
-        def __init__(self, name):
-            super().__init__(name)
-            self.weights = pybuda.Parameter(*(1,1,32,128), requires_grad=True)
-
-        def forward(self, x):
-            x = pybuda.op.Matmul("", x, self.weights)
-            return x
-
-@pytest.mark.parametrize("two_rows_harvested", [True, False])
-def test_tti_save_load_verify_module(test_device, two_rows_harvested):
-    if test_device.arch == BackendDevice.Grayskull:
-        pytest.skip("Under development")
-
-    tt0 = simple_matmul("test_tti_save_load_verify_module")
-    input_shape = (1, 1, 32, 32)
-    pybuda.verify.verify_module(
-        tt0, 
-        (input_shape,),
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            devmode=test_device.devmode,
-            test_kind=pybuda.verify.config.TestKind.INFERENCE,
-        ),
-    )
-
-
-def test_tti_bert_encoder(test_device):
-    input_shapes = [(1, 128, 128), (1, 1, 128, 128)],
-    pybuda.verify.verify_module(
-        get_bert_encoder_module(), 
-        *input_shapes,
-        verify_cfg=VerifyConfig(
-            arch=test_device.arch,
-            devtype=test_device.devtype,
-            devmode=test_device.devmode,
-            test_kind=pybuda.verify.config.TestKind.INFERENCE,
-        ),
-    )
-
-if __name__ == "__main__":
-    import os
-
-    os.environ["LOGURU_LEVEL"] = "TRACE"
-
-    test_pt_encoder_golden_save_and_load()
diff --git a/pybuda/test/tti/test_tti_merge.py b/pybuda/test/tti/test_tti_merge.py
deleted file mode 100644
index 0d141a05d..000000000
--- a/pybuda/test/tti/test_tti_merge.py
+++ /dev/null
@@ -1,79 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-import subprocess as sp
-import argparse
-import os
-from loguru import logger
-from pybuda.tools.tti_merge import merge_models
-
-if __name__ == "__main__":
-    try:
-        # Remove this config var, else dir names are too long in CI
-        if "PYBUDA_CI_DIR" in os.environ:
-            print("Found var")
-            del os.environ["PYBUDA_CI_DIR"]
-            
-        parser =  argparse.ArgumentParser()
-        parser.add_argument("--models_to_merge", type = str, help = "List of models to generate TTIs for and merge.", default = "*", nargs = "*")
-        parser.add_argument("--device_cfg", type = str, help = "Choose between wh_nebula_x1 and wh_nebula_x2", required = True)
-        parser.add_argument("--merge_ttis", type = bool, help = "Merge Generated TTIs into a single image", default = False)
-        parser.add_argument("--disable_host_queues", type = bool, help = "Disable activations in host memory", default = False)
-        parser.add_argument("--disable_memory_optimizations", type = bool, help = "Disable low level memory optimizations done during model fusion", default = False)
-        args = parser.parse_args()
-        
-        tti_build_dir = "device_images_to_merge/"
-        valid_models = {
-            "resnet" : os.path.abspath("pybuda/test/benchmark/benchmark.py") + " -m resnet -c resnet50 -df Fp16_b -mf HiFi3 -o perf.json -mb 128 --save_tti " + tti_build_dir + "/resnet.tti --arch wormhole_b0 --device-config ",
-            "mobilenet_v1" : os.path.abspath("pybuda/test/benchmark/benchmark.py") + " -m mobilenet_v1 -c 224 -df Fp16_b -mf HiFi2 -o perf.json -mb 256 --save_tti " + tti_build_dir + "/mobilenet_v1.tti --arch wormhole_b0 --device-config ",
-            "mobilenet_v2" : "pybuda/test/benchmark/benchmark.py -m mobilenet_v2 -c 224 -df Fp16_b -mf HiFi2 -o perf.json -mb 256 --save_tti " + tti_build_dir + "/mobilenet_v2.tti --arch wormhole_b0 --device-config ",
-            "mobilenet_v3" : "pybuda/test/benchmark/benchmark.py -m mobilenet_v3_timm -c large -df Fp16_b -mf HiFi2 -o perf.json -mb 256 --save_tti " + tti_build_dir + "/mobilenet_v3.tti --arch wormhole_b0 --device-config ",
-            "hrnet" : "pybuda/test/benchmark/benchmark.py -m hrnet -c v2_w64 -df Fp16_b -mf HiFi3 -o perf.json -mb 128 --save_tti " + tti_build_dir + "/hrnet.tti --arch wormhole_b0 --device-config ",
-            "vit" : "pybuda/test/benchmark/benchmark.py -m vit -c base -df Fp16_b -mf HiFi2 -o perf.json -mb 64 --save_tti " + tti_build_dir + "/vit.tti --arch wormhole_b0 --device-config ",
-            "deit" : "pybuda/test/benchmark/benchmark.py -m deit -c base -df Fp16_b -mf HiFi2 -o perf.json -mb 128 --save_tti " + tti_build_dir + "/deit.tti --arch wormhole_b0 --device-config ",
-            "unet" : "pybuda/test/benchmark/benchmark.py -m unet -c 256 -mb 48 -df Fp16_b -mf HiFi3 -o perf.json -mb 16 --save_tti " + tti_build_dir + "/unet.tti --arch wormhole_b0 --device-config ",
-            "inception" : "pybuda/test/benchmark/benchmark.py -m inception_v4 -c 224 -df Fp16_b -mf HiFi3 -o perf.json -mb 32 --save_tti " + tti_build_dir + "/inception.tti --arch wormhole_b0 --device-config ",
-            "bert_large" : "pybuda/test/benchmark/benchmark.py -m bert -c large_tc -df Fp16_b -mf HiFi3 -o perf.json -mb 64 --save_tti " + tti_build_dir + "/bert_large.tti --arch wormhole_b0 --device-config ",
-        }
-        
-        os.makedirs(tti_build_dir, exist_ok=True)
-        harvesting_flag = ['--env']
-        if args.device_cfg == "wh_nebula_x1":
-            harvesting_flag = harvesting_flag + ['PYBUDA_FORCE_EMULATE_HARVESTED=1 TT_BACKEND_HARVESTED_ROWS=2048']
-        elif args.device_cfg == "wh_nebula_x2":
-            harvesting_flag = harvesting_flag + ['PYBUDA_FORCE_EMULATE_HARVESTED=1 TT_BACKEND_HARVESTED_ROWS=2050']
-        else:
-            logger.exception("Unsupported device cfg: {}", args.device_cfg)
-        
-        # # Generate TTIs
-        tti_locations = []
-        if args.disable_host_queues:
-            os.environ["PYBUDA_ENABLE_INPUT_QUEUES_ON_HOST"] = "0"
-            os.environ["PYBUDA_ENABLE_OUTPUT_QUEUES_ON_HOST"] = "0"
-        os.environ["PYBUDA_TTI_BACKEND_FORMAT"] = "1"
-
-        models_to_merge = []
-        if args.models_to_merge == "*":
-            models_to_merge = valid_models
-        else:
-            for model in args.models_to_merge:
-                assert model in valid_models, "Model: " + model + "is not in the list of valid models."
-                models_to_merge.append(model)
-        # Generate TTIs
-        for model in models_to_merge:
-            assert model in valid_models, "Model: " + model + "is not in the list of valid models."
-            logger.info("Generating TTI for {}", model)
-            cmd = "python3 " + valid_models[model] + args.device_cfg
-            cmd = cmd.split(" ")
-            cmd = cmd + harvesting_flag
-            sp.run(cmd)
-            tti_locations.append(os.path.abspath(os.path.join(tti_build_dir, model + ".tti")))
-        # Merge TTIs
-        logger.info("Merging TTIs")
-        if args.disable_memory_optimizations:
-            merge_models(tti_locations, "wormhole_b0", "merged_model.tti", False, False)
-        else:
-            merge_models(tti_locations, "wormhole_b0", "merged_model.tti")
-    except Exception as e:
-        logger.exception(e)
-        
\ No newline at end of file
diff --git a/pybuda/test/tvm/cnn/onnx/test_resnet.py b/pybuda/test/tvm/cnn/onnx/test_resnet.py
index 2925941de..b0d5cad3f 100644
--- a/pybuda/test/tvm/cnn/onnx/test_resnet.py
+++ b/pybuda/test/tvm/cnn/onnx/test_resnet.py
@@ -47,8 +47,6 @@ def test_resnet_onnx(test_kind, test_device):
     # Configurations
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
-
 
     # Sanity run
     input_shape = (1, 3, 224, 224)
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_alexnet.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_alexnet.py
index c3142b9c6..66d93b9bd 100644
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_alexnet.py
+++ b/pybuda/test/tvm/cnn/pytorch/tests_A/test_alexnet.py
@@ -32,7 +32,6 @@ def test_tvm_alexnet(test_kind, test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     if test_kind.is_training():
         compiler_cfg.compile_depth = (
             CompileDepth.GENERATE_INITIAL_GRAPH
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_autoencoder.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_autoencoder.py
index 972457cb2..8ad70db9d 100644
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_autoencoder.py
+++ b/pybuda/test/tvm/cnn/pytorch/tests_A/test_autoencoder.py
@@ -99,7 +99,6 @@ def test_linear_autoencoder(test_kind, test_device):
         pytest.skip()
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN" 
 
     framework_model = LinearAE()
@@ -126,7 +125,6 @@ def test_conv_autoencoder(test_kind, test_device):
         pytest.skip()
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
     if test_kind.is_training():
         compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_convnext.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_convnext.py
index 63935c333..3b8dd25a2 100644
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_convnext.py
+++ b/pybuda/test/tvm/cnn/pytorch/tests_A/test_convnext.py
@@ -37,7 +37,6 @@ def test_convnext_tiny(test_kind, test_device):
         # tenstorrent/pybuda#365
         compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.retain_tvm_python_files = True
 
     framework_model = download_model(ConvNextModel.from_pretrained, "facebook/convnext-tiny-224", torchscript=True)
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_dalle_vae.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_dalle_vae.py
index b70eb6964..cea0ed347 100644
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_dalle_vae.py
+++ b/pybuda/test/tvm/cnn/pytorch/tests_A/test_dalle_vae.py
@@ -53,7 +53,6 @@ def test_tvm_dalle_Decoder(test_kind, test_device):
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
     os.environ["PYBUDA_RIBBON2"] = "1"
-    compiler_cfg.enable_t_streaming = True
     if test_kind.is_training():
         compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
 
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_efficientnet.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_efficientnet.py
index 05ebb3ff8..254163b78 100644
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_efficientnet.py
+++ b/pybuda/test/tvm/cnn/pytorch/tests_A/test_efficientnet.py
@@ -59,7 +59,6 @@ def test_efficientnet_stem(test_kind, test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
 
     model = download_model(torch.hub.load, 
         "NVIDIA/DeepLearningExamples:torchhub",
@@ -89,7 +88,6 @@ def test_efficientnet_b0(test_kind, test_device):
     import timm
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
 
     model = timm.create_model('efficientnet_b0', pretrained=True)
     module = PyTorchModule("efficientnet_b0", model)
@@ -113,7 +111,6 @@ def test_efficientnet_b4(test_kind, test_device):
     import timm
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
 
     model = timm.create_model('efficientnet_b4', pretrained=True)
     module = PyTorchModule("efficientnet_b0", model)
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_fcn.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_fcn.py
index 3a7c9adcb..f272af846 100644
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_fcn.py
+++ b/pybuda/test/tvm/cnn/pytorch/tests_A/test_fcn.py
@@ -31,7 +31,6 @@ def test_fcn_pytorch(test_kind, test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
 
     # Issue below is still valid, though it doesn't trigger when fracturing is turned on
     # tenstorrent/pybuda#310
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_googlenet.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_googlenet.py
index 1af3b5560..a1d35688b 100644
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_googlenet.py
+++ b/pybuda/test/tvm/cnn/pytorch/tests_A/test_googlenet.py
@@ -30,7 +30,6 @@ def test_tvm_googlenet(test_kind, test_device):
     compiler_cfg = _get_global_compiler_config()
 
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     if test_kind.is_training():
         compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
 
@@ -69,7 +68,6 @@ def test_googlenet_torchvision(test_kind, test_device):
     compiler_cfg = _get_global_compiler_config()
 
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
 
     model = download_model(models.googlenet, pretrained=True)
     module = PyTorchModule("googlenet_pt", model)
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_A/test_hrnet.py b/pybuda/test/tvm/cnn/pytorch/tests_A/test_hrnet.py
index 7647e9f0f..754826d15 100644
--- a/pybuda/test/tvm/cnn/pytorch/tests_A/test_hrnet.py
+++ b/pybuda/test/tvm/cnn/pytorch/tests_A/test_hrnet.py
@@ -29,7 +29,6 @@ def test_hrnet_full_model(test_kind, test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     if test_kind.is_training():
         compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
 
@@ -53,7 +52,7 @@ def test_hrnet_full_model(test_kind, test_device):
 
 def test_hrnet_basic_block(test_kind, test_device):
     if (
-        test_kind == TestKind.TRAINING
+        test_kind.is_training()
     ):  # Always run with recompute in post-commit CI. Nightly tests both
         pytest.skip()
 
@@ -84,7 +83,7 @@ def test_hrnet_basic_block(test_kind, test_device):
 
 def test_hrnet_fuse_layer(test_kind, test_device):
     if (
-        test_kind == TestKind.TRAINING
+        test_kind.is_training()
     ):  # Always run with recompute in post-commit CI. Nightly tests both
         pytest.skip()
 
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_ghostnet.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_ghostnet.py
index 23c10dbe4..5eab8f1d9 100644
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_ghostnet.py
+++ b/pybuda/test/tvm/cnn/pytorch/tests_B/test_ghostnet.py
@@ -23,7 +23,6 @@ def test_ghostnet(test_kind, test_device):
         pytest.skip()  # Backward is currently unsupported
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     #Fusing disabled due to tenstorrent/pybuda#800
@@ -62,7 +61,6 @@ def test_ghostnet_v2(test_kind, test_device):
      # STEP 1: Set PyBuda configuration parameters
     compiler_cfg = _get_global_compiler_config()  # load global compiler config object
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
  
     # Model load
     localfile, _ = urllib.request.urlretrieve("https://github.com/huawei-noah/ghostnet/raw/master/ghostnetv2_pytorch/model/ghostnetv2_torch.py")
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_mobilenet_v2.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_mobilenet_v2.py
index 30cd1e9c2..9287d34e7 100644
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_mobilenet_v2.py
+++ b/pybuda/test/tvm/cnn/pytorch/tests_B/test_mobilenet_v2.py
@@ -34,7 +34,6 @@ def test_mobilenetv2_pytorch(test_kind, test_device):
     os.environ["PYBUDA_DISABLE_CONSTANT_FOLDING"] = "1"
     
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     model = download_model(torch.hub.load, 
@@ -79,7 +78,6 @@ def test_mobilenetv2_deeplab(test_kind, test_device):
     os.environ["PYBUDA_PAD_SPARSE_MM"] = "{25:26}"
     
     compiler_cfg = _get_global_compiler_config()
-    # compiler_cfg.enable_t_streaming = True # setting this allows the test to compile, with a 0.97 PCC on golden output
     compiler_cfg.balancer_policy = "CNN"
 
     model = download_model(MobileNetV2ForSemanticSegmentation.from_pretrained, "Matthijs/deeplabv3_mobilenet_v2_1.0_513")
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_mobilenet_v3.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_mobilenet_v3.py
index ad35d5986..7a88b92ca 100644
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_mobilenet_v3.py
+++ b/pybuda/test/tvm/cnn/pytorch/tests_B/test_mobilenet_v3.py
@@ -29,7 +29,6 @@ def test_mobilenet_v3_small(test_kind, test_device):
         pytest.skip()
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     # tenstorrent/pybuda#392
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_resnet.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_resnet.py
index 85c7f7ed2..ce8c3a25e 100644
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_resnet.py
+++ b/pybuda/test/tvm/cnn/pytorch/tests_B/test_resnet.py
@@ -40,7 +40,7 @@ def get_relaxed_atol_pcc(test_kind, test_device, microbatch_size=1):
 
 def test_resnet_pytorch(test_kind, test_device):
     # Always run with recompute in post-commit CI. Nightly tests both
-    if test_kind == TestKind.TRAINING:
+    if test_kind.is_training():
         pytest.skip()
 
     compiler_cfg = _get_global_compiler_config()
@@ -50,7 +50,6 @@ def test_resnet_pytorch(test_kind, test_device):
         compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
         # compiler_cfg.compile_depth = CompileDepth.FULL
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     # compiler_cfg.place_on_new_epoch("max_pool2d_14.dc.reshape.0_operand_commute_clone411.dc.sparse_matmul.4.lc2")
 
     # Issue below is still valid, though it doesn't trigger when fracturing is turned on
@@ -96,7 +95,6 @@ def test_resnet_pytorch_instance_norm(test_kind, test_device):
         # compiler_cfg.compile_depth = CompileDepth.BALANCER_PASS
         compiler_cfg.compile_depth = CompileDepth.FULL
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.place_on_new_epoch("conv2d_0.dc.reshape.15.dc.sparse_matmul.1.lc2")
 
     from torchvision.models import resnet18
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_resnext.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_resnext.py
index dd8f418eb..4a7f2656d 100644
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_resnext.py
+++ b/pybuda/test/tvm/cnn/pytorch/tests_B/test_resnext.py
@@ -33,7 +33,6 @@ def test_resnext(test_kind, test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     if test_kind.is_training():
         compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
 
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_vgg.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_vgg.py
index 26f72a9c4..7e1b2add2 100644
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_vgg.py
+++ b/pybuda/test/tvm/cnn/pytorch/tests_B/test_vgg.py
@@ -28,7 +28,6 @@ def test_vgg_pytorch(test_kind, test_device):
         pytest.skip()  # Backward is currently unsupported
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "Ribbon"
 
     model = download_model(torch.hub.load, "pytorch/vision:v0.10.0", "vgg11", pretrained=True)
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_B/test_vit.py b/pybuda/test/tvm/cnn/pytorch/tests_B/test_vit.py
index 9d9912d92..86540857d 100644
--- a/pybuda/test/tvm/cnn/pytorch/tests_B/test_vit.py
+++ b/pybuda/test/tvm/cnn/pytorch/tests_B/test_vit.py
@@ -27,7 +27,6 @@ def test_tvm_visual_transformer(test_kind, test_device):
     # Compiler configurations
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
 
     # Load model
     config = ViTConfig()
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_C/test_densenet.py b/pybuda/test/tvm/cnn/pytorch/tests_C/test_densenet.py
index 7b8a2fc4b..b11a45465 100644
--- a/pybuda/test/tvm/cnn/pytorch/tests_C/test_densenet.py
+++ b/pybuda/test/tvm/cnn/pytorch/tests_C/test_densenet.py
@@ -25,7 +25,6 @@ def test_densenet_121(test_kind, test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
 
     # Add required env vars as per https://yyz-gitlab.local.tenstorrent.com/tenstorrent/model-demos/-/issues/48
     import os
@@ -56,7 +55,6 @@ def test_densenet_169(test_kind, test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
 
     # Add required env vars as per https://yyz-gitlab.local.tenstorrent.com/tenstorrent/model-demos/-/issues/48
     import os
@@ -90,7 +88,6 @@ def test_densenet_201(test_kind, test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
 
     # Add required env vars as per https://yyz-gitlab.local.tenstorrent.com/tenstorrent/model-demos/-/issues/48
     import os
@@ -124,7 +121,6 @@ def test_densenet_161(test_kind, test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
 
     # Add required env vars as per https://yyz-gitlab.local.tenstorrent.com/tenstorrent/model-demos/-/issues/48
     import os
diff --git a/pybuda/test/tvm/cnn/pytorch/tests_C/test_yolov5.py b/pybuda/test/tvm/cnn/pytorch/tests_C/test_yolov5.py
index 68fc63591..663d6c582 100644
--- a/pybuda/test/tvm/cnn/pytorch/tests_C/test_yolov5.py
+++ b/pybuda/test/tvm/cnn/pytorch/tests_C/test_yolov5.py
@@ -48,7 +48,6 @@ def test_yolov5_320x320(test_kind, test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tm_cpu_fallback = True
     model = download_model(torch.hub.load, "ultralytics/yolov5", "yolov5s", pretrained=True)
 
@@ -89,7 +88,6 @@ def test_yolov5_480x480(test_kind, test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tm_cpu_fallback = True
 
     model = download_model(torch.hub.load, "ultralytics/yolov5", "yolov5s", pretrained=True)
@@ -133,7 +131,6 @@ def test_yolov5m_640x640(test_kind, test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tm_cpu_fallback = True
     model = download_model(torch.hub.load, "ultralytics/yolov5", "yolov5m", pretrained=True)
 
@@ -172,7 +169,6 @@ def test_yolov5_1280x1280(test_kind, test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tm_cpu_fallback = True
 
     model = download_model(torch.hub.load, "ultralytics/yolov5", "yolov5s", pretrained=True)
diff --git a/pybuda/test/tvm/cnn/tensorflow/tests_A/test_convnext.py b/pybuda/test/tvm/cnn/tensorflow/tests_A/test_convnext.py
index 574b05fce..d0e3cc513 100644
--- a/pybuda/test/tvm/cnn/tensorflow/tests_A/test_convnext.py
+++ b/pybuda/test/tvm/cnn/tensorflow/tests_A/test_convnext.py
@@ -31,7 +31,6 @@ def test_tvm_convnext(test_kind, test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     # tenstorrent/pybuda#842
     compiler_cfg.compile_depth = (
         CompileDepth.BUDA_GRAPH_PRE_PLACER
diff --git a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_alexnet.py b/pybuda/test/tvm/cnn/tensorflow/tests_B/test_alexnet.py
index 266cf5f83..7e6532458 100644
--- a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_alexnet.py
+++ b/pybuda/test/tvm/cnn/tensorflow/tests_B/test_alexnet.py
@@ -110,7 +110,6 @@ def call(self, x):
             return x
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     framework_model = AlexNet()
diff --git a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_efficientnet.py b/pybuda/test/tvm/cnn/tensorflow/tests_B/test_efficientnet.py
index 5a490d590..d66b54f8d 100644
--- a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_efficientnet.py
+++ b/pybuda/test/tvm/cnn/tensorflow/tests_B/test_efficientnet.py
@@ -80,7 +80,6 @@ def test_efficientnet_layer(test_kind, test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     blocks_args = [{
diff --git a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_inception.py b/pybuda/test/tvm/cnn/tensorflow/tests_B/test_inception.py
index 4bb6c1b76..f01b2790b 100644
--- a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_inception.py
+++ b/pybuda/test/tvm/cnn/tensorflow/tests_B/test_inception.py
@@ -20,7 +20,6 @@ def test_inceptionv3_tf(test_kind, test_device):
         pytest.skip()
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
     if test_kind.is_training():
         compiler_cfg.compile_depth = CompileDepth.GENERATE_INITIAL_GRAPH
diff --git a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_mobilenet.py b/pybuda/test/tvm/cnn/tensorflow/tests_B/test_mobilenet.py
index 40cc26b31..750670c12 100644
--- a/pybuda/test/tvm/cnn/tensorflow/tests_B/test_mobilenet.py
+++ b/pybuda/test/tvm/cnn/tensorflow/tests_B/test_mobilenet.py
@@ -26,7 +26,6 @@ def test_mobilenetv1_tf(test_kind, test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     input_shape = (1, 224, 224, 3)
@@ -54,7 +53,6 @@ def test_mobilenetv2_tf(test_kind, test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     input_shape = (1, 224, 224, 3)
@@ -80,7 +78,6 @@ def test_mobilenetv3_tf(test_kind, test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.compile_depth = CompileDepth.PRE_LOWERING_PASS
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_policy = "CNN"
 
     input_shape = (1, 224, 224, 3)
diff --git a/pybuda/test/tvm/cnn/tflite/test_efficientnet_lite.py b/pybuda/test/tvm/cnn/tflite/test_efficientnet_lite.py
index 3fe2f3d6f..968e286b6 100644
--- a/pybuda/test/tvm/cnn/tflite/test_efficientnet_lite.py
+++ b/pybuda/test/tvm/cnn/tflite/test_efficientnet_lite.py
@@ -25,7 +25,6 @@ def test_efficientnet_lite0(test_device):
         pytest.skip()
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_constant_prop = True
 
     pybuda.config.override_op_size("conv2d_29.dc.sparse_matmul.7.dc.sparse_matmul.1.lc2", (7, 1))
@@ -49,12 +48,11 @@ def test_efficientnet_lite0(test_device):
     )
 
 def test_efficientnet_lite4(test_device):
-    if test_device.arch == pybuda.BackendDevice.Wormhole_B0:
+    if test_device.arch == pybuda.BackendDevice.Wormhole_B0 or test_device.arch == pybuda.BackendDevice.Blackhole:
         pytest.skip()
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_constant_prop = True
     compiler_cfg.enable_conv_prestride = True
     compiler_cfg.graph_solver_self_cut_type = "ConsumerOperandDataEdgesFirst"
diff --git a/pybuda/test/tvm/cnn/tflite/test_pose_landmark.py b/pybuda/test/tvm/cnn/tflite/test_pose_landmark.py
index a9de7c0a5..bea2d8308 100644
--- a/pybuda/test/tvm/cnn/tflite/test_pose_landmark.py
+++ b/pybuda/test/tvm/cnn/tflite/test_pose_landmark.py
@@ -26,7 +26,6 @@ def test_pose_landmark_lite(test_device):
     pytest.skip("Resize2d DenseMM const too big")
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_constant_prop = True
     compiler_cfg.graph_solver_self_cut_type = "ConsumerOperandDataEdgesFirst"
 
@@ -51,7 +50,6 @@ def test_pose_landmark_heavy(test_device):
     pytest.skip("Resize2d DenseMM const too big")
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_constant_prop = True
     compiler_cfg.graph_solver_self_cut_type = "ConsumerOperandDataEdgesFirst"
 
diff --git a/pybuda/test/tvm/nightly/get_tensorflow_model_with_activations.py b/pybuda/test/tvm/nightly/get_tensorflow_model_with_activations.py
index 5aa9b34bb..952906a1a 100644
--- a/pybuda/test/tvm/nightly/get_tensorflow_model_with_activations.py
+++ b/pybuda/test/tvm/nightly/get_tensorflow_model_with_activations.py
@@ -171,7 +171,6 @@ def get_gptj_full(training, recompute):
     compile_cfg = CompilerConfig(
         enable_training=training,
         enable_recompute=recompute,
-        enable_t_streaming=True,
     )
     return model, [
         act1,
diff --git a/pybuda/test/tvm/nlp/onnx/tests_A/test_roberta.py b/pybuda/test/tvm/nlp/onnx/tests_A/test_roberta.py
index 288cce970..fab372aea 100644
--- a/pybuda/test/tvm/nlp/onnx/tests_A/test_roberta.py
+++ b/pybuda/test/tvm/nlp/onnx/tests_A/test_roberta.py
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 import onnx
 import onnxruntime as ort
+from pybuda._C.backend_api import BackendDevice
 import pytest
 import torch
 from pybuda import (
@@ -119,6 +120,9 @@ def test_tvm_roberta(test_kind, test_device):
     if test_kind == TestKind.TRAINING:
         pytest.skip()
 
+    if test_device.arch == BackendDevice.Blackhole:
+         pytest.skip("Skip until BudaBackend#2628 is consumed.")
+
     input_shape = (1, 256, 256)
     roberta_model = RobertaModel.from_pretrained("arampacha/roberta-tiny", torchscript=True)
     model = roberta_model.encoder
diff --git a/pybuda/test/tvm/nlp/onnx/tests_B/test_albert.py b/pybuda/test/tvm/nlp/onnx/tests_B/test_albert.py
index afdb73c65..7d97bd64b 100644
--- a/pybuda/test/tvm/nlp/onnx/tests_B/test_albert.py
+++ b/pybuda/test/tvm/nlp/onnx/tests_B/test_albert.py
@@ -2,7 +2,6 @@
 
 # SPDX-License-Identifier: Apache-2.0
 
-from distutils.command.config import config
 import pybuda
 import pytest
 
diff --git a/pybuda/test/tvm/nlp/onnx/tests_C/test_xglm.py b/pybuda/test/tvm/nlp/onnx/tests_C/test_xglm.py
index 2a36c8cb9..f7f1978ac 100644
--- a/pybuda/test/tvm/nlp/onnx/tests_C/test_xglm.py
+++ b/pybuda/test/tvm/nlp/onnx/tests_C/test_xglm.py
@@ -1,6 +1,7 @@
 # SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
 
 # SPDX-License-Identifier: Apache-2.0
+from pybuda._C.backend_api import BackendDevice
 from pybuda.config import CompileDepth
 from pybuda.verify.config import TestKind
 import pytest
@@ -28,6 +29,9 @@ def test_tvm_xglm(test_kind, test_device):
     if test_kind == TestKind.TRAINING:
         pytest.skip()
 
+    if test_device.arch == BackendDevice.Blackhole:
+         pytest.skip("Skip until BudaBackend#2628 is consumed.")
+
     config = XGLMConfig()
     input_shape = (1, 32, 1024)
     config.num_layers = 1
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_A/test_albert.py b/pybuda/test/tvm/nlp/pytorch/tests_A/test_albert.py
index 410c308ec..c239bd5c4 100644
--- a/pybuda/test/tvm/nlp/pytorch/tests_A/test_albert.py
+++ b/pybuda/test/tvm/nlp/pytorch/tests_A/test_albert.py
@@ -5,7 +5,6 @@
 # Some basic bring-up tests of tracing functionality
 #
 import configparser
-from distutils.command.config import config
 import pybuda
 import pytest
 
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_A/test_detr.py b/pybuda/test/tvm/nlp/pytorch/tests_A/test_detr.py
index c9f45957b..10ec33df1 100644
--- a/pybuda/test/tvm/nlp/pytorch/tests_A/test_detr.py
+++ b/pybuda/test/tvm/nlp/pytorch/tests_A/test_detr.py
@@ -49,7 +49,6 @@ def forward(self, hidden_states):
     else:
         compiler_cfg.compile_depth = CompileDepth.FULL
     compiler_cfg.enable_tvm_constant_prop = True
-    compiler_cfg.enable_t_streaming = True  # Enable T streaming for backbone ResNet
     compiler_cfg.balancer_policy = "CNN"
 
     # Inputs
@@ -121,7 +120,6 @@ def forward(self, hidden_states):
     compiler_cfg.enable_tvm_constant_prop = True
     # compiler_cfg.cpu_fallback_ops.add("zeros")
     # verify_cfg.verify_pybuda_codegen_vs_framework = False  # PCC is over 0.992
-    compiler_cfg.enable_t_streaming = True  # Enable T streaming for backbone ResNet
 
     # Inputs
     input_shape = (1, 3, 256, 256)
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_A/test_t5_small.py b/pybuda/test/tvm/nlp/pytorch/tests_A/test_t5_small.py
index 78b9c2dd1..b98ce44bd 100644
--- a/pybuda/test/tvm/nlp/pytorch/tests_A/test_t5_small.py
+++ b/pybuda/test/tvm/nlp/pytorch/tests_A/test_t5_small.py
@@ -384,16 +384,13 @@ def test_t5_past_cache(variant, test_device):
         pytest.skip()
 
     import os
-    os.environ["PYBUDA_DISABLE_STREAM_OUTPUT"] = "1"
     os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
     os.environ["TT_BACKEND_MULTI_THREADED_PUSH"] = "1"
-    os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"] = "1"
     os.environ["PYBUDA_EXTRA_L1_MARGIN"] = "169536"
     os.environ["PYBUDA_FORCE_SEQUENTIAL"] = "1"
     os.environ["PYBUDA_NLP_MANUAL_TARGET"] = "30000"
     os.environ["TT_BACKEND_DRAM_POLLING_FREQUENCY"] = "64"
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False
     compiler_cfg.default_df_override = pybuda._C.Float16_b
     compiler_cfg.default_dram_parameters = False
@@ -526,16 +523,13 @@ def test_t5_past_cache_pybuda_pipeline(variant, test_device):
         pytest.skip()
 
     import os
-    os.environ["PYBUDA_DISABLE_STREAM_OUTPUT"] = "1"
     os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
     os.environ["TT_BACKEND_MULTI_THREADED_PUSH"] = "1"
-    os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"] = "1"
     os.environ["PYBUDA_EXTRA_L1_MARGIN"] = "169536"
     os.environ["PYBUDA_FORCE_SEQUENTIAL"] = "1"
     os.environ["PYBUDA_NLP_MANUAL_TARGET"] = "30000"
     os.environ["TT_BACKEND_DRAM_POLLING_FREQUENCY"] = "64"
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False
     compiler_cfg.default_df_override = pybuda._C.Float16_b
     compiler_cfg.default_dram_parameters = False
@@ -712,7 +706,6 @@ def test_t5_past_cache_enc_dec(variant, test_device):
     import os
     os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
     os.environ["TT_BACKEND_MULTI_THREADED_PUSH"] = "1"
-    os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"] = "1"
     os.environ["PYBUDA_EXTRA_L1_MARGIN"] = "120000"
     # os.environ["PYBUDA_EXTRA_L1_MARGIN"] = "169536"
     os.environ["PYBUDA_FORCE_SEQUENTIAL"] = "1"
@@ -722,7 +715,6 @@ def test_t5_past_cache_enc_dec(variant, test_device):
     os.environ["TT_BACKEND_EPOCH_BIN_NUM_SLOTS"] = "64"
     os.environ["PYBUDA_ROTATE_PAST_CACHE_PARAMS"] = "1"
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False
     compiler_cfg.default_df_override = pybuda._C.Float16_b
     compiler_cfg.default_dram_parameters = False
@@ -931,7 +923,6 @@ def test_t5_small_tiny_tile(test_device):
     os.environ["PYBUDA_ENABLE_TINY_TILE"] = "1"
     # Add PyBUDA configurations
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False
     compiler_cfg.enable_auto_fusing = False  # tenstorrent/pybuda#844
     compiler_cfg.amp_level = 1
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_C/test_trocr.py b/pybuda/test/tvm/nlp/pytorch/tests_C/test_trocr.py
index f5c98d19a..e39471b54 100644
--- a/pybuda/test/tvm/nlp/pytorch/tests_C/test_trocr.py
+++ b/pybuda/test/tvm/nlp/pytorch/tests_C/test_trocr.py
@@ -37,7 +37,6 @@ def forward(self, pixel_values):
 
     # Compile configuration
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False
     compiler_cfg.enable_tm_cpu_fallback = False
     compiler_cfg.balancer_policy = "Ribbon"
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_D/test_bart.py b/pybuda/test/tvm/nlp/pytorch/tests_D/test_bart.py
index 663d1e387..908aa0f5f 100644
--- a/pybuda/test/tvm/nlp/pytorch/tests_D/test_bart.py
+++ b/pybuda/test/tvm/nlp/pytorch/tests_D/test_bart.py
@@ -6,7 +6,6 @@
 #
 import os
 from base64 import encode
-from distutils.config import PyPIRCCommand
 import pytest
 
 import torch
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_D/test_bloom.py b/pybuda/test/tvm/nlp/pytorch/tests_D/test_bloom.py
index 4b898c681..3a4566fb7 100644
--- a/pybuda/test/tvm/nlp/pytorch/tests_D/test_bloom.py
+++ b/pybuda/test/tvm/nlp/pytorch/tests_D/test_bloom.py
@@ -93,7 +93,7 @@ def test_bloom_hf(test_kind, test_device):
         # output mismatch
         pytest.skip()
 
-    if test_device.arch == BackendDevice.Wormhole_B0:
+    if test_device.arch == BackendDevice.Wormhole_B0 or test_device.arch == BackendDevice.Blackhole:
         pytest.skip() # see tenstorrent/pybuda#969
 
     compiler_cfg = _get_global_compiler_config()
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_D/test_gpt2.py b/pybuda/test/tvm/nlp/pytorch/tests_D/test_gpt2.py
index 9fcbe7561..b3298fc7f 100644
--- a/pybuda/test/tvm/nlp/pytorch/tests_D/test_gpt2.py
+++ b/pybuda/test/tvm/nlp/pytorch/tests_D/test_gpt2.py
@@ -382,7 +382,6 @@ def forward(self, input_gen, prefill_output):
     past_shape = (1, 1, 480, 32)
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.compile_subgraphs = True
     compiler_cfg.balancer_op_override("matmul_3_output_nop_0", "t_stream_shape", (15,1))
     # compiler_cfg.loopback_outputs = {"prefill_output": (0, 1)}
@@ -578,7 +577,6 @@ def forward(self, hidden_states):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.enable_tvm_constant_prop = True
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.tvm_constnat_prop_mask={"attn.c_attn.weight", "attn.c_attn.bias"}
 
 
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_D/test_gptj.py b/pybuda/test/tvm/nlp/pytorch/tests_D/test_gptj.py
index 11ec9ad78..15e0b24f8 100644
--- a/pybuda/test/tvm/nlp/pytorch/tests_D/test_gptj.py
+++ b/pybuda/test/tvm/nlp/pytorch/tests_D/test_gptj.py
@@ -38,10 +38,8 @@ def test_gptj_block(test_kind, test_device):
     if test_kind.is_training():
         compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
 
-    if test_device.arch == BackendDevice.Wormhole_B0:
+    if test_device.arch == BackendDevice.Wormhole_B0 or test_device.arch == BackendDevice.Blackhole:
         pytest.skip() # see tenstorrent/pybuda#969
-        
-    compiler_cfg.enable_t_streaming = True 
 
     #Fusing disabled due to tenstorrent/pybuda#789
     if (test_kind == TestKind.INFERENCE):
@@ -105,7 +103,6 @@ def test_tvm_rotate_every_two(test_kind, test_device):
 
     input_shape = (1, 128, 16, 256)
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True 
 
     class GPTJRotateEveryTwo(nn.Module):
         def __init__(self, config):
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_E/test_codegen.py b/pybuda/test/tvm/nlp/pytorch/tests_E/test_codegen.py
index fbdda71be..d32ec8ed8 100644
--- a/pybuda/test/tvm/nlp/pytorch/tests_E/test_codegen.py
+++ b/pybuda/test/tvm/nlp/pytorch/tests_E/test_codegen.py
@@ -32,7 +32,6 @@ def test_codegen_single_layer_fallback(test_kind, test_device):
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.enable_tvm_constant_prop = True
     compiler_cfg.retain_tvm_python_files = True
-    compiler_cfg.enable_t_streaming = True
 
     framework_model = download_model(CodeGenForCausalLM.from_pretrained, "Salesforce/codegen-350M-mono", use_cache=False, n_layer=1, return_dict=False)
 
diff --git a/pybuda/test/tvm/nlp/pytorch/tests_E/test_whisper.py b/pybuda/test/tvm/nlp/pytorch/tests_E/test_whisper.py
index 5331645a5..c2ce0a268 100644
--- a/pybuda/test/tvm/nlp/pytorch/tests_E/test_whisper.py
+++ b/pybuda/test/tvm/nlp/pytorch/tests_E/test_whisper.py
@@ -39,7 +39,6 @@ def test_whisper_encoder(test_device, variant):
     # Configurations
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.amp_level = 1
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False  # Run full model on silicon
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     pcc = 0.93 if test_device.devtype == BackendType.Silicon else 0.99
@@ -105,10 +104,8 @@ def test_whisper_decoder(test_device, variant):
     # Configurations
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.amp_level = 1
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False  # Run full model on silicon
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
-    os.environ["PYBUDA_DISABLE_STREAM_OUTPUT"] = "1"  # Disable streaming for LM head to output queue (perf)
     os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
 
     class Wrapper(torch.nn.Module):
@@ -240,16 +237,13 @@ def forward(self, decoder_input_ids, decoder_attention_mask, encoder_last_hidden
 def test_whisper_enc_dec(test_device, variant):
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.amp_level = 1
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tvm_cpu_fallback = False  # Run full model on silicon
     compiler_cfg.input_queues_on_host = True
     compiler_cfg.compile_subgraphs = True
     compiler_cfg.enable_link_past_cache_ios = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     os.environ["PYBUDA_FORCE_SEQUENTIAL"] = "1"
-    os.environ["PYBUDA_DISABLE_STREAM_OUTPUT"] = "1"  # Disable streaming for LM head to output queue (perf)
     os.environ["PYBUDA_PAD_OUTPUT_BUFFER"] = "1"
-    os.environ["PYBUDA_DISABLE_DYNAMIC_DRAM"] = "1"
     os.environ["TT_BACKEND_MULTI_THREADED_PUSH"] = "1"
     os.environ["TT_BACKEND_DRAM_POLLING_FREQUENCY"] = "64"
     os.environ["TT_BACKEND_PROFILER"] = "1"
diff --git a/pybuda/test/tvm/nlp/tensorflow/tests_B/test_bart.py b/pybuda/test/tvm/nlp/tensorflow/tests_B/test_bart.py
index affcaffce..1b7ac4262 100644
--- a/pybuda/test/tvm/nlp/tensorflow/tests_B/test_bart.py
+++ b/pybuda/test/tvm/nlp/tensorflow/tests_B/test_bart.py
@@ -5,7 +5,6 @@
 # Some basic bring-up tests of tracing functionality
 #
 from base64 import encode
-from distutils.config import PyPIRCCommand
 import pytest
 
 import tensorflow as tf
diff --git a/pybuda/test/tvm/nlp/tensorflow/tests_B/test_gpt2.py b/pybuda/test/tvm/nlp/tensorflow/tests_B/test_gpt2.py
index d478fa267..11fc4c87a 100644
--- a/pybuda/test/tvm/nlp/tensorflow/tests_B/test_gpt2.py
+++ b/pybuda/test/tvm/nlp/tensorflow/tests_B/test_gpt2.py
@@ -65,7 +65,6 @@ def call(self, hidden_states):
     )
 
 def test_tvm_gpt2_fallback(test_kind, test_device):
-    _get_global_compiler_config().enable_t_streaming = True
     if test_kind.is_training():
         pytest.skip()
         #TODO: Fix tvm .14 regressions: tenstorrent/pybuda#2099
diff --git a/pybuda/test/tvm/nlp/tensorflow/tests_B/test_gptj_tf.py b/pybuda/test/tvm/nlp/tensorflow/tests_B/test_gptj_tf.py
index edab3d119..680f89dff 100644
--- a/pybuda/test/tvm/nlp/tensorflow/tests_B/test_gptj_tf.py
+++ b/pybuda/test/tvm/nlp/tensorflow/tests_B/test_gptj_tf.py
@@ -100,7 +100,6 @@ def test_gptj_block(test_kind, test_device):
         pytest.skip()
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
 
     config = GPTJConfig(n_layer=1)  # for faster loading
     config.rotary_dim = 64
@@ -130,8 +129,7 @@ def test_gptj_fallback(test_kind, test_device):
         pytest.skip() # see tenstorrent/pybuda#969
     
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_tvm_constant_prop = True 
-    compiler_cfg.enable_t_streaming = True
+    compiler_cfg.enable_tvm_constant_prop = True
 
     #Fusing disabled due to tenstorrent/pybuda#789
     compiler_cfg.enable_auto_fusing=False
diff --git a/pybuda/test/tvm/sanity/tests_A/test_sanity_pytorch.py b/pybuda/test/tvm/sanity/tests_A/test_sanity_pytorch.py
index 3537d78aa..b1cae4108 100644
--- a/pybuda/test/tvm/sanity/tests_A/test_sanity_pytorch.py
+++ b/pybuda/test/tvm/sanity/tests_A/test_sanity_pytorch.py
@@ -488,10 +488,101 @@ def forward(self, a):
                 test_kind=test_kind,
             )
     )
+    
+input_shapes = [(1, 3, 8, 32, 32), (1, 16, 128, 10, 10)]
+scale_factors = [(1, 1, 1), (2, 3, 5), (4, 7, 9), (2, 8, 6)]
+upsample_modes = ["nearest"]
+
+
+@pytest.mark.parametrize(
+    "input_shape", input_shapes, ids=[f"input{str(s)}" for s in input_shapes]
+)
+@pytest.mark.parametrize(
+    "scale_factors", scale_factors, ids=[f"sfactor({str(s)})" for s in scale_factors]
+)
+@pytest.mark.parametrize(
+    "upsample_mode", upsample_modes, ids=[f"umode({str(u)})" for u in upsample_modes]
+)
+def test_tvm_upsample3d(test_kind, test_device, input_shape, scale_factors, upsample_mode):
+    if test_kind.is_training():
+        pytest.xfail()  # Backward is currently unsupported
+
+    _get_global_compiler_config().compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
+
+    class Upsample3d(nn.Module):
+        def __init__(self, scale_factors, upsample_mode):
+            super().__init__()
+            self.resize = torch.nn.Upsample(
+                scale_factor=scale_factors,
+                mode=upsample_mode,
+            )
+        def forward(self, a):
+            b = self.resize(a)
+
+            return b
+
+    model = Upsample3d(scale_factors, upsample_mode)
+    mod = PyTorchModule("Upsample3d", model)
+    verify_module(
+            mod,
+            (input_shape,),
+            verify_cfg=VerifyConfig(
+                arch=test_device.arch,
+                devtype=test_device.devtype,
+                test_kind=test_kind,
+            )
+    )
+
+input_shapes = [(1, 3, 4, 4), (1, 5, 2, 2) ,(1, 7, 2, 3), (1, 4, 5, 5), (1, 5, 6, 7)]
+scale_factors = [(6, 12), (5, 10), (7, 12), (8, 25), (10, 24)]
+upsample_modes = ["nearest_neighbor", "bilinear"]
+
+@pytest.mark.parametrize(
+    "input_shapes", input_shapes, ids=[f"input{str(s)}" for s in input_shapes]
+)
+@pytest.mark.parametrize(
+    "scale_factors", scale_factors, ids=[f"sfactor({str(s)})" for s in scale_factors]
+)
+@pytest.mark.parametrize(
+    "upsample_mode", upsample_modes, ids=[f"umode({str(u)})" for u in upsample_modes]
+)
+@pytest.mark.parametrize("align_corners", (True, False), ids=["align", "no_align"])
+def test_tvm_upsample2d_channel_last(test_kind, test_device, input_shapes, scale_factors, upsample_mode, align_corners):
+    if test_kind.is_training():
+        pytest.xfail()  # Backward is currently unsupported
 
+    _get_global_compiler_config().compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
+    
+    if scale_factors[0] % input_shapes[-3] != 0 or scale_factors[1] % input_shapes[-2] != 0:
+        pytest.skip()
+        
+    if align_corners and upsample_mode != "bilinear":
+            pytest.skip()
+    class Upsample2d(PyBudaModule):
+        def __init__(self, name):
+            super().__init__(name)
 
+        def forward(self, input):
+            if upsample_mode == "nearest_neighbor":
+                return pybuda.op.Resize2d("", input, sizes=scale_factors, method=upsample_mode, channel_last=1) 
+            else:
+                return pybuda.op.Resize2d("", input, sizes=scale_factors, method=upsample_mode, align_corners=align_corners, channel_last=1) 
+        
+    model = Upsample2d("channel_last")
+    
+    verify_module(
+            model,
+            (input_shapes,),
+            verify_cfg=VerifyConfig(
+                arch=test_device.arch,
+                devtype=test_device.devtype,
+                test_kind=test_kind,
+            )
+    )
+    
+    
 input_shapes = [(1, 128, 10, 10), (1, 16, 34, 60)]
-scale_factors = [2, 3]
+scale_factors = [(1, 1), (1, 2), (2, 3), (5, 2), (6, 3), (4, 6), (5,4)]
 upsample_modes = ["nearest", "bilinear"]
 
 
@@ -2265,6 +2356,38 @@ def forward(self, x, mask):
         ),
     )
 
+@pytest.mark.parametrize("dim", (0, 1, 2, 3, -1, -2, -3, -4))
+@pytest.mark.parametrize("input_shape", ((1, 4, 4), (1, 3, 7), (1, 7, 4), (1, 4, 7), (1, 8, 7, 9), (1, 8, 7, 9, 5)))
+def test_tvm_torch_flip(test_kind, test_device, input_shape, dim):
+    if dim >= len(input_shape[1:]) or (dim < 0 and abs(dim) > len(input_shape[1:])):
+        pytest.skip()
+    # Set PyBuda configurations
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.default_df_override = pybuda.DataFormat.Float16_b
+    compiler_cfg.compile_depth = CompileDepth.CONSTEVAL_GRAPH
+    class Flip(torch.nn.Module):
+        def __init__(self,dim,feature_size):
+            super().__init__()
+            self.dim = dim
+            self.l1 = torch.nn.Linear(feature_size, feature_size)
+        def forward(self,input):
+            input = self.l1(input)
+            input = input[0]
+            output = torch.flip(input, [self.dim])
+            return output
+    model = Flip(dim=dim,feature_size=input_shape[-1])
+    model.eval()
+    tt_model = pybuda.PyTorchModule("flip_tvm_decompose_adv_index", model)
+    verify_module(
+        tt_model,
+        (input_shape,),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            test_kind=test_kind,
+            verify_tvm_compile = True,
+        ),
+    )
 
 def test_tvm_adv_indexing_batch1(test_kind, test_device):
     # reproduce the decomposition of adv_index op at the end of gpt_neo model
@@ -2428,7 +2551,75 @@ def forward(self, x):
         )
     )
     
-    
+def test_tvm_layernorm_cpu(test_device):
+
+    # Configurations
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.cpu_fallback_ops.add("layernorm")
+
+    class Module(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+
+            self.l1 = torch.nn.Linear(9, 9)
+            self.layer_norm = torch.nn.LayerNorm(9)
+
+        def forward(self, x):
+            x = self.l1(x)
+            x = self.layer_norm(x)
+            return x
+
+    framework_module = Module()
+    framework_module.eval()
+    pybuda_module = PyTorchModule("pt_layermorm_cpu", framework_module)
+
+    input_shape = (1, 9, 9)
+
+    verify_module(
+        pybuda_module,
+        (input_shape,),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
+
+
+def test_tvm_dropout_cpu(test_device):
+
+    # Configurations
+    compiler_cfg = _get_global_compiler_config()
+    compiler_cfg.cpu_fallback_ops.add("nn.dropout")
+
+    class Module(torch.nn.Module):
+        def __init__(self):
+            super().__init__()
+            self.dropout =  torch.nn.Dropout()
+        def forward(self, x):
+            x = self.dropout(x)
+            x = torch.add(x, x)
+            return x
+
+    framework_module = Module()
+    framework_module.eval()
+    pybuda_module = PyTorchModule("pt_dropout_cpu", framework_module)
+
+    input_shape = (1, 9, 9)
+
+    verify_module(
+        pybuda_module,
+        (input_shape,),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+        ),
+    )
+
+
 def test_tvm_adv_index_bool_cpu_0(test_kind, test_device):
     # Only run recompute test in post-commit
     if test_kind == TestKind.TRAINING:
@@ -2571,6 +2762,43 @@ def forward(self, x):
     )
 
 
+@pytest.mark.parametrize("input_shape", ((1, 1, 256, 256), (1, 256, 256)))
+def test_tvm_simplifyreshape(test_device, input_shape):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+
+    class Model(nn.Module):
+        def __init__(self, new_shape_1, new_shape_2):
+            super().__init__()
+            self.new_shape_1 = new_shape_1
+            self.new_shape_2 = new_shape_2
+
+        def forward(self,input):
+            input = torch.reshape(input, self.new_shape_1)
+            input = torch.transpose(input, 1, 3)
+            input = torch.transpose(input, 2, 3)
+            input = torch.reshape(input, self.new_shape_2)
+            return input
+
+    new_shape_1 = (1, 16, 16, 256)
+    new_shape_2 = input_shape
+    model = Model(new_shape_1, new_shape_2)
+    tt_model = PyTorchModule("simplifyreshape", model)
+
+    verify_module(
+        tt_model,
+        (input_shape,),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            verify_tvm_compile=True,
+        )
+    )
+    
 def test_tvm_hslice_a(test_kind, test_device):
     # Only run recompute test in post-commit
     if test_kind.is_training():
@@ -3106,12 +3334,11 @@ def test_kernel_fracturing_with_grouped_conv(test_kind, test_device):
         pytest.skip()
 
     import os
-    if test_device.is_wormhole():
+    if test_device.is_wormhole_b0():
         os.environ["PYBUDA_EXTRA_L1_MARGIN"] = "60000"
 
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.conv_multi_op_fracture_factor_override["conv2d_1"] = 2
 
     class Module(nn.Module):
@@ -3163,7 +3390,6 @@ def test_BN_no_stats(test_kind, test_device):
         
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "CNN"
-    compiler_cfg.enable_t_streaming = True
     #Fusing disabled due to tenstorrent/pybuda#789
     compiler_cfg.enable_auto_fusing=False
     class ModelBN(nn.Module):
@@ -3215,7 +3441,7 @@ def forward(self, x):
             ),
         )
     finally:
-        if test_device.is_wormhole():
+        if test_device.is_wormhole_b0():
             del os.environ["PYBUDA_EXTRA_L1_MARGIN"]
 
 
@@ -3396,7 +3622,6 @@ def forward(self, act):
     # General compiler configuration overrides
     start_compiler_cfg = _get_global_compiler_config()
     start_compiler_cfg.balancer_policy = "CNN"
-    start_compiler_cfg.enable_t_streaming = True
     start_compiler_cfg.amp_level = 1
     
     # Environement variable compiler configuration overrides
@@ -3443,7 +3668,6 @@ def forward(self, act):
 def test_torch_conv3d(test_device):
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
 
     inC, inD, inH, inW = (2, 5, 5, 5)
     outC, kD, kH, kW = (4, 3, 3, 3)
@@ -3477,7 +3701,6 @@ def forward(self, act):
 def test_torch_maxpool3d(test_device):
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
 
     inC, inD, inH, inW = (3, 8, 8, 8)
     outC, kD, kH, kW = (3, 3, 3, 3)
@@ -3511,8 +3734,6 @@ def forward(self, act):
 def test_reflection_pad(test_device):
     compiler_cfg = pybuda.config._get_global_compiler_config()
     compiler_cfg.balancer_policy = "Ribbon"
-    compiler_cfg.enable_t_streaming = True
-
 
     class Module(nn.Module):
         def __init__(self):
diff --git a/pybuda/test/tvm/sanity/tests_A/test_tvm.py b/pybuda/test/tvm/sanity/tests_A/test_tvm.py
index e05f389a4..090e215cc 100644
--- a/pybuda/test/tvm/sanity/tests_A/test_tvm.py
+++ b/pybuda/test/tvm/sanity/tests_A/test_tvm.py
@@ -676,7 +676,6 @@ def forward(self, x):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.retain_tvm_python_files = True
-    compiler_cfg.enable_t_streaming = True
 
     framework_module = Module()
     pybuda_module = PyTorchModule("pt_reshape_transpose_into_hslice", framework_module)
@@ -738,7 +737,6 @@ def forward(self, x):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.retain_tvm_python_files = True
-    compiler_cfg.enable_t_streaming = True
 
     framework_module = Module()
     pybuda_module = PyTorchModule("pt_transpose_reshape_into_hstack", framework_module)
@@ -804,7 +802,6 @@ def forward(self, x):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.retain_tvm_python_files = True
-    compiler_cfg.enable_t_streaming = True
 
     framework_module = Module()
     pybuda_module = PyTorchModule("pt_reshape_into_vslice", framework_module)
@@ -865,7 +862,6 @@ def forward(self, x):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.retain_tvm_python_files = True
-    compiler_cfg.enable_t_streaming = True
 
     framework_module = Module()
     pybuda_module = PyTorchModule("pt_reshape_into_vslice", framework_module)
@@ -914,7 +910,6 @@ def forward(self, x):
             return x
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tm_cpu_fallback = True
 
     framework_module = Module()
@@ -965,7 +960,6 @@ def forward(self, x):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.retain_tvm_python_files = True
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tm_cpu_fallback = True
 
     framework_module = Module()
@@ -1019,7 +1013,6 @@ def forward(self, x):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.retain_tvm_python_files = True
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_tm_cpu_fallback = True
 
     framework_module = Module()
@@ -1178,7 +1171,6 @@ def forward(self, x):
             return x
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.retain_tvm_python_files = True
 
     framework_model = Model()
@@ -1218,7 +1210,6 @@ def forward(self, x):
             return x
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
 
     framework_module = Module()
@@ -1571,7 +1562,6 @@ def forward(self, x):
             return x
 
     compiler_cfg = _get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     # compiler_cfg.compile_depth = CompileDepth.BUDA_GRAPH_PRE_PLACER
 
     framework_model = Model()
diff --git a/pybuda/test/tvm/sanity/tests_B/test_pattern_matcher.py b/pybuda/test/tvm/sanity/tests_B/test_pattern_matcher.py
index 0984b52b8..0a987176a 100644
--- a/pybuda/test/tvm/sanity/tests_B/test_pattern_matcher.py
+++ b/pybuda/test/tvm/sanity/tests_B/test_pattern_matcher.py
@@ -128,7 +128,6 @@ def forward(self, act1):
 def test_swin_roll():
     # Set PyBuda configuration parameters
     compiler_cfg = pybuda.config._get_global_compiler_config()
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
     compiler_cfg.balancer_policy = "Ribbon"
 
@@ -153,3 +152,44 @@ def forward(self,hidden_state):
             test_kind=TestKind.INFERENCE,
         )
     )
+
+@pytest.mark.parametrize("tranpose_dims", ((2, 0), (0, 1), (1, 2), (3, 1), (4, 1),(-1, -6)))
+def test_reshape_transpose_reshape_tvm(test_device, tranpose_dims):
+
+    # Set PyBuda configuration parameters
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.default_df_override = pybuda._C.DataFormat.Float16_b
+    compiler_cfg.compile_depth=CompileDepth.GENERATE_INITIAL_GRAPH
+    class Model(nn.Module):
+        def __init__(self, new_shape_1, dim0, dim1, new_shape_2):
+            super().__init__()
+            self.new_shape_1 = new_shape_1
+            self.dim0 = dim0
+            self.dim1 = dim1
+            self.new_shape_2 = new_shape_2
+
+        def forward(self,input):
+            input = torch.reshape(input, self.new_shape_1)
+            input = torch.transpose(input, self.dim0, self.dim1)
+            input = torch.reshape(input, self.new_shape_2)
+            return input
+
+    new_shape_1 = (1, 4, 1, 1, 4, 9)
+    dim0, dim1 = tranpose_dims
+    new_shape_2 = (16, 9)
+
+    input_shape = (1, 16, 9)
+    model = Model(new_shape_1, dim0, dim1, new_shape_2)
+    tt_model = PyTorchModule("pt_reshape_transpose_reshape", model)
+
+    verify_module(
+        tt_model,
+        (input_shape,),
+        verify_cfg=VerifyConfig(
+            arch=test_device.arch,
+            devtype=test_device.devtype,
+            devmode=test_device.devmode,
+            test_kind=TestKind.INFERENCE,
+            verify_tvm_compile=True,
+        )
+    )
diff --git a/pybuda/test/tvm/stable_diffusion/run_stable_diffusion.py b/pybuda/test/tvm/stable_diffusion/run_stable_diffusion.py
index 1d8b3c305..9aa129017 100644
--- a/pybuda/test/tvm/stable_diffusion/run_stable_diffusion.py
+++ b/pybuda/test/tvm/stable_diffusion/run_stable_diffusion.py
@@ -124,7 +124,6 @@ def initialize_compiler_overrides():
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.enable_tvm_constant_prop = True
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_enumerate_u_kt = False
     compiler_cfg.graph_solver_self_cut_type = "FastCut"
     compiler_cfg.retain_tvm_python_files = True
diff --git a/pybuda/test/tvm/stable_diffusion/test_stable_diffusion.py b/pybuda/test/tvm/stable_diffusion/test_stable_diffusion.py
index f0706ffb2..cdb7e962a 100644
--- a/pybuda/test/tvm/stable_diffusion/test_stable_diffusion.py
+++ b/pybuda/test/tvm/stable_diffusion/test_stable_diffusion.py
@@ -55,7 +55,6 @@ def test_unet(test_device):
     mod = PyTorchModule("sd_unet", UnetWrapper(pipe.unet))
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.enable_tvm_constant_prop = True
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_enumerate_u_kt = False
     compiler_cfg.graph_solver_self_cut_type = "FastCut"
     compiler_cfg.retain_tvm_python_files = True
@@ -101,7 +100,6 @@ def test_unet_CrossAttention(test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.enable_tvm_constant_prop = True
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.balancer_op_override("softmax_11.dc.subtract.1", "t_stream_shape", (16,1))
     compiler_cfg.place_on_new_epoch("softmax_11.dc.subtract.1")
     compiler_cfg.place_on_new_epoch("matmul_17")
@@ -249,8 +247,6 @@ def test_unet_up_block(test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.enable_tvm_constant_prop = True
-    compiler_cfg.enable_t_streaming = True
-    compiler_cfg.enable_auto_fusing = False
     compiler_cfg.retain_tvm_python_files = True
     compiler_cfg.enable_enumerate_u_kt = False
     compiler_cfg.balancer_policy = "Ribbon"
@@ -353,7 +349,6 @@ def test_unet_down_block(test_device):
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.enable_tvm_constant_prop = True
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_auto_fusing = False
     compiler_cfg.balancer_policy = "Ribbon"
     compiler_cfg.graph_solver_self_cut_type = "ConsumerOperandDataEdgesFirst"
@@ -465,7 +460,6 @@ def initialize_compiler_overrides():
 
     compiler_cfg = _get_global_compiler_config()
     compiler_cfg.enable_tvm_constant_prop = True
-    compiler_cfg.enable_t_streaming = True
     compiler_cfg.enable_enumerate_u_kt = False
     compiler_cfg.graph_solver_self_cut_type = "FastCut"
     compiler_cfg.retain_tvm_python_files = True
diff --git a/pybuda/test/versim/test_versim_basic_ops.py b/pybuda/test/versim/test_versim_basic_ops.py
new file mode 100644
index 000000000..30ce4d1a2
--- /dev/null
+++ b/pybuda/test/versim/test_versim_basic_ops.py
@@ -0,0 +1,38 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+
+# SPDX-License-Identifier: Apache-2.0
+#
+# Versim-related tests for end-to-end simulation
+#
+from pybuda import pybuda
+from pybuda._C.backend_api import BackendDevice, BackendType
+from pybuda.tensor import Tensor
+from pybuda.verify.config import TestKind, VerifyConfig
+import pytest
+import torch
+from test.common import run
+
+def test_versim_simple_add(test_device):
+    # Run only versim tests
+    if test_device.devtype != BackendType.Versim:
+        pytest.skip()
+
+    @run(
+        verify_cfg=VerifyConfig(
+            test_kind=TestKind.INFERENCE,
+            devtype=test_device.devtype,
+            arch=test_device.arch),
+    )
+    def simple_add(a, b):
+        c = pybuda.op.Add("add0", a, b)
+        return c
+
+    compiler_cfg = pybuda.config._get_global_compiler_config()
+    compiler_cfg.input_queues_on_host = False
+    compiler_cfg.output_queues_on_host = False
+    compiler_cfg.balancer_op_override("add0", "grid_shape", (1,1))
+
+    shape = (1, 3, 128, 128)
+    a = Tensor.create_from_torch(torch.randn(shape))
+    b = Tensor.create_from_torch(torch.randn(shape))
+    simple_add(a, b)
\ No newline at end of file
diff --git a/python_env/module.mk b/python_env/module.mk
deleted file mode 100644
index af99023a7..000000000
--- a/python_env/module.mk
+++ /dev/null
@@ -1,28 +0,0 @@
-# Every variable in subdir must be prefixed with subdir (emulating a namespace)
-PYTHON_ENV = $(OUT)/python_env
-PYTHON_VERSION ?= python3.8
-
-# Each module has a top level target as the entrypoint which must match the subdir name
-python_env: $(PYTHON_ENV)/.installed
-
-.PRECIOUS: $(PYTHON_ENV)/.installed $(PYTHON_ENV)/%
-$(PYTHON_ENV)/.installed: python_env/requirements.txt
-	$(PYTHON_VERSION) -m venv $(PYTHON_ENV)
-	bash -c "unset LD_PRELOAD; source $(PYTHON_ENV)/bin/activate && pip3 install wheel==0.37.1"
-	bash -c "unset LD_PRELOAD; source $(PYTHON_ENV)/bin/activate && pip3 install -r python_env/requirements.txt -f https://download.pytorch.org/whl/cpu/torch_stable.html"
-	touch $@
-
-# Reference for adding PyTorch Geometrics library support. 
-# Pausing for now as support for Graph NNs is postponed untill Conv based
-# NNs have better support. 
-# 
-# Note: Think about how installation and build for PyTorch Geometrics (and 
-# related dependencies) can be optimized for better performance as it now
-# almost doubles the PyBuda build time.
-#
-# Code reference for section above:
-# python_env/requirements_ext.txt
-# bash -c "source $(PYTHON_ENV)/bin/activate && pip3 install -r python_env/requirements_ext.txt -f https://download.pytorch.org/whl/cpu/torch_stable.html"
-
-# If you depend on anything (headers, libs, etc) in the python env, build env first
-$(PYTHON_ENV)/%: $(PYTHON_ENV)/.installed ;
diff --git a/run_tests.py b/run_tests.py
deleted file mode 100644
index a43a1f6b3..000000000
--- a/run_tests.py
+++ /dev/null
@@ -1,301 +0,0 @@
-# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-
-# SPDX-License-Identifier: Apache-2.0
-# Convenience script to run tests, reset on hangs and produce summary.
-
-import os
-import glob
-import socket
-import random
-import string
-import argparse
-import subprocess
-from datetime import date
-
-import pybuda
-from pybuda import (
-    BackendType,
-    BackendDevice,
-)
-
-# Warm reset command for WH B0
-tt_root_smi_path = "/home/software/syseng"
-reset_command_gs = tt_root_smi_path + "/gs/tt-smi -tr all"
-reset_command_wh_b0 = tt_root_smi_path + "/wh/tt-smi -lr all wait -er"
-
-# High priority directory
-high_prio_dir = "pybuda/test/model_demos/high_prio/"
-
-# Override desired testlist (if not defined, it'll be automatically generated to run full test suite based on high_prio_dir)
-testlist = [
-
-]
-
-# Test variants to ignore
-testlist_to_ignore = [
-
-]
-
-# Globals
-hostname = socket.gethostname().replace("-", "_")
-
-def set_env_vars_to_match_ci(device_type):
-    # General
-    pytest_addopts = "-svv --durations=0"
-    pytest_addopts += " --silicon-only" if device_type == BackendType.Silicon else " --no-silicon"
-    os.environ["PYTEST_ADDOPTS"] = pytest_addopts
-
-    # PyBuda
-    os.environ["PYBUDA_FORCE_EMULATE_HARVESTED"] = "1"
-    os.environ["PYBUDA_VERIFY_POST_AUTOGRAD_PASSES"] = "1"
-    os.environ["PYBUDA_VERIFY_POST_PLACER"] = "1"
-    os.environ["PYBUDA_VERIFY_NET2PIPE"] = "3"
-
-
-def get_git_hash():
-    try:
-        git_hash = (
-            subprocess.check_output(["git", "rev-parse", "--short", "HEAD"], stderr=subprocess.STDOUT)
-            .decode("utf-8")
-            .strip()
-        )
-        if git_hash.isalnum():
-            return git_hash
-        else:
-            return None
-    except:
-        return None
-    
-    
-def generate_test_list():
-    global high_prio_dir
-
-    # pytest pybuda/test/model_demos/high_prio --setup-plan
-    res = subprocess.check_output(["pytest", high_prio_dir, "--setup-plan"], stderr=subprocess.STDOUT).decode("utf-8")
-    
-    test_list = []
-    test_count = 0
-
-    lines = res.split("\n")
-    for line in lines:
-        if "warnings summary" in line or "slowest durations" in line:
-            break
-        
-        if line and line.startswith("        " + high_prio_dir) and "::" in line:
-            line = line.strip()
-            line = line.split(" (fixtures used:")[0] if " (fixtures used:" in line else line
-
-            test_list.append(line)
-            test_count += 1
-
-    return test_list
-
-
-def print_test_start_info(test_name, current_i, test_count):
-    if current_i == 0:
-        print("Test progress & run details\n")
-        print("#" * 32 + "\n")
-    print(f"Running: {test_name}")
-    print(f"Current progress {current_i}/{test_count}\n")
-    print("#" * 32 + "\n")
-
-
-def write_test_info_to_log_file(test_log_file, test, test_log_file_path):
-    test_log_file.write("Test details\n")
-    test_log_file.write("#" * 32 + "\n\n")
-    test_log_file.write("Hostname:\n")
-    test_log_file.write(hostname + "\n\n")
-    test_log_file.write("Test command:\n")
-    test_log_file.write("pytest -svv --durations=0 " + test + "\n\n")
-    test_log_file.write("Log path:\n")
-    test_log_file.write(test_log_file_path + "\n\n")
-    test_log_file.write("#" * 32 + "\n\n")
-    test_log_file.flush()
-    
-
-def write_test_out_to_file(test_log_file, res):
-    """
-    Write stdout to log file.
-
-    For some reason, loguru (python) logs are captured as stderr. Therefore 
-    these are not fully in sync when changing between different loggers (e.g
-    python or C++ one). However, there are more logs then before, so it's
-    still beneficial to use as is.
-    """
-    if res.stderr is not None:
-        test_log_file.write(res.stderr.decode("utf-8"))
-    if res.stdout is not None:
-        test_log_file.write(res.stdout.decode("utf-8"))
-    test_log_file.flush()
-    test_log_file.close()
-
-
-def extract_test_path_info(full_test_path, info_type):
-    try:
-        if info_type == "file_name":
-            res = full_test_path.split("::")[0].split("/")[-1].split(".")[0]
-        elif info_type == "test_name":
-            res = full_test_path.split("::")[1].split("[")[0]
-        elif info_type == "test_variant":
-            res = full_test_path.split("::")[1].split("[")[1].split("]")[0].replace("-", "_").replace("/", "_").replace(" ", "_").replace(".", "_")
-        else:
-            raise ValueError("Invalid info_type")
-        
-        return res
-    except Exception as ex:
-        print("RunScriptError: ", ex)
-        return "unknown"
-
-
-def extract_test_details(test_log_dir_path, full_test_path):
-    test_log_file_name = extract_test_path_info(full_test_path, "file_name")
-    test_log_test_name = extract_test_path_info(full_test_path, "test_name")
-    test_log_test_variant = extract_test_path_info(full_test_path, "test_variant")
-    
-    test_log_file_path = test_log_dir_path + "/" + test_log_file_name + "_" + test_log_test_name + "_" + test_log_test_variant + ".log"
-    
-    return test_log_file_path
-
-
-def collect_error_logs(run_date, commit_sha):
-    logs_path = "logs" + "/" + run_date + "/" + commit_sha
-    
-    # Find all summary files    
-    summary_files = glob.glob(f"{logs_path}/summary*.log")
-    assert len(summary_files) > 0
-
-    failed_variants = []
-    for summary_file in summary_files:
-        with open(summary_file, "r") as f:
-            lines = f.readlines()
-            for line in lines:
-                test_variant, results = line.split(": ")
-                result = results.strip()
-                if result != "0":
-                    test_log_file_path = extract_test_details("", test_variant)
-                    failed_variants.append(test_log_file_path[1:])
-        
-        for root, dirs, files in os.walk(logs_path):
-            for file in files:
-                if file.endswith(".log"):
-                    file_path = os.path.join(root, file)
-                    if file_path.split('/')[-1] in failed_variants:
-                        print(file_path)
-                        with open(file_path, "r") as f:
-                            lines = f.readlines()
-                            count = 0
-                            for i, line in enumerate(lines[::-1]):
-                                if "ERROR" in line or "error" in line or "Error" in line:
-                                    count += 1
-                                    if count > 10:
-                                        break
-                                    print(line)
-                                    print()
-
-def run_tests():
-    global reset_command_wh_b0, reset_command_gs, testlist
-    
-    parser = argparse.ArgumentParser(description="Script for manually running all high priority model demos (instead of using CI)")
-    # General run functionality
-    parser.add_argument("-r", "--reverse", help="Run reversely sorted test list", action="store_true")
-    parser.add_argument("-np", "--netlist-prefix", help="Specify short commit sha on which this script is run. Has to be 9 char long (e.g. a5d778af5)")
-    
-    # Helper functionality (e.g. log collection)
-    parser.add_argument("-co", "--collect-only", help="Collect error logs based on failed variants", action="store_true")
-    parser.add_argument("-d", "--date", help="Specify date of run in format dd_mm (e.g. 27_03)")
-    parser.add_argument("-s", "--sha", help="Specify short commit sha on which this script is run. Has to be 9 char long (e.g. a5d778af5)")
-
-    args = parser.parse_args()
-    
-    if args.collect_only:
-        assert args.date and args.sha, "Date and commit sha has to be specified when collecting logs"
-        
-        if len(args.date) != 5 and args.date[2] != "_":
-            raise ValueError("Date has to be in format dd_mm (e.g. 27_03)")
-        
-        if len(args.sha) != 9:
-            raise ValueError("Commit sha has to be 9 char long")
-        
-        collect_error_logs(args.date, args.sha)
-        
-        return
-    
-    detected_devices = pybuda.detect_available_devices()
-    device_type = BackendType.Golden if len(detected_devices) == 0 else BackendType.Silicon
-    if device_type == BackendType.Silicon:
-        if detected_devices[0] == BackendDevice.Grayskull:
-            reset_command = reset_command_gs
-        elif detected_devices[0] == BackendDevice.Wormhole_B0:
-            reset_command = reset_command_wh_b0
-        else:
-            raise ValueError("Unknown device")
-    else:
-        reset_command = ""
-            
-    # Sanity reset run if machine is in bad state
-    if device_type == BackendType.Silicon:
-        os.system(reset_command)
-    
-    # Set needed env vars
-    set_env_vars_to_match_ci(device_type)
-    
-    # Get commit hash and date-time references
-    commit = get_git_hash()
-    if commit is None:
-        commit = "unknown"
-    run_date = date.today().strftime("%d_%m")
-
-    # Setup result summary file and directory
-    sum_log_dir_path = "logs" + "/" + run_date + "/" + commit
-    if not os.path.exists(sum_log_dir_path):
-        os.makedirs(sum_log_dir_path)
-    sum_log_file_suffix = ''.join(random.choice(string.ascii_lowercase + string.digits) for _ in range(9))        
-    sum_log_file_path = sum_log_dir_path + f"/summary_{hostname}_{sum_log_file_suffix}.log"
-    test_sum_file = open(sum_log_file_path, "w")
-    
-    # Generate or fetch test list
-    if testlist == []:
-        testlist = generate_test_list()
-    test_count = len(testlist)
-    
-    if args.reverse:
-        testlist = testlist[::-1]
-
-    # Run each test variant as subprocess
-    for i, test in enumerate(testlist):
-        if test in testlist_to_ignore:
-            print(f"Skipping {test}")
-            continue
-
-        print_test_start_info(test, i, test_count)
-
-        # Setup log file and directory
-        test_log_dir_path = sum_log_dir_path + "/" + "/".join(test.split("::")[0].split("/")[:-1])
-        if not os.path.exists(test_log_dir_path):
-            os.makedirs(test_log_dir_path)
-
-        test_log_file_path = extract_test_details(test_log_dir_path, test)
-        test_log_file = open(test_log_file_path, "w")
-
-        write_test_info_to_log_file(test_log_file, test, test_log_file_path)
-        
-        # Override graph name (Netlist name)
-        os.environ["PYBUDA_GRAPH_NAME_SUFFIX"] = args.netlist_prefix if args.netlist_prefix else "" + test_log_file_path.split('/')[-2] + "_" + test_log_file_path.split('/')[-1].split('.')[0]
-
-        # Run PyTest as subprocess
-        res = subprocess.run(["pytest", "-svv", "--durations=0", test], capture_output=True)
-
-        write_test_out_to_file(test_log_file, res)
-
-        new_line_char = "\n"
-        test_sum_file.write(f"{test}: {res.returncode}{new_line_char}")
-        test_sum_file.flush()
-        if res.returncode != 0:
-            if device_type == BackendType.Silicon:
-                os.system(reset_command)
-
-    test_sum_file.close()
-
-
-if __name__ == "__main__":
-    run_tests()
diff --git a/scripts/bisect.sh b/scripts/bisect.sh
new file mode 100644
index 000000000..65f1686c3
--- /dev/null
+++ b/scripts/bisect.sh
@@ -0,0 +1,333 @@
+#!/bin/bash
+
+: << 'COMMENT'
+SAMPLE:
+Script run command : bash ./scripts/bisect.sh
+
+INPUTS:
+Enter Pytest Command: 
+pytest --devtype golden pybuda/test/model_demos/high_prio/cnn/pytorch/test_xception.py::test_xception_timm[Golden-xception] --device-config gs_e150
+Enter Passing Commit Id: 
+8e576abe7fdc250ba88775322b448fa05acf52d1 #passing commit id
+Enter Failing Commit Id:
+6c2a0f68aab744ce08174f5c59abc946be6b8395 #failing commit id
+Enter Architecture(grayskull/wormhole_b0): 
+grayskull
+Device config(e150/e300):
+e150
+Enter Run type(compile/silicon):
+compile
+
+COMMENT
+
+# Enabling required flags based on the architecture and run type 
+set_evn_flags() {
+    local arch=$1
+    local runtype=$2
+    local device_config=$3
+    export PYBUDA_VERIFY_POST_AUTOGRAD_PASSES=1
+    export PYBUDA_VERIFY_POST_PLACER=1
+    export PYBUDA_VERIFY_NET2PIPE=3
+    export PYTEST_ADDOPTS=" -svv"
+
+    if [ "$arch" = "wormhole_b0" ] ; then
+        export BACKEND_ARCH_NAME=wormhole_b0
+        export ARCH_NAME=wormhole_b0
+
+        if [ "$device_config" = "no" ] ; then
+            export PYBUDA_FORCE_EMULATE_HARVESTED=1 
+        fi
+        
+        if [ "$runtype" = "compile" ] ; then
+            export GOLDEN_WORMHOLE_B0=1 
+            export PYBUDA_DEVMODE=1 
+            export PYBUDA_EMULATE_SILICON_DEVICE=1 
+            export PYBUDA_VERIFY_GOLDEN=1
+        else
+            export PYTEST_ADDOPTS=" -svv --silicon-only"
+        fi
+    fi
+
+    if [ "$arch" = "grayskull" ] ; then
+        export BACKEND_ARCH_NAME=grayskull
+        export ARCH_NAME=grayskull
+
+        if [ "$device_config" = "e300" ] ; then
+            export PYBUDA_FORCE_EMULATE_HARVESTED=1 
+        fi
+
+        if [ "$runtype" = "compile" ] ; then
+            export PYBUDA_DEVMODE=1 
+            export PYBUDA_EMULATE_SILICON_DEVICE=1 
+            export PYBUDA_VERIFY_GOLDEN=1
+        else
+            export PYTEST_ADDOPTS=" -svv --silicon-only"
+        fi
+
+    fi
+}
+
+# Getting inputs from the user
+get_inputs() {
+    local pytest_cmd
+    read -p "Enter Pytest Command: " pytest_cmd 
+    read -p "Enter Passing Commit Id: " pass_id 
+    read -p "Enter Failing Commit Id: " fail_id 
+    read -p "Enter Architecture(grayskull/wormhole_b0): " arch
+    
+    if [ "$arch" = "wormhole_b0" ] ; then
+        read -p "Is it 1x1 config(yes/no): " device_config
+    else
+        read -p "Device config(e150/e300): " device_config
+    fi
+    read -p "Enter Run type(compile/silicon): " runtype
+
+    echo "$pytest_cmd,$pass_id,$fail_id,$arch,$runtype,$device_config"
+}
+
+# If any build issues, it will show build error and exit
+error_handling() {
+    if [ $? -ne 0 ]; then
+        local stage="$2"
+        echo "Error: $stage  Command failed"
+        exit 1
+    fi
+}
+
+# Clean previous all cacche and build folder. Build based on the architecture
+# Input :
+# $1: Architecture (based on that build will be run)
+env_clean_and_build() {
+    local arch="$1"
+    git submodule update --init --recursive >/dev/null 2>&1
+    git submodule update --init --checkout --depth 1 -f third_party/confidential_customer_models >/dev/null 2>&1
+    echo "Submodules Updated"
+    if [ -d "build" ]; then
+        echo "Build directory exists. Doing a clean up..."
+        rm -rf .pkl_memoize_py3 
+        rm -rf .pytest_cache 
+        rm -rf device_images/ 
+        rm -rf .hlkc_cache  
+        rm -rf wheel_out/  
+        rm -rf wheel_env/  
+        rm -rf pybuda.egg-info/ 
+        rm -rf wheele_env/ 
+        rm -rf generated_modules 
+        rm -rf tt_build 
+        rm -rf net2pipe_output 
+        rm -rf tensor_binaries 
+        rm -rf imagenet_classes*
+        rm -rf core*
+        rm -rf *.log 
+        rm -rf *.summary 
+        rm -rf *.yaml 
+        rm -rf *.png 
+        rm -rf *.jpg 
+        rm -rf *.pt 
+        rm -rf *.xml
+        rm -rf *.json
+        make clean >/dev/null 2>&1
+        error_handling "$?" "Clean"
+        echo "Build and cache is cleaned!"
+    fi
+    
+    # Disable this code if your testing old regression
+    if [ "$arch" = "wormhole_b0" ] ; then 
+        source env_for_wormhole_b0.sh >/dev/null 2>&1
+    else 
+        source env_for_silicon.sh >/dev/null 2>&1
+    fi
+    error_handling "$?" "Build"
+
+    #Enable below code for old regression
+
+    #export TORCH_VISION_INSTALL=0 
+    #if [ "$arch" = "wormhole_b0" ] ; then 
+    #    source env_for_wormhole_b0.sh >/dev/null 2>&1
+    #else 
+    #    source env_for_silicon.sh >/dev/null 2>&1
+    #fi
+    #error_handling "$?" "Build"
+    #if [ -d "vision" ]; then
+    #    echo "Vision Directory exists. Doing a clean up..."
+    #    rm -rf vision
+    #fi
+    #export TORCH_VISION_INSTALL=1
+    #make torchvision >/dev/null 2>&1
+    #error_handling "$?" "Torchvision"
+    
+    echo "Build Successfully"
+
+}
+
+# Runs the pytest command and stores all logs in log path
+# Input :
+# $1: Pytest Command
+# $2: Log path
+# Output:
+#   Last line of the pytest result
+pytest_run() {
+    local cmd="$1" 
+    local log_path=$2
+    command="$cmd >$log_path"
+    eval "$command" >/dev/null 2>&1
+    result=$(tail -1 "$log_path") 
+    echo "$result"
+}
+
+#Based on the pytest result, it will bisect good or bad
+# Input :
+# $1: pytest results
+# $2: Expected string to be replicated
+# Output:
+#   Current Test case is pass or failed
+#   First line of the bisect output
+comparison_result() {
+    local pytest_result=$1
+    local expecting_string=$2
+    local replication=1
+    local bis_out
+
+    if [ "$expecting_string" = "NA" ]; then
+        replication=0
+        expecting_string="passed"
+    fi
+
+    if echo "$pytest_result" | grep -q "skipped" ; then
+        echo "============================= Testcase got skipped ============================="
+        exit 1
+    fi
+
+    if echo "$pytest_result" | grep -q "$expecting_string" ; then
+        if [ "$replication" -eq 0 ] ; then
+            echo "============================= Test case got $expecting_string ============================="
+        else 
+            echo "============================= $expecting_string case got replicated =============================" 
+        fi
+    else
+        if [ "$replication" -eq 0 ] ; then
+            echo "============================= Test case got failed ============================="
+            expecting_string="failed" 
+        else 
+            echo "============================= Not able to replicate $expecting_string case =============================" 
+            exit 1
+        fi
+    fi
+
+    echo "Bisect results" 
+    if [ "$expecting_string" = "passed" ] ; then
+        bis_out=$(git bisect good | head -n 1 )
+    else 
+        if [ "$expecting_string" = "failed" ] ; then
+            bis_out=$(git bisect bad | head -n 1 )
+        fi
+    fi
+    echo "$bis_out"
+}
+
+# This function calls env_clean_and_build function, pytest_run function and comparison_result function
+# Input :
+# $1: Expected string to be replicated
+# $2: Architecture
+# $3: Pytest Command
+# $4: Log Path 
+# $5: Run count
+# Output:
+#   First line of the bisect output
+bisect_run() {
+    replica_string=$1
+    arch=$2
+    pytest_command=$3
+    local Log_path
+    if [ "$replica_string" = "NA" ]; then
+        run_count=$5
+        extension="_$run_count.txt"
+        Log_path="$4/revision$extension"
+    else
+        extension="_replication.txt"
+        Log_path="$4/$replica_string$extension"
+    fi
+
+    env_clean_and_build "$arch"
+    pytest_result=$(pytest_run "$pytest_command" "$Log_path")
+    bisect_output=$(comparison_result "$pytest_result" "$replica_string")
+    echo "$bisect_output"
+    deactivate
+}
+
+########################### main #################################
+
+#INPUTS
+# get_inputs function get 6 inputs from user and returns 4 outputs
+# Parameters:
+#   $1: Pytest Command
+#   $2: Passing Commit Id
+#   $3: Failing Commit Id
+#   $4: Architecture
+#   $5: Device config
+#   $6: Run type
+# Returns:
+#   pytest_command, pass_id, fail_id, arch, runtype, device_config
+
+inputs=$(get_inputs)
+IFS=',' read -r pytest_command pass_id fail_id arch runtype device_config <<< "$inputs"
+
+# set_evn_flags function is to set all environmental flags based on the architecture, runtype and device config
+# Parameters:
+#   $1: architecture
+#   $2: runtype
+#   $3: device config
+set_evn_flags "$arch" "$runtype" "$device_config"
+run_count=0
+
+#Creating folder for dumping the logs
+file_path=$(echo "$pytest_command" | cut -d'.' -f1)
+model_name=$(echo "$file_path" | awk -F'/' '{print $NF}')
+if ! [ -d "Logs" ]; then
+    mkdir "Logs"
+fi
+folder_path="Logs/$model_name"
+if ! [ -d "$folder_path" ]; then
+    mkdir "$folder_path"
+else
+    echo "Log Directory exists. Doing a clean up and creating new one..."
+    rm -rf "$folder_path"
+    mkdir "$folder_path"
+fi
+
+#To Avoid clash with previous bisect run we are resetting and starting.
+git bisect reset >/dev/null 2>&1
+git bisect start
+
+# bisect_run function get 3 inputs from user and returns result
+# $1: Expected string to be replicated
+# $2: Architecture
+# $3: Pytest Command
+# $4: Folder Path
+# $5: Run count
+# Returns:
+# bisect result
+
+#Replicating Pipeline last passing commit id in local run
+echo -e "\nGoing to replicate pass case in last passing commit id..."
+git checkout $pass_id >/dev/null 2>&1
+bisect_run "passed" "$arch" "$pytest_command" "$folder_path" "$run_count"
+
+#Replicating Pipeline first failing commit id in local run
+echo "Going to replicate fail case in first failing commit id..."
+git checkout "$fail_id" >/dev/null 2>&1
+bisect_output=$(bisect_run "failed" "$arch" "$pytest_command" "$folder_path" "$run_count")
+echo "$bisect_output"
+result=$(echo "$bisect_output" | awk '/Bisect results/{p=1; next} p') 
+
+#This loop will be continued untill we are getting first regressed commit id
+while ! echo "$result" | grep -q "first bad commit"; do
+    run_count=$((run_count+1))
+    bisect_output=$(bisect_run "NA" "$arch" "$pytest_command" "$folder_path" "$run_count")
+    echo "$bisect_output"
+    result=$(echo "$bisect_output" | awk '/Bisect results/{p=1; next} p') 
+    sleep 1 
+done
+
+extension="/bisect_log.txt"
+git bisect log | tee "$folder_path$extension"
\ No newline at end of file
diff --git a/scripts/compare_perf.py b/scripts/compare_perf.py
new file mode 100644
index 000000000..284e5073f
--- /dev/null
+++ b/scripts/compare_perf.py
@@ -0,0 +1,112 @@
+# SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+# SPDX-License-Identifier: Apache-2.0
+
+import sys
+import os
+import math
+import argparse
+from elasticsearch import Elasticsearch
+import pandas as pd
+
+# add project root to search path
+project_root_path = os.path.join(os.path.dirname(__file__), "..")
+sys.path.append(project_root_path)
+
+from third_party.confidential_keys.elastic_search import ES_ENDPOINT, ES_USERNAME, ES_PASSWORD
+
+
+def get_perf_from_es(es, build_id):
+    """ Get the perf from elastic search for the specified build id. """
+    query = {
+        "query": {
+            "term": {
+                "build_id.keyword": build_id
+            }
+        },
+        "size": 100
+    }
+    perf_res = es.search(index="pybuda-perf-ci", body=query)
+    hits = perf_res['hits']['hits']
+    return [{
+        "build_id": h["_source"]["build_id"],
+        "model": h["_source"]["args"]['model'],
+        "config": h["_source"]["args"]['config'],
+        "dataformat": h["_source"]["args"]['dataformat'],
+        "arch": h["_source"]["args"]['arch'],
+        "samples_per_sec": h["_source"]["samples_per_sec"],
+        "index": "_".join([h["_source"]["args"]['model'], h["_source"]["args"]['config'], h["_source"]["args"]['arch'], h["_source"]["args"]['dataformat']])
+    } for h in hits]
+
+
+def compare_perf(build_ids: list):
+    """ Compare the perf for the specified build ids. """
+    es = Elasticsearch([ES_ENDPOINT], http_auth=(ES_USERNAME, ES_PASSWORD))
+    print("\nGetting perf from elastic search\n")
+
+    # Load perf data from elastic search
+    data = []
+    for build_id in build_ids:
+        es_res = get_perf_from_es(es, build_id)
+        data.extend(es_res)
+        print(f"Got perf for build id: {build_id} with {len(es_res)} records")
+
+    df = pd.DataFrame.from_records(data)
+    ids = df['build_id'].unique()
+
+    # pivot table on build_id
+    pivot_table = df.pivot_table(index=['index', 'arch', 'config', 'dataformat', 'model'], columns='build_id', values='samples_per_sec', aggfunc='max')
+    df = pd.DataFrame(pivot_table.to_records())
+
+    # add pct_diff column
+    df.drop(columns=['index'], inplace=True)
+    df['pct_diff'] = ((df[ids[1]] - df[ids[0]]) / df[ids[0]]) * 100
+
+    return df
+
+def print_diff(df: pd.DataFrame):
+    """ Print the perf diff to console. """
+    # format the pct_diff column
+    def format_value(value):
+        if not math.isnan(value):
+            if value > 1:
+                return f'\x1b[32m{value:.2f}%\x1b[0m'  # ANSI code for green color
+            elif value < -1:
+                return f'\x1b[31m{value:.2f}%\x1b[0m'  # ANSI code for red color
+            else:
+                return f'\x1b[37m{value:.2f}%\x1b[0m'  # ANSI code for red color
+        else:
+            return 'N/A'
+
+    df['pct_diff'] = df['pct_diff'].apply(format_value)
+    df = df.round(2)
+    pd.set_option("display.max_rows", None)
+    print(df)
+
+def main():
+    parser = argparse.ArgumentParser(description='Compare performance for two builds.')
+    parser.add_argument('build_ids', nargs=2, help='Build IDs to compare')
+    parser.add_argument('-o', '--output', help='Output file path (CSV format)', default=None)
+    args = parser.parse_args()
+
+    build_ids = args.build_ids
+
+    # correct the build ids prefix
+    prefix = "gitlab-pipeline-"
+    build_ids = [(x if x.startswith(prefix) else prefix + str(x)) for x in build_ids]
+
+    # compare
+    df = compare_perf(build_ids)
+
+    # save to file
+    if args.output:
+        df.to_csv(args.output, index=False)
+
+    print_diff(df)
+
+if __name__ == "__main__":
+    main()
+
+
+def test_compare_perf():
+    df = compare_perf(["gitlab-pipeline-479274", "gitlab-pipeline-479323"])
+    print_diff(df)
diff --git a/scripts/gitlab_single_job_history.py b/scripts/gitlab_single_job_history.py
new file mode 100644
index 000000000..d260a1e5c
--- /dev/null
+++ b/scripts/gitlab_single_job_history.py
@@ -0,0 +1,135 @@
+import datetime
+
+import gitlab
+import tabulate
+
+# GitLab configurations
+gitlab_url = "https://yyz-gitlab.local.tenstorrent.com/"
+private_token = "[YOUR_GITLAB_PRIVATE_TOKEN]"
+project_id = "tenstorrent/pybuda"
+
+# Other configurations
+gitlab_datetime_format = "%Y-%m-%dT%H:%M:%S.%f%z"
+print_datetime_format = "%d-%m-%Y %H:%M:%S"
+
+# Filter conditions
+schedule_name = "PyBuda Dev Models"    # Name of the scheduled pipeline to reference
+job_name = "[job name]"             # Job name, e.g. silicon-nlp-pytorch-xglm-wh-b0-n150
+pipeline_limit_num = 9              # History limit for specified job
+
+
+def filter_pipeline_condition(pipeline):
+    condition = True
+
+    # Reference only "main" branch
+    if pipeline.ref.lower() != "main":
+        condition = False
+
+    # Reference only scheduled pipelines (nightlies, weeklies)
+    if pipeline.source.lower() != "schedule":
+        condition = False
+        
+    if pipeline.status.lower() == "canceled":
+        condition = False
+        
+    return condition
+
+
+def collect_pipeline_details(pipeline):
+    details = {}
+
+    details["id"] = pipeline.id
+    details["status"] = pipeline.status
+    
+    started_at = datetime.datetime.strptime(pipeline.started_at, gitlab_datetime_format) if pipeline.started_at else None
+    details["started_at"] = started_at.strftime(print_datetime_format) if started_at else "N/A"
+    finished_at = datetime.datetime.strptime(pipeline.finished_at, gitlab_datetime_format) if pipeline.finished_at else None
+    details["finished_at"] = finished_at.strftime(print_datetime_format) if started_at and finished_at else "N/A"
+    details["duration"] = pipeline.duration
+
+    details["ref"] = pipeline.ref
+    details["sha"] = pipeline.sha
+    details["source"] = pipeline.source
+    details["web_url"] = pipeline.web_url
+
+    return details
+
+
+def filter_job_condition(job, job_name=None):
+    condition = True
+        
+    # Reference only specific job
+    if job_name != "" and job.name.lower() != job_name.lower():
+        condition = False
+
+    return condition
+
+
+def collect_job_details(job):
+    details = {}
+    
+    details["name"] = job.name
+    details["ref"] = job.ref
+    details["stage"] = job.stage
+    details["status"] = "❌ " + job.status if job.status == "failed" else "✅ " + job.status
+    details["web_url"] = job.web_url
+    
+    started_at = datetime.datetime.strptime(job.started_at, gitlab_datetime_format) if job.started_at else None
+    details["started_at"] = started_at.strftime(print_datetime_format) if started_at else "N/A"
+    finished_at = datetime.datetime.strptime(job.finished_at, gitlab_datetime_format) if job.finished_at else None
+    details["finished_at"] = finished_at.strftime(print_datetime_format) if started_at else "N/A"
+    details["duration"] = job.duration
+    
+    details["short_commit"] = job.commit["short_id"]
+
+    return details
+
+
+def print_job_history_table(table_rows):
+    job_heading = f"| History for: {job_name} |"
+    table_headers = ["#", "Pipeline ID", "Job Status", "Job Duration", "Job Started At", "Job Finished At", "Job Short Commit", "Job Web URL"]
+    
+    print()
+    print("-" * len(job_heading))
+    print(job_heading)
+    print(tabulate.tabulate(table_rows, headers=table_headers, tablefmt="grid"))
+
+
+# Function to filter and print scheduled jobs
+def list_scheduled_jobs(project_id, job_name=""):
+    # Create a GitLab client
+    gl = gitlab.Gitlab(url=gitlab_url, private_token=private_token)
+    
+    project = gl.projects.get(project_id)
+    schedules = project.pipelineschedules.list(all=True)
+    filtered_schedules = [schedule for schedule in schedules if schedule.description == schedule_name]
+    assert len(filtered_schedules) == 1, f"Found {len(filtered_schedules)} schedules with name {schedule_name}"
+
+    # Fetch detailed pipeline information
+    table_rows = []
+    pipelines = filtered_schedules[0].pipelines.list(all=True)[::-1]
+    
+    table_rows_num = pipeline_limit_num if len(pipelines) > pipeline_limit_num else len(pipelines)
+    
+    for i, pipeline in enumerate(pipelines):
+        if i >= pipeline_limit_num:
+            break
+        pipeline = project.pipelines.get(pipeline.id)
+        
+        if filter_pipeline_condition(pipeline):
+            print(f"Processing {i + 1}/{table_rows_num}")
+            pipeline_details = collect_pipeline_details(pipeline)            
+
+            # List jobs for the scheduled pipeline
+            jobs = pipeline.jobs.list(all=True)
+            for job in jobs:
+                if filter_job_condition(job, job_name):
+                    job_details = collect_job_details(job)
+                    
+                    table_row = [i + 1, pipeline_details["id"], job_details["status"], job_details["duration"], job_details["started_at"], job_details["finished_at"], job_details["short_commit"], job_details["web_url"]]
+                    table_rows.append(table_row)
+                    
+    print_job_history_table(table_rows)
+
+
+list_scheduled_jobs(project_id, job_name)
diff --git a/setup.py b/setup.py
index 2d915c348..fcd0a0af0 100644
--- a/setup.py
+++ b/setup.py
@@ -8,135 +8,11 @@
 import platform
 import subprocess
 
-from distutils.version import LooseVersion
+__requires__ = ['pip >= 24.0']
+
 from setuptools import setup, Extension, find_packages
 from setuptools.command.build_ext import build_ext
 
-# BudaBackend files to be copied over
-bbe_files = {
-    "lib": {
-        "path": "build/lib" ,
-        "files": ["libtt.so", "libdevice.so"],
-    },
-    "bin": {
-        "path": "build/bin" ,
-        "files": ["net2pipe", "pipegen2", "op_model"],
-    },
-    "device_descriptors": {
-        "path": "device",
-        "files": [
-            "grayskull_120_arch.yaml",
-            "grayskull_10x12.yaml",
-            "wormhole_8x10.yaml",
-            "wormhole_80_arch.yaml",
-            "wormhole_b0_8x10.yaml",
-            "wormhole_b0_8x10_harvested.yaml",
-            "wormhole_80_harvested.yaml",
-            "wormhole_b0_80_arch.yaml",
-            "wormhole_b0_80_harvested.yaml",
-            "wormhole_b0_1x1.yaml",
-            "grayskull_10x12.yaml",
-            "wormhole_b0_4x6.yaml",
-        ]
-    },
-    "params": {
-        "path": "perf_lib/op_model/params",
-        "files": "*"
-    },
-    "device_silicon_wormhole_bin": {
-        "path": "device/bin/silicon/wormhole",
-        "files": [
-            "create-ethernet-map"
-        ]
-    },
-    "misc": {
-        "path": "infra",
-        "files": [
-            "common.mk"
-        ]
-    },
-    "firmware": {
-        "path": "src/firmware/riscv",
-        "files": "*"
-    },
-    "firmware_brisc_hex": {
-        "path": "build/src/firmware/riscv/targets/brisc/out",
-        "files": [
-            "brisc.hex"
-        ]
-    },
-    "kernels": {
-        "path": "src/ckernels", # TODO clean up, maybe we don't need *everything* here?
-        "files": "*" 
-    },
-    "kernel_gen": {
-        "path": "build/src/ckernels/gen/out",
-        "files": "*",
-    },
-    "hlk": {
-        "path": "hlks",
-        "files": "*",
-    },
-    "perf_lib": {
-        "path": "perf_lib",
-        "files": [
-            "scratch_api.h",
-            "__init__.py",
-            "data_movement_perf_sweep.py",
-            "logger_utils.py",
-            "op_perf_test.py",
-            "ops.py",
-            "overlay_decouple.py",
-            "perf_analysis.py",
-            "perf_analysis_base.py",
-            "perf_analyzer_api.py",
-            "perf_analyzer_summary.py",
-            "perf_comparison.py",
-            "perf_graph.py",
-            "perf_report.py",
-            "perf_sweep.py",
-            "perf_test_base.py",
-            "perf_to_vcd.py",
-            "postprocess_api.py",
-            "run_perf_test.py",
-            "sweep_params.py",
-            "vcdparse.py",
-        ]
-    },
-    "overlay": {
-        "path": "src/overlay",
-        "files": "*" # TODO, clean-up, don't need everything
-    },
-    "versim_lib": { # TODO, remove
-        "path": "common_lib",
-        "files": "*",
-    },
-    "sfpi": {
-        "path": "third_party/sfpi",
-        "files": "*" 
-    }
-}
-
-# Only copy eric if we are building Wormhole
-if "BACKEND_ARCH_NAME" in os.environ and os.environ["BACKEND_ARCH_NAME"] == "wormhole":
-    bbe_files["firmware_erisc_hex"] = {
-        "path": "build/src/firmware/riscv/targets/erisc_app/out",
-        "files": [
-            "erisc_app.hex"
-        ]
-    }
-
-if "BACKEND_ARCH_NAME" in os.environ and os.environ["BACKEND_ARCH_NAME"] == "wormhole_b0":
-    bbe_files["firmware_erisc_hex"] = {
-        "path": "build/src/firmware/riscv/targets/erisc_app/out",
-        "files": [
-            "erisc_app.hex",
-            "erisc_app.iram.hex",
-            "erisc_app.l1.hex",
-            "split_iram_l1"
-        ]
-    }
-
 
 pybuda_files = {
     "test" : {
@@ -181,7 +57,6 @@ def run(self):
             src = "build/lib/libpybuda_csrc.so"
             self.copy_file(src, os.path.join(build_lib, filename))
 
-            self._copy_budabackend(build_lib + "/budabackend")
             self._copy_pybuda(build_lib)
 
     def _copy_pybuda(self, target_path):
@@ -197,20 +72,6 @@ def _copy_pybuda(self, target_path):
                 for f in d["files"]:
                     self.copy_file(src_path + "/" + f, path + "/" + f)
 
-    def _copy_budabackend(self, target_path):
-
-        src_root = "third_party/budabackend"
-
-        for t, d in bbe_files.items():
-            path = target_path + "/" + d["path"]
-            os.makedirs(path, exist_ok=True)
-
-            src_path = src_root + "/" + d["path"]
-            if d["files"] == "*":
-                self.copy_tree(src_path, path)
-            else:
-                for f in d["files"]:
-                    self.copy_file(src_path + "/" + f, path + "/" + f)
 
 with open("README.md", "r") as f:
     long_description = f.read()
@@ -228,8 +89,6 @@ def _copy_budabackend(self, target_path):
 # pybuda._C
 pybuda_c = TTExtension("pybuda._C")
 
-# budabackend
-#budabackend = CMakeExtension("budabackend", "pybuda/csrc")
 
 ext_modules = [pybuda_c]
 
@@ -238,7 +97,7 @@ def _copy_budabackend(self, target_path):
 short_hash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).decode('ascii').strip()
 date = subprocess.check_output(['git', 'show', '-s', '--format=%cd', "--date=format:%y%m%d", 'HEAD']).decode('ascii').strip()
 
-arch_codes = {"wormhole": "wh_a0", "wormhole_b0": "wh_b0", "grayskull": "gs"}
+arch_codes = {"wormhole_b0": "wh_b0", "grayskull": "gs", "blackhole": "bh"}
 arch_code = arch_codes[os.environ["BACKEND_ARCH_NAME"]]
 
 version = "0.1." + date + "+dev." + arch_code + "." + short_hash
@@ -268,6 +127,5 @@ def _copy_budabackend(self, target_path):
         "Intended Audience :: Developers",
         "Intended Audience :: Science/Research",
         "Topic :: Scientific/Engineering :: Artificial Intelligence"
-    ],
-
+    ]
 )
diff --git a/silicon_sanity.sh b/silicon_sanity.sh
deleted file mode 100644
index 74f30657c..000000000
--- a/silicon_sanity.sh
+++ /dev/null
@@ -1,6 +0,0 @@
-pytest -v pybuda/test/test_user.py \
-          pybuda/test/backend/test_silicon.py  \
-          pybuda/test/backend/models/test_bert.py::test_ff  \
-          pybuda/test/backend/models/test_bert.py::test_pt_encoder \
-          --silicon-only
-
diff --git a/third_party/CMakeLists.txt b/third_party/CMakeLists.txt
new file mode 100644
index 000000000..e9f50fc22
--- /dev/null
+++ b/third_party/CMakeLists.txt
@@ -0,0 +1,29 @@
+### Build tt-mlir ###
+
+set(METAL_LIB_DIR "${CMAKE_SOURCE_DIR}/third_party/tt-mlir/third_party/tt-metal/src/tt-metal-build/lib")
+
+add_custom_target(build_tt_mlir ALL 
+    COMMAND ${CMAKE_COMMAND} -E env 
+    BUILD_TYPE=${CMAKE_BUILD_TYPE} # Export build type to use
+    CXX_COMPILER=${CMAKE_CXX_COMPILER} # Export CXX compiler to use
+    C_COMPILER=${CMAKE_C_COMPILER} # Export C compiler to use
+    TTMLIR_TOOLCHAIN_DIR=${TTMLIR_TOOLCHAIN_DIR} # Export TTMLIR_TOOLCHAIN_DIR to use
+    TTMLIR_VENV_DIR=${TTMLIR_VENV_DIR} # Export TTMLIR_VENV_DIR to use
+    TTMLIR_ENABLE_RUNTIME=ON # Always build runtime
+    bash ${CMAKE_CURRENT_SOURCE_DIR}/build_mlir.sh
+    WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/tt-mlir
+    BYPRODUCTS ${METAL_LIB_DIR}/_ttnn.so # Workaround how Ninja handles dependencies
+    USES_TERMINAL
+)
+
+### end build tt-mlir ###
+
+### Build TVM ###
+
+add_custom_target(bulid_tvm ALL
+    COMMAND bash -c ${CMAKE_CURRENT_SOURCE_DIR}/tvm/install.sh
+    COMMENT "Installing TVM"
+    USES_TERMINAL
+)
+
+### end build TVM ###
diff --git a/third_party/buda-model-demos b/third_party/buda-model-demos
new file mode 160000
index 000000000..2de3d7607
--- /dev/null
+++ b/third_party/buda-model-demos
@@ -0,0 +1 @@
+Subproject commit 2de3d7607fc973446bef0664cb5d32aa8e0198a9
diff --git a/third_party/budabackend b/third_party/budabackend
deleted file mode 160000
index 0f1a90832..000000000
--- a/third_party/budabackend
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 0f1a90832f2c3c3e9943044b058f18718a1e273b
diff --git a/third_party/build_mlir.sh b/third_party/build_mlir.sh
new file mode 100644
index 000000000..8078e095f
--- /dev/null
+++ b/third_party/build_mlir.sh
@@ -0,0 +1,18 @@
+#!/bin/bash
+# SPDX-FileCopyrightText: (c) 2024 Tenstorrent AI ULC
+#
+# SPDX-License-Identifier: Apache-2.0
+
+source env/activate
+
+cmake -B env/build env
+cmake --build env/build
+
+build_type=${BUILD_TYPE:-Release}
+c_compiler=${C_COMPILER:-clang}
+cxx_compiler=${CXX_COMPILER:-clang++}
+
+source env/activate
+cmake -G Ninja -B build -DCMAKE_BUILD_TYPE=$build_type -DCMAKE_C_COMPILER=$c_compiler -DCMAKE_CXX_COMPILER=$cxx_compiler -DCMAKE_CXX_COMPILER_LAUNCHER=ccache -DTTMLIR_ENABLE_RUNTIME=ON
+
+cmake --build build
diff --git a/third_party/public-tt-buda b/third_party/public-tt-buda
deleted file mode 160000
index 949be3875..000000000
--- a/third_party/public-tt-buda
+++ /dev/null
@@ -1 +0,0 @@
-Subproject commit 949be3875a63a280da03e417c6046fb7fc5c2626
diff --git a/third_party/tt-mlir b/third_party/tt-mlir
new file mode 160000
index 000000000..7345e481e
--- /dev/null
+++ b/third_party/tt-mlir
@@ -0,0 +1 @@
+Subproject commit 7345e481e4a0c503e81780c2ee5094242e1bc4fd
diff --git a/third_party/tvm b/third_party/tvm
index fea9b3799..e2e618b0b 160000
--- a/third_party/tvm
+++ b/third_party/tvm
@@ -1 +1 @@
-Subproject commit fea9b37996ee3bf5699dd0b848b9311831750a2c
+Subproject commit e2e618b0bd7db089d227aa78d4c9c1c0bba9dcc4
diff --git a/utils/assert.hpp b/utils/assert.hpp
index 42d3e5f11..b8a9c91b4 100644
--- a/utils/assert.hpp
+++ b/utils/assert.hpp
@@ -32,12 +32,11 @@ std::ostream& operator<<(std::ostream& os, tt::OStreamJoin<A, B> const& join) {
 }  // namespace tt
 
 namespace tt::assert {
-
 inline std::string demangle(const char* str) {
     size_t size = 0;
     int status = 0;
-    std::string rt(256, '\0');
-    if (1 == sscanf(str, "%*[^(]%*[^_]%255[^)+]", &rt[0])) {
+    std::string rt(1025, '\0');
+    if (1 == sscanf(str, "%*[^(]%*[^_]%1024[^)+]", &rt[0])) {
         char* v = abi::__cxa_demangle(&rt[0], nullptr, &size, &status);
         if (v) {
             std::string result(v);
@@ -63,6 +62,7 @@ inline std::vector<std::string> backtrace(int size, int skip) {
         std::cout << "backtrace_symbols error." << std::endl;
         return bt;
     }
+
     for (size_t i = skip; i < s; ++i) {
         bt.push_back(demangle(strings[i]));
     }
@@ -117,11 +117,17 @@ void tt_assert(
         trace_message_ss << "info:" << std::endl;
         tt_assert_message(trace_message_ss, messages...);
     }
+
+    if (env_as<bool>("TT_ASSERT_ABORT"))
+    {
+        // Just abort, the signal handler will print the stack trace.
+        abort();
+    }
+
     trace_message_ss << "backtrace:\n";
     trace_message_ss << tt::assert::backtrace_to_string(100, 3, " --- ");
     trace_message_ss << std::flush;
-    if (env_as<bool>("TT_ASSERT_ABORT"))
-        abort();
+
     throw std::runtime_error(trace_message_ss.str());
 }
 
@@ -135,3 +141,12 @@ void tt_assert(
     ::tt::assert::tt_assert<true>(__FILE__, __LINE__, "TT_ASSERT", #condition, f, ##__VA_ARGS__) : void()
 #define TT_THROW(...) \
     ::tt::assert::tt_assert<false>(__FILE__, __LINE__, "TT_THROW", "tt::exception", std::string_view{}, ##__VA_ARGS__)
+
+#ifndef DEBUG
+// Do nothing in release mode.
+#define TT_DBG_ASSERT(condition, ...) ((void)0)
+#else
+#define TT_DBG_ASSERT(condition, ...) \
+    __builtin_expect(not (condition), 0) ? \
+    ::tt::assert::tt_assert<false>(__FILE__, __LINE__, "TT_DBG_ASSERT", #condition, std::string_view{}, ##__VA_ARGS__) : void()
+#endif
diff --git a/utils/logger.hpp b/utils/logger.hpp
index aae8ccab7..e3819a345 100644
--- a/utils/logger.hpp
+++ b/utils/logger.hpp
@@ -64,7 +64,8 @@ constexpr LoggerABI kLoggerABI = LoggerABI::CXX11;
     X(Profile)         \
     X(TMFusion)        \
     X(TTDevice)        \
-    X(TorchDevice)
+    X(TorchDevice)     \
+    X(MLIRGenerator)
 
 enum LogType : uint32_t
 {
@@ -145,7 +146,7 @@ class Logger
         if ((1 << type) & mask)
         {
 #if defined(UTILS_LOGGER_PYTHON_OSTREAM_REDIRECT) && (UTILS_LOGGER_PYTHON_OSTREAM_REDIRECT == 1)
-            pybind11::scoped_ostream_redirect stream(*fd);
+//            pybind11::scoped_ostream_redirect stream(*fd);
 #endif
             std::string timestamp_str = get_current_time();
             fmt::terminal_color timestamp_color = fmt::terminal_color::green;
diff --git a/utils/signal_handlers.hpp b/utils/signal_handlers.hpp
new file mode 100644
index 000000000..5320cd0ca
--- /dev/null
+++ b/utils/signal_handlers.hpp
@@ -0,0 +1,86 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include <csignal>
+#include <iostream>
+#include <string>
+
+#include "utils/assert.hpp"
+#include "runtime/tt_device.hpp"
+
+inline void pybuda_signal_handler(int sig)
+{
+    std::string signal_name;
+    switch (sig)
+    {
+        case SIGSEGV:
+            signal_name = "segmentation fault";
+            break;
+        case SIGILL:
+            signal_name = "illegal instruction";
+            break;
+        case SIGFPE:
+            signal_name = "floating point exception";
+            break;
+        case SIGABRT:
+            signal_name = "abort";
+            break;
+        default:
+            signal_name = std::to_string(sig);
+            break;
+    }
+
+    std::cerr << "pybuda_signal_handler - signal: " << sig << " (" << signal_name << ")" << std::endl;
+    std::cerr << "stacktrace: " << std::endl;
+
+    std::vector bt = tt::assert::backtrace(100, 0);
+    const std::string prefix = " --- ";
+    bool in_python_section = false;
+    for (const auto& frame : bt)
+    {
+        bool python_frame = frame.find("/python") != std::string::npos;
+        if (!in_python_section && python_frame)
+        {
+            // We are entering the python section of the backtrace.
+            in_python_section = true;
+            std::cerr << prefix << std::endl;
+            std::cerr << prefix << "Python frame(s)" << std::endl;
+            std::cerr << prefix << std::endl;
+        }
+
+        if (python_frame)
+        {
+            // Skip python frames.
+            continue;
+        }
+
+        in_python_section = false;
+        std::cerr << prefix << frame << std::endl;
+    }
+
+    tt::TTSystem::get_system().close_devices();
+
+    // Restore the default signal handler and raise the signal again.
+    // The default signal handler will generate a core dump (if enabled).
+    std::signal(sig, SIG_DFL);
+    std::raise(sig);
+}
+
+class SignalHandlers
+{
+    public:
+        SignalHandlers()
+        {
+            // For SIGSEGV, SIGILL and SIGFPE we register our own signal handlers,
+            // to print the stacktrace before the program crashes.
+            for (auto sig : {SIGSEGV, SIGILL, SIGFPE, SIGABRT})
+            {
+                if (std::signal(sig, pybuda_signal_handler) == SIG_ERR)
+                {
+                    std::cerr << "Failed to register signal handler for signal " << sig << std::endl;
+                }
+            }
+        }
+};
diff --git a/utils/yaml_utils.hpp b/utils/yaml_utils.hpp
new file mode 100644
index 000000000..2e40981db
--- /dev/null
+++ b/utils/yaml_utils.hpp
@@ -0,0 +1,41 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include <array>
+#include <string>
+
+namespace tt
+{
+
+#define WRITE_YAML_LINE(out_stream, num_indents, content) (out_stream << yaml_indent(num_indents) << content << '\n')
+
+#define YAML_KV_PAIR(key, value) key << ": " << value
+
+template <int MAX_YAML_INDENT>
+class YamlIndentLookup
+{
+   public:
+    YamlIndentLookup()
+    {
+        for (unsigned int i = 0; i < yaml_indents_.size(); ++i)
+        {
+            yaml_indents_[i] = std::string(i, ' ');
+        }
+    }
+
+    const std::string &get_yaml_indent(const int yaml_indent) const { return yaml_indents_.at(yaml_indent); }
+
+   private:
+    std::array<std::string, MAX_YAML_INDENT> yaml_indents_;
+};
+
+static const std::string &yaml_indent(const int num_spaces)
+{
+    const static YamlIndentLookup<20 /* MAX_YAML_INDENT */> yaml_indent_lookup;
+
+    return yaml_indent_lookup.get_yaml_indent(num_spaces);
+}
+
+}  // namespace tt
\ No newline at end of file