diff --git a/.github/actions/install-deps/action.yml b/.github/actions/install-deps/action.yml
new file mode 100644
index 000000000..1f13767d9
--- /dev/null
+++ b/.github/actions/install-deps/action.yml
@@ -0,0 +1,31 @@
+name: "Install dependencies"
+description: "Installs dependencies on GitHub Actions runners"
+
+inputs:
+ os:
+ description: 'Runner OS'
+ required: true
+
+runs:
+ using: "composite"
+ steps:
+ - name: Verify ubuntu only
+ shell: bash
+ run: |
+ if ! echo ${{ inputs.os }} | grep -q "ubuntu"; then
+ echo "${{ inputs.os }} does not seem to be ubuntu"
+ fi
+ - name: Assert requested os exists in dependencies
+ shell: bash
+ run: |
+ if ! jq -e ".\"${{ inputs.os }}\" != null" $GITHUB_ACTION_PATH/dependencies.json; then
+ echo "${{ inputs.os }} does not exist as a supported os in $GITHUB_ACTION_PATH/dependencies.json"
+ fi
+ - name: Retrieve and install pkg deps based on OS
+ id: retrieve-pkg-deps
+ shell: bash
+ run: |
+ DEPENDENCIES=$(jq -r --arg os "${{ inputs.os }}" '.[$os] | .[]' $GITHUB_ACTION_PATH/dependencies.json)
+ echo $DEPENDENCIES
+ sudo apt update
+ sudo apt install $DEPENDENCIES
diff --git a/.github/actions/install-deps/dependencies.json b/.github/actions/install-deps/dependencies.json
new file mode 100644
index 000000000..2faab1678
--- /dev/null
+++ b/.github/actions/install-deps/dependencies.json
@@ -0,0 +1,12 @@
+{
+ "ubuntu-22.04": [
+ "software-properties-common",
+ "build-essential",
+ "python3.10-venv",
+ "libyaml-cpp-dev",
+ "libboost-all-dev",
+ "libsndfile1",
+ "libhwloc-dev",
+ "libzmq3-dev"
+ ]
+}
diff --git a/.github/workflows/build-artifacts.yml b/.github/workflows/build-artifacts.yml
new file mode 100644
index 000000000..5e52e7e8f
--- /dev/null
+++ b/.github/workflows/build-artifacts.yml
@@ -0,0 +1,24 @@
+name: Build artifacts
+
+on:
+ workflow_dispatch:
+ workflow_call:
+
+env:
+ PYTHON_VERSION: "python3.10"
+
+jobs:
+ build-artifacts:
+ strategy:
+ matrix:
+ arch: ["grayskull"]
+ runs-on: ubuntu-22.04
+ steps:
+ - uses: actions/checkout@v4
+ - uses: ./.github/actions/install-deps
+ with:
+ os: ubuntu-22.04
+ - name: Update submodule
+ run: git submodule update --init --recursive
+ - name: Build for ${{ matrix.arch }}
+ run: source env_for_silicon.sh
\ No newline at end of file
diff --git a/.github/workflows/post-commit-workflow.yml b/.github/workflows/post-commit-workflow.yml
new file mode 100644
index 000000000..ceb7d58da
--- /dev/null
+++ b/.github/workflows/post-commit-workflow.yml
@@ -0,0 +1,13 @@
+name: Post commit workflow
+
+on:
+ workflow_dispatch:
+ workflow_call:
+ push:
+ branches:
+ - main
+
+jobs:
+ build-artifacts:
+ uses: ./.github/workflows/build-artifacts.yml
+ secrets: inherit
\ No newline at end of file
diff --git a/.github/workflows/pull-request-workflow.yml b/.github/workflows/pull-request-workflow.yml
new file mode 100644
index 000000000..c5fbe7958
--- /dev/null
+++ b/.github/workflows/pull-request-workflow.yml
@@ -0,0 +1,13 @@
+name: Pull request workflow
+
+on:
+ workflow_dispatch:
+ workflow_call:
+ pull_request:
+ branches:
+ - main
+
+jobs:
+ build-artifacts:
+ uses: ./.github/workflows/build-artifacts.yml
+ secrets: inherit
\ No newline at end of file
diff --git a/.gitignore b/.gitignore
index 690531c0f..4dff4f040 100644
--- a/.gitignore
+++ b/.gitignore
@@ -48,132 +48,8 @@ bp_data
third_party/llvm
device_images
generated_modules
+build_deps/
# ClangD
compile_commands.json
\n\n# Exclude LFS files to keep the public repo small
-third_party/budabackend/common_lib/libboost_filesystem.so.1.65.1
-third_party/budabackend/common_lib/libboost_regex.so.1.65.1
-third_party/budabackend/common_lib/libboost_system.so.1.65.1
-third_party/budabackend/common_lib/libboost_thread.so.1.65.1
-third_party/budabackend/common_lib/libicudata.so.60
-third_party/budabackend/common_lib/libicui18n.so.60
-third_party/budabackend/common_lib/libicuuc.so.60
-third_party/budabackend/common_lib/libsqlite3.so.0
-third_party/budabackend/common_lib/libz.so.1
-third_party/budabackend/dbd/docs-md/images/debuda-buda.png
-third_party/budabackend/dbd/docs-md/images/debuda-debuda-server.png
-third_party/budabackend/dbd/docs-md/images/debuda-export-db.png
-third_party/budabackend/dbd/docs-md/images/debuda-speed-dial.png
-third_party/budabackend/dbd/docs-md/images/debuda-start-and-repl.png
-third_party/budabackend/dbd/docs-md/images/debuda.png
-third_party/budabackend/dbd/docs-md/images/tenstorrent-pdf-titlepage.png
-third_party/budabackend/device/bin/silicon/grayskull/clkctl
-third_party/budabackend/device/bin/silicon/grayskull/init
-third_party/budabackend/device/bin/silicon/grayskull/pm_en.ttx
-third_party/budabackend/device/bin/silicon/grayskull/setup_interface
-third_party/budabackend/device/bin/silicon/grayskull/tt-script
-third_party/budabackend/device/bin/silicon/reset-chip
-third_party/budabackend/device/bin/silicon/tensix-reset
-third_party/budabackend/device/bin/silicon/wormhole/boot
-third_party/budabackend/device/bin/silicon/wormhole/create-ethernet-map
-third_party/budabackend/device/bin/silicon/wormhole/flash-spi
-third_party/budabackend/device/bin/silicon/wormhole/get-wormhole-interfaces
-third_party/budabackend/device/bin/silicon/wormhole/imx8-reset
-third_party/budabackend/device/bin/silicon/wormhole/load-eth-fw
-third_party/budabackend/device/bin/silicon/wormhole/noc-overlay-dump
-third_party/budabackend/device/bin/silicon/wormhole/setup-interface
-third_party/budabackend/device/bin/silicon/wormhole/temp
-third_party/budabackend/device/bin/silicon/wormhole/tt-flash
-third_party/budabackend/device/bin/silicon/wormhole/tt-script
-third_party/budabackend/docs/public/images/1.png
-third_party/budabackend/docs/public/images/10.png
-third_party/budabackend/docs/public/images/perf_ui_candlestick.png
-third_party/budabackend/docs/public/images/11.png
-third_party/budabackend/docs/public/images/perf_ui_wft.png
-third_party/budabackend/docs/public/images/12.png
-third_party/budabackend/docs/public/images/13.png
-third_party/budabackend/docs/public/images/14.png
-third_party/budabackend/docs/public/images/16.png
-third_party/budabackend/docs/public/images/17.png
-third_party/budabackend/docs/public/images/18.png
-third_party/budabackend/docs/public/images/2.png
-third_party/budabackend/docs/public/images/20.png
-third_party/budabackend/docs/public/images/21.png
-third_party/budabackend/docs/public/images/22.png
-third_party/budabackend/docs/public/images/23.png
-third_party/budabackend/docs/public/images/24.png
-third_party/budabackend/docs/public/images/25.png
-third_party/budabackend/docs/public/images/26.png
-third_party/budabackend/docs/public/images/27.png
-third_party/budabackend/docs/public/images/28.png
-third_party/budabackend/docs/public/images/29.png
-third_party/budabackend/docs/public/images/3.png
-third_party/budabackend/docs/public/images/30.png
-third_party/budabackend/docs/public/images/31.png
-third_party/budabackend/docs/public/images/32.png
-third_party/budabackend/docs/public/images/33.png
-third_party/budabackend/docs/public/images/34.png
-third_party/budabackend/docs/public/images/35.png
-third_party/budabackend/docs/public/images/36.png
-third_party/budabackend/docs/public/images/37.png
-third_party/budabackend/docs/public/images/4.png
-third_party/budabackend/docs/public/images/5.png
-third_party/budabackend/docs/public/images/6.png
-third_party/budabackend/docs/public/images/7.png
-third_party/budabackend/docs/public/images/perf_ui_local_select.png
-third_party/budabackend/docs/public/images/8.png
-third_party/budabackend/docs/public/images/9.png
-third_party/budabackend/docs/public/images/Screen_Shot_2023-05-24_at_11.56.12_PM.png
-third_party/budabackend/docs/public/images/bfp-efficiency.png
-third_party/budabackend/docs/public/images/bfp2_blocks.png
-third_party/budabackend/docs/public/images/bfp4_blocks.png
-third_party/budabackend/docs/public/images/bfp8_blocks.png
-third_party/budabackend/docs/public/images/cropped-favicon-32x32.png
-third_party/budabackend/docs/public/images/data-formats.png
-third_party/budabackend/docs/public/images/logo.png
-third_party/budabackend/docs/public/images/perf_ui_device.png
-third_party/budabackend/docs/public/images/perf_ui_diff.png
-third_party/budabackend/docs/public/images/perf_ui_front_page.png
-third_party/budabackend/docs/public/images/perf_ui_host.png
-third_party/budabackend/docs/public/images/perf_ui_inputs.png
-third_party/budabackend/docs/public/images/perf_ui_select_test.png
-third_party/budabackend/docs/public/images/perf_ui_select_workspace.png
-third_party/budabackend/docs/public/images/tt_logo.png
-third_party/budabackend/docs/public/images/tt_logo.svg
-third_party/budabackend/docs/public/images/unpack_math_pack.png
-third_party/budabackend/loader/tests/reference_tensor_conv_bin/expected_tensor_for_conv_bfp8_s2.bin
-third_party/budabackend/loader/tests/reference_tensor_conv_bin/expected_tensor_for_conv_fp16b_s2.bin
-third_party/budabackend/loader/tests/reference_tensor_conv_bin/expected_tensor_for_conv_fp32_s2.bin
-third_party/budabackend/loader/tests/reference_tensor_conv_bin_mt/input_tensor_for_conv_fp16b.0.bin
-third_party/budabackend/loader/tests/reference_tensor_conv_bin_mt/input_tensor_for_conv_fp16b_s1.0.bin
-third_party/budabackend/loader/tests/reference_tensor_conv_bin_mt/input_tensor_for_conv_fp32.0.bin
-third_party/budabackend/perf_lib/graph_tests/grayskull/inference/bert_large_hifi3_fp16b.yaml
-third_party/budabackend/perf_lib/graph_tests/grayskull/inference/bert_large_lofi_bfp8b.yaml
-third_party/budabackend/verif/graph_tests/netlists/pregenerated/group_1.zip
-third_party/budabackend/verif/graph_tests/netlists/pregenerated/group_2.zip
-third_party/budabackend/verif/graph_tests/netlists/pregenerated/group_3.zip
-third_party/budabackend/verif/graph_tests/netlists/pregenerated/group_4.zip
-third_party/budabackend/verif/graph_tests/netlists/pregenerated/group_5.zip
-third_party/budabackend/verif/graph_tests/netlists/pregenerated/group_6.zip
-third_party/budabackend/verif/graph_tests/netlists/pregenerated/group_7.zip
-third_party/budabackend/verif/graph_tests/netlists/t5_large_wormhole_b0.yaml
-third_party/budabackend/verif/graph_tests/netlists/wormhole_b0/netlist_bert_12x_encoder_1x_whb0_base_training.yaml
-third_party/budabackend/verif/graph_tests/netlists/wormhole_b0/netlist_bert_24x_encoder_1x_whb0_large_training.yaml
-third_party/budabackend/verif/multichip_tests/wh_multichip/large_cluster/falcon_60l_8chip.yaml
-third_party/budabackend/verif/multichip_tests/wh_multichip/large_cluster/midbloom_inference_12chip_lab68.yaml
-third_party/budabackend/verif/multichip_tests/wh_multichip/large_cluster/midbloom_inference_32chip_lab78.no_snakeplace.yaml
-third_party/budabackend/verif/multichip_tests/wh_multichip/large_cluster/midbloom_inference_32chip_lab78.yaml
-third_party/budabackend/verif/multichip_tests/wh_multichip/large_cluster/midbloom_inference_4chip_jb11.yaml
-third_party/budabackend/verif/multichip_tests/wh_multichip/large_cluster/midbloom_inference_8chip_jb11.yaml
-third_party/budabackend/verif/multichip_tests/wh_multichip/netlist_bert_concurrent_24x_encoder_2x_wh_large_inference.yaml
-third_party/budabackend/verif/pipegen_tests/netlists/grayskull/nightly/baseline.zip
-third_party/budabackend/verif/pipegen_tests/netlists/grayskull/push/baseline.zip
-third_party/budabackend/verif/pipegen_tests/netlists/wormhole_b0/nightly/baseline.zip
-third_party/budabackend/verif/pipegen_tests/netlists/wormhole_b0/push/baseline.zip
-third_party/budabackend/verif/template_netlist/netlists/multi_tm_tests/weekly/wormhole_b0/test_dram_input_matmul_3tms_and_reblock.zip
-third_party/budabackend/verif/template_netlist/netlists/test_datacopy_matmul_2tms_and_reblock_pregenerated.zip
-third_party/budabackend/verif/template_netlist/netlists/test_datacopy_matmul_3tms_and_reblock_000_force_grayskull.zip
-third_party/budabackend/verif/template_netlist/netlists/test_dram_input_matmul_3tms_and_reblock_grayskull.zip
-third_party/budabackend/verif/tm_tests/directed/bert_large_inference_hifi.yaml
-third_party/budabackend/verif/tm_tests/directed/packer_mcast.tar.gz
diff --git a/.gitlab-ci.perf.yml b/.gitlab-ci.perf.yml
new file mode 100644
index 000000000..d683dc410
--- /dev/null
+++ b/.gitlab-ci.perf.yml
@@ -0,0 +1,44 @@
+include:
+ - .gitlab-ci.wheels.yml
+
+ # PyBuda repo, Grayskull e150
+ - ci/gitlab-test-lists/.gitlab-ci.grayskull_e150_perf_bfp8_b_nightly.yml
+ - ci/gitlab-test-lists/.gitlab-ci.grayskull_e150_perf_fp16_nightly.yml
+ - ci/gitlab-test-lists/.gitlab-ci.grayskull_e150_perf_release_nightly.yml
+
+ # PyBuda repo, Grayskull e75
+ - ci/gitlab-test-lists/.gitlab-ci.grayskull_e75_perf_bfp8_b_nightly.yml
+ - ci/gitlab-test-lists/.gitlab-ci.grayskull_e75_perf_fp16_nightly.yml
+ - ci/gitlab-test-lists/.gitlab-ci.grayskull_e75_perf_release_nightly.yml
+
+ # PyBuda repo, Wormhole B0
+ - ci/gitlab-test-lists/.gitlab-ci.wormhole_b0_silicon_perf_bfp8_b_nightly.yml
+ - ci/gitlab-test-lists/.gitlab-ci.wormhole_b0_silicon_perf_fp16_nightly.yml
+ - ci/gitlab-test-lists/.gitlab-ci.wormhole_b0_silicon_perf_release_nightly.yml
+
+ - ci/gitlab-test-lists/.gitlab-ci.wormhole_b0_silicon_perf_bfp8_b_manual.yml
+ - ci/gitlab-test-lists/.gitlab-ci.wormhole_b0_silicon_perf_fp16_manual.yml
+ - ci/gitlab-test-lists/.gitlab-ci.wormhole_b0_silicon_perf_release_manual.yml
+
+ # Benchmarking repo
+ # Grayskull e75, Grayskull e150, Wormhole B0
+ - ci/gitlab-test-lists/benchmarking/.gitlab-ci.wormhole_b0_silicon_perf_release_public.yml
+ - ci/gitlab-test-lists/benchmarking/.gitlab-ci.grayskull_e75_perf_release_public.yml
+ - ci/gitlab-test-lists/benchmarking/.gitlab-ci.grayskull_e150_perf_release_public.yml
+
+# Dissable other jobs from .gitlab-ci.wheels.yml
+pybuda-gs-latest-bbe-wheel:
+ rules:
+ - if: ($CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_MESSAGE !~ /\[no_ci_perf/)
+
+pybuda-wh-b0-latest-bbe-wheel:
+ rules:
+ - if: ($CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_MESSAGE !~ /\[no_ci_perf/)
+
+pybuda-gs-unittests:
+ rules:
+ - if: ($CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_MESSAGE !~ /\[no_ci_perf/)
+
+pybuda-wh-b0-unittests:
+ rules:
+ - if: ($CI_PIPELINE_SOURCE == "push" && $CI_COMMIT_MESSAGE !~ /\[no_ci_perf/)
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
index 58e4da86d..8b0954ebb 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -1,12 +1,13 @@
[submodule "third_party/tvm"]
path = third_party/tvm
- url = ../tt-tvm.git
-[submodule "third_party/budabackend"]
- path = third_party/budabackend
- url = ../tt-budabackend.git
+ url = https://github.com/tenstorrent/tt-tvm
[submodule "third_party/pybind11"]
path = third_party/pybind11
- url = https://github.com/pybind/pybind11
-[submodule "third_party/public-tt-buda"]
- path = third_party/public-tt-buda
- url = ../tt-buda-demos.git
+ url = https://github.com/pybind/pybind11.git
+[submodule "third_party/buda-model-demos"]
+ path = third_party/buda-model-demos
+ url = https://github.com/tenstorrent/tt-buda-demos
+
+[submodule "third_party/tt-mlir"]
+ path = third_party/tt-mlir
+ url = git@github.com:tenstorrent/tt-mlir.git
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..f12eafca7
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,51 @@
+cmake_minimum_required(VERSION 3.20.0)
+project(pybuda LANGUAGES CXX)
+
+find_program(CLANG_17 clang++-17)
+find_program(CLANG clang)
+if(CLANG_17 AND CLANG)
+ message(STATUS "Found Clang-17 here: ${CLANG_17}")
+ set(CMAKE_CXX_COMPILER "${CLANG_17}")
+ set(CMAKE_C_COMPILER "${CLANG}")
+else()
+ message(WARNING "Clang++-17 or clang not found!!!")
+endif()
+
+set(CMAKE_CXX_STANDARD 17)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
+
+list(APPEND CMAKE_MODULE_PATH "${CMAKE_SOURCE_DIR}/cmake")
+include(Utils)
+
+check_required_env_var(PYBUDA_PYTHON_VERSION)
+check_required_env_var(PYBUDA_TOOLCHAIN_DIR)
+check_required_env_var(PYBUDA_VENV_DIR)
+
+set(PYBUDA_PYTHON_VERSION $ENV{PYBUDA_PYTHON_VERSION})
+set(PYBUDA_TOOLCHAIN_DIR $ENV{PYBUDA_TOOLCHAIN_DIR})
+set(PYBUDA_VENV_DIR $ENV{PYBUDA_VENV_DIR})
+
+find_package(Python COMPONENTS REQUIRED Interpreter Development)
+
+set(TTMLIR_TOOLCHAIN_DIR $ENV{TTMLIR_TOOLCHAIN_DIR})
+set(TTMLIR_VENV_DIR $ENV{TTMLIR_VENV_DIR})
+
+if (NOT CMAKE_BUILD_TYPE)
+ set(CMAKE_BUILD_TYPE Release)
+endif()
+
+add_compile_options(-Wall -Wextra -Wpedantic -Werror -Wno-unused-parameter)
+
+set(PYBUDA_CSRC_WARNINGS -Wall -Wextra -Wno-pragmas -Wno-unused-parameter)
+set(CFLAGS_NO_WARN -DFMT_HEADER_ONLY)
+set(PYBUDA_CSRC_CFLAGS ${CFLAGS_NO_WARN} ${PYBUDA_CSRC_WARNINGS} -DUTILS_LOGGER_PYTHON_OSTREAM_REDIRECT=1)
+
+set(CONFIG_LDFLAGS "")
+
+set(STATIC_LIB_FLAGS -fPIC)
+set(SHARED_LIB_FLAGS -fPIC)
+
+add_subdirectory(third_party)
+add_subdirectory(pybuda)
diff --git a/Makefile b/Makefile
index 757d5bd1d..79a5e78be 100644
--- a/Makefile
+++ b/Makefile
@@ -1,6 +1,14 @@
.SUFFIXES:
+OS ?= $(shell uname)
+
+ifeq ($(OS),Linux)
MAKEFLAGS := --jobs=$(shell nproc) --output-sync=target
+else ifeq ($(OS),Darwin)
+MAKEFLAGS := --jobs=$(shell sysctl -n hw.physicalcpu) --output-sync=target
+else
+$(error "Unknown OS: $(OS)")
+endif
# Setup CONFIG, DEVICE_RUNNER, and out/build dirs first
CONFIG ?= assert
@@ -12,9 +20,11 @@ PREFIX ?= $(OUT)
CONFIG_CFLAGS =
CONFIG_LDFLAGS =
+CONFIG_CXXFLAGS =
ifeq ($(CONFIG), release)
CONFIG_CFLAGS += -O3
+CONFIG_CXXFLAGS = -fvisibility-inlines-hidden
else ifeq ($(CONFIG), ci) # significantly smaller artifacts
CONFIG_CFLAGS += -O3 -DDEBUG -Werror
else ifeq ($(CONFIG), assert)
@@ -46,16 +56,21 @@ TESTDIR = $(OUT)/test
DOCSDIR = $(OUT)/docs
SUBMODULESDIR = $(OUT)/submodules
+# Python version
+PYTHON_VERSION ?= python3.8
+PYTHON_INCLUDES = $(shell $(PYTHON_VERSION)-config --includes)
+PYTHON_LDFLAGS = $(shell $(PYTHON_VERSION)-config --ldflags)
+
# Top level flags, compiler, defines etc.
#WARNINGS ?= -Wall -Wextra
WARNINGS ?= -Wdelete-non-virtual-dtor -Wreturn-type -Wswitch -Wuninitialized -Wno-unused-parameter
-CC ?= gcc
-CXX ?= g++
-CFLAGS_NO_WARN ?= -MMD -I. $(CONFIG_CFLAGS) -mavx2 -DBUILD_DIR=\"$(OUT)\" -I$(INCDIR) -DFMT_HEADER_ONLY -Ithird_party/fmt -Ithird_party/pybind11/include
+CC ?= clang
+CXX ?= clang++
+CFLAGS_NO_WARN ?= -MMD -I. $(CONFIG_CFLAGS) -DBUILD_DIR=\"$(OUT)\" -I$(INCDIR) -DFMT_HEADER_ONLY -Ithird_party/fmt -Ithird_party/pybind11/include $(PYTHON_INCLUDES)
CFLAGS ?= $(CFLAGS_NO_WARN) $(WARNINGS)
-CXXFLAGS ?= --std=c++17 -fvisibility-inlines-hidden -maes -mavx
-LDFLAGS ?= $(CONFIG_LDFLAGS) -Wl,-rpath,$(PREFIX)/lib -L$(LIBDIR) -Ldevice/lib
+CXXFLAGS ?= --std=c++17 $(CONFIG_CXXFLAGS)
+LDFLAGS ?= $(CONFIG_LDFLAGS) -Wl,-rpath,$(PREFIX)/lib -L$(LIBDIR) -v
SHARED_LIB_FLAGS = -shared -fPIC
STATIC_LIB_FLAGS = -fPIC
ifeq ($(findstring clang,$(CC)),clang)
@@ -66,11 +81,16 @@ WARNINGS += -Wmaybe-uninitialized
LDFLAGS += -lstdc++
endif
GIT_COMMON_DIR=$(shell git rev-parse --git-common-dir)
-SUBMODULES=$(wildcard $(GIT_COMMON_DIR)/modules/third_party/*)
-SUBMODULES_UPDATED=$(addprefix $(SUBMODULESDIR)/, $(SUBMODULES:$(GIT_COMMON_DIR)/modules/%=%.checkout))
+SUBMODULES=$(shell git submodule status | grep -o "third_party/[^ ]*")
+SUBMODULES_UPDATED=$(addprefix $(SUBMODULESDIR)/, $(SUBMODULES:%=%.checkout))
SKIP_BBE_UPDATE ?= 0
SKIP_SUBMODULE_UPDATE ?= $(SKIP_BBE_UPDATE)
+ifeq ($(EMULATION_DEVICE_EN), 1)
+ TENSIX_EMULATION_ZEBU = $(TENSIX_EMULATION_ROOT)/zebu
+ TENSIX_EMULATION_ZCUI_WORK = $(TENSIX_EMULATION_ROOT)/targets/tensix_2x2_1dram_BH/zcui.work
+endif
+
all: update_submodules build ;
# These must be in dependency order (enforces no circular deps)
@@ -78,34 +98,45 @@ include python_env/module.mk
include pybuda/module.mk
include docs/public/module.mk
-update_submodules: $(SUBMODULES_UPDATED) ;
+update_submodules: $(SUBMODULES_UPDATED) emulation_device_links ;
-$(SUBMODULESDIR)/%.checkout: $(SUBMODULESDIR)/%
+$(SUBMODULESDIR)/%.checkout:
@mkdir -p $(dir $@)
+ifeq ($(SKIP_SUBMODULE_UPDATE), 0)
+ git submodule update --init --recursive $(@:$(SUBMODULESDIR)/%.checkout=%)
+ #git -C $(@:$(SUBMODULESDIR)/%.checkout=%) submodule foreach --recursive git lfs install || true
+ #git -C $(@:$(SUBMODULESDIR)/%.checkout=%) submodule foreach --recursive git lfs pull
+ #git -C $(@:$(SUBMODULESDIR)/%.checkout=%) submodule foreach --recursive git lfs checkout HEAD
+endif
touch $@
-$(SUBMODULESDIR)/%: $(GIT_COMMON_DIR)/modules/%/HEAD
- @mkdir -p $(dir $@)
-ifeq ($(SKIP_SUBMODULE_UPDATE), 0)
- git submodule update --init --recursive $(@:$(SUBMODULESDIR)/%=%)
- git -C $(@:$(SUBMODULESDIR)/%=%) submodule foreach --recursive git lfs install || true
- git -C $(@:$(SUBMODULESDIR)/%=%) submodule foreach --recursive git lfs pull
- git -C $(@:$(SUBMODULESDIR)/%=%) submodule foreach --recursive git lfs checkout HEAD
+emulation_device_links:
+ifeq ($(EMULATION_DEVICE_EN), 1)
+ @echo "Linking and copying emulation device files..."
+ ln -sf $(TENSIX_EMULATION_ZCUI_WORK) $(OUT)/.
+ ln -sf $(TENSIX_EMULATION_ZCUI_WORK) ../
+ cp -f $(TENSIX_EMULATION_ZEBU)/scripts/designFeatures ./
endif
- touch -r $^ $@
- touch $@.checkout
build: pybuda third_party/tvm ;
third_party/tvm: $(SUBMODULESDIR)/third_party/tvm.build ;
-$(SUBMODULESDIR)/third_party/tvm.build: python_env $(SUBMODULESDIR)/third_party/tvm.checkout
- bash -c "source $(PYTHON_ENV)/bin/activate && ./third_party/tvm/install.sh"
+$(SUBMODULESDIR)/third_party/tvm.build: $(PYTHON_ENV) $(SUBMODULESDIR)/third_party/tvm.checkout
+ bash -c "source $(PYTHON_ENV_ROOT)/bin/activate && ./third_party/tvm/install.sh"
touch $@
-clean: third_party/budabackend/clean
+build_tests: pybuda/csrc/graph_lib/tests pybuda/csrc/passes/tests ;
+
+run_tests: build_tests
+ @echo "Running tests..."
+ $(TESTDIR)/pybuda/csrc/graph_lib/tests/graphlib_unit_tests
+ $(TESTDIR)/pybuda/csrc/passes/tests/passes_unit_tests
+
+clean:
rm -rf $(OUT)
rm -rf third_party/tvm/build
+ rm -rf build_deps/
clean_no_python:
find $(OUT)/ -maxdepth 1 -mindepth 1 -type d -not -name 'python_env' -print0 | xargs -0 -I {} rm -Rf {}
@@ -136,7 +167,7 @@ build_tvm: third_party/tvm ;
.PHONY: stubs
stubs:
pip install mypy
- stubgen -m pybuda._C -m pybuda._C.autograd -m pybuda._C.balancer -m pybuda._C.graph -m pybuda._C.backend_api -m pybuda._C.pattern_matcher -m pybuda._C.scheduler -m pybuda._C.torch_device -o pybuda
+ stubgen -m pybuda._C -m pybuda._C.autograd -m pybuda._C.graph -m pybuda._C.torch_device -o pybuda
# Cleaning PyBuda and BBE artifacts
.PHONY: clean_tt
diff --git a/README.debug.md b/README.debug.md
deleted file mode 100644
index 73e040fed..000000000
--- a/README.debug.md
+++ /dev/null
@@ -1,115 +0,0 @@
-
-*PyBuda monitors many environment variables to modify default behavior. These can be used to debug or analyze problems.*
-
-## Overrides
- * PYBUDA\_BUILD\_DIR: Override the build directory for the compiler.
- * LOGURU\_LEVEL: set Python logger level - default is DEBUG, valid values are INFO, DEBUG, TRACE, NONE
- * LOGGER\_LEVEL: set C++ logger level - values are the same as Python logger
- * PYBUDA\_DEVMODE: set to make Golden/Sequential default run mode if one isn't specified explicitly
- * PYBUDA\_PROFILE: enable Python profiler
- * PYBUDA\_ASSERT\_UNSUPPORTED\_HW\_OP: assert if an unsupported op is found
- * PYBUDA\_BALANCER\_PLACER\_DATA: prints balancer/placer visual info, prints chip op group info
- * PYBUDA\_BALANCER\_POLICY\_TYPE: override balancer policy
- * PYBUDA\_SCHEDULER\_POLICY: override scheduler policy
- * PYBUDA\_BALANCER\_ONE\_ROW: limit placement to one row
- * PYBUDA\_ENABLE\_T\_STREAMING: enable t-streaming (i.e. streaming ops with small output buffers)
- * PYBUDA\_ENABLE\_TVM\_CACHE: Cache tvm graphs instead of re-compiling
- * PYBUDA\_FORCE\_FULL\_COMPILE\_DEPTH: Force each test to run to compile depth "FULL"
- * PYBUDA\_RELOAD\_GENERATED\_MODULES: Reload previously generated modules instead of recompiling through tvm.
- * PYBUDA\_SKIP\_L1\_USAGE\_VALIDATION: allows ops to use more L1 than available
- * PYBUDA\_ENFORCE\_SAME\_UBLOCK\_OPERANDS: ???
- * PYBUDA\_VERIFY\_NET2PIPE: verify produced netlist using net2pipe
- * PYBUDA\_CI\_DIR: ???
- * PYTEST\_CURRENT\_TEST: ???
- * PYBUDA\_CI\_CAPTURE\_TENSORS: save tensors used in the test so they can be used in stand-alone back-end tests
- * PYBUDA\_FORCE\_SEQUENTIAL: override test/script to run everything in sequential mode
- * PYBUDA\_TRACE\_SHUTDOWN: show stack trace on shutdown due to error
- * PYBUDA\_OVERRIDE\_NUM\_CHIPS: force the number of chips to use, instead of the auto-detected number
- * PYBUDA\_DISABLE\_DYNAMIC\_DRAM: disable dynamic allocation of e2e queues in inference
- * PYBUDA\_DISABLE\_FORK\_JOIN\_BUF: disable fork-join buffering
- * PYBUDA\_DISABLE\_FORK\_JOIN\_NOPS: don't insert nops if there's not enough buffering. Just add what's available in L1. This should only be used for debug.
- * PYBUDA\_FORK\_JOIN\_DEBUG\_INFO: print debug logs related to fork-join buffering
- * PYBUDA\_FORK\_JOIN\_DEBUG\_FORK\_NAME: filter debug logs (generated by PYBUDA\_FORK\_JOIN\_DEBUG\_INFO) by fork node name
- * PYBUDA\_FORK\_JOIN\_DEBUG\_JOIN\_NAME: filter debug logs (generated by PYBUDA\_FORK\_JOIN\_DEBUG\_INFO) by join node name
- * PYBUDA\_FORK\_JOIN\_SKIP\_EXPANDING\_BUFFERS: don't expand buffers in L1 - this will cause algorithm to add buffering nops/queues any time a fork-join needs to be buffered.
- * PYBUDA\_FORK\_JOIN\_EXPAND\_OUTPUT\_BUFFERS: expand only output buffers (instead of input buffers) for fork-join buffering
- * GOLDEN\_WORMHOLE: run Golden with Wormhole as target device instead of Grayskull (default)
- * SHOW\_ALL\_FAILS: don't assert on the first data mismatch, but show all fails before failing the test
- * PYBUDA\_EXP\_APPROX: run exp in approximate mode
- * PYBUDA\_VERIFY\_RESULTS\_OFF\_BY\_DEFAULT: disable result verification (tensor comparison of processed and golden module done via forward pass)
- * PYBUDA\_ENABLE\_STABLE\_SOFTMAX: enable stable Softmax (disabled by default)
- * EVAL\_DEBUG: prints inputs/outputs during module evaluation
- * TT\_BACKEND\_GOLDEN\_QUANTIZE: ???
- * PYBUDA\_RESET\_DEV\_BEFORE\_TEST: resets device between tests (pytest must be called with --forked in order to work)
- * PYBUDA\_PERF\_SIMULATOR: run performance simulator to estimate performance of the model
- * PYBUDA\_PERF\_SIMULATOR\_LOG: dump log of all events in perf simulator (will slow down the run)
- * PYBUDA\_PERF\_SIMULATOR\_TRACE: create trace file to be loaded into routeagui
- * PYBUDA\_OP\_PERF: dump op\_perf.csv file with op grid choices and estimated cycle counts
- * PYBUDA\_BENCHMARK\_NO\_RESET\_ON\_ERROR: from the comments seems that it doesn't work, should we remove this one?
- * PYBUDA\_SKIP\_BACKEND\_COMPILE: configure backend device to run in DeviceMode.RunOnly, picking up build binaries from previous run
- * PYBUDA\_PLACER\_BWD\_GROUPS: use bwd groups when placing so that fwd and bwd ops are placed together
- * PYBUDA\_TRIPLET\_PLACEMENT: try to place bwd groups in "triplet" placement strategy
- * PYBUDA\_EXP\_APPROX: force exp and exponent in gelu\_derivative to run in approximate mode (i.e. faster, but less accurate)
- * PYBUDA\_AMP\_LEVEL: configure the AMP (Automatic Mixed Precision) optimization level.
- * PYBUDA\_NO\_FUSE\_MATMUL\_BIAS: disable fusing of matmul+add into matmul
- * PYBUDA\_ENABLE\_OUTPUT\_QUEUES\_ON\_HOST: configures whether whether output queues are placed on HOST (default: true)
- * PYBUDA\_FORCE\_VERIFY\_ALL: ensure that verification is run after each compile stage, overrides VerifyCondig.disabled()
- * PYBUDA\_VERIFY\_POST\_AUTOGRAD\_PASSES: verify graph after post autograd passes, unless the verify config is VeifyConfig.disabled()
- * PYBUDA\_VERIFY\_POST\_PLACER: verify graph after post placer pass, unless the verify config is VeifyConfig.disabled()
- * PYBUDA\_GALAXY\_LINEAR\_ROUTE: place graphs sequentially in a snake route around the Galaxy modules
- * PYBUDA\_NEBULA\_GALAXY\_PLACER: only place output nop on mmio chip for untilizing
- * PYBUDA\_ENABLE\_AUTO\_TRANSPOSE: configures whether auto-transpose is enabled while op placement (default: false)
- * PYBUDA\_MINIMIZE\_REMOTE\_DRAM\_QUEUES: configures behaviour for data forking to remote chips - create single e2e queue on producer or e2e queue per consumer chip (default)
- * PYBUDA\_SPARSE\_MM\_ENCODING\_ESTIMATES\_OFF: when on, turns off estimation logic for in0/in2 for sparse mm, but gets slower
- * PYBUDA\_REBLOCK\_INPUT\_ACT: when enabled, we reblock input activations to the smallest grid across all users instead of forcing 1x1. (default: disabled)
- * PYBUDA\_DUMP\_MIXED\_PRECISION: when on, dump json with a per-op info about fidelity, data-formats (default: off). Default directory: reportify dump directory.
- * PYBUDA\_FRACTURIZATION\_DISABLE: disables kernel fracturing for convolutions
- * PYBUDA\_PRESTRIDE\_DISABLE: disables prestriding transform for convs
- * PYBUDA\_LEGALIZER\_DETAILED\_DEBUGGING: when on provides detailed debugging information and statistics about legalizer OpModel selection process including GraphSolver. Works only in DEBUG(default: off).
- * PYBUDA\_LEGALIZER\_DEBUG\_NODE\_NAME: used together with legalizer detailed debugging to narrow down debugging info to single node. Works only in DEBUG(default: off).
- * PYBUDA\_GRAPHSOLVER\_SELF\_CUT\_TYPE: Override for graph_solver_self_cut_type in BalancerConfig. Valid values: None, ConsumerOperandDataEdgesFirst, ProducerUserDataEdgesFirst, FastCut. When switched on(not None) graphsolver will cut edges for which it cannot produce valid paths. (default: None)
- * PYBUDA\_MAX\_GRAPH\_CUT\_RETRY: Override for default_resolve_retry_count_self_cutting in GraphSolver::resolve. This sets the max retry step if GraphSolver self cut is turned on.
- * PYBUDA\_REPLACE\_INF\_IN\_TVM\_PARAMS: Replace -inf and inf values from TVM parameters during PyBuda code generation.
- * PYBUDA\_FORCE\_ALLOW\_FRACTURING: All convs will be candidates for fracturing - this does NOT mean that all convs will fracture (search for is_kernel_fracturing_candidate)
- * PYBUDA\_DISABLE\_FUSE\_TAGS: Specify a list of ops (comma delimited) by original_op_type/op_type that will be exempt from fusion (e.g. PYBUDA\_DISABLE\_FUSE\_TAGS="reciprocal,softmax").
- * PYBUDA\_SINGLE\_OP\_EPOCHS: Place every single op on a new epoch.
- * PYBUDA\_FORK\_JOIN\_BUF\_QUEUES: Turn on adding buffering queues instead of nops in fork joins that need a lot of buffering (have one path much larger than the other).
- * PYBUDA\_RESNET\_BUFF\_QUEUE\_OVERRIDE: Turn off adding buffering queues in graph solver cut. Temporal fix for ResNet perf.
- * PYBUDA\_OVERRIDE\_DEVICE\_YAML: Override the soc device descriptor to compile against different device configurations.
- * PYBUDA\_DISABLE\_INTERACTIVE\_PLACER: Override balancer policy not to use Interactive placer and to fallback to legacy placer instead. (default: 0/False)
- * PYBUDA\_DISABLE\_INTERACTIVE\_FJ\_BUFFERING: Override balancer policy not to use inlined fork-join buffering. (default: 0/False)
- * PYBUDA\_DISABLE\_PADDING\_PASS\: Disable running of padding pass.
- * PYBUDA\_PADDING\_PASS\_ELEMENT\_WISE: In padding pass pad elementwise ops.
- * PYBUDA\_PADDING\_PASS\_MATMUL: In padding pass pad matmul ops.
- * PYBUDA\_PADDING\_PASS\_SPARSE\_MATMUL: In padding pass pad sparse matmul ops. Needs to have matmul ops enabled for padding too in order to enable this.
- * PYBUDA\_PADDING\_PASS\_BUFFER\_QUEUE": Enable padding pass, insert buffer queue
- * PYBUDA\_ENABLE\_STOCHASTIC\_ROUNDING": Enable stochastic rounding for all supported ops.
- * PYBUDA\_PADDING\_PASS\_CONCAT": Enable padding pass, for concatenate operation
- * PYBUDA\_FORCE\_CONV\_MULTI\_OP\_FRACTURE: Forces all convs to be fractured (during decompose pass) according to heuristic defined in `pybuda/pybuda/op/eval/pybuda/convolution.py`.
- * PYBUDA\_COLLECT\_CONSTRAINT\_INFO: Enables constraint info collection on every graphsolver resolve.
- * PYBUDA\_GRAPHSOLVER\_FAST: Enables partial re-resolve on cut and buffer, much faster at cost of not enabling all possible valid OpModels.
- * NUM\_EXEC\_LOOP\_ITERATIONS: For single temporal epoch tests, you can specify a # here that will rerun the epoch the specified # of times. Each rerun is initiated by FW rather than requiring host interaction, to improve performance.
- * PYBUDA\_PADDING\_PASS\_DISABLE\_BUDA\_OP: Disable padding logic that uses buda implementation for pad and unpad.
- * PYBUDA\_ENABLE\_ETH\_SERIALIZATION: Enable the ethernet stream reduction pass, using the ethernet datacopy op to implement the stream reduction
- * PYBUDA\_ENABLE\_ETH\_DATACOPY\_SERIALIZATION: Enable the ethernet stream reduction pass, using the tensix datacopy/nop op to implement the stream reduction. Will only insert datacopy ops if there are free tensix cores
- * PYBUDA\_SUPRESS\_T\_FACTOR\_MM: Enables a condition in calculate_op_model in legalizer that limits the t factor of sparse/dense matmul ops to be less than the flag's value. Valid values: any positive int value (eg. 16)
- * PYBUDA\_AMP\_LIGHT: Enable a "light" version of mixed precision to minimize accuracy impact (default: 0/False; 1: bfp8/hifi2, 2: bfp4/hifi2, 3: bfp4/LoFi)
- * PYBUDA\_GRAPH\_NAME\_SUFFIX: Suffix to add to the graph name (helps to generate unique netlist names)
- * PYBUDA\_DISABLE\_L1\_ACCUMULATE: Flag for disabling and debugging L1 accumaulation feature.
- * PYBUDA\_OVERRIDE\_VETO: Used to Add/Remove/Update general and env var based compiler configurations.
- * PYBUDA\_DISABLE\_REPORTIFY\_DUMP: Disable generating reportify graph.
- * PYBUDA\_DISABLE\_CAP\_SPARSE\_MM\_FIDELITY: Disables an optimization to cap the fidelity phases of sparse matmul to at most HiFi2.
- * PYBUDA\_DISABLE\_EXPLICIT\_DRAM\_IO: Disables the FE from programming netlist attribute `input_dram_io_buf_size_tiles`. Instead the FE will leave this attribute as `0` which implicitly means that the backend will handle the allocation of this buffer.
- * PYBUDA\_CONCAT\_ON\_HOST: Lower concatenate ops on output nodes into runtime transforms so that they're done on host.
- * PYBUDA\_BALANCER\_LEGACY\_CYCLES\_CALC: Use kernel cycles instead of limiter cycles(kernel + mem BW) for estimation.
- * PYBUDA\_OP\_MODEL\_COMPARE\_VERSION: Version of op model comparision function. Can be used to compare effect of different comparison logic on performance.
- * PYBUDA\_RIBBON1\_PREPASS\_ENABLED: Whether to use or not suboptimal opmodel invalidation prepass. Default value is False.
- * PYBUDA\_RIBBON2\_OPTIMIZATION\_ITERATIONS: Number of optimization iterations in Ribbon2 balancing policy. Default value is 0.
- * PYBUDA\_RIBBON2\_DISABLE\_CLEANUP\_BUF\_NOPS: Disable cleanup of unneeded buffering nops in Ribbon2. (default: 0/False)
-
-## Temp overrides
-* PYBUDA\_TEMP\_ENABLE\_NEW\_SPARSE\_ESTIMATES: Apply new formula to estimate the cycle count of sparse matmul ops (currently only support LoFi and HiFi2 fidelities)
-* PYBUDA\_TEMP\_SCALE\_ESTIMATE\_ARGS: Scale counts of non-zero tiles, ublocks and strips to reflect the numbers that would end up on a single core, since BBE estimates always assume grid_size [1,1].
-* PYBUDA\_TEMP\_ELT\_UNARY\_ESTIMATES\_LEGACY: Force legacy path of calculating execution cycles for eltwise unary ops - instead of calling into BBE, use hand-crafted FE-side logic
-* PYBUDA\_TEMP\_ENABLE\_NEW\_FUSED\_ESTIMATES: Apply new formula to estimate the cycle count of fused ops. The formula calls BBE to estimate each subop and sums up the results.
-* PYBUDA\_LEGACY\_KERNEL\_BROADCAST: Use legacy kernel broadcast detection path. Will detect fewer kernel broadcasts, and will oftentimes use more tiles (longer KBs).
\ No newline at end of file
diff --git a/README.md b/README.md
index 6509de5ed..fe1d360e3 100644
--- a/README.md
+++ b/README.md
@@ -1,32 +1,28 @@
-# TT-Buda
-
-## Introduction
-
-The TT-Buda software stack can compile AI/ML models from several different frameworks such as PyTorch and Tensorflow, and execute them in many different ways on Tenstorrent hardware.
-
-**Note on terminology:**
-
-TT-Buda is the official Tenstorrent AI/ML compiler stack and PyBuda is the Python interface for TT-Buda. PyBuda allows users to access and utilize TT-Buda's features directly from Python. This includes directly importing model architectures and weights from PyTorch, TensorFlow, ONNX, and TFLite.
-
-## Model Demos
-
-Model demos are now part of a separate repo:
-
-https://github.com/tenstorrent/tt-buda-demos
-
-## Docs
-
-See: [Docs](https://docs.tenstorrent.com/tenstorrent/v/tt-buda)
-
-## Build
-
-https://docs.tenstorrent.com/tenstorrent/v/tt-buda/installation
-
-## Env setup
-
-Set `LD_LIBRARY_PATH` to the location of `third_party/budabackend/build/lib` - preferrably the absolute path to allow scripts to find them from anywhere.
-
-## Silicon
-
-See README.silicon.md for details on how to run on silicon.
-
+### Building dependencies
+* `cmake`
+* `clang`
+* `Ninja` - sudo apt-get install ninja-build
+
+### Building environment
+This is one off step. It will pull all dependencies needed for tt-forge.
+
+* `git submodule update --init --recursive -f`
+* `source env/activate`
+* `cmake -B env/build env`
+* `cmake --build env/build`
+
+### Build tt-forge
+* `source env/activate`
+* `cmake -G Ninja -B build .`
+* `cmake --build build`
+
+### Cleanup
+* `rm -rf build` - to cleanup tt-forge build artifacts.
+* `./clean_all.sh` - to cleanup all build artifacts (tt-forge/tvm/tt-mlir/tt-metal). This will not remove toolchain dependencies.
+
+### Environment variables:
+* `TTMLIR_TOOLCHAIN_DIR` - points to toolchain dir where dependencies of TTLMIR will be installed. If not defined it defaults to /opt/ttmlir-toolchain
+* `TTMLIR_VENV_DIR` - points to virtual environment directory of TTMLIR.If not defined it defaults to /opt/ttmlir-toolchain/venv
+* `PYBUDA_TOOLCHAIN_DIR` - points to toolchain dir where dependencies of PyBuda will be installed. If not defined it defaults to /opt/pybuda-toolchain
+* `PYBUDA_VENV_DIR` - points to virtual environment directory of tt-forge. If not defined it defaults to /opt/pybuda-toolchain/venv
+* `PYBUDA_PYTHON_VERSION` - set to override python version. If not defined it defaults to python3.10
diff --git a/README.silicon.md b/README.silicon.md
deleted file mode 100644
index f2eb254a1..000000000
--- a/README.silicon.md
+++ /dev/null
@@ -1,25 +0,0 @@
-
-# How to run pybuda on silicon
-
-## Docker
-
-To create a docker, run `bin/run-docker.sh` from `third_party/budabackend`. Then, enter it use `docker exec -u $USER -it special-$USER bash`.
-
-## Env
-
-Grayskull and Wormhole machines require slightly different setups, so there are setup scripts for each. To build everything run the appropriate script:
-
-* `source env_for_silicon.sh` (Grayskull)
-
-or
-
-* `source env_for_wormhole.sh` (Wormhole)
-
-Now, you should be able to run pybuda pytests and python scripts.
-
-## Run
-
-For example, try:
-
-`pytest -svv pybuda/test/backend/test_silicon.py::test_basic_training[Grayskull-acc1-mbc1-microbatch1-s1]`
-
diff --git a/bisect.sh b/bisect.sh
deleted file mode 100755
index e9d72d88a..000000000
--- a/bisect.sh
+++ /dev/null
@@ -1,46 +0,0 @@
-#!/bin/bash
-
-function show_usage(){
- echo "Usage bisect.sh GOOD_REV BAD_REV TEST_COMMAND_0"
- exit 1
-}
-if [ ! $# -eq 3 ]
-then
- show_usage
-fi
-
-git fetch
-
-if ! git cat-file -e $1 2> /dev/null
-then
- echo "$1 not valid git rev"
- show_usage
-fi
-
-if ! git cat-file -e $2 2> /dev/null
-then
- echo "$2 not valid git rev"
- show_usage
-fi
-
-echo "Running auto_bisect with passing revision: $1, failing revision $2"
-echo "Test commands:"
-
-for ((i = 3; i <= $#; i++ )); do
- printf '%s\n' " ${!i}"
-done
-
-read -n1 -s -r -p $'Press c to continue, q to quit\n' key
-
-if [ "$key" = 'q' ]
-then
- echo "Exiting"
- exit 0
-fi
-
-git bisect start
-git bisect bad $2
-git bisect good $1
-git bisect run bash -c ". compile_and_run_test.sh $3"
-git bisect log
-git bisect reset
\ No newline at end of file
diff --git a/clean_all.sh b/clean_all.sh
new file mode 100755
index 000000000..6818a1b3c
--- /dev/null
+++ b/clean_all.sh
@@ -0,0 +1,6 @@
+rm -rf build
+rm -rf env/build
+rm -rf third_party/tt-mlir/build
+rm -rf third_party/tt-mlir/env/build
+rm -rf third_party/tt-mlir/third_party/tt-metal
+rm -rf third_party/tvm/build
diff --git a/cmake/Utils.cmake b/cmake/Utils.cmake
new file mode 100644
index 000000000..da33ba8e7
--- /dev/null
+++ b/cmake/Utils.cmake
@@ -0,0 +1,17 @@
+### Utility functions for pybuda ###
+
+### Check if an environment variable exists ###
+function(check_env_variable_internal VARIABLE_NAME ret)
+ if(NOT DEFINED ENV{${VARIABLE_NAME}})
+ set(${ret} "false" PARENT_SCOPE)
+ endif()
+endfunction()
+
+### Check if an environment variable exists ###
+function(check_required_env_var VARIABLE_NAME)
+ set(VARIABLE_EXISTS "true")
+ check_env_variable_internal(${VARIABLE_NAME} VARIABLE_EXISTS)
+ if(NOT ${VARIABLE_EXISTS})
+ message(FATAL_ERROR "${VARIABLE_NAME} does not exist. Did you run source env/activate?")
+ endif()
+endfunction()
diff --git a/compile_and_run_test.sh b/compile_and_run_test.sh
deleted file mode 100755
index a28439e9e..000000000
--- a/compile_and_run_test.sh
+++ /dev/null
@@ -1,33 +0,0 @@
-#!/bin/bash
-
-export BACKEND_ARCH_NAME=wormhole_b0
-git reset --hard
-git submodule update --init --recursive
-make pybuda
-if [ ! $? -eq 0 ]
-then
- echo "Make pybuda failed"
- make clean_no_python
- cd third_party/budabackend
- git clean -fxd
- cd ../..
- make pybuda
- if [ ! $? -eq 0 ]
- then
- echo "Clean build failed bad rev 1"
- return 1
- fi
-fi
-source build/python_env/bin/activate
-source third_party/tvm/install.sh
-source third_party/tvm/enable.sh
-echo "Evaluating $*"
-eval "$*"
-if [ ! $? -eq 0 ]
-then
- echo "Test failed"
- exit 1
-fi
-
-echo "All passed"
-exit 0
diff --git a/compile_flags.txt b/compile_flags.txt
index f66eece24..b2630aca4 100644
--- a/compile_flags.txt
+++ b/compile_flags.txt
@@ -13,4 +13,3 @@
-Ithird_party/json
-Ipybuda/csrc
-Ithird_party/fmt
--Ithird_party/budabackend
diff --git a/docs/CI/.gitlab-ci.build-docs.yml b/docs/CI/.gitlab-ci.build-docs.yml
index 63335c462..b92211213 100644
--- a/docs/CI/.gitlab-ci.build-docs.yml
+++ b/docs/CI/.gitlab-ci.build-docs.yml
@@ -5,7 +5,7 @@
- docs/public/md
needs:
- pybuda-wh-b0-wheel
- - seeded-venv
+ - !reference [.common_deps, needs]
tags:
- 8-core
script:
@@ -25,8 +25,10 @@ build-docs-main:
extends: .build-docs
rules:
- if: $CI_COMMIT_TAG
- when: always
+ - if: $CI_COMMIT_REF_NAME == "main"
+ when: manual
- when: never
+ allow_failure: true
script:
- !reference [.build-docs, script]
- cp -r ../docs/public/md/* pybuda
@@ -38,7 +40,6 @@ build-docs-staging:
extends: .build-docs
rules:
- if: $CI_COMMIT_BRANCH == "main"
- when: always
script:
- !reference [.build-docs, script]
- git checkout staging
@@ -46,4 +47,37 @@ build-docs-staging:
- git add pybuda
- git commit -m "update docs from pipeline $CI_PIPELINE_ID"
- git push
-
\ No newline at end of file
+
+# Job for building and pushing markdown to the new docsite repo
+# Note: the repo name will need to be updated when the docsite goes live, as the name will be changed from docs-test
+build-docs-docsite:
+ stage: docs
+ artifacts:
+ paths:
+ - docs/public/md
+ needs:
+ - pybuda-wh-b0-wheel
+ - !reference [.common_deps, needs]
+ tags:
+ - 8-core
+ rules:
+ - if: $CI_COMMIT_BRANCH == "main" && $CI_COMMIT_TAG =~ /^v[0-9]+\.[0-9]+\.[0-9]+$/
+ script:
+ - !reference [.common_prep, script]
+ - pip3 install sphinx
+ - pip3 install sphinx-markdown-builder
+ - sphinx-build -M markdown docs/public docs/public/md
+ - cp -r docs/public/images docs/public/md/markdown
+ - git config --global user.email "tenstorrent-github-bot@tenstorrent.com"
+ - git config --global user.name "tenstorrent-github-bot"
+ - git clone https://tenstorrent-github-bot:${GITHUB_BOT_TOKEN}@github.com/tenstorrent/docs-test.git
+ - cd docs-test
+ - git remote set-url origin https://tenstorrent-github-bot:${GITHUB_BOT_TOKEN}@github.com/tenstorrent/docs-test.git
+ - git checkout main
+ - pip install -r requirements.txt
+ - cp -r ../docs/public/md/* pybuda
+ - python update_tags.py pybuda $$CI_COMMIT_TAG
+ - git add .
+ - git commit -m "update pybuda docs from pipeline $CI_PIPELINE_ID with tag $CI_COMMIT_TAG"
+ - git tag -a $CI_COMMIT_TAG -m "pybuda documentation version $CI_COMMIT_TAG"
+ - git push && git push --tags
diff --git a/docs/public/api.rst b/docs/public/api.rst
index 21de6a121..5336bc92f 100644
--- a/docs/public/api.rst
+++ b/docs/public/api.rst
@@ -21,10 +21,6 @@ Python Runtime API
.. automodule:: pybuda
:members: run_inference, run_training, shutdown, initialize_pipeline, run_forward, run_backward, run_optimizer, get_parameter_checkpoint, get_parameter_gradients, update_device_parameters
-C++ Runtime API
-******************
-
-The BUDA Backend used by Python Runtime can be optionally used stand-alone to run pre-compiled TTI models. The API reference for stand-alone BUDA Backend Runtime can be found `here `_.
Configuration and Placement
***************************
diff --git a/docs/public/images/tt_buda_w_logo.png b/docs/public/images/tt_buda_w_logo.png
new file mode 100644
index 000000000..553d8a194
Binary files /dev/null and b/docs/public/images/tt_buda_w_logo.png differ
diff --git a/docs/public/images/tt_logo.png b/docs/public/images/tt_logo.png
new file mode 100644
index 000000000..191e7f857
Binary files /dev/null and b/docs/public/images/tt_logo.png differ
diff --git a/docs/public/installation.rst b/docs/public/installation.rst
index 27d292d7c..cfa7d589e 100644
--- a/docs/public/installation.rst
+++ b/docs/public/installation.rst
@@ -19,7 +19,7 @@ Prerequisites
OS Compatibility
----------------
-Presently, Tenstorrent software is only supported on the **Ubuntu 20.04 LTS (Focal Fossa)** operating system.
+Currently, Tenstorrent software is fully supported and tested on **Ubuntu 22.04 LTS (Jammy Jellyfish)**. Software is also functional on **Ubuntu 20.04 LTS (Focal Fossa)**, but we don't extensively test to guarantee there are no regressions on this OS version.
Download
********
@@ -63,6 +63,7 @@ To install a PyBUDA release, follow these steps:
source env/bin/activate
* Step 4. Pip install PyBuda and TVM
+
If you have downloaded the latest release wheel files, you can install them directly with pip.
.. code-block:: bash
@@ -70,8 +71,11 @@ If you have downloaded the latest release wheel files, you can install them dire
pip install pybuda-.whl tvm-.whl
To compile PyBUDA from source, follow these steps:
+
* Step 1. Clone PyBUDA from https://github.com/tenstorrent/tt-buda/
+
* Step 2. Update submodules
+
.. code-block:: bash
cd tt-buda
@@ -80,6 +84,7 @@ To compile PyBUDA from source, follow these steps:
* Step 3. Compile. PyBUDA's make system will automatically create the needed venv
.. code-block:: bash
+
make
source build/python_env/bin/activate
@@ -162,8 +167,14 @@ You may need to append each ``apt-get`` command with ``sudo`` if you do not have
apt-get install -y python3.8-venv libboost-all-dev libgoogle-glog-dev libgl1-mesa-glx ruby
apt-get install -y build-essential clang-6.0 libhdf5-serial-dev libzmq3-dev
+Environment variable PYTHON_VERSION sets path to the Python executable. This is used by the build system to determine which version of Python to use and where to look for Python.h.
+For example on Ubuntu 22.04, PYTHON_VERSION should be set to "python3.10", while on Ubuntu 20.04 it should be set to "python3.8".
+
Additional PyBUDA Compile Dependencies
------------------------------
+--------------------------------------
+
+OS Level Dependencies
+^^^^^^^^^^^^^^^^^^^^^
Additional dependencies to compile PyBUDA from source after running `Backend Compiler Dependencies <#backend-compiler-dependencies>`_
@@ -172,9 +183,36 @@ You may need to append each ``apt-get`` command with ``sudo`` if you do not have
.. code-block:: bash
apt-get install -y libyaml-cpp-dev python3-pip sudo git git-lfs
- apt-get install -y wget cmake cmake-data libgtest-dev libgmock-dev
+ apt-get install -y wget cmake cmake-data
pip3 install pyyaml
+Package Level Dependencies
+^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+In addition, if you intend to utilize ``torchvision`` for your model development, we strongly recommend using the ``torchvision`` version built with CXX11 ABI, that ensures optimal compatibility with PyBUDA.
+
+When building PyBUDA from source, the recommended version of ``torchvision`` is built and installed by default.
+
+.. note::
+
+ For your convenience, the ``torchvision`` wheel file is already included in the PyBUDA release bundle. This means that if you're using the release bundle, you won't need to build ``torchvision`` from source unless you want to use a different version or need to modify the source code. Simply install the provided wheel file using pip to add ``torchvision`` to your Python environment.
+
+ Here's an example of how you can install the ``torchvision`` wheel file:
+
+ .. code-block:: bash
+
+ pip install /path/to/your/wheel/file/torchvision*.whl
+
+ Replace ``/path/to/your/wheel/file/torchvision*.whl`` with the actual path to the ``torchvision`` wheel file in the PyBUDA release bundle.
+
+.. note::
+
+ To run the existing unit tests of PyBUDA components, e.g. after compiling it from source, you need to install the following packages.
+
+ .. code-block:: bash
+
+ apt-get install -y wget libgtest-dev libgmock-dev
+
TT-SMI
------
diff --git a/docs/public/module.mk b/docs/public/module.mk
index df363655d..78bd4db4c 100644
--- a/docs/public/module.mk
+++ b/docs/public/module.mk
@@ -8,7 +8,7 @@ docs/public: $(DOCS_PUBLIC_DIR)
.PHONY: foo
-$(DOCS_PUBLIC_DIR): $(DOCS_PUBLIC_BUILD_SCRIPT) $(DOCS_PUBLIC_SRCS) python_env foo
+$(DOCS_PUBLIC_DIR): $(DOCS_PUBLIC_BUILD_SCRIPT) $(DOCS_PUBLIC_SRCS) $(PYTHON_ENV) foo
LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):$(LIBDIR) \
PYTHON_ENV=$(PYTHON_ENV) \
BUILDER=$(DOCS_PUBLIC_SPHINX_BUILDER) \
@@ -19,7 +19,7 @@ $(DOCS_PUBLIC_DIR): $(DOCS_PUBLIC_BUILD_SCRIPT) $(DOCS_PUBLIC_SRCS) python_env f
docs/public/publish: docs/public
rsync --delete -avz $(DOCS_PUBLIC_DIR)/html/ yyz-webservice-02:/var/www/html/docs/pybuda-docs
-docs/pdf: python_env foo
+docs/pdf: $(PYTHON_ENV) foo
LD_LIBRARY_PATH=$(LD_LIBRARY_PATH):$(LIBDIR) \
PYTHON_ENV=$(PYTHON_ENV) \
BUILDER=latexpdf \
diff --git a/docs/public/pybuda/training.rst b/docs/public/pybuda/training.rst
index 0ac5eb659..4c9a8794d 100644
--- a/docs/public/pybuda/training.rst
+++ b/docs/public/pybuda/training.rst
@@ -22,7 +22,7 @@ by a CPU device calculating the loss and backpropagating it to Grayskull.
# Create a device and model, and place it
tt0 = TTDevice("tt0", devtype=devtype)
- matmul0 = BudaLin("matmul0")
+ matmul0 = BudaLinear("matmul0")
tt0.place_module(matmul0)
act_dim = (1, 1, 32, 32)
diff --git a/docs/public/user_guide.rst b/docs/public/user_guide.rst
index d6e7f1500..8a1d1b836 100644
--- a/docs/public/user_guide.rst
+++ b/docs/public/user_guide.rst
@@ -231,18 +231,31 @@ Such a dictionary can also be pushed back onto the device using :py:func:`update
TensTorrent Device Image (TTI): Saving/Loading
**********************************************
-A Tenstorrent Device Image (TTI) is a standalone zip/archive file that snapshots the :py:class:`TTDevice` configuration,
-the compiled models/modules placed and any backend build files. There are multiple benefits with the usage a TTI archive:
+A Tenstorrent Image (TTI) is a standalone archive file that captures the entire compiled state of a
+model. The contents of the archive include device configuration, compiler configuration, compiled model artifacts,
+backend build files (e.g. overlay and risc binaries), model parameter tensors. There can be multiple advantages
+with leveraging the usage of a TTI archive:
1) Offline target compilation of models on arbitrary device targets (i.e. target device does not have to be present/available on the machine to compile and save a TTI).
2) Loading a TTI archive allows the user to skip any long front-end and backend compilations of models onto the device
and directly begin executing the graph/module that was packaged in the `*.tti` after pushing inputs to queues.
+3) TTI archives can be shared and loaded across different machines and environments.
+4) When we save a TTI archive, we can configure the serialization format for the model parameters. This can be useful for
+ scenarios where the user wants to save the model parameters in a tilized-binary format to avoid tilizing during model inference.
+ By default the serialization format is pickle. To configure for alternate serialization formats, the user can set either:
+ `PYBUDA_TTI_BACKEND_FORMAT=1` or `PYBUDA_TTI_BACKEND_TILIZED_FORMAT=1` environment variables.
-We can save a TTI archive by invoking the `compile_to_image` method on :py:class:`TTDevice`
+For example, from a machine without a silicon device, we can save a TTI archive intended to be deployed on a silicon device.
+We need to configure the device type and architecture of the target device and compile the model to a TTI archive.
+This can be done by invoking the `compile_to_image` method on :py:class:`TTDevice`.
.. code-block:: python
- tt0 = pybuda.TTDevice("tt0",arch=BackendDevice.Grayskull, devtype=BackendType.Silicon)
+ tt0 = pybuda.TTDevice(
+ name="tt0",
+ arch=BackendDevice.Wormhole_B0,
+ devtype=BackendType.Silicon
+ )
tt0.place_module(...)
device_img: TTDeviceImage = tt0.compile_to_image(
img_path="device_images/tt0.tti",
@@ -256,14 +269,19 @@ This will create the archive file `device_images/tt0.tti`. The contents of a TTI
.. code-block::
/unzipped_tti_directory
- ├── device.json # json file capturing device state
+ ├── device.json # Device state and compiled model metadata
├── .yaml # netlist yaml
- ├── backend_build_binaries # backend build files from tt_build/
- │ ├── blob_init
+ ├── compile_and_runtime_config.json # compiler and runtime configurations
+ ├── backend_build_binaries # backend build binaries
+ │ ├── device_desc.yaml
+ │ ├── cluster_desc.yaml
│ ├── brisc
- │ ├── ...
- ├── *tensor*.pkl # pickled constant/parameter tensors
- ├── *module*.pkl # pickled PyBuda module object
+ │ ├── erisc
+ │ ├── nrisc
+ │ ├── hlks
+ │ ├── epoch_programs
+ ├── tensors # directory containing serialized tensors
+ ├── module_files # Python file containing the PybudaModule of the model
To load the TTI archive and inspect the contents:
@@ -291,11 +309,10 @@ The :py:class:`TTDeviceImage::info()` method provides a su
- chip_ids: [0]
- backend device type: BackendType.Silicon
- grid size: [10, 12]
- - harvested rows: 0
+ - harvested rows: [0]
Compilation Graph State...
- training: False
- - modules: ['bert_encoder']
- ordered input shapes: [[1, 128, 128], [1, 1, 128, 128]]
- ordered targets shapes: []
@@ -312,8 +329,35 @@ We can now configure :py:class:`TTDevice` by using our image ob
output_q = pybuda.run_inference()
-Create TTI Targeting Row-Harvested Silicon Devices
-**************************************************
+Create TTI: Targeting Supported Silicon Devices
+***********************************************
+
+In the example above, we saved a TTI file targeting a silicon device with default configuration (unharvested). There
+are also convenience labels available that can be used to target specific silicon devices in our supported product spec.
+The current support available is: {gs_e150, gs_e300, wh_n150, wh_n300}.
+
+To target a specific silicon device, we can set the device type and architecture using :py:func:`set_configuration_options`.
+
+
+.. code-block:: python
+
+ pybuda.set_configuration_options(device_config="wh_n150")
+
+ tt0 = pybuda.TTDevice(
+ name="tt0",
+ arch=BackendDevice.Wormhole_B0,
+ devtype=BackendType.Silicon
+ )
+ tt0.place_module(...)
+ device_img: TTDeviceImage = tt0.compile_to_image(
+ img_path="device_images/tt0.tti",
+ training=training,
+ sample_inputs=(...),
+ )
+
+
+Create TTI: Targeting Custom Row-Harvested Silicon Devices
+*********************************************************
We can also save a TTI file targeting a machine with silicon devices with harvested rows offline.
The only difference from the above is we need to manually induce the harvested rows before saving TTI.
@@ -348,32 +392,65 @@ Accordingly, part of the TTI file slightly changes as well:
Note that only rows 1-5 and 7-11 are harvestable, and TTI loading will raise an error if the manually harvested rows in TTI does not match with that of the loaded silicon device.
+Create TTI: Targeting Custom Device Descriptor
+**************************************************
+
+We can also save a TTI file targeting a machine with silicon devices with custom device descriptor (specified with file-path).
+This can be done by setting the device descriptor using :py:func:`set_configuration_options` with `backend_device_descriptor_path` argument.
+
+.. code-block:: python
+
+ pybuda.set_configuration_options(backend_device_descriptor_path="/wormhole_b0_4x6.yaml")
+
+ tt0 = pybuda.TTDevice("tt0",arch=BackendDevice.Wormhole_B0, devtype=BackendType.Silicon)
+ tt0.place_module(...)
+ device_img: TTDeviceImage = tt0.compile_to_image(
+ img_path="device_images/tt0.tti",
+ training=training,
+ sample_inputs=(...),
+ )
+
+The device-descriptor used during the offline compilation process will be embedded in the TTI-archive.
+This device-descriptor will be used to configure the device during the TTI-loading process.
+
+
Embedded TTI Loading
********************
-Here's an example of loading a TTI model from c++ for environments that do not have a packaged python interpreter.
+Here's an example of loading a generic TTI model from C++ for environments that do not have a packaged Python interpreter.
.. code-block:: cpp
#include
#include
#include
+ #include
#include "tt_backend.hpp"
#include "tt_backend_api.hpp"
#include "tt_backend_api_types.hpp"
+ #include "io_utils.h"
- // Populate a queue descriptor from a queue name
- tt::tt_dram_io_desc get_queue_descriptor(const std::shared_ptr &backend, const std::string &queue_name);
+ namespace fs = std::filesystem;
- // Populate a tensor descriptor from a queue name
- tt::tt_PytorchTensorDesc get_tensor_descriptor(const std::string &name);
+ int main(int argc, char **argv) {
+
+ if (argc <= 1) {
+ throw std::runtime_error("TTI path not specified on the command line");
+ }
+ else if (argc > 3) {
+ throw std::runtime_error("Incorrect number of arguments specified to inference harness. Supported args: TTI_PATH NUM_INFERENCE_LOOPS");
+ }
- int main(int argc, char** argv)
- {
// Define path to pre-compiled model and output artifacts
- std::string output_path = "tt_build/base_encoders";
- std::string model_path = "base_encoders.tti";
+ std::string output_path = "tt_build/test_standalone_runtime";
+ fs::create_directories(output_path);
+ uint32_t inference_loops = 1;
+ std::string model_path = argv[1]; // eg. "/home_mnt/software/spatial2/backend/binaries/CI_TTI_TEST_BINARIES_WH/bert.tti"
+
+ if (argc == 3) {
+ inference_loops = std::stoi(argv[2]);
+ }
// Create a pre-compiled model object and a backend object from it using default config
std::shared_ptr model = std::make_shared(model_path, output_path);
@@ -388,23 +465,32 @@ Here's an example of loading a TTI model from c++ for environments that do not h
}
// The following code must execute between initialize() and finish()
- {
+ for (uint32_t i = 0; i < inference_loops; i++) {
// - Push a microbatch of inputs to device
for (const std::string &name : model->get_graph_input_names()) {
- tt::tt_dram_io_desc io_desc = get_queue_descriptor(backend, name);
- tt::tt_PytorchTensorDesc tensor_desc = get_tensor_descriptor(name);
-
+ tt::tt_dram_io_desc io_desc = tt::io::utils::get_queue_descriptor(backend, name);
+ tt::tt_PytorchTensorDesc tensor_desc = tt::io::utils::get_tensor_descriptor(name, model, io_desc);
+ // Fill the tensor descriptor with data. We choose to allocate dummy memory using the TT backend for this tensor.
+ // The user is free to use previously allocated memory, or use the backend to allocate memory that is then filled with actual data.
+ tt::io::utils::fill_tensor_with_data(name, tensor_desc);
// DMA the input tensor from host to device
assert(tt::backend::push_input(io_desc, tensor_desc, false, 1) == tt::DEVICE_STATUS_CODE::Success);
+ // Optional: Host memory management
+ // - free releases storage on host (tensor data freed), since host is done with pushing data for this activation
+ // - The user can choose not to free this memory and use it even after the data is in device DRAM
+ std::cout << "Pushed Input tensor " << name << " data ptr: " << tensor_desc.ptr << std::endl;
+ assert(tt::backend::free_tensor(tensor_desc) == tt::DEVICE_STATUS_CODE::Success);
}
// - Run inference program, p_loop_count is the number of microbatches executed
std::map program_parameters = {{"$p_loop_count", "1"}};
- backend->run_program("run_fwd_0", program_parameters);
-
+ for (const auto& prog_name : backend -> get_programs()) {
+ assert(backend->run_program(prog_name, program_parameters) == tt::DEVICE_STATUS_CODE::Success);
+ }
+
// - Pop a microbatch of outputs from device
for (const std::string &name : model->get_graph_output_names()) {
- tt::tt_dram_io_desc io_desc = get_queue_descriptor(backend, name);
+ tt::tt_dram_io_desc io_desc = tt::io::utils::get_queue_descriptor(backend, name);
tt::tt_PytorchTensorDesc tensor_desc = {}; // passed into get_tensor below to be populated
// DMA the output tensor from device to host
@@ -416,11 +502,11 @@ Here's an example of loading a TTI model from c++ for environments that do not h
// Host memory management
// - free releases storage on host (tensor data freed), host is done with the output data
- std::cout << "Output tensor " << name << " data ptr: " << tensor_desc.ptr << std::endl;
+ // - The user can choose not to free this memory and use it for downstream tasks
+ std::cout << "Got Output tensor " << name << " data ptr: " << tensor_desc.ptr << std::endl;
assert(tt::backend::free_tensor(tensor_desc) == tt::DEVICE_STATUS_CODE::Success);
}
}
-
// - Teardown the backend
if (backend->finish() != tt::DEVICE_STATUS_CODE::Success) {
throw std::runtime_error("Failed to shutdown device");
@@ -428,56 +514,6 @@ Here's an example of loading a TTI model from c++ for environments that do not h
return 0;
}
- // Populate an queue descriptor from a queue name
- tt::tt_dram_io_desc get_queue_descriptor(const std::shared_ptr &backend, const std::string &queue_name) {
- tt::tt_dram_io_desc queue_desc = backend->get_queue_descriptor(queue_name);
- // (optional) maps device address to a contiguous user-space address in tt::tt_dram_io_desc::bufq_mapping
- // - push_input will use this mapping for memcpy-based fast DMA if it exists
- // - push_input will use user-mode driver for DMA if mapping does not exist
- tt::backend::translate_addresses(queue_desc);
- return queue_desc;
- }
-
- // Populate a tensor descriptor from raw data + metadata
- template
- tt::tt_PytorchTensorDesc to_tensor_descptor(
- const T *array,
- unsigned int w_dim,
- unsigned int z_dim,
- unsigned int r_dim,
- unsigned int c_dim,
- tt::DataFormat format,
- unsigned int dim = tt::PY_TENSOR_DIMS) {
- tt::tt_PytorchTensorDesc tensor_desc;
- tensor_desc.owner = tt::OWNERSHIP::Backend;
- tensor_desc.ptr = array;
- tensor_desc.itemsize = sizeof(T);
- tensor_desc.format = format;
- tensor_desc.shape = {w_dim, z_dim, r_dim, c_dim};
- tensor_desc.dim = dim;
-
- tensor_desc.strides[3] = sizeof(T);
- tensor_desc.strides[2] = c_dim * tensor_desc.strides[3];
- tensor_desc.strides[1] = r_dim * tensor_desc.strides[2];
- tensor_desc.strides[0] = z_dim * tensor_desc.strides[1];
- return tensor_desc;
- }
-
- // Populate a tensor descriptor from a queue name
- tt::tt_PytorchTensorDesc get_tensor_descriptor(const std::string &name) {
- // The following code is an example for BERT base encoder input:
- // - activation: [microbatch, channels = 1, height = 128, width = 768]
- // - atten_mask: [microbatch, channels = 1, height = 32, width = 128]
- if (name == "input_1") {
- static std::vector tensor_data(128 * 1 * 128 * 768, 0);
- return to_tensor_descptor(tensor_data.data(), 128, 1, 128, 768, tt::DataFormat::Float16_b);
- } else if (name == "attention_mask") {
- static std::vector tensor_data(128 * 1 * 32 * 128, 0);
- return to_tensor_descptor(tensor_data.data(), 128, 1, 32, 128, tt::DataFormat::Float16_b);
- }
- throw std::runtime_error("Tensor is not a valid input");
- }
-
Pybuda Automatic Mixed Precision
--------------------------------------
@@ -626,6 +662,137 @@ However, ``num_chips`` and ``chip_ids`` parameters can be used to select a subse
See :py:class:`TTDevice` for more details.
+Pybuda Multi-Model Support (Embedded Applications Only)
+-------------------------------------------------------
+
+Introduction
+*******************
+
+PyBuda allows users to merge several models into a single Tenstorrent Device Image, with minimal workflow overhead. The TTI can then be consumed by the C++ Backend and run on a Tenstorrent Device.
+
+A typical process to generate and execute a Multi-Model workload is as follows:
+
+**Compilation: Either Offline or on a Tenstorrent Device**
+
+#. Generate TTIs for each model in the workload.
+#. Run the Model-Merging tool to consolidate all models into a single TTI.
+
+**Execution: On a Tenstorrent Device**
+
+#. Spawn an application using the C++ backend APIs to deploy the workload contained in the TTI. An example application is provided in the `Embedded TTI Loading` section.
+
+Fusing multiple independent models is well tested with several State of the Art models (including ViT, Mobilenet, ResNet50 ...). Supporting pipelined models is currently under active development.
+
+Below, we describe the APIs and associated tools used to fuse models without any dependencies.
+
+Usage
+*****
+
+Pybuda exposes two entry points for users to run the Model Merging Tool:
+
+#. Command Line Interface to specify the list of models to merge along with optional arguments. These include parameters enabling/disabling certain optimizations.
+#. Python API to be consumed by user applications. Usage of this API is very similar to the Command Line Tool.
+
+**Command Line Interface**
+
+.. code-block:: bash
+
+ python3 pybuda/pybuda/tools/tti_merge.py [-h] [-mbl {dirname}]
+ [-mdl {models}] [-a {arch}]
+ [-mml {filename}] [-scr]
+ [-dqo]
+
+The following arguments are available when using `tti_merge.py`
+
+.. list-table:: Table 1. TT-SMI optional arguments.
+ :header-rows: 1
+
+ * - Argument
+ - Function
+ * - -h, --help
+ - Show help message and exit
+ * - -mbl, --model_binaries_location
+ - Relative path to where model TTIs are stored [Required]
+ * - -mdl, --models
+ - List of models to be merged (names must match TTI filenames) [Required]
+ * - -a, --arch
+ - Target Tenstorrent Architecture (default = wormhole_b0) [Optional]
+ * - -mml, --merged_model_location
+ - Relative path to where the Multi-Model TTI will be emitted (default = merged_model.tti) [Optional]
+ * - -scr, --skip_channel_reallocation
+ - Disable memory optimization that switches channels for queues when OOM during memory allocation (default = False) [Optional]
+ * - -dqo, --dynamic_queue_overlap_off
+ - Disable memory optimization allowing dynamic queues to overlap in memory channels (default = False) [Optional]
+
+As an example, given the following directory structure in the Pybuda root directory:
+
+.. code-block:: bash
+
+ device_images_to_merge/
+ ├-- bert_large.tti
+ ├-- deit.tti
+ ├-- hrnet.tti
+ ├-- inception.tti
+ ├-- mobilenet_v1.tti
+ ├-- mobilenet_v2.tti
+ ├-- mobilenet_v3.tti
+ ├-- resnet.tti
+ ├-- unet.tti
+ ├-- vit.tti
+
+The following command will generate a Multi-Model TTI (with memory optimizations enabled) and store it in `multi_model_workload.tti`:
+
+.. code-block:: bash
+
+ python3 pybuda/pybuda/tools/tti_merge.py -mbl device_images_to_merge/ -mdl bert_large deit hrnet inception mobilenet_v1 mobilenet_v2 mobilenet_v3 resnet unet vit -mml multi_model_workload.tti
+
+**Python API**
+
+This API provides identical functionality as the command line interface, for cases where the Model Merging step needs to be automated.
+
+.. code-block:: python
+
+ # API Declaration
+ def merge_models(model_bin_location, models, arch = "wormhole_b0", merged_model_location = "", switch_chans_if_capacity_hit = True, overlap_dynamic_queues = True)
+
+Here the arguments `switch_chans_if_capacity_hit` and `overlap_dynamic_queues` corresponds to memory optimizations, which are enabled my default.
+
+The following Python code generates a Multi-Model TTI in a manner identical to the command listed in the previous section:
+
+.. code-block:: python
+
+ from pybuda.tools.tti_merge import merge_models
+
+ model_binary_loc = "device_images_to_merge"
+ models_to_merge = ["bert_large", "deit", "hrnet", "inception", "mobilenet_v1", "mobilenet_v2", "mobilenet_v3", "resnet", "unet", "vit"]
+ target_arch = "wormhole_b0
+ merged_model_location = "multi_model_workload.tti"
+
+ # Individual Model Generation Code Goes Here
+
+ merge_models(model_binary_loc, models_to_merge, target_arch, merged_model_location)
+
+**Memory Profiler**
+
+During the model fusion process, the API presented above is responsible for performing memory reallocation. Users may be interested in the memory footprint of the fused model (both Device and Host DRAM).
+
+To fullfil this requirement, the tool reports memory utilization post reallocation. An example using a model compiled for Wormhole (with 6 Device and upto 4 Host DRAM channels) is provided below.
+
+.. code-block:: bash
+
+Displaying memory footprint per DRAM channel (MB):
+0 : 161.17
+1 : 511.12
+2 : 577.51
+3 : 200.27
+4 : 204.41
+5 : 339.57
+Displaying memory footprint per Host channel (MB):
+0 : 132.88
+1 : 0.0
+2 : 0.0
+3 : 0.0
+
TT-SMI
------
diff --git a/env/CMakeLists.txt b/env/CMakeLists.txt
new file mode 100644
index 000000000..66cef1a2d
--- /dev/null
+++ b/env/CMakeLists.txt
@@ -0,0 +1,38 @@
+cmake_minimum_required(VERSION 3.20.0)
+project(pybuda-toolchain LANGUAGES CXX)
+
+# Get parent directory of current source directory
+get_filename_component(PARENT_DIR ${CMAKE_CURRENT_SOURCE_DIR} DIRECTORY)
+
+# Include Utils
+list(APPEND CMAKE_MODULE_PATH ${PARENT_DIR}/cmake)
+include(Utils)
+
+# Check if the environment variable is set, if not error out
+check_required_env_var(PYBUDA_PYTHON_VERSION)
+check_required_env_var(PYBUDA_TOOLCHAIN_DIR)
+check_required_env_var(PYBUDA_VENV_DIR)
+
+set(PYBUDA_PYTHON_VERSION $ENV{PYBUDA_PYTHON_VERSION})
+set(PYBUDA_TOOLCHAIN_DIR $ENV{PYBUDA_TOOLCHAIN_DIR})
+set(PYBUDA_VENV_DIR $ENV{PYBUDA_VENV_DIR})
+
+set(TTMLIR_TOOLCHAIN_DIR $ENV{TTMLIR_TOOLCHAIN_DIR})
+set(TTMLIR_VENV_DIR $ENV{TTMLIR_VENV_DIR})
+
+if (NOT EXISTS "${PYBUDA_TOOLCHAIN_DIR}")
+ message( FATAL_ERROR "The directory ${PYBUDA_TOOLCHAIN_DIR} does not exist. Please create it before running this script.\n sudo mkdir -p ${PYBUDA_TOOLCHAIN_DIR}\n sudo chown -R $ENV{USER} ${PYBUDA_TOOLCHAIN_DIR}")
+endif()
+
+if (NOT EXISTS "${TTMLIR_TOOLCHAIN_DIR}")
+ message( FATAL_ERROR "The directory ${TTMLIR_TOOLCHAIN_DIR} does not exist. Please create it before running this script.\n sudo mkdir -p ${TTMLIR_TOOLCHAIN_DIR}\n sudo chown -R $ENV{USER} ${TTMLIR_TOOLCHAIN_DIR}")
+endif()
+
+# Install all python dependencies
+add_custom_target(python-venv
+ ALL COMMAND
+ PYBUDA_PYTHON_VERSION=${PYBUDA_PYTHON_VERSION}
+ PYBUDA_TOOLCHAIN_DIR=${PYBUDA_TOOLCHAIN_DIR}
+ CURRENT_SOURCE_DIR=${CMAKE_CURRENT_SOURCE_DIR}
+ PYBUDA_VENV_DIR=${PYBUDA_VENV_DIR}
+ bash ${CMAKE_CURRENT_SOURCE_DIR}/create_venv.sh)
diff --git a/env/README.md b/env/README.md
new file mode 100644
index 000000000..b36f90214
--- /dev/null
+++ b/env/README.md
@@ -0,0 +1,9 @@
+This directory contains all environment dependencies for the project.
+All dependencies are installed to /opt/pybuda-toolchain.
+
+load_env_variables.sh - Script to load the environment variables for the project.
+ - Its used when downloading dependencies for the project.
+ - Its also used when activating env for project. This is important since it contains path to different lib/inc paths.
+
+Dependencies:
+(Python.3.10)[https://www.python.org/downloads/release/python-3100/] - Version of python which is compatible with the project.
diff --git a/env/activate b/env/activate
new file mode 100644
index 000000000..fc5a44af2
--- /dev/null
+++ b/env/activate
@@ -0,0 +1,38 @@
+#!/usr/bin/env bash
+
+# NOTE
+# These env variables are stored in this file because they are needed by both
+# toolchain project and the pybuda project. This file is sourced by both projects.
+# For tt-mlir we have similar env variables but since we build toolchain and binaries of tt-mlir
+# using install.sh from tt-mlir project, we only define them in pybuda project top level cmake file.
+
+# Set PYBUDA_TOOLCHAIN_DIR to a default value if not already set
+export PYBUDA_TOOLCHAIN_DIR="${PYBUDA_TOOLCHAIN_DIR:-/opt/pybuda-toolchain}"
+
+# Set PYTHON_VERSION
+if [[ "$(uname)" == "Linux" ]]; then
+export PYBUDA_PYTHON_VERSION="${PYBUDA_PYTHON_VERSION:-python3.10}"
+elif [[ "$(uname)" == "Darwin" ]]; then
+export PYBUDA_PYTHON_VERSION="${PYBUDA_PYTHON_VERSION:-/usr/bin/python3}"
+else
+echo "Error: Unsupported OS $(uname)"
+fi
+
+# Set PYBUDA_VENV_DIR to a default value if not already set
+export PYBUDA_VENV_DIR="${PYBUDA_VENV_DIR:-${PYBUDA_TOOLCHAIN_DIR}/venv}"
+
+[ -f $PYBUDA_VENV_DIR/bin/activate ] && source $PYBUDA_VENV_DIR/bin/activate
+
+if [ -n "$PROJECT_ROOT" ]; then
+ export TT_METAL_HOME="$PROJECT_ROOT/third_party/tt-mlir/third_party/tt-metal/src/tt-metal"
+else
+ export TT_METAL_HOME="$(pwd)/third_party/tt-mlir/third_party/tt-metal/src/tt-metal"
+fi
+
+export TTMLIR_TOOLCHAIN_DIR="${TTMLIR_TOOLCHAIN_DIR:-/opt/ttmlir-toolchain}"
+
+export TTMLIR_VENV_DIR="${TTMLIR_VENV_DIR:-${TTMLIR_TOOLCHAIN_DIR}/venv}"
+
+export TTMLIR_ENV_ACTIVATED=1
+
+export ARCH_NAME="${ARCH_NAME:-wormhole_b0}"
diff --git a/python_env/core_requirements.txt b/env/core_requirements.txt
similarity index 85%
rename from python_env/core_requirements.txt
rename to env/core_requirements.txt
index 374743db8..5c64dad27 100644
--- a/python_env/core_requirements.txt
+++ b/env/core_requirements.txt
@@ -23,21 +23,22 @@ matplotlib==3.5.1
# They are both requirements of datasets. Multiprocess is however not fixed.
# TODO: Remove when datasets is upgraded
multiprocess==0.70.13
-mxnet==1.9.1
+# Newer versions of mxnet cause issue with AnyJSONManager static object destruction (segfault)
+mxnet==1.6.0
networkx==2.8.5
numpy==1.23.1
-onnx==1.14.1
-onnxruntime==1.15.0
+onnx>=1.15.0
+onnxruntime>=1.16.3
opencv-python-headless==4.6.0.66
# This is needed to avoid issue https://yyz-gitlab.local.tenstorrent.com/devops/devops/-/issues/95
pandas==1.5.3
prettytable==3.0.0
protobuf==3.20.3
pybind11==2.6.2
-pyinstrument==4.1.1
-scipy==1.8.0
-tensorflow-cpu==2.13.0
-tensorboard==2.13.0
+pyinstrument>=4.1.1
+scipy>=1.8.0
+tensorflow==2.13
+tensorboard==2.13
tf2onnx==1.15.1
transformers==4.35.2
# To avoid warning during the import
@@ -45,5 +46,5 @@ requests==2.28.2
urllib3==1.26.14
tflite==2.10.0
ultralytics==8.0.145
-keras==2.13.1
-pytorch_forecasting==1.0.0
+keras>=2.13.1
+#pytorch_forecasting>=1.0.0
diff --git a/env/create_venv.sh b/env/create_venv.sh
new file mode 100644
index 000000000..4a592f6b9
--- /dev/null
+++ b/env/create_venv.sh
@@ -0,0 +1,37 @@
+set -e
+
+if [[ -z "$PYBUDA_PYTHON_VERSION" ]]; then
+ echo "PYBUDA_PYTHON_VERSION environment variable is not set"
+ exit 1
+fi
+
+if [[ -z "$PYBUDA_TOOLCHAIN_DIR" ]]; then
+ echo "PYBUDA_TOOLCHAIN_DIR environment variable is not set"
+ exit 1
+fi
+
+if [[ -z "$PYBUDA_VENV_DIR" ]]; then
+ echo "PYBUDA_VENV_DIR environment variable is not set"
+ exit 1
+fi
+
+if [[ -z "$CURRENT_SOURCE_DIR" ]]; then
+ echo "CURRENT_SOURCE_DIR environment variable is not set"
+ exit 1
+fi
+
+# Torch requires a specific version of wheel to be installed
+# which depends on the platform
+if [[ "$(uname)" == "Darwin" ]]; then
+ REQUIREMENTS_FILE="$CURRENT_SOURCE_DIR/mac_requirements.txt"
+else
+ # TODO test on linux
+ REQUIREMENTS_FILE="$CURRENT_SOURCE_DIR/linux_requirements.txt"
+fi
+
+$PYBUDA_PYTHON_VERSION -m venv $PYBUDA_VENV_DIR
+unset LD_PRELOAD
+source $PYBUDA_VENV_DIR/bin/activate
+$PYBUDA_PYTHON_VERSION -m pip install --upgrade pip
+pip3 install wheel==0.37.1
+pip3 install -r $REQUIREMENTS_FILE -f https://download.pytorch.org/whl/cpu/torch_stable.html
diff --git a/python_env/dist_requirements.txt b/env/dist_requirements.txt
similarity index 53%
rename from python_env/dist_requirements.txt
rename to env/dist_requirements.txt
index b60831429..9dc705e6c 100644
--- a/python_env/dist_requirements.txt
+++ b/env/dist_requirements.txt
@@ -1,5 +1,6 @@
-r core_requirements.txt
torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.1.0%2Bcpu.cxx11.abi-cp38-cp38-linux_x86_64.whl ; python_version=='3.8'
torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.1.0%2Bcpu.cxx11.abi-cp310-cp310-linux_x86_64.whl ; python_version=='3.10'
-torchvision @ https://download.pytorch.org/whl/cpu/torchvision-0.16.0%2Bcpu-cp38-cp38-linux_x86_64.whl ; python_version=='3.8'
-torchvision @ https://download.pytorch.org/whl/cpu/torchvision-0.16.0%2Bcpu-cp310-cp310-linux_x86_64.whl ; python_version=='3.10'
+torch==2.3.1 ; python_version=='3.11'
+# Custom torchversion with ABI
+torchvision==0.16.0+fbb4cc5
\ No newline at end of file
diff --git a/python_env/requirements.txt b/env/linux_requirements.txt
similarity index 83%
rename from python_env/requirements.txt
rename to env/linux_requirements.txt
index dddbc8592..3fbf8b7cf 100644
--- a/python_env/requirements.txt
+++ b/env/linux_requirements.txt
@@ -3,7 +3,8 @@
bitarray==2.5.1
clang-format==14.0.3
-diffusers==0.14.0
+diffusers==0.27.2
+optimum==1.19.2
hydra-core
IPython==8.8.0
nvidia-ml-py3==7.352.0
@@ -17,7 +18,7 @@ sacremoses==0.0.53
seaborn
scikit-image==0.20.0 # For DenseNet 121 HF XRay model
segmentation_models_pytorch==0.3.2
-sentencepiece==0.1.96
+#sentencepiece==0.1.96
subword-nmt==0.3.8
tensorflow-hub==0.12.0
timm==0.6.12
@@ -25,12 +26,16 @@ yolov5==7.0.9
# The CPU versions of torch and torch visions are used due to their size being
# several GB smaller which made a large impact on the performance of CI
# (through build artifacts and caching)
-torchvision==0.16.0+cpu
torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.1.0%2Bcpu.cxx11.abi-cp38-cp38-linux_x86_64.whl ; python_version=='3.8'
torch @ https://download.pytorch.org/whl/cpu-cxx11-abi/torch-2.1.0%2Bcpu.cxx11.abi-cp310-cp310-linux_x86_64.whl ; python_version=='3.10'
+torch==2.3.1 ; python_version=='3.11'
torchxrayvision==0.0.39
vgg_pytorch==0.3.0
librosa==0.10.0.post2
soundfile==0.12.1
accelerate==0.23.0
-python-gitlab==4.2.0
+python-gitlab==4.4.0
+deepdiff==6.7.1
+tabulate==0.9.0
+opencv-contrib-python==4.9.0.80
+yolov6detect==0.4.1
diff --git a/env/mac_requirements.txt b/env/mac_requirements.txt
new file mode 100644
index 000000000..f691e557c
--- /dev/null
+++ b/env/mac_requirements.txt
@@ -0,0 +1,38 @@
+# First include all requirements from the Distribution build
+-r core_requirements.txt
+
+bitarray==2.5.1
+clang-format==14.0.3
+diffusers==0.27.2
+optimum==1.19.2
+hydra-core
+IPython==8.8.0
+nvidia-ml-py3==7.352.0
+omegaconf
+pytest==6.2.4
+pytest-timeout==2.0.1
+pytest-xdist==2.5.0
+pytorchcv==0.0.67
+sacrebleu==2.1.0
+sacremoses==0.0.53
+seaborn
+scikit-image==0.20.0 # For DenseNet 121 HF XRay model
+segmentation_models_pytorch==0.3.2
+#sentencepiece==0.1.96
+subword-nmt==0.3.8
+tensorflow-hub==0.12.0
+timm==0.6.12
+yolov5==7.0.9
+torch @ https://download.pytorch.org/whl/cpu/torch-2.1.0-cp310-none-macosx_11_0_arm64.whl ; python_version=='3.10'
+torch @ https://download.pytorch.org/whl/cpu/torch-2.1.0-cp38-none-macosx_11_0_arm64.whl ; python_version=='3.8'
+torch==2.3.1 ; python_version=='3.11'
+torchxrayvision==0.0.39
+vgg_pytorch==0.3.0
+librosa==0.10.0.post2
+soundfile==0.12.1
+accelerate==0.23.0
+python-gitlab==4.4.0
+deepdiff==6.7.1
+tabulate==0.9.0
+opencv-contrib-python==4.9.0.80
+yolov6detect==0.4.1
diff --git a/python_env/requirements_ext.txt b/env/requirements_ext.txt
similarity index 100%
rename from python_env/requirements_ext.txt
rename to env/requirements_ext.txt
diff --git a/env_for_silicon.sh b/env_for_silicon.sh
deleted file mode 100755
index e6e80c556..000000000
--- a/env_for_silicon.sh
+++ /dev/null
@@ -1,11 +0,0 @@
-export BACKEND_ARCH_NAME=grayskull
-export ARCH_NAME=grayskull
-
-if command -v bear >/dev/null 2>&1; then
- bear make
-else
- make
-fi
-source build/python_env/bin/activate
-source third_party/tvm/enable.sh
-set +e
diff --git a/env_for_wormhole.sh b/env_for_wormhole.sh
deleted file mode 100644
index d45cabd26..000000000
--- a/env_for_wormhole.sh
+++ /dev/null
@@ -1,13 +0,0 @@
-export LD_LIBRARY_PATH=third_party/confidential_tenstorrent_modules/versim/wormhole/lib:third_party/confidential_tenstorrent_modules/versim/wormhole/lib/ext
-export BACKEND_ARCH_NAME=wormhole
-export ARCH_NAME=wormhole
-
-if command -v bear >/dev/null 2>&1; then
- bear make
-else
- make
-fi
-source build/python_env/bin/activate
-source third_party/tvm/install.sh
-source third_party/tvm/enable.sh
-set +e
diff --git a/env_for_wormhole_b0.sh b/env_for_wormhole_b0.sh
deleted file mode 100755
index e91dd8fcd..000000000
--- a/env_for_wormhole_b0.sh
+++ /dev/null
@@ -1,12 +0,0 @@
-export LD_LIBRARY_PATH=versim/wormhole_b0/lib:versim/wormhole_b0/lib/ext
-export BACKEND_ARCH_NAME=wormhole_b0
-export ARCH_NAME=wormhole_b0
-
-if command -v bear >/dev/null 2>&1; then
- bear make
-else
- make
-fi
-source build/python_env/bin/activate
-source third_party/tvm/enable.sh
-set +e
diff --git a/pybuda/CMakeLists.txt b/pybuda/CMakeLists.txt
new file mode 100644
index 000000000..8b0df77fd
--- /dev/null
+++ b/pybuda/CMakeLists.txt
@@ -0,0 +1,2 @@
+add_subdirectory(csrc)
+add_subdirectory(pybuda)
diff --git a/pybuda/csrc/CMakeLists.txt b/pybuda/csrc/CMakeLists.txt
new file mode 100644
index 000000000..81e9490e0
--- /dev/null
+++ b/pybuda/csrc/CMakeLists.txt
@@ -0,0 +1,139 @@
+set(CMAKE_PREFIX_PATH
+ ${Python_SITELIB}/torch
+ ${CMAKE_PREFIX_PATH})
+find_package(Torch REQUIRED)
+
+set(TT_MLIR_ROOT_DIR ${CMAKE_SOURCE_DIR}/third_party/tt-mlir)
+set(TTMLIR_INCLUDE_DIRS
+ ${TT_MLIR_ROOT_DIR}/include
+ ${TT_MLIR_ROOT_DIR}/build/include
+ ${TT_MLIR_ROOT_DIR}/runtime/include)
+
+set(PYBUDA_CSRC_INCLUDES
+ ${CMAKE_CURRENT_SOURCE_DIR}
+ ${CMAKE_SOURCE_DIR}
+ ${CMAKE_SOURCE_DIR}/third_party
+ ${CMAKE_SOURCE_DIR}/third_party/fmt
+ ${CMAKE_SOURCE_DIR}/third_party/pybind11/include
+ ${CMAKE_SOURCE_DIR}/third_party/json
+ ${CMAKE_SOURCE_DIR}/third_party/tt-mlir/build/include
+ ${CMAKE_SOURCE_DIR}/third_party/tt-mlir/runtime/include
+ ${CMAKE_SOURCE_DIR}/third_party/tt-mlir/include
+ ${TTMLIR_TOOLCHAIN_DIR}/include
+ ${Python3_INCLUDE_DIRS}
+ ${TTMLIR_INCLUDE_DIRS}
+)
+
+include_directories(${PYBUDA_CSRC_INCLUDES})
+# This is workaround for utils/assert.hpp using ##__VA_ARGS__ which is not supported by clang
+include_directories(SYSTEM ${CMAKE_SOURCE_DIR})
+include_directories(SYSTEM ${TORCH_INCLUDE_DIRS})
+
+add_subdirectory(graph_lib)
+add_subdirectory(autograd)
+add_subdirectory(shared_utils)
+add_subdirectory(backend_api)
+add_subdirectory(reportify)
+add_subdirectory(runtime)
+add_subdirectory(tt_torch_device)
+
+### pybuda_csrc_objs ###
+
+file(GLOB CPP_FILES
+ "pybuda_bindings.cpp"
+ "buda_passes.cpp"
+ "passes/*.cpp"
+ "lower_to_buda/common.cpp"
+)
+
+add_library(pybuda_csrc_objs OBJECT ${CPP_FILES})
+target_compile_options(pybuda_csrc_objs PRIVATE ${STATIC_LIB_FLAGS} ${PYBUDA_CSRC_CFLAGS})
+add_dependencies(pybuda_csrc_objs build_tt_mlir)
+
+### End of pybuda_csrc_objs ###
+
+######## pybuda_csrc ########
+
+set(TTMLIR_LIB_DIR "${CMAKE_SOURCE_DIR}/third_party/tt-mlir/build/lib")
+set(TTRUNTIME_LIB_DIR "${CMAKE_SOURCE_DIR}/third_party/tt-mlir/build/runtime/lib")
+set(METAL_LIB_DIR "${CMAKE_SOURCE_DIR}/third_party/tt-mlir/third_party/tt-metal/src/tt-metal-build/lib")
+set(TORCH_LIB_DIR "${PYBUDA_VENV_DIR}/lib/${PYBUDA_PYTHON_VERSION}/site-packages/torch/lib")
+
+add_library(pybuda_csrc SHARED)
+
+set(METAL_LIB_DIR "${CMAKE_SOURCE_DIR}/third_party/tt-mlir/third_party/tt-metal/src/tt-metal-build/lib")
+
+# Because _ttnn library doesn't have lib prefix, this is workaround to make linking work
+add_library(ttnn SHARED IMPORTED)
+set_property(TARGET ttnn PROPERTY IMPORTED_LOCATION "${METAL_LIB_DIR}/_ttnn.so")
+
+target_link_libraries(pybuda_csrc PRIVATE
+ graph
+ autograd
+ shared_utils
+ backend_api
+ reportify
+ tt_torch_device
+ runtime
+ pybuda_csrc_objs
+
+ # NOTE: ordering of the libraries will affect the linking
+ LLVM
+ MLIR
+ TTNNTargetFlatbuffer
+ MLIRTTDialect
+ MLIRTTIRDialect
+ MLIRTTNNDialect
+ MLIRTTKernelDialect
+ MLIRTTMetalDialect
+ MLIRTTIRTransforms
+ MLIRTTNNTransforms
+ MLIRTTIRAnalysis
+ MLIRTTNNPipelines
+ TTMLIRTTNNToEmitC
+ TTRuntime
+ TTRuntimeTTNN
+ tt_metal
+ device
+ tt_eager
+ ttnn
+ tt_metal
+ device
+ tt_eager
+ flatbuffers
+ xml2
+ curses
+ z
+ m
+ torch_python
+ c10
+ ${PYBUDA_PYTHON_VERSION}
+ ${TORCH_LIBRARIES}
+)
+
+target_compile_options(pybuda_csrc PRIVATE
+ ${PYBUDA_CSRC_CFLAGS}
+ ${CXXFLAGS}
+ ${SHARED_LIB_FLAGS}
+)
+
+target_link_directories(pybuda_csrc PRIVATE
+ ${TTMLIR_TOOLCHAIN_DIR}/lib
+ ${TTMLIR_LIB_DIR}
+ ${TTRUNTIME_LIB_DIR}
+ ${METAL_LIB_DIR}
+ ${TORCH_LIB_DIR})
+
+### End of pybuda_csrc ###
+
+#### Copy python module extension to pybuda directory ####
+add_custom_target(run_after_pybuda_csrc ALL
+ COMMAND mkdir -p ${PYBUDA_VENV_DIR}/lib/${PYBUDA_PYTHON_VERSION}/site-packages/pybuda
+ COMMAND cp $ ${PYBUDA_VENV_DIR}/lib/${PYBUDA_PYTHON_VERSION}/site-packages/pybuda/_C.so
+ COMMAND touch -r $ ${PYBUDA_VENV_DIR}/lib/${PYBUDA_PYTHON_VERSION}/site-packages/pybuda/_C.so
+ COMMAND ln -sf ${PYBUDA_VENV_DIR}/lib/${PYBUDA_PYTHON_VERSION}/site-packages/pybuda/_C.so ${CMAKE_SOURCE_DIR}/pybuda/pybuda/_C.so
+ COMMENT "Running run_after_pybuda_csrc to copy the python module extension to pybuda directory"
+ USES_TERMINAL
+)
+
+add_dependencies(run_after_pybuda_csrc pybuda_csrc)
diff --git a/pybuda/csrc/autograd/CMakeLists.txt b/pybuda/csrc/autograd/CMakeLists.txt
new file mode 100644
index 000000000..7f35e9dd9
--- /dev/null
+++ b/pybuda/csrc/autograd/CMakeLists.txt
@@ -0,0 +1,8 @@
+add_library(autograd
+ STATIC
+ autograd.cpp
+ binding.cpp
+ python_bindings.cpp)
+
+target_compile_options(autograd PRIVATE ${STATIC_LIB_FLAGS} ${PYBUDA_CSRC_CFLAGS})
+
diff --git a/pybuda/csrc/autograd/binding.cpp b/pybuda/csrc/autograd/binding.cpp
index 8a203eef9..793c084e5 100644
--- a/pybuda/csrc/autograd/binding.cpp
+++ b/pybuda/csrc/autograd/binding.cpp
@@ -2,7 +2,6 @@
//
// SPDX-License-Identifier: Apache-2.0
#include "autograd/binding.hpp"
-#include "passes/fuse_ops.hpp"
#include
@@ -25,84 +24,10 @@ std::tuple> get_op_shape(OpType type, std::vect
return std::make_tuple(s, ret[1].cast>());
}
-std::tuple> get_fused_op_shape(tt::graphlib::BudaOpNode *op, std::vector &operands)
-{
- std::unordered_map buffers;
- std::vector dim_broadcasts;
- std::optional dest;
- for (auto schedule : op->get_fused_op()->get_schedules())
- {
- for (auto sub_op : schedule.ops)
- {
- std::vector sub_op_inputs;
- for (tt::FusedSubOpInput i : sub_op.inputs)
- {
- if (i.type == tt::FusedSubOpInput::InputType::INPUT) {
- TT_ASSERT(i.index < operands.size(), "Refering to input that doesn't exist for fused op");
- sub_op_inputs.push_back(operands.at(i.index));
- }
- else if (i.type == tt::FusedSubOpInput::InputType::DEST) {
- TT_ASSERT(dest.has_value(), "Reading from dest that has not value");
- sub_op_inputs.push_back(dest.value());
- dest = std::nullopt;
- }
- else {
- auto it = buffers.find(i.index);
- TT_ASSERT(it != buffers.end(), "Referring to intermediate buffer that doesn't exist");
- sub_op_inputs.push_back(it->second);
- }
-
- // All inputs to the fused op are already properly broadcasted.
- // But for the sub-op inputs which are outputs of previously executed sub-ops,
- // we need to apply broadcast.
- // NOTE: We don't need to apply tile broadcasts for shape calculation, since each
- // input is at least the size of a tile.
- if (i.type != tt::FusedSubOpInput::InputType::INPUT
- && i.has_broadcast())
- {
- Shape operand_shape = sub_op_inputs.back();
-
- int broadcast_dim = i.broadcast.first;
- int broadcast_factor = i.broadcast.second;
-
- OpType broadcast_op = OpType("broadcast", {broadcast_dim, broadcast_factor}, {});
- std::vector shapes = {operand_shape};
- std::tuple> shape_data = get_op_shape(broadcast_op, shapes, true);
-
- operand_shape = std::get<0>(shape_data);
-
- sub_op_inputs.pop_back();
- sub_op_inputs.emplace_back(operand_shape);
- }
- }
-
- Shape result;
- std::vector broadcast;
- tie(result, broadcast) = get_op_shape(sub_op.op_type, sub_op_inputs, true);
-
- if (sub_op.output_type == tt::FusedSubOp::OutputType::OUTPUT)
- return std::make_pair(result, dim_broadcasts);
-
- else if (sub_op.output_type == tt::FusedSubOp::OutputType::DEST)
- dest = result;
-
- else {
- // intermed
- if (buffers.count((std::uint32_t)sub_op.output_buffer) == 0)
- buffers.insert(std::make_pair((std::uint32_t)sub_op.output_buffer, result));
- else
- buffers[(std::uint32_t)sub_op.output_buffer] = result;
- }
- }
- }
- TT_THROW("Evaluated the full fused op, but haven't reached the output shape.");
- return std::make_pair(Shape(), std::vector{});
-}
-
NodeContext insert_backward(
autograd_context context,
OpType type,
- int operand,
+ int operand,
const std::vector &inputs,
NodeContext output,
NodeContext gradient)
diff --git a/pybuda/csrc/autograd/binding.hpp b/pybuda/csrc/autograd/binding.hpp
index 29818f7af..a8c517bd0 100644
--- a/pybuda/csrc/autograd/binding.hpp
+++ b/pybuda/csrc/autograd/binding.hpp
@@ -23,7 +23,6 @@ using TileDim = tt::TileDim;
std::tuple> get_op_shape(
OpType type, std::vector &operands, bool is_buda, TileDim tile_dim = TileDim::Dim32x32);
-std::tuple> get_fused_op_shape(tt::graphlib::BudaOpNode *op, std::vector &operands);
inline Shape get_tm_shape(OpType type, Shape operand, bool is_buda)
{
Shape shape;
diff --git a/pybuda/csrc/autograd/module.mk b/pybuda/csrc/autograd/module.mk
index 255de133c..57f24bb81 100644
--- a/pybuda/csrc/autograd/module.mk
+++ b/pybuda/csrc/autograd/module.mk
@@ -13,7 +13,7 @@ PYBUDA_CSRC_AUTOGRAD_INCLUDES = $(PYBUDA_CSRC_INCLUDES)
pybuda/csrc/autograd: $(PYBUDA_CSRC_AUTOGRAD)
-$(PYBUDA_CSRC_AUTOGRAD): $(PYBUDA_CSRC_AUTOGRAD_OBJS) $(PYBUDA_CSRC_GRAPH_LIB)
+$(PYBUDA_CSRC_AUTOGRAD): $(PYBUDA_CSRC_AUTOGRAD_OBJS)
@mkdir -p $(LIBDIR)
ar rcs $@ $^
diff --git a/pybuda/csrc/backend_api/CMakeLists.txt b/pybuda/csrc/backend_api/CMakeLists.txt
new file mode 100644
index 000000000..8fb8f8479
--- /dev/null
+++ b/pybuda/csrc/backend_api/CMakeLists.txt
@@ -0,0 +1,9 @@
+add_library(backend_api
+ STATIC
+ arch_type.cpp
+ backend_api.cpp)
+
+target_link_libraries(backend_api shared_utils)
+add_dependencies(backend_api shared_utils)
+target_compile_options(backend_api PRIVATE ${STATIC_LIB_FLAGS} ${PYBUDA_CSRC_CFLAGS})
+
diff --git a/pybuda/csrc/backend_api/arch_type.cpp b/pybuda/csrc/backend_api/arch_type.cpp
new file mode 100644
index 000000000..3827ea3e8
--- /dev/null
+++ b/pybuda/csrc/backend_api/arch_type.cpp
@@ -0,0 +1,49 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#include
+#include "arch_type.hpp"
+#include "shared_utils/string_extension.hpp"
+
+namespace tt{
+ std::string to_string_arch(ARCH arch)
+ {
+ switch (arch)
+ {
+ case ARCH::GRAYSKULL:
+ return "GRAYSKULL";
+ case ARCH::WORMHOLE_B0:
+ return "WORMHOLE_B0";
+ case ARCH::BLACKHOLE:
+ return "BLACKHOLE";
+ default:
+ throw std::runtime_error("Unsupported ARCH enum: " + std::to_string(static_cast(arch)));
+ }
+ }
+
+ std::string to_string_arch_lower(ARCH arch)
+ {
+ return tt::utils::to_lower_string(to_string_arch(arch));
+ }
+
+ ARCH to_arch_type(const std::string& arch_string)
+ {
+ std::string arch_string_lower = tt::utils::to_upper_string(arch_string);
+ if (arch_string_lower == "GRAYSKULL")
+ {
+ return ARCH::GRAYSKULL;
+ }
+ else if (arch_string_lower == "WORMHOLE_B0")
+ {
+ return ARCH::WORMHOLE_B0;
+ }
+ else if (arch_string_lower == "BLACKHOLE")
+ {
+ return ARCH::BLACKHOLE;
+ }
+ else
+ {
+ throw std::runtime_error("Unsuported tt::ARCH string: " + arch_string_lower);
+ }
+ }
+}
\ No newline at end of file
diff --git a/pybuda/csrc/backend_api/arch_type.hpp b/pybuda/csrc/backend_api/arch_type.hpp
new file mode 100644
index 000000000..80d9fc085
--- /dev/null
+++ b/pybuda/csrc/backend_api/arch_type.hpp
@@ -0,0 +1,24 @@
+// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
+//
+// SPDX-License-Identifier: Apache-2.0
+#pragma once
+
+#include
+
+namespace tt {
+enum class ARCH
+{
+ JAWBRIDGE = 0,
+ GRAYSKULL = 1,
+ WORMHOLE = 2,
+ WORMHOLE_B0 = 3,
+ BLACKHOLE = 4,
+ Invalid = 0xFF,
+};
+
+std::string to_string_arch(ARCH ar);
+std::string to_string_arch_lower(ARCH arch);
+ARCH to_arch_type(const std::string& arch_string);
+}
+
+
diff --git a/pybuda/csrc/backend_api/backend_api.cpp b/pybuda/csrc/backend_api/backend_api.cpp
index 97e404ed6..773ec4e4d 100644
--- a/pybuda/csrc/backend_api/backend_api.cpp
+++ b/pybuda/csrc/backend_api/backend_api.cpp
@@ -1,169 +1,32 @@
// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
//
// SPDX-License-Identifier: Apache-2.0
+
#include
-#include "yaml-cpp/yaml.h"
-#include "backend_api/backend_api.hpp"
#include "backend_api/device_config.hpp"
#include "utils/assert.hpp"
-#include "netlist/tt_backend.hpp"
-#include "netlist/tt_backend_api.hpp"
-#include "common/env_lib.hpp"
-
namespace tt
{
+template
+constexpr std::false_type false_type_t{};
+
template
T DeviceConfig::get(std::string const ¶m, const bool system_level_command) const
{
- std::string key_value = (system_level_command) ? ("system-" + param) : (arch_name + "-" + param);
- std::string value;
- if (system_level_command and this->cached_system_level_params.size() > 0)
- {
- value = this->cached_system_level_params.at(key_value);
- }
- else
- {
- value = ::tt::backend::get_backend_param(
- key_value,
- this->device_yaml,
- this->cluster_config_yaml,
- this->runtime_params_yaml,
- this->store_backend_db_to_yaml);
- }
-
- if constexpr (std::is_same_v)
- {
- return value;
- }
- else if constexpr (std::is_same_v)
- {
- return std::stoul(value, 0, 0);
- }
- else if constexpr (std::is_same_v)
+ TT_ASSERT(false);
+ if constexpr (std::is_same_v)
{
- return std::stoi(value, 0, 0);
- }
- else if constexpr (std::is_same_v)
- {
- return std::stoull(value, 0, 0);
- }
- else if constexpr (std::is_same_v)
- {
- return static_cast(std::stoi(value, 0, 0));
- }
- else if constexpr (std::is_same_v)
- {
- auto delimeter = value.find("-");
- auto x_str = value.substr(0, delimeter);
- auto y_str = value.substr(delimeter + 1, std::string::npos);
- return CoreCoord(std::stoi(x_str, 0, 0), std::stoi(y_str, 0, 0));
+ return CoreCoord(1, 1);
}
else if constexpr (std::is_same_v)
{
- auto delimeter = value.find("-");
- auto c_str = value.substr(0, delimeter);
- auto r_str = value.substr(delimeter + 1, std::string::npos);
- return DeviceGrid(std::stoi(r_str, 0, 0), std::stoi(c_str, 0, 0));
- }
- else if constexpr (std::is_same_v)
- {
- return value;
- }
- else if constexpr (std::is_same_v>)
- {
- // Chips with mmio are serialized separated by a dash (eg. '1-2-3')
- std::vector chips_with_mmio;
-
- // Split string and extract chip ids
- size_t delimeter = 0;
- while((delimeter = value.find("-")) != std::string::npos)
- {
- std::string curr_str = value.substr(0, delimeter);
- chips_with_mmio.push_back(std::stoi(curr_str, 0, 0));
- value.erase(0, delimeter + 1);
- }
-
- return chips_with_mmio;
- }
- else if constexpr (std::is_same_v>)
- {
- // Chip locations are serialized separated by a dash (eg. '0,0,0,-1,1,0,-')
- std::unordered_map chip_locations;
- std::vector temporary_buffer;
-
- // Split string into temporary buffer for additional processing
- size_t delimeter = 0;
- while((delimeter = value.find("-")) != std::string::npos)
- {
- std::string curr_str = value.substr(0, delimeter);
- temporary_buffer.push_back(curr_str);
- value.erase(0, delimeter + 1);
- }
-
- // Loop through temporary buffer and extract information
- for (std::string chip_location : temporary_buffer)
- {
- // Split string into chip id and chip location portions
- size_t delimeter = 0;
- std::vector extracted_values;
- while((delimeter = chip_location.find(",")) != std::string::npos)
- {
- std::string curr_str = chip_location.substr(0, delimeter);
- extracted_values.push_back(std::stoi(curr_str, 0, 0));
- chip_location.erase(0, delimeter + 1);
- }
-
- // Add chip location to map
- chip_locations.insert(
- {extracted_values.at(0),
- EthCoord(
- extracted_values.at(1), extracted_values.at(2), extracted_values.at(3), extracted_values.at(4))});
- }
-
- return chip_locations;
- }
- else if constexpr (std::is_same_v>>>)
- {
- // Ethernet connections are serialized separated by a dash (eg. '0,0,1,0,-0,1,1,1,-')
- std::unordered_map>> ethernet_connections;
- std::vector temporary_buffer;
-
- // Split string into temporary buffer for additional processing
- size_t delimeter = 0;
- while((delimeter = value.find("-")) != std::string::npos)
- {
- std::string curr_str = value.substr(0, delimeter);
- temporary_buffer.push_back(curr_str);
- value.erase(0, delimeter + 1);
- }
-
- // Loop through temporary buffer and extract information
- for (std::string eth_connection : temporary_buffer)
- {
- // Split string and collect values
- size_t delimeter = 0;
- std::vector extracted_values;
- while((delimeter = eth_connection.find(",")) != std::string::npos)
- {
- std::string curr_str = eth_connection.substr(0, delimeter);
- extracted_values.push_back(std::stoul(curr_str, 0, 0));
- eth_connection.erase(0, delimeter + 1);
- }
-
- // Add values to map
- if(ethernet_connections.find(extracted_values[0]) == ethernet_connections.end()) {
- ethernet_connections[extracted_values[0]] = {};
- }
- ethernet_connections[extracted_values[0]][extracted_values[1]] = std::tuple(extracted_values[2], extracted_values[3]);
- }
-
- return ethernet_connections;
+ return DeviceGrid(1, 1);
}
else
{
- static_assert(false_type_t, "No specialization for type");
+ return T();
}
}
@@ -183,573 +46,10 @@ template std::unordered_map load_cached_sys_param(std::string yaml_file)
{
std::unordered_map cache;
- YAML::Node reference_param_desc = YAML::LoadFile(yaml_file);
- for (auto it = reference_param_desc["system_level_params"].begin(); it != reference_param_desc["system_level_params"].end(); it++)
- cache[(it -> first).as()] = (it -> second).as();
return cache;
}
-void DeviceConfig::load_system_level_params()
-{
- auto silicon_devices = tt::backend::detect_available_devices();
- if (silicon_devices.size() == 0) // compute machine
- this->cached_system_level_params = load_cached_sys_param(this->runtime_params_yaml);
-}
-
-std::vector DeviceConfig::get_harvested_cfg() const
-{
- auto silicon_devices = tt::backend::detect_available_devices();
- if (silicon_devices.size() == 0 and this->runtime_params_yaml.empty())
- return std::vector(chip_ids.size(), 0); // assume same harvesting-config among all chips for non-silicon backend
-
- std::vector ret;
- for (auto i : chip_ids)
- {
- std::string cmd = "device";
- cmd += std::to_string(i);
- cmd += "-harvesting_mask";
- uint32_t num = get(cmd, true);
- ret.push_back(num);
- }
- return ret;
-}
+void DeviceConfig::load_system_level_params() { TT_ASSERT(false); }
+std::unordered_map DeviceConfig::get_harvested_cfg() const { TT_ASSERT(false); return {}; }
} // namespace tt
-
-namespace tt::backend_api
-{
-using tt_backend_config = tt::tt_backend_config;
-using tt_compile_result = tt::tt_compile_result;
-
-void python_handle_refchange(const void *handle_ptr, bool allocate)
-{
- py::handle handle((PyObject *)handle_ptr);
- if (allocate)
- handle.inc_ref();
- else
- handle.dec_ref();
-}
-
-void BackendModule(py::module &m_backend) {
-
-
- py::class_(m_backend, "BackendConfig")
- .def(py::init([](
- tt::DEVICE backend_type,
- tt::ARCH backend_device,
- tt::DEVICE_MODE device_mode,
- int opt_level,
- const std::string &output_dir,
- const std::string &soc_descriptor_path,
- const std::string &cluster_descriptor_path) {
-
- auto cfg = tt_backend_config{
- .type = backend_type,
- .arch = backend_device,
- .mode = device_mode,
- .output_dir = output_dir,
- .soc_descriptor_path = soc_descriptor_path,
- .cluster_descriptor_path = cluster_descriptor_path};
-
- char *env_opt_level = getenv("TT_BACKEND_OPT_LEVEL");
- if (env_opt_level) {
- cfg.optimization_level = atoi(env_opt_level);
- }
- else {
- cfg.optimization_level = opt_level;
- }
- if (backend_type == tt::DEVICE::Golden) {
- cfg.ignore_data_format_precision = true; // run backend at full precision by default (on Golden)
- }
- return cfg;
- }))
- .def("set_golden_ignore_df_precision", [](tt_backend_config &self, bool ignore_data_format_precision) {
- self.ignore_data_format_precision = ignore_data_format_precision;
- })
- .def("set_performance_trace_args", [](tt_backend_config &self, std::string args) {
- self.perf_desc_args = args;
- })
- .def("set_runtime_args", [](tt_backend_config &self, std::string args) {
- self.runtime_args = args;
- });
-
- m_backend.def("get_golden_config", []() {
- tt_backend_config cfg = {tt::DEVICE::Golden, tt::ARCH::GRAYSKULL};
- return cfg;
- });
-
- py::enum_(m_backend, "BackendType")
- .value("Golden", tt::DEVICE::Golden)
- .value("Model", tt::DEVICE::Model)
- .value("Silicon", tt::DEVICE::Silicon)
- .value("NoBackend", tt::DEVICE::Invalid)
- .def_static("from_string", &tt::get_device_from_string)
- .def("to_json", [](const tt::DEVICE backend_type) {
- switch (backend_type)
- {
- case tt::DEVICE::Golden: return "Golden";
- case tt::DEVICE::Model: return "Model";
- case tt::DEVICE::Silicon: return "Silicon";
- case tt::DEVICE::Invalid: return "Invalid";
- default: break;
- }
- return "Invalid";
- })
- .def("from_json", [](std::string const& encoded) {
- static std::unordered_map decode = {
- {"Golden", tt::DEVICE::Golden},
- {"Model", tt::DEVICE::Model},
- {"Silicon", tt::DEVICE::Silicon},
- {"NoBackend", tt::DEVICE::Invalid},
- };
- return decode.at(encoded);
- });
-
- py::enum_(m_backend, "IOType")
- .value("Queue", tt::IO_TYPE::Queue)
- .value("RandomAccess", tt::IO_TYPE::RandomAccess)
- .value("Invalid", tt::IO_TYPE::Invalid);
-
- py::enum_(m_backend, "IOLayout")
- .value("Tilized", tt::IO_LAYOUT::Tilized)
- .value("Flat", tt::IO_LAYOUT::Flat)
- .value("Invalid", tt::IO_LAYOUT::Invalid);
-
- py::enum_(m_backend, "BackendDevice")
- .value("Grayskull", tt::ARCH::GRAYSKULL)
- .value("Wormhole", tt::ARCH::WORMHOLE)
- .value("Wormhole_B0", tt::ARCH::WORMHOLE_B0)
- .value("Invalid", tt::ARCH::Invalid)
- .def("to_string", &tt::get_string_lowercase)
- .def_static("from_string", &tt::get_arch_from_string)
- .def("to_json", [](const tt::ARCH backend_device) {
- switch (backend_device)
- {
- case tt::ARCH::GRAYSKULL: return "Grayskull";
- case tt::ARCH::WORMHOLE: return "Wormhole";
- case tt::ARCH::WORMHOLE_B0: return "Wormhole_B0";
- case tt::ARCH::Invalid: return "Invalid";
- default: break;
- }
- return "Invalid";
- })
- .def("from_json", [](std::string const& encoded) {
- static std::unordered_map decode = {
- {"Grayskull", tt::ARCH::GRAYSKULL},
- {"Wormhole", tt::ARCH::WORMHOLE},
- {"Wormhole_B0", tt::ARCH::WORMHOLE_B0},
- {"Invalid", tt::ARCH::Invalid},
- };
- return decode.at(encoded);
- });
-
- py::enum_(m_backend, "DeviceMode")
- .value("CompileAndRun", tt::DEVICE_MODE::CompileAndRun)
- .value("CompileOnly", tt::DEVICE_MODE::CompileOnly)
- .value("RunOnly", tt::DEVICE_MODE::RunOnly)
- .def(
- "to_json",
- [](tt::DEVICE_MODE d) {
- switch (d)
- {
- case tt::DEVICE_MODE::CompileAndRun: return "CompileAndRun";
- case tt::DEVICE_MODE::CompileOnly: return "CompileOnly";
- case tt::DEVICE_MODE::RunOnly: return "RunOnly";
- default: break;
- }
- return "Invalid";
- })
- .def("from_json", [](std::string const &encoded) {
- static std::unordered_map decode = {
- {"CompileAndRun", tt::DEVICE_MODE::CompileAndRun},
- {"CompileOnly", tt::DEVICE_MODE::CompileOnly},
- {"RunOnly", tt::DEVICE_MODE::RunOnly},
- };
- return decode.at(encoded);
- });
-
- py::class_(m_backend, "StrideDescriptor")
- .def(py::init<>())
- .def_readwrite("xy_offsets", &tt::Stride::xy_offsets)
- .def_readwrite("stride", &tt::Stride::stride)
- .def(py::pickle(
- [](const tt::Stride &s) { // __getstate__
- return py::make_tuple(
- s.xy_offsets,
- s.stride);
- },
- [](py::tuple t) { // __setstate__
- if (t.size() != 2)
- {
- throw std::runtime_error("Invalid state for tt::Stride");
- }
-
- tt::Stride s;
-
- s.xy_offsets = t[0].cast>>();
- s.stride = t[1].cast();
-
- return s;
- }
- ));
-
- py::class_(m_backend, "DramIODesc")
- .def_property_readonly("name", [](tt::tt_dram_io_desc &self) { return self.queue_name; })
- .def_property_readonly("data_format", [](tt::tt_dram_io_desc &self) { return self.bufq_target_format; })
- .def_readwrite("bufq_grid_dim_r", &tt::tt_dram_io_desc::bufq_grid_dim_r)
- .def_readwrite("bufq_grid_dim_c", &tt::tt_dram_io_desc::bufq_grid_dim_c)
- .def_readwrite("ublock_rt", &tt::tt_dram_io_desc::ublock_rt)
- .def_readwrite("ublock_ct", &tt::tt_dram_io_desc::ublock_ct)
- .def_readwrite("mblock_m", &tt::tt_dram_io_desc::mblock_m)
- .def_readwrite("mblock_n", &tt::tt_dram_io_desc::mblock_n)
- .def_readwrite("tile_height", &tt::tt_dram_io_desc::tile_height)
- .def_readwrite("tile_width", &tt::tt_dram_io_desc::tile_width)
- .def_readwrite("t", &tt::tt_dram_io_desc::t)
- .def_readwrite("hstack_factor", &tt::tt_dram_io_desc::hstack_factor)
- .def_readwrite("vstack_factor", &tt::tt_dram_io_desc::vstack_factor)
- .def_readwrite("stack_row_major", &tt::tt_dram_io_desc::stack_row_major)
- .def_readwrite("s_descriptor", &tt::tt_dram_io_desc::s_descriptor)
- .def_readwrite("input_count", &tt::tt_dram_io_desc::input_count)
- .def_readwrite("netlist_path", &tt::tt_dram_io_desc::netlist_path)
- .def(py::pickle(
- [](const tt::tt_dram_io_desc &p) { // __getstate__
- return py::make_tuple(
- p.netlist_path,
- p.queue_name,
- p.bufq_grid_dim_r,
- p.bufq_grid_dim_c,
- p.bufq_num_slots,
- p.ublock_rt,
- p.ublock_ct,
- p.mblock_m,
- p.mblock_n,
- p.tile_height,
- p.tile_width,
- p.t,
- p.input_count,
- p.hstack_factor,
- p.vstack_factor,
- p.stack_row_major,
- p.bufq_target_format,
- p.bufq_start_addr_channel,
- p.bufq_entry_size,
- p.io_type,
- p.s_descriptor,
- p.backend_type,
- p.layout);
- },
- [](py::tuple t) { // __setstate__
- if (t.size() != 23)
- throw std::runtime_error("tt::tt_dram_io_desc: Invalid state!");
-
- tt::tt_dram_io_desc p;
- p.netlist_path = t[0].cast();
- p.queue_name = t[1].cast();
- p.bufq_grid_dim_r = t[2].cast();
- p.bufq_grid_dim_c = t[3].cast();
- p.bufq_num_slots = t[4].cast();
- p.ublock_rt = t[5].cast();
- p.ublock_ct = t[6].cast();
- p.mblock_m = t[7].cast();
- p.mblock_n = t[8].cast();
- p.tile_height = t[9].cast();
- p.tile_width = t[10].cast();
- p.t = t[11].cast();
- p.input_count = t[12].cast();
- p.hstack_factor = t[13].cast();
- p.vstack_factor = t[14].cast();
- p.stack_row_major = t[15].cast();
- p.bufq_target_format = t[16].cast();
- p.bufq_start_addr_channel = t[17].cast>>();
- p.bufq_entry_size = t[18].cast();
- p.io_type = t[19].cast();
- p.s_descriptor = t[20].cast();
- p.backend_type = t[21].cast();
- p.layout = t[22].cast();
-
- TT_ASSERT(
- tt::backend::translate_addresses(p) == tt::DEVICE_STATUS_CODE::Success,
- "Failed to translate addresses for " + p.queue_name);
- return p;
- }));
-
- py::class_(m_backend, "PytorchTensorDesc", py::buffer_protocol())
- .def(py::init([]() {
- return tt_PytorchTensorDesc();
- }))
- .def(py::init([](py::object pytorch_tensor, std::uint32_t itemsize, tt::DataFormat format,
- std::uint32_t dim,
- std::array shape,
- std::array strides) {
-
- auto ptr = pytorch_tensor.attr("data_ptr")().cast();
- py::handle handle = pytorch_tensor.release();
-
- return tt_PytorchTensorDesc(
- (void *)ptr, itemsize, format, shape, strides, dim, (void*)handle.ptr(), python_handle_refchange);
- }))
- .def(py::init([](void *buffer, std::uint32_t itemsize, tt::DataFormat format,
- std::uint32_t dim,
- std::array shape,
- std::array strides) {
-
- return tt_PytorchTensorDesc(buffer, itemsize, format, shape, strides, dim);
- }))
- .def_readwrite("itemsize", &tt_PytorchTensorDesc::itemsize)
- .def_readwrite("format", &tt_PytorchTensorDesc::format)
- .def_readwrite("shape", &tt_PytorchTensorDesc::shape)
- .def_readwrite("strides", &tt_PytorchTensorDesc::strides)
- .def_readwrite("dim", &tt_PytorchTensorDesc::dim)
- .def("print", [](tt::tt_PytorchTensorDesc &self) {
- std::cout << "Descriptor: ptr=" << (std::uint64_t)self.ptr <<
- ", itemsize=" << self.itemsize <<
- ", format =" << (int)self.format <<
- ", dim =" << self.dim <<
- ", shape =" << self.shape[0] << "," << self.shape[1] << "," << self.shape[2] << "," << self.shape[3] <<
- ", strides =" << self.strides[0] << "," << self.strides[1] << "," << self.strides[2] << "," << self.strides[3] << std::endl;
- })
- .def_buffer([](tt::tt_PytorchTensorDesc &desc) -> py::buffer_info {
-
- // Mostly irrelevant since we'll be turning this into torch tensor with its
- // own format. However, this could cause numpy to interpret the data wrong
- std::string data_format = py::format_descriptor::format();
- return py::buffer_info(
- const_cast(desc.ptr),
- desc.itemsize,
- data_format,
- 4,
- desc.shape,
- desc.strides);
- })
- .def(py::pickle(
- [](const tt::tt_PytorchTensorDesc &t) { // __getstate__
- return py::make_tuple(
- reinterpret_cast(t.ptr),
- t.itemsize,
- t.format,
- t.shape,
- t.strides,
- t.dim);
- },
- [](py::tuple t) { // __setstate__
- if (t.size() != 6)
- throw std::runtime_error("tt::tt_PytorchTensorDesc: Invalid state!");
-
- tt::tt_PytorchTensorDesc p;
- p.ptr = reinterpret_cast(t[0].cast());
- p.itemsize = t[1].cast();
- p.format = t[2].cast();
- p.shape = t[3].cast>();
- p.strides = t[4].cast>();
- p.dim = t[5].cast();
- return p;
- }
- ));
-
- py::class_(m_backend, "TilizedTensorDesc")
- .def(py::init<>())
- .def_readwrite("num_buffers", &tt::tt_TilizedTensorDesc::num_buffers)
- .def_readwrite("buf_size_bytes", &tt::tt_TilizedTensorDesc::buf_size_bytes)
- .def_readwrite("format", &tt::tt_TilizedTensorDesc::format)
- .def("print", [](tt::tt_TilizedTensorDesc &self) {
- std::cout << "Descriptor: ptr=" << (std::uint64_t)self.ptr <<
- ", num_buffers=" << self.num_buffers <<
- ", buf_size_bytes=" << (int)self.buf_size_bytes <<
- ", format =" << self.format;
- })
- .def(py::pickle(
- [](const tt::tt_TilizedTensorDesc &t) { // __getstate__
- return py::make_tuple(
- t.num_buffers,
- t.buf_size_bytes,
- t.format);
- },
- [](py::tuple t) { // __setstate__
- if (t.size() != 3)
- throw std::runtime_error("tt::tt_TilizedTensorDesc: Invalid state!");
-
- return tt::tt_TilizedTensorDesc(
- nullptr,
- t[0].cast(),
- t[1].cast(),
- t[2].cast()
- );
- }
- ));
-
-
- py::class_(m_backend, "BackendDeviceDesc")
- .def(py::init<>())
- .def_readonly("arch", ¶m::DeviceDesc::arch)
- .def_readonly("soc_desc_yaml", ¶m::DeviceDesc::soc_desc_yaml)
- .def_readonly("mmio", ¶m::DeviceDesc::mmio)
- .def_readonly("harvesting_mask", ¶m::DeviceDesc::harvesting_mask);
-
- py::class_(m_backend, "OpModelDesc")
- .def(py::init<>())
- .def_readwrite("type", &tt_op_model_desc::type)
- .def_readwrite("arch", &tt_op_model_desc::arch)
- .def_readwrite("data_format", &tt_op_model_desc::data_format)
- .def_readwrite("math_fidelity", &tt_op_model_desc::math_fidelity)
- .def_readwrite("t", &tt_op_model_desc::t)
- .def_readwrite("mblock_m", &tt_op_model_desc::mblock_m)
- .def_readwrite("mblock_n", &tt_op_model_desc::mblock_n)
- .def_readwrite("ublock_rt", &tt_op_model_desc::ublock_rt)
- .def_readwrite("ublock_ct", &tt_op_model_desc::ublock_ct)
- .def_readwrite("mblock_k", &tt_op_model_desc::mblock_k)
- .def_readwrite("ublock_kt", &tt_op_model_desc::ublock_kt)
- .def_readwrite("sparse_indices", &tt_op_model_desc::sparse_indices)
- .def_readwrite("sparse_nz_ublocks", &tt_op_model_desc::sparse_nz_ublocks)
- .def_readwrite("sparse_nz_strips", &tt_op_model_desc::sparse_nz_strips)
- .def_readwrite("approx_mode", &tt_op_model_desc::approx_mode)
- .def_readwrite("op_attr", &tt_op_model_desc::op_attr)
- .def_readwrite("reduce_z", &tt_op_model_desc::reduce_z);
-
- py::enum_(m_backend, "BackendStatusCode")
- .value("Success", tt::DEVICE_STATUS_CODE::Success)
- .value("RuntimeError", tt::DEVICE_STATUS_CODE::RuntimeError)
- .value("TimeoutError", tt::DEVICE_STATUS_CODE::TimeoutError);
-
- py::enum_(m_backend, "BackendCompileFailure")
- .value("BriscCompile", tt::COMPILE_FAILURE::BriscCompile)
- .value("EriscCompile",tt::COMPILE_FAILURE::EriscCompile)
- .value("NriscCompile",tt::COMPILE_FAILURE::NriscCompile)
- .value("Net2Pipe",tt::COMPILE_FAILURE::Net2Pipe)
- .value("PipeGen",tt::COMPILE_FAILURE::PipeGen)
- .value("BlobGen",tt::COMPILE_FAILURE::BlobGen)
- .value("L1Size",tt::COMPILE_FAILURE::L1Size)
- .value("OverlaySize",tt::COMPILE_FAILURE::OverlaySize)
- .value("Invalid",tt::COMPILE_FAILURE::Invalid);
-
- py::class_(m_backend, "BackendCompileResult")
- .def(py::init<>())
- .def_readwrite("success", &tt_compile_result::success)
- .def_readwrite("failure_type", &tt_compile_result::failure_type)
- .def_readwrite("failure_message", &tt_compile_result::failure_message)
- .def_readwrite("failure_target",&tt_compile_result::failure_target)
- .def_readwrite("device_id",&tt_compile_result::device_id)
- .def_readwrite("temporal_epoch_id", &tt_compile_result::temporal_epoch_id)
- .def_readwrite("logical_core_x",&tt_compile_result::logical_core_x)
- .def_readwrite("logical_core_y", &tt_compile_result::logical_core_y)
- .def_readwrite("extra_size_bytes", &tt_compile_result::extra_size_bytes);
-
-
- py::class_>(m_backend, "BackendApi")
- .def(py::init(py::overload_cast(&tt_backend::create)))
- .def("initialize", py::overload_cast<>(&tt_backend::initialize), py::call_guard())
- .def("initialize", py::overload_cast(&tt_backend::initialize), py::call_guard())
- .def("finish", &tt_backend::finish)
- .def("run_program", &tt_backend::run_program, py::call_guard())
- .def("wait_for_idle", &tt_backend::wait_for_idle, py::call_guard())
-
- .def("get_queue_descriptor", &tt_backend::get_queue_descriptor);
-
- // Explicitly release the backend pointer
- m_backend.def("release_backend_ptr", [](std::shared_ptr backend) {
- backend.reset();
- });
-
- m_backend.def(
- "clear_backend_param_cache",
- &tt::backend::clear_backend_param_cache_v2);
-
- m_backend.def("get_op_model_execution_cycles", &tt::backend::get_op_model_execution_cycles);
- m_backend.def("get_op_model_param", &tt::backend::get_op_model_param);
-
- m_backend.def(
- "push_input",
- py::overload_cast<
- const tt::tt_dram_io_desc&,
- const tt::tt_PytorchTensorDesc&,
- const bool, const int, const int>(&tt::backend::push_input), py::call_guard());
- m_backend.def(
- "push_input",
- py::overload_cast<
- const tt::tt_dram_io_desc&,
- const tt::tt_TilizedTensorDesc&,
- const int, const int>(&tt::backend::push_input), py::call_guard());
- m_backend.def("pop_output", &tt::backend::pop_output, py::call_guard());
- m_backend.def("get_output", &tt::backend::get_output, py::call_guard());
- m_backend.def("free_tensor", &tt::backend::free_tensor);
- m_backend.def("free_tensor", &tt::backend::free_tensor);
- m_backend.def("tilize_tensor", &tt::backend::tilize_tensor);
- m_backend.def("binarize_tensor", &tt::backend::binarize_tensor);
- m_backend.def("binarize_tensor", &tt::backend::binarize_tensor);
- m_backend.def("debinarize_tensor", &tt::backend::debinarize_tensor);
- m_backend.def("debinarize_tensor", &tt::backend::debinarize_tensor);
-
- m_backend.def(
- "get_io_size_in_bytes",
- &tt::backend::get_io_size_in_bytes,
- py::arg("data_formati"),
- py::arg("is_untilizesd"),
- py::arg("ublock_ct"),
- py::arg("ublock_rt"),
- py::arg("mblock_m"),
- py::arg("mblock_n"),
- py::arg("t"),
- py::arg("entries"),
- py::arg("tile_height") = 32,
- py::arg("tile_width") = 32);
- m_backend.def("get_next_aligned_address", &tt::backend::get_next_aligned_address);
-
- m_backend.def("translate_addresses", &tt::backend::translate_addresses, py::call_guard());
-
- m_backend.def(
- "detect_available_silicon_devices", &tt::backend::detect_available_devices, py::arg("only_detect_mmio") = true);
- m_backend.def(
- "get_device_descs_for_available_devices",
- &tt::backend::get_device_descs_for_available_devices,
- py::arg("out_dir") = std::string("./tt_build"));
-
- m_backend.def(
- "get_custom_device_desc",
- &tt::backend::get_custom_device_desc,
- py::arg("arch") = tt::ARCH::Invalid,
- py::arg("mmio") = false,
- py::arg("harvesting_mask") = 0u,
- py::arg("grid_dim") = std::make_pair(0, 0),
- py::arg("out_dir") = std::string("./tt_build"));
- m_backend.def("get_device_cluster_yaml", &tt::backend::get_device_cluster_yaml_v2, py::arg("out_dir"));
- m_backend.def("initialize_child_process", &tt::backend::initialize_child_process);
- m_backend.def("finish_child_process", &tt::backend::finish_child_process);
- m_backend.def("load_cached_sys_param", &tt::load_cached_sys_param);
-
- py::class_(m_backend, "DeviceGrid")
- .def(py::init>())
- .def_readonly("r", &DeviceGrid::r)
- .def_readonly("c", &DeviceGrid::c);
-
- py::class_(m_backend, "DeviceConfig")
- .def(py::init<
- std::string,
- std::string,
- std::string,
- std::string,
- std::string,
- bool,
- std::vector>())
- .def(py::init<
- std::string,
- std::string,
- std::string,
- std::string,
- std::string,
- bool,
- std::vector>>())
- .def("get_harvested_cfg", &DeviceConfig::get_harvested_cfg)
- .def("get_ethernet_connections", &DeviceConfig::get_ethernet_connections)
- .def("get_dram_backend_reserved_max", &DeviceConfig::get_dram_backend_reserved_max)
- .def("get_host_memory_channel_start_address", &DeviceConfig::get_host_memory_channel_start_address)
- .def("get_host_memory_num_channels", &DeviceConfig::get_host_memory_num_channels)
- .def("get_host_memory_channel_size", &DeviceConfig::get_host_memory_channel_size)
- .def_property_readonly(
- "arch", [](DeviceConfig const &dc) -> tt::ARCH { return get_arch_from_string(dc.arch_name); })
- .def_readonly("arch_name", &DeviceConfig::arch_name)
- .def_readonly("device_yaml", &DeviceConfig::device_yaml)
- .def_readonly("cluster_config_yaml", &DeviceConfig::cluster_config_yaml)
- .def_readonly("backend_type", &DeviceConfig::backend_type)
- .def_readonly("grid_size", &DeviceConfig::grid_size)
- .def_readonly("chip_ids", &DeviceConfig::chip_ids);
-}
-} // namespace tt::backend_api
diff --git a/pybuda/csrc/backend_api/backend_api.hpp b/pybuda/csrc/backend_api/backend_api.hpp
deleted file mode 100644
index 4bf536a33..000000000
--- a/pybuda/csrc/backend_api/backend_api.hpp
+++ /dev/null
@@ -1,16 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include
-#include
-#include
-namespace py = pybind11;
-
-namespace tt {
-namespace backend_api {
-
-void BackendModule(py::module &m_backend);
-
-}
-}
-
diff --git a/pybuda/csrc/backend_api/device_config.hpp b/pybuda/csrc/backend_api/device_config.hpp
index ada27b5d6..2ddf39681 100644
--- a/pybuda/csrc/backend_api/device_config.hpp
+++ b/pybuda/csrc/backend_api/device_config.hpp
@@ -12,6 +12,7 @@
#include
#include
+#include "arch_type.hpp"
#include "utils/assert.hpp"
#include "utils/env.hpp"
#include "utils/logger.hpp"
@@ -25,6 +26,7 @@ struct DeviceGrid
DeviceGrid(int r, int c) : r(r), c(c) {}
DeviceGrid(std::pair p) : r(p.first), c(p.second) {}
+ int size() const { return r * c; }
};
struct CoreCoord
@@ -54,6 +56,7 @@ struct EthCoord
struct DeviceConfig
{
std::string arch_name;
+ ARCH arch;
std::string device_yaml;
std::string cluster_config_yaml;
std::string runtime_params_yaml;
@@ -92,12 +95,14 @@ struct DeviceConfig
store_backend_db_to_yaml(store_backend_db_to_yaml),
grid_size(get("t6-grid_size", false))
{
+ arch = to_arch_type(arch_name);
+
// Constructor - used only by unittesting.
if (skip_backend_queries)
return;
// Get backend related parameters
- if (this->is_wormhole())
+ if (this->is_wormhole_b0())
{
// Load and cache system-level params if needed
if (this->backend_type == "silicon")
@@ -208,19 +213,25 @@ struct DeviceConfig
}
}
- inline bool is_grayskull() const { return arch_name.find("grayskull") != std::string::npos; }
- inline bool is_wormhole() const { return arch_name.find("wormhole") != std::string::npos; }
- inline bool is_wormhole_b0() const { return arch_name.find("wormhole_b0") != std::string::npos; }
+ // Get if the device is a blackhole
+ inline bool is_blackhole() const { return arch == ARCH::BLACKHOLE; }
+
+ // Get if the device is a wormhole_b0
+ // During the onboarding process of the blackhole architecture,
+ // we temporarily treat it as equivalent to the Wormhole_b0 architecture.
+ inline bool is_wormhole_b0() const { return arch == ARCH::WORMHOLE_B0 || is_blackhole(); }
+
+ // Get if the device is a grayskull
+ inline bool is_grayskull() const { return arch == ARCH::GRAYSKULL; }
template
T get(std::string const ¶m, const bool system_level_command) const;
void load_system_level_params();
- std::vector get_harvested_cfg() const;
+ std::unordered_map get_harvested_cfg() const;
- std::size_t get_dst_size() const { return get("t6-dst_size", false); }
std::size_t get_clock_freq() const
{
- return 1000000000; // tenstorrent/budabackend#1912
+ return 1000000000;
}
std::uint32_t get_host_memory_num_channels() const
{
@@ -246,27 +257,15 @@ struct DeviceConfig
static size_t overlay_blob_extra_size = env_as("TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE", 0);
return overlay_blob_extra_size;
}
- std::size_t get_l1_backend_reserved_size() const
- {
- // BBE will account for extra blob size (TT_BACKEND_OVERLAY_MAX_EXTRA_BLOB_SIZE) in the reserved size
- //
- auto reserved_size = get("t6-l1_backend_reserved_size", false);
- static auto extra_l1_margin = env_as("PYBUDA_EXTRA_L1_MARGIN");
- if (reserved_size < (std::uint32_t)extra_l1_margin)
- return 0;
-
- return reserved_size - extra_l1_margin;
- }
- std::size_t get_l1_usable_size() const { return get_l1_size() - get_l1_backend_reserved_size(); }
+ std::size_t get_l1_usable_size() const { return get_l1_size(); }
std::size_t get_l1_dram_io_backend_reserved_size() const
{
// Get this number from DB query:
- // tenstorrent/budabackend#1979
return 100 * 1024;
}
std::size_t get_noc_bandwidth_bytes_per_cycle() const
{
- return 32; // tenstorrent/budabackend#1912
+ return 32;
}
std::uint32_t get_dram_num_channels() const { return get("dram-num_channels", false); }
std::uint32_t get_dram_num_subchannels() const
@@ -297,7 +296,8 @@ struct DeviceConfig
}
CoreCoord get_dram_core_coord(std::uint32_t channel, std::uint32_t subchannel) const
{
- if (is_grayskull())
+ // Emulation device has only one dram channel
+ if (is_grayskull() || this->backend_type == "emulation")
{
return get("dram-core_xy_chan" + std::to_string(channel), false);
}
@@ -440,13 +440,6 @@ inline std::ostream& operator<<(std::ostream& os, DeviceConfig const& device_con
os << indent << ".runtime_params_yaml = " << device_config.runtime_params_yaml << "," << std::endl;
os << indent << ".grid_size = {" << device_config.grid_size.r << ", " << device_config.grid_size.c << "}"
<< "," << std::endl;
- os << indent << ".get_dst_size = " << device_config.get_dst_size() << "," << std::endl;
- os << indent << ".get_l1_size = " << device_config.get_l1_size() << "," << std::endl;
- os << indent << ".get_l1_backend_reserved_size = " << device_config.get_l1_backend_reserved_size() << ","
- << std::endl;
- os << indent << ".get_l1_usable_size = " << device_config.get_l1_usable_size() << "," << std::endl;
- os << indent << ".get_dram_num_channels = " << device_config.get_dram_num_channels() << "," << std::endl;
- os << indent << ".get_dram_channel_capacity = " << device_config.get_dram_channel_capacity() << "," << std::endl;
os << indent << ".supports_fp32_accumulation = " << device_config.supports_fp32_accumulation() << "," << std::endl;
os << indent << ".supports_stochastic_rounding = " << device_config.supports_stochastic_rounding() << ","
<< std::endl;
diff --git a/pybuda/csrc/backend_api/module.mk b/pybuda/csrc/backend_api/module.mk
index b2f79cfcf..4605c1a95 100644
--- a/pybuda/csrc/backend_api/module.mk
+++ b/pybuda/csrc/backend_api/module.mk
@@ -1,21 +1,9 @@
# Every variable in subdir must be prefixed with subdir (emulating a namespace)
-BACKEND_CONFIG ?= release
-BACKEND_ARCH_NAME ?= grayskull
-BACKEND_CC ?= gcc
-BACKEND_CXX ?= g++
-
-BACKEND_INCLUDES = -Ithird_party/budabackend
-
-BUDABACKEND_LIBDIR = third_party/budabackend/build/lib
-BUDABACKEND_LIB = $(BUDABACKEND_LIBDIR)/libtt.so
-BUDABACKEND_DEVICE = $(BUDABACKEND_LIBDIR)/libdevice.so
-BUDABACKEND_NET2PIPE = third_party/budabackend/build/bin/net2pipe
-BUDABACKEND_PIPEGEN = third_party/budabackend/build/bin/pipegen2
-
PYBUDA_CSRC_BACKENDAPI_LIB = $(LIBDIR)/libbackend_api.a
PYBUDA_CSRC_BACKENDAPI_SRCS += \
- pybuda/csrc/backend_api/backend_api.cpp
+ pybuda/csrc/backend_api/backend_api.cpp \
+ pybuda/csrc/backend_api/arch_type.cpp
PYBUDA_CSRC_BACKENDAPI_INCLUDES = $(PYBUDA_CSRC_INCLUDES) $(BACKEND_INCLUDES)
@@ -24,32 +12,10 @@ PYBUDA_CSRC_BACKENDAPI_DEPS = $(addprefix $(OBJDIR)/, $(PYBUDA_CSRC_BACKENDAPI_S
-include $(PYBUDA_CSRC_BACKENDAPI_DEPS)
-third_party/budabackend/clean:
- #ROOT=third_party/budabackend $(MAKE) -C third_party/budabackend clean
- cd third_party/budabackend; rm -rf build
-
-third_party/budabackend: $(SUBMODULESDIR)/third_party/budabackend.build ;
-
-DEVICE_VERSIM_INSTALL_ROOT ?= third_party/budabackend
-$(SUBMODULESDIR)/third_party/budabackend.build: $(SUBMODULESDIR)/third_party/budabackend.checkout
- CC=$(BACKEND_CC) CXX=$(BACKEND_CXX) CONFIG=$(BACKEND_CONFIG) ARCH_NAME=$(BACKEND_ARCH_NAME) DEVICE_VERSIM_INSTALL_ROOT=$(DEVICE_VERSIM_INSTALL_ROOT) ROOT=$(PWD)/third_party/budabackend $(MAKE) -C third_party/budabackend backend build_hw
- touch $@
-
-.PHONY: third_party/budabackend/netlist_analyzer
-third_party/budabackend/netlist_analyzer:
- CONFIG=$(BACKEND_CONFIG) ARCH_NAME=$(BACKEND_ARCH_NAME) DEVICE_VERSIM_INSTALL_ROOT=$(DEVICE_VERSIM_INSTALL_ROOT) ROOT=$(PWD)/third_party/budabackend $(MAKE) -C third_party/budabackend netlist_analyzer/tests
-
-$(BUDABACKEND_DEVICE): third_party/budabackend ;
-$(BUDABACKEND_LIB): third_party/budabackend ;
-$(BUDABACKEND_NET2PIPE): third_party/budabackend ;
-$(BUDABACKEND_PIPEGEN): third_party/budabackend ;
-
-third_party/budabackend/src/net2pipe: $(BUDABACKEND_NET2PIPE) $(BUDABACKEND_PIPEGEN) ;
-
# Each module has a top level target as the entrypoint which must match the subdir name
-pybuda/csrc/backend_api: $(PYBUDA_CSRC_BACKENDAPI_LIB) $(BUDABACKEND_LIB) $(BUDABACKEND_DEVICE) ;
+pybuda/csrc/backend_api: $(PYBUDA_CSRC_BACKENDAPI_LIB) $(PYBUDA_CSRC_SHARED_UTILS_LIB) ;
-$(PYBUDA_CSRC_BACKENDAPI_LIB): $(PYBUDA_CSRC_BACKENDAPI_OBJS) $(BUDABACKEND_LIB) $(BUDABACKEND_DEVICE)
+$(PYBUDA_CSRC_BACKENDAPI_LIB): $(PYBUDA_CSRC_BACKENDAPI_OBJS)
@mkdir -p $(LIBDIR)
ar rcs $@ $^
diff --git a/pybuda/csrc/balancer/balancer.cpp b/pybuda/csrc/balancer/balancer.cpp
deleted file mode 100644
index b58f67d36..000000000
--- a/pybuda/csrc/balancer/balancer.cpp
+++ /dev/null
@@ -1,652 +0,0 @@
-// SPDX-FileCopyrightText: © 2024 Tenstorrent AI ULC
-//
-// SPDX-License-Identifier: Apache-2.0
-#include "balancer/balancer.hpp"
-
-#include
-#include
-#include
-#include
-#include
-
-#include "balancer/balancer_cache_collection.hpp"
-#include "balancer/legalizer/legalizer.hpp"
-#include "balancer/policies/policies.hpp"
-#include "balancer/policies/policy_types.hpp"
-#include "balancer/policies/policy_utils.hpp"
-#include "balancer/python_interface.hpp"
-#include "graph_lib/node_types.hpp"
-#include "passes/passes_utils.hpp"
-#include "placer/epoch_placer.hpp"
-#include "placer/placer.hpp"
-#include "python_bindings_common.hpp"
-
-using NodeType = tt::graphlib::NodeType;
-
-namespace tt::balancer
-{
-
-std::ostream& operator<<(std::ostream& stream, PolicyType policy_type)
-{
- switch (policy_type)
- {
- case PolicyType::MaximizeTMinimizeGrid: stream << "PolicyType::MaximizeTMinimizeGrid"; break;
- case PolicyType::MinimizeGrid: stream << "PolicyType::MinimizeGrid"; break;
- case PolicyType::Random: stream << "PolicyType::Random"; break;
- case PolicyType::NLP: stream << "PolicyType::NLP"; break;
- case PolicyType::CNN: stream << "PolicyType::CNN"; break;
- case PolicyType::Ribbon: stream << "PolicyType::Ribbon"; break;
- default: stream << "PolicyType::Unknown"; break;
- }
- return stream;
-}
-
-std::ostream& operator<<(std::ostream& os, BalancerConfig const& config)
-{
- os << "BalancerConfig{" << std::endl;
- os << " .device_config.arch_name = " << config.device_config.arch_name << std::endl;
- os << " .device_config.device_yaml = " << config.device_config.device_yaml << std::endl;
- os << " .policy_type = " << config.policy_type << std::endl;
- os << " .random_policy_seed = " << config.random_policy_seed << std::endl;
- os << " .num_chips = " << config.chip_ids.size() << std::endl;
- os << " .skip_l1_usage_validation = " << config.skip_l1_usage_validation << std::endl;
- os << " .enable_t_streaming = " << config.enable_t_streaming << std::endl;
- os << "}";
- return os;
-}
-
-static std::vector cut_edges(BalancerConfig const& config, graphlib::Graph const* graph)
-{
- // Prevent duplicates coming from config.
- //
- std::unordered_set edges;
-
- for (auto const& epoch : config.op_names_to_epoch_break)
- {
- for (auto const& op_name : epoch)
- {
- if (not graph->has_node_with_name(op_name))
- continue;
- graphlib::Node* consumer = graph->get_node_by_name(op_name);
- for (auto edge : graph->operand_data_edges(consumer))
- {
- if (graph->node_by_id(edge.producer_node_id)->node_type() != NodeType::kBudaOp)
- continue;
-
- if (edges.count(edge) == 0)
- {
- edges.insert(edge);
- }
- }
- }
- }
-
- return std::vector(edges.begin(), edges.end());
-}
-
-legalizer::GraphSolver get_graph_solver(
- BalancerConfig const& config,
- std::shared_ptr cache_collection,
- graphlib::Graph* graph,
- LegalOpModels const& legal_op_models,
- bool use_op_model_recalculation)
-{
- if (config.device_config.is_grayskull())
- {
- return legalizer::GraphSolver::create(
- graph, legal_op_models, config, cache_collection, use_op_model_recalculation, cut_edges(config, graph));
- }
- else if (config.device_config.is_wormhole())
- {
- return legalizer::GraphSolver::create(
- graph, legal_op_models, config, cache_collection, use_op_model_recalculation, cut_edges(config, graph));
- }
-
- log_fatal("Unknown device arch name: {}", config.device_config.arch_name);
-}
-
-static void add_broadcasts_for_sparse_inputs_0_2(
- graphlib::Graph const* graph, graphlib::Node const* node, OpModel const& op_model)
-{
- // If fracture factor > 1, inputs have already been properly sized to target eltwise pipes scenario
- // The reason they can't be handled here as well is that fractured convs have different inputs 0 and
- // 2 for each c-dim core
- if (op_model.fracture_factor > 1)
- {
- return;
- }
-
- // No need to broadcast in this case
- if (op_model.grid_shape.c == 1)
- {
- return;
- }
-
- std::vector in_edges = graph->operand_data_edges(node);
- TT_ASSERT(in_edges.size() == 3 || in_edges.size() == 4); // 4 with bias
-
- std::shared_ptr input0_edge_attrs = graph->get_edge_attributes(in_edges[0]);
- std::shared_ptr input2_edge_attrs = graph->get_edge_attributes(in_edges[2]);
-
- std::vector input0_edge_tms = input0_edge_attrs->get_tms();
- std::vector input2_edge_tms = input2_edge_attrs->get_tms();
-
- // Append tms
- tt::graphlib::OpType broadcast_to_append = graphlib::OpType("broadcast", {3, op_model.grid_shape.c, true}, {});
-
- input0_edge_tms.push_back(broadcast_to_append);
- input2_edge_tms.push_back(broadcast_to_append);
-
- input0_edge_attrs->set_tms(input0_edge_tms);
- input2_edge_attrs->set_tms(input2_edge_tms);
-}
-
-static void insert_sparse_fracturing_tms(
- graphlib::Graph const* graph, graphlib::Node const* node, OpModel const& op_model)
-{
- // Out
- std::vector out_edges = graph->user_data_edges(node);
- TT_ASSERT(out_edges.size() == 1);
- std::shared_ptr out_edge_attrs = graph->get_edge_attributes(out_edges[0]);
- std::vector out_edge_tms = out_edge_attrs->get_tms();
-
- std::vector prepend_tms = {
- // Instead of adding hslice(f) + vstack(f), we could just add an hslice(f) and divide the existing vslice's
- // split factor by f (where f = fracture_factor), but the optimize_tms() pass does this for us
- graphlib::OpType("hslice", {op_model.fracture_factor}, {}),
- graphlib::OpType("vstack", {op_model.fracture_factor}, {})};
- out_edge_tms.insert(out_edge_tms.begin(), prepend_tms.begin(), prepend_tms.end());
- out_edge_attrs->set_tms(out_edge_tms);
-
- // In1
- std::vector in_edges = graph->operand_data_edges(node);
- TT_ASSERT(in_edges.size() == 3);
- std::shared_ptr in1_edge_attrs = graph->get_edge_attributes(in_edges[1]);
- std::vector in1_edge_tms = in1_edge_attrs->get_tms();
-
- std::vector append_tms = {
- graphlib::OpType("broadcast", {3, op_model.fracture_factor, true}, {})};
- in1_edge_tms.insert(in1_edge_tms.end(), append_tms.begin(), append_tms.end());
- in1_edge_attrs->set_tms(in1_edge_tms);
-}
-
-static void insert_sparse_buffer_op_tms(
- Graph const* graph, graphlib::OpNode* op, balancer::OpModel const& op_model, int fracture_factor)
-{
- TT_ASSERT(op_model.has_sparse_buffer());
- TT_ASSERT(fracture_factor == 1);
-
- if (op_model.grid_shape.r == 1)
- return;
-
- auto users = graph->user_data_edges(op);
- graphlib::BudaOpNode* user = dynamic_cast(graph->node_by_id(users[0].consumer_node_id));
- bool is_reduce_z = user->op_name() == "reduce" and std::get(user->buda_attrs().at("dim")) == "z";
- bool is_matmul = user->is_matmul() and not user->is_sparse_matmul();
-
- if (not is_reduce_z and not is_matmul)
- return;
-
- TT_ASSERT(users.size() == 1, "Unsupported, multiple users with different views");
-
- auto edge_attrs = graph->get_edge_attributes(users[0]);
- auto& tms = edge_attrs->get_tms();
- if (tms.empty())
- return;
-
- TT_ASSERT(not is_matmul or tms.size() == 2, op->name(), user->name(), tms.size());
- TT_ASSERT(not is_reduce_z or tms.size() == 1, op->name(), user->name(), tms.size());
- if (is_matmul)
- {
- TT_ASSERT(tms[0].op == "vslice");
- TT_ASSERT(tms[1].op == "hstack");
- int vslice_factor = std::get(tms[0].attr[0]);
- int hstack_factor = std::get(tms[1].attr[0]);
- TT_ASSERT(vslice_factor == hstack_factor);
- TT_ASSERT(vslice_factor > 1);
- int factor = vslice_factor;
- int grid_r = op_model.grid_shape.r;
- tms.clear();
- tms.push_back(graphlib::OpType("vslice", {grid_r * factor}, {}));
- tms.push_back(graphlib::OpType("hstack", {factor}, {}));
- tms.push_back(graphlib::OpType("vstack", {grid_r}, {}));
- }
- else if (is_reduce_z)
- {
- TT_ASSERT(false, "This path results in illegal tms, cannot have slice after stack");
- TT_ASSERT(tms[0].op == "vslice");
- int vslice_factor = std::get(tms[0].attr[0]);
- TT_ASSERT(vslice_factor > 1);
- int factor = vslice_factor;
- int grid_r = op_model.grid_shape.r;
- tms.clear();
- tms.push_back(graphlib::OpType("vslice", {grid_r * factor}, {}));
- tms.push_back(graphlib::OpType("hstack", {factor}, {}));
- tms.push_back(graphlib::OpType("vstack", {grid_r}, {}));
- tms.push_back(graphlib::OpType("hslice", {factor}, {}));
- }
-}
-
-// Layout dataflow reorders the output buffer of sparse matmul in a way
-// such that each row of cores between a sparse/consumer pair has a 1to1
-// mapping of tiles and avoids inefficient gathers. This function erases
-// the existing TMs along this path and replaces them with "per row core"
-// equivalent set of TMs. This often results in more complicated TMs, but
-// much simpler pipes
-static void insert_sparse_dataflow_tms(
- graphlib::Graph const* graph, graphlib::Node const* node, OpModel const& op_model)
-{
- for (Edge user : graph->user_data_edges(node))
- {
- auto& tms = graph->get_edge_attributes(user)->get_tms();
- TT_ASSERT(tms.size() >= 1 and tms.size() <= 3);
-
- bool needs_stack = false;
- int row_slice = op_model.grid_shape.r * op_model.block_shape().rt();
- bool backwards = tms.front().op == "transpose";
- int factor = 0;
-
- if (backwards)
- {
- TT_ASSERT(tms.size() >= 2);
- TT_ASSERT(tms[1].op == "hslice");
- factor = std::get(tms[1].attr[0]);
- if (tms.size() > 2)
- {
- TT_ASSERT(tms[2].op == "vstack");
- TT_ASSERT(factor == std::get(tms[2].attr[0]));
- needs_stack = true;
- }
- }
- else
- {
- TT_ASSERT(tms[0].op == "vslice");
- factor = std::get(tms[0].attr[0]);
- if (tms.size() > 1)
- {
- TT_ASSERT(tms[1].op == "hstack");
- TT_ASSERT(factor == std::get(tms[1].attr[0]));
- needs_stack = true;
- }
- }
- TT_ASSERT(factor > 1);
- TT_ASSERT(row_slice > 1);
-
- tms.clear();
-
- if (backwards)
- {
- tms.push_back(graphlib::OpType("transpose", {}, {}, {{"dim0", 2}, {"dim1", 3}, {"z_dim_slice", -1}}));
- tms.push_back(graphlib::OpType("hslice", {row_slice}, {}));
- tms.push_back(graphlib::OpType("vstack", {factor}, {}));
- tms.push_back(graphlib::OpType("hstack", {row_slice / factor}, {}));
- if (not needs_stack)
- tms.push_back(graphlib::OpType("vslice", {factor}, {}));
- }
- else
- {
- tms.push_back(graphlib::OpType("vslice", {row_slice}, {}));
- tms.push_back(graphlib::OpType("hstack", {factor}, {}));
- tms.push_back(graphlib::OpType("vstack", {row_slice / factor}, {}));
- if (not needs_stack)
- tms.push_back(graphlib::OpType("hslice", {factor}, {}));
- }
- optimize_tms(tms);
- }
-}
-
-void print_perf_input_data(
- tt::sparse::EncodingTiles const& buda_indices_all_rows, int sparse_tile_ptr_bits, balancer::OpModel const& op_model)
-{
- constexpr int TILE_DIM = tt::sparse::TILE_DIM;
- using IndexType = std::remove_extent_t;
- const int ublock_tile_index_bytes = 16 - sparse_tile_ptr_bits;
- const int grid_r = buda_indices_all_rows.size();
-
- fmt::print("~~ Node: {}\n", op_model.buda_op_node->name());
- fmt::print("~~~~ t: {}\n", op_model.t_stream_factor.t());
- fmt::print("~~~~ mblock_m: {}\n", op_model.block_shape().mblock_m);
- fmt::print("~~~~ mblock_n: {}\n", op_model.block_shape().mblock_n);
- fmt::print("~~~~ ublock_rt: {}\n", op_model.ublock_shape().rt);
- fmt::print("~~~~ ublock_ct: {}\n", op_model.ublock_shape().ct);
- fmt::print("~~~~ u_kt: {}\n", op_model.input_buffers[0].block_shape.ublock.ct);
- fmt::print(
- "~~~~ m_k (total strips): {}\n",
- op_model.op_shape.inputs[1].rt / op_model.input_buffers[1].block_shape.ublock.rt);
-
- for (int curr_r = 0; curr_r < grid_r; curr_r++)
- {
- std::vector buda_indices = buda_indices_all_rows[curr_r];
- std::uint8_t const* base_ptr = reinterpret_cast(buda_indices.data());
- TT_ASSERT((int)buda_indices.size() % (TILE_DIM * TILE_DIM) == 0);
-
- int cnt_nz_tiles = 0;
- int cnt_nz_ublocks = 0;
- int cnt_nz_strips = 0;
-
- for (int tile_id = 0; tile_id < (int)(buda_indices.size() / (TILE_DIM * TILE_DIM)); ++tile_id)
- {
- tt::sparse::strip_info_struct const* info = reinterpret_cast(
- base_ptr + tile_id * (TILE_DIM * TILE_DIM * sizeof(std::uint32_t)));
-
- bool done = false;
- while (not done)
- {
- if (info->f.nz_ublocks > 0)
- {
- cnt_nz_strips++;
- }
-
- cnt_nz_ublocks += info->f.nz_ublocks;
-
- int i = 0;
- for (int ublock_i = 0; ublock_i < info->f.nz_ublocks; ++ublock_i)
- {
- IndexType encoded = info->f.index_array[i++];
- IndexType nz_tiles_in_ublock = encoded >> sparse_tile_ptr_bits;
- nz_tiles_in_ublock =
- (nz_tiles_in_ublock == 0u) ? (1u << ublock_tile_index_bytes) : nz_tiles_in_ublock;
- cnt_nz_tiles += nz_tiles_in_ublock;
- i += nz_tiles_in_ublock;
- }
-
- done = info->f.last_strip_in_tile;
- info = reinterpret_cast(
- reinterpret_cast(info) + sizeof(tt::sparse::strip_info_struct) +
- i * sizeof(IndexType));
- }
- }
-
- fmt::print("~~~~~~ grid_r {}\n", curr_r);
- fmt::print("~~~~~~~~ cnt_nz_tiles: {}\n", cnt_nz_tiles);
- fmt::print("~~~~~~~~ cnt_nz_ublocks: {}\n", cnt_nz_ublocks);
- fmt::print("~~~~~~~~ cnt_nz_strips: {}\n", cnt_nz_strips);
- }
-}
-
-void update_ops_on_selected_op_models(graphlib::Graph const* graph, OpModels const& op_models)
-{
- for (Node* node : graph->nodes())
- {
- if (node->node_type() == NodeType::kBudaOp)
- {
- graphlib::OpNode* op = node->as();
- graphlib::OpType type = op->op_type();
- if (op->is_sparse_matmul())
- {
- TT_LOG_ASSERT(op_models.count(node) > 0, "Missing op model for node: {}", node->name());
- balancer::OpModel op_model = op_models.at(node);
-
- int grid_r = op_model.grid_shape.r;
- int u_rt = op_model.output_buffers[0].block_shape.ublock.rt;
- int u_kt = op_model.input_buffers[1].block_shape.ublock.rt;
- int u_ct = op_model.output_buffers[0].block_shape.ublock.ct;
- bool has_buffer_op = op_model.has_sparse_buffer();
- bool force_buffer_op_layout = env_as("PYBUDA_FORCE_SPARSE_BUFFER_LAYOUT");
- bool buffer_op_layout = has_buffer_op or force_buffer_op_layout;
- const sparse::SparseBUDA& sparse_buda =
- graph->data_operands(node)[0]->as()->get_sparse_buda();
- auto layout = sparse::SparseBUDA::create_layout(
- buffer_op_layout, op_model.t_stream_factor.dir.z_major(), op_model.fracture_factor);
-
- std::string visualize_sparse_path =
- env_as("PYBUDA_VISUALIZE_SPARSE") ? "sparse_" + op->name() + ".png" : "";
- auto [sparse, encodings, sparse_s, encodings_s, num_strips_per_row] =
- sparse_buda.get_sparse_tiles_and_encodings(
- grid_r,
- op_model.t_stream_factor.r,
- op_model.t_stream_factor.c,
- u_rt,
- u_kt,
- op_model.fracture_factor,
- layout,
- visualize_sparse_path);
- int sparse_tile_ptr_bits =
- sparse_buda.get_sparse_tile_ptr_bits(grid_r, op_model.t_stream_factor.r, u_rt);
- int sparse_ublock_idx_bits =
- sparse_buda.get_sparse_ublock_idx_bits(grid_r, op_model.t_stream_factor.r, u_rt);
-
- if (env_as("PYBUDA_SPARSE_PRINT_PERF_INPUT_DATA"))
- {
- print_perf_input_data(encodings, sparse_tile_ptr_bits, op_model);
- }
-
- auto sparse_utils_module = py::module_::import("pybuda.op.eval.sparse_utils");
- py::function shapeify = sparse_utils_module.attr("shapeify_sparse_tiles_and_encodings");
-
- // Overwrite input tensors
- auto [sp, enc] = shapeify(sparse, encodings, grid_r, op_model.fracture_factor)
- .cast>();
- graphlib::ConstantInputNode* cin0 = graph->data_operands(node)[0]->as();
- graphlib::ConstantInputNode* cin2 = graph->data_operands(node)[2]->as();
- cin0->set_tensor_handle(make_shared_py_object(sp));
- cin2->set_tensor_handle(make_shared_py_object(enc));
-
- // tenstorrent/budabackend#1234
- // tenstorrent/pybuda#504
- // Due to BBE-imposed constraint, we can't have more that 2 operands multicasting
- // BBE changed behavior so that inputs 0&2 use eltwise style pipes instead of row multicast
- // On FE side, we add a broadcast in c-dim to account for this diff
- add_broadcasts_for_sparse_inputs_0_2(graph, node, op_model);
-
- // Overwrite op attributes
- auto op_attrs = op->op_attrs();
- TT_ASSERT(op_attrs.size() == 15);
- op_attrs[2] = sparse_tile_ptr_bits;
- op_attrs[7] = op_model.fracture_factor;
- op_attrs[8] = u_rt;
- op_attrs[9] = u_kt;
- op_attrs[10] = u_ct;
- op_attrs[11] = op_model.grid_shape.c;
- op_attrs[12] = op_model.t_stream_factor.r;
- op_attrs[13] = op_model.t_stream_factor.c;
- op_attrs[14] = sparse_ublock_idx_bits;
- op->overwrite_op_attrs(op_attrs);
-
- // Overwrite buda attributes
- auto buda_attrs = op->buda_attrs();
- buda_attrs["num_sparse_tiles"] =
- static_cast(sparse_s[3] / sparse::TILE_DIM / op_model.fracture_factor);
- buda_attrs["num_index_tiles"] =
- static_cast(encodings_s[3] / sparse::TILE_DIM / op_model.fracture_factor);
- buda_attrs["sparse_tile_ptr_bits"] = sparse_tile_ptr_bits;
- buda_attrs["sparse_ublock_idx_bits"] = sparse_ublock_idx_bits;
- buda_attrs["fracture_factor"] = op_model.fracture_factor;
- if (has_buffer_op)
- {
- TT_ASSERT((op_model.grid_shape.c % 2) == 0);
- std::vector num_nz_strips;
- int grid_c = op_model.grid_shape.c / 2;
- int grid_volume = op_model.grid_shape.r * grid_c;
- num_nz_strips.resize(grid_volume);
- for (int i = 0; i < grid_volume; ++i) num_nz_strips[i] = num_strips_per_row[i / grid_c];
- buda_attrs["num_nz_strips"] = num_nz_strips;
- buda_attrs["act_buffered"] = true;
- }
- op->overwrite_buda_attrs(buda_attrs);
-
- // Overwrite op attributes
- auto attr = op->op_attrs();
- std::get(attr[2]) = sparse_tile_ptr_bits;
- op->overwrite_op_attrs(attr);
-
- log_trace(LogBalancer, " Sparse node {}:", node->name());
- log_trace(LogBalancer, " {}", op_model.grid_shape);
- log_trace(
- LogBalancer, " Num sparse tiles per core: {}:", std::get(buda_attrs["num_sparse_tiles"]));
- log_trace(
- LogBalancer, " Num index tiles per core: {}:", std::get(buda_attrs["num_index_tiles"]));
-
- graph->data_operands(node)[0]->set_shape(graphlib::Shape::create_buda(sparse_s));
- graph->data_operands(node)[2]->set_shape(graphlib::Shape::create_buda(encodings_s));
-
- if (op_model.fracture_factor > 1)
- {
- // Update node shape to account for fracture factor
- tt::graphlib::Shape shape = node->shape().canonical();
- tt::graphlib::Shape new_shape = graphlib::Shape::create_buda(
- shape.as_vector()[0],
- shape.as_vector()[1],
- shape.as_vector()[2] / op_model.fracture_factor,
- shape.as_vector()[3] * op_model.fracture_factor);
- TT_ASSERT(shape.volume() == new_shape.volume());
- node->set_shape(new_shape);
-
- // Insert tms to account for fracture factor
- insert_sparse_fracturing_tms(graph, node, op_model);
- }
-
- log_trace(LogBalancer, "Sparse layout {}: {}", op->name(), layout);
- switch (layout)
- {
- case sparse::SparseBUDA::Layout::BufferOp:
- {
- insert_sparse_buffer_op_tms(graph, op, op_model, op_model.fracture_factor);
- break;
- }
- case sparse::SparseBUDA::Layout::ZMajorDataflow:
- {
- TT_ASSERT(op_model.fracture_factor == 1);
- insert_sparse_dataflow_tms(graph, op, op_model);
- break;
- }
- default:
- {
- break;
- }
- }
- }
- else if (type.op == "embedding")
- {
- balancer::OpModel const& op_model = op_models.at(node);
-
- auto* embedding_table = graph->data_operands(op)[0]->as();
- embedding_table->set_layout(BudaQueueLayout::Flat);
-
- // Reconfigure shape for raw tilized layout
- auto* embedding_indices = graph->data_operands(op)[1]->as();
- auto indices_shape = embedding_indices->shape();
- TT_ASSERT(indices_shape[-2] == graphlib::Shape::BUDA_TILE_DIM);
- indices_shape[-2] = indices_shape[-2] * op_model.grid_shape.r;
- indices_shape[-1] = graphlib::align_up_tile(
- indices_shape[-1] / (op_model.grid_shape.r * graphlib::Shape::BUDA_TILE_DIM));
- embedding_indices->set_shape(indices_shape);
-
- // Convert num_indices to be per core
- int num_indices = std::get(type.buda_attrs.at("num_indices"));
- num_indices = graphlib::align_up_tile(num_indices);
- TT_ASSERT(num_indices % op_model.grid_shape.r == 0);
- std::get(type.buda_attrs.at("num_indices")) = num_indices / op_model.grid_shape.r;
-
- op->change_op_type(type);
- }
- else if (type.op == "dropout")
- {
- // Overwrite op attributes
- TT_LOG_ASSERT(op_models.count(node) > 0, "Missing op model for node: {}", node->name());
- balancer::OpModel op_model = op_models.at(node);
-
- auto attr = op->op_attrs();
- attr[5] = op_model.t_stream_factor.r;
- attr[6] = op_model.t_stream_factor.c;
- attr[7] = op_model.t_stream_factor.dir.r();
- attr[8] = op_model.t_stream_factor.dir.z_major();
- op->overwrite_op_attrs(attr);
- }
- else if (type.op == "splice")
- {
- // Update op attributes
- TT_LOG_ASSERT(op_models.count(node) > 0, "Missing op model for node: {}", node->name());
- balancer::OpModel op_model = op_models.at(node);
- graphlib::UBlockOrder ublock_order = get_output_ublock_order(graph, node);
- op->py_attr(
- "update_ranges",
- (ublock_order == graphlib::UBlockOrder::R), // ublock_is_row_order
- op_model.ublock_shape().rt,
- op_model.ublock_shape().ct,
- op_model.grid_shape.r,
- op_model.grid_shape.c,
- op_model.t_stream_factor.r,
- op_model.t_stream_factor.c);
- }
- else if (type.op == "tilizer")
- {
- auto* input = graph->data_operands(op)[0]->as();
- input->set_layout(BudaQueueLayout::Flat);
- }
- }
- }
-}
-
-static void insert_input_queues(
- placer::PlacerSolution& placer_solution, const Graph* graph, const OpModelMap& op_models)
-{
- // Add input queues to the placer solution
- for (auto [node_name, op_model] : op_models)
- {
- Node* node = graph->get_node_by_name(node_name);
- switch (node->node_type())
- {
- case NodeType::kInput:
- {
- placer_solution.input_queue_to_grid_shape.insert(
- {node_name,
- tt::placer::GridShape(
- (std::uint32_t)op_model.grid_shape.r, (std::uint32_t)op_model.grid_shape.c)});
- break;
- }
- default: break;
- }
- }
-}
-
-static std::tuple balancer_passes(
- Graph* graph,
- BalancerConfig& config,
- std::shared_ptr cache_collection,
- std::optional& placer_solution)
-{
- log_debug(LogBalancer, "{}", config);
- LegalOpModels valid_op_models = legalizer::get_legal_op_models(graph, config, cache_collection);
-
- auto graph_solver = get_graph_solver(config, cache_collection, graph, valid_op_models);
-
- legalizer::GraphSolverSolution graph_solver_solution = run_policy(graph, config, graph_solver, placer_solution);
-
- update_ops_on_selected_op_models(graph, graph_solver_solution.selected_op_models);
-
- auto ret = legalizer::resolve_block_shapes(graph, config, graph_solver_solution);
-
- if (placer_solution.has_value())
- insert_input_queues(placer_solution.value(), graph, std::get<0>(ret));
-
- return ret;
-}
-
-std::shared_ptr run_balancer_and_placer(
- Graph* graph, BalancerConfig& config, std::shared_ptr cache_collection)
-{
- log_info("Running Balancer with Policy: {}", config.policy_type);
- PROFILE_SCOPE();
-
- // New epoch-by-epoch placement loop
- if (config.epoch_by_epoch)
- return placer::run_epoch_placer(&graph, config, cache_collection);
-
- std::optional opt_placer_solution = std::nullopt;
- auto const& [op_models, block_shape_map, output_host_tms, cut_edges] =
- balancer_passes(graph, config, cache_collection, opt_placer_solution);
-
- TT_ASSERT(
- graph->virtual_node_count() == 0,
- "After balancer passes are complete we should not have virtual nodes in graph anymore.");
-
- auto placer_solution =
- opt_placer_solution.has_value() ? opt_placer_solution.value() : run_placer(graph, config, op_models);
- dump_balancer_placer_data(
- graph, config.chip_ids, placer_solution, op_models, std::cout, config.device_config.arch_name);
-
- return std::make_shared