From 67b1500622fd20e691ace256d77fe1facecec6af Mon Sep 17 00:00:00 2001 From: makslevental Date: Mon, 23 Sep 2024 13:14:33 -0400 Subject: [PATCH 01/19] [WIP] use python bindings for tests --- .github/workflows/ci-linux.yml | 13 +++++ .github/workflows/ci-windows.yml | 18 ++++-- build_tools/build_test_cpp.ps1 | 1 + build_tools/build_test_cpp.sh | 1 + cmake/iree_aie_bootgen.cmake | 10 ++-- cmake/iree_aie_rt.cmake | 21 +++---- iree_compiler_plugin.cmake | 1 + .../iree-amd-aie/aie_runtime/test/utest.cc | 1 - samples/smol_matmul.py | 56 +++++++++++++++++++ 9 files changed, 99 insertions(+), 23 deletions(-) create mode 100644 samples/smol_matmul.py diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index c3beeb267..00b295edd 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -57,6 +57,10 @@ jobs: yum remove -y openssl-devel zlib-devel || true yum install -y protobuf-devel protobuf-compiler tmate + - uses: actions/setup-python@v4 + with: + python-version: '3.10' + - name: Python deps run: | pip install "numpy<2" pyyaml "pybind11[global]==2.10.3" nanobind pytest @@ -136,7 +140,10 @@ jobs: - name: Extract artifact run: | tar -xvf iree-dist-linux.tar + echo "IREE_INSTALL_DIR=$PWD/iree-install" >> $GITHUB_ENV + echo "PYTHONPATH=$PWD/iree-install/python_packages/iree_compiler:$PWD/iree-install/python_packages/iree_runtime" >> $GITHUB_ENV bash build_tools/download_peano.sh + echo "PEANO_INSTALL_DIR=$PWD/llvm-aie" >> $GITHUB_ENV - name: Create venv and install dependencies run: | @@ -193,3 +200,9 @@ jobs: $PWD/llvm-aie \ /opt/xilinx/xrt \ /opt/Xilinx/Vitis/2024.2 + + - name: Python tests + run: | + source .venv/bin/activate + source /opt/xilinx/xrt/setup.sh + python samples/smol_matmul.py diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index 190cc9672..777be4d5b 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -73,7 +73,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: '3.11' + python-version: '3.10' - name: Python deps run: | @@ -142,8 +142,8 @@ jobs: defaults: run: shell: bash - strategy: - fail-fast: true + env: + XILINX_XRT: "/c/Xilinx/XRT" steps: - name: "Checking out repository" # for test scripts uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 @@ -158,7 +158,10 @@ jobs: - name: Extract artifact run: | tar -xvf iree-dist-windows.tar - bash build_tools/download_peano.sh + echo "IREE_INSTALL_DIR=$PWD/iree-install" >> $GITHUB_ENV + echo "PYTHONPATH=$PWD/iree-install/python_packages/iree_compiler:$PWD/iree-install/python_packages/iree_runtime" >> $GITHUB_ENV + bash build_tools/download_peano.sh + echo "PEANO_INSTALL_DIR=$PWD/llvm-aie" >> $GITHUB_ENV - name: Create venv and install dependencies run: | @@ -169,7 +172,6 @@ jobs: - name: E2E correctness matmul test run: | source .venv/Scripts/activate - export XILINX_XRT=/c/Xilinx/XRT bash build_tools/ci/run_matmul_test.sh \ /c/test_matmuls \ $PWD/iree-install \ @@ -178,8 +180,12 @@ jobs: - name : E2E comparison of AIE to llvm-cpu run: | source .venv/Scripts/activate - export XILINX_XRT=/c/Xilinx/XRT python build_tools/ci/cpu_comparison/run.py \ /c/test_aie_vs_cpu \ $PWD/iree-install \ $PWD/llvm-aie -v + + - name: Python tests + run: | + source .venv/Scripts/activate + python samples/smol_matmul.py diff --git a/build_tools/build_test_cpp.ps1 b/build_tools/build_test_cpp.ps1 index 41bf86d34..5007aff7f 100644 --- a/build_tools/build_test_cpp.ps1 +++ b/build_tools/build_test_cpp.ps1 @@ -122,6 +122,7 @@ echo "------------" echo "Installing" echo "----------" echo "Install to: $install_dir" +& cmake --build $build_dir --target install & cmake --build $build_dir --target iree-install-dist echo "CTest" diff --git a/build_tools/build_test_cpp.sh b/build_tools/build_test_cpp.sh index ddda26a0d..499d9ba80 100644 --- a/build_tools/build_test_cpp.sh +++ b/build_tools/build_test_cpp.sh @@ -116,6 +116,7 @@ cmake --build "$build_dir" -- -k 0 echo "Installing" echo "----------" echo "Install to: $install_dir" +cmake --build "$build_dir" --target install cmake --build "$build_dir" --target iree-install-dist echo "CTest" diff --git a/cmake/iree_aie_bootgen.cmake b/cmake/iree_aie_bootgen.cmake index a834c9427..244a4be18 100644 --- a/cmake/iree_aie_bootgen.cmake +++ b/cmake/iree_aie_bootgen.cmake @@ -16,12 +16,10 @@ set(_BOOTGEN_SOURCE_DIR ${IREE_AMD_AIE_SOURCE_DIR}/third_party/bootgen) # https://stackoverflow.com/a/56463133 If you want to use malloc, then include stdlib.h replace_string_in_file(${_BOOTGEN_SOURCE_DIR}/cdo-npi.c "#include " "#include ") replace_string_in_file(${_BOOTGEN_SOURCE_DIR}/cdo-alloc.c "#include " "#include ") -replace_string_in_file("${_BOOTGEN_SOURCE_DIR}/main.cpp" - "#include \"openssl/ms/applink.c\"" "//#include \"openssl/ms/applink.c\"") -replace_string_in_file("${_BOOTGEN_SOURCE_DIR}/main.cpp" - "int main" "int iree_aie_bootgen_main") -replace_string_in_file("${_BOOTGEN_SOURCE_DIR}/main.cpp" - "DisplayBanner();" "//DisplayBanner();") +replace_string_in_file("${_BOOTGEN_SOURCE_DIR}/main.cpp" "#include \"openssl/ms/applink.c\"" "") +replace_string_in_file("${_BOOTGEN_SOURCE_DIR}/main.cpp" "int main" "int iree_aie_bootgen_main") +replace_string_in_file("${_BOOTGEN_SOURCE_DIR}/main.cpp" "DisplayBanner();" "") +replace_string_in_file("${_BOOTGEN_SOURCE_DIR}/main.cpp" "LOG_MSG(\"\\n[INFO] : Bootimage generated successfully\\n\");" "") file(GLOB _bootgen_sources "${_BOOTGEN_SOURCE_DIR}/*.c" "${_BOOTGEN_SOURCE_DIR}/*.cpp") add_library(iree-aie-bootgen STATIC ${_bootgen_sources}) diff --git a/cmake/iree_aie_rt.cmake b/cmake/iree_aie_rt.cmake index dcdb7c204..161bd03b5 100644 --- a/cmake/iree_aie_rt.cmake +++ b/cmake/iree_aie_rt.cmake @@ -14,16 +14,17 @@ endif() # cdo-drver # ############################################################################## -set(_BOOTGEN_SOURCE_DIR ${IREE_AMD_AIE_SOURCE_DIR}/third_party/bootgen) -replace_string_in_file(${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c "\"(Write64)" "\"cdo-driver: (Write64)") -replace_string_in_file(${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c "\"(MaskWrite64)" "\"cdo-driver: (MaskWrite64)") -replace_string_in_file(${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c "\"(NOP Command)" "\"cdo-driver: (NOP Command)") -replace_string_in_file(${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c "\"(BlockWrite-DMAWriteCmd)" "\"cdo-driver: (BlockWrite-DMAWriteCmd)") -replace_string_in_file(${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c "Data@ 0x%\" PRIxPTR \"" "Data") -replace_string_in_file(${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c "(uintptr_t)(pData + i)," "") -replace_string_in_file(${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c "\" Address:" "\"cdo-driver: Address:") -replace_string_in_file(${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c "\"(BlockSet-DMAWriteCmd)" "\"cdo-driver: (BlockSet-DMAWriteCmd)") -replace_string_in_file(${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c "\"(MaskPoll64)" "\"cdo-driver: (MaskPoll64)") +set(_BOOTGEN_SOURCE_DIR "${IREE_AMD_AIE_SOURCE_DIR}/third_party/bootgen") +replace_string_in_file("${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c" "\"(Write64)" "\"cdo-driver: (Write64)") +replace_string_in_file("${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c" "\"(MaskWrite64)" "\"cdo-driver: (MaskWrite64)") +replace_string_in_file("${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c" "\"(NOP Command)" "\"cdo-driver: (NOP Command)") +replace_string_in_file("${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c" "\"(BlockWrite-DMAWriteCmd)" "\"cdo-driver: (BlockWrite-DMAWriteCmd)") +replace_string_in_file("${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c" "Data@ 0x%\" PRIxPTR \"" "Data") +replace_string_in_file("${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c" "(uintptr_t)(pData + i)," "") +replace_string_in_file("${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c" "\" Address:" "\"cdo-driver: Address:") +replace_string_in_file("${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c" "\"(BlockSet-DMAWriteCmd)" "\"cdo-driver: (BlockSet-DMAWriteCmd)") +replace_string_in_file("${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c" "\"(MaskPoll64)" "\"cdo-driver: (MaskPoll64)") +replace_string_in_file("${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c" "printf(\"Generating: %s\\n\", cdoFileName);" "") add_library(cdo_driver STATIC ${_BOOTGEN_SOURCE_DIR}/cdo-driver/cdo_driver.c) target_include_directories(cdo_driver PUBLIC ${_BOOTGEN_SOURCE_DIR}/cdo-driver) diff --git a/iree_compiler_plugin.cmake b/iree_compiler_plugin.cmake index d49434479..c91066224 100644 --- a/iree_compiler_plugin.cmake +++ b/iree_compiler_plugin.cmake @@ -14,6 +14,7 @@ set(IREE_AMD_AIE_ENABLE_XRT_DRIVER OFF) if("xrt" IN_LIST IREE_EXTERNAL_HAL_DRIVERS) message(STATUS "Enabling XRT build because it is an enabled HAL driver") set(IREE_AMD_AIE_ENABLE_XRT_DRIVER ON) + set(IREE_TARGET_BACKEND_AMD_AIE ON CACHE BOOL "") endif() if(IREE_AMD_AIE_ENABLE_XRT_DRIVER) diff --git a/runtime/src/iree-amd-aie/aie_runtime/test/utest.cc b/runtime/src/iree-amd-aie/aie_runtime/test/utest.cc index d32ef33a4..44a0e9427 100755 --- a/runtime/src/iree-amd-aie/aie_runtime/test/utest.cc +++ b/runtime/src/iree-amd-aie/aie_runtime/test/utest.cc @@ -9,7 +9,6 @@ clang-format off RUN: (aie_runtime_utest %S/pi.elf) | FileCheck %s -CHECK: Generating: pi.cdo CHECK: (BlockWrite-DMAWriteCmd): Start Address: 0x0000000000200404 Size: 9 CHECK: Address: 0x0000000000200404 Data is: 0x00000170 CHECK: Address: 0x0000000000200408 Data is: 0x00000000 diff --git a/samples/smol_matmul.py b/samples/smol_matmul.py new file mode 100644 index 000000000..2a66daf43 --- /dev/null +++ b/samples/smol_matmul.py @@ -0,0 +1,56 @@ +import os +import pathlib + +import numpy as np + +from iree import compiler as ireec +from iree import runtime as ireert +from iree.compiler import ir +from iree.compiler.dialects import arith, tensor, linalg +from iree.compiler.dialects.builtin import module +from iree.compiler.dialects.func import func +from iree.compiler.extras import types as T +from iree.runtime import get_driver + +with ir.Context(), ir.Location.unknown(): + + @module(sym_name="arithmetic") + def arithmetic(): + @func(T.tensor(32, 16, T.i8()), T.tensor(16, 32, T.i8())) + def matmul_i8_i32(lhs, rhs): + cst = arith.constant(T.i32(), 0) + v0 = tensor.empty([32, 32], T.i32()) + v1 = linalg.fill(cst, outs=[v0]) + return linalg.matmul(lhs, rhs, outs=[v1]) + + print(arithmetic) + +TARGET_BACKEND = "amd-aie" +WORK_DIR = pathlib.Path(__file__).cwd() / "executable_cache_test" +WORK_DIR = WORK_DIR.absolute() +with ireec.tools.TempFileSaver(str(WORK_DIR)): + compiled_flatbuffer = ireec.tools.compile_str( + str(arithmetic), + target_backends=[TARGET_BACKEND], + extra_args=[ + f"--iree-hal-dump-executable-files-to={WORK_DIR}", + f"--iree-hal-target-backends={TARGET_BACKEND}", + "--iree-amdaie-lower-to-aie-pipeline=air", + f"--iree-amd-aie-peano-install-dir={os.getenv('PEANO_INSTALL_DIR')}", + f"--iree-amd-aie-install-dir={os.getenv('IREE_INSTALL_DIR')}", + ], + ) + +driver = get_driver("xrt") + +config = ireert.Config(device=driver.create_default_device()) +ctx = ireert.SystemContext(config=config) +vm_module = ireert.VmModule.copy_buffer(ctx.instance, compiled_flatbuffer) +ctx.add_vm_module(vm_module) + +# Invoke the function and print the result. +arg0 = np.ones((32, 16), dtype=np.int8) +arg1 = np.ones((16, 32), dtype=np.int8) +f = ctx.modules.arithmetic["matmul_i8_i32"] +results = f(arg0, arg1).to_host() +print("Results:", results) From c39c3f997334dfe0c7f5ce33afde02081c0658a6 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Tue, 24 Sep 2024 18:53:06 -0400 Subject: [PATCH 02/19] Update ci-linux.yml --- .github/workflows/ci-linux.yml | 17 +- .github/workflows/ci-windows.yml | 4 +- .../AMD-AIE/iree-amd-aie/Target/AIETarget.cpp | 48 ++--- .../AMD-AIE/iree-amd-aie/Target/AIETarget.h | 91 +++++++++ .../AMDAIELowerExecutableTarget.cpp | 21 ++- .../iree-amd-aie/Transforms/Passes.cpp | 176 +++++++----------- .../AMD-AIE/iree-amd-aie/Transforms/Passes.h | 28 ++- .../AMD-AIE/iree-amd-aie/Transforms/Passes.td | 5 +- .../test/disable_vectorization.mlir | 25 +-- samples/smol_matmul.py | 56 ------ tests/__init__.py | 0 tests/conftest.py | 114 ++++++++++++ {build_tools/ci => tests}/requirements.txt | 1 + tests/test_matmul.py | 81 ++++++++ 14 files changed, 425 insertions(+), 242 deletions(-) delete mode 100644 samples/smol_matmul.py create mode 100644 tests/__init__.py create mode 100644 tests/conftest.py rename {build_tools/ci => tests}/requirements.txt (80%) create mode 100644 tests/test_matmul.py diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 00b295edd..d097be7db 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -57,10 +57,6 @@ jobs: yum remove -y openssl-devel zlib-devel || true yum install -y protobuf-devel protobuf-compiler tmate - - uses: actions/setup-python@v4 - with: - python-version: '3.10' - - name: Python deps run: | pip install "numpy<2" pyyaml "pybind11[global]==2.10.3" nanobind pytest @@ -115,7 +111,6 @@ jobs: path: ${{ env.CACHE_DIR }} key: ${{ env.CACHE_KEY }} - test_linux: name: E2E Test linux needs: build_and_ctest @@ -131,7 +126,7 @@ jobs: uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 with: submodules: false # not required for testbench - + - name: Download artifacts uses: actions/download-artifact@v4 with: @@ -145,11 +140,15 @@ jobs: bash build_tools/download_peano.sh echo "PEANO_INSTALL_DIR=$PWD/llvm-aie" >> $GITHUB_ENV + - uses: actions/setup-python@v4 + with: + python-version: '3.11' + - name: Create venv and install dependencies run: | - python3 -m venv .venv + python -m venv .venv source .venv/bin/activate - pip install -r build_tools/ci/requirements.txt + pip install -r tests/requirements.txt - name : E2E comparison of AIE to llvm-cpu @@ -205,4 +204,4 @@ jobs: run: | source .venv/bin/activate source /opt/xilinx/xrt/setup.sh - python samples/smol_matmul.py + pytest --capture=tee-sys tests diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index 777be4d5b..002a9aec2 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -167,7 +167,7 @@ jobs: run: | python -m venv .venv source .venv/Scripts/activate - pip install -r build_tools/ci/requirements.txt + pip install -r tests/requirements.txt - name: E2E correctness matmul test run: | @@ -188,4 +188,4 @@ jobs: - name: Python tests run: | source .venv/Scripts/activate - python samples/smol_matmul.py + pytest -s tests diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp index d552a9e64..bffd85c59 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp @@ -48,35 +48,6 @@ #define DEBUG_TYPE "aie-target" namespace mlir::iree_compiler::AMDAIE { - -/// Command line option for selecting the target AIE device. -static llvm::cl::opt clAMDAIETargetDevice( - "iree-amdaie-target-device", - llvm::cl::desc("Sets the target device architecture."), - llvm::cl::values( - clEnumValN(AMDAIEDevice::xcvc1902, "xcvc1902", "The xcvc1902 device"), - clEnumValN(AMDAIEDevice::xcve2302, "xcve2302", "The xcve2302 device"), - clEnumValN(AMDAIEDevice::xcve2802, "xcve2802", "The xcve2802 device"), - clEnumValN(AMDAIEDevice::npu1, "npu1", "Default Phoenix NPU"), - clEnumValN(AMDAIEDevice::npu1_1col, "npu1_1col", - "Phoenix NPU with a single column"), - clEnumValN(AMDAIEDevice::npu1_2col, "npu1_2col", - "Phoenix NPU with two columns"), - clEnumValN(AMDAIEDevice::npu1_3col, "npu1_3col", - "Phoenix NPU with three columns"), - clEnumValN(AMDAIEDevice::npu1_4col, "npu1_4col", - "Phoenix NPU with four columns"), - clEnumValN(AMDAIEDevice::npu4, "npu4", - "Strix B0 NPU with 8 columns and 6 rows")), - llvm::cl::init(AMDAIEDevice::npu1_4col)); - -static llvm::cl::opt clEnableAMDAIEUkernels( - "iree-amdaie-enable-ukernels", - llvm::cl::desc("Enables microkernels in the amdaie backend. May be " - "`none`, `all`, or a comma-separated list of specific " - "unprefixed microkernels to enable, e.g. `matmul`."), - llvm::cl::init("none")); - static xilinx::AIE::DeviceOp getDeviceOpWithName(ModuleOp moduleOp, StringRef targetName) { xilinx::AIE::DeviceOp deviceOp; @@ -161,11 +132,12 @@ class AIETargetBackend final : public IREE::HAL::TargetBackend { configItems.emplace_back(StringAttr::get(context, name), value); }; // Set target device - addConfig( - "target_device", - StringAttr::get(context, AMDAIE::stringifyEnum(clAMDAIETargetDevice))); + addConfig("target_device", + StringAttr::get( + context, AMDAIE::stringifyEnum(options.AMDAIETargetDevice))); // Set microkernel enabling flag. - addConfig("ukernels", StringAttr::get(context, clEnableAMDAIEUkernels)); + addConfig("ukernels", + StringAttr::get(context, options.enableAMDAIEUkernels)); auto configAttr = b.getDictionaryAttr(configItems); return IREE::HAL::ExecutableTargetAttr::get( context, b.getStringAttr("amd-aie"), @@ -198,7 +170,10 @@ class AIETargetBackend final : public IREE::HAL::TargetBackend { void buildTranslationPassPipeline(IREE::HAL::ExecutableTargetAttr, OpPassManager &passManager) override { - buildAMDAIETransformPassPipeline(passManager, clAMDAIETargetDevice); + buildAMDAIETransformPassPipeline( + passManager, options.AMDAIETargetDevice, options.useTilePipeline, + options.useLowerToAIEPipeline, options.matmulElementwiseFusion, + options.enableVectorizationPasses, options.pathToUkernels); } void buildLinkingPassPipeline(OpPassManager &passManager) override { @@ -366,7 +341,7 @@ LogicalResult AIETargetBackend::serializeExecutable( // TODO(max): this should be an enum // TODO(max): this needs to be pulled from PCIE std::string npuVersion; - switch (clAMDAIETargetDevice) { + switch (options.AMDAIETargetDevice) { case AMDAIEDevice::npu1: case AMDAIEDevice::npu1_1col: case AMDAIEDevice::npu1_2col: @@ -403,7 +378,8 @@ LogicalResult AIETargetBackend::serializeExecutable( /*xclBinKernelName=*/entryPointNamesFb[ordinal], /*xclBinInstanceName=*/"IREE", /*amdAIEInstallDir=*/options.amdAieInstallDir, - /*InputXCLBin=*/std::nullopt, /*ukernel=*/clEnableAMDAIEUkernels))) + /*InputXCLBin=*/std::nullopt, + /*ukernel=*/options.enableAMDAIEUkernels))) return failure(); std::ifstream instrFile(static_cast(npuInstPath)); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h index 1e5691c87..a8a6c4168 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h @@ -10,6 +10,7 @@ #include #include "aie/AIEDialect.h" +#include "iree-amd-aie/Transforms/KernelDispatch.h" #include "iree/compiler/Dialect/HAL/Target/TargetBackend.h" #include "iree/compiler/Dialect/HAL/Target/TargetDevice.h" #include "iree/compiler/Utils/OptionUtils.h" @@ -43,6 +44,15 @@ struct AMDAIEOptions { // Print MLIR timing summary for the MLIR passes in aie2xclbin. bool aie2xclbinTiming{false}; + LowerToAIEPassPipeline useLowerToAIEPipeline{ + LowerToAIEPassPipeline::ObjectFifo}; + TilePassPipeline useTilePipeline{TilePassPipeline::PackPeelPipeline}; + std::string pathToUkernels{""}; + bool enableVectorizationPasses{true}; + bool matmulElementwiseFusion{false}; + AMDAIEDevice AMDAIETargetDevice{AMDAIEDevice::npu1_4col}; + std::string enableAMDAIEUkernels{"none"}; + void bindOptions(OptionsBinder &binder) { static llvm::cl::OptionCategory category("AMD AIE Options"); binder.opt( @@ -91,6 +101,87 @@ struct AMDAIEOptions { binder.opt("iree-amd-aie-enable-chess", useChess, llvm::cl::cat(category), llvm::cl::desc("Use the legacy chess compiler")); + + binder.opt( + "iree-amdaie-enable-ukernels", enableAMDAIEUkernels, + llvm::cl::cat(category), + llvm::cl::desc("Enables microkernels in the amdaie backend. May be " + "`none`, `all`, or a comma-separated list of specific " + "unprefixed microkernels to enable, e.g. `matmul`.")); + + /// Command line option for selecting the lowering pipeline to use to + /// generate AIE DMA configurations, core code and control code. + binder.opt( + "iree-amdaie-lower-to-aie-pipeline", useLowerToAIEPipeline, + llvm::cl::cat(category), + llvm::cl::desc("Pick the lowering pipeline to use"), + llvm::cl::values( + clEnumValN(LowerToAIEPassPipeline::AIR, "air", + "Use the IREE lowering through AIR"), + clEnumValN(LowerToAIEPassPipeline::ObjectFifo, "objectFifo", + "Use the IREE lowering to objectFifos"))); + + /// Command line option for selecting the lowering pipeline to use tiling + /// computations and packing data. + binder.opt( + "iree-amdaie-tile-pipeline", useTilePipeline, llvm::cl::cat(category), + llvm::cl::desc("Pick the lowering pipeline to use"), + llvm::cl::values( + clEnumValN(TilePassPipeline::PackPeelPipeline, "pack-peel", + "Use the pack-peel based lowering strategy for " + "matmul-like ops"), + clEnumValN( + TilePassPipeline::PadPackPipeline, "pad-pack", + "Use the pad-pack based lowering strategy for matmul-like ops"), + clEnumValN(TilePassPipeline::ConvDecomposePipeline, + "conv-decompose", + "Use the conv-decompose based lowering strategy for " + "convolution interface ops"))); + + binder.opt("iree-amdaie-path-to-ukernels", pathToUkernels, + llvm::cl::cat(category), + llvm::cl::desc("Path to microkernels' directory")); + + binder.opt( + "iree-amdaie-enable-vectorization-passes", enableVectorizationPasses, + llvm::cl::cat(category), + llvm::cl::desc( + "Some pipelines (see iree-amdaie-tile-pipeline) may include " + "vectorization passes. This option enables or disables " + "these vectorization passes. It is intended for development " + "purposes only.")); + + binder.opt( + "iree-amdaie-matmul-elementwise-fusion", matmulElementwiseFusion, + llvm::cl::cat(category), + llvm::cl::desc( + "This option enables/disables special passes in MLIR-AIR " + "for matmul-elementwise fusion. It is currently added for " + "development purpose and should be removed in the future.")); + + /// Command line option for selecting the target AIE device. + binder.opt( + "iree-amdaie-target-device", AMDAIETargetDevice, + llvm::cl::cat(category), + llvm::cl::desc("Sets the target device architecture."), + llvm::cl::values( + clEnumValN(AMDAIEDevice::xcvc1902, "xcvc1902", + "The xcvc1902 device"), + clEnumValN(AMDAIEDevice::xcve2302, "xcve2302", + "The xcve2302 device"), + clEnumValN(AMDAIEDevice::xcve2802, "xcve2802", + "The xcve2802 device"), + clEnumValN(AMDAIEDevice::npu1, "npu1", "Default Phoenix NPU"), + clEnumValN(AMDAIEDevice::npu1_1col, "npu1_1col", + "Phoenix NPU with a single column"), + clEnumValN(AMDAIEDevice::npu1_2col, "npu1_2col", + "Phoenix NPU with two columns"), + clEnumValN(AMDAIEDevice::npu1_3col, "npu1_3col", + "Phoenix NPU with three columns"), + clEnumValN(AMDAIEDevice::npu1_4col, "npu1_4col", + "Phoenix NPU with four columns"), + clEnumValN(AMDAIEDevice::npu4, "npu4", + "Strix B0 NPU with 8 columns and 6 rows"))); } }; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerExecutableTarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerExecutableTarget.cpp index 90449f67a..f6b69faa1 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerExecutableTarget.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/AMDAIELowerExecutableTarget.cpp @@ -53,8 +53,8 @@ class AMDAIELowerExecutableTargetPass } AMDAIELowerExecutableTargetPass() = default; - AMDAIELowerExecutableTargetPass( - const AMDAIELowerExecutableTargetPass &pass){}; + AMDAIELowerExecutableTargetPass(const AMDAIELowerExecutableTargetPass &pass) { + }; AMDAIELowerExecutableTargetPass( const AMDAIELowerExecutableTargetOptions &options) : AMDAIELowerExecutableTargetBase(options) {} @@ -85,7 +85,7 @@ static TilingConfig getTilingConfigForPipeline(FunctionOpInterface funcOp) { auto maybeLoweringConfig = getRootLoweringConfig(funcOp); assert(succeeded(maybeLoweringConfig) && "Pipeline requires a lowering config"); - return TilingConfig(*maybeLoweringConfig); + return {*maybeLoweringConfig}; } void AMDAIELowerExecutableTargetPass::runOnOperation() { @@ -108,13 +108,20 @@ void AMDAIELowerExecutableTargetPass::runOnOperation() { case IREE::Codegen::DispatchLoweringPassPipeline::Custom: { TilingConfig tilingConfig = getTilingConfigForPipeline(funcOp); if (usePassPipeline == TilePassPipeline::PackPeelPipeline) { - addPackPeelBasedPassPipeline(executableLoweringPipeline, tilingConfig); + addPackPeelBasedPassPipeline(executableLoweringPipeline, tilingConfig, + pathToUkernels, enableVectorizationPasses, + TilePassPipeline::PackPeelPipeline); } else if (usePassPipeline == TilePassPipeline::PadPackPipeline) { - addPadPackBasedPassPipeline(executableLoweringPipeline, tilingConfig); + addPadPackBasedPassPipeline(executableLoweringPipeline, tilingConfig, + pathToUkernels, enableVectorizationPasses, + TilePassPipeline::PadPackPipeline); } else if (usePassPipeline == TilePassPipeline::ConvDecomposePipeline) { - addConvDecomposePassPipeline(executableLoweringPipeline, tilingConfig); + addConvDecomposePassPipeline(executableLoweringPipeline, tilingConfig, + enableVectorizationPasses, + TilePassPipeline::ConvDecomposePipeline); } - } break; + break; + } default: funcOp.emitOpError("unhandled pass pipeline value set"); return signalPassFailure(); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index 0fc050ffc..c83fb8b5c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -42,61 +42,9 @@ namespace mlir::iree_compiler::AMDAIE { -static llvm::cl::opt clEnablePacketFlow( - "iree-amdaie-enable-packet-flow", - llvm::cl::desc("Enable packet routing data movement."), - llvm::cl::init(false)); - -/// Command line option for selecting the lowering pipeline to use to generate -/// AIE DMA configurations, core code and control code. -static llvm::cl::opt clUseLowerToAIEPipeline( - "iree-amdaie-lower-to-aie-pipeline", - llvm::cl::desc("Pick the lowering pipeline to use"), - llvm::cl::values(clEnumValN(LowerToAIEPassPipeline::AIR, "air", - "Use the IREE lowering through AIR"), - clEnumValN(LowerToAIEPassPipeline::ObjectFifo, - "objectFifo", - "Use the IREE lowering to objectFifos")), - llvm::cl::init(LowerToAIEPassPipeline::ObjectFifo)); - -/// Command line option for selecting the lowering pipeline to use tiling -/// computations and packing data. -static llvm::cl::opt clUseTilePipeline( - "iree-amdaie-tile-pipeline", - llvm::cl::desc("Pick the lowering pipeline to use"), - llvm::cl::values( - clEnumValN( - TilePassPipeline::PackPeelPipeline, "pack-peel", - "Use the pack-peel based lowering strategy for matmul-like ops"), - clEnumValN( - TilePassPipeline::PadPackPipeline, "pad-pack", - "Use the pad-pack based lowering strategy for matmul-like ops"), - clEnumValN(TilePassPipeline::ConvDecomposePipeline, "conv-decompose", - "Use the conv-decompose based lowering strategy for " - "convolution interface ops")), - llvm::cl::init(TilePassPipeline::PackPeelPipeline)); - -static llvm::cl::opt clPathToUkernels( - "iree-amdaie-path-to-ukernels", - llvm::cl::desc("Path to microkernels' directory"), llvm::cl::init("")); - -static llvm::cl::opt clEnableVectorizationPasses( - "iree-amdaie-enable-vectorization-passes", - llvm::cl::desc("Some pipelines (see iree-amdaie-tile-pipeline) may include " - "vectorization passes. This option enables or disables " - "these vectorization passes. It is intended for development " - "purposes only."), - llvm::cl::init(true)); - -static llvm::cl::opt clMatmulElementwiseFusion( - "iree-amdaie-matmul-elementwise-fusion", - llvm::cl::desc("This option enables/disables special passes in MLIR-AIR " - "for matmul-elementwise fusion. It is currently added for " - "development purpose and should be removed in the future."), - llvm::cl::init(false)); - -void appendVectorizationToPipeline(OpPassManager &funcPassManager) { - if (!clEnableVectorizationPasses) return; +void appendVectorizationToPipeline(OpPassManager &funcPassManager, + bool enableVectorizationPasses) { + if (!enableVectorizationPasses) return; funcPassManager.addPass(createAMDAIECleanupPass()); funcPassManager.addPass(createAMDAIEInsertLoopsForVectorizationPass()); funcPassManager.addPass(createAMDAIEVectorizationPass()); @@ -105,25 +53,6 @@ void appendVectorizationToPipeline(OpPassManager &funcPassManager) { //===---------------------------------------------------------------------===// // Default allocation functions for AIE backend //===---------------------------------------------------------------------===// -// Allocation callbacks to use with upstream comprehensive bufferization -static FailureOr aieComprehensiveBufferizeAllocationFn( - OpBuilder &builder, Location loc, MemRefType memRefType, - ValueRange dynamicSizes, unsigned alignment) { - int64_t numDims = memRefType.getShape().size(); - AMDAIEMemSpace memSpace = AMDAIEMemSpace::Local; - if (clUseTilePipeline == TilePassPipeline::PackPeelPipeline && numDims == 4) { - memSpace = AMDAIEMemSpace::Shared; - } - - OpBuilder::InsertionGuard g(builder); - auto memorySpaceAttr = - AMDAIEMemSpaceAttr::get(builder.getContext(), memSpace); - MemRefType allocType = - MemRefType::get(memRefType.getShape(), memRefType.getElementType(), - AffineMap(), memorySpaceAttr); - return builder.create(loc, allocType, dynamicSizes) - .getResult(); -} static LogicalResult aieComprehensiveBufferizeCopyFn(OpBuilder &builder, Location loc, Value from, @@ -137,9 +66,32 @@ static LogicalResult aieComprehensiveBufferizeCopyFn(OpBuilder &builder, return success(); } -static void addAMDAIEBufferizePasses(OpPassManager &pm) { +static void addAMDAIEBufferizePasses(OpPassManager &pm, + TilePassPipeline useTilePipeline) { // Bufferize the dispatch. using mlir::bufferization::BufferizationOptions; + + // Allocation callbacks to use with upstream comprehensive bufferization + auto aieComprehensiveBufferizeAllocationFn = + [useTilePipeline](OpBuilder &builder, Location loc, MemRefType memRefType, + ValueRange dynamicSizes, unsigned _alignment) { + int64_t numDims = memRefType.getShape().size(); + AMDAIEMemSpace memSpace = AMDAIEMemSpace::Local; + if (useTilePipeline == TilePassPipeline::PackPeelPipeline && + numDims == 4) { + memSpace = AMDAIEMemSpace::Shared; + } + + OpBuilder::InsertionGuard g(builder); + auto memorySpaceAttr = + AMDAIEMemSpaceAttr::get(builder.getContext(), memSpace); + MemRefType allocType = + MemRefType::get(memRefType.getShape(), memRefType.getElementType(), + AffineMap(), memorySpaceAttr); + return builder.create(loc, allocType, dynamicSizes) + .getResult(); + }; + BufferizationOptions::AllocationFn allocationFn = aieComprehensiveBufferizeAllocationFn; BufferizationOptions::MemCpyFn memCpyFn = aieComprehensiveBufferizeCopyFn; @@ -158,7 +110,10 @@ void addAMDAIEToAIEPasses(OpPassManager &passManager) { } void addPackPeelBasedPassPipeline(OpPassManager &funcPassManager, - TilingConfig &tilingConfig) { + TilingConfig &tilingConfig, + const std::string &pathToUkernels, + bool enableVectorizationPasses, + TilePassPipeline useTilePipeline) { // First level tiling using scf.forall { AMDAIETileAndFuseOptions tileFuseOptions; @@ -327,20 +282,23 @@ void addPackPeelBasedPassPipeline(OpPassManager &funcPassManager, { AMDAIELowerToUKernelsOptions options; // windows - options.pathToUkernels = escapeCommandLineComponent(clPathToUkernels); + options.pathToUkernels = escapeCommandLineComponent(pathToUkernels); funcPassManager.addPass(createAMDAIELowerToUKernelsPass(options)); } // Vectorization passes - appendVectorizationToPipeline(funcPassManager); + appendVectorizationToPipeline(funcPassManager, enableVectorizationPasses); // Comprehensive bufferization - addAMDAIEBufferizePasses(funcPassManager); + addAMDAIEBufferizePasses(funcPassManager, useTilePipeline); funcPassManager.addPass(createHoistStaticallyBoundAllocationsPass()); } void addPadPackBasedPassPipeline(OpPassManager &funcPassManager, - TilingConfig &tilingConfig) { + TilingConfig &tilingConfig, + const std::string &pathToUkernels, + bool enableVectorizationPasses, + TilePassPipeline useTilePipeline) { // First level tiling using scf.forall { AMDAIETileAndFuseOptions tileFuseOptions; @@ -435,19 +393,21 @@ void addPadPackBasedPassPipeline(OpPassManager &funcPassManager, { AMDAIELowerToUKernelsOptions options; // windows - options.pathToUkernels = escapeCommandLineComponent(clPathToUkernels); + options.pathToUkernels = escapeCommandLineComponent(pathToUkernels); funcPassManager.addPass(createAMDAIELowerToUKernelsPass(options)); } // Vectorization passes - appendVectorizationToPipeline(funcPassManager); + appendVectorizationToPipeline(funcPassManager, enableVectorizationPasses); funcPassManager.addPass(createCanonicalizerPass()); // Comprehensive bufferization - addAMDAIEBufferizePasses(funcPassManager); + addAMDAIEBufferizePasses(funcPassManager, useTilePipeline); } void addConvDecomposePassPipeline(OpPassManager &funcPassManager, - TilingConfig &tilingConfig) { + TilingConfig &tilingConfig, + bool enableVectorizationPasses, + TilePassPipeline useTilePipeline) { // First level tiling using scf.forall { AMDAIETileAndFuseOptions tileFuseOptions; @@ -537,15 +497,18 @@ void addConvDecomposePassPipeline(OpPassManager &funcPassManager, funcPassManager.addPass(createDecomposeConvolutionToLowerDimOpsPass()); // Vectorization passes - appendVectorizationToPipeline(funcPassManager); + appendVectorizationToPipeline(funcPassManager, enableVectorizationPasses); funcPassManager.addPass(createCanonicalizerPass()); // Comprehensive bufferization - addAMDAIEBufferizePasses(funcPassManager); + addAMDAIEBufferizePasses(funcPassManager, useTilePipeline); } -void buildAMDAIETransformPassPipeline(OpPassManager &variantPassManager, - AMDAIEDevice device) { +void buildAMDAIETransformPassPipeline( + OpPassManager &variantPassManager, AMDAIEDevice device, + TilePassPipeline useTilePipeline, + LowerToAIEPassPipeline useLowerToAIEPipeline, bool matmulElementwiseFusion, + bool enableVectorizationPasses, const std::string &pathToUkernels) { OpPassManager &modulePassManager = variantPassManager.nest(); { FunctionLikeNest funcPassManager(modulePassManager); @@ -554,23 +517,26 @@ void buildAMDAIETransformPassPipeline(OpPassManager &variantPassManager, modulePassManager.addPass(createMaterializeUserConfigsPass()); { AMDAIELoweringStrategyOptions options; - options.usePassPipeline = clUseTilePipeline; - options.useLowerToAIEPipeline = clUseLowerToAIEPipeline; + options.usePassPipeline = useTilePipeline; + options.useLowerToAIEPipeline = useLowerToAIEPipeline; modulePassManager.addPass(createAMDAIELoweringStrategyPass(options)); } modulePassManager.addPass(createLowerExecutableUsingTransformDialectPass()); { FunctionLikeNest funcPassManager(modulePassManager); AMDAIELowerExecutableTargetOptions options; - options.usePassPipeline = clUseTilePipeline; + options.usePassPipeline = useTilePipeline; + options.enableVectorizationPasses = enableVectorizationPasses; + options.pathToUkernels = pathToUkernels; funcPassManager.addPass( [&]() { return createAMDAIELowerExecutableTargetPass(options); }); } modulePassManager.addPass(createLowerUKernelOpsToCallsPass()); - if (clUseLowerToAIEPipeline == LowerToAIEPassPipeline::ObjectFifo) { + if (useLowerToAIEPipeline == LowerToAIEPassPipeline::ObjectFifo) { addAMDAIEObjectFifoLoweringPasses(modulePassManager); - } else if (clUseLowerToAIEPipeline == LowerToAIEPassPipeline::AIR) { - addMLIRAIRLoweringPasses(modulePassManager, device); + } else if (useLowerToAIEPipeline == LowerToAIEPassPipeline::AIR) { + addMLIRAIRLoweringPasses(modulePassManager, device, useTilePipeline, + matmulElementwiseFusion); } else { assert( false && @@ -653,7 +619,9 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) { // TODO (Erwei): The "packPeel" temporary argument should be removed once // pack-peel and pack-pad share the same pass pipeline. See TODOs inlined below // for details. -void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) { +void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device, + TilePassPipeline useTilePipeline, + bool matmulElementwiseFusion) { // Add passes for preparing for lowering to MLIR-AIR passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass()); passManager.addPass(memref::createFoldMemRefAliasOpsPass()); @@ -672,7 +640,7 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) { // 1) make canonicalization work for scf.forall // 2) pass to collapse rank-4 scf.foralls to rank-2 scf.foralls. // 3) resolve above 'unsupproted operation' error. - if (clUseTilePipeline == TilePassPipeline::ConvDecomposePipeline) { + if (useTilePipeline == TilePassPipeline::ConvDecomposePipeline) { passManager.addPass(createCanonicalizerPass()); } @@ -698,8 +666,8 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) { passManager.addPass(createCSEPass()); passManager.addPass(xilinx::air::createAIRDependencyPass()); - if (!(clUseTilePipeline == TilePassPipeline::PackPeelPipeline && - clMatmulElementwiseFusion)) { + if (!(useTilePipeline == TilePassPipeline::PackPeelPipeline && + matmulElementwiseFusion)) { passManager.addPass(xilinx::air::createAIRDependencyScheduleOptPass()); passManager.addPass(xilinx::air::createAIRSpecializeDmaBroadcast()); } @@ -711,7 +679,7 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) { passManager.addPass(createCSEPass()); // TODO (Erwei): This pass currently doesn't support pack-peel pipeline. This // pass needs to work in order to get multiple AIE columns to work. - if (clUseTilePipeline != TilePassPipeline::PackPeelPipeline) + if (useTilePipeline != TilePassPipeline::PackPeelPipeline) passManager.addNestedPass( xilinx::air::createAIRSplitL2MemrefForBufferConstraintPass()); passManager.addPass(xilinx::air::createAIRIsolateAsyncDmaLoopNests()); @@ -720,8 +688,8 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) { { xilinx::air::AIRFuseChannelsOptions options; std::vector mode; - if (clUseTilePipeline == TilePassPipeline::PackPeelPipeline && - clMatmulElementwiseFusion) { + if (useTilePipeline == TilePassPipeline::PackPeelPipeline && + matmulElementwiseFusion) { mode.push_back("L1"); } options.clAggressiveMode = ArrayRef(mode); @@ -791,9 +759,9 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) { // AIRUnrollOuterPerfectlyNestedLoopsPass, to enforce SHIM DMA BD count // within the hardware limit. std::vector tile_sizes; - if (clUseTilePipeline == TilePassPipeline::PackPeelPipeline) { + if (useTilePipeline == TilePassPipeline::PackPeelPipeline) { tile_sizes = {2, 2}; - } else if (clUseTilePipeline == TilePassPipeline::PadPackPipeline) { + } else if (useTilePipeline == TilePassPipeline::PadPackPipeline) { tile_sizes = {4, 4}; } else tile_sizes = {}; @@ -807,7 +775,7 @@ void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device) { // nests that were left untiled by the previous AffineLoopOptPass, // generating NPU sequence representing the SHIM DMA BDs. xilinx::air::AIRUnrollOuterPerfectlyNestedLoopsPassOptions options; - if (clUseTilePipeline == TilePassPipeline::ConvDecomposePipeline) + if (useTilePipeline == TilePassPipeline::ConvDecomposePipeline) options.clDepth = 4; else options.clDepth = 2; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index 2bc11268b..79a20280e 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -19,7 +19,9 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager); /// Add passes to lower from MLIR-AIR through AIE. This is /// currently the default passes used for lowering after IREEs tiling. -void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device); +void addMLIRAIRLoweringPasses(OpPassManager &passManager, AMDAIEDevice device, + TilePassPipeline useTilePipeline, + bool matmulElementwiseFusion); /// Add lowering passes from MLIR-AIE. This is /// currently the default passes used for lowering from AIE dialect. @@ -28,23 +30,33 @@ void addMLIRAIELoweringPasses(OpPassManager &passManager); /// Populates passes needed to lower linalg/arith/math ops to LLVM dialect via /// the structured ops path. The pass manager `pm` here operate on the module /// within the IREE::HAL::ExecutableOp. -void buildAMDAIETransformPassPipeline(OpPassManager &pm, AMDAIEDevice device); - -void buildAMDAIELowerObjectFIFO(OpPassManager &variantPassManager); +void buildAMDAIETransformPassPipeline( + OpPassManager &variantPassManager, AMDAIEDevice device, + TilePassPipeline useTilePipeline, + LowerToAIEPassPipeline useLowerToAIEPipeline, bool matmulElementwiseFusion, + bool enableVectorizationPasses, const std::string &pathToUkernels); void addLowerToLLVMPasses(OpPassManager &pm); /// Populates passes needed to lower the IR via a Pack-Peel based approach. -void addPackPeelBasedPassPipeline(OpPassManager &passManager, - TilingConfig &tilingConfig); +void addPackPeelBasedPassPipeline(OpPassManager &oassManager, + TilingConfig &tilingConfig, + const std::string &pathToUkernels, + bool enableVectorizationPasses, + TilePassPipeline useTilePipeline); /// Populates passes needed to lower the IR via a Pad-Pack based approach. void addPadPackBasedPassPipeline(OpPassManager &passManager, - TilingConfig &tilingConfig); + TilingConfig &tilingConfig, + const std::string &pathToUkernels, + bool enableVectorizationPasses, + TilePassPipeline useTilePipeline); /// Populates passes needed to lower the IR via a Conv-Decompose based approach. void addConvDecomposePassPipeline(OpPassManager &passManager, - TilingConfig &tilingConfig); + TilingConfig &tilingConfig, + bool enableVectorizationPasses, + TilePassPipeline useTilePipeline); /// Populates passes needed to link HAL executables across AIE targets. void buildAMDAIELinkingPassPipeline(OpPassManager &passManager); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td index 74084c175..575decddb 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.td @@ -313,7 +313,10 @@ def AMDAIELowerExecutableTarget : "Use the pad-pack based lowering strategy for matmul-like ops."), clEnumValN(mlir::iree_compiler::AMDAIE::TilePassPipeline::ConvDecomposePipeline, "conv-decompose", "Use the conv-decompose based lowering strategy for convolution interface ops.") - )}]> + )}]>, + Option<"pathToUkernels", "path-to-ukernels", "std::string", /*default=*/"", "Path to microkernels' directory">, + Option<"enableVectorizationPasses", "enable-vectorization-passes", "bool", /*default=*/"true", + "Enable/disable vectorization."> ]; } diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/disable_vectorization.mlir b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/disable_vectorization.mlir index 48f5a5f3d..98feb949c 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/disable_vectorization.mlir +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/test/disable_vectorization.mlir @@ -9,31 +9,18 @@ // --iree-amdaie-enable-vectorization-passes=1 // // 3) Not specifying the flag at all, which should use the default value (1). -// -// We first perform the step which is common to all 3 paths: we compile the -// free function to an 'executable sources' file. -// RUN: iree-compile --iree-hal-target-backends=amd-aie \ -// RUN: --compile-to=executable-sources %s > exe-sources.mlir // 1) Explicitly disabled: -// RUN: iree-opt \ -// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(\ -// RUN: iree-hal-translate-target-executable-variants{target=amd-aie})))" \ -// RUN: --iree-amdaie-enable-vectorization-passes=0 exe-sources.mlir \ -// RUN: | FileCheck %s -check-prefix=CHECK-DISABLED +// RUN: iree-compile --iree-hal-target-backends=amd-aie \ +// RUN: --compile-to=executable-targets --iree-amdaie-enable-vectorization-passes=0 %s | FileCheck %s -check-prefix=CHECK-DISABLED // 2) Explicitly enabled: -// RUN: iree-opt \ -// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(\ -// RUN: iree-hal-translate-target-executable-variants{target=amd-aie})))" \ -// RUN: --iree-amdaie-enable-vectorization-passes=1 exe-sources.mlir \ -// RUN: | FileCheck %s -check-prefix=CHECK-ENABLED +// RUN: iree-compile --iree-hal-target-backends=amd-aie \ +// RUN: --compile-to=executable-targets --iree-amdaie-enable-vectorization-passes=1 %s | FileCheck %s -check-prefix=CHECK-ENABLED // 3) Default value: -// RUN: iree-opt \ -// RUN: --pass-pipeline="builtin.module(hal.executable(hal.executable.variant(\ -// RUN: iree-hal-translate-target-executable-variants{target=amd-aie})))" \ -// RUN: exe-sources.mlir | FileCheck %s -check-prefix=CHECK-DEFAULT +// RUN: iree-compile --iree-hal-target-backends=amd-aie \ +// RUN: --compile-to=executable-targets %s | FileCheck %s -check-prefix=CHECK-DEFAULT func.func @mm_in_bf16_out_f32(%lhs: tensor<64x64xbf16>, diff --git a/samples/smol_matmul.py b/samples/smol_matmul.py deleted file mode 100644 index 2a66daf43..000000000 --- a/samples/smol_matmul.py +++ /dev/null @@ -1,56 +0,0 @@ -import os -import pathlib - -import numpy as np - -from iree import compiler as ireec -from iree import runtime as ireert -from iree.compiler import ir -from iree.compiler.dialects import arith, tensor, linalg -from iree.compiler.dialects.builtin import module -from iree.compiler.dialects.func import func -from iree.compiler.extras import types as T -from iree.runtime import get_driver - -with ir.Context(), ir.Location.unknown(): - - @module(sym_name="arithmetic") - def arithmetic(): - @func(T.tensor(32, 16, T.i8()), T.tensor(16, 32, T.i8())) - def matmul_i8_i32(lhs, rhs): - cst = arith.constant(T.i32(), 0) - v0 = tensor.empty([32, 32], T.i32()) - v1 = linalg.fill(cst, outs=[v0]) - return linalg.matmul(lhs, rhs, outs=[v1]) - - print(arithmetic) - -TARGET_BACKEND = "amd-aie" -WORK_DIR = pathlib.Path(__file__).cwd() / "executable_cache_test" -WORK_DIR = WORK_DIR.absolute() -with ireec.tools.TempFileSaver(str(WORK_DIR)): - compiled_flatbuffer = ireec.tools.compile_str( - str(arithmetic), - target_backends=[TARGET_BACKEND], - extra_args=[ - f"--iree-hal-dump-executable-files-to={WORK_DIR}", - f"--iree-hal-target-backends={TARGET_BACKEND}", - "--iree-amdaie-lower-to-aie-pipeline=air", - f"--iree-amd-aie-peano-install-dir={os.getenv('PEANO_INSTALL_DIR')}", - f"--iree-amd-aie-install-dir={os.getenv('IREE_INSTALL_DIR')}", - ], - ) - -driver = get_driver("xrt") - -config = ireert.Config(device=driver.create_default_device()) -ctx = ireert.SystemContext(config=config) -vm_module = ireert.VmModule.copy_buffer(ctx.instance, compiled_flatbuffer) -ctx.add_vm_module(vm_module) - -# Invoke the function and print the result. -arg0 = np.ones((32, 16), dtype=np.int8) -arg1 = np.ones((16, 32), dtype=np.int8) -f = ctx.modules.arithmetic["matmul_i8_i32"] -results = f(arg0, arg1).to_host() -print("Results:", results) diff --git a/tests/__init__.py b/tests/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/tests/conftest.py b/tests/conftest.py new file mode 100644 index 000000000..b25c89516 --- /dev/null +++ b/tests/conftest.py @@ -0,0 +1,114 @@ +import os +from contextlib import contextmanager + +import numpy as np +import pytest +from iree._runtime_libs._runtime import VmModule + +from iree.compiler import ir +from iree.compiler._mlir_libs import get_dialect_registry +from iree.compiler.api import Session, Output, Source +from iree.compiler.extras import types as T +from iree.runtime import get_driver, Config, SystemContext + + +for t in [ + "i8", + "i16", + "i32", + "i64", + "si8", + "si16", + "si32", + "si64", + "ui8", + "ui16", + "ui32", + "ui64", + "f16", + "f32", + "f64", + "bf16", +]: + tf = getattr(T, t) + tf.__name__ = t + + +@pytest.fixture +def iree_session(request) -> Session: + s = Session() + s.context.append_dialect_registry(get_dialect_registry()) + s.context.load_all_available_dialects() + target_backend = getattr(request, "target_backend", "amd-aie") + pipeline = getattr(request, "pipeline", "air") + s.set_flags( + f"--iree-hal-target-backends={target_backend}", + # TODO(max): normalize iree-amdaie/iree-amd-aie in pass strings + f"--iree-amdaie-lower-to-aie-pipeline={pipeline}", + f"--iree-amd-aie-peano-install-dir={os.getenv('PEANO_INSTALL_DIR')}", + f"--iree-amd-aie-install-dir={os.getenv('IREE_INSTALL_DIR')}", + ) + yield s + + +@pytest.fixture +def session_module(iree_session, tmpdir) -> ir.Module: + iree_session.set_flags( + f"--iree-hal-dump-executable-files-to={tmpdir}", + ) + with ir.Location.unknown(iree_session.context): + module_op = ir.Module.create() + with ir.InsertionPoint(module_op.body): + yield iree_session, module_op + + +@contextmanager +def invokable_module(session, module, device="xrt") -> VmModule: + source = Source.wrap_buffer(session, str(module).encode()) + inv = session.invocation() + inv.parse_source(source) + inv.execute() + compiled_flatbuffer = Output.open_membuffer() + inv.output_vm_bytecode(compiled_flatbuffer) + + driver = get_driver(device) + config = Config(device=driver.create_default_device()) + ctx = SystemContext(config=config) + vm_module = VmModule.copy_buffer(ctx.instance, compiled_flatbuffer.map_memory()) + ctx.add_vm_module(vm_module) + + try: + yield ctx.modules.module + finally: + inv.close() + + +_np_dtype_to_mlir_type_ctor = { + np.int8: T.i8, + np.int16: T.i16, + np.int32: T.i32, + # windows + np.intc: T.i32, + np.int64: T.i64, + # is technically wrong i guess but numpy by default casts python scalars to this + # so to support passing lists of ints we map to index type + np.longlong: T.index, + np.uintp: T.index, + np.float16: T.f16, + np.float32: T.f32, + np.float64: T.f64, +} + +_mlir_type_ctor_to_np_dtype = lambda: { + v: k for k, v in _np_dtype_to_mlir_type_ctor.items() +} + + +def np_dtype_to_mlir_type(np_dtype): + if typ := _np_dtype_to_mlir_type_ctor.get(np_dtype): + return typ() + + +def mlir_type_to_np_dtype(mlir_type): + _mlir_type_to_np_dtype = {v(): k for k, v in _np_dtype_to_mlir_type_ctor.items()} + return _mlir_type_to_np_dtype.get(mlir_type) diff --git a/build_tools/ci/requirements.txt b/tests/requirements.txt similarity index 80% rename from build_tools/ci/requirements.txt rename to tests/requirements.txt index 3f48cf4a4..d92fd92a9 100644 --- a/build_tools/ci/requirements.txt +++ b/tests/requirements.txt @@ -2,3 +2,4 @@ PyYAML>=5.4.1 requests>=2.28.0 enum_tools==0.6.4 numpy<2 +pytest==8.2.2 diff --git a/tests/test_matmul.py b/tests/test_matmul.py new file mode 100644 index 000000000..7659a2d33 --- /dev/null +++ b/tests/test_matmul.py @@ -0,0 +1,81 @@ +import numpy as np +import pytest + +from iree.compiler.dialects import arith, tensor, linalg +from iree.compiler.dialects.arith import _is_float_type +from iree.compiler.dialects.func import func +from iree.compiler.extras import types as T +from .conftest import invokable_module, mlir_type_to_np_dtype + + +def test_smol_matmul(session_module): + session, module = session_module + + @func(T.tensor(32, 16, T.i8()), T.tensor(16, 32, T.i8())) + def matmul_i8_i32(lhs, rhs): + cst = arith.constant(T.i32(), 0) + v0 = tensor.empty([32, 32], T.i32()) + v1 = linalg.fill(cst, outs=[v0]) + return linalg.matmul(lhs, rhs, outs=[v1]) + + arg0 = np.ones((32, 16), dtype=np.int8) + arg1 = np.ones((16, 32), dtype=np.int8) + with invokable_module(session, module) as module: + results = module[matmul_i8_i32.__name__](arg0, arg1).to_host() + assert np.array_equal(results, arg0 @ arg1) + + +def emit_matmul(M, K, N, lhs_rhs_type, acc_type): + matmul_name = f"{M}x{K}x{N}x{lhs_rhs_type}x{acc_type}" + + init_value = 0 + if _is_float_type(acc_type): + init_value = 0.0 + + @func(T.tensor(M, K, lhs_rhs_type), T.tensor(K, N, lhs_rhs_type), name=matmul_name) + def matmul(lhs, rhs): + cst = arith.constant(acc_type, init_value) + v0 = tensor.empty([M, N], acc_type) + v1 = linalg.fill(cst, outs=[v0]) + return linalg.matmul(lhs, rhs, outs=[v1]) + + return matmul_name + + +testdata = [ + (32, 16, 32, T.i8, T.i32), + (32, 32, 32, T.i8, T.i32), + (64, 32, 64, T.i8, T.i32), + (64, 64, 64, T.i8, T.i32), + (128, 64, 128, T.i8, T.i32), + (128, 128, 128, T.i8, T.i32), + (128, 256, 128, T.i8, T.i32), + (32, 16, 32, T.f32, T.f32), + (32, 32, 32, T.f32, T.f32), + (64, 32, 64, T.f32, T.f32), + (64, 64, 64, T.f32, T.f32), + (128, 128, 128, T.f32, T.f32), + (128, 256, 128, T.f32, T.f32), +] + + +def ids(datum): + if callable(datum): + return datum.__name__ + return datum + + +@pytest.mark.parametrize("M, K, N, lhs_rhs_type, acc_type", testdata, ids=ids) +def test_matmul(session_module, M, K, N, lhs_rhs_type, acc_type): + session, module = session_module + + lhs_rhs_type, acc_type = lhs_rhs_type(), acc_type() + matmul_name = emit_matmul(M, K, N, lhs_rhs_type, acc_type) + + lhs_rhs_type = mlir_type_to_np_dtype(lhs_rhs_type) + acc_type = mlir_type_to_np_dtype(acc_type) + arg0 = np.ones((M, K), dtype=lhs_rhs_type) + arg1 = np.ones((K, N), dtype=lhs_rhs_type) + with invokable_module(session, module) as module: + results = module[matmul_name](arg0, arg1).to_host() + assert np.array_equal(results, (arg0.astype(acc_type) @ arg1.astype(acc_type))) From e4710c6da4d1cabde108c5d7bf41fa229526f163 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Thu, 26 Sep 2024 04:01:21 -0400 Subject: [PATCH 03/19] re-enable python on windows --- .github/workflows/ci-windows.yml | 3 +-- build_tools/build_llvm.ps1 | 6 +----- build_tools/build_test_cpp.ps1 | 17 +++-------------- build_tools/download_peano.ps1 | 11 +++++++++++ cmake/iree_aie_xrt.cmake | 1 - iree_compiler_plugin.cmake | 2 +- tests/conftest.py | 2 +- 7 files changed, 18 insertions(+), 24 deletions(-) create mode 100644 build_tools/download_peano.ps1 diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index 002a9aec2..d98c1a624 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -73,7 +73,7 @@ jobs: - uses: actions/setup-python@v4 with: - python-version: '3.10' + python-version: '3.11' - name: Python deps run: | @@ -91,7 +91,6 @@ jobs: $env:cache_dir = "${{ env.CACHE_DIR }}" $env:CCACHE_COMPILERCHECK = "string:$(clang-cl.exe --version)" .\build_tools\build_llvm.ps1 - ls # Remove-Item -Path "$pwd\llvm-build" -Force $env:llvm_install_dir = "$pwd\llvm-install" echo $env:llvm_install_dir diff --git a/build_tools/build_llvm.ps1 b/build_tools/build_llvm.ps1 index 12fa7ec51..0e9edfc3d 100644 --- a/build_tools/build_llvm.ps1 +++ b/build_tools/build_llvm.ps1 @@ -55,10 +55,6 @@ $env:CCACHE_SLOPPINESS = 'include_file_ctime,include_file_mtime,time_macros' & ccache -z -# on windows python bindings don't for split build because -# i can't figure out MLIR_CAPI_EXPORTED and MLIR_CAPI_BUILDING_LIBRARY -# which somehow disables exceptions (blocking bootgen and xrt) - $CMAKE_ARGS = @( "-GNinja" "-DCMAKE_BUILD_TYPE=Release" @@ -85,7 +81,7 @@ $CMAKE_ARGS = @( "-DLLVM_ENABLE_ZSTD=OFF" "-DLLVM_FORCE_ENABLE_STATS=ON" "-DLLVM_INSTALL_UTILS=ON" - "-DMLIR_ENABLE_BINDINGS_PYTHON=OFF" + "-DMLIR_ENABLE_BINDINGS_PYTHON=ON" "-DLLVM_ENABLE_PROJECTS=mlir;clang;lld" "-DLLVM_TARGET_ARCH=X86" "-DLLVM_TARGETS_TO_BUILD=X86" diff --git a/build_tools/build_test_cpp.ps1 b/build_tools/build_test_cpp.ps1 index 5007aff7f..f7c3b7db2 100644 --- a/build_tools/build_test_cpp.ps1 +++ b/build_tools/build_test_cpp.ps1 @@ -61,9 +61,6 @@ $CMAKE_ARGS = @( "-DCMAKE_BUILD_TYPE=Release" "-DCMAKE_INSTALL_PREFIX=$install_dir" "-DCMAKE_INSTALL_LIBDIR=lib" - "-DCMAKE_EXE_LINKER_FLAGS_INIT=-fuse-ld=lld" - "-DCMAKE_SHARED_LINKER_FLAGS_INIT=-fuse-ld=lld" - "-DCMAKE_MODULE_LINKER_FLAGS_INIT=-fuse-ld=lld" "-DCMAKE_C_COMPILER=$env:CC" "-DCMAKE_CXX_COMPILER=$env:CXX" "-DLLVM_TARGET_ARCH=X86" @@ -84,6 +81,7 @@ $CMAKE_ARGS = @( "-DIREE_TARGET_BACKEND_LLVM_CPU=ON" "-DIREE_CMAKE_PLUGIN_PATHS=$repo_root" "-DIREE_EXTERNAL_HAL_DRIVERS=xrt" + "-DIREE_BUILD_PYTHON_BINDINGS=ON" ) if ($llvm_install_dir -and (Test-Path "$llvm_install_dir")) @@ -92,7 +90,7 @@ if ($llvm_install_dir -and (Test-Path "$llvm_install_dir")) # TODO(max): send IREE a fix for this # target_compile_definitions may only set INTERFACE properties on IMPORTED $cmake_file = Resolve-Path -Path "$iree_dir/compiler/src/iree/compiler/API/CMakeLists.txt" - (Get-Content $cmake_file).Replace("`$`{_object_lib} PRIVATE", "`$`{_object_lib} INTERFACE") ` + (Get-Content $cmake_file).Replace("if(MSVC)", "get_target_property(_imported `$`{_object_lib} IMPORTED)`n if(MSVC AND NOT `$`{_imported})") ` | Out-File -encoding ASCII $cmake_file $CMAKE_ARGS += @( "-DIREE_BUILD_BUNDLED_LLVM=OFF" @@ -100,18 +98,8 @@ if ($llvm_install_dir -and (Test-Path "$llvm_install_dir")) "-DLLD_DIR=$llvm_install_dir/lib/cmake/lld" "-DMLIR_DIR=$llvm_install_dir/lib/cmake/mlir" "-DLLVM_DIR=$llvm_install_dir/lib/cmake/llvm" - # TODO(max) - # on windows python bindings don't for split build because - # i can't figure out MLIR_CAPI_EXPORTED and MLIR_CAPI_BUILDING_LIBRARY - # which somehow disables exceptions - "-DIREE_BUILD_PYTHON_BINDINGS=OFF" ) } -else -{ - echo "building bundled llvm" - $CMAKE_ARGS += @("-DIREE_BUILD_PYTHON_BINDINGS=ON") -} & cmake $CMAKE_ARGS -S $iree_dir -B $build_dir @@ -144,3 +132,4 @@ if ($llvm_install_dir -and (Test-Path "$llvm_install_dir")) } Copy-Item -Path "$build_dir/tools/testing/e2e/iree-e2e-matmul-test.exe" -Destination "$install_dir/bin" -Force +Copy-Item -Path "$build_dir/tools/xrt_coreutil.dll" -Destination "$install_dir/python_packages/iree_runtime/iree/_runtime_libs" -Force diff --git a/build_tools/download_peano.ps1 b/build_tools/download_peano.ps1 new file mode 100644 index 000000000..16f147e25 --- /dev/null +++ b/build_tools/download_peano.ps1 @@ -0,0 +1,11 @@ +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +$ErrorActionPreference = 'Stop' + +$RELEASE = "19.0.0.2024082221+90abe71b" +pip download llvm_aie==$RELEASE -f https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly +Expand-Archive (Get-ChildItem -Filter llvm*.whl).FullName -DestinationPath $PWD.Path \ No newline at end of file diff --git a/cmake/iree_aie_xrt.cmake b/cmake/iree_aie_xrt.cmake index c289a30f7..78658c81d 100644 --- a/cmake/iree_aie_xrt.cmake +++ b/cmake/iree_aie_xrt.cmake @@ -34,7 +34,6 @@ set(IREE_AIE_BOOST_LIBS any algorithm asio - exception format functional lexical_cast diff --git a/iree_compiler_plugin.cmake b/iree_compiler_plugin.cmake index c91066224..958d6de46 100644 --- a/iree_compiler_plugin.cmake +++ b/iree_compiler_plugin.cmake @@ -9,12 +9,12 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake") set(IREE_AMD_AIE_SOURCE_DIR "${CMAKE_CURRENT_LIST_DIR}") set(IREE_AMD_AIE_RUNTIME_SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/runtime/src) set(IREE_MLIR_AIR_SOURCE_DIR "${IREE_AMD_AIE_SOURCE_DIR}/third_party/mlir-air/mlir") +set(IREE_TARGET_BACKEND_AMD_AIE ON CACHE BOOL "") set(IREE_AMD_AIE_ENABLE_XRT_DRIVER OFF) if("xrt" IN_LIST IREE_EXTERNAL_HAL_DRIVERS) message(STATUS "Enabling XRT build because it is an enabled HAL driver") set(IREE_AMD_AIE_ENABLE_XRT_DRIVER ON) - set(IREE_TARGET_BACKEND_AMD_AIE ON CACHE BOOL "") endif() if(IREE_AMD_AIE_ENABLE_XRT_DRIVER) diff --git a/tests/conftest.py b/tests/conftest.py index b25c89516..82b7a4c98 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -3,7 +3,7 @@ import numpy as np import pytest -from iree._runtime_libs._runtime import VmModule +from iree.runtime import VmModule from iree.compiler import ir from iree.compiler._mlir_libs import get_dialect_registry From c159ea199e8693068c3589801c59fe7e975c7a93 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Thu, 26 Sep 2024 14:58:12 -0400 Subject: [PATCH 04/19] fix after rebase --- .../target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp | 3 ++- .../target/AMD-AIE/iree-amd-aie/Target/AIETarget.h | 5 +++++ .../target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp | 11 +++++++---- .../target/AMD-AIE/iree-amd-aie/Transforms/Passes.h | 6 ++++-- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp index bffd85c59..2c643d39b 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.cpp @@ -173,7 +173,8 @@ class AIETargetBackend final : public IREE::HAL::TargetBackend { buildAMDAIETransformPassPipeline( passManager, options.AMDAIETargetDevice, options.useTilePipeline, options.useLowerToAIEPipeline, options.matmulElementwiseFusion, - options.enableVectorizationPasses, options.pathToUkernels); + options.enableVectorizationPasses, options.pathToUkernels, + options.enablePacketFlow); } void buildLinkingPassPipeline(OpPassManager &passManager) override { diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h index a8a6c4168..1f6518909 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Target/AIETarget.h @@ -52,6 +52,7 @@ struct AMDAIEOptions { bool matmulElementwiseFusion{false}; AMDAIEDevice AMDAIETargetDevice{AMDAIEDevice::npu1_4col}; std::string enableAMDAIEUkernels{"none"}; + bool enablePacketFlow{false}; void bindOptions(OptionsBinder &binder) { static llvm::cl::OptionCategory category("AMD AIE Options"); @@ -182,6 +183,10 @@ struct AMDAIEOptions { "Phoenix NPU with four columns"), clEnumValN(AMDAIEDevice::npu4, "npu4", "Strix B0 NPU with 8 columns and 6 rows"))); + + binder.opt("iree-amdaie-enable-packet-flow", enablePacketFlow, + llvm::cl::cat(category), + llvm::cl::desc("Enable packet routing data movement.")); } }; diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp index c83fb8b5c..dfd7a02cb 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.cpp @@ -508,7 +508,8 @@ void buildAMDAIETransformPassPipeline( OpPassManager &variantPassManager, AMDAIEDevice device, TilePassPipeline useTilePipeline, LowerToAIEPassPipeline useLowerToAIEPipeline, bool matmulElementwiseFusion, - bool enableVectorizationPasses, const std::string &pathToUkernels) { + bool enableVectorizationPasses, const std::string &pathToUkernels, + bool enablePacketFlow) { OpPassManager &modulePassManager = variantPassManager.nest(); { FunctionLikeNest funcPassManager(modulePassManager); @@ -533,7 +534,7 @@ void buildAMDAIETransformPassPipeline( } modulePassManager.addPass(createLowerUKernelOpsToCallsPass()); if (useLowerToAIEPipeline == LowerToAIEPassPipeline::ObjectFifo) { - addAMDAIEObjectFifoLoweringPasses(modulePassManager); + addAMDAIEObjectFifoLoweringPasses(modulePassManager, enablePacketFlow); } else if (useLowerToAIEPipeline == LowerToAIEPassPipeline::AIR) { addMLIRAIRLoweringPasses(modulePassManager, device, useTilePipeline, matmulElementwiseFusion); @@ -552,7 +553,8 @@ void buildAMDAIETransformPassPipeline( }); } -void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) { +void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager, + bool enablePacketFlow) { passManager.addPass(createEraseHALDescriptorTypeFromMemRefPass()); passManager.addPass(memref::createFoldMemRefAliasOpsPass()); passManager.addPass(createAMDAIEConvertToDmaPass()); @@ -580,7 +582,8 @@ void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager) { passManager.addPass(createAMDAIEAssignLogicalObjectFifoDepthPass()); passManager.addPass(createAMDAIEAccessToAcquireReleasePass()); passManager.addPass(createAMDAIENoneAccessToTemporaryBufferPass()); - passManager.addPass(createAMDAIEAssignConnectionTypesPass({clEnablePacketFlow})); + passManager.addPass( + createAMDAIEAssignConnectionTypesPass({enablePacketFlow})); passManager.addPass(createCSEPass()); passManager.addPass(createCanonicalizerPass()); diff --git a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h index 79a20280e..1fbc24ca1 100644 --- a/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h +++ b/compiler/plugins/target/AMD-AIE/iree-amd-aie/Transforms/Passes.h @@ -15,7 +15,8 @@ namespace mlir::iree_compiler::AMDAIE { /// Add passes to lower to AIE objectFifos. -void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager); +void addAMDAIEObjectFifoLoweringPasses(OpPassManager &passManager, + bool enablePacketFlow); /// Add passes to lower from MLIR-AIR through AIE. This is /// currently the default passes used for lowering after IREEs tiling. @@ -34,7 +35,8 @@ void buildAMDAIETransformPassPipeline( OpPassManager &variantPassManager, AMDAIEDevice device, TilePassPipeline useTilePipeline, LowerToAIEPassPipeline useLowerToAIEPipeline, bool matmulElementwiseFusion, - bool enableVectorizationPasses, const std::string &pathToUkernels); + bool enableVectorizationPasses, const std::string &pathToUkernels, + bool enablePacketFlow); void addLowerToLLVMPasses(OpPassManager &pm); From 57f665abd745dea8f4e790e4ac75deafd698e5fa Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Thu, 26 Sep 2024 16:55:54 -0400 Subject: [PATCH 05/19] try to fix xrt bugs --- .github/workflows/ci-windows.yml | 51 ++++++++++--------- cmake/iree_aie_xrt.cmake | 5 +- .../driver/xrt/native_executable.cc | 11 ++-- .../driver/xrt/native_executable.h | 1 + tests/conftest.py | 4 +- tests/test_matmul.py | 32 ++++++++++++ 6 files changed, 72 insertions(+), 32 deletions(-) diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index d98c1a624..727154b43 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -138,9 +138,9 @@ jobs: name: E2E Test windows runs-on: windows-phoenix needs: build_and_ctest - defaults: - run: - shell: bash +# defaults: +# run: +# shell: bash env: XILINX_XRT: "/c/Xilinx/XRT" steps: @@ -157,34 +157,35 @@ jobs: - name: Extract artifact run: | tar -xvf iree-dist-windows.tar - echo "IREE_INSTALL_DIR=$PWD/iree-install" >> $GITHUB_ENV - echo "PYTHONPATH=$PWD/iree-install/python_packages/iree_compiler:$PWD/iree-install/python_packages/iree_runtime" >> $GITHUB_ENV - bash build_tools/download_peano.sh - echo "PEANO_INSTALL_DIR=$PWD/llvm-aie" >> $GITHUB_ENV + Add-Content -Path $env:GITHUB_ENV -Value "IREE_INSTALL_DIR=$PWD\iree-install" + Add-Content -Path $env:GITHUB_ENV -Value "PYTHONPATH=$PWD\iree-install\python_packages\iree_compiler;$PWD\iree-install\python_packages\iree_runtime" + .\build_tools\download_peano.ps1 + Add-Content -Path $env:GITHUB_ENV -Value "PEANO_INSTALL_DIR=$PWD\llvm-aie" - name: Create venv and install dependencies run: | python -m venv .venv - source .venv/Scripts/activate + .\.venv\Scripts\Activate.ps1 pip install -r tests/requirements.txt - - name: E2E correctness matmul test - run: | - source .venv/Scripts/activate - bash build_tools/ci/run_matmul_test.sh \ - /c/test_matmuls \ - $PWD/iree-install \ - $PWD/llvm-aie - - - name : E2E comparison of AIE to llvm-cpu - run: | - source .venv/Scripts/activate - python build_tools/ci/cpu_comparison/run.py \ - /c/test_aie_vs_cpu \ - $PWD/iree-install \ - $PWD/llvm-aie -v +# - name: E2E correctness matmul test +# run: | +# source .venv/Scripts/activate +# bash build_tools/ci/run_matmul_test.sh \ +# /c/test_matmuls \ +# $PWD/iree-install \ +# $PWD/llvm-aie +# +# - name : E2E comparison of AIE to llvm-cpu +# run: | +# source .venv/Scripts/activate +# python build_tools/ci/cpu_comparison/run.py \ +# /c/test_aie_vs_cpu \ +# $PWD/iree-install \ +# $PWD/llvm-aie -v - name: Python tests run: | - source .venv/Scripts/activate - pytest -s tests + echo "PYTHONPATH: $env:PYTHONPATH" + .\.venv\Scripts\Activate.ps1 + pytest -s --basetemp=$env:TEMP tests diff --git a/cmake/iree_aie_xrt.cmake b/cmake/iree_aie_xrt.cmake index 78658c81d..c6315c389 100644 --- a/cmake/iree_aie_xrt.cmake +++ b/cmake/iree_aie_xrt.cmake @@ -175,7 +175,10 @@ foreach(_core_lib IN LISTS _core_libs) ${IREE_XRT_SOURCE_DIR}/runtime_src) target_include_directories(${_core_lib} SYSTEM PUBLIC ${IREE_XRT_SOURCE_DIR}/runtime_src/core/common/elf) - target_compile_definitions(${_core_lib} PUBLIC -DBOOST_BIND_GLOBAL_PLACEHOLDERS) + target_compile_definitions(${_core_lib} + PUBLIC + -DBOOST_BIND_GLOBAL_PLACEHOLDERS + $<$:-DXRT_VERBOSE>) target_compile_options(${_core_lib} PRIVATE $<$:-fexceptions -frtti -w> diff --git a/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc b/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc index 09caa5978..106f89cb3 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc +++ b/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc @@ -190,9 +190,10 @@ iree_status_t iree_hal_xrt_native_executable_create( return iree_make_status(IREE_STATUS_INTERNAL, "XCLBIN register error: %s", e.what()); } - xrt::hw_context context; + std::unique_ptr context; try { - context = {*device, xclbin->get_uuid()}; + context = std::make_unique( + *device, xclbin->get_uuid(), xrt::hw_context::access_mode::exclusive); } catch (std::exception& e) { return iree_make_status(IREE_STATUS_INTERNAL, "xrt::hw_context context: %s", e.what()); @@ -207,7 +208,7 @@ iree_status_t iree_hal_xrt_native_executable_create( std::unique_ptr kernel; std::unique_ptr instr; try { - kernel = std::make_unique(context, entry_name); + kernel = std::make_unique(*context, entry_name); // XCL_BO_FLAGS_CACHEABLE is used to indicate that this is an instruction // buffer that resides in instr_memory. This buffer is always passed as // the second argument to the kernel and we can use group id 1. @@ -230,6 +231,7 @@ iree_status_t iree_hal_xrt_native_executable_create( instr->sync(XCL_BO_SYNC_BO_TO_DEVICE); iree_hal_xrt_kernel_params_t* params = &executable->entry_points[entry_ordinal]; + params->context = context.release(); params->xclbin = xclbin.release(); params->kernel = kernel.release(); params->instr = instr.release(); @@ -277,15 +279,16 @@ static void iree_hal_xrt_native_executable_destroy( for (iree_host_size_t i = 0; i < executable->entry_point_count; ++i) { try { + delete executable->entry_points[i].kernel; #ifndef _WIN32 // causes segmentation fault on windows - delete executable->entry_points[i].kernel; delete executable->entry_points[i].instr; #endif // TODO(jornt): deleting the xclbin here will result in a corrupted size // error in XRT. It looks like the xclbin needs to stay alive while the // device is alive if it has been registered. // delete executable->entry_points[i].xclbin; + // delete executable->entry_points[i].context; } catch (...) { (void)iree_status_from_code(IREE_STATUS_DATA_LOSS); } diff --git a/runtime/src/iree-amd-aie/driver/xrt/native_executable.h b/runtime/src/iree-amd-aie/driver/xrt/native_executable.h index 03544e62b..aab760faf 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/native_executable.h +++ b/runtime/src/iree-amd-aie/driver/xrt/native_executable.h @@ -21,6 +21,7 @@ extern "C" { // Object and launch parameters for a compute kernel. typedef struct iree_hal_xrt_kernel_params_t { + xrt::hw_context* context; xrt::xclbin* xclbin; // The kernel code object. xrt::kernel* kernel; diff --git a/tests/conftest.py b/tests/conftest.py index 82b7a4c98..ec0dca1ae 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -52,9 +52,9 @@ def iree_session(request) -> Session: @pytest.fixture -def session_module(iree_session, tmpdir) -> ir.Module: +def session_module(iree_session, tmp_path) -> ir.Module: iree_session.set_flags( - f"--iree-hal-dump-executable-files-to={tmpdir}", + f"--iree-hal-dump-executable-files-to={tmp_path}", ) with ir.Location.unknown(iree_session.context): module_op = ir.Module.create() diff --git a/tests/test_matmul.py b/tests/test_matmul.py index 7659a2d33..8e667416b 100644 --- a/tests/test_matmul.py +++ b/tests/test_matmul.py @@ -1,3 +1,5 @@ +import random + import numpy as np import pytest @@ -56,9 +58,39 @@ def matmul(lhs, rhs): (64, 64, 64, T.f32, T.f32), (128, 128, 128, T.f32, T.f32), (128, 256, 128, T.f32, T.f32), + + (32, 16, 32, T.i8, T.i32), + (32, 32, 32, T.i8, T.i32), + (64, 32, 64, T.i8, T.i32), + (64, 64, 64, T.i8, T.i32), + (128, 64, 128, T.i8, T.i32), + (128, 128, 128, T.i8, T.i32), + (128, 256, 128, T.i8, T.i32), + (32, 16, 32, T.f32, T.f32), + (32, 32, 32, T.f32, T.f32), + (64, 32, 64, T.f32, T.f32), + (64, 64, 64, T.f32, T.f32), + (128, 128, 128, T.f32, T.f32), + (128, 256, 128, T.f32, T.f32), + + (32, 16, 32, T.i8, T.i32), + (32, 32, 32, T.i8, T.i32), + (64, 32, 64, T.i8, T.i32), + (64, 64, 64, T.i8, T.i32), + (128, 64, 128, T.i8, T.i32), + (128, 128, 128, T.i8, T.i32), + (128, 256, 128, T.i8, T.i32), + (32, 16, 32, T.f32, T.f32), + (32, 32, 32, T.f32, T.f32), + (64, 32, 64, T.f32, T.f32), + (64, 64, 64, T.f32, T.f32), + (128, 128, 128, T.f32, T.f32), + (128, 256, 128, T.f32, T.f32), ] +random.shuffle(testdata) + def ids(datum): if callable(datum): return datum.__name__ From 46ef58eb954a79aa07e864d8f4d26e0a888d6788 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Thu, 26 Sep 2024 20:24:43 -0400 Subject: [PATCH 06/19] Update ci-windows.yml --- .github/workflows/ci-windows.yml | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index 727154b43..98b6b8e64 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -188,4 +188,5 @@ jobs: run: | echo "PYTHONPATH: $env:PYTHONPATH" .\.venv\Scripts\Activate.ps1 - pytest -s --basetemp=$env:TEMP tests + mkdir temp + pytest -s --basetemp=$PWD\temp tests From 5a0d800b794673508f1af759495a45b4f267679d Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Thu, 26 Sep 2024 21:09:40 -0400 Subject: [PATCH 07/19] Update ci-windows.yml --- .github/workflows/ci-windows.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index 98b6b8e64..6e73cdd1c 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -186,7 +186,7 @@ jobs: - name: Python tests run: | - echo "PYTHONPATH: $env:PYTHONPATH" + ls $env:XILINX_XRT .\.venv\Scripts\Activate.ps1 mkdir temp pytest -s --basetemp=$PWD\temp tests From 9a9f3e9974172d07a00a5183d29ec9ea682ca494 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Thu, 26 Sep 2024 22:02:59 -0400 Subject: [PATCH 08/19] Update ci-windows.yml --- .github/workflows/ci-windows.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index 6e73cdd1c..cdad45f03 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -142,7 +142,7 @@ jobs: # run: # shell: bash env: - XILINX_XRT: "/c/Xilinx/XRT" + XILINX_XRT: "C:\\Xilinx\\XRT" steps: - name: "Checking out repository" # for test scripts uses: actions/checkout@8f4b7f84864484a7bf31766abe9204da3cbe65b3 # v3.5.0 From d18f6167402613371cba195cc033b479df4e4b76 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Fri, 27 Sep 2024 17:36:37 -0400 Subject: [PATCH 09/19] try to fix xrt bugs --- .github/workflows/ci-windows.yml | 36 +++++----- build_tools/build_test_cpp.sh | 2 + .../driver/xrt/direct_allocator.cc | 14 ++-- .../driver/xrt/direct_allocator.h | 4 +- .../driver/xrt/direct_command_buffer.cc | 32 ++++----- .../driver/xrt/native_executable.cc | 66 +++++++------------ .../driver/xrt/native_executable.h | 12 ++-- .../driver/xrt/nop_executable_cache.cc | 4 +- .../driver/xrt/nop_executable_cache.h | 4 +- .../src/iree-amd-aie/driver/xrt/xrt_device.cc | 6 +- .../src/iree-amd-aie/driver/xrt/xrt_device.h | 10 +-- .../src/iree-amd-aie/driver/xrt/xrt_driver.cc | 23 ++----- 12 files changed, 90 insertions(+), 123 deletions(-) diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index cdad45f03..5a13344be 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -166,23 +166,25 @@ jobs: run: | python -m venv .venv .\.venv\Scripts\Activate.ps1 - pip install -r tests/requirements.txt - -# - name: E2E correctness matmul test -# run: | -# source .venv/Scripts/activate -# bash build_tools/ci/run_matmul_test.sh \ -# /c/test_matmuls \ -# $PWD/iree-install \ -# $PWD/llvm-aie -# -# - name : E2E comparison of AIE to llvm-cpu -# run: | -# source .venv/Scripts/activate -# python build_tools/ci/cpu_comparison/run.py \ -# /c/test_aie_vs_cpu \ -# $PWD/iree-install \ -# $PWD/llvm-aie -v + pip install -r tests\requirements.txt + + - name: E2E correctness matmul test + shell: bash + run: | + source .venv/Scripts/activate + bash build_tools/ci/run_matmul_test.sh \ + /c/test_matmuls \ + $PWD/iree-install \ + $PWD/llvm-aie + + - name : E2E comparison of AIE to llvm-cpu + shell: bash + run: | + source .venv/Scripts/activate + python build_tools/ci/cpu_comparison/run.py \ + /c/test_aie_vs_cpu \ + $PWD/iree-install \ + $PWD/llvm-aie -v - name: Python tests run: | diff --git a/build_tools/build_test_cpp.sh b/build_tools/build_test_cpp.sh index 499d9ba80..d04bf8bb4 100644 --- a/build_tools/build_test_cpp.sh +++ b/build_tools/build_test_cpp.sh @@ -117,6 +117,8 @@ echo "Installing" echo "----------" echo "Install to: $install_dir" cmake --build "$build_dir" --target install +# TODO(max): there's no way to install the python runtime bindings +# ninja iree-install-dist install-IREECompilerPythonModules install-IREEDialectsPythonModules cmake --build "$build_dir" --target iree-install-dist echo "CTest" diff --git a/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.cc b/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.cc index 7b1b884ac..641f08c9c 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.cc +++ b/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.cc @@ -28,7 +28,7 @@ typedef struct iree_hal_xrt_allocator_t { // The device that this allocator is attached to. iree_hal_device_t* base_device; - xrt::device* device; + xrt::device device; iree_allocator_t host_allocator; @@ -46,7 +46,7 @@ static iree_hal_xrt_allocator_t* iree_hal_xrt_allocator_cast( } iree_status_t iree_hal_xrt_allocator_create( - iree_hal_device_t* base_device, xrt::device* device, + iree_hal_device_t* base_device, xrt::device device, iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator) { IREE_ASSERT_ARGUMENT(base_device); IREE_ASSERT_ARGUMENT(out_allocator); @@ -171,17 +171,18 @@ static iree_status_t iree_hal_xrt_allocator_allocate_buffer( std::unique_ptr xrt_buffer; try { - xrt_buffer = std::make_unique(*allocator->device, allocation_size, + xrt_buffer = std::make_unique(allocator->device, allocation_size, XRT_BO_FLAGS_HOST_ONLY, group_id); } catch (...) { IREE_TRACE_ZONE_END(z0); return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED, "could not allocate memory for buffer"); } - IREE_TRACE_ZONE_END(z0); + if (!xrt_buffer) { - status = iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED, - "unable to allocate buffer"); + IREE_TRACE_ZONE_END(z0); + return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED, + "unable to allocate buffer"); } iree_hal_buffer_t* buffer = nullptr; @@ -203,6 +204,7 @@ static iree_status_t iree_hal_xrt_allocator_allocate_buffer( } else { if (buffer) iree_hal_buffer_release(buffer); } + IREE_TRACE_ZONE_END(z0); return status; } diff --git a/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.h b/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.h index a6272a6eb..39a0f3e10 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.h +++ b/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.h @@ -17,11 +17,11 @@ extern "C" { // Creates an XRT memory allocator. iree_status_t iree_hal_xrt_allocator_create( - iree_hal_device_t* base_device, xrt::device* device, + iree_hal_device_t* base_device, xrt::device device, iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator); #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif // __cplusplus #endif // IREE_AMD_AIE_DRIVER_XRT_DIRECT_ALLOCATOR_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc index fe47cdf6a..5785f3484 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc @@ -323,51 +323,43 @@ static iree_status_t iree_hal_xrt_direct_command_buffer_dispatch( z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1, &executable)); - xrt::kernel kernel = *kernel_params.kernel; - xrt::bo instr = *kernel_params.instr; - uint32_t num_instr = kernel_params.num_instr; - xrt::run run = xrt::run(kernel); - - // set opcode for transaction binary execution - unsigned int opcode = 3; - + xrt::run run = xrt::run(kernel_params.kernel); // Index to push arguments on the kernel. iree_host_size_t arg_index = 0; - // First argument is the opcode. + unsigned int opcode = 3; run.set_arg(arg_index++, opcode); - // Second argument is the LX6 instructions. - run.set_arg(arg_index++, instr); - + run.set_arg(arg_index++, kernel_params.instr); // Third argument is the number of LX6 instructions. - run.set_arg(arg_index++, num_instr); + run.set_arg(arg_index++, kernel_params.num_instr); // Copy descriptors from all sets to the end of the current segment for later // access. // TODO(jornt): hack to ensure that the output buffer is synced by syncing all // buffers after the run. - xrt::bo arg_buffer; std::vector bos; // TODO(max): do we need multiple descriptor sets ever for AIE? uint32_t set = 0; iree_hal_xrt_direct_command_buffer_push_descriptor_set( base_command_buffer, set, bindings.count, bindings.values); for (iree_host_size_t j = 0; j < bindings.count; ++j) { - arg_buffer = xrt::bo(*command_buffer->descriptor_sets[set].bindings[j], - command_buffer->descriptor_sets[set].lengths[j], - command_buffer->descriptor_sets[set].offsets[j]); + xrt::bo arg_buffer = + xrt::bo(*command_buffer->descriptor_sets[set].bindings[j], + command_buffer->descriptor_sets[set].lengths[j], + command_buffer->descriptor_sets[set].offsets[j]); bos.push_back(arg_buffer); run.set_arg(arg_index + j, arg_buffer); } + run.start(); try { run.wait2(); - } catch (...) { + } catch (const std::exception& e) { IREE_TRACE_ZONE_END(z0); - return iree_make_status(IREE_STATUS_UNKNOWN, - "failed to wait for xrt kernel run to finish"); + return iree_make_status(IREE_STATUS_UNKNOWN, e.what()); } + for (xrt::bo& bo : bos) bo.sync(XCL_BO_SYNC_BO_FROM_DEVICE); IREE_TRACE_ZONE_END(z0); diff --git a/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc b/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc index 106f89cb3..c3f8c7aa4 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc +++ b/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc @@ -21,7 +21,7 @@ typedef struct iree_hal_xrt_native_executable_t { iree_allocator_t host_allocator; iree_host_size_t entry_point_count; - iree_hal_xrt_kernel_params_t entry_points[]; + iree_hal_xrt_kernel_params_t entry_points[16]; } iree_hal_xrt_native_executable_t; namespace { @@ -99,7 +99,7 @@ static iree_status_t iree_amd_aie_hal_xrt_native_executable_flatbuffer_verify( } iree_status_t iree_hal_xrt_native_executable_create( - xrt::device* device, const iree_hal_executable_params_t* executable_params, + xrt::device device, const iree_hal_executable_params_t* executable_params, iree_allocator_t host_allocator, iree_hal_executable_t** out_executable) { IREE_ASSERT_ARGUMENT(executable_params); IREE_ASSERT_ARGUMENT(out_executable); @@ -174,30 +174,35 @@ iree_status_t iree_hal_xrt_native_executable_create( flatbuffers_string_t xclbin_fb = iree_amd_aie_hal_xrt_XclbinDef_xclbin_get(xclbin_def); + iree_hal_xrt_kernel_params_t* params = + &executable->entry_points[entry_ordinal]; + // XRT API needs this vector and cant actually read a void*. std::vector xclbinVector( xclbin_fb, xclbin_fb + flatbuffers_string_len(xclbin_fb)); - std::unique_ptr xclbin; try { - xclbin = std::make_unique(xclbinVector); + params->xclbin = xrt::xclbin(xclbinVector); } catch (std::exception& e) { return iree_make_status(IREE_STATUS_INTERNAL, "XCLBIN load error: %s", e.what()); } + try { - device->register_xclbin(*xclbin); + device.register_xclbin(params->xclbin); } catch (std::exception& e) { return iree_make_status(IREE_STATUS_INTERNAL, "XCLBIN register error: %s", e.what()); } - std::unique_ptr context; + try { - context = std::make_unique( - *device, xclbin->get_uuid(), xrt::hw_context::access_mode::exclusive); + params->context = + xrt::hw_context(device, params->xclbin.get_uuid(), + xrt::hw_context::access_mode::exclusive); } catch (std::exception& e) { return iree_make_status(IREE_STATUS_INTERNAL, "xrt::hw_context context: %s", e.what()); } + uint32_t asm_instr_index = flatbuffers_uint32_vec_at(asm_instr_indices_vec, entry_ordinal); iree_amd_aie_hal_xrt_AsmInstDef_table_t asminst_def = @@ -205,17 +210,16 @@ iree_status_t iree_hal_xrt_native_executable_create( flatbuffers_uint32_vec_t asm_inst = iree_amd_aie_hal_xrt_AsmInstDef_asm_inst_get(asminst_def); uint32_t num_instr = flatbuffers_uint32_vec_len(asm_inst); - std::unique_ptr kernel; - std::unique_ptr instr; + try { - kernel = std::make_unique(*context, entry_name); + params->kernel = xrt::kernel(params->context, entry_name); // XCL_BO_FLAGS_CACHEABLE is used to indicate that this is an instruction // buffer that resides in instr_memory. This buffer is always passed as // the second argument to the kernel and we can use group id 1. int group_id = 1; - instr = std::make_unique(*device, num_instr * sizeof(uint32_t), - XCL_BO_FLAGS_CACHEABLE, - kernel->group_id(group_id)); + params->instr = + xrt::bo(device, num_instr * sizeof(uint32_t), XCL_BO_FLAGS_CACHEABLE, + params->kernel.group_id(group_id)); } catch (...) { iree_hal_executable_destroy((iree_hal_executable_t*)executable); IREE_TRACE_ZONE_END(z0); @@ -223,18 +227,12 @@ iree_status_t iree_hal_xrt_native_executable_create( IREE_STATUS_RESOURCE_EXHAUSTED, "could not allocate memory for kernel or instr buffer"); } - uint32_t* instr_buffer = instr->map(); - for (int j = 0; j < num_instr; j++) { - instr_buffer[j] = flatbuffers_uint32_vec_at(asm_inst, j); - } + + uint32_t* instr_buffer = params->instr.map(); + memcpy(instr_buffer, asm_inst, num_instr * sizeof(uint32_t)); + // The Ryzen AI device is not cache coherent, so it is important to sync - instr->sync(XCL_BO_SYNC_BO_TO_DEVICE); - iree_hal_xrt_kernel_params_t* params = - &executable->entry_points[entry_ordinal]; - params->context = context.release(); - params->xclbin = xclbin.release(); - params->kernel = kernel.release(); - params->instr = instr.release(); + params->instr.sync(XCL_BO_SYNC_BO_TO_DEVICE); params->num_instr = num_instr; // Stash the entry point name in the string table for use when tracing. @@ -265,6 +263,7 @@ iree_status_t iree_hal_xrt_native_executable_create( } }); } + *out_executable = (iree_hal_executable_t*)executable; IREE_TRACE_ZONE_END(z0); return iree_ok_status(); @@ -277,22 +276,6 @@ static void iree_hal_xrt_native_executable_destroy( iree_allocator_t host_allocator = executable->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); - for (iree_host_size_t i = 0; i < executable->entry_point_count; ++i) { - try { - delete executable->entry_points[i].kernel; -#ifndef _WIN32 - // causes segmentation fault on windows - delete executable->entry_points[i].instr; -#endif - // TODO(jornt): deleting the xclbin here will result in a corrupted size - // error in XRT. It looks like the xclbin needs to stay alive while the - // device is alive if it has been registered. - // delete executable->entry_points[i].xclbin; - // delete executable->entry_points[i].context; - } catch (...) { - (void)iree_status_from_code(IREE_STATUS_DATA_LOSS); - } - } iree_allocator_free(host_allocator, executable); IREE_TRACE_ZONE_END(z0); @@ -309,6 +292,7 @@ iree_status_t iree_hal_xrt_native_executable_entry_point_kernel_params( "only contains %" PRIhsz " entry points", entry_point, executable->entry_point_count); } + memcpy(out_params, &executable->entry_points[entry_point], sizeof(*out_params)); return iree_ok_status(); diff --git a/runtime/src/iree-amd-aie/driver/xrt/native_executable.h b/runtime/src/iree-amd-aie/driver/xrt/native_executable.h index aab760faf..141bbebca 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/native_executable.h +++ b/runtime/src/iree-amd-aie/driver/xrt/native_executable.h @@ -21,12 +21,12 @@ extern "C" { // Object and launch parameters for a compute kernel. typedef struct iree_hal_xrt_kernel_params_t { - xrt::hw_context* context; - xrt::xclbin* xclbin; + xrt::hw_context context; + xrt::xclbin xclbin; // The kernel code object. - xrt::kernel* kernel; + xrt::kernel kernel; // Instruction buffer argument to the kernel. - xrt::bo* instr; + xrt::bo instr; // Number of assembly instructions argument to the kernel uint32_t num_instr; // number of instructions IREE_TRACE(iree_string_view_t kernel_name;) @@ -37,7 +37,7 @@ typedef struct iree_hal_xrt_kernel_params_t { // |out_executable| must be released by the caller (see // iree_hal_executable_release). iree_status_t iree_hal_xrt_native_executable_create( - xrt::device* device, const iree_hal_executable_params_t* executable_params, + xrt::device device, const iree_hal_executable_params_t* executable_params, iree_allocator_t host_allocator, iree_hal_executable_t** out_executable); // Returns the kernel launch parameters for the given |entry_point|. @@ -46,7 +46,7 @@ iree_status_t iree_hal_xrt_native_executable_entry_point_kernel_params( iree_hal_xrt_kernel_params_t* out_params); #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif // __cplusplus #endif // IREE_AMD_AIE_DRIVER_XRT_NATIVE_EXECUTABLE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt/nop_executable_cache.cc b/runtime/src/iree-amd-aie/driver/xrt/nop_executable_cache.cc index 50e67d78e..655133e61 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/nop_executable_cache.cc +++ b/runtime/src/iree-amd-aie/driver/xrt/nop_executable_cache.cc @@ -17,7 +17,7 @@ typedef struct iree_hal_xrt_nop_executable_cache_t { // at offset 0. iree_hal_resource_t resource; - xrt::device* device; + xrt::device device; iree_allocator_t host_allocator; } iree_hal_xrt_nop_executable_cache_t; @@ -35,7 +35,7 @@ iree_hal_xrt_nop_executable_cache_cast( } iree_status_t iree_hal_xrt_nop_executable_cache_create( - xrt::device* device, iree_string_view_t identifier, + xrt::device device, iree_string_view_t identifier, iree_allocator_t host_allocator, iree_hal_executable_cache_t** out_executable_cache) { IREE_ASSERT_ARGUMENT(out_executable_cache); diff --git a/runtime/src/iree-amd-aie/driver/xrt/nop_executable_cache.h b/runtime/src/iree-amd-aie/driver/xrt/nop_executable_cache.h index dd1c902bc..9a84f9e6d 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/nop_executable_cache.h +++ b/runtime/src/iree-amd-aie/driver/xrt/nop_executable_cache.h @@ -22,12 +22,12 @@ extern "C" { // |out_executable_cache| must be released by the caller (see // iree_hal_executable_cache_release). iree_status_t iree_hal_xrt_nop_executable_cache_create( - xrt::device* device, iree_string_view_t identifier, + xrt::device device, iree_string_view_t identifier, iree_allocator_t host_allocator, iree_hal_executable_cache_t** out_executable_cache); #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif // __cplusplus #endif // IREE_AMD_AIE_DRIVER_XRT_NOP_EXECUTABLE_CACHE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt/xrt_device.cc b/runtime/src/iree-amd-aie/driver/xrt/xrt_device.cc index 031955c2d..7b9a36f78 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/xrt_device.cc +++ b/runtime/src/iree-amd-aie/driver/xrt/xrt_device.cc @@ -32,7 +32,7 @@ typedef struct iree_hal_xrt_device_t { iree_allocator_t host_allocator; iree_hal_allocator_t* device_allocator; - xrt::device* device; + xrt::device device; } iree_hal_xrt_device_t; namespace { @@ -52,7 +52,7 @@ void iree_hal_xrt_device_params_initialize( } static iree_status_t iree_hal_xrt_device_create_internal( - iree_string_view_t identifier, xrt::device* xrt_device, + iree_string_view_t identifier, xrt::device xrt_device, const iree_hal_xrt_device_params_t* params, iree_allocator_t host_allocator, iree_hal_device_t** out_device) { iree_hal_xrt_device_t* device = nullptr; @@ -85,7 +85,7 @@ static iree_status_t iree_hal_xrt_device_create_internal( iree_status_t iree_hal_xrt_device_create( iree_string_view_t identifier, const iree_hal_xrt_device_params_t* params, - xrt::device* device, iree_allocator_t host_allocator, + xrt::device device, iree_allocator_t host_allocator, iree_hal_device_t** out_device) { IREE_ASSERT_ARGUMENT(out_device); IREE_TRACE_ZONE_BEGIN(z0); diff --git a/runtime/src/iree-amd-aie/driver/xrt/xrt_device.h b/runtime/src/iree-amd-aie/driver/xrt/xrt_device.h index f6e79bfb9..f23610dbd 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/xrt_device.h +++ b/runtime/src/iree-amd-aie/driver/xrt/xrt_device.h @@ -16,22 +16,16 @@ extern "C" { #endif // __cplusplus -/// P0 TODO(jornt): get rid of the global variable. -/// Using a global variable is currently the only 'reliable' approach that will -/// not result in occasional hangs. Creating a unique pointer and releasing it -/// doesn't work for some reason. Needs further debugging. -static xrt::device global_device; - // Creates a XRT device by wrapping |device| from the given |driver| with the // specific |params|. // |out_device| must be released by the caller (see iree_hal_device_release). iree_status_t iree_hal_xrt_device_create( iree_string_view_t identifier, const iree_hal_xrt_device_params_t* params, - xrt::device* device, iree_allocator_t host_allocator, + xrt::device device, iree_allocator_t host_allocator, iree_hal_device_t** out_device); #ifdef __cplusplus -} // extern "C" +} // extern "C" #endif // __cplusplus #endif // IREE_AMD_AIE_DRIVER_XRT_XRT_DEVICE_H_ diff --git a/runtime/src/iree-amd-aie/driver/xrt/xrt_driver.cc b/runtime/src/iree-amd-aie/driver/xrt/xrt_driver.cc index fe13ce0e3..2f8a1134f 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/xrt_driver.cc +++ b/runtime/src/iree-amd-aie/driver/xrt/xrt_driver.cc @@ -40,7 +40,7 @@ typedef struct iree_hal_xrt_driver_t { // Parameters used to control device behavior. iree_hal_xrt_device_params_t device_params; - xrt::device* device; + xrt::device device; } iree_hal_xrt_driver_t; @@ -91,8 +91,7 @@ iree_status_t iree_hal_xrt_driver_create_internal( } // Get handle to xrt device try { - global_device = xrt::device(0); - driver->device = &global_device; + driver->device = xrt::device(0); } catch (std::exception& e) { return iree_make_status(IREE_STATUS_INTERNAL, "xrt::device(0) failed: %s", e.what()); @@ -131,11 +130,11 @@ static iree_status_t iree_hal_xrt_driver_dump_device_info( iree_hal_driver_t* base_driver, iree_hal_device_id_t device_id, iree_string_builder_t* builder) { iree_hal_xrt_driver_t* driver = iree_hal_xrt_driver_cast(base_driver); - xrt::device* device = driver->device; + xrt::device device = driver->device; IREE_RETURN_IF_ERROR( iree_string_builder_append_cstring(builder, "\n- Platform:")); - std::string platform_info = device->get_info(); + std::string platform_info = device.get_info(); const char* platform_info_str = platform_info.c_str(); if (platform_info_str) { IREE_RETURN_IF_ERROR(iree_string_builder_append_cstring(builder, " ")); @@ -150,7 +149,7 @@ static iree_status_t iree_hal_xrt_driver_dump_device_info( // |out_device_info| must point to valid memory and additional data will be // appended to |buffer_ptr| and the new pointer is returned. static iree_status_t iree_hal_xrt_populate_device_info( - xrt::device* device, uint8_t* buffer_ptr, uint8_t** out_buffer_ptr, + xrt::device device, uint8_t* buffer_ptr, uint8_t** out_buffer_ptr, iree_hal_device_info_t* out_device_info) { *out_buffer_ptr = buffer_ptr; @@ -158,15 +157,7 @@ static iree_status_t iree_hal_xrt_populate_device_info( // We currenly only work with one XRT device and its device id is 0. out_device_info->device_id = 0; - // TODO (nirvedhmeshram) : Add device path, initial attempt below to use the - // info api for this gave an error. - /*std::string device_path = - device.get_info().to_string(); - const size_t path_len = strlen(device_path.c_str()); - buffer_ptr += iree_string_view_append_to_buffer( - iree_make_string_view(device_path.c_str(), path_len), - &out_device_info->path, (char*)buffer_ptr);*/ - std::string device_name = device->get_info(); + std::string device_name = device.get_info(); const size_t name_len = strlen(device_name.c_str()); if (name_len >= IREE_HAL_XRT_MAX_DEVICE_NAME_LENGTH) { return iree_make_status(IREE_STATUS_OUT_OF_RANGE, @@ -186,7 +177,7 @@ static iree_status_t iree_hal_xrt_driver_query_available_devices( iree_host_size_t* out_device_info_count, iree_hal_device_info_t** out_device_infos) { iree_hal_xrt_driver_t* driver = iree_hal_xrt_driver_cast(base_driver); - xrt::device* device = driver->device; + xrt::device device = driver->device; // Allocate the return infos and populate with the devices. iree_hal_device_info_t* device_infos = nullptr; iree_host_size_t single_info_size = From 400b3aaa66efab9d84f01ae7549e6ade200000d6 Mon Sep 17 00:00:00 2001 From: makslevental Date: Sat, 28 Sep 2024 15:23:52 -0400 Subject: [PATCH 10/19] add args to conftest --- .github/workflows/ci-linux.yml | 5 +++- .github/workflows/ci-windows.yml | 2 +- build_tools/ci/run_matmul_test.sh | 11 ++------ tests/conftest.py | 47 +++++++++++++++++++++++-------- 4 files changed, 44 insertions(+), 21 deletions(-) diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index d097be7db..383de570c 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -204,4 +204,7 @@ jobs: run: | source .venv/bin/activate source /opt/xilinx/xrt/setup.sh - pytest --capture=tee-sys tests + pytest tests \ + --capture=tee-sys \ + --iree-install-dir=$PWD/iree-install \ + --peano-install-dir=$PWD/llvm-aie diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index 5a13344be..cfe20f43e 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -191,4 +191,4 @@ jobs: ls $env:XILINX_XRT .\.venv\Scripts\Activate.ps1 mkdir temp - pytest -s --basetemp=$PWD\temp tests + pytest tests -s --basetemp=$PWD\temp --iree-install-dir="$PWD/iree-install" --peano-install-dir="$PWD/llvm-aie" diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh index 2c29007f0..b0ec2c550 100755 --- a/build_tools/ci/run_matmul_test.sh +++ b/build_tools/ci/run_matmul_test.sh @@ -634,11 +634,6 @@ i32_shapes_small=( '128x256x128' ) -i32_shapes_medium=( - '1024x1024x1024' - '1536x2048x1536' -) - run_matmul_test_on_shapes ${i32_shapes_small[@]} \ --name_prefix "small_i32" \ --lower_to_aie_pipeline "objectFifo" \ @@ -659,9 +654,9 @@ run_matmul_test_on_shapes ${i32_shapes_small[@]} \ i32_shapes_medium=( '1024x1024x1024' ) -if [ "$OSTYPE" != "msys" ]; then - i32_shapes_medium+=('1536x2048x1536') -fi +#if [ "$OSTYPE" != "msys" ]; then +# i32_shapes_medium+=('1536x2048x1536') +#fi run_matmul_test_on_shapes ${i32_shapes_medium[@]} \ --name_prefix "medium_i32" \ diff --git a/tests/conftest.py b/tests/conftest.py index ec0dca1ae..cfd151938 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,5 +1,6 @@ import os from contextlib import contextmanager +from pathlib import Path import numpy as np import pytest @@ -34,28 +35,52 @@ tf.__name__ = t +def pytest_addoption(parser): + abs_path = lambda x: Path(x).absolute() + parser.addoption("--iree-install-dir", type=abs_path, required=True) + parser.addoption("--peano-install-dir", type=abs_path, required=True) + parser.addoption("--output-dir", type=abs_path) + parser.addoption("--vitis-dir", type=abs_path) + parser.addoption("--iree-aie-debug", action="store_true") + + @pytest.fixture -def iree_session(request) -> Session: +def iree_session(request, pytestconfig) -> Session: s = Session() s.context.append_dialect_registry(get_dialect_registry()) s.context.load_all_available_dialects() target_backend = getattr(request, "target_backend", "amd-aie") - pipeline = getattr(request, "pipeline", "air") - s.set_flags( + target_device = getattr(request, "target_device", "npu1_4col") + lower_to_aie_pipeline = getattr(request, "lower_to_aie_pipeline", "air") + tile_pipeline = getattr(request, "tile_pipeline", "pad-pack") + use_chess = getattr(request, "use_chess", False) + enable_packet_flow = getattr(request, "enable_packet_flow", False) + # TODO(max): normalize iree-amdaie/iree-amd-aie in pass strings + flags = [ f"--iree-hal-target-backends={target_backend}", - # TODO(max): normalize iree-amdaie/iree-amd-aie in pass strings - f"--iree-amdaie-lower-to-aie-pipeline={pipeline}", - f"--iree-amd-aie-peano-install-dir={os.getenv('PEANO_INSTALL_DIR')}", - f"--iree-amd-aie-install-dir={os.getenv('IREE_INSTALL_DIR')}", - ) + f"--iree-amdaie-target-device={target_device}", + f"--iree-amdaie-lower-to-aie-pipeline={lower_to_aie_pipeline}", + f"--iree-amdaie-tile-pipeline={tile_pipeline}", + f"--iree-amd-aie-peano-install-dir={pytestconfig.option.peano_install_dir}", + f"--iree-amd-aie-install-dir={pytestconfig.option.iree_install_dir}", + f"--iree-amd-aie-enable-chess={use_chess}", + f"--iree-amdaie-enable-packet-flow={enable_packet_flow}", + ] + if pytestconfig.option.vitis_dir: + flags += [f"--iree-amd-aie-vitis-install-dir={pytestconfig.option.vitis_dir}"] + if pytestconfig.option.iree_aie_debug: + flags += ["--iree-amd-aie-show-invoked-commands"] + if pytestconfig.option.output_dir: + flags += [ + f"--iree-hal-dump-executable-files-to={pytestconfig.option.output_dir}" + ] + + s.set_flags(*flags) yield s @pytest.fixture def session_module(iree_session, tmp_path) -> ir.Module: - iree_session.set_flags( - f"--iree-hal-dump-executable-files-to={tmp_path}", - ) with ir.Location.unknown(iree_session.context): module_op = ir.Module.create() with ir.InsertionPoint(module_op.body): From 55b096d08765cc5388e87cee0facbba79ae5f51c Mon Sep 17 00:00:00 2001 From: makslevental Date: Sat, 28 Sep 2024 16:12:30 -0400 Subject: [PATCH 11/19] add tests --- .github/workflows/ci-windows.yml | 3 - build_tools/ci/run_matmul_test.sh | 31 --------- build_tools/download_peano.ps1 | 2 +- tests/conftest.py | 13 +++- tests/requirements.txt | 1 + tests/test_matmul.py | 103 ++++++++++++++---------------- 6 files changed, 61 insertions(+), 92 deletions(-) diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index cfe20f43e..e979c37c8 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -138,9 +138,6 @@ jobs: name: E2E Test windows runs-on: windows-phoenix needs: build_and_ctest -# defaults: -# run: -# shell: bash env: XILINX_XRT: "C:\\Xilinx\\XRT" steps: diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh index b0ec2c550..213bfed4f 100755 --- a/build_tools/ci/run_matmul_test.sh +++ b/build_tools/ci/run_matmul_test.sh @@ -558,37 +558,6 @@ if [ -d "$VITIS" ]; then --use_ukernel "1" fi -# Example of a run with a group of 2+ matmuls. Currently this test is passed -# the flag '--num_repeat_runs 0" as there is currently an issue with the runtime if -# multiple matmuls are run in the same test. TODO(newling/nmeshram): Document -# this issue. -run_matmul_test \ - --name_prefix "multiple_matmuls" \ - --lower_to_aie_pipeline "air" \ - --tile_pipeline "pad-pack" \ - --lhs_rhs_type "i32" \ - --acc_type "i32" \ - --m "512,8,16" \ - --n "512,32,16" \ - --k "256,16,8" \ - --num_repeat_runs "0" - -run_matmul_test \ - --name_prefix "transpose_i8_i32" \ - --lower_to_aie_pipeline "air" \ - --tile_pipeline "pad-pack" \ - --lhs_rhs_type "i8" \ - --acc_type "i32" \ - --m "16" --n "32" --k "64" \ - --do_transpose_rhs "1" - -run_matmul_test \ - --name_prefix "packPeel_i32" \ - --tile_pipeline "pack-peel" \ - --lhs_rhs_type "i32" \ - --acc_type "i32" \ - --m "64" --n "64" --k "128" - run_matmul_test \ --name_prefix "packPeel_bf16" \ --tile_pipeline "pack-peel" \ diff --git a/build_tools/download_peano.ps1 b/build_tools/download_peano.ps1 index 16f147e25..98b8cf982 100644 --- a/build_tools/download_peano.ps1 +++ b/build_tools/download_peano.ps1 @@ -8,4 +8,4 @@ $ErrorActionPreference = 'Stop' $RELEASE = "19.0.0.2024082221+90abe71b" pip download llvm_aie==$RELEASE -f https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly -Expand-Archive (Get-ChildItem -Filter llvm*.whl).FullName -DestinationPath $PWD.Path \ No newline at end of file +Expand-Archive (Get-ChildItem -Filter llvm*.whl).FullName -DestinationPath $PWD.Path diff --git a/tests/conftest.py b/tests/conftest.py index cfd151938..508c33eaa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -10,6 +10,7 @@ from iree.compiler._mlir_libs import get_dialect_registry from iree.compiler.api import Session, Output, Source from iree.compiler.extras import types as T +from ml_dtypes import bfloat16 from iree.runtime import get_driver, Config, SystemContext @@ -35,6 +36,12 @@ tf.__name__ = t +def ids(datum): + if callable(datum): + return datum.__name__ + return datum + + def pytest_addoption(parser): abs_path = lambda x: Path(x).absolute() parser.addoption("--iree-install-dir", type=abs_path, required=True) @@ -69,7 +76,10 @@ def iree_session(request, pytestconfig) -> Session: if pytestconfig.option.vitis_dir: flags += [f"--iree-amd-aie-vitis-install-dir={pytestconfig.option.vitis_dir}"] if pytestconfig.option.iree_aie_debug: - flags += ["--iree-amd-aie-show-invoked-commands"] + flags += [ + "--iree-amd-aie-show-invoked-commands", + "--aie2xclbin-print-ir-after-all", + ] if pytestconfig.option.output_dir: flags += [ f"--iree-hal-dump-executable-files-to={pytestconfig.option.output_dir}" @@ -119,6 +129,7 @@ def invokable_module(session, module, device="xrt") -> VmModule: # so to support passing lists of ints we map to index type np.longlong: T.index, np.uintp: T.index, + bfloat16: T.bf16, np.float16: T.f16, np.float32: T.f32, np.float64: T.f64, diff --git a/tests/requirements.txt b/tests/requirements.txt index d92fd92a9..4d7a9c934 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -3,3 +3,4 @@ requests>=2.28.0 enum_tools==0.6.4 numpy<2 pytest==8.2.2 +ml_dtypes==0.5.0 diff --git a/tests/test_matmul.py b/tests/test_matmul.py index 8e667416b..529f6d93f 100644 --- a/tests/test_matmul.py +++ b/tests/test_matmul.py @@ -1,4 +1,4 @@ -import random +import itertools import numpy as np import pytest @@ -7,7 +7,7 @@ from iree.compiler.dialects.arith import _is_float_type from iree.compiler.dialects.func import func from iree.compiler.extras import types as T -from .conftest import invokable_module, mlir_type_to_np_dtype +from .conftest import invokable_module, mlir_type_to_np_dtype, ids def test_smol_matmul(session_module): @@ -44,61 +44,48 @@ def matmul(lhs, rhs): return matmul_name -testdata = [ - (32, 16, 32, T.i8, T.i32), - (32, 32, 32, T.i8, T.i32), - (64, 32, 64, T.i8, T.i32), - (64, 64, 64, T.i8, T.i32), - (128, 64, 128, T.i8, T.i32), - (128, 128, 128, T.i8, T.i32), - (128, 256, 128, T.i8, T.i32), - (32, 16, 32, T.f32, T.f32), - (32, 32, 32, T.f32, T.f32), - (64, 32, 64, T.f32, T.f32), - (64, 64, 64, T.f32, T.f32), - (128, 128, 128, T.f32, T.f32), - (128, 256, 128, T.f32, T.f32), - - (32, 16, 32, T.i8, T.i32), - (32, 32, 32, T.i8, T.i32), - (64, 32, 64, T.i8, T.i32), - (64, 64, 64, T.i8, T.i32), - (128, 64, 128, T.i8, T.i32), - (128, 128, 128, T.i8, T.i32), - (128, 256, 128, T.i8, T.i32), - (32, 16, 32, T.f32, T.f32), - (32, 32, 32, T.f32, T.f32), - (64, 32, 64, T.f32, T.f32), - (64, 64, 64, T.f32, T.f32), - (128, 128, 128, T.f32, T.f32), - (128, 256, 128, T.f32, T.f32), - - (32, 16, 32, T.i8, T.i32), - (32, 32, 32, T.i8, T.i32), - (64, 32, 64, T.i8, T.i32), - (64, 64, 64, T.i8, T.i32), - (128, 64, 128, T.i8, T.i32), - (128, 128, 128, T.i8, T.i32), - (128, 256, 128, T.i8, T.i32), - (32, 16, 32, T.f32, T.f32), - (32, 32, 32, T.f32, T.f32), - (64, 32, 64, T.f32, T.f32), - (64, 64, 64, T.f32, T.f32), - (128, 128, 128, T.f32, T.f32), - (128, 256, 128, T.f32, T.f32), +# "multiple_matmuls" +test_params = list( + sorted( + itertools.product( + [512, 8, 16], + [512, 32, 16], + [256, 16, 8], + [T.i32], + [T.f32], + ["air"], + ["pad-pack"], + [1], + ) + ) +) + +test_params += [ + # transpose_i8_i32 + (16, 32, 64, T.i8, T.i32, "air", "pad-pack", 1), + # packPeel_i32 + (64, 128, 64, T.i32, T.i32, "air", "pack-peel", 1), + # small objectfifo + (32, 32, 32, T.i32, T.i32, "air", "pad-pack", 1000), ] -random.shuffle(testdata) - -def ids(datum): - if callable(datum): - return datum.__name__ - return datum - - -@pytest.mark.parametrize("M, K, N, lhs_rhs_type, acc_type", testdata, ids=ids) -def test_matmul(session_module, M, K, N, lhs_rhs_type, acc_type): +@pytest.mark.parametrize( + "M, K, N, lhs_rhs_type, acc_type, lower_to_aie_pipeline, tile_pipeline, num_repeat_runs", + test_params, + ids=ids, +) +def test_matmul( + session_module, + M, + K, + N, + lhs_rhs_type, + acc_type, + lower_to_aie_pipeline, + tile_pipeline, + num_repeat_runs, +): session, module = session_module lhs_rhs_type, acc_type = lhs_rhs_type(), acc_type() @@ -109,5 +96,9 @@ def test_matmul(session_module, M, K, N, lhs_rhs_type, acc_type): arg0 = np.ones((M, K), dtype=lhs_rhs_type) arg1 = np.ones((K, N), dtype=lhs_rhs_type) with invokable_module(session, module) as module: - results = module[matmul_name](arg0, arg1).to_host() - assert np.array_equal(results, (arg0.astype(acc_type) @ arg1.astype(acc_type))) + for i in range(num_repeat_runs): + print(f"run {i}") + results = module[matmul_name](arg0, arg1).to_host() + assert np.array_equal( + results, (arg0.astype(acc_type) @ arg1.astype(acc_type)) + ) From 5829b9e80c70a84b4b610cafc1b031116b1887ec Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Sat, 28 Sep 2024 17:52:21 -0400 Subject: [PATCH 12/19] Update test_matmul.py --- .github/workflows/ci-linux.yml | 2 +- .github/workflows/ci-windows.yml | 5 ++- build_tools/ci/run_matmul_test.sh | 31 +++++++++++++++ tests/conftest.py | 16 ++++---- tests/test_matmul.py | 64 +++++++++++++++++++++++++++---- 5 files changed, 102 insertions(+), 16 deletions(-) diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 383de570c..5a4413690 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -204,7 +204,7 @@ jobs: run: | source .venv/bin/activate source /opt/xilinx/xrt/setup.sh - pytest tests \ + pytest -v tests \ --capture=tee-sys \ --iree-install-dir=$PWD/iree-install \ --peano-install-dir=$PWD/llvm-aie diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index e979c37c8..2fcc702e7 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -188,4 +188,7 @@ jobs: ls $env:XILINX_XRT .\.venv\Scripts\Activate.ps1 mkdir temp - pytest tests -s --basetemp=$PWD\temp --iree-install-dir="$PWD/iree-install" --peano-install-dir="$PWD/llvm-aie" + pytest tests -sv ` + --basetemp=$PWD\temp ` + --iree-install-dir="$PWD/iree-install" ` + --peano-install-dir="$PWD/llvm-aie" diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh index 213bfed4f..b0ec2c550 100755 --- a/build_tools/ci/run_matmul_test.sh +++ b/build_tools/ci/run_matmul_test.sh @@ -558,6 +558,37 @@ if [ -d "$VITIS" ]; then --use_ukernel "1" fi +# Example of a run with a group of 2+ matmuls. Currently this test is passed +# the flag '--num_repeat_runs 0" as there is currently an issue with the runtime if +# multiple matmuls are run in the same test. TODO(newling/nmeshram): Document +# this issue. +run_matmul_test \ + --name_prefix "multiple_matmuls" \ + --lower_to_aie_pipeline "air" \ + --tile_pipeline "pad-pack" \ + --lhs_rhs_type "i32" \ + --acc_type "i32" \ + --m "512,8,16" \ + --n "512,32,16" \ + --k "256,16,8" \ + --num_repeat_runs "0" + +run_matmul_test \ + --name_prefix "transpose_i8_i32" \ + --lower_to_aie_pipeline "air" \ + --tile_pipeline "pad-pack" \ + --lhs_rhs_type "i8" \ + --acc_type "i32" \ + --m "16" --n "32" --k "64" \ + --do_transpose_rhs "1" + +run_matmul_test \ + --name_prefix "packPeel_i32" \ + --tile_pipeline "pack-peel" \ + --lhs_rhs_type "i32" \ + --acc_type "i32" \ + --m "64" --n "64" --k "128" + run_matmul_test \ --name_prefix "packPeel_bf16" \ --tile_pipeline "pack-peel" \ diff --git a/tests/conftest.py b/tests/conftest.py index 508c33eaa..e1b3f17b4 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -45,7 +45,7 @@ def ids(datum): def pytest_addoption(parser): abs_path = lambda x: Path(x).absolute() parser.addoption("--iree-install-dir", type=abs_path, required=True) - parser.addoption("--peano-install-dir", type=abs_path, required=True) + parser.addoption("--peano-install-dir", type=abs_path) parser.addoption("--output-dir", type=abs_path) parser.addoption("--vitis-dir", type=abs_path) parser.addoption("--iree-aie-debug", action="store_true") @@ -56,12 +56,14 @@ def iree_session(request, pytestconfig) -> Session: s = Session() s.context.append_dialect_registry(get_dialect_registry()) s.context.load_all_available_dialects() - target_backend = getattr(request, "target_backend", "amd-aie") - target_device = getattr(request, "target_device", "npu1_4col") - lower_to_aie_pipeline = getattr(request, "lower_to_aie_pipeline", "air") - tile_pipeline = getattr(request, "tile_pipeline", "pad-pack") - use_chess = getattr(request, "use_chess", False) - enable_packet_flow = getattr(request, "enable_packet_flow", False) + target_backend = request.node.callspec.params.get("target_backend", "amd-aie") + target_device = request.node.callspec.params.get("target_device", "npu1_4col") + lower_to_aie_pipeline = request.node.callspec.params.get( + "lower_to_aie_pipeline", "air" + ) + tile_pipeline = request.node.callspec.params.get("tile_pipeline", "pad-pack") + use_chess = request.node.callspec.params.get("use_chess", False) + enable_packet_flow = request.node.callspec.params.get("enable_packet_flow", False) # TODO(max): normalize iree-amdaie/iree-amd-aie in pass strings flags = [ f"--iree-hal-target-backends={target_backend}", diff --git a/tests/test_matmul.py b/tests/test_matmul.py index 529f6d93f..9cd2f0d7c 100644 --- a/tests/test_matmul.py +++ b/tests/test_matmul.py @@ -10,7 +10,8 @@ from .conftest import invokable_module, mlir_type_to_np_dtype, ids -def test_smol_matmul(session_module): +@pytest.mark.parametrize("target_backend", ["amd-aie"]) +def test_smol_matmul(session_module, target_backend): session, module = session_module @func(T.tensor(32, 16, T.i8()), T.tensor(16, 32, T.i8())) @@ -45,14 +46,14 @@ def matmul(lhs, rhs): # "multiple_matmuls" -test_params = list( +simple_matmul_test_params = list( sorted( itertools.product( [512, 8, 16], [512, 32, 16], [256, 16, 8], [T.i32], - [T.f32], + [T.i32], ["air"], ["pad-pack"], [1], @@ -60,19 +61,69 @@ def matmul(lhs, rhs): ) ) -test_params += [ +xfails = [ + (8, 512, 8, T.i32, T.i32, "air", "pad-pack", 1), + (8, 512, 16, T.i32, T.i32, "air", "pad-pack", 1), + (16, 512, 8, T.i32, T.i32, "air", "pad-pack", 1), + (16, 512, 16, T.i32, T.i32, "air", "pad-pack", 1), +] + +for x in xfails: + simple_matmul_test_params.remove(x) + +simple_matmul_test_params += [ # transpose_i8_i32 (16, 32, 64, T.i8, T.i32, "air", "pad-pack", 1), # packPeel_i32 (64, 128, 64, T.i32, T.i32, "air", "pack-peel", 1), # small objectfifo - (32, 32, 32, T.i32, T.i32, "air", "pad-pack", 1000), + # segfault + # (32, 32, 32, T.i32, T.i32, "air", "pad-pack", 1000), +] + [ + # from multiple_matmuls + pytest.param(*x, marks=pytest.mark.xfail(reason="compile failure")) + for x in xfails +] + +# small_i32 +small_i32_shapes = [ + (32, 32, 32), + (64, 32, 128), + (128, 32, 64), + (128, 32, 64), + (128, 32, 128), + (256, 32, 256), + (32, 64, 32), + (64, 64, 64), + (128, 256, 128), +] + + +small_i8_shapes_small = [ + (64, 64, 64), + (128, 256, 128), +] + +small_i8_shapes_medium = [ + (512, 512, 512), + (1024, 1024, 1024), + # (1536, 2048, 1536), + # (4096, 2048, 4096), +] + +simple_matmul_test_params += [ + (*s, T.i32, T.i32, "objectFifo", "pack-peel", 1) for s in small_i32_shapes +] + +simple_matmul_test_params += [ + (*s, T.i8, T.i32, "objectFifo", "pack-peel", 1) + for s in small_i8_shapes_small + small_i8_shapes_medium ] @pytest.mark.parametrize( "M, K, N, lhs_rhs_type, acc_type, lower_to_aie_pipeline, tile_pipeline, num_repeat_runs", - test_params, + simple_matmul_test_params, ids=ids, ) def test_matmul( @@ -97,7 +148,6 @@ def test_matmul( arg1 = np.ones((K, N), dtype=lhs_rhs_type) with invokable_module(session, module) as module: for i in range(num_repeat_runs): - print(f"run {i}") results = module[matmul_name](arg0, arg1).to_host() assert np.array_equal( results, (arg0.astype(acc_type) @ arg1.astype(acc_type)) From edd181553fa0c069df27c45141cc0b4cff0a4b85 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Sun, 29 Sep 2024 18:21:04 -0400 Subject: [PATCH 13/19] run 100 times --- .github/workflows/ci-linux.yml | 43 +++++++++++++++++++------------- .github/workflows/ci-windows.yml | 33 +++++++++++++++--------- 2 files changed, 47 insertions(+), 29 deletions(-) diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 5a4413690..8248d2e90 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -155,13 +155,16 @@ jobs: run: | source .venv/bin/activate source /opt/xilinx/xrt/setup.sh - python3 build_tools/ci/cpu_comparison/run.py \ - test_aie_vs_cpu \ - $PWD/iree-install \ - $PWD/llvm-aie \ - --xrt-dir /opt/xilinx/xrt \ - --vitis-dir /opt/Xilinx/Vitis/2024.2 \ - --reset-npu-between-runs -v + for i in {1..100}; do + echo "run $i" + python build_tools/ci/cpu_comparison/run.py \ + test_aie_vs_cpu \ + $PWD/iree-install \ + $PWD/llvm-aie \ + --xrt-dir /opt/xilinx/xrt \ + --vitis-dir /opt/Xilinx/Vitis/2024.2 \ + --reset-npu-between-runs -v + done - name: E2E correctness matmul test run: | @@ -193,18 +196,24 @@ jobs: sudo prlimit -lunlimited --pid $$ source .venv/bin/activate source /opt/xilinx/xrt/setup.sh - bash build_tools/ci/run_matmul_test.sh \ - test_matmuls \ - iree-install \ - $PWD/llvm-aie \ - /opt/xilinx/xrt \ - /opt/Xilinx/Vitis/2024.2 + for i in {1..100}; do + echo "run $i" + bash build_tools/ci/run_matmul_test.sh \ + test_matmuls \ + iree-install \ + $PWD/llvm-aie \ + /opt/xilinx/xrt \ + /opt/Xilinx/Vitis/2024.2 + done - name: Python tests run: | source .venv/bin/activate source /opt/xilinx/xrt/setup.sh - pytest -v tests \ - --capture=tee-sys \ - --iree-install-dir=$PWD/iree-install \ - --peano-install-dir=$PWD/llvm-aie + for i in {1..100}; do + echo "run $i" + pytest -v tests \ + --capture=tee-sys \ + --iree-install-dir=$PWD/iree-install \ + --peano-install-dir=$PWD/llvm-aie + done diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index 2fcc702e7..34908dab8 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -169,26 +169,35 @@ jobs: shell: bash run: | source .venv/Scripts/activate - bash build_tools/ci/run_matmul_test.sh \ - /c/test_matmuls \ - $PWD/iree-install \ - $PWD/llvm-aie + for i in {1..100}; do + echo "run $i" + bash build_tools/ci/run_matmul_test.sh \ + /c/test_matmuls \ + $PWD/iree-install \ + $PWD/llvm-aie + done - name : E2E comparison of AIE to llvm-cpu shell: bash run: | source .venv/Scripts/activate - python build_tools/ci/cpu_comparison/run.py \ - /c/test_aie_vs_cpu \ - $PWD/iree-install \ - $PWD/llvm-aie -v + for i in {1..100}; do + echo "run $i" + python build_tools/ci/cpu_comparison/run.py \ + /c/test_aie_vs_cpu \ + $PWD/iree-install \ + $PWD/llvm-aie -v + done - name: Python tests run: | ls $env:XILINX_XRT .\.venv\Scripts\Activate.ps1 mkdir temp - pytest tests -sv ` - --basetemp=$PWD\temp ` - --iree-install-dir="$PWD/iree-install" ` - --peano-install-dir="$PWD/llvm-aie" + for ($i = 1; $i -le 100; $i++) { + echo "run $i" + pytest tests -sv ` + --basetemp=$PWD\temp ` + --iree-install-dir="$PWD/iree-install" ` + --peano-install-dir="$PWD/llvm-aie" + } From 6737a6fc2c82c54cbaa8ebc471c1908b169cd0c0 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Sun, 29 Sep 2024 20:37:39 -0400 Subject: [PATCH 14/19] run 100 times --- .github/workflows/ci-linux.yml | 6 +++--- .github/workflows/ci-windows.yml | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 8248d2e90..1ac6b7940 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -163,7 +163,7 @@ jobs: $PWD/llvm-aie \ --xrt-dir /opt/xilinx/xrt \ --vitis-dir /opt/Xilinx/Vitis/2024.2 \ - --reset-npu-between-runs -v + --reset-npu-between-runs -v > /dev/null done - name: E2E correctness matmul test @@ -203,7 +203,7 @@ jobs: iree-install \ $PWD/llvm-aie \ /opt/xilinx/xrt \ - /opt/Xilinx/Vitis/2024.2 + /opt/Xilinx/Vitis/2024.2 > /dev/null done - name: Python tests @@ -215,5 +215,5 @@ jobs: pytest -v tests \ --capture=tee-sys \ --iree-install-dir=$PWD/iree-install \ - --peano-install-dir=$PWD/llvm-aie + --peano-install-dir=$PWD/llvm-aie > /dev/null done diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index 34908dab8..b041351b7 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -174,7 +174,7 @@ jobs: bash build_tools/ci/run_matmul_test.sh \ /c/test_matmuls \ $PWD/iree-install \ - $PWD/llvm-aie + $PWD/llvm-aie > /dev/null done - name : E2E comparison of AIE to llvm-cpu @@ -186,7 +186,7 @@ jobs: python build_tools/ci/cpu_comparison/run.py \ /c/test_aie_vs_cpu \ $PWD/iree-install \ - $PWD/llvm-aie -v + $PWD/llvm-aie -v > /dev/null done - name: Python tests From c32a3163b62162d1b9b643528bba46debca566f6 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Sun, 29 Sep 2024 20:53:02 -0400 Subject: [PATCH 15/19] run 100 times --- build_tools/ci/cpu_comparison/run.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/build_tools/ci/cpu_comparison/run.py b/build_tools/ci/cpu_comparison/run.py index de7756798..3b07d4512 100755 --- a/build_tools/ci/cpu_comparison/run.py +++ b/build_tools/ci/cpu_comparison/run.py @@ -2,6 +2,7 @@ # Copyright 2024 The IREE Authors +import sys import argparse import os import platform @@ -97,15 +98,17 @@ def shell_out(cmd: list, workdir=None, verbose: int = 0, raise_on_error=True, en print("Standard output from script:") print(stdout_decode) if stderr_decode: - print("Standard error from script:") - print(stderr_decode) + print("Standard error from script:", file=sys.stderr) + print(stderr_decode, file=sys.stderr) if not raise_on_error and handle.returncode != 0: print( - f"Error executing script, error code was {handle.returncode}. Not raising an error." + f"Error executing script, error code was {handle.returncode}. Not raising an error.", + file=sys.stderr ) if raise_on_error and handle.returncode != 0: raise RuntimeError( - f"Error executing script, error code was {handle.returncode}" + f"Error executing script, error code was {handle.returncode}", + file=sys.stderr ) return stdout_decode, stderr_decode From 499de7edd1f80616102d1e10c9bb594211605a28 Mon Sep 17 00:00:00 2001 From: makslevental Date: Mon, 30 Sep 2024 21:21:22 -0400 Subject: [PATCH 16/19] use xrtDeviceHandle --- .github/workflows/ci-linux.yml | 7 ++- .github/workflows/ci-windows.yml | 6 +-- .../driver/xrt/direct_allocator.cc | 9 ++-- .../driver/xrt/direct_allocator.h | 2 +- .../driver/xrt/direct_command_buffer.cc | 1 - .../driver/xrt/native_executable.cc | 16 ++++--- .../driver/xrt/native_executable.h | 6 +-- .../driver/xrt/nop_executable_cache.cc | 8 ++-- .../driver/xrt/nop_executable_cache.h | 2 +- .../src/iree-amd-aie/driver/xrt/xrt_device.cc | 36 +++++++++++----- .../src/iree-amd-aie/driver/xrt/xrt_device.h | 3 +- .../src/iree-amd-aie/driver/xrt/xrt_driver.cc | 43 +++++-------------- tests/conftest.py | 16 ++++--- tests/test_matmul.py | 9 ++-- 14 files changed, 82 insertions(+), 82 deletions(-) diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index 1ac6b7940..f9f69c11b 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -150,12 +150,11 @@ jobs: source .venv/bin/activate pip install -r tests/requirements.txt - - name : E2E comparison of AIE to llvm-cpu run: | source .venv/bin/activate source /opt/xilinx/xrt/setup.sh - for i in {1..100}; do + for i in {1..50}; do echo "run $i" python build_tools/ci/cpu_comparison/run.py \ test_aie_vs_cpu \ @@ -196,7 +195,7 @@ jobs: sudo prlimit -lunlimited --pid $$ source .venv/bin/activate source /opt/xilinx/xrt/setup.sh - for i in {1..100}; do + for i in {1..50}; do echo "run $i" bash build_tools/ci/run_matmul_test.sh \ test_matmuls \ @@ -210,7 +209,7 @@ jobs: run: | source .venv/bin/activate source /opt/xilinx/xrt/setup.sh - for i in {1..100}; do + for i in {1..50}; do echo "run $i" pytest -v tests \ --capture=tee-sys \ diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index b041351b7..ef31c478a 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -169,7 +169,7 @@ jobs: shell: bash run: | source .venv/Scripts/activate - for i in {1..100}; do + for i in {1..50}; do echo "run $i" bash build_tools/ci/run_matmul_test.sh \ /c/test_matmuls \ @@ -181,7 +181,7 @@ jobs: shell: bash run: | source .venv/Scripts/activate - for i in {1..100}; do + for i in {1..50}; do echo "run $i" python build_tools/ci/cpu_comparison/run.py \ /c/test_aie_vs_cpu \ @@ -194,7 +194,7 @@ jobs: ls $env:XILINX_XRT .\.venv\Scripts\Activate.ps1 mkdir temp - for ($i = 1; $i -le 100; $i++) { + for ($i = 1; $i -le 50; $i++) { echo "run $i" pytest tests -sv ` --basetemp=$PWD\temp ` diff --git a/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.cc b/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.cc index 641f08c9c..be2370d2f 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.cc +++ b/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.cc @@ -28,7 +28,7 @@ typedef struct iree_hal_xrt_allocator_t { // The device that this allocator is attached to. iree_hal_device_t* base_device; - xrt::device device; + xrtDeviceHandle device_hdl; iree_allocator_t host_allocator; @@ -46,7 +46,7 @@ static iree_hal_xrt_allocator_t* iree_hal_xrt_allocator_cast( } iree_status_t iree_hal_xrt_allocator_create( - iree_hal_device_t* base_device, xrt::device device, + iree_hal_device_t* base_device, xrtDeviceHandle device_hdl, iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator) { IREE_ASSERT_ARGUMENT(base_device); IREE_ASSERT_ARGUMENT(out_allocator); @@ -61,7 +61,7 @@ iree_status_t iree_hal_xrt_allocator_create( &allocator->resource); allocator->base_device = base_device; iree_hal_device_retain(base_device); - allocator->device = device; + allocator->device_hdl = device_hdl; allocator->host_allocator = host_allocator; *out_allocator = (iree_hal_allocator_t*)allocator; @@ -171,7 +171,8 @@ static iree_status_t iree_hal_xrt_allocator_allocate_buffer( std::unique_ptr xrt_buffer; try { - xrt_buffer = std::make_unique(allocator->device, allocation_size, + xrt::device device(xrtDeviceToXclDevice(allocator->device_hdl)); + xrt_buffer = std::make_unique(device, allocation_size, XRT_BO_FLAGS_HOST_ONLY, group_id); } catch (...) { IREE_TRACE_ZONE_END(z0); diff --git a/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.h b/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.h index 39a0f3e10..104bb2e2b 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.h +++ b/runtime/src/iree-amd-aie/driver/xrt/direct_allocator.h @@ -17,7 +17,7 @@ extern "C" { // Creates an XRT memory allocator. iree_status_t iree_hal_xrt_allocator_create( - iree_hal_device_t* base_device, xrt::device device, + iree_hal_device_t* base_device, xrtDeviceHandle device_hdl, iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator); #ifdef __cplusplus diff --git a/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc b/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc index 5785f3484..770527e93 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc +++ b/runtime/src/iree-amd-aie/driver/xrt/direct_command_buffer.cc @@ -292,7 +292,6 @@ static iree_status_t iree_hal_xrt_direct_command_buffer_push_descriptor_set( IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_hal_resource_set_insert(command_buffer->resource_set, 1, &binding->buffer)); - std::unique_ptr sub_buffer; current_bindings[i] = iree_hal_xrt_buffer_handle( iree_hal_buffer_allocated_buffer(binding->buffer)); current_offsets[i] = diff --git a/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc b/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc index c3f8c7aa4..6d37d9e53 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc +++ b/runtime/src/iree-amd-aie/driver/xrt/native_executable.cc @@ -99,7 +99,8 @@ static iree_status_t iree_amd_aie_hal_xrt_native_executable_flatbuffer_verify( } iree_status_t iree_hal_xrt_native_executable_create( - xrt::device device, const iree_hal_executable_params_t* executable_params, + xrtDeviceHandle device_hdl, + const iree_hal_executable_params_t* executable_params, iree_allocator_t host_allocator, iree_hal_executable_t** out_executable) { IREE_ASSERT_ARGUMENT(executable_params); IREE_ASSERT_ARGUMENT(out_executable); @@ -180,24 +181,27 @@ iree_status_t iree_hal_xrt_native_executable_create( // XRT API needs this vector and cant actually read a void*. std::vector xclbinVector( xclbin_fb, xclbin_fb + flatbuffers_string_len(xclbin_fb)); + xrt::xclbin xclbin; try { - params->xclbin = xrt::xclbin(xclbinVector); + xclbin = xrt::xclbin(xclbinVector); } catch (std::exception& e) { return iree_make_status(IREE_STATUS_INTERNAL, "XCLBIN load error: %s", e.what()); } + xrt::device device(xrtDeviceToXclDevice(device_hdl)); + IREE_ASSERT(device, "failed to find device"); + try { - device.register_xclbin(params->xclbin); + device.register_xclbin(xclbin); } catch (std::exception& e) { return iree_make_status(IREE_STATUS_INTERNAL, "XCLBIN register error: %s", e.what()); } try { - params->context = - xrt::hw_context(device, params->xclbin.get_uuid(), - xrt::hw_context::access_mode::exclusive); + params->context = xrt::hw_context( + device, xclbin.get_uuid(), xrt::hw_context::access_mode::exclusive); } catch (std::exception& e) { return iree_make_status(IREE_STATUS_INTERNAL, "xrt::hw_context context: %s", e.what()); diff --git a/runtime/src/iree-amd-aie/driver/xrt/native_executable.h b/runtime/src/iree-amd-aie/driver/xrt/native_executable.h index 141bbebca..bc01b9d23 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/native_executable.h +++ b/runtime/src/iree-amd-aie/driver/xrt/native_executable.h @@ -7,7 +7,7 @@ #ifndef IREE_AMD_AIE_DRIVER_XRT_NATIVE_EXECUTABLE_H_ #define IREE_AMD_AIE_DRIVER_XRT_NATIVE_EXECUTABLE_H_ -#include +#include #include "iree/base/api.h" #include "iree/base/tracing.h" @@ -22,7 +22,6 @@ extern "C" { // Object and launch parameters for a compute kernel. typedef struct iree_hal_xrt_kernel_params_t { xrt::hw_context context; - xrt::xclbin xclbin; // The kernel code object. xrt::kernel kernel; // Instruction buffer argument to the kernel. @@ -37,7 +36,8 @@ typedef struct iree_hal_xrt_kernel_params_t { // |out_executable| must be released by the caller (see // iree_hal_executable_release). iree_status_t iree_hal_xrt_native_executable_create( - xrt::device device, const iree_hal_executable_params_t* executable_params, + xrtDeviceHandle device_hdl, + const iree_hal_executable_params_t* executable_params, iree_allocator_t host_allocator, iree_hal_executable_t** out_executable); // Returns the kernel launch parameters for the given |entry_point|. diff --git a/runtime/src/iree-amd-aie/driver/xrt/nop_executable_cache.cc b/runtime/src/iree-amd-aie/driver/xrt/nop_executable_cache.cc index 655133e61..3120e5a49 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/nop_executable_cache.cc +++ b/runtime/src/iree-amd-aie/driver/xrt/nop_executable_cache.cc @@ -17,7 +17,7 @@ typedef struct iree_hal_xrt_nop_executable_cache_t { // at offset 0. iree_hal_resource_t resource; - xrt::device device; + xrtDeviceHandle device_hdl; iree_allocator_t host_allocator; } iree_hal_xrt_nop_executable_cache_t; @@ -35,7 +35,7 @@ iree_hal_xrt_nop_executable_cache_cast( } iree_status_t iree_hal_xrt_nop_executable_cache_create( - xrt::device device, iree_string_view_t identifier, + xrtDeviceHandle device_hdl, iree_string_view_t identifier, iree_allocator_t host_allocator, iree_hal_executable_cache_t** out_executable_cache) { IREE_ASSERT_ARGUMENT(out_executable_cache); @@ -49,7 +49,7 @@ iree_status_t iree_hal_xrt_nop_executable_cache_create( iree_hal_resource_initialize(&iree_hal_xrt_nop_executable_cache_vtable, &executable_cache->resource); executable_cache->host_allocator = host_allocator; - executable_cache->device = device; + executable_cache->device_hdl = device_hdl; *out_executable_cache = (iree_hal_executable_cache_t*)executable_cache; IREE_TRACE_ZONE_END(z0); @@ -82,7 +82,7 @@ static iree_status_t iree_hal_xrt_nop_executable_cache_prepare_executable( iree_hal_xrt_nop_executable_cache_t* executable_cache = iree_hal_xrt_nop_executable_cache_cast(base_executable_cache); return iree_hal_xrt_native_executable_create( - executable_cache->device, executable_params, + executable_cache->device_hdl, executable_params, executable_cache->host_allocator, out_executable); } diff --git a/runtime/src/iree-amd-aie/driver/xrt/nop_executable_cache.h b/runtime/src/iree-amd-aie/driver/xrt/nop_executable_cache.h index 9a84f9e6d..5362f98af 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/nop_executable_cache.h +++ b/runtime/src/iree-amd-aie/driver/xrt/nop_executable_cache.h @@ -22,7 +22,7 @@ extern "C" { // |out_executable_cache| must be released by the caller (see // iree_hal_executable_cache_release). iree_status_t iree_hal_xrt_nop_executable_cache_create( - xrt::device device, iree_string_view_t identifier, + xrtDeviceHandle device_hdl, iree_string_view_t identifier, iree_allocator_t host_allocator, iree_hal_executable_cache_t** out_executable_cache); diff --git a/runtime/src/iree-amd-aie/driver/xrt/xrt_device.cc b/runtime/src/iree-amd-aie/driver/xrt/xrt_device.cc index 7b9a36f78..03aa86c9f 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/xrt_device.cc +++ b/runtime/src/iree-amd-aie/driver/xrt/xrt_device.cc @@ -6,6 +6,7 @@ #include "iree-amd-aie/driver/xrt/xrt_device.h" +#include "experimental/xrt_system.h" #include "iree-amd-aie/driver/xrt/direct_allocator.h" #include "iree-amd-aie/driver/xrt/direct_command_buffer.h" #include "iree-amd-aie/driver/xrt/nop_executable_cache.h" @@ -32,7 +33,7 @@ typedef struct iree_hal_xrt_device_t { iree_allocator_t host_allocator; iree_hal_allocator_t* device_allocator; - xrt::device device; + xrtDeviceHandle device_hdl; } iree_hal_xrt_device_t; namespace { @@ -52,17 +53,30 @@ void iree_hal_xrt_device_params_initialize( } static iree_status_t iree_hal_xrt_device_create_internal( - iree_string_view_t identifier, xrt::device xrt_device, - const iree_hal_xrt_device_params_t* params, iree_allocator_t host_allocator, - iree_hal_device_t** out_device) { + iree_string_view_t identifier, const iree_hal_xrt_device_params_t* params, + iree_allocator_t host_allocator, iree_hal_device_t** out_device) { iree_hal_xrt_device_t* device = nullptr; iree_host_size_t total_size = iree_sizeof_struct(*device) + identifier.size; IREE_RETURN_IF_ERROR( iree_allocator_malloc(host_allocator, total_size, (void**)&device)); + try { + if (IREE_UNLIKELY(xrt::system::enumerate_devices() == 0)) { + return iree_make_status(IREE_STATUS_FAILED_PRECONDITION, + "No XRT devices found"); + } + } catch (std::exception& e) { + return iree_make_status(IREE_STATUS_INTERNAL, + "xrt::system::enumerate_devices failed: %s", + e.what()); + } + + xrtDeviceHandle device_hdl = xrtDeviceOpen(0); + IREE_ASSERT(device_hdl, "failed to open xrt device"); + iree_status_t status = - iree_hal_xrt_allocator_create((iree_hal_device_t*)device, xrt_device, + iree_hal_xrt_allocator_create((iree_hal_device_t*)device, device_hdl, host_allocator, &device->device_allocator); if (iree_status_is_ok(status)) { iree_hal_resource_initialize(&iree_hal_xrt_device_vtable, @@ -74,7 +88,7 @@ static iree_status_t iree_hal_xrt_device_create_internal( &device->block_pool); device->host_allocator = host_allocator; - device->device = xrt_device; + device->device_hdl = device_hdl; device->params = *params; *out_device = (iree_hal_device_t*)device; } else { @@ -85,13 +99,12 @@ static iree_status_t iree_hal_xrt_device_create_internal( iree_status_t iree_hal_xrt_device_create( iree_string_view_t identifier, const iree_hal_xrt_device_params_t* params, - xrt::device device, iree_allocator_t host_allocator, - iree_hal_device_t** out_device) { + iree_allocator_t host_allocator, iree_hal_device_t** out_device) { IREE_ASSERT_ARGUMENT(out_device); IREE_TRACE_ZONE_BEGIN(z0); iree_status_t status = iree_hal_xrt_device_create_internal( - identifier, device, params, host_allocator, out_device); + identifier, params, host_allocator, out_device); IREE_TRACE_ZONE_END(z0); return status; @@ -104,7 +117,9 @@ static void iree_hal_xrt_device_destroy(iree_hal_device_t* base_device) { iree_hal_allocator_release(device->device_allocator); iree_arena_block_pool_deinitialize(&device->block_pool); + xrtDeviceHandle device_hdl = device->device_hdl; iree_allocator_free(host_allocator, device); + (void)xrtDeviceClose(device_hdl); IREE_TRACE_ZONE_END(z0); } @@ -201,7 +216,8 @@ static iree_status_t iree_hal_xrt_device_create_executable_cache( iree_loop_t loop, iree_hal_executable_cache_t** out_executable_cache) { iree_hal_xrt_device_t* device = iree_hal_xrt_device_cast(base_device); return iree_hal_xrt_nop_executable_cache_create( - device->device, identifier, device->host_allocator, out_executable_cache); + device->device_hdl, identifier, device->host_allocator, + out_executable_cache); } static iree_status_t iree_hal_xrt_device_import_file( diff --git a/runtime/src/iree-amd-aie/driver/xrt/xrt_device.h b/runtime/src/iree-amd-aie/driver/xrt/xrt_device.h index f23610dbd..aa77fdeb7 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/xrt_device.h +++ b/runtime/src/iree-amd-aie/driver/xrt/xrt_device.h @@ -21,8 +21,7 @@ extern "C" { // |out_device| must be released by the caller (see iree_hal_device_release). iree_status_t iree_hal_xrt_device_create( iree_string_view_t identifier, const iree_hal_xrt_device_params_t* params, - xrt::device device, iree_allocator_t host_allocator, - iree_hal_device_t** out_device); + iree_allocator_t host_allocator, iree_hal_device_t** out_device); #ifdef __cplusplus } // extern "C" diff --git a/runtime/src/iree-amd-aie/driver/xrt/xrt_driver.cc b/runtime/src/iree-amd-aie/driver/xrt/xrt_driver.cc index 2f8a1134f..ca868860a 100644 --- a/runtime/src/iree-amd-aie/driver/xrt/xrt_driver.cc +++ b/runtime/src/iree-amd-aie/driver/xrt/xrt_driver.cc @@ -6,13 +6,10 @@ #include "iree-amd-aie/driver/xrt/xrt_device.h" #include "iree/base/api.h" -#include "iree/base/target_platform.h" #include "iree/base/tracing.h" #include "iree/hal/api.h" // XRT includes -#include "experimental/xrt_system.h" -#include "xrt.h" #include "xrt/xrt_device.h" #include "xrt/xrt_kernel.h" @@ -40,7 +37,7 @@ typedef struct iree_hal_xrt_driver_t { // Parameters used to control device behavior. iree_hal_xrt_device_params_t device_params; - xrt::device device; + xrtDeviceHandle device_hdl; } iree_hal_xrt_driver_t; @@ -79,23 +76,6 @@ iree_status_t iree_hal_xrt_driver_create_internal( (char*)driver + iree_sizeof_struct(*driver)); driver->device_params = *device_params; - try { - if (IREE_UNLIKELY(xrt::system::enumerate_devices() == 0)) { - return iree_make_status(IREE_STATUS_FAILED_PRECONDITION, - "No XRT devices found"); - } - } catch (std::exception& e) { - return iree_make_status(IREE_STATUS_INTERNAL, - "xrt::system::enumerate_devices failed: %s", - e.what()); - } - // Get handle to xrt device - try { - driver->device = xrt::device(0); - } catch (std::exception& e) { - return iree_make_status(IREE_STATUS_INTERNAL, "xrt::device(0) failed: %s", - e.what()); - } *out_driver = reinterpret_cast(driver); return iree_ok_status(); } @@ -130,10 +110,10 @@ static iree_status_t iree_hal_xrt_driver_dump_device_info( iree_hal_driver_t* base_driver, iree_hal_device_id_t device_id, iree_string_builder_t* builder) { iree_hal_xrt_driver_t* driver = iree_hal_xrt_driver_cast(base_driver); - xrt::device device = driver->device; + xrtDeviceHandle device_hdl = driver->device_hdl; IREE_RETURN_IF_ERROR( iree_string_builder_append_cstring(builder, "\n- Platform:")); - + xrt::device device(xrtDeviceToXclDevice(device_hdl)); std::string platform_info = device.get_info(); const char* platform_info_str = platform_info.c_str(); if (platform_info_str) { @@ -149,7 +129,7 @@ static iree_status_t iree_hal_xrt_driver_dump_device_info( // |out_device_info| must point to valid memory and additional data will be // appended to |buffer_ptr| and the new pointer is returned. static iree_status_t iree_hal_xrt_populate_device_info( - xrt::device device, uint8_t* buffer_ptr, uint8_t** out_buffer_ptr, + xrtDeviceHandle device_hdl, uint8_t* buffer_ptr, uint8_t** out_buffer_ptr, iree_hal_device_info_t* out_device_info) { *out_buffer_ptr = buffer_ptr; @@ -157,6 +137,7 @@ static iree_status_t iree_hal_xrt_populate_device_info( // We currenly only work with one XRT device and its device id is 0. out_device_info->device_id = 0; + xrt::device device(xrtDeviceToXclDevice(device_hdl)); std::string device_name = device.get_info(); const size_t name_len = strlen(device_name.c_str()); if (name_len >= IREE_HAL_XRT_MAX_DEVICE_NAME_LENGTH) { @@ -177,7 +158,7 @@ static iree_status_t iree_hal_xrt_driver_query_available_devices( iree_host_size_t* out_device_info_count, iree_hal_device_info_t** out_device_infos) { iree_hal_xrt_driver_t* driver = iree_hal_xrt_driver_cast(base_driver); - xrt::device device = driver->device; + xrtDeviceHandle device_hdl = driver->device_hdl; // Allocate the return infos and populate with the devices. iree_hal_device_info_t* device_infos = nullptr; iree_host_size_t single_info_size = @@ -190,7 +171,7 @@ static iree_status_t iree_hal_xrt_driver_query_available_devices( // Append all path and name strings at the end of the struct. uint8_t* buffer_ptr = (uint8_t*)device_infos + sizeof(iree_hal_device_info_t); iree_status_t status = iree_hal_xrt_populate_device_info( - device, buffer_ptr, &buffer_ptr, device_infos); + device_hdl, buffer_ptr, &buffer_ptr, device_infos); if (iree_status_is_ok(status)) { // We currenly only work with one XRT device. *out_device_info_count = 1; @@ -209,9 +190,8 @@ static iree_status_t iree_hal_xrt_driver_create_device_by_id( iree_hal_xrt_driver_t* driver = iree_hal_xrt_driver_cast(base_driver); iree_string_view_t device_name = iree_make_cstring_view("xrt"); - iree_status_t status = - iree_hal_xrt_device_create(device_name, &driver->device_params, - driver->device, host_allocator, out_device); + iree_status_t status = iree_hal_xrt_device_create( + device_name, &driver->device_params, host_allocator, out_device); IREE_TRACE_ZONE_END(z0); return status; @@ -226,9 +206,8 @@ static iree_status_t iree_hal_xrt_driver_create_device_by_path( iree_hal_xrt_driver_t* driver = iree_hal_xrt_driver_cast(base_driver); iree_string_view_t device_name = iree_make_cstring_view("xrt"); - iree_status_t status = - iree_hal_xrt_device_create(device_name, &driver->device_params, - driver->device, host_allocator, out_device); + iree_status_t status = iree_hal_xrt_device_create( + device_name, &driver->device_params, host_allocator, out_device); IREE_TRACE_ZONE_END(z0); return status; diff --git a/tests/conftest.py b/tests/conftest.py index e1b3f17b4..3a2a5f76b 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,19 +1,17 @@ -import os from contextlib import contextmanager from pathlib import Path import numpy as np import pytest -from iree.runtime import VmModule +from ml_dtypes import bfloat16 from iree.compiler import ir from iree.compiler._mlir_libs import get_dialect_registry from iree.compiler.api import Session, Output, Source from iree.compiler.extras import types as T -from ml_dtypes import bfloat16 +from iree.runtime import VmModule from iree.runtime import get_driver, Config, SystemContext - for t in [ "i8", "i16", @@ -99,8 +97,13 @@ def session_module(iree_session, tmp_path) -> ir.Module: yield iree_session, module_op +@pytest.fixture(scope="session") +def device(device="xrt") -> ir.Module: + yield get_driver(device).create_default_device() + + @contextmanager -def invokable_module(session, module, device="xrt") -> VmModule: +def invokable_module(session, module, device) -> VmModule: source = Source.wrap_buffer(session, str(module).encode()) inv = session.invocation() inv.parse_source(source) @@ -108,8 +111,7 @@ def invokable_module(session, module, device="xrt") -> VmModule: compiled_flatbuffer = Output.open_membuffer() inv.output_vm_bytecode(compiled_flatbuffer) - driver = get_driver(device) - config = Config(device=driver.create_default_device()) + config = Config(device=device) ctx = SystemContext(config=config) vm_module = VmModule.copy_buffer(ctx.instance, compiled_flatbuffer.map_memory()) ctx.add_vm_module(vm_module) diff --git a/tests/test_matmul.py b/tests/test_matmul.py index 9cd2f0d7c..658572e19 100644 --- a/tests/test_matmul.py +++ b/tests/test_matmul.py @@ -11,7 +11,7 @@ @pytest.mark.parametrize("target_backend", ["amd-aie"]) -def test_smol_matmul(session_module, target_backend): +def test_smol_matmul(session_module, target_backend, device): session, module = session_module @func(T.tensor(32, 16, T.i8()), T.tensor(16, 32, T.i8())) @@ -23,7 +23,7 @@ def matmul_i8_i32(lhs, rhs): arg0 = np.ones((32, 16), dtype=np.int8) arg1 = np.ones((16, 32), dtype=np.int8) - with invokable_module(session, module) as module: + with invokable_module(session, module, device) as module: results = module[matmul_i8_i32.__name__](arg0, arg1).to_host() assert np.array_equal(results, arg0 @ arg1) @@ -98,7 +98,6 @@ def matmul(lhs, rhs): (128, 256, 128), ] - small_i8_shapes_small = [ (64, 64, 64), (128, 256, 128), @@ -136,6 +135,7 @@ def test_matmul( lower_to_aie_pipeline, tile_pipeline, num_repeat_runs, + device, ): session, module = session_module @@ -146,8 +146,9 @@ def test_matmul( acc_type = mlir_type_to_np_dtype(acc_type) arg0 = np.ones((M, K), dtype=lhs_rhs_type) arg1 = np.ones((K, N), dtype=lhs_rhs_type) - with invokable_module(session, module) as module: + with invokable_module(session, module, device) as module: for i in range(num_repeat_runs): + print(f"{matmul_name} run {i}") results = module[matmul_name](arg0, arg1).to_host() assert np.array_equal( results, (arg0.astype(acc_type) @ arg1.astype(acc_type)) From 12787ce9ab82bd7117e93d0abf3d5c0555e83e3b Mon Sep 17 00:00:00 2001 From: makslevental Date: Tue, 1 Oct 2024 07:19:33 -0400 Subject: [PATCH 17/19] restore one run tests --- .github/workflows/ci-linux.yml | 43 +++++++++++++------------------- .github/workflows/ci-windows.yml | 33 +++++++++--------------- tests/test_matmul.py | 5 ++-- 3 files changed, 31 insertions(+), 50 deletions(-) diff --git a/.github/workflows/ci-linux.yml b/.github/workflows/ci-linux.yml index f9f69c11b..e7dcb810e 100644 --- a/.github/workflows/ci-linux.yml +++ b/.github/workflows/ci-linux.yml @@ -154,16 +154,13 @@ jobs: run: | source .venv/bin/activate source /opt/xilinx/xrt/setup.sh - for i in {1..50}; do - echo "run $i" - python build_tools/ci/cpu_comparison/run.py \ - test_aie_vs_cpu \ - $PWD/iree-install \ - $PWD/llvm-aie \ - --xrt-dir /opt/xilinx/xrt \ - --vitis-dir /opt/Xilinx/Vitis/2024.2 \ - --reset-npu-between-runs -v > /dev/null - done + python build_tools/ci/cpu_comparison/run.py \ + test_aie_vs_cpu \ + $PWD/iree-install \ + $PWD/llvm-aie \ + --xrt-dir /opt/xilinx/xrt \ + --vitis-dir /opt/Xilinx/Vitis/2024.2 \ + --reset-npu-between-runs -v - name: E2E correctness matmul test run: | @@ -195,24 +192,18 @@ jobs: sudo prlimit -lunlimited --pid $$ source .venv/bin/activate source /opt/xilinx/xrt/setup.sh - for i in {1..50}; do - echo "run $i" - bash build_tools/ci/run_matmul_test.sh \ - test_matmuls \ - iree-install \ - $PWD/llvm-aie \ - /opt/xilinx/xrt \ - /opt/Xilinx/Vitis/2024.2 > /dev/null - done + bash build_tools/ci/run_matmul_test.sh \ + test_matmuls \ + iree-install \ + $PWD/llvm-aie \ + /opt/xilinx/xrt \ + /opt/Xilinx/Vitis/2024.2 - name: Python tests run: | source .venv/bin/activate source /opt/xilinx/xrt/setup.sh - for i in {1..50}; do - echo "run $i" - pytest -v tests \ - --capture=tee-sys \ - --iree-install-dir=$PWD/iree-install \ - --peano-install-dir=$PWD/llvm-aie > /dev/null - done + pytest -v tests \ + --capture=tee-sys \ + --iree-install-dir=$PWD/iree-install \ + --peano-install-dir=$PWD/llvm-aie diff --git a/.github/workflows/ci-windows.yml b/.github/workflows/ci-windows.yml index ef31c478a..2fcc702e7 100644 --- a/.github/workflows/ci-windows.yml +++ b/.github/workflows/ci-windows.yml @@ -169,35 +169,26 @@ jobs: shell: bash run: | source .venv/Scripts/activate - for i in {1..50}; do - echo "run $i" - bash build_tools/ci/run_matmul_test.sh \ - /c/test_matmuls \ - $PWD/iree-install \ - $PWD/llvm-aie > /dev/null - done + bash build_tools/ci/run_matmul_test.sh \ + /c/test_matmuls \ + $PWD/iree-install \ + $PWD/llvm-aie - name : E2E comparison of AIE to llvm-cpu shell: bash run: | source .venv/Scripts/activate - for i in {1..50}; do - echo "run $i" - python build_tools/ci/cpu_comparison/run.py \ - /c/test_aie_vs_cpu \ - $PWD/iree-install \ - $PWD/llvm-aie -v > /dev/null - done + python build_tools/ci/cpu_comparison/run.py \ + /c/test_aie_vs_cpu \ + $PWD/iree-install \ + $PWD/llvm-aie -v - name: Python tests run: | ls $env:XILINX_XRT .\.venv\Scripts\Activate.ps1 mkdir temp - for ($i = 1; $i -le 50; $i++) { - echo "run $i" - pytest tests -sv ` - --basetemp=$PWD\temp ` - --iree-install-dir="$PWD/iree-install" ` - --peano-install-dir="$PWD/llvm-aie" - } + pytest tests -sv ` + --basetemp=$PWD\temp ` + --iree-install-dir="$PWD/iree-install" ` + --peano-install-dir="$PWD/llvm-aie" diff --git a/tests/test_matmul.py b/tests/test_matmul.py index 658572e19..b58fc42e8 100644 --- a/tests/test_matmul.py +++ b/tests/test_matmul.py @@ -144,11 +144,10 @@ def test_matmul( lhs_rhs_type = mlir_type_to_np_dtype(lhs_rhs_type) acc_type = mlir_type_to_np_dtype(acc_type) - arg0 = np.ones((M, K), dtype=lhs_rhs_type) - arg1 = np.ones((K, N), dtype=lhs_rhs_type) + arg0 = np.random.randint(-1, 1, (M, K), dtype=lhs_rhs_type) + arg1 = np.random.randint(-1, 1, (K, N), dtype=lhs_rhs_type) with invokable_module(session, module, device) as module: for i in range(num_repeat_runs): - print(f"{matmul_name} run {i}") results = module[matmul_name](arg0, arg1).to_host() assert np.array_equal( results, (arg0.astype(acc_type) @ arg1.astype(acc_type)) From 3db0a06e27c6edba6bd6bbcf017fff66b035c651 Mon Sep 17 00:00:00 2001 From: Maksim Levental Date: Tue, 1 Oct 2024 07:50:28 -0400 Subject: [PATCH 18/19] Update build_tools/ci/run_matmul_test.sh Co-authored-by: Jorn Tuyls --- build_tools/ci/run_matmul_test.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/build_tools/ci/run_matmul_test.sh b/build_tools/ci/run_matmul_test.sh index b0ec2c550..fb4b01a77 100755 --- a/build_tools/ci/run_matmul_test.sh +++ b/build_tools/ci/run_matmul_test.sh @@ -654,6 +654,7 @@ run_matmul_test_on_shapes ${i32_shapes_small[@]} \ i32_shapes_medium=( '1024x1024x1024' ) +# TODO(jornt): re-enable `1536x2048x1536` #if [ "$OSTYPE" != "msys" ]; then # i32_shapes_medium+=('1536x2048x1536') #fi From fc4808768ff887a0f5a2b9a8d46011810dec1935 Mon Sep 17 00:00:00 2001 From: makslevental Date: Tue, 1 Oct 2024 08:20:47 -0400 Subject: [PATCH 19/19] comments --- build_tools/download_peano.ps1 | 3 ++- build_tools/download_peano.sh | 3 ++- build_tools/peano_commit.txt | 1 + tests/test_matmul.py | 8 ++++---- 4 files changed, 9 insertions(+), 6 deletions(-) create mode 100644 build_tools/peano_commit.txt diff --git a/build_tools/download_peano.ps1 b/build_tools/download_peano.ps1 index 98b8cf982..89bd6808f 100644 --- a/build_tools/download_peano.ps1 +++ b/build_tools/download_peano.ps1 @@ -6,6 +6,7 @@ $ErrorActionPreference = 'Stop' -$RELEASE = "19.0.0.2024082221+90abe71b" +$this_dir = Split-Path -Path $MyInvocation.MyCommand.Path -Parent +$RELEASE = (Get-Content -Path "$this_dir/peano_commit.txt") pip download llvm_aie==$RELEASE -f https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly Expand-Archive (Get-ChildItem -Filter llvm*.whl).FullName -DestinationPath $PWD.Path diff --git a/build_tools/download_peano.sh b/build_tools/download_peano.sh index d42e1777b..43b3c3cf2 100644 --- a/build_tools/download_peano.sh +++ b/build_tools/download_peano.sh @@ -6,6 +6,7 @@ # See https://llvm.org/LICENSE.txt for license information. # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -RELEASE=19.0.0.2024092601+562ccea2 +this_dir="$(cd $(dirname $0) && pwd)" +RELEASE=$(cat $this_dir/peano_commit.txt) pip download llvm_aie==$RELEASE -f https://github.com/Xilinx/llvm-aie/releases/expanded_assets/nightly unzip llvm_aie*whl diff --git a/build_tools/peano_commit.txt b/build_tools/peano_commit.txt new file mode 100644 index 000000000..d6d5007c1 --- /dev/null +++ b/build_tools/peano_commit.txt @@ -0,0 +1 @@ +19.0.0.2024092601+562ccea2 \ No newline at end of file diff --git a/tests/test_matmul.py b/tests/test_matmul.py index b58fc42e8..598bfdcab 100644 --- a/tests/test_matmul.py +++ b/tests/test_matmul.py @@ -21,8 +21,8 @@ def matmul_i8_i32(lhs, rhs): v1 = linalg.fill(cst, outs=[v0]) return linalg.matmul(lhs, rhs, outs=[v1]) - arg0 = np.ones((32, 16), dtype=np.int8) - arg1 = np.ones((16, 32), dtype=np.int8) + arg0 = np.random.randint(-1, 3, (32, 16), dtype=np.int8) + arg1 = np.random.randint(-1, 3, (16, 32), dtype=np.int8) with invokable_module(session, module, device) as module: results = module[matmul_i8_i32.__name__](arg0, arg1).to_host() assert np.array_equal(results, arg0 @ arg1) @@ -144,8 +144,8 @@ def test_matmul( lhs_rhs_type = mlir_type_to_np_dtype(lhs_rhs_type) acc_type = mlir_type_to_np_dtype(acc_type) - arg0 = np.random.randint(-1, 1, (M, K), dtype=lhs_rhs_type) - arg1 = np.random.randint(-1, 1, (K, N), dtype=lhs_rhs_type) + arg0 = np.random.randint(-1, 3, (M, K), dtype=lhs_rhs_type) + arg1 = np.random.randint(-1, 3, (K, N), dtype=lhs_rhs_type) with invokable_module(session, module, device) as module: for i in range(num_repeat_runs): results = module[matmul_name](arg0, arg1).to_host()