diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml index b58b7e03..b37b85a1 100644 --- a/.github/workflows/cmake.yml +++ b/.github/workflows/cmake.yml @@ -2,16 +2,29 @@ name: Build on: [push, pull_request] +# We define multiple jobs that test compatibility of the FIRESTARTER code against different compilers. +# On linux: +# FIRESTARTER_HIP version 6.2 against the hipcc compiler on ubuntu24.04 +# FIRESTARTER_ONEAPI versions 2023.2.0 and 2024.0 against icx and icpx compiler on ubuntu24.04 +# FIRESTARTER_CUDA with cuda versions 8.0, 11.0 and NVHPC-22.5 against the default compiler on ubuntu24.04 +# FIRESTARTER against +# ubuntu 20.04 gcc-7 and clang-9 +# ubuntu 24.04 gcc-14 and clang-18 +# default compilers on ubuntu-20.04, ubuntu-22.04 and ubuntu-24.04 +# On Windows: +# FIRESTARTER against windows-2019 MSVC and mingw +# FIRESTARTER_CUDA against windows-2019 MSVC +# On MacOS: +# FIRESTARTER against XCode on MacOS 13 + jobs: build-linux-hip-rocm: strategy: -# max-parallel: 1 # Sets the limit of jobs to run concurrently fail-fast: true matrix: - os: [ubuntu-22.04] - compiler: [g++-9, g++-10, g++-11, g++-12, clang++-11, clang++-12, clang++-13, clang++-14, clang++-15] - HIP: ['6.2'] + os: [ubuntu-24.04] + HIP: ['6.3'] runs-on: ${{ matrix.os }} env: @@ -24,42 +37,6 @@ jobs: sudo rm -rf /opt/ghc sudo rm -rf /usr/local/.ghcup - - name: Install g++-9 (if needed) - if: matrix.compiler == 'g++-9' - run: | - sudo apt install g++-9 - - name: Install g++-10 (if needed) - if: matrix.compiler == 'g++-10' - run: | - sudo apt install g++-10 - - name: Install g++-11 (if needed) - if: matrix.compiler == 'g++-11' - run: | - sudo apt install g++-11 - - name: Install g++-12 (if needed) - if: matrix.compiler == 'g++-12' - run: | - sudo apt install g++-12 - - name: Install clang++-11 (if needed) - if: matrix.compiler == 'clang++-11' - run: | - sudo apt install clang-11 - - name: Install clang++-12 (if needed) - if: matrix.compiler == 'clang++-12' - run: | - sudo apt install clang-12 - - name: Install clang++-13 (if needed) - if: matrix.compiler == 'clang++-13' - run: | - sudo apt install clang-13 - - name: Install clang++-14 (if needed) - if: matrix.compiler == 'clang++-14' - run: | - sudo apt install clang-14 - - name: Install clang++-15 (if needed) - if: matrix.compiler == 'clang++-15' - run: | - sudo apt install clang-15 - name: Setup Node.js uses: actions/setup-node@v4 with: @@ -69,12 +46,11 @@ jobs: with: fetch-depth: '0' - name: Install HIP runtime (if needed) - if: matrix.HIP != '0' && matrix.ONEAPI == 0 && matrix.CUDA == 0 run: | case ${{ matrix.HIP }} in - 6.2) - wget https://repo.radeon.com/amdgpu-install/6.2/ubuntu/jammy/amdgpu-install_6.2.60200-1_all.deb - sudo apt install ./amdgpu-install_6.2.60200-1_all.deb + 6.3) + wget https://repo.radeon.com/amdgpu-install/6.3/ubuntu/noble/amdgpu-install_6.3.60300-1_all.deb + sudo apt install ./amdgpu-install_6.3.60300-1_all.deb sudo amdgpu-install -y --usecase=hiplibsdk,rocm --no-dkms ;; esac @@ -83,9 +59,9 @@ jobs: run: | mkdir build - name: Run CMake configure (HIP) - if: matrix.HIP != '0' && matrix.CUDA == '0' && matrix.ONEAPI == '0' env: - CXX: ${{ matrix.compiler }} + CC: '/opt/rocm/bin/hipcc' + CXX: '/opt/rocm/bin/hipcc' run: | export CPATH=${HIP_ROOT}/include:${HIP_ROOT} export LD_LIBRARY_PATH=${HIP_ROOT}/lib64:${HIP_ROOT}/lib64/stubs:${LD_LIBRARY_PATH} @@ -98,62 +74,106 @@ jobs: cd build cmake -DFIRESTARTER_BUILD_TYPE="FIRESTARTER_HIP" -DCMAKE_EXE_LINKER_FLAGS="-L${HIP_ROOT}/lib64/stubs/" .. - - name: Build (default, CUDA, HIP) - if: matrix.ONEAPI =='0' + - name: Build (HIP) run: | cd build - make -j2 + make -j4 - name: Strip binary (HIP) - if: matrix.CUDA == '0' && matrix.HIP != '0' && matrix.ONEAPI == '0' run: | cd build strip src/FIRESTARTER_HIP - uses: actions/upload-artifact@v4 - if: matrix.compiler == 'g++-9' && matrix.HIP != '0' && matrix.CUDA == '0' && matrix.ONEAPI == '0' && ( github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') || github.event_name == 'pull_request' ) + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') || github.event_name == 'pull_request' with: name: FIRESTARTER_HIP_${{ matrix.HIP }}-linux retention-days: 1 path: build/src/FIRESTARTER_HIP - build-linux: + + + build-linux-oneapi: strategy: -# max-parallel: 1 # Sets the limit of jobs to run concurrently fail-fast: true matrix: - os: [ubuntu-20.04] - compiler: [g++-7, g++-8, g++-9, g++-10, clang++-8, clang++-9, clang++-10] - CUDA: ['0', '8.0', '11.0', 'NVHPC-22.5'] - ONEAPI: ['0', '2023.2.0', '2024.0'] - + os: [ubuntu-24.04] + ONEAPI: ['2023.2.0', '2024.0'] + runs-on: ${{ matrix.os }} - env: - CUDA_ROOT: '/usr/local/cuda' -#kudos to https://github.com/easimon/maximize-build-space/blob/master/action.yml steps: - name: Try to clean up some things run: | sudo rm -rf /usr/local/lib/android sudo rm -rf /usr/share/dotnet - - name: Install g++-7 (if needed) - if: matrix.compiler == 'g++-7' + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: '0' + - name: Install OneAPI Base-Toolkit run: | - sudo apt install g++-7 - - name: Install g++-8 (if needed) - if: matrix.compiler == 'g++-8' + wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null + echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list + sudo apt update + sudo apt install intel-basekit-${{ matrix.ONEAPI }} + - name: Create build directory run: | - sudo apt install g++-8 - - name: Install clang++-8 (if needed) - if: matrix.compiler == 'clang++-8' + mkdir build + - name: Run CMake configure (OneAPI 2023.2.0) + if: matrix.ONEAPI == '2023.2.0' + run: | + . /opt/intel/oneapi/setvars.sh + cd build + cmake -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DFIRESTARTER_BUILD_TYPE="FIRESTARTER_ONEAPI" .. + - name: Run CMake configure (OneAPI 2024.0) + if: matrix.ONEAPI == '2024.0' + run: | + . /opt/intel/oneapi/${{ matrix.ONEAPI }}/oneapi-vars.sh + cd build + cmake -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DFIRESTARTER_BUILD_TYPE="FIRESTARTER_ONEAPI" .. + - name: Build (OneAPI 2023.2.0) + if: matrix.ONEAPI == '2023.2.0' + run: | + . /opt/intel/oneapi/setvars.sh + cd build + make -j4 + - name: Build (OneAPI 2024.0) + if: matrix.ONEAPI == '2024.0' run: | - sudo apt install clang-8 - - name: Install clang++-9 (if needed) - if: matrix.compiler == 'clang++-9' + . /opt/intel/oneapi/${{ matrix.ONEAPI }}/oneapi-vars.sh + cd build + make -j4 + - name: Strip binary (OneAPI) run: | - sudo apt install clang-9 - - name: Install clang++-10 (if needed) - if: matrix.compiler == 'clang++-10' + cd build + strip src/FIRESTARTER_ONEAPI + - uses: actions/upload-artifact@v4 + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') || github.event_name == 'pull_request' + with: + name: FIRESTARTER_ONEAPI_${{ matrix.ONEAPI }}-linux + retention-days: 1 + path: build/src/FIRESTARTER_ONEAPI + + - name: UnInstall OneAPI Base-Toolkit (if needed) run: | - sudo apt install clang-10 + sudo apt remove intel-basekit-${{ matrix.ONEAPI }} + sudo apt autoremove + + + build-linux-cuda: + strategy: + fail-fast: true + matrix: + os: [ubuntu-24.04] + CUDA: ['8.0', '11.0', 'NVHPC-22.5'] + + runs-on: ${{ matrix.os }} + + env: + CUDA_ROOT: '/usr/local/cuda' + steps: - name: Setup Node.js uses: actions/setup-node@v4 with: @@ -162,8 +182,7 @@ jobs: uses: actions/checkout@v4 with: fetch-depth: '0' - - name: Install CUDA runtime (if needed) - if: matrix.CUDA != '0' && matrix.ONEAPI == 0 + - name: Install CUDA runtime run: | case ${{ matrix.CUDA }} in 8.0) @@ -186,162 +205,142 @@ jobs: rm nvhpc_2022_225_Linux_x86_64_cuda_11.7.tar.gz sudo NVHPC_SILENT="true" NVHPC_INSTALL_DIR="$CUDA_ROOT" NVHPC_INSTALL_TYPE="single" ./nvhpc_2022_225_Linux_x86_64_cuda_11.7/install rm -rf nvhpc_2022_225_Linux_x86_64_cuda_11.7 - + ;; esac - - name: Install OneAPI Base-Toolkit (if needed) - if: matrix.ONEAPI != '0' && matrix.CUDA == '0' - run: | - wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | sudo tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null - echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" | sudo tee /etc/apt/sources.list.d/oneAPI.list - sudo apt update - sudo apt install intel-basekit-${{ matrix.ONEAPI }} - name: Create build directory run: | mkdir build - - name: Run CMake configure (default) - if: matrix.CUDA == '0' && matrix.ONEAPI == '0' - env: - CXX: ${{ matrix.compiler }} - run: | - cd build - cmake .. - name: Run CMake configure (CUDA) - if: matrix.CUDA != '0' && matrix.CUDA != 'NVHPC-22.5' && matrix.ONEAPI == '0' - env: - CXX: ${{ matrix.compiler }} + if: matrix.CUDA != 'NVHPC-22.5' run: | export CPATH=${CUDA_ROOT}/include:${CPATH} export LD_LIBRARY_PATH=${CUDA_ROOT}/lib64:${CUDA_ROOT}/lib64/stubs:${LD_LIBRARY_PATH} export LIBRARY_PATH=${CUDA_ROOT}/lib64:${CUDA_ROOT}/lib64/stubs:${LIBRARY_PATH} export CUDA_LIB_PATH=${CUDA_ROOT}/lib64:${CUDA_ROOT}/lib64/stubs - export PATH=${CUDA_ROOT}:${PATH} + export PATH=${CUDA_ROOT}/bin:${PATH} export CUDA_HOME=${CUDA_ROOT} export CUDA_PATH=${CUDA_ROOT} export CUDAToolkit_ROOT=${CUDA_ROOT} - + cd build cmake -DFIRESTARTER_BUILD_TYPE="FIRESTARTER_CUDA" -DCMAKE_EXE_LINKER_FLAGS="-L${CUDA_ROOT}/lib64/stubs/" .. - name: Run CMake configure (CUDA with NVHPC) - if: matrix.CUDA == 'NVHPC-22.5' && matrix.ONEAPI == '0' - env: - CXX: ${{ matrix.compiler }} + if: matrix.CUDA == 'NVHPC-22.5' run: | NVARCH=`uname -s`_`uname -m`; export NVARCH PATH=$CUDA_ROOT/$NVARCH/22.5/compilers/bin:$PATH; export PATH LD_LIBRARY_PATH=$CUDA_ROOT/$NVARCH/22.5/compilers/lib:$LD_LIBRARY_PATH; export LD_LIBRARY_PATH LD_LIBRARY_PATH=$CUDA_ROOT/$NVARCH/22.5/cuda/11.7/lib64:$LD_LIBRARY_PATH; export LD_LIBRARY_PATH LD_LIBRARY_PATH=$CUDA_ROOT/$NVARCH/22.5/cuda/11.7/lib64/stubs:$LD_LIBRARY_PATH; export LD_LIBRARY_PATH - + cd build cmake -DFIRESTARTER_BUILD_TYPE="FIRESTARTER_CUDA" -DCMAKE_EXE_LINKER_FLAGS=-L"$CUDA_ROOT/$NVARCH/22.5/cuda/11.7/lib64/stubs" -LA .. - - name: Run CMake configure (OneAPI 2023.2.0) - if: matrix.CUDA == '0' && matrix.ONEAPI =='2023.2.0' + - name: Build (CUDA) run: | - . /opt/intel/oneapi/setvars.sh cd build - cmake -DFIRESTARTER_BUILD_TYPE="FIRESTARTER_ONEAPI" .. - - name: Run CMake configure (OneAPI 2024.0) - if: matrix.CUDA == '0' && matrix.ONEAPI =='2024.0' + make -j4 + - name: Strip binary (CUDA) run: | - . /opt/intel/oneapi/${{ matrix.ONEAPI }}/oneapi-vars.sh cd build - cmake -DFIRESTARTER_BUILD_TYPE="FIRESTARTER_ONEAPI" .. - - name: Build (default, CUDA) - if: matrix.ONEAPI =='0' + strip src/FIRESTARTER_CUDA + - uses: actions/upload-artifact@v4 + if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') || github.event_name == 'pull_request' + with: + name: FIRESTARTER_CUDA_${{ matrix.CUDA }}-linux + retention-days: 1 + path: build/src/FIRESTARTER_CUDA + - name: UnInstall CUDA runtime (if needed) run: | - cd build - make -j2 - - name: Build (OneAPI 2023.2.0) - if: matrix.CUDA == '0' && matrix.ONEAPI =='2023.2.0' + sudo rm -rf ${CUDA_ROOT} + + + build-linux: + strategy: + fail-fast: true + matrix: + include: + # Oldest supported compiler on older Ubuntu + - { os: ubuntu-20.04, compiler: gcc-7, cxxcompiler: g++-7 } + - { os: ubuntu-20.04, compiler: clang-9, cxxcompiler: clang++-9 } + # Latest compiler on latest Ubuntu + - { os: ubuntu-24.04, compiler: gcc-14, cxxcompiler: g++-14 } + - { os: ubuntu-24.04, compiler: clang-18, cxxcompiler: clang++-18 } + # Default compilers on all ubuntu + - { os: ubuntu-20.04, compiler: default } + - { os: ubuntu-22.04, compiler: default } + - { os: ubuntu-24.04, compiler: default } + + runs-on: ${{ matrix.os }} + + steps: + - name: Install compiler (if needed) + if: startsWith(matrix.compiler, 'gcc') run: | - . /opt/intel/oneapi/setvars.sh - cd build - make -j2 - - name: Build (OneAPI 2024.0) - if: matrix.CUDA == '0' && matrix.ONEAPI =='2024.0' + sudo apt install ${{ matrix.cxxcompiler }} + - name: Install compiler (if needed) + if: startsWith(matrix.compiler, 'clang') + run: | + sudo apt install ${{ matrix.compiler }} + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: '20' + - name: Checkout repository + uses: actions/checkout@v4 + with: + fetch-depth: '0' + - name: Create build directory + run: | + mkdir build + - name: Run CMake configure + if: matrix.compiler != 'default' + env: + CC: ${{ matrix.compiler }} + CXX: ${{ matrix.cxxcompiler }} run: | - . /opt/intel/oneapi/${{ matrix.ONEAPI }}/oneapi-vars.sh cd build - make -j2 - - name: Strip binary (default) - if: matrix.CUDA == '0' && matrix.ONEAPI == '0' + cmake .. + - name: Run CMake configure + if: matrix.compiler == 'default' run: | cd build - strip src/FIRESTARTER - - name: Strip binary (CUDA) - if: matrix.CUDA != '0' && matrix.ONEAPI == '0' + cmake .. + - name: Build (default) run: | cd build - strip src/FIRESTARTER_CUDA - - name: Strip binary (OneAPI) - if: matrix.ONEAPI != '0' && matrix.CUDA == '0' + make -j4 + - name: Strip binary (default) run: | cd build - strip src/FIRESTARTER_ONEAPI + strip src/FIRESTARTER - name: Test FIRESTARTER (default) - if: matrix.CUDA == '0' && matrix.ONEAPI == '0' run: ./build/src/FIRESTARTER -t 1 - uses: actions/upload-artifact@v4 - if: matrix.compiler == 'clang++-10' && matrix.CUDA == '0' && matrix.ONEAPI == '0' && ( github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') || github.event_name == 'pull_request' ) + if: matrix.compiler == 'clang-18' && ( github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') || github.event_name == 'pull_request' ) with: name: FIRESTARTER-linux retention-days: 1 path: build/src/FIRESTARTER - - uses: actions/upload-artifact@v4 - if: matrix.compiler == 'clang++-10' && matrix.CUDA != '0' && matrix.ONEAPI == '0' && ( github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') || github.event_name == 'pull_request' ) - with: - name: FIRESTARTER_CUDA_${{ matrix.CUDA }}-linux - retention-days: 1 - path: build/src/FIRESTARTER_CUDA - - uses: actions/upload-artifact@v4 - if: matrix.compiler == 'clang++-10' && matrix.CUDA == '0' && matrix.ONEAPI != '0' && ( github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') || github.event_name == 'pull_request' ) - with: - name: FIRESTARTER_ONEAPI_${{ matrix.ONEAPI }}-linux - retention-days: 1 - path: build/src/FIRESTARTER_ONEAPI - - - name: UnInstall g++-7 (if needed) - if: matrix.compiler == 'g++-7' - run: | - sudo apt remove g++-7 - sudo apt autoremove - - name: UnInstall g++-8 (if needed) - if: matrix.compiler == 'g++-8' + - name: Uninstall compiler (if needed) + if: startsWith(matrix.compiler, 'gcc') run: | - sudo apt remove g++-8 + sudo apt remove ${{ matrix.cxxcompiler }} sudo apt autoremove - - name: UnInstall clang++-8 (if needed) - if: matrix.compiler == 'clang++-8' + - name: Uninstall compiler (if needed) + if: startsWith(matrix.compiler, 'clang') run: | - sudo apt remove clang-8 - sudo apt autoremove - - name: UnInstall clang++-9 (if needed) - if: matrix.compiler == 'clang++-9' - run: | - sudo apt remove clang-9 - sudo apt autoremove - - name: UnInstall clang++-10 (if needed) - if: matrix.compiler == 'clang++-10' - run: | - sudo apt remove clang-10 - sudo apt autoremove - - name: UnInstall CUDA runtime (if needed) - if: matrix.CUDA != '0' && matrix.ONEAPI == '0' - run: | - sudo rm -rf ${CUDA_ROOT} - - name: UnInstall OneAPI Base-Toolkit (if needed) - if: matrix.ONEAPI != '0' && matrix.CUDA == '0' - run: | - sudo apt remove intel-basekit-${{ matrix.ONEAPI }} + sudo apt remove ${{ matrix.compiler }} sudo apt autoremove + + build-windows: strategy: - fail-fast: false + fail-fast: true matrix: os: [windows-2019] cfg: - - { CUDA: '0', ONEAPI: '0', MSVC: true } - - { CUDA: '0', ONEAPI: '0', MSVC: false } + - { CUDA: '0', MSVC: true } + - { CUDA: '0', MSVC: false } - { CUDA: '11.0', MSVC: true } runs-on: ${{ matrix.os }} @@ -395,7 +394,7 @@ jobs: shell: pwsh run: | cd build - cmake --build . -j2 + cmake --build . -j4 - name: Copy Hwloc DLL shell: pwsh if: matrix.cfg.MSVC == true @@ -427,11 +426,12 @@ jobs: path: | build\src\FIRESTARTER_CUDA.exe build\src\libhwloc-15.dll + build-macos: strategy: - fail-fast: false + fail-fast: true matrix: - os: [macos-12, macos-13] + os: [macos-13] runs-on: ${{ matrix.os }} @@ -452,7 +452,7 @@ jobs: - name: Build run: | cd build - make -j2 + make -j4 - name: Strip binary run: | cd build @@ -461,23 +461,18 @@ jobs: run: | cd build ./src/FIRESTARTER -t 1 - - uses: actions/upload-artifact@v4 - if: matrix.os == 'macos-12' && ( github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') || github.event_name == 'pull_request' ) - with: - retention-days: 1 - name: FIRESTARTER-macOS_12 - path: build/src/FIRESTARTER - uses: actions/upload-artifact@v4 if: matrix.os == 'macos-13' && ( github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') || github.event_name == 'pull_request' ) with: retention-days: 1 name: FIRESTARTER-macOS_13 path: build/src/FIRESTARTER + create-download: name: Create download for Website runs-on: ubuntu-latest if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags/') || github.event_name == 'pull_request' - needs: [build-linux-hip-rocm, build-linux, build-macos, build-windows] + needs: [ build-linux-hip-rocm, build-linux-oneapi, build-linux-cuda, build-linux, build-macos, build-windows ] steps: - uses: actions/checkout@v4 with: @@ -555,27 +550,17 @@ jobs: mv FIRESTARTER/FIRESTARTER_ONEAPI FIRESTARTER_ONEAPI_2023.2.0 rm -rf FIRESTARTER chmod +x FIRESTARTER_ONEAPI_2023.2.0 - # Linux HIP 6.2 - - name: Retrieve FIRESTARTER_HIP_6.2-linux - uses: actions/download-artifact@v4 - with: - name: FIRESTARTER_HIP_6.2-linux - path: FIRESTARTER - - name: Move binary to right directory - run: | - mv FIRESTARTER/FIRESTARTER_HIP FIRESTARTER_HIP_6.2 - rm -rf FIRESTARTER - chmod +x FIRESTARTER_HIP_6.2 - - name: Retrieve FIRESTARTER-macOS_12 + # Linux HIP 6.3 + - name: Retrieve FIRESTARTER_HIP_6.3-linux uses: actions/download-artifact@v4 with: - name: FIRESTARTER-macOS_12 + name: FIRESTARTER_HIP_6.3-linux path: FIRESTARTER - name: Move binary to right directory run: | - mv FIRESTARTER/FIRESTARTER FIRESTARTER-macOS_12 + mv FIRESTARTER/FIRESTARTER_HIP FIRESTARTER_HIP_6.3 rm -rf FIRESTARTER - chmod +x FIRESTARTER-macOS_12 + chmod +x FIRESTARTER_HIP_6.3 - name: Retrieve FIRESTARTER-macOS_13 uses: actions/download-artifact@v4 with: diff --git a/.github/workflows/ctest.yml b/.github/workflows/ctest.yml new file mode 100644 index 00000000..96655b97 --- /dev/null +++ b/.github/workflows/ctest.yml @@ -0,0 +1,39 @@ +name: ctest + +on: [push, pull_request] + +env: + PYTHONUNBUFFERED: 1 + +jobs: + build: + runs-on: ubuntu-latest + + steps: + - uses: actions/checkout@v4 + with: + submodules: 'true' + + - name: Install python3 + run: | + sudo apt update + sudo apt install python3 + + - name: Create build directory + run: | + mkdir build + + - name: Run CMake configure (default) + run: | + cd build + cmake .. + + - name: Build + run: | + cd build + make -j4 + + - name: Ctest + run: | + cd build + ctest \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt index c8f580e4..b86bec79 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,7 +1,9 @@ cmake_minimum_required(VERSION 3.22) project(FIRESTARTER) +enable_testing() set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +set(CMAKE_CXX_STANDARD 17) include(cmake/GitSubmoduleUpdate.cmake) @@ -55,41 +57,10 @@ endif() include(cmake/GitSubmoduleUpdate.cmake) git_submodule_update() -if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") -else() -SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -O2 -fdata-sections -ffunction-sections") -endif() +include(cmake/BuildOptions.cmake) +include(cmake/BuildSettings.cmake) -if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") - SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-dead_strip") -elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") -else() - SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gc-sections") -endif() - -# enable debug features on linux -if(CMAKE_SYSTEM_NAME STREQUAL "Linux") - option(FIRESTARTER_DEBUG_FEATURES "Enable debug features" ON) - if (FIRESTARTER_DEBUG_FEATURES) - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DFIRESTARTER_DEBUG_FEATURES") - endif() -endif() - -set(FIRESTARTER_BUILD_TYPE "FIRESTARTER" CACHE STRING "FIRESTARTER_BUILD_TYPE can be any of FIRESTARTER, FIRESTARTER_CUDA, FIRESTARTER_ONEAPI, or FIRESTARTER_HIP.") -set_property(CACHE FIRESTARTER_BUILD_TYPE PROPERTY STRINGS FIRESTARTER FIRESTARTER_CUDA FIRESTARTER_ONEAPI) -if (${FIRESTARTER_BUILD_TYPE} STREQUAL "FIRESTARTER") - option(FIRESTARTER_LINK_STATIC "Link FIRESTARTER as a static binary. Note, dlopen is not supported in static binaries. This option is not available on macOS or with CUDA or OneAPI enabled." ON) -endif() -if (${FIRESTARTER_BUILD_TYPE} STREQUAL "FIRESTARTER") - option(FIRESTARTER_BUILD_HWLOC "Build hwloc dependency." ON) -elseif(${FIRESTARTER_BUILD_TYPE} STREQUAL "FIRESTARTER_CUDA") - option(FIRESTARTER_BUILD_HWLOC "Build hwloc dependency." ON) -elseif(${FIRESTARTER_BUILD_TYPE} STREQUAL "FIRESTARTER_ONEAPI") - option(FIRESTARTER_BUILD_HWLOC "Build hwloc dependency." ON) -elseif(${FIRESTARTER_BUILD_TYPE} STREQUAL "FIRESTARTER_HIP") - option(FIRESTARTER_BUILD_HWLOC "Build hwloc dependency." ON) -endif() -option(FIRESTARTER_THREAD_AFFINITY "Enable FIRESTARTER to set affinity to hardware threads." ON) +include(cmake/DarwinBuild.cmake) if(NOT DEFINED ASMJIT_STATIC) set(ASMJIT_STATIC TRUE) @@ -111,3 +82,5 @@ find_package(Threads REQUIRED) include(cmake/InstallHwloc.cmake) add_subdirectory(src) + +add_subdirectory(test) \ No newline at end of file diff --git a/README.md b/README.md index efda1299..058af152 100644 --- a/README.md +++ b/README.md @@ -152,7 +152,12 @@ CMake option | Description `FIRESTARTER_BUILD_HWLOC` | Build hwloc dependency. Default `ON` `FIRESTARTER_THREAD_AFFINITY` | Enable FIRESTARTER to set affinity to hardware threads. Default `ON` -When building `FIRESTARTER_ONEAPI` make sure that the Intel Math Kernel Library (MKL) and the complier `icx` can be found. These will be used to build `FIRESTARTER`, while dependencies will be build with `$CC` and `$CXX` respectively. +When building `FIRESTARTER_ONEAPI` make sure that the Intel Math Kernel +Library (MKL) and the compiler `icx` and `icpx` can be found. Please provide +them through the `CC` and `CXX` environment variables. + +When building `FIRESTARTER_HIP` make sure that the compiler `hipcc` can be +found. Please provide it through the `CC` and `CXX` environment variables. ## Metrics diff --git a/cmake/BuildOptions.cmake b/cmake/BuildOptions.cmake new file mode 100644 index 00000000..660373ea --- /dev/null +++ b/cmake/BuildOptions.cmake @@ -0,0 +1,26 @@ +include(CMakeDependentOption) + +# Set the different available FIRESTARTER builds. +set(FIRESTARTER_BUILD_TYPE "FIRESTARTER" CACHE STRING "FIRESTARTER_BUILD_TYPE can be any of FIRESTARTER, FIRESTARTER_CUDA, FIRESTARTER_ONEAPI, or FIRESTARTER_HIP.") +set_property(CACHE FIRESTARTER_BUILD_TYPE PROPERTY STRINGS FIRESTARTER FIRESTARTER_CUDA FIRESTARTER_ONEAPI FIRESTARTER_HIP) + +# Static linking is not supported with GPU devices or MacOS. +if(${FIRESTARTER_BUILD_TYPE} STREQUAL "FIRESTARTER" AND NOT ${CMAKE_SYSTEM_NAME} STREQUAL "Darwin") + set(FIRESTARTER_LINK_STATIC "Link FIRESTARTER as a static binary. Note, dlopen is not supported in static binaries. This option is not available on macOS or with CUDA, OneAPI or HIP enabled." ON) +endif() + + +# We vendor hwloc per default. +option(FIRESTARTER_BUILD_HWLOC "Build hwloc dependency." ON) + + +# Use of thread affinity is enabled on linux per default. +if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") + set(FIRESTARTER_THREAD_AFFINITY "Enable FIRESTARTER to set affinity to hardware threads." ON) +endif() + + +# Debug feature are enabled on linux per default. +if(${CMAKE_SYSTEM_NAME} STREQUAL "Linux") + set(FIRESTARTER_DEBUG_FEATURES "Enable debug features" ON) +endif() \ No newline at end of file diff --git a/cmake/BuildSettings.cmake b/cmake/BuildSettings.cmake new file mode 100644 index 00000000..d16949ff --- /dev/null +++ b/cmake/BuildSettings.cmake @@ -0,0 +1,69 @@ +# Dependent Linux features +if(FIRESTARTER_LINK_STATIC) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DFIRESTARTER_LINK_STATIC") +endif() + +if (FIRESTARTER_DEBUG_FEATURES) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DFIRESTARTER_DEBUG_FEATURES") +endif() + +if (FIRESTARTER_THREAD_AFFINITY) + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DFIRESTARTER_THREAD_AFFINITY") +endif() + + +# Not MSVC +if(NOT "${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Wall -Wextra -O2 -fdata-sections -ffunction-sections") +endif() + + +# Darwin +if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-dead_strip") +endif() + + +# Not (Darwin or MSVC) +# equivalent to Linux and Windows with mingw +if(NOT (CMAKE_SYSTEM_NAME STREQUAL "Darwin" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC")) + SET(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,--gc-sections") +endif() + + +# Linux +if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + # enable position independant code on linux + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fPIC") +endif() + + +# Find packages, set the compiler and compile flags specific to the selected FIRESTARTER build. +if(${FIRESTARTER_BUILD_TYPE} STREQUAL "FIRESTARTER") + # No specific compiler selected +elseif ("${FIRESTARTER_BUILD_TYPE}" STREQUAL "FIRESTARTER_CUDA") + find_package(CUDAToolkit REQUIRED) + include_directories(${CUDAToolkit_INCLUDE_DIRS}) + + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DFIRESTARTER_BUILD_CUDA") +elseif ("${FIRESTARTER_BUILD_TYPE}" STREQUAL "FIRESTARTER_ONEAPI") + find_program(ICX_PATH icx REQUIRED) + + message(STATUS "Path of icx executable is: ${ICX_PATH}") + + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -DFIRESTARTER_BUILD_ONEAPI") +elseif("${FIRESTARTER_BUILD_TYPE}" STREQUAL "FIRESTARTER_HIP") + if (NOT DEFINED ROCM_PATH ) + set ( ROCM_PATH "/opt/rocm" CACHE STRING "Default ROCM installation directory." ) + endif () + + # Search for rocm in common locations + list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}/hip ${ROCM_PATH}/lib ${ROCM_PATH}) + find_package(HIP REQUIRED) + find_package(rocblas REQUIRED) + find_package(rocrand REQUIRED) + find_package(hiprand REQUIRED) + find_package(hipblas REQUIRED) + + SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DFIRESTARTER_BUILD_HIP") +endif() \ No newline at end of file diff --git a/cmake/DarwinBuild.cmake b/cmake/DarwinBuild.cmake new file mode 100644 index 00000000..9519a6de --- /dev/null +++ b/cmake/DarwinBuild.cmake @@ -0,0 +1,18 @@ +if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + find_library( COREFOUNDATION_LIBRARY CoreFoundation ) + find_library( IOKIT_LIBRARY IOKit ) +endif() + +# Function to link against the correct libraries on darwin +function(target_link_libraries_darwin) + set(oneValueArgs NAME) + cmake_parse_arguments(TARGET "" "${oneValueArgs}" + "" ${ARGN} ) + + if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") + target_link_libraries(${TARGET_NAME} + ${COREFOUNDATION_LIBRARY} + ${IOKIT_LIBRARY} + ) + endif() +endfunction() diff --git a/cmake/InstallHwloc.cmake b/cmake/InstallHwloc.cmake index ef50c736..a292d8fb 100644 --- a/cmake/InstallHwloc.cmake +++ b/cmake/InstallHwloc.cmake @@ -17,6 +17,7 @@ if (FIRESTARTER_BUILD_HWLOC) BUILD_IN_SOURCE 1 BUILD_COMMAND make -j INSTALL_COMMAND make install + BUILD_BYPRODUCTS /lib/libhwloc.a ) SET(HWLOC_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/lib/Hwloc/install") @@ -36,6 +37,7 @@ if (FIRESTARTER_BUILD_HWLOC) CONFIGURE_COMMAND "" BUILD_COMMAND cd \\contrib\\windows && MSBuild /p:Configuration=Release /p:Platform=x64 hwloc.sln INSTALL_COMMAND "" + BUILD_BYPRODUCTS /contrib/windows/x64/Release/libhwloc.lib ) SET(HWLOC_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/lib/Hwloc/sources") @@ -55,6 +57,7 @@ if (FIRESTARTER_BUILD_HWLOC) CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" + BUILD_BYPRODUCTS /lib/libhwloc.a ) SET(HWLOC_INCLUDE_DIR "${PROJECT_SOURCE_DIR}/lib/Hwloc/sources") @@ -66,4 +69,5 @@ if (FIRESTARTER_BUILD_HWLOC) endif() include_directories(${HWLOC_INCLUDE_DIR}/include) + add_dependencies(hwloc HwlocInstall) endif() diff --git a/include/firestarter/Cuda/CudaHipCompat.hpp b/include/firestarter/Cuda/CudaHipCompat.hpp index f0543f4d..12723d0f 100644 --- a/include/firestarter/Cuda/CudaHipCompat.hpp +++ b/include/firestarter/Cuda/CudaHipCompat.hpp @@ -346,13 +346,6 @@ auto getErrorString(CUresult Result) -> const char* { accellSafeCall(cuGetErrorName(Result, &ErrorString), __FILE__, __LINE__); return ErrorString; } -#else -// define types to not run into compile errors with if constexpr - -enum class CUresult {}; -// NOLINTBEGIN(readability-identifier-naming) -constexpr const int CUDA_SUCCESS = 0; -// NOLINTEND(readability-identifier-naming) #endif template void accellSafeCall(T TVal, const char* File, const int Line, std::optional DeviceIndex) { @@ -368,14 +361,15 @@ template void accellSafeCall(T TVal, const char* File, const int Li if (TVal == RandStatusT::RAND_STATUS_SUCCESS) { return; } - } else if constexpr (std::is_same_v) { -#ifndef FIRESTARTER_BUILD_CUDA - static_assert(false, "Tried to call accellSafeCall with CUresult, but not building for CUDA."); -#endif + } +#ifdef FIRESTARTER_BUILD_CUDA + else if constexpr (std::is_same_v) { if (TVal == CUDA_SUCCESS) { return; } - } else { + } +#endif + else { assert(false && "Tried to call accellSafeCall with an unknown type."); } diff --git a/include/firestarter/Environment/Environment.hpp b/include/firestarter/Environment/Environment.hpp index 41446bde..eff470d2 100644 --- a/include/firestarter/Environment/Environment.hpp +++ b/include/firestarter/Environment/Environment.hpp @@ -85,7 +85,8 @@ class Environment { virtual void printSelectedCodePathSummary() = 0; /// Print a list of available high-load function and if they are available on the current system. - virtual void printFunctionSummary() = 0; + /// \arg ForceYes Force all functions to be shown as avaialable + virtual void printFunctionSummary(bool ForceYes) = 0; /// Get the number of threads FIRESTARTER will run with. [[nodiscard]] auto requestedNumThreads() const -> uint64_t { return RequestedNumThreads; } diff --git a/include/firestarter/Environment/Payload/Payload.hpp b/include/firestarter/Environment/Payload/Payload.hpp index b5b17199..df7d08a7 100644 --- a/include/firestarter/Environment/Payload/Payload.hpp +++ b/include/firestarter/Environment/Payload/Payload.hpp @@ -94,9 +94,10 @@ class Payload { /// compiled payload. /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine /// of the compiled payload. + /// \arg PrintAssembler Should the generated assembler code be logged. /// \returns The compiled payload that provides access to the init and load functions. - [[nodiscard]] virtual auto compilePayload(const PayloadSettings& Settings, bool DumpRegisters, - bool ErrorDetection) const -> CompiledPayload::UniquePtr = 0; + [[nodiscard]] virtual auto compilePayload(const PayloadSettings& Settings, bool DumpRegisters, bool ErrorDetection, + bool PrintAssembler) const -> CompiledPayload::UniquePtr = 0; /// Get the available instruction items that are supported by this payload. /// \returns The available instruction items that are supported by this payload. diff --git a/include/firestarter/Environment/X86/Payload/AVX512Payload.hpp b/include/firestarter/Environment/X86/Payload/AVX512Payload.hpp index 20bfc491..d783b984 100644 --- a/include/firestarter/Environment/X86/Payload/AVX512Payload.hpp +++ b/include/firestarter/Environment/X86/Payload/AVX512Payload.hpp @@ -56,9 +56,10 @@ class AVX512Payload final : public X86Payload { /// compiled payload. /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine /// of the compiled payload. + /// \arg PrintAssembler Should the generated assembler code be logged. /// \returns The compiled payload that provides access to the init and load functions. [[nodiscard]] auto compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, - bool ErrorDetection) const + bool ErrorDetection, bool PrintAssembler) const -> environment::payload::CompiledPayload::UniquePtr override; private: diff --git a/include/firestarter/Environment/X86/Payload/AVXPayload.hpp b/include/firestarter/Environment/X86/Payload/AVXPayload.hpp index 24ef7a15..db9c1a42 100644 --- a/include/firestarter/Environment/X86/Payload/AVXPayload.hpp +++ b/include/firestarter/Environment/X86/Payload/AVXPayload.hpp @@ -55,9 +55,10 @@ class AVXPayload final : public X86Payload { /// compiled payload. /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine /// of the compiled payload. + /// \arg PrintAssembler Should the generated assembler code be logged. /// \returns The compiled payload that provides access to the init and load functions. [[nodiscard]] auto compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, - bool ErrorDetection) const + bool ErrorDetection, bool PrintAssembler) const -> environment::payload::CompiledPayload::UniquePtr override; private: diff --git a/include/firestarter/Environment/X86/Payload/FMA4Payload.hpp b/include/firestarter/Environment/X86/Payload/FMA4Payload.hpp index f0e711f6..13e66550 100644 --- a/include/firestarter/Environment/X86/Payload/FMA4Payload.hpp +++ b/include/firestarter/Environment/X86/Payload/FMA4Payload.hpp @@ -55,9 +55,10 @@ class FMA4Payload final : public X86Payload { /// compiled payload. /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine /// of the compiled payload. + /// \arg PrintAssembler Should the generated assembler code be logged. /// \returns The compiled payload that provides access to the init and load functions. [[nodiscard]] auto compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, - bool ErrorDetection) const + bool ErrorDetection, bool PrintAssembler) const -> environment::payload::CompiledPayload::UniquePtr override; private: diff --git a/include/firestarter/Environment/X86/Payload/FMAPayload.hpp b/include/firestarter/Environment/X86/Payload/FMAPayload.hpp index 8280a5b2..1bfe361b 100644 --- a/include/firestarter/Environment/X86/Payload/FMAPayload.hpp +++ b/include/firestarter/Environment/X86/Payload/FMAPayload.hpp @@ -45,9 +45,10 @@ class FMAPayload final : public X86Payload { /// compiled payload. /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine /// of the compiled payload. + /// \arg PrintAssembler Should the generated assembler code be logged. /// \returns The compiled payload that provides access to the init and load functions. [[nodiscard]] auto compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, - bool ErrorDetection) const + bool ErrorDetection, bool PrintAssembler) const -> environment::payload::CompiledPayload::UniquePtr override; private: diff --git a/include/firestarter/Environment/X86/Payload/SSE2Payload.hpp b/include/firestarter/Environment/X86/Payload/SSE2Payload.hpp index 557af0d4..f6d667be 100644 --- a/include/firestarter/Environment/X86/Payload/SSE2Payload.hpp +++ b/include/firestarter/Environment/X86/Payload/SSE2Payload.hpp @@ -55,9 +55,10 @@ class SSE2Payload final : public X86Payload { /// compiled payload. /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine /// of the compiled payload. + /// \arg PrintAssembler Should the generated assembler code be logged. /// \returns The compiled payload that provides access to the init and load functions. [[nodiscard]] auto compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, - bool ErrorDetection) const + bool ErrorDetection, bool PrintAssembler) const -> environment::payload::CompiledPayload::UniquePtr override; private: diff --git a/include/firestarter/Environment/X86/Payload/X86Payload.hpp b/include/firestarter/Environment/X86/Payload/X86Payload.hpp index 44d5bd4f..2608c8f4 100644 --- a/include/firestarter/Environment/X86/Payload/X86Payload.hpp +++ b/include/firestarter/Environment/X86/Payload/X86Payload.hpp @@ -90,6 +90,16 @@ class X86Payload : public environment::payload::Payload { }; protected: + /// Print the generated assembler Code of asmjit + /// \arg Builder The builder that contains the assembler code. + static void printAssembler(asmjit::BaseBuilder& Builder) { + asmjit::String Sb; + asmjit::FormatOptions FormatOptions{}; + + asmjit::Formatter::formatNodeList(Sb, FormatOptions, &Builder); + log::info() << Sb.data(); + } + /// Emit the code to dump the xmm, ymm or zmm registers into memory for the dump registers feature. /// \tparam Vec the type of the vector register used. /// \arg Cb The asmjit code builder that is used to emit the assembler code. @@ -548,6 +558,7 @@ class X86Payload : public environment::payload::Payload { /// LoadVar changed. void lowLoadFunction(volatile LoadThreadWorkType& LoadVar, std::chrono::microseconds Period) const final; +public: /// Get the available instruction items that are supported by this payload. /// \returns The available instruction items that are supported by this payload. [[nodiscard]] auto getAvailableInstructions() const -> std::list final; diff --git a/include/firestarter/Environment/X86/Payload/ZENFMAPayload.hpp b/include/firestarter/Environment/X86/Payload/ZENFMAPayload.hpp index 5d624725..777e6587 100644 --- a/include/firestarter/Environment/X86/Payload/ZENFMAPayload.hpp +++ b/include/firestarter/Environment/X86/Payload/ZENFMAPayload.hpp @@ -40,9 +40,10 @@ class ZENFMAPayload final : public X86Payload { /// compiled payload. /// \arg ErrorDetection Should the code to support error detection between thread be baked into the high load routine /// of the compiled payload. + /// \arg PrintAssembler Should the generated assembler code be logged. /// \returns The compiled payload that provides access to the init and load functions. [[nodiscard]] auto compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, - bool ErrorDetection) const + bool ErrorDetection, bool PrintAssembler) const -> environment::payload::CompiledPayload::UniquePtr override; private: diff --git a/include/firestarter/Environment/X86/X86Environment.hpp b/include/firestarter/Environment/X86/X86Environment.hpp index f4760f7e..40d1b6e0 100644 --- a/include/firestarter/Environment/X86/X86Environment.hpp +++ b/include/firestarter/Environment/X86/X86Environment.hpp @@ -90,7 +90,8 @@ class X86Environment final : public Environment { /// Print a list of available high-load function and if they are available on the current system. This includes all /// PlatformConfigs in combination with all thread per core counts. - void printFunctionSummary() override; + /// \arg ForceYes Force all functions to be shown as avaialable + void printFunctionSummary(bool ForceYes) override; private: /// The list of availabe platform configs that is printed when supplying the --avail command line argument. The IDs diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c0355fa0..96de8119 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -1,8 +1,7 @@ -SET(FIRESTARTER_FILES +# Create the core firestarter library that is used in all builds and tests +add_library(firestartercore STATIC firestarter/Config.cpp - firestarter/Main.cpp firestarter/Firestarter.cpp - firestarter/LoadWorker.cpp firestarter/SafeExit.cpp firestarter/WatchdogWorker.cpp firestarter/DumpRegisterWorker.cpp @@ -27,10 +26,16 @@ SET(FIRESTARTER_FILES firestarter/Environment/X86/Payload/SSE2Payload.cpp ) -if(CMAKE_SYSTEM_NAME STREQUAL "Linux") - SET(FIRESTARTER_FILES - ${FIRESTARTER_FILES} +target_link_libraries(firestartercore + hwloc + AsmJit::AsmJit + Nitro::log + nlohmann_json::nlohmann_json + ) +# Create the linux firestarter library that is used for specific linux only features +if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + add_library(firestarterlinux STATIC # measurement stuff firestarter/Measurement/MeasurementWorker.cpp firestarter/Measurement/Summary.cpp @@ -44,39 +49,39 @@ if(CMAKE_SYSTEM_NAME STREQUAL "Linux") firestarter/Optimizer/Util/MultiObjective.cpp firestarter/Optimizer/Algorithm/NSGA2.cpp ) -endif() - -SET(FIRESTARTER_LINK_LIBRARIES - ) -if (FIRESTARTER_THREAD_AFFINITY) - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DFIRESTARTER_THREAD_AFFINITY") + target_link_libraries(firestarterlinux + Nitro::log + nlohmann_json::nlohmann_json + ) endif() -if ("${FIRESTARTER_BUILD_TYPE}" STREQUAL "FIRESTARTER_CUDA") - find_package(CUDAToolkit REQUIRED) - include_directories(${CUDAToolkit_INCLUDE_DIRS}) - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DFIRESTARTER_BUILD_CUDA") +SET(FIRESTARTER_FILES + firestarter/Main.cpp + + # IpcEstimateMetricData::insertValue is accesses which is part of the firestarterlinux library. + # This reference should be removed there and the file moved back to the firestartercore library. + firestarter/LoadWorker.cpp + ) +if ("${FIRESTARTER_BUILD_TYPE}" STREQUAL "FIRESTARTER_CUDA") add_executable(FIRESTARTER_CUDA ${FIRESTARTER_FILES} firestarter/Cuda/Cuda.cpp ) - target_compile_features(FIRESTARTER_CUDA PRIVATE cxx_std_17) - if(FIRESTARTER_BUILD_HWLOC) - add_dependencies(FIRESTARTER_CUDA - HwlocInstall - ) + target_link_libraries(FIRESTARTER_CUDA + firestartercore + ) + + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + target_link_libraries(FIRESTARTER_CUDA + firestarterlinux + ) endif() target_link_libraries(FIRESTARTER_CUDA - hwloc - AsmJit::AsmJit - Nitro::log - nlohmann_json::nlohmann_json - Threads::Threads CUDA::cuda_driver CUDA::cudart CUDA::curand @@ -95,38 +100,21 @@ if ("${FIRESTARTER_BUILD_TYPE}" STREQUAL "FIRESTARTER_CUDA") endif() elseif ("${FIRESTARTER_BUILD_TYPE}" STREQUAL "FIRESTARTER_ONEAPI") - - find_program(ICX_PRESENT icx) - - if(ICX_PRESENT) - message(STATUS "Executable found: ${ICX_PRESENT}") - else() - message(FATAL_ERROR "OneAPI Intel Compiler icx not found") - endif() - SET(CMAKE_CXX_COMPILER "icx") - SET(CMAKE_C_COMPILER "icx") - - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsycl -DFIRESTARTER_BUILD_ONEAPI") - add_executable(FIRESTARTER_ONEAPI ${FIRESTARTER_FILES} firestarter/OneAPI/OneAPI.cpp ) - target_compile_features(FIRESTARTER_ONEAPI PRIVATE cxx_std_17) - - if(FIRESTARTER_BUILD_HWLOC) - add_dependencies(FIRESTARTER_ONEAPI - HwlocInstall - ) - endif() target_link_libraries(FIRESTARTER_ONEAPI - hwloc - AsmJit::AsmJit - Nitro::log - nlohmann_json::nlohmann_json - Threads::Threads + firestartercore + ) + + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + target_link_libraries(FIRESTARTER_ONEAPI + firestarterlinux ) + endif() + target_link_libraries(FIRESTARTER_ONEAPI mkl_sycl mkl_intel_ilp64 @@ -134,44 +122,26 @@ elseif ("${FIRESTARTER_BUILD_TYPE}" STREQUAL "FIRESTARTER_ONEAPI") mkl_core sycl stdc++ + Threads::Threads ) elseif("${FIRESTARTER_BUILD_TYPE}" STREQUAL "FIRESTARTER_HIP") - if (NOT DEFINED ROCM_PATH ) - set ( ROCM_PATH "/opt/rocm" CACHE STRING "Default ROCM installation directory." ) - endif () - # Search for rocm in common locations - list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH}/hip ${ROCM_PATH}/lib ${ROCM_PATH}) - find_package(HIP REQUIRED) - find_package(rocblas REQUIRED) - find_package(rocrand REQUIRED) - find_package(hiprand REQUIRED) - find_package(hipblas REQUIRED) - - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DFIRESTARTER_BUILD_HIP") - - set(CMAKE_CXX_COMPILER ${HIP_HIPCC_EXECUTABLE}) - set(CMAKE_CXX_LINKER ${HIP_HIPCC_EXECUTABLE}) - set( CMAKE_CXX_STANDARD 17 ) - add_executable(FIRESTARTER_HIP ${FIRESTARTER_FILES} firestarter/Cuda/Cuda.cpp ) - target_compile_features(FIRESTARTER_HIP PRIVATE cxx_std_17) - if(FIRESTARTER_BUILD_HWLOC) - add_dependencies(FIRESTARTER_HIP - HwlocInstall - ) + target_link_libraries(FIRESTARTER_HIP + firestartercore + ) + + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + target_link_libraries(FIRESTARTER_HIP + firestarterlinux + ) endif() target_link_libraries(FIRESTARTER_HIP - hwloc - AsmJit::AsmJit - Nitro::log - nlohmann_json::nlohmann_json - Threads::Threads hip::host hip::hiprand roc::hipblas @@ -188,51 +158,44 @@ elseif(${FIRESTARTER_BUILD_TYPE} STREQUAL "FIRESTARTER") add_executable(FIRESTARTER ${FIRESTARTER_FILES} ) - target_compile_features(FIRESTARTER PRIVATE cxx_std_17) + + target_link_libraries(FIRESTARTER + firestartercore + ) + + if(CMAKE_SYSTEM_NAME STREQUAL "Linux") + target_link_libraries(FIRESTARTER + firestarterlinux + ) + endif() + + target_link_libraries_darwin(NAME FIRESTARTER) # static linking is not supported on Darwin, see Apple Technical QA1118 - if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") - find_library( COREFOUNDATION_LIBRARY CoreFoundation ) - find_library( IOKIT_LIBRARY IOKit ) + if((NOT CMAKE_SYSTEM_NAME STREQUAL "Darwin") AND FIRESTARTER_LINK_STATIC) target_link_libraries(FIRESTARTER - Threads::Threads - ${COREFOUNDATION_LIBRARY} - ${IOKIT_LIBRARY} + -static ) - else() - if (FIRESTARTER_LINK_STATIC) - SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -static -DFIRESTARTER_LINK_STATIC") - # we are using pthreads - # static linking with pthreads and std::condition_variabale is evil and will cause segfaults - # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58909#c1 - if(CMAKE_THREAD_LIBS_INIT MATCHES "^.*pthread.*$") - target_link_libraries(FIRESTARTER - "-Wl,--whole-archive -lpthread -Wl,--no-whole-archive" - ) - endif() - else() - if(CMAKE_SYSTEM_NAME STREQUAL "Linux") - target_link_libraries(FIRESTARTER - -ldl - ) - endif() + # we are using pthreads + # static linking with pthreads and std::condition_variabale is evil and will cause segfaults + # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58909#c1 + if(CMAKE_THREAD_LIBS_INIT MATCHES "^.*pthread.*$") target_link_libraries(FIRESTARTER - Threads::Threads + "-Wl,--whole-archive -lpthread -Wl,--no-whole-archive" ) endif() endif() - if(FIRESTARTER_BUILD_HWLOC) - add_dependencies(FIRESTARTER - HwlocInstall + if(CMAKE_SYSTEM_NAME STREQUAL "Linux" AND NOT FIRESTARTER_LINK_STATIC) + target_link_libraries(FIRESTARTER + -ldl ) endif() - target_link_libraries(FIRESTARTER - hwloc - AsmJit::AsmJit - Nitro::log - nlohmann_json::nlohmann_json - ) -endif() + if (NOT FIRESTARTER_LINK_STATIC) + target_link_libraries(FIRESTARTER + Threads::Threads + ) + endif() +endif() \ No newline at end of file diff --git a/src/firestarter/Environment/X86/Payload/AVX512Payload.cpp b/src/firestarter/Environment/X86/Payload/AVX512Payload.cpp index f52a5410..88d02440 100644 --- a/src/firestarter/Environment/X86/Payload/AVX512Payload.cpp +++ b/src/firestarter/Environment/X86/Payload/AVX512Payload.cpp @@ -25,7 +25,8 @@ namespace firestarter::environment::x86::payload { auto AVX512Payload::compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, - bool ErrorDetection) const -> environment::payload::CompiledPayload::UniquePtr { + bool ErrorDetection, bool PrintAssembler) const + -> environment::payload::CompiledPayload::UniquePtr { using Imm = asmjit::Imm; using Zmm = asmjit::x86::Zmm; // NOLINTBEGIN(readability-identifier-naming) @@ -364,6 +365,10 @@ auto AVX512Payload::compilePayload(const environment::payload::PayloadSettings& Cb.finalize(); + if (PrintAssembler) { + printAssembler(Cb); + } + auto CompiledPayloadPtr = CompiledX86Payload::create(Stats, Code); // skip if we could not determine cache size diff --git a/src/firestarter/Environment/X86/Payload/AVXPayload.cpp b/src/firestarter/Environment/X86/Payload/AVXPayload.cpp index b20a85f7..82ded951 100644 --- a/src/firestarter/Environment/X86/Payload/AVXPayload.cpp +++ b/src/firestarter/Environment/X86/Payload/AVXPayload.cpp @@ -25,7 +25,8 @@ namespace firestarter::environment::x86::payload { auto AVXPayload::compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, - bool ErrorDetection) const -> environment::payload::CompiledPayload::UniquePtr { + bool ErrorDetection, bool PrintAssembler) const + -> environment::payload::CompiledPayload::UniquePtr { using Imm = asmjit::Imm; using Mm = asmjit::x86::Mm; using Xmm = asmjit::x86::Xmm; @@ -394,6 +395,10 @@ auto AVXPayload::compilePayload(const environment::payload::PayloadSettings& Set Cb.finalize(); + if (PrintAssembler) { + printAssembler(Cb); + } + auto CompiledPayloadPtr = CompiledX86Payload::create(Stats, Code); // skip if we could not determine cache size diff --git a/src/firestarter/Environment/X86/Payload/FMA4Payload.cpp b/src/firestarter/Environment/X86/Payload/FMA4Payload.cpp index 202d34c7..91f3479a 100644 --- a/src/firestarter/Environment/X86/Payload/FMA4Payload.cpp +++ b/src/firestarter/Environment/X86/Payload/FMA4Payload.cpp @@ -25,7 +25,8 @@ namespace firestarter::environment::x86::payload { auto FMA4Payload::compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, - bool ErrorDetection) const -> environment::payload::CompiledPayload::UniquePtr { + bool ErrorDetection, bool PrintAssembler) const + -> environment::payload::CompiledPayload::UniquePtr { using Imm = asmjit::Imm; using Xmm = asmjit::x86::Xmm; // NOLINTBEGIN(readability-identifier-naming) @@ -367,6 +368,10 @@ auto FMA4Payload::compilePayload(const environment::payload::PayloadSettings& Se Cb.finalize(); + if (PrintAssembler) { + printAssembler(Cb); + } + auto CompiledPayloadPtr = CompiledX86Payload::create(Stats, Code); // skip if we could not determine cache size diff --git a/src/firestarter/Environment/X86/Payload/FMAPayload.cpp b/src/firestarter/Environment/X86/Payload/FMAPayload.cpp index cec0021a..03b8995c 100644 --- a/src/firestarter/Environment/X86/Payload/FMAPayload.cpp +++ b/src/firestarter/Environment/X86/Payload/FMAPayload.cpp @@ -25,7 +25,8 @@ namespace firestarter::environment::x86::payload { auto FMAPayload::compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, - bool ErrorDetection) const -> environment::payload::CompiledPayload::UniquePtr { + bool ErrorDetection, bool PrintAssembler) const + -> environment::payload::CompiledPayload::UniquePtr { using Imm = asmjit::Imm; using Xmm = asmjit::x86::Xmm; using Ymm = asmjit::x86::Ymm; @@ -402,6 +403,10 @@ auto FMAPayload::compilePayload(const environment::payload::PayloadSettings& Set Cb.finalize(); + if (PrintAssembler) { + printAssembler(Cb); + } + auto CompiledPayloadPtr = CompiledX86Payload::create(Stats, Code); // skip if we could not determine cache size diff --git a/src/firestarter/Environment/X86/Payload/SSE2Payload.cpp b/src/firestarter/Environment/X86/Payload/SSE2Payload.cpp index fc77c8e1..8f443781 100644 --- a/src/firestarter/Environment/X86/Payload/SSE2Payload.cpp +++ b/src/firestarter/Environment/X86/Payload/SSE2Payload.cpp @@ -25,7 +25,8 @@ namespace firestarter::environment::x86::payload { auto SSE2Payload::compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, - bool ErrorDetection) const -> environment::payload::CompiledPayload::UniquePtr { + bool ErrorDetection, bool PrintAssembler) const + -> environment::payload::CompiledPayload::UniquePtr { using Imm = asmjit::Imm; using Mm = asmjit::x86::Mm; using Xmm = asmjit::x86::Xmm; @@ -385,6 +386,10 @@ auto SSE2Payload::compilePayload(const environment::payload::PayloadSettings& Se Cb.finalize(); + if (PrintAssembler) { + printAssembler(Cb); + } + auto CompiledPayloadPtr = CompiledX86Payload::create(Stats, Code); // skip if we could not determine cache size diff --git a/src/firestarter/Environment/X86/Payload/ZENFMAPayload.cpp b/src/firestarter/Environment/X86/Payload/ZENFMAPayload.cpp index 4857f82d..f12dca1d 100644 --- a/src/firestarter/Environment/X86/Payload/ZENFMAPayload.cpp +++ b/src/firestarter/Environment/X86/Payload/ZENFMAPayload.cpp @@ -25,7 +25,8 @@ namespace firestarter::environment::x86::payload { auto ZENFMAPayload::compilePayload(const environment::payload::PayloadSettings& Settings, bool DumpRegisters, - bool ErrorDetection) const -> environment::payload::CompiledPayload::UniquePtr { + bool ErrorDetection, bool PrintAssembler) const + -> environment::payload::CompiledPayload::UniquePtr { using Imm = asmjit::Imm; using Xmm = asmjit::x86::Xmm; using Ymm = asmjit::x86::Ymm; @@ -352,6 +353,10 @@ auto ZENFMAPayload::compilePayload(const environment::payload::PayloadSettings& Cb.finalize(); + if (PrintAssembler) { + printAssembler(Cb); + } + auto CompiledPayloadPtr = CompiledX86Payload::create(Stats, Code); // skip if we could not determine cache size diff --git a/src/firestarter/Environment/X86/X86Environment.cpp b/src/firestarter/Environment/X86/X86Environment.cpp index 3ecd89c1..4511f514 100644 --- a/src/firestarter/Environment/X86/X86Environment.cpp +++ b/src/firestarter/Environment/X86/X86Environment.cpp @@ -169,7 +169,7 @@ void X86Environment::setLineCount(unsigned LineCount) { config().settings().setL void X86Environment::printSelectedCodePathSummary() { config().printCodePathSummary(); } -void X86Environment::printFunctionSummary() { +void X86Environment::printFunctionSummary(bool ForceYes) { log::info() << " available load-functions:\n" << " ID | NAME | available on this " "system | payload default setting\n" @@ -182,7 +182,7 @@ void X86Environment::printFunctionSummary() { for (auto const& Config : PlatformConfigs) { for (auto const& ThreadsPerCore : Config->settings().threads()) { - const char* Available = Config->isAvailable(topology()) ? "yes" : "no"; + const char* Available = (Config->isAvailable(topology()) || ForceYes) ? "yes" : "no"; const auto& FunctionName = Config->functionName(ThreadsPerCore); const auto& InstructionGroupsString = Config->settings().getInstructionGroupsString(); diff --git a/src/firestarter/Firestarter.cpp b/src/firestarter/Firestarter.cpp index 379e2039..a62fc2f8 100644 --- a/src/firestarter/Firestarter.cpp +++ b/src/firestarter/Firestarter.cpp @@ -59,7 +59,7 @@ Firestarter::Firestarter(Config&& ProvidedConfig) } if (Cfg.PrintFunctionSummary) { - Environment->printFunctionSummary(); + Environment->printFunctionSummary(/*ForceYes=*/false); safeExit(EXIT_SUCCESS); } diff --git a/src/firestarter/LoadWorker.cpp b/src/firestarter/LoadWorker.cpp index 4d473832..238dace8 100644 --- a/src/firestarter/LoadWorker.cpp +++ b/src/firestarter/LoadWorker.cpp @@ -267,8 +267,8 @@ void Firestarter::loadThreadWorker(const std::shared_ptr& Td) { Td->environment().setCpuAffinity(Td->id()); // compile payload - Td->CompiledPayloadPtr = - Td->config().payload()->compilePayload(Td->config().settings(), Td->DumpRegisters, Td->ErrorDetection); + Td->CompiledPayloadPtr = Td->config().payload()->compilePayload(Td->config().settings(), Td->DumpRegisters, + Td->ErrorDetection, /*PrintAssembler=*/false); // allocate memory // if we should dump some registers, we use the first part of the memory @@ -354,8 +354,8 @@ void Firestarter::loadThreadWorker(const std::shared_ptr& Td) { break; case LoadThreadState::ThreadSwitch: // compile payload - Td->CompiledPayloadPtr = - Td->config().payload()->compilePayload(Td->config().settings(), Td->DumpRegisters, Td->ErrorDetection); + Td->CompiledPayloadPtr = Td->config().payload()->compilePayload(Td->config().settings(), Td->DumpRegisters, + Td->ErrorDetection, /*PrintAssembler=*/false); // call init function Td->CompiledPayloadPtr->init(Td->Memory->getMemoryAddress(), Td->BuffersizeMem); diff --git a/src/firestarter/SafeExit.cpp b/src/firestarter/SafeExit.cpp index 4aed7a50..61c6a6f0 100644 --- a/src/firestarter/SafeExit.cpp +++ b/src/firestarter/SafeExit.cpp @@ -21,6 +21,7 @@ #include "firestarter/SafeExit.hpp" +#include #include [[noreturn]] void firestarter::safeExit(const int Status) { diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt new file mode 100644 index 00000000..785f6f53 --- /dev/null +++ b/test/CMakeLists.txt @@ -0,0 +1,15 @@ +find_package(Python REQUIRED) + +# Function to add reference tests +function(add_ref_test) + set(oneValueArgs NAME) + cmake_parse_arguments(TEST "" "${oneValueArgs}" + "" ${ARGN} ) + + add_test(NAME ${TEST_NAME} + COMMAND ${Python_EXECUTABLE} ${PROJECT_SOURCE_DIR}/tooling/ref-test.py $ ${PROJECT_SOURCE_DIR}/test/refs/${TEST_NAME}.log + ) +endfunction() + +add_subdirectory(DumpPayloads) +add_subdirectory(X86Functions) \ No newline at end of file diff --git a/test/DumpPayloads/CMakeLists.txt b/test/DumpPayloads/CMakeLists.txt new file mode 100644 index 00000000..6ebd81f8 --- /dev/null +++ b/test/DumpPayloads/CMakeLists.txt @@ -0,0 +1,5 @@ +add_executable(DumpPayloads Main.cpp) +target_link_libraries(DumpPayloads firestartercore) +target_link_libraries_darwin(NAME DumpPayloads) + +add_ref_test(NAME DumpPayloads) \ No newline at end of file diff --git a/test/DumpPayloads/Main.cpp b/test/DumpPayloads/Main.cpp new file mode 100644 index 00000000..7f7986f3 --- /dev/null +++ b/test/DumpPayloads/Main.cpp @@ -0,0 +1,74 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2024 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +#include "firestarter/Environment/Payload/Payload.hpp" +#include "firestarter/Environment/X86/Payload/AVX512Payload.hpp" +#include "firestarter/Environment/X86/Payload/AVXPayload.hpp" +#include "firestarter/Environment/X86/Payload/FMA4Payload.hpp" +#include "firestarter/Environment/X86/Payload/FMAPayload.hpp" +#include "firestarter/Environment/X86/Payload/SSE2Payload.hpp" +#include "firestarter/Environment/X86/Payload/ZENFMAPayload.hpp" + +namespace { + +/// Take a list of instructions and return a list with a pair containing the each instruction in the first element of +/// the pair and a one in the second. +auto oneEach(const std::list& Instructions) + -> std::vector { + std::vector OneEach; + for (const auto& Instruction : Instructions) { + OneEach.emplace_back(Instruction, 1); + } + return OneEach; +} + +/// Dump the generated assembler code of the payload with some given settings. Each item is printed once. +void dumpPayload(firestarter::environment::payload::Payload& PayloadPtr) { + const auto& Instuctions = PayloadPtr.getAvailableInstructions(); + + firestarter::environment::payload::PayloadSettings Settings(/*Threads=*/{1}, + /*DataCacheBufferSize=*/{32768, 1048576, 1441792}, + /*RamBufferSize=*/1048576000, + /*Lines=*/3 * Instuctions.size(), + /*InstructionGroups=*/oneEach(Instuctions)); + + (void)PayloadPtr.compilePayload(Settings, /*DumpRegisters=*/false, /*ErrorDetection=*/false, + /*PrintAssembler=*/true); +} + +} // namespace + +auto main(int /*argc*/, const char** /*argv*/) -> int { + const std::vector> PayloadPtrs = { + std::make_unique(), + std::make_unique(), + std::make_unique(), + std::make_unique(), + std::make_unique(), + std::make_unique()}; + + for (const auto& PayloadPtr : PayloadPtrs) { + firestarter::log::info() << "Payload " << PayloadPtr->name(); + dumpPayload(*PayloadPtr); + } + + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/test/X86Functions/CMakeLists.txt b/test/X86Functions/CMakeLists.txt new file mode 100644 index 00000000..c2a2649b --- /dev/null +++ b/test/X86Functions/CMakeLists.txt @@ -0,0 +1,5 @@ +add_executable(X86Functions Main.cpp) +target_link_libraries(X86Functions firestartercore) +target_link_libraries_darwin(NAME X86Functions) + +add_ref_test(NAME X86Functions) \ No newline at end of file diff --git a/test/X86Functions/Main.cpp b/test/X86Functions/Main.cpp new file mode 100644 index 00000000..4b35389a --- /dev/null +++ b/test/X86Functions/Main.cpp @@ -0,0 +1,32 @@ +/****************************************************************************** + * FIRESTARTER - A Processor Stress Test Utility + * Copyright (C) 2024 TU Dresden, Center for Information Services and High + * Performance Computing + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + * + * Contact: daniel.hackenberg@tu-dresden.de + *****************************************************************************/ + +#include "firestarter/Environment/X86/X86Environment.hpp" + +auto main(int /*argc*/, const char** /*argv*/) -> int { + firestarter::logging::Filter::set_severity(nitro::log::severity_level::info); + + firestarter::environment::x86::X86Environment Env; + + Env.printFunctionSummary(/*ForceYes=*/true); + + return EXIT_SUCCESS; +} \ No newline at end of file diff --git a/test/refs/DumpPayloads.log b/test/refs/DumpPayloads.log new file mode 100644 index 00000000..64d7faaf --- /dev/null +++ b/test/refs/DumpPayloads.log @@ -0,0 +1,1476 @@ +Payload AVX512 +.section .text +push rbx +push rbp +push r12 +push r13 +push r14 +push r15 +mov rax, rdi +mov r15, rsi +mov r13, rdx +movq mm0, r13 +mov r13, qword ptr [r15] +test r13, r13 +jz L0 +mov r14, 64 +mov edi, 2863311530 +mov esi, 2863311530 +mov edx, 2863311530 +vmovapd zmm0, zmmword ptr [rax] +vmovapd zmm1, zmmword ptr [rax+64] +vmovapd zmm2, zmmword ptr [rax+128] +vmovapd zmm3, zmmword ptr [rax+448] +vmovapd zmm4, zmmword ptr [rax+512] +vmovapd zmm5, zmmword ptr [rax+576] +vmovapd zmm6, zmmword ptr [rax+640] +vmovapd zmm7, zmmword ptr [rax+704] +vmovapd zmm8, zmmword ptr [rax+768] +vmovapd zmm9, zmmword ptr [rax+832] +vmovapd zmm10, zmmword ptr [rax+896] +vmovapd zmm11, zmmword ptr [rax+960] +vmovapd zmm12, zmmword ptr [rax+1024] +vmovapd zmm13, zmmword ptr [rax+1088] +vmovapd zmm14, zmmword ptr [rax+1152] +vmovapd zmm15, zmmword ptr [rax+1216] +vmovapd zmm16, zmmword ptr [rax+1280] +vmovapd zmm17, zmmword ptr [rax+1344] +vmovapd zmm18, zmmword ptr [rax+1408] +vmovapd zmm19, zmmword ptr [rax+1472] +vmovapd zmm20, zmmword ptr [rax+1536] +vmovapd zmm21, zmmword ptr [rax+1600] +vmovapd zmm22, zmmword ptr [rax+1664] +vmovapd zmm23, zmmword ptr [rax+1728] +vmovapd zmm24, zmmword ptr [rax+1792] +vmovapd zmm25, zmmword ptr [rax+1856] +vmovapd zmm26, zmmword ptr [rax+1920] +vmovapd zmm27, zmmword ptr [rax+1984] +vmovapd zmm28, zmmword ptr [rax+2048] +vmovapd zmm29, zmmword ptr [rax+2112] +mov rbx, rax +mov rcx, rax +add rcx, 32768 +mov r8, rax +add r8, 1048576 +mov r9, rax +add r9, 1441792 +mov r10, 1456 +mov r11, 1501 +mov r12, 1365333 +.align 64 (code) +L1: +vfmadd231pd zmm4, zmm0, zmm2 +vbroadcastsd zmm4, qword ptr [rbx+64] +add rbx, r14 +shl edi, 1 +vfmadd231pd zmm5, zmm0, zmm2 +vfmadd231pd zmm25, zmm2, zmm1 +xor rdi, r13 +shl esi, 1 +vmovapd zmmword ptr [r9+64], zmm6 +vfmadd231pd zmm6, zmm0, zmm2 +add r9, r14 +shl edx, 1 +vfmadd231pd zmm7, zmm0, zmmword ptr [rbx+64] +prefetcht2 [r9] +add r9, r14 +shr edi, 1 +vmovapd zmmword ptr [r9+64], zmm8 +vfmadd231pd zmm8, zmm0, zmmword ptr [r9+128] +add r9, r14 +shr esi, 1 +vfmadd231pd zmm9, zmm0, zmm2 +vfmadd231pd zmm30, zmm1, zmmword ptr [r9+64] +add r9, r14 +shr edx, 1 +vmovapd zmmword ptr [r8+64], zmm10 +vfmadd231pd zmm10, zmm0, zmm2 +add r8, r14 +shl edi, 1 +vfmadd231pd zmm11, zmm0, zmmword ptr [rbx+64] +prefetcht2 [r8] +add r8, r14 +shl esi, 1 +vmovapd zmmword ptr [r8+64], zmm12 +vfmadd231pd zmm12, zmm0, zmmword ptr [r8+128] +add r8, r14 +shl edx, 1 +vfmadd231pd zmm13, zmm0, zmm2 +vfmadd231pd zmm13, zmm1, zmmword ptr [r8+64] +add r8, r14 +shr edi, 1 +vmovapd zmmword ptr [rcx+64], zmm14 +vfmadd231pd zmm14, zmm0, zmm2 +add rcx, r14 +shr esi, 1 +vmovapd zmmword ptr [rcx+64], zmm15 +vfmadd231pd zmm15, zmm0, zmmword ptr [rcx+128] +add rcx, r14 +shr edx, 1 +vfmadd231pd zmm16, zmm0, zmm2 +vfmadd231pd zmm16, zmm1, zmmword ptr [rcx+64] +add rcx, r14 +shl edi, 1 +vmovapd zmmword ptr [rbx+64], zmm17 +vfmadd231pd zmm17, zmm0, zmm2 +add rbx, r14 +shl esi, 1 +vmovapd zmmword ptr [rbx+64], zmm18 +vfmadd231pd zmm18, zmm0, zmmword ptr [rbx+128] +add rbx, r14 +shl edx, 1 +vfmadd231pd zmm19, zmm0, zmm2 +vfmadd231pd zmm19, zmm1, zmmword ptr [rbx+64] +add rbx, r14 +shr edi, 1 +vfmadd231pd zmm20, zmm0, zmm2 +vbroadcastsd zmm20, qword ptr [rbx+64] +add rbx, r14 +shr esi, 1 +vfmadd231pd zmm21, zmm0, zmm2 +vfmadd231pd zmm26, zmm2, zmm1 +xor rsi, r13 +shr edx, 1 +vmovapd zmmword ptr [r9+64], zmm22 +vfmadd231pd zmm22, zmm0, zmm2 +add r9, r14 +shl edi, 1 +vfmadd231pd zmm23, zmm0, zmmword ptr [rbx+64] +prefetcht2 [r9] +add r9, r14 +shl esi, 1 +vmovapd zmmword ptr [r9+64], zmm24 +vfmadd231pd zmm24, zmm0, zmmword ptr [r9+128] +add r9, r14 +shl edx, 1 +vfmadd231pd zmm3, zmm0, zmm2 +vfmadd231pd zmm30, zmm1, zmmword ptr [r9+64] +add r9, r14 +shr edi, 1 +vmovapd zmmword ptr [r8+64], zmm4 +vfmadd231pd zmm4, zmm0, zmm2 +add r8, r14 +shr esi, 1 +vfmadd231pd zmm5, zmm0, zmmword ptr [rbx+64] +prefetcht2 [r8] +add r8, r14 +shr edx, 1 +vmovapd zmmword ptr [r8+64], zmm6 +vfmadd231pd zmm6, zmm0, zmmword ptr [r8+128] +add r8, r14 +shl edi, 1 +vfmadd231pd zmm7, zmm0, zmm2 +vfmadd231pd zmm7, zmm1, zmmword ptr [r8+64] +add r8, r14 +shl esi, 1 +vmovapd zmmword ptr [rcx+64], zmm8 +vfmadd231pd zmm8, zmm0, zmm2 +add rcx, r14 +shl edx, 1 +vmovapd zmmword ptr [rcx+64], zmm9 +vfmadd231pd zmm9, zmm0, zmmword ptr [rcx+128] +add rcx, r14 +shr edi, 1 +vfmadd231pd zmm10, zmm0, zmm2 +vfmadd231pd zmm10, zmm1, zmmword ptr [rcx+64] +add rcx, r14 +shr esi, 1 +vmovapd zmmword ptr [rbx+64], zmm11 +vfmadd231pd zmm11, zmm0, zmm2 +add rbx, r14 +shr edx, 1 +vmovapd zmmword ptr [rbx+64], zmm12 +vfmadd231pd zmm12, zmm0, zmmword ptr [rbx+128] +add rbx, r14 +shl edi, 1 +vfmadd231pd zmm13, zmm0, zmm2 +vfmadd231pd zmm13, zmm1, zmmword ptr [rbx+64] +add rbx, r14 +shl esi, 1 +vfmadd231pd zmm14, zmm0, zmm2 +vbroadcastsd zmm14, qword ptr [rbx+64] +add rbx, r14 +shl edx, 1 +vfmadd231pd zmm15, zmm0, zmm2 +vfmadd231pd zmm27, zmm2, zmm1 +xor rdx, r13 +shr edi, 1 +vmovapd zmmword ptr [r9+64], zmm16 +vfmadd231pd zmm16, zmm0, zmm2 +add r9, r14 +shr esi, 1 +vfmadd231pd zmm17, zmm0, zmmword ptr [rbx+64] +prefetcht2 [r9] +add r9, r14 +shr edx, 1 +vmovapd zmmword ptr [r9+64], zmm18 +vfmadd231pd zmm18, zmm0, zmmword ptr [r9+128] +add r9, r14 +shl edi, 1 +vfmadd231pd zmm19, zmm0, zmm2 +vfmadd231pd zmm30, zmm1, zmmword ptr [r9+64] +add r9, r14 +shl esi, 1 +vmovapd zmmword ptr [r8+64], zmm20 +vfmadd231pd zmm20, zmm0, zmm2 +add r8, r14 +shl edx, 1 +vfmadd231pd zmm21, zmm0, zmmword ptr [rbx+64] +prefetcht2 [r8] +add r8, r14 +shr edi, 1 +vmovapd zmmword ptr [r8+64], zmm22 +vfmadd231pd zmm22, zmm0, zmmword ptr [r8+128] +add r8, r14 +shr esi, 1 +vfmadd231pd zmm23, zmm0, zmm2 +vfmadd231pd zmm23, zmm1, zmmword ptr [r8+64] +add r8, r14 +shr edx, 1 +vmovapd zmmword ptr [rcx+64], zmm24 +vfmadd231pd zmm24, zmm0, zmm2 +add rcx, r14 +shl edi, 1 +vmovapd zmmword ptr [rcx+64], zmm3 +vfmadd231pd zmm3, zmm0, zmmword ptr [rcx+128] +add rcx, r14 +shl esi, 1 +vfmadd231pd zmm4, zmm0, zmm2 +vfmadd231pd zmm4, zmm1, zmmword ptr [rcx+64] +add rcx, r14 +shl edx, 1 +vmovapd zmmword ptr [rbx+64], zmm5 +vfmadd231pd zmm5, zmm0, zmm2 +add rbx, r14 +shr edi, 1 +vmovapd zmmword ptr [rbx+64], zmm6 +vfmadd231pd zmm6, zmm0, zmmword ptr [rbx+128] +add rbx, r14 +shr esi, 1 +vfmadd231pd zmm7, zmm0, zmm2 +vfmadd231pd zmm7, zmm1, zmmword ptr [rbx+64] +add rbx, r14 +shr edx, 1 +movq r13, mm0 +sub r12, 1 +jnz L2 +mov r12, 1365333 +mov r9, rax +add r9, 1441792 +L2: +inc r13 +sub r10, 1 +jnz L3 +mov r10, 1456 +mov rcx, rax +add rcx, 32768 +L3: +movq mm0, r13 +sub r11, 1 +jnz L4 +mov r11, 1501 +mov r8, rax +add r8, 1048576 +L4: +mov rbx, rax +test qword ptr [r15], 1 +jnz L1 +L0: +movq rax, mm0 +pop r15 +pop r14 +pop r13 +pop r12 +pop rbp +pop rbx +ret + +Payload FMA +.section .text +push rbx +push rbp +push r12 +push r13 +push r14 +push r15 +mov rax, rdi +mov r15, rsi +mov r13, rdx +movq mm0, r13 +mov r13, qword ptr [r15] +test r13, r13 +jz L0 +mov r14, 64 +mov edi, 2863311530 +mov esi, 2863311530 +mov edx, 2863311530 +vmovapd ymm0, ymmword ptr [rax] +vmovapd ymm1, ymmword ptr [rax+32] +vmovapd ymm2, ymmword ptr [rax+64] +vmovapd ymm3, ymmword ptr [rax+352] +vmovapd ymm4, ymmword ptr [rax+384] +vmovapd ymm5, ymmword ptr [rax+416] +vmovapd ymm6, ymmword ptr [rax+448] +vmovapd ymm7, ymmword ptr [rax+480] +vmovapd ymm8, ymmword ptr [rax+512] +vmovapd ymm9, ymmword ptr [rax+544] +vmovapd ymm10, ymmword ptr [rax+576] +vmovapd ymm11, ymmword ptr [rax+608] +vmovapd ymm12, ymmword ptr [rax+640] +vmovapd ymm13, ymmword ptr [rax+672] +vmovapd ymm14, ymmword ptr [rax+704] +mov rbx, rax +mov rcx, rax +add rcx, 32768 +mov r8, rax +add r8, 1048576 +mov r9, rax +add r9, 1441792 +mov r10, 873 +mov r11, 1201 +mov r12, 1365333 +.align 64 (code) +L1: +vfmadd231pd ymm4, ymm0, ymmword ptr [rbx+32] +vfmadd231pd ymm12, ymm1, ymmword ptr [rbx+64] +add rbx, r14 +shl edi, 1 +vfmadd231pd ymm5, ymm0, ymm2 +vfmadd231pd ymm12, ymm2, ymm1 +xor rdi, r13 +shl esi, 1 +vmovapd xmmword ptr [r9+64], xmm6 +vfmadd231pd ymm6, ymm0, ymm2 +add r9, r14 +shl edx, 1 +vfmadd231pd ymm7, ymm0, ymmword ptr [rbx+32] +prefetcht2 [r9] +add r9, r14 +shr edi, 1 +vmovapd xmmword ptr [r9+64], xmm8 +vfmadd231pd ymm8, ymm0, ymmword ptr [r9+32] +add r9, r14 +shr esi, 1 +vfmadd231pd ymm9, ymm0, ymm2 +vfmadd231pd ymm15, ymm1, ymmword ptr [r9+64] +add r9, r14 +shr edx, 1 +vmovapd xmmword ptr [r8+96], xmm10 +vfmadd231pd ymm10, ymm0, ymm2 +add r8, r14 +shl edi, 1 +vfmadd231pd ymm11, ymm0, ymmword ptr [rbx+32] +prefetcht2 [r8] +add r8, r14 +shl esi, 1 +vmovapd ymmword ptr [r8+96], ymm3 +vfmadd231pd ymm3, ymm0, ymmword ptr [r8+64] +add r8, r14 +shl edx, 1 +vmovapd xmmword ptr [r8+96], xmm4 +vfmadd231pd ymm4, ymm0, ymmword ptr [r8+64] +add r8, r14 +shr edi, 1 +vfmadd231pd ymm5, ymm0, ymm2 +vfmadd231pd ymm5, ymm1, ymmword ptr [r8+64] +add r8, r14 +shr esi, 1 +vmovapd xmmword ptr [rcx+64], xmm6 +vfmadd231pd ymm6, ymm0, ymm2 +add rcx, r14 +shr edx, 1 +vmovapd ymmword ptr [rcx+96], ymm7 +vfmadd231pd ymm7, ymm0, [rcx+64] +add rcx, r14 +shl edi, 1 +vmovapd xmmword ptr [rcx+96], xmm8 +vfmadd231pd ymm8, ymm0, ymmword ptr [rcx+64] +add rcx, r14 +shl esi, 1 +vfmadd231pd ymm9, ymm0, ymm2 +vfmadd231pd ymm9, ymm1, ymmword ptr [rcx+64] +add rcx, r14 +shl edx, 1 +vfmadd231pd ymm10, ymm0, [rcx+64] +vfmadd231pd ymm13, ymm1, [rcx+96] +vmovapd ymmword ptr [rcx+32], ymm10 +add rcx, 128 +vmovapd xmmword ptr [rbx+32], xmm11 +vfmadd231pd ymm11, ymm0, ymm2 +add rbx, r14 +shr esi, 1 +vfmadd231pd ymm3, ymm0, ymmword ptr [rbx+64] +vmovapd ymmword ptr [rbx+32], ymm3 +add rbx, r14 +shr edx, 1 +vmovapd xmmword ptr [rbx+64], xmm4 +vfmadd231pd ymm4, ymm0, ymmword ptr [rbx+32] +add rbx, r14 +shl edi, 1 +vfmadd231pd ymm5, ymm0, ymm2 +vfmadd231pd ymm5, ymm1, ymmword ptr [rbx+32] +add rbx, r14 +shl esi, 1 +vfmadd231pd ymm6, ymm0, ymmword ptr [rbx+64] +vfmadd231pd ymm13, ymm1, ymmword ptr [rbx+96] +vmovapd ymmword ptr [rbx+32], ymm6 +add rbx, r14 +vfmadd231pd ymm7, ymm0, ymmword ptr [rbx+32] +vfmadd231pd ymm13, ymm1, ymmword ptr [rbx+64] +add rbx, r14 +shr edi, 1 +vfmadd231pd ymm8, ymm0, ymm2 +vfmadd231pd ymm13, ymm2, ymm1 +xor rdi, r13 +shr esi, 1 +vmovapd xmmword ptr [r9+64], xmm9 +vfmadd231pd ymm9, ymm0, ymm2 +add r9, r14 +shr edx, 1 +vfmadd231pd ymm10, ymm0, ymmword ptr [rbx+32] +prefetcht2 [r9] +add r9, r14 +shl edi, 1 +vmovapd xmmword ptr [r9+64], xmm11 +vfmadd231pd ymm11, ymm0, ymmword ptr [r9+32] +add r9, r14 +shl esi, 1 +vfmadd231pd ymm3, ymm0, ymm2 +vfmadd231pd ymm15, ymm1, ymmword ptr [r9+64] +add r9, r14 +shl edx, 1 +vmovapd xmmword ptr [r8+96], xmm4 +vfmadd231pd ymm4, ymm0, ymm2 +add r8, r14 +shr edi, 1 +vfmadd231pd ymm5, ymm0, ymmword ptr [rbx+32] +prefetcht2 [r8] +add r8, r14 +shr esi, 1 +vmovapd ymmword ptr [r8+96], ymm6 +vfmadd231pd ymm6, ymm0, ymmword ptr [r8+64] +add r8, r14 +shr edx, 1 +vmovapd xmmword ptr [r8+96], xmm7 +vfmadd231pd ymm7, ymm0, ymmword ptr [r8+64] +add r8, r14 +shl edi, 1 +vfmadd231pd ymm8, ymm0, ymm2 +vfmadd231pd ymm8, ymm1, ymmword ptr [r8+64] +add r8, r14 +shl esi, 1 +vmovapd xmmword ptr [rcx+64], xmm9 +vfmadd231pd ymm9, ymm0, ymm2 +add rcx, r14 +shl edx, 1 +vmovapd ymmword ptr [rcx+96], ymm10 +vfmadd231pd ymm10, ymm0, [rcx+64] +add rcx, r14 +shr edi, 1 +vmovapd xmmword ptr [rcx+96], xmm11 +vfmadd231pd ymm11, ymm0, ymmword ptr [rcx+64] +add rcx, r14 +shr esi, 1 +vfmadd231pd ymm3, ymm0, ymm2 +vfmadd231pd ymm3, ymm1, ymmword ptr [rcx+64] +add rcx, r14 +shr edx, 1 +vfmadd231pd ymm4, ymm0, [rcx+64] +vfmadd231pd ymm14, ymm1, [rcx+96] +vmovapd ymmword ptr [rcx+32], ymm4 +add rcx, 128 +vmovapd xmmword ptr [rbx+32], xmm5 +vfmadd231pd ymm5, ymm0, ymm2 +add rbx, r14 +shl esi, 1 +vfmadd231pd ymm6, ymm0, ymmword ptr [rbx+64] +vmovapd ymmword ptr [rbx+32], ymm6 +add rbx, r14 +shl edx, 1 +vmovapd xmmword ptr [rbx+64], xmm7 +vfmadd231pd ymm7, ymm0, ymmword ptr [rbx+32] +add rbx, r14 +shr edi, 1 +vfmadd231pd ymm8, ymm0, ymm2 +vfmadd231pd ymm8, ymm1, ymmword ptr [rbx+32] +add rbx, r14 +shr esi, 1 +vfmadd231pd ymm9, ymm0, ymmword ptr [rbx+64] +vfmadd231pd ymm14, ymm1, ymmword ptr [rbx+96] +vmovapd ymmword ptr [rbx+32], ymm9 +add rbx, r14 +vfmadd231pd ymm10, ymm0, ymmword ptr [rbx+32] +vfmadd231pd ymm14, ymm1, ymmword ptr [rbx+64] +add rbx, r14 +shl edi, 1 +vfmadd231pd ymm11, ymm0, ymm2 +vfmadd231pd ymm14, ymm2, ymm1 +xor rdi, r13 +shl esi, 1 +vmovapd xmmword ptr [r9+64], xmm3 +vfmadd231pd ymm3, ymm0, ymm2 +add r9, r14 +shl edx, 1 +vfmadd231pd ymm4, ymm0, ymmword ptr [rbx+32] +prefetcht2 [r9] +add r9, r14 +shr edi, 1 +vmovapd xmmword ptr [r9+64], xmm5 +vfmadd231pd ymm5, ymm0, ymmword ptr [r9+32] +add r9, r14 +shr esi, 1 +vfmadd231pd ymm6, ymm0, ymm2 +vfmadd231pd ymm15, ymm1, ymmword ptr [r9+64] +add r9, r14 +shr edx, 1 +vmovapd xmmword ptr [r8+96], xmm7 +vfmadd231pd ymm7, ymm0, ymm2 +add r8, r14 +shl edi, 1 +vfmadd231pd ymm8, ymm0, ymmword ptr [rbx+32] +prefetcht2 [r8] +add r8, r14 +shl esi, 1 +vmovapd ymmword ptr [r8+96], ymm9 +vfmadd231pd ymm9, ymm0, ymmword ptr [r8+64] +add r8, r14 +shl edx, 1 +vmovapd xmmword ptr [r8+96], xmm10 +vfmadd231pd ymm10, ymm0, ymmword ptr [r8+64] +add r8, r14 +shr edi, 1 +vfmadd231pd ymm11, ymm0, ymm2 +vfmadd231pd ymm11, ymm1, ymmword ptr [r8+64] +add r8, r14 +shr esi, 1 +vmovapd xmmword ptr [rcx+64], xmm3 +vfmadd231pd ymm3, ymm0, ymm2 +add rcx, r14 +shr edx, 1 +vmovapd ymmword ptr [rcx+96], ymm4 +vfmadd231pd ymm4, ymm0, [rcx+64] +add rcx, r14 +shl edi, 1 +vmovapd xmmword ptr [rcx+96], xmm5 +vfmadd231pd ymm5, ymm0, ymmword ptr [rcx+64] +add rcx, r14 +shl esi, 1 +vfmadd231pd ymm6, ymm0, ymm2 +vfmadd231pd ymm6, ymm1, ymmword ptr [rcx+64] +add rcx, r14 +shl edx, 1 +vfmadd231pd ymm7, ymm0, [rcx+64] +vfmadd231pd ymm12, ymm1, [rcx+96] +vmovapd ymmword ptr [rcx+32], ymm7 +add rcx, 128 +vmovapd xmmword ptr [rbx+32], xmm8 +vfmadd231pd ymm8, ymm0, ymm2 +add rbx, r14 +shr esi, 1 +vfmadd231pd ymm9, ymm0, ymmword ptr [rbx+64] +vmovapd ymmword ptr [rbx+32], ymm9 +add rbx, r14 +shr edx, 1 +vmovapd xmmword ptr [rbx+64], xmm10 +vfmadd231pd ymm10, ymm0, ymmword ptr [rbx+32] +add rbx, r14 +shl edi, 1 +vfmadd231pd ymm11, ymm0, ymm2 +vfmadd231pd ymm11, ymm1, ymmword ptr [rbx+32] +add rbx, r14 +shl esi, 1 +vfmadd231pd ymm3, ymm0, ymmword ptr [rbx+64] +vfmadd231pd ymm12, ymm1, ymmword ptr [rbx+96] +vmovapd ymmword ptr [rbx+32], ymm3 +add rbx, r14 +movq r13, mm0 +sub r12, 1 +jnz L2 +mov r12, 1365333 +mov r9, rax +add r9, 1441792 +L2: +inc r13 +sub r10, 1 +jnz L3 +mov r10, 873 +mov rcx, rax +add rcx, 32768 +L3: +movq mm0, r13 +sub r11, 1 +jnz L4 +mov r11, 1201 +mov r8, rax +add r8, 1048576 +L4: +mov rbx, rax +test qword ptr [r15], 1 +jnz L1 +L0: +movq rax, mm0 +pop r15 +pop r14 +pop r13 +pop r12 +pop rbp +pop rbx +ret + +Payload ZENFMA +.section .text +push rbx +push rbp +push r12 +push r13 +push r14 +push r15 +mov rax, rdi +mov r15, rsi +mov r13, rdx +movq mm0, r13 +mov r13, qword ptr [r15] +test r13, r13 +jz L0 +mov r14, 64 +mov rdi, -6148914691236517206 +mov rsi, -6148914691236517206 +mov rdx, -6148914691236517206 +vmovapd ymm0, ymmword ptr [rax] +vmovapd ymm1, ymmword ptr [rax+32] +vmovapd ymm2, ymmword ptr [rax+320] +vmovapd ymm3, ymmword ptr [rax+352] +vmovapd ymm4, ymmword ptr [rax+384] +vmovapd ymm5, ymmword ptr [rax+416] +vmovapd ymm6, ymmword ptr [rax+448] +vmovapd ymm7, ymmword ptr [rax+480] +vmovapd ymm8, ymmword ptr [rax+512] +vmovapd ymm9, ymmword ptr [rax+544] +vmovapd ymm10, ymmword ptr [rax+576] +vmovapd ymm11, ymmword ptr [rax+608] +vmovapd ymm12, ymmword ptr [rax+640] +vbroadcastss xmm13, xmm13 +vmovapd xmm14, xmm13 +vpsrlq xmm14, xmm14, 1 +mov rbx, rax +mov rcx, rax +add rcx, 32768 +mov r8, rax +add r8, 1048576 +mov r9, rax +add r9, 1441792 +mov r10, 4369 +mov r11, 6007 +mov r12, 5461333 +.align 64 (code) +L1: +vfmadd231pd ymm2, ymm0, ymmword ptr [rbx+32] +vmovapd xmmword ptr [rbx+64], xmm2 +add rbx, r14 +vpsrlq xmm13, xmm13, 1 +vfmadd231pd ymm3, ymm1, ymm0 +xor r13, rdi +shl rsi, 1 +vpsllq xmm14, xmm14, 1 +vfmadd231pd ymm15, ymm0, ymmword ptr [r9+32] +xor r13, rsi +add r9, r14 +vpsllq xmm13, xmm13, 1 +vfmadd231pd ymm5, ymm1, ymmword ptr [r8+64] +xor r13, rdx +add r8, r14 +vpsrlq xmm14, xmm14, 1 +vfmadd231pd ymm6, ymm0, ymmword ptr [rcx+64] +xor r13, rdi +add rcx, r14 +vpsrlq xmm13, xmm13, 1 +vfmadd231pd ymm7, ymm1, ymmword ptr [rbx+32] +vmovapd xmmword ptr [rbx+64], xmm7 +add rbx, r14 +vpsllq xmm14, xmm14, 1 +vfmadd231pd ymm8, ymm0, ymm1 +xor r13, rdx +shl rdi, 1 +vpsllq xmm13, xmm13, 1 +vfmadd231pd ymm15, ymm1, ymmword ptr [r9+32] +xor r13, rdi +add r9, r14 +vpsrlq xmm14, xmm14, 1 +vfmadd231pd ymm10, ymm0, ymmword ptr [r8+64] +xor r13, rsi +add r8, r14 +vpsrlq xmm13, xmm13, 1 +vfmadd231pd ymm11, ymm1, ymmword ptr [rcx+64] +xor r13, rdx +add rcx, r14 +vpsllq xmm14, xmm14, 1 +vfmadd231pd ymm12, ymm0, ymmword ptr [rbx+32] +vmovapd xmmword ptr [rbx+64], xmm12 +add rbx, r14 +vpsllq xmm13, xmm13, 1 +vfmadd231pd ymm2, ymm1, ymm0 +xor r13, rsi +shr rdx, 1 +vpsrlq xmm14, xmm14, 1 +vfmadd231pd ymm15, ymm0, ymmword ptr [r9+32] +xor r13, rdx +add r9, r14 +vfmadd231pd ymm4, ymm1, ymmword ptr [r8+64] +xor r13, rdi +add r8, r14 +vfmadd231pd ymm5, ymm0, ymmword ptr [rcx+64] +xor r13, rsi +add rcx, r14 +movq r13, mm0 +sub r12, 1 +jnz L2 +mov r12, 5461333 +mov r9, rax +add r9, 1441792 +L2: +inc r13 +sub r10, 1 +jnz L3 +mov r10, 4369 +mov rcx, rax +add rcx, 32768 +L3: +movq mm0, r13 +sub r11, 1 +jnz L4 +mov r11, 6007 +mov r8, rax +add r8, 1048576 +L4: +mov rbx, rax +test qword ptr [r15], 1 +jnz L1 +L0: +movq rax, mm0 +pop r15 +pop r14 +pop r13 +pop r12 +pop rbp +pop rbx +ret + +Payload FMA4 +.section .text +push rbx +push rbp +push r12 +push r13 +push r14 +push r15 +mov rax, rdi +mov r15, rsi +mov r13, rdx +movq mm0, r13 +mov r13, qword ptr [r15] +test r13, r13 +jz L0 +mov r14, 64 +mov edi, 2863311530 +mov esi, 2863311530 +mov edx, 2863311530 +vmovapd ymm0, ymmword ptr [rax] +vmovapd ymm1, ymmword ptr [rax] +vmovapd ymm2, ymmword ptr [rax+320] +vmovapd ymm3, ymmword ptr [rax+352] +vmovapd ymm4, ymmword ptr [rax+384] +vmovapd ymm5, ymmword ptr [rax+416] +vmovapd ymm6, ymmword ptr [rax+448] +vmovapd ymm7, ymmword ptr [rax+480] +vmovapd ymm8, ymmword ptr [rax+512] +vmovapd ymm9, ymmword ptr [rax+544] +vmovapd ymm10, ymmword ptr [rax+576] +vmovapd ymm11, ymmword ptr [rax+608] +vmovapd ymm12, ymmword ptr [rax+640] +vmovapd ymm13, ymmword ptr [rax+672] +mov rbx, rax +mov rcx, rax +add rcx, 32768 +mov r8, rax +add r8, 1048576 +mov r9, rax +add r9, 1441792 +mov r10, 1456 +mov r11, 1501 +mov r12, 1365333 +.align 64 (code) +L1: +vfmaddpd xmm3, xmm3, xmm0, xmm4 +vfmaddpd ymm3, ymm3, ymm1, ymmword ptr [rbx+32] +add rbx, r14 +shl edi, 1 +vfmaddpd xmm4, xmm4, xmm0, xmm5 +vfmaddpd xmm11, xmm11, xmm1, xmm6 +xor rdi, r13 +shl esi, 1 +vmovapd xmmword ptr [r9+64], xmm5 +vfmaddpd xmm5, xmm5, xmm0, xmm6 +add r9, r14 +shl edx, 1 +vfmaddpd xmm6, xmm6, xmm0, xmmword ptr [rbx+32] +prefetcht2 [r9] +add r9, r14 +shr edi, 1 +vmovapd xmmword ptr [r9+64], xmm7 +vfmaddpd xmm7, xmm7, xmm0, xmmword ptr [r9+32] +add r9, r14 +shr esi, 1 +vfmaddpd xmm8, xmm8, xmm0, xmm9 +vfmaddpd xmm15, xmm15, xmm1, xmmword ptr [r9+64] +add r9, r14 +shr edx, 1 +vmovapd xmmword ptr [r8+96], xmm9 +vfmaddpd xmm9, xmm9, xmm0, xmm10 +add r8, r14 +shl edi, 1 +vfmaddpd xmm10, xmm10, xmm0, xmmword ptr [rbx+32] +prefetcht2 [r8] +add r8, r14 +shl esi, 1 +vmovapd xmmword ptr [r8+96], xmm2 +vfmaddpd xmm2, xmm2, xmm0, xmmword ptr [r8+64] +add r8, r14 +shl edx, 1 +vfmaddpd xmm3, xmm3, xmm0, xmm4 +vfmaddpd xmm3, xmm3, xmm1, xmmword ptr [r8+64] +add r8, r14 +shr edi, 1 +vmovapd xmmword ptr [rcx+64], xmm4 +vfmaddpd xmm4, xmm4, xmm0, xmm5 +add rcx, r14 +shr esi, 1 +vmovapd xmmword ptr [rcx+96], xmm5 +vfmaddpd xmm5, xmm5, xmm0, xmmword ptr [rcx+64] +add rcx, r14 +shr edx, 1 +vfmaddpd xmm6, xmm6, xmm0, xmm7 +vfmaddpd xmm6, xmm6, xmm1, xmmword ptr [rcx+64] +add rcx, r14 +shl edi, 1 +vmovapd xmmword ptr [rbx+32], xmm7 +vfmaddpd ymm7, ymm7, ymm0, ymm8 +add rbx, r14 +shl esi, 1 +vmovapd xmmword ptr [rbx+64], xmm8 +vfmaddpd ymm8, ymm8, ymm0, ymmword ptr [rbx+32] +add rbx, r14 +shl edx, 1 +vfmaddpd xmm9, xmm9, xmm0, xmm10 +vfmaddpd ymm9, ymm9, ymm1, ymmword ptr [rbx+32] +add rbx, r14 +shr edi, 1 +vfmaddpd xmm10, xmm10, xmm0, xmm2 +vfmaddpd xmm12, xmm12, xmm1, xmm3 +xor rdi, r13 +shr esi, 1 +vmovapd xmmword ptr [r9+64], xmm2 +vfmaddpd xmm2, xmm2, xmm0, xmm3 +add r9, r14 +shr edx, 1 +vfmaddpd xmm3, xmm3, xmm0, xmmword ptr [rbx+32] +prefetcht2 [r9] +add r9, r14 +shl edi, 1 +vmovapd xmmword ptr [r9+64], xmm4 +vfmaddpd xmm4, xmm4, xmm0, xmmword ptr [r9+32] +add r9, r14 +shl esi, 1 +vfmaddpd xmm5, xmm5, xmm0, xmm6 +vfmaddpd xmm15, xmm15, xmm1, xmmword ptr [r9+64] +add r9, r14 +shl edx, 1 +vmovapd xmmword ptr [r8+96], xmm6 +vfmaddpd xmm6, xmm6, xmm0, xmm7 +add r8, r14 +shr edi, 1 +vfmaddpd xmm7, xmm7, xmm0, xmmword ptr [rbx+32] +prefetcht2 [r8] +add r8, r14 +shr esi, 1 +vmovapd xmmword ptr [r8+96], xmm8 +vfmaddpd xmm8, xmm8, xmm0, xmmword ptr [r8+64] +add r8, r14 +shr edx, 1 +vfmaddpd xmm9, xmm9, xmm0, xmm10 +vfmaddpd xmm9, xmm9, xmm1, xmmword ptr [r8+64] +add r8, r14 +shl edi, 1 +vmovapd xmmword ptr [rcx+64], xmm10 +vfmaddpd xmm10, xmm10, xmm0, xmm2 +add rcx, r14 +shl esi, 1 +vmovapd xmmword ptr [rcx+96], xmm2 +vfmaddpd xmm2, xmm2, xmm0, xmmword ptr [rcx+64] +add rcx, r14 +shl edx, 1 +vfmaddpd xmm3, xmm3, xmm0, xmm4 +vfmaddpd xmm3, xmm3, xmm1, xmmword ptr [rcx+64] +add rcx, r14 +shr edi, 1 +vmovapd xmmword ptr [rbx+32], xmm4 +vfmaddpd ymm4, ymm4, ymm0, ymm5 +add rbx, r14 +shr esi, 1 +vmovapd xmmword ptr [rbx+64], xmm5 +vfmaddpd ymm5, ymm5, ymm0, ymmword ptr [rbx+32] +add rbx, r14 +shr edx, 1 +vfmaddpd xmm6, xmm6, xmm0, xmm7 +vfmaddpd ymm6, ymm6, ymm1, ymmword ptr [rbx+32] +add rbx, r14 +shl edi, 1 +vfmaddpd xmm7, xmm7, xmm0, xmm8 +vfmaddpd xmm13, xmm13, xmm1, xmm9 +xor rdi, r13 +shl esi, 1 +vmovapd xmmword ptr [r9+64], xmm8 +vfmaddpd xmm8, xmm8, xmm0, xmm9 +add r9, r14 +shl edx, 1 +vfmaddpd xmm9, xmm9, xmm0, xmmword ptr [rbx+32] +prefetcht2 [r9] +add r9, r14 +shr edi, 1 +vmovapd xmmword ptr [r9+64], xmm10 +vfmaddpd xmm10, xmm10, xmm0, xmmword ptr [r9+32] +add r9, r14 +shr esi, 1 +vfmaddpd xmm2, xmm2, xmm0, xmm3 +vfmaddpd xmm15, xmm15, xmm1, xmmword ptr [r9+64] +add r9, r14 +shr edx, 1 +vmovapd xmmword ptr [r8+96], xmm3 +vfmaddpd xmm3, xmm3, xmm0, xmm4 +add r8, r14 +shl edi, 1 +vfmaddpd xmm4, xmm4, xmm0, xmmword ptr [rbx+32] +prefetcht2 [r8] +add r8, r14 +shl esi, 1 +vmovapd xmmword ptr [r8+96], xmm5 +vfmaddpd xmm5, xmm5, xmm0, xmmword ptr [r8+64] +add r8, r14 +shl edx, 1 +vfmaddpd xmm6, xmm6, xmm0, xmm7 +vfmaddpd xmm6, xmm6, xmm1, xmmword ptr [r8+64] +add r8, r14 +shr edi, 1 +vmovapd xmmword ptr [rcx+64], xmm7 +vfmaddpd xmm7, xmm7, xmm0, xmm8 +add rcx, r14 +shr esi, 1 +vmovapd xmmword ptr [rcx+96], xmm8 +vfmaddpd xmm8, xmm8, xmm0, xmmword ptr [rcx+64] +add rcx, r14 +shr edx, 1 +vfmaddpd xmm9, xmm9, xmm0, xmm10 +vfmaddpd xmm9, xmm9, xmm1, xmmword ptr [rcx+64] +add rcx, r14 +shl edi, 1 +vmovapd xmmword ptr [rbx+32], xmm10 +vfmaddpd ymm10, ymm10, ymm0, ymm2 +add rbx, r14 +shl esi, 1 +vmovapd xmmword ptr [rbx+64], xmm2 +vfmaddpd ymm2, ymm2, ymm0, ymmword ptr [rbx+32] +add rbx, r14 +shl edx, 1 +movq r13, mm0 +sub r12, 1 +jnz L2 +mov r12, 1365333 +mov r9, rax +add r9, 1441792 +L2: +inc r13 +sub r10, 1 +jnz L3 +mov r10, 1456 +mov rcx, rax +add rcx, 32768 +L3: +movq mm0, r13 +sub r11, 1 +jnz L4 +mov r11, 1501 +mov r8, rax +add r8, 1048576 +L4: +mov rbx, rax +test qword ptr [r15], 1 +jnz L1 +L0: +movq rax, mm0 +pop r15 +pop r14 +pop r13 +pop r12 +pop rbp +pop rbx +ret + +Payload AVX +.section .text +push rbx +push rbp +push r12 +push r13 +push r14 +mov rax, rdi +mov r13, rsi +mov r14, rdx +mov r11, qword ptr [r13] +test r11, r11 +jz L0 +mov r12, 64 +vmovapd ymm0, ymmword ptr [rax] +vmovapd ymm1, ymmword ptr [rax+32] +vmovapd ymm2, ymmword ptr [rax+64] +vmovapd ymm3, ymmword ptr [rax+96] +vmovapd ymm4, ymmword ptr [rax+128] +vmovapd ymm5, ymmword ptr [rax+160] +vmovapd ymm6, ymmword ptr [rax+192] +vmovapd ymm7, ymmword ptr [rax+224] +vmovapd ymm8, ymmword ptr [rax+256] +vmovapd ymm9, ymmword ptr [rax+288] +mov r11, 6148914691236517205 +movq mm0, r11 +movq mm1, mm0 +movq mm2, mm0 +movq mm3, mm0 +movq mm4, mm0 +movq mm5, mm0 +mov r11, 1085102592571150095 +pinsrq xmm10, r11, 0 +pinsrq xmm10, r11, 1 +vinsertf128 ymm10, ymm10, xmm10, 1 +shl r11, 4 +pinsrq xmm11, r11, 0 +pinsrq xmm11, r11, 1 +vinsertf128 ymm11, ymm11, xmm11, 1 +shr r11, 4 +pinsrq xmm12, r11, 0 +pinsrq xmm12, r11, 1 +vinsertf128 ymm12, ymm12, xmm12, 1 +shl r11, 4 +pinsrq xmm13, r11, 0 +pinsrq xmm13, r11, 1 +vinsertf128 ymm13, ymm13, xmm13, 1 +shr r11, 4 +pinsrq xmm14, r11, 0 +pinsrq xmm14, r11, 1 +vinsertf128 ymm14, ymm14, xmm14, 1 +shl r11, 4 +pinsrq xmm15, r11, 0 +pinsrq xmm15, r11, 1 +vinsertf128 ymm15, ymm15, xmm15, 1 +mov rbx, rax +mov rcx, rax +add rcx, 32768 +mov rdx, rax +add rdx, 1048576 +mov rdi, rax +add rdi, 1441792 +mov r8, 1456 +mov r9, 1501 +mov r10, 1365333 +.align 64 (code) +L1: +vaddpd ymm1, ymm1, ymmword ptr [rbx+32] +add rbx, r12 +psllw mm3, mm0 +vaddpd ymm2, ymm2, ymm3 +vmovdqa ymm11, ymm12 +psllw mm4, mm1 +vaddpd ymm3, ymm3, ymm2 +vmovapd xmmword ptr [rdi+64], xmm3 +add rdi, r12 +psllw mm5, mm2 +vaddpd ymm4, ymm4, ymmword ptr [rbx+32] +prefetcht2 [rdi] +add rdi, r12 +psllw mm0, mm3 +vaddpd ymm5, ymm5, ymmword ptr [rdx+64] +vmovapd xmmword ptr [rdi+64], xmm5 +add rdi, r12 +psllw mm1, mm4 +vaddpd ymm6, ymm6, ymmword ptr [rdi+64] +add rdi, r12 +psllw mm2, mm5 +vaddpd ymm7, ymm7, ymm6 +vmovapd xmmword ptr [rdx+96], xmm7 +add rdx, r12 +psrlw mm3, mm0 +vaddpd ymm8, ymm8, ymmword ptr [rbx+32] +prefetcht0 [rdx] +add rdx, r12 +psrlw mm4, mm1 +vaddpd ymm9, ymm9, ymmword ptr [rdx+64] +vmovapd xmmword ptr [rdx+96], xmm9 +add rdx, r12 +psrlw mm5, mm2 +vaddpd ymm1, ymm1, ymmword ptr [rdx+64] +add rdx, r12 +psrlw mm0, mm3 +vaddpd ymm2, ymm2, ymm1 +vmovapd xmmword ptr [rcx+64], xmm2 +add rcx, r12 +psrlw mm1, mm4 +vaddpd ymm3, ymm3, ymmword ptr [rcx+64] +vmovapd xmmword ptr [rcx+96], xmm3 +add rcx, r12 +psrlw mm2, mm5 +vaddpd ymm4, ymm4, ymmword ptr [rcx+64] +add rcx, r12 +psllw mm3, mm0 +vaddpd ymm5, ymm5, ymm4 +vmovapd xmmword ptr [rbx+32], xmm5 +add rbx, r12 +psllw mm4, mm1 +vaddpd ymm6, ymm6, ymmword ptr [rbx+32] +vmovapd xmmword ptr [rbx+64], xmm6 +add rbx, r12 +psllw mm5, mm2 +vaddpd ymm7, ymm7, ymmword ptr [rbx+32] +add rbx, r12 +psllw mm0, mm3 +vaddpd ymm8, ymm8, ymm9 +vmovdqa ymm14, ymm15 +psllw mm1, mm4 +vaddpd ymm9, ymm9, ymm8 +vmovapd xmmword ptr [rdi+64], xmm9 +add rdi, r12 +psllw mm2, mm5 +vaddpd ymm1, ymm1, ymmword ptr [rbx+32] +prefetcht2 [rdi] +add rdi, r12 +psrlw mm3, mm0 +vaddpd ymm2, ymm2, ymmword ptr [rdx+64] +vmovapd xmmword ptr [rdi+64], xmm2 +add rdi, r12 +psrlw mm4, mm1 +vaddpd ymm3, ymm3, ymmword ptr [rdi+64] +add rdi, r12 +psrlw mm5, mm2 +vaddpd ymm4, ymm4, ymm3 +vmovapd xmmword ptr [rdx+96], xmm4 +add rdx, r12 +psrlw mm0, mm3 +vaddpd ymm5, ymm5, ymmword ptr [rbx+32] +prefetcht0 [rdx] +add rdx, r12 +psrlw mm1, mm4 +vaddpd ymm6, ymm6, ymmword ptr [rdx+64] +vmovapd xmmword ptr [rdx+96], xmm6 +add rdx, r12 +psrlw mm2, mm5 +vaddpd ymm7, ymm7, ymmword ptr [rdx+64] +add rdx, r12 +psllw mm3, mm0 +vaddpd ymm8, ymm8, ymm7 +vmovapd xmmword ptr [rcx+64], xmm8 +add rcx, r12 +psllw mm4, mm1 +vaddpd ymm9, ymm9, ymmword ptr [rcx+64] +vmovapd xmmword ptr [rcx+96], xmm9 +add rcx, r12 +psllw mm5, mm2 +vaddpd ymm1, ymm1, ymmword ptr [rcx+64] +add rcx, r12 +psllw mm0, mm3 +vaddpd ymm2, ymm2, ymm1 +vmovapd xmmword ptr [rbx+32], xmm2 +add rbx, r12 +psllw mm1, mm4 +vaddpd ymm3, ymm3, ymmword ptr [rbx+32] +vmovapd xmmword ptr [rbx+64], xmm3 +add rbx, r12 +psllw mm2, mm5 +vaddpd ymm4, ymm4, ymmword ptr [rbx+32] +add rbx, r12 +psrlw mm3, mm0 +vaddpd ymm5, ymm5, ymm6 +vmovdqa ymm11, ymm12 +psrlw mm4, mm1 +vaddpd ymm6, ymm6, ymm5 +vmovapd xmmword ptr [rdi+64], xmm6 +add rdi, r12 +psrlw mm5, mm2 +vaddpd ymm7, ymm7, ymmword ptr [rbx+32] +prefetcht2 [rdi] +add rdi, r12 +psrlw mm0, mm3 +vaddpd ymm8, ymm8, ymmword ptr [rdx+64] +vmovapd xmmword ptr [rdi+64], xmm8 +add rdi, r12 +psrlw mm1, mm4 +vaddpd ymm9, ymm9, ymmword ptr [rdi+64] +add rdi, r12 +psrlw mm2, mm5 +vaddpd ymm1, ymm1, ymm0 +vmovapd xmmword ptr [rdx+96], xmm1 +add rdx, r12 +psllw mm3, mm0 +vaddpd ymm2, ymm2, ymmword ptr [rbx+32] +prefetcht0 [rdx] +add rdx, r12 +psllw mm4, mm1 +vaddpd ymm3, ymm3, ymmword ptr [rdx+64] +vmovapd xmmword ptr [rdx+96], xmm3 +add rdx, r12 +psllw mm5, mm2 +vaddpd ymm4, ymm4, ymmword ptr [rdx+64] +add rdx, r12 +psllw mm0, mm3 +vaddpd ymm5, ymm5, ymm4 +vmovapd xmmword ptr [rcx+64], xmm5 +add rcx, r12 +psllw mm1, mm4 +vaddpd ymm6, ymm6, ymmword ptr [rcx+64] +vmovapd xmmword ptr [rcx+96], xmm6 +add rcx, r12 +psllw mm2, mm5 +vaddpd ymm7, ymm7, ymmword ptr [rcx+64] +add rcx, r12 +psrlw mm3, mm0 +vaddpd ymm8, ymm8, ymm7 +vmovapd xmmword ptr [rbx+32], xmm8 +add rbx, r12 +psrlw mm4, mm1 +vaddpd ymm9, ymm9, ymmword ptr [rbx+32] +vmovapd xmmword ptr [rbx+64], xmm9 +add rbx, r12 +psrlw mm5, mm2 +sub r10, 1 +jnz L2 +mov r10, 1365333 +mov rdi, rax +add rdi, 1441792 +L2: +sub r8, 1 +jnz L3 +mov r8, 1456 +mov rcx, rax +add rcx, 32768 +L3: +sub r9, 1 +jnz L4 +mov r9, 1501 +mov rdx, rax +add rdx, 1048576 +L4: +inc r14 +mov rbx, rax +test qword ptr [r13], 1 +jnz L1 +L0: +mov rax, r14 +pop r14 +pop r13 +pop r12 +pop rbp +pop rbx +ret + +Payload SSE2 +.section .text +push rbx +push rbp +push r12 +push r13 +push r14 +mov rax, rdi +mov r13, rsi +mov r14, rdx +mov r11, qword ptr [r13] +test r11, r11 +jz L0 +mov r12, 64 +movapd xmm0, xmmword ptr [rax] +movapd xmm1, xmmword ptr [rax+32] +movapd xmm2, xmmword ptr [rax+64] +movapd xmm3, xmmword ptr [rax+96] +movapd xmm4, xmmword ptr [rax+128] +movapd xmm5, xmmword ptr [rax+160] +movapd xmm6, xmmword ptr [rax+192] +movapd xmm7, xmmword ptr [rax+224] +movapd xmm8, xmmword ptr [rax+256] +movapd xmm9, xmmword ptr [rax+288] +movapd xmm10, xmmword ptr [rax+320] +movapd xmm11, xmmword ptr [rax+352] +movapd xmm12, xmmword ptr [rax+384] +movapd xmm13, xmmword ptr [rax+416] +mov r11, 1085102592571150095 +pinsrq xmm14, r11, 0 +pinsrq xmm14, r11, 1 +shl r11, 4 +pinsrq xmm15, r11, 0 +pinsrq xmm15, r11, 1 +mov rbx, rax +mov rcx, rax +add rcx, 32768 +mov rdx, rax +add rdx, 1048576 +mov rdi, rax +add rdi, 1441792 +mov r8, 1456 +mov r9, 1501 +mov r10, 1365333 +.align 64 (code) +L1: +addpd xmm1, xmmword ptr [rbx+32] +add rbx, r12 +addpd xmm2, xmm3 +movdqa xmm15, xmm14 +addpd xmm3, xmm2 +movapd xmmword ptr [rdi+64], xmm3 +add rdi, r12 +addpd xmm4, xmmword ptr [rbx+32] +prefetcht2 [rdi] +add rdi, r12 +addpd xmm5, xmmword ptr [rdx+64] +movapd xmmword ptr [rdi+64], xmm5 +add rdi, r12 +addpd xmm6, xmmword ptr [rdi+64] +add rdi, r12 +addpd xmm7, xmm6 +movapd xmmword ptr [rdx+96], xmm7 +add rdx, r12 +addpd xmm8, xmmword ptr [rbx+32] +prefetcht0 [rdx] +add rdx, r12 +addpd xmm9, xmmword ptr [rdx+64] +movapd xmmword ptr [rdx+96], xmm9 +add rdx, r12 +addpd xmm10, xmmword ptr [rdx+64] +add rdx, r12 +addpd xmm11, xmm10 +movapd xmmword ptr [rcx+64], xmm11 +add rcx, r12 +addpd xmm12, xmmword ptr [rcx+64] +movapd xmmword ptr [rcx+96], xmm12 +add rcx, r12 +addpd xmm13, xmmword ptr [rcx+64] +add rcx, r12 +addpd xmm1, xmm0 +movapd xmmword ptr [rbx+32], xmm1 +add rbx, r12 +addpd xmm2, xmmword ptr [rbx+32] +movapd xmmword ptr [rbx+64], xmm2 +add rbx, r12 +addpd xmm3, xmmword ptr [rbx+32] +add rbx, r12 +addpd xmm4, xmm5 +movdqa xmm14, xmm15 +addpd xmm5, xmm4 +movapd xmmword ptr [rdi+64], xmm5 +add rdi, r12 +addpd xmm6, xmmword ptr [rbx+32] +prefetcht2 [rdi] +add rdi, r12 +addpd xmm7, xmmword ptr [rdx+64] +movapd xmmword ptr [rdi+64], xmm7 +add rdi, r12 +addpd xmm8, xmmword ptr [rdi+64] +add rdi, r12 +addpd xmm9, xmm8 +movapd xmmword ptr [rdx+96], xmm9 +add rdx, r12 +addpd xmm10, xmmword ptr [rbx+32] +prefetcht0 [rdx] +add rdx, r12 +addpd xmm11, xmmword ptr [rdx+64] +movapd xmmword ptr [rdx+96], xmm11 +add rdx, r12 +addpd xmm12, xmmword ptr [rdx+64] +add rdx, r12 +addpd xmm13, xmm12 +movapd xmmword ptr [rcx+64], xmm13 +add rcx, r12 +addpd xmm1, xmmword ptr [rcx+64] +movapd xmmword ptr [rcx+96], xmm1 +add rcx, r12 +addpd xmm2, xmmword ptr [rcx+64] +add rcx, r12 +addpd xmm3, xmm2 +movapd xmmword ptr [rbx+32], xmm3 +add rbx, r12 +addpd xmm4, xmmword ptr [rbx+32] +movapd xmmword ptr [rbx+64], xmm4 +add rbx, r12 +addpd xmm5, xmmword ptr [rbx+32] +add rbx, r12 +addpd xmm6, xmm7 +movdqa xmm15, xmm14 +addpd xmm7, xmm6 +movapd xmmword ptr [rdi+64], xmm7 +add rdi, r12 +addpd xmm8, xmmword ptr [rbx+32] +prefetcht2 [rdi] +add rdi, r12 +addpd xmm9, xmmword ptr [rdx+64] +movapd xmmword ptr [rdi+64], xmm9 +add rdi, r12 +addpd xmm10, xmmword ptr [rdi+64] +add rdi, r12 +addpd xmm11, xmm10 +movapd xmmword ptr [rdx+96], xmm11 +add rdx, r12 +addpd xmm12, xmmword ptr [rbx+32] +prefetcht0 [rdx] +add rdx, r12 +addpd xmm13, xmmword ptr [rdx+64] +movapd xmmword ptr [rdx+96], xmm13 +add rdx, r12 +addpd xmm1, xmmword ptr [rdx+64] +add rdx, r12 +addpd xmm2, xmm1 +movapd xmmword ptr [rcx+64], xmm2 +add rcx, r12 +addpd xmm3, xmmword ptr [rcx+64] +movapd xmmword ptr [rcx+96], xmm3 +add rcx, r12 +addpd xmm4, xmmword ptr [rcx+64] +add rcx, r12 +addpd xmm5, xmm4 +movapd xmmword ptr [rbx+32], xmm5 +add rbx, r12 +addpd xmm6, xmmword ptr [rbx+32] +movapd xmmword ptr [rbx+64], xmm6 +add rbx, r12 +sub r10, 1 +jnz L2 +mov r10, 1365333 +mov rdi, rax +add rdi, 1441792 +L2: +sub r8, 1 +jnz L3 +mov r8, 1456 +mov rcx, rax +add rcx, 32768 +L3: +sub r9, 1 +jnz L4 +mov r9, 1501 +mov rdx, rax +add rdx, 1048576 +L4: +inc r14 +mov rbx, rax +test qword ptr [r13], 1 +jnz L1 +L0: +mov rax, r14 +pop r14 +pop r13 +pop r12 +pop rbp +pop rbx +ret + diff --git a/test/refs/X86Functions.log b/test/refs/X86Functions.log new file mode 100644 index 00000000..d0640ee2 --- /dev/null +++ b/test/refs/X86Functions.log @@ -0,0 +1,25 @@ + available load-functions: + ID | NAME | available on this system | payload default setting + ------------------------------------------------------------------------------------------------------------------------------------------------------- + 1 | FUNC_KNL_XEONPHI_AVX512_4T | yes | RAM_P:3,L2_S:8,L1_L:40,REG:10 + 2 | FUNC_SKL_COREI_FMA_1T | yes | RAM_L:3,L3_LS_256:5,L2_LS_256:18,L1_2LS_256:78,REG:40 + 3 | FUNC_SKL_COREI_FMA_2T | yes | RAM_L:3,L3_LS_256:5,L2_LS_256:18,L1_2LS_256:78,REG:40 + 4 | FUNC_SKL_XEONEP_AVX512_1T | yes | RAM_S:3,RAM_P:1,L3_S:1,L3_P:1,L2_S:4,L2_L:70,L1_S:0,L1_L:40,REG:140 + 5 | FUNC_SKL_XEONEP_AVX512_2T | yes | RAM_S:3,RAM_P:1,L3_S:1,L3_P:1,L2_S:4,L2_L:70,L1_S:0,L1_L:40,REG:140 + 6 | FUNC_HSW_COREI_FMA_1T | yes | RAM_L:2,L3_LS:3,L2_LS:9,L1_LS:90,REG:40 + 7 | FUNC_HSW_COREI_FMA_2T | yes | RAM_L:2,L3_LS:3,L2_LS:9,L1_LS:90,REG:40 + 8 | FUNC_HSW_XEONEP_FMA_1T | yes | RAM_L:8,L3_LS:1,L2_LS:29,L1_LS:100,REG:100 + 9 | FUNC_HSW_XEONEP_FMA_2T | yes | RAM_L:8,L3_LS:1,L2_LS:29,L1_LS:100,REG:100 + 10 | FUNC_SNB_COREI_AVX_1T | yes | RAM_L:2,L3_LS:4,L2_LS:10,L1_LS:90,REG:45 + 11 | FUNC_SNB_COREI_AVX_2T | yes | RAM_L:2,L3_LS:4,L2_LS:10,L1_LS:90,REG:45 + 12 | FUNC_SNB_XEONEP_AVX_1T | yes | RAM_L:3,L3_LS:2,L2_LS:10,L1_LS:90,REG:30 + 13 | FUNC_SNB_XEONEP_AVX_2T | yes | RAM_L:3,L3_LS:2,L2_LS:10,L1_LS:90,REG:30 + 14 | FUNC_NHM_COREI_SSE2_1T | yes | RAM_P:1,L1_LS:70,REG:2 + 15 | FUNC_NHM_COREI_SSE2_2T | yes | RAM_P:1,L1_LS:70,REG:2 + 16 | FUNC_NHM_XEONEP_SSE2_1T | yes | RAM_P:1,L1_LS:60,REG:2 + 17 | FUNC_NHM_XEONEP_SSE2_2T | yes | RAM_P:1,L1_LS:60,REG:2 + 18 | FUNC_BLD_OPTERON_FMA4_1T | yes | RAM_L:1,L3_L:1,L2_LS:5,L1_L:90,REG:45 + 19 | FUNC_ZEN_EPYC_ZENFMA_1T | yes | RAM_L:3,L3_L:14,L2_L:75,L1_LS:81,REG:100 + 20 | FUNC_ZEN_EPYC_ZENFMA_2T | yes | RAM_L:3,L3_L:14,L2_L:75,L1_LS:81,REG:100 + 21 | FUNC_ZEN_2_EPYC_FMA_1T | yes | RAM_L:10,L3_L:25,L2_L:91,L1_2LS_256:72,L1_LS_256:82,REG:75 + 22 | FUNC_ZEN_2_EPYC_FMA_2T | yes | RAM_L:10,L3_L:25,L2_L:91,L1_2LS_256:72,L1_LS_256:82,REG:75 diff --git a/tooling/ref-test.py b/tooling/ref-test.py new file mode 100644 index 00000000..aab40526 --- /dev/null +++ b/tooling/ref-test.py @@ -0,0 +1,34 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + +import sys +import os +import subprocess + +def run_and_update(executable, ref_file, update_refs): + p = subprocess.Popen([ executable ], stdout=subprocess.PIPE) + p.wait() + stdout, _ = p.communicate() + + reference = open(ref_file, 'rb').read() + + if stdout != reference: + # Update the reference if applicable + if update_refs: + open(ref_file, 'wb').write(stdout) + return + + sys.exit(1) + +# Run the first argument and compare it to the file provided in the second argument +if __name__ == "__main__": + if len(sys.argv) != 3: + print(f"Usage: {sys.argv[0]} EXECUTABLE REFERENCE_FILE") + print("Run with env variable UPDATE_REFERENCES set to update the reference files.") + sys.exit(1) + + executable = sys.argv[1] + ref_file = sys.argv[2] + update_refs = "UPDATE_REFERENCES" in os.environ + + run_and_update(executable, ref_file, update_refs) \ No newline at end of file