From 2cbef25ac2c3d0f22fcde983096d371748498a7c Mon Sep 17 00:00:00 2001 From: Rickard Date: Tue, 2 Jan 2024 14:48:54 +0100 Subject: [PATCH] Make native code portable and add GitHub workflow for building --- .github/workflows/python-package.yml | 201 ++++++++++++++++++ .gitignore | 22 +- CMakeLists.txt | 121 +++++++++++ Makefile | 141 ------------ csrc/common.cpp | 24 +-- csrc/common.h | 2 +- csrc/cpu_ops.cpp | 42 ++-- csrc/kernels.cu | 12 +- csrc/mps_kernels.metal | 117 ++++++++++ csrc/ops.cuh | 2 + ...{pythonInterface.c => pythonInterface.cpp} | 4 + include/Algo-Direct2.h | 2 + include/Portable.h | 35 ++- include/SIMD.h | 67 ++++-- include/Type.h | 2 +- requirements.txt | 8 +- setup.py | 8 +- 17 files changed, 593 insertions(+), 217 deletions(-) create mode 100644 .github/workflows/python-package.yml create mode 100644 CMakeLists.txt delete mode 100644 Makefile create mode 100644 csrc/mps_kernels.metal rename csrc/{pythonInterface.c => pythonInterface.cpp} (99%) diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml new file mode 100644 index 000000000..c2bebf1d8 --- /dev/null +++ b/.github/workflows/python-package.yml @@ -0,0 +1,201 @@ +name: Python package + +on: + push: + branches: [ "*" ] + pull_request: + branches: [ master ] + release: + types: [ published ] + +jobs: + + ## + # This job matrix builds the non-CUDA versions of the libraries for all supported platforms. + ## + build-shared-libs: + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + arch: [x86_64, aarch64] + exclude: + - os: windows-latest # This probably requres arm64 Windows agents + arch: aarch64 + runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents + steps: + # Check out code + - uses: actions/checkout@v3 + # On Linux we use CMake within Docker + - name: Setup cmake + uses: jwlawson/actions-setup-cmake@v1.13 + with: + cmake-version: '3.26.x' + - name: Add msbuild to PATH + uses: microsoft/setup-msbuild@v1.1 + if: ${{ startsWith(matrix.os, 'windows') }} + # Compile C++ code + - name: Build C++ + shell: bash + run: | + set -ex + build_os=${{ matrix.os }} + build_arch=${{ matrix.arch }} + ( git clone https://github.com/NVlabs/cub ./dependencies/cub; cd dependencies/cub; git checkout 1.11.0 ) + if [ ${build_os:0:6} == ubuntu -a ${build_arch} == aarch64 ]; then + # Allow cross-compile om aarch64 + sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu + fi + if [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then + cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DENABLE_CUDA=OFF -DENABLE_MPS=ON . + else + cmake -DENABLE_CUDA=OFF . + fi + if [ ${build_os:0:7} == windows ]; then + pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release" + else + make + fi + mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} + ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) + - name: Upload build artifact + uses: actions/upload-artifact@v3 + with: + name: shared_library + path: output/* + retention-days: 7 + ## + # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64) + ## + build-shared-libs-cuda: + strategy: + matrix: + os: [ubuntu-latest, windows-latest] + arch: [x86_64, aarch64] + cuda_version: ['12.1.0'] + exclude: + - os: windows-latest # This probably requres arm64 Windows agents + arch: aarch64 + runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents + steps: + # Check out code + - uses: actions/checkout@v3 + # Linux: We use Docker to build cross platform Cuda (aarch64 is built in emulation) + - name: Set up Docker multiarch + if: startsWith(matrix.os, 'ubuntu') + uses: docker/setup-qemu-action@v2 + # On Linux we use CMake within Docker + - name: Setup cmake + if: ${{ !startsWith(matrix.os, 'linux') }} + uses: jwlawson/actions-setup-cmake@v1.13 + with: + cmake-version: '3.26.x' + # Windows: We install Cuda on the agent (slow) + - uses: Jimver/cuda-toolkit@v0.2.10 + if: startsWith(matrix.os, 'windows') + id: cuda-toolkit + with: + cuda: ${{ matrix.cuda_version }} + method: 'local' + #sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev","visual_studio_integration"]' + - name: Add msbuild to PATH + uses: microsoft/setup-msbuild@v1.1 + if: ${{ startsWith(matrix.os, 'windows') }} + # Compile C++ code + - name: Build C++ + shell: bash + run: | + set -ex + build_os=${{ matrix.os }} + build_arch=${{ matrix.arch }} + ( git clone https://github.com/NVlabs/cub ./dependencies/cub; cd dependencies/cub; git checkout 1.11.0 ) + if [ ${build_os:0:6} == ubuntu ]; then + image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04 + echo "Using image $image" + docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \ + "apt-get update \ + && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \ + && cmake -DENABLE_CUDA=ON . \ + && make" + else + cmake -DENABLE_CUDA=ON . + pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release" + fi + mkdir -p output/${{ matrix.os }}/${{ matrix.arch }} + ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ ) + - name: Upload build artifact + uses: actions/upload-artifact@v3 + with: + name: shared_library + path: output/* + retention-days: 7 + build-wheels: + needs: + - build-shared-libs + - build-shared-libs-cuda + strategy: + matrix: + os: [ubuntu-latest, macos-latest, windows-latest] + python-version: ["3.8", "3.9", "3.10", "3.11"] + arch: [x86_64, aarch64] + exclude: + - os: windows-latest # This probably requres arm64 Windows agents + arch: aarch64 + runs-on: ${{ matrix.os }} + steps: + # Check out code + - uses: actions/checkout@v3 + # Download shared libraries + - name: Download build artifact + uses: actions/download-artifact@v3 + with: + name: shared_library + path: output/ + - name: Copy correct platform shared library + shell: bash + run: | + cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/ + # Compile C++ code + - name: Set up Python ${{ matrix.python-version }} + uses: actions/setup-python@v4 + with: + python-version: ${{ matrix.python-version }} + # + - name: Install Python dependencies + shell: bash + run: | + pip install -r requirements.txt + # TODO: How to run CUDA tests on GitHub actions? + #- name: Run unit tests + # if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents + # run: | + # PYTHONPATH=. pytest --log-cli-level=DEBUG tests + - name: Build wheel + shell: bash + run: | + python setup.py bdist_wheel + - name: Upload build artifact + uses: actions/upload-artifact@v3 + with: + name: bdist_wheel + path: dist/bitsandbytes-*.whl + retention-days: 7 + publish: + needs: build-wheels + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v2 + - name: Build dist + run: | + python setup.py sdist + - name: Download build artifact + uses: actions/download-artifact@v3 + with: + name: bdist_wheel + path: dist/ + - run: | + ls -lR dist/ + - name: Publish to PyPi + if: startsWith(github.ref, 'refs/tags') + uses: pypa/gh-action-pypi-publish@release/v1 + with: + password: ${{ secrets.pypi }} diff --git a/.gitignore b/.gitignore index 2f929968b..202dcb13d 100644 --- a/.gitignore +++ b/.gitignore @@ -2,9 +2,26 @@ __pycache__/ *.py[cod] *$py.class - -# C extensions *.so +*.dll +*.dylib +*.o +*.obj +*.air +*.metallib + +# CMake generated files +CMakeCache.txt +CMakeScripts/ +cmake_install.cmake +Makefile +CMakeFiles/ +*.sln +*.vcxproj* +*.xcodeproj/ +bitsandbytes.dir/ +Debug/ +Release/ # Distribution / packaging .Python @@ -133,4 +150,5 @@ dmypy.json dependencies cuda_build +output/ .vscode/* diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 000000000..d6e269d15 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,121 @@ +cmake_minimum_required(VERSION 3.22.1) + +option(ENABLE_CUDA "Build for CUDA (Nvidia)" OFF) +option(ENABLE_MPS "Build for Metal Performance Shaders (Apple)" OFF) + +if(ENABLE_CUDA) + if(APPLE) + message(FATAL_ERROR "CUDA is not supported on macOS" ) + endif() + option(NO_CUBLASLT "Don't use CUBLAST" OFF) + if(NO_CUBLASLT) + set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72) + else() + set(CMAKE_CUDA_ARCHITECTURES 75 80 86 89 90) + endif() +endif() + +if(ENABLE_CUDA) + message("Building CUDA support for ${CMAKE_CUDA_ARCHITECTURES}") + # Find CUDA tools if we are compiling with CUDA + find_package(CUDAToolkit REQUIRED) + if(NO_CUBLASLT) + set(LIBSUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}${CUDAToolkit_VERSION_MINOR}_nocublaslt") + else() + set(LIBSUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}${CUDAToolkit_VERSION_MINOR}") + endif() + + project(bitsandbytes LANGUAGES CXX CUDA) + add_compile_definitions(BUILD_CUDA) + set(CMAKE_CUDA_STANDARD 14) + set(CMAKE_CUDA_STANDARD_REQUIRED ON) + set(GPU_SOURCES csrc/ops.cu csrc/kernels.cu) +elseif(ENABLE_MPS) + if(NOT APPLE) + message(FATAL_ERROR "MPS is only supported on macOS" ) + endif() + message("Building MPS support") + set(LIBSUFFIX "mps") + project(bitsandbytes LANGUAGES CXX OBJCXX) + add_compile_definitions(BUILD_MPS) + set(METAL_SOURCES csrc/mps_kernels.metal) + file(MAKE_DIRECTORY "build") + add_custom_command(OUTPUT "bitsandbytes/bitsandbytes.metallib" + COMMAND xcrun metal -c -o "build/bitsandbytes.air" ${METAL_SOURCES} + COMMAND xcrun metallib "build/bitsandbytes.air" -o "bitsandbytes/bitsandbytes.metallib" + DEPENDS "${METAL_SOURCES}" + COMMENT "Compiling Metal kernels" + VERBATIM) + add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib") + set(GPU_SOURCES csrc/mps_ops.mm) +else() + message("Building with CPU only") + set(LIBSUFFIX "cpu") + + project(bitsandbytes LANGUAGES CXX) + set(GPU_SOURCES) +endif() + +if(APPLE) + set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1) +endif() +set(CMAKE_CXX_STANDARD 14) +set(CXX_STANDARD_REQUIRED C++14) + +if(WIN32) + # Mute warnings + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -diag-suppress=177") + + # Enable fast math on VC++ + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast") + + # Export all symbols + set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON) +endif() + +# Weird MSVC hacks +if(MSVC) + set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /NODEFAULTLIB:msvcprtd /NODEFAULTLIB:MSVCRTD /NODEFAULTLIB:LIBCMT") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2") + set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2") +endif() + +# Add csrc files +add_library(bitsandbytes SHARED + ${GPU_SOURCES} + csrc/common.cpp + csrc/cpu_ops.cpp + csrc/pythonInterface.cpp) + +target_include_directories(bitsandbytes PUBLIC + ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES} + ${CMAKE_CURRENT_SOURCE_DIR}/csrc + ${CMAKE_CURRENT_SOURCE_DIR}/include) + +if(ENABLE_CUDA) + target_include_directories(bitsandbytes PUBLIC ${CUDA_TOOLKIT_ROOT_DIR}/include) + + set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math") + + set_target_properties( + bitsandbytes + PROPERTIES + CUDA_SEPARABLE_COMPILATION ON) + + target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cublasLt CUDA::cusparse) +endif() +if(ENABLE_MPS) + add_dependencies(bitsandbytes metallib) + target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph") +endif() + +set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME "bitsandbytes_${LIBSUFFIX}") +# Set the output name of the CUDA library +if(MSVC) +set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE bitsandbytes) +set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG bitsandbytes) +set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE bitsandbytes) +set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG bitsandbytes) +endif() + +set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY bitsandbytes) diff --git a/Makefile b/Makefile deleted file mode 100644 index 5f997a122..000000000 --- a/Makefile +++ /dev/null @@ -1,141 +0,0 @@ -MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST))) -ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH))) - -GPP:= /usr/bin/g++ -#GPP:= /sw/gcc/11.2.0/bin/g++ -ifeq ($(CUDA_HOME),) - CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev) -endif - -ifndef CUDA_VERSION -ifneq ($(MAKECMDGOALS),clean) -$(warning WARNING: CUDA_VERSION not set. Call make with CUDA string, for example: make cuda11x CUDA_VERSION=115 or make cpuonly CUDA_VERSION=CPU) -CUDA_VERSION:= -endif -endif - - - -NVCC := $(CUDA_HOME)/bin/nvcc - -########################################### - -CSRC := $(ROOT_DIR)/csrc -BUILD_DIR:= $(ROOT_DIR)/build - -FILES_CUDA := $(CSRC)/ops.cu $(CSRC)/kernels.cu -FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c - -INCLUDE := -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include -LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L $(CONDA_PREFIX)/lib - -# NVIDIA NVCC compilation flags -COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell -COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell -COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal -COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal -COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta - -CC_KEPLER := -gencode arch=compute_35,code=sm_35 # Kepler -CC_KEPLER += -gencode arch=compute_37,code=sm_37 # Kepler - -# Later versions of CUDA support the new architectures -CC_CUDA11x := -gencode arch=compute_75,code=sm_75 -CC_CUDA11x += -gencode arch=compute_80,code=sm_80 -CC_CUDA11x += -gencode arch=compute_86,code=sm_86 - - -CC_cublasLt110 := -gencode arch=compute_75,code=sm_75 -CC_cublasLt110 += -gencode arch=compute_80,code=sm_80 - -CC_cublasLt111 := -gencode arch=compute_75,code=sm_75 -CC_cublasLt111 += -gencode arch=compute_80,code=sm_80 -CC_cublasLt111 += -gencode arch=compute_86,code=sm_86 - -CC_ADA_HOPPER := -gencode arch=compute_89,code=sm_89 -CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90 - - -all: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda110_nomatmul_kepler: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda11x_nomatmul_kepler: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - - -cuda110_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda11x_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda118_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda12x_nomatmul: $(BUILD_DIR) env - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT - $(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB) - -cuda110: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda11x: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda118: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cuda12x: $(BUILD_DIR) env - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) - $(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o - $(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB) - -cpuonly: $(BUILD_DIR) env - $(GPP) -std=c++14 -shared -fPIC -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cpu.so - -env: - @echo "ENVIRONMENT" - @echo "============================" - @echo "CUDA_VERSION: $(CUDA_VERSION)" - @echo "============================" - @echo "NVCC path: $(NVCC)" - @echo "GPP path: $(GPP) VERSION: `$(GPP) --version | head -n 1`" - @echo "CUDA_HOME: $(CUDA_HOME)" - @echo "CONDA_PREFIX: $(CONDA_PREFIX)" - @echo "PATH: $(PATH)" - @echo "LD_LIBRARY_PATH: $(LD_LIBRARY_PATH)" - @echo "============================" - -$(BUILD_DIR): - mkdir -p build - mkdir -p dependencies - -$(ROOT_DIR)/dependencies/cub: - git clone https://github.com/NVlabs/cub $(ROOT_DIR)/dependencies/cub - cd dependencies/cub; git checkout 1.11.0 - -clean: - rm -rf build/* *.egg* - rm -f bitsandbytes/libbitsandbytes*.so diff --git a/csrc/common.cpp b/csrc/common.cpp index 52f029917..0a9601689 100644 --- a/csrc/common.cpp +++ b/csrc/common.cpp @@ -1,39 +1,35 @@ #include #include -void *quantize_block(void *arguments) { +void quantize_block(const quantize_block_args& args) { // 1. find absmax in block // 2. divide input value by absmax to normalize into [-1.0, 1.0] // 3. do binary search to find the closest value // 4. check minimal distance // 5. store index - struct quantize_block_args *args = (quantize_block_args *) arguments; - // 1. find absmax in block float absmax_block = -FLT_MAX; - for (long long i = args->block_idx; i < args->block_end; i++) - absmax_block = fmax(absmax_block, fabs(args->A[i])); + for (long long i = args.block_idx; i < args.block_end; i++) + absmax_block = fmax(absmax_block, fabs(args.A[i])); - args->absmax[args->block_idx / args->blocksize] = absmax_block; + args.absmax[args.block_idx / args.blocksize] = absmax_block; - for (long long i = args->block_idx; i < args->block_end; i++) { + for (long long i = args.block_idx; i < args.block_end; i++) { // 2. divide input value by absmax to normalize into [-1.0, 1.0] // 3. do binary search to find the closest value - float normed_value = args->A[i] / absmax_block; - long long idx = args->bin_searcher->scalar(normed_value); + float normed_value = args.A[i] / absmax_block; + long long idx = args.bin_searcher->scalar(normed_value); // 4. check minimal distance // The binary search returns always the value to the left, which might not be the closest value if (idx < 255) { - float dist_left = fabs(normed_value - (args->code[idx])); - float dist_right = fabs(normed_value - (args->code[idx + 1])); + float dist_left = fabs(normed_value - (args.code[idx])); + float dist_right = fabs(normed_value - (args.code[idx + 1])); if (dist_right < dist_left) { idx += 1; } } // 5. store index - args->out[i] = (unsigned char) idx; + args.out[i] = (unsigned char) idx; } - - return NULL; } diff --git a/csrc/common.h b/csrc/common.h index c99034e78..e513f2875 100644 --- a/csrc/common.h +++ b/csrc/common.h @@ -20,6 +20,6 @@ struct quantize_block_args { }; -void *quantize_block(void *arguments); +void quantize_block(const quantize_block_args& args); #endif diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp index e28e7b2c2..478c1f4ff 100644 --- a/csrc/cpu_ops.cpp +++ b/csrc/cpu_ops.cpp @@ -1,6 +1,6 @@ #include -#include #include +#include using namespace BinSearch; @@ -31,12 +31,8 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long for(long long offset = 0; offset < num_blocks; offset+=thread_wave_size) { long long valid_chunks = num_blocks - offset >= thread_wave_size ? thread_wave_size : num_blocks - offset; - pthread_t *threads = (pthread_t *) malloc(sizeof(pthread_t) * valid_chunks); - - struct quantize_block_args **args = (quantize_block_args **) malloc(valid_chunks * sizeof(quantize_block_args *)); - - for(long long i = 0; i < valid_chunks; i++) - args[i] = (quantize_block_args *) malloc(sizeof(quantize_block_args)); + std::vector threads(valid_chunks); + std::vector args(valid_chunks); int chunks_processed = 0; for(long long block_idx = offset*blocksize; block_idx < n; block_idx += blocksize) @@ -44,30 +40,24 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long long long valid_items = n - block_idx >= blocksize ? blocksize : n - block_idx; long long block_end = block_idx + valid_items; - struct quantize_block_args *arg = args[chunks_processed]; - arg->bin_searcher = &bin_searcher; - arg->code = code; - arg->A = A; - arg->absmax = absmax; - arg->out = out; - arg->block_end = block_end; - arg->block_idx = block_idx; - arg->threadidx = block_idx / blocksize; - arg->blocksize = blocksize; - - pthread_create(&threads[chunks_processed], NULL, &quantize_block, (void *) arg); + struct quantize_block_args& arg = args[chunks_processed]; + arg.bin_searcher = &bin_searcher; + arg.code = code; + arg.A = A; + arg.absmax = absmax; + arg.out = out; + arg.block_end = block_end; + arg.block_idx = block_idx; + arg.threadidx = block_idx / blocksize; + arg.blocksize = blocksize; + + threads[chunks_processed] = std::thread([arg] { quantize_block(arg); }); chunks_processed += 1; if(chunks_processed == valid_chunks){ break; } } for (int i = 0; i < valid_chunks; i++) - int err = pthread_join(threads[i], NULL); - - free(threads); - for (int i = 0; i < valid_chunks; i++) - free(args[i]); - free(args); - + threads[i].join(); } } diff --git a/csrc/kernels.cu b/csrc/kernels.cu index 1ab8aa242..c2e2d7da7 100644 --- a/csrc/kernels.cu +++ b/csrc/kernels.cu @@ -3816,12 +3816,12 @@ template __global__ void kgemm_4bit_inference_naive(int M, int N template __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); template __global__ void kExtractOutliers(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); -template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); +template __global__ void kspmm_coo_very_sparse_naive(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB); template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 0, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols); template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 1, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols); diff --git a/csrc/mps_kernels.metal b/csrc/mps_kernels.metal new file mode 100644 index 000000000..a5c8e35b2 --- /dev/null +++ b/csrc/mps_kernels.metal @@ -0,0 +1,117 @@ +#include +using namespace metal; + +#define HLF_MAX 65504 +#define TH 1024 +#define NUM 4 +#define NUM_BLOCK 4096 + +template +static unsigned char quantize_scalar( + float rand, + device float* code, + float x) +{ + int pivot = 127; + int upper_pivot = 255; + int lower_pivot = 0; + + float lower = -1.0f; + float upper = 1.0f; + + float val = code[pivot]; + // i>>=1 = {32, 16, 8, 4, 2, 1} + for(int i = 64; i > 0; i>>=1) + { + if(x > val) + { + lower_pivot = pivot; + lower = val; + pivot+=i; + } + else + { + upper_pivot = pivot; + upper = val; + pivot-=i; + } + val = code[pivot]; + } + + if(upper_pivot == 255) + upper = code[upper_pivot]; + if(lower_pivot == 0) + lower = code[lower_pivot]; + + if(!STOCHASTIC) + { + if(x > val) + { + float midpoint = (upper+val)*0.5f; + if(x > midpoint) + { + return upper_pivot; + } + else + return pivot; + } + else + { + float midpoint = (lower+val)*0.5f; + if(x < midpoint) + return lower_pivot; + else + return pivot; + } + } + else + { + if(x > val) + { + float dist_to_upper = fabs(upper-x); + float dist_full = upper-val; + if(rand >= dist_to_upper/dist_full) return upper_pivot; + else return pivot; + } + else + { + float dist_to_lower = fabs(lower-x); + float dist_full = val-lower; + if(rand >= dist_to_lower/dist_full) return lower_pivot; + else return pivot; + } + } +} + +kernel void quantize(device float* code [[buffer(0)]], + device float* A [[buffer(1)]], + device uchar* out [[buffer(2)]], + constant uint& n [[buffer(3)]], + uint id [[thread_position_in_grid]]) { + const uint n_full = (NUM_BLOCK * (n / NUM_BLOCK)) + (n % NUM_BLOCK == 0 ? 0 : NUM_BLOCK); + uint valid_items = (id / NUM_BLOCK + 1 == (n + NUM_BLOCK - 1) / NUM_BLOCK) ? n - (id / NUM_BLOCK * NUM_BLOCK) : NUM_BLOCK; + const uint base_idx = (id / NUM_BLOCK * NUM_BLOCK); + + float vals[NUM]; + uchar qvals[NUM]; + + for (uint i = base_idx; i < n_full; i += ((n + NUM_BLOCK - 1) / NUM_BLOCK) * NUM_BLOCK) { + valid_items = n - i > NUM_BLOCK ? NUM_BLOCK : n - i; + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint j = 0; j < valid_items; j++) { + vals[j] = A[i + j]; + } + + for (uint j = 0; j < valid_items; j++) { + qvals[j] = quantize_scalar(0.0f, code, vals[j]); + } + + threadgroup_barrier(mem_flags::mem_threadgroup); + + for (uint j = 0; j < valid_items; j++) { + out[i + j] = qvals[j]; + } + } +} diff --git a/csrc/ops.cuh b/csrc/ops.cuh index f37b3b3af..cc7b59505 100644 --- a/csrc/ops.cuh +++ b/csrc/ops.cuh @@ -9,7 +9,9 @@ #include #include +#ifndef _MSC_VER #include +#endif #include #include diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.cpp similarity index 99% rename from csrc/pythonInterface.c rename to csrc/pythonInterface.cpp index 865e4b6d5..a6b348ca6 100644 --- a/csrc/pythonInterface.c +++ b/csrc/pythonInterface.cpp @@ -6,6 +6,9 @@ #if BUILD_CUDA #include #endif +#if BUILD_MPS +// #include +#endif #include // We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary. @@ -412,6 +415,7 @@ extern "C" { gemm_4bit_inference_naive_fp32(m, n, k, A, B, absmax, datatype, out, lda, ldb, ldc, blocksize); } #endif + void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); } void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); } } diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h index d5fa58d12..7f52fce14 100644 --- a/include/Algo-Direct2.h +++ b/include/Algo-Direct2.h @@ -52,6 +52,7 @@ struct AlgoVecBase::val private: typedef AlgoScalarBase base_t; +#ifdef USE_SSE2 FORCE_INLINE //NO_INLINE void resolve(const FVec& vz, const IVec& bidx, uint32 *pr) const @@ -135,6 +136,7 @@ struct AlgoVecBase::val pr[0] = u.ui32[0]; pr[1] = u.ui32[2]; } +#endif // USE_SSE2 #ifdef USE_AVX diff --git a/include/Portable.h b/include/Portable.h index 1710b0502..78599944e 100644 --- a/include/Portable.h +++ b/include/Portable.h @@ -4,10 +4,40 @@ #include #include +#if defined(__aarch64__) +#ifdef __CUDACC__ +#undef USE_NEON // Doesn't work with nvcc, undefined symbols +#else +#include +#undef USE_NEON // Not yet implemented +#endif +#undef USE_AVX // x86_64 only +#undef USE_AVX2 // x86_64 only +#undef USE_SSE2 // x86_64 only +#undef USE_SSE41 // x86_64 only +#undef USE_SSE42 // x86_64 only +#undef USE_FMA // x86_64 only +#ifdef USE_NEON +typedef float32x4_t __m128; +typedef int32x4_t __m128i; +typedef float64x2_t __m128d; +#else +typedef struct {float a; float b; float c; float d;} __m128; +typedef struct {int a; int b; int c; int d;} __m128i; +typedef struct {double a; double b;} __m128d; +#endif +#else +#undef USE_NEON // ARM64 only #ifdef __FMA__ #define USE_FMA #endif +#if !defined(__SSE2__) && !defined(_MSC_VER) +#error Compiler must support SSE2 +#endif +#define USE_SSE2 +#if defined(__aarch64__) +#else #ifdef __AVX2__ #define USE_AVX2 #endif @@ -24,7 +54,8 @@ #ifdef __SSE4_2__ #define USE_SSE42 #endif - +#endif +#endif #ifndef _MSC_VER #include @@ -50,7 +81,7 @@ typedef unsigned __int64 uint64; namespace Details { -#define myassert(cond, msg) if (!cond){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); } +#define myassert(cond, msg) if (!(cond)){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); } // log2 is not defined in VS2008 #if defined(_MSC_VER) diff --git a/include/SIMD.h b/include/SIMD.h index a2ac1a9ae..18a38dbfd 100644 --- a/include/SIMD.h +++ b/include/SIMD.h @@ -2,6 +2,46 @@ #include "Portable.h" +#ifdef USE_SSE2 +#include +#if defined(USE_AVX) || defined(USE_AVX2) +#include +#else +#ifdef USE_SSE41 +#include +#endif +#endif +#endif + +namespace BinSearch { +namespace Details { + +template +struct FTOITraits{}; + +template +struct FVec; + +template +struct IVec; + +template +struct FVec1; + +template <> struct InstrFloatTraits +{ + typedef __m128 vec_t; +}; + +template <> struct InstrFloatTraits +{ + typedef __m128d vec_t; +}; + +} +} + +#if !defined(__aarch64__) #ifdef USE_SSE42 #ifndef _MSC_VER #include @@ -26,29 +66,11 @@ FORCE_INLINE int popcnt32(int x32) } // namespace #endif -#if defined(USE_AVX) || defined(USE_AVX2) -#include -#else -#include -#ifdef USE_SSE41 -#include -#endif -#endif - #include "Type.h" namespace BinSearch { namespace Details { -template -struct FVec; - -template -struct IVec; - -template -struct FVec1; - template <> struct InstrIntTraits { typedef __m128i vec_t; @@ -64,8 +86,8 @@ template <> struct InstrFloatTraits typedef __m128d vec_t; }; -template -struct FTOITraits +template <> +struct FTOITraits { typedef IVec vec_t; }; @@ -285,9 +307,11 @@ FORCE_INLINE FVec operator- (const FVec& a, const FVec< FORCE_INLINE FVec operator* (const FVec& a, const FVec& b) { return _mm_mul_ps( a, b ); } FORCE_INLINE FVec operator/ (const FVec& a, const FVec& b) { return _mm_div_ps( a, b ); } FORCE_INLINE IVec ftoi (const FVec& a) { return _mm_cvttps_epi32(a); } +#ifndef __clang__ // Conflicts with builtin operator FORCE_INLINE IVec operator<= (const FVec& a, const FVec& b) { return _mm_castps_si128( _mm_cmple_ps( a, b ) ); } FORCE_INLINE IVec operator>= (const FVec& a, const FVec& b) { return _mm_castps_si128( _mm_cmpge_ps( a, b ) ); } FORCE_INLINE IVec operator< (const FVec& a, const FVec& b) { return _mm_castps_si128(_mm_cmplt_ps(a, b)); } +#endif #ifdef USE_FMA FORCE_INLINE FVec mulSub(const FVec& a, const FVec& b, const FVec& c) { return _mm_fmsub_ps(a, b, c); } #endif @@ -339,9 +363,11 @@ FORCE_INLINE FVec operator- (const FVec& a, const FVec FORCE_INLINE FVec operator* (const FVec& a, const FVec& b) { return _mm_mul_pd( a, b ); } FORCE_INLINE FVec operator/ (const FVec& a, const FVec& b) { return _mm_div_pd( a, b ); } FORCE_INLINE IVec ftoi (const FVec& a) { return _mm_cvttpd_epi32(a); } +#ifndef __clang__ // Conflicts with builtin operator FORCE_INLINE IVec operator<= (const FVec& a, const FVec& b) { return _mm_castpd_si128( _mm_cmple_pd( a, b ) ); } FORCE_INLINE IVec operator< (const FVec& a, const FVec& b) { return _mm_castpd_si128(_mm_cmplt_pd(a, b)); } FORCE_INLINE IVec operator>= (const FVec& a, const FVec& b) { return _mm_castpd_si128( _mm_cmpge_pd( a, b ) ); } +#endif #ifdef USE_FMA FORCE_INLINE FVec mulSub(const FVec& a, const FVec& b, const FVec& c ) { return _mm_fmsub_pd(a, b, c); } #endif @@ -560,3 +586,4 @@ FORCE_INLINE FVec mulSub(const FVec& a, const FVec