From 2cbef25ac2c3d0f22fcde983096d371748498a7c Mon Sep 17 00:00:00 2001
From: Rickard <rly@combination.se>
Date: Tue, 2 Jan 2024 14:48:54 +0100
Subject: [PATCH] Make native code portable and add GitHub workflow for
 building

---
 .github/workflows/python-package.yml          | 201 ++++++++++++++++++
 .gitignore                                    |  22 +-
 CMakeLists.txt                                | 121 +++++++++++
 Makefile                                      | 141 ------------
 csrc/common.cpp                               |  24 +--
 csrc/common.h                                 |   2 +-
 csrc/cpu_ops.cpp                              |  42 ++--
 csrc/kernels.cu                               |  12 +-
 csrc/mps_kernels.metal                        | 117 ++++++++++
 csrc/ops.cuh                                  |   2 +
 ...{pythonInterface.c => pythonInterface.cpp} |   4 +
 include/Algo-Direct2.h                        |   2 +
 include/Portable.h                            |  35 ++-
 include/SIMD.h                                |  67 ++++--
 include/Type.h                                |   2 +-
 requirements.txt                              |   8 +-
 setup.py                                      |   8 +-
 17 files changed, 593 insertions(+), 217 deletions(-)
 create mode 100644 .github/workflows/python-package.yml
 create mode 100644 CMakeLists.txt
 delete mode 100644 Makefile
 create mode 100644 csrc/mps_kernels.metal
 rename csrc/{pythonInterface.c => pythonInterface.cpp} (99%)

diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
new file mode 100644
index 000000000..c2bebf1d8
--- /dev/null
+++ b/.github/workflows/python-package.yml
@@ -0,0 +1,201 @@
+name: Python package
+
+on:
+  push:
+    branches: [ "*" ]
+  pull_request:
+    branches: [ master ]
+  release:
+    types: [ published ]
+
+jobs:
+  
+  ##
+  # This job matrix builds the non-CUDA versions of the libraries for all supported platforms.
+  ##
+  build-shared-libs:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        arch: [x86_64, aarch64]
+        exclude:
+          - os: windows-latest # This probably requres arm64 Windows agents
+            arch: aarch64
+    runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
+    steps:
+      # Check out code
+    - uses: actions/checkout@v3
+      # On Linux we use CMake within Docker
+    - name: Setup cmake
+      uses: jwlawson/actions-setup-cmake@v1.13
+      with:
+        cmake-version: '3.26.x'
+    - name: Add msbuild to PATH
+      uses: microsoft/setup-msbuild@v1.1
+      if: ${{ startsWith(matrix.os, 'windows') }}
+      # Compile C++ code
+    - name: Build C++
+      shell: bash
+      run: |
+        set -ex
+        build_os=${{ matrix.os }}
+        build_arch=${{ matrix.arch }}
+        ( git clone https://github.com/NVlabs/cub ./dependencies/cub; cd dependencies/cub; git checkout 1.11.0 )
+        if [ ${build_os:0:6} == ubuntu -a ${build_arch} == aarch64 ]; then
+          # Allow cross-compile om aarch64
+          sudo apt-get install -y gcc-aarch64-linux-gnu binutils-aarch64-linux-gnu
+        fi
+        if [ ${build_os:0:5} == macos -a ${build_arch} == aarch64 ]; then
+          cmake -DCMAKE_OSX_ARCHITECTURES=arm64 -DENABLE_CUDA=OFF -DENABLE_MPS=ON .
+        else
+          cmake -DENABLE_CUDA=OFF .
+        fi
+        if [ ${build_os:0:7} == windows ]; then
+          pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release"
+        else
+          make
+        fi
+        mkdir -p output/${{ matrix.os }}/${{ matrix.arch }}
+        ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ )
+    - name: Upload build artifact
+      uses: actions/upload-artifact@v3
+      with:
+        name: shared_library
+        path: output/*
+        retention-days: 7
+  ##
+  # This job matrix builds the CUDA versions of the libraries for platforms that support CUDA (Linux x64/aarch64 + Windows x64)
+  ##
+  build-shared-libs-cuda:
+    strategy:
+      matrix:
+        os: [ubuntu-latest, windows-latest]
+        arch: [x86_64, aarch64]
+        cuda_version: ['12.1.0']
+        exclude:
+          - os: windows-latest # This probably requres arm64 Windows agents
+            arch: aarch64
+    runs-on: ${{ matrix.os }} # One day, we could run them on native agents. Azure supports this now but it's planned only for Q3 2023 for hosted agents
+    steps:
+      # Check out code
+    - uses: actions/checkout@v3
+      # Linux: We use Docker to build cross platform Cuda (aarch64 is built in emulation)
+    - name: Set up Docker multiarch
+      if: startsWith(matrix.os, 'ubuntu')
+      uses: docker/setup-qemu-action@v2
+      # On Linux we use CMake within Docker
+    - name: Setup cmake
+      if: ${{ !startsWith(matrix.os, 'linux') }}
+      uses: jwlawson/actions-setup-cmake@v1.13
+      with:
+        cmake-version: '3.26.x'
+      # Windows: We install Cuda on the agent (slow)
+    - uses: Jimver/cuda-toolkit@v0.2.10
+      if: startsWith(matrix.os, 'windows')
+      id: cuda-toolkit
+      with:
+        cuda: ${{ matrix.cuda_version }}
+        method: 'local'
+        #sub-packages: '["nvcc","cudart","nvrtc_dev","cublas_dev","cusparse_dev","visual_studio_integration"]'
+    - name: Add msbuild to PATH
+      uses: microsoft/setup-msbuild@v1.1
+      if: ${{ startsWith(matrix.os, 'windows') }}
+      # Compile C++ code
+    - name: Build C++
+      shell: bash
+      run: |
+        set -ex
+        build_os=${{ matrix.os }}
+        build_arch=${{ matrix.arch }}
+        ( git clone https://github.com/NVlabs/cub ./dependencies/cub; cd dependencies/cub; git checkout 1.11.0 )
+        if [ ${build_os:0:6} == ubuntu ]; then 
+          image=nvidia/cuda:${{ matrix.cuda_version }}-devel-ubuntu22.04
+          echo "Using image $image"
+          docker run --platform linux/$build_arch -i -w /src -v $PWD:/src $image sh -c \
+            "apt-get update \
+            && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends cmake \
+            && cmake -DENABLE_CUDA=ON . \
+            && make"
+        else
+          cmake -DENABLE_CUDA=ON .
+          pwsh -Command "msbuild bitsandbytes.vcxproj /property:Configuration=Release"
+        fi
+        mkdir -p output/${{ matrix.os }}/${{ matrix.arch }}
+        ( shopt -s nullglob && cp bitsandbytes/*.{so,dylib,dll} output/${{ matrix.os }}/${{ matrix.arch }}/ )
+    - name: Upload build artifact
+      uses: actions/upload-artifact@v3
+      with:
+        name: shared_library
+        path: output/*
+        retention-days: 7
+  build-wheels:
+    needs:
+    - build-shared-libs
+    - build-shared-libs-cuda
+    strategy:
+      matrix:
+        os: [ubuntu-latest, macos-latest, windows-latest]
+        python-version: ["3.8", "3.9", "3.10", "3.11"]
+        arch: [x86_64, aarch64]
+        exclude:
+          - os: windows-latest # This probably requres arm64 Windows agents
+            arch: aarch64
+    runs-on: ${{ matrix.os }}
+    steps:
+      # Check out code
+    - uses: actions/checkout@v3
+      # Download shared libraries
+    - name: Download build artifact
+      uses: actions/download-artifact@v3
+      with:
+        name: shared_library
+        path: output/
+    - name: Copy correct platform shared library
+      shell: bash
+      run: |
+        cp output/${{ matrix.os }}/${{ matrix.arch }}/* bitsandbytes/
+      # Compile C++ code
+    - name: Set up Python ${{ matrix.python-version }}
+      uses: actions/setup-python@v4
+      with:
+        python-version: ${{ matrix.python-version }}
+      # 
+    - name: Install Python dependencies
+      shell: bash
+      run: |
+        pip install -r requirements.txt
+    # TODO: How to run CUDA tests on GitHub actions?
+    #- name: Run unit tests
+    #  if: ${{ matrix.arch == 'x86_64' }} # Tests are too slow to run in emulation. Wait for real aarch64 agents
+    #  run: |
+    #    PYTHONPATH=. pytest --log-cli-level=DEBUG tests
+    - name: Build wheel
+      shell: bash
+      run: |
+        python setup.py bdist_wheel
+    - name: Upload build artifact
+      uses: actions/upload-artifact@v3
+      with:
+        name: bdist_wheel
+        path: dist/bitsandbytes-*.whl
+        retention-days: 7
+  publish:
+    needs: build-wheels
+    runs-on: ubuntu-latest
+    steps:
+    - uses: actions/checkout@v2
+    - name: Build dist
+      run: |
+        python setup.py sdist
+    - name: Download build artifact
+      uses: actions/download-artifact@v3
+      with:
+        name: bdist_wheel
+        path: dist/
+    - run: |
+        ls -lR dist/
+    - name: Publish to PyPi
+      if: startsWith(github.ref, 'refs/tags')
+      uses: pypa/gh-action-pypi-publish@release/v1
+      with:
+        password: ${{ secrets.pypi }}
diff --git a/.gitignore b/.gitignore
index 2f929968b..202dcb13d 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,9 +2,26 @@
 __pycache__/
 *.py[cod]
 *$py.class
-
-# C extensions
 *.so
+*.dll
+*.dylib
+*.o
+*.obj
+*.air
+*.metallib
+
+# CMake generated files
+CMakeCache.txt
+CMakeScripts/
+cmake_install.cmake
+Makefile
+CMakeFiles/
+*.sln
+*.vcxproj*
+*.xcodeproj/
+bitsandbytes.dir/
+Debug/
+Release/
 
 # Distribution / packaging
 .Python
@@ -133,4 +150,5 @@ dmypy.json
 
 dependencies
 cuda_build
+output/
 .vscode/*
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..d6e269d15
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,121 @@
+cmake_minimum_required(VERSION 3.22.1)
+
+option(ENABLE_CUDA "Build for CUDA (Nvidia)" OFF)
+option(ENABLE_MPS "Build for Metal Performance Shaders (Apple)" OFF)
+
+if(ENABLE_CUDA)
+    if(APPLE)
+        message(FATAL_ERROR "CUDA is not supported on macOS" )
+    endif()
+    option(NO_CUBLASLT "Don't use CUBLAST" OFF)
+    if(NO_CUBLASLT)
+        set(CMAKE_CUDA_ARCHITECTURES 50 52 60 61 70 72)
+    else()
+        set(CMAKE_CUDA_ARCHITECTURES 75 80 86 89 90)
+    endif()
+endif()
+
+if(ENABLE_CUDA)
+    message("Building CUDA support for ${CMAKE_CUDA_ARCHITECTURES}")
+    # Find CUDA tools if we are compiling with CUDA
+    find_package(CUDAToolkit REQUIRED)
+    if(NO_CUBLASLT)
+        set(LIBSUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}${CUDAToolkit_VERSION_MINOR}_nocublaslt")
+    else()
+        set(LIBSUFFIX "cuda${CUDAToolkit_VERSION_MAJOR}${CUDAToolkit_VERSION_MINOR}")
+    endif()
+
+    project(bitsandbytes LANGUAGES CXX CUDA)
+    add_compile_definitions(BUILD_CUDA)
+    set(CMAKE_CUDA_STANDARD 14)
+    set(CMAKE_CUDA_STANDARD_REQUIRED ON)
+    set(GPU_SOURCES csrc/ops.cu csrc/kernels.cu)
+elseif(ENABLE_MPS)
+    if(NOT APPLE)
+        message(FATAL_ERROR "MPS is only supported on macOS" )
+    endif()
+    message("Building MPS support")
+    set(LIBSUFFIX "mps")
+    project(bitsandbytes LANGUAGES CXX OBJCXX)
+    add_compile_definitions(BUILD_MPS)
+    set(METAL_SOURCES csrc/mps_kernels.metal)
+    file(MAKE_DIRECTORY "build")
+    add_custom_command(OUTPUT "bitsandbytes/bitsandbytes.metallib"  
+                   COMMAND xcrun metal -c -o "build/bitsandbytes.air" ${METAL_SOURCES}
+                   COMMAND xcrun metallib "build/bitsandbytes.air" -o "bitsandbytes/bitsandbytes.metallib"
+                   DEPENDS "${METAL_SOURCES}"  
+                   COMMENT "Compiling Metal kernels"  
+                   VERBATIM)
+    add_custom_target(metallib DEPENDS "bitsandbytes/bitsandbytes.metallib")
+    set(GPU_SOURCES csrc/mps_ops.mm)
+else()
+    message("Building with CPU only")
+    set(LIBSUFFIX "cpu")
+
+    project(bitsandbytes LANGUAGES CXX)
+    set(GPU_SOURCES)
+endif()
+
+if(APPLE)
+  set(CMAKE_OSX_DEPLOYMENT_TARGET 13.1)
+endif()
+set(CMAKE_CXX_STANDARD 14)
+set(CXX_STANDARD_REQUIRED C++14)
+
+if(WIN32)
+    # Mute warnings
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -diag-suppress=177")
+
+    # Enable fast math on VC++
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /fp:fast")
+
+    # Export all symbols
+    set(CMAKE_WINDOWS_EXPORT_ALL_SYMBOLS ON)
+endif()
+
+# Weird MSVC hacks
+if(MSVC)
+    set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}  /NODEFAULTLIB:msvcprtd /NODEFAULTLIB:MSVCRTD /NODEFAULTLIB:LIBCMT")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /arch:AVX2")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /arch:AVX2")
+endif()
+
+# Add csrc files
+add_library(bitsandbytes SHARED
+	${GPU_SOURCES}
+	csrc/common.cpp
+	csrc/cpu_ops.cpp
+	csrc/pythonInterface.cpp)
+
+target_include_directories(bitsandbytes PUBLIC
+    ${CMAKE_CUDA_TOOLKIT_INCLUDE_DIRECTORIES}
+    ${CMAKE_CURRENT_SOURCE_DIR}/csrc
+    ${CMAKE_CURRENT_SOURCE_DIR}/include)
+
+if(ENABLE_CUDA)
+    target_include_directories(bitsandbytes PUBLIC ${CUDA_TOOLKIT_ROOT_DIR}/include)
+
+    set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --use_fast_math")
+
+    set_target_properties(
+        bitsandbytes
+        PROPERTIES
+        CUDA_SEPARABLE_COMPILATION ON)
+
+    target_link_libraries(bitsandbytes CUDA::cudart CUDA::cublas CUDA::cublasLt CUDA::cusparse)
+endif()
+if(ENABLE_MPS)
+    add_dependencies(bitsandbytes metallib)
+    target_link_libraries(bitsandbytes objc "-framework Foundation" "-framework Metal" "-framework MetalPerformanceShaders" "-framework MetalPerformanceShadersGraph")
+endif()
+
+set_target_properties(bitsandbytes PROPERTIES OUTPUT_NAME "bitsandbytes_${LIBSUFFIX}")
+# Set the output name of the CUDA library
+if(MSVC)
+set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_RELEASE bitsandbytes)
+set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY_DEBUG bitsandbytes)
+set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_RELEASE bitsandbytes)
+set_target_properties(bitsandbytes PROPERTIES RUNTIME_OUTPUT_DIRECTORY_DEBUG bitsandbytes)
+endif()
+
+set_target_properties(bitsandbytes PROPERTIES LIBRARY_OUTPUT_DIRECTORY bitsandbytes)
diff --git a/Makefile b/Makefile
deleted file mode 100644
index 5f997a122..000000000
--- a/Makefile
+++ /dev/null
@@ -1,141 +0,0 @@
-MKFILE_PATH := $(abspath $(lastword $(MAKEFILE_LIST)))
-ROOT_DIR := $(patsubst %/,%,$(dir $(MKFILE_PATH)))
-
-GPP:= /usr/bin/g++
-#GPP:= /sw/gcc/11.2.0/bin/g++
-ifeq ($(CUDA_HOME),)
-	CUDA_HOME:= $(shell which nvcc | rev | cut -d'/' -f3- | rev)
-endif
-
-ifndef CUDA_VERSION
-ifneq ($(MAKECMDGOALS),clean)
-$(warning WARNING: CUDA_VERSION not set. Call make with CUDA string, for example: make cuda11x CUDA_VERSION=115 or make cpuonly CUDA_VERSION=CPU)
-CUDA_VERSION:=
-endif
-endif
-
-
-
-NVCC := $(CUDA_HOME)/bin/nvcc
-
-###########################################
-
-CSRC := $(ROOT_DIR)/csrc
-BUILD_DIR:= $(ROOT_DIR)/build
-
-FILES_CUDA := $(CSRC)/ops.cu $(CSRC)/kernels.cu
-FILES_CPP := $(CSRC)/common.cpp $(CSRC)/cpu_ops.cpp $(CSRC)/pythonInterface.c
-
-INCLUDE :=  -I $(CUDA_HOME)/include -I $(ROOT_DIR)/csrc -I $(CONDA_PREFIX)/include -I $(ROOT_DIR)/include
-LIB := -L $(CUDA_HOME)/lib64 -lcudart -lcublas -lcublasLt -lcusparse -L $(CONDA_PREFIX)/lib
-
-# NVIDIA NVCC compilation flags
-COMPUTE_CAPABILITY += -gencode arch=compute_50,code=sm_50 # Maxwell
-COMPUTE_CAPABILITY += -gencode arch=compute_52,code=sm_52 # Maxwell
-COMPUTE_CAPABILITY += -gencode arch=compute_60,code=sm_60 # Pascal
-COMPUTE_CAPABILITY += -gencode arch=compute_61,code=sm_61 # Pascal
-COMPUTE_CAPABILITY += -gencode arch=compute_70,code=sm_70 # Volta
-
-CC_KEPLER := -gencode arch=compute_35,code=sm_35 # Kepler
-CC_KEPLER += -gencode arch=compute_37,code=sm_37 # Kepler
-
-# Later versions of CUDA support the new architectures
-CC_CUDA11x := -gencode arch=compute_75,code=sm_75
-CC_CUDA11x += -gencode arch=compute_80,code=sm_80
-CC_CUDA11x += -gencode arch=compute_86,code=sm_86
-
-
-CC_cublasLt110 := -gencode arch=compute_75,code=sm_75
-CC_cublasLt110 += -gencode arch=compute_80,code=sm_80
-
-CC_cublasLt111 := -gencode arch=compute_75,code=sm_75
-CC_cublasLt111 += -gencode arch=compute_80,code=sm_80
-CC_cublasLt111 += -gencode arch=compute_86,code=sm_86
-
-CC_ADA_HOPPER := -gencode arch=compute_89,code=sm_89
-CC_ADA_HOPPER += -gencode arch=compute_90,code=sm_90
-
-
-all: $(BUILD_DIR) env
-	$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
-
-cuda110_nomatmul_kepler: $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
-
-cuda11x_nomatmul_kepler: $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_KEPLER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
-
-
-cuda110_nomatmul: $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
-
-cuda11x_nomatmul: $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
-
-cuda118_nomatmul: $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER)  -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
-
-cuda12x_nomatmul: $(BUILD_DIR) env
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR) -D NO_CUBLASLT
-	$(NVCC) $(COMPUTE_CAPABILITY) $(CC_CUDA11x) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION)_nocublaslt.so $(LIB)
-
-cuda110: $(BUILD_DIR) env
-	$(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(CC_cublasLt110) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
-
-cuda11x: $(BUILD_DIR) env
-	$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(CC_cublasLt111) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
-
-cuda118: $(BUILD_DIR) env
-	$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
-
-cuda12x: $(BUILD_DIR) env
-	$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' --use_fast_math -Xptxas=-v -dc $(FILES_CUDA) $(INCLUDE) $(LIB) --output-directory $(BUILD_DIR)
-	$(NVCC) $(CC_cublasLt111) $(CC_ADA_HOPPER) -Xcompiler '-fPIC' -dlink $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o -o $(BUILD_DIR)/link.o
-	$(GPP) -std=c++14 -DBUILD_CUDA -shared -fPIC $(INCLUDE) $(BUILD_DIR)/ops.o $(BUILD_DIR)/kernels.o $(BUILD_DIR)/link.o $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cuda$(CUDA_VERSION).so $(LIB)
-
-cpuonly: $(BUILD_DIR) env
-	$(GPP) -std=c++14 -shared -fPIC -I $(ROOT_DIR)/csrc -I $(ROOT_DIR)/include $(FILES_CPP) -o ./bitsandbytes/libbitsandbytes_cpu.so
-
-env:
-	@echo "ENVIRONMENT"
-	@echo "============================"
-	@echo "CUDA_VERSION: $(CUDA_VERSION)"
-	@echo "============================"
-	@echo "NVCC path: $(NVCC)"
-	@echo "GPP path: $(GPP) VERSION: `$(GPP) --version | head -n 1`"
-	@echo "CUDA_HOME: $(CUDA_HOME)"
-	@echo "CONDA_PREFIX: $(CONDA_PREFIX)"
-	@echo "PATH: $(PATH)"
-	@echo "LD_LIBRARY_PATH: $(LD_LIBRARY_PATH)"
-	@echo "============================"
-
-$(BUILD_DIR):
-	mkdir -p build
-	mkdir -p dependencies
-
-$(ROOT_DIR)/dependencies/cub:
-	git clone https://github.com/NVlabs/cub $(ROOT_DIR)/dependencies/cub
-	cd dependencies/cub; git checkout 1.11.0
-
-clean:
-	rm -rf build/* *.egg*
-	rm -f bitsandbytes/libbitsandbytes*.so
diff --git a/csrc/common.cpp b/csrc/common.cpp
index 52f029917..0a9601689 100644
--- a/csrc/common.cpp
+++ b/csrc/common.cpp
@@ -1,39 +1,35 @@
 #include <common.h>
 #include <float.h>
 
-void *quantize_block(void *arguments) {
+void quantize_block(const quantize_block_args& args) {
     // 1. find absmax in block
     // 2. divide input value by absmax to normalize into [-1.0, 1.0]
     // 3. do binary search to find the closest value
     // 4. check minimal distance
     // 5. store index
 
-    struct quantize_block_args *args = (quantize_block_args *) arguments;
-
     // 1. find absmax in block
     float absmax_block = -FLT_MAX;
-    for (long long i = args->block_idx; i < args->block_end; i++)
-        absmax_block = fmax(absmax_block, fabs(args->A[i]));
+    for (long long i = args.block_idx; i < args.block_end; i++)
+        absmax_block = fmax(absmax_block, fabs(args.A[i]));
 
-    args->absmax[args->block_idx / args->blocksize] = absmax_block;
+    args.absmax[args.block_idx / args.blocksize] = absmax_block;
 
-    for (long long i = args->block_idx; i < args->block_end; i++) {
+    for (long long i = args.block_idx; i < args.block_end; i++) {
         // 2. divide input value by absmax to normalize into [-1.0, 1.0]
         // 3. do binary search to find the closest value
-        float normed_value = args->A[i] / absmax_block;
-        long long idx = args->bin_searcher->scalar(normed_value);
+        float normed_value = args.A[i] / absmax_block;
+        long long idx = args.bin_searcher->scalar(normed_value);
 
         // 4. check minimal distance
         // The binary search returns always the value to the left, which might not be the closest value
         if (idx < 255) {
-            float dist_left = fabs(normed_value - (args->code[idx]));
-            float dist_right = fabs(normed_value - (args->code[idx + 1]));
+            float dist_left = fabs(normed_value - (args.code[idx]));
+            float dist_right = fabs(normed_value - (args.code[idx + 1]));
             if (dist_right < dist_left) { idx += 1; }
         }
 
         // 5. store index
-        args->out[i] = (unsigned char) idx;
+        args.out[i] = (unsigned char) idx;
     }
-
-    return NULL;
 }
diff --git a/csrc/common.h b/csrc/common.h
index c99034e78..e513f2875 100644
--- a/csrc/common.h
+++ b/csrc/common.h
@@ -20,6 +20,6 @@ struct quantize_block_args {
 };
 
 
-void *quantize_block(void *arguments);
+void quantize_block(const quantize_block_args& args);
 
 #endif
diff --git a/csrc/cpu_ops.cpp b/csrc/cpu_ops.cpp
index e28e7b2c2..478c1f4ff 100644
--- a/csrc/cpu_ops.cpp
+++ b/csrc/cpu_ops.cpp
@@ -1,6 +1,6 @@
 #include <BinSearch.h>
-#include <pthread.h>
 #include <common.h>
+#include <thread>
 
 using namespace BinSearch;
 
@@ -31,12 +31,8 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
     for(long long offset = 0; offset < num_blocks; offset+=thread_wave_size)
     {
       long long valid_chunks = num_blocks - offset >= thread_wave_size ? thread_wave_size : num_blocks - offset;
-      pthread_t *threads = (pthread_t *) malloc(sizeof(pthread_t) * valid_chunks);
-
-      struct quantize_block_args **args = (quantize_block_args **) malloc(valid_chunks * sizeof(quantize_block_args *));
-
-      for(long long i = 0; i < valid_chunks; i++)
-          args[i] = (quantize_block_args *) malloc(sizeof(quantize_block_args));
+      std::vector<std::thread> threads(valid_chunks);
+      std::vector<quantize_block_args> args(valid_chunks);
 
       int chunks_processed = 0;
       for(long long block_idx = offset*blocksize; block_idx < n; block_idx += blocksize)
@@ -44,30 +40,24 @@ void quantize_cpu(float *code, float *A, float *absmax, unsigned char *out, long
           long long valid_items = n - block_idx >= blocksize ? blocksize : n - block_idx;
           long long block_end = block_idx + valid_items;
 
-          struct quantize_block_args *arg = args[chunks_processed];
-          arg->bin_searcher = &bin_searcher;
-          arg->code = code;
-          arg->A = A;
-          arg->absmax = absmax;
-          arg->out = out;
-          arg->block_end = block_end;
-          arg->block_idx = block_idx;
-          arg->threadidx = block_idx / blocksize;
-          arg->blocksize = blocksize;
-
-          pthread_create(&threads[chunks_processed], NULL, &quantize_block, (void *) arg);
+          struct quantize_block_args& arg = args[chunks_processed];
+          arg.bin_searcher = &bin_searcher;
+          arg.code = code;
+          arg.A = A;
+          arg.absmax = absmax;
+          arg.out = out;
+          arg.block_end = block_end;
+          arg.block_idx = block_idx;
+          arg.threadidx = block_idx / blocksize;
+          arg.blocksize = blocksize;
+
+          threads[chunks_processed] = std::thread([arg] { quantize_block(arg); });
           chunks_processed += 1;
           if(chunks_processed == valid_chunks){ break; }
       }
 
       for (int i = 0; i < valid_chunks; i++)
-          int err = pthread_join(threads[i], NULL);
-
-      free(threads);
-      for (int i = 0; i < valid_chunks; i++)
-          free(args[i]);
-      free(args);
-
+          threads[i].join();
     }
 
 }
diff --git a/csrc/kernels.cu b/csrc/kernels.cu
index 1ab8aa242..c2e2d7da7 100644
--- a/csrc/kernels.cu
+++ b/csrc/kernels.cu
@@ -3816,12 +3816,12 @@ template __global__ void kgemm_4bit_inference_naive<float, 128, 32>(int M, int N
 template __global__ void kExtractOutliers<COL_TURING>(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
 template __global__ void kExtractOutliers<COL_AMPERE>(char *A, int *idx, char *out, int idx_size, int rowsA, int colsA, int tiledRowsA, int tiledColsA);
 
-template __global__ void kspmm_coo_very_sparse_naive<half, 8, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<half, 16, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<half, 32, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<signed char, 8, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<signed char, 16, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
-template __global__ void kspmm_coo_very_sparse_naive<signed char, 32, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float *dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 8, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 16, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<half, 32, 16>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, half *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 8, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 16, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
+template __global__ void kspmm_coo_very_sparse_naive<signed char, 32, 8>(int *max_count, int *max_idx, int *offset_rowidx, int *rowidx, int *colidx, half *values, signed char *B, half *out, float * __restrict__ const dequant_stats, int nnz, int rowsA, int rowsB, int colsB);
 
 template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 0, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
 template __global__ void kTransformRowToFormat<256, 8, 32, 32*8, 1, COL32>(char *__restrict__ const A, char *out, int rows, int cols, int tiledCols, int outRows, int outCols);
diff --git a/csrc/mps_kernels.metal b/csrc/mps_kernels.metal
new file mode 100644
index 000000000..a5c8e35b2
--- /dev/null
+++ b/csrc/mps_kernels.metal
@@ -0,0 +1,117 @@
+#include <metal_stdlib>
+using namespace metal;
+
+#define HLF_MAX 65504
+#define TH 1024
+#define NUM 4
+#define NUM_BLOCK 4096
+
+template<bool STOCHASTIC>
+static unsigned char quantize_scalar(
+  float rand,
+  device float* code,
+  float x)
+{
+    int pivot = 127;
+    int upper_pivot = 255;
+    int lower_pivot = 0;
+
+    float lower = -1.0f;
+    float upper = 1.0f;
+
+    float val = code[pivot];
+    // i>>=1 = {32, 16, 8, 4, 2, 1}
+    for(int i = 64; i > 0; i>>=1)
+    {
+        if(x > val)
+        {
+            lower_pivot = pivot;
+            lower = val;
+            pivot+=i;
+        }
+        else
+        {
+            upper_pivot = pivot;
+            upper = val;
+            pivot-=i;
+        }
+        val = code[pivot];
+    }
+
+    if(upper_pivot == 255)
+        upper = code[upper_pivot];
+    if(lower_pivot == 0)
+        lower = code[lower_pivot];
+
+    if(!STOCHASTIC)
+    {
+      if(x > val)
+      {
+        float midpoint = (upper+val)*0.5f;
+        if(x > midpoint)
+        {
+          return upper_pivot;
+        }
+        else
+          return pivot;
+      }
+      else
+      {
+        float midpoint = (lower+val)*0.5f;
+        if(x < midpoint)
+          return lower_pivot;
+        else
+          return pivot;
+      }
+    }
+    else
+    {
+      if(x > val)
+      {
+        float dist_to_upper = fabs(upper-x);
+        float dist_full = upper-val;
+        if(rand >= dist_to_upper/dist_full) return upper_pivot;
+        else return pivot;
+      }
+      else
+      {
+        float dist_to_lower = fabs(lower-x);
+        float dist_full = val-lower;
+        if(rand >= dist_to_lower/dist_full) return lower_pivot;
+        else return pivot;
+      }
+    }
+}
+
+kernel void quantize(device float* code [[buffer(0)]],  
+                      device float* A [[buffer(1)]],  
+                      device uchar* out [[buffer(2)]],  
+                      constant uint& n [[buffer(3)]],  
+                      uint id [[thread_position_in_grid]]) {  
+  const uint n_full = (NUM_BLOCK * (n / NUM_BLOCK)) + (n % NUM_BLOCK == 0 ? 0 : NUM_BLOCK);  
+  uint valid_items = (id / NUM_BLOCK + 1 == (n + NUM_BLOCK - 1) / NUM_BLOCK) ? n - (id / NUM_BLOCK * NUM_BLOCK) : NUM_BLOCK;  
+  const uint base_idx = (id / NUM_BLOCK * NUM_BLOCK);  
+  
+  float vals[NUM];  
+  uchar qvals[NUM];  
+  
+  for (uint i = base_idx; i < n_full; i += ((n + NUM_BLOCK - 1) / NUM_BLOCK) * NUM_BLOCK) {  
+    valid_items = n - i > NUM_BLOCK ? NUM_BLOCK : n - i;  
+  
+    threadgroup_barrier(mem_flags::mem_threadgroup);  
+  
+    for (uint j = 0; j < valid_items; j++) {  
+      vals[j] = A[i + j];  
+    }  
+  
+    for (uint j = 0; j < valid_items; j++) {  
+      qvals[j] = quantize_scalar<false>(0.0f, code, vals[j]);
+    }  
+  
+    threadgroup_barrier(mem_flags::mem_threadgroup);  
+  
+    for (uint j = 0; j < valid_items; j++) {  
+      out[i + j] = qvals[j];  
+    }  
+  }  
+}
diff --git a/csrc/ops.cuh b/csrc/ops.cuh
index f37b3b3af..cc7b59505 100644
--- a/csrc/ops.cuh
+++ b/csrc/ops.cuh
@@ -9,7 +9,9 @@
 
 #include <stdio.h>
 #include <iostream>
+#ifndef _MSC_VER
 #include <unistd.h>
+#endif
 #include <assert.h>
 
 #include <cuda_runtime_api.h>
diff --git a/csrc/pythonInterface.c b/csrc/pythonInterface.cpp
similarity index 99%
rename from csrc/pythonInterface.c
rename to csrc/pythonInterface.cpp
index 865e4b6d5..a6b348ca6 100644
--- a/csrc/pythonInterface.c
+++ b/csrc/pythonInterface.cpp
@@ -6,6 +6,9 @@
 #if BUILD_CUDA
 #include <ops.cuh>
 #endif
+#if BUILD_MPS
+// #include <mps_ops.h>
+#endif
 #include <cpu_ops.h>
 
 // We cannot call templated code from C, so we wrap the template in a C compatible call here if necessary.
@@ -412,6 +415,7 @@ extern "C"
 	{ gemm_4bit_inference_naive_fp32(m, n, k, A, B, absmax,  datatype, out, lda, ldb, ldc, blocksize); }
 
 #endif
+
 	void cquantize_blockwise_cpu_fp32(float *code, float *A, float *absmax, unsigned char *out, long long blocksize, long long n){ quantize_cpu(code, A, absmax, out, blocksize, n); }
 	void cdequantize_blockwise_cpu_fp32(float *code, unsigned char *A, float *absmax, float *out, long long blocksize, long long n){ dequantize_cpu(code, A, absmax, out, blocksize, n); }
 }
diff --git a/include/Algo-Direct2.h b/include/Algo-Direct2.h
index d5fa58d12..7f52fce14 100644
--- a/include/Algo-Direct2.h
+++ b/include/Algo-Direct2.h
@@ -52,6 +52,7 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
 private:
     typedef AlgoScalarBase<T, A> base_t;
 
+#ifdef USE_SSE2
     FORCE_INLINE
         //NO_INLINE
         void resolve(const FVec<SSE, float>& vz, const IVec<SSE, float>& bidx, uint32 *pr) const
@@ -135,6 +136,7 @@ struct AlgoVecBase<I, T, A, typename std::enable_if<DirectAux::IsDirect2<A>::val
         pr[0] = u.ui32[0];
         pr[1] = u.ui32[2];
     }
+#endif // USE_SSE2
 
 #ifdef USE_AVX
 
diff --git a/include/Portable.h b/include/Portable.h
index 1710b0502..78599944e 100644
--- a/include/Portable.h
+++ b/include/Portable.h
@@ -4,10 +4,40 @@
 #include <stdexcept>
 #include <sstream>
 
+#if defined(__aarch64__)
+#ifdef __CUDACC__
+#undef USE_NEON // Doesn't work with nvcc, undefined symbols
+#else
+#include <arm_neon.h>
+#undef USE_NEON // Not yet implemented
+#endif
+#undef USE_AVX // x86_64 only
+#undef USE_AVX2 // x86_64 only
+#undef USE_SSE2 // x86_64 only
+#undef USE_SSE41 // x86_64 only
+#undef USE_SSE42 // x86_64 only
+#undef USE_FMA // x86_64 only
+#ifdef USE_NEON
+typedef float32x4_t __m128;
+typedef int32x4_t __m128i;
+typedef float64x2_t __m128d;
+#else
+typedef struct {float a; float b; float c; float d;} __m128;
+typedef struct {int a; int b; int c; int d;} __m128i;
+typedef struct {double a; double b;} __m128d;
+#endif
+#else
+#undef USE_NEON // ARM64 only
 #ifdef __FMA__
 #define USE_FMA
 #endif
+#if !defined(__SSE2__) && !defined(_MSC_VER)
+#error Compiler must support SSE2
+#endif
+#define USE_SSE2
 
+#if defined(__aarch64__)
+#else
 #ifdef __AVX2__
 #define USE_AVX2
 #endif
@@ -24,7 +54,8 @@
 #ifdef __SSE4_2__
 #define USE_SSE42
 #endif
-
+#endif
+#endif
 
 #ifndef _MSC_VER
 #include <stdint.h>
@@ -50,7 +81,7 @@ typedef unsigned __int64  uint64;
 
 namespace Details {
 
-#define myassert(cond, msg) if (!cond){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); }
+#define myassert(cond, msg) if (!(cond)){ std::ostringstream os; os << "\nassertion failed: " << #cond << ", " << msg << "\n"; throw std::invalid_argument(os.str()); }
 
 // log2 is not defined in VS2008
 #if defined(_MSC_VER)
diff --git a/include/SIMD.h b/include/SIMD.h
index a2ac1a9ae..18a38dbfd 100644
--- a/include/SIMD.h
+++ b/include/SIMD.h
@@ -2,6 +2,46 @@
 
 #include "Portable.h"
 
+#ifdef USE_SSE2
+#include <emmintrin.h>
+#if defined(USE_AVX) || defined(USE_AVX2)
+#include <immintrin.h>
+#else
+#ifdef USE_SSE41
+#include <smmintrin.h>
+#endif
+#endif
+#endif
+
+namespace BinSearch {
+namespace Details {
+
+template <InstrSet I, typename T>
+struct FTOITraits{};
+
+template <InstrSet I, class T>
+struct FVec;
+
+template <InstrSet I, class T>
+struct IVec;
+
+template <InstrSet I, class T>
+struct FVec1;
+
+template <> struct InstrFloatTraits<Scalar, float>
+{
+    typedef __m128  vec_t;
+};
+
+template <> struct InstrFloatTraits<Scalar, double>
+{
+    typedef __m128d vec_t;
+};
+
+}
+}
+
+#if !defined(__aarch64__)
 #ifdef USE_SSE42
 #ifndef _MSC_VER
 #include <popcntintrin.h>
@@ -26,29 +66,11 @@ FORCE_INLINE int popcnt32(int x32)
 } // namespace
 #endif
 
-#if defined(USE_AVX) || defined(USE_AVX2)
-#include <immintrin.h>
-#else
-#include <emmintrin.h>
-#ifdef USE_SSE41
-#include <smmintrin.h>
-#endif
-#endif
-
 #include "Type.h"
 
 namespace BinSearch {
 namespace Details {
 
-template <InstrSet I, class T>
-struct FVec;
-
-template <InstrSet I, class T>
-struct IVec;
-
-template <InstrSet I, class T>
-struct FVec1;
-
 template <> struct InstrIntTraits<SSE>
 {
     typedef __m128i vec_t;
@@ -64,8 +86,8 @@ template <> struct InstrFloatTraits<SSE, double>
     typedef __m128d vec_t;
 };
 
-template <InstrSet I, typename T>
-struct FTOITraits
+template <>
+struct FTOITraits<SSE, float>
 {
     typedef IVec<SSE, float> vec_t;
 };
@@ -285,9 +307,11 @@ FORCE_INLINE FVec<SSE,float> operator-   (const FVec<SSE,float>& a,  const FVec<
 FORCE_INLINE FVec<SSE,float> operator*   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_mul_ps( a, b ); }
 FORCE_INLINE FVec<SSE,float> operator/   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_div_ps( a, b ); }
 FORCE_INLINE IVec<SSE,float> ftoi        (const FVec<SSE,float>& a)                             { return _mm_cvttps_epi32(a); }
+#ifndef __clang__ // Conflicts with builtin operator
 FORCE_INLINE IVec<SSE,float> operator<=  (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128( _mm_cmple_ps( a, b ) ); }
 FORCE_INLINE IVec<SSE,float> operator>=  (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128( _mm_cmpge_ps( a, b ) ); }
 FORCE_INLINE IVec<SSE,float> operator<   (const FVec<SSE,float>& a,  const FVec<SSE,float>& b)  { return _mm_castps_si128(_mm_cmplt_ps(a, b)); }
+#endif
 #ifdef USE_FMA
 FORCE_INLINE FVec<SSE, float> mulSub(const FVec<SSE, float>& a, const FVec<SSE, float>& b, const FVec<SSE, float>& c) { return _mm_fmsub_ps(a, b, c); }
 #endif
@@ -339,9 +363,11 @@ FORCE_INLINE FVec<SSE,double> operator-   (const FVec<SSE,double>& a, const FVec
 FORCE_INLINE FVec<SSE,double> operator*   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_mul_pd( a, b ); }
 FORCE_INLINE FVec<SSE,double> operator/   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_div_pd( a, b ); }
 FORCE_INLINE IVec<SSE,float>  ftoi        (const FVec<SSE,double>& a)                               { return _mm_cvttpd_epi32(a); }
+#ifndef __clang__ // Conflicts with builtin operator
 FORCE_INLINE IVec<SSE,double> operator<=  (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128( _mm_cmple_pd( a, b ) ); }
 FORCE_INLINE IVec<SSE,double> operator<   (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128(_mm_cmplt_pd(a, b)); }
 FORCE_INLINE IVec<SSE,double> operator>=  (const FVec<SSE,double>& a, const FVec<SSE,double>& b)    { return _mm_castpd_si128( _mm_cmpge_pd( a, b ) ); }
+#endif
 #ifdef USE_FMA
 FORCE_INLINE FVec<SSE, double> mulSub(const FVec<SSE, double>& a, const FVec<SSE, double>& b, const FVec<SSE, double>& c ) { return _mm_fmsub_pd(a, b, c); }
 #endif
@@ -560,3 +586,4 @@ FORCE_INLINE FVec<AVX, double> mulSub(const FVec<AVX, double>& a, const FVec<AVX
 
 } // namepsace Details
 } // namespace BinSearch
+#endif // !defined(__aarch64__)
\ No newline at end of file
diff --git a/include/Type.h b/include/Type.h
index 720bfb86f..16bf3e3ae 100644
--- a/include/Type.h
+++ b/include/Type.h
@@ -10,7 +10,7 @@ using std::size_t;
 
 namespace BinSearch {
 
-enum InstrSet { Scalar, SSE, AVX };
+enum InstrSet { Scalar, SSE, AVX, Neon };
 
 #define ALGOENUM(x, b) x,
 enum Algos
diff --git a/requirements.txt b/requirements.txt
index 3bde2dc6a..ad64af67a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,3 +1,5 @@
-lion-pytorch
-pytest
-scipy
+pytest==7.2.2
+einops==0.6.0
+wheel==0.40.0
+lion-pytorch==0.0.6
+scipy==1.11.4
diff --git a/setup.py b/setup.py
index 2068c5fd8..72c809820 100644
--- a/setup.py
+++ b/setup.py
@@ -6,11 +6,12 @@
 import os
 
 from setuptools import find_packages, setup
+from setuptools.dist import Distribution
 import bitsandbytes as bnb
 
 VERSION = bnb.__version__
 
-libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.so"))
+libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.*"))
 libs = [os.path.basename(p) for p in libs]
 print("libs:", libs)
 
@@ -18,6 +19,10 @@
 def read(fname):
     return open(os.path.join(os.path.dirname(__file__), fname)).read()
 
+# Tested with wheel v0.29.0
+class BinaryDistribution(Distribution):
+    def has_ext_modules(foo):
+        return True
 
 setup(
     name=f"bitsandbytes",
@@ -37,4 +42,5 @@ def read(fname):
         "Development Status :: 4 - Beta",
         "Topic :: Scientific/Engineering :: Artificial Intelligence",
     ],
+    distclass=BinaryDistribution
 )