From dae7041a401c48984863db622919d84603469848 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alejandro=20Rodr=C3=ADguez=20Salamanca?=
 <alexrs95@gmail.com>
Date: Mon, 15 May 2023 22:33:16 +0200
Subject: [PATCH 01/18] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 727a86cb5..5cf1e6ee0 100644
--- a/README.md
+++ b/README.md
@@ -102,7 +102,7 @@ For straight Int8 matrix multiplication with mixed precision decomposition you c
 bnb.matmul(..., threshold=6.0)
 ```
 
-For instructions how to use LLM.int8() inference layers in your own code, see the TL;DR above or for extended instruction see [this blog post](https://github.com/huggingface/transformers).
+For instructions how to use LLM.int8() inference layers in your own code, see the TL;DR above or for extended instruction see [this blog post](https://huggingface.co/blog/hf-bitsandbytes-integration).
 
 ### Using the 8-bit Optimizers
 

From 6b26402b793525ec317fadff260548f5b36d936f Mon Sep 17 00:00:00 2001
From: Aarni Koskela <akx@iki.fi>
Date: Thu, 25 May 2023 15:25:58 +0300
Subject: [PATCH 02/18] Fix typo "quanitze"

---
 benchmarking/switchback/speed_benchmark.py           |  6 +++---
 bitsandbytes/nn/triton_based_modules.py              | 12 ++++++------
 ...dequanitze.py => int8_matmul_mixed_dequantize.py} |  4 ++--
 3 files changed, 11 insertions(+), 11 deletions(-)
 rename bitsandbytes/triton/{int8_matmul_mixed_dequanitze.py => int8_matmul_mixed_dequantize.py} (98%)

diff --git a/benchmarking/switchback/speed_benchmark.py b/benchmarking/switchback/speed_benchmark.py
index 9ad991194..b0983d0b8 100644
--- a/benchmarking/switchback/speed_benchmark.py
+++ b/benchmarking/switchback/speed_benchmark.py
@@ -8,7 +8,7 @@
 from bitsandbytes.triton.quantize_columnwise_and_transpose import quantize_columnwise_and_transpose
 from bitsandbytes.triton.int8_matmul_rowwise_dequantize import int8_matmul_rowwise_dequantize
 from bitsandbytes.triton.quantize_global import quantize_global, quantize_global_transpose
-from bitsandbytes.triton.int8_matmul_mixed_dequanitze import int8_matmul_mixed_dequanitze
+from bitsandbytes.triton.int8_matmul_mixed_dequantize import int8_matmul_mixed_dequantize
 
 # KNOW ISSUE: need to optimize "w_quantize_colwise_transpose" when embeddim is too large.
 
@@ -72,8 +72,8 @@ def get_time(k, fn, info_dict):
                 get_time('standard_gx', lambda : g.matmul(w), info)
                 get_time('rowwise_fwd', lambda : int8_matmul_rowwise_dequantize(x_int8, w_int8.t(), state_x_rowwise, state_w_columnwise, None), info)
                 get_time('rowwise_bwd', lambda : int8_matmul_rowwise_dequantize(g_int8, wt_int8.t(), state_x_rowwise, state_w_rowwise, None), info)
-                get_time('global_fwd', lambda : int8_matmul_mixed_dequanitze(x_int8, w_int8.t(), state_x_rowwise, state_w_global, None), info)
-                get_time('global_bwd', lambda : int8_matmul_mixed_dequanitze(g_int8, wt_int8.t(), state_x_rowwise, state_w_global, None), info)
+                get_time('global_fwd', lambda : int8_matmul_mixed_dequantize(x_int8, w_int8.t(), state_x_rowwise, state_w_global, None), info)
+                get_time('global_bwd', lambda : int8_matmul_mixed_dequantize(g_int8, wt_int8.t(), state_x_rowwise, state_w_global, None), info)
                 get_time('x_quantize_rowwise', lambda : quantize_rowwise(x), info)
                 get_time('g_quantize_rowwise', lambda : quantize_rowwise(g), info)
                 get_time('w_quantize_rowwise', lambda : quantize_rowwise(w), info)
diff --git a/bitsandbytes/nn/triton_based_modules.py b/bitsandbytes/nn/triton_based_modules.py
index 6fbf583b9..de07ac647 100644
--- a/bitsandbytes/nn/triton_based_modules.py
+++ b/bitsandbytes/nn/triton_based_modules.py
@@ -10,7 +10,7 @@
 from bitsandbytes.triton.quantize_columnwise_and_transpose import quantize_columnwise_and_transpose
 from bitsandbytes.triton.int8_matmul_rowwise_dequantize import int8_matmul_rowwise_dequantize
 from bitsandbytes.triton.quantize_global import quantize_global, quantize_global_transpose
-from bitsandbytes.triton.int8_matmul_mixed_dequanitze import int8_matmul_mixed_dequanitze
+from bitsandbytes.triton.int8_matmul_mixed_dequantize import int8_matmul_mixed_dequantize
 
 
 class _switchback_global(torch.autograd.Function):
@@ -29,7 +29,7 @@ def forward(ctx, X_3D, W, bias):
 
         # matmult, fused dequant and add bias
         # call "mixed" because we are mixing rowwise quantized and global quantized
-        return int8_matmul_mixed_dequanitze(
+        return int8_matmul_mixed_dequantize(
             X_int8, W_int8.t(), state_X, state_W, bias
         ).view(*X_3D.size()[:-1], -1)
 
@@ -47,7 +47,7 @@ def backward(ctx, G_3D):
             # so we transpose once then call .t() in the matmul
             G_int8, state_G = quantize_rowwise(G)
             W_int8, state_W = quantize_global_transpose(W)
-            grad_X = int8_matmul_mixed_dequanitze(G_int8, W_int8.t(), state_G, state_W, None).view(
+            grad_X = int8_matmul_mixed_dequantize(G_int8, W_int8.t(), state_G, state_W, None).view(
                 *G_3D.size()[:-1], -1
             )
         if ctx.needs_input_grad[1]:
@@ -119,7 +119,7 @@ def forward(ctx, X_3D, W, bias):
 
         # matmult, fused dequant and add bias
         # call "mixed" because we are mixing rowwise quantized and global quantized
-        return int8_matmul_mixed_dequanitze(
+        return int8_matmul_mixed_dequantize(
             X_int8, W_int8.t(), state_X, state_W, bias
         ).view(*X_3D_sz[:-1], -1)
 
@@ -143,7 +143,7 @@ def backward(ctx, G_3D):
             G_int8, state_G = quantize_rowwise(G)
             del G
             W_int8 = W_int8.t().contiguous()
-            grad_X = int8_matmul_mixed_dequanitze(G_int8, W_int8.t(), state_G, state_W, None).view(
+            grad_X = int8_matmul_mixed_dequantize(G_int8, W_int8.t(), state_G, state_W, None).view(
                 *G_3D_sz[:-1], -1
             )
 
@@ -215,7 +215,7 @@ def forward(self, x):
                     X_int8, self.W_int8.t(), state_X, self.state_W, self.bias
                 ).view(*x.size()[:-1], -1)
             else:
-                return int8_matmul_mixed_dequanitze(
+                return int8_matmul_mixed_dequantize(
                     X_int8, self.W_int8.t(), state_X, self.state_W, self.bias
                 ).view(*x.size()[:-1], -1)
 
diff --git a/bitsandbytes/triton/int8_matmul_mixed_dequanitze.py b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py
similarity index 98%
rename from bitsandbytes/triton/int8_matmul_mixed_dequanitze.py
rename to bitsandbytes/triton/int8_matmul_mixed_dequantize.py
index 60a56e698..b0961f558 100644
--- a/bitsandbytes/triton/int8_matmul_mixed_dequanitze.py
+++ b/bitsandbytes/triton/int8_matmul_mixed_dequantize.py
@@ -2,7 +2,7 @@
 from bitsandbytes.triton.triton_utils import is_triton_available
 
 if not is_triton_available():
-    def int8_matmul_mixed_dequanitze(a, b, state_x, state_w, bias): return None
+    def int8_matmul_mixed_dequantize(a, b, state_x, state_w, bias): return None
 else:
 
     import triton
@@ -136,7 +136,7 @@ def _int8_matmul_mixed_dequantize(A, B, C, bias, state_x_ptr, state_w_ptr, M, N,
             tl.atomic_add(C, acc, mask=mask)
 
 
-    def int8_matmul_mixed_dequanitze(a, b, state_x, state_w, bias):
+    def int8_matmul_mixed_dequantize(a, b, state_x, state_w, bias):
         device = a.device
         divfactor = 1. / (127. * 127.)
         has_bias = 0 if bias is None else 1

From ea0f7935717aa33aee56c4e5c7f149d35e08724a Mon Sep 17 00:00:00 2001
From: Pranav <pranav.b11@gmail.com>
Date: Wed, 7 Jun 2023 12:51:31 -0700
Subject: [PATCH 03/18] Update README.md

Changed misleading Hardware requirements from "2018 or older" to "2018 or newer"
---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 727a86cb5..0488007b5 100644
--- a/README.md
+++ b/README.md
@@ -80,7 +80,7 @@ out = linear(x.to(torch.float16))
 Requirements: anaconda, cudatoolkit, pytorch
 
 Hardware requirements:
- - LLM.int8(): NVIDIA Turing (RTX 20xx; T4) or Ampere GPU (RTX 30xx; A4-A100); (a GPU from 2018 or older).
+ - LLM.int8(): NVIDIA Turing (RTX 20xx; T4) or Ampere GPU (RTX 30xx; A4-A100); (a GPU from 2018 or newer).
  - 8-bit optimizers and quantization: NVIDIA Kepler GPU or newer (>=GTX 78X).
 
 Supported CUDA versions: 10.2 - 12.0

From 237ad4920552dc04cbb4e6eac27aa57ad2c4456e Mon Sep 17 00:00:00 2001
From: dulalbert <113252426+dulalbert@users.noreply.github.com>
Date: Tue, 20 Jun 2023 10:53:30 +0800
Subject: [PATCH 04/18] Added scipy to requirements.txt

Added scipy to requirements.txt as it is used but not added to requirements
---
 requirements.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/requirements.txt b/requirements.txt
index 883b2e42e..3bde2dc6a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,2 +1,3 @@
 lion-pytorch
 pytest
+scipy

From c2494a61a3810b307873e6bab08402c4d463c42a Mon Sep 17 00:00:00 2001
From: Neel Gupta <neelgupta04@outlook.com>
Date: Tue, 4 Jul 2023 14:54:15 +0100
Subject: [PATCH 05/18] Fixed missing `Embedding` export

---
 bitsandbytes/nn/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/nn/__init__.py b/bitsandbytes/nn/__init__.py
index 49d7b5ced..6fa6d1183 100644
--- a/bitsandbytes/nn/__init__.py
+++ b/bitsandbytes/nn/__init__.py
@@ -2,5 +2,5 @@
 #
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
-from .modules import Int8Params, Linear8bitLt, StableEmbedding, Linear4bit, LinearNF4, LinearFP4, Params4bit, OutlierAwareLinear, SwitchBackLinearBnb
+from .modules import Int8Params, Linear8bitLt, StableEmbedding, Linear4bit, LinearNF4, LinearFP4, Params4bit, OutlierAwareLinear, SwitchBackLinearBnb, Embedding
 from .triton_based_modules import SwitchBackLinear, SwitchBackLinearGlobal, SwitchBackLinearVectorwise, StandardLinear

From 5e266532a97f69b799c72a596d0ee4333aaae0c8 Mon Sep 17 00:00:00 2001
From: dulalbert <113252426+dulalbert@users.noreply.github.com>
Date: Wed, 12 Jul 2023 10:22:02 +0800
Subject: [PATCH 06/18] Added scipy to install_requires

---
 setup.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/setup.py b/setup.py
index 51e747c65..91a63fddf 100644
--- a/setup.py
+++ b/setup.py
@@ -25,6 +25,7 @@ def read(fname):
     license="MIT",
     keywords="gpu optimizers optimization 8-bit quantization compression",
     url="https://github.com/TimDettmers/bitsandbytes",
+    install_requires=['scipy'],
     packages=find_packages(),
     package_data={"": libs},
     long_description=read("README.md"),

From 87816e4e9c90cfd559d42ad8bdcfe3b5fde18407 Mon Sep 17 00:00:00 2001
From: Ikko Eltociear Ashimine <eltociear@gmail.com>
Date: Tue, 18 Jul 2023 00:44:17 +0900
Subject: [PATCH 07/18] Fix typo in test_optim.py

paramters -> parameters
---
 tests/test_optim.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_optim.py b/tests/test_optim.py
index 9e90083a9..49d4f442a 100644
--- a/tests/test_optim.py
+++ b/tests/test_optim.py
@@ -169,7 +169,7 @@ def test_optimizer32bit(dim1, dim2, gtype, optim_name):
 
         if gtype != torch.float32:
             # the adam buffers should also be close because they are 32-bit
-            # but the paramters can diverge because they are 16-bit
+            # but the parameters can diverge because they are 16-bit
             # the difference grow larger and larger with each update
             # --> copy the state to keep weights close
             p1.data = p1.data.to(p2.dtype).float()

From 4b0e401c6575dd521374d1da4dcfc977f4d056e8 Mon Sep 17 00:00:00 2001
From: Cebtenzzre <cebtenzzre@gmail.com>
Date: Thu, 20 Jul 2023 21:33:51 -0400
Subject: [PATCH 08/18] improve `make clean` target

Make `make clean` remove all build artifacts, and do not warn about
CUDA_VERSION when simply running 'clean'.

Fixes #532
---
 Makefile | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/Makefile b/Makefile
index 7ccbcb191..5f997a122 100644
--- a/Makefile
+++ b/Makefile
@@ -8,9 +8,11 @@ ifeq ($(CUDA_HOME),)
 endif
 
 ifndef CUDA_VERSION
+ifneq ($(MAKECMDGOALS),clean)
 $(warning WARNING: CUDA_VERSION not set. Call make with CUDA string, for example: make cuda11x CUDA_VERSION=115 or make cpuonly CUDA_VERSION=CPU)
 CUDA_VERSION:=
 endif
+endif
 
 
 
@@ -135,10 +137,5 @@ $(ROOT_DIR)/dependencies/cub:
 	cd dependencies/cub; git checkout 1.11.0
 
 clean:
-	rm build/*
-
-cleaneggs:
-	rm -rf *.egg*
-
-cleanlibs:
-	rm ./bitsandbytes/libbitsandbytes*.so
+	rm -rf build/* *.egg*
+	rm -f bitsandbytes/libbitsandbytes*.so

From b4bc3369a0831269e82051ecbefec6b9262bba66 Mon Sep 17 00:00:00 2001
From: Keith Stevens <fozziethebeat@gmail.com>
Date: Fri, 21 Jul 2023 14:30:03 +0900
Subject: [PATCH 09/18] Make sure bitsandbytes handles permission errors in the
 right order

---
 bitsandbytes/cuda_setup/main.py | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
index 3b0097100..3dd16ff9f 100644
--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@@ -196,11 +196,13 @@ def remove_non_existent_dirs(candidate_paths: Set[Path]) -> Set[Path]:
         try:
             if path.exists():
                 existent_directories.add(path)
+        except PermissionError as pex:
+            # Handle the PermissionError first as it is a subtype of OSError 
+            # https://docs.python.org/3/library/exceptions.html#exception-hierarchy
+            pass
         except OSError as exc:
             if exc.errno != errno.ENAMETOOLONG:
                 raise exc
-        except PermissionError as pex:
-            pass
 
     non_existent_directories: Set[Path] = candidate_paths - existent_directories
     if non_existent_directories:
@@ -361,4 +363,4 @@ def evaluate_cuda_setup():
         "if not has_cublaslt (CC < 7.5), then we have to choose  _nocublaslt.so"
         binary_name = f"libbitsandbytes_cuda{cuda_version_string}_nocublaslt.so"
 
-    return binary_name, cudart_path, cc, cuda_version_string
\ No newline at end of file
+    return binary_name, cudart_path, cc, cuda_version_string

From d76b6ca91b827b5c522bb794d96628d290ee29f6 Mon Sep 17 00:00:00 2001
From: Apoorva Kulkarni <kuapoorv@amazon.com>
Date: Fri, 21 Jul 2023 23:20:44 -0700
Subject: [PATCH 10/18] doc: Fix typo in how_to_use_nonpytorch_cuda.md

---
 how_to_use_nonpytorch_cuda.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/how_to_use_nonpytorch_cuda.md b/how_to_use_nonpytorch_cuda.md
index 7b717f41c..09dcac7c7 100644
--- a/how_to_use_nonpytorch_cuda.md
+++ b/how_to_use_nonpytorch_cuda.md
@@ -21,7 +21,7 @@ wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_instal
 #   EXPORT_TO_BASH in {0, 1} with 0=False and 1=True 
 
 # For example, the following installs CUDA 11.7 to ~/local/cuda-11.7 and exports the path to your .bashrc
-bash cuda install 117 ~/local 1 
+bash cuda_install.sh 117 ~/local 1 
 ```
 
 ## Setting the environmental variables BNB_CUDA_VERSION, and LD_LIBRARY_PATH

From a51840faa4c305951eb83c088f3e9c484eff6e87 Mon Sep 17 00:00:00 2001
From: Osma Suominen <osma.suominen@helsinki.fi>
Date: Fri, 4 Aug 2023 10:32:05 +0300
Subject: [PATCH 11/18] Robustness fix: don't break in case of directories
 without read permission

---
 bitsandbytes/cuda_setup/main.py | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
index f3edf4c73..0a0a6defa 100644
--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@@ -214,8 +214,11 @@ def get_cuda_runtime_lib_paths(candidate_paths: Set[Path]) -> Set[Path]:
     paths = set()
     for libname in CUDA_RUNTIME_LIBS:
         for path in candidate_paths:
-            if (path / libname).is_file():
-                paths.add(path / libname)
+            try:
+                if (path / libname).is_file():
+                    paths.add(path / libname)
+            except PermissionError:
+                pass
     return paths
 
 

From 91c7518dd8a18572dbb8cc471784c24ac1c0e351 Mon Sep 17 00:00:00 2001
From: rasbt <mail@sebastianraschka.com>
Date: Tue, 15 Aug 2023 19:14:09 -0500
Subject: [PATCH 12/18] add version attribute as per Python convention

---
 bitsandbytes/__init__.py | 2 ++
 setup.py                 | 5 ++++-
 2 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index f35a3b582..3608de1ec 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -24,4 +24,6 @@
     "optim.optimizer.MockArgs": False,
 }
 
+__version__ = "0.41.1"
+
 PACKAGE_GITHUB_URL = "https://github.com/TimDettmers/bitsandbytes"
diff --git a/setup.py b/setup.py
index d6267088e..405c161a7 100644
--- a/setup.py
+++ b/setup.py
@@ -6,6 +6,9 @@
 import os
 
 from setuptools import find_packages, setup
+import bitsandbytes as bnb
+
+VERSION = bnb.__version__
 
 libs = list(glob.glob("./bitsandbytes/libbitsandbytes*.so"))
 libs = [os.path.basename(p) for p in libs]
@@ -18,7 +21,7 @@ def read(fname):
 
 setup(
     name=f"bitsandbytes",
-    version=f"0.41.1",
+    version=VERSION,
     author="Tim Dettmers",
     author_email="dettmers@cs.washington.edu",
     description="k-bit optimizers and matrix multiplication routines.",

From 09cd03d70032f9c0b8319d9eaa79b45602d81912 Mon Sep 17 00:00:00 2001
From: Alex Pilon <alex.pilon@shopify.com>
Date: Thu, 17 Aug 2023 12:51:23 -0400
Subject: [PATCH 13/18] Add env var related to google systems to ignored list

---
 bitsandbytes/cuda_setup/env_vars.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/bitsandbytes/cuda_setup/env_vars.py b/bitsandbytes/cuda_setup/env_vars.py
index 4fcb643ee..214ae9c58 100644
--- a/bitsandbytes/cuda_setup/env_vars.py
+++ b/bitsandbytes/cuda_setup/env_vars.py
@@ -19,6 +19,7 @@ def to_be_ignored(env_var: str, value: str) -> bool:
         "PATH",  # this is for finding binaries, not libraries
         "LESSOPEN",  # related to the `less` command
         "LESSCLOSE",
+        "GOOGLE_VM_CONFIG_LOCK_FILE", # Google Cloud stuff, contains root only paths
         "_",  # current Python interpreter
     }
     return env_var in ignorable

From 9f293ffc92400fffc6da38b84328cbaca2ebc40a Mon Sep 17 00:00:00 2001
From: Merve Noyan <merveenoyan@gmail.com>
Date: Thu, 24 Aug 2023 13:10:45 +0300
Subject: [PATCH 14/18] Small fix to README

---
 README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 727a86cb5..63cf3c1f4 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ python setup.py install
 ```python
 from transformers import AutoModelForCausalLM
 model = AutoModelForCausalLM.from_pretrained(
-  'decapoda-research/llama-7b-hf,
+  'decapoda-research/llama-7b-hf',
   device_map='auto',
   load_in_8bit=True,
   max_memory=f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB')
@@ -119,7 +119,7 @@ torch.nn.Embedding(...) ->  bnb.nn.StableEmbedding(...) # recommended for NLP mo
 ```
 
 Note that by default all parameter tensors with less than 4096 elements are kept at 32-bit even if you initialize those parameters with 8-bit optimizers. This is done since such small tensors do not save much memory and often contain highly variable parameters (biases) or parameters that require high precision (batch norm, layer norm). You can change this behavior like so:
-```
+```python
 # parameter tensors with less than 16384 values are optimized in 32-bit
 # it is recommended to use multiplies of 4096
 adam = bnb.optim.Adam8bit(model.parameters(), min_8bit_size=16384)

From fea5bc7b83deb971f9aa520d0fcc93fbaa588110 Mon Sep 17 00:00:00 2001
From: Donato Riccio <71320919+reese3222@users.noreply.github.com>
Date: Sat, 26 Aug 2023 09:42:19 +0200
Subject: [PATCH 15/18] Fixed wget link for installing cuda

Wget now downloads the correct raw file from github
---
 bitsandbytes/cuda_setup/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/cuda_setup/main.py b/bitsandbytes/cuda_setup/main.py
index f3edf4c73..1ffcde352 100644
--- a/bitsandbytes/cuda_setup/main.py
+++ b/bitsandbytes/cuda_setup/main.py
@@ -64,7 +64,7 @@ def generate_instructions(self):
             self.add_log_entry('CUDA SETUP: Solution 1b): Once the library is found add it to the LD_LIBRARY_PATH: export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:FOUND_PATH_FROM_1a')
             self.add_log_entry('CUDA SETUP: Solution 1c): For a permanent solution add the export from 1b into your .bashrc file, located at ~/.bashrc')
             self.add_log_entry('CUDA SETUP: Solution 2: If no library was found in step 1a) you need to install CUDA.')
-            self.add_log_entry('CUDA SETUP: Solution 2a): Download CUDA install script: wget https://github.com/TimDettmers/bitsandbytes/blob/main/cuda_install.sh')
+            self.add_log_entry('CUDA SETUP: Solution 2a): Download CUDA install script: wget https://raw.githubusercontent.com/TimDettmers/bitsandbytes/main/cuda_install.sh')
             self.add_log_entry('CUDA SETUP: Solution 2b): Install desired CUDA version to desired location. The syntax is bash cuda_install.sh CUDA_VERSION PATH_TO_INSTALL_INTO.')
             self.add_log_entry('CUDA SETUP: Solution 2b): For example, "bash cuda_install.sh 113 ~/local/" will download CUDA 11.3 and install into the folder ~/local')
             return

From 57cfbcb710aa752c08c7617f873cc5ba89005ca7 Mon Sep 17 00:00:00 2001
From: John B Nelson <jbn@abreka.com>
Date: Sun, 3 Sep 2023 12:09:26 -0700
Subject: [PATCH 16/18] FIX missing closing quote in README example

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index 727a86cb5..a2b1b53d6 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,7 @@ python setup.py install
 ```python
 from transformers import AutoModelForCausalLM
 model = AutoModelForCausalLM.from_pretrained(
-  'decapoda-research/llama-7b-hf,
+  'decapoda-research/llama-7b-hf',
   device_map='auto',
   load_in_8bit=True,
   max_memory=f'{int(torch.cuda.mem_get_info()[0]/1024**3)-2}GB')

From 04e20899a9cf4cc10915d89f651ed9f5df976ccc Mon Sep 17 00:00:00 2001
From: Michael Mior <michael.mior@gmail.com>
Date: Fri, 20 Oct 2023 14:48:40 -0400
Subject: [PATCH 17/18] Fix parameter name in error message

---
 bitsandbytes/nn/modules.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/bitsandbytes/nn/modules.py b/bitsandbytes/nn/modules.py
index 3d34bb45f..0e0e816be 100644
--- a/bitsandbytes/nn/modules.py
+++ b/bitsandbytes/nn/modules.py
@@ -218,10 +218,10 @@ def set_compute_type(self, x):
             if self.compute_dtype == torch.float32 and (x.numel() == x.shape[-1]):
                 # single batch inference with input torch.float16 and compute_dtype float32 -> slow inference when it could be fast
                 # warn the user about this
-                warnings.warn(f'Input type into Linear4bit is torch.float16, but bnb_4bit_compute_type=torch.float32 (default). This will lead to slow inference.')
+                warnings.warn(f'Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference.')
                 warnings.filterwarnings('ignore', message='.*inference.')
             if self.compute_dtype == torch.float32 and (x.numel() != x.shape[-1]):
-                warnings.warn(f'Input type into Linear4bit is torch.float16, but bnb_4bit_compute_type=torch.float32 (default). This will lead to slow inference or training speed.')
+                warnings.warn(f'Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.')
                 warnings.filterwarnings('ignore', message='.*inference or training')
 
 

From b5800d35775eed3696326b5307b257fd5066873e Mon Sep 17 00:00:00 2001
From: Sebastian Raschka <mail@sebastianraschka.com>
Date: Mon, 11 Dec 2023 16:39:33 -0600
Subject: [PATCH 18/18] Update __init__.py

---
 bitsandbytes/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/bitsandbytes/__init__.py b/bitsandbytes/__init__.py
index 3608de1ec..d77116849 100644
--- a/bitsandbytes/__init__.py
+++ b/bitsandbytes/__init__.py
@@ -24,6 +24,6 @@
     "optim.optimizer.MockArgs": False,
 }
 
-__version__ = "0.41.1"
+__version__ = "0.41.3.post1"
 
 PACKAGE_GITHUB_URL = "https://github.com/TimDettmers/bitsandbytes"