diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 4f719007c..27ebad62e 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -1,6 +1,7 @@
 name: Build and test
 
 on:
+  create:
   workflow_dispatch:
   push:
     branches:
@@ -138,6 +139,33 @@ jobs:
         call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat"
         make-4.4.1\dist\make -j WIN_CI_BUILD=1 train_gpt2fp32cu test_gpt2fp32cu test_gpt2cu train_gpt2cu profile_gpt2cu
 
+  build-ubuntu20-04:
+    runs-on: ubuntu-20.04
+    container:
+      image: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: System Info
+        run: |
+          nvcc --version
+          g++ --version
+
+      - name: Install cudnn frontend
+        run: |
+          apt-get update && apt-get install -y git
+          git clone https://github.com/NVIDIA/cudnn-frontend.git
+
+      - name: Build FP32 checkpoint
+        run: make train_gpt2fp32cu test_gpt2fp32cu
+
+      - name: Build FP32 precision
+        run: PRECISION=FP32 make train_gpt2cu test_gpt2cu profile_gpt2cu
+
+      - name: Build with CUDNN
+        run: PRECISION=BF16 USE_CUDNN=1 make train_gpt2cu test_gpt2cu profile_gpt2cu
+
   build-cuda-fp32:
     runs-on: ubuntu-latest
     container:
diff --git a/.github/workflows/ci_gpu.yml b/.github/workflows/ci_gpu.yml
new file mode 100644
index 000000000..f4a9dfb4b
--- /dev/null
+++ b/.github/workflows/ci_gpu.yml
@@ -0,0 +1,128 @@
+name: GPU Builds and Tests
+
+on:
+  create:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  build-and-test-gpu:
+    runs-on: ubicloud-gpu-standard-1-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install OpenMP
+        run: sudo apt-get update && sudo apt-get install -y libomp-dev
+
+      - name: Install dependencies
+        run: pip install -r requirements.txt
+
+      - name: Run preprocessing
+        run: python dev/data/tinyshakespeare.py
+
+      - name: Train model
+        run: python train_gpt2.py
+
+      - name: Compile training and testing program
+        run: make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu
+
+      - name: Train model (With OpenMP)
+        run: OMP_NUM_THREADS=8 ./train_gpt2cu
+
+      - name: Train model (FP32) with gpt2_124M.bin
+        run: |
+          PRECISION=FP32 make train_gpt2cu
+          ./train_gpt2cu -b 1 -t 64 -d 256 -l 0.0001 -v 200 -s 200 -a 1 -x 10 -r 0 -f 0 -e "gpt2_124M.bin"
+
+      - name: Test for percent loss differential for FP32 
+        run: |
+          PRECISION=FP32 make train_gpt2cu
+          ./train_gpt2cu -b 1 -t 64 -d 256 -l 0.0001 -v 200 -s 200 -a 1 -x 10 -r 0 -f 0 -e "gpt2_124M.bin" > train_gpt2cu_fp32_precision.txt
+          python dev/loss_checker_ci.py -f train_gpt2cu_fp32_precision.txt -s 20 -e 28 -a 5.0
+
+      - name: Build FP32 precision
+        run: PRECISION=FP32 make test_gpt2cu profile_gpt2cu
+
+      - name: Run default
+        run: ./test_gpt2cu
+
+      - name: Run no recompute GeLU
+        run: ./test_gpt2cu -r 0
+
+      - name: Run recompute LN
+        run: ./test_gpt2cu -r 2
+
+      - name: Build BF16 precision
+        run: PRECISION=BF16 make train_gpt2cu test_gpt2cu profile_gpt2cu
+
+      - name: Run default
+        run: ./test_gpt2cu
+
+      - name: Run no recompute GeLU
+        run: ./test_gpt2cu -r 0
+
+      - name: Run no master weights
+        run: ./test_gpt2cu -w 0
+
+      - name: Run recompute LN
+        run: ./test_gpt2cu -r 2
+
+      - name: Train model fp32 (With OpenMP)
+        run: OMP_NUM_THREADS=8 ./train_gpt2fp32cu
+
+      - name: Execute testing program (With OpenMP)
+        run: OMP_NUM_THREADS=8 ./test_gpt2cu
+
+      - name: Execute testing program fp32 (With OpenMP)
+        run: OMP_NUM_THREADS=8 ./test_gpt2fp32cu
+
+      - name: Compile training and testing program without OpenMP
+        run: NO_OMP=1 make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu
+
+      - name: Train model (No OpenMP)
+        run: NO_OMP=1 ./train_gpt2cu
+
+      - name: Train model fp32 (No OpenMP)
+        run: NO_OMP=1 ./train_gpt2fp32cu
+
+      - name: Execute testing program (No OpenMP)
+        run: ./test_gpt2cu -b 32
+
+      - name: Execute testing program fp32 (No OpenMP)
+        run: ./test_gpt2fp32cu
+
+      - name: Install cuDNN-frontend
+        run:
+          git clone https://github.com/NVIDIA/cudnn-frontend.git
+
+      - name: Build with cuDNN
+        run: USE_CUDNN=1 make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu
+
+      - name: Train model with cuDNN
+        run: ./train_gpt2cu
+
+      - name: Train model fp32 with cuDNN
+        run: ./train_gpt2fp32cu
+
+      - name: Execute testing program with cuDNN
+        run: ./test_gpt2cu
+
+      - name: Execute testing program fp32 with cuDNN
+        run: ./test_gpt2fp32cu
+
+  unit-tests-gpu:
+    runs-on: ubicloud-gpu-standard-1-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Test Device<->File IO
+        run: cd dev/test && nvcc -o device_file_io device_file_io.cu && ./device_file_io
diff --git a/.github/workflows/ci_tests.yml b/.github/workflows/ci_tests.yml
new file mode 100644
index 000000000..81aaace1c
--- /dev/null
+++ b/.github/workflows/ci_tests.yml
@@ -0,0 +1,100 @@
+name: Unit, Static and other Tests
+
+on:
+  create:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  dataloader_test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: test the dataloader without / with sanitize address
+        run: |
+          cd dev/test
+          make PRECISION=BF16 test_dataloader
+          ./test_dataloader   
+          make clean       
+          make PRECISION=BF16 TEST_CFLAGS="-fsanitize=address -fno-omit-frame-pointer" test_dataloader 
+          ./test_dataloader          
+
+  ptx_and_sass_files:
+    runs-on: ubuntu-latest
+    container:
+      image: nvidia/cuda:12.4.1-devel-ubuntu22.04
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install OpenMP and OpenMPI
+        run: apt-get update && apt-get install -y libomp-dev libopenmpi-dev
+    
+      - name: Generate ptx/sass files and upload them to persistent storage
+        run: |
+          mkdir -p dev/cuda/ptx_sass_logs
+          make train_gpt2cu
+          cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx
+          cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass          
+          cd dev/cuda
+          make -j all_ptx
+          make -j all_sass
+          cp *.ptx ptx_sass_logs/
+          cp *.sass ptx_sass_logs/
+          ls ptx_sass_logs/
+
+      - name: Generate ptx/sass files for A100 and upload them to persistent storage
+        run: |
+            mkdir -p dev/cuda/ptx_sass_logs_A100
+            make train_gpt2cu GPU_COMPUTE_CAPABILITY=80
+            cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx
+            cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass          
+            cd dev/cuda
+            make -j GPU_COMPUTE_CAPABILITY=80 all_ptx
+            make -j GPU_COMPUTE_CAPABILITY=80 all_sass
+            cp *.ptx ptx_sass_logs_A100/
+            cp *.sass ptx_sass_logs_A100/
+            ls ptx_sass_logs_A100/
+
+      - name: Generate ptx/sass files for H100 and upload them to persistent storage
+        run: |
+            mkdir -p dev/cuda/ptx_sass_logs_H100
+            make train_gpt2cu GPU_COMPUTE_CAPABILITY=90
+            cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx
+            cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass          
+            cd dev/cuda
+            make -j GPU_COMPUTE_CAPABILITY=90 all_ptx
+            make -j GPU_COMPUTE_CAPABILITY=90 all_sass
+            cp *.ptx ptx_sass_logs_H100/
+            cp *.sass ptx_sass_logs_H100/
+            ls ptx_sass_logs_H100/
+
+      - name: Upload ptx/sass files
+        uses: actions/upload-artifact@v4
+        with:
+          name: ptx_sass_files
+          path: dev/cuda/ptx_sass_logs/
+          retention-days: 30 # days to retain
+
+      - name: Upload ptx/sass files for A100
+        uses: actions/upload-artifact@v4
+        with:
+          name: ptx_sass_files_A100
+          path: dev/cuda/ptx_sass_logs_A100/
+          retention-days: 30 # days to retain          
+
+      - name: Upload ptx/sass files for H100
+        uses: actions/upload-artifact@v4
+        with:
+          name: ptx_sass_files_H100
+          path: dev/cuda/ptx_sass_logs_H100/
+          retention-days: 30 # days to retain                    
\ No newline at end of file
diff --git a/Makefile b/Makefile
index c8b555ac2..f2cce2d16 100644
--- a/Makefile
+++ b/Makefile
@@ -13,7 +13,7 @@ CUDA_OUTPUT_FILE = -o $@
 
 # NVCC flags
 # -t=0 is short for --threads, 0 = number of CPUs on the machine
-NVCC_FLAGS = -O3 -t=0 --use_fast_math
+NVCC_FLAGS = -O3 -t=0 --use_fast_math -std=c++17
 NVCC_LDFLAGS = -lcublas -lcublasLt
 NVCC_INCLUDES =
 NVCC_LDLIBS =
@@ -22,6 +22,16 @@ NVCC_CUDNN =
 # By default we don't build with cudnn because it blows up compile time from a few seconds to ~minute
 USE_CUDNN ?= 0
 
+# We will place .o files in the `build` directory (create it if it doesn't exist)
+BUILD_DIR = build
+ifeq ($(OS), Windows_NT)
+  $(shell if not exist $(BUILD_DIR) mkdir $(BUILD_DIR))
+  REMOVE_BUILD_OBJECT_FILES := del $(BUILD_DIR)\*.obj
+else
+  $(shell mkdir -p $(BUILD_DIR))
+  REMOVE_BUILD_OBJECT_FILES := rm -f $(BUILD_DIR)/*.o
+endif
+
 # Function to check if a file exists in the PATH
 ifneq ($(OS), Windows_NT)
 define file_exists_in_path
@@ -34,16 +44,16 @@ endef
 endif
 
 ifneq ($(CI),true) # if not in CI, then use the GPU query
-  ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY= 
+  ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY=
     ifneq ($(call file_exists_in_path, __nvcc_device_query),)
-      GPU_COMPUTE_CAPABILITY = $(shell __nvcc_device_query) 
+      GPU_COMPUTE_CAPABILITY = $(shell __nvcc_device_query)
       GPU_COMPUTE_CAPABILITY := $(strip $(GPU_COMPUTE_CAPABILITY))
     endif
   endif
 endif
 
 # set to defaults if - make GPU_COMPUTE_CAPABILITY= otherwise use the compute capability detected above
-ifneq ($(GPU_COMPUTE_CAPABILITY),) 
+ifneq ($(GPU_COMPUTE_CAPABILITY),)
   NVCC_FLAGS += --generate-code arch=compute_$(GPU_COMPUTE_CAPABILITY),code=[compute_$(GPU_COMPUTE_CAPABILITY),sm_$(GPU_COMPUTE_CAPABILITY)]
 endif
 
@@ -52,6 +62,7 @@ $(info ---------------------------------------------)
 
 ifneq ($(OS), Windows_NT)
   NVCC := $(shell which nvcc 2>/dev/null)
+  NVCC_LDFLAGS += -lnvidia-ml
 
   # Function to test if the compiler accepts a given flag.
   define check_and_add_flag
@@ -73,7 +84,7 @@ else
     NVCC :=
   endif
   CC := cl
-  CFLAGS = /Idev /Zi /nologo /Wall /WX- /diagnostics:column /sdl /O2 /Oi /Ot /GL /D _DEBUG /D _CONSOLE /D _UNICODE /D UNICODE /Gm- /EHsc /MD /GS /Gy /fp:fast /Zc:wchar_t /Zc:forScope /Zc:inline /permissive- \
+  CFLAGS = /Idev /Zi /nologo /W4 /WX- /diagnostics:column /sdl /O2 /Oi /Ot /GL /D _DEBUG /D _CONSOLE /D _UNICODE /D UNICODE /Gm- /EHsc /MD /GS /Gy /fp:fast /Zc:wchar_t /Zc:forScope /Zc:inline /permissive- \
    /external:W3 /Gd /TP /wd4996 /Fd$@.pdb /FC /openmp:llvm
   LDFLAGS :=
   LDLIBS :=
@@ -108,8 +119,8 @@ ifeq ($(USE_CUDNN), 1)
     NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH)
     NVCC_LDFLAGS += -lcudnn
     NVCC_FLAGS += -DENABLE_CUDNN
-    NVCC_CUDNN = cudnn_att.o
-  else 
+    NVCC_CUDNN = $(BUILD_DIR)/cudnn_att.o
+  else
     ifneq ($(OS), Windows_NT)
       $(info → cuDNN is not supported on MAC OS right now)
     else
@@ -119,14 +130,14 @@ ifeq ($(USE_CUDNN), 1)
       else ifeq ($(shell if exist "cudnn-frontend\include" (echo exists)),exists)
         CUDNN_FRONTEND_PATH ?= cudnn-frontend\include #override on command line if different location
       else
-        $(error ✗ cuDNN not found. See the README for install instructions and the Makefile for hard-coded paths) 
+        $(error ✗ cuDNN not found. See the README for install instructions and the Makefile for hard-coded paths)
       endif
       CUDNN_INCLUDE_PATH ?= -I"C:\Program Files\NVIDIA\CUDNN\v9.1\include\12.4"
       CUDNN_FRONTEND_PATH += $(CUDNN_INCLUDE_PATH)
       NVCC_FLAGS += --std c++20 -Xcompiler "/std:c++20" -Xcompiler "/EHsc /W0 /nologo /Ox /FS" -maxrregcount=0 --machine 64
-      NVCC_CUDNN = cudnn_att.obj
+      NVCC_CUDNN = $(BUILD_DIR)\cudnn_att.obj
       NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH)
-      NVCC_LDFLAGS += -L"C:\Program Files\NVIDIA\CUDNN\v9.1\lib\12.4\x64" -lcudnn 
+      NVCC_LDFLAGS += -L"C:\Program Files\NVIDIA\CUDNN\v9.1\lib\12.4\x64" -lcudnn
       NVCC_FLAGS += -DENABLE_CUDNN
     endif
   endif
@@ -178,27 +189,41 @@ else
   endif
 endif
 
-# Check if OpenMPI and NCCL are available, include them if so, for multi-GPU training
+# Check if NCCL is available, include if so, for multi-GPU training
 ifeq ($(NO_MULTI_GPU), 1)
-  $(info → Multi-GPU (OpenMPI + NCCL) is manually disabled)
+  $(info → Multi-GPU (NCCL) is manually disabled)
 else
   ifneq ($(OS), Windows_NT)
     # Detect if running on macOS or Linux
     ifeq ($(SHELL_UNAME), Darwin)
-      $(info ✗ Multi-GPU on CUDA on Darwin is not supported, skipping OpenMPI + NCCL support)
-    else ifeq ($(shell [ -d /usr/lib/x86_64-linux-gnu/openmpi/lib/ ] && [ -d /usr/lib/x86_64-linux-gnu/openmpi/include/ ] && echo "exists"), exists)
-      $(info ✓ OpenMPI found, OK to train with multiple GPUs)
-      NVCC_INCLUDES += -I/usr/lib/x86_64-linux-gnu/openmpi/include
-      NVCC_LDFLAGS += -L/usr/lib/x86_64-linux-gnu/openmpi/lib/
-      NVCC_LDLIBS += -lmpi -lnccl
+      $(info ✗ Multi-GPU on CUDA on Darwin is not supported, skipping NCCL support)
+    else ifeq ($(shell dpkg -l | grep -q nccl && echo "exists"), exists)
+      $(info ✓ NCCL found, OK to train with multiple GPUs)
       NVCC_FLAGS += -DMULTI_GPU
+      NVCC_LDLIBS += -lnccl
     else
-      $(info ✗ OpenMPI is not found, disabling multi-GPU support)
-      $(info ---> On Linux you can try install OpenMPI with `sudo apt install openmpi-bin openmpi-doc libopenmpi-dev`)
+      $(info ✗ NCCL is not found, disabling multi-GPU support)
+      $(info ---> On Linux you can try install NCCL with `sudo apt install libnccl2 libnccl-dev`)
     endif
   endif
 endif
 
+# Attempt to find and include OpenMPI on the system
+OPENMPI_DIR ?= /usr/lib/x86_64-linux-gnu/openmpi
+OPENMPI_LIB_PATH = $(OPENMPI_DIR)/lib/
+OPENMPI_INCLUDE_PATH = $(OPENMPI_DIR)/include/
+ifeq ($(NO_USE_MPI), 1)
+  $(info → MPI is manually disabled)
+else ifeq ($(shell [ -d $(OPENMPI_LIB_PATH) ] && [ -d $(OPENMPI_INCLUDE_PATH) ] && echo "exists"), exists)
+  $(info ✓ MPI enabled)
+  NVCC_INCLUDES += -I$(OPENMPI_INCLUDE_PATH)
+  NVCC_LDFLAGS += -L$(OPENMPI_LIB_PATH)
+  NVCC_LDLIBS += -lmpi
+  NVCC_FLAGS += -DUSE_MPI
+else
+  $(info ✗ MPI not found)
+endif
+
 # Precision settings, default to bf16 but ability to override
 PRECISION ?= BF16
 VALID_PRECISIONS := FP32 FP16 BF16
@@ -237,23 +262,24 @@ train_gpt2: train_gpt2.c
 test_gpt2: test_gpt2.c
 	$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) $^ $(LDLIBS) $(OUTPUT_FILE)
 
-$(NVCC_CUDNN): cudnn_att.cpp
-	$(NVCC) -c $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_INCLUDES) 
+$(NVCC_CUDNN): llmc/cudnn_att.cpp
+	$(NVCC) -c $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_INCLUDES) -o $@
 
 train_gpt2cu: train_gpt2.cu $(NVCC_CUDNN)
-	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) 
+	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
 
 train_gpt2fp32cu: train_gpt2_fp32.cu
 	$(NVCC) $(NVCC_FLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
 
 test_gpt2cu: test_gpt2.cu $(NVCC_CUDNN)
-	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE) 
+	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
 
 test_gpt2fp32cu: test_gpt2_fp32.cu
 	$(NVCC) $(NVCC_FLAGS) $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS) $(CUDA_OUTPUT_FILE)
 
 profile_gpt2cu: profile_gpt2.cu $(NVCC_CUDNN)
-	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS)  $(CUDA_OUTPUT_FILE) 
+	$(NVCC) $(NVCC_FLAGS) $(PFLAGS) -lineinfo $^ $(NVCC_LDFLAGS) $(NVCC_INCLUDES) $(NVCC_LDLIBS)  $(CUDA_OUTPUT_FILE)
 
 clean:
-	$(REMOVE_FILES) $(TARGETS) $(NVCC_CUDNN)
+	$(REMOVE_FILES) $(TARGETS)
+	$(REMOVE_BUILD_OBJECT_FILES)
diff --git a/README.md b/README.md
index b6c6128dd..a3536e0bf 100644
--- a/README.md
+++ b/README.md
@@ -6,33 +6,41 @@ LLMs in simple, pure C/CUDA with no need for 245MB of PyTorch or 107MB of cPytho
 
 The best introduction to the llm.c repo today is reproducing the GPT-2 (124M) model. [Discussion #481](https://github.com/karpathy/llm.c/discussions/481) steps through this in detail. We can reproduce other models from the GPT-2 and GPT-3 series in both llm.c and in the parallel implementation of PyTorch. Have a look at the [scripts README](scripts/README.md).
 
+debugging tip: when you run the `make` command to build the binary, modify it by replacing `-O3` with `-g` so you can step through the code in your favorite IDE (e.g. vscode).
+
 ## quick start (1 GPU, fp32 only)
 
 If you won't be training on multiple nodes, aren't interested in mixed precision, and are interested in learning CUDA, the fp32 (legacy) files might be of interest to you. These are files that were "checkpointed" early in the history of llm.c and frozen in time. They are simpler, more portable, and possibly easier to understand. Run the 1 GPU, fp32 code like this:
 
 ```bash
-pip install -r requirements.txt
-python dev/data/tinyshakespeare.py
-python train_gpt2.py
+chmod u+x ./dev/download_starter_pack.sh
+./dev/download_starter_pack.sh
 make train_gpt2fp32cu
 ./train_gpt2fp32cu
 ```
 
-The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C/CUDA and train for one epoch on tineshakespeare with AdamW (using batch size 4, context length 1024, total of 74 steps), evaluate validation loss, and sample some text.
+The download_starter_pack.sh script is a quick & easy way to get started and it downloads a bunch of .bin files that help get you off the ground. These contain: 1) the GPT-2 124M model saved in fp32, in bfloat16, 2) a "debug state" used in unit testing (a small batch of data, and target activations and gradients), 3) the GPT-2 tokenizer, and 3) the tokenized [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset. Alternatively, instead of running the .sh script, you can re-create these artifacts manually as follows:
+
+```bash
+pip install -r requirements.txt
+python dev/data/tinyshakespeare.py
+python train_gpt2.py
+```
 
 ## quick start (CPU)
 
 The "I am so GPU poor that I don't even have one GPU" section. You can still enjoy seeing llm.c train! But you won't go too far. Just like the fp32 version above, the CPU version is an even earlier checkpoint in the history of llm.c, back when it was just a simple reference implementation in C. For example, instead of training from scratch, you can finetune a GPT-2 small (124M) to output Shakespeare-like text, as an example:
 
 ```bash
-pip install -r requirements.txt
-python dev/data/tinyshakespeare.py
-python train_gpt2.py
+chmod u+x ./dev/download_starter_pack.sh
+./dev/download_starter_pack.sh
 make train_gpt2
 OMP_NUM_THREADS=8 ./train_gpt2
 ```
 
-The above lines (1) download the [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset, tokenize it with the GPT-2 Tokenizer, (2) download and save the GPT-2 (124M) weights, (3) init from them in C and train for 40 steps on tineshakespeare with AdamW (using batch size 4, context length only 64), evaluate validation loss, and sample some text. Honestly, unless you have a beefy CPU (and can crank up the number of OMP threads in the launch command), you're not going to get that far on CPU training LLMs, but it might be a good demo/reference. The output looks like this on my MacBook Pro (Apple Silicon M3 Max):
+If you'd prefer to avoid running the starter pack script, then as mentioned in the previous section you can reproduce the exact same .bin files and artifacts by running `python dev/data/tinyshakespeare.py` and then `python train_gpt2.py`.
+
+The above lines (1) download an already tokenized [tinyshakespeare](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt) dataset and download the GPT-2 (124M) weights, (3) init from them in C and train for 40 steps on tineshakespeare with AdamW (using batch size 4, context length only 64), evaluate validation loss, and sample some text. Honestly, unless you have a beefy CPU (and can crank up the number of OMP threads in the launch command), you're not going to get that far on CPU training LLMs, but it might be a good demo/reference. The output looks like this on my MacBook Pro (Apple Silicon M3 Max):
 
 ```
 [GPT-2]
@@ -126,12 +134,16 @@ sudo apt-get -y install libcudnn9-dev-cuda-12
 
 On top of this you need the [cuDNN frontend](https://github.com/NVIDIA/cudnn-frontend/tree/main), but this is just header files. Simply clone the repo to your disk. The Makefile currently looks for it in either your home directory or the current directory. If you have put it elsewhere, add `CUDNN_FRONTEND_PATH=/path/to/your/cudnn-frontend/include` to the `make` command-line.
 
-**multi-GPU training**. As of April 26, 2024 there is now also support for multi-GPU training using MPI and NCCL. Make sure you install MPI, e.g. on Linux:
+## multi-GPU training
+
+Make sure you install MPI and NCCL, e.g. on Linux:
 
 ```bash
 sudo apt install openmpi-bin openmpi-doc libopenmpi-dev
 ```
 
+For NCCL follow the instructions from the [official website](https://developer.nvidia.com/nccl/nccl-download) (e.g. network installer)
+
 and then:
 
 ```bash
@@ -139,6 +151,23 @@ make train_gpt2cu
 mpirun -np <number of GPUs> ./train_gpt2cu
 ```
 
+or simply run one of our scripts under `./scripts/`.
+
+## multi-node training
+
+Make sure you've installed `NCCL` following instructions from [multi-GPU](#multi-gpu-training) section.
+
+There are 3 ways we currently support that allow you to run multi-node training:
+1) Use OpenMPI to exchange nccl id and initialize NCCL. See e.g. `./scripts/multi_node/run_gpt2_124M_mpi.sh` script for details.
+2) Use shared file system to init NCCL. See `./scripts/multi_node/run_gpt2_124M_fs.sbatch` script for details.
+3) Use TCP sockets to init NCCL. See `./scripts/multi_node/run_gpt2_124M_tcp.sbatch` script for details.
+
+Note:
+* If you're running in a slurm environment and your slurm doesn't support PMIx (which we assume will be a common situation given that `slurm-wlm` dropped PMIx support) you will have to use FS (2) or TCP (3) approach. To test whether your slurm supports PMIx run: `srun --mpi=list` and see whether you get `pmix` in the output.
+* If you don't have slurm set up, you can kick off a multi-node run using `mpirun` - MPI (1).
+
+None of these 3 methods is superior, we just offer you options so that you can run in your specific environment.
+
 ## experiments / sweeps
 
 Just as an example process to sweep learning rates on a machine with 4 GPUs on TinyStories. Run a shell script `sweep.sh` (after you of course `chmod u+x sweep.sh`):
@@ -184,6 +213,9 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p
   - [llm.cpp](https://github.com/gevtushenko/llm.c) by @[gevtushenko](https://github.com/gevtushenko): a port of this project using the [CUDA C++ Core Libraries](https://github.com/NVIDIA/cccl)
      - A presentation this fork was covered in [this lecture](https://www.youtube.com/watch?v=WiB_3Csfj_Q) in the [CUDA MODE Discord Server](https://discord.gg/cudamode)
 
+- WebGPU C++
+  - [gpu.cpp](https://github.com/AnswerDotAI/gpu.cpp) by @[austinvhuang](https://github.com/austinvhuang): a library for portable GPU compute in C++ using native WebGPU. Aims to be a general-purpose library, but also porting llm.c kernels to WGSL.
+
 - Go
   - [llm.go](https://github.com/joshcarp/llm.go) by @[joshcarp](https://github.com/joshcarp): a Go port of this project
 
@@ -196,6 +228,9 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p
 - Mojo
   - [llm.🔥](https://github.com/dorjeduck/llm.mojo) by @[dorjeduck](https://github.com/dorjeduck): a Mojo port of this project
 
+- OpenCL
+  - [llm.c](https://github.com/krrishnarraj/llm.c) by @[krrishnarraj](https://github.com/krrishnarraj): an OpenCL port of this project
+
 - Rust
   -  [llm.rs](https://github.com/yijunyu/llm.rs) by @[Yijun Yu](https://github.com/yijunyu): a Rust rewrite with the aim to have same performance
   -  [llm.rs](https://github.com/ToJen/llm.rs) by @[ToJen](https://github.com/ToJen): a Rust port of this project
@@ -205,6 +240,12 @@ Lastly, I will be a lot more sensitive to complexity in the root folder of the p
 
 - Zig
   - [llm.zig](https://github.com/Saimirbaci/llm.zig) by @[saimirbaci](https://github.com/Saimirbaci): a Zig port of this project
+ 
+- Habana Gaudi2
+  - [llm.tpc](https://github.com/abhilash1910/llm.tpc) by @[abhilash1910](https://github.com/abhilash1910): a Habana Gaudi2 port of this project 
+
+- Nim
+  - [llm.nim](https://github.com/Vindaar/llm.nim) by @[Vindaar](https://github.com/Vindaar): a Nim port of this project
 
 ## discussions
 
diff --git a/dev/cuda/Makefile b/dev/cuda/Makefile
index 14eae201e..6a7584f8d 100644
--- a/dev/cuda/Makefile
+++ b/dev/cuda/Makefile
@@ -8,9 +8,21 @@ ifeq ($(NVCC),)
 		$(error nvcc not found.)
 endif
 
+ifneq ($(CI),true) # if not in CI, then use the GPU query
+  ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY=
+    GPU_COMPUTE_CAPABILITY = $(shell __nvcc_device_query) # assume if NVCC is present, then this likely is too
+    GPU_COMPUTE_CAPABILITY := $(strip $(GPU_COMPUTE_CAPABILITY))
+  endif
+endif
+
 # Compiler flags
-CFLAGS = -O3 --use_fast_math
-NVCCFLAGS = -lcublas -lcublasLt
+ifeq ($(GPU_COMPUTE_CAPABILITY),) # set to defaults if: make GPU_COMPUTE_CAPABILITY=
+  CFLAGS = -O3 --use_fast_math
+else
+  CFLAGS = -O3 --use_fast_math --generate-code arch=compute_$(GPU_COMPUTE_CAPABILITY),code=[compute_$(GPU_COMPUTE_CAPABILITY),sm_$(GPU_COMPUTE_CAPABILITY)]
+endif
+
+NVCCFLAGS = -lcublas -lcublasLt -std=c++17
 MPI_PATHS = -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-gnu/openmpi/lib/
 
 # Default rule for our CUDA files
@@ -18,8 +30,11 @@ MPI_PATHS = -I/usr/lib/x86_64-linux-gnu/openmpi/include -L/usr/lib/x86_64-linux-
 	$(NVCC) $(CFLAGS) $(NVCCFLAGS) $< -o $@
 
 # Build all targets
-TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_backward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward fused_residual_forward  global_norm
+TARGETS = adamw attention_backward attention_forward classifier_fused crossentropy_forward crossentropy_softmax_backward encoder_backward encoder_forward gelu_backward gelu_forward layernorm_backward layernorm_forward matmul_backward matmul_backward_bias matmul_forward nccl_all_reduce residual_forward softmax_forward trimat_forward fused_residual_forward  global_norm permute
+
 all: $(TARGETS)
+all_ptx:  $(TARGETS:%=%.ptx)
+all_sass: $(TARGETS:%=%.sass)
 
 # Individual targets: forward pass
 attention_forward: attention_forward.cu
@@ -50,10 +65,20 @@ matmul_backward: matmul_backward.cu
 adamw: adamw.cu
 global_norm: global_norm.cu
 
+permute: permute.cu
+
 # NCCL communication kernels
 nccl_all_reduce: nccl_all_reduce.cu
 	$(NVCC) -lmpi -lnccl $(NVCCFLAGS) $(MPI_PATHS) nccl_all_reduce.cu -o nccl_all_reduce
 
+# Generate PTX using cuobjdump
+%.ptx: %
+	cuobjdump --dump-ptx $< > $@
+
+# Generate SASS using cuobjdump
+%.sass: %
+	cuobjdump --dump-sass $< > $@
+
 # Run all targets
 run_all: all
 	@for target in $(TARGETS); do \
@@ -65,4 +90,4 @@ run_all: all
 
 # Clean up
 clean:
-	rm -f $(TARGETS)
+	rm -f $(TARGETS) *.ptx *.sass
diff --git a/dev/cuda/README.md b/dev/cuda/README.md
index d020cf606..22ad4d081 100644
--- a/dev/cuda/README.md
+++ b/dev/cuda/README.md
@@ -7,7 +7,7 @@ See the top of each file for how to compile and run the kernel. Alternatively, t
 For example, we can look at the top of `layernorm_forward.cu` to build the forward pass kernels for the LayerNorm:
 
 ```bash
-nvcc -O3 --use_fast_math layernorm_forward.cu -o layernorm_forward
+nvcc -O3 --use_fast_math -lcublas -lcublasLt layernorm_forward.cu -o layernorm_forward
 ```
 
 or simply
diff --git a/dev/cuda/adamw.cu b/dev/cuda/adamw.cu
index 20a6560dd..74dfc2ee2 100644
--- a/dev/cuda/adamw.cu
+++ b/dev/cuda/adamw.cu
@@ -159,7 +159,7 @@ int main(int argc, char **argv) {
     // create random data on host (to be used for the CPU reference implementation)
     float* params_memory = make_random_float(num_parameters);
     float* grads_memory = make_random_float(num_parameters);
-    float* m_memory = make_random_float_01(num_parameters);
+    float* m_memory = make_random_float(num_parameters);
     float* v_memory = make_random_float_01(num_parameters);
 
     // move to GPU
diff --git a/dev/cuda/attention_backward.cu b/dev/cuda/attention_backward.cu
index c97dbeee8..936014281 100644
--- a/dev/cuda/attention_backward.cu
+++ b/dev/cuda/attention_backward.cu
@@ -68,7 +68,7 @@ void attention_forward_cpu(float* out, float* preatt, float* att,
                 float* att_bth = att + b*NH*T*T + h*T*T + t*T;
 
                 // pass 1: calculate query dot key and maxval
-                float maxval = -10000.0f; // TODO something better
+                float maxval = -FLT_MAX;
                 for (int t2 = 0; t2 < T; t2++) { // used to be t2 <= t
                     float* key_t2 = inp + b * T * C3 + t2 * C3 + h * hs + C; // +C because it's key
 
@@ -1137,6 +1137,7 @@ int main(int argc, char **argv) {
     free(dinp);
     free(dpreatt);
     free(datt);
+    free(h_dinp);
     cudaCheck(cudaFree(d_inp));
     cudaCheck(cudaFree(d_qkvr));
     cudaCheck(cudaFree(d_preatt));
diff --git a/dev/cuda/attention_forward.cu b/dev/cuda/attention_forward.cu
index b632b4a66..ca75762be 100644
--- a/dev/cuda/attention_forward.cu
+++ b/dev/cuda/attention_forward.cu
@@ -98,7 +98,7 @@ void attention_forward_cpu(float* out, float* preatt, float* att,
                 float* att_bth = att + b*NH*T*T + h*T*T + t*T;
 
                 // pass 1: calculate query dot key and maxval
-                float maxval = -10000.0f; // TODO something better
+                float maxval = -FLT_MAX;
                 for (int t2 = 0; t2 <= t; t2++) {
                     const float* key_t2 = inp + b * T * C3 + t2 * C3 + h * hs + C; // +C because it's key
 
@@ -203,7 +203,7 @@ __global__ void attention_softmax_kernel1(float* att, const float* preatt,
         float* att_bth = att + b*NH*T*T + h*T*T + t*T;
 
         // find maxval
-        float maxval = -10000.0f; // TODO something better
+        float maxval = -FLT_MAX;
         for (int t2 = 0; t2 <= t; t2++) {
             if (preatt_bth[t2] > maxval) {
                 maxval = preatt_bth[t2];
@@ -1377,6 +1377,7 @@ int main(int argc, char **argv) {
     cudaCheck(cudaFree(d_preatt));
     cudaCheck(cudaFree(d_att));
     cudaCheck(cudaFree(d_inp));
+    cudaCheck(cudaFree(d_stats));
     cublasDestroy(cublas_handle);
 
     #ifdef ENABLE_CUDNN
diff --git a/dev/cuda/benchmark_on_modal.py b/dev/cuda/benchmark_on_modal.py
index 7a055ec9f..907a831ad 100644
--- a/dev/cuda/benchmark_on_modal.py
+++ b/dev/cuda/benchmark_on_modal.py
@@ -1,46 +1,53 @@
 """
 Script for running benchmarks on the Modal platform.
 This is useful for folks who do not have access to expensive GPUs locally.
-
-Example usage:
+Example usage for cuda kernels:
 GPU_MEM=80 modal run benchmark_on_modal.py \
     --compile-command "nvcc -O3 --use_fast_math attention_forward.cu -o attention_forward -lcublas" \
     --run-command "./attention_forward 1"
+OR if you want to use cuDNN etc.
 
-This will mount the contents of the current directory to the remote container on modal,
-compile the `attention_forward.cu` file with `nvcc`, and run the resulting binary on a A100 GPU with 80GB of memory.
-"""
 
+For training the gpt2 model with cuDNN use:
+GPU_MEM=80 modal run dev/cuda/benchmark_on_modal.py \
+    --compile-command "make train_gpt2cu USE_CUDNN=1"
+    --run-command "./train_gpt2cu -i dev/data/tinyshakespeare/tiny_shakespeare_train.bin -j dev/data/tinyshakespeare/tiny_shakespeare_val.bin -v 250 -s 250 -g 144 -f shakespeare.log -b 4"
+
+
+For profiling using nsight system:
+GPU_MEM=80 modal run dev/cuda/benchmark_on_modal.py \
+    --compile-command "make train_gpt2cu USE_CUDNN=1" \
+    --run-command "nsys profile --cuda-graph-trace=graph --python-backtrace=cuda --cuda-memory-usage=true \
+    ./train_gpt2cu -i dev/data/tinyshakespeare/tiny_shakespeare_train.bin \
+    -j dev/data/tinyshakespeare/tiny_shakespeare_val.bin -v 250 -s 250 -g 144 -f shakespeare.log -b 4"
+
+For more nsys profiling specifics and command options, take a look at: https://docs.nvidia.com/nsight-systems/2024.2/UserGuide/
+-> To profile the report using a GUI, download NVIDIA NSight System GUI version (this software can run on all OS, so you download it locally)
+
+NOTE: Currently there is a bug in the profiling using nsight system which produces a unrecognized GPU UUId error on the command line but it
+does not actually interfere with the model training and validation. The report (that you download) is still generated and can be viewed from Nsight Systems
+"""
 import subprocess
 import os
 import sys
+import datetime
 
 import modal
 from modal import Image, Stub
-
 GPU_NAME_TO_MODAL_CLASS_MAP = {
     "H100": modal.gpu.H100,
     "A100": modal.gpu.A100,
     "A10G": modal.gpu.A10G,
 }
-
 N_GPUS = int(os.environ.get("N_GPUS", 1))
 GPU_MEM = int(os.environ.get("GPU_MEM", 40))
 GPU_NAME = os.environ.get("GPU_NAME", "A100")
-GPU_CONFIG = GPU_NAME_TO_MODAL_CLASS_MAP[GPU_NAME](count=N_GPUS, size=str(GPU_MEM)+'GB')
+GPU_CONFIG = GPU_NAME_TO_MODAL_CLASS_MAP[GPU_NAME](count=N_GPUS, size=str(GPU_MEM) + 'GB')
 
 APP_NAME = "llm.c benchmark run"
 
-# We don't actually need to use the Axolotl image here, but it's reliable
-AXOLOTL_REGISTRY_SHA = (
-    "d5b941ba2293534c01c23202c8fc459fd2a169871fa5e6c45cb00f363d474b6a"
-)
-axolotl_image = (
-    Image.from_registry(f"winglian/axolotl@sha256:{AXOLOTL_REGISTRY_SHA}")
-    .run_commands(
-        "git clone https://github.com/OpenAccess-AI-Collective/axolotl /root/axolotl",
-        "cd /root/axolotl && git checkout v0.4.0",
-    )
+image = (
+    Image.from_registry("totallyvyom/cuda-env:latest-2")
     .pip_install("huggingface_hub==0.20.3", "hf-transfer==0.1.5")
     .env(
         dict(
@@ -49,34 +56,66 @@
             TQDM_DISABLE="true",
         )
     )
+    .run_commands(
+    "wget -q https://github.com/Kitware/CMake/releases/download/v3.28.1/cmake-3.28.1-Linux-x86_64.sh",
+    "bash cmake-3.28.1-Linux-x86_64.sh --skip-license --prefix=/usr/local",
+    "rm cmake-3.28.1-Linux-x86_64.sh",
+    "ln -s /usr/local/bin/cmake /usr/bin/cmake",)
+    .run_commands(
+        "apt-get install -y --allow-change-held-packages libcudnn8 libcudnn8-dev",
+        "apt-get install -y openmpi-bin openmpi-doc libopenmpi-dev kmod sudo",
+        "git clone https://github.com/NVIDIA/cudnn-frontend.git /root/cudnn-frontend",
+        "cd /root/cudnn-frontend && mkdir build && cd build && cmake .. && make"
+    )
+    .run_commands(
+        "wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-ubuntu2204.pin && \
+        mv cuda-ubuntu2204.pin /etc/apt/preferences.d/cuda-repository-pin-600 && \
+        apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/3bf863cc.pub && \
+        add-apt-repository \"deb https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/ /\" && \
+        apt-get update"
+    ).run_commands(
+        "apt-get install -y nsight-systems-2023.3.3"
+    )
 )
 
 stub = modal.App(APP_NAME)
 
-
 def execute_command(command: str):
     command_args = command.split(" ")
     print(f"{command_args = }")
     subprocess.run(command_args, stdout=sys.stdout, stderr=subprocess.STDOUT)
 
-
 @stub.function(
     gpu=GPU_CONFIG,
-    image=axolotl_image,
+    image=image,
     allow_concurrent_inputs=4,
     container_idle_timeout=900,
-    # This copies everything in this folder to the remote root folder
-    mounts=[modal.Mount.from_local_dir("./", remote_path="/root/")]
+    mounts=[modal.Mount.from_local_dir("./", remote_path="/root/")],
+    # Instead of 'cuda-env' put your volume name that you create from 'modal volume create {volume-name}'
+    # This enables the profiling reports to be saved on the volume that you can download by using:
+    # 'modal volume get {volume-name} {/output_file_name}
+    # For example right now, when profiling using this command "nsys profile --trace=cuda,nvtx --cuda-graph-trace=graph --python-backtrace=cuda --cuda-memory-usage=true" you would get your report
+    # using in a directory in your volume, where the name contains the timestamp unique id.
+    # This script will generate a "report1_{timestamp} folder in volume"
+    # and you can download it with 'modal volume get {volume-name} report1_{timestamp}
+    volumes={"/cuda-env": modal.Volume.from_name("cuda-env")},
 )
 def run_benchmark(compile_command: str, run_command: str):
     execute_command("pwd")
     execute_command("ls")
     execute_command(compile_command)
     execute_command(run_command)
-    return None
+    # Use this section if you want to profile using nsight system and install the reports on your volume to be locally downloaded
+    timestamp = datetime.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
 
+    execute_command("mkdir report1_" + timestamp)
+    execute_command("mv /root/report1.nsys-rep /root/report1_" + timestamp + "/")
+    execute_command("mv /root/report1.qdstrm /root/report1_" + timestamp + "/")
+    execute_command("mv /root/report1_" + timestamp + "/" + " /cuda-env/")
+
+    return None
 
 @stub.local_entrypoint()
 def inference_main(compile_command: str, run_command: str):
     results = run_benchmark.remote(compile_command, run_command)
-    return results
+    return results
\ No newline at end of file
diff --git a/dev/cuda/classifier_fused.cu b/dev/cuda/classifier_fused.cu
index 2125b874d..9556d5139 100644
--- a/dev/cuda/classifier_fused.cu
+++ b/dev/cuda/classifier_fused.cu
@@ -114,7 +114,7 @@ __device__ SoftmaxParams prepare_softmax(cg::thread_block_tile<32>& warp,
                                          int64_t idx, const float* inp, int V, int P) {
     // this warp (of 32) threads processes one row of inp, i.e. inp[idx, :] of shape (V,)
     // note that inp is actually (B * T, P) but we only use the first V elements
-    // this function tehen calculates:
+    // this function then calculates:
     // 1) the max value to subtract for numerical stability and
     // 2) the sum normalization factor
     const float* x = inp + idx * P;
@@ -481,33 +481,6 @@ __global__ void fused_classifier_kernel4(floatX* dlogits, floatX* losses, floatX
     }
 }
 
-// todo - move to common.h - or ideally somewhere it's not duplicated between train & common?
-// requires all 32 threads in the warp to be active, but should work for any block size
-// uses non-dynamic shared memory so every call increases shared memory requirements by 128 bytes
-// the fact it's unique shared memory allows us to avoid an extra __syncthreads() call at the end
-// but if called inside a loop, the shared memory will be implicitly reused, so set final_sync to 1
-using reduction_func_t = float (*) (float);
-template<reduction_func_t warp_reduction>
-__device__ float blockReduce(float val, bool final_sync=false, float out_of_bounds=0.0f) {
-    // two reductions of up to 1024 threads:
-    // 1) inside warp (shuffle), 2) cross-warp (shared memory), 3) inside warp (shuffle)
-    __shared__ float shared_val[32];
-    const int lane_id = threadIdx.x % 32;
-    const int warp_id = threadIdx.x / 32;
-    const int num_warps = blockDim.x / 32;
-
-    float warp_val = warp_reduction(val);
-    if (lane_id == 0) { shared_val[warp_id] = warp_val; }
-    __syncthreads();
-    warp_val = (lane_id < num_warps) ? shared_val[lane_id] : out_of_bounds;
-    float block_val = warp_reduction(warp_val);
-
-    if (final_sync) {
-        __syncthreads(); // only needed in loops when effectively reusing shared memory etc.
-    }
-    return block_val;
-}
-
 __device__ SoftmaxParams prepare_softmax_blockwide3(int64_t idx, const floatX* inp, int V, int P) {
     // same but not float4
     // one row of inp, i.e. inp[idx, :] of shape (V,)
@@ -664,7 +637,7 @@ void fused_classifier5(float* dlogits, float* losses,
                       int B, int T, int V, int P, int block_size) {
     const int N = B * T;
     const int grid_size = N;
-    fused_classifier_kernel5<true,false><<<grid_size, block_size, 512>>>((floatX*)dlogits, (floatX*)losses, NULL, (floatX*)logits, (floatX*)dlosses, targets, B, T, V, P);
+    fused_classifier_kernel5<true,false><<<grid_size, block_size>>>((floatX*)dlogits, (floatX*)losses, NULL, (floatX*)logits, (floatX*)dlosses, targets, B, T, V, P);
     cudaCheck(cudaGetLastError());
 }
 
@@ -707,8 +680,8 @@ int main(int argc, char **argv) {
     cudaCheck(cudaSetDevice(deviceIdx));
 
     // create host memory of random numbers
-    float* logits = make_random_float_01(B * T * V);
-    float* probs = (float*)malloc(B * T * V * sizeof(float));
+    float* logits = make_random_float(B * T * V);
+    float* probs = make_random_float_01(B * T * V);
     float* dlogits = (float*)malloc(B * T * V * sizeof(float));
     float* losses = (float*)malloc(B * T * sizeof(float));
     float* dlosses = make_random_float(B * T);
@@ -787,11 +760,13 @@ int main(int argc, char **argv) {
     free(losses);
     free(dlosses);
     free(targets);
+    free(outliers);
     cudaCheck(cudaFree(d_dlogits));
     cudaCheck(cudaFree(d_losses));
     cudaCheck(cudaFree(d_logits));
     cudaCheck(cudaFree(d_dlosses));
     cudaCheck(cudaFree(d_targets));
+    cudaCheck(cudaFree(d_dlogits_no_pad));
 
     return 0;
 }
\ No newline at end of file
diff --git a/dev/cuda/common.h b/dev/cuda/common.h
index 6502baa20..61a783a60 100644
--- a/dev/cuda/common.h
+++ b/dev/cuda/common.h
@@ -5,6 +5,8 @@
 #include <cublasLt.h>
 #include <float.h>
 
+#define WARP_SIZE 32U
+extern cudaDeviceProp deviceProp;
 
 template<class T>
 __host__ __device__ T ceil_div(T dividend, T divisor) {
@@ -18,6 +20,39 @@ __device__ float warpReduceSum(float val) {
     return val;
 }
 
+// requires all 32 threads in the warp to be active, but should work for any block size
+// uses non-dynamic shared memory so every call increases shared memory requirements by 128 bytes
+// the fact it's unique shared memory allows us to avoid an extra __syncthreads() call at the end
+// but if called inside a loop, the shared memory will be implicitly reused, so set final_sync to 1
+using reduction_func_t = float (*) (float);
+
+template<reduction_func_t warp_reduction>
+__device__ inline float blockReduce(float val, bool final_sync, float out_of_bounds) {
+    // two reductions of up to 1024 threads:
+    // 1) inside warp (shuffle), 2) cross-warp (shared memory), 3) inside warp (shuffle)
+    __shared__ float shared_val[WARP_SIZE];
+    const int lane_id = threadIdx.x % WARP_SIZE;
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int num_warps = blockDim.x / WARP_SIZE;
+
+    float warp_val = warp_reduction(val);
+    if (lane_id == 0) { shared_val[warp_id] = warp_val; }
+    __syncthreads();
+    warp_val = (lane_id < num_warps) ? shared_val[lane_id] : out_of_bounds;
+    float block_val = warp_reduction(warp_val);
+
+    if (final_sync) {
+        __syncthreads(); // only needed in loops when effectively reusing shared memory etc.
+    }
+    return block_val;
+}
+
+// Helper function to call blockReduce with default arguments
+template<reduction_func_t warp_reduction>
+__device__ inline float blockReduce(float val) {
+    return blockReduce<warp_reduction>(val, false, 0.0f);
+}
+
 // ----------------------------------------------------------------------------
 // checking utils
 
diff --git a/dev/cuda/crossentropy_softmax_backward.cu b/dev/cuda/crossentropy_softmax_backward.cu
index 27521bf60..65c72a2f1 100644
--- a/dev/cuda/crossentropy_softmax_backward.cu
+++ b/dev/cuda/crossentropy_softmax_backward.cu
@@ -99,7 +99,7 @@ int main(int argc, char **argv) {
     cudaCheck(cudaSetDevice(deviceIdx));
 
     // create host memory of random numbers
-    float* probs = make_random_float(B * T * V);
+    float* probs = make_random_float_01(B * T * V);
     int* targets = make_random_int(B * T, V);
     float* dlosses = make_random_float(B * T);
     float* dlogits = make_zeros_float(B * T * V);
diff --git a/dev/cuda/encoder_backward.cu b/dev/cuda/encoder_backward.cu
index 53221878e..7c14d0eba 100644
--- a/dev/cuda/encoder_backward.cu
+++ b/dev/cuda/encoder_backward.cu
@@ -163,14 +163,17 @@ int main(int argc, char **argv) {
     }
     printf("Using kernel %d\n", kernel_num);
 
-    // set up block sizes
+    // first check the correctness of the kernel
+    encoder_backward_cpu(dwte, dwpe, dout, inp, B, T, C);
+
+    // time the kernel at different block sizes
     int block_sizes[] = {32, 64, 128, 256, 512, 1024};
 
-    // first check the correctness of the kernel
     for (int j = 0; j < sizeof(block_sizes) / sizeof(int); j++) {
         int block_size = block_sizes[j];
+        cudaCheck(cudaMemset(d_dwte, 0, V * C * sizeof(float)));
+        cudaCheck(cudaMemset(d_dwpe, 0, T * C * sizeof(float)));
         printf("Checking block size %d.\n", block_size);
-        encoder_backward_cpu(dwte, dwpe, dout, inp, B, T, C);
         encoder_backward(kernel_num, d_dwte, d_dwpe, d_dout, d_inp, B, T, C, block_size);
         validate_result(d_dwte, dwte, "dwte", V * C, 1e-5f);
         validate_result(d_dwpe, dwpe, "dwpe", T * C, 1e-5f);
diff --git a/dev/cuda/fused_residual_forward.cu b/dev/cuda/fused_residual_forward.cu
index b98a67c4b..9752873db 100644
--- a/dev/cuda/fused_residual_forward.cu
+++ b/dev/cuda/fused_residual_forward.cu
@@ -133,7 +133,7 @@ __global__ void fused_residual_forward2(floatX* residual, floatX* normed, floatX
     for(int c = 0; c < C; ++c) {
         float out = (float)inp1[c] + (float)inp2[c];
         m += out;
-        residual[c] = out;
+        residual[c] = (floatX)out;
     }
 
     m = m / C;
@@ -149,11 +149,11 @@ __global__ void fused_residual_forward2(floatX* residual, floatX* normed, floatX
     for (int c = 0; c < C; c++) {
         float n = (s * ((float)residual[c] - m)); // normalized output
         float o = n * (float)weight[c] + (float)bias[c]; // scale and shift it
-        normed[c] = o; // write
+        normed[c] = (floatX)o; // write
     }
     // cache the mean and rstd for the backward pass later
-    mean[idx] = m;
-    rstd[idx] = s;
+    mean[idx] = (floatX)m;
+    rstd[idx] = (floatX)s;
 }
 
 // handle one token per warp for coalesced access
@@ -232,7 +232,7 @@ __global__ void fused_residual_forward_kernel4(floatX* residual, floatX* normed,
         const x128 in2 = load128cs(inp2 + c);
         x128 out;
         for(int k = 0; k < x128::size; ++k) {
-            out[k] = (float)in1[k] + (float)in2[k];
+            out[k] = (floatX)((float)in1[k] + (float)in2[k]);
             sum += (float)out[k];
             sum_sq += (float)out[k] * (float)out[k];
         }
@@ -309,7 +309,7 @@ __global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed,
         const x128 in2 = load128cs(inp2 + c);
         x128 out;
         for(int k = 0; k < x128::size; ++k) {
-            out[k] = (float)in1[k] + (float)in2[k];
+            out[k] = (floatX)((float)in1[k] + (float)in2[k]);
             sum += (float)out[k];
         }
         store128cs(residual + c, out);
@@ -372,8 +372,8 @@ __global__ void fused_residual_forward_kernel6(floatX* residual, floatX* normed,
     // weights and biases are  shared among all tokens
     x128* s_weight = reinterpret_cast<x128*>(params);
     x128* s_bias = reinterpret_cast<x128*>(params + C * sizeof(floatX));
-    // residual output (input to layernorm) is indpendent for each sub-block indicates by threadIdx.z
-    x128* s_res = reinterpret_cast<x128*>(params + (2 + threadIdx.z) * C * sizeof(floatX)  );
+    // residual output (input to layernorm) is independent for each sub-block indicates by threadIdx.z
+    x128* s_res = reinterpret_cast<x128*>(params + (2 + threadIdx.z) * C * sizeof(floatX));
     // similarly, each sub-block needs its own reduction buffers
     float* s_mean = reinterpret_cast<float*>(params + (2 + blockDim.z) * C * sizeof(floatX) + threadIdx.z * 32 * sizeof(float));
     float* s_var = reinterpret_cast<float*>(params + (2 + blockDim.z) * C * sizeof(floatX) + 32 * sizeof(float) * (blockDim.z + threadIdx.z));
@@ -385,10 +385,10 @@ __global__ void fused_residual_forward_kernel6(floatX* residual, floatX* normed,
         s_weight[c / x128::size] = load128(weight + c);
         s_bias[c / x128::size] = load128(bias + c);
     }
+
     // the block-level reductions will cause sync before the first time we read these
     // => no syncthreads needed here
 
-
     // loop over all tokens
     for(int tidx = blockIdx.x * blockDim.z + threadIdx.z; tidx < N; tidx += gridDim.x * blockDim.z) {
         // adjust pointers to current token
diff --git a/dev/cuda/global_norm.cu b/dev/cuda/global_norm.cu
index 6c2ed0389..f54a35a42 100644
--- a/dev/cuda/global_norm.cu
+++ b/dev/cuda/global_norm.cu
@@ -16,6 +16,7 @@ nvcc -O3 --use_fast_math global_norm.cu -o global_norm
 #define ENABLE_BF16
 #include "common.h"
 
+cudaDeviceProp deviceProp;
 
 float global_norm_cpu(const float* data, size_t count) {
     // accumulate in double so we have an accurate numerical reference
@@ -89,6 +90,54 @@ __global__ void norm_kernel2(float* out, const T* data, size_t count) {
     }
 }
 
+template<class T>
+__global__ void norm_kernel3(float* out, const T* data, size_t count) {
+    size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t grid_width = blockDim.x * gridDim.x;
+    float accumulator = 0.f;
+    for(size_t i = index; i < count; i += grid_width) {
+        accumulator += (float)data[i] * (float)data[i];
+    }
+    // block-level reduce
+    float block_sum = blockReduce<warpReduceSum>(accumulator);
+    if(threadIdx.x == 0) {
+        atomicAdd(out, block_sum);
+    }
+}
+
+// Same as kernel3 but without atomic adds -> this allows us to have determinism due to the
+// non associativity of floating point operations. Roughly same performance as kernel3.
+template<class T>
+__global__ void norm_kernel4(float* out, const T* data, size_t count) {
+    size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t grid_width = blockDim.x * gridDim.x;
+    float accumulator = 0.f;
+    for(size_t i = index; i < count; i += grid_width) {
+        accumulator += (float)data[i] * (float)data[i];
+    }
+    // block-level reduce
+    float block_sum = blockReduce<warpReduceSum>(accumulator);
+    // each block accumulates its partial sum to out[blockIdx.x]
+    // we want to avoid using atomic add here so we combine this kernel with the aggregate kernel call
+    // that sums up the partial block sums
+    if(threadIdx.x == 0) {
+        out[blockIdx.x] = block_sum;
+    }
+}
+
+__global__ void global_norm_aggregate_kernel(float* out, size_t count) {
+    size_t index = threadIdx.x;
+    // grab block sums from the previous kernel, use 0. as the neutral sum element
+    float block_sum = (index < count) ? out[index] : 0.f;
+    float sum = blockReduce<warpReduceSum>(block_sum);
+    if(threadIdx.x == 0) {
+        out[0] = sum;  // out[0] ends up with the final norm squared
+    }
+}
+
+// ----------------------------------------------------------------------------
+// kernel launchers
+
 template<typename T>
 void global_norm1(float* out, const T* values, size_t count, int block_size) {
     // launch just enough blocks to fill the grid. deliberately no DIV_CEIL.
@@ -111,17 +160,54 @@ void global_norm2(float* out, const T* values, size_t count, int block_size) {
     cudaCheck(cudaGetLastError());
 }
 
+template<typename T>
+void global_norm3(float* out, const T* values, size_t count, int block_size) {
+    // launch just enough blocks to fill the grid. deliberately no DIV_CEIL.
+    // having one block less than possible is a tiny performance hit, having
+    // one block too many is catastrophic, since it only can start once all the other
+    // blocks finish. anyway, I think cuda_threads_per_SM should be a multiple of 512
+    // on all gpus, so the division really is going to be exact.
+    const int grid_size = deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / block_size;
+    assert(grid_size > 0);  // gives a better error than letting the call below fail
+    norm_kernel3<<<grid_size, block_size>>>(out, values, count);
+    cudaCheck(cudaGetLastError());
+}
+
+template<typename T>
+void global_norm4(float* out, const T* values, size_t count, int block_size) {
+    if (block_size <= 64) {
+        block_size = 128;  // to avoid triggering the assert below
+    }
+    // launch just enough blocks to fill the grid. deliberately no DIV_CEIL.
+    // having one block less than possible is a tiny performance hit, having
+    // one block too many is catastrophic, since it only can start once all the other
+    // blocks finish. anyway, I think cuda_threads_per_SM should be a multiple of 512
+    // on all gpus, so the division really is going to be exact.
+    const int grid_size = deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / block_size;
+    assert(grid_size > 0);      // gives a better error than letting the call below fail
+    assert(grid_size < 1024);  // we want to later accumulate the block sums in a single block
+    norm_kernel4<<<grid_size, block_size>>>(out, values, count);
+    cudaCheck(cudaGetLastError());
+    global_norm_aggregate_kernel<<<1, 1024>>>(out, grid_size);
+    cudaCheck(cudaGetLastError());
+}
+
 void global_norm(int kernel_num, float* out, const floatX* values, size_t count, int block_size) {
     switch (kernel_num) {
         case 1:
             return global_norm1(out, values, count, block_size);
         case 2:
             return global_norm2(out, values, count, block_size);
+        case 3:
+            return global_norm3(out, values, count, block_size);
+        case 4:
+            return global_norm4(out, values, count, block_size);
     }
 }
 
 int main(int argc, const char **argv) {
     setup_main();
+    cudaGetDeviceProperties(&deviceProp, 0);
 
     int C = 768;
     int L = 12;
@@ -148,7 +234,7 @@ int main(int argc, const char **argv) {
     // move to GPU
     float* d_out;
     floatX* d_inp;
-    cudaCheck(cudaMalloc(&d_out,  sizeof(float)));
+    cudaCheck(cudaMalloc(&d_out,  1024 * sizeof(float)));  // 1024 needed for kernel 4
     cudaCheck(cudaMalloc(&d_inp, num_params * sizeof(floatX)));
     cudaCheck(memcpy_convert(d_inp, inp, num_params));
 
diff --git a/dev/cuda/layernorm_backward.cu b/dev/cuda/layernorm_backward.cu
index dc9d7e982..3930cecdd 100644
--- a/dev/cuda/layernorm_backward.cu
+++ b/dev/cuda/layernorm_backward.cu
@@ -874,7 +874,6 @@ __global__ void layernorm_backward_kernel9(floatX* dinp, floatX* dweight, floatX
         }
         __trap();       // prefer to crash here than run into a deadlock later on
     }
-    constexpr int WARP_SIZE = 32;
     int BLOCK_SIZE = blockDim.x;
     int warpsInBlock = BLOCK_SIZE / WARP_SIZE; //number of warps in block
     extern __shared__ float shared[]; // size = 2 * C + 1
@@ -1059,7 +1058,6 @@ layernorm_backward_kernel10(floatX* dinp, floatX* dweight, floatX* dbias, float*
                             const floatX* dout, const floatX* inp, const floatX* weight,
                             const floatX* mean, const floatX* rstd,
                             int B, int T, int C) {
-    constexpr int WARP_SIZE = 32;
     int BLOCK_SIZE = blockDim.x;
     int warpsInBlock = BLOCK_SIZE / WARP_SIZE; //number of warps in block
     extern __shared__ float shared[]; // size = 2 * C + 1
diff --git a/dev/cuda/layernorm_forward.cu b/dev/cuda/layernorm_forward.cu
index 3e948289a..0c4675162 100644
--- a/dev/cuda/layernorm_forward.cu
+++ b/dev/cuda/layernorm_forward.cu
@@ -28,7 +28,6 @@ verstion 5 allocates blocks per row instead of warps per row, same alg as 4 othe
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 #include "common.h"
-
 // ----------------------------------------------------------------------------
 // CPU code reference
 
@@ -290,7 +289,7 @@ __global__ void layernorm_forward_kernel5(float* __restrict__ out, float* __rest
     int num_warps = blockDim.x / 32;
     int warp_id = threadIdx.x / 32;
     int lane_id = threadIdx.x % 32;
-    int idx = blockIdx.x; // simpoy one block per row
+    int idx = blockIdx.x; // simply one block per row
     // the row of input that this group of threads is responsible for
     const float* x = inp + idx * C;
     // thread coarsening through the row, reduce the sum in series
@@ -337,6 +336,82 @@ __global__ void layernorm_forward_kernel5(float* __restrict__ out, float* __rest
     }
 }
 
+// Inspired by `fused_residual_forward_kernel5` in fused_residual_forward.cu
+__global__ void layernorm_forward_kernel6(float* __restrict__ out, float* __restrict__ mean, float* __restrict__ rstd,
+                                    const float*  __restrict__ inp, const float*  __restrict__ weight,
+                                    const float* __restrict__ bias, int N, int C) {
+    assert(blockDim.x == WARP_SIZE);
+
+    // load weights and biases into shared memory
+    // do this before we allow any threads to exit!
+    extern __shared__ char params[];
+    // load128/store128 sometimes generated multiple instructions when the types here were floatX*, so
+    // let's keep everything as x128
+    x128* s_weight = reinterpret_cast<x128*>(params);
+    x128* s_bias = reinterpret_cast<x128*>(params) + (C / x128::size);
+    x128* s_in = reinterpret_cast<x128*>(params) + ((2 + threadIdx.y) * C / x128::size);
+
+    int sidx = (threadIdx.x + WARP_SIZE * threadIdx.y) * x128::size;
+    for(int i = sidx; i < C; i += blockDim.y * WARP_SIZE * x128::size) {
+        s_weight[i/x128::size] = load128(weight + i);
+        s_bias[i/x128::size] = load128(bias + i);
+    }
+    __syncthreads();
+
+    int idx = blockIdx.x * blockDim.y + threadIdx.y;
+    if(idx >= N) { return; } // guard
+
+    // adjust pointers to current token
+    inp += idx * C;
+    out += idx * C;
+
+    const float eps = 1e-5f;
+    float sum = 0.0f;
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
+        const x128 in_data = load128cs(inp + c);
+        for(int k = 0; k < x128::size; ++k) {
+            sum += (float)in_data[k];
+        }
+        s_in[c / x128::size] = in_data;
+    }
+
+    sum = warpReduceSum(sum);
+    float m = sum / C;
+    float v = 0.f;
+
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
+        const x128 in_data = s_in[c / x128::size];
+        for(int k = 0; k < x128::size; ++k) {
+            v += ((float)in_data[k] - m) * ((float)in_data[k] - m);
+        }
+    }
+
+    v = warpReduceSum(v) / C;
+    float s = rsqrtf(v + eps);
+
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
+        const x128 in_data = s_in[c / x128::size];
+        const x128 w = s_weight[c / x128::size];
+        const x128 b = s_bias[c / x128::size];
+        x128 out_data;
+        for(int k = 0; k < x128::size; ++k) {
+            float n = s * ((float)in_data[k] - m); // normalized output
+            float o = n * (float)w[k] + (float)b[k]; // scale and shift it
+            out_data[k] = o;
+        }
+
+        store128cs(out + c, out_data);
+    }
+    // cache the mean and rstd for the backward pass later
+    if(threadIdx.x == 0 && mean != nullptr) {
+        __stcs(mean + idx, m);
+    }
+    // store the rstd, no need to cache it
+    if(threadIdx.x == 0 && rstd != nullptr) {
+        __stcs(rstd + idx, s);
+    }
+}
+
 // ----------------------------------------------------------------------------
 // kernel launcher
 
@@ -356,9 +431,9 @@ void layernorm_forward2(float* out, float* mean, float* rstd,
                        const int block_size) {
     int N = B * T;
     // in mean and rstd, threads cooperate within blocks via reductions
-    mean_kernel<<<B * T, block_size, block_size * sizeof(float)>>>(mean, inp, N, C, block_size);
+    mean_kernel<<<N, block_size, block_size * sizeof(float)>>>(mean, inp, N, C, block_size);
     cudaCheck(cudaGetLastError());
-    rstd_kernel<<<B * T, block_size, block_size * sizeof(float)>>>(rstd, inp, mean, N, C, block_size);
+    rstd_kernel<<<N, block_size, block_size * sizeof(float)>>>(rstd, inp, mean, N, C, block_size);
     cudaCheck(cudaGetLastError());
     // in the normalization, everything just gets flattened out
     const int block_size2 = 256;
@@ -394,12 +469,38 @@ void layernorm_forward5(float* out, float* mean, float* rstd,
                        int B, int T, int C,
                        const int block_size) {
     assert(block_size % 32 == 0);
+    assert(block_size <= 1024);
     const int N = B * T;
     const int grid_size = N;
     layernorm_forward_kernel5<<<grid_size, block_size>>>(out, mean, rstd, inp, weight, bias, N, C);
     cudaCheck(cudaGetLastError());
 }
 
+void layernorm_forward6(float* out, float* mean, float* rstd,
+                       const float* inp, const float* weight, const float* bias,
+                       int B, int T, int C,
+                       int block_size) {
+    assert(block_size % 32 == 0);
+    const int N = B * T;
+    int block_y = block_size / WARP_SIZE;
+    const int grid_size = ceil_div(N, block_y);
+    size_t smem = (2 + block_y) * C * sizeof(float);
+
+    // in order to use more than 48 KiB of smem, need to call cudaFuncSetAttribute
+    // this may fail, in which case we fall back to the smem free implementation.
+    cudaCheck(cudaGetLastError());
+    auto status = cudaFuncSetAttribute(layernorm_forward_kernel6, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
+    cudaGetLastError();
+    if (status == cudaSuccess) {
+        layernorm_forward_kernel6<<<grid_size, dim3(32, block_y), smem>>>(out, mean, rstd, inp, weight, bias, N, C);
+    } else {
+        const int grid_size = N;
+        // fall back to the version without shared memory
+        layernorm_forward_kernel5<<<grid_size, block_size>>>(out, mean, rstd, inp, weight, bias, N, C);
+    }
+    cudaCheck(cudaGetLastError());
+}
+
 // kernel version dispatch
 void layernorm_forward(int kernel_num,
                     float* out, float* mean, float* rstd,
@@ -422,6 +523,9 @@ void layernorm_forward(int kernel_num,
         case 5:
             layernorm_forward5(out, mean, rstd, inp, weight, bias, B, T, C, block_size);
             break;
+        case 6:
+            layernorm_forward6(out, mean, rstd, inp, weight, bias, B, T, C, block_size);
+            break;
         default:
             printf("Invalid kernel number\n");
             exit(1);
@@ -473,9 +577,6 @@ int main(int argc, char **argv) {
     printf("Using kernel %d\n", kernel_num);
 
     int block_sizes[] = {32, 64, 128, 256, 512, 1024};
-    float* out_gpu = (float*)malloc(B * T * C * sizeof(float));
-    float* mean_gpu = (float*)malloc(B * T * sizeof(float));
-    float* rstd_gpu = (float*)malloc(B * T * sizeof(float));
 
     layernorm_forward_cpu(out, mean, rstd, inp, weight, bias, B, T, C);
 
diff --git a/dev/cuda/matmul_backward.cu b/dev/cuda/matmul_backward.cu
index dece1f6dc..6f4222b11 100644
--- a/dev/cuda/matmul_backward.cu
+++ b/dev/cuda/matmul_backward.cu
@@ -268,12 +268,14 @@ int main(int argc, char **argv) {
     free(dout);
     free(inp);
     free(weight);
+    free(ones);
     cudaCheck(cudaFree(d_dinp));
     cudaCheck(cudaFree(d_dweight));
     cudaCheck(cudaFree(d_dbias));
     cudaCheck(cudaFree(d_dout));
     cudaCheck(cudaFree(d_inp));
     cudaCheck(cudaFree(d_weight));
+    cudaCheck(cudaFree(d_ones));
     cublasCheck(cublasDestroy(cublas_handle));
 
     return 0;
diff --git a/dev/cuda/matmul_backward_bias.cu b/dev/cuda/matmul_backward_bias.cu
index 16172bcf2..86fd37379 100644
--- a/dev/cuda/matmul_backward_bias.cu
+++ b/dev/cuda/matmul_backward_bias.cu
@@ -2,7 +2,7 @@
 Kernels for matmul backward pass bias only.
 
 Compile example:
-nvcc -O3 -lcublas -lcublasLt matmul_backward_bias.cu -lineinfo -o matmul_backward_bias
+nvcc -O3 -lcublas -lcublasLt -std=c++17 matmul_backward_bias.cu -lineinfo -o matmul_backward_bias
 
 ./matmul_backward_bias 1
 ./matmul_backward_bias 2
@@ -92,7 +92,7 @@ __global__ void matmul_backward_bias_kernel1(floatX* dbias, const floatX* dout,
     }
     // write the final result (at thread 0) to global memory
     if (tid == 0) {
-        dbias[o] = (float)dbias[o] + shared[0];
+        dbias[o] = (floatX)((float)dbias[o] + shared[0]);
     }
 }
 
@@ -116,7 +116,7 @@ __global__ void matmul_backward_bias_kernel2(floatX* dbias, const floatX* dout,
     sum = cg::reduce(warp, sum, cg::plus<float>{});
     // write the result to output (global memory)
     if(warp.thread_rank() == 0) {
-        dbias[idx] += sum;
+        dbias[idx] = (float)dbias[idx] + sum;
     }
 }
 
@@ -132,12 +132,13 @@ __global__ void matmul_backward_bias_kernel3(floatX* dbias, const floatX* dout,
     int warp_id = threadIdx.x / 32;
     int lane_id = threadIdx.x % 32;
     int idx = blockIdx.x; // simply one block per row
-    // round 1: thread coarsening to reduce the problem size from B*T to 32
+    // round 1: thread coarsening to reduce the problem size from B*T to block_size
     float thread_sum = 0.0f;
     for(int i = threadIdx.x; i < BT; i += blockDim.x) {
         thread_sum += (float)dout[i * OC + idx];
     }
     // now do a warp-level reduce to get the sum across the 32 threads in each warp
+    // reduce the problem size from block_size to block_size/32 i.e. `num_warps`
     float warp_sum = cg::reduce(warp, thread_sum, cg::plus<float>{});
     // store the warp sum in shared memory (we could have lane_id == 0 guard but not needed)
     shared_sum[warp_id] = warp_sum;
@@ -148,7 +149,7 @@ __global__ void matmul_backward_bias_kernel3(floatX* dbias, const floatX* dout,
     float block_sum = cg::reduce(warp, warp_sum, cg::plus<float>{}); // sum(x)
     // write the result to output (global memory)
     if(threadIdx.x == 0) {
-        dbias[idx] += block_sum;
+        dbias[idx] = (float)dbias[idx] + block_sum;
     }
 }
 
@@ -167,7 +168,7 @@ __global__ void matmul_backward_bias_kernel4(floatX* dbias, const floatX* dout,
     const int vstep = blockDim.x / warpSize; // number of warps in a block, e.g. 4
 
     // pointer to the start of the column for one lane of threads
-    // so e.g. 4 threads (of the same lane_id) will reduce this one column
+    // so e.g. 4 (`vstep`) threads (of the same lane_id) will reduce this one column
     const floatX* dout_col = dout + tl + lane_id;
 
     // column reductions by looping through the rows
@@ -188,7 +189,7 @@ __global__ void matmul_backward_bias_kernel4(floatX* dbias, const floatX* dout,
         for (int j = 0; j < vstep; j++) {
             dout_sum += smem[lane_id + j * warpSize];
         }
-        dbias[tl + lane_id] += dout_sum;
+        dbias[tl + lane_id] = (float)dbias[tl + lane_id] + dout_sum;
     }
 }
 
@@ -503,7 +504,7 @@ void matmul_backward_bias7(floatX* dbias, const floatX* dout,
 
     assert(block_size_y >= x128::size); // part of the kernel assumes this is large enough to avoid loops
 
-    cudaCheck(cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float)));
+    cudaCheck(cudaMemset(dbias_buffer, 0, OC * sizeof(float)));
     matmul_backward_bias_kernel7<<<dim3(grid_size_x, grid_size_y),
         dim3(block_size_x, block_size_y), OC_per_warp * sizeof(float)>>>(dbias_buffer, dout, B, T, OC, block_size);
     cudaCheck(cudaGetLastError());
@@ -524,7 +525,7 @@ void matmul_backward_bias8(floatX* dbias, const floatX* dout,
         matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias, dout, B, T, OC, std::bool_constant<false>{});
         cudaCheck(cudaGetLastError());
     } else {
-        cudaCheck(cudaMemsetAsync(dbias_buffer, 0, OC * sizeof(float)));
+        cudaCheck(cudaMemset(dbias_buffer, 0, OC * sizeof(float)));
         matmul_backward_bias_kernel8<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias_buffer, dout, B, T, OC, std::bool_constant<true>{});
         cudaCheck(cudaGetLastError());
         cast_and_add_kernel<<<ceil_div(OC, 256), 256, 0>>>(dbias, dbias_buffer, OC);
@@ -661,6 +662,7 @@ int main(int argc, char **argv) {
     // cleanups
     free(dbias);
     free(dout);
+    cudaCheck(cudaFree(dbias_buffer));
     cudaCheck(cudaFree(d_dbias));
     cudaCheck(cudaFree(d_dout));
 
diff --git a/dev/cuda/matmul_forward.cu b/dev/cuda/matmul_forward.cu
index fe22729e7..1a03e9a2c 100644
--- a/dev/cuda/matmul_forward.cu
+++ b/dev/cuda/matmul_forward.cu
@@ -84,6 +84,88 @@ __global__ void add_bias(float* out, const float* bias, int B, int T, int OC) {
     }
 }
 
+// kernel 4: semi-efficient handwritten kernel
+// see trimat_forward.cu for some intermediate development steps
+__device__ float4 ld_vec(const float* address) {
+    return *reinterpret_cast<const float4*>(address);
+}
+
+__device__ void st_vec(float* address, float4 val) {
+    *reinterpret_cast<float4*>(address) = val;
+}
+
+__global__ void __launch_bounds__(16*16) matmul_forward_kernel4(float* out,
+                                       const float* inp, const float* weight, const float* bias,
+                                       int C, int OC) {
+    // out is (B,T,OC). OC is short for "output channels", e.g. OC = 4 * C
+    // inp is (B,T,C), weight is (OC, C), bias is (OC)
+    // each thread handles 8x8 elements; each block 128 by 128 elements.
+    int oc = 8*(blockIdx.y * blockDim.y + threadIdx.y);
+
+    // buffers to cache chunks of the input matrices
+    __shared__ float lhs_s[128][32];
+    __shared__ float rhs_s[128][32];
+
+    // adjust our pointers for the current block
+    inp += 128 * blockIdx.x * C;
+    weight += 128 * blockIdx.y * C;
+    out += 128 * blockIdx.x * OC + 128 * blockIdx.y;
+
+    float vals[8][8] = {};
+    if(bias != NULL) {
+        for (int i = 0; i < 8; i++) {
+            for (int j = 0; j < 8; j += 4) {
+                float4 b = ld_vec(bias + oc + j);
+                vals[i][j+0] = b.x;
+                vals[i][j+1] = b.y;
+                vals[i][j+2] = b.z;
+                vals[i][j+3] = b.w;
+            }
+        }
+    }
+
+    int si_start = 4*(16 * threadIdx.y + threadIdx.x);
+    for (int so = 0; so < C; so += 32) {
+        __syncthreads();
+        int xmod8 = threadIdx.x % 8;
+        int xby8 = threadIdx.x / 8;
+        int xo = 4 * xmod8;
+        for(int y = 2 * threadIdx.y + xby8; y < 128; y += 32) {
+            st_vec(&lhs_s[y][xo], ld_vec(inp + y * C + so + xo));
+            st_vec(&rhs_s[y][xo], ld_vec(weight + y * C + so + xo));
+        }
+        __syncthreads();
+
+        for (int si = si_start; si < si_start + 32; si += 4) {
+            float4 rhs[8];
+            for (int u = 0; u < 8; ++u) {
+                rhs[u] = ld_vec(&rhs_s[u + 8 * threadIdx.y][si % 32]);
+            }
+
+            for (int ii = 0; ii < 8; ++ii) {
+                float4 lhs = ld_vec(&lhs_s[ii + 8 * threadIdx.x][si % 32]);
+                for (int ji = 0; ji < 8; ++ji) {
+                    vals[ii][ji] += lhs.x * rhs[ji].x;
+                    vals[ii][ji] += lhs.y * rhs[ji].y;
+                    vals[ii][ji] += lhs.z * rhs[ji].z;
+                    vals[ii][ji] += lhs.w * rhs[ji].w;
+                }
+            }
+        }
+    }
+
+    for (int i = 0; i < 8; ++i) {
+        for (int j = 0; j < 8; j += 4) {
+            float4 result;
+            result.x = vals[i][j + 0];
+            result.y = vals[i][j + 1];
+            result.z = vals[i][j + 2];
+            result.w = vals[i][j + 3];
+            st_vec(out + (8*threadIdx.x+i) * OC + 8*threadIdx.y + j, result);
+        }
+    }
+}
+
 // ----------------------------------------------------------------------------
 // kernel launcher
 
@@ -218,6 +300,21 @@ void matmul_forward3(float* out,
     cublasCheck(cublasLtMatrixLayoutDestroy(biasLayout));
 }
 
+// handwritten, relatively efficient non-tensorcore matmul kernel
+void matmul_forward4(float* out,
+                     const float* inp, const float* weight, const float* bias,
+                     int B, int T, int C, int OC,
+                     int sqrt_block_size) {
+    // out is (B,T,OC). OC is short for "output channels", e.g. OC = 4 * C
+    // inp is (B,T,C), weight is (OC, C), bias is (OC)
+    sqrt_block_size = 16;
+
+    dim3 gridDim(ceil_div(B * T, 8*sqrt_block_size), ceil_div(OC, 8*sqrt_block_size));
+    dim3 blockDim(sqrt_block_size, sqrt_block_size);
+    matmul_forward_kernel4<<<gridDim, blockDim>>>(out, inp, weight, bias, C, OC);
+    cudaCheck(cudaGetLastError());
+}
+
 // kernel version dispatch
 void matmul_forward(int kernel_num,
                     float* out,
@@ -234,6 +331,9 @@ void matmul_forward(int kernel_num,
         case 3:
             matmul_forward3(out, inp, weight, bias, B, T, C, OC);
             break;
+        case 4:
+            matmul_forward4(out, inp, weight, bias, B, T, C, OC, sqrt_block_size);
+            break;
         default:
             printf("Invalid kernel number\n");
             exit(1);
@@ -245,7 +345,7 @@ void matmul_forward(int kernel_num,
 int main(int argc, char **argv) {
     srand(0);
 
-    int B = 8;
+    int B = 32;
     int T = 1024;
     int C = 768;
     int OC = 768 * 4; // expansion of 4, e.g. in the MLP
diff --git a/dev/cuda/nccl_all_reduce.cu b/dev/cuda/nccl_all_reduce.cu
index 260ba02ba..d6d36b1f3 100644
--- a/dev/cuda/nccl_all_reduce.cu
+++ b/dev/cuda/nccl_all_reduce.cu
@@ -193,5 +193,6 @@ int main(int argc, char **argv) {
 
   free(all_reduce_buffer_host);
   cudaCheck(cudaFree(all_reduce_buffer));
+  cudaCheck(cudaFree(all_reduce_buffer_recv));
   multi_gpu_config_free(&multi_gpu_config);
 }
diff --git a/dev/cuda/permute.cu b/dev/cuda/permute.cu
new file mode 100644
index 000000000..b5c65b88a
--- /dev/null
+++ b/dev/cuda/permute.cu
@@ -0,0 +1,181 @@
+/*
+Kernels to demonstrate permute operation.
+
+Compile example:
+nvcc -O3 permute.cu -o permute
+
+The goal is to permute a 4D matrix from its original shape (dim1, dim2, dim3, dim4) to a new shape (dim4, dim3, dim1, dim2).
+
+Before permutation, we need to understand how to access elements in a flattened (linear) form of the matrix.
+
+Given:
+
+dim1 = size of the 1st dimension
+dim2 = size of the 2nd dimension
+dim3 = size of the 3rd dimension
+dim4 = size of the 4th dimension
+
+For any element in a 4D matrix at position (i1, i2, i3, i4), where:
+
+i1 is the index in dimension 1
+i2 is the index in dimension 2
+i3 is the index in dimension 3
+i4 is the index in dimension 4
+
+If you find it challenging to calculate the indices i1, i2, i3, and i4, observe the pattern in the index calculations.
+Initially, it might take some time to grasp, but with practice, you'll develop a mental model for it.
+
+To calculate the indices, use the following formulas:
+
+i1 = (idx / (dim2 * dim3 * dim4)) % dim1;
+i2 = (idx / (dim3 * dim4)) % dim2;
+i3 = (idx / dim4) % dim3;
+i4 = idx % dim4;
+
+Pattern Explanation:
+To find the index for any dimension, divide the thread ID (idx) by the product of all subsequent dimensions.
+Then, perform modulo operation with the current dimension.
+
+
+
+The linear index in a flattened 1D array is calculated as:
+linear_idx = i1 × ( dim2 × dim3 × dim4 ) + i2 × ( dim3 × dim4 ) + i3 × dim4 + i4
+This linear index uniquely identifies the position of the element in the 1D array.
+
+To permute the matrix, we need to rearrange the indices according to the new shape.
+In this case, we are permuting from (dim1, dim2, dim3, dim4) to (dim4, dim3, dim1, dim2).
+
+The new dimension post permutation will be as follows:
+
+dim1 becomes the new 3rd dimension.
+dim2 becomes the new 4th dimension.
+dim3 becomes the new 2nd dimension.
+dim4 becomes the new 1st dimension.
+
+permuted_idx = i4 * (dim3 * dim1 * dim2) + i3 * (dim1 * dim2) + i1 * dim2 + i2;
+
+Here's how this works:
+
+i4 * (dim3 * dim1 * dim2): This accounts for how many complete dim3 × dim1 × dim2 blocks fit before the current i4 block.
+i3 * (dim1 * dim2): This accounts for the offset within the current i4 block, specifying which i3 block we are in.
+i1 * dim2: This accounts for the offset within the current i3 block, specifying which i1 block we are in.
+i2: This gives the offset within the current i1 block.
+
+Lastly at the end we store the current value at idx index of the original value to the permuted index in the permuted_matrix.
+
+
+--------------------------------------------------------------------------------------------------------------------------------------------------------
+
+Similarly we can follow the above approach to permute matrices of any dimensions.
+
+*/
+
+
+#include <cuda_runtime.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <cmath>
+
+#include "common.h"
+
+// CPU function to permute a 4D matrix
+void permute_cpu(const float* matrix, float* out_matrix, int dim1, int dim2, int dim3, int dim4) {
+    int total_threads = dim1 * dim2 * dim3 * dim4;
+
+    for (int idx = 0; idx < total_threads; idx++) {
+        // Calculate the 4D indices from the linear index
+        int i1 = (idx / (dim2 * dim3 * dim4)) % dim1;
+        int i2 = (idx / (dim3 * dim4)) % dim2;
+        int i3 = (idx / dim4) % dim3;
+        int i4 = idx % dim4;
+
+        // Compute the new index for the permuted matrix
+        // Transpose from (dim1, dim2, dim3, dim4) to (dim4, dim3, dim1, dim2)
+        int permuted_idx = i4 * (dim3 * dim1 * dim2) + i3 * (dim1 * dim2) + i1 * dim2 + i2;
+        out_matrix[permuted_idx] = matrix[idx];
+    }
+}
+
+// CUDA kernel to permute a 4D matrix
+__global__ void permute_kernel(const float* matrix, float* out_matrix, int dim1, int dim2, int dim3, int dim4) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    // Ensure index is within bounds
+    if (idx < dim1 * dim2 * dim3 * dim4) {
+        // Calculate the 4D indices from the linear index
+        int i1 = (idx / (dim2 * dim3 * dim4)) % dim1;
+        int i2 = (idx / (dim3 * dim4)) % dim2;
+        int i3 = (idx / dim4) % dim3;
+        int i4 = idx % dim4;
+
+        // Compute the new index for the permuted matrix
+        // Transpose from (dim1, dim2, dim3, dim4) to (dim4, dim3, dim1, dim2)
+        int permuted_idx = i4 * (dim3 * dim1 * dim2) + i3 * (dim1 * dim2) + i1 * dim2 + i2;
+        out_matrix[permuted_idx] = matrix[idx];
+    }
+}
+
+
+int main() {
+    int dim_1 = 24;
+    int dim_2 = 42;
+    int dim_3 = 20;
+    int dim_4 = 32;
+
+    // Set up the device
+    int deviceIdx = 0;
+    cudaSetDevice(deviceIdx);
+    cudaDeviceProp deviceProp;
+    cudaGetDeviceProperties(&deviceProp, deviceIdx);
+    printf("Device %d: %s\n", deviceIdx, deviceProp.name);
+
+    // Allocate host memory
+    float* matrix = make_random_float(dim_1 * dim_2 * dim_3 * dim_4);
+    float* permuted_matrix = (float*)malloc(dim_1 * dim_2 * dim_3 * dim_4 * sizeof(float));
+
+    // Initialize the matrix with random values
+
+    // Allocate device memory
+    float *d_matrix, *d_permuted_matrix;
+    cudaMalloc(&d_matrix, dim_1 * dim_2 * dim_3 * dim_4 * sizeof(float));
+    cudaMalloc(&d_permuted_matrix, dim_1 * dim_2 * dim_3 * dim_4 * sizeof(float));
+
+    // Copy matrix from host to device
+    cudaMemcpy(d_matrix, matrix, dim_1 * dim_2 * dim_3 * dim_4 * sizeof(float), cudaMemcpyHostToDevice);
+
+    // Perform permutation on CPU
+    clock_t start = clock();
+    permute_cpu(matrix, permuted_matrix, dim_1, dim_2, dim_3, dim_4);
+    clock_t end = clock();
+    double elapsed_time_cpu = (double)(end - start) / CLOCKS_PER_SEC;
+
+    // Define block and grid sizes
+    dim3 blockSize(256);
+    int totalThreads = dim_1 * dim_2 * dim_3 * dim_4;
+    int gridSize = (totalThreads + blockSize.x - 1) / blockSize.x; // Compute grid size
+
+    // Launch CUDA kernel to perform permutation
+    permute_kernel<<<gridSize, blockSize>>>(d_matrix, d_permuted_matrix, dim_1, dim_2, dim_3, dim_4);
+    cudaDeviceSynchronize(); // Ensure kernel execution is complete
+
+    // Verify results
+    printf("Checking correctness...\n");
+    validate_result(d_permuted_matrix, permuted_matrix, "permuted_matrix", dim_1 * dim_2 * dim_3 * dim_4, 1e-5f);
+
+    printf("All results match.\n\n");
+    // benchmark kernel
+    int repeat_times = 1000;
+    float elapsed_time = benchmark_kernel(repeat_times, permute_kernel,
+                                          d_matrix, d_permuted_matrix, dim_1, dim_2, dim_3, dim_4
+    );
+    printf("time gpu %.4f ms\n", elapsed_time);
+    printf("time cpu %.4f ms\n", elapsed_time_cpu);
+
+    // Free allocated memory
+    free(matrix);
+    free(permuted_matrix);
+    cudaFree(d_matrix);
+    cudaFree(d_permuted_matrix);
+
+    return 0;
+}
diff --git a/dev/cuda/residual_forward.cu b/dev/cuda/residual_forward.cu
index fd7d1fb8e..0f3fd7e28 100644
--- a/dev/cuda/residual_forward.cu
+++ b/dev/cuda/residual_forward.cu
@@ -99,7 +99,7 @@ int main(int argc, char **argv) {
     float* out = (float*)malloc(B * T * C * sizeof(float));
     float* inp1 = make_random_float(B * T * C);
     float* inp2 = make_random_float(B * T * C);
-    
+
     // move to GPU
     floatX* d_out;
     floatX* d_inp1;
diff --git a/dev/cuda/softmax_forward.cu b/dev/cuda/softmax_forward.cu
index e6b46167e..3fb57fbf2 100644
--- a/dev/cuda/softmax_forward.cu
+++ b/dev/cuda/softmax_forward.cu
@@ -135,7 +135,6 @@ __global__ void softmax_forward_kernel2(float* out, const float* inp, int N, int
         maxval = fmaxf(maxval, x[i]);
     }
     shared[tid] = maxval;
-    __syncthreads();
     // reductions
     for (int stride = block_size / 2; stride >= 1; stride /= 2) {
         __syncthreads();
@@ -157,7 +156,6 @@ __global__ void softmax_forward_kernel2(float* out, const float* inp, int N, int
         sumval += x[i];
     }
     shared[tid] = sumval;
-    __syncthreads();
     // reductions
     for (int stride = block_size / 2; stride >= 1; stride /= 2) {
         __syncthreads();
@@ -210,14 +208,13 @@ __global__ void softmax_forward_kernel3(float* out, const float* inp, int N, int
     for (int i = tid; i < C; i += blockDim.x) {
         sumval += x[i];
     }
+    // No need to broadcast sumval since all threads in the warp will have the same value
+    // (due to the fact that we're using __shfl_xor_sync)
     sumval = warpReduceSum(sumval);
 
-    // Broadcast sumval within the warp
-    float sum = __shfl_sync(0xFFFFFFFF, sumval, 0);
-
     // Divide the input values by the sum
     for (int i = tid; i < C; i += blockDim.x) {
-        out[idx * C + i] = x[i] / sum;
+        out[idx * C + i] = x[i] / sumval;
     }
 }
 
@@ -238,10 +235,9 @@ __global__ void softmax_forward_kernel4(float* out, const float* inp, int N, int
     // the number of warps per block. recall that blockDim.x is block_size
     int warpsPerBlock = blockDim.x / 32;
 
-    // shared[] must be allocated to have 2 * warpsPerBlock elements
-    // first half for max values, the second half for sum values
-    float* maxvals = shared;
-    float* sumvals = &shared[warpsPerBlock];
+    // shared[] must be allocated to have warpsPerBlock elements
+    // those will be used for max and sum values
+    float* max_or_sum_storage = shared;
 
     // one row of inp, i.e. inp[idx, :] of shape (C,)
     const float* x = inp + idx * C;
@@ -255,21 +251,21 @@ __global__ void softmax_forward_kernel4(float* out, const float* inp, int N, int
     maxval = warpReduceMax(maxval);
 
     // the 0th thread of each warp writes the maxval of that warp to shared memory
-    if (laneId == 0) maxvals[warpId] = maxval;
+    if (laneId == 0) max_or_sum_storage[warpId] = maxval;
     __syncthreads();
 
-    // now the 0th thread reduces the maxvals in shared memory, i.e. across warps
+    // now the 0th thread of the block reduces the max values in shared memory, i.e. across warps
     if (tid == 0) {
-        float val = maxvals[tid];
+        float val = max_or_sum_storage[tid];
         for (int i = 1; i < warpsPerBlock; i++) {
-            val = fmaxf(val, maxvals[i]);
+            val = fmaxf(val, max_or_sum_storage[i]);
         }
         // store the final max in the first position
-        maxvals[0] = val;
+        max_or_sum_storage[0] = val;
     }
     __syncthreads();
     // broadcast the max to all threads
-    float offset = maxvals[0];
+    float offset = max_or_sum_storage[0];
 
     // compute expf and write the result to global memory
     for (int i = tid; i < C; i += blockDim.x) {
@@ -289,20 +285,20 @@ __global__ void softmax_forward_kernel4(float* out, const float* inp, int N, int
     sumval = warpReduceSum(sumval);
 
     // write sumval to shared memory
-    if (laneId == 0) sumvals[warpId] = sumval;
+    if (laneId == 0) max_or_sum_storage[warpId] = sumval;
     __syncthreads();
 
     // inter-thread reduction of sum
     if (tid == 0) {
-        float val = sumvals[tid];
+        float val = max_or_sum_storage[tid];
         for (int i = 1; i < warpsPerBlock; ++i) {
-            val += sumvals[i];
+            val += max_or_sum_storage[i];
         }
-        sumvals[0] = val;
+        max_or_sum_storage[0] = val;
     }
     __syncthreads();
     // broadcast the sum to all threads
-    float sum = sumvals[0];
+    float sum = max_or_sum_storage[0];
 
     // divide the whole row by the sum
     for (int i = tid; i < C; i += blockDim.x) {
@@ -322,12 +318,13 @@ __global__ void softmax_forward_online_kernel1(float* out, const float* inp, int
         double sum = 0.0;
         for (int j = 0; j < C; j++) {
             float maxval_prev = maxval;
-			if (inp_row[j] > maxval) {
-				maxval = inp_row[j];
-				sum = sum * expf(maxval_prev - maxval) + expf(inp_row[j] - maxval);
+            float current_val = inp_row[j];
+			if (current_val > maxval) {
+				maxval = current_val;
+				sum = sum * expf(maxval_prev - maxval) + expf(current_val - maxval);
 			}
 			else {
-				sum += expf(inp_row[j] - maxval);
+				sum += expf(current_val - maxval);
 			}
 		}
 
@@ -590,7 +587,8 @@ void softmax_forward3(float* out, const float* inp, int N, int C, int block_size
 
 void softmax_forward4(float* out, const float* inp, int N, int C, int block_size) {
     int grid_size = N;
-    size_t shared_mem_size = 2 * block_size / 32 * sizeof(float);
+    // for each warp in the block we need a float that will be used for both maxval and sumval
+    size_t shared_mem_size = block_size / 32 * sizeof(float);
     softmax_forward_kernel4<<<grid_size, block_size, shared_mem_size>>>(out, inp, N, C);
 }
 
@@ -672,11 +670,10 @@ int main(int argc, char **argv) {
     const int* outliers = make_random_int(B * T * 3, V);
     for(int k = 0; k < 3; ++k) {
         for(int j = 0; j < B * T; ++j) {
-            inp[j * V +  outliers[j*3 + k]] *= 20;
+            inp[j * V + outliers[j*3 + k]] *= 20;
         }
     }
 
-
     // move to GPU
     float* d_out;
     float* d_inp;
@@ -728,6 +725,7 @@ int main(int argc, char **argv) {
     // free memory
     free(out);
     free(inp);
+    free((void*)outliers);
     cudaCheck(cudaFree(d_out));
     cudaCheck(cudaFree(d_inp));
 
diff --git a/dev/cuda/trimat_forward.cu b/dev/cuda/trimat_forward.cu
index 1c093e2a1..d85286b93 100644
--- a/dev/cuda/trimat_forward.cu
+++ b/dev/cuda/trimat_forward.cu
@@ -65,30 +65,38 @@ static float* d_qkvr;   // scratch for the cublas kernel
 // taken from then attention forward pass
 void trimul_cpu(float* out, const float* inp,
                 int B, int T, int C, int NH) {
+    // inp shape: (B, T, 3, NH, HS)
+    // out shape: (B, NH, T, T)
     int C3 = C*3;
-    int hs = C / NH; // head size
-    float scale = 1.0 / sqrtf(hs);
+    int HS = C / NH; // head size
+    float scale = 1.0 / sqrtf(HS);
 
     for (int b = 0; b < B; b++) {
         for (int t = 0; t < T; t++) {
-            for (int h = 0; h < NH; h++) {
-                const float* query_t = inp + b * T * C3 + t * C3 + h * hs;
-                float* out_bth = out + b * NH * T * T + h * T * T + t * T;
+            for (int nh = 0; nh < NH; nh++) {
+                // Q[b][nh][t][:] = inp[b][t][0][nh][:] (where : is the slice operator for hs)
+                const float* query_t = inp + b * T * C3 + t * C3 + nh * HS;
+                // out[b][nh][t][:]
+                float* out_bth = out + b * NH * T * T + nh * T * T + t * T;
 
                 // pass 1: calculate query dot key and maxval
                 for (int t2 = 0; t2 <= t; t2++) {
-                    const float* key_t2 = inp + b * T * C3 + t2 * C3 + h * hs + C; // +C because it's key
+                    // K[b][nh][t2][:] = inp[b][t2][1][nh][:]
+                    const float* key_t2 = inp + b * T * C3 + t2 * C3 + nh * HS + C; // +C because it's key
 
-                    // (query_t) dot (key_t2)
+                    // Q[b][nh][t][:] dot K[b][nh][t2][:]
                     float val = 0.0f;
-                    for (int i = 0; i < hs; i++) {
+                    for (int i = 0; i < HS; i++) {
                         val += query_t[i] * key_t2[i];
                     }
                     val *= scale;
 
+                     // out[b][nh][t][t2] = val
                     out_bth[t2] = val;
                 }
                 for(int t2 = t + 1; t2 < T; ++t2) {
+                    // causal mask, using NAN to supress warnings -> it could be -inf
+                    // but it doesn't matter because in validate_result we ignore infinities/NANs
                     out_bth[t2] = NAN;
                 }
             }
@@ -98,31 +106,31 @@ void trimul_cpu(float* out, const float* inp,
 
 __global__ void permute_kernel(float* q, float* k, float* v,
                                const float* inp,
-                               int B, int N, int NH, int d) {
-    // okay so now, this kernel wants Q,K,V to all be of shape (B, NH, N, d)
-    // but instead, we have a single tensor QKV (inp) of shape (B, N, 3, NH, d)
+                               int B, int T, int NH, int HS) {
+    // okay so now, this kernel wants Q,K,V to all be of shape (B, NH, T, HS)
+    // but instead, we have a single tensor QKV (inp) of shape (B, T, 3, NH, HS)
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
 
-    // Q[b][nh_][n][d_] = inp[b][n][0][nh_][d_]
+    // Q[b][nh][t][hs] = inp[b][t][0][nh][hs]
 
-    if (idx < B * NH * N * d) {
-        int b = idx / (NH * N * d);
-        int rest = idx % (NH * N * d);
-        int nh_ = rest / (N * d);
-        rest = rest % (N * d);
-        int n = rest / d;
-        int d_ = rest % d;
+    if (idx < B * NH * T * HS) {
+        int b = idx / (NH * T * HS);
+        int rest = idx % (NH * T * HS);
+        int nh = rest / (T * HS);
+        rest = rest % (T * HS);
+        int t = rest / HS;
+        int hs = rest % HS;
 
         int inp_idx = \
-            (b * N * 3 * NH * d)
-            +   (n * 3 * NH * d)
-            +       (0 * NH * d)
-            +          (nh_ * d)
-            +                d_;
+            (b * T * 3 * NH * HS)
+            +   (t * 3 * NH * HS)
+            +       (0 * NH * HS)
+            +          (nh * HS)
+            +                hs;
 
         q[idx] = inp[inp_idx];
-        k[idx] = inp[inp_idx + NH * d];
-        v[idx] = inp[inp_idx + 2 * (NH * d)];
+        k[idx] = inp[inp_idx + NH * HS];
+        v[idx] = inp[inp_idx + 2 * (NH * HS)];
     }
 }
 
@@ -145,6 +153,35 @@ void trimul_cublas(float* preatt,
     // batched matrix multiply with cuBLAS
     const float alpha = 1.0f / sqrtf(HS);
     const float beta = 0.0f;
+    // This schedules in parallel B*NH matmuls of shape q@k^t = (T, HS) @ (HS, T) = (T, T).
+    // IMPORTANT NOTE: Cublas uses a column-major (and we use row-major in our codebase) representation,
+    // so this call might look confusing to you if you look at the `cublasSgemmStridedBatched` signature.
+    //
+    // In order to avoid having to do an additional transpose operation after this func call,
+    // we need to pass in K as the first argument and Q as the second argument, which might make you think we're computing K^T @ Q.
+    // That combined with the shapes we got after the permute kernel - (B, NH, T, HS) (I'll omit B, NH for brevity going forward)
+    // and you might think we end up with (HS, T) @ (T, HS) = (HS, HS).
+    // This is not the case. :)
+    //
+    // Cublas sees our row-major matrix (T, HS) as (HS, T), hence we set the lead dimensions to HS (see function signature).
+    // We transpose K and end up computing K^T @ Q = (T, HS) @ (HS, T) = (T, T).
+    // If you were to interpret the above formula K^T @ Q you might think we end up with:
+    // -----------------------------------
+    // k1.dot(q1) k1.dot(q2) ... k1.dot(qT)
+    // k2.dot(q1) k2.dot(q2) ... k2.dot(qT)
+    // ...
+    // kT.dot(q1) kT.dot(q2) ... kT.dot(qT)
+    // -----------------------------------
+    // But as I mentioned, Cublas is column-major!
+    // So given that the dot product is symmetric we can write k1.dot(q1) as q1.dot(k1) and transposing the above
+    // representation we can see what we actually end up with in the row-major format:
+    // -----------------------------------
+    // q1.dot(k1) q1.dot(k2) ... q1.dot(kT)
+    // q2.dot(k1) q2.dot(k2) ... q2.dot(kT)
+    // ...
+    // qT.dot(k1) qT.dot(k2) ... qT.dot(kT)
+    // -----------------------------------
+    // which is exactly what we wanted! :)
     cublasCheck(cublasSgemmStridedBatched(cublas_handle,
                                           CUBLAS_OP_T, CUBLAS_OP_N,
                                           T, T, HS,
@@ -173,7 +210,7 @@ void trimul_cublas(float* preatt,
  */
 
 // using creates an alias for a function pointer
-using matmul_fn_ptr = void(*)(float* p, int ps, const float* k, int ks, const float* q, int qs, int T, int hs, float alpha);
+using matmul_fn_ptr = void(*)(float* p, int PS, const float* k, int KS, const float* q, int QS, int T, int HS, float alpha);
 
 template<matmul_fn_ptr matmul_tri>
 __global__ void __launch_bounds__(256, 2) trimul_global(float* out, const float* inp, int T, int C, int NH) {
@@ -183,20 +220,21 @@ __global__ void __launch_bounds__(256, 2) trimul_global(float* out, const float*
 
     // set up indices
     int C3 = C*3;
-    int hs = C / NH; // head size
-    float scale = 1.0 / sqrtf(hs);
+    int HS = C / NH; // head size
+    float scale = 1.0 / sqrtf(HS);
 
     // we put the "batch x head" dimension into the z block index.
-    int h = blockIdx.z % NH;
     int b = blockIdx.z / NH;
+    int nh = blockIdx.z % NH;
 
     // Get the base address for the current batch and head
-    const float* q = inp + b * T * C3 + h * hs;
-    const float* k = inp + b * T * C3 + h * hs + C;
-    float* r = out + (b*NH + h)*T*T;
+    // shapes -> inp (B, T, 3, NH, HS), Q (B, NH, T, HS), K (B, NH, T, HS)
+    const float* q = inp + b * T * C3 + nh * HS;  // Q[b][nh][:][:] = inp[b][:][0][nh][:]
+    const float* k = inp + b * T * C3 + nh * HS + C;  // K[b][nh][:][:] = inp[b][:][1][nh][:]
+    float* r = out + (b*NH + nh)*T*T;  // out[b][nh][:][:]
 
     // start the multiplication
-    matmul_tri(r, T, q, C3, k, C3, T, hs, scale);
+    matmul_tri(r, T, k, C3, q, C3, T, HS, scale);
 }
 
 template<matmul_fn_ptr matmul_tri>
@@ -239,12 +277,22 @@ void trimul_launcher(float* out, const float* inp, int B, int T, int C, int NH)
  */
 
 // baseline implementation: 20 ms
-__device__ void matmul_tri_naive(float* p, int ps, const float* k, int ks, const float* q, int qs, int T, int hs, float alpha) {
-    // get coordinates of our block
+__device__ void matmul_tri_naive(float* p, int PS, const float* k, int KS, const float* q, int QS, int T, int HS, float alpha) {
+    // coordinate system:
+    // | - - - - - > j
+    // |
+    // |
+    // v
+    // i
+    // get coordinates of our block - each thread is responsible for a single 8x8 block.
     int i_base = 128 * blockIdx.x + 8 * threadIdx.x;
     int j_base = 128 * blockIdx.y + 8 * threadIdx.y;
 
-    // one more check to skip the upper diagonal in blocks that are on the diagonal.
+    // One more check to skip the upper diagonal in blocks that are on the diagonal.
+    // Note: we deliberately waste some compute on the jagged diagonal i.e. elements that belong
+    // to the upper triangle that should be masked out. This will be ignored due to the causal mask
+    // in the reference CPU implementation when used in the `validate_result` function.
+    // Alternatively this check should be done in the nested for loop below -> if (i > j) return.
     if(j_base > i_base)
         return;
 
@@ -254,17 +302,17 @@ __device__ void matmul_tri_naive(float* p, int ps, const float* k, int ks, const
         for(int jo = 0; jo < 8; ++jo) {
             int j = j_base + jo;
             float val = 0;
-            for (int s = 0; s < hs; ++s) {
-                val += k[i * ks + s] * q[j * qs + s];
+            for (int s = 0; s < HS; ++s) {
+                val += q[i * QS + s] * k[j * KS + s];
             }
-            p[i * ps + j] = val * alpha;
+            p[i * PS + j] = val * alpha;
         }
     }
 }
 
 /*                     ** Chapter IV - ... **
  *
- *  Each worker is producing 64 combined cookies from 8 animals and 8 landscapes. They send there runners of 64 times
+ *  Each worker is producing 64 combined cookies from 8 animals and 8 landscapes. They send their runners 64 times
  *  to fetch the corresponding shapes. This is terribly inefficient; The runners need a minute or so for each trip,
  *  but making a cookie can be done in just a second.
  *
@@ -292,7 +340,7 @@ __device__ void matmul_tri_naive(float* p, int ps, const float* k, int ks, const
  */
 
 // reorganize loops to enable data reuse: 3.5 ms
-__device__ void matmul_tri_registers(float* p, int ps, const float* k, int ks, const float* q, int qs, int T, int hs, float alpha) {
+__device__ void matmul_tri_registers(float* p, int PS, const float* k, int KS, const float* q, int QS, int T, int HS, float alpha) {
     int i_base = 128 * blockIdx.x + 8 * threadIdx.x;
     int j_base = 128 * blockIdx.y + 8 * threadIdx.y;
 
@@ -300,17 +348,17 @@ __device__ void matmul_tri_registers(float* p, int ps, const float* k, int ks, c
         return;
 
     // shift our pointers to the sub-block this thread is responsible for
-    k += i_base * ks;
-    q += j_base * qs;
-    p += i_base * ps + j_base;
+    q += i_base * QS;
+    k += j_base * KS;
+    p += i_base * PS + j_base;
 
     float vals[8][8] = {};
-    for (int s = 0; s < hs; ++s) {
+    for (int hs = 0; hs < HS; ++hs) {
         float lhs[8];
         float rhs[8];
         for (int u = 0; u < 8; ++u) {
-            lhs[u] = k[u * ks + s];
-            rhs[u] = q[u * qs + s];
+            lhs[u] = q[u * QS + hs];
+            rhs[u] = k[u * KS + hs];
         }
 
         for (int i = 0; i < 8; ++i) {
@@ -322,7 +370,7 @@ __device__ void matmul_tri_registers(float* p, int ps, const float* k, int ks, c
 
     for (int i = 0; i < 8; ++i) {
         for (int j = 0; j < 8; ++j) {
-            p[i * ps + j] = vals[i][j] * alpha;
+            p[i * PS + j] = vals[i][j] * alpha;
         }
     }
 }
@@ -334,7 +382,7 @@ __device__ void matmul_tri_registers(float* p, int ps, const float* k, int ks, c
  *  "Of course", the runner answers, "but they've asked me for an elephant, a lion, a zebra, and a goldfish. These
  *  are all over the place, I can't just pick them up at one spot (_strided acccess_).
  *  "But the lion is right next to the palm tree. You could bring those two together?", you confirm.
- *  "Yes", he says, "if the just asked for the different categories at the same time, that would make things
+ *  "Yes", he says, "if they just asked for the different categories at the same time, that would make things
  *  so much easier. See, I have this bucket, I could carry lots of things in one go if I could just scoop them up
  *  from the same place (_coalesced access_).
  *
@@ -364,7 +412,8 @@ __device__ void st_vec(float* address, float4 val) {
 }
 
 // vector instructions for coalesced memory access: 1.7 ms
-__device__ void matmul_tri3(float* p, int ps, const float* k, int ks, const float* q, int qs, int T, int hs, float alpha) {
+__device__ void matmul_tri3(float* p, int PS, const float* k, int KS, const float* q, int QS, int T, int HS, float alpha) {
+    // Same logic as previous kernel we just load in float4 to improve coalescing
     int i_base = 128 * blockIdx.x + 8 * threadIdx.x;
     int j_base = 128 * blockIdx.y + 8 * threadIdx.y;
 
@@ -372,21 +421,21 @@ __device__ void matmul_tri3(float* p, int ps, const float* k, int ks, const floa
         return;
 
     // shift our pointers to the sub-block this thread is responsible for
-    k += i_base * ks;
-    q += j_base * qs;
-    p += i_base * ps + j_base;
+    q += i_base * QS;
+    k += j_base * KS;
+    p += i_base * PS + j_base;
 
     float vals[8][8] = {};
-    for (int s = 0; s < hs; s += 4) {
+    for (int hs = 0; hs < HS; hs += 4) {
         // load in float4 to improve coalescing
         float4 rhs[8];
         for (int u = 0; u < 8; ++u) {
-            rhs[u] = ld_vec(q + u * qs + s);
+            rhs[u] = ld_vec(k + u * KS + hs);
         }
 
         for (int i = 0; i < 8; ++i) {
-            // no need to keep lhs around for the i loop, its only reused in the j loop anyway.
-            float4 lhs = ld_vec(k + i * ks + s);
+            // no need to keep lhs around for the i loop, it's only reused in the j loop anyway.
+            float4 lhs = ld_vec(q + i * QS + hs);
             for (int j = 0; j < 8; ++j) {
                 vals[i][j] += lhs.x * rhs[j].x;
                 vals[i][j] += lhs.y * rhs[j].y;
@@ -403,7 +452,7 @@ __device__ void matmul_tri3(float* p, int ps, const float* k, int ks, const floa
             result.y = vals[i][j + 1] * alpha;
             result.z = vals[i][j + 2] * alpha;
             result.w = vals[i][j + 3] * alpha;
-            st_vec(p + i * ps + j, result);
+            st_vec(p + i * PS + j, result);
         }
     }
 }
@@ -424,7 +473,7 @@ __device__ void matmul_tri3(float* p, int ps, const float* k, int ks, const floa
  *  details.]
  *
  */
-__device__ void matmul_tri4(float* p, int ps, const float* k, int ks, const float* q, int qs, int T, int hs, float alpha) {
+__device__ void matmul_tri4(float* p, int PS, const float* k, int KS, const float* q, int QS, int T, int HS, float alpha) {
     int i_base = 128 * blockIdx.x + 8 * threadIdx.x;
     int j_base = 128 * blockIdx.y + 8 * threadIdx.y;
 
@@ -433,14 +482,14 @@ __device__ void matmul_tri4(float* p, int ps, const float* k, int ks, const floa
     if (blockIdx.y > blockIdx.x)
         return;
 
-    k += 128 * blockIdx.x * ks;
-    q += 128 * blockIdx.y * qs;
+    q += 128 * blockIdx.x * QS;
+    k += 128 * blockIdx.y * KS;
 
     __shared__ float lhs_s[128][32];
     __shared__ float rhs_s[128][32];
 
     float vals[8][8] = {};
-    for (int so = 0; so < hs; so += 32) {
+    for (int so = 0; so < HS; so += 32) {
         // Read a large slice of the input, worked on together by all threads.
         // They are organized differently for this part. We want to ensure
         // fully coalesced loads, so we let a single warp handle consecutive
@@ -448,14 +497,23 @@ __device__ void matmul_tri4(float* p, int ps, const float* k, int ks, const floa
         // in one read operation.
         // note: threads may read data here that they don't need themselves.
         //       this really is a block-level operation.
+        // note2: 16x16 threads (i.e. the block) will, through this for loop, fetch 32 dims from 128 keys and 128 queries
+        // i.e. from Q/K, of shape (T, HS) take q[:128, so*32:(so+1)*32] and k[:128, so*32:(so+1)*32]
         __syncthreads();
         for(int y = threadIdx.y / 2; y < 128; y += 8) {
             int xo = (threadIdx.y % 2) * 16;
-            lhs_s[y][threadIdx.x + xo] = k[y * ks + so + threadIdx.x + xo];
-            rhs_s[y][threadIdx.x + xo] = q[y * qs + so + threadIdx.x + xo];
+            lhs_s[y][threadIdx.x + xo] = q[y * QS + so + threadIdx.x + xo];
+            rhs_s[y][threadIdx.x + xo] = k[y * KS + so + threadIdx.x + xo];
         }
         __syncthreads();
 
+        // Now we compute a partial dot product (only 32 dims) for all combinations of keys and queries (128x128).
+        // Each thread does 8x8 of these partial dot products.
+        // E.g. thread (0,0) covers queries 0-7 and keys 0-7. More generally first row of threads
+        // (0,:) covers queries 0-7 with keys 0-127 and so on.
+        // In the next iterations of the outer (`so`) loop we'll be accumulating values to `vals` until we
+        // get the full dot product. We then later deposit it into the output matrix for all 8x8 blocks
+        // that are below the diagonal.
         for (int si = 0; si < 32; ++si) {
             float rhs[8];
             for (int u = 0; u < 8; ++u) {
@@ -484,7 +542,7 @@ __device__ void matmul_tri4(float* p, int ps, const float* k, int ks, const floa
             result.y = vals[ii][ji + 1] * alpha;
             result.z = vals[ii][ji + 2] * alpha;
             result.w = vals[ii][ji + 3] * alpha;
-            st_vec(p + i * ps + j, result);
+            st_vec(p + i * PS + j, result);
         }
     }
 }
@@ -585,6 +643,7 @@ int main(int argc, char **argv) {
     free(inp);
     cudaCheck(cudaFree(d_out));
     cudaCheck(cudaFree(d_inp));
+    cudaCheck(cudaFree(d_qkvr));
     cublasDestroy(cublas_handle);
 
     return 0;
diff --git a/dev/data/edu_fineweb.sh b/dev/data/edu_fineweb.sh
new file mode 100755
index 000000000..53c82df78
--- /dev/null
+++ b/dev/data/edu_fineweb.sh
@@ -0,0 +1,73 @@
+#!/bin/bash
+
+# Downloads the FineWeb-Edu 100B dataset, but in an already tokenized format in .bin files
+# Example: ./edu_fineweb.sh 100
+# would download 100 shards
+# Default is all shards
+# Make sure to run this from current directory, i.e. inside ./dev/data!
+
+# Check if MAX_SHARDS is provided as positional first arg, otherwise default to 1024
+if [ $# -eq 0 ]; then
+    MAX_SHARDS=1001
+else
+    MAX_SHARDS=$1
+fi
+
+if [ $MAX_SHARDS -gt 1001 ]; then
+    MAX_SHARDS=1001
+fi
+
+# Base URLs
+TRAIN_BASE_URL="https://huggingface.co/datasets/karpathy/fineweb-edu-100B-gpt2-token-shards/resolve/main/edu_fineweb_train_"
+VAL_URL="https://huggingface.co/datasets/karpathy/fineweb-edu-100B-gpt2-token-shards/resolve/main/edu_fineweb_val_000000.bin"
+
+# Directory to save files
+SAVE_DIR="edu_fineweb100B"
+
+# Create the directory if it doesn't exist
+mkdir -p "$SAVE_DIR"
+
+download() {
+    local FILE_URL=$1
+    local FILE_NAME=$(basename $FILE_URL | cut -d'?' -f1)
+    local FILE_PATH="${SAVE_DIR}/${FILE_NAME}"
+    curl -s -L -o "$FILE_PATH" "$FILE_URL"
+    echo "Downloaded $FILE_NAME to $SAVE_DIR"
+}
+
+# Function to manage parallel jobs
+run_in_parallel() {
+    local max_jobs=$1
+    shift
+    local commands=("$@")
+    local job_count=0
+
+    for cmd in "${commands[@]}"; do
+        eval "$cmd" &
+        ((job_count++))
+        if (( job_count >= max_jobs )); then
+            wait -n
+            ((job_count--))
+        fi
+    done
+
+    # Wait for any remaining jobs to finish
+    wait
+}
+
+# Export the function so it's available in subshells
+export -f download
+
+# Download the validation shard
+download "$VAL_URL" &
+
+# Generate train file shard download commands
+train_commands=()
+for i in $(seq -f "%06g" 1 $MAX_SHARDS); do
+    FILE_URL="${TRAIN_BASE_URL}${i}.bin?download=true"
+    train_commands+=("download \"$FILE_URL\"")
+done
+
+# Run the train file commands in parallel
+run_in_parallel 40 "${train_commands[@]}"
+echo "The val shard and first $MAX_SHARDS train shards of FineWebEdu100B files downloaded in $SAVE_DIR"
diff --git a/dev/data/fineweb.py b/dev/data/fineweb.py
index 1d4184c2a..72c312966 100644
--- a/dev/data/fineweb.py
+++ b/dev/data/fineweb.py
@@ -14,13 +14,16 @@
   "language_score": 0.9185474514961243,
   "token_count": 594
 }
+
+Example of downloading the 100B dataset of FineWebEDU, from root directory:
+python dev/data/fineweb.py -t edu -v 100B
+100B runs for small few hours, depending on your internet and computer.
 """
 import os
 import argparse
 import multiprocessing as mp
 import numpy as np
 import tiktoken
-# from huggingface_hub import snapshot_download
 from datasets import load_dataset
 from tqdm import tqdm
 import argparse
@@ -28,26 +31,34 @@
 from data_common import write_datafile
 # ------------------------------------------
 
-parser = argparse.ArgumentParser(description="FineWeb dataset preprocessing")
-parser.add_argument("-v", "--version", type=str, default="10B", help="Which version of fineweb to use 10B|100B")
-parser.add_argument("-s", "--shard_size", type=int, default=10**8, help="Size of each shard in tokens")
+parser = argparse.ArgumentParser(description="FineWeb and Edu-FineWeb dataset preprocessing")
+parser.add_argument("-t", "--type", type=str, default="classic", help="Fineweb type, edu|classic")
+parser.add_argument("-v", "--version", type=str, default="10B", help="Fineweb data sample size, 10B|100B")
+parser.add_argument("-s", "--shard_size", type=int, default=10**8, help="Size of each data shard in the output .bin files, in tokens")
 args = parser.parse_args()
 
 # FineWeb has a few possible subsamples available
-assert args.version in ["10B", "100B"], "version must be one of 10B, 100B"
-if args.version == "10B":
-    local_dir = "fineweb10B"
-    remote_name = "sample-10BT"
-elif args.version == "100B":
-    local_dir = "fineweb100B"
-    remote_name = "sample-100BT"
+assert args.version in {"10B", "100B"}, "version must be one of: 10B, 100B"
+assert args.type in {"edu", "classic"}, "type must be one of: edu, classic"
+directories = {
+    ("classic", "10B"): ("fineweb10B", "sample-10BT"),
+    ("classic", "100B"): ("fineweb100B", "sample-100BT"),
+    ("edu", "10B"): ("edu_fineweb10B", "sample-10BT"),
+    ("edu", "100B"): ("edu_fineweb100B", "sample-100BT")
+}
+local_dir, remote_name = directories[(args.type, args.version)]
 
 # create the cache the local directory if it doesn't exist yet
 DATA_CACHE_DIR = os.path.join(os.path.dirname(__file__), local_dir)
 os.makedirs(DATA_CACHE_DIR, exist_ok=True)
 
 # download the dataset
-fw = load_dataset("HuggingFaceFW/fineweb", name=remote_name, split="train")
+if args.type == "classic":
+    fw = load_dataset("HuggingFaceFW/fineweb", name=remote_name, split="train")
+    name = "fineweb"
+elif args.type =="edu":
+    fw = load_dataset("HuggingFaceFW/fineweb-edu", name=remote_name, split="train")
+    name = "edu_fineweb"
 
 # init the tokenizer
 enc = tiktoken.get_encoding("gpt2")
@@ -83,7 +94,7 @@ def tokenize(doc):
         else:
             # write the current shard and start a new one
             split = "val" if shard_index == 0 else "train"
-            filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{split}_{shard_index:06d}.bin")
+            filename = os.path.join(DATA_CACHE_DIR, f"{name}_{split}_{shard_index:06d}.bin")
             # split the document into whatever fits in this shard; the remainder goes to next one
             remainder = args.shard_size - token_count
             progress_bar.update(remainder)
@@ -98,5 +109,5 @@ def tokenize(doc):
     # write any remaining tokens as the last shard
     if token_count != 0:
         split = "val" if shard_index == 0 else "train"
-        filename = os.path.join(DATA_CACHE_DIR, f"fineweb_{split}_{shard_index:06d}.bin")
+        filename = os.path.join(DATA_CACHE_DIR, f"{name}_{split}_{shard_index:06d}.bin")
         write_datafile(filename, all_tokens_np[:token_count])
diff --git a/dev/data/fineweb.sh b/dev/data/fineweb.sh
new file mode 100755
index 000000000..33e94792f
--- /dev/null
+++ b/dev/data/fineweb.sh
@@ -0,0 +1,77 @@
+#!/bin/bash
+
+# Downloads the FineWeb100B dataset, but in an already tokenized format in .bin files
+# Example: ./fineweb.sh 100
+# would download 100 shards
+# Default is all shards
+
+# Check if MAX_SHARDS is provided as positional first arg, otherwise default to 1024
+if [ $# -eq 0 ]; then
+    MAX_SHARDS=1028
+else
+    MAX_SHARDS=$1
+fi
+
+# Ensure MAX_SHARDS is not greater than 1028
+if [ $MAX_SHARDS -gt 1028 ]; then
+    MAX_SHARDS=1028
+fi
+
+# Base URLs
+TRAIN_BASE_URL="https://huggingface.co/datasets/chrisdryden/FineWebTokenizedGPT2/resolve/main/fineweb_train_"
+VAL_URL="https://huggingface.co/datasets/chrisdryden/FineWebTokenizedGPT2/resolve/main/fineweb_val_000000.bin?download=true"
+
+# Directory to save files
+SAVE_DIR="fineweb100B"
+
+# Create the directory if it doesn't exist
+mkdir -p "$SAVE_DIR"
+
+# Function to download, decompress, and delete files
+download() {
+    local FILE_URL=$1
+    local FILE_NAME=$(basename $FILE_URL | cut -d'?' -f1)
+    local FILE_PATH="${SAVE_DIR}/${FILE_NAME}"
+
+    # Download the file
+    curl -s -L -o "$FILE_PATH" "$FILE_URL"
+    echo "Downloaded $FILE_NAME to $SAVE_DIR"
+}
+
+# Function to manage parallel jobs
+run_in_parallel() {
+    local max_jobs=$1
+    shift
+    local commands=("$@")
+    local job_count=0
+
+    for cmd in "${commands[@]}"; do
+        eval "$cmd" &
+        ((job_count++))
+        if (( job_count >= max_jobs )); then
+            wait -n
+            ((job_count--))
+        fi
+    done
+
+    # Wait for any remaining jobs to finish
+    wait
+}
+
+# Export the function so it's available in subshells
+export -f download
+
+# Download
+download "$VAL_URL" &
+
+# Generate train file commands
+train_commands=()
+for i in $(seq -f "%06g" 1 $MAX_SHARDS); do
+    FILE_URL="${TRAIN_BASE_URL}${i}.bin?download=true"
+    train_commands+=("download \"$FILE_URL\"")
+done
+
+# Run the train file commands in parallel
+run_in_parallel 40 "${train_commands[@]}"
+
+echo "The val shard and first $MAX_SHARDS train shards of FineWeb100B files downloaded in $SAVE_DIR"
diff --git a/dev/download_starter_pack.sh b/dev/download_starter_pack.sh
new file mode 100755
index 000000000..1ddd328d9
--- /dev/null
+++ b/dev/download_starter_pack.sh
@@ -0,0 +1,80 @@
+#!/bin/bash
+
+# Get the directory of the script
+SCRIPT_DIR=$(dirname "$(realpath "$0")")
+
+# Base URL
+BASE_URL="https://huggingface.co/datasets/karpathy/llmc-starter-pack/resolve/main/"
+
+# Directory paths based on script location
+SAVE_DIR_PARENT="$SCRIPT_DIR/.."
+SAVE_DIR_TINY="$SCRIPT_DIR/data/tinyshakespeare"
+SAVE_DIR_HELLA="$SCRIPT_DIR/data/hellaswag"
+
+# Create the directories if they don't exist
+mkdir -p "$SAVE_DIR_TINY"
+mkdir -p "$SAVE_DIR_HELLA"
+
+# Files to download
+FILES=(
+    "gpt2_124M.bin"
+    "gpt2_124M_bf16.bin"
+    "gpt2_124M_debug_state.bin"
+    "gpt2_tokenizer.bin"
+    "tiny_shakespeare_train.bin"
+    "tiny_shakespeare_val.bin"
+    "hellaswag_val.bin"
+)
+
+# Function to download files to the appropriate directory
+download_file() {
+    local FILE_NAME=$1
+    local FILE_URL="${BASE_URL}${FILE_NAME}?download=true"
+    local FILE_PATH
+
+    # Determine the save directory based on the file name
+    if [[ "$FILE_NAME" == tiny_shakespeare* ]]; then
+        FILE_PATH="${SAVE_DIR_TINY}/${FILE_NAME}"
+    elif [[ "$FILE_NAME" == hellaswag* ]]; then
+        FILE_PATH="${SAVE_DIR_HELLA}/${FILE_NAME}"
+    else
+        FILE_PATH="${SAVE_DIR_PARENT}/${FILE_NAME}"
+    fi
+
+    # Download the file
+    curl -s -L -o "$FILE_PATH" "$FILE_URL"
+    echo "Downloaded $FILE_NAME to $FILE_PATH"
+}
+
+# Export the function so it's available in subshells
+export -f download_file
+
+# Generate download commands
+download_commands=()
+for FILE in "${FILES[@]}"; do
+    download_commands+=("download_file \"$FILE\"")
+done
+
+# Function to manage parallel jobs in increments of a given size
+run_in_parallel() {
+    local batch_size=$1
+    shift
+    local i=0
+    local command
+
+    for command; do
+        eval "$command" &
+        ((i = (i + 1) % batch_size))
+        if [ "$i" -eq 0 ]; then
+            wait
+        fi
+    done
+
+    # Wait for any remaining jobs to finish
+    wait
+}
+
+# Run the download commands in parallel in batches of 2
+run_in_parallel 6 "${download_commands[@]}"
+
+echo "All files downloaded and saved in their respective directories"
\ No newline at end of file
diff --git a/dev/eval/README.md b/dev/eval/README.md
new file mode 100644
index 000000000..f44c36327
--- /dev/null
+++ b/dev/eval/README.md
@@ -0,0 +1,59 @@
+# eleuther eval readme
+
+The goal here is to run the Eleuther Eval harness exactly in the same way as that used in the [huggingface LLM Leaderboard](https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard).
+
+The starting point is a `.bin` file trained by llm.c. We now have to export it to a huggingface model and then evaluate it.
+
+To export the model, use [export_hf.py](export_hf.py). See its documentation up top. Eample usage, from this directory:
+
+```bash
+cd dev/eval
+python export_hf.py --input model.bin --output output_dir
+```
+
+Where you point to your model .bin file, and huggingface files get written to output_dir. The script can optionally also upload to huggingface hub. One more post-processing that is advisable is to go into the `output_dir`, open up the `config.json` there and add one more entry into the json object:
+
+```
+"_attn_implementation": "flash_attention_2"
+```
+
+To use FlashAttention 2. We had trouble evaluating in bfloat16 without using FlashAttention 2 (the scores are much lower, and this was never fully resolved). This is a temporary hack/workaround.
+
+Now that we have the model in huggingface format, we download the Eleuther Eval Harness repo and run it. Head over to the parent/root directory of the llm.c repo and:
+
+```bash
+git clone https://github.com/EleutherAI/lm-evaluation-harness/
+cd lm-evaluation-harness
+git checkout b281b0921b636bc36ad05c0b0b0763bd6dd43463
+pip install -e .
+```
+
+And then run the run_eval.sh script:
+
+```bash
+./dev/eval/run_eval.sh output_dir result_dir
+```
+
+Where output_dir can either be local output dir (above), or a huggingface repo name.This will write eval json objects to `./lm-evaluation-harness/results/results_dir`. It will print the results into console, e.g. for a 774M model we see:
+
+```
+----------------------------------------
+arc_challenge_25shot.json      : 30.4608
+gsm8k_5shot.json               : 0.1516
+hellaswag_10shot.json          : 57.8072
+mmlu_5shot.json                : 25.8682
+truthfulqa_0shot.json          : 35.7830
+winogrande_5shot.json          : 59.3528
+----------------------------------------
+Average Score                  : 34.9039
+```
+
+But you can additionally get these results later by running `summarize_eval.py`:
+
+```bash
+python dev/eval/summarize_eval.py lm-evaluation-harness/results/results_dir
+```
+
+The same information will be printed again.
+
+For some reason, the evaluation is quite expensive and runs for somewhere around 1-3 hours, even though it should be a few minutes at most. This has not been satisfyingly resolved so far.
\ No newline at end of file
diff --git a/dev/eval/export_hf.py b/dev/eval/export_hf.py
new file mode 100644
index 000000000..b52cc28ea
--- /dev/null
+++ b/dev/eval/export_hf.py
@@ -0,0 +1,173 @@
+"""
+Script to convert GPT2 models from llm.c binary format to Hugging Face
+
+It can optinally upload to your account on Hugging Face if you have the CLI:
+  pip install -U "huggingface_hub[cli]"
+  huggingface-cli login
+
+Export to a local HF model:
+  python export_hf.py --input input_file.bin --output output_dir
+
+Export to a local HF model and also push to your account on Hugging Face:
+  python export_hf.py --input input_file.bin --output output_dir --push true
+"""
+
+import numpy as np
+import torch
+import argparse, sys
+from transformers import GPT2Config, GPT2Tokenizer, GPT2LMHeadModel
+
+# -----------------------------------------------------------------------------
+# Tensor functions for both bfloat16 (from int16) and normal float32
+# Both return float32 tensors
+
+def tensor_bf16(data_int16, transpose=False):
+    if transpose:
+        data_int16 = data_int16.transpose(1,0)
+    return torch.tensor(data_int16).view(torch.bfloat16).to(torch.float32)
+
+def tensor_fp32(data_float32, transpose=False):
+    if transpose:
+        data_float32 = data_float32.transpose(1,0)
+    return torch.tensor(data_float32).view(torch.float32)
+
+# -----------------------------------------------------------------------------
+# Main conversion function
+
+def convert(filepath, output, push_to_hub=False, out_dtype="bfloat16"):
+    print(f"Converting model {filepath} to {output} in {out_dtype} format and pushing to Hugging Face: {push_to_hub}")
+
+    f = open(filepath, 'rb')
+    # Read in our header, checking the magic number and version
+    # version 3 = fp32, padded vocab
+    # version 5 = bf16, padded vocab
+    model_header = np.frombuffer(f.read(256*4), dtype=np.int32)
+    if model_header[0] != 20240326:
+        print("ERROR: magic number mismatch in the data .bin file!")
+        exit(1)
+    version = model_header[1]
+    if not version in [3, 5]:
+        print("Bad version in model file")
+        exit(1)
+
+    # Load in our model parameters
+    maxT = model_header[2].item() # max sequence length
+    V = model_header[3].item() # vocab size
+    L =  model_header[4].item() # num layers
+    H = model_header[5].item() # num heads
+    C = model_header[6].item() # channels
+    Vp = model_header[7].item() # padded vocab size
+
+    print(f"{version=}, {maxT=}, {V=}, {Vp=}, {L=}, {H=}, {C=}")
+
+    # Define the shapes of our parameters
+    shapes = {
+        'wte': (Vp, C),
+        'wpe': (maxT, C),
+        'ln1w': (L, C),
+        'ln1b': (L, C),
+        'qkvw': (L, 3 * C, C),
+        'qkvb': (L, 3 * C),
+        'attprojw': (L, C, C),
+        'attprojb': (L, C),
+        'ln2w': (L, C),
+        'ln2b': (L, C),
+        'fcw': (L, 4 * C, C),
+        'fcb': (L, 4 * C),
+        'fcprojw': (L, C, 4 * C),
+        'fcprojb': (L, C),
+        'lnfw': (C,),
+        'lnfb': (C,),
+    }
+
+    # Load in our weights given our parameter shapes
+    dtype = np.float32 if version == 3 else np.int16
+    w = {}
+    for key, shape in shapes.items():
+        num_elements = np.prod(shape)
+        data = np.frombuffer(f.read(num_elements * np.dtype(dtype).itemsize), dtype=dtype)
+        w[key] = data.reshape(shape)
+        # The binary file saves the padded vocab - drop the padding back to GPT2 size
+        if shape[0] == Vp:
+            w[key] = w[key].reshape(shape)[:(V-Vp), :]
+    # Ensure the file is fully read and then close
+    assert f.read() == b''
+    f.close()
+
+    # Map to our model dict, the tensors at this stage are always fp32
+    mk_tensor = {
+        3 : tensor_fp32,
+        5 : tensor_bf16,
+    }[version]
+    model_dict = {}
+    model_dict['transformer.wte.weight'] = mk_tensor(w['wte'])
+    model_dict['transformer.wpe.weight'] = mk_tensor(w['wpe'])
+    model_dict['lm_head.weight'] = model_dict['transformer.wte.weight'] # Tie weights
+    for i in range(L):
+        model_dict[f'transformer.h.{i}.ln_1.weight'] = mk_tensor(w['ln1w'][i])
+        model_dict[f'transformer.h.{i}.ln_1.bias'] = mk_tensor(w['ln1b'][i])
+        model_dict[f'transformer.h.{i}.attn.c_attn.weight'] = mk_tensor(w['qkvw'][i], True)
+        model_dict[f'transformer.h.{i}.attn.c_attn.bias'] = mk_tensor(w['qkvb'][i])
+        model_dict[f'transformer.h.{i}.attn.c_proj.weight'] = mk_tensor(w['attprojw'][i], True)
+        model_dict[f'transformer.h.{i}.attn.c_proj.bias'] = mk_tensor(w['attprojb'][i])
+        model_dict[f'transformer.h.{i}.ln_2.weight'] = mk_tensor(w['ln2w'][i])
+        model_dict[f'transformer.h.{i}.ln_2.bias'] = mk_tensor(w['ln2b'][i])
+        model_dict[f'transformer.h.{i}.mlp.c_fc.weight'] = mk_tensor(w['fcw'][i], True)
+        model_dict[f'transformer.h.{i}.mlp.c_fc.bias'] = mk_tensor(w['fcb'][i])
+        model_dict[f'transformer.h.{i}.mlp.c_proj.weight'] = mk_tensor(w['fcprojw'][i], True)
+        model_dict[f'transformer.h.{i}.mlp.c_proj.bias'] = mk_tensor(w['fcprojb'][i])
+    model_dict['transformer.ln_f.weight'] = mk_tensor(w['lnfw'])
+    model_dict['transformer.ln_f.bias'] = mk_tensor(w['lnfb'])
+
+    # Create a GPT-2 model instance, in the requested dtype
+    config = GPT2Config(vocab_size = V,
+                        n_positions = maxT,
+                        n_ctx = maxT,
+                        n_embd = C,
+                        n_layer = L,
+                        n_head = H)
+    model = GPT2LMHeadModel(config)
+    if out_dtype == "bfloat16":
+        model = model.to(torch.bfloat16)
+
+    # Set the model dict and save
+    model.load_state_dict(model_dict)
+    model.save_pretrained(output, max_shard_size="5GB", safe_serialization=True)
+
+    # Copy over a standard gpt2 tokenizer
+    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
+    tokenizer.save_pretrained(output)
+
+    if push_to_hub:
+        print(f"Uploading {output} to Hugging Face")
+        model.push_to_hub(output)
+        tokenizer.push_to_hub(output)
+
+def spin(output):
+    print("Taking the exported model for a spin...")
+    print('-'*80)
+    from transformers import AutoModelForCausalLM, AutoTokenizer
+    tokenizer = AutoTokenizer.from_pretrained(output)
+    model = AutoModelForCausalLM.from_pretrained(output, attn_implementation="flash_attention_2", torch_dtype=torch.bfloat16, device_map='cuda')
+    model.eval()
+    tokens = tokenizer.encode("During photosynthesis in green plants", return_tensors="pt")
+    tokens = tokens.to('cuda')
+    output = model.generate(tokens, max_new_tokens=64, repetition_penalty=1.3)
+    samples = tokenizer.batch_decode(output)
+    for sample in samples:
+        print('-'*30)
+        print(sample)
+
+# -----------------------------------------------------------------------------
+
+if __name__== '__main__':
+    parser=argparse.ArgumentParser()
+    parser.add_argument("--input", "-i", help="The name of the llm.c model.bin file", type=str, required=True)
+    parser.add_argument("--output","-o",  help="The Hugging Face output model directory", type=str, required=True)
+    parser.add_argument("--dtype", "-d", help="Output as either float32 or bfloat16 (default)", type=str, default="bfloat16")
+    parser.add_argument("--push", "-p", help="Push the model to your Hugging Face account", type=bool, default=False)
+    parser.add_argument("--spin", "-s", help="Take the model for a spin at the end?", type=bool, default=True)
+    args = parser.parse_args()
+    convert(args.input, args.output, args.push, args.dtype)
+    if args.spin:
+        spin(args.output)
diff --git a/dev/eval/run_eval.sh b/dev/eval/run_eval.sh
new file mode 100755
index 000000000..d1e28f612
--- /dev/null
+++ b/dev/eval/run_eval.sh
@@ -0,0 +1,52 @@
+# https://huggingface.co/spaces/open-llm-leaderboard/open_llm_leaderboard
+# (See About tab -> REPRODUCIBILITY)
+
+# This script is intended to be run from the parent/root directory of llm.c repo.
+
+# Clone the evaluation harness:
+
+# git clone https://github.com/EleutherAI/lm-evaluation-harness/
+# cd lm-evaluation-harness
+# git checkout b281b0921b636bc36ad05c0b0b0763bd6dd43463
+# pip install -e .
+
+# Then return to the parent directory and run this script
+
+# cd ..
+# ./dev/eval/run_eval.sh [model_name] [result_name]
+
+# where model_name is either a HF model such as openai-community/gpt2 or a local path such as ./gpt2-124M-run1
+# and result_name is the name of the folder under lm-evaluation-harness/results to store the evaluations
+
+# Since the evals can take a couple of hours to run, depending on the model size, you may wish to
+# run within a "screen" session or by using nohup to run the script:
+
+# nohup ./dev/eval/run_eval.sh [model_name] [result_name] > run.txt 2> err.txt &
+
+if [ -z "$1" ]; then
+    echo "Error: missing HuggingFace model name or path to local model"
+    echo "./run_eval.sh hf_account/model_name my_result"
+  exit 1
+fi
+if [ -z "$2" ]; then
+  echo "Error: missing output name for results"
+    echo "./run_eval.sh hf_account/model_name my_result"
+  exit 1
+fi
+
+export MODEL="$(realpath -s "$1")"
+export RESULT="$2"
+echo "Evaluating model $MODEL"
+echo "Saving results to ./lm-evaluation-harness/results/$RESULT"
+
+cd lm-evaluation-harness
+
+python main.py --model hf-causal-experimental --model_args pretrained=$MODEL,use_accelerate=True,trust_remote_code=True --tasks truthfulqa_mc --batch_size 1 --no_cache --write_out --output_path results/$RESULT/truthfulqa_0shot.json --device cuda
+python main.py --model hf-causal-experimental --model_args pretrained=$MODEL,use_accelerate=True,trust_remote_code=True --tasks winogrande --batch_size 1 --no_cache --write_out --output_path results/$RESULT/winogrande_5shot.json --device cuda --num_fewshot 5
+python main.py --model hf-causal-experimental --model_args pretrained=$MODEL,use_accelerate=True,trust_remote_code=True --tasks arc_challenge --batch_size 1 --no_cache --write_out --output_path results/$RESULT/arc_challenge_25shot.json --device cuda --num_fewshot 25
+python main.py --model hf-causal-experimental --model_args pretrained=$MODEL,use_accelerate=True,trust_remote_code=True --tasks hellaswag --batch_size 1 --no_cache --write_out --output_path results/$RESULT/hellaswag_10shot.json --device cuda --num_fewshot 10
+python main.py --model hf-causal-experimental --model_args pretrained=$MODEL,use_accelerate=True,trust_remote_code=True --tasks gsm8k --batch_size 1 --no_cache --write_out --output_path results/$RESULT/gsm8k_5shot.json --device cuda --num_fewshot 5
+python main.py --model hf-causal-experimental --model_args pretrained=$MODEL,use_accelerate=True,trust_remote_code=True --tasks hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions --batch_size 1 --no_cache --write_out --output_path results/$RESULT/mmlu_5shot.json --device cuda --num_fewshot 5
+
+cd ..
+python dev/eval/summarize_eval.py lm-evaluation-harness/results/$RESULT
diff --git a/dev/eval/summarize_eval.py b/dev/eval/summarize_eval.py
new file mode 100644
index 000000000..82425264e
--- /dev/null
+++ b/dev/eval/summarize_eval.py
@@ -0,0 +1,32 @@
+# example run command
+# python dev/eval/summarize_eval.py lm-evaluation-harness/results/result774M
+# this script is optional, the run_eval.sh should already print these
+# but this script can be used to re-print them
+
+import json, sys
+
+RESULT = sys.argv[1]
+print("-"*40)
+
+key = {"arc_challenge_25shot.json": "acc_norm",
+       "gsm8k_5shot.json": "acc",
+       "hellaswag_10shot.json": "acc_norm",
+       "mmlu_5shot.json": "acc",
+       "truthfulqa_0shot.json": "mc2",
+       "winogrande_5shot.json": "acc"
+       }
+
+total = 0
+for test in ["arc_challenge_25shot.json", "gsm8k_5shot.json", "hellaswag_10shot.json", "mmlu_5shot.json", "truthfulqa_0shot.json", "winogrande_5shot.json"]:
+    data = json.loads(open("./%s/%s"%(RESULT, test)).read())
+    r_count = 0
+    r_total = 0
+    for test_name in data['results']:
+      r_count += 1
+      r_total += data['results'][test_name][key[test]]
+    score = (r_total*100)/r_count
+    print(f"{test:<30} : {score:.4f}")
+    total += score
+average = total / 6.0
+print("-"*40)
+print(f"Average Score                  : {average:.4f}")
diff --git a/dev/loss_checker_ci.py b/dev/loss_checker_ci.py
new file mode 100644
index 000000000..897266715
--- /dev/null
+++ b/dev/loss_checker_ci.py
@@ -0,0 +1,66 @@
+# Description: A script to compare numbers in a file with fixed values and check for accuracy within a specified percent difference.
+# Usage: python loss_checker_ci.py -f <file_path> -s <col_start> -e <col_end> -a <percent_accuracy>
+# Example: python dev/loss_checker_ci.py -f train_gpt2cu_fp32_precision.txt -s 20 -e 28 -a 10.0
+import sys
+import argparse
+
+def read_numbers_from_file(file_path, col_start, col_end):
+    try:
+        numbers = []
+        with open(file_path, 'r') as file:
+            lines = file.readlines()
+            start_index = None
+            for i, line in enumerate(lines):
+                if "step    1/10" in line:
+                    start_index = i
+                    break
+
+            if start_index is None:
+                print("Error: Could not find the string 'step    1/10' in the file.")
+                return None
+
+            # Read 10 rows starting from the identified start row
+            for line in lines[start_index:start_index + 10]:
+                # Extracting the specified columns
+                number = float(line[col_start:col_end].strip())
+                numbers.append(number)
+        return numbers
+    except Exception as e:
+        print(f"Error reading the file: {e}")
+        return None
+
+def compare_numbers(read_values, fixed_values, percent_accuracy):
+    for i in range(len(read_values)):
+        read_value = read_values[i]
+        fixed_value = fixed_values[i]
+        percent_difference = ((read_value - fixed_value) / fixed_value) * 100
+        print(f"Fixed Value: {fixed_value}, Read Value: {read_value}, Percent Difference: {percent_difference:.2f}%")
+        if abs(percent_difference) > percent_accuracy:
+            print(f"Error: Percent difference {percent_difference:.2f}% exceeds the allowed accuracy of {percent_accuracy}%")
+            return 1
+    print("Success: All values are within the allowed accuracy.")
+    return 0
+
+def main():
+    parser = argparse.ArgumentParser(description='Compare numbers in a file with fixed values.')
+    parser.add_argument('-f', '--file', required=True, help='Path to the input file')
+    parser.add_argument('-s', '--col_start', type=int, required=True, help='Starting column index (0-based)')
+    parser.add_argument('-e', '--col_end', type=int, required=True, help='Ending column index (0-based)')
+    parser.add_argument('-a', '--percent_accuracy', type=float, required=True, help='Allowed percent accuracy for comparison')
+
+    args = parser.parse_args()
+
+    # Read numbers from file
+    read_values = read_numbers_from_file(args.file, args.col_start, args.col_end)
+    if read_values is None:
+        return 1
+
+    # Use values from test_gpt2.cu for fp32 precision
+    fixed_values = [5.270009,4.060681,3.320085,2.717550,2.181066,1.653923,1.168050,0.736873,0.401021,0.187493];
+
+    # Compare the numbers and check accuracy
+    result = compare_numbers(read_values, fixed_values, args.percent_accuracy)
+    return result
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/dev/test/Makefile b/dev/test/Makefile
new file mode 100644
index 000000000..dfc1d250f
--- /dev/null
+++ b/dev/test/Makefile
@@ -0,0 +1,166 @@
+CC ?= gcc
+# example: make test_dataloader TEST_CFLAGS=-fsanitize=address -fno-omit-frame-pointer 
+CFLAGS = -Ofast -Wno-unused-result -Wno-ignored-pragmas -Wno-unknown-attributes -g
+CFLAGS += $(TEST_CFLAGS)
+LDFLAGS =
+LDLIBS = -lm
+INCLUDES =
+CFLAGS_COND = -march=native
+
+# Find nvcc
+SHELL_UNAME = $(shell uname)
+REMOVE_FILES = rm -f
+OUTPUT_FILE = -o $@
+CUDA_OUTPUT_FILE = -o $@
+
+# NVCC flags
+# -t=0 is short for --threads, 0 = number of CPUs on the machine
+NVCC_FLAGS = -O3 -t=0 --use_fast_math -std=c++17
+NVCC_LDFLAGS = -lcublas -lcublasLt
+NVCC_INCLUDES =
+NVCC_LDLIBS =
+NVCC_CUDNN =
+# By default we don't build with cudnn because it blows up compile time from a few seconds to ~minute
+USE_CUDNN ?= 0
+
+# We will place .o files in the `build` directory (create it if it doesn't exist)
+BUILD_DIR = build
+$(shell mkdir -p $(BUILD_DIR))
+REMOVE_BUILD_OBJECT_FILES := rm -f $(BUILD_DIR)/*.o
+
+# Function to check if a file exists in the PATH
+define file_exists_in_path
+  $(which $(1) 2>/dev/null)
+endef
+
+ifneq ($(CI),true) # if not in CI, then use the GPU query
+  ifndef GPU_COMPUTE_CAPABILITY # set to defaults if: make GPU_COMPUTE_CAPABILITY=
+    ifneq ($(call file_exists_in_path, __nvcc_device_query),)
+      GPU_COMPUTE_CAPABILITY = $(shell __nvcc_device_query)
+      GPU_COMPUTE_CAPABILITY := $(strip $(GPU_COMPUTE_CAPABILITY))
+    endif
+  endif
+endif
+
+# set to defaults if - make GPU_COMPUTE_CAPABILITY= otherwise use the compute capability detected above
+ifneq ($(GPU_COMPUTE_CAPABILITY),)
+  NVCC_FLAGS += --generate-code arch=compute_$(GPU_COMPUTE_CAPABILITY),code=[compute_$(GPU_COMPUTE_CAPABILITY),sm_$(GPU_COMPUTE_CAPABILITY)]
+endif
+
+# autodect a lot of various supports on current platform
+$(info ---------------------------------------------)
+
+NVCC := $(shell which nvcc 2>/dev/null)
+
+# Check and include cudnn if available
+# You can override the path to cudnn frontend by setting CUDNN_FRONTEND_PATH on the make command line
+# By default, we look for it in HOME/cudnn-frontend/include and ./cudnn-frontend/include
+# Refer to the README for cuDNN install instructions
+ifeq ($(USE_CUDNN), 1)
+  ifeq ($(shell [ -d $(HOME)/cudnn-frontend/include ] && echo "exists"), exists)
+    $(info ✓ cuDNN found, will run with flash-attention)
+    CUDNN_FRONTEND_PATH ?= $(HOME)/cudnn-frontend/include
+  else ifeq ($(shell [ -d cudnn-frontend/include ] && echo "exists"), exists)
+    $(info ✓ cuDNN found, will run with flash-attention)
+    CUDNN_FRONTEND_PATH ?= cudnn-frontend/include
+  else
+    $(error ✗ cuDNN not found. See the README for install instructions and the Makefile for hard-coded paths)
+  endif
+  NVCC_INCLUDES += -I$(CUDNN_FRONTEND_PATH)
+  NVCC_LDFLAGS += -lcudnn
+  NVCC_FLAGS += -DENABLE_CUDNN
+  NVCC_CUDNN = $(BUILD_DIR)/cudnn_att.o
+else
+  $(info → cuDNN is manually disabled by default, run make with `USE_CUDNN=1` to try to enable)
+endif
+
+# Check if OpenMP is available
+# This is done by attempting to compile an empty file with OpenMP flags
+# OpenMP makes the code a lot faster so I advise installing it
+# e.g. on MacOS: brew install libomp
+# e.g. on Ubuntu: sudo apt-get install libomp-dev
+# later, run the program by prepending the number of threads, e.g.: OMP_NUM_THREADS=8 ./gpt2
+# First, check if NO_OMP is set to 1, if not, proceed with the OpenMP checks
+ifeq ($(NO_OMP), 1)
+  $(info OpenMP is manually disabled)
+else
+  ifneq ($(OS), Windows_NT)
+    # Check for OpenMP support in GCC or Clang on Linux
+    ifeq ($(shell echo | $(CC) -fopenmp -x c -E - > /dev/null 2>&1; echo $$?), 0)
+      CFLAGS += -fopenmp -DOMP
+      LDLIBS += -lgomp
+      $(info ✓ OpenMP found)
+    else
+      $(info ✗ OpenMP not found)
+    endif
+  endif
+endif
+
+# Check if OpenMPI and NCCL are available, include them if so, for multi-GPU training
+ifeq ($(NO_MULTI_GPU), 1)
+  $(info → Multi-GPU (OpenMPI + NCCL) is manually disabled)
+else
+  ifeq ($(shell [ -d /usr/lib/x86_64-linux-gnu/openmpi/lib/ ] && [ -d /usr/lib/x86_64-linux-gnu/openmpi/include/ ] && echo "exists"), exists)
+    $(info ✓ OpenMPI found, OK to train with multiple GPUs)
+    NVCC_INCLUDES += -I/usr/lib/x86_64-linux-gnu/openmpi/include
+    NVCC_LDFLAGS += -L/usr/lib/x86_64-linux-gnu/openmpi/lib/
+    NVCC_LDLIBS += -lmpi -lnccl
+    NVCC_FLAGS += -DMULTI_GPU
+  else
+    $(info ✗ OpenMPI is not found, disabling multi-GPU support)
+    $(info ---> On Linux you can try install OpenMPI with `sudo apt install openmpi-bin openmpi-doc libopenmpi-dev`)
+  endif
+endif
+
+# Precision settings, default to bf16 but ability to override
+ifeq ($(MAKECMDGOALS), clean)
+  PRECISION=BF16 
+endif
+
+VALID_PRECISIONS := FP32 FP16 BF16
+ifeq ($(filter $(PRECISION),$(VALID_PRECISIONS)),)
+  $(error Invalid precision $(PRECISION), valid precisions are $(VALID_PRECISIONS))
+endif
+ifeq ($(PRECISION), FP32)
+  PFLAGS = -DENABLE_FP32
+else ifeq ($(PRECISION), FP16)
+  PFLAGS = -DENABLE_FP16
+else
+  PFLAGS = -DENABLE_BF16
+endif
+
+# PHONY means these targets will always be executed
+.PHONY: all clean
+
+# Add targets
+TARGETS = test_dataloader
+
+# Dependency files
+test_dataloader_dependencies = test_dataloader.d
+HEADER_DEPENDENCIES = $(test_dataloader_dependencies)
+
+# Conditional inclusion of CUDA targets
+ifeq ($(NVCC),)
+    $(info ✗ nvcc not found, skipping GPU/CUDA builds)
+else
+    $(info ✓ nvcc found, including GPU/CUDA support)
+    TARGETS += 
+endif
+
+$(info ---------Build Configuration Complete - Build Targets -------------------------)
+
+all: $(TARGETS)
+
+# Generate dependency files
+%.d: %.c
+	$(CC) $(CFLAGS) -MMD -MP -MF $@ -c $<
+
+# Include the dependency files
+-include test_dataloader.d
+
+test_dataloader: test_dataloader.c
+	$(CC) $(CFLAGS) $(INCLUDES) $(LDFLAGS) -MMD -MP $^ $(LDLIBS) $(OUTPUT_FILE)
+
+clean:
+	$(REMOVE_FILES) $(TARGETS) *.d *.o
+	$(REMOVE_BUILD_OBJECT_FILES)
diff --git a/dev/test/device_file_io.cu b/dev/test/device_file_io.cu
new file mode 100644
index 000000000..71fb1ce7e
--- /dev/null
+++ b/dev/test/device_file_io.cu
@@ -0,0 +1,64 @@
+/*
+Tests device <-> file IO functions
+
+compile and run as (from dev/test directory)
+nvcc -o device_file_io device_file_io.cu && ./device_file_io
+*/
+
+
+#include "../../llmc/cuda_common.h"
+#include <vector>
+#include <random>
+#include <cstdio>
+#include <algorithm>
+
+void test(size_t nelem, size_t wt_buf_size, size_t rd_buf_size) {
+
+    float* data;
+    cudaCheck(cudaMalloc(&data, nelem*sizeof(float)));
+
+    // generate random array
+    std::vector<float> random_data(nelem);
+    std::mt19937 rng(42);
+    std::uniform_real_distribution<float> dist(-100.f, 100.f);
+    std::generate(random_data.begin(), random_data.end(), [&](){ return dist(rng); });
+
+    cudaCheck(cudaMemcpy(data, random_data.data(), random_data.size()*sizeof(float), cudaMemcpyHostToDevice));
+
+    cudaStream_t stream;
+    cudaStreamCreate(&stream);
+
+    FILE* tmp = fopenCheck("tmp.bin", "w");
+    device_to_file(tmp, data, nelem * sizeof(float), wt_buf_size, stream);
+    fcloseCheck(tmp);
+
+
+    float* reload;
+    cudaCheck(cudaMalloc(&reload, nelem*sizeof(float)));
+
+    tmp  = fopenCheck("tmp.bin", "r");
+    file_to_device(reload, tmp, nelem * sizeof(float), rd_buf_size, stream);
+    fcloseCheck(tmp);
+
+    std::vector<float> cmp(nelem);
+    cudaCheck(cudaMemcpy(cmp.data(), reload, nelem * sizeof(float), cudaMemcpyDeviceToHost));
+    for(int i = 0; i < nelem; ++i) {
+        if(random_data[i] != cmp[i])  {
+            fprintf(stderr, "FAIL: Mismatch at position %d: %f vs %f\n", i, random_data[i], cmp[i]);
+            remove("tmp.bin");
+            exit(EXIT_FAILURE);
+        }
+    }
+
+    cudaCheck(cudaFree(reload));
+    cudaCheck(cudaFree(data));
+    remove("tmp.bin");
+}
+
+int main() {
+    test(1025, 10000, 10000);           // buffers larger than data
+    test(1025, 1024, 513);              // different and smaller
+    test(500, 500*sizeof(float),
+         500*sizeof(float));            // exact match
+    test(125'000, 10000, 10000);        // large array
+}
\ No newline at end of file
diff --git a/dev/test/test_dataloader.c b/dev/test/test_dataloader.c
new file mode 100644
index 000000000..2803da022
--- /dev/null
+++ b/dev/test/test_dataloader.c
@@ -0,0 +1,304 @@
+/*
+Tests our DataLoader
+
+compile and run as (from dev/test directory)
+gcc -O3 -I../../llmc -o test_dataloader test_dataloader.c -lm && ./test_dataloader
+
+TODOs:
+- test load/save state of DataLoader
+*/
+#include <unistd.h>
+#include "../../llmc/dataloader.h"
+
+#define SHARD_NAME_LEN 64
+char shard_name[SHARD_NAME_LEN];
+const int num_tokens = 140;
+int num_shards = 4;
+
+void check_range(const int *tokens, const int start, const int end, const char *file, int line) {
+    // checks that the tokens[0, ... end-start] are the range [start, end)
+    int n = end - start;
+    for (int i = 0; i < n; i++) {
+        int token = tokens[i];
+        if (token != start + i) {
+            fprintf(stderr, "Error: tokens[%d] = %d, expected %d\n", i, token, start + i);
+            fprintf(stderr, "Error details:\n");
+            fprintf(stderr, "  File: %s\n", file);
+            fprintf(stderr, "  Line: %d\n", line);
+            exit(EXIT_FAILURE);
+        }
+    }
+    // printf("tokens in range [%d, %d) OK\n", start, end);
+}
+#define checkRange(tokens, start, end) check_range(tokens, start, end, __FILE__, __LINE__)
+
+void check_equals(const int *tokens, const int n, const int expected, const char *file, int line) {
+    // checks that the tokens[0, ... n] are all equal to expected
+    for (int i = 0; i < n; i++) {
+        int token = tokens[i];
+        if (token != expected) {
+            fprintf(stderr, "Error: tokens[%d] = %d, expected %d\n", i, token, expected);
+            fprintf(stderr, "Error details:\n");
+            fprintf(stderr, "  File: %s\n", file);
+            fprintf(stderr, "  Line: %d\n", line);
+            exit(EXIT_FAILURE);
+        }
+    }
+    // printf("tokens all equal to %d OK\n", expected);
+}
+#define checkEquals(tokens, n, expected) check_equals(tokens, n, expected, __FILE__, __LINE__)
+
+void test_simple(void) {
+    /*
+    Tests the simplest DataLoader functionality:
+    - multi-shard
+    - single-process
+    - not shuffled
+    DataLoader should just return all the tokens in order
+    */
+    printf("test_simple... ");
+    int B = 4;
+    int T = 8;
+    int process_rank = 0;
+    int num_processes = 1;
+    int should_shuffle = 0;
+    snprintf(shard_name, SHARD_NAME_LEN, "shard_????.bin");
+    DataLoader loader;
+    dataloader_init(&loader, shard_name, B, T, process_rank, num_processes, should_shuffle);
+
+    int batches_fit = num_tokens / (B * T); // number of batches that fit per shard
+    int BT = B * T;
+    int num_epochs = 4;
+    for (int e = 0; e < num_epochs; e++) { // epoch
+        for (int s = 0; s < num_shards; s++) { // shard
+            int start = s * num_tokens;
+            for (int b = 0; b < batches_fit; b++) { // batch
+                dataloader_next_batch(&loader);
+                checkRange(loader.inputs, start, start + BT);
+                checkRange(loader.targets, start + 1, start + BT + 1);
+                start += BT;
+            }
+        }
+    }
+    dataloader_free(&loader);
+    printf("OK\n");
+}
+
+void test_multiprocess_simple(void) {
+    /*
+    Same as simple above, but using 2 processes.
+    (which we of course use in a serial, single process way here)
+    The DataLoaders simply pull chunks of consecutive tokens, so
+    we expect them to alternate in the "token space".
+    */
+    printf("test_multiprocess_simple... ");
+    int B = 4;
+    int T = 8;
+    int num_processes = 2;
+    int should_shuffle = 0;
+    snprintf(shard_name, SHARD_NAME_LEN, "shard_????.bin");
+    DataLoader loader0, loader1;
+    dataloader_init(&loader0, shard_name, B, T, 0, num_processes, should_shuffle);
+    dataloader_init(&loader1, shard_name, B, T, 1, num_processes, should_shuffle);
+
+    int batches_fit = num_tokens / (B * T * num_processes); // number of batches that fit per shard
+    int BT = B * T;
+    int num_epochs = 4;
+    for (int e = 0; e < num_epochs; e++) { // epoch
+        for (int s = 0; s < num_shards; s++) { // shard
+            int start = s * num_tokens;
+            for (int b = 0; b < batches_fit; b++) { // batch
+                dataloader_next_batch(&loader0);
+                dataloader_next_batch(&loader1);
+                checkRange(loader0.inputs, start, start + BT);
+                checkRange(loader1.inputs, start + BT, start + 2*BT);
+                checkRange(loader0.targets, start + 1, start + BT + 1);
+                checkRange(loader1.targets, start + BT + 1, start + 2*BT + 1);
+                start += 2*BT;
+            }
+        }
+    }
+
+    dataloader_free(&loader0);
+    dataloader_free(&loader1);
+    printf("OK\n");
+}
+
+void test_shuffled(void) {
+    /*
+    Tests the DataLoader when using shuffled:
+    - multi-shard
+    - single-process
+    - shuffled!
+    DataLoader should return all the tokens, but in randperm order.
+    So all we check is that we see all the tokens we expect to see,
+    the correct number of times.
+    */
+    printf("test_shuffled... ");
+    int B = 4;
+    int T = 8;
+    int process_rank = 0;
+    int num_processes = 1;
+    int should_shuffle = 1; // should shuffle bit turn on
+    snprintf(shard_name, 64, "shard_????.bin");
+    DataLoader loader;
+    dataloader_init(&loader, shard_name, B, T, process_rank, num_processes, should_shuffle);
+
+    // get batches from the dataloader and keep stats on what tokens we see
+    int total_tokens = num_shards * num_tokens;
+    int *num_seen_inputs = (int *)calloc(total_tokens, sizeof(int));
+    int *num_seen_targets = (int *)calloc(total_tokens, sizeof(int));
+    int batches_fit = num_tokens / (B * T); // number of batches that fit per shard
+    int BT = B * T;
+    int num_epochs = 4;
+    for (int e = 0; e < num_epochs; e ++) { // epoch
+        for (int s = 0; s < num_shards; s++) { // shard
+            int start = s * num_tokens;
+            for (int b = 0; b < batches_fit; b++) { // batch
+                dataloader_next_batch(&loader);
+                // count up the tokens we see
+                for (int i = 0; i < BT; i++) {
+                    int input_token = loader.inputs[i];
+                    int target_token = loader.targets[i];
+                    assert(input_token >= 0 && input_token < total_tokens);
+                    assert(target_token >= 0 && target_token < total_tokens);
+                    num_seen_inputs[input_token]++;
+                    num_seen_targets[target_token]++;
+                }
+                start += BT;
+            }
+        }
+    }
+
+    // verify that we saw all the tokens the correct number of times
+    int tokens_fit = batches_fit * BT; // number of tokens that fit per shard
+    for (int s = 0; s < num_shards; s++) {
+        int start = s * num_tokens;
+        // verify the inputs counts for this shard:
+        // - the first tokens_fit should have been seen num_epochs times
+        // - the rest of the tokens in that should should have been seen zero times
+        checkEquals(num_seen_inputs + start, tokens_fit, num_epochs);
+        checkEquals(num_seen_inputs + start + tokens_fit, num_tokens - tokens_fit, 0);
+        // verify the target counts. same thing but offset by 1
+        checkEquals(num_seen_targets + start + 1, tokens_fit, num_epochs);
+        checkEquals(num_seen_targets + start + 1 + tokens_fit,
+            (s == (num_shards - 1)) ? num_tokens - tokens_fit - 1 : num_tokens - tokens_fit,0);
+    }
+
+    dataloader_free(&loader);
+    free(num_seen_inputs);
+    free(num_seen_targets);
+    printf("OK\n");
+}
+
+void test_multiprocess_shuffled(void) {
+    /*
+    Tests the DataLoader when using both multiprocess and shuffled:
+    - multi-shard
+    - multi-process
+    - shuffled!
+    DataLoaders should return all the tokens, but in randperm order.
+    So all we check is that we see all the tokens we expect to see,
+    the correct number of times, over multiple epochs.
+    */
+
+    printf("test_multiprocess_shuffled... ");
+    int B = 4;
+    int T = 8;
+    const int num_processes = 2;
+    int should_shuffle = 0;
+    snprintf(shard_name, SHARD_NAME_LEN, "shard_????.bin");
+    DataLoader loaders[num_processes];
+    for (int i = 0; i < num_processes; i++) {
+        dataloader_init(&loaders[i], shard_name, B, T, i, num_processes, should_shuffle);
+    }
+
+    // get batches from the dataloader and keep stats on what tokens we see
+    int total_tokens = num_shards * num_tokens;
+    int *num_seen_inputs = (int *)calloc(total_tokens, sizeof(int));
+    int *num_seen_targets = (int *)calloc(total_tokens, sizeof(int));
+    int batches_fit = num_tokens / (B * T * num_processes); // number of batches that fit per shard
+    int BT = B * T;
+    int num_epochs = 4;
+    for (int e = 0; e < num_epochs; e ++) { // epoch
+        for (int s = 0; s < num_shards; s++) { // shard
+            int start = s * num_tokens;
+            for (int b = 0; b < batches_fit; b++) { // batch
+                for (int n = 0; n < num_processes; n++) { // dataloader
+                    DataLoader *loader = &loaders[n];
+                    dataloader_next_batch(loader);
+                    // count up the tokens we see
+                    for (int i = 0; i < BT; i++) {
+                        int input_token = loader->inputs[i];
+                        int target_token = loader->targets[i];
+                        assert(input_token >= 0 && input_token < total_tokens);
+                        assert(target_token >= 0 && target_token < total_tokens);
+                        num_seen_inputs[input_token]++;
+                        num_seen_targets[target_token]++;
+                    }
+                    start += BT;
+                }
+            }
+        }
+    }
+
+    // verify that we saw all the tokens the correct number of times
+    int tokens_fit = batches_fit * (B * T * num_processes); // number of tokens that fit per shard
+    for (int s = 0; s < num_shards; s++) {
+        int start = s * num_tokens; // token id that starts this shard
+        // verify the inputs counts for this shard:
+        // - the first tokens_fit should have been seen num_epochs times
+        // - the rest of the tokens in that should should have been seen zero times
+        checkEquals(num_seen_inputs + start, tokens_fit, num_epochs);
+        checkEquals(num_seen_inputs + start + tokens_fit, num_tokens - tokens_fit, 0);
+        // verify the target counts. same thing but offset by 1
+        checkEquals(num_seen_targets + start + 1, tokens_fit, num_epochs);
+        checkEquals(num_seen_targets + start + 1 + tokens_fit,
+            (s == (num_shards - 1)) ? num_tokens - tokens_fit - 1 : num_tokens - tokens_fit,0);
+    }
+
+    // cleanup
+    for (int i = 0; i < num_processes; i++) {
+        dataloader_free(&loaders[i]);
+    }
+    free(num_seen_inputs);
+    free(num_seen_targets);
+    printf("OK\n");
+}
+
+int main(void) {
+
+    // generate a few dummy shards of data with incrementing tokens
+    int header[HEADER_SIZE];
+    uint16_t tokens[num_tokens];
+    for (int shard_id = 0; shard_id < num_shards; shard_id++) {
+        // ensure unique tokens across the shards for ez accounting below
+        int token_offset = shard_id * num_tokens;
+        for (int i = 0; i < num_tokens; i++) {
+            tokens[i] = token_offset + i;
+        }
+        // write the shard
+        snprintf(shard_name, SHARD_NAME_LEN, "shard_%04d.bin", shard_id);
+        header[0] = 20240520; // magic
+        header[1] = 1; // version
+        header[2] = num_tokens; // number of tokens within
+        FILE* shard_file = fopenCheck(shard_name, "wb");
+        fwrite(header, sizeof(int), HEADER_SIZE, shard_file);
+        fwrite(tokens, sizeof(uint16_t), num_tokens, shard_file);
+        fcloseCheck(shard_file);
+        printf("Wrote shard %s\n", shard_name);
+    }
+
+    test_simple();
+    test_multiprocess_simple();
+    test_shuffled();
+    test_multiprocess_shuffled();
+
+    // clean up the shards
+    for (int shard_id = 0; shard_id < num_shards; shard_id++) {
+        snprintf(shard_name, SHARD_NAME_LEN, "shard_%04d.bin", shard_id);
+        remove(shard_name);
+    }
+
+    return EXIT_SUCCESS;
+}
\ No newline at end of file
diff --git a/dev/test/test_outlier_detector.c b/dev/test/test_outlier_detector.c
new file mode 100644
index 000000000..75b9ca354
--- /dev/null
+++ b/dev/test/test_outlier_detector.c
@@ -0,0 +1,52 @@
+/*
+Tests our OutlierDetector
+
+compile and run as (from dev/test directory)
+gcc -O3 -I../../llmc -o test_outlier_detector test_outlier_detector.c -lm && ./test_outlier_detector
+*/
+
+#include <stdlib.h>
+#include "../../llmc/outlier_detector.h"
+
+int main(void) {
+    OutlierDetector detector;
+    init_detector(&detector);
+
+    srand(1337); // init rng
+
+    // generate OUTLIER_DETECTOR_WINDOW_SIZE * 2 random numbers between -1 and 1
+    for (int i = 0; i < OUTLIER_DETECTOR_WINDOW_SIZE * 2; i++) {
+        double val = (double)rand() / RAND_MAX * 2 - 1;  // Random number between -1 and 1
+        double zscore = update_detector(&detector, val);
+
+        printf("Step %d: Value = %.4f, zscore = %.4f\n", i, val, zscore);
+
+        // check that the first OUTLIER_DETECTOR_WINDOW_SIZE values return nan
+        if (i < OUTLIER_DETECTOR_WINDOW_SIZE) {
+            if (!isnan(zscore)) {
+                printf("Error: Expected nan, got %.4f\n", zscore);
+                return EXIT_FAILURE;
+            }
+        } else {
+            // check that the zscore is within reasonable bounds
+            if (zscore < -3.0 || zscore > 3.0) {
+                printf("Error: Z-score %.4f is outside of expected range\n", zscore);
+                return EXIT_FAILURE;
+            }
+        }
+    }
+
+    // simulate an outlier
+    double outlier = 10.0; // <--- loss spike
+    double zscore = update_detector(&detector, outlier);
+    printf("Outlier Step: Value = %.4f, zscore = %.4f\n", outlier, zscore);
+
+    // check that the z-score here is large
+    if (zscore < 5.0) {
+        printf("Error: Z-score %.4f is not large enough for an outlier\n", zscore);
+        return EXIT_FAILURE;
+    }
+
+    printf("OK\n");
+    return EXIT_SUCCESS;
+}
diff --git a/dev/unistd.h b/dev/unistd.h
index 337f29ad2..fa407fc61 100644
--- a/dev/unistd.h
+++ b/dev/unistd.h
@@ -4,11 +4,17 @@
 
 #define _CRT_SECURE_NO_WARNINGS
 #define _USE_MATH_DEFINES
+#define WIN32_LEAN_AND_MEAN
 
 #include <stdio.h>
 #include <math.h>
-//#define gen_max_length 64 // compile as C++ to skip this VLA issue
 #include <time.h>
+#include <stdlib.h> // for malloc and free
+#include <string.h>
+#include <direct.h> // for _mkdir and _stat
+#include <io.h> // needed for _access below and _findfirst, _findnext, _findclose
+#pragma comment(lib, "Ws2_32.lib")  // Link Ws2_32.lib for socket functions
+#include <winsock2.h>
 
 #define CLOCK_MONOTONIC 0
 static inline int clock_gettime(int ignore_variable, struct timespec* tv)
@@ -17,14 +23,12 @@ static inline int clock_gettime(int ignore_variable, struct timespec* tv)
 }
 
 #define OMP /* turn it on */
-#include <io.h> /* needed for access below */
 #define F_OK 0
 #define access _access
 
 #define TURN_OFF_FP_FAST __pragma(float_control( precise, on, push )) // Save current setting and turn on /fp:precise
 #define TURN_ON_FP_FAST  __pragma(float_control(pop)) // Restore file's default settings
 
-#include <direct.h> /* for _mkdir and _stat */
 #define mkdir(path, mode) _mkdir(path) /* sketchy way to get mkdir to work on windows */
 #define stat _stat
 
@@ -59,7 +63,7 @@ static inline int glob(const char* pattern, int ignored_flags, int (*ignored_err
 
     replace_forward_slashes (pattern_copy); // Replace forward slashes with backslashes
 
-    if (strchr(pattern_copy, '\\') != NULL) {
+    if (strchr(pattern_copy, '\\') != (void*) NULL) {
         strncpy_s(directory_path, sizeof(directory_path) - 1, pattern_copy, strrchr(pattern_copy, '\\') - pattern_copy + 1);
         directory_path[strrchr(pattern_copy, '\\') - pattern_copy + 1] = '\0';
     }
diff --git a/dev/vislog.ipynb b/dev/vislog.ipynb
index 6cd9d51e1..96dbe4877 100644
--- a/dev/vislog.ipynb
+++ b/dev/vislog.ipynb
@@ -35,7 +35,6 @@
     "    with open(logfile, \"r\") as f:\n",
     "        for line in f:\n",
     "            parts = line.split()\n",
-    "            assert len(parts) == 2\n",
     "            step = int(parts[0].split(\":\")[1])\n",
     "            stream = parts[1].split(\":\")[0]\n",
     "            val = float(parts[1].split(\":\")[1])\n",
@@ -53,7 +52,7 @@
     "    # return the xs, ys lists\n",
     "    return streams_xy\n",
     "\n",
-    "# parse_logfile(\"../log124M/main.log\")"
+    "parse_logfile(\"../log124M/main.log\")"
    ]
   },
   {
@@ -62,55 +61,85 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "sz = \"350M\"\n",
+    "import numpy as np\n",
+    "\n",
+    "sz = \"124M\"\n",
     "loss_baseline = {\n",
     "    \"124M\": 3.424958,\n",
     "    \"350M\": 3.083089,\n",
     "    \"774M\": 3.000580,\n",
     "    \"1558M\": 2.831273,\n",
     "}[sz]\n",
-    "hella_baseline = {\n",
+    "hella2_baseline = { # for GPT-2\n",
     "    \"124M\": 0.294463,\n",
     "    \"350M\": 0.375224,\n",
     "    \"774M\": 0.431986,\n",
     "    \"1558M\": 0.488946,\n",
     "}[sz]\n",
-    "\n",
+    "hella3_baseline = { # for GPT-3\n",
+    "    \"124M\": 0.337,\n",
+    "    \"350M\": 0.436,\n",
+    "    \"774M\": 0.510,\n",
+    "    \"1558M\": 0.547,\n",
+    "}[sz]\n",
     "# assumes each model run is stored in this way\n",
-    "logfile = f\"../log{sz}/main.log\"\n",
+    "logfile = f\"../log_gpt2_{sz}/main.log\"\n",
     "streams = parse_logfile(logfile)\n",
     "\n",
+    "# optional function that smooths out the loss some\n",
+    "def smooth_moving_average(signal, window_size):\n",
+    "    if signal.ndim != 1:\n",
+    "        raise ValueError(\"smooth_moving_average only accepts 1D arrays.\")\n",
+    "    if signal.size < window_size:\n",
+    "        raise ValueError(\"Input vector needs to be bigger than window size.\")\n",
+    "    if window_size < 3:\n",
+    "        return signal\n",
+    "\n",
+    "    s = np.pad(signal, (window_size//2, window_size-1-window_size//2), mode='edge')\n",
+    "    w = np.ones(window_size) / window_size\n",
+    "    smoothed_signal = np.convolve(s, w, mode='valid')\n",
+    "    return smoothed_signal\n",
+    "\n",
     "plt.figure(figsize=(16, 6))\n",
     "\n",
     "# Panel 1: losses: both train and val\n",
     "plt.subplot(121)\n",
     "xs, ys = streams[\"trl\"] # training loss\n",
+    "ys = np.array(ys)\n",
+    "# smooth out ys using a rolling window\n",
+    "# ys = smooth_moving_average(ys, 21) # optional\n",
     "plt.plot(xs, ys, label=f'llm.c ({sz}) train loss')\n",
     "print(\"Min Train Loss:\", min(ys))\n",
     "xs, ys = streams[\"tel\"] # validation loss\n",
     "plt.plot(xs, ys, label=f'llm.c ({sz}) val loss')\n",
     "# horizontal line at GPT-2 baseline\n",
+    "# we don't have GPT-3 loss on this dataset because the weights were never released\n",
     "if loss_baseline is not None:\n",
     "    plt.axhline(y=loss_baseline, color='r', linestyle='--', label=f\"OpenAI GPT-2 ({sz}) checkpoint val loss\")\n",
     "plt.xlabel(\"steps\")\n",
     "plt.ylabel(\"loss\")\n",
     "plt.yscale('log')\n",
+    "plt.ylim(top=4.0)\n",
     "plt.legend()\n",
     "plt.title(\"Loss\")\n",
     "print(\"Min Validation Loss:\", min(ys))\n",
     "\n",
     "# Panel 2: HellaSwag eval\n",
     "plt.subplot(122)\n",
-    "xs, ys = streams[\"eval\"] # HellaSwag eval\n",
-    "plt.plot(xs, ys, label=f\"llm.c ({sz})\")\n",
-    "# horizontal line at GPT-2 baseline\n",
-    "if hella_baseline:\n",
-    "    plt.axhline(y=hella_baseline, color='r', linestyle='--', label=f\"OpenAI GPT-2 ({sz}) checkpoint\")\n",
-    "plt.xlabel(\"steps\")\n",
-    "plt.ylabel(\"accuracy\")\n",
-    "plt.legend()\n",
-    "plt.title(\"HellaSwag eval\")\n",
-    "print(\"Max Hellaswag eval:\", max(ys))"
+    "if \"eval\" in streams:\n",
+    "    xs, ys = streams[\"eval\"] # HellaSwag eval\n",
+    "    ys = np.array(ys)\n",
+    "    plt.plot(xs, ys, label=f\"llm.c ({sz})\")\n",
+    "    # horizontal line at GPT-2/3 baselines\n",
+    "    if hella2_baseline:\n",
+    "        plt.axhline(y=hella2_baseline, color='r', linestyle='--', label=f\"OpenAI GPT-2 ({sz}) checkpoint\")\n",
+    "    if hella3_baseline:\n",
+    "        plt.axhline(y=hella3_baseline, color='g', linestyle='--', label=f\"OpenAI GPT-3 ({sz}) checkpoint\")\n",
+    "    plt.xlabel(\"steps\")\n",
+    "    plt.ylabel(\"accuracy\")\n",
+    "    plt.legend()\n",
+    "    plt.title(\"HellaSwag eval\")\n",
+    "    print(\"Max Hellaswag eval:\", max(ys))\n"
    ]
   }
  ],
diff --git a/llmc/adamw.cuh b/llmc/adamw.cuh
new file mode 100644
index 000000000..4453576ee
--- /dev/null
+++ b/llmc/adamw.cuh
@@ -0,0 +1,98 @@
+/*
+AdamW kernel
+*/
+
+// llmc internal imports
+#include "cuda_common.h"
+#include "cuda_utils.cuh"
+
+// ----------------------------------------------------------------------------
+// CUDA kernels
+
+// Implements linear interpolation using only two floating-point operations (as opposed to three in a naive implementation).
+// Reference: https://developer.nvidia.com/blog/lerp-faster-cuda
+__device__ float lerp(float start, float end, float weight) {
+    return fma(weight, end, fma(-weight, start, start));
+}
+
+template <typename Tp, typename Tg>
+__device__ void adamw_update(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters,
+                             float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay,
+                             float grad_scale, unsigned int seed) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_parameters) { return; }  // guard
+
+    // get the gradient, m, and v for this parameter
+    float grad = grad_scale * (float)grads_memory[idx];
+    float m = m_memory[idx];
+    float v = v_memory[idx];
+    // update the first moment (momentum)
+    m = lerp(grad, m, beta1);
+    m_memory[idx] = m;
+    // update the second moment (RMSprop)
+    v = lerp(grad * grad, v, beta2);
+    v_memory[idx] = v;
+    m /= beta1_correction;  // m_hat
+    v /= beta2_correction;  // v_hat
+    // fetch the old value of this parameter as a float, from either source
+    float old_param = (master_params_memory != NULL) ? master_params_memory[idx] : (float)params_memory[idx];
+    // update this parameter
+    float param = old_param - (learning_rate * (m / (sqrtf(v) + eps) + weight_decay * old_param));
+    // update our low precision version of the parameters using stochastic rounding
+    // this will be used in the next forward pass
+    stochastic_rounding(param, &params_memory[idx], seed);
+    // write the full, float version of the param into our master copy, if we maintain one
+    // this will be used in the next update
+    if (master_params_memory != NULL) { master_params_memory[idx] = param; }
+}
+
+template <typename Tp, typename Tg>
+__global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters,
+                              ptrdiff_t w_stride, ptrdiff_t g_stride, ptrdiff_t s_stride,
+                              float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay,
+                              float grad_scale, unsigned int seed) {
+    adamw_update(params_memory + blockIdx.y * w_stride,
+                 master_params_memory ? master_params_memory + blockIdx.y * s_stride : NULL,
+                 grads_memory + blockIdx.y * g_stride,
+                 m_memory + blockIdx.y * s_stride,
+                 v_memory + blockIdx.y * s_stride,
+                 num_parameters, learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay, grad_scale,
+                 seed
+                 );
+}
+
+template <typename Tp>
+__global__ void init_from_master_kernel(Tp* params_memory, float* master_params_memory, size_t num_parameters,
+                                          ptrdiff_t w_stride, ptrdiff_t s_stride, unsigned int seed) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= num_parameters) { return; }
+    params_memory += blockIdx.y * w_stride; // adjust for layer offset
+    master_params_memory += blockIdx.y * s_stride;
+    stochastic_rounding(master_params_memory[idx], &params_memory[idx], seed);
+}
+
+template <typename Tp, typename Tg>
+void adamw_update(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters,
+                  ptrdiff_t w_stride, ptrdiff_t g_stride, ptrdiff_t s_stride,  int num_slices, float learning_rate, float beta1, float beta2, int t, float eps, float weight_decay,
+                  float grad_scale, unsigned int seed, cudaStream_t stream) {
+    // AdamW update
+    int block_size = 512;
+    int num_blocks = CEIL_DIV(num_parameters, block_size);
+    float beta1_correction = 1.0f - powf(beta1, t);
+    float beta2_correction = 1.0f - powf(beta2, t);
+    adamw_kernel3<<<dim3(num_blocks, num_slices), block_size, 0, stream>>>(params_memory, master_params_memory, grads_memory,
+                                                         m_memory, v_memory, num_parameters, w_stride, g_stride, s_stride,
+                                                         learning_rate, beta1, beta2, beta1_correction, beta2_correction, eps, weight_decay,
+                                                         grad_scale, seed);
+    cudaCheck(cudaGetLastError());
+}
+
+template <typename Tp>
+void init_from_master(Tp* params_memory, float* master_params_memory, size_t num_parameters,
+                        ptrdiff_t w_stride, ptrdiff_t s_stride, int num_slices, unsigned int seed, cudaStream_t stream) {
+    int block_size = 512; // must match block size of adamw_update so that RNG also matches
+    int num_blocks = CEIL_DIV(num_parameters, block_size);
+    init_from_master_kernel<<<dim3(num_blocks, num_slices), block_size, 0, stream>>>
+                             (params_memory, master_params_memory, num_parameters, w_stride, s_stride, seed);
+    cudaCheck(cudaGetLastError());
+}
diff --git a/llmc/attention.cuh b/llmc/attention.cuh
new file mode 100644
index 000000000..f6294a213
--- /dev/null
+++ b/llmc/attention.cuh
@@ -0,0 +1,276 @@
+/*
+Attention, as a fallback when we do not use the Flash Attention from cuDNN
+*/
+#include <assert.h>
+// llmc internal imports
+#include "cuda_common.h"
+#include "cuda_utils.cuh"
+#include "cublas_common.h"
+
+// ----------------------------------------------------------------------------
+// CUDA kernels
+
+// inputs floatX, outputs FP32 (for current FP32-only activation path for this WIP)
+__global__ void permute_kernel(floatX* q, floatX* k, floatX* v,
+                               const floatX* inp,
+                               int B, int N, int NH, int d) {
+    // okay so now, this kernel wants Q,K,V to all be of shape (B, NH, N, d)
+    // but instead, we have a single tensor QKV (inp) of shape (B, N, 3, NH, d)
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= B * NH * N * d) { return; }
+
+    // Q[b][nh_][n][d_] = inp[b][n][0][nh_][d_]
+    int b = idx / (NH * N * d);
+    int rest = idx % (NH * N * d);
+    int nh_ = rest / (N * d);
+    rest = rest % (N * d);
+    int n = rest / d;
+    int d_ = rest % d;
+    int inp_idx = (b * N * 3 * NH * d) + (n * 3 * NH * d) + (0 * NH * d) + (nh_ * d) + d_;
+    q[idx] = __ldcs(&inp[inp_idx]);
+    k[idx] = __ldcs(&inp[inp_idx + NH * d]);
+    v[idx] = __ldcs(&inp[inp_idx + 2 * (NH * d)]);
+}
+
+__global__ void permute_kernel_backward(floatX* dinp,
+                                        const floatX* dq, const floatX* dk, const floatX* dv,
+                                        int B, int N, int NH, int d) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= B * NH * N * d) { return; }
+
+    int b = idx / (NH * N * d);
+    int rest = idx % (NH * N * d);
+    int nh_ = rest / (N * d);
+    rest = rest % (N * d);
+    int n = rest / d;
+    int d_ = rest % d;
+
+    int inp_idx = (b * N * 3 * NH * d) + (n * 3 * NH * d) + (0 * NH * d) + (nh_ * d) + d_;
+    dinp[inp_idx] = dq[idx];
+    dinp[inp_idx + NH * d] = dk[idx];
+    dinp[inp_idx + 2 * (NH * d)] = dv[idx];
+}
+
+__global__ void unpermute_kernel(floatX* inp, floatX *out, int B, int N, int NH, int d) {
+   // out has shape (B, nh, N, d) but we need to unpermute it to (B, N, nh, d)
+
+    int idx = (blockIdx.x * blockDim.x + threadIdx.x);
+    // out[b][n][nh_][d_] <- inp[b][nh_][n][d_]
+    if (idx >= B * NH * N * d) { return; }
+
+    int b = idx / (NH * N * d);
+    int rest = idx % (NH * N * d);
+    int nh_ = rest / (N * d);
+    rest = rest % (N * d);
+    int n = rest / d;
+    int d_ = rest % d;
+    int other_idx = (b * NH * N * d) + (n * NH * d) + (nh_ * d) + d_;
+    out[other_idx] = __ldcs(&inp[idx]);
+}
+
+__global__ void unpermute_kernel_backward(floatX* dinp, const floatX *dout, int B, int N, int NH, int d) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= B * NH * N * d) { return; }
+
+    int b = idx / (NH * N * d);
+    int rest = idx % (NH * N * d);
+    int nh_ = rest / (N * d);
+    rest = rest % (N * d);
+    int n = rest / d;
+    int d_ = rest % d;
+    int other_idx = (b * NH * N * d) + (n * NH * d) + (nh_ * d) + d_;
+    dinp[idx] = (floatX)dout[other_idx];
+}
+
+__global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, const floatX* inp, int N, int T) {
+    // inp, out shape: (N, T, T), where N = B * NH
+    // fuses the multiplication by scale inside attention
+    // directly autoregressive, so we only compute the lower triangular part
+    // uses the online softmax algorithm
+    assert(T % 4  == 0);
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int warp_id = threadIdx.x / WARP_SIZE;
+    int num_warps = blockDim.x / WARP_SIZE;
+
+    // micro-optimization: we iterate backwards so that
+    // after the softmax backward operation completes, the cache retains the
+    // part of the matrix close to the upper left corner, which benefits the
+    // matmul operation that immediately follows.
+    // int idx = blockIdx.x * warp.meta_group_size() + warp.meta_group_rank(); // forward order
+    int idx = (gridDim.x - blockIdx.x - 1) * num_warps + warp_id; // backward order
+    if(idx >= N * T) {
+        return;
+    }
+    int own_pos = idx % T;
+    int pos_by_4 = own_pos / 4;
+
+    // one row of inp, i.e. inp[idx, :] of shape (T,)
+    const floatX* x = inp + idx * T;
+
+    // not INF, so we don't get NaNs accidentally when subtracting two values.
+    const float flt_max = 340282346638528859811704183484516925440.0f; // to avoid including float.h
+    float maxval = -flt_max;
+    float sumval = 0.0f;
+
+    const floatX* x_aligned = reinterpret_cast<const floatX*>(__builtin_assume_aligned(x, 16));
+    for (int i = lane_id; i < pos_by_4; i += WARP_SIZE) {
+        float regarray[4];
+        for (int k = 0; k < 4; ++k) {
+            regarray[k] = (float)x_aligned[4*i + k];
+        }
+        float old_maxval = maxval;
+        for(int k = 0; k < 4; ++k) {
+            maxval = fmaxf(maxval, regarray[k]);
+        }
+        sumval *= expf(inv_temperature * (old_maxval - maxval));
+        for(int k = 0; k < 4; ++k) {
+            sumval += expf(inv_temperature * (regarray[k] - maxval));
+        }
+    }
+
+    if(4*pos_by_4 + lane_id <= own_pos) {
+        float old_maxval = maxval;
+        maxval = fmaxf(maxval, (float)x[4*pos_by_4 + lane_id]);
+        sumval *= expf(inv_temperature * (old_maxval - maxval));
+        sumval += expf(inv_temperature * ((float)x[4*pos_by_4 + lane_id] - maxval));
+    }
+
+    float global_maxval = warpReduceMax(maxval);
+    sumval *= expf(inv_temperature * (maxval - global_maxval));
+
+    float sum = warpReduceSum(sumval);
+    float norm = 1.f / sum;
+
+    // divide the whole row by the sum
+    for (int i = lane_id; i <= own_pos; i += WARP_SIZE) {
+        // recalculation is faster than doing the round-trip through memory.
+        float ev = expf(inv_temperature * ((float)__ldcs(x + i) - global_maxval));
+        __stcs(out + idx * T + i, (floatX)(ev * norm));
+    }
+}
+
+__global__ void softmax_autoregressive_backward_inplace_kernel(floatX* datt, const floatX* att,
+                                                               int B, int T, int C, float scale) {
+    constexpr const int BlockSize = 256;
+    constexpr int T_per_block = 4;
+
+    // go through blocks in reverse order, so the slowest block starts first
+    int t0 = T - 1 - T_per_block*blockIdx.x;
+    int idx = blockIdx.y;
+
+    att += idx * T * T;
+    datt += idx * T * T;
+
+    for(int to = 0; to < T_per_block; ++to) {
+        int t = t0 - to;
+        if(t < 0) return;
+        const floatX* att_bth = att + t * T;
+        const floatX* datt_bth = datt + t * T;
+        floatX* dpreatt_bth = datt + t * T;
+
+        float local_sum = 0;
+        for (int t2 = threadIdx.x; t2 <= t; t2 += BlockSize) {
+            local_sum += (float)att_bth[t2] * (float)datt_bth[t2];
+        }
+
+        local_sum = blockReduce<warpReduceSum>(local_sum);
+
+        for (int t3 = threadIdx.x; t3 < T; t3 += BlockSize) {
+            // don't touch the cache. Some parts will still be here from the previous loop, and
+            // we want to exploit those.
+            if(t3 <= t) {
+                float acc = (float) __ldcs(att_bth + t3) * ((float) __ldcs(datt_bth + t3) - local_sum);
+                __stcs(dpreatt_bth + t3, (floatX) (scale * acc));
+            } else {
+                // explicitly set non-causal elements to zero
+                __stcs(dpreatt_bth + t3, (floatX)0.f);
+            }
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------
+// kernel launchers
+
+void attention_forward(floatX* out, floatX* qkvr, floatX* att,
+                       floatX* inp,
+                       int B, int T, int C, int NH, cudaStream_t stream) {
+    NVTX_RANGE_FN();
+    // Note: `inp` is not needed for backward pass, so we re-use it as a scratch buffer.
+    // Its contents will be overwritten by this function.
+    const int block_size = 256;
+
+    // inp is (B, T, 3C) QKV
+    // preatt, att are (B, NH, T, T)
+    // output is (B, T, C)
+    const int HS = C / NH; // head size
+
+    // permute and separate inp from (B, T, 3, NH, HS) to 3X (B, NH, T, HS)
+    floatX *q, *k, *v;
+    q = qkvr + 0 * B * T * C;
+    k = qkvr + 1 * B * T * C;
+    v = qkvr + 2 * B * T * C;
+    int total_threads = B * NH * T * HS;
+    int num_blocks = CEIL_DIV(total_threads, block_size);
+    permute_kernel<<<num_blocks, block_size, 0, stream>>>(q, k, v, inp, B, T, NH, HS);
+
+    floatX* preatt = inp; // reuse inp as scratch buffer
+    matmul_cublaslt(preatt, k, q, nullptr, T, T, HS, stream, true, false, B * NH, T * HS, T * HS, T * T);
+
+    // multiply all elements of preatt elementwise by scale
+    float scale = 1.f / sqrtf(HS);
+    int grid_size = CEIL_DIV(B * NH * T * WARP_SIZE, block_size);
+    softmax_forward_kernel5<<<grid_size, block_size, 0, stream>>>(att, scale, preatt, B * NH, T);
+
+    // new approach: first cuBLAS another batched matmul
+    floatX* vaccum = inp;
+    // y = att @ v # (B, nh, T, T) @ (B, nh, T, hs) -> (B, nh, T, hs)
+    matmul_cublaslt(vaccum, v, att, nullptr, HS, T, T, stream, false, false, B * NH, T * HS, T * T, T * HS);
+
+    // now unpermute
+    // y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
+    num_blocks = CEIL_DIV(B * T * C, block_size);
+    unpermute_kernel<<<num_blocks, block_size, 0, stream>>>(vaccum, out, B, T, NH, HS);
+    cudaCheck(cudaGetLastError());
+}
+
+// the sequence of transformations in this compound op is:
+// inp (B,T,3C) -> qkvr (B,T,3C) -> preatt (B,NH,T,T) -> att (B,NH,T,T) -> vaccum (B,T,C) -> out (B,T,C)
+void attention_backward(floatX* dinp, floatX* dqkvr, floatX* datt, floatX* scratch,
+                        const floatX* dout,
+                        const floatX* qkvr, const floatX* att,
+                        int B, int T, int C, int NH, cudaStream_t stream) {
+    NVTX_RANGE_FN();
+    const int block_size = 256;
+    const int HS = C / NH; // head size
+
+    // unpack convenience pointers into q, k, v
+    const floatX *q, *k, *v;
+    q = qkvr + 0 * B * T * C;
+    k = qkvr + 1 * B * T * C;
+    v = qkvr + 2 * B * T * C;
+    floatX *dq, *dk, *dv;
+    dq = dqkvr + 0 * B * T * C;
+    dk = dqkvr + 1 * B * T * C;
+    dv = dqkvr + 2 * B * T * C;
+
+    // backward through the unpermute operation
+    int num_blocks = CEIL_DIV(B * T * C, block_size);
+    unpermute_kernel_backward<<<num_blocks, block_size, 0, stream>>>(scratch, dout, B, T, NH, HS);
+    // backward into datt
+    matmul_cublaslt(datt, v, scratch, nullptr, T, T, HS, stream, true, false, B * NH, T * HS, T * HS, T * T);
+    // backward into dv
+    matmul_cublaslt(dv, scratch, att, nullptr, HS, T, T, stream, false, true, B * NH, T * HS, T * T, T * HS);
+    const float scale = 1.0f / sqrtf((float)HS);
+    // backward into preatt. this is an in-place operation; datt turns into dpreatt here
+    softmax_autoregressive_backward_inplace_kernel<<<dim3(T / 4, B * NH), 256>>>(datt, att, B, T, C, scale);
+    const floatX* dpreatt = datt;
+    // backward into q
+    matmul_cublaslt(dq, k, dpreatt, nullptr, HS, T, T, stream, false, false, B * NH, T * HS, T * T, T * HS);
+    // backward into k
+    matmul_cublaslt(dk, q, dpreatt, nullptr, HS, T, T, stream, false, true, B * NH, T * HS, T * T, T * HS);
+    // backward into inp
+    num_blocks = CEIL_DIV(B * NH * T * HS, block_size);
+    permute_kernel_backward<<<num_blocks, block_size, 0, stream>>>(dinp, dq, dk, dv, B, T, NH, HS);
+    cudaCheck(cudaGetLastError());
+}
diff --git a/llmc/cublas_common.h b/llmc/cublas_common.h
new file mode 100644
index 000000000..0496a2ac6
--- /dev/null
+++ b/llmc/cublas_common.h
@@ -0,0 +1,46 @@
+/*
+cuBLAS related utils
+*/
+#ifndef CUBLAS_COMMON_H
+#define CUBLAS_COMMON_H
+
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <cublas_v2.h>
+#include <cublasLt.h>
+
+// ----------------------------------------------------------------------------
+// cuBLAS Precision settings
+
+#if defined(ENABLE_FP32)
+#define CUBLAS_LOWP CUDA_R_32F
+#elif defined(ENABLE_FP16)
+#define CUBLAS_LOWP CUDA_R_16F
+#else // default to bfloat16
+#define CUBLAS_LOWP CUDA_R_16BF
+#endif
+
+// ----------------------------------------------------------------------------
+// cuBLAS globals for workspace, handle, settings
+
+// Hardcoding workspace to 32MiB but only Hopper needs 32 (for others 4 is OK)
+const size_t cublaslt_workspace_size = 32 * 1024 * 1024;
+void* cublaslt_workspace = NULL;
+cublasComputeType_t cublas_compute = CUBLAS_COMPUTE_32F;
+cublasLtHandle_t cublaslt_handle;
+
+// ----------------------------------------------------------------------------
+// Error checking
+
+// cuBLAS error checking
+void cublasCheck(cublasStatus_t status, const char *file, int line)
+{
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        printf("[cuBLAS ERROR]: %d %s %d\n", status, file, line);
+        exit(EXIT_FAILURE);
+    }
+}
+#define cublasCheck(status) { cublasCheck((status), __FILE__, __LINE__); }
+
+#endif // CUBLAS_COMMON_H
\ No newline at end of file
diff --git a/llmc/cuda_common.h b/llmc/cuda_common.h
new file mode 100644
index 000000000..006ad3010
--- /dev/null
+++ b/llmc/cuda_common.h
@@ -0,0 +1,209 @@
+/*
+Common utilities for CUDA code.
+*/
+#ifndef CUDA_COMMON_H
+#define CUDA_COMMON_H
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <string>
+#include <type_traits>      // std::bool_constant
+#include <cuda_runtime.h>
+#include <nvtx3/nvToolsExt.h>
+#include <nvtx3/nvToolsExtCudaRt.h>
+#include <cuda_profiler_api.h>
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
+
+#include "utils.h"
+
+// ----------------------------------------------------------------------------
+// Global defines and settings
+
+// Device properties of the CUDA device used in this process
+// defined as extern here because the individual kernels wish to use it
+// but it is actually created and instantiated in the main program file
+extern cudaDeviceProp deviceProp;
+
+// WarpSize is not a compile time constant
+// Defining here like this possibly allows the compiler to optimize better
+#define WARP_SIZE 32U
+
+// try to make sure that 2 blocks fit on A100/H100 to maximise latency tolerance
+// this needs to be defines rather than queried to be used for __launch_bounds__
+#if __CUDA_ARCH__ == 800 || __CUDA_ARCH__ >= 900
+#define MAX_1024_THREADS_BLOCKS 2
+#else
+#define MAX_1024_THREADS_BLOCKS 1
+#endif
+
+// convenience macro for calculating grid/block dimensions for kernels
+#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
+
+// short-cuts for compile-time boolean values that can be used as function arguments
+constexpr std::bool_constant<true> True;
+constexpr std::bool_constant<true> False;
+
+// ----------------------------------------------------------------------------
+// Error checking
+
+// CUDA error checking
+inline void cudaCheck(cudaError_t error, const char *file, int line) {
+  if (error != cudaSuccess) {
+    printf("[CUDA ERROR] at file %s:%d:\n%s\n", file, line, cudaGetErrorString(error));
+    exit(EXIT_FAILURE);
+  }
+};
+#define cudaCheck(err) (cudaCheck(err, __FILE__, __LINE__))
+
+// like cudaFree, but checks for errors _and_ resets the pointer.
+template<class T>
+inline void cudaFreeCheck(T** ptr, const char *file, int line) {
+    cudaError_t error = cudaFree(*ptr);
+    if (error != cudaSuccess) {
+        printf("[CUDA ERROR] at file %s:%d:\n%s\n", file, line, cudaGetErrorString(error));
+        exit(EXIT_FAILURE);
+    }
+    *ptr = nullptr;
+}
+#define cudaFreeCheck(ptr) (cudaFreeCheck(ptr, __FILE__, __LINE__))
+
+// ----------------------------------------------------------------------------
+// CUDA Precision settings and defines
+
+enum PrecisionMode {
+    PRECISION_FP32,
+    PRECISION_FP16,
+    PRECISION_BF16
+};
+
+// Specific configurations based on the enabled precision
+#if defined(ENABLE_FP32)
+typedef float floatX;
+#define PRECISION_MODE PRECISION_FP32
+// use fp16 (note: this may require gradient scaler, currently not implemented!)
+#elif defined(ENABLE_FP16)
+typedef half floatX;
+#define PRECISION_MODE PRECISION_FP16
+#else // Default to bfloat16
+typedef __nv_bfloat16 floatX;
+#define PRECISION_MODE PRECISION_BF16
+#endif
+
+// ----------------------------------------------------------------------------
+// Load and store with streaming cache hints
+// Older nvcc does not provide __ldcs and __stcs for bfloat16, despite these
+// actually just being unsigned shorts. We need to be careful here to only define
+// our own versions if none already exist, otherwise the compiler will complain.
+// If not, you easily get "no viable overload" (for sm52) and "function already exists" (sm_80)
+
+#if defined(ENABLE_BF16) && (__CUDACC_VER_MAJOR__ < 12) && !((__CUDA_ARCH__ >= 800) || !defined(__CUDA_ARCH__))
+__device__ floatX __ldcs(const floatX* address) {
+    unsigned short bf = __ldcs(reinterpret_cast<const unsigned short*>(address));
+    return __nv_bfloat16_raw{bf};
+}
+
+__device__ void __stcs(floatX* address, floatX value) {
+    __stcs(reinterpret_cast<unsigned short*>(address), ((__nv_bfloat16_raw)value).x);
+}
+#endif
+
+// ----------------------------------------------------------------------------
+// Profiler utils
+
+class NvtxRange {
+ public:
+    NvtxRange(const char* s) { nvtxRangePush(s); }
+    NvtxRange(const std::string& base_str, int number) {
+        std::string range_string = base_str + " " + std::to_string(number);
+        nvtxRangePush(range_string.c_str());
+    }
+    ~NvtxRange() { nvtxRangePop(); }
+};
+#define NVTX_RANGE_FN() NvtxRange nvtx_range(__FUNCTION__)
+
+// ----------------------------------------------------------------------------
+// Utilities to Read & Write between CUDA memory <-> files
+
+// copy num_bytes from device pointer src into file dest, using double buffering running on the given stream.
+inline void device_to_file(FILE* dest, void* src, size_t num_bytes, size_t buffer_size, cudaStream_t stream) {
+    // allocate pinned buffer for faster, async transfer
+    char* buffer_space;
+    cudaCheck(cudaMallocHost(&buffer_space, 2*buffer_size));
+    // split allocation in two
+    void* read_buffer = buffer_space;
+    void* write_buffer = buffer_space + buffer_size;
+
+    // prime the read buffer; first copy means we have to wait
+    char* gpu_read_ptr = (char*)src;
+    size_t copy_amount = std::min(buffer_size, num_bytes);
+    cudaCheck(cudaMemcpyAsync(read_buffer, gpu_read_ptr, copy_amount, cudaMemcpyDeviceToHost, stream));
+    cudaCheck(cudaStreamSynchronize(stream));
+    size_t rest_bytes = num_bytes - copy_amount;
+    size_t write_buffer_size = copy_amount;
+    gpu_read_ptr += copy_amount;
+
+    std::swap(read_buffer, write_buffer);
+    // now the main loop; as long as there are bytes left
+    while(rest_bytes > 0) {
+        // initiate next read
+        copy_amount = std::min(buffer_size, rest_bytes);
+        cudaCheck(cudaMemcpyAsync(read_buffer, gpu_read_ptr, copy_amount, cudaMemcpyDeviceToHost, stream));
+        // while this is going on, transfer the write buffer to disk
+        fwriteCheck(write_buffer, 1, write_buffer_size, dest);
+        cudaCheck(cudaStreamSynchronize(stream));     // wait for both buffers to be ready.
+
+        std::swap(read_buffer, write_buffer);
+        rest_bytes -= copy_amount;
+        write_buffer_size = copy_amount;
+        gpu_read_ptr += copy_amount;
+    }
+
+    // make sure to write the last remaining write buffer
+    fwriteCheck(write_buffer, 1, write_buffer_size, dest);
+    cudaCheck(cudaFreeHost(buffer_space));
+}
+
+// copy num_bytes from file src into device pointer dest, using double buffering running on the given stream.
+inline void file_to_device(void* dest, FILE* src, size_t num_bytes, size_t buffer_size, cudaStream_t stream) {
+     // allocate pinned buffer for faster, async transfer
+     // from the docs (https://developer.download.nvidia.com/compute/DevZone/docs/html/C/doc/html/group__CUDART__HIGHLEVEL_ge439496de696b166ba457dab5dd4f356.html)
+     // WC memory is a good option for buffers that will be written by the CPU and read by the device via mapped pinned memory or host->device transfers.
+    char* buffer_space;
+    cudaCheck(cudaMallocHost(&buffer_space, 2*buffer_size, cudaHostAllocWriteCombined));
+    // split allocation in two
+    void* read_buffer = buffer_space;
+    void* write_buffer = buffer_space + buffer_size;
+
+    // prime the read buffer;
+    char* gpu_write_ptr = (char*)dest;
+    size_t copy_amount = std::min(buffer_size, num_bytes);
+    freadCheck(read_buffer, 1, copy_amount, src);
+
+    size_t rest_bytes = num_bytes - copy_amount;
+    size_t write_buffer_size = copy_amount;
+    std::swap(read_buffer, write_buffer);
+
+    // now the main loop; as long as there are bytes left
+    while(rest_bytes > 0) {
+        // initiate next read
+        copy_amount = std::min(buffer_size, rest_bytes);
+        cudaCheck(cudaMemcpyAsync(gpu_write_ptr, write_buffer, write_buffer_size, cudaMemcpyHostToDevice, stream));
+        gpu_write_ptr += write_buffer_size;
+        // while this is going on, read from disk
+        freadCheck(read_buffer, 1, copy_amount, src);
+        cudaCheck(cudaStreamSynchronize(stream));     // wait for both buffers to be ready.
+
+        std::swap(read_buffer, write_buffer);
+        rest_bytes -= copy_amount;
+        write_buffer_size = copy_amount;
+    }
+
+    // copy the last remaining write buffer to gpu
+    cudaCheck(cudaMemcpyAsync(gpu_write_ptr, write_buffer, write_buffer_size, cudaMemcpyHostToDevice, stream));
+    cudaCheck(cudaStreamSynchronize(stream));
+    cudaCheck(cudaFreeHost(buffer_space));
+}
+
+#endif // CUDA_COMMON_H
\ No newline at end of file
diff --git a/llmc/cuda_utils.cuh b/llmc/cuda_utils.cuh
new file mode 100644
index 000000000..0ce728ee1
--- /dev/null
+++ b/llmc/cuda_utils.cuh
@@ -0,0 +1,263 @@
+// Utilities for use in __device__ code
+
+#ifndef CUDA_UTILS_CUH
+#define CUDA_UTILS_CUH
+
+#include "cuda_common.h"
+
+// ----------------------------------------------------------------------------
+// Packed128 data structure that forces the compiler to use 128-bit loads/stores
+// in GPUs that support (the LDG.128 and STS.128 instructions)
+// This is a bit similar to the use of float4 in the case of 32-bit floats, but
+// supports arbitrary precision.
+
+template<class ElementType>
+struct alignas(16) Packed128 {
+    Packed128() = default;
+    __device__ explicit Packed128(int4 bits) {
+        static_assert(sizeof(bits) == sizeof(payload), "Size mismatch.");
+        memcpy(&payload, &bits, sizeof(bits));
+    }
+
+    __device__  static Packed128 constant(ElementType value) {
+        Packed128 result;
+        for(int k = 0; k < size; ++k) {
+            result.payload[k] = value;
+        }
+        return result;
+    }
+    __device__ static Packed128 zeros() {
+        return constant(0.f);
+    }
+    __device__ static Packed128 ones() {
+        return constant(1.f);
+    }
+
+    __device__ ElementType& operator[](int index) {
+        return payload[index];
+    }
+    __device__ const ElementType& operator[](int index) const {
+        return payload[index];
+    }
+    __device__ int4 get_bits() const {
+        int4 bits;
+        static_assert(sizeof(bits) == sizeof(payload), "Size mismatch.");
+        memcpy(&bits, &payload, sizeof(bits));
+        return bits;
+    }
+    static constexpr const size_t size = sizeof(int4) / sizeof(ElementType);
+    ElementType payload[size];
+};
+
+// load a Packed128 from an aligned memory address
+template<class ElementType>
+__device__ Packed128<ElementType> load128(const ElementType* address) {
+    return Packed128<ElementType>{*reinterpret_cast<const int4*>(address)};
+}
+// load a Packed128 from an aligned memory address with streaming cache hint
+template<class ElementType>
+__device__ Packed128<ElementType> load128cs(const ElementType* address) {
+    return Packed128<ElementType>{__ldcs(reinterpret_cast<const int4*>(address))};
+}
+// store a Packed128 to an aligned memory address
+template<class ElementType>
+__device__ void store128(ElementType* target, Packed128<ElementType> value) {
+    *reinterpret_cast<int4*>(target) = value.get_bits();
+}
+// store a Packed128 to an aligned memory address with streaming cache hint
+template<class ElementType>
+__device__ void store128cs(ElementType* target, Packed128<ElementType> value) {
+    __stcs(reinterpret_cast<int4*>(target), value.get_bits());
+}
+// store a Packed128 to an aligned memory address while caching in L2 but bypassing L1
+template<class ElementType>
+__device__ void store128cg(ElementType* target, Packed128<ElementType> value) {
+    __stcg(reinterpret_cast<int4*>(target), value.get_bits());
+}
+
+// short-form typedefs
+typedef Packed128<float> f128;
+typedef Packed128<floatX> x128;
+
+// ----------------------------------------------------------------------------
+// DType support
+
+// enumerator to indentify the datatype of a tensor.
+enum class DType : uint8_t {
+    FP32, FP16, BF16
+};
+
+// Given a datatype enum, returns the underlying number of bytes
+// for a scalar of that type
+size_t sizeof_dtype(DType type) {
+    switch (type) {
+        case DType::FP32:
+            return sizeof(float);
+        case DType::FP16:
+            return sizeof(half);
+        case DType::BF16:
+            return sizeof(nv_bfloat16);
+        default: // handle or get compiler warning
+            fprintf(stderr, "Unknown datatype\n");
+            exit(EXIT_FAILURE);
+    }
+}
+
+DType dtype_of(float* f) { return DType::FP32; }
+DType dtype_of(nv_bfloat16 * f) { return DType::BF16; }
+DType dtype_of(half * f) { return DType::FP16; }
+
+
+
+// ----------------------------------------------------------------------------
+// Copy, cast functions
+
+// device functions and the kernel to cast data between types
+template<typename Td, typename Ts>
+__device__ Td cast_value(Ts val);
+
+template<>
+__device__ float cast_value<float, float>(float val) {
+    return val;
+}
+
+template<>
+__device__ float cast_value<float, half>(half val) {
+    return __half2float(val);
+}
+
+template<>
+__device__ float cast_value<float, __nv_bfloat16>(__nv_bfloat16 val) {
+    return __bfloat162float(val);
+}
+
+template<typename Td, typename Ts>
+__global__ void copy_and_cast_kernel(Td* dst, const Ts* src, size_t n, ptrdiff_t stride_dst, ptrdiff_t stride_src) {
+    int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    // need to try grid stride looping for more perf later
+    if (idx < n) {
+        dst[idx + stride_dst * blockIdx.y] = cast_value<Td, Ts>(src[idx + stride_src * blockIdx.y]);
+    }
+}
+
+// ----------------------------------------------------------------------------
+// Warp/Block communication primitives
+
+// warp-level reduction for summing values
+__device__ inline float warpReduceSum(float val) {
+    for (int offset = 16; offset > 0; offset /= 2) {
+        val += __shfl_xor_sync(0xFFFFFFFF, val, offset);
+    }
+    return val;
+}
+// warp-level reduction for finding the maximum value
+__device__ inline float warpReduceMax(float val) {
+    for (int offset = 16; offset > 0; offset /= 2) {
+        val = fmaxf(val, __shfl_xor_sync(0xFFFFFFFF, val, offset));
+    }
+    return val;
+}
+// requires all 32 threads in the warp to be active, but should work for any block size
+// uses non-dynamic shared memory so every call increases shared memory requirements by 128 bytes
+// the fact it's unique shared memory allows us to avoid an extra __syncthreads() call at the end
+// but if called inside a loop, the shared memory will be implicitly reused, so set final_sync to 1
+using reduction_func_t = float (*) (float);
+template<reduction_func_t warp_reduction>
+__device__ inline float blockReduce(float val, bool final_sync=false, float out_of_bounds=0.0f) {
+    // two reductions of up to 1024 threads:
+    // 1) inside warp (shuffle), 2) cross-warp (shared memory), 3) inside warp (shuffle)
+    __shared__ float shared_val[WARP_SIZE];
+    const int lane_id = threadIdx.x % WARP_SIZE;
+    const int warp_id = threadIdx.x / WARP_SIZE;
+    const int num_warps = blockDim.x / WARP_SIZE;
+
+    float warp_val = warp_reduction(val);
+    if (lane_id == 0) { shared_val[warp_id] = warp_val; }
+    __syncthreads();
+    warp_val = (lane_id < num_warps) ? shared_val[lane_id] : out_of_bounds;
+    float block_val = warp_reduction(warp_val);
+
+    if (final_sync) {
+        __syncthreads(); // only needed in loops when effectively reusing shared memory etc.
+    }
+    return block_val;
+}
+
+// Performs a _deterministic_ sum reduction. determinism is achieved by requiring that only
+// a single block be used.
+template<class Float>
+__global__ void global_sum_single_block_kernel(float* result, const Float* values, size_t count) {
+    assert(gridDim.x == 1);     // only a single block!
+    float thread_sum = 0;
+    for(size_t index = threadIdx.x; index < count; index += blockDim.x) {
+        thread_sum += (float)values[index];
+    }
+
+    float reduction = blockReduce<warpReduceSum>(thread_sum, true);
+    if(threadIdx.x == 0) {
+        *result = reduction;
+    }
+}
+
+template<class Float>
+void global_sum_deterministic(float* result, const Float* values, int count, cudaStream_t stream) {
+    global_sum_single_block_kernel<<<1, 1024, 0, stream>>>(result, values, count);
+    cudaCheck(cudaGetLastError());
+}
+
+// ----------------------------------------------------------------------------
+// Random Number Generation used in Stochastic Rounding
+
+// SquirrelNoise5 - Squirrel's Raw Noise utilities (version 5)
+// This gives us a random number from threadIdx/blockIdx + a single seed for the entire GPU
+// todo - possibly overkill and we don't need such high quality random numbers? (tbd)
+// http://eiserloh.net/noise/SquirrelNoise5.hpp
+__device__ __host__ constexpr unsigned int SquirrelNoise5(unsigned int positionX, unsigned int seed)
+{
+    constexpr unsigned int SQ5_BIT_NOISE1 = 0xd2a80a3f;	// 11010010101010000000101000111111
+    constexpr unsigned int SQ5_BIT_NOISE2 = 0xa884f197;	// 10101000100001001111000110010111
+    constexpr unsigned int SQ5_BIT_NOISE3 = 0x6C736F4B; // 01101100011100110110111101001011
+    constexpr unsigned int SQ5_BIT_NOISE4 = 0xB79F3ABB;	// 10110111100111110011101010111011
+    constexpr unsigned int SQ5_BIT_NOISE5 = 0x1b56c4f5;	// 00011011010101101100010011110101
+    unsigned int mangledBits = positionX;
+    mangledBits *= SQ5_BIT_NOISE1;
+    mangledBits += seed;
+    mangledBits ^= (mangledBits >> 9);
+    mangledBits += SQ5_BIT_NOISE2;
+    mangledBits ^= (mangledBits >> 11);
+    mangledBits *= SQ5_BIT_NOISE3;
+    mangledBits ^= (mangledBits >> 13);
+    mangledBits += SQ5_BIT_NOISE4;
+    mangledBits ^= (mangledBits >> 15);
+    mangledBits *= SQ5_BIT_NOISE5;
+    mangledBits ^= (mangledBits >> 17);
+    return mangledBits;
+}
+__device__ __host__ constexpr unsigned int Get2dNoiseUint(int indexX, int indexY, unsigned int seed)
+{
+    constexpr unsigned int PRIME_NUMBER = 198491317u; // Large prime number with non-boring bits
+    unsigned int x = static_cast<unsigned int>(indexX);
+    unsigned int y = static_cast<unsigned int>(indexY);
+
+    return SquirrelNoise5(x + (PRIME_NUMBER * y), seed);
+}
+
+// stochastic rounding built on top of Squirel Noise above (with seed updated per step via xorshift)
+__device__ __forceinline__ void stochastic_rounding(float in, __nv_bfloat16 *out, unsigned int seed) {
+    // todo - is this stochastic rounding *too good*? can we cut any corners?
+    // makes sure each thread gets a different random number
+    unsigned int random = Get2dNoiseUint(threadIdx.x, blockIdx.x * blockDim.x + blockIdx.y, seed);
+    unsigned int threshold = random & 0xFFFF;
+    unsigned int float_bits = __float_as_uint(in);
+    unsigned int rounded_bits = float_bits & 0x0000FFFF;
+    float_bits = (rounded_bits > threshold) ? (float_bits | 0xFFFF) : (float_bits  & ~0xFFFF);
+    *out = __float2bfloat16_rn(__uint_as_float(float_bits));
+}
+__device__ __forceinline__ void stochastic_rounding(float in, half *out, unsigned int random) {
+    *out = (float)in; // todo - implement this...
+}
+__device__ __forceinline__ void stochastic_rounding(float in, float *out, unsigned int random) {
+    *out = in; // dummy function for when floatX is float (FP32 mode)
+}
+
+#endif
\ No newline at end of file
diff --git a/cudnn_att.cpp b/llmc/cudnn_att.cpp
similarity index 90%
rename from cudnn_att.cpp
rename to llmc/cudnn_att.cpp
index 04b1a92ec..0330abe20 100644
--- a/cudnn_att.cpp
+++ b/llmc/cudnn_att.cpp
@@ -2,57 +2,36 @@
 // we change some unrelated piece of the code.
 // TODO this currently duplicates some of the utilities from the main file
 
+#define NOMINMAX
+#include <unistd.h>
+#include "cudnn_att.h"
 #include <cudnn_frontend.h>
-#include <cuda_bf16.h>
-#include <nvtx3/nvToolsExt.h>
+
 namespace fe = cudnn_frontend;
 
 // Specific configurations based on the enabled precision
 #if defined(ENABLE_FP32)
-typedef float floatX;
 static_assert(false, "cuDNN is not supported in FP32 mode.")
-
 // use fp16 (note: this may require gradient scaler, currently not implemented!)
 #elif defined(ENABLE_FP16)
-typedef half floatX;
-#define CUBLAS_LOWP CUDA_R_16F
 #define CUDNN_16BIT fe::DataType_t::HALF
-
 #else // Default to bfloat16
-typedef __nv_bfloat16 floatX;
 #define CUDNN_16BIT fe::DataType_t::BFLOAT16
 #endif
 
-// CUDA error checking
-static void cudaCheck(cudaError_t error, const char *file, int line) {
-    if (error != cudaSuccess) {
-        printf("[CUDA ERROR] at file %s:%d:\n%s\n", file, line,
-               cudaGetErrorString(error));
-        exit(EXIT_FAILURE);
-    }
-};
-#define cudaCheck(err) (cudaCheck(err, __FILE__, __LINE__))
-
-// Profiler utils
-namespace {
-    class NvtxRange {
-    public:
-        NvtxRange(const char* s) { nvtxRangePush(s); }
-        NvtxRange(const std::string& base_str, int number) {
-            std::string range_string = base_str + " " + std::to_string(number);
-            nvtxRangePush(range_string.c_str());
-        }
-        ~NvtxRange() { nvtxRangePop(); }
-    };
-}
-#define NVTX_RANGE_FN() NvtxRange nvtx_range(__FUNCTION__)
-
 static cudnnHandle_t cudnn_handle;
 static size_t cudnn_workspace_size = 0; // dynamically allocated as needed (up to 256MiB!)
 static void* cudnn_workspace = NULL;
-#define checkCudnnErr(err) assert((int)err == 0);
 
-static void checkCudnnFE(fe::error_object e, const char *file, int line) {
+static void cuDNNCheck(cudnnStatus_t error, const char *file, int line) {
+    if (error != CUDNN_STATUS_SUCCESS) {
+        printf("[CUDNN ERROR] at file %s:%d:\n%s\n", file, line, cudnnGetErrorString(error));
+        exit(EXIT_FAILURE);
+    }
+};
+#define cuDNNCheck(err) (cuDNNCheck(err, __FILE__, __LINE__))
+
+static void checkCudnnFE(const fe::error_object& e, const char *file, int line) {
     if(!e.is_good()) {
         printf("[CUDNN ERROR] at file %s:%d:\n%s\n", file, line, e.err_msg.c_str());
         exit(EXIT_FAILURE);
@@ -88,7 +67,7 @@ auto lookup_cache_or_build_graph_fwd(int B,int H,int T,int HS, int is_inference_
     if (it != user_maintained_cache_fwd.end()) {
         return it->second;
     }
-    
+
     auto graph = std::make_shared<fe::graph::Graph>();
     graph->set_io_data_type(CUDNN_16BIT)
           .set_intermediate_data_type(fe::DataType_t::FLOAT)
@@ -205,6 +184,9 @@ auto lookup_cache_or_build_graph_bwd(int B, int NH, int T, int HS) {
                             .set_uid(Attn_scale_UID)
                             .set_data_type(fe::DataType_t::FLOAT));
     auto sdpa_backward_options = fe::graph::SDPA_backward_attributes().set_name("flash_attention_backward")
+#if CUDNN_FRONTEND_MAJOR_VERSION > 1 || CUDNN_FRONTEND_MINOR_VERSION >= 5
+                            .set_deterministic_algorithm(true) // 1.5+ needs this for determinism
+#endif
                             .set_causal_mask(true)
                             .set_attn_scale(attn_scale);
 
@@ -240,11 +222,13 @@ auto lookup_cache_or_build_graph_bwd(int B, int NH, int T, int HS) {
 void attention_forward_cudnn(floatX* out,  // output: (B, T, NH, HS)
                              float* stats, // output for backward pass: (B, NH, T)
                              floatX* inp,  // input: (B, T, 3, NH, HS) QKV
-                             int B, int T, int NH, int C) {
+                             int B, int T, int NH, int C, cudaStream_t stream) {
     NVTX_RANGE_FN();
     int HS = C / NH; // number of features per head
     bool is_inference_only = (stats == nullptr);
 
+    cuDNNCheck(cudnnSetStream(cudnn_handle, stream));
+
     // Get graph and tensors from cache (or generate it on first use)
     auto graph = lookup_cache_or_build_graph_fwd(B, NH, T, HS, is_inference_only);
 
@@ -271,7 +255,7 @@ void attention_forward_cudnn(floatX* out,  // output: (B, T, NH, HS)
 
 void attention_backward_cudnn(floatX* dqkvr,                                       // output
                               floatX* dout, floatX* qkvr, floatX* o, float* stats, // inputs
-                              int B, int T, int NH, int C) {
+                              int B, int T, int NH, int C, cudaStream_t stream) {
     NVTX_RANGE_FN();
     int HS = C / NH; // number of features per head
 
@@ -298,15 +282,16 @@ void attention_backward_cudnn(floatX* dqkvr,
         {Attn_scale_UID, &attn_scale_cpu}};
 
     // Execute graph
+    cuDNNCheck(cudnnSetStream(cudnn_handle, stream));
     checkCudnnFE(graph->execute(cudnn_handle, variant_pack, cudnn_workspace));
     cudaCheck(cudaGetLastError());
 }
 
 void create_cudnn() {
-    checkCudnnErr(cudnnCreate(&cudnn_handle));
+    cuDNNCheck(cudnnCreate(&cudnn_handle));
 }
 
 void destroy_cudnn() {
     if (cudnn_workspace != NULL) { cudaCheck(cudaFree(cudnn_workspace)); }
-    checkCudnnErr(cudnnDestroy(cudnn_handle));
+    cuDNNCheck(cudnnDestroy(cudnn_handle));
 }
\ No newline at end of file
diff --git a/llmc/cudnn_att.h b/llmc/cudnn_att.h
new file mode 100644
index 000000000..318413007
--- /dev/null
+++ b/llmc/cudnn_att.h
@@ -0,0 +1,21 @@
+/*
+cuDNN (flash) attention
+*/
+#ifndef CUDNN_ATT_H
+#define CUDNN_ATT_H
+
+#include "cuda_common.h"
+
+// forward declarations of functions defined in cudnn_att.cpp
+void create_cudnn();
+void destroy_cudnn();
+void attention_forward_cudnn(floatX* out,  // output: (B, T, NH, HS)
+                             float* stats, // output for backward pass: (B, NH, T)
+                             floatX* inp,  // input: (B, T, 3, NH, HS) QKV
+                             int B, int T, int NH, int C, cudaStream_t stream);
+
+void attention_backward_cudnn(floatX* dqkvr,                                       // output
+                              floatX* dout, floatX* qkvr, floatX* o, float* stats, // inputs
+                              int B, int T, int NH, int C, cudaStream_t stream);
+
+#endif // CUDNN_ATT_H
\ No newline at end of file
diff --git a/llmc/dataloader.h b/llmc/dataloader.h
index 5da78b05d..ad5829d06 100644
--- a/llmc/dataloader.h
+++ b/llmc/dataloader.h
@@ -15,6 +15,7 @@
 // defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck
 // defines: mallocCheck
 #include "utils.h"
+#include "rand.h"
 
 // ----------------------------------------------------------------------------
 // implementation of glob for Windows is in dev/unistd.h
@@ -30,23 +31,37 @@ typedef struct {
     // each process/worker has to access different parts of the data
     int process_rank;
     int num_processes;
-    // hyperparameters. use size_t to prevent overflow
+    // batch and token information
     size_t B;
     size_t T;
-    // input handling and its state
+    size_t num_tokens; // total number of tokens
+    size_t shard_num_samples;  // total number of samples in the current shard per process
+    // shards and current position
     glob_t glob_result; // stores the result of glob, for all shards we want to iterate
-    int current_shard; // the current shard we are reading from
+    size_t current_shard_idx; // the current shard we are reading from
+    size_t current_sample_idx; // the current sample we are reading from
+    // file handle
     FILE* tokens_file;
-    int64_t file_size;
-    int64_t current_position;
+    // data buffers
     uint16_t* buffer; // we fread data from file into this buffer
-    // public variables that could be accessed from outside
-    size_t num_tokens; // total number of tokens
     int* inputs;  // input tokens into transformer
     int* targets; // target tokens for the transformer
+    // random shuffle related variables
+    mt19937_state shuffle_rng;
+    int should_shuffle;
+    int* shard_indices;
+    int* intra_shard_indices;
+    // sizes in bytes
+    size_t total_batch_size_bytes;  // total across all processes
+    size_t local_batch_offset_bytes;  // inner-sample offset for this process
+    size_t header_bytes;  // header size in bytes
+    int64_t file_size_bytes;
 } DataLoader;
 
 int64_t dataloader_load_shard_(DataLoader *loader, int shard_index) {
+    if (loader->should_shuffle) {
+        shard_index = loader->shard_indices[shard_index];
+    }
     // use the first glob match as the filename for now
     const char* filename = loader->glob_result.gl_pathv[shard_index];
     // open the input file for reading. also only a single file can be opened at a time
@@ -68,37 +83,60 @@ int64_t dataloader_load_shard_(DataLoader *loader, int shard_index) {
     assert(ntok > 0); // we expect some tokens in the file. this should never trip, right?
     // determine the file size and make sure it is consistent with the number of tokens
     fseekCheck(loader->tokens_file, 0, SEEK_END); // seek to end of file
-    loader->file_size = ftell(loader->tokens_file); // read the offset, i.e. file size
+    loader->file_size_bytes = ftell(loader->tokens_file); // read the offset, i.e. file size
     fseekCheck(loader->tokens_file, 0, SEEK_SET); // seek back to the beginning
     // we expect ntok in the file to be consistent with filesize, assert that is the case
     int64_t expected_file_size = HEADER_SIZE * sizeof(int) + ntok * sizeof(uint16_t);
-    if (loader->file_size != expected_file_size) {
+    if (loader->file_size_bytes != expected_file_size) {
         printf("Error: file size is not as expected\n");
         exit(EXIT_FAILURE);
     }
+    // -1 uint16_t due to us taking B*T+1 tokens but moving by B*T tokens
+    loader->shard_num_samples = (ntok * sizeof(uint16_t) - sizeof(uint16_t)) / loader->total_batch_size_bytes;
     return ntok;
 }
 
+void prepare_intra_shard_indices_(DataLoader *loader) {
+    // shuffle the examples inside the shards
+    if (loader->intra_shard_indices != NULL) {
+        // in case shards have different number of samples / sizes
+        free(loader->intra_shard_indices);
+    }
+    loader->intra_shard_indices = (int*)mallocCheck(loader->shard_num_samples * sizeof(int));
+    init_identity_permutation(loader->intra_shard_indices, (int) loader->shard_num_samples);
+    random_permutation(loader->intra_shard_indices, (int) loader->shard_num_samples, &loader->shuffle_rng);
+}
+
 void dataloader_reset(DataLoader *loader) {
-    // fully resets the DataLoader object to init configuration
-    // each process starts at a different offset in the file
-    int64_t header_bytes = HEADER_SIZE * sizeof(int);
-    int64_t token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t);
-    loader->current_shard = 0;
-    loader->current_position = header_bytes + token_bytes_offset;
-    dataloader_load_shard_(loader, loader->current_shard);
+    loader->current_shard_idx = 0;
+    loader->current_sample_idx = 0;
+
+    if (loader->should_shuffle) {  // shuffle the shards
+        random_permutation(loader->shard_indices, (int) loader->glob_result.gl_pathc, &loader->shuffle_rng);
+    }
+
+    dataloader_load_shard_(loader, (int) loader->current_shard_idx);
+
+    if (loader->should_shuffle) {
+        prepare_intra_shard_indices_(loader);
+    }
 }
 
 void dataloader_advance_(DataLoader *loader) {
+    if (loader->current_shard_idx == loader->glob_result.gl_pathc - 1) {
+        // if we are at the last shard, we reset the loader and start a new epoch
+        dataloader_reset(loader);
+        return;
+    }
+
     // advance the loader by loading the next data shard and resetting the position
-    if (loader->glob_result.gl_pathc > 1) {
-        // if we have more than one shard, advance to the next one
-        loader->current_shard = (loader->current_shard + 1) % loader->glob_result.gl_pathc;
-        dataloader_load_shard_(loader, loader->current_shard);
+    loader->current_shard_idx = (loader->current_shard_idx + 1) % loader->glob_result.gl_pathc;
+    loader->current_sample_idx = 0;
+    dataloader_load_shard_(loader, (int) loader->current_shard_idx);
+
+    if (loader->should_shuffle) {
+        prepare_intra_shard_indices_(loader);
     }
-    int64_t header_bytes = HEADER_SIZE * sizeof(int);
-    int64_t token_bytes_offset = loader->process_rank * loader->B * loader->T * sizeof(uint16_t);
-    loader->current_position = header_bytes + token_bytes_offset;
 }
 
 void dataloader_init(DataLoader *loader,
@@ -106,12 +144,17 @@ void dataloader_init(DataLoader *loader,
                      size_t B,
                      size_t T,
                      int process_rank,
-                     int num_processes) {
+                     int num_processes,
+                     int should_shuffle) {
     loader->process_rank = process_rank;
     loader->num_processes = num_processes;
     loader->B = B;
     loader->T = T;
     loader->tokens_file = NULL;
+    loader->should_shuffle = should_shuffle;
+    loader->header_bytes = HEADER_SIZE * sizeof(int);
+    loader->total_batch_size_bytes = ((loader->num_processes * (loader->B * loader->T)) * sizeof(uint16_t));
+    loader->local_batch_offset_bytes = loader->process_rank * loader->B * loader->T * sizeof(uint16_t);
 
     // glob to get the list of files matching the pattern, these are our data shards
     int glob_status = glob(filename_pattern, 0, NULL, &loader->glob_result);
@@ -124,6 +167,15 @@ void dataloader_init(DataLoader *loader,
         exit(EXIT_FAILURE);
     }
 
+    if (should_shuffle) {
+        mt19937_state shuffle_rng;
+        manual_seed(&shuffle_rng, 42 + process_rank);
+        loader->shuffle_rng = shuffle_rng;
+        loader->shard_indices = (int*)mallocCheck(loader->glob_result.gl_pathc * sizeof(int));
+        init_identity_permutation(loader->shard_indices, (int) loader->glob_result.gl_pathc);
+        loader->intra_shard_indices = NULL;  // dynamically allocated allowing different shard sizes
+    }
+
     // inspect and validate all shards so we don't get any runtime errors later
     // if too slow / too many shards, may wish to revisit later
     int64_t ntok_total = 0;
@@ -131,7 +183,7 @@ void dataloader_init(DataLoader *loader,
         int64_t shard_ntok = dataloader_load_shard_(loader, shard_index);
         // we need at least one batch/shard, the way things are written right now.
         // can be relaxed a lot later.
-        assert(shard_ntok >= num_processes * B * T + 1);
+        assert(shard_ntok >= (int64_t) (num_processes * B * T + 1));
         ntok_total += shard_ntok;
     }
     // debugging prints
@@ -139,40 +191,59 @@ void dataloader_init(DataLoader *loader,
     // printf("DataLoader: Found %ld tokens across %zu shards\n", ntok_total, loader->glob_result.gl_pathc);
 
     // allocate all the space we'll need
-    loader->buffer = (uint16_t*)malloc((B * T + 1) * sizeof(uint16_t));
-    loader->inputs = (int*)malloc(B * T * sizeof(int));
-    loader->targets = (int*)malloc(B * T * sizeof(int));
+    loader->buffer = (uint16_t*)mallocCheck((B * T + 1) * sizeof(uint16_t));
+    loader->inputs = (int*)mallocCheck(B * T * sizeof(int));
+    loader->targets = (int*)mallocCheck(B * T * sizeof(int));
     loader->num_tokens = ntok_total;
 
     // reset the loader, to initialize it
     dataloader_reset(loader);
 }
 
-void dataloader_next_batch(DataLoader *loader) {
+void dataloader_load_batch(DataLoader* loader) {
+    assert(!loader->should_shuffle || (loader->should_shuffle && loader->intra_shard_indices != NULL));
+    assert(loader->current_sample_idx < loader->shard_num_samples);
+    size_t idx = loader->should_shuffle ? loader->intra_shard_indices[loader->current_sample_idx] : loader->current_sample_idx;
+    size_t global_batch_offset_bytes = idx * loader->total_batch_size_bytes;
+    int64_t current_offset = loader->header_bytes + global_batch_offset_bytes + loader->local_batch_offset_bytes;
+
     size_t B = loader->B;
     size_t T = loader->T;
     // read B*T+1 uint16_t tokens from the file into buffer
-    fseekCheck(loader->tokens_file, loader->current_position, SEEK_SET);
+    fseekCheck(loader->tokens_file, (int) current_offset, SEEK_SET);
     freadCheck(loader->buffer, sizeof(uint16_t), B*T+1, loader->tokens_file);
     // decode the buffer into inputs and targets (cast to int)
     for (int i = 0; i < B*T; i++) {
         loader->inputs[i] = (int)loader->buffer[i];
         loader->targets[i] = (int)loader->buffer[i+1];
     }
-    // advance the current position by B*T*num_processes integers
-    // note: the "stride" of tokens by which we move each time is definitely B * T
-    // we only load B * T + 1 tokens at each iteration because the targets are offset by 1
-    loader->current_position += loader->num_processes * B * T * sizeof(uint16_t);
+}
+
+void dataloader_next_batch(DataLoader *loader) {
     // if the next batch would go past the end of the file, advance the loader
-    if (loader->current_position + (loader->num_processes * B * T + 1) * sizeof(uint16_t) > loader->file_size) {
+    if (loader->current_sample_idx >= loader->shard_num_samples) {
         dataloader_advance_(loader);
     }
+    dataloader_load_batch(loader);
+    loader->current_sample_idx += 1;
+}
+
+
+void dataloader_resume(DataLoader *loader, size_t current_shard_idx, size_t current_sample_idx) {
+    // used during model resumption (-y 1) flag
+    loader->current_shard_idx = current_shard_idx;
+    loader->current_sample_idx = current_sample_idx;
+    dataloader_load_shard_(loader, (int) loader->current_shard_idx);
 }
 
 void dataloader_free(DataLoader *loader) {
     free(loader->buffer);
     free(loader->inputs);
     free(loader->targets);
+    if (loader->should_shuffle) {
+        free(loader->shard_indices);
+        free(loader->intra_shard_indices);
+    }
     fcloseCheck(loader->tokens_file);
     globfree(&loader->glob_result);
 }
@@ -210,7 +281,6 @@ typedef struct {
     size_t T; // maximum context length of the model
     // input handling and its state
     FILE* eval_file;
-    int64_t file_size;
     uint16_t* buffer; // we fread data from file into this buffer
     // public variables that could be accessed from outside
     int num_examples; // in total across all processes
@@ -231,7 +301,13 @@ void evalloader_reset(EvalLoader *loader) {
     // then process 0 should start at 0, process 1 at N/4, process 2 at N/2, etc.
     // determine how much work there is for all processes
     int examples_per_process = CEIL_DIV(loader->num_examples, loader->num_processes);
-    int can_fit_examples = loader->B / ASSUMED_NUM_COMPLETIONS;
+    int can_fit_examples = (int) (loader->B / ASSUMED_NUM_COMPLETIONS);
+    if (can_fit_examples == 0) {
+        // this could be fixed in the future, but for now keeping it simple and throw error when B too low
+        printf("HellaSwag EvalLoader: batch size %zu is < %d\n", loader->B, ASSUMED_NUM_COMPLETIONS);
+        printf("---> HINT: Disable HellaSwag eval with -h 0, or increase batch size with -b\n");
+        exit(EXIT_FAILURE);
+    }
     loader->num_batches = CEIL_DIV(examples_per_process, can_fit_examples);
     // determine the start and end example indices for this process
     loader->start_example_index = examples_per_process * loader->process_rank;
@@ -243,7 +319,7 @@ void evalloader_reset(EvalLoader *loader) {
     // now seek through the file to the start of that example
     // utilize <EXAMPLE_BYTES> for efficiency
     int64_t header_bytes = HEADER_SIZE * sizeof(int);
-    fseekCheck(loader->eval_file, header_bytes, SEEK_SET);
+    fseekCheck(loader->eval_file, (int) header_bytes, SEEK_SET);
     for (int i = 0; i < loader->start_example_index; i++) {
         uint16_t example_header[3];
         // read 3 uint16_t values: <START_EXAMPLE>, <EXAMPLE_BYTES>, <EXAMPLE_INDEX>
@@ -255,7 +331,7 @@ void evalloader_reset(EvalLoader *loader) {
         // skip to the next example, keeping in mind that we already read the header
         size_t remaining_bytes = example_header[1] - sizeof(uint16_t) * 3;
         assert(remaining_bytes > 0); // we expect some bytes in the example
-        fseekCheck(loader->eval_file, remaining_bytes, SEEK_CUR);
+        fseekCheck(loader->eval_file, (int) remaining_bytes, SEEK_CUR);
     }
     // now we are at the start of the example we want to start at, pointing at <START_EXAMPLE>
     loader->current_example_index = loader->start_example_index;
@@ -279,7 +355,7 @@ void evalloader_init(EvalLoader *loader,
     freadCheck(header, sizeof(int), HEADER_SIZE, loader->eval_file);
     if (header[0] != 20240522) { printf("Bad magic in eval file\n"); exit(EXIT_FAILURE); }
     if (header[1] != 1) { printf("Bad version in data file\n"); exit(EXIT_FAILURE); }
-    loader->num_examples = header[2]; // number of tokens in the file
+    loader->num_examples = header[2]; // number of examples in the file
     assert(loader->num_examples >= num_processes); // avoid headaches for now
     size_t longest_example_bytes = header[3]; // longest example in the file
     // basic sensibility check we could relax later. but roughly each example
@@ -290,12 +366,12 @@ void evalloader_init(EvalLoader *loader,
     assert(longest_example_bytes > 0 && longest_example_bytes < (1+ASSUMED_NUM_COMPLETIONS)*T*2);
 
     // allocate all the space we'll need
-    int can_fit_examples = B / ASSUMED_NUM_COMPLETIONS;
-    loader->buffer = (uint16_t*)malloc(longest_example_bytes);
-    loader->inputs = (int*)malloc(B * T * sizeof(int));
-    loader->targets = (int*)malloc(B * T * sizeof(int));
-    loader->mask = (char*)malloc(B * T * sizeof(char));
-    loader->label = (int*)malloc(can_fit_examples * sizeof(int));
+    int can_fit_examples = (int) (B / ASSUMED_NUM_COMPLETIONS);
+    loader->buffer = (uint16_t*)mallocCheck(longest_example_bytes);
+    loader->inputs = (int*)calloc(B * T, sizeof(int));
+    loader->targets = (int*)calloc(B * T, sizeof(int));
+    loader->mask = (char*)mallocCheck(B * T * sizeof(char));
+    loader->label = (int*)mallocCheck(can_fit_examples * sizeof(int));
 
     // reset the loader, to initialize it
     evalloader_reset(loader);
@@ -323,7 +399,7 @@ void evalloader_next_example_(EvalLoader *loader, int example_batch_index) {
     freadCheck(loader->buffer, sizeof(char), example_bytes, loader->eval_file);
     // process the example label
     int label = (int)loader->buffer[0];
-    int can_fit_examples = loader->B / ASSUMED_NUM_COMPLETIONS;
+    int can_fit_examples = (int) (loader->B / ASSUMED_NUM_COMPLETIONS);
     assert(label >= 0 && label < ASSUMED_NUM_COMPLETIONS); // we expect the label to be in [0, 4) for right now
     assert(example_batch_index >= 0 && example_batch_index < can_fit_examples);
     loader->label[example_batch_index] = label; // store for output
@@ -373,16 +449,14 @@ void evalloader_next_example_(EvalLoader *loader, int example_batch_index) {
 void evalloader_next_batch(EvalLoader *loader) {
     size_t B = loader->B;
     size_t T = loader->T;
-    // init all inputs, targets, mask to zeros
-    // TODO: I think only mask is necessary to reset?
-    memset(loader->inputs, 0, B * T * sizeof(int));
-    memset(loader->targets, 0, B * T * sizeof(int));
+    // init mask to zeros, no need to do it for inputs & targets, the values where the mask
+    // is set will be correctly overwritten every time.
     memset(loader->mask, 0, B * T * sizeof(char));
     // ok here is the problem we are solving
     // we have a batch dimension of B, which we want to take full advantage of
     // each example has some number of completions (usually 4)
     // so we want to pack as many examples into rows of B as we can fit
-    int can_fit_examples = B / ASSUMED_NUM_COMPLETIONS; // how many examples can we fit in the batch?
+    int can_fit_examples = (int) (B / ASSUMED_NUM_COMPLETIONS); // how many examples can we fit in the batch?
     for (int i = 0; i < can_fit_examples; i++) {
         if (loader->current_example_index >= loader->end_example_index) {
             break; // this process has exhausted its work, noop from here on
@@ -401,7 +475,7 @@ int evalloader_stat_losses(EvalLoader *loader, float* losses) {
     size_t B = loader->B;
     size_t T = loader->T;
     // iterate the examples in this batch
-    int can_fit_examples = B / ASSUMED_NUM_COMPLETIONS;
+    int can_fit_examples = (int) (B / ASSUMED_NUM_COMPLETIONS);
     for (int i = 0; i < can_fit_examples; i++) {
         float min_loss = 0.0f;
         int min_loss_index = -1;
diff --git a/llmc/encoder.cuh b/llmc/encoder.cuh
new file mode 100644
index 000000000..3aa63e175
--- /dev/null
+++ b/llmc/encoder.cuh
@@ -0,0 +1,234 @@
+/*
+The GPT-2 Encoder, which combines two encodings: token and position
+In the forward pass, both encodings are added together
+In the backward pass, the gradients flow to both, handled by different kernels
+*/
+#include <assert.h>
+#include <stdint.h>
+#include <utility>              // std::pair
+#include <vector>
+#include <algorithm>
+#include <unordered_map>
+// llmc internal imports
+#include "cuda_common.h"
+#include "cuda_utils.cuh"
+
+// ----------------------------------------------------------------------------
+// CUDA kernels
+
+__global__ void encoder_forward_kernel3(floatX* out,
+                               const int* inp, const floatX* wte, const floatX* wpe,
+                               int B, int T, int C) {
+    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
+    int N = B * T * C;
+    if (idx >= N) { return; }
+
+    int bt = idx / C;
+    int b = bt / T;
+    int t = bt % T;
+    int c = idx % C;
+
+    int ix = inp[b * T + t];
+
+    floatX* out_btc = out + b * T * C + t * C + c;
+    const floatX* wte_ix = wte + ix * C + c;
+    const floatX* wpe_tc = wpe + t * C + c;
+
+    x128 packed_out;
+    x128 wte128 = load128cs(wte_ix);
+    x128 wpe128 = load128cs(wpe_tc);
+    for (int k = 0; k < x128::size; k++) {
+        packed_out[k] = (floatX)((float)wte128[k] + (float)wpe128[k]);
+    }
+    store128(out_btc, packed_out);
+}
+
+template <int BLOCK_SIZE=256>
+__global__ void wte_backward_kernel(floatX* dwte,
+                                    const int4* bucket_info, const int* workload_indices, const floatX* dout, const int* inp,
+                                    unsigned int seed, int B, int T, int C) {
+    // In order to be deterministic, we preprocess the inputs on the cpu into "buckets"
+    // Each bucket corresponds to (WARP_SIZE * x128::size) channels for a single vocabulary token
+    // Each thread handles x128::size channels, e.g. 256 per warp for BF16
+    // Each block handles (BLOCK_SIZE / WARP_SIZE) elements in a single bucket in parallel
+    // If a bucket has less than 8 elements, some warps will return immediately
+    // If a bucket has more than 8 elements, we will loop over all of them
+    // The buckets are sorted on the CPU so the largest buckets start 1st
+    int bucket = blockIdx.x;
+    int warp_id = threadIdx.x / WARP_SIZE;
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int c_per_warp = WARP_SIZE * x128::size;
+
+    int bucket_start_idx = bucket_info[bucket].x;
+    int bucket_size = bucket_info[bucket].y;
+    int bucket_ix = bucket_info[bucket].z;
+    int c = bucket_info[bucket].w * c_per_warp + (lane_id * x128::size);
+
+    // Each thread handles "x128::size" channels, so at fp8, each warp would handle 512 channels
+    // If C is not a multiple of this (e.g. 768), some buckets/c_groups cannot use the entire warp
+    if (c >= C) { return; }
+    // Exit early if this is a small bucket and this warp doesn't have any items to process
+    if (warp_id >= bucket_size) { return; }
+
+    float accum[x128::size] = {0.0f};
+    __shared__ float accum_shared[x128::size * BLOCK_SIZE];
+
+    for(int item = warp_id; item < bucket_size; item += BLOCK_SIZE/WARP_SIZE) {
+        int bt = workload_indices[bucket_start_idx + item];
+
+        const floatX* dout_btc = dout + bt * C + c;
+        x128 packed_inp1 = load128cs(dout_btc);
+        for (int k = 0; k < packed_inp1.size; k++) {
+            accum[k] += (float)packed_inp1[k];
+        }
+    }
+
+    if (warp_id != 0) {
+        // we accumulate into warp 0, so only the other warps need to write to shared memory
+        for (int k = 0; k < x128::size; k++) {
+            accum_shared[threadIdx.x + k * BLOCK_SIZE] = accum[k];
+        }
+        return; // only warp 0 is needed after writing to shared memory
+    }
+
+    // Read dwte for warp 0 even if other warps are not finished yet to maximise latency tolerance
+    floatX* dwte_ix = dwte + bucket_ix * C + c;
+    x128 packed_in_out = load128(dwte_ix);
+
+    // note: threads which have returned are considered synchronised by CUDA so no risk of deadlock
+    __syncthreads();
+
+    // Accumulate into warp 0's registers by reading the values of the other warps in shared memory
+    for (int i = threadIdx.x+WARP_SIZE; i < min(BLOCK_SIZE, bucket_size*WARP_SIZE); i += WARP_SIZE) {
+        for (int k = 0; k < x128::size; k++) {
+            accum[k] += accum_shared[i + k * BLOCK_SIZE];
+        }
+    }
+
+    // Add the result to dwte and write back to global memory (read-modify-write)
+    for (unsigned int k = 0; k < x128::size; k++) {
+        // We use stochastic rounding to go from FP32 to BF16
+        // The seed is deterministic and unique for each parameter to guarantee we have determinism AND
+        // to avoid **potential** issues with positionX int SquirrelNoise5 argument overflowing which is UB
+        // and that somehow messing the quality of random numbers
+        stochastic_rounding(accum[k] + (float)packed_in_out[k], &packed_in_out[k], seed + bucket * WARP_SIZE + threadIdx.x + k);
+    }
+    store128(dwte_ix, packed_in_out);
+}
+
+__global__ void wpe_backward_kernel(floatX* dwpe,
+                                    const floatX* dout, const int* inp,
+                                    int B, int T, int C, unsigned int seed) {
+    // Each thread handles x128::size "channel positions", e.g. 256 per warp for BF16
+    // For gpt2-124M BF16, C=768 and T=1024, so 3 warps per channel and 3072 warps in total
+    // For each "channel position" we sum the gradients for every batch at that C/T element
+    // This way each dwte element is only updated once, and the kernel is fully deterministic!
+    // The previous kernel was not deterministic, as batches were aggregated with atomicAdd
+    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
+    if (idx >= T * C) { return; }
+
+    // if C is not a multiple of WARP_SIZE*x128::size, it's OK for some warps to handle multiple t
+    int t = idx / C;
+    int c = idx % C;
+    float accum[x128::size] = {0.0f};
+
+    for (int b = 0; b < B; b++) {
+        x128 packed_dout = load128cs(dout + (b * T * C) + (t * C) + c); // will never be read again
+        for (int k = 0; k < x128::size; k++) {
+            accum[k] += (float)packed_dout[k];
+        }
+    }
+
+    floatX* dwpe_tc = dwpe + (t * C) + c;
+    x128 packed_dwpe = load128(dwpe_tc);
+    for (unsigned int k = 0; k < x128::size; k++) {
+        // We use stochastic rounding to go from FP32 to BF16
+        // The seed is deterministic and unique for each parameter to guarantee we have determinism AND
+        // to avoid **potential** issues with positionX int SquirrelNoise5 argument overflowing which is UB
+        // and that somehow messing the quality of random numbers
+        stochastic_rounding(accum[k] + (float)packed_dwpe[k], &packed_dwpe[k], seed + idx + k);
+    }
+    store128(dwpe_tc, packed_dwpe);
+}
+
+// ----------------------------------------------------------------------------
+// kernel launchers
+
+void encoder_forward(floatX* out,
+                     const int* inp, const floatX* wte, const floatX* wpe,
+                     int B, int T, int C, cudaStream_t stream) {
+    NVTX_RANGE_FN();
+    const int block_size = 256;
+    const int N = B * T * C;
+    const int grid_size = CEIL_DIV(N, (int)(block_size * x128::size));
+    encoder_forward_kernel3<<<grid_size, block_size, 0, stream>>>(out, inp, wte, wpe, B, T, C);
+    cudaCheck(cudaGetLastError());
+}
+
+// Fully deterministic (see comments in wte_backward_kernel and wpe_backward_kernel for more details)
+void encoder_backward(floatX* dwte, floatX* dwpe, floatX* scratch, // gpu outputs & scratch
+                      int* workload_indices, int4* bucket_info,    // cpu scratch buffers
+                      const floatX* dout, const int* inp, const int* inputs_cpu, // cpu/gpu inputs
+                      int B, int T, int C, unsigned int seed, cudaStream_t stream) {
+    NVTX_RANGE_FN();
+
+    // Launch wpe kernel first (so it runs on the GPU in parallel with the CPU pre-processing for wte)
+    const int block_size = 256;
+    const int N = T * C / x128::size;
+    const int grid_size = CEIL_DIV(N, block_size);
+    wpe_backward_kernel<<<grid_size, block_size, 0, stream>>>(dwpe, dout, inp, B, T, C, seed);
+    cudaCheck(cudaGetLastError());
+
+    // check the GPU scratch buffer is large enough to hold the bucket info and workload indices
+    // todo - this is trivially true given hardcoded scratch buffer size here, is this useful?
+    int num_c_groups = CEIL_DIV(C, x128::size * WARP_SIZE);
+    assert(B*T*num_c_groups * (sizeof(int4)+sizeof(int)) <= B*T*3*C * sizeof(floatX));
+
+    // Step 1: Sort inputs into buckets
+    int total_items = 0;
+    std::unordered_map<uint64_t, std::vector<uint64_t>> buckets;
+    for (uint64_t bt = 0; bt < B * T; bt++) {
+        for (uint64_t c_group = 0; c_group < num_c_groups; c_group++) {
+            // todo - passing c_group/inputs_cpu[bt] in data to avoid a second hash lookup is a bit hacky
+            uint64_t data = bt + (c_group<<32ULL) + ((uint64_t)inputs_cpu[bt]<<42ULL);
+            buckets[c_group + num_c_groups * inputs_cpu[bt]].push_back(data);
+            total_items++;
+        }
+    }
+
+    // Step 2: Sort buckets by size in descending order
+    // this is so the largest buckets are processed first by the GPU
+    // otherwise, if they started late, they would still be running with the rest of the GPU idle
+    std::vector<std::pair<uint64_t, std::vector<uint64_t>>> sortedBuckets(buckets.begin(), buckets.end());
+    std::sort(sortedBuckets.begin(), sortedBuckets.end(), // ugly because we don't have a typedef for the std::pair
+              [](const std::pair<uint64_t, std::vector<uint64_t>>& a, const std::pair<uint64_t, std::vector<uint64_t>>& b) {
+                  return a.second.size() > b.second.size();
+              });
+
+    int num_buckets = buckets.size();
+    int bucket_index = 0;
+    int workload_index = 0;
+    for (const auto& bucket : sortedBuckets) {
+        bucket_info[bucket_index].x = workload_index; // bucket start
+        bucket_info[bucket_index].y = bucket.second.size(); // bucket size
+        bucket_info[bucket_index].z = (bucket.second[0] >> 42ULL) & ((1ULL<<20ULL)-1); // bucket ix
+        bucket_info[bucket_index].w = (bucket.second[0] >> 32ULL) & ((1ULL<<10ULL)-1); // bucket c
+
+        for (uint64_t idx : bucket.second) {
+            workload_indices[workload_index++] = (int)(idx & ((1ULL<<31ULL)-1ULL));
+        }
+        bucket_index++;
+    }
+
+    // Step 3: Copy data from host to device (async until the last one to avoid synchronising CPU/GPU twice)
+    // todo - could use CUDA events (even without streams) to avoid CPU/GPU synchronisation completely
+    int4* d_bucket_info = (int4*)scratch;
+    int*  d_workload_indices = (int*)(scratch + B*T*num_c_groups * sizeof(int4));
+    cudaCheck(cudaMemcpyAsync(d_bucket_info, bucket_info, num_buckets * sizeof(int4), cudaMemcpyHostToDevice, stream));
+    cudaCheck(cudaMemcpyAsync(d_workload_indices, workload_indices, total_items * sizeof(int), cudaMemcpyHostToDevice, stream));
+
+    // Launch wte kernel
+    // todo - profile block sizes on more content (depends on number of buckets and on GPU?)
+    wte_backward_kernel<256><<<num_buckets, 256, 0, stream>>>(dwte, d_bucket_info, d_workload_indices, dout, inp, seed, B, T, C);
+    cudaCheck(cudaGetLastError());
+}
diff --git a/llmc/fused_classifier.cuh b/llmc/fused_classifier.cuh
new file mode 100644
index 000000000..4837d4cb0
--- /dev/null
+++ b/llmc/fused_classifier.cuh
@@ -0,0 +1,149 @@
+/*
+Fused Classifier:
+- Forwards the Cross Entropy Loss
+- Never materializes the full normalized logits, only at the target label
+- (fusion) Also kicks off the backward pass, because everything is already loaded
+*/
+// llmc internal imports
+#include "cuda_common.h"
+#include "cuda_utils.cuh"
+
+// ----------------------------------------------------------------------------
+// CUDA kernels
+
+struct SoftmaxParams {
+    float Scale;
+    float Offset;
+};
+
+__device__ SoftmaxParams prepare_softmax_blockwide3(int64_t idx, const floatX* inp, int V, int P) {
+    // same but not float4
+    // one row of inp, i.e. inp[idx, :] of shape (V,)
+
+    const floatX* x = inp + idx * P;
+    float thread_maxval = -INFINITY;
+    float thread_sumval = 0.0f;
+    int i = (V+x128::size-1)/x128::size + threadIdx.x - blockDim.x;
+
+    // special-case loop to handle the unaligned elements at the end of the array
+    // this lets us skip the bounds check in the main loop below, which improves performance
+    while ((i+1)*x128::size > V) {
+        for(int k = 0; k < x128::size; ++k) {
+            if (i*x128::size+k >= V) {
+                break; // bounds checking against real V (rather than padded P)
+            }
+            float v = (float)x[i*x128::size+k];
+            float old_maxval = thread_maxval;
+            thread_maxval = fmaxf(thread_maxval, v);
+            thread_sumval *= expf((old_maxval - thread_maxval));
+            thread_sumval += expf(v - thread_maxval);
+        }
+        i -= blockDim.x;
+    }
+
+    // main loop for the bulk of the iterations (no bounds checking required!)
+    for (; i >= 0; i -= blockDim.x) {
+        x128 packed_x = load128(x + i * x128::size); // load and keep in cache until fused_classifier loop
+        for(int k = 0; k < x128::size; ++k) {
+            float v = (float)packed_x[k];
+            float old_maxval = thread_maxval;
+            thread_maxval = fmaxf(thread_maxval, v);
+            thread_sumval *= expf((old_maxval - thread_maxval));
+            thread_sumval += expf(v - thread_maxval);
+        }
+    }
+
+    // Block Max Reduction -> Maths -> Block Sum Reduction
+    float block_maxval = blockReduce<warpReduceMax>(thread_maxval, false, -INFINITY);
+    thread_sumval *= expf(thread_maxval - block_maxval);
+    float block_sumval = blockReduce<warpReduceSum>(thread_sumval);
+
+    // return the softmax parameters
+    return SoftmaxParams{1.f / block_sumval, block_maxval};
+}
+
+// will _update_ logits to logit gradients
+// uses template to decide whether to write logits and probs
+// split both loops in "multiple-of-x128-size" and "bounds-checked remainder" parts
+template <bool WriteDLogits = true, bool WriteProbs = false>
+__global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
+    fused_classifier_kernel5(floatX* logits, float* losses, floatX* probs,
+                                const float dloss, const int* targets,
+                                int B, int T, int V, int P, std::bool_constant<WriteDLogits>) {
+    // note: idx is small enough that it easily fits into 32 bit;
+    // by making it a long here, we ensure that any offsets calculated with it (e.g., idx * P)
+    // are done is 64 bit
+    int64_t idx = gridDim.x - (blockIdx.x+1); // reverse order for cache hits on matmul data
+    int ix = targets[idx];
+
+    // softmax (reading B * T * V, same logits read again below, hopefully still in cache)
+    SoftmaxParams sp = prepare_softmax_blockwide3(idx, logits, V, P);
+
+    // calculate the probability needed for the loss and update (single-threaded)
+    if(threadIdx.x == 0) {
+        float prob = expf((float)logits[idx * P + ix] - sp.Offset) * sp.Scale;
+        losses[idx] -= logf(prob);
+    }
+
+    // without this synchronization point we have a race condition:
+    // the logits used above to compute the loss are concurrently (race) modified to carry backward pass grads.
+    // since the "logits" are overwritten to be in the [-1, 1] range and sp.Offset is sometimes smaller than -90
+    // we errouneously end up computing exp^(90+) which gives us infinities in the loss! this is the fix.
+    __syncthreads();
+
+    // calculate the gradients directly, saves bandwidth from probs during training
+    // but also supports writing probs for inference-only and debugging
+    const floatX* logits_vec = logits + idx * P;
+    for (int i = threadIdx.x; i < V/x128::size; i += blockDim.x) {
+        // this is the 2nd read of logits after the one in prepare_softmax2
+        // it will be overwritten by the logits gradients which is when we reduce cache persistence
+        x128 packed_logits_vec = load128(logits_vec + i * x128::size); // rely on cs of store128cs
+        x128 packed_probs;
+        for(int k = 0; k < x128::size; ++k) {
+            int element = i*x128::size + k;
+            float prob = expf((float)packed_logits_vec[k] - sp.Offset) * sp.Scale;
+            packed_probs[k] = (floatX)prob;
+            float indicator = (element == ix) ? 1.0f : 0.0f;
+            packed_logits_vec[k] = (floatX)((prob - indicator) * dloss);
+        }
+        if (WriteDLogits){
+            // reduce cache persistence for the overwritten logits
+            // to maximise probability that logits remain in cache between prepare_softmax and here
+            store128cs(logits + idx * P + i * x128::size, packed_logits_vec);
+        }
+        if (WriteProbs) {
+            store128(probs + idx * P + i * x128::size, packed_probs);
+        }
+    }
+
+    // handle remaining elements after the last multiple of x128::size
+    // e.g. if V = 8003, and x128::size = 8, we need to handle the last 3 elements
+    int unaligned_start = V & ~(x128::size - 1); // round down to multiple of x128::size
+    for (int i = threadIdx.x + unaligned_start; i < V; i++) {
+        float prob = expf((float)logits_vec[i] - sp.Offset) * sp.Scale;
+        float indicator = (i == ix) ? 1.0f : 0.0f;
+        float dlogit = (prob - indicator) * dloss;
+        if (WriteDLogits){
+            __stcs(logits + idx * P + i, (floatX)dlogit);
+        }
+        if (WriteProbs) {
+            probs[idx * P + i] = (floatX)prob;
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------
+// kernel launchers
+
+// replaces logits with logit gradients
+template <typename Type, bool WriteDLogits>
+void fused_classifier(Type* logits, float* losses,
+                      const float dloss, const int* targets,
+                      int B, int T, int V, int P, std::bool_constant<WriteDLogits> write_dlogits, cudaStream_t stream) {
+    NVTX_RANGE_FN();
+    const int block_size = 1024;
+    const int N = B * T;
+    const int grid_size = N;
+    fused_classifier_kernel5<<<grid_size, block_size, 0, stream>>>(logits, losses, (floatX*)NULL, dloss, targets, B, T, V, P, write_dlogits);
+    cudaCheck(cudaGetLastError());
+}
diff --git a/llmc/gelu.cuh b/llmc/gelu.cuh
new file mode 100644
index 000000000..cd5c297b6
--- /dev/null
+++ b/llmc/gelu.cuh
@@ -0,0 +1,66 @@
+/*
+(Approximate) GeLU non-linearity layer
+*/
+#include <assert.h>
+// llmc internal imports
+#include "cuda_common.h"
+#include "cuda_utils.cuh"
+
+// ----------------------------------------------------------------------------
+// CUDA kernels
+
+#define GELU_SCALING_FACTOR sqrtf(2.0f / M_PI)
+__global__ void gelu_forward_kernel2(floatX* out, const floatX* inp) {
+    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
+
+    x128 packed_out;
+    x128 packed_inp = load128cs(inp + idx); // load and do not keep in cache
+    for(int k = 0; k < packed_inp.size; ++k) {
+        float xi = (float)packed_inp[k];
+        float cube = 0.044715f * xi * xi * xi;
+        packed_out[k] = (floatX)(0.5f * xi * (1.0f + tanhf(GELU_SCALING_FACTOR * (xi + cube))));
+    }
+    // store instead of storecs (without cache streaming) in case it is useful for the
+    // data to be in the cache for the next operation after this GeLU
+    store128(out + idx, packed_out);
+}
+
+__global__ void gelu_backward_inplace_kernel(floatX* d_in_out, const floatX* inp) {
+    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
+
+    x128 packed_dinp;
+    x128 packed_inp = load128cs(inp + idx);
+    x128 packed_dout = load128(d_in_out + idx);
+    for (int k = 0; k < packed_inp.size; ++k) {
+        float x = (float)packed_inp[k];
+        float cube = 0.044715f * x * x * x;
+        float tanh_arg = GELU_SCALING_FACTOR * (x + cube);
+        float tanh_out = tanhf(tanh_arg);
+        float coshf_out = coshf(tanh_arg);
+        float sech_out = 1.0f / (coshf_out * coshf_out);
+        float local_grad = 0.5f * (1.0f + tanh_out) + x * 0.5f * sech_out * GELU_SCALING_FACTOR * (1.0f + 3.0f * 0.044715f * x * x);
+        packed_dinp[k] = (floatX)(local_grad * (float)packed_dout[k]);
+    }
+    store128(d_in_out + idx, packed_dinp);
+}
+
+// ----------------------------------------------------------------------------
+// kernel launchers
+
+void gelu_forward(floatX* out, const floatX* inp, int N, cudaStream_t stream) {
+    NVTX_RANGE_FN();
+    const int block_size = 512;
+    assert(N % (block_size * x128::size) == 0);
+    const int grid_size = CEIL_DIV(N, block_size * x128::size);
+    gelu_forward_kernel2<<<grid_size, block_size, 0, stream>>>(out, inp);
+    cudaCheck(cudaGetLastError());
+}
+
+void gelu_backward_inplace(floatX* d_in_out, const floatX* inp, const int N, cudaStream_t stream) {
+    NVTX_RANGE_FN();
+    const int block_size = 128;
+    assert(N % (block_size * x128::size) == 0);
+    const int grid_size = CEIL_DIV(N, block_size * x128::size);
+    gelu_backward_inplace_kernel<<<grid_size, block_size, 0, stream>>>(d_in_out, inp);
+    cudaCheck(cudaGetLastError());
+}
diff --git a/llmc/global_norm.cuh b/llmc/global_norm.cuh
new file mode 100644
index 000000000..e0e23b08a
--- /dev/null
+++ b/llmc/global_norm.cuh
@@ -0,0 +1,89 @@
+/*
+Global norm, used in gradient clipping
+*/
+#include <assert.h>
+#include <stddef.h>
+#include <cuda_runtime_api.h>
+// llmc internal imports
+#include "cuda_common.h"
+#include "cuda_utils.cuh"
+
+// ----------------------------------------------------------------------------
+// CUDA kernels
+
+template<class T>
+__device__ float global_norm_squared_for_range(const T* data, size_t count) {
+    size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t grid_width = blockDim.x * gridDim.x;
+    float accumulator = 0.f;
+    for(size_t i = index; i < count; i += grid_width) {
+        accumulator += (float)data[i] * (float)data[i];
+    }
+    // block-level reduce
+    return blockReduce<warpReduceSum>(accumulator);
+}
+
+template<class T>
+__global__ void global_norm_squared_kernel(float* out, const T* data, size_t count, ptrdiff_t stride) {
+    float block_sum = global_norm_squared_for_range(data + blockIdx.y * stride, count);
+    // each block accumulates its partial sum to out[out_index]
+    // we want to avoid using atomic add here so we combine this kernel with another kernel call
+    // that sums up the partial block sums
+    if(threadIdx.x == 0) {
+        size_t out_index = blockIdx.y * gridDim.x + blockIdx.x;
+        out[out_index] = out[out_index] + block_sum;
+    }
+}
+
+__global__ void global_norm_aggregate_kernel(float* out, size_t grid_size) {
+    size_t index = threadIdx.x;
+    // grab block sums from the previous kernel, use 0. as the neutral sum element
+    float block_sum = (index < grid_size) ? out[index] : 0.f;
+    float sum = blockReduce<warpReduceSum>(block_sum);
+    if(threadIdx.x == 0) {
+        out[0] = sum;  // out[0] ends up with the final norm squared
+    }
+}
+
+// ----------------------------------------------------------------------------
+// kernel launcher
+
+// Helper function determines the maximum number of block sums
+int get_max_num_block_sums(int* num_slices_all, int numel) {
+    // NOTE: this needs to be kept in sync with `global_norm_squared` below.
+    const int block_size = 512;
+    const int grid_size = deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / block_size;
+    assert(grid_size > 0);
+    int max_num_block_sums = 0;
+    for (int i = 0; i < numel; i++) {
+        int num_slices = num_slices_all[i];
+        const int gx = CEIL_DIV(grid_size, num_slices);
+        const int gy = num_slices;
+        max_num_block_sums = max(max_num_block_sums, gx * gy);
+    }
+
+    return max_num_block_sums;
+}
+
+template<typename T>
+void global_norm_squared(float* out, const T* values, size_t count, ptrdiff_t stride, int num_slices, int max_num_block_sums, bool reset, cudaStream_t stream) {
+    const int block_size = 512;
+    // launch just enough blocks to fill the grid. deliberately no DIV_CEIL.
+    // having one block less than possible is a tiny performance hit, having
+    // one block too many is catastrophic, since it only can start once all the other
+    // blocks finish. anyway, I think cuda_threads_per_SM should be a multiple of 512
+    // on all gpus, so the division really is going to be exact.
+    const int grid_size = deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / block_size;
+    assert(grid_size > 0);      // gives a better error than letting the call below fail
+
+    const int gx = CEIL_DIV(grid_size, num_slices);
+    const int gy = num_slices;
+
+    assert(gx * gy < 1024);  // we want to later accumulate the block sums in a single block
+
+    if (reset) {
+        cudaCheck(cudaMemsetAsync(out, 0, max_num_block_sums * sizeof(float), stream));
+    }
+    global_norm_squared_kernel<<<dim3(gx, gy), block_size, 0, stream>>>(out, values, count, stride);
+    cudaCheck(cudaGetLastError());
+}
diff --git a/llmc/layernorm.cuh b/llmc/layernorm.cuh
new file mode 100644
index 000000000..9777d0658
--- /dev/null
+++ b/llmc/layernorm.cuh
@@ -0,0 +1,505 @@
+/*
+LayerNorm CUDA kernel, and also Residual, because sometimes they are fused
+
+Note in llm.c we try to be clever in the backward pass to conserve memory.
+All parameters use a += in the backward pass, so we can do gradient accumulation.
+But all activations have = instead of += because these are faster (just read, no write).
+This is okay for all activations except for those in the residual stream, where the
+gradients have to add. We make sure that we do a += as necessary.
+E.g., the layernorms are connected to the residuals so we += in layernorm backward.
+*/
+
+#include <assert.h>
+// llmc internal imports
+#include "cuda_common.h"
+#include "cuda_utils.cuh"
+
+// ----------------------------------------------------------------------------
+// CUDA kernels
+
+__global__ void layernorm_forward_kernel3(floatX* __restrict__ out, float* __restrict__ mean, float* __restrict__ rstd,
+                                    const floatX*  __restrict__ inp, const floatX*  __restrict__ weight,
+                                    const floatX* __restrict__ bias, int N, int C) {
+    int lane_id = threadIdx.x % WARP_SIZE;
+    int warp_id = threadIdx.x / WARP_SIZE;
+    int num_warps = blockDim.x / WARP_SIZE;
+
+    int idx = blockIdx.x * num_warps + warp_id;
+    if(idx >= N) { return; } // guard
+
+    // the row of input that this group of threads is responsible for
+    const floatX* x = inp + idx * C;
+
+    // mean
+    float sum = 0.0f;
+    for (int i = lane_id; i < C; i += WARP_SIZE) {
+        sum += (float)x[i];
+    }
+    sum = warpReduceSum(sum);
+    float m = sum / C;
+    if(lane_id == 0 && mean != nullptr) {
+        __stcs(mean + idx, m);
+    }
+
+    // rstd
+    sum = 0.0f;
+    for (int i = lane_id; i < C; i += WARP_SIZE) {
+        float diff = (float)x[i] - m;
+        sum += diff * diff;
+    }
+    sum = warpReduceSum(sum);
+    float s = rsqrtf(sum / C + 1e-5f);
+    if(lane_id == 0 && rstd != nullptr) {
+        __stcs(rstd + idx, s);
+    }
+
+    // final normalization and scaling by weight/bias
+    floatX* o = out + idx * C;
+    for (int c = lane_id; c < C; c += WARP_SIZE) {
+        // load and store using the .cs "streaming" hint to the compiler,
+        // indicating that this data will not be reused soon, and can be streamed through the caches
+        // this allows the threads to get more cache-hits for the (shared) weight and bias parameters
+        float n = s * ((float)__ldcs(x+c) - m);
+        __stcs(o+c, (floatX)(n * (float)weight[c] + (float)bias[c]));
+    }
+}
+
+__global__ void layernorm_forward_kernel6(floatX* __restrict__ out, float* __restrict__ mean, float* __restrict__ rstd,
+                                    const floatX*  __restrict__ inp, const floatX*  __restrict__ weight,
+                                    const floatX* __restrict__ bias, int N, int C) {
+    assert(blockDim.x == WARP_SIZE);
+
+    // load weights and biases into shared memory
+    // do this before we allow any threads to exit!
+    extern __shared__ char* params[];
+    // load128/store128 sometimes generated multiple instructions when the types here were floatX*, so
+    // let's keep everything as x128
+    x128* s_weight = reinterpret_cast<x128*>(params);
+    x128* s_bias = reinterpret_cast<x128*>(params) + (C / x128::size);
+    x128* s_in = reinterpret_cast<x128*>(params) + ((2 + threadIdx.y) * C / x128::size);
+
+    int sidx = (threadIdx.x + WARP_SIZE * threadIdx.y) * x128::size;
+    for(int i = sidx; i < C; i += blockDim.y * WARP_SIZE * x128::size) {
+        s_weight[i/x128::size] = load128(weight + i);
+        s_bias[i/x128::size] = load128(bias + i);
+    }
+    __syncthreads();
+
+    int idx = blockIdx.x * blockDim.y + threadIdx.y;
+    if(idx >= N) { return; } // guard
+
+    // adjust pointers to current token
+    inp += idx * C;
+    out += idx * C;
+
+    const float eps = 1e-5f;
+    float sum = 0.0f;
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
+        const x128 in_data = load128cs(inp + c);
+        for(int k = 0; k < x128::size; ++k) {
+            sum += (float)in_data[k];
+        }
+        s_in[c / x128::size] = in_data;
+    }
+
+    sum = warpReduceSum(sum);
+    float m = sum / C;
+    float v = 0.f;
+
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
+        const x128 in_data = s_in[c / x128::size];
+        for(int k = 0; k < x128::size; ++k) {
+            v += ((float)in_data[k] - m) * ((float)in_data[k] - m);
+        }
+    }
+
+    v = warpReduceSum(v) / C;
+    float s = rsqrtf(v + eps);
+
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
+        const x128 in_data = s_in[c / x128::size];
+        const x128 w = s_weight[c / x128::size];
+        const x128 b = s_bias[c / x128::size];
+        x128 out_data;
+        for(int k = 0; k < x128::size; ++k) {
+            float n = s * ((float)in_data[k] - m); // normalized output
+            float o = n * (float)w[k] + (float)b[k]; // scale and shift it
+            out_data[k] = (floatX)o;
+        }
+
+        store128cs(out + c, out_data);
+    }
+    // cache the mean and rstd for the backward pass later
+    if(threadIdx.x == 0 && mean != nullptr) {
+        __stcs(mean + idx, m);
+    }
+    // store the rstd, no need to cache it
+    if(threadIdx.x == 0 && rstd != nullptr) {
+        __stcs(rstd + idx, s);
+    }
+}
+
+__global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed, float* mean, float* rstd,
+                                               const floatX* inp1, const floatX* inp2,
+                                               const floatX* weight, const floatX* bias,
+                                               int N, int C) {
+    assert(blockDim.x == WARP_SIZE);
+
+    // load weights and biases into shared memory
+    // do this before we allow any threads to exit!
+    extern __shared__ char* params[];
+    // load128/store128 sometimes generated multiple instructions when the types here were floatX*, so
+    // let's keep everything as x128
+    x128* s_weight = reinterpret_cast<x128*>(params);
+    x128* s_bias = reinterpret_cast<x128*>(params) + (C / x128::size);
+    x128* s_res = reinterpret_cast<x128*>(params) + ((2 + threadIdx.y) * C / x128::size);
+
+    int sidx = (threadIdx.x + WARP_SIZE * threadIdx.y) * x128::size;
+    for(int i = sidx; i < C; i += blockDim.y * WARP_SIZE * x128::size) {
+        s_weight[i/x128::size] = load128(weight + i);
+        s_bias[i/x128::size] = load128(bias + i);
+    }
+    __syncthreads();
+
+    int idx = blockIdx.x * blockDim.y + threadIdx.y;
+    if(idx > N) return;
+
+    // adjust pointers to current token
+    residual += C * idx;
+    normed += C * idx;
+    inp1 += C * idx;
+    inp2 += C * idx;
+
+    const float eps = 1e-5f;
+    float sum = 0.0f;
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
+        const x128 in1 = load128cs(inp1 + c);
+        const x128 in2 = load128cs(inp2 + c);
+        x128 out;
+        for(int k = 0; k < x128::size; ++k) {
+            out[k] = (float)in1[k] + (float)in2[k];
+            sum += (float)out[k];
+        }
+        store128cs(residual + c, out);
+        s_res[c / x128::size] = out;
+    }
+
+    sum = warpReduceSum(sum);
+    float m = sum / C;
+    float v = 0.f;
+
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
+        const x128 res = s_res[c / x128::size];
+        for(int k = 0; k < x128::size; ++k) {
+            v += ((float)res[k] - m) * ((float)res[k] - m);
+        }
+    }
+
+    v = warpReduceSum(v) / C;
+    float s = rsqrtf(v + eps);
+
+    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
+        const x128 res = s_res[c / x128::size];
+        const x128 w = s_weight[c / x128::size];
+        const x128 b = s_bias[c / x128::size];
+        x128 out;
+        for(int k = 0; k < x128::size; ++k) {
+            float n = s * ((float)res[k] - m); // normalized output
+            float o = n * (float)w[k] + (float)b[k]; // scale and shift it
+            out[k] = o;
+        }
+
+        store128cs(normed + c, out);
+    }
+    // cache the mean and rstd for the backward pass later
+    if(threadIdx.x == 0) {
+        mean[idx] = m;
+        rstd[idx] = s;
+    }
+}
+
+__global__ void residual_forward_kernel(floatX* out, const floatX* inp1, const floatX* inp2) {
+    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
+
+    x128 packed_out;
+    x128 packed_inp1 = load128cs(inp1 + idx);
+    x128 packed_inp2 = load128cs(inp2 + idx);
+    for (int k = 0; k < packed_inp1.size; k++) {
+        packed_out[k] = (floatX)((float)packed_inp1[k] + (float)packed_inp2[k]);
+    }
+    store128(out + idx, packed_out);
+}
+
+__global__ void __launch_bounds__(512, 2) // todo - any warnings on Turing with only 1024 threads?
+    layernorm_backward_kernel10(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
+                                const floatX* dout, const floatX* inp, const floatX* weight,
+                                const float* mean, const float* rstd,
+                                int B, int T, int C) {
+    int BLOCK_SIZE = blockDim.x;
+    int warpsInBlock = BLOCK_SIZE / WARP_SIZE; //number of warps in block
+    extern __shared__ float shared[];
+
+    int warpId = threadIdx.x / WARP_SIZE; // warp index within a block
+    int baseIdx = blockIdx.x * warpsInBlock + warpId;
+    int warpThreadIdx = threadIdx.x % WARP_SIZE; // Thread index within the warp
+    int warpsInGrid = gridDim.x * warpsInBlock;
+    int C_per_iteration = WARP_SIZE * x128::size;
+    int iterations_C = CEIL_DIV(C, C_per_iteration); // + 2;
+
+    // the first half of shared memory is bias, second is weight
+    size_t rounded_C = CEIL_DIV(C, (32 * x128::size)) * (32 * x128::size);
+    float* dbias_shared = shared;
+    float* dweight_shared = shared + rounded_C;
+    // warp zero doesn't actually write to the _tmp_shared memory locations, so we don't need to reserve memory
+    // the obvious solution is to change the addressing below to use (threadId.x-32) as offset, but that causes
+    // register spills, so instead we mess with the base pointer here, which doesn't increase register usage.
+    float* dbias_tmp_shared = shared + 2 * rounded_C - WARP_SIZE * f128::size;
+    float* dweight_tmp_shared = shared + 2 * rounded_C + f128::size * BLOCK_SIZE - 2 * WARP_SIZE * f128::size;
+
+    // init shared memory to zero
+    for(int i = threadIdx.x * f128::size; i < rounded_C; i += BLOCK_SIZE * f128::size) {
+        store128(dbias_shared + i, f128::zeros());
+        store128(dweight_shared + i, f128::zeros());
+    }
+    __syncthreads();
+
+    for (int bt = baseIdx; bt < B * T; bt += warpsInGrid) {
+        const floatX* dout_bt = dout + bt * C;
+        const floatX* inp_bt = inp +bt * C;
+        floatX* dinp_bt = dinp + bt * C;
+
+        // first: two reduce operations
+        float dnorm_mean = 0.0f;
+        float dnorm_norm_mean = 0.0f;
+        for (int i = warpThreadIdx * x128::size; i < C; i += WARP_SIZE * x128::size) {
+            x128 dout128_i   = load128(dout_bt + i);
+            x128 inp128_i    = load128(inp_bt  + i);
+            x128 weight128_i = load128(weight  + i);
+            for (int k = 0; k < x128::size; k++) {
+                float dnorm_i = (float)weight128_i[k] * (float)dout128_i[k];
+                dnorm_mean += dnorm_i;
+                dnorm_norm_mean += dnorm_i * (float)inp128_i[k];
+            }
+        }
+
+        const float mean_bt = mean[bt];
+        const float rstd_bt = rstd[bt];
+        dnorm_mean = warpReduceSum(dnorm_mean) / C;
+        dnorm_norm_mean = warpReduceSum(dnorm_norm_mean) / C * rstd_bt - dnorm_mean * mean_bt * rstd_bt;
+
+        for (int c = 0; c < iterations_C; c++) {
+            int global_index = (warpThreadIdx * x128::size) + (c * C_per_iteration);
+
+            x128 dout128   = x128::zeros();
+            x128 inp128    = x128::zeros();
+            x128 dinp128   = x128::zeros();
+            x128 weight128 = x128::zeros();
+
+            if(global_index < C) {
+                dout128 = load128cs(dout_bt + global_index);
+                inp128 = load128cs(inp_bt + global_index);
+                dinp128 = load128(dinp_bt + global_index);
+                weight128 = load128(weight + global_index);
+            }
+
+            for(int o = 0; o < x128::size / f128::size; ++o) {
+                f128 dbias_f;
+                f128 dweight_f;
+                for(int i = 0; i < f128::size; ++i) {
+                    int x = o * f128::size + i;
+                    float dout_i = (float)dout128[x];
+                    float norm_bti = ((float)inp128[x] - mean_bt) * rstd_bt;
+                    dbias_f[i] = dout_i;
+                    dweight_f[i] = norm_bti * dout_i;
+
+                    float dval = 0.0f;
+                    dval += (float) weight128[x] * (float)dout128[x]; // term 1
+                    dval -= dnorm_mean; // term 2
+                    dval -= norm_bti * dnorm_norm_mean; // term 3
+                    dval *= rstd_bt; // final scale
+                    dinp128[x] = (floatX) ((float) dinp128[x] + dval);
+                }
+
+                if (warpId != 0) {
+                    store128(dbias_tmp_shared + threadIdx.x * f128::size, dbias_f);
+                    // this seems to generate a 64-bit store, instead of 128-bit.
+                    // however, forcing 128-bit (e.g., using inline ptx), results in register
+                    // spilling and much worse performance, so we'll keep it like this for now
+                    // but ideally, we could reduce the register pressure a little.
+                    store128(dweight_tmp_shared + threadIdx.x * f128::size, dweight_f);
+                }
+                __syncthreads();
+                if (warpId == 0) {
+                    for (int j = 1; j < warpsInBlock; j++) {
+                        f128 dbias_tmp = load128(dbias_tmp_shared + f128::size * (threadIdx.x + j * WARP_SIZE));
+                        f128 dweight_tmp = load128(dweight_tmp_shared + f128::size * (threadIdx.x + j * WARP_SIZE));
+                        for(int i = 0; i < f128::size; ++i) {
+                            dbias_f[i] += dbias_tmp[i];
+                            dweight_f[i] += dweight_tmp[i];
+                        }
+                    }
+                }
+                __syncthreads();
+                if (warpId == 0) {
+                    f128 db_old = load128(dbias_shared + global_index + f128::size * o);
+                    f128 dw_old = load128(dweight_shared + global_index + f128::size * o);
+                    for(int i = 0; i < f128::size; ++i) {
+                        dbias_f[i] += db_old[i];
+                        dweight_f[i] += dw_old[i];
+                    }
+                    store128(dbias_shared + global_index + f128::size * o, dbias_f);
+                    store128(dweight_shared + global_index + f128::size * o, dweight_f);
+                }
+            }
+            if(global_index < C) {
+                // cache in L2 as this is read by the next kernel, but bypass L1 to minimise thrashing
+                store128cg(dinp_bt + global_index, dinp128);
+            }
+        }
+    }
+    __syncthreads();
+    // Each block writes its partial sum to global memory
+    // The last block to finish becomes responsible for summing up all the partial sums
+    // This is done by atomically incrementing a flag (cleared to 0 before launching the kernel)
+    unsigned int* scratchFlag = (unsigned int*)(scratch);
+    // Increment scratch pointer by a full cacheline so that everything remains cacheline aligned
+    scratch += 32;
+    float* scratch_dbias = scratch;
+    float* scratch_dweight = scratch + C;
+    for(int i = threadIdx.x * f128::size; i < C; i += BLOCK_SIZE * f128::size) {
+        // Write to global memory in the same "shared memory banking friendly" order
+        store128(scratch_dbias + i + 2*C*blockIdx.x, load128(dbias_shared + i));
+        store128(scratch_dweight + i + 2*C*blockIdx.x, load128(dweight_shared + i));
+    }
+    __syncthreads();
+    // that portion of shared memory is no longer used, so we can repurpose it for the scratch flag.
+    unsigned int *tmp_flag = (unsigned int*)(shared + 2*rounded_C);
+    if (threadIdx.x == 0) {
+        *tmp_flag = atomicInc(scratchFlag, gridDim.x);
+    }
+    __syncthreads();
+    if (*tmp_flag == gridDim.x-1) {
+        // Reduction of the partial sums by the final block
+        // todo - there isn't enough parallelism even inside that single SM...
+        // ==> so could maybe split into another kernel with YET ANOTHER level of reduction?!
+        for(int i = threadIdx.x * f128::size; i < C; i += BLOCK_SIZE * f128::size) {
+            f128 dbias_accum = f128::zeros();
+            f128 dweight_accum = f128::zeros();
+
+            for (int read_block_idx = 0; read_block_idx < gridDim.x; read_block_idx++) {
+                int offset = i + 2*C*read_block_idx;
+                f128 dbias128 = load128(scratch_dbias + offset);
+                f128 dweight128 = load128(scratch_dweight + offset);
+                for(int k = 0; k < f128::size; k++) {
+                    dbias_accum[k] += dbias128[k];
+                    dweight_accum[k] += dweight128[k];
+                }
+            }
+            store128(dbias_shared + i, dbias_accum);
+            store128(dweight_shared + i, dweight_accum);
+        }
+        __syncthreads();
+
+        // convert from float/FP32 to floatX/BF16 for the final write
+        // this is separate because it cannot use as many warps as the above (f128 vs x128)
+        // todo - if we split this code into another kernel, we could maybe do it at the same time?
+        for (int c = warpId; c < iterations_C; c += warpsInBlock) {
+            int global_index = (warpThreadIdx * x128::size) + (c * C_per_iteration);
+            if (global_index >= C) {
+                break;
+            }
+
+            x128 dbias128 = load128(dbias + global_index);
+            x128 dweight128 = load128(dweight + global_index);
+            for(int o = 0; o < x128::size / f128::size; ++o) {
+                f128 s_db = load128(dbias_shared + global_index + o * f128::size);
+                f128 s_dw = load128(dweight_shared + global_index + o * f128::size);
+                for(int i = 0; i < f128::size; ++i) {
+                    int x = o * f128::size + i;
+                    dbias128[x] = (floatX)(s_db[i] + (float)dbias128[x]);
+                    dweight128[x] = (floatX)(s_dw[i] + (float)dweight128[x]);
+                }
+            }
+            store128(dbias + global_index, dbias128);
+            store128(dweight + global_index, dweight128);
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------
+// kernel launchers
+
+// similar to `fused_residual_forward5`
+void layernorm_forward(floatX* out, float* mean, float* rstd,
+                       floatX* inp, const floatX* weight, const floatX* bias,
+                       int B, int T, int C, cudaStream_t stream) {
+    NVTX_RANGE_FN();
+    const int block_size = 256;
+    int block_y = block_size / WARP_SIZE;
+    const int N = B * T;
+    const int grid_size = CEIL_DIV(N, block_y);
+    size_t smem = (2 + block_y) * C * sizeof(floatX);
+
+    // in order to use more than 48 KiB of smem, need to call cudaFuncSetAttribute
+    // this may fail, in which case we fall back to the smem free implementation.
+    cudaCheck(cudaGetLastError());
+    auto status = cudaFuncSetAttribute(layernorm_forward_kernel6, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
+    cudaCheck(cudaGetLastError());
+    if (status == cudaSuccess) {
+        layernorm_forward_kernel6<<<grid_size, dim3(WARP_SIZE, block_y), smem, stream>>>(out, mean, rstd, inp, weight, bias, N, C);
+    } else {
+        // fall back to the version without shared memory
+        const int grid_size_fb = CEIL_DIV(N * WARP_SIZE, block_size);
+        layernorm_forward_kernel3<<<grid_size_fb, block_size, 0, stream>>>(out, mean, rstd, inp, weight, bias, N, C);
+    }
+    cudaCheck(cudaGetLastError());
+}
+
+void residual_forward(floatX* out, const floatX* inp1, const floatX* inp2, int N, cudaStream_t stream) {
+    NVTX_RANGE_FN();
+    const int block_size = 256;
+    assert(N % (block_size * x128::size) == 0);
+    const int grid_size = CEIL_DIV(N, block_size * x128::size);
+    residual_forward_kernel<<<grid_size, block_size, 0, stream>>>(out, inp1, inp2);
+    cudaCheck(cudaGetLastError());
+}
+
+void fused_residual_forward5(floatX* residual, floatX* normed, float* mean, float* rstd,
+                             const floatX* inp1, const floatX* inp2,
+                             const floatX* weight, const floatX* bias,
+                             int N, int C, cudaStream_t stream) {
+    const int block_size = 256;
+    int block_y = block_size / WARP_SIZE;
+    const int grid_size = CEIL_DIV(N, block_y);
+    size_t smem = (2 + block_y) * C * sizeof(floatX);
+
+    // in order to use more than 48 KiB of smem, need to call cudaFuncSetAttribute
+    // this may fail, in which case we fall back to the smem free implementation.
+    cudaCheck(cudaGetLastError());
+    auto status = cudaFuncSetAttribute(fused_residual_forward_kernel5, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
+    cudaCheck(cudaGetLastError());
+    if(status == cudaSuccess) {
+        fused_residual_forward_kernel5<<<grid_size, dim3(WARP_SIZE, block_y), smem, stream>>>(residual, normed,
+                                                                                              mean, rstd, inp1, inp2,
+                                                                                              weight, bias, N, C);
+    } else {
+        residual_forward(residual, inp1, inp2, N*C, stream);
+        layernorm_forward(normed, mean, rstd, residual, weight, bias, N, 1, C, stream);
+    }
+    cudaCheck(cudaGetLastError());
+}
+
+void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
+                        const floatX* dout, const floatX* inp, const floatX* weight, const float* mean, const float* rstd,
+                        int B, int T, int C, cudaStream_t stream) {
+    NVTX_RANGE_FN();
+    const int block_size = 512;
+    const int blocks_per_sm = 2; // supported on every architecture and less cache thrashing than 3
+    const int grid_size = blocks_per_sm * deviceProp.multiProcessorCount;
+    size_t rounded_C = CEIL_DIV(C, (32 * x128::size)) * (32 * x128::size);
+    size_t shared_mem_size = (2 * rounded_C + 2 * (block_size - 32) * f128::size) * sizeof(float);
+
+    cudaCheck(cudaMemsetAsync(scratch, 0, 1 * sizeof(float), stream)); // only need to reset the flag to 0
+    layernorm_backward_kernel10<<<grid_size, block_size, shared_mem_size, stream>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
+    cudaCheck(cudaGetLastError());
+}
diff --git a/llmc/logger.h b/llmc/logger.h
index 7ffb9d6d3..f9d414c34 100644
--- a/llmc/logger.h
+++ b/llmc/logger.h
@@ -6,6 +6,7 @@ The Logger object is stateless and uses append mode to write to log files.
 #define LOGGER_H
 
 #include <assert.h>
+#include <stdio.h>
 #include <string.h>
 // defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck, mallocCheck
 #include "utils.h"
@@ -46,10 +47,10 @@ void logger_log_val(Logger *logger, int step, float val_loss) {
     }
 }
 
-void logger_log_train(Logger *logger, int step, float train_loss) {
+void logger_log_train(Logger *logger, int step, float train_loss, float learning_rate, float grad_norm) {
     if (logger->active == 1) {
         FILE *logfile = fopenCheck(logger->output_log_file, "a");
-        fprintf(logfile, "s:%d trl:%.4f\n", step, train_loss);
+        fprintf(logfile, "s:%d trl:%.4f lr:%.6f norm:%.2f\n", step, train_loss, learning_rate, grad_norm);
         fclose(logfile);
     }
 }
diff --git a/llmc/matmul.cuh b/llmc/matmul.cuh
new file mode 100644
index 000000000..becc372c6
--- /dev/null
+++ b/llmc/matmul.cuh
@@ -0,0 +1,290 @@
+/*
+Matrix Multiplication, with help from cuBLASLt
+*/
+#include <assert.h>
+#include <type_traits>      // std::bool_constant
+// llmc internal imports
+#include "cuda_common.h"
+#include "cuda_utils.cuh"
+#include "cublas_common.h"
+// GELU can be either fused (cublasLt) or non-fused (gelu.h)
+#include "gelu.cuh"
+
+// ----------------------------------------------------------------------------
+// CUDA kernels
+
+template<typename OutFloat, bool UseAuxBuffer>
+__global__ void matmul_backward_bias_kernel9(OutFloat* dbias, const floatX* dout, int B, int T, int OC,
+                                             std::bool_constant<UseAuxBuffer>) {
+    constexpr const int bdx = 4;
+    constexpr const int bdy = WARP_SIZE / bdx;
+    assert(blockDim.x == bdx);
+    assert(blockDim.y == bdy);
+
+    int warp_d = (int)threadIdx.x;
+    int warp_c = (int)threadIdx.y;
+    int block_d = (int)threadIdx.z;
+
+    const int OC_per_warp = bdy * x128::size;  // 64 at BF16
+
+    int local_oc = warp_c * x128::size;
+    int global_oc = blockIdx.x * OC_per_warp + local_oc;
+
+    int local_bt = warp_d + bdx * block_d;
+    int bt_per_block = bdx * blockDim.z;
+
+    float accumulators[x128::size];
+    for (int k = 0; k < x128::size; k++) {
+        accumulators[k] = 0.0f;
+    }
+
+    if(global_oc < OC) {
+        // sum up over all bt within registers
+        for (int idx = blockIdx.y * bt_per_block + local_bt; idx < B * T; idx += gridDim.y * bt_per_block) {
+            x128 packed_dout = load128(dout + global_oc + idx*OC);
+            for (int k = 0; k < x128::size; k++) {
+                accumulators[k] += (float)packed_dout[k];
+            }
+        }
+    }
+
+    __shared__ float sub_results[x128::size][WARP_SIZE][bdy];
+
+    // reduce within-warp results
+    for (int k = 0; k < x128::size; k++) {
+        float v = accumulators[k];
+        v += __shfl_down_sync(0xffffffff, v, 1, 4);
+        v += __shfl_down_sync(0xffffffff, v, 2, 4);
+        if(warp_d == 0) {
+            sub_results[k][block_d][warp_c] = v;
+        }
+    }
+    __syncthreads();
+
+    // block-wide reductions
+    for (int k = block_d; k < x128::size; k += blockDim.z) {
+        float a = 0.f;
+        for (int r = warp_d; r < blockDim.z; r += bdx) {
+            float v = sub_results[k][r][warp_c];
+            v += __shfl_down_sync(0xffffffff, v, 1, 4);
+            v += __shfl_down_sync(0xffffffff, v, 2, 4);
+            a += v;
+        }
+        if(warp_d == 0 && global_oc < OC) {
+            if constexpr (!UseAuxBuffer) {
+                dbias[global_oc + k] = (OutFloat)(a + (float)dbias[global_oc + k]);
+            } else {
+                dbias[global_oc + k + blockIdx.y * OC] = a;
+            }
+        }
+    }
+}
+
+__global__ void reduce_add_sum_kernel(floatX* dst, const float* src, size_t n, size_t m) {
+    const size_t idx = (blockIdx.x * blockDim.x + threadIdx.x) * f128::size;
+    assert(n % x128::size == 0);
+    if (idx < n) {
+        f128 acc;
+        for(int k = 0; k < f128::size; ++k) {
+            acc[k] = 0.f;
+        }
+
+        for(int l = 0; l < m; ++l) {
+            f128 s = load128(src + idx + n * l);
+            for(int k = 0; k < f128::size; ++k) {
+                acc[k] += s[k];
+            }
+        }
+        for(int k = 0; k < f128::size; ++k) {
+            dst[idx + k] = (floatX) ((float)dst[idx + k] + acc[k]);
+        }
+    }
+}
+
+// ----------------------------------------------------------------------------
+// kernel launchers
+
+// Wrapper around cublasLtMatmul that is meant to support everything we need in llm.c
+// https://docs.nvidia.com/cuda/cublas/#cublasltmatmul
+void matmul_cublaslt(floatX* d, const floatX* a, const floatX* b, const floatX* bias,
+                     int m, int n, int k, cudaStream_t stream=0, bool transA=true, bool transB=false,
+                     int batch_count=0, size_t strideA=0, size_t strideB=0, size_t strideOut=0,
+                     bool accumulate=false, floatX* pre_gelu=NULL, bool backward=false)
+{
+    NVTX_RANGE_FN();
+    bool has_bias = (bias != NULL);
+    bool has_gelu = (pre_gelu != NULL);
+
+    // check alignment (some modes work unaligned but it always best to be aligned for performance)
+    if(((uintptr_t)a % 16) != 0 || ((uintptr_t)b % 16) != 0 || ((uintptr_t)d % 16) != 0 || ((uintptr_t)bias % 16) != 0) {
+        printf("All cuBLASLt pointers must be aligned!\n");
+        exit(EXIT_FAILURE);
+    }
+
+    // create the operation descriptor
+    cublasLtMatmulDesc_t operationDesc;
+    cublasCheck(cublasLtMatmulDescCreate(&operationDesc, cublas_compute, CUDA_R_32F));
+
+    int returnedResults = 0;
+    cublasLtMatmulPreference_t preference;
+    cublasLtMatmulHeuristicResult_t heuristic;
+
+    cublasOperation_t opNoTranspose = CUBLAS_OP_N;
+    cublasOperation_t opTranspose = CUBLAS_OP_T;
+    cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, (transA)  ? &opTranspose : &opNoTranspose,   sizeof(opTranspose)));
+    cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, (transB) ? &opTranspose   : &opNoTranspose, sizeof(opNoTranspose)));
+
+    // define matrix layouts
+    cublasLtMatrixLayout_t ALayout;
+    cublasLtMatrixLayout_t BLayout;
+    cublasLtMatrixLayout_t DLayout;
+    cublasLtMatrixLayout_t CLayout;
+    if (transA) {
+        cublasCheck(cublasLtMatrixLayoutCreate(&ALayout, CUBLAS_LOWP, k, m, k));
+    } else {
+        cublasCheck(cublasLtMatrixLayoutCreate(&ALayout, CUBLAS_LOWP, m, k, m));
+    }
+    if (transB) {
+        cublasCheck(cublasLtMatrixLayoutCreate(&BLayout, CUBLAS_LOWP, n, k, n));
+    } else {
+        cublasCheck(cublasLtMatrixLayoutCreate(&BLayout, CUBLAS_LOWP, k, n, k));
+    }
+    // cuBLASLt requires C in FP8 mode to be BF16 or FP32... (sigh)
+    cublasCheck(cublasLtMatrixLayoutCreate(&CLayout, (sizeof(floatX) == 1) ? CUDA_R_16BF : CUBLAS_LOWP, m, n, m));
+    cublasCheck(cublasLtMatrixLayoutCreate(&DLayout, CUBLAS_LOWP, m, n, m));
+
+    // Strided Batched GEMM (used for non-flash attention, equivalent to cublasGemmStridedBatchedEx)
+    if (batch_count) {
+        cublasCheck(cublasLtMatrixLayoutSetAttribute(ALayout, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count, sizeof(batch_count)));
+        cublasCheck(cublasLtMatrixLayoutSetAttribute(BLayout, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count, sizeof(batch_count)));
+        cublasCheck(cublasLtMatrixLayoutSetAttribute(CLayout, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count, sizeof(batch_count)));
+        cublasCheck(cublasLtMatrixLayoutSetAttribute(DLayout, CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch_count, sizeof(batch_count)));
+
+        cublasCheck(cublasLtMatrixLayoutSetAttribute(ALayout, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideA, sizeof(strideA)));
+        cublasCheck(cublasLtMatrixLayoutSetAttribute(BLayout, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideB, sizeof(strideB)));
+        cublasCheck(cublasLtMatrixLayoutSetAttribute(CLayout, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideOut, sizeof(strideOut)));
+        cublasCheck(cublasLtMatrixLayoutSetAttribute(DLayout, CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &strideOut, sizeof(strideOut)));
+    }
+
+    // create a preference handle with specified max workspace
+    cublasCheck(cublasLtMatmulPreferenceCreate(&preference));
+    cublasCheck(cublasLtMatmulPreferenceSetAttribute(preference, CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
+                                                     &cublaslt_workspace_size, sizeof(cublaslt_workspace_size)));
+
+    // setup epilogue and associated pointers for bias & gelu
+    cublasLtEpilogue_t epilogue;
+    if (has_gelu) {
+        int64_t gelu_ld = m; // todo - is this affected by anything else?
+        cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_LD, &gelu_ld, sizeof(gelu_ld)));
+        cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE_AUX_POINTER, &pre_gelu, sizeof(pre_gelu)));
+        if (backward) {
+            assert(!has_bias); // we shouldn't have any backward matmuls that use both GELU and bias
+            epilogue = CUBLASLT_EPILOGUE_DGELU;
+        } else {
+            epilogue = has_bias ? CUBLASLT_EPILOGUE_GELU_AUX_BIAS : CUBLASLT_EPILOGUE_GELU_AUX;
+        }
+    } else if(has_bias){
+        epilogue = backward ? CUBLASLT_EPILOGUE_BGRADB : CUBLASLT_EPILOGUE_BIAS;
+    } else {
+        epilogue = CUBLASLT_EPILOGUE_DEFAULT;
+    }
+    cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogue, sizeof(epilogue)));
+
+    if (has_bias) {
+        // cuBLASLt requires bias in FP8 mode to be BF16... (sigh)
+        cublasDataType_t bias_data_type = (sizeof(floatX) == 1) ? CUDA_R_16BF : CUBLAS_LOWP; // force BF16 bias for FP8 mode
+        cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, &bias_data_type, sizeof(bias_data_type)));
+        cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias)));
+    }
+
+    // set scale type to FP32 (needs to be FP16 if and only if using CUBLAS_COMPUTE_16F, so it's FP32 even for FP8!)
+    cublasDataType_t scale_type = CUDA_R_32F;
+    cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_SCALE_TYPE, &scale_type, sizeof(scale_type)));
+
+    // find a suitable algorithm (cached internally so shouldn't take much CPU time in practice)
+    cublasLtMatmulAlgoGetHeuristic(cublaslt_handle, operationDesc, ALayout, BLayout, CLayout, DLayout,
+                                   preference, 1, &heuristic, &returnedResults);
+    if (returnedResults == 0) {
+        printf("No cuBLASLt algorithm: m: %d, n: %d, k: %d, bias: %d\n", n, m, k, has_bias);
+        exit(EXIT_FAILURE);
+    }
+
+    // set whether to accumulate (i.e. D += C) or not - note this isn't considered in algorithm selection (?!)
+    const float alpha = 1.0f, beta = accumulate ? 1.0f : 0.0f;
+
+    // call the matmul
+    cublasCheck(cublasLtMatmul(cublaslt_handle, operationDesc,
+                               &alpha, a, ALayout, b, BLayout, &beta, d, CLayout, d, DLayout,
+                               &heuristic.algo, cublaslt_workspace, cublaslt_workspace_size, stream));
+
+    // cleanups
+    cublasCheck(cublasLtMatmulPreferenceDestroy(preference));
+    cublasCheck(cublasLtMatmulDescDestroy(operationDesc));
+    cublasCheck(cublasLtMatrixLayoutDestroy(ALayout));
+    cublasCheck(cublasLtMatrixLayoutDestroy(BLayout));
+    cublasCheck(cublasLtMatrixLayoutDestroy(CLayout));
+    cublasCheck(cublasLtMatrixLayoutDestroy(DLayout));
+    cudaCheck(cudaGetLastError());
+}
+
+// small wrapper around matmul_cublaslt for the forward pass (keeping historical order of arguments)
+void matmul_forward_cublaslt(floatX* out,
+                     floatX* inp, floatX* weight, floatX* bias,
+                     int B, int T, int C, int OC, cudaStream_t stream,
+                     floatX* pre_gelu=NULL, int gelu_fusion=1) {
+    // By default only fuse GELU for H100+ as cuBLAS seems to be inefficient for fused GELU on Ada/Ampere (?)
+    if (gelu_fusion < 1 && pre_gelu) {
+        matmul_cublaslt(pre_gelu, weight, inp, bias, OC, B*T, C, stream, true, false, 0, 0, 0, 0, false, NULL, false);
+        gelu_forward(out, pre_gelu, B*T*OC, stream);
+    } else {
+        matmul_cublaslt(out, weight, inp, bias, OC, B*T, C, stream, true, false, 0, 0, 0, 0, false, pre_gelu, false);
+    }
+}
+
+void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias,
+                     floatX* dout, floatX* inp, floatX* weight,
+                     float* dbias_buffer,
+                     int B, int T, int C, int OC, cudaStream_t stream,
+                     floatX* pre_gelu=NULL, int gelu_fusion=1) {
+    NVTX_RANGE_FN();
+
+    // backward to bias, if given, does a +=
+    if (dbias != NULL) {
+        // Each warp is responsible for 8 * "x128::size" = 64 OCs at BF16 (OC must be a multiple of 64!)
+        // Block size is 1024 | 768 threads (32|24 warps) and we reduce those values into 1 at the end
+
+        const int block_size = deviceProp.maxThreadsPerMultiProcessor == 1536 ? 768 : 1024;
+
+        dim3 block_dim = {4, 8, (unsigned)block_size/WARP_SIZE};
+        const int OC_per_warp = block_dim.y * x128::size; // 64 at BF16
+        const int grid_size_x = CEIL_DIV(OC, OC_per_warp); // e.g. 12 horizontal blocks for 768 OCs at BF16
+        const int grid_size_y = max(1, deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / (block_size * grid_size_x)); // full GPU!
+
+        // If we have enough OC that we don't need cross-block reductions, we can skip the bias_buffer accumulation
+        // and write results directly to the output.
+        if(grid_size_y == 1) {
+            matmul_backward_bias_kernel9<<<dim3(grid_size_x, grid_size_y), block_dim, 0, stream>>>(dbias, dout, B, T, OC, False);
+            cudaCheck(cudaGetLastError());
+        } else {
+            // kernel 9 overwrites temp buffer, so no need to memset
+            matmul_backward_bias_kernel9<<<dim3(grid_size_x, grid_size_y), block_dim, 0, stream>>>(dbias_buffer, dout, B, T, OC, True);
+            cudaCheck(cudaGetLastError());
+            reduce_add_sum_kernel<<<CEIL_DIV(OC, 256 * f128::size), 256, 0, stream>>>(dbias, dbias_buffer, OC, grid_size_y);
+            cudaCheck(cudaGetLastError());
+        }
+        dbias = NULL; // prevent dbias calculation from also being fused in matmul_cublaslt below (if we enabled fusion)
+    }
+
+    // backward to input, uses = in the backward pass (set the gradient)
+    matmul_cublaslt(dinp, weight, dout, NULL, C, B*T, OC, stream, false, false, 0, 0, 0, 0, false,
+                    gelu_fusion >= 2 ? pre_gelu : NULL, true);
+
+    // backward GELU (if it wasn't fused into the matmul above)
+    if (gelu_fusion < 2 && pre_gelu) {
+        gelu_backward_inplace(dinp, pre_gelu, B*T*C, stream);
+    }
+
+    // backward to weight, uses += in the backward pass (accumulate the gradient) by setting alpha=one
+    matmul_cublaslt(dweight, inp, dout, NULL /*dbias*/, C, OC, B*T, stream, false, true, 0, 0, 0, 0,
+                    true /* accumulate */, NULL, true);
+}
diff --git a/llmc/mfu.h b/llmc/mfu.h
new file mode 100644
index 000000000..1c40b7b7f
--- /dev/null
+++ b/llmc/mfu.h
@@ -0,0 +1,244 @@
+#ifndef MFU_H
+#define MFU_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#if __has_include(<nvml.h>)
+#define USE_NVML 1
+#include <nvml.h>
+#else
+#define USE_NVML 0
+#endif
+
+// tied to enum PrecisionMode, in a future refactor make them the same
+#define MFUH_PRECISION_FP32 0
+#define MFUH_PRECISION_FP16 1
+#define MFUH_PRECISION_BF16 2
+
+#if USE_NVML
+inline void nvml_check(nvmlReturn_t status, const char *file, int line) {
+    if (status != NVML_SUCCESS) {
+        printf("[NVML ERROR] at file %s:%d:\n%s\n", file, line, nvmlErrorString(status));
+        exit(EXIT_FAILURE);
+    }
+};
+#define nvmlCheck(err) (nvml_check(err, __FILE__, __LINE__))
+#endif
+
+
+typedef struct {
+    float TF_32;       // tensor-core performance 32 bit
+    float BF_16_32;    // bf16 with 32 bit accumulate
+    float FP_16_32;    // fp16 with 32 bit accumulate
+    float FP_16_16;    // fp16 with 16 bit accumulate
+    float FP_8_32;     // and so on
+    float FP_8_16;
+    float CLOCK;        // clock frequency from the spec sheet
+    float CORES;        // #TCs from the spec sheet
+} PerfData;
+
+// basic default data from the nvidia whitepapers
+static const PerfData VOLTA = {125.0f, -1.f, 125.f, -1.f, -1.f, -1.f, 1530.f, 640.f};
+static const PerfData AMPERE_DATACENTER = {156.f, 312.f, 312.f, 312.f, -1.f, -1.f, 1410.f, 432.f};
+static const PerfData AMPERE_CONSUMER = {40.f, 80.f, 80.f, 160.f, -1.f, -1.f, 1860.f, 336.f};
+static const PerfData HOPPER = {378.f, 756.f, 756.f, 756.f, 1513.f, 1513.f, 1620.f, 456.f};
+static const PerfData ADA = {82.6f, 165.2f, 165.2f, 330.3f, 330.3f, 660.6f, 2520.f, 512.f};
+
+typedef struct {
+    const char* name;
+    const PerfData* perf_data;
+    float new_cores;
+    float new_mhz;
+} GPUEntry;
+
+// the overrides for each specific GPU
+static GPUEntry gpu_db[] = {
+    {"Tesla V100-SXM2-16GB", &VOLTA, 640, 1530},
+    {"Tesla V100-PCIE-32GB", &VOLTA, 640, 1530},
+    {"NVIDIA A100-PCIE-40GB", &AMPERE_DATACENTER, 432, 1410},
+    {"NVIDIA A100-PCIE-80GB", &AMPERE_DATACENTER, 432, 1410},
+    {"NVIDIA A100-SXM4-40GB", &AMPERE_DATACENTER, 432, 1410},
+    {"NVIDIA A100-SXM4-80GB", &AMPERE_DATACENTER, 432, 1410},
+    {"NVIDIA RTX A2000", &AMPERE_CONSUMER, 104, 1200},
+    {"NVIDIA RTX A4000", &AMPERE_CONSUMER, 192, 1560},
+    {"NVIDIA RTX A4500", &AMPERE_CONSUMER, 224, 1650},
+    {"NVIDIA RTX A5000", &AMPERE_CONSUMER, 256, 1695},
+    {"NVIDIA RTX A5500", &AMPERE_CONSUMER, 320, 1770},
+    {"NVIDIA RTX A6000", &AMPERE_CONSUMER, 336, 1800},
+    {"NVIDIA GeForce RTX 3090 Ti", &AMPERE_CONSUMER, 336, 1860},
+    {"NVIDIA GeForce RTX 3090", &AMPERE_CONSUMER, 328, 1695},
+    {"NVIDIA GeForce RTX 3080 Ti", &AMPERE_CONSUMER, 320, 1665},
+    {"NVIDIA GeForce RTX 3080", &AMPERE_CONSUMER, 272, 1710},
+    {"NVIDIA GeForce RTX 3070 Ti", &AMPERE_CONSUMER, 192, 1770},
+    {"NVIDIA GeForce RTX 3070", &AMPERE_CONSUMER, 184, 1725},
+    {"NVIDIA GeForce RTX 3060 Ti", &AMPERE_CONSUMER, 152, 1665},
+    {"NVIDIA GeForce RTX 3060", &AMPERE_CONSUMER, 112, 1777},
+    {"NVIDIA RTX A2000 ADA", &ADA, 88, 2130},
+    {"NVIDIA RTX A4000 ADA", &ADA, 192, 2175},
+    {"NVIDIA RTX A4500 ADA", &ADA, 224, 2580},
+    {"NVIDIA RTX A5000 ADA", &ADA, 400, 2550},
+    {"NVIDIA RTX A5880 ADA", &ADA, 440, 2460},
+    {"NVIDIA RTX A6000 ADA", &ADA, 568, 2505},
+    {"NVIDIA GeForce RTX 4090", &ADA, 512, 2520},
+    {"NVIDIA GeForce RTX 4080 SUPER", &ADA, 320, 2550},
+    {"NVIDIA GeForce RTX 4080", &ADA, 304, 2505},
+    {"NVIDIA GeForce RTX 4070 Ti SUPER", &ADA, 264, 2610},
+    {"NVIDIA GeForce RTX 4070 Ti", &ADA, 240, 2610},
+    {"NVIDIA GeForce RTX 4070 SUPER", &ADA, 224, 2475},
+    {"NVIDIA GeForce RTX 4070", &ADA, 184, 2475},
+    {"NVIDIA GeForce RTX 4070", &ADA, 184, 2475},
+    {"NVIDIA GeForce RTX 4060 Ti", &ADA, 136, 2535},
+    {"NVIDIA GeForce RTX 4060", &ADA, 96, 2460},
+    {"NVIDIA H100 PCIe", &HOPPER, 456, 1620},
+    {"NVIDIA H100 80GB HBM3", &HOPPER, 528, 1830}, // HBM3 = SXM5
+};
+
+float get_flops_promised(const char* device, int precision_mode) {
+    /*
+    This function is used to estimate the Model Flops Utilization (MFU)
+    basically we have to figure out how many flops the GPU can do per second.
+    Note that this is not a simple endeavor and may well go wrong! The details are tricky.
+    The returned value is in units of 1e12.
+
+    For the non-top models, actual performance numbers aren't that easy to find, e.g.,
+    here https://www.techpowerup.com/gpu-specs/rtx-a4000.c3756, does "Theoretical Performance"
+    seems to be without tensor cores.
+
+    So, instead we use that all these cards just use the same types of tensor cores in different
+    numbers and at different frequencies. Then we just need to look up these two easily accesible
+    numbers for all the other GPUs.
+    linear scaling seems to work: comparing spec sheet and calculation:
+    4080: 304TCs, 2505 GHz; 97.5TFlops = 165.2/512*304 /2520 * 2505
+
+    Original numbers for the top GPUS are from.
+    https://resources.nvidia.com/en-us-tensor-core
+    https://images.nvidia.com/aem-dam/Solutions/geforce/ada/nvidia-ada-gpu-architecture.pdf
+    */
+
+   // validate the precision mode as one of the three possible values
+    if (!(precision_mode == MFUH_PRECISION_FP32 || precision_mode == MFUH_PRECISION_FP16 || precision_mode == MFUH_PRECISION_BF16)) {
+        fprintf(stderr, "Invalid precision mode: %d\n", precision_mode);
+        return -1.0f;
+    }
+
+    // do a linear search until you find our GPU, then calculate the flops promised
+    int num_gpu_entries = sizeof(gpu_db) / sizeof(gpu_db[0]);
+    for (int i = 0; i < num_gpu_entries; i++) {
+        if (strcmp(gpu_db[i].name, device) == 0) {
+            const PerfData* perf_data = gpu_db[i].perf_data;
+
+            // look up the default flops value for the given precision mode
+            float value = -1.0f;
+            if (precision_mode == MFUH_PRECISION_BF16) { value = perf_data->BF_16_32; }
+            if (precision_mode == MFUH_PRECISION_FP32) { value = perf_data->TF_32; }
+            if (precision_mode == MFUH_PRECISION_FP16) { value = perf_data->FP_16_32; }
+
+            // we'd get here if we're e.g. trying to use BF16 on Volta GPU or something...
+            if (value < 0.0f) {
+                fprintf(stderr, "No data for GPU %s and precision mode %d\n", device, precision_mode);
+                return -1.0f;
+            }
+
+            // adjust flops based on the specific core count and clock frequency of this GPU
+            float new_cores = gpu_db[i].new_cores;
+            float new_mhz = gpu_db[i].new_mhz;
+            float adjusted = value * (new_cores / perf_data->CORES) * (new_mhz / perf_data->CLOCK);
+            return adjusted;
+        }
+    }
+
+    return -1.0f; // ¯\_(ツ)_/¯
+}
+
+struct GPUUtilInfo {
+    unsigned int clock;
+    unsigned int max_clock;
+    unsigned int power;
+    unsigned int power_limit;
+    unsigned int fan;
+    unsigned int temperature;
+    unsigned int temp_slowdown;
+
+    float gpu_utilization;
+    float mem_utilization;
+    const char* throttle_reason;
+};
+
+// lazily initialize nvml and generate a handle to the GPU
+#if USE_NVML
+nvmlDevice_t nvml_get_device() {
+    static bool needs_init = true;
+    static nvmlDevice_t device;
+    if(needs_init) {
+        needs_init = false;
+        nvmlCheck(nvmlInit());
+        nvmlCheck(nvmlDeviceGetHandleByIndex_v2(0, &device));
+    }
+    return device;
+}
+
+// convert throttle reason bitfield into a text reason.
+// this is a lossy conversion; we just want to give some idea of what is happening
+const char* get_throttle_reason(unsigned long long bits) {
+    if(bits & (nvmlClocksThrottleReasonSwPowerCap | nvmlClocksThrottleReasonHwPowerBrakeSlowdown)) {
+        return "power cap";
+    } else if (bits & (nvmlClocksThrottleReasonSwThermalSlowdown | nvmlClocksThrottleReasonHwThermalSlowdown)) {
+        return "thermal cap";
+    } else if (bits & (nvmlClocksThrottleReasonAll)) {
+        return "other cap";
+    } else {
+        return "no cap";
+    }
+}
+
+// gather data for a GPUUtilInfo object
+GPUUtilInfo get_gpu_utilization_info() {
+    GPUUtilInfo info;
+    nvmlDevice_t device = nvml_get_device();
+    // query different infos directly
+    nvmlCheck(nvmlDeviceGetClockInfo(device, NVML_CLOCK_SM, &info.clock));
+    nvmlCheck(nvmlDeviceGetMaxClockInfo(device, NVML_CLOCK_SM, &info.max_clock));
+    nvmlCheck(nvmlDeviceGetPowerManagementLimit(device, &info.power_limit));
+    nvmlCheck(nvmlDeviceGetPowerUsage(device, &info.power));
+    nvmlCheck(nvmlDeviceGetTemperature(device, NVML_TEMPERATURE_GPU, &info.temperature));
+    nvmlCheck(nvmlDeviceGetTemperatureThreshold(device, NVML_TEMPERATURE_THRESHOLD_SLOWDOWN, &info.temp_slowdown));
+    unsigned long long throttle;
+    nvmlCheck(nvmlDeviceGetCurrentClocksThrottleReasons(device, &throttle));
+    info.throttle_reason = get_throttle_reason(throttle);
+    nvmlCheck(nvmlDeviceGetFanSpeed(device, &info.fan));
+
+    // for "utilization", we look at recorded samples. In principle, we could query the driver for how many samples
+    // to request, but then we'd need to dynamically allocate sufficient space. Let's just hard-code a limit of 128,
+    // and have no memory management required
+    constexpr const int BUFFER_LIMIT = 128;
+    nvmlSample_t buffer[BUFFER_LIMIT];
+    nvmlValueType_t v_type;
+    unsigned int sample_count = BUFFER_LIMIT;
+    nvmlCheck(nvmlDeviceGetSamples(device, NVML_GPU_UTILIZATION_SAMPLES, 0, &v_type, &sample_count, buffer));
+    float gpu_utilization = 0.f;
+    for(unsigned i = 0; i < sample_count; ++i) {
+        gpu_utilization += (float)buffer[i].sampleValue.uiVal;
+    }
+    gpu_utilization /= (float)sample_count;
+
+    // sample count may have been modified by the query above; reset back to buffer size
+    sample_count = BUFFER_LIMIT;
+    nvmlCheck(nvmlDeviceGetSamples(device, NVML_MEMORY_UTILIZATION_SAMPLES, 0, &v_type, &sample_count, buffer));
+    float mem_utilization = 0.f;
+    for(unsigned i = 0; i < sample_count; ++i) {
+        mem_utilization += (float)buffer[i].sampleValue.uiVal;
+    }
+    mem_utilization /= (float)sample_count;
+
+    info.gpu_utilization = gpu_utilization;
+    info.mem_utilization = mem_utilization;
+    return info;
+}
+#else
+GPUUtilInfo get_gpu_utilization_info() {
+    fprintf(stderr, "Error: Compiled without nvml support. Cannot perform additional GPU state tracking.");
+    exit(EXIT_FAILURE);
+}
+#endif
+#endif // MFU_H
diff --git a/llmc/outlier_detector.h b/llmc/outlier_detector.h
new file mode 100644
index 000000000..fb4ded23e
--- /dev/null
+++ b/llmc/outlier_detector.h
@@ -0,0 +1,70 @@
+/*
+Simple OutlierDetector that we can use to monitor the loss and grad norm
+Internally, it keeps track of a window of measurements and each time we
+add a measurement, it returns the z-score of the new value with respect to
+the window of measurements. This can be used to detect outliers in the data.
+
+We use double so that the detector doesn't drift too much, because we
+update the mean and variance with += on each step for efficiency. We could
+reconsider this choice in the future, as the compute cost here is minimal.
+*/
+
+#include <stdio.h>
+#include <math.h>
+
+// use compile-time constant for window size to avoid dynamic memory allocations
+#define OUTLIER_DETECTOR_WINDOW_SIZE 128
+
+typedef struct {
+    double buffer[OUTLIER_DETECTOR_WINDOW_SIZE];
+    int count;
+    int index;
+    double sum;
+    double sum_sq;
+} OutlierDetector;
+
+void init_detector(OutlierDetector *detector) {
+    for (int i = 0; i < OUTLIER_DETECTOR_WINDOW_SIZE; i++) {
+        detector->buffer[i] = 0.0;
+    }
+    detector->count = 0;
+    detector->index = 0;
+    detector->sum = 0.0;
+    detector->sum_sq = 0.0;
+}
+
+double update_detector(OutlierDetector *detector, double new_value) {
+
+    if (detector->count < OUTLIER_DETECTOR_WINDOW_SIZE) {
+        // here we are still building up a window of observations
+        detector->buffer[detector->count] = new_value;
+        detector->sum += new_value;
+        detector->sum_sq += new_value * new_value;
+        detector->count++;
+        return nan(""); // not enough data yet
+
+    } else {
+        // we've filled the window, so now we can start detecting outliers
+
+        // pop the oldest value from the window
+        double old_value = detector->buffer[detector->index];
+        detector->sum -= old_value;
+        detector->sum_sq -= old_value * old_value;
+        // push the new value into the window
+        detector->buffer[detector->index] = new_value;
+        detector->sum += new_value;
+        detector->sum_sq += new_value * new_value;
+        // move the index to the next position
+        detector->index = (detector->index + 1) % OUTLIER_DETECTOR_WINDOW_SIZE;
+        // calculate the z-score of the new value
+        double mean = detector->sum / OUTLIER_DETECTOR_WINDOW_SIZE;
+        double variance = (detector->sum_sq / OUTLIER_DETECTOR_WINDOW_SIZE) - (mean * mean);
+        double std_dev = sqrt(variance);
+        if (std_dev == 0.0) {
+            return 0.0;
+        }
+        double z = (new_value - mean) / std_dev;
+
+        return z;
+    }
+}
diff --git a/llmc/rand.h b/llmc/rand.h
index ba13de9e4..b66aa04b7 100644
--- a/llmc/rand.h
+++ b/llmc/rand.h
@@ -163,15 +163,15 @@ void uniform_(float* data, unsigned int numel, float from, float to, mt19937_sta
     }
 }
 
-// Box�Muller transform
-
-void normal_fill_16(float* data, float mean, float std, mt19937_state* state) {
-    #define EPSILONE 1e-12
+// Box-Muller transform: maps uniform random numbers to Gaussian distributed numbers
+// https://en.wikipedia.org/wiki/Box%E2%80%93Muller_transform
+void normal_fill_16(float* data, float mean, float std) {
+    #define EPSILONE 1e-12f
     for (unsigned int t = 0; t < 8; t++) {
         float u1 = 1 - data[t];
         float u2 = data[t + 8];
         float radius = sqrtf(-2 * logf(u1 + EPSILONE));
-        float theta = 2.0 * M_PI * u2;
+        float theta = (float) (2.0 * M_PI * u2);
         data[t] = (radius * cosf(theta) * std + mean);
         data[t + 8] = (radius * sinf(theta) * std + mean);
     }
@@ -182,7 +182,7 @@ void normal_fill(float* data, unsigned int numel, float mean, float std, mt19937
         data[t] = randfloat32(state);
     }
     for (unsigned int i = 0; i < numel - 15; i += 16) {
-        normal_fill_16(data + i, mean, std, state);
+        normal_fill_16(data + i, mean, std);
     }
     if (numel % 16 != 0) {
         // recompute the last 16 values
@@ -190,12 +190,12 @@ void normal_fill(float* data, unsigned int numel, float mean, float std, mt19937
         for (unsigned int i = 0; i < 16; i++) {
             data[i] = randfloat32(state);
         }
-        normal_fill_16(data, mean, std, state);
+        normal_fill_16(data, mean, std);
     }
 }
 
 void normal_(float* data, unsigned int numel, float mean, float std, mt19937_state* state) {
-    #define EPSILONE 1e-12
+    #define EPSILONE 1e-12f
     if (numel >= 16) {
         normal_fill(data, numel, mean, std, state);
     }
@@ -209,10 +209,10 @@ void normal_(float* data, unsigned int numel, float mean, float std, mt19937_sta
                 continue;
             }
             // for numel < 16 we draw a double (float64)
-            float u1 = randfloat64(state);
-            float u2 = randfloat64(state);
+            float u1 = (float) randfloat64(state);
+            float u2 = (float) randfloat64(state);
             float radius = sqrtf(-2 * logf(1 - u2 + EPSILONE));
-            float theta = 2.0 * M_PI * u1;
+            float theta = (float) (2.0 * M_PI * u1);
             next_double_normal_sample = radius * sinf(theta);
             has_next_double_normal_sample = 1;
             data[t] = (radius * cosf(theta) * std + mean);
@@ -220,4 +220,21 @@ void normal_(float* data, unsigned int numel, float mean, float std, mt19937_sta
     }
 }
 
+void init_identity_permutation(int *data, int numel) {
+    for (int i = 0; i < numel; i++) {
+        data[i] = i;
+    }
+}
+
+void random_permutation(int* data, int numel, mt19937_state* state) {
+    for (int i = numel - 1; i > 0; i--) {
+        // pick an index j in [0, i] with equal probability
+        int j = randint32(state) % (i + 1);
+        // swap i <-> j
+        int tmp = data[i];
+        data[i] = data[j];
+        data[j] = tmp;
+    }
+}
+
 #endif
\ No newline at end of file
diff --git a/llmc/sampler.h b/llmc/sampler.h
index 49451baa5..b4c8d6434 100644
--- a/llmc/sampler.h
+++ b/llmc/sampler.h
@@ -6,6 +6,19 @@ Implements a simple Sampler, used during model inference to sample tokens.
 
 #include <math.h>
 
+// Simple xorshift RNG
+unsigned int random_u32(unsigned long long *state) {
+    // xorshift rng: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A
+    *state ^= *state >> 12;
+    *state ^= *state << 25;
+    *state ^= *state >> 27;
+    return (*state * 0x2545F4914F6CDD1Dull) >> 32;
+}
+
+float random_f32(unsigned long long *state) { // random float32 in [0,1)
+    return (random_u32(state) >> 8) / 16777216.0f;
+}
+
 int sample_softmax(const float* logits, int n, float coin) {
     // sample index from logits (converted to probabilities using softmax)
     // coin is a random number in [0, 1), usually from random_f32()
diff --git a/llmc/schedulers.h b/llmc/schedulers.h
new file mode 100644
index 000000000..9ddc570d1
--- /dev/null
+++ b/llmc/schedulers.h
@@ -0,0 +1,100 @@
+/*
+Implements various learning rate schedulers.
+*/
+#ifndef SCHEDULERS_H
+#define SCHEDULERS_H
+
+#include <assert.h>
+#include <math.h>
+#include <string.h>
+
+typedef struct {
+    const char* type;
+    float learning_rate;
+    int warmup_iterations;
+    int train_num_batches;
+    float final_learning_rate_frac;
+} LearningRateScheduler;
+
+void lr_scheduler_init(LearningRateScheduler *scheduler, const char* scheduler_type, float learning_rate, int warmup_iterations, int train_num_batches, float final_learning_rate_frac) {
+    scheduler->type = scheduler_type;
+    scheduler->learning_rate = learning_rate;
+    scheduler->warmup_iterations = warmup_iterations;
+    scheduler->train_num_batches = train_num_batches;
+    scheduler->final_learning_rate_frac = final_learning_rate_frac;
+}
+
+// cosine: warmup linearly to max LR, then cosine decay to LR * final_learning_rate_frac
+float get_learning_rate_cosine(LearningRateScheduler *scheduler, int step) {
+    float lr = scheduler->learning_rate;
+    if (step < scheduler->warmup_iterations) {
+        lr = scheduler->learning_rate * ((float)(step + 1)) / scheduler->warmup_iterations;
+    } else {
+        float decay_ratio = ((float)(step - scheduler->warmup_iterations)) / (scheduler->train_num_batches - scheduler->warmup_iterations);
+        assert(0.0f <= decay_ratio && decay_ratio <= 1.0f);
+        float coeff = 0.5f * (1.0f + cosf(M_PI * decay_ratio)); // coeff starts at 1 and goes to 0
+        assert(0.0f <= coeff && coeff <= 1.0f);
+        float min_lr = scheduler->learning_rate * scheduler->final_learning_rate_frac;
+        lr = min_lr + coeff * (scheduler->learning_rate - min_lr);
+    }
+    return lr;
+}
+
+// linear: warmup linearly to max LR, then decay linearly to LR * final_learning_rate_frac
+float get_learning_rate_linear(LearningRateScheduler *scheduler, int step) {
+    float lr = scheduler->learning_rate;
+    if (step < scheduler->warmup_iterations) {
+        lr = scheduler->learning_rate * ((float)(step + 1)) / scheduler->warmup_iterations;
+    } else {
+        float decay_ratio = ((float)(step - scheduler->warmup_iterations)) / (scheduler->train_num_batches - scheduler->warmup_iterations);
+        assert(0.0f <= decay_ratio && decay_ratio <= 1.0f);
+        float min_lr = scheduler->learning_rate * scheduler->final_learning_rate_frac;
+        lr = scheduler->learning_rate - decay_ratio * (scheduler->learning_rate - min_lr);
+    }
+    return lr;
+}
+
+// constant
+float get_learning_rate_constant(LearningRateScheduler *scheduler, int step) {
+    return scheduler->learning_rate;
+}
+
+// wsd schedule: warmup linearly, keep constant, last 20% decay using 1 - sqrt decay to final_frac (should be 0.0)
+// https://arxiv.org/abs/2405.18392
+float get_learning_rate_wsd(LearningRateScheduler *scheduler, int step) {
+    int decay_point = (int)(0.8f * scheduler->train_num_batches);
+    float max_lr = scheduler->learning_rate;
+    float lr = max_lr;
+    if (step < scheduler->warmup_iterations) {
+        float decay_ratio = ((float)(step + 1)) / scheduler->warmup_iterations;
+        lr = max_lr * decay_ratio;
+    } else if (step < decay_point) {
+        // noop, keep lr constant
+    } else {
+        float decay_ratio = ((float)(step - decay_point)) / (scheduler->train_num_batches - decay_point);
+        assert(0.0f <= decay_ratio && decay_ratio <= 1.0f);
+        float min_lr = max_lr * scheduler->final_learning_rate_frac;
+        return min_lr + (1.0f - sqrtf(decay_ratio)) * (max_lr - min_lr);
+    }
+    return lr;
+}
+
+// return the learning rate at a given step
+float get_learning_rate(LearningRateScheduler *scheduler, int step) {
+    float step_learning_rate;
+    if (strcmp(scheduler->type, "cosine") == 0) {
+        step_learning_rate = get_learning_rate_cosine(scheduler, step);
+    } else if (strcmp(scheduler->type, "linear") == 0) {
+        step_learning_rate = get_learning_rate_linear(scheduler, step);
+    } else if (strcmp(scheduler->type, "constant") == 0) {
+        step_learning_rate = get_learning_rate_constant(scheduler, step);
+    } else if (strcmp(scheduler->type, "wsd") == 0) {
+        step_learning_rate = get_learning_rate_wsd(scheduler, step);
+    } else {
+        fprintf(stderr, "Unknown learning rate scheduler type: %s\n", scheduler->type);
+        exit(EXIT_FAILURE);
+    }
+    return step_learning_rate;
+}
+
+#endif // SCHEDULERS_H
diff --git a/llmc/tokenizer.h b/llmc/tokenizer.h
index 21e55901e..3c3f1a12d 100644
--- a/llmc/tokenizer.h
+++ b/llmc/tokenizer.h
@@ -90,7 +90,7 @@ const char *tokenizer_decode(Tokenizer *tokenizer, uint32_t token_id) {
     if (token_id < tokenizer->vocab_size) {
         return tokenizer->token_table[token_id];
     } else {
-        printf("invalid token id %d!\n", token_id);
+        printf("invalid token id %u!\n", token_id);
         return NULL;
     }
 }
diff --git a/llmc/utils.h b/llmc/utils.h
index f89c1d25a..775534ccd 100644
--- a/llmc/utils.h
+++ b/llmc/utils.h
@@ -7,12 +7,15 @@
 #ifndef UTILS_H
 #define UTILS_H
 
+#include <unistd.h>
+#include <string.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <sys/stat.h>
 // implementation of dirent for Windows is in dev/unistd.h
 #ifndef _WIN32
 #include <dirent.h>
+#include <arpa/inet.h>
 #endif
 
 // ----------------------------------------------------------------------------
@@ -20,7 +23,7 @@
 // simple replace fopen, fread, fclose, fseek
 // with fopenCheck, freadCheck, fcloseCheck, fseekCheck
 
-FILE *fopen_check(const char *path, const char *mode, const char *file, int line) {
+extern inline FILE *fopen_check(const char *path, const char *mode, const char *file, int line) {
     FILE *fp = fopen(path, mode);
     if (fp == NULL) {
         fprintf(stderr, "Error: Failed to open file '%s' at %s:%d\n", path, file, line);
@@ -38,7 +41,7 @@ FILE *fopen_check(const char *path, const char *mode, const char *file, int line
 
 #define fopenCheck(path, mode) fopen_check(path, mode, __FILE__, __LINE__)
 
-void fread_check(void *ptr, size_t size, size_t nmemb, FILE *stream, const char *file, int line) {
+extern inline void fread_check(void *ptr, size_t size, size_t nmemb, FILE *stream, const char *file, int line) {
     size_t result = fread(ptr, size, nmemb, stream);
     if (result != nmemb) {
         if (feof(stream)) {
@@ -60,7 +63,7 @@ void fread_check(void *ptr, size_t size, size_t nmemb, FILE *stream, const char
 
 #define freadCheck(ptr, size, nmemb, stream) fread_check(ptr, size, nmemb, stream, __FILE__, __LINE__)
 
-void fclose_check(FILE *fp, const char *file, int line) {
+extern inline void fclose_check(FILE *fp, const char *file, int line) {
     if (fclose(fp) != 0) {
         fprintf(stderr, "Error: Failed to close file at %s:%d\n", file, line);
         fprintf(stderr, "Error details:\n");
@@ -72,7 +75,33 @@ void fclose_check(FILE *fp, const char *file, int line) {
 
 #define fcloseCheck(fp) fclose_check(fp, __FILE__, __LINE__)
 
-void fseek_check(FILE *fp, long off, int whence, const char *file, int line) {
+extern inline void sclose_check(int sockfd, const char *file, int line) {
+    if (close(sockfd) != 0) {
+        fprintf(stderr, "Error: Failed to close socket at %s:%d\n", file, line);
+        fprintf(stderr, "Error details:\n");
+        fprintf(stderr, "  File: %s\n", file);
+        fprintf(stderr, "  Line: %d\n", line);
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define scloseCheck(sockfd) sclose_check(sockfd, __FILE__, __LINE__)
+
+#ifdef _WIN32
+extern inline void closesocket_check(int sockfd, const char *file, int line) {
+    if (closesocket(sockfd) != 0) {
+        fprintf(stderr, "Error: Failed to close socket at %s:%d\n", file, line);
+        fprintf(stderr, "Error details:\n");
+        fprintf(stderr, "  File: %s\n", file);
+        fprintf(stderr, "  Line: %d\n", line);
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define closesocketCheck(sockfd) closesocket_check(sockfd, __FILE__, __LINE__)
+#endif
+
+extern inline void fseek_check(FILE *fp, long off, int whence, const char *file, int line) {
     if (fseek(fp, off, whence) != 0) {
         fprintf(stderr, "Error: Failed to seek in file at %s:%d\n", file, line);
         fprintf(stderr, "Error details:\n");
@@ -86,10 +115,32 @@ void fseek_check(FILE *fp, long off, int whence, const char *file, int line) {
 
 #define fseekCheck(fp, off, whence) fseek_check(fp, off, whence, __FILE__, __LINE__)
 
+extern inline void fwrite_check(void *ptr, size_t size, size_t nmemb, FILE *stream, const char *file, int line) {
+    size_t result = fwrite(ptr, size, nmemb, stream);
+    if (result != nmemb) {
+        if (feof(stream)) {
+            fprintf(stderr, "Error: Unexpected end of file at %s:%d\n", file, line);
+        } else if (ferror(stream)) {
+            fprintf(stderr, "Error: File write error at %s:%d\n", file, line);
+        } else {
+            fprintf(stderr, "Error: Partial write at %s:%d. Expected %zu elements, wrote %zu\n",
+                    file, line, nmemb, result);
+        }
+        fprintf(stderr, "Error details:\n");
+        fprintf(stderr, "  File: %s\n", file);
+        fprintf(stderr, "  Line: %d\n", line);
+        fprintf(stderr, "  Expected elements: %zu\n", nmemb);
+        fprintf(stderr, "  Written elements: %zu\n", result);
+        exit(EXIT_FAILURE);
+    }
+}
+
+#define fwriteCheck(ptr, size, nmemb, stream) fwrite_check(ptr, size, nmemb, stream, __FILE__, __LINE__)
+
 // ----------------------------------------------------------------------------
 // malloc error-handling wrapper util
 
-void *malloc_check(size_t size, const char *file, int line) {
+extern inline void *malloc_check(size_t size, const char *file, int line) {
     void *ptr = malloc(size);
     if (ptr == NULL) {
         fprintf(stderr, "Error: Memory allocation failed at %s:%d\n", file, line);
@@ -104,10 +155,29 @@ void *malloc_check(size_t size, const char *file, int line) {
 
 #define mallocCheck(size) malloc_check(size, __FILE__, __LINE__)
 
+
+// ----------------------------------------------------------------------------
+// check that all tokens are within range
+extern inline void token_check(const int* tokens, int token_count, int vocab_size, const char *file, int line) {
+    for(int i = 0; i < token_count; i++) {
+        if(!(0 <= tokens[i] && tokens[i] < vocab_size)) {
+            fprintf(stderr, "Error: Token out of vocabulary at %s:%d\n", file, line);
+            fprintf(stderr, "Error details:\n");
+            fprintf(stderr, "  File: %s\n", file);
+            fprintf(stderr, "  Line: %d\n", line);
+            fprintf(stderr, "  Token: %d\n", tokens[i]);
+            fprintf(stderr, "  Position: %d\n", i);
+            fprintf(stderr, "  Vocab: %d\n", vocab_size);
+            exit(EXIT_FAILURE);
+        }
+    }
+}
+#define tokenCheck(tokens, count, vocab) token_check(tokens, count, vocab, __FILE__, __LINE__)
+
 // ----------------------------------------------------------------------------
 // I/O ops
 
-void create_dir_if_not_exists(const char *dir) {
+extern inline void create_dir_if_not_exists(const char *dir) {
     if (dir == NULL) { return; }
     struct stat st = {0};
     if (stat(dir, &st) == -1) {
@@ -119,7 +189,7 @@ void create_dir_if_not_exists(const char *dir) {
     }
 }
 
-int find_max_step(const char* output_log_dir) {
+extern inline int find_max_step(const char* output_log_dir) {
     // find the DONE file in the log dir with highest step count
     if (output_log_dir == NULL) { return -1; }
     DIR* dir;
@@ -139,4 +209,15 @@ int find_max_step(const char* output_log_dir) {
     return max_step;
 }
 
+extern inline int ends_with_bin(const char* str) {
+    // checks if str ends with ".bin". could be generalized in the future.
+    if (str == NULL) { return 0; }
+    size_t len = strlen(str);
+    const char* suffix = ".bin";
+    size_t suffix_len = strlen(suffix);
+    if (len < suffix_len) { return 0; }
+    int suffix_matches = strncmp(str + len - suffix_len, suffix, suffix_len) == 0;
+    return suffix_matches;
+}
+
 #endif
\ No newline at end of file
diff --git a/llmc/zero.cuh b/llmc/zero.cuh
new file mode 100644
index 000000000..e6c5b6e7c
--- /dev/null
+++ b/llmc/zero.cuh
@@ -0,0 +1,597 @@
+/*
+Utilities for ZeRO sharding
+*/
+
+#ifndef LLMC_ZERO_CUH
+#define LLMC_ZERO_CUH
+
+#include <cuda_runtime_api.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <stddef.h>
+
+#ifdef MULTI_GPU
+#include <nccl.h>
+#ifdef USE_MPI
+#include <mpi.h>
+#endif
+#endif
+
+// defines: fcloseCheck, fwriteCheck, scloseCheck, sclosesocketCheck
+#include "utils.h"
+
+// ----------------------------------------------------------------------------
+// Multi-GPU related
+#ifdef MULTI_GPU
+
+#if defined(ENABLE_FP32)
+const ncclDataType_t ncclFloatX = ncclFloat;
+#elif defined(ENABLE_FP16)
+const ncclDataType_t ncclFloatX = ncclHalf;
+#else // Default to bfloat16
+const ncclDataType_t ncclFloatX = ncclBfloat16;
+#endif
+
+void nccl_check(ncclResult_t status, const char *file, int line) {
+    if (status != ncclSuccess) {
+        printf("[NCCL ERROR] at file %s:%d:\n%s\n", file, line, ncclGetErrorString(status));
+        exit(EXIT_FAILURE);
+    }
+}
+#define ncclCheck(err) (nccl_check(err, __FILE__, __LINE__))
+
+#ifdef USE_MPI
+void mpi_check(int status, const char *file, int line) {
+    if (status != MPI_SUCCESS) {
+        char mpi_error[4096];
+        int mpi_error_len = 0;
+        assert(MPI_Error_string(status, &mpi_error[0], &mpi_error_len) == MPI_SUCCESS);
+        printf("[MPI ERROR] at file %s:%d:\n%.*s\n", file, line, mpi_error_len, mpi_error);
+        exit(EXIT_FAILURE);
+    }
+}
+#define mpiCheck(err) (mpi_check(err, __FILE__, __LINE__))
+#endif
+
+#endif // MULTI_GPU
+
+// ----------------------------------------------------------------------------
+// Parameters specific to training on multiple GPUs.
+typedef struct {
+    int process_rank;      // Rank of this process among all processes. 0 if no multi-GPU.
+    int num_processes;     // Total number of processes. 1 if no multi-GPU.
+    int local_device_idx;  // This process GPU index on current machine. 0 if no multi-GPU.
+
+    // Zero Redundancy Optimizer stage - https://fairscale.readthedocs.io/en/stable/deep_dive/oss_sdp_fsdp.html
+    // 0-Disabled
+    // 1-Optimizer State Sharding (OSS)
+    // 2-Optimizer + Gradient State Sharding (SDP)
+    // 3-Optimizer + Gradient + Horizontal Model Sharding (FSDP)
+    int zero_stage;
+    size_t shard_num_parameters;
+#ifdef MULTI_GPU
+    ncclComm_t nccl_comm;       // NCCL communication primitive, used for collective multi-GPU work.
+    cudaStream_t nccl_stream;   // CUDA Stream to perform NCCL operations.
+    cudaEvent_t compute_nccl_sync; // Event used to synchronize NCCL with the compute
+    float* unified_buffer;
+#endif
+} MultiGpuConfig;
+
+// one global variable to hold the multi-GPU configuration for this process
+// inline, so we can include this header multiple times without getting multiple definitions
+inline MultiGpuConfig multi_gpu_config;
+
+#ifdef MULTI_GPU
+
+#ifdef _WIN32
+void send_nccl_id_to_clients_windows(ncclUniqueId *nccl_id, SOCKET client_sockets[], int num_clients) {
+    for (int i = 0; i < num_clients; ++i) {
+        if (send(client_sockets[i], (const char *)nccl_id, sizeof(*nccl_id), 0) == SOCKET_ERROR) {
+            printf("Failed to send nccl_id");
+            WSACleanup();
+            exit(EXIT_FAILURE);
+        }
+        closesocketCheck(client_sockets[i]);
+    }
+}
+#else
+void send_nccl_id_to_clients(ncclUniqueId *nccl_id, int client_sockets[], int num_clients) {
+    for (int i = 0; i < num_clients; ++i) {
+        if (send(client_sockets[i], nccl_id, sizeof(*nccl_id), 0) == -1) {
+            printf("Failed to send nccl_id");
+            exit(EXIT_FAILURE);
+        }
+        scloseCheck(client_sockets[i]);
+    }
+}
+#endif
+
+#ifdef _WIN32
+// Same as get_nccl_id_via_tcp but for Windows
+ncclUniqueId get_nccl_id_via_tcp_windows(MultiGpuConfig* result, const char* server_ip) {
+    ncclUniqueId nccl_id;
+
+    int SERVER_PORT = 12345;  // hardcoded an arbitrary port number between 1024 and 49151 (registered ports)
+    WSADATA wsaData;
+    if (WSAStartup(MAKEWORD(2, 2), &wsaData) != 0) {
+        printf("WSAStartup failed");
+        exit(EXIT_FAILURE);
+    }
+
+    if (result->process_rank == 0) {
+        ncclCheck(ncclGetUniqueId(&nccl_id));
+
+        int MAX_CLIENTS = result->num_processes - 1;
+        SOCKET client_sockets[MAX_CLIENTS];
+        int num_clients = 0;
+        SOCKET server_socket, new_socket;
+        struct sockaddr_in address;
+        int addrlen = sizeof(address);
+
+        // Step 1) create a server TCP socket
+        if ((server_socket = socket(AF_INET, SOCK_STREAM, 0)) == INVALID_SOCKET) {
+            printf("Socket failed");
+            WSACleanup();
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 2) set the server address and port
+        address.sin_family = AF_INET;  // IPv4
+        address.sin_addr.s_addr = inet_addr(server_ip);
+        address.sin_port = htons(SERVER_PORT);
+
+        // Step 3) bind the socket to the address and port
+        if (bind(server_socket, (struct sockaddr *)&address, sizeof(address)) == SOCKET_ERROR) {
+            printf("Bind failed");
+            closesocketCheck(server_socket);
+            WSACleanup();
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 4) MAX_CLIENTS specifies the maximum number of clients that can be queued for this server
+        if (listen(server_socket, MAX_CLIENTS) == SOCKET_ERROR) {
+            printf("Listen failed");
+            closesocketCheck(server_socket);
+            WSACleanup();
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 5) accept connections from clients
+        printf("Waiting for clients to connect...\n");
+        while (num_clients < MAX_CLIENTS) {
+            if ((new_socket = accept(server_socket, (struct sockaddr *)&address, &addrlen)) == INVALID_SOCKET) {
+                printf("Accept failed");
+                closesocketCheck(server_socket);
+                WSACleanup();
+                exit(EXIT_FAILURE);
+            }
+            client_sockets[num_clients++] = new_socket;
+            printf("Client %d connected\n", num_clients);
+        }
+
+        // Step 6) send the NCCL ID to all clients
+        send_nccl_id_to_clients_windows(&nccl_id, client_sockets, num_clients);
+        printf("NCCL ID sent to all clients\n");
+
+        closesocketCheck(server_socket);
+    } else {
+        int num_connection_attempts = 5;
+        int time_to_sleep = 2;
+        SOCKET client_socket;
+        struct sockaddr_in serv_addr;
+
+        // Step 1) create a client TCP socket
+        if ((client_socket = socket(AF_INET, SOCK_STREAM, 0)) == INVALID_SOCKET) {
+            printf("Socket creation error");
+            WSACleanup();
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 2) set the server address and port
+        serv_addr.sin_family = AF_INET;
+        serv_addr.sin_port = htons(SERVER_PORT);
+        if (inet_pton(AF_INET, server_ip, &serv_addr.sin_addr) <= 0) {
+            printf("Invalid address or address not supported");
+            closesocketCheck(client_socket);
+            WSACleanup();
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 3) Try to connect to the server - retry up to `num_connection_attempts` times if the connection fails
+        while (connect(client_socket, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) == SOCKET_ERROR) {
+            printf("%d Connection failed, retrying in %d seconds\n", result->process_rank, time_to_sleep);
+            if (--num_connection_attempts == 0) {
+                printf("Failed to connect to the server\n");
+                closesocketCheck(client_socket);
+                WSACleanup();
+                exit(EXIT_FAILURE);
+            }
+            Sleep(time_to_sleep * 1000);
+        }
+
+        // Step 4) receive the NCCL ID from the server
+        if (recv(client_socket, (char *)&nccl_id, sizeof(nccl_id), 0) <= 0) {
+            printf("Failed to receive nccl_id");
+            closesocketCheck(client_socket);
+            WSACleanup();
+            exit(EXIT_FAILURE);
+        }
+
+        printf("Received NCCL ID\n");
+        closesocketCheck(client_socket);
+    }
+
+    WSACleanup();
+    return nccl_id;
+}
+#else
+ncclUniqueId get_nccl_id_via_tcp(MultiGpuConfig* result, const char* server_ip) {
+    ncclUniqueId nccl_id;
+
+    int SERVER_PORT = 12345;  // hardcoded an arbitrary port number between 1024 and 49151 (registered ports)
+    if (result->process_rank == 0) {
+        ncclCheck(ncclGetUniqueId(&nccl_id));
+
+        int MAX_CLIENTS = result->num_processes - 1;
+        int client_sockets[MAX_CLIENTS];
+        int num_clients = 0;
+        int server_socket, new_socket;
+        struct sockaddr_in address;
+        int addrlen = sizeof(address);
+        int opt = 1;
+
+        // Step 1) create a server TCP socket
+        if ((server_socket = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+            printf("Socket failed");
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 2) set socket options
+        // SOL_SOCKET - means that option is configured at socket level
+        // SO_REUSEADDR - allows to bind to an address which is in a TIME_WAIT state (already used by another socket) - useful when restarting the server
+        // SO_REUSEPORT - allows to bind to the same port multiple times
+        if (setsockopt(server_socket, SOL_SOCKET, SO_REUSEADDR | SO_REUSEPORT, &opt, sizeof(opt)) < 0) {
+            printf("Setsockopt failed");
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 3) set the server address and port
+        address.sin_family = AF_INET;  // IPv4
+        address.sin_addr.s_addr = inet_addr(server_ip); // alternatively use INADDR_ANY to bind to all interfaces, currently we only allow ethernet
+        address.sin_port = htons(SERVER_PORT);
+
+        // Step 4) bind the socket to the address and port
+        if (bind(server_socket, (struct sockaddr *)&address, sizeof(address)) < 0) {
+            printf("Bind failed");
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 5) MAX_CLIENTS specifies the maximum number of clients that can be queued for this server
+        if (listen(server_socket, MAX_CLIENTS) < 0) {
+            printf("Listen failed");
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 6) accept connections from clients
+        printf("Waiting for clients to connect...\n");
+        while (num_clients < MAX_CLIENTS) {
+            if ((new_socket = accept(server_socket, (struct sockaddr *)&address, (socklen_t*)&addrlen)) < 0) {
+                printf("Accept failed");
+                exit(EXIT_FAILURE);
+            }
+            client_sockets[num_clients++] = new_socket;
+            printf("Client %d connected\n", num_clients);
+        }
+
+        // Step 7) send the NCCL ID to all clients
+        send_nccl_id_to_clients(&nccl_id, client_sockets, num_clients);
+        printf("NCCL ID sent to all clients\n");
+
+        scloseCheck(server_socket);
+    } else {
+        int num_connection_attempts = 5;
+        int time_to_sleep = 2;
+        int client_socket;
+        struct sockaddr_in serv_addr;
+
+        // Step 1) create a client TCP socket
+        if ((client_socket = socket(AF_INET, SOCK_STREAM, 0)) < 0) {
+            printf("Socket creation error");
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 2) set the server address and port
+        serv_addr.sin_family = AF_INET;
+        serv_addr.sin_port = htons(SERVER_PORT);
+        if (inet_pton(AF_INET, server_ip, &serv_addr.sin_addr) <= 0) {
+            printf("Invalid address or address not supported");
+            exit(EXIT_FAILURE);
+        }
+
+        // Step 3) Try to connect to the server - retry up to `num_connection_attempts` times if the connection fails
+        while (connect(client_socket, (struct sockaddr *)&serv_addr, sizeof(serv_addr)) < 0) {
+            printf("%d Connection failed, retrying in %d seconds\n", result->process_rank, time_to_sleep);
+            if (--num_connection_attempts == 0) {
+                printf("Failed to connect to the server\n");
+                exit(EXIT_FAILURE);
+            }
+            sleep(time_to_sleep);
+        }
+
+        // Step 4) receive the NCCL ID from the server
+        if (recv(client_socket, &nccl_id, sizeof(nccl_id), 0) <= 0) {
+            printf("Failed to receive nccl_id");
+            exit(EXIT_FAILURE);
+        }
+
+        printf("Received NCCL ID\n");
+        scloseCheck(client_socket);
+    }
+
+    return nccl_id;
+}
+#endif
+
+ncclUniqueId get_nccl_id_via_fs(MultiGpuConfig* result, char* fs_path) {
+    // Works assuming that the filesystem is shared among all processes
+    ncclUniqueId nccl_id;
+    FILE* idFile;
+    static char filename[1024];
+    snprintf(filename, sizeof(filename), "%s/ncclUniqueId.sync", fs_path);
+
+    if (result->process_rank != 0) {  // client processse should wait for the server to write to the file
+        // This is a naive and not 100% robust way to synchronize the processes but it should work almost always
+        sleep(2);
+    }
+
+    if (result->process_rank == 0) {
+        ncclCheck(ncclGetUniqueId(&nccl_id));
+        idFile = fopen(filename, "wb");
+        assert(idFile != NULL);
+        fwriteCheck(&nccl_id, sizeof(nccl_id), 1, idFile);
+        fcloseCheck(idFile);
+    } else {
+        // Other ranks wait until the file is available and read the unique ID
+        do {
+            sleep(1);  // 1 second
+            idFile = fopen(filename, "rb");
+            if (idFile != NULL) break;
+        } while (idFile == NULL);
+        freadCheck(&nccl_id, sizeof(nccl_id), 1, idFile);
+        fcloseCheck(idFile);
+    }
+
+    return nccl_id;
+}
+
+#ifdef USE_MPI
+// Determine which GPU this process should use.
+// Processes on the same machines use different GPU indicies. Processes on other machines don't.
+// Copied from NCCL examples: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html#example-2-one-device-per-process-or-thread
+int multi_gpu_get_local_device_idx(int process_rank, int num_processes) {
+    char hostname[1024];
+    hostname[1023] = '\0';
+    // All processes on the same machine will share the same hostname.
+    gethostname(hostname, 1023);
+    for (int i=0; i < 1024; i++) {
+        if (hostname[i] == '.') {
+            hostname[i] = '\0';
+            break;
+        }
+    }
+    uint64_t hostname_hash = 5381u;
+    for (int c = 0; hostname[c] != '\0'; c++){ hostname_hash = ((hostname_hash << 5u) + hostname_hash) ^ hostname[c]; }
+
+    // Distribute all hostname hashes to all processes.
+    uint64_t* all_hostsname_hashes = (uint64_t*)malloc(num_processes * sizeof(uint64_t));
+    all_hostsname_hashes[process_rank] = hostname_hash;
+    mpiCheck(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, all_hostsname_hashes, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD));
+
+    // Identify which GPU we need to use.
+    int local_device_idx = 0;
+    for (int current_process = 0; current_process < num_processes; ++current_process) {
+        if (current_process == process_rank) {
+        // Found my gpu, local_device_idx now has my target GPU index.
+        break;
+        }
+        if (all_hostsname_hashes[current_process] == all_hostsname_hashes[process_rank]) {
+        // This process ID runs on the same machine, but it's not me, skip this GPU
+        local_device_idx++;
+        }
+    }
+
+    free(all_hostsname_hashes);
+    return local_device_idx;
+}
+#endif
+
+#endif
+
+MultiGpuConfig multi_gpu_config_init(int num_processes, int process_rank, int gpus_per_node, char* server_ip, char* fs_path, char* init_method) {
+#ifdef MULTI_GPU
+    MultiGpuConfig result;
+    ncclUniqueId nccl_id;
+    // Get nccl_id using MPI, TCP, or FS (file system synchronization) methods
+    // On newer slurm versions (slurm-wlm package) PMIx is disabled so we can not use MPI for NCCL init in multi node setup
+    if (strcmp(init_method, "mpi") == 0) {
+        #ifdef USE_MPI
+        mpiCheck(MPI_Init(NULL, NULL));
+        mpiCheck(MPI_Comm_rank(MPI_COMM_WORLD, &result.process_rank));
+        mpiCheck(MPI_Comm_size(MPI_COMM_WORLD, &result.num_processes));
+        result.local_device_idx = multi_gpu_get_local_device_idx(result.process_rank, result.num_processes);
+        if (result.process_rank == 0) {
+            ncclCheck(ncclGetUniqueId(&nccl_id));
+        }
+        mpiCheck(MPI_Bcast(&nccl_id, sizeof(nccl_id), MPI_BYTE, 0, MPI_COMM_WORLD));
+        #else
+        printf("MPI support is disabled. Please enable MPI support to use MPI-based NCCL-init method.\n");
+        exit(EXIT_FAILURE);
+        #endif
+    } else {
+        result.process_rank = process_rank;
+        result.num_processes = num_processes;
+        result.local_device_idx = process_rank % gpus_per_node;
+        if (strcmp(init_method, "tcp") == 0) {
+            #ifdef _WIN32
+            nccl_id = get_nccl_id_via_tcp_windows(&result, server_ip);
+            #else
+            nccl_id = get_nccl_id_via_tcp(&result, server_ip);
+            #endif
+        } else if (strcmp(init_method, "fs") == 0) {
+            nccl_id = get_nccl_id_via_fs(&result, fs_path);
+        } else {
+            printf("Invalid NCCL-init method\n");
+            exit(EXIT_FAILURE);
+        }
+    }
+    cudaCheck(cudaSetDevice(result.local_device_idx));
+    ncclCheck(ncclCommInitRank(&result.nccl_comm, result.num_processes, nccl_id, result.process_rank));
+    cudaCheck(cudaStreamCreate(&result.nccl_stream));
+    // event without timing for maximum performance
+    cudaCheck(cudaEventCreate(&result.compute_nccl_sync, cudaEventDisableTiming));
+    nvtxNameCudaStreamA(result.nccl_stream, "nccl stream");
+    nvtxNameCudaEventA(result.compute_nccl_sync, "nccl compute sync");
+    cudaCheck(cudaMallocManaged(&result.unified_buffer, sizeof(float)));
+    return result;
+#else
+    printf("Multi-GPU support is disabled. Using a single GPU.\n");
+    cudaCheck(cudaSetDevice(0));
+    MultiGpuConfig result;
+    result.process_rank = 0;
+    result.num_processes = 1;
+    result.local_device_idx = 0;
+    return result;
+#endif
+}
+
+void multi_gpu_config_free(MultiGpuConfig* config) {
+#ifdef MULTI_GPU
+    ncclCheck(ncclCommDestroy(config->nccl_comm));
+    cudaCheck(cudaStreamDestroy(config->nccl_stream));
+    cudaCheck(cudaEventDestroy(config->compute_nccl_sync));
+    cudaCheck(cudaFree(config->unified_buffer));
+    #ifdef USE_MPI
+    mpiCheck(MPI_Finalize());
+    #endif
+#endif
+}
+
+void multi_gpu_barrier(const MultiGpuConfig* config) {
+#ifdef MULTI_GPU
+    if (config->num_processes > 1) {
+        ncclCheck(ncclAllReduce(config->unified_buffer, config->unified_buffer, sizeof(float), ncclFloat, ncclSum, config->nccl_comm, config->nccl_stream));
+    }
+    cudaCheck(cudaDeviceSynchronize());
+#endif
+}
+
+// Offset and size of a tensor shard
+typedef struct {
+    ptrdiff_t offset;
+    size_t size;
+} ShardInfo;
+
+// Get info about sharding for a tensor of elements many numbers
+ShardInfo multi_gpu_get_shard_offset(size_t elements, const MultiGpuConfig* config, int shard_at_stage) {
+    const int nproc = config->num_processes;
+    if(config->zero_stage >= shard_at_stage) {
+        if (elements % nproc != 0) {
+            fprintf(stderr, "Number of elements %zu must be a multiple of the number of processes %d\n", elements, nproc);
+            exit(EXIT_FAILURE);
+        }
+        return {(ptrdiff_t) (config->process_rank * (elements / nproc)), elements / nproc};
+    } else {
+        return {0, elements};
+    }
+}
+
+// Block NCCL stream until computations on compute_stream are done, then aggregate multiple pointers in an NCCL group.
+// This can work either as an all-reduce (i.e., no ZeRo), or a reduce-scatter (ZeRO 1).
+// The awkward `(&pointers)[N]` syntax ensures we are capturing the parameters as sized arrays, so that it becomes impossible
+// to call this function if pointers and pointers_sizes do not match.
+template<int N>
+void multi_gpu_async_reduce_gradient(
+        floatX* const (&pointers)[N], const size_t (&pointers_sizes)[N],
+        MultiGpuConfig* config, cudaStream_t compute_stream) {
+    if (config->num_processes == 1) {
+        return; // no multi-GPU, just exit.
+    }
+
+#ifdef MULTI_GPU
+    NVTX_RANGE_FN();
+    // mark an event on the compute stream, and immediately wait on this in the nccl stream
+    // this means that the nccl stream won't start executing before all compute kernels that
+    // have been submitted before this point have finished.
+    // by using an event instead of cudaSyncStream, we avoid having to synchronize the host, and
+    // can enqueue new work to the GPU right away.
+    cudaCheck(cudaEventRecord(config->compute_nccl_sync, compute_stream));
+    cudaCheck(cudaStreamWaitEvent(config->nccl_stream, config->compute_nccl_sync));
+    ncclCheck(ncclGroupStart()); // NCCL group: aggregate all pointers in a single NCCL GPU kernel.
+    for (int i = 0; i < N; ++i) {
+        if(config->zero_stage == 0) {
+            ncclCheck(ncclAllReduce(
+                    pointers[i], pointers[i],
+                    pointers_sizes[i],
+                    ncclFloatX, ncclAvg,
+                    config->nccl_comm, config->nccl_stream
+            ));
+        } else if(config->zero_stage == 1) {
+            assert(pointers_sizes[i] % config->num_processes == 0);
+            size_t shard_size = pointers_sizes[i] / config->num_processes;
+            ptrdiff_t shard_offset = (ptrdiff_t)shard_size * config->process_rank;
+            ncclCheck(ncclReduceScatter(
+                    pointers[i], pointers[i] + shard_offset,
+                    shard_size,
+                    ncclFloatX, ncclAvg,
+                    config->nccl_comm, config->nccl_stream
+            ));
+        }
+    }
+    ncclCheck(ncclGroupEnd());
+#endif
+}
+
+// convenience macro that only prints if the rank of process is zero
+#define printf0(...) if (::multi_gpu_config.process_rank == 0) { printf(__VA_ARGS__); }
+
+void set_zero_configs(MultiGpuConfig* config, int zero_stage, size_t total_parameters) {
+    config->zero_stage = 0;
+    config->shard_num_parameters = total_parameters;
+    // Check the Zero Stage and define sharding parameters
+    if (zero_stage == 0) {
+        printf0("| Zero Optimization is disabled                                              |\n");
+    }
+    else if (zero_stage == 1) {
+        if (total_parameters % config->num_processes != 0) {
+            printf0("| Zero Optimization is disabled, Can't equally partition parameters          |\n");
+            config->zero_stage = 0;
+        }
+        else {
+            config->zero_stage = 1;
+            config->shard_num_parameters = total_parameters / config->num_processes;
+        }
+    }
+    else{
+        printf0("| Disabling Zero Optimization, Zero Stage2 and Stage3 are not yet supported  |\n");
+        config->zero_stage = 0;
+    }
+}
+
+// Compute sum of a single CPU value across all GPU processes. No-op when multi-GPU is disabled.
+float multi_gpu_cpu_float_sum(float value, MultiGpuConfig* config) {
+#ifdef MULTI_GPU
+    if (config->num_processes == 1) return value;
+
+    float* unified_buffer = config->unified_buffer;
+    *unified_buffer = value;
+    ncclCheck(ncclAllReduce(unified_buffer, unified_buffer, sizeof(float), ncclFloat, ncclSum, config->nccl_comm, config->nccl_stream));
+    cudaCheck(cudaDeviceSynchronize());
+    return *unified_buffer;
+#else
+    return value;
+#endif
+}
+
+#endif
+
diff --git a/profile_gpt2.cu b/profile_gpt2.cu
index f79e9ada4..fa5e528d7 100644
--- a/profile_gpt2.cu
+++ b/profile_gpt2.cu
@@ -28,11 +28,18 @@ the profile.ncu-rep from a cloud box to local to pretty view.
 #include "train_gpt2.cu"
 
 int main(int argc, char *argv[]) {
-    multi_gpu_config = multi_gpu_config_init(&argc, &argv);
+    char nccl_init_method[256] = "mpi";  // "tcp" or "fs" or "mpi"
+    int num_processes = -1;  // doesn't matter when using MPI
+    int process_rank = -1;  // doesn't matter when using MPI
+    int gpus_per_node = -1;  // doesn't matter when using MPI
+    char server_ip[256] = "";  // doesn't matter when using MPI
+    char fs_path[256] = "";  // doesn't matter when using MPI
+    multi_gpu_config = multi_gpu_config_init(num_processes, process_rank, gpus_per_node, server_ip, fs_path, nccl_init_method);
     common_start(true, true);
 
     // build the GPT-2 model from a checkpoint
     GPT2 model;
+    gpt2_init_common(&model);
     gpt2_build_from_checkpoint(&model, "gpt2_124M_bf16.bin");
 
     int B = 24; // if program OOMs decrease this number, e.g. all the way down to 4 or etc
@@ -51,14 +58,17 @@ int main(int argc, char *argv[]) {
     model.config.num_layers = 1;
     set_zero_configs(&multi_gpu_config, 0, model.num_parameters);
 
+    gpt2_allocate_state(&model, B, T);
     // do a training step
-    gpt2_forward(&model, x, y, B, T);
-    gpt2_zero_grad(&model);
-    gpt2_backward(&model, x);
-    gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, 1.f, 1, &multi_gpu_config);
+    gpt2_forward(&model, x, B, T);
+    gpt2_backward_and_reduce(&model, x, y, 1, 0);
+    float grad_norm = gpt2_calculate_grad_norm(&model, &multi_gpu_config);
+    float grad_scale = (grad_norm > 1.0f) ? 1.0f / grad_norm : 1.0f;
+    gpt2_update(&model, 1e-4f, 0.9f, 0.999f, 1e-8f, 0.0f, grad_scale, 1, &multi_gpu_config);
     cudaCheck(cudaDeviceSynchronize()); // finish all CUDA work to get correct precise timings
 
     // free
+    gpt2_free(&model);
     common_free(model);
     return 0;
 }
diff --git a/profile_gpt2cu.py b/profile_gpt2cu.py
index de2edfda9..bcf097c9b 100644
--- a/profile_gpt2cu.py
+++ b/profile_gpt2cu.py
@@ -130,7 +130,7 @@
         # the classifier part, counts only once
         pass_name = "cls"
         phase = "bwd"
-    elif "adamw" in kernel or "global_norm" in kernel:
+    elif "adamw" in kernel or "global_norm" in kernel or "copy_and_cast" in kernel:
         # encoder layer or adam
         pass_name = "opt"
     # before the first optimizer run, we create weight copies.
diff --git a/requirements.txt b/requirements.txt
index 80471a8be..ea4bc768d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,5 @@
 tqdm
-numpy
+numpy<2
 torch
 tiktoken
 transformers
diff --git a/scripts/multi_node/run_gpt2_124M_fs.sbatch b/scripts/multi_node/run_gpt2_124M_fs.sbatch
new file mode 100755
index 000000000..9bef9aaca
--- /dev/null
+++ b/scripts/multi_node/run_gpt2_124M_fs.sbatch
@@ -0,0 +1,85 @@
+#!/bin/bash
+#SBATCH --job-name=llmc-multinode                                     # job name
+#SBATCH --output=/home/ubuntu/llm.c/scripts/multi_node/%x_%j_%t.log   # output file
+#SBATCH --error=/home/ubuntu/llm.c/scripts/multi_node/%x_%j_%t.err    # error file
+#SBATCH --partition=llmc                                              # Specify the GPU partition
+#SBATCH --ntasks=16                                                   # total number of processes to launch on all nodes
+#SBATCH --nodes=2                                                     # total number of nodes
+#SBATCH --ntasks-per-node=8                                           # assuming each node has 8 gpus
+#SBATCH --gres=gpu:8                                                  # request 8 gpus from each node
+
+# NOTE: change the above slurm arguments to match your system!
+# Run with `sbatch <path_to_this_script.sh>`
+
+make train_gpt2cu USE_CUDNN=1 NO_USE_MPI=1
+
+# NOTE: change the following to match your system
+binary_path="/home/ubuntu/llm.c/train_gpt2cu"
+out_dir="/ephemeral/data/fineweb/log_gpt2_124M_multi"
+train_data_path='/ephemeral/data/fineweb/bin_10B/fineweb_train_*.bin'
+val_data_path='/ephemeral/data/fineweb/bin_10B/fineweb_val_*.bin'
+sync_fs_path=$out_dir  # needs to be a shared filesystem path that all nodes can access
+
+# In case the file system is shared this is a no-op.
+# Otherwise, we need to copy the binary to all nodes.
+current_user=$USER
+hosts=$(scontrol show hostnames $SLURM_JOB_NODELIST)  # get the hostnames of the allocated nodes
+current_host=$(hostname)
+for host in $hosts; do
+    if [ $host == $current_host ]; then
+        continue
+    fi
+    echo "copying $binary_path to $current_user@$host"
+    scp -r $binary_path $current_user@$host:$binary_path
+done
+
+# Use this for NCCL debugging if you run into issues
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=ALL
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+# Optimization flags
+export NCCL_NET_GDR_LEVEL=2  # use GPUDirect RDMA - allows for direct memory access between GPUs across different nodes by bypassing the CPU
+export NCCL_IB_DISABLE=0  # use InfiniBand if available
+
+# NOTE: change the following environment variables to match your system - or comment them out if you don't need them
+export NCCL_SOCKET_IFNAME=ens17
+export OMPI_MCA_btl_tcp_if_include=ens17
+export NCCL_P2P_LEVEL=PXB
+
+if [ -z "$SLURM_JOB_ID" ]; then
+    echo "Make sure you're running in a SLURM environment. Did you forget to run with sbatch? Aborting."
+    exit 1
+else
+    DATESTRING=`date "+%Y-%m-%dT%H:%M:%S"`
+    echo "Running in a SLURM environment (job ID: $SLURM_JOB_ID, user: $current_user)"
+    echo "Running on hosts: $(echo $(scontrol show hostname))"
+    echo "$DATESTRING"
+fi
+
+srun -l -u bash -c "
+    $binary_path \
+    -i '$train_data_path' \
+    -j '$val_data_path' \
+    -o $out_dir \
+    -v 250 -s 20000 -g 144 \
+    -h 1 \
+    -b 64 -t 1024 \
+    -d 2097152 \
+    -r 0 \
+    -z 1 \
+    -c 0.1 \
+    -l 0.0006 \
+    -q 0.0 \
+    -u 700 \
+    -n 5000 \
+    -y 1 \
+    -e d12 \
+    -pn \$SLURM_NTASKS \
+    -pr \$SLURM_PROCID \
+    -pg \$SLURM_NTASKS_PER_NODE \
+    -pf $sync_fs_path \
+    -pi "fs" \
+"
+
+echo "$DATESTRING"
\ No newline at end of file
diff --git a/scripts/multi_node/run_gpt2_124M_mpi.sh b/scripts/multi_node/run_gpt2_124M_mpi.sh
new file mode 100755
index 000000000..e09b027ce
--- /dev/null
+++ b/scripts/multi_node/run_gpt2_124M_mpi.sh
@@ -0,0 +1,49 @@
+
+make train_gpt2cu USE_CUDNN=1
+
+# NOTE: change the following to match your system
+binary_path="/home/ubuntu/llm.c/train_gpt2cu"
+out_dir="/ephemeral/data/fineweb/log_gpt2_124M_multi"
+train_data_path='/ephemeral/data/fineweb/bin_10B/fineweb_train_*.bin'
+val_data_path='/ephemeral/data/fineweb/bin_10B/fineweb_val_*.bin'
+# You can find these names either in `/etc/hosts`` file or in the terminal (user@host:~$).
+host1="h100-node-1-0"  # master and worker node
+host2="h100-node-1-1"  # worker node
+
+# In case the file system is shared this is a no-op.
+# Otherwise, we need to copy the binary to all nodes.
+scp -r $binary_path $USER@$host2:$binary_path
+
+# Use this for NCCL debugging if you run into issues
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=ALL
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+# Optimization flags
+export NCCL_NET_GDR_LEVEL=2  # use GPUDirect RDMA - allows for direct memory access between GPUs across different nodes by bypassing the CPU
+export NCCL_IB_DISABLE=0  # use InfiniBand if available
+
+# NOTE: change the following environment variables to match your system - or comment them out if you don't need them
+export NCCL_SOCKET_IFNAME=ens17
+export OMPI_MCA_btl_tcp_if_include=ens17
+export NCCL_P2P_LEVEL=PXB
+
+mpirun -np 16 --host $host1:8,$host2:8 \
+    $binary_path \
+    -i "$train_data_path" \
+    -j "$val_data_path" \
+    -o $out_dir \
+    -v 250 -s 20000 -g 144 \
+    -h 1 \
+    -b 64 -t 1024 \
+    -d 2097152 \
+    -r 0 \
+    -z 1 \
+    -c 0.1 \
+    -l 0.0006 \
+    -q 0.1 \
+    -u 700 \
+    -n 1000 \
+    -y 0 \
+    -e d12 \
+    -pi "mpi" \
diff --git a/scripts/multi_node/run_gpt2_124M_tcp.sbatch b/scripts/multi_node/run_gpt2_124M_tcp.sbatch
new file mode 100755
index 000000000..f6cd3a7fa
--- /dev/null
+++ b/scripts/multi_node/run_gpt2_124M_tcp.sbatch
@@ -0,0 +1,86 @@
+#!/bin/bash
+#SBATCH --job-name=llmc-multinode                                     # job name
+#SBATCH --output=/home/ubuntu/llm.c/scripts/multi_node/%x_%j_%t.log   # output file
+#SBATCH --error=/home/ubuntu/llm.c/scripts/multi_node/%x_%j_%t.err    # error file
+#SBATCH --partition=llmc                                              # Specify the GPU partition
+#SBATCH --ntasks=16                                                   # total number of processes to launch on all nodes
+#SBATCH --nodes=2                                                     # total number of nodes
+#SBATCH --ntasks-per-node=8                                           # assuming each node has 8 gpus
+#SBATCH --gres=gpu:8                                                  # request 8 gpus from each node
+
+# NOTE: change the above slurm arguments to match your system!
+# Run with `sbatch <path_to_this_script.sh>`
+
+make train_gpt2cu USE_CUDNN=1 NO_USE_MPI=1
+
+# NOTE: change the following to match your system
+binary_path="/home/ubuntu/llm.c/train_gpt2cu"
+out_dir="/ephemeral/data/fineweb/log_gpt2_124M_multi"
+train_data_path='/ephemeral/data/fineweb/bin_10B/fineweb_train_*.bin'
+val_data_path='/ephemeral/data/fineweb/bin_10B/fineweb_val_*.bin'
+# NOTE: change the server_ip to the IP address of the machine that is running process zero
+server_ip="10.0.1.220"
+
+# In case the file system is shared this is a no-op.
+# Otherwise, we need to copy the binary to all nodes.
+current_user=$USER
+hosts=$(scontrol show hostnames $SLURM_JOB_NODELIST)  # get the hostnames of the allocated nodes
+current_host=$(hostname)
+for host in $hosts; do
+    if [ $host == $current_host ]; then
+        continue
+    fi
+    echo "copying $binary_path to $current_user@$host"
+    scp -r $binary_path $current_user@$host:$binary_path
+done
+
+# Use this for NCCL debugging if you run into issues
+# export NCCL_DEBUG=INFO
+# export NCCL_DEBUG_SUBSYS=ALL
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+
+# Optimization flags
+export NCCL_NET_GDR_LEVEL=2  # use GPUDirect RDMA - allows for direct memory access between GPUs across different nodes by bypassing the CPU
+export NCCL_IB_DISABLE=0  # use InfiniBand if available
+
+# NOTE: change the following environment variables to match your system - or comment them out if you don't need them
+export NCCL_SOCKET_IFNAME=ens17
+export OMPI_MCA_btl_tcp_if_include=ens17
+export NCCL_P2P_LEVEL=PXB
+
+if [ -z "$SLURM_JOB_ID" ]; then
+    echo "Make sure you're running in a SLURM environment. Did you forget to run with sbatch? Aborting."
+    exit 1
+else
+    DATESTRING=`date "+%Y-%m-%dT%H:%M:%S"`
+    echo "Running in a SLURM environment (job ID: $SLURM_JOB_ID, user: $current_user)"
+    echo "Running on hosts: $(echo $(scontrol show hostname))"
+    echo "$DATESTRING"
+fi
+
+srun -l -u bash -c "
+    $binary_path \
+    -i '$train_data_path' \
+    -j '$val_data_path' \
+    -o $out_dir \
+    -v 250 -s 20000 -g 144 \
+    -h 1 \
+    -b 64 -t 1024 \
+    -d 2097152 \
+    -r 0 \
+    -z 1 \
+    -c 0.1 \
+    -l 0.0006 \
+    -q 0.0 \
+    -u 700 \
+    -n 5000 \
+    -y 1 \
+    -e d12 \
+    -pn \$SLURM_NTASKS \
+    -pr \$SLURM_PROCID \
+    -pg \$SLURM_NTASKS_PER_NODE \
+    -ps $server_ip \
+    -pi "tcp" \
+"
+
+echo "$DATESTRING"
diff --git a/scripts/run_gpt2_1558M.sh b/scripts/run_gpt2_1558M.sh
new file mode 100644
index 000000000..929041228
--- /dev/null
+++ b/scripts/run_gpt2_1558M.sh
@@ -0,0 +1,43 @@
+# GPT-2 (1558M) repro on FineWeb-EDU
+# 1558M parameter model on 32B tokens
+# => 6 * 1558e6 * 32e9 = 6.966e20 ~= 3e20 capability model
+# 32,000 steps on ~1M tokens/step (1,048,576 to be precise)
+# on 8X H100 80GB SXM ($28/hr) steps in 2.80s/iter
+# => training time 32,000 steps * 2.7s => 24 hours ~= 1 day ~= $672
+
+make train_gpt2cu USE_CUDNN=1
+out_dir="log_gpt2_1558M"
+done_file="$out_dir/DONE_00032000"
+
+# in case the training stalls or crashes, loop to resume (-y 1)
+while true; do
+
+    # exit condition is that optimization has finished
+    if [ -f "$done_file" ]; then
+        echo "File $done_file exists. Exiting the loop."
+        break
+    fi
+
+    mpirun -np 8 ./train_gpt2cu \
+                -i "dev/data/edu_fineweb100B/edu_fineweb_train_*.bin" \
+                -j "dev/data/edu_fineweb100B/edu_fineweb_val_*.bin" \
+                -o $out_dir \
+                -v 250 -s 300000 -g 384 \
+                -h 1 \
+                -b 16 -t 1024 \
+                -d 1048576 \
+                -r 0 \
+                -z 1 \
+                -c 0.1 \
+                -k "cosine" \
+                -l 0.0006 \
+                -q 0.1 \
+                -u 700 \
+                -n 2000 \
+                -x 32000 \
+                -ge 1 \
+                -y 1 \
+                -e "d48"
+
+    sleep 1
+done
diff --git a/scripts/run_gpt2_774M.sh b/scripts/run_gpt2_774M.sh
new file mode 100755
index 000000000..dfb80b1b5
--- /dev/null
+++ b/scripts/run_gpt2_774M.sh
@@ -0,0 +1,43 @@
+# GPT-2 (774M) repro on FineWeb
+# 774M parameter model on ~150B tokens
+# => 6 * 774e6 * 150e9 = 6.966e20 ~= 7e20 capability model (10X 350M)
+# => 286,102 steps on 524,288 tokens/step
+# on 8X A100 80GB SXM ($14/hr) steps in ~1.7s/iter
+# => training time 286,102 steps * 1.7s = 135 hours ~= 5.6 days ~= $2000 (10X 124M)
+
+make train_gpt2cu USE_CUDNN=1
+out_dir="log_gpt2_774M"
+done_file="$out_dir/DONE_00286102"
+
+# in case the training stalls or crashes, loop to resume (-y 1)
+while true; do
+
+    # exit condition is that optimization has finished
+    if [ -f "$done_file" ]; then
+        echo "File $done_file exists. Exiting the loop."
+        break
+    fi
+
+    # run python dev/data/fineweb.py --version 100B to prepro data
+    # run python dev/data/hellaswag.py to prepro hellaswag eval
+    mpirun -np 8 ./train_gpt2cu \
+                -i "dev/data/fineweb100B/fineweb_train_*.bin" \
+                -j "dev/data/fineweb100B/fineweb_val_*.bin" \
+                -o $out_dir \
+                -v 250 -s 300000 -g 144 \
+                -h 1 \
+                -b 32 -t 1024 \
+                -d 524288 \
+                -r 0 \
+                -z 1 \
+                -c 0.1 \
+                -l 0.00025 \
+                -q 0.0 \
+                -u 700 \
+                -n 4000 \
+                -x 286102 \
+                -y 1 \
+                -e "d36"
+
+    sleep 1
+done
diff --git a/scripts/run_gpt3_124M.sh b/scripts/run_gpt3_125M.sh
similarity index 57%
rename from scripts/run_gpt3_124M.sh
rename to scripts/run_gpt3_125M.sh
index bde1e6859..ca88a1889 100644
--- a/scripts/run_gpt3_124M.sh
+++ b/scripts/run_gpt3_125M.sh
@@ -1,14 +1,14 @@
-# GPT-3 (124M) repro on FineWeb
-# 124M parameter model on 300B tokens
+# GPT-3 (125M) repro, but using FineWeb
+# 125M parameter model on 300B tokens
 # note context length: 1024 -> 2048 for GPT-3
-# => 6 * 124e6 * 300e9 = 7.44e18 ~= 2.2e20 capability model
-# 565,950 steps of 524,288 tokens/step
-# on 8X A100 80GB SXM ($14/hr) steps in ~300ms/iter
-# => training time 565,950 * 300ms ~= 47 hours ~= $658
+# => 6 * 125e6 * 300e9 = ~= 2.25e20 capability model
+# 572,204 steps of 524,288 tokens/step => 300B
+# on 8X A100 80GB SXM ($14/hr) steps in ~150ms/iter
+# => training time 572,204 * 150ms ~= 24 hours ~= $336
 
 make train_gpt2cu USE_CUDNN=1
-out_dir="log_gpt3_124M"
-done_file="$out_dir/DONE_00565950"
+out_dir="log_gpt3_125M"
+done_file="$out_dir/DONE_00572204"
 
 while true; do
 
@@ -18,8 +18,6 @@ while true; do
         break
     fi
 
-    # run python dev/data/fineweb.py --version 10B to prepro data
-    # run python dev/data/hellaswag.py to prepro hellaswag eval
     mpirun -np 8 ./train_gpt2cu \
                 -i "dev/data/fineweb100B/fineweb_train_*.bin" \
                 -j "dev/data/fineweb100B/fineweb_val_*.bin" \
@@ -32,12 +30,17 @@ while true; do
                 -z 1 \
                 -c 0.1 \
                 -l 0.0006 \
-                -q 0.0 \
+                -q 0.1 \
                 -u 700 \
                 -n 10000 \
+                -nk 5 \
+                -nm 50000 \
+                -ge 1 \
+                -sl 7.0 \
+                -sg 7.0 \
                 -y 1 \
-                -x 565950 \
-                -e "d12"
+                -x 572204 \
+                -e "gpt3:c768"
 
     sleep 1
 done
diff --git a/test_gpt2.c b/test_gpt2.c
index e49b73fad..3db8dff7a 100644
--- a/test_gpt2.c
+++ b/test_gpt2.c
@@ -6,7 +6,7 @@ int check_tensor(float *a, float *b, int n, const char* label) {
     int print_upto = 5;
     int ok = 1;
     float maxdiff = 0.0f;
-    float tol = 2e-2;
+    float tol = 2e-2f;
     printf("%s\n", label);
     for (int i = 0; i < n; i++) {
         // look at the diffence at position i of these two tensors
@@ -52,7 +52,7 @@ int main(int argc, char *argv[]) {
     FILE *state_file = fopen("gpt2_124M_debug_state.bin", "rb");
     if (state_file == NULL) { printf("Error opening state file\n"); return 1; }
     int state_header[256];
-    fread(state_header, sizeof(int), 256, state_file);
+    freadCheck(state_header, sizeof(int), 256, state_file);
     if (state_header[0] != 20240327) { printf("Bad magic state file\n"); return 1; }
     if (state_header[1] != 2) {
         printf("Bad version in state file\n");
@@ -75,28 +75,28 @@ int main(int argc, char *argv[]) {
     float* expected_loss = (float*) malloc(1 * sizeof(float));
 
     // read reference information from Python
-    fread(x, sizeof(int), B*T, state_file);
-    fread(y, sizeof(int), B*T, state_file);
-    fread(expected_logits, sizeof(float), B*T*V, state_file);
-    fread(expected_loss, sizeof(float), 1, state_file);
-    fread(expected_grads_memory, sizeof(float), model.num_parameters, state_file);
-    fclose(state_file);
+    freadCheck(x, sizeof(int), B*T, state_file);
+    freadCheck(y, sizeof(int), B*T, state_file);
+    freadCheck(expected_logits, sizeof(float), B*T*V, state_file);
+    freadCheck(expected_loss, sizeof(float), 1, state_file);
+    freadCheck(expected_grads_memory, sizeof(float), model.num_parameters, state_file);
+    fcloseCheck(state_file);
 
     // overall OK signal for the test
     int allok = 1;
 
     // let's do 10 training iterations, following the pytorch code
     float expected_losses[10] = {
-        5.270007133483887,
-        4.059706687927246,
-        3.3751230239868164,
-        2.8007826805114746,
-        2.315382242202759,
-        1.8490285873413086,
-        1.3946564197540283,
-        0.9991465210914612,
-        0.6240804195404053,
-        0.37651097774505615
+        5.270007133483887f,
+        4.059706687927246f,
+        3.3751230239868164f,
+        2.8007826805114746f,
+        2.315382242202759f,
+        1.8490285873413086f,
+        1.3946564197540283f,
+        0.9991465210914612f,
+        0.6240804195404053f,
+        0.37651097774505615f
     };
     for (int step = 0; step < 10; step++) {
 
diff --git a/test_gpt2.cu b/test_gpt2.cu
index bde357f32..e608ce229 100644
--- a/test_gpt2.cu
+++ b/test_gpt2.cu
@@ -89,7 +89,13 @@ float* float_cpu_malloc_and_point_parameters(FloatParameterTensors* params, size
 }
 
 int main(int argc, char *argv[]) {
-    multi_gpu_config = multi_gpu_config_init(&argc, &argv);
+    char nccl_init_method[256] = "mpi";  // "tcp" or "fs" or "mpi"
+    int num_processes = -1;  // doesn't matter when using MPI
+    int process_rank = -1;  // doesn't matter when using MPI
+    int gpus_per_node = -1;  // doesn't matter when using MPI
+    char server_ip[256] = "";  // doesn't matter when using MPI
+    char fs_path[256] = "";  // doesn't matter when using MPI
+    multi_gpu_config = multi_gpu_config_init(num_processes, process_rank, gpus_per_node, server_ip, fs_path, nccl_init_method);
     common_start(false, true);
 
     // set the right paths
@@ -101,6 +107,7 @@ int main(int argc, char *argv[]) {
 
     // build the GPT-2 model from a checkpoint
     GPT2 model;
+    gpt2_init_common(&model);
 
     gpt2_build_from_checkpoint(&model, load_filename);
     size_t V = model.config.vocab_size;
@@ -109,6 +116,15 @@ int main(int argc, char *argv[]) {
     size_t L = model.config.num_layers;
     size_t C = model.config.channels;
 
+    for (int i = 1; i < argc; i+=2) {
+        if (i + 1 >= argc) { exit(EXIT_FAILURE);  } // must have arg after flag
+        if (!(strlen(argv[i]) == 2 || strlen(argv[i]) == 3)) { exit(EXIT_FAILURE); } // must be -x[y] (one dash, one or two letters)
+        if (argv[i][0] != '-') { exit(EXIT_FAILURE); } // must start with dash
+        if (argv[i][1] == 'w') { model.use_master_weights = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'r') { model.recompute = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'g' && argv[i][2] == 'e') { model.gelu_fusion = atoi(argv[i+1]); }
+    }
+
     // load additional information that we will use for debugging and error checking
     FILE *state_file = fopenCheck("gpt2_124M_debug_state.bin", "rb");
     int state_header[256];
@@ -152,22 +168,25 @@ int main(int argc, char *argv[]) {
     // overall OK signal for the test
     int allok = 1;
 
+    gpt2_allocate_state(&model, B, T);
+
     // First, do target-free forward pass to validate logits
-    gpt2_forward(&model, x, NULL, B, T);
+    gpt2_forward(&model, x, B, T);
     // at this point, target should be equal to expected_logits, let's compare
     // copy logits to CPU so we can compare them
     floatX* logits_cpu_raw = (floatX*)mallocCheck(B * T * Vp * sizeof(floatX));
     float* logits_cpu = (float*)mallocCheck(B * T * Vp * sizeof(float));
-    cudaMemcpy(logits_cpu_raw, model.acts.output, B * T * Vp * sizeof(floatX), cudaMemcpyDeviceToHost);
+    cudaCheck(cudaMemcpy(logits_cpu_raw, model.acts.output, B * T * Vp * sizeof(floatX), cudaMemcpyDeviceToHost));
     for (int i = 0; i < B * T * Vp; i++) {
         logits_cpu[i] = (float)logits_cpu_raw[i];
     }
 
+    float logit_accuracy_threshold = 1e-3f;
+    float loss_diff_threshold = 1e-5f;
     // FP16 and lower require very high tolerances unfortunately. TODO look into more
-    float logit_accuracy_threshold = 1e-2f;
-    float loss_diff_threshold = 0.05f;
     #if defined(ENABLE_BF16) || defined(ENABLE_F16)
     logit_accuracy_threshold = 25.0f; // 15.0f was too low even without cuDNN?! :(
+    loss_diff_threshold = 0.05f;
     #endif
 
     // compare the output logits from the forward pass
@@ -201,25 +220,16 @@ int main(int argc, char *argv[]) {
     for (int step = 0; step < 10; step++) {
         struct timespec start, end;
         clock_gettime(CLOCK_MONOTONIC, &start);
-        gpt2_forward(&model, x, y, B, T);
-        gpt2_zero_grad(&model);
-        gpt2_backward(&model, x);
+        gpt2_forward(&model, x, B, T);
+        gpt2_backward_and_reduce(&model, x, y, 1, 0);
         clock_gettime(CLOCK_MONOTONIC, &end);
         double time_elapsed_s = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec) / 1e9;
 
         if (step == 0) {
             // error checking at step 0 for reference activations
 
-            // compare the achieved loss
-            if (fabsf(model.mean_loss - *expected_loss) >= loss_diff_threshold) {
-                printf("LOSS MISMATCH: %f %f\n", model.mean_loss, *expected_loss);
-                allok = 0;
-            } else {
-                printf("LOSS OK: %f %f\n", model.mean_loss, *expected_loss);
-            }
-
             // move the (mixed precision) grads from GPU to CPU
-            cudaMemcpy(grads_memory_cpu, model.grads_memory, model.num_parameters_bytes, cudaMemcpyDeviceToHost);
+            cudaCheck(cudaMemcpy(grads_memory_cpu, model.grads_memory, model.num_parameters_bytes, cudaMemcpyDeviceToHost));
 
             // convert all gradients to float on the CPU
             char* src_iterator = (char*)grads_memory_cpu; // can be lower precision, so we use char*
@@ -256,43 +266,56 @@ int main(int argc, char *argv[]) {
             // In that case it's ok to extend the tolerance by a bit, after a manual review.
             // Also, different GPUs may use different matrix multiplication algorithms, so the
             // actual errors can be hardware specific.
-            allok = allok & check_tensor(tensors1[0], tensors2[0], V * C, "wte", 6e-1f); // hmm a bit high
-            allok = allok & check_tensor(tensors1[1], tensors2[1], maxT * C, "wpe", 4e-3f);
-            allok = allok & check_tensor(tensors1[2], tensors2[2], L * 3*C * C, "qkvw", 1e-1); // hmm a bit high
-            allok = allok & check_tensor(tensors1[3], tensors2[3], L * 3*C, "qkvb", 3.5e-2f);
-            allok = allok & check_tensor(tensors1[4], tensors2[4], L * C * C, "attprojw", 2e-2f);
-            allok = allok & check_tensor(tensors1[5], tensors2[5], L * C, "attprojb", 3e-2f);
-            allok = allok & check_tensor(tensors1[6], tensors2[6], L * 4*C * C, "fcw", 5e-2f); // hmm a bit high
-            allok = allok & check_tensor(tensors1[7], tensors2[7], L * 4*C, "fcb", 5e-2f); // hmm a bit high
-            allok = allok & check_tensor(tensors1[8], tensors2[8], L * C * 4*C, "fcprojw", 5e-2f); // hmm a bit high
-            allok = allok & check_tensor(tensors1[9], tensors2[9], L * C, "fcprojb", 1.5e-2f);
-            allok = allok & check_tensor(tensors1[10], tensors2[10], L * C, "ln1w", 6e-4f);
-            allok = allok & check_tensor(tensors1[11], tensors2[11], L * C, "ln1b", 9e-3f);
-            allok = allok & check_tensor(tensors1[12], tensors2[12], L * C, "ln2w", 2e-3f);
-            allok = allok & check_tensor(tensors1[13], tensors2[13], L * C, "ln2b", 2.5e-3f);
-            allok = allok & check_tensor(tensors1[14], tensors2[14], C, "lnfw", 0.12f); // hmm bit higher
-            allok = allok & check_tensor(tensors1[15], tensors2[15], C, "lnfb", 2e-2f);
+
+            float grad_thresholds[NUM_PARAMETER_TENSORS] = {5e-1f, 4e-3f, 1e-1f, 3.5e-2f, 2e-2f, 3e-2f, 5e-2f, 5e-2f, 5e-2f, 1.5e-2f, 5e-4f, 8e-3f, 1.5e-3f, 2.5e-3f, 1e-1f, 2e-2f};
+            #if defined(ENABLE_FP32)
+            for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
+                grad_thresholds[i] = 1e-6f;  // we can be much more precise in FP32
+            }
+            #endif
+
+            allok = allok & check_tensor(tensors1[0], tensors2[0], V * C, "wte", grad_thresholds[0]);
+            allok = allok & check_tensor(tensors1[1], tensors2[1], maxT * C, "wpe", grad_thresholds[1]);
+            allok = allok & check_tensor(tensors1[2], tensors2[2], L * 3*C * C, "qkvw", grad_thresholds[2]);
+            allok = allok & check_tensor(tensors1[3], tensors2[3], L * 3*C, "qkvb", grad_thresholds[3]);
+            allok = allok & check_tensor(tensors1[4], tensors2[4], L * C * C, "attprojw", grad_thresholds[4]);
+            allok = allok & check_tensor(tensors1[5], tensors2[5], L * C, "attprojb", grad_thresholds[5]);
+            allok = allok & check_tensor(tensors1[6], tensors2[6], L * 4*C * C, "fcw", grad_thresholds[6]);
+            allok = allok & check_tensor(tensors1[7], tensors2[7], L * 4*C, "fcb", grad_thresholds[7]);
+            allok = allok & check_tensor(tensors1[8], tensors2[8], L * C * 4*C, "fcprojw", grad_thresholds[8]);
+            allok = allok & check_tensor(tensors1[9], tensors2[9], L * C, "fcprojb", grad_thresholds[9]);
+            allok = allok & check_tensor(tensors1[10], tensors2[10], L * C, "ln1w", grad_thresholds[10]);
+            allok = allok & check_tensor(tensors1[11], tensors2[11], L * C, "ln1b", grad_thresholds[11]);
+            allok = allok & check_tensor(tensors1[12], tensors2[12], L * C, "ln2w", grad_thresholds[12]);
+            allok = allok & check_tensor(tensors1[13], tensors2[13], L * C, "ln2b", grad_thresholds[13]);
+            allok = allok & check_tensor(tensors1[14], tensors2[14], C, "lnfw", grad_thresholds[14]);
+            allok = allok & check_tensor(tensors1[15], tensors2[15], C, "lnfb", grad_thresholds[15]);
         }
 
-        gpt2_update(&model, 1e-4f, 0.9f, 0.95f, 1e-8f, 0.0f, 1.0f, step+1, &multi_gpu_config);
+        float grad_norm = gpt2_calculate_grad_norm(&model, &multi_gpu_config);
+        float grad_scale = (grad_norm > 1.0f) ? 1.0f / grad_norm : 1.0f;
+        gpt2_update(&model, 1e-4f, 0.9f, 0.95f, 1e-8f, 0.0f, grad_scale, step+1, &multi_gpu_config);
 
         // print the timing information at the end
         printf("step %d: loss %f (took %f ms)\n", step+1, model.mean_loss, time_elapsed_s * 1000);
-        losses[step] = model.mean_loss;
+        // the expected losses from PyTorch were copied over after the print formatting rounded
+        // them to 6 decimal places, so we do the same here
+        float rounded_loss = roundf(model.mean_loss * 1000000) / 1000000;
+        losses[step] = rounded_loss;
     }
 
     // expected losses are as follows, from Python
     float expected_losses[10] = {
-        5.2700,
-        4.0607,
-        3.3202,
-        2.7176,
-        2.1811,
-        1.6538,
-        1.1680,
-        0.7367,
-        0.4008,
-        0.1874
+        5.270009f,
+        4.060681f,
+        3.320085f,
+        2.717550f,
+        2.181066f,
+        1.653923f,
+        1.168050f,
+        0.736873f,
+        0.401021f,
+        0.187493f
     };
 
     // compare
@@ -305,10 +328,59 @@ int main(int argc, char *argv[]) {
         }
     }
 
+    // Finally, let's check determinism
+    gpt2_write_to_checkpoint(&model, "test_gpt2cu_model.ckpt");
+
+    DataLoader loader;
+    dataloader_init(&loader, "dev/data/tinyshakespeare/tiny_shakespeare_val.bin", B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes, 1);
+    save_state("test_gpt2cu_state.ckpt", 10, &model, &loader);
+    int tokens[10];
+    for (int step = 0; step < 10; step++) {
+        dataloader_next_batch(&loader);
+        gpt2_forward(&model, loader.inputs, B, T);
+        gpt2_backward_and_reduce(&model, loader.inputs, loader.targets, 1, 0);
+        gpt2_update(&model, 1e-4f, 0.9f, 0.95f, 1e-8f, 0.0f, 1.0f, step+11, &multi_gpu_config);
+        losses[step] = model.mean_loss;
+        tokens[step] = loader.inputs[0];
+    }
+
+    // reload
+    gpt2_free(&model);
+    gpt2_build_from_checkpoint(&model, "test_gpt2cu_model.ckpt");
+    int ld_step;
+    gpt2_allocate_state(&model, B, T);
+    load_state(&ld_step, &model, &loader, "test_gpt2cu_state.ckpt");
+    for (int step = 0; step < 10; step++) {
+        dataloader_next_batch(&loader);
+        gpt2_forward(&model, loader.inputs, B, T);
+        gpt2_backward_and_reduce(&model, loader.inputs, loader.targets, 1, 0);
+        gpt2_update(&model, 1e-4f, 0.9f, 0.95f, 1e-8f, 0.0f, 1.0f, step+11, &multi_gpu_config);
+
+        if(loader.inputs[0] != tokens[step]) {
+            printf("Nondeterminism! Token mismatch at step %d: %d vs %d\n", step, tokens[step], loader.inputs[0]);
+            allok = false;
+            break;
+        }
+
+        if(losses[step] != model.mean_loss) {
+            printf("Nondeterminism! Loss mismatch at step %d: %.15f vs %.15f\n", step, losses[step], model.mean_loss);
+            allok = false;
+            break;
+        } else {
+            printf("loss ok at step %d: %f %f\n", step, losses[step], model.mean_loss);
+        }
+    }
+
     // final approval
     printf("overall okay: %d\n", allok);
 
+    // delete intermediate test files
+    remove("test_gpt2cu_model.ckpt");
+    remove("test_gpt2cu_state.ckpt");
+
     // free everything
+    dataloader_free(&loader);
+    gpt2_free(&model);
     common_free(model);
     free(x);
     free(y);
@@ -319,5 +391,5 @@ int main(int argc, char *argv[]) {
     free(expected_grads_memory);
     free(grads_memory_cpu);
     free(grads_memory_cpu_float);
-    return 0;
+    return allok ? EXIT_SUCCESS : EXIT_FAILURE;
 }
diff --git a/test_gpt2_fp32.cu b/test_gpt2_fp32.cu
index 01440072a..356b001c1 100644
--- a/test_gpt2_fp32.cu
+++ b/test_gpt2_fp32.cu
@@ -36,7 +36,6 @@ int main(int argc, char *argv[]) {
 
     // setup cuBLAS and cuBLASLt
     cublasCheck(cublasCreate(&cublas_handle));
-    cublasCheck(cublasLtCreate(&cublaslt_handle));
     // TF32 precision is equivalent to torch.set_float32_matmul_precision('high')
     int enable_tf32 = deviceProp.major >= 8 ? 1 : 0;
     enable_tf32 = 0; // NOTE: disable TF32 for testing!!!
@@ -44,7 +43,6 @@ int main(int argc, char *argv[]) {
     cublas_compute_type = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
     cublasMath_t cublas_math_mode = enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH;
     cublasCheck(cublasSetMathMode(cublas_handle, cublas_math_mode));
-    cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size));
 
     // build the GPT-2 model from a checkpoint
     GPT2 model;
@@ -100,7 +98,7 @@ int main(int argc, char *argv[]) {
     // at this point, target should be equal to expected_logits, let's compare
     // copy logits to CPU so we can compare them
     float* logits_cpu = (float*)mallocCheck(B * T * Vp * sizeof(float));
-    cudaMemcpy(logits_cpu, model.acts.output, B * T * Vp * sizeof(float), cudaMemcpyDeviceToHost);
+    cudaCheck(cudaMemcpy(logits_cpu, model.acts.output, B * T * Vp * sizeof(float), cudaMemcpyDeviceToHost));
 
     // compare the output logits from the forward pass
     // also careful that we don't access and compare the padded columns of logits
@@ -198,16 +196,16 @@ int main(int argc, char *argv[]) {
 
     // expected losses are as follows, from Python
     float expected_losses[10] = {
-        5.270007133483887,
-        4.059706687927246,
-        3.3751230239868164,
-        2.8007826805114746,
-        2.315382242202759,
-        1.8490285873413086,
-        1.3946564197540283,
-        0.9991465210914612,
-        0.6240804195404053,
-        0.37651097774505615
+        5.270007133483887f,
+        4.059706687927246f,
+        3.3751230239868164f,
+        2.8007826805114746f,
+        2.315382242202759f,
+        1.8490285873413086f,
+        1.3946564197540283f,
+        0.9991465210914612f,
+        0.6240804195404053f,
+        0.37651097774505615f
     };
 
     // compare
@@ -231,9 +229,7 @@ int main(int argc, char *argv[]) {
     free(expected_grads_memory);
     free(calculated_grads_memory);
     gpt2_free(&model);
-    cudaCheck(cudaFree(cublaslt_workspace));
     cublasCheck(cublasDestroy(cublas_handle));
-    cublasCheck(cublasLtDestroy(cublaslt_handle));
 
     return 0;
 }
\ No newline at end of file
diff --git a/train_gpt2.c b/train_gpt2.c
index 6240f67f9..029cd0842 100644
--- a/train_gpt2.c
+++ b/train_gpt2.c
@@ -352,7 +352,7 @@ void attention_backward(float* dinp, float* dpreatt, float* datt,
     // dout is (B, T, C)
     int C3 = C*3;
     int hs = C / NH; // head size
-    float scale = 1.0 / sqrtf(hs);
+    float scale = 1.f / sqrtf(hs);
 
     for (int b = 0; b < B; b++) {
         for (int t = 0; t < T; t++) {
@@ -625,6 +625,36 @@ typedef struct {
     float* losses; // (B, T)
 } ActivationTensors;
 
+void fill_in_activation_sizes(size_t* act_sizes, GPT2Config config, int B, int T) {
+    size_t C = config.channels;
+    size_t NH = config.num_heads;
+    size_t L = config.num_layers;
+    size_t Vp = config.padded_vocab_size;
+    act_sizes[0] = B * T * C; // encoded
+    act_sizes[1] = L * B * T * C; // ln1
+    act_sizes[2] = L * B * T; // ln1_mean
+    act_sizes[3] = L * B * T; // ln1_rstd
+    act_sizes[4] = L * B * T * 3 * C; // qkv
+    act_sizes[5] = L * B * T * C; // atty
+    act_sizes[6] = L * B * NH * T * T; // preatt
+    act_sizes[7] = L * B * NH * T * T; // att
+    act_sizes[8] = L * B * T * C; // attproj
+    act_sizes[9] = L * B * T * C; // residual2
+    act_sizes[10] = L * B * T * C; // ln2
+    act_sizes[11] = L * B * T; // ln2_mean
+    act_sizes[12] = L * B * T; // ln2_rstd
+    act_sizes[13] = L * B * T * 4 * C; // fch
+    act_sizes[14] = L * B * T * 4 * C; // fch_gelu
+    act_sizes[15] = L * B * T * C; // fcproj
+    act_sizes[16] = L * B * T * C; // residual3
+    act_sizes[17] = B * T * C; // lnf
+    act_sizes[18] = B * T; // lnf_mean
+    act_sizes[19] = B * T; // lnf_rstd
+    act_sizes[20] = B * T * Vp; // logits
+    act_sizes[21] = B * T * Vp; // probs
+    act_sizes[22] = B * T; // losses
+}
+
 float* malloc_and_point_activations(ActivationTensors* acts, size_t* act_sizes) {
     size_t num_activations = 0;
     for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
@@ -678,7 +708,6 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
 
     // read in model from a checkpoint file
     FILE *model_file = fopenCheck(checkpoint_path, "rb");
-    if (model_file == NULL) { printf("Error opening model file\n"); exit(1); }
     int model_header[256];
     freadCheck(model_header, sizeof(int), 256, model_file);
     if (model_header[0] != 20240326) { printf("Bad magic model file\n"); exit(1); }
@@ -763,29 +792,7 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T) {
         model->batch_size = B;
         model->seq_len = T;
         // and now allocate the space
-        model->act_sizes[0] = B * T * C; // encoded
-        model->act_sizes[1] = L * B * T * C; // ln1
-        model->act_sizes[2] = L * B * T;  // ln1_mean
-        model->act_sizes[3] = L * B * T;  // ln1_rstd
-        model->act_sizes[4] = L * B * T * 3*C; // qkv
-        model->act_sizes[5] = L * B * T * C;  // atty
-        model->act_sizes[6] = L * B * NH * T * T;  // preatt
-        model->act_sizes[7] = L * B * NH * T * T;  // att
-        model->act_sizes[8] = L * B * T * C; // attproj
-        model->act_sizes[9] = L * B * T * C; // residual2
-        model->act_sizes[10] = L * B * T * C; // ln2
-        model->act_sizes[11] = L * B * T; // ln2_mean
-        model->act_sizes[12] = L * B * T; // ln2_rstd
-        model->act_sizes[13] = L * B * T * 4*C; // fch
-        model->act_sizes[14] = L * B * T * 4*C; // fch_gelu
-        model->act_sizes[15] = L * B * T * C; // fcproj
-        model->act_sizes[16] = L * B * T * C; // residual3
-        model->act_sizes[17] = B * T * C; // lnf
-        model->act_sizes[18] = B * T; // lnf_mean
-        model->act_sizes[19] = B * T; // lnf_rstd
-        model->act_sizes[20] = B * T * Vp; // logits
-        model->act_sizes[21] = B * T * Vp; // probs
-        model->act_sizes[22] = B * T; // losses
+        fill_in_activation_sizes(model->act_sizes, model->config, B, T);
         size_t num_activations = 0;
         for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
             num_activations += model->act_sizes[i];
@@ -1041,14 +1048,14 @@ void gpt2_free(GPT2 *model) {
 // ----------------------------------------------------------------------------
 // sampler
 
-unsigned int random_u32(unsigned long long *state) {
+unsigned int random_u32(uint64_t *state) {
     // xorshift rng: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A
     *state ^= *state >> 12;
     *state ^= *state << 25;
     *state ^= *state >> 27;
     return (*state * 0x2545F4914F6CDD1Dull) >> 32;
 }
-float random_f32(unsigned long long *state) { // random float32 in [0,1)
+float random_f32(uint64_t *state) { // random float32 in [0,1)
     return (random_u32(state) >> 8) / 16777216.0f;
 }
 
@@ -1083,8 +1090,8 @@ int main() {
     int B = 4; // batch size 4 (i.e. 4 independent token sequences will be trained on)
     int T = 64; // sequence length 64 (i.e. each sequence is 64 tokens long). must be <= maxT, which is 1024 for GPT-2
     DataLoader train_loader, val_loader;
-    dataloader_init(&train_loader, train_tokens, B, T, 0, 1);
-    dataloader_init(&val_loader, val_tokens, B, T, 0, 1);
+    dataloader_init(&train_loader, train_tokens, B, T, 0, 1, 1);
+    dataloader_init(&val_loader, val_tokens, B, T, 0, 1, 0);
     printf("train dataset num_batches: %zu\n", train_loader.num_tokens / (B*T));
     printf("val dataset num_batches: %zu\n", val_loader.num_tokens / (B*T));
     int val_num_batches = 5;
@@ -1094,7 +1101,7 @@ int main() {
     tokenizer_init(&tokenizer, "gpt2_tokenizer.bin");
 
     // some memory for generating samples from the model
-    unsigned long long rng_state = 1337;
+    uint64_t rng_state = 1337;
     int* gen_tokens = (int*)mallocCheck(B * T * sizeof(int));
     const int genT = 64; // number of steps of inference we will do
 
diff --git a/train_gpt2.cu b/train_gpt2.cu
index 38aa986e8..16f801387 100644
--- a/train_gpt2.cu
+++ b/train_gpt2.cu
@@ -1,31 +1,17 @@
 /*
 GPT-2 Transformer Neural Net training loop. See README.md for usage.
 */
-
 #include <unistd.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <stdarg.h>
 #include <string>
+#include <string_view>
+#include <sys/stat.h>
 #include <sys/types.h>
-#include <vector>
-#include <algorithm>
-#include <functional>
-#include <unordered_map>
-// GPU / CUDA related
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <cublasLt.h>
-#include <nvtx3/nvToolsExt.h>
-#include <cuda_profiler_api.h>
-// Multi-GPU related
-#ifdef MULTI_GPU
-#include <mpi.h>
-#include <nccl.h>
-#endif
-// our own utilities
+// ----------- CPU utilities -----------
 // defines: fopenCheck, freadCheck, fcloseCheck, fseekCheck, mallocCheck
-// defines: create_dir_if_not_exists, find_max_step
+// defines: create_dir_if_not_exists, find_max_step, ends_with_bin
 #include "llmc/utils.h"
 // defines: tokenizer_init, tokenizer_decode, tokenizer_free
 #include "llmc/tokenizer.h"
@@ -34,1865 +20,66 @@ GPT-2 Transformer Neural Net training loop. See README.md for usage.
 #include "llmc/dataloader.h"
 // defines: manual_seed, normal_ (same as torch.manual_seed and torch.normal)
 #include "llmc/rand.h"
-// defines: sample_softmax
+// defines: lr_scheduler_init, get_learning_rate
+#include "llmc/schedulers.h"
+// defines: sample_softmax, random_f32
 #include "llmc/sampler.h"
 // defines: logger_init, logger_log_eval, logger_log_val, logger_log_train
 #include "llmc/logger.h"
-// ----------------------------------------------------------------------------
-// CUDA precision settings
-
-enum PrecisionMode {
-    PRECISION_FP32,
-    PRECISION_FP16,
-    PRECISION_BF16
-};
-
-// Specific configurations based on the enabled precision
-#if defined(ENABLE_FP32)
-typedef float floatX;
-#define CUBLAS_LOWP CUDA_R_32F
-#define PRECISION_MODE PRECISION_FP32
-#ifdef MULTI_GPU
-const ncclDataType_t ncclFloatX = ncclFloat;
-#endif
-
-// use fp16 (note: this may require gradient scaler, currently not implemented!)
-#elif defined(ENABLE_FP16)
-typedef half floatX;
-#define CUBLAS_LOWP CUDA_R_16F
-#define PRECISION_MODE PRECISION_FP16
-#ifdef MULTI_GPU
-const ncclDataType_t ncclFloatX = ncclHalf;
-#endif
-
-#else // Default to bfloat16
-typedef __nv_bfloat16 floatX;
-#define CUBLAS_LOWP CUDA_R_16BF
-#define PRECISION_MODE PRECISION_BF16
-#ifdef MULTI_GPU
-const ncclDataType_t ncclFloatX = ncclBfloat16;
-#endif
-#endif
-
-// ----------------------------------------------------------------------------
-// CUDA utils
-
-// Profiler utils
-class NvtxRange {
- public:
-    NvtxRange(const char* s) { nvtxRangePush(s); }
-    NvtxRange(const std::string& base_str, int number) {
-        std::string range_string = base_str + " " + std::to_string(number);
-        nvtxRangePush(range_string.c_str());
-    }
-    ~NvtxRange() { nvtxRangePop(); }
-};
-#define NVTX_RANGE_FN() NvtxRange nvtx_range(__FUNCTION__)
-
-// try to make sure that 2 blocks fit on A100/H100 to maximise latency tolerance
-// this needs to be defines rather than queried to be used for __launch_bounds__
-#if __CUDA_ARCH__ == 800 || __CUDA_ARCH__ >= 900
-#define MAX_1024_THREADS_BLOCKS 2
+// defines: get_flops_promised
+#include "llmc/mfu.h"
+// defines: OutlierDetector, init_detector, update_detector
+#include "llmc/outlier_detector.h"
+// ----------- GPU utilities -----------
+// defines:
+// WARP_SIZE, MAX_1024_THREADS_BLOCKS, CEIL_DIV, cudaCheck, PRECISION_MODE
+// NVTX_RANGE_FN
+#include "llmc/cuda_common.h"
+// defines:
+// Packed128, f128, x128
+// warpReduceSum, warpReduceMax, blockReduce, copy_and_cast_kernel
+#include "llmc/cuda_utils.cuh"
+// defines: CUBLAS_LOWP, cublasCheck, cublaslt_workspace_size, cublaslt_workspace
+// defines: cublas_compute, cublaslt_handle, cublas_handle
+#include "llmc/cublas_common.h"
+// ----------- Layer implementations in CUDA -----------
+// defines: encoder_forward, encoder_backward
+#include "llmc/encoder.cuh"
+// defines: layernorm_forward, residual_forward, fused_residual_forward5, layernorm_backward
+#include "llmc/layernorm.cuh"
+// defines: matmul_cublaslt, matmul_forward, matmul_backward, gelu_forward, gelu_backward_inplace
+#include "llmc/matmul.cuh"
+#ifdef ENABLE_CUDNN
+// defines: create_cudnn, destroy_cudnn, attention_forward_cudnn, attention_backward_cudnn
+#include "llmc/cudnn_att.h"
 #else
-#define MAX_1024_THREADS_BLOCKS 1
-#endif
-
-// WarpSize is not a compile time constant, this allows the compiler to optimize
-#define WARP_SIZE 32U
-
-// cuBLAS workspace. Hardcoding to 32MiB but only Hopper needs 32, for others 4 is OK
-const size_t cublaslt_workspace_size = 32 * 1024 * 1024;
-void* cublaslt_workspace = NULL;
-cublasComputeType_t cublas_compute = CUBLAS_COMPUTE_32F;
-cublasLtHandle_t cublaslt_handle;
-cublasHandle_t cublas_handle;
-cudaDeviceProp deviceProp;
-
-// convenience macro for calculating grid/block dimensions for kernels
-#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
-
-// CUDA error checking
-void cudaCheck(cudaError_t error, const char *file, int line) {
-  if (error != cudaSuccess) {
-    printf("[CUDA ERROR] at file %s:%d:\n%s\n", file, line, cudaGetErrorString(error));
-    exit(EXIT_FAILURE);
-  }
-};
-#define cudaCheck(err) (cudaCheck(err, __FILE__, __LINE__))
-
-// cuBLAS error checking
-void cublasCheck(cublasStatus_t status, const char *file, int line)
-{
-    if (status != CUBLAS_STATUS_SUCCESS) {
-        printf("[cuBLAS ERROR]: %d %s %d\n", status, file, line);
-        exit(EXIT_FAILURE);
-    }
-}
-#define cublasCheck(status) { cublasCheck((status), __FILE__, __LINE__); }
-
-#ifdef MULTI_GPU
-void nccl_check(ncclResult_t status, const char *file, int line) {
-    if (status != ncclSuccess) {
-        printf("[NCCL ERROR] at file %s:%d:\n%s\n", file, line, ncclGetErrorString(status));
-        exit(EXIT_FAILURE);
-    }
-}
-#define ncclCheck(err) (nccl_check(err, __FILE__, __LINE__))
-
-void mpi_check(int status, const char *file, int line) {
-    if (status != MPI_SUCCESS) {
-        char mpi_error[4096];
-        int mpi_error_len = 0;
-        assert(MPI_Error_string(status, &mpi_error[0], &mpi_error_len) == MPI_SUCCESS);
-        printf("[MPI ERROR] at file %s:%d:\n%.*s\n", file, line, mpi_error_len, mpi_error);
-        exit(EXIT_FAILURE);
-    }
-}
-#define mpiCheck(err) (mpi_check(err, __FILE__, __LINE__))
+// defines: attention_forward, attention_backward
+#include "llmc/attention.cuh"
 #endif
-
-// older nvcc does not provide __ldcs and __stcs for bfloat16, despite these actually just being unsigned shorts.
-// we need to be careful here to only define our own versions if none already exist, otherwise the compiler will
-// complain.
-// If not, you easily get "no viable overload" (for sm52) and "function already exists" (sm_80)
-#if defined(ENABLE_BF16) && (__CUDACC_VER_MAJOR__ < 12) && !((__CUDA_ARCH__ >= 800) || !defined(__CUDA_ARCH__))
-__device__ floatX __ldcs(const floatX* address) {
-    unsigned short bf = __ldcs(reinterpret_cast<const unsigned short*>(address));
-    return __nv_bfloat16_raw{bf};
-}
-
-__device__ void __stcs(floatX* address, floatX value) {
-    __stcs(reinterpret_cast<unsigned short*>(address), ((__nv_bfloat16_raw)value).x);
-}
-#endif
-
-// warp-level reduction for summing values
-__device__ float warpReduceSum(float val) {
-    for (int offset = 16; offset > 0; offset /= 2) {
-        val += __shfl_xor_sync(0xFFFFFFFF, val, offset);
-    }
-    return val;
-}
-// warp-level reduction for finding the maximum value
-__device__ float warpReduceMax(float val) {
-    for (int offset = 16; offset > 0; offset /= 2) {
-        val = fmaxf(val, __shfl_xor_sync(0xFFFFFFFF, val, offset));
-    }
-    return val;
-}
-// requires all 32 threads in the warp to be active, but should work for any block size
-// uses non-dynamic shared memory so every call increases shared memory requirements by 128 bytes
-// the fact it's unique shared memory allows us to avoid an extra __syncthreads() call at the end
-// but if called inside a loop, the shared memory will be implicitly reused, so set final_sync to 1
-using reduction_func_t = float (*) (float);
-template<reduction_func_t warp_reduction>
-__device__ float blockReduce(float val, bool final_sync=false, float out_of_bounds=0.0f) {
-    // two reductions of up to 1024 threads:
-    // 1) inside warp (shuffle), 2) cross-warp (shared memory), 3) inside warp (shuffle)
-    __shared__ float shared_val[WARP_SIZE];
-    const int lane_id = threadIdx.x % WARP_SIZE;
-    const int warp_id = threadIdx.x / WARP_SIZE;
-    const int num_warps = blockDim.x / WARP_SIZE;
-
-    float warp_val = warp_reduction(val);
-    if (lane_id == 0) { shared_val[warp_id] = warp_val; }
-    __syncthreads();
-    warp_val = (lane_id < num_warps) ? shared_val[lane_id] : out_of_bounds;
-    float block_val = warp_reduction(warp_val);
-
-    if (final_sync) {
-        __syncthreads(); // only needed in loops when effectively reusing shared memory etc.
-    }
-    return block_val;
-}
-
-// ----------------------------------------------------------------------------
-// Packed128 data structure, which forces the compiler to use 128-bit loads/stores
-// in GPUs that support (the LDG.128 and STS.128 instructions)
-// This is a bit similar to the use of float4 in the case of 32-bit floats, but
-// supports arbitrary precision.
-
-template<class ElementType>
-struct alignas(16) Packed128 {
-    Packed128() = default;
-    __device__ explicit Packed128(int4 bits) {
-        static_assert(sizeof(bits) == sizeof(payload), "Size mismatch.");
-        memcpy(&payload, &bits, sizeof(bits));
-    }
-
-    __device__  static Packed128 constant(ElementType value) {
-        Packed128 result;
-        for(int k = 0; k < size; ++k) {
-            result.payload[k] = value;
-        }
-        return result;
-    }
-    __device__ static Packed128 zeros() {
-        return constant(0);
-    }
-    __device__ static Packed128 ones() {
-        return constant(1);
-    }
-
-    __device__ ElementType& operator[](int index) {
-        return payload[index];
-    }
-    __device__ const ElementType& operator[](int index) const {
-        return payload[index];
-    }
-    __device__ int4 get_bits() const {
-        int4 bits;
-        static_assert(sizeof(bits) == sizeof(payload), "Size mismatch.");
-        memcpy(&bits, &payload, sizeof(bits));
-        return bits;
-    }
-    static constexpr const size_t size = sizeof(int4) / sizeof(ElementType);
-    ElementType payload[size];
-};
-
-// load a Packed128 from an aligned memory address
-template<class ElementType>
-__device__ Packed128<ElementType> load128(const ElementType* address) {
-    return Packed128<ElementType>{*reinterpret_cast<const int4*>(address)};
-}
-// load a Packed128 from an aligned memory address with streaming cache hint
-template<class ElementType>
-__device__ Packed128<ElementType> load128cs(const ElementType* address) {
-    return Packed128<ElementType>{__ldcs(reinterpret_cast<const int4*>(address))};
-}
-// store a Packed128 to an aligned memory address
-template<class ElementType>
-__device__ void store128(ElementType* target, Packed128<ElementType> value) {
-    *reinterpret_cast<int4*>(target) = value.get_bits();
-}
-// store a Packed128 to an aligned memory address with streaming cache hint
-template<class ElementType>
-__device__ void store128cs(ElementType* target, Packed128<ElementType> value) {
-    __stcs(reinterpret_cast<int4*>(target), value.get_bits());
-}
-// store a Packed128 to an aligned memory address while caching in L2 but bypassing L1
-template<class ElementType>
-__device__ void store128cg(ElementType* target, Packed128<ElementType> value) {
-    __stcg(reinterpret_cast<int4*>(target), value.get_bits());
-}
-
-// short-form typedefs
-typedef Packed128<float> f128;
-typedef Packed128<floatX> x128;
-
-// ----------------------------------------------------------------------------
-// Random Number Generatiom
-
-// Simple xorshift RNG
-__device__ __host__ unsigned int random_u32(unsigned long long *state) {
-    // xorshift rng: https://en.wikipedia.org/wiki/Xorshift#xorshift.2A
-    *state ^= *state >> 12;
-    *state ^= *state << 25;
-    *state ^= *state >> 27;
-    return (*state * 0x2545F4914F6CDD1Dull) >> 32;
-}
-__device__ __host__ float random_f32(unsigned long long *state) { // random float32 in [0,1)
-    return (random_u32(state) >> 8) / 16777216.0f;
-}
-
-// SquirrelNoise5 - Squirrel's Raw Noise utilities (version 5)
-// This gives us a random number from threadIdx/blockIdx + a single seed for the entire GPU
-// todo - possibly overkill and we don't need such high quality random numbers? (tbd)
-// http://eiserloh.net/noise/SquirrelNoise5.hpp
-__device__ __host__ constexpr unsigned int SquirrelNoise5(int positionX, unsigned int seed)
-{
-	constexpr unsigned int SQ5_BIT_NOISE1 = 0xd2a80a3f;	// 11010010101010000000101000111111
-	constexpr unsigned int SQ5_BIT_NOISE2 = 0xa884f197;	// 10101000100001001111000110010111
-	constexpr unsigned int SQ5_BIT_NOISE3 = 0x6C736F4B; // 01101100011100110110111101001011
-	constexpr unsigned int SQ5_BIT_NOISE4 = 0xB79F3ABB;	// 10110111100111110011101010111011
-	constexpr unsigned int SQ5_BIT_NOISE5 = 0x1b56c4f5;	// 00011011010101101100010011110101
-	unsigned int mangledBits = (unsigned int) positionX;
-	mangledBits *= SQ5_BIT_NOISE1;
-	mangledBits += seed;
-	mangledBits ^= (mangledBits >> 9);
-	mangledBits += SQ5_BIT_NOISE2;
-	mangledBits ^= (mangledBits >> 11);
-	mangledBits *= SQ5_BIT_NOISE3;
-	mangledBits ^= (mangledBits >> 13);
-	mangledBits += SQ5_BIT_NOISE4;
-	mangledBits ^= (mangledBits >> 15);
-	mangledBits *= SQ5_BIT_NOISE5;
-	mangledBits ^= (mangledBits >> 17);
-	return mangledBits;
-}
-__device__ __host__ constexpr unsigned int Get2dNoiseUint(int indexX, int indexY, unsigned int seed)
-{
-	constexpr int PRIME_NUMBER = 198491317; // Large prime number with non-boring bits
-	return SquirrelNoise5(indexX + (PRIME_NUMBER * indexY), seed);
-}
-
-// stochastic rounding built on top of Squirel Noise above (with seed updated per step via xorshift)
-__device__ __forceinline__ void stochastic_rounding(float in, __nv_bfloat16 *out, unsigned int seed) {
-    // todo - is this stochastic rounding *too good*? can we cut any corners?
-    unsigned int random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed);
-    unsigned int threshold = random & 0xFFFF;
-    unsigned int float_bits = __float_as_uint(in);
-    unsigned int rounded_bits = float_bits & 0x0000FFFF;
-    float_bits = (rounded_bits > threshold) ? (float_bits | 0xFFFF) : (float_bits  & ~0xFFFF);
-    *out = __float2bfloat16_rn(__uint_as_float(float_bits));
-}
-__device__ __forceinline__ void stochastic_rounding(float in, half *out, unsigned int random) {
-    *out = (float)in; // todo - implement this...
-}
-__device__ __forceinline__ void stochastic_rounding(float in, float *out, unsigned int random) {
-    *out = in; // dummy function for when floatX is float (FP32 mode)
-}
+// defines: fused_classifier
+#include "llmc/fused_classifier.cuh"
+// defines: adamw_kernel3
+#include "llmc/adamw.cuh"
+// defines: global_norm_squared
+#include "llmc/global_norm.cuh"
+// ----------- Multi-GPU support -----------
+// defines: ncclFloatX, ncclCheck, MultiGpuConfig, ShardInfo
+// defines: printf0, multi_gpu_config
+// defines: multi_gpu_config_init, multi_gpu_config_free
+// defines: set_zero_configs, multi_gpu_cpu_float_sum, multi_gpu_barrier
+// defines: multi_gpu_get_shard_offset, multi_gpu_async_reduce_gradient
+#include "llmc/zero.cuh"
 
 // ----------------------------------------------------------------------------
-// MPI / multi-processing setup
-
-// Parameters specific to training on multiple GPUs.
-typedef struct {
-    int process_rank;      // Rank of this process among all MPI processes. 0 if no multi-GPU.
-    int num_processes;     // Total number of processes. 1 if no multi-GPU.
-    int local_device_idx;  // This process GPU index on current machine. 0 if no multi-GPU.
-
-    // Zero Redundancy Optimizer stage - https://fairscale.readthedocs.io/en/stable/deep_dive/oss_sdp_fsdp.html
-    // 0-Disabled
-    // 1-Optimizer State Sharding (OSS)
-    // 2-Optimizer + Gradient State Sharding (SDP)
-    // 3-Optimizer + Gradient + Horizontal Model Sharding (FSDP)
-    int zero_stage;
-    size_t shard_num_parameters;
-    size_t shard_offset;
-#ifdef MULTI_GPU
-    ncclComm_t nccl_comm;  // NCCL communication primitive, used for collective multi-GPU work.
-#endif
-} MultiGpuConfig;
-
-// one global variable to hold the multi-GPU configuration for this process
-MultiGpuConfig multi_gpu_config;
-
-#ifdef MULTI_GPU
-// Determine which GPU this process should use.
-// Processes on the same machines use different GPU indicies. Processes on other machines don't.
-// Copied from NCCL examples: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/examples.html#example-2-one-device-per-process-or-thread
-int multi_gpu_get_local_device_idx(int process_rank, int num_processes) {
-  char hostname[1024];
-  hostname[1023] = '\0';
-  // All processes on the same machine will share the same hostname.
-  gethostname(hostname, 1023);
-  for (int i=0; i < 1024; i++) {
-    if (hostname[i] == '.') {
-        hostname[i] = '\0';
-        break;
-    }
-  }
-  uint64_t hostname_hash = 5381;
-  for (int c = 0; hostname[c] != '\0'; c++){ hostname_hash = ((hostname_hash << 5) + hostname_hash) ^ hostname[c]; }
-
-  // Distribute all hostname hashes to all processes.
-  uint64_t* all_hostsname_hashes = (uint64_t*)malloc(num_processes * sizeof(uint64_t));
-  all_hostsname_hashes[process_rank] = hostname_hash;
-  mpiCheck(MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, all_hostsname_hashes, sizeof(uint64_t), MPI_BYTE, MPI_COMM_WORLD));
-
-  // Identify which GPU we need to use.
-  int local_device_idx = 0;
-  for (int current_process = 0; current_process < num_processes; ++current_process) {
-     if (current_process == process_rank) {
-      // Found my gpu, local_device_idx now has my target GPU index.
-      break;
-     }
-     if (all_hostsname_hashes[current_process] == all_hostsname_hashes[process_rank]) {
-      // This process ID runs on the same machine, but it's not me, skip this GPU
-      local_device_idx++;
-     }
-  }
-
-  free(all_hostsname_hashes);
-  return local_device_idx;
-}
-#endif
-
-MultiGpuConfig multi_gpu_config_init(int *argc, char ***argv) {
-#ifdef MULTI_GPU
-    // Initialize MPI.
-    MultiGpuConfig result;
-    mpiCheck(MPI_Init(argc, argv));
-    mpiCheck(MPI_Comm_rank(MPI_COMM_WORLD, &result.process_rank));
-    mpiCheck(MPI_Comm_size(MPI_COMM_WORLD, &result.num_processes));
-    result.local_device_idx = multi_gpu_get_local_device_idx(result.process_rank, result.num_processes);
-    cudaCheck(cudaSetDevice(result.local_device_idx));
-    ncclUniqueId nccl_id;
-    if (result.process_rank == 0) {
-        ncclCheck(ncclGetUniqueId(&nccl_id));
-    }
-    mpiCheck(MPI_Bcast((void *)&nccl_id, sizeof(nccl_id), MPI_BYTE, 0, MPI_COMM_WORLD));
-    ncclCheck(ncclCommInitRank(&result.nccl_comm, result.num_processes, nccl_id, result.process_rank));
-    return result;
-#else
-    printf("Multi-GPU support is disabled. Using a single GPU.\n");
-    cudaCheck(cudaSetDevice(0));
-    MultiGpuConfig result;
-    result.process_rank = 0;
-    result.num_processes = 1;
-    result.local_device_idx = 0;
-    return result;
-#endif
-}
-
-void multi_gpu_config_free(const MultiGpuConfig* multi_gpu_config) {
-#ifdef MULTI_GPU
-    ncclCheck(ncclCommDestroy(multi_gpu_config->nccl_comm));
-    mpiCheck(MPI_Finalize());
-#endif
-}
-
-void multi_gpu_barrier(const MultiGpuConfig* multi_gpu_config) {
-#ifdef MULTI_GPU
-    if (multi_gpu_config->num_processes > 1) {
-        mpiCheck(MPI_Barrier(MPI_COMM_WORLD));
-    }
-#endif
-}
-
-// convenience function that only prints if the rank of process is zero
-void printf0(const char *format, ...) {
-    if (multi_gpu_config.process_rank == 0) {
-        va_list args;
-        va_start(args, format);
-        vprintf(format, args);
-        va_end(args);
-    }
-}
-
-void set_zero_configs(MultiGpuConfig* multi_gpu_config, int zero_stage, size_t total_parameters) {
-
-    multi_gpu_config->zero_stage = 0;
-    multi_gpu_config->shard_num_parameters = total_parameters;
-    multi_gpu_config->shard_offset = 0;
-
-    // Check the Zero Stage and define sharding parameters
-    if (zero_stage == 0) {
-        printf0("| Zero Optimization is disabled                                              |\n");
-    }
-    else if (zero_stage == 1) {
-        if (total_parameters % multi_gpu_config->num_processes != 0) {
-            printf0("| Zero Optimization is disabled, Can't equally partition parameters          |\n");
-            multi_gpu_config->zero_stage = 0;
-        }
-        else {
-            printf0("| Zero Stage1 is enabled                                                     |\n");
-            multi_gpu_config->zero_stage = 1;
-            multi_gpu_config->shard_num_parameters = total_parameters / multi_gpu_config->num_processes;
-            multi_gpu_config->shard_offset = multi_gpu_config->process_rank * multi_gpu_config->shard_num_parameters;
-        }
-    }
-    else{
-        printf0("| Disabling Zero Optimization, Zero Stage2 and Stage3 are not yet supported  |\n");
-        multi_gpu_config->zero_stage = 0;
-    }
-}
+// global vars for I/O
+char filename_buffer[512];
 
 // ----------------------------------------------------------------------------
-// cuDNN path
-#ifdef ENABLE_CUDNN
-// functions defined in cudnn_att.cu
-void create_cudnn();
-void destroy_cudnn();
-void attention_forward_cudnn(floatX* out,  // output: (B, T, NH, HS)
-                             float* stats, // output for backward pass: (B, NH, T)
-                             floatX* inp,  // input: (B, T, 3, NH, HS) QKV
-                             int B, int T, int NH, int C);
-
-void attention_backward_cudnn(floatX* dqkvr,                                       // output
-                              floatX* dout, floatX* qkvr, floatX* o, float* stats, // inputs
-                              int B, int T, int NH, int C);
-#else
-void create_cudnn() {}
-void destroy_cudnn() {}
-#endif // ENABLE_CUDNN
-
-// ----------------------------------------------------------------------------
-// all the kernels
-/*
-Note that we are being clever in the backward pass to conserve memory.
-All parameters use a += in the backward pass, so we can do gradient accumulation.
-But all activations have = instead of += because these are faster (just read, no write).
-This is okay for all activations except for those in the residual stream, where the
-gradients have to add. We make sure that we do a += as necessary.
-E.g., the layernorms are connected to the residuals so we += in layernorm backward.
-*/
-
-__global__ void encoder_forward_kernel3(floatX* out,
-                               const int* inp, const floatX* wte, const floatX* wpe,
-                               int B, int T, int C) {
-    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
-    int N = B * T * C;
-    if (idx >= N) { return; }
-
-    int bt = idx / C;
-    int b = bt / T;
-    int t = bt % T;
-    int c = idx % C;
-
-    int ix = inp[b * T + t];
-
-    floatX* out_btc = out + b * T * C + t * C + c;
-    const floatX* wte_ix = wte + ix * C + c;
-    const floatX* wpe_tc = wpe + t * C + c;
-
-    x128 packed_out;
-    x128 wte128 = load128cs(wte_ix);
-    x128 wpe128 = load128cs(wpe_tc);
-    for (int k = 0; k < x128::size; k++) {
-        packed_out[k] = (floatX)((float)wte128[k] + (float)wpe128[k]);
-    }
-    store128(out_btc, packed_out);
-}
-
-template <int BLOCK_SIZE=256>
-__global__ void wte_backward_kernel(floatX* dwte,
-                                    const int4* bucket_info, const int* workload_indices, const floatX* dout, const int* inp,
-                                    unsigned int seed, int B, int T, int C) {
-    // In order to be deterministic, we preprocess the inputs on the cpu into "buckets"
-    // Each bucket corresponds to (WARP_SIZE * x128::size) channels for a single vocabulary token
-    // Each thread handles x128::size channels, e.g. 256 per warp for BF16
-    // Each block handles (BLOCK_SIZE / WARP_SIZE) elements in a single bucket in parallel
-    // If a bucket has less than 8 elements, some warps will return immediately
-    // If a bucket has more than 8 elements, we will loop over all of them
-    // The buckets are sorted on the CPU so the largest buckets start 1st
-    int bucket = blockIdx.x;
-    int warp_id = threadIdx.x / WARP_SIZE;
-    int lane_id = threadIdx.x % WARP_SIZE;
-    int c_per_warp = WARP_SIZE * x128::size;
-
-    int bucket_start_idx = bucket_info[bucket].x;
-    int bucket_size = bucket_info[bucket].y;
-    int bucket_ix = bucket_info[bucket].z;
-    int c = bucket_info[bucket].w * c_per_warp + (lane_id * x128::size);
-
-    // Each thread handles "x128::size" channels, so at fp8, each warp would handle 512 channels
-    // If C is not a multiple of this (e.g. 768), some buckets/c_groups cannot use the entire warp
-    if (c >= C) { return; }
-    // Exit early if this is a small bucket and this warp doesn't have any items to process
-    if (warp_id >= bucket_size) { return; }
-
-    float accum[x128::size] = {0.0f};
-    __shared__ float accum_shared[x128::size * BLOCK_SIZE];
-
-    for(int item = warp_id; item < bucket_size; item += BLOCK_SIZE/WARP_SIZE) {
-        int bt = workload_indices[bucket_start_idx + item];
-
-        const floatX* dout_btc = dout + bt * C + c;
-        x128 packed_inp1 = load128cs(dout_btc);
-        for (int k = 0; k < packed_inp1.size; k++) {
-            accum[k] += (float)packed_inp1[k];
-        }
-    }
-
-    if (warp_id != 0) {
-        // we accumulate into warp 0, so only the other warps need to write to shared memory
-        for (int k = 0; k < x128::size; k++) {
-            accum_shared[threadIdx.x + k * BLOCK_SIZE] = accum[k];
-        }
-        return; // only warp 0 is needed after writing to shared memory
-    }
-
-    // Read dwte for warp 0 even if other warps are not finished yet to maximise latency tolerance
-    floatX* dwte_ix = dwte + bucket_ix * C + c;
-    x128 packed_in_out = load128(dwte_ix);
-
-    // note: threads which have returned are considered synchronised by CUDA so no risk of deadlock
-    __syncthreads();
-
-    // Accumulate into warp 0's registers by reading the values of the other warps in shared memory
-    for (int i = threadIdx.x+WARP_SIZE; i < min(BLOCK_SIZE, bucket_size*WARP_SIZE); i += WARP_SIZE) {
-        for (int k = 0; k < x128::size; k++) {
-            accum[k] += accum_shared[i + k * BLOCK_SIZE];
-        }
-    }
-
-    // Add the result to dwte and write back to global memory (read-modify-write)
-    for (unsigned int k = 0; k < x128::size; k++) {
-        // We use stochastic rounding to go from FP32 to BF16 but the seed should be deterministic
-        stochastic_rounding(accum[k] + (float)packed_in_out[k], &packed_in_out[k], seed + k);
-    }
-    store128(dwte_ix, packed_in_out);
-}
-
-__global__ void wpe_backward_kernel(floatX* dwpe,
-                                    const floatX* dout, const int* inp,
-                                    int B, int T, int C, unsigned int seed) {
-    // Each thread handles x128::size "channel positions", e.g. 256 per warp for BF16
-    // For gpt2-124M BF16, C=768 and T=1024, so 3 warps per channel and 3072 warps in total
-    // For each "channel position" we sum the gradients for every batch at that C/T element
-    // This way each dwte element is only updated once, and the kernel is fully deterministic!
-    // The previous kernel was not deterministic, as batches were aggregated with atomicAdd
-    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
-    if (idx >= T * C) { return; }
-
-    // if C is not a multiple of WARP_SIZE*x128::size, it's OK for some warps to handle multiple t
-    int t = idx / C;
-    int c = idx % C;
-    float accum[x128::size] = {0.0f};
-
-    for (int b = 0; b < B; b++) {
-        x128 packed_dout = load128cs(dout + (b * T * C) + (t * C) + c); // will never be read again
-        for (int k = 0; k < x128::size; k++) {
-            accum[k] += (float)packed_dout[k];
-        }
-    }
-
-    floatX* dwpe_tc = dwpe + (t * C) + c;
-    x128 packed_dwpe = load128(dwpe_tc);
-    for (unsigned int k = 0; k < x128::size; k++) {
-        // We use stochastic rounding to go from FP32 to BF16 but the seed should be deterministic
-        stochastic_rounding(accum[k] + (float)packed_dwpe[k], &packed_dwpe[k], seed + k);
-    }
-    store128(dwpe_tc, packed_dwpe);
-}
-
-__global__ void layernorm_forward_kernel3(floatX* __restrict__ out, floatX* __restrict__ mean, floatX* __restrict__ rstd,
-                                    const floatX*  __restrict__ inp, const floatX*  __restrict__ weight,
-                                    const floatX* __restrict__ bias, int N, int C) {
-    int lane_id = threadIdx.x % WARP_SIZE;
-    int warp_id = threadIdx.x / WARP_SIZE;
-    int num_warps = blockDim.x / WARP_SIZE;
-
-    int idx = blockIdx.x * num_warps + warp_id;
-    if(idx >= N) { return; } // guard
-
-    // the row of input that this group of threads is responsible for
-    const floatX* x = inp + idx * C;
-
-    // mean
-    float sum = 0.0f;
-    for (int i = lane_id; i < C; i += WARP_SIZE) {
-        sum += (float)x[i];
-    }
-    sum = warpReduceSum(sum);
-    float m = sum / C;
-    if(lane_id == 0 && mean != nullptr) {
-        __stcs(mean + idx, (floatX)m);
-    }
-
-    // rstd
-    sum = 0.0f;
-    for (int i = lane_id; i < C; i += WARP_SIZE) {
-        float diff = (float)x[i] - m;
-        sum += diff * diff;
-    }
-    sum = warpReduceSum(sum);
-    float s = rsqrtf(sum / C + 1e-5f);
-    if(lane_id == 0 && rstd != nullptr) {
-        __stcs(rstd + idx, (floatX)s);
-    }
-
-    // final normalization and scaling by weight/bias
-    floatX* o = out + idx * C;
-    for (int c = lane_id; c < C; c += WARP_SIZE) {
-        // load and store using the .cs "streaming" hint to the compiler,
-        // indicating that this data will not be reused soon, and can be streamed through the caches
-        // this allows the threads to get more cache-hits for the (shared) weight and bias parameters
-        float n = s * ((float)__ldcs(x+c) - m);
-        __stcs(o+c, (floatX)(n * (float)weight[c] + (float)bias[c]));
-    }
-}
-
-__global__ void fused_residual_forward_kernel5(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
-                                               const floatX* inp1, const floatX* inp2,
-                                               const floatX* weight, const floatX* bias,
-                                               int N, int C) {
-    assert(blockDim.x == WARP_SIZE);
-
-    // load weights and biases into shared memory
-    // do this before we allow any threads to exit!
-    extern __shared__ char* params[];
-    // load128/store128 sometimes generated multiple instructions when the types here were floatX*, so
-    // let's keep everything as x128
-    x128* s_weight = reinterpret_cast<x128*>(params);
-    x128* s_bias = reinterpret_cast<x128*>(params) + (C / x128::size);
-    x128* s_res = reinterpret_cast<x128*>(params) + ((2 + threadIdx.y) * C / x128::size);
-
-    int sidx = (threadIdx.x + WARP_SIZE * threadIdx.y) * x128::size;
-    for(int i = sidx; i < C; i += blockDim.y * WARP_SIZE * x128::size) {
-        s_weight[i/x128::size] = load128(weight + i);
-        s_bias[i/x128::size] = load128(bias + i);
-    }
-    __syncthreads();
-
-    int idx = blockIdx.x * blockDim.y + threadIdx.y;
-    if(idx > N) return;
-
-    // adjust pointers to current token
-    residual += C * idx;
-    normed += C * idx;
-    inp1 += C * idx;
-    inp2 += C * idx;
-
-    const float eps = 1e-5f;
-    float sum = 0.0f;
-    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
-        const x128 in1 = load128cs(inp1 + c);
-        const x128 in2 = load128cs(inp2 + c);
-        x128 out;
-        for(int k = 0; k < x128::size; ++k) {
-            out[k] = (float)in1[k] + (float)in2[k];
-            sum += (float)out[k];
-        }
-        store128cs(residual + c, out);
-        s_res[c / x128::size] = out;
-    }
-
-    sum = warpReduceSum(sum);
-    float m = sum / C;
-    float v = 0.f;
-
-    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
-        const x128 res = s_res[c / x128::size];
-        for(int k = 0; k < x128::size; ++k) {
-            v += ((float)res[k] - m) * ((float)res[k] - m);
-        }
-    }
-
-    v = warpReduceSum(v) / C;
-    float s = rsqrtf(v + eps);
-
-    for(int c = threadIdx.x * x128::size; c < C; c += WARP_SIZE * x128::size) {
-        const x128 res = s_res[c / x128::size];
-        const x128 w = s_weight[c / x128::size];
-        const x128 b = s_bias[c / x128::size];
-        x128 out;
-        for(int k = 0; k < x128::size; ++k) {
-            float n = s * ((float)res[k] - m); // normalized output
-            float o = n * (float)w[k] + (float)b[k]; // scale and shift it
-            out[k] = o;
-        }
-
-        store128cs(normed + c, out);
-    }
-    // cache the mean and rstd for the backward pass later
-    if(threadIdx.x == 0) {
-        mean[idx] = m;
-        rstd[idx] = s;
-    }
-}
-
-
-// inputs floatX, outputs FP32 (for current FP32-only activation path for this WIP)
-__global__ void permute_kernel(floatX* q, floatX* k, floatX* v,
-                               const floatX* inp,
-                               int B, int N, int NH, int d) {
-    // okay so now, this kernel wants Q,K,V to all be of shape (B, NH, N, d)
-    // but instead, we have a single tensor QKV (inp) of shape (B, N, 3, NH, d)
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= B * NH * N * d) { return; }
-
-    // Q[b][nh_][n][d_] = inp[b][n][0][nh_][d_]
-    int b = idx / (NH * N * d);
-    int rest = idx % (NH * N * d);
-    int nh_ = rest / (N * d);
-    rest = rest % (N * d);
-    int n = rest / d;
-    int d_ = rest % d;
-    int inp_idx = (b * N * 3 * NH * d) + (n * 3 * NH * d) + (0 * NH * d) + (nh_ * d) + d_;
-    q[idx] = __ldcs(&inp[inp_idx]);
-    k[idx] = __ldcs(&inp[inp_idx + NH * d]);
-    v[idx] = __ldcs(&inp[inp_idx + 2 * (NH * d)]);
-}
-
-__global__ void permute_kernel_backward(floatX* dinp,
-                                        const floatX* dq, const floatX* dk, const floatX* dv,
-                                        int B, int N, int NH, int d) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= B * NH * N * d) { return; }
-
-    int b = idx / (NH * N * d);
-    int rest = idx % (NH * N * d);
-    int nh_ = rest / (N * d);
-    rest = rest % (N * d);
-    int n = rest / d;
-    int d_ = rest % d;
-
-    int inp_idx = (b * N * 3 * NH * d) + (n * 3 * NH * d) + (0 * NH * d) + (nh_ * d) + d_;
-    dinp[inp_idx] = dq[idx];
-    dinp[inp_idx + NH * d] = dk[idx];
-    dinp[inp_idx + 2 * (NH * d)] = dv[idx];
-}
-
-__global__ void unpermute_kernel(floatX* inp, floatX *out, int B, int N, int NH, int d) {
-   // out has shape (B, nh, N, d) but we need to unpermute it to (B, N, nh, d)
-
-    int idx = (blockIdx.x * blockDim.x + threadIdx.x);
-    // out[b][n][nh_][d_] <- inp[b][nh_][n][d_]
-    if (idx >= B * NH * N * d) { return; }
-
-    int b = idx / (NH * N * d);
-    int rest = idx % (NH * N * d);
-    int nh_ = rest / (N * d);
-    rest = rest % (N * d);
-    int n = rest / d;
-    int d_ = rest % d;
-    int other_idx = (b * NH * N * d) + (n * NH * d) + (nh_ * d) + d_;
-    out[other_idx] = __ldcs(&inp[idx]);
-}
-
-__global__ void unpermute_kernel_backward(floatX* dinp, const floatX *dout, int B, int N, int NH, int d) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= B * NH * N * d) { return; }
-
-    int b = idx / (NH * N * d);
-    int rest = idx % (NH * N * d);
-    int nh_ = rest / (N * d);
-    rest = rest % (N * d);
-    int n = rest / d;
-    int d_ = rest % d;
-    int other_idx = (b * NH * N * d) + (n * NH * d) + (nh_ * d) + d_;
-    dinp[idx] = (floatX)dout[other_idx];
-}
-
-__global__ void softmax_forward_kernel5(floatX* out, float inv_temperature, const floatX* inp, int N, int T) {
-    // inp, out shape: (N, T, T), where N = B * NH
-    // fuses the multiplication by scale inside attention
-    // directly autoregressive, so we only compute the lower triangular part
-    // uses the online softmax algorithm
-    assert(T % 4  == 0);
-    int lane_id = threadIdx.x % WARP_SIZE;
-    int warp_id = threadIdx.x / WARP_SIZE;
-    int num_warps = blockDim.x / WARP_SIZE;
-
-    // micro-optimization: we iterate backwards so that
-    // after the softmax backward operation completes, the cache retains the
-    // part of the matrix close to the upper left corner, which benefits the
-    // matmul operation that immediately follows.
-    // int idx = blockIdx.x * warp.meta_group_size() + warp.meta_group_rank(); // forward order
-    int idx = (gridDim.x - blockIdx.x - 1) * num_warps + warp_id; // backward order
-    if(idx >= N * T) {
-        return;
-    }
-    int own_pos = idx % T;
-    int pos_by_4 = own_pos / 4;
-
-    // one row of inp, i.e. inp[idx, :] of shape (T,)
-    const floatX* x = inp + idx * T;
-
-    // not INF, so we don't get NaNs accidentally when subtracting two values.
-    const float flt_max = 340282346638528859811704183484516925440.0f; // to avoid including float.h
-    float maxval = -flt_max;
-    float sumval = 0.0f;
-
-    const floatX* x_aligned = reinterpret_cast<const floatX*>(__builtin_assume_aligned(x, 16));
-    for (int i = lane_id; i < pos_by_4; i += WARP_SIZE) {
-        float regarray[4];
-        for (int k = 0; k < 4; ++k) {
-            regarray[k] = (float)x_aligned[4*i + k];
-        }
-        float old_maxval = maxval;
-        for(int k = 0; k < 4; ++k) {
-            maxval = fmaxf(maxval, regarray[k]);
-        }
-        sumval *= expf(inv_temperature * (old_maxval - maxval));
-        for(int k = 0; k < 4; ++k) {
-            sumval += expf(inv_temperature * (regarray[k] - maxval));
-        }
-    }
-
-    if(4*pos_by_4 + lane_id <= own_pos) {
-        float old_maxval = maxval;
-        maxval = fmaxf(maxval, (float)x[4*pos_by_4 + lane_id]);
-        sumval *= expf(inv_temperature * (old_maxval - maxval));
-        sumval += expf(inv_temperature * ((float)x[4*pos_by_4 + lane_id] - maxval));
-    }
-
-    float global_maxval = warpReduceMax(maxval);
-    sumval *= expf(inv_temperature * (maxval - global_maxval));
-
-    float sum = warpReduceSum(sumval);
-    float norm = 1.f / sum;
-
-    // divide the whole row by the sum
-    for (int i = lane_id; i <= own_pos; i += WARP_SIZE) {
-        // recalculation is faster than doing the round-trip through memory.
-        float ev = expf(inv_temperature * ((float)__ldcs(x + i) - global_maxval));
-        __stcs(out + idx * T + i, (floatX)(ev * norm));
-    }
-}
-
-__global__ void residual_forward_kernel(floatX* out, const floatX* inp1, const floatX* inp2) {
-    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
-
-    x128 packed_out;
-    x128 packed_inp1 = load128cs(inp1 + idx);
-    x128 packed_inp2 = load128cs(inp2 + idx);
-    for (int k = 0; k < packed_inp1.size; k++) {
-        packed_out[k] = (floatX)((float)packed_inp1[k] + (float)packed_inp2[k]);
-    }
-    store128(out + idx, packed_out);
-}
-
-#define GELU_SCALING_FACTOR sqrtf(2.0f / M_PI)
-__global__ void gelu_forward_kernel2(floatX* out, const floatX* inp) {
-    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
-
-    x128 packed_out;
-    x128 packed_inp = load128cs(inp + idx); // load and do not keep in cache
-    for(int k = 0; k < packed_inp.size; ++k) {
-        float xi = (float)packed_inp[k];
-        float cube = 0.044715f * xi * xi * xi;
-        packed_out[k] = (floatX)(0.5f * xi * (1.0f + tanhf(GELU_SCALING_FACTOR * (xi + cube))));
-    }
-    // store instead of storecs (without cache streaming) in case it is useful for the
-    // data to be in the cache for the next operation after this GeLU
-    store128(out + idx, packed_out);
-}
-
-__global__ void gelu_backward_kernel(floatX* dinp, const floatX* inp, const floatX* dout) {
-    int idx = (blockIdx.x * blockDim.x + threadIdx.x) * x128::size;
-
-    x128 packed_dinp;
-    x128 packed_inp = load128cs(inp + idx);
-    x128 packed_dout = load128cs(dout + idx);
-    for (int k = 0; k < packed_inp.size; ++k) {
-        float x = (float)packed_inp[k];
-        float cube = 0.044715f * x * x * x;
-        float tanh_arg = GELU_SCALING_FACTOR * (x + cube);
-        float tanh_out = tanhf(tanh_arg);
-        float coshf_out = coshf(tanh_arg);
-        float sech_out = 1.0f / (coshf_out * coshf_out);
-        float local_grad = 0.5f * (1.0f + tanh_out) + x * 0.5f * sech_out * GELU_SCALING_FACTOR * (1.0f + 3.0f * 0.044715f * x * x);
-        packed_dinp[k] = (floatX)(local_grad * (float)packed_dout[k]);
-    }
-    store128(dinp + idx, packed_dinp);
-}
-
-template<typename OutFloat, bool UseAuxBuffer>
-__global__ void matmul_backward_bias_kernel9(OutFloat* dbias, const floatX* dout, int B, int T, int OC,
-                                             std::bool_constant<UseAuxBuffer>) {
-    constexpr const int bdx = 4;
-    constexpr const int bdy = WARP_SIZE / bdx;
-    assert(blockDim.x == bdx);
-    assert(blockDim.y == bdy);
-
-    int warp_d = (int)threadIdx.x;
-    int warp_c = (int)threadIdx.y;
-    int block_d = (int)threadIdx.z;
-
-    const int OC_per_warp = bdy * x128::size;  // 64 at BF16
-
-    int local_oc = warp_c * x128::size;
-    int global_oc = blockIdx.x * OC_per_warp + local_oc;
-
-    int local_bt = warp_d + bdx * block_d;
-    int bt_per_block = bdx * blockDim.z;
-
-    float accumulators[x128::size];
-    for (int k = 0; k < x128::size; k++) {
-        accumulators[k] = 0.0f;
-    }
-
-    if(global_oc < OC) {
-        // sum up over all bt within registers
-        for (int idx = blockIdx.y * bt_per_block + local_bt; idx < B * T; idx += gridDim.y * bt_per_block) {
-            x128 packed_dout = load128(dout + global_oc + idx*OC);
-            for (int k = 0; k < x128::size; k++) {
-                accumulators[k] += (float)packed_dout[k];
-            }
-        }
-    }
-
-    __shared__ float sub_results[x128::size][WARP_SIZE][bdy];
-
-    // reduce within-warp results
-    for (int k = 0; k < x128::size; k++) {
-        float v = accumulators[k];
-        v += __shfl_down_sync(0xffffffff, v, 1, 4);
-        v += __shfl_down_sync(0xffffffff, v, 2, 4);
-        if(warp_d == 0) {
-            sub_results[k][block_d][warp_c] = v;
-        }
-    }
-    __syncthreads();
-
-    // block-wide reductions
-    for (int k = block_d; k < x128::size; k += blockDim.z) {
-        float a = 0.f;
-        for (int r = warp_d; r < blockDim.z; r += bdx) {
-            float v = sub_results[k][r][warp_c];
-            v += __shfl_down_sync(0xffffffff, v, 1, 4);
-            v += __shfl_down_sync(0xffffffff, v, 2, 4);
-            a += v;
-        }
-        if(warp_d == 0 && global_oc < OC) {
-            if constexpr (!UseAuxBuffer) {
-                dbias[global_oc + k] = (OutFloat)(a + (float)dbias[global_oc + k]);
-            } else {
-                dbias[global_oc + k + blockIdx.y * OC] = a;
-            }
-        }
-    }
-}
-
-__global__ void reduce_add_sum_kernel(floatX* dst, const float* src, size_t n, size_t m) {
-    const size_t idx = (blockIdx.x * blockDim.x + threadIdx.x) * f128::size;
-    assert(n % x128::size == 0);
-    if (idx < n) {
-        f128 acc;
-        for(int k = 0; k < f128::size; ++k) {
-            acc[k] = 0.f;
-        }
-
-        for(int l = 0; l < m; ++l) {
-            f128 s = load128(src + idx + n * l);
-            for(int k = 0; k < f128::size; ++k) {
-                acc[k] += s[k];
-            }
-        }
-        for(int k = 0; k < f128::size; ++k) {
-            dst[idx + k] = (floatX) ((float)dst[idx + k] + acc[k]);
-        }
-    }
-}
-
-__global__ void __launch_bounds__(512, 2) // todo - any warnings on Turing with only 1024 threads?
-    layernorm_backward_kernel10(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
-                                const floatX* dout, const floatX* inp, const floatX* weight,
-                                const floatX* mean, const floatX* rstd,
-                                int B, int T, int C) {
-    int BLOCK_SIZE = blockDim.x;
-    int warpsInBlock = BLOCK_SIZE / WARP_SIZE; //number of warps in block
-    extern __shared__ float shared[];
-
-    int warpId = threadIdx.x / WARP_SIZE; // warp index within a block
-    int baseIdx = blockIdx.x * warpsInBlock + warpId;
-    int warpThreadIdx = threadIdx.x % WARP_SIZE; // Thread index within the warp
-    int warpsInGrid = gridDim.x * warpsInBlock;
-    int C_per_iteration = WARP_SIZE * x128::size;
-    int iterations_C = CEIL_DIV(C, C_per_iteration); // + 2;
-
-    // the first half of shared memory is bias, second is weight
-    size_t rounded_C = CEIL_DIV(C, (32 * x128::size)) * (32 * x128::size);
-    float* dbias_shared = shared;
-    float* dweight_shared = shared + rounded_C;
-    // warp zero doesn't actually write to the _tmp_shared memory locations, so we don't need to reserve memory
-    // the obvious solution is to change the addressing below to use (threadId.x-32) as offset, but that causes
-    // register spills, so instead we mess with the base pointer here, which doesn't increase register usage.
-    float* dbias_tmp_shared = shared + 2 * rounded_C - WARP_SIZE * f128::size;
-    float* dweight_tmp_shared = shared + 2 * rounded_C + f128::size * BLOCK_SIZE - 2 * WARP_SIZE * f128::size;
-
-    // init shared memory to zero
-    for(int i = threadIdx.x * f128::size; i < rounded_C; i += BLOCK_SIZE * f128::size) {
-        store128(dbias_shared + i, f128::zeros());
-        store128(dweight_shared + i, f128::zeros());
-    }
-    __syncthreads();
-
-    for (int bt = baseIdx; bt < B * T; bt += warpsInGrid) {
-        const floatX* dout_bt = dout + bt * C;
-        const floatX* inp_bt = inp +bt * C;
-        floatX* dinp_bt = dinp + bt * C;
-
-        // first: two reduce operations
-        float dnorm_mean = 0.0f;
-        float dnorm_norm_mean = 0.0f;
-        for (int i = warpThreadIdx * x128::size; i < C; i += WARP_SIZE * x128::size) {
-            x128 dout128_i   = load128(dout_bt + i);
-            x128 inp128_i    = load128(inp_bt  + i);
-            x128 weight128_i = load128(weight  + i);
-            for (int k = 0; k < x128::size; k++) {
-                float dnorm_i = (float)weight128_i[k] * (float)dout128_i[k];
-                dnorm_mean += dnorm_i;
-                dnorm_norm_mean += dnorm_i * (float)inp128_i[k];
-            }
-        }
-
-        const float mean_bt = (float)mean[bt];
-        const float rstd_bt = (float)rstd[bt];
-        dnorm_mean = warpReduceSum(dnorm_mean) / C;
-        dnorm_norm_mean = warpReduceSum(dnorm_norm_mean) / C * rstd_bt - dnorm_mean * mean_bt * rstd_bt;
-
-        for (int c = 0; c < iterations_C; c++) {
-            int global_index = (warpThreadIdx * x128::size) + (c * C_per_iteration);
-
-            x128 dout128   = x128::zeros();
-            x128 inp128    = x128::zeros();
-            x128 dinp128   = x128::zeros();
-            x128 weight128 = x128::zeros();
-
-            if(global_index < C) {
-                dout128 = load128cs(dout_bt + global_index);
-                inp128 = load128cs(inp_bt + global_index);
-                dinp128 = load128(dinp_bt + global_index);
-                weight128 = load128(weight + global_index);
-            }
-
-            for(int o = 0; o < x128::size / f128::size; ++o) {
-                f128 dbias_f;
-                f128 dweight_f;
-                for(int i = 0; i < f128::size; ++i) {
-                    int x = o * f128::size + i;
-                    float dout_i = (float)dout128[x];
-                    float norm_bti = ((float)inp128[x] - mean_bt) * rstd_bt;
-                    dbias_f[i] = dout_i;
-                    dweight_f[i] = norm_bti * dout_i;
-
-                    float dval = 0.0f;
-                    dval += (float) weight128[x] * (float)dout128[x]; // term 1
-                    dval -= dnorm_mean; // term 2
-                    dval -= norm_bti * dnorm_norm_mean; // term 3
-                    dval *= rstd_bt; // final scale
-                    dinp128[x] = (floatX) ((float) dinp128[x] + dval);
-                }
-
-                if (warpId != 0) {
-                    store128(dbias_tmp_shared + threadIdx.x * f128::size, dbias_f);
-                    // this seems to generate a 64-bit store, instead of 128-bit.
-                    // however, forcing 128-bit (e.g., using inline ptx), results in register
-                    // spilling and much worse performance, so we'll keep it like this for now
-                    // but ideally, we could reduce the register pressure a little.
-                    store128(dweight_tmp_shared + threadIdx.x * f128::size, dweight_f);
-                }
-                __syncthreads();
-                if (warpId == 0) {
-                    for (int j = 1; j < warpsInBlock; j++) {
-                        f128 dbias_tmp = load128(dbias_tmp_shared + f128::size * (threadIdx.x + j * WARP_SIZE));
-                        f128 dweight_tmp = load128(dweight_tmp_shared + f128::size * (threadIdx.x + j * WARP_SIZE));
-                        for(int i = 0; i < f128::size; ++i) {
-                            dbias_f[i] += dbias_tmp[i];
-                            dweight_f[i] += dweight_tmp[i];
-                        }
-                    }
-                }
-                __syncthreads();
-                if (warpId == 0) {
-                    f128 db_old = load128(dbias_shared + global_index + f128::size * o);
-                    f128 dw_old = load128(dweight_shared + global_index + f128::size * o);
-                    for(int i = 0; i < f128::size; ++i) {
-                        dbias_f[i] += db_old[i];
-                        dweight_f[i] += dw_old[i];
-                    }
-                    store128(dbias_shared + global_index + f128::size * o, dbias_f);
-                    store128(dweight_shared + global_index + f128::size * o, dweight_f);
-                }
-            }
-            if(global_index < C) {
-                // cache in L2 as this is read by the next kernel, but bypass L1 to minimise thrashing
-                store128cg(dinp_bt + global_index, dinp128);
-            }
-        }
-    }
-    __syncthreads();
-    // Each block writes its partial sum to global memory
-    // The last block to finish becomes responsible for summing up all the partial sums
-    // This is done by atomically incrementing a flag (cleared to 0 before launching the kernel)
-    unsigned int* scratchFlag = (unsigned int*)(scratch);
-    // Increment scratch pointer by a full cacheline so that everything remains cacheline aligned
-    scratch += 32;
-    float* scratch_dbias = scratch;
-    float* scratch_dweight = scratch + C;
-    for(int i = threadIdx.x * f128::size; i < C; i += BLOCK_SIZE * f128::size) {
-        // Write to global memory in the same "shared memory banking friendly" order
-        store128(scratch_dbias + i + 2*C*blockIdx.x, load128(dbias_shared + i));
-        store128(scratch_dweight + i + 2*C*blockIdx.x, load128(dweight_shared + i));
-    }
-    __syncthreads();
-    // that portion of shared memory is no longer used, so we can repurpose it for the scratch flag.
-    unsigned int *tmp_flag = (unsigned int*)(shared + 2*rounded_C);
-    if (threadIdx.x == 0) {
-        *tmp_flag = atomicInc(scratchFlag, gridDim.x);
-    }
-    __syncthreads();
-    if (*tmp_flag == gridDim.x-1) {
-        // Reduction of the partial sums by the final block
-        // todo - there isn't enough parallelism even inside that single SM...
-        // ==> so could maybe split into another kernel with YET ANOTHER level of reduction?!
-        for(int i = threadIdx.x * f128::size; i < C; i += BLOCK_SIZE * f128::size) {
-            f128 dbias_accum = f128::zeros();
-            f128 dweight_accum = f128::zeros();
-
-            for (int read_block_idx = 0; read_block_idx < gridDim.x; read_block_idx++) {
-                int offset = i + 2*C*read_block_idx;
-                f128 dbias128 = load128(scratch_dbias + offset);
-                f128 dweight128 = load128(scratch_dweight + offset);
-                for(int k = 0; k < f128::size; k++) {
-                    dbias_accum[k] += dbias128[k];
-                    dweight_accum[k] += dweight128[k];
-                }
-            }
-            store128(dbias_shared + i, dbias_accum);
-            store128(dweight_shared + i, dweight_accum);
-        }
-        __syncthreads();
-
-        // convert from float/FP32 to floatX/BF16 for the final write
-        // this is separate because it cannot use as many warps as the above (f128 vs x128)
-        // todo - if we split this code into another kernel, we could maybe do it at the same time?
-        for (int c = warpId; c < iterations_C; c += warpsInBlock) {
-            int global_index = (warpThreadIdx * x128::size) + (c * C_per_iteration);
-            if (global_index >= C) {
-                break;
-            }
-
-            x128 dbias128 = load128(dbias + global_index);
-            x128 dweight128 = load128(dweight + global_index);
-            for(int o = 0; o < x128::size / f128::size; ++o) {
-                f128 s_db = load128(dbias_shared + global_index + o * f128::size);
-                f128 s_dw = load128(dweight_shared + global_index + o * f128::size);
-                for(int i = 0; i < f128::size; ++i) {
-                    int x = o * f128::size + i;
-                    dbias128[x] = (floatX)(s_db[i] + (float)dbias128[x]);
-                    dweight128[x] = (floatX)(s_dw[i] + (float)dweight128[x]);
-                }
-            }
-            store128(dbias + global_index, dbias128);
-            store128(dweight + global_index, dweight128);
-        }
-    }
-}
-
-__global__ void softmax_autoregressive_backward_kernel(floatX* dpreatt, const floatX* datt, const floatX* att,
-                                                       int B, int T, int C, float scale) {
-    constexpr const int BlockSize = 256;
-    constexpr int T_per_block = 4;
-
-    // go through blocks in reverse order, so the slowest block starts first
-    int t0 = T - 1 - T_per_block*blockIdx.x;
-    int idx = blockIdx.y;
-
-    att += idx * T * T;
-    datt += idx * T * T;
-    dpreatt += idx * T * T;
-
-    for(int to = 0; to < T_per_block; ++to) {
-        int t = t0 - to;
-        if(t < 0) return;
-        const floatX* att_bth = att + t * T;
-        const floatX* datt_bth = datt + t * T;
-        floatX* dpreatt_bth = dpreatt + t * T;
-
-        float local_sum = 0;
-        for (int t2 = threadIdx.x; t2 <= t; t2 += BlockSize) {
-            local_sum += (float)att_bth[t2] * (float)datt_bth[t2];
-        }
-
-        local_sum = blockReduce<warpReduceSum>(local_sum);
-
-        for (int t3 = threadIdx.x; t3 <= t; t3 += BlockSize) {
-            // don't touch the cache. Some parts will still be here from the previous loop, and
-            // we want to exploit those.
-            float acc = (float)__ldcs(att_bth + t3) * ((float)__ldcs(datt_bth + t3) - local_sum);
-            __stcs(dpreatt_bth + t3, (floatX)(scale * acc));
-        }
-    }
-}
-
-// Implements linear interpolation using only two floating-point operations (as opposed to three in a naive implementation).
-// Reference: https://developer.nvidia.com/blog/lerp-faster-cuda
-__device__ float lerp(float start, float end, float weight) {
-    return fma(weight, end, fma(-weight, start, start));
-}
-
-template <typename Tp, typename Tg>
-__global__ void adamw_kernel3(Tp* params_memory, float* master_params_memory, Tg* grads_memory, float* m_memory, float* v_memory, size_t num_parameters,
-                              float learning_rate, float beta1, float beta2, float beta1_correction, float beta2_correction, float eps, float weight_decay,
-                              float grad_scale, unsigned int seed) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (idx >= num_parameters) { return; }  // guard
-
-    // get the gradient, m, and v for this parameter
-    float grad = grad_scale * (float)grads_memory[idx];
-    float m = m_memory[idx];
-    float v = v_memory[idx];
-    // update the first moment (momentum)
-    m = lerp(grad, m, beta1);
-    m_memory[idx] = m;
-    // update the second moment (RMSprop)
-    v = lerp(grad * grad, v, beta2);
-    v_memory[idx] = v;
-    m /= beta1_correction;  // m_hat
-    v /= beta2_correction;  // v_hat
-    // fetch the old value of this parameter as a float, from either source
-    float old_param = (master_params_memory != NULL) ? master_params_memory[idx] : (float)params_memory[idx];
-    // update this parameter
-    float param = old_param - (learning_rate * (m / (sqrtf(v) + eps) + weight_decay * old_param));
-    // update our low precision version of the parameters using stochastic rounding
-    // this will be used in the next forward pass
-    // TODO: simply doing `params_memory[i] = (floatX)param;` breaks everything (why?)
-    unsigned int random = Get2dNoiseUint(threadIdx.x, blockIdx.x, seed);
-    stochastic_rounding(param, &params_memory[idx], random);
-    // write the full, float version of the param into our master copy, if we maintain one
-    // this will be used in the next update
-    if (master_params_memory != NULL) { master_params_memory[idx] = param; }
-}
-
-template<class T>
-__global__ void global_norm_squared_kernel(float* out, const T* data, size_t count) {
-    // we want as few atomics as possible, so each block tries to do
-    // the maximum amount of work (so no fixed chunk, but instead iterating
-    // until we run out of data), and then we reduce inside the block
-    // and finally have just one atomic per block.
-    // out will be updated atomically from all thread blocks. It is a float, so the
-    // atomic op is unproblematic
-    size_t index = threadIdx.x + blockDim.x * blockIdx.x;
-    size_t grid_width = blockDim.x * gridDim.x;
-    float accumulator = 0.f;
-    for(size_t i = index; i < count; i += grid_width) {
-        accumulator += (float)data[i] * (float)data[i];
-    }
-    // warp-level reduce
-    float block_sum = blockReduce<warpReduceSum>(accumulator);
-    if(threadIdx.x == 0) {
-        atomicAdd(out, block_sum);
-    }
-}
-
-struct SoftmaxParams {
-    float Scale;
-    float Offset;
-};
-
-__device__ SoftmaxParams prepare_softmax_blockwide3(int64_t idx, const floatX* inp, int V, int P) {
-    // same but not float4
-    // one row of inp, i.e. inp[idx, :] of shape (V,)
-
-    const floatX* x = inp + idx * P;
-    float thread_maxval = -INFINITY;
-    float thread_sumval = 0.0f;
-    int i = (V+x128::size-1)/x128::size + threadIdx.x - blockDim.x;
-
-    // special-case loop to handle the unaligned elements at the end of the array
-    // this lets us skip the bounds check in the main loop below, which improves performance
-    while ((i+1)*x128::size > V) {
-        for(int k = 0; k < x128::size; ++k) {
-            if (i*x128::size+k >= V) {
-                break; // bounds checking against real V (rather than padded P)
-            }
-            float v = (float)x[i*x128::size+k];
-            float old_maxval = thread_maxval;
-            thread_maxval = fmaxf(thread_maxval, v);
-            thread_sumval *= expf((old_maxval - thread_maxval));
-            thread_sumval += expf(v - thread_maxval);
-        }
-        i -= blockDim.x;
-    }
-
-    // main loop for the bulk of the iterations (no bounds checking required!)
-    for (; i >= 0; i -= blockDim.x) {
-        x128 packed_x = load128(x + i * x128::size); // load and keep in cache until fused_classifier loop
-        for(int k = 0; k < x128::size; ++k) {
-            float v = (float)packed_x[k];
-            float old_maxval = thread_maxval;
-            thread_maxval = fmaxf(thread_maxval, v);
-            thread_sumval *= expf((old_maxval - thread_maxval));
-            thread_sumval += expf(v - thread_maxval);
-        }
-    }
-
-    // Block Max Reduction -> Maths -> Block Sum Reduction
-    float block_maxval = blockReduce<warpReduceMax>(thread_maxval, false, -INFINITY);
-    thread_sumval *= expf(thread_maxval - block_maxval);
-    float block_sumval = blockReduce<warpReduceSum>(thread_sumval);
-
-    // return the softmax parameters
-    return SoftmaxParams{1.f / block_sumval, block_maxval};
-}
-
-// will _update_ logits to logit gradients
-// uses template to decide whether to write logits and probs
-// split both loops in "multiple-of-x128-size" and "bounds-checked remainder" parts
-template <bool WriteLogits = true, bool WriteProbs = false>
-__global__ void __launch_bounds__(1024, MAX_1024_THREADS_BLOCKS)
-                fused_classifier_kernel5(floatX* logits, floatX* losses, floatX* probs,
-                                         const float dloss, const int* targets,
-                                         int B, int T, int V, int P) {
-    // note: idx is small enough that it easily fits into 32 bit;
-    // by making it a long here, we ensure that any offsets calculated with it (e.g., idx * P)
-    // are done is 64 bit
-    int64_t idx = gridDim.x - (blockIdx.x+1); // reverse order for cache hits on matmul data
-    int ix = targets[idx];
-
-    // softmax (reading B * T * V, same logits read again below, hopefully still in cache)
-    SoftmaxParams sp = prepare_softmax_blockwide3(idx, logits, V, P);
-
-    // calculate the probability needed for the loss and update (single-threaded)
-    if(threadIdx.x == 0) {
-        float prob = expf((float)logits[idx * P + ix] - sp.Offset) * sp.Scale;
-        losses[idx] = (floatX)(-logf(prob));
-    }
-
-    // calculate the gradients directly, saves bandwidth from probs during training
-    // but also supports writing probs for inference-only and debugging
-    const floatX* logits_vec = logits + idx * P;
-    for (int i = threadIdx.x; i < V/x128::size; i += blockDim.x) {
-        // this is the 2nd read of logits after the one in prepare_softmax2
-        // it will be overwritten by the logits gradients which is when we reduce cache persistence
-        x128 packed_logits_vec = load128(logits_vec + i * x128::size); // rely on cs of store128cs
-        x128 packed_probs;
-        for(int k = 0; k < x128::size; ++k) {
-            int element = i*x128::size + k;
-            float prob = expf((float)packed_logits_vec[k] - sp.Offset) * sp.Scale;
-            packed_probs[k] = (floatX)prob;
-            float indicator = (element == ix) ? 1.0f : 0.0f;
-            packed_logits_vec[k] = (floatX)((prob - indicator) * dloss);
-        }
-        if (WriteLogits){
-            // reduce cache persistence for the overwritten logits
-            // to maximise probability that logits remain in cache between prepare_softmax and here
-            store128cs(logits + idx * P + i * x128::size, packed_logits_vec);
-        }
-        if (WriteProbs) {
-            store128(probs + idx * P + i * x128::size, packed_probs);
-        }
-    }
-
-    // handle remaining elements after the last multiple of x128::size
-    // e.g. if V = 8003, and x128::size = 8, we need to handle the last 3 elements
-    int unaligned_start = V & ~(x128::size - 1); // round down to multiple of x128::size
-    for (int i = threadIdx.x + unaligned_start; i < V; i++) {
-        float prob = expf((float)logits_vec[i] - sp.Offset) * sp.Scale;
-        float indicator = (i == ix) ? 1.0f : 0.0f;
-        float dlogit = (prob - indicator) * dloss;
-        if (WriteLogits){
-            __stcs(logits + idx * P + i, (floatX)dlogit);
-        }
-        if (WriteProbs) {
-            probs[idx * P + i] = (floatX)prob;
-        }
-    }
-}
-
-// device functions and the kernel to cast data between types
-template<typename Td, typename Ts>
-__device__ Td cast_value(Ts val);
-
-template<>
-__device__ float cast_value<float, float>(float val) {
-    return val;
-}
-
-template<>
-__device__ float cast_value<float, half>(half val) {
-    return __half2float(val);
-}
-
-template<>
-__device__ float cast_value<float, __nv_bfloat16>(__nv_bfloat16 val) {
-    return __bfloat162float(val);
-}
-
-template<typename Td, typename Ts>
-__global__ void copy_and_cast_kernel(Td* dst, const Ts* src, size_t n) {
-    int idx = blockIdx.x * blockDim.x + threadIdx.x;
-    // need to try grid stride looping for more perf later
-    if (idx < n) {
-        dst[idx] = cast_value<Td, Ts>(src[idx]);
-    }
-}
-
-// ----------------------------------------------------------------------------
-// kernel launchers
-
-void encoder_forward(floatX* out,
-                     const int* inp, const floatX* wte, const floatX* wpe,
-                     int B, int T, int C) {
-    NVTX_RANGE_FN();
-    const int block_size = 256;
-    const int N = B * T * C;
-    const int grid_size = CEIL_DIV(N, (int)(block_size * x128::size));
-    encoder_forward_kernel3<<<grid_size, block_size>>>(out, inp, wte, wpe, B, T, C);
-    cudaCheck(cudaGetLastError());
-}
-
-// Fully deterministic (see comments in wte_backward_kernel and wpe_backward_kernel for more details)
-void encoder_backward(floatX* dwte, floatX* dwpe, floatX* scratch, // gpu outputs & scratch
-                      int* workload_indices, int4* bucket_info,    // cpu scratch buffers
-                      const floatX* dout, const int* inp, const int* inputs_cpu, // cpu/gpu inputs
-                      int B, int T, int C, unsigned int seed) {
-    NVTX_RANGE_FN();
-
-    // Launch wpe kernel first (so it runs on the GPU in parallel with the CPU pre-processing for wte)
-    const int block_size = 256;
-    const int N = T * C / x128::size;
-    const int grid_size = CEIL_DIV(N, block_size);
-    wpe_backward_kernel<<<grid_size, block_size, 0>>>(dwpe, dout, inp, B, T, C, seed);
-    cudaCheck(cudaGetLastError());
-
-    // check the GPU scratch buffer is large enough to hold the bucket info and workload indices
-    // todo - this is trivially true given hardcoded scratch buffer size here, is this useful?
-    int num_c_groups = CEIL_DIV(C, x128::size * WARP_SIZE);
-    assert(B*T*num_c_groups * (sizeof(int4)+sizeof(int)) <= B*T*3*C * sizeof(floatX));
-
-    // Step 1: Sort inputs into buckets
-    int total_items = 0;
-    std::unordered_map<uint64_t, std::vector<uint64_t>> buckets;
-    for (uint64_t bt = 0; bt < B * T; bt++) {
-        for (uint64_t c_group = 0; c_group < num_c_groups; c_group++) {
-            // todo - passing c_group/inputs_cpu[bt] in data to avoid a second hash lookup is a bit hacky
-            uint64_t data = bt + (c_group<<32ULL) + ((uint64_t)inputs_cpu[bt]<<42ULL);
-            buckets[c_group + num_c_groups * inputs_cpu[bt]].push_back(data);
-            total_items++;
-        }
-    }
-
-    // Step 2: Sort buckets by size in descending order
-    // this is so the largest buckets are processed first by the GPU
-    // otherwise, if they started late, they would still be running with the rest of the GPU idle
-    std::vector<std::pair<uint64_t, std::vector<uint64_t>>> sortedBuckets(buckets.begin(), buckets.end());
-    std::sort(sortedBuckets.begin(), sortedBuckets.end(), // ugly because we don't have a typedef for the std::pair
-              [](const std::pair<uint64_t, std::vector<uint64_t>>& a, const std::pair<uint64_t, std::vector<uint64_t>>& b) {
-                  return a.second.size() > b.second.size();
-              });
-
-    int num_buckets = buckets.size();
-    int bucket_index = 0;
-    int workload_index = 0;
-    for (const auto& bucket : sortedBuckets) {
-        bucket_info[bucket_index].x = workload_index; // bucket start
-        bucket_info[bucket_index].y = bucket.second.size(); // bucket size
-        bucket_info[bucket_index].z = (bucket.second[0] >> 42ULL) & ((1ULL<<20ULL)-1); // bucket ix
-        bucket_info[bucket_index].w = (bucket.second[0] >> 32ULL) & ((1ULL<<10ULL)-1); // bucket c
-
-        for (uint64_t idx : bucket.second) {
-            workload_indices[workload_index++] = (int)(idx & ((1ULL<<31ULL)-1ULL));
-        }
-        bucket_index++;
-    }
-
-    // Step 3: Copy data from host to device (async until the last one to avoid synchronising CPU/GPU twice)
-    // todo - could use CUDA events (even without streams) to avoid CPU/GPU synchronisation completely
-    int4* d_bucket_info = (int4*)scratch;
-    int*  d_workload_indices = (int*)(scratch + B*T*num_c_groups * sizeof(int4));
-    cudaCheck(cudaMemcpyAsync(d_bucket_info, bucket_info, num_buckets * sizeof(int4), cudaMemcpyHostToDevice));
-    cudaCheck(cudaMemcpy(d_workload_indices, workload_indices, total_items * sizeof(int), cudaMemcpyHostToDevice));
-
-    // Launch wte kernel
-    // todo - profile block sizes on more content (depends on number of buckets and on GPU?)
-    wte_backward_kernel<256><<<num_buckets, 256>>>(dwte, d_bucket_info, d_workload_indices, dout, inp, seed, B, T, C);
-    cudaCheck(cudaGetLastError());
-}
-
-void layernorm_forward(floatX* out, floatX* mean, floatX* rstd,
-                       floatX* inp, const floatX* weight, const floatX* bias,
-                       int B, int T, int C) {
-    NVTX_RANGE_FN();
-    const int block_size = 512;
-    const int N = B * T;
-    const int grid_size = CEIL_DIV(N * WARP_SIZE, block_size);
-    layernorm_forward_kernel3<<<grid_size, block_size>>>(out, mean, rstd, inp, weight, bias, N, C);
-    cudaCheck(cudaGetLastError());
-}
-
-// https://docs.nvidia.com/cuda/cublas/#cublasltmatmul
-void matmul_forward_cublaslt(floatX* out,
-                     floatX* inp, floatX* weight, floatX* bias,
-                     int B, int T, int C, int OC) {
-    NVTX_RANGE_FN();
-    int has_bias = (bias != NULL);
-
-    // check bias alignment
-    if(((uintptr_t)bias % 16) != 0) {
-        printf("Bias pointer is not aligned (cuBLASLt requirement)!\n");
-        exit(EXIT_FAILURE);
-    }
-
-    // these need to be in FP16 if and only if alpha/beta are CUBLAS_COMPUTE_16F
-    const float alpha = 1.0f, beta = 0.0f;
-
-    int returnedResults = 0;
-    cublasLtMatmulDesc_t operationDesc;
-    cublasLtMatmulPreference_t preference;
-    cublasLtMatrixLayout_t weightLayout;
-    cublasLtMatrixLayout_t inputLayout;
-    cublasLtMatrixLayout_t outputLayout;
-    cublasLtMatrixLayout_t biasLayout;
-    cublasLtMatmulHeuristicResult_t heuristic;
-
-    // create the operation descriptor
-    cublasOperation_t opNoTranspose = CUBLAS_OP_N;
-    cublasOperation_t opTranspose = CUBLAS_OP_T;
-    cublasLtEpilogue_t epilogueBias = has_bias ? CUBLASLT_EPILOGUE_BIAS : CUBLASLT_EPILOGUE_DEFAULT;
-
-    cublasCheck(cublasLtMatmulDescCreate(&operationDesc, cublas_compute, CUDA_R_32F)); // FP16 if CUBLAS_COMPUTE_16F
-    cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &opTranspose, sizeof(opTranspose)));
-    cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opNoTranspose, sizeof(opNoTranspose)));
-    cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogueBias, sizeof(epilogueBias)));
-    cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias)));
-
-    // define matrix layouts
-    cublasCheck(cublasLtMatrixLayoutCreate(&weightLayout, CUBLAS_LOWP, C, OC, C));
-    cublasCheck(cublasLtMatrixLayoutCreate(&inputLayout, CUBLAS_LOWP, C, B*T, C));
-    cublasCheck(cublasLtMatrixLayoutCreate(&outputLayout, CUBLAS_LOWP, OC, B*T, OC));
-    cublasCheck(cublasLtMatrixLayoutCreate(&biasLayout, CUBLAS_LOWP, OC, 1, OC));
-
-    // create a preference handle with specified max workspace
-    cublasCheck(cublasLtMatmulPreferenceCreate(&preference));
-    cublasCheck(cublasLtMatmulPreferenceSetAttribute(preference,
-        CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, &cublaslt_workspace_size, sizeof(cublaslt_workspace_size)));
-
-    // find a suitable algorithm
-    cublasCheck(cublasLtMatmulAlgoGetHeuristic(cublaslt_handle, operationDesc,
-        weightLayout, inputLayout, outputLayout, outputLayout,
-        preference, 1, &heuristic, &returnedResults));
-    if (returnedResults == 0) {
-        printf("No cuBLASLt algorithm: B: %d, T: %d, C: %d, OC: %d, bias: %d\n", B, T, C, OC, has_bias);
-        exit(EXIT_FAILURE);
-    }
-
-    // call the matmul
-    cublasCheck(cublasLtMatmul(cublaslt_handle, operationDesc,
-        &alpha, weight, weightLayout, inp, inputLayout, &beta,
-        out, outputLayout, out, outputLayout, &heuristic.algo,
-        cublaslt_workspace, cublaslt_workspace_size, 0));
-
-    // cleanups
-    cublasCheck(cublasLtMatmulPreferenceDestroy(preference));
-    cublasCheck(cublasLtMatmulDescDestroy(operationDesc));
-    cublasCheck(cublasLtMatrixLayoutDestroy(weightLayout));
-    cublasCheck(cublasLtMatrixLayoutDestroy(inputLayout));
-    cublasCheck(cublasLtMatrixLayoutDestroy(outputLayout));
-    cublasCheck(cublasLtMatrixLayoutDestroy(biasLayout));
-}
-
-void attention_forward(floatX* out, floatX* qkvr, floatX* att,
-                       floatX* inp,
-                       int B, int T, int C, int NH) {
-    NVTX_RANGE_FN();
-    // Note: `inp` is not needed for backward pass, so we re-use it as a scratch buffer.
-    // Its contents will be overwritten by this function.
-    const int block_size = 256;
-    const float alpha = 1.0f, beta = 0.0f;
-
-    // inp is (B, T, 3C) QKV
-    // preatt, att are (B, NH, T, T)
-    // output is (B, T, C)
-    int HS = C / NH; // head size
-
-    // permute and separate inp from (B, T, 3, NH, HS) to 3X (B, NH, T, HS)
-    floatX *q, *k, *v;
-    q = qkvr + 0 * B * T * C;
-    k = qkvr + 1 * B * T * C;
-    v = qkvr + 2 * B * T * C;
-    int total_threads = B * NH * T * HS;
-    int num_blocks = CEIL_DIV(total_threads, block_size);
-    permute_kernel<<<num_blocks, block_size>>>(q, k, v, inp, B, T, NH, HS);
-
-
-    floatX* preatt = inp;
-    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle,
-                                     CUBLAS_OP_T, CUBLAS_OP_N,
-                                     T, T, HS, &alpha,
-                                     k, CUBLAS_LOWP, HS, T * HS,
-                                     q, CUBLAS_LOWP, HS, T * HS,
-                                     &beta, preatt, CUBLAS_LOWP, T, T * T,
-                                     B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT));
-
-    // multiply all elements of preatt elementwise by scale
-    float scale = 1.0 / sqrtf(HS);
-    int grid_size = CEIL_DIV(B * NH * T * 32, block_size);
-    softmax_forward_kernel5<<<grid_size, block_size>>>(att, scale, preatt, B * NH, T);
-
-    // new approach: first cuBLAS another batched matmul
-    floatX* vaccum = inp;
-    // y = att @ v # (B, nh, T, T) @ (B, nh, T, hs) -> (B, nh, T, hs)
-    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle,
-                                     CUBLAS_OP_N, CUBLAS_OP_N,
-                                     HS, T, T, &alpha,
-                                     v, CUBLAS_LOWP, HS, T * HS,
-                                     att, CUBLAS_LOWP, T, T * T,
-                                     &beta, vaccum, CUBLAS_LOWP, HS, T * HS,
-                                     B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT));
-
-    // now unpermute
-    // y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
-    num_blocks = CEIL_DIV(B * T * C, block_size);
-    unpermute_kernel<<<num_blocks, block_size>>>(vaccum, out, B, T, NH, HS);
-    cudaCheck(cudaGetLastError());
-}
-
-void residual_forward(floatX* out, const floatX* inp1, const floatX* inp2, int N) {
-    NVTX_RANGE_FN();
-    const int block_size = 256;
-    assert(N % block_size == 0);
-    const int grid_size = CEIL_DIV(N, block_size * x128::size);
-    residual_forward_kernel<<<grid_size, block_size>>>(out, inp1, inp2);
-    cudaCheck(cudaGetLastError());
-}
-
-void fused_residual_forward5(floatX* residual, floatX* normed, floatX* mean, floatX* rstd,
-                             const floatX* inp1, const floatX* inp2,
-                             const floatX* weight, const floatX* bias,
-                             int N, int C) {
-    const int block_size = 256;
-    int block_y = block_size / WARP_SIZE;
-    const int grid_size = CEIL_DIV(N, block_y);
-    size_t smem = (2 + block_y) * C * sizeof(floatX);
-
-    // in order to use more than 48 KiB of smem, need to call cudaFuncSetAttribute
-    // this may fail, in which case we fall back to the smem free implementation.
-    cudaCheck(cudaGetLastError());
-    auto status = cudaFuncSetAttribute(fused_residual_forward_kernel5, cudaFuncAttributeMaxDynamicSharedMemorySize, smem);
-    cudaGetLastError();
-    if(status == cudaSuccess) {
-        fused_residual_forward_kernel5<<<grid_size, dim3(WARP_SIZE, block_y), smem>>>(residual, normed, mean, rstd, inp1, inp2,
-                                                                               weight, bias, N, C);
-    } else {
-        residual_forward(residual, inp1, inp2, N*C);
-        layernorm_forward(normed, mean, rstd, residual, weight, bias, N, 1, C);
-    }
-    cudaCheck(cudaGetLastError());
-}
-
-
-void gelu_forward(floatX* out, const floatX* inp, int N) {
-    NVTX_RANGE_FN();
-    const int block_size = 512;
-    assert(N % block_size == 0);
-    const int grid_size = CEIL_DIV(N, block_size * x128::size);
-    gelu_forward_kernel2<<<grid_size, block_size>>>(out, inp);
-    cudaCheck(cudaGetLastError());
-}
-
-void gelu_backward(floatX* dinp, const floatX* inp, const floatX* dout, const int N) {
-    NVTX_RANGE_FN();
-    const int block_size = 128;
-    assert(N % block_size == 0);
-    const int grid_size = CEIL_DIV(N, block_size * x128::size);
-    gelu_backward_kernel<<<grid_size, block_size>>>(dinp, inp, dout);
-    cudaCheck(cudaGetLastError());
-}
-
-void matmul_backward(floatX* dinp, floatX* dweight, floatX* dbias,
-                     floatX* dout, floatX* inp, floatX* weight,
-                     float* dbias_buffer,
-                     int B, int T, int C, int OC) {
-    NVTX_RANGE_FN();
-    float one = 1.0f, zero = 0.0f;
-
-    // backward to bias, if given, does a +=
-    if (dbias != NULL) {
-        // Each warp is responsible for 8 * "x128::size" = 64 OCs at BF16 (OC must be a multiple of 64!)
-        // Block size is 1024 | 768 threads (32|24 warps) and we reduce those values into 1 at the end
-
-        const int block_size = deviceProp.maxThreadsPerMultiProcessor == 1536 ? 768 : 1024;
-
-        dim3 block_dim = {4, 8, (unsigned)block_size/WARP_SIZE};
-        const int OC_per_warp = block_dim.y * x128::size; // 64 at BF16
-        const int grid_size_x = CEIL_DIV(OC, OC_per_warp); // e.g. 12 horizontal blocks for 768 OCs at BF16
-        const int grid_size_y = max(1, deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / (block_size * grid_size_x)); // full GPU!
-
-        // If we have enough OC that we don't need cross-block reductions, we can skip the bias_buffer accumulation
-        // and write results directly to the output.
-        if(grid_size_y == 1) {
-            matmul_backward_bias_kernel9<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias, dout, B, T, OC, std::bool_constant<false>{});
-            cudaCheck(cudaGetLastError());
-        } else {
-            // kernel 9 overwrites temp buffer, so no need to memset
-            matmul_backward_bias_kernel9<<<dim3(grid_size_x, grid_size_y), block_dim>>>(dbias_buffer, dout, B, T, OC, std::bool_constant<true>{});
-            cudaCheck(cudaGetLastError());
-            reduce_add_sum_kernel<<<CEIL_DIV(OC, 256 * f128::size), 256>>>(dbias, dbias_buffer, OC, grid_size_y);
-            cudaCheck(cudaGetLastError());
-        }
-    }
-
-    // backward to input, uses = in the backward pass (set the gradient)
-    cublasCheck(cublasGemmEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, C, B*T, OC, &one,
-                             weight, CUBLAS_LOWP, C, dout, CUBLAS_LOWP, OC, &zero,
-                             dinp, CUBLAS_LOWP, C, cublas_compute, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    // backward to weight, uses += in the backward pass (accumulate the gradient) by setting alpha=one
-    cublasCheck(cublasGemmEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, C, OC, B*T, &one,
-                             inp, CUBLAS_LOWP, C, dout, CUBLAS_LOWP, OC, &one,
-                             dweight, CUBLAS_LOWP, C, cublas_compute, CUBLAS_GEMM_DEFAULT_TENSOR_OP));
-    cudaCheck(cudaGetLastError());
-}
-
-void layernorm_backward(floatX* dinp, floatX* dweight, floatX* dbias, float* scratch,
-                        const floatX* dout, const floatX* inp, const floatX* weight, const floatX* mean, const floatX* rstd,
-                        int B, int T, int C) {
-    NVTX_RANGE_FN();
-    const int block_size = 512;
-    const int blocks_per_sm = 2; // supported on every architecture and less cache thrashing than 3
-    const int grid_size = blocks_per_sm * deviceProp.multiProcessorCount;
-    size_t rounded_C = CEIL_DIV(C, (32 * x128::size)) * (32 * x128::size);
-    size_t shared_mem_size = (2 * rounded_C + 2 * (block_size - 32) * f128::size) * sizeof(float);
-
-    cudaCheck(cudaMemset(scratch, 0, 1 * sizeof(float))); // only need to reset the flag to 0
-    layernorm_backward_kernel10<<<grid_size, block_size, shared_mem_size>>>(dinp, dweight, dbias, scratch, dout, inp, weight, mean, rstd, B, T, C);
-    cudaCheck(cudaGetLastError());
-}
-
-// the sequence of transformations in this compound op is:
-// inp (B,T,3C) -> qkvr (B,T,3C) -> preatt (B,NH,T,T) -> att (B,NH,T,T) -> vaccum (B,T,C) -> out (B,T,C)
-void attention_backward(floatX* dinp, floatX* dqkvr, floatX* dpreatt, floatX* datt, floatX* scratch,
-                        const floatX* dout,
-                        const floatX* qkvr, const floatX* att,
-                        int B, int T, int C, int NH) {
-    NVTX_RANGE_FN();
-    const int block_size = 256;
-    int HS = C / NH; // head size
-    const float alpha = 1.0f, beta = 0.0f;
-
-    // unpack convenience pointers into q, k, v
-    const floatX *q, *k, *v;
-    q = qkvr + 0 * B * T * C;
-    k = qkvr + 1 * B * T * C;
-    v = qkvr + 2 * B * T * C;
-    floatX *dq, *dk, *dv;
-    dq = dqkvr + 0 * B * T * C;
-    dk = dqkvr + 1 * B * T * C;
-    dv = dqkvr + 2 * B * T * C;
-
-    // backward through the unpermute operation
-    int num_blocks = CEIL_DIV(B * T * C, block_size);
-    unpermute_kernel_backward<<<num_blocks, block_size>>>(scratch, dout, B, T, NH, HS);
-    // backward into datt
-    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_T, CUBLAS_OP_N, T, T, HS, &alpha,
-                                           v, CUBLAS_LOWP, HS, T * HS, scratch, CUBLAS_LOWP, HS, T * HS, &beta,
-                                           datt, CUBLAS_LOWP, T, T * T, B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT));
-    // backward into dv
-    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, HS, T, T, &alpha,
-                                           scratch, CUBLAS_LOWP, HS, T * HS, att, CUBLAS_LOWP, T, T * T, &beta,
-                                           dv, CUBLAS_LOWP, HS, T * HS, B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT));
-    // backward into preatt
-    int hs = C / NH; // head size
-    float scale = 1.0f / sqrtf(hs);
-    softmax_autoregressive_backward_kernel<<<dim3(T / 4, B * NH), 256>>>(dpreatt, datt, att, B, T, C, scale);
-    // backward into q
-    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_N, HS, T, T, &alpha,
-                                           k, CUBLAS_LOWP, HS, T * HS, dpreatt, CUBLAS_LOWP, T, T * T, &beta,
-                                           dq, CUBLAS_LOWP, HS, T * HS, B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT));
-    // backward into k
-    cublasCheck(cublasGemmStridedBatchedEx(cublas_handle, CUBLAS_OP_N, CUBLAS_OP_T, HS, T, T, &alpha,
-                                           q, CUBLAS_LOWP, HS, T * HS, dpreatt, CUBLAS_LOWP, T, T * T, &beta,
-                                           dk, CUBLAS_LOWP, HS, T * HS, B * NH, cublas_compute, CUBLAS_GEMM_DEFAULT));
-    // backward into inp
-    num_blocks = CEIL_DIV(B * NH * T * HS, block_size);
-    permute_kernel_backward<<<num_blocks, block_size>>>(dinp, dq, dk, dv, B, T, NH, HS);
-    cudaCheck(cudaGetLastError());
-}
-
-// replaces logits with logit gradients
-template <typename Type>
-void fused_classifier(Type* logits, Type* losses,
-                      const float dloss, const int* targets,
-                      int B, int T, int V, int P) {
-    NVTX_RANGE_FN();
-    const int block_size = 1024;
-    const int N = B * T;
-    const int grid_size = N;
-    fused_classifier_kernel5<<<grid_size, block_size, 512>>>(logits, losses, (floatX*)NULL, dloss, targets, B, T, V, P);
-    cudaCheck(cudaGetLastError());
-}
-
-template<typename T>
-void global_norm_squared(float* out, const T* values, size_t count) {
-    const int block_size = 512;
-    // launch just enough blocks to fill the grid. deliberately no DIV_CEIL.
-    // having one block less than possible is a tiny performance hit, having
-    // one block too many is catastrophic, since it only can start once all the other
-    // blocks finish. anyway, I think cuda_threads_per_SM should be a multiple of 512
-    // on all gpus, so the division really is going to be exact.
-    const int grid_size = deviceProp.maxThreadsPerMultiProcessor * deviceProp.multiProcessorCount / block_size;
-    assert(grid_size > 0);      // gives a better error than letting the call below fail
-    // initialize out with zero
-    cudaCheck(cudaMemset(out, 0, sizeof(float)));
-    global_norm_squared_kernel<<<grid_size, block_size>>>(out, values, count);
-    cudaCheck(cudaGetLastError());
-}
+// global vars containing information about the GPU this process is running on
+cudaDeviceProp deviceProp; // fills in common_start()
+cudaStream_t main_stream;
+// buffer size to use for device <-> disk io
+constexpr const size_t IO_BUF_SIZE = 32 * 1024 * 1024;
 
 // ----------------------------------------------------------------------------
 // GPT-2 model definition
@@ -1959,10 +146,8 @@ void fill_in_parameter_sizes(size_t* param_sizes, size_t* param_sizeof, GPT2Conf
 // allocate memory for the parameters and point the individual tensors to the right places
 void* malloc_and_point_parameters(ParameterTensors* params, size_t* param_elements, size_t *param_sizeof) {
     // calculate the total number of parameters and bytes across all tensors
-    size_t num_parameters = 0;
     size_t num_parameters_bytes = 0;
     for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
-        num_parameters += param_elements[i];
         num_parameters_bytes += param_elements[i] * param_sizeof[i];
     }
     // malloc all parameters all at once on the device
@@ -1982,27 +167,31 @@ void* malloc_and_point_parameters(ParameterTensors* params, size_t* param_elemen
     return params_memory;
 }
 
-#define NUM_ACTIVATION_TENSORS 21
+constexpr int NUM_ACTIVATION_TENSORS = 21;
 typedef struct {
     floatX* encoded; // (B, T, C)
     floatX* ln1; // (L, B, T, C)
-    floatX* ln1_mean; // (L, B, T)
-    floatX* ln1_rstd; // (L, B, T)
+    float* ln1_mean; // (L, B, T)
+    float* ln1_rstd; // (L, B, T)
     floatX* atty; // (L, B, T, C)
-    floatX* att; // (L, B, NH, T, T) (smaller with cuDNN)
-    floatX* attproj; // (L, B, T, C)
+    // cuDNN saves only some statistics information
+#if ENABLE_CUDNN
+    float* att;  // (L, B, NH, T)
+#else
+    floatX* att; // (L, B, NH, T, T)
+#endif
+
     floatX* residual2; // (L, B, T, C)
     floatX* ln2; // (L, B, T, C)
-    floatX* ln2_mean; // (L, B, T)
-    floatX* ln2_rstd; // (L, B, T)
+    float* ln2_mean; // (L, B, T)
+    float* ln2_rstd; // (L, B, T)
     floatX* fch; // (L, B, T, 4*C)
     floatX* fch_gelu; // (L, B, T, 4*C)
-    floatX* fcproj; // (L, B, T, C)
     floatX* residual3; // (L, B, T, C)
-    floatX* lnf; // (B, T, C)
-    floatX* lnf_mean; // (B, T)
-    floatX* lnf_rstd; // (B, T)
-    floatX* losses; // (B, T)
+    floatX* lnf; // (B, T, C);   if LN recomputation is enabled (-r 2 and above), will be used for _all_ layernorms
+    float* lnf_mean; // (B, T)
+    float* lnf_rstd; // (B, T)
+    float* losses; // (B, T), will be accumulated in micro-steps
     // adding these two compared to the CPU .c code, needed for attention kernel as buffers
     floatX* qkvr; // (L, B, T, 3*C)
     // in inference mode, this buffer will store the logits
@@ -2011,105 +200,88 @@ typedef struct {
     // general scratchpad buffer. Allocation is made large enough to hold (B, T, 3C),
     // (B, NH, T, T), and (B, T, V) shaped tensors.
     floatX* output;
+
+    // some additional scratch buffers
+    floatX* scratch_bt4c;   // (B, T, 4*C)
+    floatX* scratch_btc;    // (B, T, C)
 } ActivationTensors;
 
-void fill_in_activation_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config config, int recompute) {
-    size_t Vp = config.padded_vocab_size;
-    size_t L = config.num_layers;
-    size_t NH = config.num_heads;
-    size_t C = config.channels;
-    act_sizes[0] = B * T * C; // encoded
-    act_sizes[1] = L * B * T * C; // ln1
-    act_sizes[2] = L * B * T; // ln1_mean
-    act_sizes[3] = L * B * T; // ln1_rstd
-    act_sizes[4] = L * B * T * C; // atty
-    #ifdef ENABLE_CUDNN
-    // FP32 stats tensor for cuDNN to be passed to backward pass
-    act_sizes[5] = L * B * NH * T * (sizeof(float) / sizeof(floatX));
-    #else
-    act_sizes[5] = L * B * NH * T * T; // att
-    #endif
-    act_sizes[6] = L * B * T * C; // attproj
-    act_sizes[7] = L * B * T * C; // residual2
-    act_sizes[8] = L * B * T * C; // ln2
-    act_sizes[9] = L * B * T; // ln2_mean
-    act_sizes[10] = L * B * T; // ln2_rstd
-    act_sizes[11] = L * B * T * 4*C; // fch
-    // if recompute >= 1 then we will recompute gelu_forward during backward and use this as scratch buffer
-    act_sizes[12] = (recompute == 0) ? L * B * T * 4*C : B * T * 4*C;
-    act_sizes[13] = L * B * T * C; // fcproj
-    act_sizes[14] = L * B * T * C; // residual3
-    act_sizes[15] = B * T * C; // lnf
-    act_sizes[16] = B * T; // lnf_mean
-    act_sizes[17] = B * T; // lnf_rstd
-    act_sizes[18] = B * T; // losses
-    act_sizes[19] = L * B * T * 3*C; // qkvr
-    act_sizes[20] = B * T * max(3*C, max(NH*T, Vp)); // output / scratch
-}
 
-// Backward pass is conceptually quite different from forward, because we can discard
-// the activations of a layer as soon as we're done with it. This lets us aggressively
-// reuse memory, so that we need far fewer tensors for backward state.
-#ifdef ENABLE_CUDNN
-#define NUM_BACKWARD_TENSORS 2
-#else
-#define NUM_BACKWARD_TENSORS 3
-#endif
+struct TensorSpec {
+    void** ptr;
+    size_t size;
+    DType type;
+};
 
-typedef struct {
-    floatX* bt4c; // (B, T, 4*C)
-    floatX* residual3; // (B, T, C)
-    #ifndef ENABLE_CUDNN
-    floatX* preatt; // (B, NH, T, T)
-    #endif
-} GradActTensors;
 
-void fill_in_grad_act_sizes(size_t* act_sizes, size_t B, size_t T, GPT2Config config) {
-    size_t C = config.channels;
-    act_sizes[0] = B * T * 4 * C; // bt4c
-    act_sizes[1] = B * T * C; // residual3
+#define TENSOR_SPEC(pointer, size) TensorSpec{(void**)(&pointer), (size), dtype_of(pointer)};
 
-    #ifndef ENABLE_CUDNN
+void fill_in_activation_sizes(const ActivationTensors* data, TensorSpec (&tensors)[NUM_ACTIVATION_TENSORS], size_t B, size_t T, GPT2Config config, int recompute) {
+    size_t Vp = config.padded_vocab_size;
+    size_t L = config.num_layers;
     size_t NH = config.num_heads;
-    act_sizes[2] = B * NH * T * T; // preatt
+    size_t C = config.channels;
+    tensors[0] = TENSOR_SPEC(data->encoded, B * T * C);
+    // if recompute >= 1 then we will recompute the layernorm forward activation during backward pass
+    tensors[1] = TENSOR_SPEC(data->ln1,  (recompute < 2) ? L * B * T * C : 0);
+    tensors[2] = TENSOR_SPEC(data->ln1_mean, L * B * T);
+    tensors[3] = TENSOR_SPEC(data->ln1_rstd, L * B * T);
+    tensors[4] = TENSOR_SPEC(data->atty, L * B * T * C);
+    #ifdef ENABLE_CUDNN
+    // FP32 stats tensor for cuDNN to be passed to backward pass
+    tensors[5] = TENSOR_SPEC(data->att, L * B * NH * T);
+    #else
+    tensors[5] = TENSOR_SPEC(data->att, L * B * NH * T * T);
     #endif
+    tensors[6] = TENSOR_SPEC(data->residual2, L * B * T * C);
+    // if recompute >= 1 then we will recompute the layernorm forward activation during backward pass
+    tensors[7] = TENSOR_SPEC(data->ln2, (recompute < 2) ? L * B * T * C : 0);
+    tensors[8] = TENSOR_SPEC(data->ln2_mean, L * B * T);
+    tensors[9] = TENSOR_SPEC(data->ln2_rstd, L * B * T);
+    tensors[10] = TENSOR_SPEC(data->fch, L * B * T * 4*C);
+    // if recompute >= 1 then we will recompute gelu_forward during backward and use this as scratch buffer
+    tensors[11] = TENSOR_SPEC(data->fch_gelu, (recompute < 1) ? L * B * T * 4*C : B * T * 4*C);
+    tensors[12] = TENSOR_SPEC(data->residual3, L * B * T * C);
+    tensors[13] = TENSOR_SPEC(data->lnf, B * T * C);
+    tensors[14] = TENSOR_SPEC(data->lnf_mean, B * T);
+    tensors[15] = TENSOR_SPEC(data->lnf_rstd, B * T);
+    tensors[16] = TENSOR_SPEC(data->losses, B * T);
+    tensors[17] = TENSOR_SPEC(data->qkvr, L * B * T * 3*C);
+    tensors[18] = TENSOR_SPEC(data->output, B * T * max(3*C, max(NH*T, Vp)));
+
+    tensors[19] = TENSOR_SPEC(data->scratch_bt4c, B * T * 4 * C);
+    tensors[20] = TENSOR_SPEC(data->scratch_btc, B * T * C);
 }
 
-void* malloc_and_point(floatX** targets[], const size_t* act_sizes, size_t n) {
-    size_t num_activations = 0;
-    for (size_t i = 0; i < n; i++) {
-        num_activations += act_sizes[i];
+void* malloc_and_point_activations(TensorSpec (&tensors)[NUM_ACTIVATION_TENSORS]) {
+    size_t bytes = 0;
+    for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
+        bytes += tensors[i].size * sizeof_dtype(tensors[i].type);
     }
+
+    printf0("allocating %d MiB for activations\n", (int)round(bytes / (1024 * 1024)));
+
     void* acts_memory;
-    cudaCheck(cudaMalloc((void**)&acts_memory, num_activations * sizeof(floatX)));
+    cudaCheck(cudaMalloc((void**)&acts_memory, bytes));
+
+    // cudaMalloc does not guarantee initial memory values so we memset the allocation here
+    // this matters because e.g. non-cuDNN attention assumes the attention buffer is zeroed
+    // todo - up to ~100ms on slow GPUs, could theoretically be more selective, but this is safer
+    cudaCheck(cudaMemset(acts_memory, 0, bytes));
+
     char* acts_memory_iterator = (char*)acts_memory;
-    for (size_t i = 0; i < n; i++) {
-        *(targets[i]) = (floatX*)acts_memory_iterator;
-        acts_memory_iterator += act_sizes[i] * sizeof(floatX);
+    for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
+        // extra protection so we don't accidentally use an empty buffer
+        if(tensors[i].size == 0) {
+            *(tensors[i].ptr) = NULL;
+        }else {
+            *(tensors[i].ptr) = acts_memory_iterator;
+            acts_memory_iterator += tensors[i].size * sizeof_dtype(tensors[i].type);
+        }
     }
     return acts_memory;
 }
 
-void* malloc_and_point_activations(ActivationTensors* acts, const size_t* act_sizes) {
-    floatX** ptrs[] = {
-        &acts->encoded, &acts->ln1, &acts->ln1_mean, &acts->ln1_rstd, &acts->atty,
-        &acts->att, &acts->attproj, &acts->residual2, &acts->ln2, &acts->ln2_mean,
-        &acts->ln2_rstd, &acts->fch, &acts->fch_gelu, &acts->fcproj, &acts->residual3, &acts->lnf,
-        &acts->lnf_mean, &acts->lnf_rstd, &acts->losses, &acts->qkvr, &acts->output
-    };
-    return malloc_and_point(ptrs, act_sizes, NUM_ACTIVATION_TENSORS);
-}
-
-void* malloc_and_point_backward(GradActTensors* acts, const size_t* act_sizes) {
-    floatX** ptrs[] = {
-        &acts->bt4c, &acts->residual3,
-        #ifndef ENABLE_CUDNN
-        &acts->preatt,
-        #endif
-    };
-    return malloc_and_point(ptrs, act_sizes, NUM_BACKWARD_TENSORS);
-}
-
 typedef struct {
     GPT2Config config;
     // the weights of the model, and their sizes
@@ -2128,30 +300,121 @@ typedef struct {
     float* master_weights;     // is NULL unless fp32 weights is enabled.
     // the activations of the model, and their sizes
     ActivationTensors acts;
-    size_t act_sizes[NUM_ACTIVATION_TENSORS];
+    TensorSpec acts_specs[NUM_ACTIVATION_TENSORS];
     void* acts_memory;
-    size_t num_activations;
-    // gradients of the activations
-    GradActTensors grads_acts;
-    size_t num_grad_acts;
-    void* grads_acts_memory;
     // other run state configuration
     int batch_size; // the batch size (B) of current forward pass
     int seq_len; // the sequence length (T) of current forward pass
     int* inputs; // the input tokens for the current forward pass
     int* targets; // the target tokens for the current forward pass
-    float mean_loss; // after a forward pass with targets, will be populated with the mean loss
-    float accumulated_mean_loss; // Mean loss after aggregating it on all GPUs
-    floatX* cpu_losses; // CPU buffer to copy the losses to, allocated with cudaMallocHost
-    float* cpu_losses_fp32; // same but fp32
+    float mean_loss; // after the last backward micro-batch, will be populated with mean loss across all GPUs and micro-steps
+    float* accumulated_mean_loss; // GPU buffer used to accumulate loss across micro-steps
+    float* cpu_losses; // CPU buffer to copy the losses to, allocated with cudaMallocHost
     unsigned long long rng_state; // the RNG state for seeding stochastic rounding etc.
-    int use_master_weights;
-    int recompute;
+    unsigned long long rng_state_last_update; // RNG before last gpt2_update() to re-round identically from master weights
+    int use_master_weights; // keep master weights copy in float for optim update? 0|1
+    bool init_state;   // set to true if master weights need to be initialized
+    int gelu_fusion; // fuse gelu via cuBLASLt (0=none, 1=forward, 2=forward+backward)
+    int recompute; // recompute gelu | layernorm forward during model backward? 0|1|2
     // todo - if other functions need cpu scratch buffers in the future, reuse as generic scratch?
     int* workload_indices; // encoder_backward, B*T*num_c_groups (int)
     int4* bucket_info;     // encoder_backward, B*T*num_c_groups (int4) - size for worst case
 } GPT2;
 
+void gpt2_init_common(GPT2 *model) {
+    // common inits outside of the model weights
+    // memory lazily initialized in forward()
+    model->acts_memory = NULL;
+    model->inputs = NULL;
+    model->targets = NULL;
+    model->accumulated_mean_loss = NULL;
+    model->cpu_losses = NULL;
+    // the B,T params are determined and set, fixed on first batch in forward()
+    model->batch_size = 0;
+    model->seq_len = 0;
+    model->mean_loss = -1.0f; // -1.0f designates no loss, set at end of forward()
+    model->params_memory = NULL;
+    // memory lazily initialized in backward()
+    model->grads_memory = NULL;
+    model->workload_indices = NULL; // on cpu, for encoder_backward
+    model->bucket_info = NULL; // on cpu, for encoder_backward
+    // memory lazily initialized in update()
+    model->m_memory = NULL;
+    model->v_memory = NULL;
+    model->master_weights = NULL;
+    // other default settings
+    model->rng_state = 13371337 + multi_gpu_config.process_rank; // used in stochastic rounding
+    model->use_master_weights = 1; // safe default: do keep master weights in fp32
+    model->init_state = true;
+    model->recompute = 1; // good default: recompute gelu but not layernorm
+    model->gelu_fusion = 0; //deviceProp.major >= 9 ? 2 : 0; // default: off for now (default must match main())
+}
+
+void gpt2_allocate_weights(GPT2 *model) {
+    // fill in all the parameter tensor dimensions and types
+    fill_in_parameter_sizes(model->param_elements, model->param_sizeof, model->config);
+    model->num_parameters = 0;
+    model->num_parameters_bytes = 0;
+    for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
+        model->num_parameters += model->param_elements[i];
+        model->num_parameters_bytes += model->param_elements[i] * model->param_sizeof[i];
+    }
+    // create memory for model parameters on the device
+    assert(model->params_memory == nullptr);
+    model->params_memory = malloc_and_point_parameters(&model->params, model->param_elements, model->param_sizeof);
+}
+
+void gpt2_allocate_state(GPT2 *model, int B, int T) {
+    printf0("allocating %d MiB for parameter gradients\n", (int)round(model->num_parameters * sizeof(floatX) / (1024 * 1024)));
+    assert(model->grads_memory == nullptr);
+    model->grads_memory = malloc_and_point_parameters(&model->grads, model->param_elements, model->param_sizeof);
+
+    // record the current B,T as well
+    model->batch_size = B;
+    model->seq_len = T;
+
+    // allocate the space
+    fill_in_activation_sizes(&model->acts, model->acts_specs, B, T, model->config, model->recompute);
+    model->acts_memory = malloc_and_point_activations(model->acts_specs);
+    // also create memory for caching inputs and targets
+    cudaCheck(cudaMalloc((void**)&model->inputs, B * T * sizeof(int)));
+    cudaCheck(cudaMalloc((void**)&model->targets, B * T * sizeof(int)));
+    cudaCheck(cudaMalloc(((void**)&model->accumulated_mean_loss), sizeof(float)));
+    cudaCheck(cudaMallocHost((void**)&model->cpu_losses, B * T * sizeof(float)));
+
+    // initialise cpu scratch buffers for encoder backward
+    size_t num_c_groups = CEIL_DIV(model->config.channels, (WARP_SIZE * x128::size));
+    assert((size_t)(model->batch_size * model->seq_len) * num_c_groups < (1ULL<<31ULL)); // todo - maybe an issue for llama3-400B(?)
+    model->workload_indices = (int*)mallocCheck(sizeof(int) * model->batch_size * model->seq_len * num_c_groups);
+    model->bucket_info = (int4*)mallocCheck(sizeof(int4) * model->batch_size * model->seq_len * num_c_groups);
+
+    size_t shard_num_parameters = multi_gpu_config.shard_num_parameters; // num parameters we are responsible for
+    printf0("allocating %zu MiB for AdamW optimizer state m\n", (shard_num_parameters * sizeof(float)) >> 20);
+    printf0("allocating %zu MiB for AdamW optimizer state v\n", (shard_num_parameters * sizeof(float)) >> 20);
+    assert(model->m_memory == nullptr);
+    assert(model->v_memory == nullptr);
+    cudaCheck(cudaMalloc((void**)&model->m_memory, shard_num_parameters * sizeof(float)));
+    cudaCheck(cudaMalloc((void**)&model->v_memory, shard_num_parameters * sizeof(float)));
+
+    if (model->use_master_weights == 1) {
+        assert(model->master_weights == nullptr);
+        printf0("allocating %zu MiB for master copy of params\n", (shard_num_parameters * sizeof(float)) >> 20);
+        cudaCheck(cudaMalloc((void**) &model->master_weights, shard_num_parameters * sizeof(float)));
+    }
+
+    size_t free, total;
+    cudaCheck(cudaMemGetInfo(&free, &total));
+    printf0("device memory usage: %zd MiB / %zd MiB\n", (total-free) / 1024 / 1024, total / 1024 / 1024);
+
+    // give an estimate of the maximum batch size
+    size_t bytes_per_sequence = 0;
+    for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
+        bytes_per_sequence += model->acts_specs[i].size * sizeof_dtype(model->acts_specs[i].type) / B;
+    }
+    printf0("memory per sequence: %zu MiB\n", bytes_per_sequence / 1024 / 1024);
+    printf0(" -> estimated maximum batch size: %zu\n", B + free / bytes_per_sequence);
+}
+
 void gpt2_write_to_checkpoint(GPT2 *model, const char* checkpoint_path) {
     // write the model to a checkpoint file
     printf0("Writing model to %s\n", checkpoint_path);
@@ -2168,17 +431,19 @@ void gpt2_write_to_checkpoint(GPT2 *model, const char* checkpoint_path) {
     model_header[5] = model->config.num_heads;
     model_header[6] = model->config.channels;
     model_header[7] = model->config.padded_vocab_size;
-    fwrite(model_header, sizeof(int), 256, model_file);
+    fwriteCheck(model_header, sizeof(int), 256, model_file);
     // write the parameters
-    void* params_memory_cpu = (void*)mallocCheck(model->num_parameters_bytes);
-    cudaCheck(cudaMemcpy(params_memory_cpu, model->params_memory, model->num_parameters_bytes, cudaMemcpyDeviceToHost));
-    fwrite(params_memory_cpu, 1, model->num_parameters_bytes, model_file);
-    free(params_memory_cpu);
+    device_to_file(model_file, model->params_memory, model->num_parameters_bytes,
+                   IO_BUF_SIZE, main_stream);
     // close file, we're done
     fcloseCheck(model_file);
 }
 
-void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
+void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path, bool weight_init=true) {
+    // If weight_init is true, we will load the weights from this checkpoint .bin file
+    // We sometimes want this to be false, if we are going to initialize these weights from
+    // the master weights that are instead stored in the state .bin file.
+    // In that case, this function mostly loads the model hyperparameters from the header.
 
     if (PRECISION_MODE == PRECISION_FP16) {
         // TODO for later perhaps, would require us dynamically converting the
@@ -2201,16 +466,20 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
         fprintf(stderr, "---> HINT: try to re-run `python train_gpt2.py`\n");
         exit(EXIT_FAILURE);
     }
-    if (PRECISION_MODE == PRECISION_BF16 && version != 5) {
-        fprintf(stderr, "Precision is configured as BF16 but model at %s is not.\n", checkpoint_path);
-        fprintf(stderr, "---> HINT: are you sure you're loading a _bf16.bin file?\n");
-        exit(EXIT_FAILURE);
-    }
-    if (PRECISION_MODE == PRECISION_FP32 && version != 3) {
-        fprintf(stderr, "Precision is configured as FP32 but model at %s is not.\n", checkpoint_path);
-        fprintf(stderr, "---> HINT: to turn on FP32 you have to compile like: `make train_gpt2cu PRECISION=FP32`\n");
-        fprintf(stderr, "---> HINT: are you sure you're loading a .bin file without any _bf16 in the name?\n");
-        exit(EXIT_FAILURE);
+
+    // check if the precision mode of the checkpoing matches the model precision
+    if (weight_init) {
+        if (PRECISION_MODE == PRECISION_BF16 && version != 5) {
+            fprintf(stderr, "Precision is configured as BF16 but model at %s is not.\n", checkpoint_path);
+            fprintf(stderr, "---> HINT: are you sure you're loading a _bf16.bin file?\n");
+            exit(EXIT_FAILURE);
+        }
+        if (PRECISION_MODE == PRECISION_FP32 && version != 3) {
+            fprintf(stderr, "Precision is configured as FP32 but model at %s is not.\n", checkpoint_path);
+            fprintf(stderr, "---> HINT: to turn on FP32 you have to compile like: `make train_gpt2cu PRECISION=FP32`\n");
+            fprintf(stderr, "---> HINT: are you sure you're loading a .bin file without any _bf16 in the name?\n");
+            exit(EXIT_FAILURE);
+        }
     }
 
     // read in hyperparameters
@@ -2221,76 +490,87 @@ void gpt2_build_from_checkpoint(GPT2 *model, const char* checkpoint_path) {
     model->config.channels = model_header[6];
     model->config.padded_vocab_size = model_header[7];
 
-    // allocate space for all the parameters and read them in
-    fill_in_parameter_sizes(model->param_elements, model->param_sizeof, model->config);
+    // allocate memory for the model parameters
+    gpt2_allocate_weights(model);
 
-    model->num_parameters = 0;
-    model->num_parameters_bytes = 0;
-    for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
-        model->num_parameters += model->param_elements[i];
-        model->num_parameters_bytes += model->param_elements[i] * model->param_sizeof[i];
+    // read in the parameters if weight_init is true
+    if (weight_init) {
+        assert(model->params_memory != NULL);
+        file_to_device(model->params_memory, model_file, model->num_parameters_bytes, IO_BUF_SIZE, main_stream);
     }
-
-    // create memory for model parameters on the device
-    model->params_memory = malloc_and_point_parameters(&model->params, model->param_elements, model->param_sizeof);
-
-    // read in all the parameters from file and copy them to device
-    void* params_memory_cpu = (void*)mallocCheck(model->num_parameters_bytes);
-    freadCheck(params_memory_cpu, 1, model->num_parameters_bytes, model_file);
-    cudaCheck(cudaMemcpy(model->params_memory, params_memory_cpu, model->num_parameters_bytes, cudaMemcpyHostToDevice));
-    free(params_memory_cpu);
     fcloseCheck(model_file);
 
-    // other inits
-    model->acts_memory = NULL;
-    model->grads_memory = NULL;
-    model->m_memory = NULL;
-    model->v_memory = NULL;
-    model->master_weights = NULL;
-    model->grads_acts_memory = NULL;
-    model->inputs = NULL;
-    model->targets = NULL;
-    model->cpu_losses = NULL;
-    model->cpu_losses_fp32 = NULL;
-    model->workload_indices = NULL;
-    model->bucket_info = NULL;
-    model->batch_size = 0;
-    model->seq_len = 0;
-    model->mean_loss = -1.0f; // -1.0f will designate no loss
-    model->rng_state = 13371337;
-    model->use_master_weights = 1; // keep master weights copy in float for optim update?
-    model->recompute = 1; // default to recompute gelu during backward
+    // only return from this function once we are certain the params are ready on the GPU
+    cudaCheck(cudaDeviceSynchronize());
 }
 
-void gpt2_build_from_random(GPT2 *model, int depth) {
-    // init random (training from scratch)
-
-    // parameterize the size of gpt2 based only on the depth of the model (num_layers)
-    model->config.num_layers = depth;
-    // follows GPT-2 sizes
+void gpt2_set_hyperparameters(GPT2Config* config, const char* depth_str) {
+    int depth = atoi(depth_str);
+    assert(depth > 0); // atoi returns 0 if not a number
     int channels, num_heads;
-    if      (depth == 6)  { channels = 384; num_heads = 6; } // gpt2-tiny (30M)
-    else if (depth == 12) { channels = 768; num_heads = 12; } // gpt2 (124M)
+    if      (depth == 6)  { channels = 384; num_heads = 6; }   // (unofficial) gpt2-tiny (30M)
+    else if (depth == 12) { channels = 768; num_heads = 12; }  // gpt2 (124M)
     else if (depth == 24) { channels = 1024; num_heads = 16; } // gpt2-medium (350M)
     else if (depth == 36) { channels = 1280; num_heads = 20; } // gpt2-large (774M)
     else if (depth == 48) { channels = 1600; num_heads = 25; } // gpt2-xl (1558M)
-    else { fprintf(stderr, "Unsupported depth for now\n"); exit(EXIT_FAILURE); }
-    model->config.channels = channels;
-    model->config.num_heads = num_heads;
-    model->config.max_seq_len = 1024;
+    else if (depth == 60) { channels = 1920; num_heads = 30; } // (unofficial) 2.7B
+    else if (depth == 72) { channels = 2880; num_heads = 30; } // (unofficial) 7.3B
+    else if (depth == 84) { channels = 3456; num_heads = 36; } // (unofficial) 12.2B
+    else { fprintf(stderr, "Unsupported GPT-2 depth: %d\n", depth); exit(EXIT_FAILURE); }
+    config->num_layers = depth;
+    config->channels = channels;
+    config->num_heads = num_heads;
+    config->max_seq_len = 1024;
+}
+
+void gpt3_set_hyperparameters(GPT2Config* config, const char* channels_str) {
+    // we use channels instead of depth for GPT-3 because GPT-3 model depths are not one-to-one
+    // note that our models are not necessarily identical to GPT-3 because
+    // we use dense attention, not the alternating dense/banded attention of GPT-3
+    int channels = atoi(channels_str);
+    assert(channels > 0); // atoi returns 0 if not a number
+    int depth, head_size;
+    if      (channels == 384)   { depth = 6;  head_size = 64; }  // (unofficial) gpt3-tiny (31M)
+    else if (channels == 768)   { depth = 12; head_size = 64; }  // gpt3-small (125M)
+    else if (channels == 1024)  { depth = 24; head_size = 64; }  // gpt3-medium (350M)
+    else if (channels == 1536)  { depth = 24; head_size = 96; }  // gpt3-large (760M)
+    else if (channels == 2048)  { depth = 24; head_size = 128; } // gpt3-xl (1.3B) [heads fixed]
+    else if (channels == 2560)  { depth = 32; head_size = 80; }  // gpt3-2.7B
+    else if (channels == 4096)  { depth = 32; head_size = 128; } // gpt3-6.7B
+    else if (channels == 5140)  { depth = 40; head_size = 128; } // gpt3-13B
+    else if (channels == 12288) { depth = 96; head_size = 128; } // gpt3 (175B)
+    else { fprintf(stderr, "Unsupported GPT-3 channels: %d\n", channels); exit(EXIT_FAILURE); }
+    assert(channels % head_size == 0);
+    config->num_layers = depth;
+    config->channels = channels;
+    config->num_heads = channels / head_size;
+    config->max_seq_len = 2048; // NOTE: GPT-3 uses context length of 2048 tokens, up from 1024 in GPT-2
+}
+
+void gpt_build_from_descriptor(GPT2 *model, const char* descriptor) {
+    // The model descriptor can be:
+    // - legacy format "dX", where X is number, e.g. "d12". This creates GPT-2 model with 12 layers.
+    // - new explicit format "gpt2:dX", same as above, e.g. "gpt2:d48" for GPT-2 with 48 layers.
+    // - "gpt3:cX", where X is now the channel count, e.g. "gpt3:c768" is the smallest GPT-3 model.
+
+    // check the valid prexies and dispatch to the right setup function
+    assert(descriptor != NULL);
+    size_t len = strlen(descriptor);
+    if (len > 1 && descriptor[0] == 'd') {
+        gpt2_set_hyperparameters(&model->config, descriptor + 1); // pass along the depth str without the 'd'
+    } else if (len > 6 && strncmp(descriptor, "gpt2:d", 6) == 0) {
+        gpt2_set_hyperparameters(&model->config, descriptor + 6); // pass along the depth str without the 'gpt2:d'
+    } else if (len > 6 && strncmp(descriptor, "gpt3:c", 6) == 0) {
+        gpt3_set_hyperparameters(&model->config, descriptor + 6); // pass along the channels str without the 'gpt3:c'
+    } else {
+        fprintf(stderr, "Unsupported model descriptor: %s\n", descriptor); exit(EXIT_FAILURE);
+    }
+
+    // both GPT-2 and GPT-3 use the same tokenizer with 50257 tokens
     model->config.vocab_size = 50257;
-    model->config.padded_vocab_size = 50304; // padded to 128
+    model->config.padded_vocab_size = 50304; // padded to 128 for CUDA kernel efficiency
 
-    // fill in all the parameter tensor dimensions and types
-    fill_in_parameter_sizes(model->param_elements, model->param_sizeof, model->config);
-    model->num_parameters = 0;
-    model->num_parameters_bytes = 0;
-    for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
-        model->num_parameters += model->param_elements[i];
-        model->num_parameters_bytes += model->param_elements[i] * model->param_sizeof[i];
-    }
-    // create memory for model parameters on the device
-    model->params_memory = malloc_and_point_parameters(&model->params, model->param_elements, model->param_sizeof);
+    gpt2_allocate_weights(model);
 
     // allocate and random init the memory for all the parameters with GPT-2 schema
     // weights ~N(0, 0.02), biases 0, c_proj weights ~N(0, 0.02/(2*L)**0.5)
@@ -2317,7 +597,7 @@ void gpt2_build_from_random(GPT2 *model, int depth) {
             // weights tensors are handled here
             if ((l == 0 && (i == 0 || i == 1)) // only at l = 0, init the wte and wpe tensors
               || i == 4 || i == 6 || i == 10 || i == 12) {
-                int n = model->param_elements[i];
+                size_t n = model->param_elements[i];
                 size_t layer_offset = 0;
                 if (i == 0) {
                     // for wte tensor (padded vocab) override to init V instead of Vp rows
@@ -2347,30 +627,13 @@ void gpt2_build_from_random(GPT2 *model, int depth) {
     // copy them to GPU
     cudaCheck(cudaMemcpy(model->params_memory, params_memory_cpu, model->num_parameters_bytes, cudaMemcpyHostToDevice));
     free(params_memory_cpu);
-
-    // other inits and defaults
-    model->acts_memory = NULL;
-    model->grads_memory = NULL;
-    model->m_memory = NULL;
-    model->v_memory = NULL;
-    model->master_weights = NULL;
-    model->grads_acts_memory = NULL;
-    model->inputs = NULL;
-    model->targets = NULL;
-    model->cpu_losses = NULL;
-    model->cpu_losses_fp32 = NULL;
-    model->batch_size = 0;
-    model->seq_len = 0;
-    model->mean_loss = -1.0f; // -1.0f designates no loss
-    model->rng_state = 13371337;
-    model->use_master_weights = 1; // keep master weights copy in float for optim update?
-    model->recompute = 1; // default to recompute gelu during backward
 }
 
-void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, int grad_accum_steps=1) {
+// propagate inputs through the network to produce logits.
+// right now, this function is fully synchronous with the host
+void gpt2_forward(GPT2 *model, const int* inputs, size_t B, size_t T) {
     NVTX_RANGE_FN();
-    // targets are optional and could be NULL
-    // in this function we must be careful and use size_t instead of int, otherwise
+    // we must be careful and use size_t instead of int, otherwise
     // we could overflow int. E.g. l * B * NH * T * T overflows int at B 16.
 
     // ensure the model was initialized or error out
@@ -2380,62 +643,32 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, in
     }
 
     // convenience parameters
-    size_t V = model->config.vocab_size;
-    size_t Vp = model->config.padded_vocab_size;
-    size_t L = model->config.num_layers;
-    size_t NH = model->config.num_heads;
-    size_t C = model->config.channels;
-
-    // validate inputs, all indices must be in the range [0, V)
-    for(int i = 0; i < B * T; i++) {
-        assert(0 <= inputs[i] && inputs[i] < V);
-        if (targets != NULL) {
-            assert(0 <= targets[i] && targets[i] < V);
-        }
-    }
-
-    // allocate space for all the activations if needed (done here, lazily)
-    if(model->acts_memory == NULL) {
-        // record the current B,T as well
-        model->batch_size = B;
-        model->seq_len = T;
-        // allocate the space
-        fill_in_activation_sizes(model->act_sizes, B, T, model->config, model->recompute);
-        size_t num_activations = 0;
-        for (size_t i = 0; i < NUM_ACTIVATION_TENSORS; i++) {
-            num_activations += model->act_sizes[i];
-        }
-        model->num_activations = num_activations;
-        printf0("allocating %d MiB for activations\n", (int)round(num_activations * sizeof(floatX) / (1024 * 1024)));
-        model->acts_memory = malloc_and_point_activations(&model->acts, model->act_sizes);
-        // also create memory for caching inputs and targets
-        cudaCheck(cudaMalloc((void**)&model->inputs, B * T * sizeof(int)));
-        cudaCheck(cudaMalloc((void**)&model->targets, B * T * sizeof(int)));
-        cudaCheck(cudaMallocHost((void**)&model->cpu_losses, B * T * sizeof(floatX)));
-        cudaCheck(cudaMallocHost((void**)&model->cpu_losses_fp32, B * T * sizeof(float)));
-    } else {
-        // validate B,T is consistent with how we've allocated the memory before
-        // in principle we could get more clever here in the future, for now this is safest
-        if (B != model->batch_size || T != model->seq_len) {
-            printf("Model: B=%d T=%d, Desired: B=%d T=%d\n", model->batch_size, model->seq_len, (int)B, (int)T);
-            exit(EXIT_FAILURE);
-        }
+    const size_t V = model->config.vocab_size;
+    const size_t Vp = model->config.padded_vocab_size;
+    const size_t L = model->config.num_layers;
+    const size_t NH = model->config.num_heads;
+    const size_t C = model->config.channels;
+
+    // validate B,T are not larger than the values used at initialisation
+    // (smaller B,T are okay for inference only)
+    if (B > model->batch_size || T > model->seq_len) {
+        printf("Model: B=%d T=%d, Desired: B=%d T=%d\n", model->batch_size, model->seq_len, (int)B, (int)T);
+        exit(EXIT_FAILURE);
     }
 
     // copy inputs/targets to the model
-    // todo - inputs is copied on default stream so this synchronises CPU/GPU for now
     cudaCheck(cudaMemcpy(model->inputs, inputs, B * T * sizeof(int), cudaMemcpyHostToDevice));
-    if (targets != NULL) {
-        cudaCheck(cudaMemcpy(model->targets, targets, B * T * sizeof(int), cudaMemcpyHostToDevice));
-    }
+    // validate inputs, all indices must be in the range [0, V)
+    // we can do this while the copies are already underway
+    tokenCheck(inputs, B*T, V);
 
     // forward pass
     ParameterTensors params = model->params; // for brevity
     ActivationTensors acts = model->acts;
-    encoder_forward(acts.encoded, model->inputs, params.wte, params.wpe, B, T, C); // encoding goes into residual[0]
+    encoder_forward(acts.encoded, model->inputs, params.wte, params.wpe, B, T, C, main_stream); // encoding goes into residual[0]
 
     // first layernorm isn't fused
-    layernorm_forward(acts.ln1, acts.ln1_mean, acts.ln1_rstd, acts.encoded, params.ln1w, params.ln1b, B, T, C);
+    layernorm_forward((model->recompute < 2) ? acts.ln1 : acts.lnf, acts.ln1_mean, acts.ln1_rstd, acts.encoded, params.ln1w, params.ln1b, B, T, C, main_stream);
 
     for (int l = 0; l < L; l++) {
         NvtxRange layer_range("Layer", l);
@@ -2455,137 +688,132 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, size_t B, size_t T, in
         floatX* l_fcprojb = params.fcprojb + l * C;
 
         // get the pointers of the activations for this layer
-        floatX* l_ln1 = acts.ln1 + l * B * T * C;
+        floatX* l_ln1 = (model->recompute < 2) ? acts.ln1 + l * B * T * C : acts.lnf;
         floatX* l_qkvr = acts.qkvr + l * B * T * 3*C;
         floatX* l_atty = acts.atty + l * B * T * C;
-        floatX* l_attproj = acts.attproj + l * B * T * C;
         floatX* l_residual2 = acts.residual2 + l * B * T * C;
-        floatX* l_ln2 = acts.ln2 + l * B * T * C;
-        floatX* l_ln2_mean = acts.ln2_mean + l * B * T;
-        floatX* l_ln2_rstd = acts.ln2_rstd + l * B * T;
+        floatX* l_ln2 = (model->recompute < 2) ? acts.ln2 + l * B * T * C : acts.lnf;
+        float* l_ln2_mean = acts.ln2_mean + l * B * T;
+        float* l_ln2_rstd = acts.ln2_rstd + l * B * T;
         floatX* l_fch = acts.fch + l * B * T * 4*C;
         // reuse the same activation buffer at each layer, as we'll re-compute the gelu during backward
         // very useful because we dramatically reduce VRAM usage, and may be able to fit larger batch size
-        floatX* l_fch_gelu = (model->recompute == 0) ? acts.fch_gelu + l * B * T * 4*C : acts.fch_gelu;
-        floatX* l_fcproj = acts.fcproj + l * B * T * C;
+        floatX* l_fch_gelu = (model->recompute < 1) ? acts.fch_gelu + l * B * T * 4*C : acts.fch_gelu;
         floatX* l_residual3 = acts.residual3 + l * B * T * C;
+        floatX* scratch = (floatX*)acts.output; // used for non-cudnn attention, fcproj, attproj, etc.
 
         // now do the forward pass
         #ifdef ENABLE_CUDNN
         float* l_att = (float*)acts.att + l * B * NH * T; // cuDNN needs a smaller FP32 tensor
-        matmul_forward_cublaslt(l_qkvr, l_ln1, l_qkvw, l_qkvb, B, T, C, 3*C);
-        attention_forward_cudnn(l_atty, (float*)l_att, l_qkvr, B, T, NH, C);
+        matmul_forward_cublaslt(l_qkvr, l_ln1, l_qkvw, l_qkvb, B, T, C, 3*C, main_stream);
+        attention_forward_cudnn(l_atty, (float*)l_att, l_qkvr, B, T, NH, C, main_stream);
         #else
         floatX* l_att = acts.att + l * B * NH * T * T;
+        if (T != model->seq_len) { // unused parts of attention buffer must be zeroed (T-dependent)
+            cudaCheck(cudaMemset(l_att, 0, B * NH * T * T * sizeof(floatX)));
+        }
         // these are only needed as scratchpads for the forward pass, but
         // need not be stored for backward
-        floatX* scratch = (floatX*)acts.output;
-        matmul_forward_cublaslt(scratch, l_ln1, l_qkvw, l_qkvb, B, T, C, 3*C);
-        attention_forward(l_atty, l_qkvr, l_att, scratch, B, T, C, NH);
+        matmul_forward_cublaslt(scratch, l_ln1, l_qkvw, l_qkvb, B, T, C, 3*C, main_stream);
+        attention_forward(l_atty, l_qkvr, l_att, scratch, B, T, C, NH, main_stream);
         #endif
 
-        matmul_forward_cublaslt(l_attproj, l_atty, l_attprojw, l_attprojb, B, T, C, C);
-        fused_residual_forward5(l_residual2, l_ln2, l_ln2_mean, l_ln2_rstd, residual, l_attproj, l_ln2w, l_ln2b, B*T, C);
-        matmul_forward_cublaslt(l_fch, l_ln2, l_fcw, l_fcb, B, T, C, 4*C);
-        gelu_forward(l_fch_gelu, l_fch, B*T*4*C);
-        matmul_forward_cublaslt(l_fcproj, l_fch_gelu, l_fcprojw, l_fcprojb, B, T, 4*C, C);
-
+        matmul_forward_cublaslt(scratch, l_atty, l_attprojw, l_attprojb, B, T, C, C, main_stream);
+        fused_residual_forward5(l_residual2, l_ln2, l_ln2_mean, l_ln2_rstd, residual, scratch, l_ln2w, l_ln2b, B*T, C, main_stream);
+        matmul_forward_cublaslt(l_fch_gelu, l_ln2, l_fcw, l_fcb, B, T, C, 4*C, main_stream, l_fch, model->gelu_fusion);
+        matmul_forward_cublaslt(scratch, l_fch_gelu, l_fcprojw, l_fcprojb, B, T, 4*C, C, main_stream);
         // OK, fusion across blocks.
         if(l+1 != L) {
-            floatX* l_ln1 = acts.ln1 + (l + 1) * B * T * C;
-            floatX* l_ln1_mean = acts.ln1_mean + (l + 1) * B * T;
-            floatX* l_ln1_rstd = acts.ln1_rstd + (l + 1) * B * T;
+            floatX* l_ln1 = (model->recompute < 2) ? acts.ln1 + (l + 1) * B * T * C : acts.lnf;
+            float* l_ln1_mean = acts.ln1_mean + (l + 1) * B * T;
+            float* l_ln1_rstd = acts.ln1_rstd + (l + 1) * B * T;
             const floatX* l_ln1w = params.ln1w + (l + 1) * C;
             const floatX* l_ln1b = params.ln1b + (l + 1) * C;
-            fused_residual_forward5(l_residual3, l_ln1, l_ln1_mean, l_ln1_rstd, l_residual2, l_fcproj, l_ln1w, l_ln1b,
-                                    B * T, C);
+            fused_residual_forward5(l_residual3, l_ln1, l_ln1_mean, l_ln1_rstd, l_residual2, scratch, l_ln1w, l_ln1b,
+                                    B * T, C, main_stream);
         } else {
-            fused_residual_forward5(l_residual3, acts.lnf, acts.lnf_mean, acts.lnf_rstd, l_residual2, l_fcproj,
+            fused_residual_forward5(l_residual3, acts.lnf, acts.lnf_mean, acts.lnf_rstd, l_residual2, scratch,
                                     params.lnfw, params.lnfb,
-                                    B * T, C);
+                                    B * T, C, main_stream);
         }
     }
 
-    matmul_forward_cublaslt(acts.output, acts.lnf, params.wte, NULL, B, T, C, Vp);
-
-    // also forward the cross-entropy loss function if we have the targets
-    if (targets != NULL) {
-        NvtxRange classifier_and_loss_range("classifier_and_loss");
-        // fused classifier: does the forward pass and first part of the backward pass
-        const float dloss = 1.0f / (B * T * grad_accum_steps); // results in the uniform average loss over all elements
-        fused_classifier(acts.output, acts.losses, dloss, model->targets, B, T, V, Vp);
-        // for convenience also evaluate the mean loss (TODO re-think this compute+sync point)
-        cudaCheck(cudaMemcpy(model->cpu_losses, acts.losses, B * T * sizeof(floatX), cudaMemcpyDeviceToHost));
-        float mean_loss = 0.0f;
-        for (int i = 0; i < B*T; i++) {
-            float loss = (float)(model->cpu_losses[i]);
-            model->cpu_losses_fp32[i] = loss;
-            mean_loss += loss;
-        }
-        mean_loss /= B*T*grad_accum_steps;
-        model->mean_loss = mean_loss;
-    } else {
-        // if we don't have targets, we don't have loss
-        model->mean_loss = -1.0f;
-    }
+    matmul_forward_cublaslt(acts.output, acts.lnf, params.wte, NULL, B, T, C, Vp, main_stream);
+    cudaCheck(cudaDeviceSynchronize());
 }
 
-void gpt2_zero_grad(GPT2 *model) {
-    NVTX_RANGE_FN();
-    if (model->grads_memory != NULL) {
-        cudaCheck(cudaMemset(model->grads_memory, 0, model->num_parameters * sizeof(floatX)));
-    }
-}
 
-void gpt2_backward(GPT2 *model, int* inputs) {
-    NVTX_RANGE_FN();
-    // double check we forwarded previously, with targets
-    if (model->mean_loss == -1.0f) {
-        printf("Error: must forward with targets before backward\n");
+// Forwards both the model and the loss and is used for validation splits and evals.
+// In particular it populates cpu_losses with loss at each token.
+// Some of the evals (e.g. HellaSwag) require the per-token losses, which are produced here.
+float gpt2_validate(GPT2 *model, const int* inputs, const int* targets, size_t B, size_t T) {
+    assert(targets != NULL);
+    // forward the model itself
+    gpt2_forward(model, inputs, B, T);
+    // convenience shortcuts, size_t instead of int so that pointer arithmetics don't overflow
+    const size_t V = model->config.vocab_size;
+    const size_t Vp = model->config.padded_vocab_size;
+
+    NvtxRange classifier_and_loss_range("classifier_and_loss");
+    ActivationTensors acts = model->acts;
+    float mean_loss = 0.0f;
+    // fused classifier: does the forward pass and first part of the backward pass
+    const float dloss = 1.0f / (B * T); // results in the uniform average loss over all elements
+    // note: we don't need to generate dlogits here
+    cudaCheck(cudaMemset(acts.losses, 0, B*T*sizeof(float)));
+    cudaCheck(cudaMemcpy(model->targets, targets, B * T * sizeof(int), cudaMemcpyHostToDevice));
+    tokenCheck(targets, B*T, V); // while the memcpy is underway, validate the targets
+    fused_classifier(acts.output, acts.losses, dloss, model->targets, B, T, V, Vp, False, main_stream);
+    cudaCheck(cudaMemcpy(model->cpu_losses, acts.losses, B * T * sizeof(float), cudaMemcpyDeviceToHost));
+    for (int i = 0; i < B*T; i++) {
+        mean_loss += model->cpu_losses[i];
+    }
+    mean_loss /= B*T;
+    cudaCheck(cudaDeviceSynchronize());
+    return mean_loss;
+}
+
+void gpt2_backward_and_reduce(GPT2 *model, int* inputs, const int* targets, int grad_accum_steps, int micro_step) {
+    if(model->grads_memory == nullptr) {
+        fprintf(stderr, "Need to allocate gradients before backward");
         exit(EXIT_FAILURE);
     }
-
-    // lazily allocate the memory for gradients of the weights and activations, if needed
-    if (model->grads_memory == NULL) {
-        // allocate buffers for weight gradients
-        printf0("allocating %d MiB for parameter gradients\n", (int)round(model->num_parameters * sizeof(floatX) / (1024 * 1024)));
-        model->grads_memory = malloc_and_point_parameters(&model->grads, model->param_elements, model->param_sizeof);
-        // we're going to be clever for the activations backward pass. we don't need to exactly
-        // mirror the forward pass activations and we will save memory.
-        size_t bw_act_sizes[NUM_ACTIVATION_TENSORS];
-        fill_in_grad_act_sizes(bw_act_sizes, model->batch_size, model->seq_len, model->config);
-        // count up and allocate the space
-        model->num_grad_acts = 0;
-        for (size_t i = 0; i < NUM_BACKWARD_TENSORS; i++) {
-            model->num_grad_acts += bw_act_sizes[i];
-        }
-        printf0("allocating %d MiB for activation gradients\n", (int)round(model->num_grad_acts * sizeof(floatX) / (1024 * 1024)));
-        model->grads_acts_memory = malloc_and_point_backward(&model->grads_acts, bw_act_sizes);
-        // init gradients of parameters and activations to zero
-        gpt2_zero_grad(model);
-        // initialise cpu scratch buffers for encoder backward
-        size_t num_c_groups = CEIL_DIV(model->config.channels, (WARP_SIZE * x128::size));
-        assert((size_t)(model->batch_size * model->seq_len) * num_c_groups < (1ULL<<31ULL)); // todo - maybe an issue for llama3-400B(?)
-        model->workload_indices = (int*)mallocCheck(sizeof(int) * model->batch_size * model->seq_len * num_c_groups);
-        model->bucket_info = (int4*)mallocCheck(sizeof(int4) * model->batch_size * model->seq_len * num_c_groups);
+    NVTX_RANGE_FN();
+    bool last_step = micro_step == grad_accum_steps - 1;
+    // on the first micro-step zero the gradients, as we're about to += accumulate into them
+    if (micro_step == 0) {
+        // there are currently two state vars during the gradient accumulation inner loop:
+        // 1) the losses accumulate += into acts.losses, reset here
+        // 2) the gradients accumulate += into grads_memory, reset here
+        cudaCheck(cudaMemsetAsync(model->acts.losses, 0, model->batch_size * model->seq_len * sizeof(float), main_stream));
+        cudaCheck(cudaMemsetAsync(model->grads_memory, 0, model->num_parameters * sizeof(floatX), main_stream));
     }
 
     // convenience shortcuts, size_t instead of int so that pointer arithmetics don't overflow
-    size_t B = model->batch_size;
-    size_t T = model->seq_len;
-    size_t Vp = model->config.padded_vocab_size;
-    size_t L = model->config.num_layers;
-    size_t NH = model->config.num_heads;
-    size_t C = model->config.channels;
+    const size_t B = model->batch_size;
+    const size_t T = model->seq_len;
+    const size_t V = model->config.vocab_size;
+    const size_t Vp = model->config.padded_vocab_size;
+    const size_t L = model->config.num_layers;
+    const size_t NH = model->config.num_heads;
+    const size_t C = model->config.channels;
 
-    // backward pass: go in the reverse order of the forward pass, and call backward() functions
     ParameterTensors params = model->params; // for brevity
     ParameterTensors grads = model->grads;
     ActivationTensors acts = model->acts;
-    GradActTensors grads_acts = model->grads_acts;
+
+    // accumulate the losses inside acts.losses, and kick off the backward pass inside the fused classifier
+    NvtxRange classifier_and_loss_range("classifier_and_loss");
+    const float dloss = 1.0f / (float)(B * T * grad_accum_steps); // results in the uniform average loss over all elements
+    cudaCheck(cudaMemcpy(model->targets, targets, B * T * sizeof(int), cudaMemcpyHostToDevice));
+    tokenCheck(targets, B*T, V);
+    fused_classifier(acts.output, acts.losses, dloss, model->targets, B, T, V, Vp, True, main_stream);
+
+    // backward pass: go in the reverse order of the forward pass, and call backward() functions
 
     // reset residual stream gradients (put here to work with gradient accumulation)
-    cudaCheck(cudaMemset(model->grads_acts.residual3, 0, B * T * C * sizeof(floatX)));
+    floatX* dresidual = (floatX*)model->acts.scratch_btc; // the main buffer holding the gradient in the backward pass
+    cudaCheck(cudaMemset(dresidual, 0, B * T * C * sizeof(floatX)));
 
     // re-use the output buffer of the forward pass as a scratchpad during backward pass
     float*  scratchF = (float*)acts.output;
@@ -2596,11 +824,14 @@ void gpt2_backward(GPT2 *model, int* inputs) {
     // technically that is a small, inline backward() pass of calculating
     // total, final loss as the mean over all losses over all (B,T) positions in the batch
     // next: backward the classifier matmul
-    matmul_backward(grads_acts.bt4c, grads.wte, NULL, acts.output, acts.lnf, params.wte, NULL, B, T, C, Vp);
+    matmul_backward(model->acts.scratch_bt4c, grads.wte, NULL, acts.output, acts.lnf, params.wte, NULL, B, T, C, Vp, main_stream);
     // backward the final layernorm
     floatX* residual = acts.residual3 + (L-1) * B * T * C; // last residual is in residual3
-    floatX* dresidual = (floatX*)grads_acts.residual3; // the main buffer holding the gradient in the backward pass
-    layernorm_backward(dresidual, grads.lnfw, grads.lnfb, scratchF, grads_acts.bt4c, residual, params.lnfw, acts.lnf_mean, acts.lnf_rstd, B, T, C);
+    layernorm_backward(dresidual, grads.lnfw, grads.lnfb, scratchF, model->acts.scratch_bt4c, residual, params.lnfw, acts.lnf_mean, acts.lnf_rstd, B, T, C, main_stream);
+
+    // from this point on, we no longer need the values stored in the last residual, so we can reuse that memory as generic
+    // scratch for backward computations
+    floatX* dl_btc = residual;
 
     // now backward all the layers
     for (int l = L-1; l >= 0; l--) {
@@ -2610,9 +841,11 @@ void gpt2_backward(GPT2 *model, int* inputs) {
 
         // get the pointers of the weights for this layer
         floatX* l_ln1w = params.ln1w + l * C;
+        floatX* l_ln1b = params.ln1b + l * C;
         floatX* l_qkvw = params.qkvw + l * 3*C * C;
         floatX* l_attprojw = params.attprojw + l * C * C;
         floatX* l_ln2w = params.ln2w + l * C;
+        floatX* l_ln2b = params.ln2b + l * C;
         floatX* l_fcw = params.fcw + l * 4*C * C;
         floatX* l_fcprojw = params.fcprojw + l * C * 4*C;
         // get the pointers of the gradients of the weights for this layer
@@ -2629,98 +862,166 @@ void gpt2_backward(GPT2 *model, int* inputs) {
         floatX* dl_fcprojw = grads.fcprojw + l * C * 4*C;
         floatX* dl_fcprojb = grads.fcprojb + l * C;
         // get the pointers of the activations for this layer
-        floatX* l_ln1 = acts.ln1 + l * B * T * C;
-        floatX* l_ln1_mean = acts.ln1_mean + l * B * T;
-        floatX* l_ln1_rstd = acts.ln1_rstd + l * B * T;
+        floatX* l_ln1 = (model->recompute < 2) ? acts.ln1 + l * B * T * C : acts.lnf;
+        float* l_ln1_mean = acts.ln1_mean + l * B * T;
+        float* l_ln1_rstd = acts.ln1_rstd + l * B * T;
         floatX* l_qkvr = acts.qkvr + l * B * T * 3*C;
         floatX* l_atty = acts.atty + l * B * T * C;
         floatX* l_residual2 = acts.residual2 + l * B * T * C;
-        floatX* l_ln2 = acts.ln2 + l * B * T * C;
-        floatX* l_ln2_mean = acts.ln2_mean + l * B * T;
-        floatX* l_ln2_rstd = acts.ln2_rstd + l * B * T;
-        floatX* l_fch = acts.fch + l * B * T * 4*C;
-        floatX* l_fch_gelu = (model->recompute == 0) ? acts.fch_gelu + l * B * T * 4*C : acts.fch_gelu;
+        floatX* l_ln2 = (model->recompute < 2) ? acts.ln2 + l * B * T * C : acts.lnf;
+        float* l_ln2_mean = acts.ln2_mean + l * B * T;
+        float* l_ln2_rstd = acts.ln2_rstd + l * B * T;
+        floatX* l_fch_pre_gelu = acts.fch + l * B * T * 4*C;
+        floatX* l_fch_gelu = (model->recompute < 1) ? acts.fch_gelu + l * B * T * 4*C : acts.fch_gelu;
         // get the pointers of the gradients of the activations for this layer
         // notice that there is no l *, because we just have a single copy, and keep
         // re-using this memory in every Transformer block as we calculate backward pass
 
-        // we need a B x T x C buffer; thankfully, the forward activation for lnf isn't needed anymore,
-        // so we can co-opt it here.
-        floatX* dl_btc = (floatX*)acts.lnf;
-        floatX* dl_bt4c = (floatX*)grads_acts.bt4c;
+        floatX* dl_bt4c = (floatX*)model->acts.scratch_bt4c;
 
         // start the backward pass for this layer
         if(model->recompute >= 1) {
             // recompute >= 1 means we recompute gelu. in this case,
             // l_fch_gelu is just a buffer, so re-compute the gelu from l_fch here
-            gelu_forward(l_fch_gelu, l_fch, B*T*4*C);
+            gelu_forward(l_fch_gelu, l_fch_pre_gelu, B*T*4*C, main_stream);
+        }
+        matmul_backward(dl_bt4c, dl_fcprojw, dl_fcprojb, dresidual, l_fch_gelu, l_fcprojw, scratchF, B, T, 4*C, C, main_stream, l_fch_pre_gelu, model->gelu_fusion);
+        if(model->recompute >= 2) {
+            // same as gelu above, l_ln1 and l_ln2 are just buffers if recompute >= 2, recompute them here on demand
+            layernorm_forward(l_ln2, l_ln2_mean, l_ln2_rstd, l_residual2, l_ln2w, l_ln2b, B, T, C, main_stream);
         }
-        matmul_backward(dl_bt4c, dl_fcprojw, dl_fcprojb, dresidual, l_fch_gelu, l_fcprojw, scratchF, B, T, 4*C, C);
-        gelu_backward(dl_bt4c, l_fch, dl_bt4c, B*T*4*C);
-        matmul_backward(dl_btc, dl_fcw, dl_fcb, dl_bt4c, l_ln2, l_fcw, scratchF, B, T, C, 4 * C);
+        matmul_backward(dl_btc, dl_fcw, dl_fcb, dl_bt4c, l_ln2, l_fcw, scratchF, B, T, C, 4 * C, main_stream);
         // layernorm backward does += to the dresidual, so it correctly accumulates grad from the MLP block above
-        layernorm_backward(dresidual, dl_ln2w, dl_ln2b, scratchF, dl_btc, l_residual2, l_ln2w, l_ln2_mean, l_ln2_rstd, B, T, C);
-        matmul_backward(dl_btc, dl_attprojw, dl_attprojb, dresidual, l_atty, l_attprojw, scratchF, B, T, C, C);
+        layernorm_backward(dresidual, dl_ln2w, dl_ln2b, scratchF, dl_btc, l_residual2, l_ln2w, l_ln2_mean, l_ln2_rstd, B, T, C, main_stream);
+        matmul_backward(dl_btc, dl_attprojw, dl_attprojb, dresidual, l_atty, l_attprojw, scratchF, B, T, C, C, main_stream);
 
         #ifdef ENABLE_CUDNN
         float* l_att = (float*)acts.att + l * B * NH * T; // cuDNN needs a smaller FP32 tensor
-        attention_backward_cudnn(dl_bt4c, dl_btc, l_qkvr, l_atty, (float*)l_att, B, T, NH, C);
+        attention_backward_cudnn(dl_bt4c, dl_btc, l_qkvr, l_atty, (float*)l_att, B, T, NH, C, main_stream);
         #else
         floatX* l_att = acts.att + l * B * NH * T * T;
         // we need B x T x (4)C buffers. l_atty and l_fch aren't needed anymore at this point, so reuse their memory
         floatX* buffer_a = l_atty;
-        floatX* buffer_b = l_fch;        // this is B x T x 4C, so even larger than what we need
-        floatX* dl_preatt = (floatX*)grads_acts.preatt; // dedicated scratchpad allocation
-        attention_backward(dl_bt4c, buffer_b, dl_preatt, scratchX, buffer_a, dl_btc, l_qkvr, l_att, B, T, C, NH);
+        floatX* buffer_b = l_fch_pre_gelu;        // this is B x T x 4C, so even larger than what we need
+        attention_backward(dl_bt4c, buffer_b, scratchX, buffer_a, dl_btc, l_qkvr, l_att, B, T, C, NH, main_stream);
         #endif
-
+        if(model->recompute >= 2) {
+            layernorm_forward(l_ln1, l_ln1_mean, l_ln1_rstd, residual, l_ln1w, l_ln1b, B, T, C, main_stream);
+        }
         // QKV parameter gradients
-        matmul_backward(dl_btc, dl_qkvw, dl_qkvb, dl_bt4c, l_ln1, l_qkvw, scratchF, B, T, C, 3 * C);
+        matmul_backward(dl_btc, dl_qkvw, dl_qkvb, dl_bt4c, l_ln1, l_qkvw, scratchF, B, T, C, 3 * C, main_stream);
         // layernorm backward does += to dresidual, so it correctly accumulates gradient for the Attention block above
-        layernorm_backward(dresidual, dl_ln1w, dl_ln1b, scratchF, dl_btc, residual, l_ln1w, l_ln1_mean, l_ln1_rstd, B, T, C);
+        layernorm_backward(dresidual, dl_ln1w, dl_ln1b, scratchF, dl_btc, residual, l_ln1w, l_ln1_mean, l_ln1_rstd, B, T, C, main_stream);
+
+        // Accumulate gradients from this layer in a background stream.
+        if(last_step) {
+            floatX* const pointers[] = {
+                dl_ln1w, dl_ln1b,
+                dl_qkvw, dl_qkvb,
+                dl_attprojw, dl_attprojb,
+                dl_ln2w, dl_ln2b,
+                dl_fcw, dl_fcb,
+                dl_fcprojw, dl_fcprojb
+            };
+            const size_t nelem[] = {
+                C, C,
+                3 * C * C, 3 * C,
+                C * C, C,
+                C, C,
+                4 * C * C, 4 * C,
+                C * 4 * C, C
+            };
+            multi_gpu_async_reduce_gradient(pointers, nelem, &multi_gpu_config, main_stream);
+        }
     }
     encoder_backward(grads.wte, grads.wpe, scratchX, model->workload_indices, model->bucket_info,
-                     dresidual, model->inputs, inputs, B, T, C, random_u32(&model->rng_state));
+                     dresidual, model->inputs, inputs, B, T, C, random_u32(&model->rng_state), main_stream);
+
+    // Aggregate all gradients that are not part of the transformer blocks
+    if(last_step) {
+        // reduce all the losses within the current GPU (across all microsteps)
+        global_sum_deterministic(model->accumulated_mean_loss, acts.losses, B*T, main_stream);
+        // reduce loss across GPUs to a single, final float across all microsteps and GPUs
+        #if MULTI_GPU
+        ncclCheck(ncclAllReduce(model->accumulated_mean_loss, model->accumulated_mean_loss, sizeof(float), ncclFloat, ncclAvg, multi_gpu_config.nccl_comm, main_stream));
+        #endif
+        cudaCheck(cudaMemcpyAsync(&model->mean_loss, model->accumulated_mean_loss, sizeof(float), cudaMemcpyDeviceToHost, main_stream));
+        // reduce the gradients for non-transformer block parameters
+        floatX* const pointers[] = {grads.wte, grads.wpe, grads.lnfw, grads.lnfb};
+        const size_t nelem[] = {Vp * C, T * C, C, C};
+        multi_gpu_async_reduce_gradient(pointers, nelem, &multi_gpu_config, main_stream);
+    }
+
+    cudaCheck(cudaDeviceSynchronize());
+    if(last_step) {
+        model->mean_loss /= B*T*grad_accum_steps;
+    } else {
+        model->mean_loss = -1.f; // no loss available yet
+    }
 }
 
-// Compute sum of a single CPU value across all GPU processes. No-op when multi-GPU is disabled.
-float multi_gpu_cpu_float_sum(float value) {
-#ifdef MULTI_GPU
-    // note MPI doesn't support all reduce with mean, only sum
-    float result;
-    mpiCheck(MPI_Allreduce(&value, &result, 1, MPI_FLOAT, MPI_SUM, MPI_COMM_WORLD));
-    return result;
-#else
-    return value;
-#endif
+// Gets the offset of a specific tensor for a specific layer in the GPT2 model
+// layer_id is ignored for weights that are not part of a transformer block
+ShardInfo gpt2_get_tensor_at_layer(const GPT2 *model, int layer_id, int param_tensor_id) {
+    // first offset our way to the parameter tensor start
+    ptrdiff_t offset = 0;
+    for (int i = 0; i < param_tensor_id; i++) {
+        offset += (ptrdiff_t)model->param_elements[i];
+    }
+    size_t size = model->param_elements[param_tensor_id] ;
+    // if we are in the transformer block, we need to additionally offset by the layer id
+    if(2 <= param_tensor_id && param_tensor_id <= 13) {
+        size /= model->config.num_layers;
+        offset += (ptrdiff_t)(layer_id * size);
+    }
+    return {offset, size};
 }
 
-// Averages out the loss and gradients across all GPUs. No-op when multi-GPU is disabled.
-// todo - this version only works if all the parameters are the same size (floatX)
-void gpt2_multi_gpu_accumulate(GPT2* model, MultiGpuConfig* multi_gpu_config) {
-#ifdef MULTI_GPU
+float gpt2_calculate_grad_norm(GPT2 *model, MultiGpuConfig* multi_gpu_config) {
     NVTX_RANGE_FN();
-    if (multi_gpu_config->num_processes == 1) { return; }
-    // Average all losses.
-    model->accumulated_mean_loss = multi_gpu_cpu_float_sum(model->mean_loss) / multi_gpu_config->num_processes;
-    if(multi_gpu_config->zero_stage == 0) {
-        //  no ZERO == standard DDP: Average all gradients.
-        ncclCheck(ncclAllReduce(model->grads_memory, model->grads_memory,
-                                model->num_parameters,
-                                ncclFloatX, ncclAvg,
-                                multi_gpu_config->nccl_comm, 0));
-    } else if (multi_gpu_config->zero_stage == 1) {
-        // ZERO-1: Get average gradient for local shard
-        floatX* local_grads_memory = (floatX*) model->grads_memory + multi_gpu_config->shard_offset;
-        ncclCheck(ncclReduceScatter(model->grads_memory, local_grads_memory,
-                                    multi_gpu_config->shard_num_parameters,
-                                    ncclFloatX, ncclAvg,
-                                    multi_gpu_config->nccl_comm, 0));
-    }
+    floatX* grads_memory = (floatX*)model->grads_memory;
+
+    // repurposing this buffer (which isn't needed now) to write grad norm into it
+    float* grad_norm_squared = (float*)model->acts.output;
+    float grad_norm_squared_cpu = 0.0f;
+
+    int num_slices[2] = {1, model->config.num_layers};
+    int max_num_block_sums = get_max_num_block_sums(num_slices, 2);
+    if (multi_gpu_config->zero_stage == 1) {
+        // because of the ncclReduceScatter() in backward,
+        // grads_memory only contains the averaged gradients at the local shards,
+        // so we only calculate the grad norm at the grads_memory belonging to the local shards
+        for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
+            ShardInfo tensor = gpt2_get_tensor_at_layer(model, 0, i);
+            ShardInfo shard = multi_gpu_get_shard_offset(tensor.size, multi_gpu_config, 1);
+            ptrdiff_t offset = tensor.offset + shard.offset;
+            bool is_first_pass = (i == 0);
+            if((i < 2 || i > 13)) {
+                global_norm_squared(grad_norm_squared, grads_memory + offset, shard.size, 0, 1,
+                                    max_num_block_sums, is_first_pass, main_stream);
+            } else {
+                global_norm_squared(grad_norm_squared, grads_memory + offset, shard.size, tensor.size, model->config.num_layers,
+                                    max_num_block_sums, is_first_pass, main_stream);
+            }
+        }
+        global_sum_deterministic(grad_norm_squared, grad_norm_squared, max_num_block_sums, main_stream);
+#if MULTI_GPU
+        // further sum the (partial) squared norm across all GPUs
+        ncclCheck(ncclAllReduce(grad_norm_squared, grad_norm_squared, sizeof(float), ncclFloat, ncclSum, multi_gpu_config->nccl_comm, main_stream));
 #endif
+    } else {
+        // in regular DDP, backward has averaged the gradients across all GPUs
+        // so each GPU can compute the squared norm over the whole grad vector, with no added comms needed
+        global_norm_squared(grad_norm_squared, grads_memory, model->num_parameters, 0, 1, max_num_block_sums, true, main_stream);
+        global_sum_deterministic(grad_norm_squared, grad_norm_squared, max_num_block_sums, main_stream);
+    }
+    cudaCheck(cudaMemcpy(&grad_norm_squared_cpu, grad_norm_squared, sizeof(float), cudaMemcpyDeviceToHost));
+    float grad_norm_cpu = sqrtf(grad_norm_squared_cpu);
+    return grad_norm_cpu;
 }
 
-float gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, float grad_clip, int t, MultiGpuConfig* multi_gpu_config) {
+void gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, float eps, float weight_decay, float grad_scale, int t,
+                 MultiGpuConfig* multi_gpu_config, bool init_from_master_only=false) {
     // update the model parameters using the AdamW optimizer
     // keep in mind that optimizer sharding (ZeRO-1) assigns different parameters to different GPUs
     // so we may not be responsible for the entire parameter tensor
@@ -2728,196 +1029,170 @@ float gpt2_update(GPT2 *model, float learning_rate, float beta1, float beta2, fl
     // selectively weight decay some, but not all tensors :(
     // TODO: revisit and probably refactor this entire function
     NVTX_RANGE_FN();
-    size_t shard_num_parameters = multi_gpu_config->shard_num_parameters; // num parameters we are responsible for
-    size_t shard_offset = multi_gpu_config->shard_offset; // offset into the full parameter tensor
-    floatX* params_memory = (floatX*)model->params_memory;
-    floatX* grads_memory = (floatX*)model->grads_memory;
-
-    // lazily allocate m,v memory and master weights (usually on the first iteration)
-    if (model->m_memory == NULL) {
-        printf0("allocating %zu MiB for AdamW optimizer state m\n", (shard_num_parameters * sizeof(float)) >> 20);
-        printf0("allocating %zu MiB for AdamW optimizer state v\n", (shard_num_parameters * sizeof(float)) >> 20);
-        cudaCheck(cudaMalloc((void**)&model->m_memory, shard_num_parameters * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&model->v_memory, shard_num_parameters * sizeof(float)));
-        cudaCheck(cudaMemset(model->m_memory, 0, shard_num_parameters * sizeof(float)));
-        cudaCheck(cudaMemset(model->v_memory, 0, shard_num_parameters * sizeof(float)));
-    }
-    if (model->use_master_weights == 1 && model->master_weights == NULL) {
-        printf0("allocating %zu MiB for master copy of params\n", (shard_num_parameters * sizeof(float)) >> 20);
-        cudaCheck(cudaMalloc((void**)&model->master_weights, shard_num_parameters * sizeof(float)));
-        size_t grid_size = CEIL_DIV(shard_num_parameters, 512);
-        copy_and_cast_kernel<<<grid_size, 512>>>(model->master_weights, params_memory + shard_offset, shard_num_parameters);
-        cudaCheck(cudaGetLastError());
+    if(model->grads_memory == nullptr || model->m_memory == nullptr || model->v_memory == nullptr) {
+        fprintf(stderr, "Need to allocate optimizer state before update");
+        exit(EXIT_FAILURE);
     }
 
-    // gradient clipping
-    // repurposing this buffer (which isn't needed now) to write grad norm into it
-    float* grad_norm_squared = (float*)model->acts.output;
-    global_norm_squared(grad_norm_squared, (floatX*)model->grads_memory, model->num_parameters);
-    // transfer the gradient norm to CPU
-    float grad_norm_squared_cpu = 0.0f;
-    cudaCheck(cudaMemcpy(&grad_norm_squared_cpu, grad_norm_squared, sizeof(float), cudaMemcpyDeviceToHost));
-    if(!isfinite(grad_norm_squared_cpu)) {
-        // may happen due to some issue (e.g. overflow?)
-        // TODO: later may want to keep a global counter of instabilities like this
-        printf0("[WARNING]: grad norm is not finite, skipping AdamW update\n");
-        return -1.0f;
+    bool init_state = model->init_state;
+    if(init_state) {
+        model->init_state = false;
+        NvtxRange rng("InitOpt");
+        cudaCheck(cudaMemset(model->m_memory, 0, multi_gpu_config->shard_num_parameters * sizeof(float)));
+        cudaCheck(cudaMemset(model->v_memory, 0, multi_gpu_config->shard_num_parameters * sizeof(float)));
     }
-    float grad_norm_cpu = sqrtf(grad_norm_squared_cpu);
-    float grad_scale = (grad_norm_cpu > grad_clip) ? grad_clip / grad_norm_cpu : 1.0f;
+
+    // save RNG state at this point so we can round from master weights identically when restoring from a checkpoint
+    model->rng_state_last_update = model->rng_state;
 
     // AdamW update
-    int block_size = 512;
-    float beta1_correction = 1.0f - powf(beta1, t);
-    float beta2_correction = 1.0f - powf(beta2, t);
-    unsigned int seed = random_u32(&model->rng_state);
-    // individually call the adamw_kernel3 on all parameter tensors separately
-    size_t offset = 0;
+    // handle adamw for all the transformer blocks
     for (int i = 0; i < NUM_PARAMETER_TENSORS; i++) {
-        size_t num_parameters = model->param_elements[i];
-        // the scope of this GPU's work is the range: [shard_offset, shard_offset + shard_num_parameters)
-        // this parameter's values are in the range:  [offset, offset + num_parameters)
-        // so we are responsible for some of its parameters if:
-        // 1) this parameter ends after we begin (i.e. offset + num_parameters > shard_offset)
-        // 2) this parameter begins before we end (i.e. offset < shard_offset + shard_num_parameters)
-        if(offset + num_parameters > shard_offset && offset < shard_offset + shard_num_parameters) {
-
-            // ok this tensor has at least one element inside the range of responsibility of this GPU
-            // let's figure out the exact span we wish to call the AdamW kernel on
-            floatX* params_ptr = NULL;
-            floatX* grad_ptr = NULL;
-            float* m_ptr = NULL;
-            float* v_ptr = NULL;
-            float* master_ptr = NULL;
-            size_t local_params = 0;
-            // does the tensor begin before our responsibility?
-            if(offset <= shard_offset) {
-                // if so, our start point is exactly that of our responsibility, i.e. shard_offset
-                params_ptr = params_memory + shard_offset;
-                grad_ptr = grads_memory + shard_offset;
-                // note that (master_weights, m, v) are already only the "local slice" for this GPU,
-                // and are of size shard_num_parameters, instead of the total number of parameters
-                // so they do not get offset, i.e. we just start at their index 0
-                if (model->master_weights != NULL) { master_ptr = model->master_weights; }
-                m_ptr = model->m_memory;
-                v_ptr = model->v_memory;
-                // the number of parameters we have to update is the minimum of two ranges
-                local_params = min(shard_num_parameters, (offset + num_parameters) - shard_offset);
-            } else {
-                // our start point is the location of this tensor, i.e. offset
-                params_ptr = params_memory + offset;
-                grad_ptr = grads_memory + offset;
-                // this arithmetic gave me a headache but my little doodle example says it's right
-                size_t delta = offset - shard_offset;
-                if (model->master_weights != NULL) { master_ptr = model->master_weights + delta; }
-                m_ptr = model->m_memory + delta;
-                v_ptr = model->v_memory + delta;
-                local_params = min(num_parameters, shard_num_parameters - delta);
+        // generate a unique seed for each tensor
+        unsigned int seed = random_u32(&model->rng_state);
+
+        int num_layers = model->config.num_layers;
+        if((i < 2 || i > 13)) {
+            num_layers = 1;
+        }
+
+        ShardInfo tensor = gpt2_get_tensor_at_layer(model, 0, i);
+        ShardInfo shard = multi_gpu_get_shard_offset(tensor.size, multi_gpu_config, 1);
+        ptrdiff_t local_offset_full = tensor.offset + shard.offset;
+        ptrdiff_t local_offset_partial = tensor.offset / multi_gpu_config->num_processes;
+
+        // we only want to weight decay the 2D tensors and leave all 1D tensors alone
+        // in particular this also decays the embedding weights, but this is ok:
+        // - the token embeddings are weight shared and participate in the final projection to logits
+        // - the position embeddings actively participate at every forward/backward pass
+        float wd = (i == 0 || i == 1 || i == 4 || i == 6 || i == 10 || i == 12) ? weight_decay : 0.0f;
+        floatX* param_ptr = (floatX*)model->params_memory + local_offset_full;
+        floatX* grad_ptr = (floatX*)model->grads_memory + local_offset_full;
+
+        ptrdiff_t opt_state_offset = multi_gpu_config->zero_stage < 1 ?  local_offset_full : local_offset_partial;
+        float* m_ptr = model->m_memory + opt_state_offset;
+        float* v_ptr = model->v_memory + opt_state_offset;
+        float* master_ptr = nullptr;
+        if (model->master_weights != nullptr) { master_ptr = model->master_weights + opt_state_offset; }
+        if(init_state && model->master_weights != nullptr ) {
+            size_t grid_size = CEIL_DIV(shard.size, 512);
+            copy_and_cast_kernel<<<dim3(grid_size, num_layers), 512, 0, main_stream>>>(master_ptr, param_ptr, shard.size,
+                                                                     shard.size, tensor.size);
+            cudaCheck(cudaGetLastError());
+        }
+
+        if (init_from_master_only) {
+            // when resuming training from a checkpoint with master weights (allows changing precision)
+            init_from_master(param_ptr, master_ptr, shard.size, tensor.size, shard.size, num_layers, seed, main_stream);
+        } else {
+            // ok finally call the kernel to update the weights with AdamW
+            adamw_update(param_ptr, master_ptr, grad_ptr,
+                        m_ptr, v_ptr,
+                        shard.size, tensor.size, tensor.size, shard.size, num_layers,
+                        learning_rate,
+                        beta1, beta2, t, eps, wd, grad_scale, seed, main_stream);
+        }
+
+        if (multi_gpu_config->zero_stage == 1) {
+#if MULTI_GPU
+            ncclCheck(ncclGroupStart());
+            for(int l = 0; l < num_layers; ++l) {
+                // gather updated shards of model->params_memory from each process
+                ncclCheck(ncclAllGather(param_ptr + l * tensor.size,
+                                        (floatX*) model->params_memory + tensor.offset + l * tensor.size,
+                                        shard.size, ncclFloatX,
+                                        multi_gpu_config->nccl_comm, multi_gpu_config->nccl_stream));
             }
-            // we only want to weight decay the 2D tensors and leave all 1D tensors alone
-            // in particular this also decays the embedding weights, but this is ok:
-            // - the token embeddings are weight shared and participate in the final projection to logits
-            // - the position embeddings actively participate at every forward/backward pass
-            float wd = (i == 0 || i == 1 || i == 4 || i == 6 || i == 10 || i == 12) ? weight_decay : 0.0f;
-            // ok finally call the kernel
-            size_t num_blocks = CEIL_DIV(num_parameters, block_size);
-            adamw_kernel3<<<num_blocks, block_size>>>(params_ptr, master_ptr, grad_ptr,
-                                                      m_ptr, v_ptr, local_params, learning_rate,
-                                                      beta1, beta2, beta1_correction, beta2_correction,
-                                                      eps, wd, grad_scale, seed);
+            ncclCheck(ncclGroupEnd());
+#endif
         }
-        // advance the offset pointer to the next parameter tensor
-        offset += num_parameters;
     }
-    cudaCheck(cudaGetLastError());
-    return grad_norm_cpu;
-}
 
-void gpt2_multi_gpu_gather(GPT2 *model, MultiGpuConfig* multi_gpu_config)
-{
-#ifdef MULTI_GPU
-    if (multi_gpu_config->num_processes == 1) { return; } // 1 process => noop
-    if (multi_gpu_config->zero_stage == 1) {
-        // gather updated shards of model->params_memory from each process
-        ncclCheck(ncclAllGather((floatX*)model->params_memory + multi_gpu_config->shard_offset, (floatX*)model->params_memory,
-                                multi_gpu_config->shard_num_parameters, ncclFloatX,
-                                multi_gpu_config->nccl_comm, 0));
-    }
-    cudaCheck(cudaGetLastError());
-#endif
+    cudaCheck(cudaDeviceSynchronize());
 }
 
 float gpt2_estimate_mfu(GPT2 *model, int num_tokens, float dt) {
-    // estimate model flops utilization (MFU) in units of A100 bfloat16 peak FLOPS
-    // see PaLM paper Appendix B as ref: https://arxiv.org/abs/2204.02311
-    // TODO this calculation is only valid for an A100: generalize it?
-    int N = model->num_parameters;
+    /*
+    Estimate model flops utilization (MFU)
+    ref: Section 2.1 of https://arxiv.org/pdf/2001.08361
+    Note: Ideally, the N here would be only the parameters that actually
+    participate in matrix multiplications. In this N, we are over-estimating by
+    including LayerNorm params, biases, and the position embedding weights,
+    but these are very small terms. Also keep in mind that we would want to exclude
+    the token embedding weights, but in GPT-2 these are weight shared, so they
+    participate in the classifier matmul, so they are correct to be included in N.
+    Note 2: The first term (6 * N) in flops_per_token is all weight matmuls, the
+    second is the attention matmul, which is also usually a small contribution.
+    */
+    size_t N = model->num_parameters;
     int L = model->config.num_layers;
-    int H = model->config.num_heads;
-    int Q = model->config.channels / model->config.num_heads;
+    int C = model->config.channels;
     int T = model->seq_len;
-    size_t flops_per_token = (size_t)6 * N + (size_t)12 * L * H * Q * T;
+    size_t flops_per_token = 6 * N + (size_t)6 * L * C * T;
     size_t flops_per_step = flops_per_token * num_tokens;
     // express our flops throughput as ratio of A100 bfloat16 peak flops
     float flops_achieved = (float)flops_per_step * (1.0f / dt); // per second
-    float flops_promised = 312e12f; // A100 GPU bfloat16 peak flops is 312 TFLOPS
+    float flops_promised = get_flops_promised(deviceProp.name, PRECISION_MODE) * 1e12f;
+    if(flops_promised < 0) {
+        return -1.f;   // don't know
+    }
     float mfu = flops_achieved / flops_promised;
     return mfu;
 }
 
 void gpt2_free(GPT2 *model) {
-    cudaCheck(cudaFree(model->params_memory));
-    cudaCheck(cudaFree(model->grads_memory));
-    cudaCheck(cudaFree(model->m_memory));
-    cudaCheck(cudaFree(model->v_memory));
-    cudaCheck(cudaFree(model->master_weights));
-    cudaCheck(cudaFree(model->acts_memory));
-    cudaCheck(cudaFree(model->grads_acts_memory));
-    cudaCheck(cudaFree(model->inputs));
-    cudaCheck(cudaFree(model->targets));
-    cudaFreeHost(model->cpu_losses);
-    cudaFreeHost(model->cpu_losses_fp32);
+    cudaFreeCheck(&model->params_memory);
+    cudaFreeCheck(&model->grads_memory);
+    cudaFreeCheck(&model->m_memory);
+    cudaFreeCheck(&model->v_memory);
+    cudaFreeCheck(&model->master_weights);
+    cudaFreeCheck(&model->acts_memory);
+    cudaFreeCheck(&model->inputs);
+    cudaFreeCheck(&model->targets);
+    cudaFreeCheck(&model->accumulated_mean_loss);
+    cudaCheck(cudaFreeHost(model->cpu_losses));
     free(model->workload_indices);
     free(model->bucket_info);
 }
 
 // ----------------------------------------------------------------------------
-// common init & free code for train/test/profile
+// common init & free code for all of train/test/profile
+
 void common_start(bool override_enable_tf32 = true, bool print_device_info = true) {
-    cudaGetDeviceProperties(&deviceProp, multi_gpu_config.local_device_idx);
+
+    // get CUDA device infos
+    cudaCheck(cudaGetDeviceProperties(&deviceProp, multi_gpu_config.local_device_idx));
     if (print_device_info) {
         printf("[System]\n");
         printf("Device %d: %s\n", multi_gpu_config.local_device_idx, deviceProp.name);
     }
 
-    // set up cuBLAS and cuBLASLt (and cuDNN if enabled)
-    cublasCheck(cublasCreate(&cublas_handle));
+    // set up the cuda streams. atm everything is on the single main stream
+    cudaCheck(cudaStreamCreate(&main_stream));
+    nvtxNameCudaStreamA(main_stream, "main stream");
+
+    // set up cuBLAS and cuBLASLt
     cublasCheck(cublasLtCreate(&cublaslt_handle));
     cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size));
 
     // TF32 precision is equivalent to torch.set_float32_matmul_precision('high')
     bool enable_tf32 = PRECISION_MODE == PRECISION_FP32 && deviceProp.major >= 8 && override_enable_tf32;
-    cublasCheck(cublasSetMathMode(cublas_handle, enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH));
     cublas_compute = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
 
+    #ifdef ENABLE_CUDNN
     create_cudnn();
+    #endif
 }
 
 void common_free(GPT2 &model) {
-    gpt2_free(&model);
+    cudaCheck(cudaStreamDestroy(main_stream));
     cudaCheck(cudaFree(cublaslt_workspace));
-    cublasCheck(cublasDestroy(cublas_handle));
     cublasCheck(cublasLtDestroy(cublaslt_handle));
+    #ifdef ENABLE_CUDNN
     destroy_cudnn();
+    #endif
 }
 
-#ifndef TESTING
-// if we are TESTING (see test_gpt2.cu), we'll skip everything below this point
-
-// ----------------------------------------------------------------------------
-// training resumption logic, very useful when jobs crash once in a while
-// the goal is that we can resume optimization from any checkpoint, bit-perfect
-// note that "state" refers to things not already saved in the model checkpoint file
 
 void save_state(const char* filename, int step, GPT2* model, DataLoader* loader) {
     printf("Writing state to %s\n", filename);
@@ -2929,23 +1204,35 @@ void save_state(const char* filename, int step, GPT2* model, DataLoader* loader)
     state_header[1] = 1; // version number
     state_header[2] = multi_gpu_config.num_processes; // number of processes
     state_header[3] = multi_gpu_config.process_rank; // rank of this process
+    state_header[4] = model->use_master_weights;  // whether we're using fp32 master weights
+    state_header[5] = loader->should_shuffle; // shuffle state of the dataloader
     // int main state, start at 10 to leave some padding
     state_header[10] = step; // step of the optimization
-    // model state, state, start at 20 to leave some padding
+    // model rng state, start at 20 to leave some padding
     *((unsigned long long*)&state_header[20]) = model->rng_state; // random number generator state
+    *((unsigned long long*)&state_header[22]) = model->rng_state_last_update; // last gpt2_update
     // dataloader state, start at 30 to leave some padding
-    state_header[30] = loader->current_shard; // shard of the dataset
-    *((int64_t*)&state_header[31]) = loader->current_position; // position in shard
-    fwrite(state_header, sizeof(int), 256, state_file);
+    *((size_t*)&state_header[30]) = loader->current_shard_idx; // shard of the dataset
+    *((size_t*)&state_header[32]) = loader->current_sample_idx; // position in shard
+    fwriteCheck(state_header, sizeof(int), 256, state_file);
+
     // write AdamW m, v, and master_weights here (they are all float)
     size_t shard_num_parameters = multi_gpu_config.shard_num_parameters;
-    float* cpu_buffer = (float*)mallocCheck(shard_num_parameters * sizeof(float));
-    cudaCheck(cudaMemcpy(cpu_buffer, model->m_memory, shard_num_parameters * sizeof(float), cudaMemcpyDeviceToHost));
-    fwrite(cpu_buffer, sizeof(float), shard_num_parameters, state_file);
-    cudaCheck(cudaMemcpy(cpu_buffer, model->v_memory, shard_num_parameters * sizeof(float), cudaMemcpyDeviceToHost));
-    fwrite(cpu_buffer, sizeof(float), shard_num_parameters, state_file);
-    free(cpu_buffer);
-    fclose(state_file);
+    device_to_file(state_file, model->m_memory, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream);
+    device_to_file(state_file, model->v_memory, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream);
+    if(model->use_master_weights) {
+        device_to_file(state_file, model->master_weights, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream);
+    }
+
+    // write dataloader state if we are using the Permuted version of it
+    if (loader->should_shuffle) {
+        fwriteCheck(&loader->glob_result.gl_pathc, sizeof(size_t), 1, state_file);  // number of shards
+        fwriteCheck(loader->shard_indices, sizeof(int), loader->glob_result.gl_pathc, state_file);
+        fwriteCheck(&loader->shard_num_samples, sizeof(size_t), 1, state_file);
+        fwriteCheck(loader->intra_shard_indices, sizeof(int), loader->shard_num_samples, state_file);
+        fwriteCheck(&loader->shuffle_rng, sizeof(mt19937_state), 1, state_file);
+    }
+    fcloseCheck(state_file);
 }
 
 void load_state(int* step, GPT2* model, DataLoader* loader, const char* filename) {
@@ -2956,29 +1243,112 @@ void load_state(int* step, GPT2* model, DataLoader* loader, const char* filename
     assert(state_header[1] == 1); // version number
     assert(state_header[2] == multi_gpu_config.num_processes); // number of processes
     assert(state_header[3] == multi_gpu_config.process_rank); // rank of this process
+    int use_master_weights = state_header[4];  // whether we're using fp32 master weights
+    int should_shuffle = state_header[5]; // shuffle state of the dataloader
     *step = state_header[10]; // step of the optimization
     model->rng_state = *((unsigned long long*)&state_header[20]); // random number generator state
-    loader->current_shard = state_header[30]; // shard of the dataset
-    loader->current_position = *((int64_t*)&state_header[31]); // position in shard
-    // read AdamW m, v (they are all float)
-    // also allocate the m, v memory in the model, if it does not yet exist
+    model->rng_state_last_update = *((unsigned long long*)&state_header[22]); // last gpt2_update
+    size_t current_shard_idx = *((size_t*)&state_header[30]); // shard index
+    size_t current_sample_idx = *((size_t*)&state_header[32]); // position in shard
+
+    // read AdamW m, v, master_weights (they are all float)
+    // allocate all the needed memory as necessary
     size_t shard_num_parameters = multi_gpu_config.shard_num_parameters;
-    if (model->m_memory == NULL) {
-        printf0("allocating %zu MiB for AdamW optimizer state m\n", (shard_num_parameters * sizeof(float)) >> 20);
-        printf0("allocating %zu MiB for AdamW optimizer state v\n", (shard_num_parameters * sizeof(float)) >> 20);
-        cudaCheck(cudaMalloc((void**)&model->m_memory, shard_num_parameters * sizeof(float)));
-        cudaCheck(cudaMalloc((void**)&model->v_memory, shard_num_parameters * sizeof(float)));
+    if(use_master_weights == 1 && !model->use_master_weights) {
+        printf0("Warning: Master weights are present in state, but not enabled for current run.");
+    } else if (use_master_weights == 0 && model->use_master_weights) {
+        printf0("Error: Master weights requested, but not present in state file.");
+        exit(EXIT_FAILURE);
+    }
+
+    model->init_state = false;      // we just got the state from file, no need to do first-touch init
+    assert(model->m_memory != nullptr);
+    assert(model->v_memory != nullptr);
+    file_to_device(model->m_memory, state_file, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream);
+    file_to_device(model->v_memory, state_file, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream);
+    if(model->use_master_weights) {
+        assert(model->master_weights != nullptr);
+        file_to_device(model->master_weights, state_file, shard_num_parameters * sizeof(float), IO_BUF_SIZE, main_stream);
+        // restore weights from the master weights using the RNG state before last weight update
+        model->rng_state = model->rng_state_last_update;
+        gpt2_update(model, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0, &multi_gpu_config, /* init_from_master_only*/ true);
+        model->rng_state = *((unsigned long long*)&state_header[20]); // use final RNG state from checkpoint after this
+    }
+
+    // revive the DataLoader object and its state
+    loader->should_shuffle = should_shuffle;
+    if (should_shuffle == 1) {
+        // ensure the number of shards matches
+        size_t glob_result_gl_pathc;
+        freadCheck(&glob_result_gl_pathc, sizeof(size_t), 1, state_file);
+        assert(glob_result_gl_pathc == loader->glob_result.gl_pathc);
+        // read the shard indices
+        loader->shard_indices = (int*)mallocCheck(loader->glob_result.gl_pathc * sizeof(int));
+        freadCheck(loader->shard_indices, sizeof(int), loader->glob_result.gl_pathc, state_file);
+        // ensure the number of samples matches
+        size_t shard_num_samples;
+        freadCheck(&shard_num_samples, sizeof(size_t), 1, state_file);
+        assert(shard_num_samples == loader->shard_num_samples);
+        // read the intra-shard indices
+        loader->intra_shard_indices = (int*)mallocCheck(loader->shard_num_samples * sizeof(int));
+        freadCheck(loader->intra_shard_indices, sizeof(int), loader->shard_num_samples, state_file);
+        // read the shuffle rng state
+        freadCheck(&loader->shuffle_rng, sizeof(mt19937_state), 1, state_file);
+    }
+    dataloader_resume(loader, current_shard_idx, current_sample_idx);
+
+    // all done, close state file
+    fcloseCheck(state_file);
+}
+
+void write_checkpoint(const char* output_log_dir, int step, GPT2* model, DataLoader* train_loader, MultiGpuConfig* multi_gpu_config) {
+    // a checkpoint contains: model weights, optimizer/dataloader state, and a DONE file
+    printf0("Writing checkpoint at step %d\n", step);
+    int rank = multi_gpu_config->process_rank;
+    // only rank 0 writes the model file because it is the same across all ranks
+    if (rank == 0) {
+        snprintf(filename_buffer, sizeof(filename_buffer), "%s/model_%08d.bin", output_log_dir, step);
+        gpt2_write_to_checkpoint(model, filename_buffer);
+    }
+    // all ranks write their state file
+    snprintf(filename_buffer, sizeof(filename_buffer), "%s/state_%08d_%05d.bin", output_log_dir, step, rank);
+    save_state(filename_buffer, step, model, train_loader);
+    // DONE file is a signal that this checkpoint as a whole is complete
+    multi_gpu_barrier(multi_gpu_config);
+    if (rank == 0) {
+        snprintf(filename_buffer, sizeof(filename_buffer), "%s/DONE_%08d", output_log_dir, step);
+        FILE* done_file = fopenCheck(filename_buffer, "w");
+        fcloseCheck(done_file);
+    }
+}
+
+void delete_checkpoint(const char* output_log_dir, int step, MultiGpuConfig* multi_gpu_config) {
+    // mirrors write_checkpoint function, cleans up checkpoint from disk
+    printf0("Deleting checkpoint at step %d\n", step);
+    int rank = multi_gpu_config->process_rank;
+    if (rank == 0) {
+        snprintf(filename_buffer, sizeof(filename_buffer), "%s/model_%08d.bin", output_log_dir, step);
+        remove(filename_buffer);
+    }
+    snprintf(filename_buffer, sizeof(filename_buffer), "%s/state_%08d_%05d.bin", output_log_dir, step, rank);
+    remove(filename_buffer);
+    if (rank == 0) {
+        snprintf(filename_buffer, sizeof(filename_buffer), "%s/DONE_%08d", output_log_dir, step);
+        remove(filename_buffer);
     }
-    float* cpu_buffer = (float*)mallocCheck(shard_num_parameters * sizeof(float));
-    freadCheck(cpu_buffer, sizeof(float), shard_num_parameters, state_file);
-    cudaCheck(cudaMemcpy(model->m_memory, cpu_buffer, shard_num_parameters * sizeof(float), cudaMemcpyHostToDevice));
-    freadCheck(cpu_buffer, sizeof(float), shard_num_parameters, state_file);
-    cudaCheck(cudaMemcpy(model->v_memory, cpu_buffer, shard_num_parameters * sizeof(float), cudaMemcpyHostToDevice));
 }
 
+#ifndef TESTING
+// if we are TESTING (see test_gpt2.cu), we'll skip everything below this point
+
+// ----------------------------------------------------------------------------
+// training resumption logic, very useful when jobs crash once in a while
+// the goal is that we can resume optimization from any checkpoint, bit-perfect
+// note that "state" refers to things not already saved in the model checkpoint file
+
 // ----------------------------------------------------------------------------
 // CLI, poor man's argparse
-// unclaimed flags lol: k,p
+// (all single letters have been claimed now)
 
 void error_usage() {
     fprintf(stderr, "Usage:   ./train_gpt2cu [options]\n");
@@ -2986,9 +1356,12 @@ void error_usage() {
     // file system input / output
     fprintf(stderr, "  -i <string> train data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_train.bin)\n");
     fprintf(stderr, "  -j <string> val data filename pattern (default = dev/data/tinyshakespeare/tiny_shakespeare_val.bin)\n");
-    fprintf(stderr, "  -e <string> input from model at this filename (default = gpt2_124M_bf16.bin)\n");
+    fprintf(stderr, "  -e <string> input .bin filename or descriptor, see code comments as docs. (default = gpt2_124M_bf16.bin)\n");
     fprintf(stderr, "  -o <string> output log dir (default = NULL, no logging)\n");
+    fprintf(stderr, "  -lg <int>   log gpu info every x steps (default = -1; disabled)\n");
     fprintf(stderr, "  -n <int>    write optimization checkpoints every how many steps? (default 0, don't)\n");
+    fprintf(stderr, "  -nk <int>   max number of checkpoints to keep in the directory, removing old ones (0 = disable, default)\n");
+    fprintf(stderr, "  -nm <int>   every how many step checkpoints are considered major? major checkpoints never get deleted.\n");
     fprintf(stderr, "  -y <int>    resume optimization found inside output log dir? (0=restart/overwrite, 1=resume/append)\n");
     // token layout for each step of the optimization
     fprintf(stderr, "  -b <int>    (per-GPU, micro) batch size B (default = 4)\n");
@@ -2997,10 +1370,13 @@ void error_usage() {
     // workload (number of steps)
     fprintf(stderr, "  -x <int>    max_steps of optimization to run (-1 (default) = disable, run 1 epoch)\n");
     // optimization
+    fprintf(stderr, "  -k <string> learning rate scheduler (default = cosine)\n");
     fprintf(stderr, "  -l <float>  learning rate (default = 3e-4f)\n");
     fprintf(stderr, "  -u <int>    learning rate warmup iterations (default = 0, no warmup)\n");
     fprintf(stderr, "  -q <float>  learning rate decay: final fraction, at end of training (default = 1.0 (no decay))\n");
     fprintf(stderr, "  -c <float>  weight decay (default = 0.0f)\n");
+    fprintf(stderr, "  -sl <float> outlier stability: skip update if loss goes above this in zscore (0.0f=off)\n");
+    fprintf(stderr, "  -sg <float> outlier stability: skip update if grad_norm goes above this in zscore (0.0f=off)\n");
     // evaluation
     fprintf(stderr, "  -v <int>    val_loss_every, how often we evaluate val loss (default = 20)\n");
     fprintf(stderr, "  -m <int>    val_max_steps, up to how many val batches to estimate val loss? (default = 20)\n");
@@ -3012,31 +1388,43 @@ void error_usage() {
     // numerics
     fprintf(stderr, "  -f <int>    enable_tf32 override (default: 1, set to 0 to disable tf32)\n");
     fprintf(stderr, "  -w <int>    keep f32 copy of weights for the optimizer? (default: 1)\n");
+    fprintf(stderr, "  -ge <int>   gelu fusion: 0=none, 1=forward, 2=forward+backward (default: 2 for >=SM90, 0 for older GPUs)\n");
     // memory management
     fprintf(stderr, "  -z <int>    zero_stage, Zero Optimization Stage, 0,1,2,3 (default = 0)\n");
-    fprintf(stderr, "  -r <int>    recompute: saves memory at cost of speed. (default = 1), 0 = none. 1 = recompute gelu\n");
+    fprintf(stderr, "  -r <int>    recompute: less memory but less speed. (default = 1), 0|1|2 = none,gelu,gelu+ln\n");
+    // multi-node settings
+    fprintf(stderr, "  -pn <int>    num_processes (default = 1)\n");
+    fprintf(stderr, "  -pr <int>    process_rank (default = 0)\n");
+    fprintf(stderr, "  -pg <int>    gpus_per_node (default = 8)\n");
+    fprintf(stderr, "  -pm <string> nccl_init_method: tcp,fs,mpi (default = mpi)\n");
+    fprintf(stderr, "  -ps <string> server_ip - used only when nccl_init_method is tcp (default = -1)\n");
+    fprintf(stderr, "  -pp <string> fs_path - used only when nccl_init_method is fs (default = /tmp)\n");
     exit(EXIT_FAILURE);
 }
 
 // ----------------------------------------------------------------------------
 // main training loop
 int main(int argc, char *argv[]) {
-    multi_gpu_config = multi_gpu_config_init(&argc, &argv);
-
     // read in the (optional) command line arguments
     const char* train_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_train.bin";
     const char* val_data_pattern = "dev/data/tinyshakespeare/tiny_shakespeare_val.bin";
     const char* load_filename = "gpt2_124M_bf16.bin"; // bf16 weights of the model
+    const char* lr_scheduler_type = "cosine";
     const char* output_log_dir = NULL;
-    int checkpoint_every = 0; // write optimization checkpoints every how many steps?
+    int checkpoint_every = 0; // write checkpoints every how many steps?
+    int checkpoints_keep = 0; // how long checkpoint history do we keep? (in units of checkpoints)
+    int major_checkpoint_every = 0; // major checkpoints never get deleted when maintaining history
     int resume = 0; // resume the optimization, if one is found inside output_log_dir?
     int B = 4; // batch size
     int T = 1024; // sequence length max
     int total_batch_size = -1; // will be calculated down below later, if not provided
     float learning_rate = 3e-4f;
+    int log_gpu_every = -1;
     int warmup_iterations = 0;
     float final_learning_rate_frac = 1.0f; // final fraction of learning rate, at end of training
     float weight_decay = 0.0f;
+    float skip_update_lossz = 0.0f; // skip update if loss goes above this in zscore
+    float skip_update_gradz = 0.0f; // skip update if grad_norm goes above this in zscore
     int val_loss_every = 20; // every how many steps do we eval validation loss?
     int val_max_steps = 20; // how many batches max do we eval for validation loss?
     int sample_every = 20; // every how many steps to do inference?
@@ -3045,31 +1433,41 @@ int main(int argc, char *argv[]) {
     int max_steps = -1;
     int override_enable_tf32 = 1;
     int use_master_weights = 1;
+    int gelu_fusion = -1; // 0 = none, 1 = forward, 2 = forward+backward (-1 => per-GPU default)
     int recompute = 1; // recompute during backward setting, 0 = none, 1 = recompute gelu
     int zero_stage = 0; // Zero Optimization Stage for Multi-GPU training
     int hellaswag_eval = 0;
+    // multi-node settings
+    int num_processes = 1;  // this should be set by the slurm environment
+    int process_rank = 0;  // this should be set by the slurm environment
+    int gpus_per_node = 8;  // this should be set by the slurm environment
+    char nccl_init_method[256] = "mpi";  // "tcp" or "fs" or "mpi"
+    char server_ip[256] = "";  // used if init_method set to "tcp" -> set to your server ip address
+    char fs_path[256] = "";  // used if init_method set to "fs" -> set to a shared filesystem path
     for (int i = 1; i < argc; i+=2) {
         if (i + 1 >= argc) { error_usage(); } // must have arg after flag
         if (argv[i][0] != '-') { error_usage(); } // must start with dash
-        if (strlen(argv[i]) != 2) { error_usage(); } // must be -x (one dash, one letter)
+        if (!(strlen(argv[i]) == 2 || strlen(argv[i]) == 3)) { error_usage(); } // must be -x[y] (one dash, one or two letters)
         // read in the args
         if (argv[i][1] == 'i') { train_data_pattern = argv[i+1]; }
         else if (argv[i][1] == 'j') { val_data_pattern = argv[i+1]; }
         else if (argv[i][1] == 'e') { load_filename = argv[i+1]; }
         else if (argv[i][1] == 'o') { output_log_dir = argv[i+1]; }
-        else if (argv[i][1] == 'n') { checkpoint_every = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'n' && argv[i][2] == '\0') { checkpoint_every = atoi(argv[i+1]); }
         else if (argv[i][1] == 'y') { resume = atoi(argv[i+1]); }
         else if (argv[i][1] == 'b') { B = atoi(argv[i+1]); } // Per-GPU (micro) batch size
         else if (argv[i][1] == 't') { T = atoi(argv[i+1]); }
         else if (argv[i][1] == 'd') { total_batch_size = atoi(argv[i+1]); }
-        else if (argv[i][1] == 'l') { learning_rate = atof(argv[i+1]); }
+        else if (argv[i][1] == 'l' && argv[i][2] == '\0') { learning_rate = atof(argv[i+1]); }
+        else if (argv[i][1] == 'l' && argv[i][2] == 'g') { log_gpu_every = atoi(argv[i+1]); }
         else if (argv[i][1] == 'u') { warmup_iterations = atoi(argv[i+1]); }
         else if (argv[i][1] == 'q') { final_learning_rate_frac = atof(argv[i+1]); }
         else if (argv[i][1] == 'c') { weight_decay = atof(argv[i+1]); }
         else if (argv[i][1] == 'x') { max_steps = atoi(argv[i+1]); }
         else if (argv[i][1] == 'v') { val_loss_every = atoi(argv[i+1]); }
         else if (argv[i][1] == 'm') { val_max_steps = atoi(argv[i+1]); }
-        else if (argv[i][1] == 's') { sample_every = atoi(argv[i+1]); }
+        else if (argv[i][1] == 's' && argv[i][2] == '\0') { sample_every = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'g' && argv[i][2] == 'e') { gelu_fusion = atoi(argv[i+1]); }
         else if (argv[i][1] == 'g') { genT = atoi(argv[i+1]); }
         else if (argv[i][1] == 'a') { overfit_single_batch = atoi(argv[i+1]); }
         else if (argv[i][1] == 'f') { override_enable_tf32 = atoi(argv[i+1]); }
@@ -3077,22 +1475,33 @@ int main(int argc, char *argv[]) {
         else if (argv[i][1] == 'z') { zero_stage = atoi(argv[i+1]); }
         else if (argv[i][1] == 'r') { recompute = atoi(argv[i+1]); }
         else if (argv[i][1] == 'h') { hellaswag_eval = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'k') { lr_scheduler_type = argv[i+1]; }
+        else if (argv[i][1] == 'p' && argv[i][2] == 'i') { strcpy(nccl_init_method, argv[i+1]); }
+        else if (argv[i][1] == 'p' && argv[i][2] == 'f') { strcpy(fs_path, argv[i+1]); }
+        else if (argv[i][1] == 'p' && argv[i][2] == 's') { strcpy(server_ip, argv[i+1]); }
+        else if (argv[i][1] == 'p' && argv[i][2] == 'n') { num_processes = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'p' && argv[i][2] == 'r') { process_rank = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'p' && argv[i][2] == 'g') { gpus_per_node = atoi(argv[i+1]); }
+        else if (argv[i][1] == 's' && argv[i][2] == 'l') { skip_update_lossz = atof(argv[i+1]); }
+        else if (argv[i][1] == 's' && argv[i][2] == 'g') { skip_update_gradz = atof(argv[i+1]); }
+        else if (argv[i][1] == 'n' && argv[i][2] == 'k') { checkpoints_keep = atoi(argv[i+1]); }
+        else if (argv[i][1] == 'n' && argv[i][2] == 'm') { major_checkpoint_every = atoi(argv[i+1]); }
         else { error_usage(); }
     }
+
+    multi_gpu_config = multi_gpu_config_init(num_processes, process_rank, gpus_per_node, server_ip, fs_path, nccl_init_method);
+    common_start(override_enable_tf32, false); // common init code for train/test/profile
+
     // should do a bit more error checking here
     assert(warmup_iterations >= 0);
     if (output_log_dir != NULL) {
         assert(strlen(output_log_dir) < 400); // careful bunch of hardcoded snprintf around this
     }
-    // check if output_log_dir has a "." in it, because this behavior changed May 24, 2024. take out later
-    if (output_log_dir != NULL && strstr(output_log_dir, ".") != NULL) {
-        fprintf(stderr, "-o (output_log_dir) has a '.', are you specifying a file instead of dir?\n");
-        fprintf(stderr, "(note that this option changed recently, -o used to be file, became dir.)\n");
-        exit(EXIT_FAILURE);
-    }
     int tokens_per_fwdbwd = B * T * multi_gpu_config.num_processes; // one micro-batch processes this many tokens
     // calculate sensible default for total batch size as assuming no gradient accumulation
     if (total_batch_size == -1) { total_batch_size = tokens_per_fwdbwd; }
+    // in the future, we might want to set gelu fusion to 2 for SM90+ and 0 for other GPUs
+    if (gelu_fusion == -1) { gelu_fusion = 0; } // (deviceProp.major >= 9) ? 2 : 0; } // in gpt2_init_common for test_gpt2cu...
     // calculate the number of gradient accumulation steps from the desired total batch size
     assert(total_batch_size % tokens_per_fwdbwd == 0);
     int grad_accum_steps = total_batch_size / tokens_per_fwdbwd;
@@ -3110,10 +1519,13 @@ int main(int argc, char *argv[]) {
     printf0("| micro batch size B    | %-50d |\n", B);
     printf0("| sequence length T     | %-50d |\n", T);
     printf0("| total batch size      | %-50d |\n", total_batch_size);
+    printf0("| LR scheduler          | %-50s |\n", lr_scheduler_type);
     printf0("| learning rate (LR)    | %-50e |\n", learning_rate);
     printf0("| warmup iterations     | %-50d |\n", warmup_iterations);
     printf0("| final LR fraction     | %-50e |\n", final_learning_rate_frac);
     printf0("| weight decay          | %-50e |\n", weight_decay);
+    printf0("| skip update lossz     | %-50f |\n", skip_update_lossz);
+    printf0("| skip update gradz     | %-50f |\n", skip_update_gradz);
     printf0("| max_steps             | %-50d |\n", max_steps);
     printf0("| val_loss_every        | %-50d |\n", val_loss_every);
     printf0("| val_max_steps         | %-50d |\n", val_max_steps);
@@ -3121,53 +1533,50 @@ int main(int argc, char *argv[]) {
     printf0("| genT                  | %-50d |\n", genT);
     printf0("| overfit_single_batch  | %-50d |\n", overfit_single_batch);
     printf0("| use_master_weights    | %-50s |\n", use_master_weights ? "enabled" : "disabled");
+    printf0("| gelu_fusion           | %-50d |\n", gelu_fusion);
     printf0("| recompute             | %-50d |\n", recompute);
     printf0("+-----------------------+----------------------------------------------------+\n");
-
-    common_start(override_enable_tf32, false); // common init code for train/test/profile
     const char* precision_str = (PRECISION_MODE == PRECISION_FP32)
                               ? (cublas_compute == CUBLAS_COMPUTE_32F_FAST_TF32 ? "TF32" : "FP32")
                               : (PRECISION_MODE == PRECISION_FP16 ? "FP16" : "BF16");
     printf0("| device                | %-50s |\n", deviceProp.name);
+    printf0("| peak TFlops           | %-50.1f |\n", get_flops_promised(deviceProp.name, PRECISION_MODE));
     printf0("| precision             | %-50s |\n", precision_str);
     printf0("+-----------------------+----------------------------------------------------+\n");
 
     // figure out if we are going to be resuming the optimization
-    char filename_buffer[512];
     int resuming = 0;
+    // find the DONE file with the highest step count
     int resume_max_step = find_max_step(output_log_dir);
-    if (resume == 1) {
-        // find the DONE file with the highest step count
+    if (resume == 1) { // is -y 1 resume flag set?
         assert(output_log_dir != NULL);
-        if (resume_max_step == -1) {
-        } else {
-            resuming = 1;
-            snprintf(filename_buffer, 512, "%s/model_%08d.bin", output_log_dir, resume_max_step);
+        if (resume_max_step != -1) {
+            resuming = 1; // -y 1 is set, and we found a checkpoint we can resume from
+            snprintf(filename_buffer, sizeof(filename_buffer), "%s/model_%08d.bin", output_log_dir, resume_max_step);
         }
     }
 
     // build the GPT-2 model
     GPT2 model;
-    // if load_filename is of the form "dX" where X is an integer (e.g. d12), then we build
-    // a random model with the depth of the model specified by X (e.g. 12). otherwise interpret
-    // this variable as a checkpoint filename, and load that checkpoint
-    assert(strlen(load_filename) >= 2);
+    gpt2_init_common(&model);
     if (resuming == 1) {
-        gpt2_build_from_checkpoint(&model, filename_buffer);
-    } else if (load_filename[0] == 'd') {
-        int depth = atoi(load_filename + 1);
-        if (depth > 1 && depth <= 1000) { // we're not going to train models this big right? heh
-            gpt2_build_from_random(&model, depth);
-        } else {
-            exit(EXIT_FAILURE);
-        }
-    } else {
+        // if `-y 1` was set, then we are resuming from the latest checkpoint
+        // if we are using master weights, we'll init them later inside load_state()
+        bool weight_init = !use_master_weights;
+        gpt2_build_from_checkpoint(&model, filename_buffer, weight_init);
+    } else if (ends_with_bin(load_filename)) {
+        // otherwise, if this is a .bin file, we assume it's a model, let's init from it
         gpt2_build_from_checkpoint(&model, load_filename);
+    } else {
+        // if it's not .bin, it could be a "special descriptor". This descriptor is used to
+        // construct GPT-2 / GPT-3 models in a convenient format. See the function for docs.
+        gpt_build_from_descriptor(&model, load_filename);
     }
 
     model.use_master_weights = use_master_weights;
+    model.gelu_fusion = gelu_fusion;
     model.recompute = recompute;
-    printf0("| load_filename         | %-50s |\n", load_filename);
+    printf0("| weight init method    | %-50s |\n", resuming == 1 ? "intermediate checkpoint" : load_filename);
     printf0("| max_sequence_length T | %-50d |\n", model.config.max_seq_len);
     printf0("| vocab_size V          | %-50d |\n", model.config.vocab_size);
     printf0("| padded_vocab_size Vp  | %-50d |\n", model.config.padded_vocab_size);
@@ -3178,9 +1587,10 @@ int main(int argc, char *argv[]) {
     printf0("+-----------------------+----------------------------------------------------+\n");
 
     // build DataLoaders for both train and val
+    int permute_train_loader = (overfit_single_batch == 1) ? 0 : 1;
     DataLoader train_loader, val_loader;
-    dataloader_init(&train_loader, train_data_pattern, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes);
-    dataloader_init(&val_loader, val_data_pattern, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes);
+    dataloader_init(&train_loader, train_data_pattern, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes, permute_train_loader);
+    dataloader_init(&val_loader, val_data_pattern, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes, 0);
     // figure out the number of training steps we will run for
     int train_num_batches = max_steps; // passed in from command line
     if (train_num_batches == -1) {
@@ -3204,8 +1614,8 @@ int main(int argc, char *argv[]) {
     // build an EvalLoader for HellaSwag
     EvalLoader eval_loader;
     const char* hellaswag_path = "dev/data/hellaswag/hellaswag_val.bin";
-    const char hellaswag_available = access(hellaswag_path, F_OK) == 0;
-    const char run_hellaswag = hellaswag_eval && hellaswag_available;
+    const bool hellaswag_available = access(hellaswag_path, F_OK) == 0;
+    const bool run_hellaswag = hellaswag_eval && hellaswag_available;
     if (run_hellaswag) {
         evalloader_init(&eval_loader, hellaswag_path, B, T, multi_gpu_config.process_rank, multi_gpu_config.num_processes);
     }
@@ -3232,7 +1642,7 @@ int main(int argc, char *argv[]) {
     printf0("=> setting grad_accum_steps=%d\n", grad_accum_steps);
 
     // set up logging
-    create_dir_if_not_exists(output_log_dir);
+    if (multi_gpu_config.process_rank == 0) { create_dir_if_not_exists(output_log_dir); }
     Logger logger;
     logger_init(&logger, output_log_dir, multi_gpu_config.process_rank, resume);
 
@@ -3240,6 +1650,11 @@ int main(int argc, char *argv[]) {
     Tokenizer tokenizer;
     tokenizer_init(&tokenizer, "gpt2_tokenizer.bin");
 
+    // set up learning rate scheduler
+    LearningRateScheduler lr_scheduler;
+    lr_scheduler_init(&lr_scheduler, lr_scheduler_type, learning_rate,
+                      warmup_iterations, train_num_batches, final_learning_rate_frac);
+
     // some memory for generating samples from the model
     int* gen_tokens = (int*)mallocCheck(B * T * sizeof(int));
     floatX* cpu_logits_raw = (floatX*)mallocCheck(model.config.vocab_size * sizeof(floatX));
@@ -3247,11 +1662,34 @@ int main(int argc, char *argv[]) {
 
     // if we found a checkpoint to resume from, load the optimization state
     int step = 0;
+    gpt2_allocate_state(&model, B, T);
     if (resuming == 1) {
-        snprintf(filename_buffer, 512, "%s/state_%08d_%05d.bin", output_log_dir, resume_max_step, multi_gpu_config.process_rank);
+        snprintf(filename_buffer, sizeof(filename_buffer), "%s/state_%08d_%05d.bin", output_log_dir, resume_max_step, multi_gpu_config.process_rank);
         load_state(&step, &model, &train_loader, filename_buffer);
     }
 
+    // init an OutlierDetector the training loss
+    OutlierDetector loss_outlier_detector, grad_norm_outlier_detector;
+    init_detector(&loss_outlier_detector);
+    init_detector(&grad_norm_outlier_detector);
+
+    // do some checks here before we kick off training
+    // cross-check the desired sequence length T with the model's max sequence length
+    if (T < model.config.max_seq_len) {
+        printf0("!!!!!!!!\n");
+        printf0("WARNING:\n");
+        printf0("- The training sequence length is: T=%d (set with -t)\n", T);
+        printf0("- The model's max sequence length is: max_seq_len=%d\n", model.config.max_seq_len);
+        printf0("You are attempting to train with a sequence length shorter than the model's max.\n");
+        printf0("This will lead to unused parameters in the wpe position embedding weights.\n");
+        printf0("If you know what you're doing you can ignore this warning.\n");
+        printf0("If you're like ???, you are most likely misconfiguring your training run.\n");
+        printf0("---> HINT: If you're training GPT-2 use -t 1024. If GPT-3, use -t 2048.\n");
+        printf0("!!!!!!!!\n");
+    }
+    // in any case, this must be true or we'd index beyond the model's wpe (position embedding table)
+    assert(T <= model.config.max_seq_len);
+
     // train
     cudaEvent_t start, end;
     cudaCheck(cudaEventCreate(&start));
@@ -3271,11 +1709,10 @@ int main(int argc, char *argv[]) {
             dataloader_reset(&val_loader);
             for (int i = 0; i < val_num_batches; i++) {
                 dataloader_next_batch(&val_loader);
-                gpt2_forward(&model, val_loader.inputs, val_loader.targets, B, T);
-                val_loss += model.mean_loss;
+                val_loss += gpt2_validate(&model, val_loader.inputs, val_loader.targets, B, T);
             }
             val_loss /= val_num_batches;
-            val_loss = multi_gpu_cpu_float_sum(val_loss) / multi_gpu_config.num_processes;
+            val_loss = multi_gpu_cpu_float_sum(val_loss, &multi_gpu_config) / multi_gpu_config.num_processes;
             printf0("val loss %f\n", val_loss);
             logger_log_val(&logger, step, val_loss);
         }
@@ -3289,12 +1726,12 @@ int main(int argc, char *argv[]) {
             for (int i = 0; i < eval_loader.num_batches; i++) {
                 if (i % 10 == 0) { printf("evaluating HellaSwag: %d/%d\r", i, eval_loader.num_batches); }
                 evalloader_next_batch(&eval_loader);
-                gpt2_forward(&model, eval_loader.inputs, eval_loader.targets, B, T);
-                int correct = evalloader_stat_losses(&eval_loader, model.cpu_losses_fp32);
+                gpt2_validate(&model, eval_loader.inputs, eval_loader.targets, B, T);
+                int correct = evalloader_stat_losses(&eval_loader, model.cpu_losses);
                 eval_acc_norm += (float)correct;
             }
             // careful because not all ranks may have the exact same allocation of number of examples
-            eval_acc_norm = multi_gpu_cpu_float_sum(eval_acc_norm);
+            eval_acc_norm = multi_gpu_cpu_float_sum(eval_acc_norm, &multi_gpu_config);
             printf0("HellaSwag: %d/%d = %f\n", (int)eval_acc_norm, eval_loader.num_examples, eval_acc_norm / eval_loader.num_examples);
             logger_log_eval(&logger, step, eval_acc_norm / eval_loader.num_examples);
         }
@@ -3313,14 +1750,14 @@ int main(int argc, char *argv[]) {
             printf("generating:\n---\n");
             for (int t = 1; t < genT; t++) {
                 NvtxRange generation_range("Generation step", t);
-                // note that inference is very wasteful here because for each token
-                // we re-calculate the forward pass for all of (B,T) positions from scratch
-                // but the inference here is just for sanity checking anyway
-                // and we can maybe optimize a bit more later, with careful tests
-                gpt2_forward(&model, gen_tokens, NULL, B, T);
-                // furthermore, below we're only using b=0 (i.e. the first row) of all B rows
-                // we're in principle running B "inference streams" in parallel here
-                // only using position 0 because it's a bit faster (copy less probs from GPU -> CPU)
+                // we try not to be too wasteful for inference by not calculating all of B,T
+                // Using a smaller B is always bit-for-bit identical, but T is more tricky
+                // for non-CUDNN, we need to make sure the attention buffer is memset to 0
+                // for cuDNN, it might suddenly decide to use a slightly different algorithm...
+                // on cuDNN 9.2.1 with cuDNN FrontEnd 1.5.2, T >= 256 seems bit-for-bit identical
+                // (but even if it wasn't fully identical that's probably not the end of the world)
+                // note this is still somewhat wasteful because we don't have a KV cache!
+                gpt2_forward(&model, gen_tokens, 1, CEIL_DIV(t, min(T,256)) * min(T,256));
                 // get the V-dimensional vector probs[0, t-1, :]
                 floatX* logits = model.acts.output + (t - 1) * model.config.padded_vocab_size;
                 // move probs back to CPU and sample (note we only move the first vocab_size logits, ignoring the padding)
@@ -3349,23 +1786,17 @@ int main(int argc, char *argv[]) {
         // once in a while checkpoint the optimization state (all ranks)
         if ((checkpoint_every > 0 && output_log_dir != NULL && resuming == 0) &&
             ((step > 0 && step % checkpoint_every == 0) || last_step)) {
-            assert(strlen(output_log_dir) < 400); // being a bit lazy here
-            // only rank 0 writes the model file because it is the same across all ranks
-            if (multi_gpu_config.process_rank == 0) {
-                snprintf(filename_buffer, 512, "%s/model_%08d.bin", output_log_dir, step);
-                gpt2_write_to_checkpoint(&model, filename_buffer);
-            }
-            // all ranks write their state file
-            snprintf(filename_buffer, 512, "%s/state_%08d_%05d.bin", output_log_dir, step, multi_gpu_config.process_rank);
-            save_state(filename_buffer, step, &model, &train_loader);
-            // DONE file is a signal that this checkpoint as a whole is complete
-            multi_gpu_barrier(&multi_gpu_config);
-            if (multi_gpu_config.process_rank == 0) {
-                snprintf(filename_buffer, 512, "%s/DONE_%08d", output_log_dir, step);
-                FILE* done_file = fopenCheck(filename_buffer, "w");
-                fclose(done_file);
+            // writes model .bin file, state .bin files, and DONE file for step
+            write_checkpoint(output_log_dir, step, &model, &train_loader, &multi_gpu_config);
+            // we only keep checkpoints_keep checkpoints on disk to save space
+            // so now that we wrote a new checkpoint, delete one old one (unless it is a "major" checkpoint)
+            // we only do this is checkpoint keeping is turned on (checkpoints_keep > 0)
+            int step_delete = step - checkpoints_keep * checkpoint_every;
+            if (checkpoints_keep > 0 && step_delete > 0 &&
+               (major_checkpoint_every == 0 || step_delete % major_checkpoint_every != 0)
+                ) {
+                delete_checkpoint(output_log_dir, step_delete, &multi_gpu_config);
             }
-            multi_gpu_barrier(&multi_gpu_config);
         }
         resuming = 0;
 
@@ -3376,45 +1807,38 @@ int main(int argc, char *argv[]) {
         if (last_step) { break; }
 
         // --------------- TRAINING SECTION BEGIN -----------------
+        if (overfit_single_batch == 1) {
+            // if we are trying to overfit a single batch, we reset the loader here
+            dataloader_reset(&train_loader);
+        }
         // do one training step, doing forward/backward/update on total_batch_size tokens
-        cudaEventRecord(start);
-        // gradient accumulation loop over micro-batches
-        float lossf = 0.0f; // for getting the mean loss over the accumulation steps
+        cudaCheck(cudaEventRecord(start));
+        // gradient and loss accumulation loop over micro-batches
         for (int micro_step = 0; micro_step < grad_accum_steps; micro_step++) {
             // fetch the next data batch
-            // and if we're overfitting a single batch, we'll only call this a single time
-            if (overfit_single_batch == 0 ||
-               (overfit_single_batch == 1 && step == 0 && micro_step == 0)) {
-                dataloader_next_batch(&train_loader);
-            }
+            dataloader_next_batch(&train_loader);
             // forward pass. note that we pass in grad_accum_steps, which scales down the loss
-            gpt2_forward(&model, train_loader.inputs, train_loader.targets, B, T, grad_accum_steps);
-            lossf += model.mean_loss; // the mean_loss was normalized by grad_accum_steps inside gpt2_forward
+            gpt2_forward(&model, train_loader.inputs, B, T);
             // backward pass. all model params accumulate gradients with += inside this inner loop
-            gpt2_backward(&model, train_loader.inputs);
+            gpt2_backward_and_reduce(&model, train_loader.inputs, train_loader.targets, grad_accum_steps, micro_step);
         }
-        // override the mean loss, accounting for the gradient accumulation loop
-        // this is esp important to do here in multigpu update below, where model.mean_loss gets allreduced
-        model.mean_loss = lossf;
-        // update the parameters
-        gpt2_multi_gpu_accumulate(&model, &multi_gpu_config);
-        // learning rate schedule: warmup linearly to max LR, then cosine decay to LR * final_learning_rate_frac
-        float step_learning_rate = learning_rate;
-        if (step < warmup_iterations) {
-            step_learning_rate = learning_rate * ((float)(step + 1)) / warmup_iterations;
+        float zloss = (float)(update_detector(&loss_outlier_detector, (double)model.mean_loss)); // loss z-score
+        // fetch the next learning rate
+        float step_learning_rate = get_learning_rate(&lr_scheduler, step);
+        // calculate the gradient norm and how much we wish to scale the gradient
+        float grad_norm = gpt2_calculate_grad_norm(&model, &multi_gpu_config);
+        float zgrad = (float)(update_detector(&grad_norm_outlier_detector, (double)grad_norm)); // grad z-score
+        // update the model parameters
+        if (isfinite(zloss) && skip_update_lossz != 0.0f && zloss > skip_update_lossz) {
+            printf0("skipping update due to loss z-score of %f\n", zloss);
+        } else if (isfinite(zgrad) && skip_update_gradz != 0.0f && zgrad > skip_update_gradz) {
+            printf0("skipping update due to grad z-score of %f\n", zgrad);
         } else {
-            float decay_ratio = ((float)(step - warmup_iterations)) / (train_num_batches - warmup_iterations);
-            assert(0.0f <= decay_ratio && decay_ratio <= 1.0f);
-            float coeff = 0.5f * (1.0f + cosf(M_PI * decay_ratio)); // coeff starts at 1 and goes to 0
-            assert(0.0f <= coeff && coeff <= 1.0f);
-            float min_lr = learning_rate * final_learning_rate_frac;
-            step_learning_rate = min_lr + coeff * (learning_rate - min_lr);
+            // clip the gradient norm to a maximum value
+            float grad_clip = 1.0f;
+            float grad_scale = (grad_norm > grad_clip) ? grad_clip / grad_norm : 1.0f;
+            gpt2_update(&model, step_learning_rate, 0.9f, 0.95f, 1e-8f, weight_decay, grad_scale, step+1, &multi_gpu_config);
         }
-        // update the model parameters
-        float grad_norm = gpt2_update(&model, step_learning_rate, 0.9f, 0.95f, 1e-8f, weight_decay, 1.0f, step+1, &multi_gpu_config);
-        gpt2_multi_gpu_gather(&model, &multi_gpu_config);
-        // zero out the gradients for the next iteration
-        gpt2_zero_grad(&model);
         cudaCheck(cudaEventRecord(end));
         cudaCheck(cudaEventSynchronize(end)); // wait for the end event to finish to get correct timings
         // --------------- TRAINING SECTION END -------------------
@@ -3432,12 +1856,17 @@ int main(int argc, char *argv[]) {
             ema_tokens_per_second = 0.95f * ema_tokens_per_second + 0.05f * tokens_per_second;
             bias_corrected_ema_tokens_per_second = ema_tokens_per_second / (1.0f - powf(0.95f, step));
         }
-        float accumulated_loss = multi_gpu_config.num_processes == 1 ? model.mean_loss : model.accumulated_mean_loss;
         float mfu = gpt2_estimate_mfu(&model, B * T * grad_accum_steps, time_elapsed_ms / 1000.0f);
-        printf0("step %4d/%d | train loss %7.6f | norm %6.4f | lr %.2e | %.2f ms | %.1f%% A100 fp16 MFU | %.0f tok/s\n",
-                step + 1, train_num_batches, accumulated_loss, grad_norm, step_learning_rate,
+        printf0("step %4d/%d | loss %7.6f (%+.2fz)| norm %6.4f (%+.2fz)| lr %.2e | %.2f ms | %.1f%% bf16 MFU | %.0f tok/s\n",
+                step + 1, train_num_batches, model.mean_loss, zloss, grad_norm, zgrad, step_learning_rate,
                 time_elapsed_ms, 100*mfu, bias_corrected_ema_tokens_per_second);
-        logger_log_train(&logger, step, model.mean_loss);
+        if(log_gpu_every > 0 && (step + 1) % log_gpu_every == 0) {
+            GPUUtilInfo gpu_info = get_gpu_utilization_info();
+            printf0("                  compute %2.1f%% | memory: %2.1f%% | fan: %2d%% | %4d MHz / %4d MHz | %3d W / %3d W | %d°C / %d°C | %s\n",
+                    gpu_info.gpu_utilization, gpu_info.mem_utilization, gpu_info.fan, gpu_info.clock, gpu_info.max_clock, gpu_info.power / 1000, gpu_info.power_limit / 1000,
+                    gpu_info.temperature, gpu_info.temp_slowdown, gpu_info.throttle_reason);
+        }
+        logger_log_train(&logger, step, model.mean_loss, step_learning_rate, grad_norm);
 
         // disable the profiler after 3 steps of optimization
         if (step == 3) { cudaProfilerStop(); }
@@ -3456,6 +1885,7 @@ int main(int argc, char *argv[]) {
     free(cpu_logits);
     free(gen_tokens);
     multi_gpu_config_free(&multi_gpu_config);
+    gpt2_free(&model);
     common_free(model);
     return 0;
 }
diff --git a/train_gpt2.py b/train_gpt2.py
index c93675c1e..b9dee8701 100644
--- a/train_gpt2.py
+++ b/train_gpt2.py
@@ -32,6 +32,7 @@
 from torch.nn.parallel import DistributedDataParallel as DDP
 from torch.distributed import init_process_group, destroy_process_group
 from torch.distributed.optim import ZeroRedundancyOptimizer
+import torch.distributed as dist
 
 # -----------------------------------------------------------------------------
 # PyTorch nn.Module definitions for the GPT-2 model
@@ -767,21 +768,18 @@ def get_lr(it):
         if (args.sample_every > 0 \
             and (step % args.sample_every == 0 or last_step)) \
             and master_process:
-            # TODO I'm not sure why this sampling code (which worked fine)
-            # doesn't work anymore when placed here debug later
-            if False:
-                model.eval()
-                # before we end, let's also do one round of inference
-                # we'll kick off the generation with "<|endoftext|>", which designates the start of a new sequence
-                start_ids = [enc.eot_token]
-                x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
-                max_new_tokens = 32
-                temperature = 1.0
-                top_k = 40
-                y = raw_model.generate(x, max_new_tokens, temperature=temperature, top_k=top_k)
-                print0('---------------')
-                print0(enc.decode(y[0].tolist()))
-                print0('---------------')
+            model.eval()
+            # before we end, let's also do one round of inference
+            # we'll kick off the generation with "<|endoftext|>", which designates the start of a new sequence
+            start_ids = [enc.eot_token]
+            xg = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
+            max_new_tokens = 32
+            temperature = 1.0
+            top_k = 40
+            yg = raw_model.generate(xg, max_new_tokens, temperature=temperature, top_k=top_k)
+            print0('---------------')
+            print0(enc.decode(yg[0].tolist()))
+            print0('---------------')
 
         # bit confusing: we want to make sure to eval and sample on 0th iteration
         # but also after the very last iteration. so we loop for step <= num_iterations
@@ -792,14 +790,21 @@ def get_lr(it):
 
         # --------------- TRAINING SECTION BEGIN -----------------
         model.train()
+        optimizer.zero_grad(set_to_none=True)
+        # if we are trying to overfit a single batch, we reset the loader here
+        if args.overfit_single_batch:
+            train_loader.reset()
         # micro-batch loop where we do gradient accumulation to reach desired total batch size
         lossf = 0.0 # for getting the mean loss (as simple float) over the accumulation steps
         for micro_step in range(grad_accum_steps):
             # fetch a batch
-            if not args.overfit_single_batch \
-                or (args.overfit_single_batch and step == 0 and micro_step == 0):
-                x, y = train_loader.next_batch()
-                x, y = x.to(device), y.to(device)
+            x, y = train_loader.next_batch()
+            x, y = x.to(device), y.to(device)
+            if ddp:
+                # we want only the last micro-step to sync grads in a DDP model
+                # the official way to do this is with model.no_sync(), but that is a
+                # context manager that bloats the code, so we just toggle this variable
+                model.require_backward_grad_sync = (micro_step == grad_accum_steps - 1)
             # forward pass
             with ctx:
                 _, loss = model(x, y, return_logits=False)
@@ -808,15 +813,13 @@ def get_lr(it):
                 # addition of gradients corresponds to a SUM in the objective, but
                 # instead of a SUM we want MEAN, so we scale the loss here
                 loss = loss / grad_accum_steps
-                lossf += loss.item() # keep track of the mean loss
+                lossf += loss.detach() # keep track of the mean loss
             # backward pass
-            if ddp:
-                # we want only the last micro-step to sync grads in a DDP model
-                # the official way to do this is with model.no_sync(), but that is a
-                # context manager that bloats the code, so we just toggle this variable
-                model.require_backward_grad_sync = (micro_step == grad_accum_steps - 1)
             if not args.inference_only:
                 loss.backward()
+        if ddp:
+            dist.all_reduce(lossf, op=dist.ReduceOp.AVG)
+        lossf = lossf.item()
         norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
         # determine and set the learning rate for this iteration
         lr = get_lr(step)
@@ -824,7 +827,6 @@ def get_lr(it):
             param_group['lr'] = lr
         # step the optimizer
         optimizer.step()
-        optimizer.zero_grad(set_to_none=True)
         # --------------- TRAINING SECTION END -------------------
         # everything that follows now is just diagnostics, prints, logging, etc.
 
diff --git a/train_gpt2_fp32.cu b/train_gpt2_fp32.cu
index 6553ca009..df412ea5e 100644
--- a/train_gpt2_fp32.cu
+++ b/train_gpt2_fp32.cu
@@ -23,7 +23,6 @@ the layernorms are connected to the residuals so we += in layernorm backward.
 // GPU / CUDA related
 #include <cublas_v2.h>
 #include <cuda_runtime.h>
-#include <cublasLt.h>
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 // our own utilities
@@ -60,12 +59,8 @@ void cublasCheck(cublasStatus_t status, const char *file, int line)
 }
 #define cublasCheck(status) { cublasCheck((status), __FILE__, __LINE__); }
 
-// cuBLAS workspace. Hardcoding to 32MiB but only Hopper needs 32, for others 4 is OK
-static size_t cublaslt_workspace_size = 32 * 1024 * 1024;
-static void* cublaslt_workspace = NULL;
 static cublasComputeType_t cublas_compute_type;
 cublasHandle_t cublas_handle;
-cublasLtHandle_t cublaslt_handle;
 
 namespace cg = cooperative_groups;
 
@@ -611,6 +606,87 @@ __global__ void fused_classifier_kernel3(float* logits, float* losses, float* pr
     }
 }
 
+__device__ float4 ld_vec(const float* address) {
+    return *reinterpret_cast<const float4*>(address);
+}
+
+__device__ void st_vec(float* address, float4 val) {
+    *reinterpret_cast<float4*>(address) = val;
+}
+
+__global__ void __launch_bounds__(16*16, 2) matmul_forward_kernel4(float* out,
+                                                                   const float* inp, const float* weight, const float* bias,
+                                                                   int C, int OC) {
+    // out is (B,T,OC). OC is short for "output channels", e.g. OC = 4 * C
+    // inp is (B,T,C), weight is (OC, C), bias is (OC)
+    // each thread handles 8x8 elements; each block 128 by 128 elements.
+    int oc = 8*(blockIdx.y * blockDim.y + threadIdx.y);
+
+    // buffers to cache chunks of the input matrices
+    __shared__ float lhs_s[128][32];
+    __shared__ float rhs_s[128][32];
+
+    // adjust our pointers for the current block
+    inp += 128 * blockIdx.x * C;
+    weight += 128 * blockIdx.y * C;
+    out += 128 * blockIdx.x * OC + 128 * blockIdx.y;
+
+    float vals[8][8] = {};
+    if(bias != NULL) {
+        for (int i = 0; i < 8; i++) {
+            for (int j = 0; j < 8; j += 4) {
+                float4 b = ld_vec(bias + oc + j);
+                vals[i][j+0] = b.x;
+                vals[i][j+1] = b.y;
+                vals[i][j+2] = b.z;
+                vals[i][j+3] = b.w;
+            }
+        }
+    }
+
+    int si_start = 4*(16 * threadIdx.y + threadIdx.x);
+    for (int so = 0; so < C; so += 32) {
+        __syncthreads();
+        int xmod8 = threadIdx.x % 8;
+        int xby8 = threadIdx.x / 8;
+        int xo = 4 * xmod8;
+        for(int y = 2 * threadIdx.y + xby8; y < 128; y += 32) {
+            st_vec(&lhs_s[y][xo], ld_vec(inp + y * C + so + xo));
+            st_vec(&rhs_s[y][xo], ld_vec(weight + y * C + so + xo));
+        }
+        __syncthreads();
+
+        for (int si = si_start; si < si_start + 32; si += 4) {
+            float4 rhs[8];
+            for (int u = 0; u < 8; ++u) {
+                rhs[u] = ld_vec(&rhs_s[u + 8 * threadIdx.y][si % 32]);
+            }
+
+            for (int ii = 0; ii < 8; ++ii) {
+                float4 lhs = ld_vec(&lhs_s[ii + 8 * threadIdx.x][si % 32]);
+                for (int ji = 0; ji < 8; ++ji) {
+                    vals[ii][ji] += lhs.x * rhs[ji].x;
+                    vals[ii][ji] += lhs.y * rhs[ji].y;
+                    vals[ii][ji] += lhs.z * rhs[ji].z;
+                    vals[ii][ji] += lhs.w * rhs[ji].w;
+                }
+            }
+        }
+    }
+
+    for (int i = 0; i < 8; ++i) {
+        for (int j = 0; j < 8; j += 4) {
+            float4 result;
+            result.x = vals[i][j + 0];
+            result.y = vals[i][j + 1];
+            result.z = vals[i][j + 2];
+            result.w = vals[i][j + 3];
+            st_vec(out + (8*threadIdx.x+i) * OC + 8*threadIdx.y + j, result);
+        }
+    }
+}
+
+
 // ----------------------------------------------------------------------------
 // kernel launchers
 
@@ -645,77 +721,18 @@ void layernorm_forward(float* out, float* mean, float* rstd,
     cudaCheck(cudaGetLastError());
 }
 
-// uses cuBLASLt to fuse the bias and gelu. does not work with OC = 50257 (last layer)
-// https://docs.nvidia.com/cuda/cublas/#cublasltmatmul
-// https://github.com/NVIDIA/CUDALibrarySamples/blob/master/cuBLASLt/LtSgemm/sample_cublasLt_LtSgemm.cu
-void matmul_forward_cublaslt(float* out,
-                     float* inp, float* weight, float* bias,
-                     int B, int T, int C, int OC) {
-    int has_bias = (bias != NULL);
-
-    // check bias alignment
-    if(((uintptr_t)bias % 16) != 0) {
-        printf("Bias pointer is not aligned (cuBLASLt requirement)!\n");
-        exit(EXIT_FAILURE);
-    }
-
-    int returnedResults = 0;
-    cublasLtMatmulDesc_t operationDesc;
-    cublasLtMatmulPreference_t preference;
-    cublasLtMatrixLayout_t weightLayout;
-    cublasLtMatrixLayout_t inputLayout;
-    cublasLtMatrixLayout_t outputLayout;
-    cublasLtMatrixLayout_t biasLayout;
-    cublasLtMatmulHeuristicResult_t heuristic;
-
-    // create the operation descriptor
-    cublasOperation_t opNoTranspose = CUBLAS_OP_N;
-    cublasOperation_t opTranspose = CUBLAS_OP_T;
-    cublasLtEpilogue_t epilogueBias = CUBLASLT_EPILOGUE_BIAS;
-    cublasCheck(cublasLtMatmulDescCreate(&operationDesc, cublas_compute_type, CUDA_R_32F));
-    cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSA, &opTranspose, sizeof(opTranspose)));
-    cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opNoTranspose, sizeof(opNoTranspose)));
-    if(has_bias) {
-        cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_EPILOGUE, &epilogueBias,
-                                                   sizeof(epilogueBias)));
-    }
-    cublasCheck(cublasLtMatmulDescSetAttribute(operationDesc, CUBLASLT_MATMUL_DESC_BIAS_POINTER, &bias, sizeof(bias)));
-
-    // define matrix layouts
-    cublasCheck(cublasLtMatrixLayoutCreate(&weightLayout, CUDA_R_32F, C, OC, C));
-    cublasCheck(cublasLtMatrixLayoutCreate(&inputLayout, CUDA_R_32F, C, B*T, C));
-    cublasCheck(cublasLtMatrixLayoutCreate(&outputLayout, CUDA_R_32F, OC, B*T, OC));
-    cublasCheck(cublasLtMatrixLayoutCreate(&biasLayout, CUDA_R_32F, OC, 1, OC));
-
-    // create a preference handle with specified max workspace
-    cublasCheck(cublasLtMatmulPreferenceCreate(&preference));
-    cublasCheck(cublasLtMatmulPreferenceSetAttribute(preference,
-        CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES,
-        &cublaslt_workspace_size, sizeof(cublaslt_workspace_size)));
-
-    // find a suitable algorithm
-    cublasCheck(cublasLtMatmulAlgoGetHeuristic(cublaslt_handle, operationDesc,
-        weightLayout, inputLayout, outputLayout, outputLayout,
-        preference, 1, &heuristic, &returnedResults));
-    if (returnedResults == 0) {
-        printf("No cuBLASLt algorithm: B: %d, T: %d, C: %d, OC: %d, bias: %d\n", B, T, C, OC, has_bias);
-        exit(EXIT_FAILURE);
-    }
-
-    // call the matmul
-    const float alpha = 1.0f, beta = 0.0f;
-    cublasCheck(cublasLtMatmul(cublaslt_handle, operationDesc,
-        &alpha, weight, weightLayout, inp, inputLayout, &beta,
-        out, outputLayout, out, outputLayout, &heuristic.algo,
-        cublaslt_workspace, cublaslt_workspace_size, 0));
-
-    // cleanups
-    cublasCheck(cublasLtMatmulPreferenceDestroy(preference));
-    cublasCheck(cublasLtMatmulDescDestroy(operationDesc));
-    cublasCheck(cublasLtMatrixLayoutDestroy(weightLayout));
-    cublasCheck(cublasLtMatrixLayoutDestroy(inputLayout));
-    cublasCheck(cublasLtMatrixLayoutDestroy(outputLayout));
-    cublasCheck(cublasLtMatrixLayoutDestroy(biasLayout));
+// kernel 1 is the most naive matmul kernel
+void matmul_forward(float* out,
+                    const float* inp, const float* weight, const float* bias,
+                    int B, int T, int C, int OC) {
+    // out is (B,T,OC). OC is short for "output channels", e.g. OC = 4 * C
+    // inp is (B,T,C), weight is (OC, C), bias is (OC)
+    int sqrt_block_size = 16;
+
+    dim3 gridDim(CEIL_DIV(B * T, 8*sqrt_block_size), CEIL_DIV(OC, 8*sqrt_block_size));
+    dim3 blockDim(sqrt_block_size, sqrt_block_size);
+    matmul_forward_kernel4<<<gridDim, blockDim>>>(out, inp, weight, bias, C, OC);
+    cudaCheck(cudaGetLastError());
 }
 
 void attention_forward(float* out, float* qkvr, float* att,
@@ -1255,20 +1272,20 @@ void gpt2_forward(GPT2 *model, int* inputs, int* targets, int B, int T) {
 
         // now do the forward pass
         layernorm_forward(l_ln1, l_ln1_mean, l_ln1_rstd, residual, l_ln1w, l_ln1b, B, T, C);
-        matmul_forward_cublaslt(scratch, l_ln1, l_qkvw, l_qkvb, B, T, C, 3*C);
+        matmul_forward(scratch, l_ln1, l_qkvw, l_qkvb, B, T, C, 3*C);
         attention_forward(l_atty, l_qkvr, l_att, scratch, B, T, C, NH);
-        matmul_forward_cublaslt(l_attproj, l_atty, l_attprojw, l_attprojb, B, T, C, C);
+        matmul_forward(l_attproj, l_atty, l_attprojw, l_attprojb, B, T, C, C);
         residual_forward(l_residual2, residual, l_attproj, B*T*C);
         layernorm_forward(l_ln2, l_ln2_mean, l_ln2_rstd, l_residual2, l_ln2w, l_ln2b, B, T, C);
-        matmul_forward_cublaslt(l_fch, l_ln2, l_fcw, l_fcb, B, T, C, 4*C);
+        matmul_forward(l_fch, l_ln2, l_fcw, l_fcb, B, T, C, 4*C);
         gelu_forward(l_fch_gelu, l_fch, B*T*4*C);
-        matmul_forward_cublaslt(l_fcproj, l_fch_gelu, l_fcprojw, l_fcprojb, B, T, 4*C, C);
+        matmul_forward(l_fcproj, l_fch_gelu, l_fcprojw, l_fcprojb, B, T, 4*C, C);
         residual_forward(l_residual3, l_residual2, l_fcproj, B*T*C);
     }
 
     residual = acts.residual3 + (L-1) * B * T * C; // last residual is in residual3
     layernorm_forward(acts.lnf, acts.lnf_mean, acts.lnf_rstd, residual, params.lnfw, params.lnfb, B, T, C);
-    matmul_forward_cublaslt(acts.output, acts.lnf, params.wte, NULL, B, T, C, Vp);
+    matmul_forward(acts.output, acts.lnf, params.wte, NULL, B, T, C, Vp);
 
     // also forward the cross-entropy loss function if we have the targets
     if (targets != NULL) {
@@ -1594,13 +1611,11 @@ int main(int argc, char *argv[]) {
     cudaGetDeviceProperties(&deviceProp, deviceIdx);
     // setup cuBLAS and cuBLASLt
     cublasCheck(cublasCreate(&cublas_handle));
-    cublasCheck(cublasLtCreate(&cublaslt_handle));
     // TF32 precision is equivalent to torch.set_float32_matmul_precision('high')
     int enable_tf32 = deviceProp.major >= 8 ? 1 : 0;
     cublas_compute_type = enable_tf32 ? CUBLAS_COMPUTE_32F_FAST_TF32 : CUBLAS_COMPUTE_32F;
     cublasMath_t cublas_math_mode = enable_tf32 ? CUBLAS_TF32_TENSOR_OP_MATH : CUBLAS_DEFAULT_MATH;
     cublasCheck(cublasSetMathMode(cublas_handle, cublas_math_mode));
-    cudaCheck(cudaMalloc(&cublaslt_workspace, cublaslt_workspace_size));
     printf("| device                | %-50s |\n", deviceProp.name);
     printf("| TF32                  | %-50s |\n", enable_tf32 ? "enabled" : "disabled");
     printf("+-----------------------+----------------------------------------------------+\n");
@@ -1619,8 +1634,8 @@ int main(int argc, char *argv[]) {
 
     // build DataLoaders for both train and val
     DataLoader train_loader, val_loader;
-    dataloader_init(&train_loader, train_data_pattern, B, T, 0, 1);
-    dataloader_init(&val_loader, val_data_pattern, B, T, 0, 1);
+    dataloader_init(&train_loader, train_data_pattern, B, T, 0, 1, 1);
+    dataloader_init(&val_loader, val_data_pattern, B, T, 0, 1, 0);
     int train_num_batches = train_loader.num_tokens / (B*T); // let's do 1 epoch by default for now
     int val_num_batches = val_loader.num_tokens / (B*T);
     if (val_num_batches > val_max_steps) { val_num_batches = val_max_steps; }
@@ -1732,9 +1747,7 @@ int main(int argc, char *argv[]) {
     gpt2_free(&model);
     free(cpu_logits);
     free(gen_tokens);
-    cudaCheck(cudaFree(cublaslt_workspace));
     cublasCheck(cublasDestroy(cublas_handle));
-    cublasCheck(cublasLtDestroy(cublaslt_handle));
     logger_free(&logger);
 
     return 0;
diff --git a/train_llama3.py b/train_llama3.py
new file mode 100644
index 000000000..31596c306
--- /dev/null
+++ b/train_llama3.py
@@ -0,0 +1,1284 @@
+"""
+Reference code for LLaMA-3.1 training and inference.
+Will save the model weights into files, to be read from C as initialization.
+
+This code differs from GPT-2 very slightly, there are three main differences:
+1) RoPE: LLaMA uses a different positional encoding scheme called Relative Positional Encoding (RoPE).
+2) GQA: Grouped Query Attention (GQA) is used to reduce the number of attention heads.
+3) SwiGLU: Swish-Gated Linear Unit (SwiGLU) is used as the activation function in the MLP.
+
+References:
+# 1) https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/api/tokenizer.py
+# 2) https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/api/model.py
+# 3) https://github.com/meta-llama/llama3/blob/11817d47e1ba7a4959b025eb1ca308572e0e3963/llama/generation.py
+
+Example launches to only benchmark the speed of bfloat16 compiled GPU training:
+TODO: add the actual commands
+"""
+
+import os
+import math
+import glob
+import inspect
+from contextlib import nullcontext
+from dataclasses import dataclass
+import json
+from pathlib import Path
+from typing import (
+    AbstractSet,
+    Callable,
+    Collection,
+    Dict,
+    Iterator,
+    List,
+    Literal,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+    cast,
+)
+
+import numpy as np
+import torch
+import torch.nn as nn
+from torch.nn import functional as F
+import torch._inductor.config as config
+from torch.nn.parallel import DistributedDataParallel as DDP
+from torch.distributed import init_process_group, destroy_process_group
+from torch.distributed.optim import ZeroRedundancyOptimizer
+import torch.distributed as dist
+
+import tiktoken
+from tiktoken.load import load_tiktoken_bpe
+
+# -----------------------------------------------------------------------------
+# PyTorch nn.Module definitions for the LLaMA 3.x model
+
+# using a global to toggle flash-attention
+FLASH = 0
+
+# Used in Grouped Query Attention (GQA), broadcasts the key and value tensors
+def repeat_kv(x: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """torch.repeat_interleave(x, dim=2, repeats=n_rep)"""
+    bs, slen, n_kv_heads, head_dim = x.shape
+    if n_rep == 1:
+        return x
+    return (
+        x[:, :, :, None, :]
+        .expand(bs, slen, n_kv_heads, n_rep, head_dim)
+        .reshape(bs, slen, n_kv_heads * n_rep, head_dim)
+    )
+
+# -----------------------------------------------------------------------------
+# RoPE related
+
+def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor):
+    ndim = x.ndim
+    assert 0 <= 1 < ndim
+    assert freqs_cis.shape == (x.shape[1], x.shape[-1])
+    shape = [d if i == 1 or i == ndim - 1 else 1 for i, d in enumerate(x.shape)]
+    return freqs_cis.view(*shape)
+
+def apply_scaling(freqs: torch.Tensor):
+    # Values obtained from grid search
+    scale_factor = 8
+    low_freq_factor = 1
+    high_freq_factor = 4
+    old_context_len = 8192  # original llama3 length
+
+    low_freq_wavelen = old_context_len / low_freq_factor
+    high_freq_wavelen = old_context_len / high_freq_factor
+    new_freqs = []
+    for freq in freqs:
+        wavelen = 2 * math.pi / freq
+        if wavelen < high_freq_wavelen:
+            new_freqs.append(freq)
+        elif wavelen > low_freq_wavelen:
+            new_freqs.append(freq / scale_factor)
+        else:
+            assert low_freq_wavelen != high_freq_wavelen
+            smooth = (old_context_len / wavelen - low_freq_factor) / (
+                high_freq_factor - low_freq_factor
+            )
+            new_freqs.append((1 - smooth) * freq / scale_factor + smooth * freq)
+    return torch.tensor(new_freqs, dtype=freqs.dtype, device=freqs.device)
+
+def apply_rotary_emb(
+    xq: torch.Tensor,
+    xk: torch.Tensor,
+    freqs_cis: torch.Tensor,
+) -> Tuple[torch.Tensor, torch.Tensor]:
+    xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2))
+    xk_ = torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2))
+    freqs_cis = reshape_for_broadcast(freqs_cis, xq_)
+    xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3)
+    xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3)
+    return xq_out.type_as(xq), xk_out.type_as(xk)
+
+def precompute_freqs_cis(
+    dim: int, end: int, theta: float = 10000.0, use_scaled: bool = False
+):
+    freqs = 1.0 / (theta ** (torch.arange(0, dim, 2)[: (dim // 2)].float() / dim))
+    t = torch.arange(end, device=freqs.device, dtype=torch.float32)
+    if use_scaled:
+        freqs = apply_scaling(freqs)
+    freqs = torch.outer(t, freqs)
+    freqs_cis = torch.polar(torch.ones_like(freqs), freqs)  # complex64
+    return freqs_cis
+
+# -----------------------------------------------------------------------------
+# LLaMA building blocks
+
+# LLaMA reference code explicitly implemented RMSNorm so we copy pasted it
+# (https://github.com/meta-llama/llama-models/blob/main/models/llama3_1/api/model.py)
+# we could also use nn.RMSNorm, it has slightly different numeric properties, but equivalent
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int, eps: float = 1e-6):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.ones(dim))
+
+    def _norm(self, x):
+        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
+
+    def forward(self, x):
+        output = self._norm(x.float()).type_as(x)
+        return output * self.weight
+
+class CausalSelfAttention(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+
+        self.n_head = config.n_head
+        self.n_kv_head = config.n_kv_head
+        self.n_rep = self.n_head // self.n_kv_head
+        self.hd = config.n_embd // config.n_head
+        self.use_kv = config.use_kv
+
+        self.c_attn = nn.Linear(config.n_embd, (config.n_head + 2 * config.n_kv_head) * self.hd, bias=False)  # key, query, value projections
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=False)  # output projection
+
+        # static KV cache - we could alternatively allocate it outside of the model and just pass it in when needed
+        if self.use_kv:
+            self.cache_k = torch.zeros((config.max_gen_batch_size, config.block_size, config.n_kv_head, self.hd))
+            self.cache_v = torch.zeros((config.max_gen_batch_size, config.block_size, config.n_kv_head, self.hd))
+
+    def forward(self, x, freqs_cis=None, start_pos=None, mask=None):
+        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        qkv = self.c_attn(x)
+        q, k, v = qkv.split([self.n_head * self.hd, self.n_kv_head * self.hd, self.n_kv_head * self.hd], dim=-1)
+        q, k, v = map(lambda t: t.view(B, T, -1, self.hd), (q, k, v))  # (B, T, NH, HD)
+
+        q, k = apply_rotary_emb(q, k, freqs_cis=freqs_cis)  # rotate QK (rope)  <-- 1. difference compared to GPT-2
+
+        if self.use_kv and not self.training and start_pos >= 0:  # use kv-caching during inference
+            self.cache_k[:B, start_pos : start_pos + T] = k
+            self.cache_v[:B, start_pos : start_pos + T] = v
+            k = self.cache_k[:B, : start_pos + T]
+            v = self.cache_v[:B, : start_pos + T]
+
+        k = repeat_kv(k, self.n_rep)  # GQA <-- 2. difference compared to GPT-2
+        v = repeat_kv(v, self.n_rep)
+
+        q, k, v = map(lambda t: t.transpose(1, 2), (q, k, v))  # (B, NH, T, HD)
+
+        if FLASH:
+            # flashattention
+            y = F.scaled_dot_product_attention(q, k, v, mask)
+        else:
+            # manual implementation of attention
+            # this materializes the large (T,T) matrix for all the queries and keys
+            scores = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(self.hd))
+            if mask is not None:
+                scores.masked_fill_(mask, torch.finfo(scores.dtype).min)
+            att = F.softmax(scores.float(), dim=-1).type_as(q)
+            y = att @ v # (B, NH, T, T) x (B, NH, T, HD) -> (B, NH, T, HD)
+        y = y.transpose(1, 2).contiguous().view(B, T, C)
+        y = self.c_proj(y)
+        return y
+
+class MLP(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        hidden_dim = 4 * config.n_embd
+        hidden_dim = int(2 * hidden_dim / 3)
+        # custom dim factor multiplier
+        if config.ffn_dim_multiplier is not None:
+            hidden_dim = int(config.ffn_dim_multiplier * hidden_dim)
+        hidden_dim = config.multiple_of * ((hidden_dim + config.multiple_of - 1) // config.multiple_of)
+        self.c_fc = nn.Linear(config.n_embd, hidden_dim, bias=False)
+        self.c_fc2 = nn.Linear(config.n_embd, hidden_dim, bias=False)
+        self.c_proj = nn.Linear(hidden_dim, config.n_embd, bias=False)
+
+    def forward(self, x):
+        # SwiGLU self.c_proj(F.silu(self.c_fc2(x)) * self.c_fc(x))  <-- 3. difference compared to GPT-2
+        x1 = self.c_fc(x)
+        x2 = self.c_fc2(x)
+        x2 = F.silu(x2)
+        x = x1 * x2
+        x = self.c_proj(x)
+        return x
+
+class Block(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = RMSNorm(config.n_embd, config.norm_eps)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = RMSNorm(config.n_embd, config.norm_eps)
+        self.mlp = MLP(config)
+
+    def forward(self, x, freqs_cis=None, start_pos=None, mask=None):
+        x = x + self.attn(self.ln_1(x), freqs_cis, start_pos, mask)
+        x = x + self.mlp(self.ln_2(x))
+        return x
+
+# -----------------------------------------------------------------------------
+# The main LLaMA 3.1 model
+
+@dataclass
+class LlamaConfig:
+    version: str = "3.1"
+    block_size: int = 8192
+    vocab_size: int = 128256
+    n_layer: int = 32
+    n_head: int = 32
+    n_kv_head: int = 8
+    n_embd: int = 4096
+    ffn_dim_multiplier: float = 1.3
+    multiple_of: int = 1024
+    norm_eps: float = 1e-5
+    rope_theta: float = 500000.0
+    use_scaled_rope: bool = True
+    max_gen_batch_size: int = 4
+    use_kv: bool = True
+
+    def __init__(self, **kwargs):
+        for k, v in kwargs.items():
+            if hasattr(self, k):
+                setattr(self, k, v)
+        assert self.n_kv_head <= self.n_head
+        assert self.n_head % self.n_kv_head == 0
+        assert self.n_embd % self.n_head == 0
+
+class LLaMA(nn.Module):
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        self.transformer = nn.ModuleDict(dict(
+            wte = nn.Embedding(config.vocab_size, config.n_embd),
+            h = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),
+            ln_f = RMSNorm(config.n_embd, config.norm_eps),
+        ))
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=False)
+
+        # init all weights, use a torch rng object to be very careful
+        self.init_rng = torch.Generator()
+        self.init_rng.manual_seed(42)
+
+        self.freqs_cis = precompute_freqs_cis(
+            config.n_embd // config.n_head,
+            config.block_size * 2,
+            config.rope_theta,
+            config.use_scaled_rope,
+        )
+
+    def forward(self, idx, targets=None, return_logits=True, start_pos=0):
+        _, t = idx.size()
+        assert t <= self.config.block_size, f"Cannot forward sequence of length {t}, block size is only {self.config.block_size}"
+
+        # forward the LLaMA model itself
+        x = self.transformer.wte(idx) # token embeddings of shape (b, t, n_embd)
+        freqs_cis = self.freqs_cis[start_pos:start_pos+t]
+
+        mask = torch.triu(torch.ones((t, t), device=next(self.parameters()).device, dtype=torch.bool), diagonal=1)
+
+        for i, block in enumerate(self.transformer.h):
+            x = block(x, freqs_cis, start_pos, mask)
+        x = self.transformer.ln_f(x)
+
+        if targets is not None:
+            # if we are given some desired targets also calculate the loss
+            logits = self.lm_head(x).float()
+            loss = F.cross_entropy(logits.view(-1, logits.size(-1)), targets.view(-1), ignore_index=-1)
+        else:
+            # inference-time mini-optimization: only forward the lm_head on the very last position
+            logits = self.lm_head(x[:, [-1], :]).float() # note: using list [-1] to preserve the time dim
+            loss = None
+
+        # there are performance reasons why not returning logits is prudent, if not needed
+        if not return_logits:
+            logits = None
+
+        return logits, loss
+
+    @staticmethod
+    def adapt_llama_state_dict_keys(checkpoint, config: LlamaConfig):
+        # Modify key names from Meta's LLaMA to our LLaMA
+        # our key names are derived from GPT-2's key names
+        checkpoint['transformer.wte.weight'] = checkpoint.pop('tok_embeddings.weight')
+
+        for i in range(config.n_layer):
+            for name in ['attention_norm', 'ffn_norm']:
+                old_key = f'layers.{i}.{name}.weight'  # e.g. layers.x.attention_norm.weight -> transformer.h.x.ln_1.weight
+                new_key = f'transformer.h.{i}.ln_{1 if name == "attention_norm" else 2}.weight'
+                checkpoint[new_key] = checkpoint.pop(old_key)
+
+        for i in range(config.n_layer):
+            for name in ['attention.wq', 'attention.wk', 'attention.wv']:
+                old_key = f'layers.{i}.{name}.weight'
+                new_key = f'transformer.h.{i}.attn.c_attn.weight'
+                if name == 'attention.wq':
+                    checkpoint[new_key] = checkpoint.pop(old_key)
+                else:  # merge 3 weights into transformer.h.x.attn.c_attn.weight
+                    checkpoint[new_key] = torch.cat((checkpoint[new_key], checkpoint.pop(old_key)), dim=0)
+            old_key = f'layers.{i}.attention.wo.weight'
+            new_key = f'transformer.h.{i}.attn.c_proj.weight'
+            checkpoint[new_key] = checkpoint.pop(old_key)
+
+        ffn_map = {'w1': 'c_fc2', 'w2': 'c_proj', 'w3': 'c_fc'}
+        for i in range(config.n_layer):
+            for name in ['feed_forward.w1', 'feed_forward.w2', 'feed_forward.w3']:
+                old_key = f'layers.{i}.{name}.weight'
+                new_key = f'transformer.h.{i}.mlp.{ffn_map[name.split(".")[-1]]}.weight'
+                checkpoint[new_key] = checkpoint.pop(old_key)
+
+        checkpoint['transformer.ln_f.weight'] = checkpoint.pop('norm.weight')
+        checkpoint['lm_head.weight'] = checkpoint.pop('output.weight')
+
+        return checkpoint
+
+    @staticmethod
+    def adapt_llama_state_dict_keys_hf(checkpoint, config: LlamaConfig):
+        # Modify key names from HuggingFace's LLaMA to our LLaMA
+        # our key names are derived from GPT-2's key names
+        checkpoint['transformer.wte.weight'] = checkpoint.pop('model.embed_tokens.weight')
+
+        # We need to unpermute K and V because HF script permuted the original Meta-LLaMA weights
+        # see: https://github.com/huggingface/transformers/blob/main/src/transformers/models/llama/convert_llama_weights_to_hf.py
+        def unpermute(w, n_heads, dim1, dim2):
+            return w.view(n_heads, 2, dim1 // n_heads // 2, dim2).transpose(1, 2).reshape(dim1, dim2)
+
+        for i in range(config.n_layer):
+            for name in ['input_layernorm', 'post_attention_layernorm']:
+                old_key = f'model.layers.{i}.{name}.weight'  # e.g. layers.x.attention_norm.weight -> transformer.h.x.ln_1.weight
+                new_key = f'transformer.h.{i}.ln_{1 if name == "input_layernorm" else 2}.weight'
+                checkpoint[new_key] = checkpoint.pop(old_key)
+
+        for i in range(config.n_layer):
+            for name in ['self_attn.q_proj', 'self_attn.k_proj', 'self_attn.v_proj']:
+                old_key = f'model.layers.{i}.{name}.weight'
+                new_key = f'transformer.h.{i}.attn.c_attn.weight'
+                if name == 'self_attn.q_proj':
+                    checkpoint[new_key] = unpermute(checkpoint.pop(old_key), config.n_head, config.n_embd, config.n_embd)
+                else:  # merge 3 weights into transformer.h.x.attn.c_attn.weight
+                    tensor = checkpoint.pop(old_key)
+                    if name == 'self_attn.k_proj':
+                        tensor = unpermute(tensor, config.n_kv_head, config.n_kv_head * (config.n_embd // config.n_head), config.n_embd)
+                    checkpoint[new_key] = torch.cat((checkpoint[new_key], tensor), dim=0)
+            old_key = f'model.layers.{i}.self_attn.o_proj.weight'
+            new_key = f'transformer.h.{i}.attn.c_proj.weight'
+            checkpoint[new_key] = checkpoint.pop(old_key)
+
+        ffn_map = {'gate_proj': 'c_fc2', 'down_proj': 'c_proj', 'up_proj': 'c_fc'}
+        for i in range(config.n_layer):
+            for name in ['gate_proj', 'down_proj', 'up_proj']:
+                old_key = f'model.layers.{i}.mlp.{name}.weight'
+                new_key = f'transformer.h.{i}.mlp.{ffn_map[name]}.weight'
+                checkpoint[new_key] = checkpoint.pop(old_key)
+
+        checkpoint['transformer.ln_f.weight'] = checkpoint.pop('model.norm.weight')
+
+        return checkpoint
+
+    @classmethod
+    def from_pretrained_llama3_hf(cls, model_id):
+        """Loads pretrained LLaMA model weights from HuggingFace"""
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+        assert model_id == "meta-llama/Meta-Llama-3.1-8B", "Only the 8B-bae model is supported for now"
+        model_args = LlamaConfig()
+
+        model = AutoModelForCausalLM.from_pretrained(model_id)
+        checkpoint = LLaMA.adapt_llama_state_dict_keys_hf(model.state_dict(), model_args)
+
+        original_default_type = torch.get_default_dtype()  # save the default type
+        torch.set_default_tensor_type(torch.cuda.BFloat16Tensor)  # much faster loading
+        model = LLaMA(model_args)
+        model.load_state_dict(checkpoint, strict=False)
+        torch.set_default_tensor_type(torch.tensor([], dtype=original_default_type, device="cpu").type())  # restore default type
+
+        tokenizer = AutoTokenizer.from_pretrained(model_id)
+        tokenizer.pad_id = 128004  # this is the pad token id for LLaMA 3.1 base, we need to set this explicitly as our generate func expects it
+        tokenizer.stop_tokens = [tokenizer.eos_token_id]
+        model.tokenizer = tokenizer
+        return model
+
+    @classmethod
+    def from_pretrained_llama3_meta(cls, ckpt_dir, tokenizer_path):
+        """Loads pretrained LLaMA model weights from a checkpoint directory"""
+        model_args = LlamaConfig()
+
+        ckpt_path = sorted(Path(ckpt_dir).glob("*.pth"))[0]
+        checkpoint = torch.load(ckpt_path, map_location="cpu", weights_only=True)
+        checkpoint = LLaMA.adapt_llama_state_dict_keys(checkpoint, model_args)
+
+        original_default_type = torch.get_default_dtype()  # save the default type
+        torch.set_default_tensor_type(torch.cuda.BFloat16Tensor)  # much faster loading
+        model = LLaMA(model_args)
+        model.load_state_dict(checkpoint, strict=False)
+        torch.set_default_tensor_type(torch.tensor([], dtype=original_default_type, device="cpu").type())  # restore default type
+
+        tokenizer = Tokenizer(model_path=tokenizer_path)
+        model.tokenizer = tokenizer
+        return model
+
+    def configure_optimizers(self, weight_decay, learning_rate, betas, device_type, zero_stage):
+        # start with all of the candidate parameters
+        param_dict = {pn: p for pn, p in self.named_parameters()}
+        # filter out those that do not require grad
+        param_dict = {pn: p for pn, p in param_dict.items() if p.requires_grad}
+        # create optim groups. Any parameters that is 2D will be weight decayed, otherwise no.
+        # i.e. all weight tensors in matmuls + embeddings decay, all biases and layernorms don't.
+        decay_params = [p for n, p in param_dict.items() if p.dim() >= 2]
+        nodecay_params = [p for n, p in param_dict.items() if p.dim() < 2]
+        optim_groups = [
+            {'params': decay_params, 'weight_decay': weight_decay},
+            {'params': nodecay_params, 'weight_decay': 0.0}
+        ]
+        num_decay_params = sum(p.numel() for p in decay_params)
+        num_nodecay_params = sum(p.numel() for p in nodecay_params)
+        print0(f"num decayed parameter tensors: {len(decay_params)}, with {num_decay_params:,} parameters")
+        print0(f"num non-decayed parameter tensors: {len(nodecay_params)}, with {num_nodecay_params:,} parameters")
+        # Create AdamW optimizer and use the fused version if it is available
+        fused_available = 'fused' in inspect.signature(torch.optim.AdamW).parameters
+        use_fused = fused_available and device_type == 'cuda'
+        print0(f"using fused AdamW: {use_fused}")
+        if zero_stage == 1:
+            print0("using ZeroRedundancyOptimizer")
+            optimizer = ZeroRedundancyOptimizer(**optim_groups[0], optimizer_class=torch.optim.AdamW,
+                                                lr=learning_rate, betas=betas, fused=use_fused)
+            optimizer.add_param_group(optim_groups[1])
+        else:
+            print0("using regular AdamW")
+            optimizer = torch.optim.AdamW(optim_groups, lr=learning_rate, betas=betas, fused=use_fused)
+        return optimizer
+
+    @torch.inference_mode()
+    def generate(
+        self,
+        prompt_tokens: List[List[int]],
+        max_gen_len: int,
+        temperature: float = 0.6,
+        top_p: float = 0.9,
+        logprobs: bool = False,
+        echo: bool = False,
+    ) -> Tuple[List[List[int]], Optional[List[List[float]]]]:
+        """
+        Generate text sequences based on provided prompts using the language generation model.
+
+        Args:
+            prompt_tokens (List[List[int]]): List of tokenized prompts, where each prompt is represented as a list of integers.
+            max_gen_len (int): Maximum length of the generated text sequence.
+            temperature (float, optional): Temperature value for controlling randomness in sampling. Defaults to 0.6.
+            top_p (float, optional): Top-p probability threshold for nucleus sampling. Defaults to 0.9.
+            logprobs (bool, optional): Flag indicating whether to compute token log probabilities. Defaults to False.
+            echo (bool, optional): Flag indicating whether to include prompt tokens in the generated output. Defaults to False.
+
+        Returns:
+            Tuple[List[List[int]], Optional[List[List[float]]]]: A tuple containing generated token sequences and, if logprobs is True, corresponding token log probabilities.
+
+        Note:
+            This method uses the provided prompts as a basis for generating text. It employs nucleus sampling to produce text with controlled randomness.
+            If logprobs is True, token log probabilities are computed for each generated token.
+
+        """
+        bsz = len(prompt_tokens)
+        assert bsz <= self.config.max_gen_batch_size, (bsz, self.config.max_gen_batch_size)
+        device = next(self.parameters()).device
+
+        min_prompt_len = min(len(t) for t in prompt_tokens)
+        max_prompt_len = max(len(t) for t in prompt_tokens)
+        assert max_prompt_len <= self.config.block_size
+        total_len = min(self.config.block_size, max_gen_len + max_prompt_len)
+
+        pad_id = self.tokenizer.pad_id
+        tokens = torch.full((bsz, total_len), pad_id, dtype=torch.long, device=device)
+        for k, t in enumerate(prompt_tokens):
+            tokens[k, : len(t)] = torch.tensor(t, dtype=torch.long, device=device)
+        if logprobs:
+            token_logprobs = torch.zeros_like(tokens, dtype=torch.float)
+
+        prev_pos = 0
+        eos_reached = torch.tensor([False] * bsz, device=device)
+        input_text_mask = tokens != pad_id
+
+        if min_prompt_len == total_len:
+            logits, _ = self.forward(tokens, start_pos=prev_pos)
+            token_logprobs = -F.cross_entropy(
+                input=logits.transpose(1, 2),
+                target=tokens,
+                reduction="none",
+                ignore_index=pad_id,
+            )
+
+        stop_tokens = torch.tensor(list(self.tokenizer.stop_tokens)).to(device)
+
+        for cur_pos in range(min_prompt_len, total_len):
+            logits, _ = self.forward(tokens[:, prev_pos:cur_pos], start_pos=prev_pos)
+            if temperature > 0:
+                probs = torch.softmax(logits[:, -1] / temperature, dim=-1)
+                next_token = sample_top_p(probs, top_p)
+            else:
+                next_token = torch.argmax(logits[:, -1], dim=-1)
+
+            next_token = next_token.reshape(-1)
+            # only replace token if prompt has already been generated
+            next_token = torch.where(input_text_mask[:, cur_pos], tokens[:, cur_pos], next_token)
+            tokens[:, cur_pos] = next_token
+            if logprobs:
+                token_logprobs[:, prev_pos + 1 : cur_pos + 1] = -F.cross_entropy(
+                    input=logits.transpose(1, 2),
+                    target=tokens[:, prev_pos + 1 : cur_pos + 1],
+                    reduction="none",
+                    ignore_index=pad_id,
+                )
+            eos_reached |= (~input_text_mask[:, cur_pos]) & (
+                torch.isin(next_token, stop_tokens)
+            )
+            prev_pos = cur_pos
+            if all(eos_reached):
+                break
+
+        if logprobs:
+            token_logprobs = token_logprobs.tolist()
+        out_tokens, out_logprobs = [], []
+        for i, toks in enumerate(tokens.tolist()):
+            # cut to max gen len
+            start = 0 if echo else len(prompt_tokens[i])
+            toks = toks[start : len(prompt_tokens[i]) + max_gen_len]
+            probs = None
+            if logprobs:
+                probs = token_logprobs[i][start : len(prompt_tokens[i]) + max_gen_len]
+            # cut to after eos tok if any
+            for stop_token in self.tokenizer.stop_tokens:
+                try:
+                    eos_idx = toks.index(stop_token)
+                    toks = toks[:eos_idx]
+                    probs = probs[:eos_idx] if logprobs else None
+                except ValueError:
+                    pass
+            out_tokens.append(toks)
+            out_logprobs.append(probs)
+        return (out_tokens, out_logprobs if logprobs else None)
+
+# -----------------------------------------------------------------------------
+# sampling utils
+
+def sample_top_p(probs, p):
+    """
+    Perform top-p (nucleus) sampling on a probability distribution.
+
+    Args:
+        probs (torch.Tensor): Probability distribution tensor.
+        p (float): Probability threshold for top-p sampling.
+
+    Returns:
+        torch.Tensor: Sampled token indices.
+
+    Note:
+        Top-p sampling selects the smallest set of tokens whose cumulative probability mass
+        exceeds the threshold p. The distribution is renormalized based on the selected tokens.
+    """
+    probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
+    probs_sum = torch.cumsum(probs_sort, dim=-1)
+    mask = probs_sum - probs_sort > p
+    probs_sort[mask] = 0.0
+    probs_sort.div_(probs_sort.sum(dim=-1, keepdim=True))
+    next_token = torch.multinomial(probs_sort, num_samples=1)
+    next_token = torch.gather(probs_idx, -1, next_token)
+    return next_token
+
+# -----------------------------------------------------------------------------
+# Llama 3.1 Tokenizer
+
+# The tiktoken tokenizer can handle <=400k chars without
+# pyo3_runtime.PanicException.
+TIKTOKEN_MAX_ENCODE_CHARS = 400_000
+
+# https://github.com/openai/tiktoken/issues/195
+# Here we iterate over subsequences and split if we exceed the limit
+# of max consecutive non-whitespace or whitespace characters.
+MAX_NO_WHITESPACES_CHARS = 25_000
+
+
+class Tokenizer:
+    """
+    Tokenizing and encoding/decoding text using the Tiktoken tokenizer.
+    """
+
+    special_tokens: Dict[str, int]
+
+    num_reserved_special_tokens = 256
+
+    pat_str = r"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\r\n\p{L}\p{N}]?\p{L}+|\p{N}{1,3}| ?[^\s\p{L}\p{N}]+[\r\n]*|\s*[\r\n]+|\s+(?!\S)|\s+"  # noqa: E501
+
+    def __init__(self, model_path: str):
+        """
+        Initializes the Tokenizer with a Tiktoken model.
+
+        Args:
+            model_path (str): The path to the Tiktoken model file.
+        """
+        assert os.path.isfile(model_path), model_path
+
+        mergeable_ranks = load_tiktoken_bpe(model_path)
+        num_base_tokens = len(mergeable_ranks)
+        special_tokens = [
+            "<|begin_of_text|>",
+            "<|end_of_text|>",
+            "<|reserved_special_token_0|>",
+            "<|reserved_special_token_1|>",
+            "<|finetune_right_pad_id|>",
+            "<|step_id|>",
+            "<|start_header_id|>",
+            "<|end_header_id|>",
+            "<|eom_id|>",  # end of message
+            "<|eot_id|>",  # end of turn
+            "<|python_tag|>",
+        ]
+        reserved_tokens = [
+            f"<|reserved_special_token_{2 + i}|>"
+            for i in range(self.num_reserved_special_tokens - len(special_tokens))
+        ]
+        special_tokens = special_tokens + reserved_tokens
+
+        self.special_tokens = {
+            token: num_base_tokens + i for i, token in enumerate(special_tokens)
+        }
+        self.model = tiktoken.Encoding(
+            name=Path(model_path).name,
+            pat_str=self.pat_str,
+            mergeable_ranks=mergeable_ranks,
+            special_tokens=self.special_tokens,
+        )
+
+        self.n_words: int = num_base_tokens + len(special_tokens)
+        # BOS / EOS token IDs
+        self.bos_id: int = self.special_tokens["<|begin_of_text|>"]
+        self.eos_id: int = self.special_tokens["<|end_of_text|>"]
+        self.eot_id: int = self.special_tokens["<|eot_id|>"]
+        self.eom_id: int = self.special_tokens["<|eom_id|>"]
+        self.python_tag_id = self.special_tokens["<|python_tag|>"]
+        self.pad_id: int = self.special_tokens["<|finetune_right_pad_id|>"]
+        # hardcoded stop tokens for the base model
+        self.stop_tokens = [
+            self.special_tokens["<|begin_of_text|>"],
+            self.special_tokens["<|end_of_text|>"],
+        ]
+
+    def encode(
+        self,
+        s: str,
+        *,
+        bos: bool,
+        eos: bool,
+        allowed_special: Optional[Union[Literal["all"], AbstractSet[str]]] = None,
+        disallowed_special: Union[Literal["all"], Collection[str]] = (),
+    ) -> List[int]:
+        """
+        Encodes a string into a list of token IDs.
+
+        Args:
+            s (str): The input string to be encoded.
+            bos (bool): Whether to prepend the beginning-of-sequence token.
+            eos (bool): Whether to append the end-of-sequence token.
+            allowed_tokens ("all"|set[str]): allowed special tokens in string
+            disallowed_tokens ("all"|set[str]): special tokens that raise an error when in string
+
+        Returns:
+            list[int]: A list of token IDs.
+
+        By default, setting disallowed_special=() encodes a string by ignoring
+        special tokens. Specifically:
+        - Setting `disallowed_special` to () will cause all text corresponding
+          to special tokens to be encoded as natural text (insteading of raising
+          an error).
+        - Setting `allowed_special` to "all" will treat all text corresponding
+          to special tokens to be encoded as special tokens.
+        """
+        if allowed_special is None:
+            allowed_special = set()
+        assert type(s) is str
+
+        substrs = (
+            substr
+            for i in range(0, len(s), TIKTOKEN_MAX_ENCODE_CHARS)
+            for substr in self._split_whitespaces_or_nonwhitespaces(
+                s[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
+            )
+        )
+        t: List[int] = []
+        for substr in substrs:
+            t.extend(
+                self.model.encode(
+                    substr,
+                    allowed_special=allowed_special,
+                    disallowed_special=disallowed_special,
+                )
+            )
+        if bos:
+            t.insert(0, self.bos_id)
+        if eos:
+            t.append(self.eos_id)
+        return t
+
+    def decode(self, t: Sequence[int]) -> str:
+        # Typecast is safe here. Tiktoken doesn't do anything list-related with the sequence.
+        return self.model.decode(cast(List[int], t))
+
+    @staticmethod
+    def _split_whitespaces_or_nonwhitespaces(
+        s: str, max_consecutive_slice_len: int
+    ) -> Iterator[str]:
+        """
+        Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
+        consecutive whitespaces or consecutive non-whitespaces.
+        """
+        current_slice_len = 0
+        current_slice_is_space = s[0].isspace() if len(s) > 0 else False
+        slice_start = 0
+
+        for i in range(len(s)):
+            is_now_space = s[i].isspace()
+
+            if current_slice_is_space ^ is_now_space:
+                current_slice_len = 1
+                current_slice_is_space = is_now_space
+            else:
+                current_slice_len += 1
+                if current_slice_len > max_consecutive_slice_len:
+                    yield s[slice_start:i]
+                    slice_start = i
+                    current_slice_len = 1
+        yield s[slice_start:]
+
+# -----------------------------------------------------------------------------
+# Our own simple Distributed Data Loader
+
+def _peek_data_shard(filename):
+    raise NotImplementedError("_peek_data_shard not yet implemented for llama 3")
+    # only reads the header, returns header data
+    with open(filename, "rb") as f:
+        # first read the header, which is 256 int32 integers (4 bytes each)
+        header = np.frombuffer(f.read(256*4), dtype=np.int32)
+    if header[0] != 20240520:
+        print("ERROR: magic number mismatch in the data .bin file!")
+        print("---> HINT: Are you passing in a correct file with --input_bin?")
+        print("---> HINT: Dataset encoding changed recently, re-run data prepro or refer again to README")
+        print("---> HINT: For example re-run: `python dev/data/tinyshakespeare.py`, then re-try")
+        exit(1)
+    assert header[1] == 1, "unsupported version"
+    ntok = header[2] # number of tokens (claimed)
+    return ntok # for now just return the number of tokens
+
+def _load_data_shard(filename):
+    raise NotImplementedError("_load_data_shard not yet implemented for llama 3")
+    with open(filename, "rb") as f:
+        # first read the header, which is 256 int32 integers (4 bytes each)
+        header = np.frombuffer(f.read(256*4), dtype=np.int32)
+        assert header[0] == 20240520, "magic number mismatch in the data .bin file"
+        assert header[1] == 1, "unsupported version"
+        ntok = header[2] # number of tokens (claimed)
+        # the rest of it are tokens, stored as uint16
+        tokens = np.frombuffer(f.read(), dtype=np.uint16)
+    assert len(tokens) == ntok, "number of tokens read does not match header?"
+    return tokens
+
+class DistributedDataLoader:
+    def __init__(self, filename_pattern, B, T, process_rank, num_processes):
+        self.process_rank = process_rank
+        self.num_processes = num_processes
+        self.B = B
+        self.T = T
+
+        # glob files that match the pattern
+        self.files = sorted(glob.glob(filename_pattern))
+        assert len(self.files) > 0, f"did not find any files that match the pattern {filename_pattern}"
+
+        # load and validate all data shards, count number of tokens in total
+        ntok_total = 0
+        for fname in self.files:
+            shard_ntok = _peek_data_shard(fname)
+            assert shard_ntok >= num_processes * B * T + 1
+            ntok_total += shard_ntok
+        self.ntok_total = ntok_total
+        print0(f"DataLoader: total number of tokens: {ntok_total:,} across {len(self.files)} files")
+
+        # kick things off
+        self.current_shard = None
+        self.reset()
+
+    def reset(self):
+        # we're being a bit clever here: if we already had shard 0 loaded,
+        # then don't do the work to reload it, just reset the pointer
+        if self.current_shard != 0:
+            self.current_shard = 0
+            self.tokens = _load_data_shard(self.files[self.current_shard])
+        self.current_position = self.process_rank * self.B * self.T
+
+    def advance(self): # advance to next data shard
+        self.current_shard = (self.current_shard + 1) % len(self.files)
+        self.current_position = self.process_rank * self.B * self.T
+        self.tokens = _load_data_shard(self.files[self.current_shard])
+
+    def next_batch(self):
+        B = self.B
+        T = self.T
+        buf = self.tokens[self.current_position : self.current_position+B*T+1]
+        buf = torch.tensor(buf.astype(np.int32), dtype=torch.long)
+        x = (buf[:-1]).view(B, T) # inputs
+        y = (buf[1:]).view(B, T) # targets
+        # advance the start pointer in current shard
+        self.current_position += B * T * self.num_processes
+        # if loading the next batch would be out of bounds advance the shard
+        if self.current_position + (B * T * self.num_processes + 1) > len(self.tokens):
+            self.advance()
+        return x, y
+
+# -----------------------------------------------------------------------------
+# Python -> C bridge utilities for saving params/grads/activations to .bin files
+
+def write_fp32(tensor, file):
+    t = tensor.detach().cpu().to(torch.float32)
+    b = t.numpy().tobytes()
+    file.write(b)
+
+def write_bf16(tensor, file):
+    t = tensor.detach().cpu().to(torch.bfloat16)
+    # numpy doesn't have bf16 datatype so we have to trick it
+    t = t.view(torch.int16) # trick: reinterpret as int16
+    b = t.numpy().tobytes()
+    file.write(b)
+
+def write_tensors(model_tensors, L, file, dtype):
+    # writes LLaMA 3 model's weights to a binary file
+    assert dtype in {"float32", "bfloat16"}
+    write_fun = write_fp32 if dtype == "float32" else write_bf16
+    write_fun(model_tensors["transformer.wte.weight"], file) # (V, C)
+    for i in range(L): # (L, C)
+        write_fun(model_tensors[f"transformer.h.{i}.ln_1.weight"], file)
+    for i in range(L): # (L, 3C, C)
+        write_fun(model_tensors[f"transformer.h.{i}.attn.c_attn.weight"], file)
+    for i in range(L): # (L, C, C)
+        write_fun(model_tensors[f"transformer.h.{i}.attn.c_proj.weight"], file)
+    for i in range(L): # (L, C)
+        write_fun(model_tensors[f"transformer.h.{i}.ln_2.weight"], file)
+    for i in range(L): # (L, 4C, C)
+        write_fun(model_tensors[f"transformer.h.{i}.mlp.c_fc.weight"], file)
+    for i in range(L): # (L, 4C, C)
+        write_fun(model_tensors[f"transformer.h.{i}.mlp.c_fc2.weight"], file)
+    for i in range(L): # (L, C, 4C)
+        write_fun(model_tensors[f"transformer.h.{i}.mlp.c_proj.weight"], file)
+    write_fun(model_tensors["transformer.ln_f.weight"], file) # (C, )
+    write_fun(model_tensors["lm_head.weight"], file) # (V, C)
+
+def write_model(model, filename, dtype):
+    # everything we need to instantiate the model
+    # 1) header is: version int, LLaMAConfig ints, padding to 1024 bytes
+    assert dtype in {"float32", "bfloat16"}
+    version = {
+        "float32": 3, # 3: all tensors are fp32
+        "bfloat16": 5, # 5: all tensors are bf16
+    }[dtype]
+    header = torch.zeros(256, dtype=torch.int32)
+    header[0] = 20240803 # magic
+    header[1] = version # checkpoint version
+    header[2] = model.config.block_size
+    header[3] = model.config.vocab_size
+    header[4] = model.config.n_layer
+    header[5] = model.config.n_head
+    header[6] = model.config.n_kv_head
+    header[7] = model.config.n_embd
+    header[8] = model.config.ffn_dim_multiplier
+    header[9] = model.config.multiple_of
+    header[10] = model.config.norm_eps
+    header[11] = model.config.rope_theta
+    header[12] = model.config.use_scaled_rope
+    header[13] = model.config.max_gen_batch_size
+    header[14] = int(model.config.version.split('.')[0]) # major version
+    header[15] = int(model.config.version.split('.')[1]) # minor version
+    # 2) the parameters follow the header
+    params = {name: param.cpu() for name, param in model.named_parameters()}
+    # now write to file
+    with open(filename, "wb") as file:
+        file.write(header.numpy().tobytes()) # header
+        write_tensors(params, model.config.n_layer, file, dtype) # params
+    print(f"wrote {filename}")
+
+def write_state(model, x, y, logits, loss, filename):
+    # the state is used for debugging.
+    # it contains information about the input, logits, loss, and the parameter gradients
+    # this can be used for checking the computation correctness in C
+    header = torch.zeros(256, dtype=torch.int32)
+    header[0] = 20240803 # magic
+    header[1] = x.size(0) # batch size of the batch, B
+    header[2] = x.size(1) # temporal extent of the batch, T
+    grads = {name: param.grad.cpu() for name, param in model.named_parameters()}
+    with open(filename, "wb") as file:
+        # header
+        file.write(header.numpy().tobytes())
+        # input x
+        file.write(x.cpu().numpy().astype("int32").tobytes()) # (B, T)
+        # targets y
+        file.write(y.cpu().numpy().astype("int32").tobytes()) # (B, T)
+        # logits (result of the model forward pass)
+        write_fp32(logits.cpu(), file)
+        # loss (single float, result of the cross entropy loss)
+        write_fp32(loss.cpu(), file)
+        # gradients
+        write_tensors(grads, model.config.n_layer, file, "float32")
+    print(f"wrote {filename}")
+
+# -----------------------------------------------------------------------------
+# int main
+
+def print0(*args, **kwargs):
+    # modified print that only prints from the master process
+    # if this is not a distributed run, it's just a print
+    if int(os.environ.get("RANK", 0)) == 0:
+        print(*args, **kwargs)
+
+if __name__ == "__main__":
+    import time
+    import argparse
+    print0(f"Running pytorch {torch.version.__version__}")
+
+    # default settings will overfit a tiny batch of data
+    # and save model weights and debug state to disk on the first iteration
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--use_hf", type=int, default=1, help="use HuggingFace (default) or use Meta's model")
+    parser.add_argument("--ckpt_dir", type=str, default=None, help="path to llama3 model checkpoint")
+    parser.add_argument("--tokenizer_path", type=str, default=None, help="path to llama3 tokenizer")
+    # file system input / output
+    parser.add_argument("--input_bin", type=str, default="dev/data/tinyshakespeare/tiny_shakespeare_val.bin", help="input .bin to train on")
+    parser.add_argument("--input_val_bin", type=str, default="", help="input .bin to eval validation loss on")
+    parser.add_argument("--output_dir", type=str, default="", help="output directory to which to write logs and checkpoints")
+    parser.add_argument("--model", type=str, default="meta-llama/Meta-Llama-3.1-8B", help="chose the llama model")
+    # token layout for each step of the optimization
+    parser.add_argument("--batch_size", type=int, default=4, help="batch size, in units of #batch dimensions")
+    parser.add_argument("--sequence_length", type=int, default=64, help="sequence length")
+    parser.add_argument("--total_batch_size", type=int, default=256, help="total desired batch size, in units of #tokens")
+    # workload (number of steps)
+    parser.add_argument("--num_iterations", type=int, default=10, help="number of iterations to run")
+    parser.add_argument("--inference_only", type=int, default=0, help="only run inference")
+    # optimization
+    parser.add_argument("--learning_rate", type=float, default=1e-4, help="learning rate warmup iterations")
+    parser.add_argument("--warmup_iters", type=int, default=0, help="learning rate warmup iterations")
+    parser.add_argument("--learning_rate_decay_frac", type=float, default=1.0, help="learning rate warmup iterations")
+    parser.add_argument("--weight_decay", type=float, default=0.0, help="weight decay")
+    parser.add_argument("--grad_clip", type=float, default=1.0, help="maximum gradient magnitude")
+    # evaluation
+    parser.add_argument("--val_loss_every", type=int, default=0, help="every how mant steps to evaluate val loss?")
+    parser.add_argument("--val_max_steps", type=int, default=20, help="how many batches of val to average?")
+    parser.add_argument("--sample_every", type=int, default=0, help="how often to sample from the model?")
+    # debugging
+    parser.add_argument("--overfit_single_batch", type=int, default=1, help="overfit just one batch of data")
+    # numerics
+    parser.add_argument("--tensorcores", type=int, default=0, help="use tensorcores")
+    # memory management
+    parser.add_argument("--device", type=str, default="", help="by default we autodetect, or set it here")
+    parser.add_argument("--compile", type=int, default=0, help="torch.compile the model")
+    parser.add_argument("--flash", type=int, default=0, help="use flash attention")
+    parser.add_argument("--dtype", type=str, default="bfloat16", help="float32|float16|bfloat16")
+    parser.add_argument("--zero_stage", type=int, default=0, help="zero redundancy optimizer stage (0/1/2/3)")
+    # python -> C bridge
+    parser.add_argument("--write_tensors", type=int, default=0, help="write tensors to disk")
+    args = parser.parse_args()
+
+    # args error checking and convenience variables
+    B, T = args.batch_size, args.sequence_length
+    assert 1 <= T <= 8192, "sequence length must be between 1 and 8192"
+    assert args.dtype in {"float32", "float16", "bfloat16"}
+    assert args.model in {"meta-llama/Meta-Llama-3.1-8B"}  # only 8B base model supported for now
+
+    # create the logging directory if it does not exist
+    logfile = None
+    if args.output_dir:
+        os.makedirs(args.output_dir, exist_ok=True)
+        logfile = os.path.join(args.output_dir, "main.log")
+        # create the log file "main.log" inside it, and wipe it clean
+        with open(logfile, "w") as f:
+            pass
+
+    # set up DDP (distributed data parallel). torchrun sets this env variable
+    ddp = int(os.environ.get('RANK', -1)) != -1 # is this a ddp run?
+    if ddp:
+        # use of DDP atm demands CUDA, we set the device appropriately according to rank
+        assert torch.cuda.is_available(), "for now i think we need CUDA for DDP"
+        init_process_group(backend='nccl')
+        ddp_rank = int(os.environ['RANK'])
+        ddp_local_rank = int(os.environ['LOCAL_RANK'])
+        ddp_world_size = int(os.environ['WORLD_SIZE'])
+        device = f'cuda:{ddp_local_rank}'
+        torch.cuda.set_device(device)
+        master_process = ddp_rank == 0 # this process will do logging, checkpointing etc.
+        seed_offset = 0 # each process gets the exact same seed
+        zero_stage = args.zero_stage
+    else:
+        ddp_rank = 0
+        ddp_local_rank = 0
+        zero_stage = 0
+        ddp_world_size = 1
+        master_process = True
+        seed_offset = 0
+        # select the device
+        if args.device:
+            # provided explicitly by the user
+            device = args.device
+        else:
+            # attempt to autodetect the device
+            device = "cpu"
+            if torch.cuda.is_available():
+                device = "cuda"
+            elif hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
+                device = "mps"
+    print(f"using device: {device}")
+    device_type = 'cuda' if 'cuda' in device else 'cpu'
+    assert device_type in {'cuda'}  # we need to load LLaMA as bf16 on CUDA
+
+    # calculate gradient accumulation from the desired total batch size and the current run configuration
+    tokens_per_fwdbwd = B * T * ddp_world_size
+    assert args.total_batch_size % tokens_per_fwdbwd == 0
+    grad_accum_steps = args.total_batch_size // tokens_per_fwdbwd
+    print0(f"total desired batch size: {args.total_batch_size}")
+    print0(f"=> calculated gradient accumulation steps: {grad_accum_steps}")
+
+    # set up a context manager following the desired dtype and device
+    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[args.dtype]
+    ctx = torch.amp.autocast(device_type=device_type, dtype=ptdtype) if (device_type == "cuda") else nullcontext()
+
+    # rng / reproducibility
+    torch.manual_seed(42)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed(42)
+
+    # set the torch precision mode to use TensorFloat32 (TF32) for matmuls
+    # docs https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html
+    if args.tensorcores:
+        torch.set_float32_matmul_precision('high')
+
+    # turn on/off flash attention
+    assert args.flash in {0, 1}
+    FLASH = args.flash
+
+    # init the model
+    assert args.ckpt_dir is not None and os.path.exists(args.ckpt_dir), f"llama3 ckpt dir {args.ckpt_dir} does not exist"
+    assert args.tokenizer_path is not None and os.path.exists(args.tokenizer_path), f"llama3 tokenizer path {args.tokenizer_path} does not exist"
+    if args.use_hf:
+        model = LLaMA.from_pretrained_llama3_hf(args.model)
+    else:  # use Meta's checkpoint
+        model = LLaMA.from_pretrained_llama3_meta(args.ckpt_dir, args.tokenizer_path)
+
+    model.train()
+    if args.compile:
+        if hasattr(config, "coordinate_descent_tuning"):
+            config.coordinate_descent_tuning = True # suggested by @Chillee
+        print0("compiling the model...")
+        model = torch.compile(model)
+
+    # -------------------------------------------------------------------------
+    # Our own version of a simple DistributedDataLoader
+
+    # load tokens
+    train_loader = DistributedDataLoader(args.input_bin, B, T, ddp_rank, ddp_world_size)
+    val_loader = None
+    if args.input_val_bin:
+        val_loader = DistributedDataLoader(args.input_val_bin, B, T, ddp_rank, ddp_world_size)
+
+    # -------------------------------------------------------------------------
+    # PyTorch -> C bridge: save some weights and state for C to load later as reference
+
+    # do one forward pass to generate ground truth for our C tests
+    if master_process and args.write_tensors and (not args.inference_only):
+        x, y = train_loader.next_batch()
+        x, y = x.to(device), y.to(device)
+        logits, loss = model(x, y)
+        loss.backward()
+        # save model params, in bfloat16
+        model_to_size = {"meta-llama/Meta-Llama-3.1-8B": "8B"}
+        model_size_str = model_to_size[args.model] # e.g. "8B"
+        write_model(model, os.path.join(args.output_dir, f"llama3.1_{model_size_str}_bf16.bin"), dtype="bfloat16")
+        # save x, y, logits, loss, and parameter gradients, for debugging C
+        # always store these in fp32 to have an accurate reference (?)
+        write_state(model, x, y, logits, loss, os.path.join(args.output_dir, f"llama3_{model_size_str}_debug_state.bin"))
+        # reset the train_loader for the optimization below
+        train_loader.reset()
+
+    # -------------------------------------------------------------------------
+    # main training loop
+
+    # here we wrap model into DDP container
+    if ddp:
+        model = DDP(model, device_ids=[ddp_local_rank])
+    raw_model = model.module if ddp else model # always contains the "raw" unwrapped model
+
+    # init the optimizer
+    optimizer = raw_model.configure_optimizers(weight_decay=args.weight_decay,
+                                               learning_rate=args.learning_rate, betas=(0.9, 0.95),
+                                               device_type=device, zero_stage=zero_stage)
+
+    # learning rate decay scheduler (cosine with warmup)
+    def get_lr(it):
+        min_lr = args.learning_rate * args.learning_rate_decay_frac
+        # 1) linear warmup for warmup_iters steps
+        if it < args.warmup_iters:
+            return args.learning_rate * (it+1) / args.warmup_iters
+        # 2) if it > lr_decay_iters, return min learning rate
+        if it > args.num_iterations:
+            return min_lr
+        # 3) in between, use cosine decay down to min learning rate
+        decay_ratio = (it - args.warmup_iters) / (args.num_iterations - args.warmup_iters)
+        assert 0 <= decay_ratio <= 1
+        coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio)) # coeff starts at 1 and goes to 0
+        return min_lr + coeff * (args.learning_rate - min_lr)
+
+    if device == "cuda":
+        torch.cuda.reset_peak_memory_stats()
+    timings = []
+    norm = -1.0   # dummy value to print in inference-only mode
+    for step in range(args.num_iterations + 1):
+        t0 = time.time()
+        last_step = (step == args.num_iterations)
+
+        # once in a while evaluate the validation dataset
+        if (args.val_loss_every > 0 \
+            and (step % args.val_loss_every == 0 or last_step)) \
+            and (val_loader is not None):
+            model.eval()
+            val_loader.reset()
+            with torch.no_grad():
+                val_loss = 0.0
+                for _ in range(args.val_max_steps):
+                    x, y = val_loader.next_batch()
+                    x, y = x.to(device), y.to(device)
+                    _, loss = model(x, y, return_logits=False)
+                    val_loss += loss.item()
+                val_loss /= args.val_max_steps
+            # log to console and to file
+            print0(f"val loss {val_loss}")
+            if master_process and logfile is not None:
+                with open(logfile, "a") as f:
+                    f.write("s:%d tel:%f\n" % (step, val_loss))
+
+        # once in a while perform model inference on the master process
+        if (args.sample_every > 0 \
+            and (step % args.sample_every == 0 or last_step)) \
+            and master_process:
+            model.eval()
+            prompts: List[str] = [
+        "Clearly, the meaning of life is",
+        "Simply put, the theory of relativity states that",
+        """The repo llm.c on GitHub is""",
+        """Translate English to French:
+
+        sea otter => loutre de mer
+        peppermint => menthe poivrée
+        plush girafe => girafe peluche
+        cheese =>""",
+            ]
+            if args.use_hf:
+                prompt_tokens = [model.tokenizer(x).input_ids for x in prompts]
+            else:  # Meta
+                prompt_tokens = [model.tokenizer.encode(x, bos=True, eos=False) for x in prompts]
+
+            generation_tokens, _ = model.generate(prompt_tokens, max_gen_len=64, temperature=0.6, top_p=0.9, logprobs=False, echo=False)
+            results = [{"generation": model.tokenizer.decode(t)} for t in generation_tokens]
+            for prompt, result in zip(prompts, results):
+                print(prompt, end="")
+                print(f"{result['generation']}")
+                print("\n==================================\n")
+
+        # bit confusing: we want to make sure to eval and sample on 0th iteration
+        # but also after the very last iteration. so we loop for step <= num_iterations
+        # instead of just < num_iterations (one extra due to <=), only to do
+        # the validation/sampling one last time, and then we break right here as we're done.
+        if last_step:
+            break
+
+        # --------------- TRAINING SECTION BEGIN -----------------
+        model.train()
+        optimizer.zero_grad(set_to_none=True)
+        # if we are trying to overfit a single batch, we reset the loader here
+        if args.overfit_single_batch:
+            train_loader.reset()
+        # micro-batch loop where we do gradient accumulation to reach desired total batch size
+        lossf = 0.0 # for getting the mean loss (as simple float) over the accumulation steps
+        for micro_step in range(grad_accum_steps):
+            # fetch a batch
+            x, y = train_loader.next_batch()
+            x, y = x.to(device), y.to(device)
+            if ddp:
+                # we want only the last micro-step to sync grads in a DDP model
+                # the official way to do this is with model.no_sync(), but that is a
+                # context manager that bloats the code, so we just toggle this variable
+                model.require_backward_grad_sync = (micro_step == grad_accum_steps - 1)
+            # forward pass
+            with ctx:
+                _, loss = model(x, y, return_logits=False)
+                # we have to scale the loss to account for gradient accumulation,
+                # because the gradients just add on each successive backward().
+                # addition of gradients corresponds to a SUM in the objective, but
+                # instead of a SUM we want MEAN, so we scale the loss here
+                loss = loss / grad_accum_steps
+                lossf += loss.detach() # keep track of the mean loss
+            # backward pass
+            if not args.inference_only:
+                loss.backward()
+        if ddp:
+            dist.all_reduce(lossf, op=dist.ReduceOp.AVG)
+        lossf = lossf.item()
+        norm = torch.nn.utils.clip_grad_norm_(model.parameters(), args.grad_clip)
+        # determine and set the learning rate for this iteration
+        lr = get_lr(step)
+        for param_group in optimizer.param_groups:
+            param_group['lr'] = lr
+        # step the optimizer
+        optimizer.step()
+        # --------------- TRAINING SECTION END -------------------
+        # everything that follows now is just diagnostics, prints, logging, etc.
+
+        # wait on the CPU for all device work to end so we get accurate per-iteration timings below
+        if device == "mps":
+            torch.mps.synchronize()
+        elif device == "cuda":
+            torch.cuda.synchronize()
+        # time and print
+        t1 = time.time()
+        # the 0th iteration is often an outlier (much slower) => skip logging it
+        tokens_per_second = grad_accum_steps * ddp_world_size * B * T / (t1-t0)
+        print0(f"step {step+1:4d}/{args.num_iterations} | train loss {lossf:.6f} | norm {norm:.4f} | lr {lr:.2e} | ({(t1-t0)*1000:.2f} ms | {tokens_per_second:.0f} tok/s)")
+        # log to logile
+        if master_process and logfile is not None:
+            with open(logfile, "a") as f:
+                f.write("s:%d trl:%f\n" % (step, lossf))
+
+        # keep track of smooth timings, last 20 iterations
+        if step > 0 and step > args.num_iterations - 20:
+            timings.append(t1-t0)
+
+    # print the average of the last 20 timings, to get something smooth-ish
+    timings = timings[-20:]
+    print0(f"final {len(timings)} iters avg: {np.mean(timings)*1000:.3f}ms")
+    print0(f"peak memory consumption: {torch.cuda.max_memory_allocated() // 1024 // 1024} MiB")
+
+    # -------------------------------------------------------------------------
+    # clean up nice
+    if ddp:
+        destroy_process_group()