Merge branch 'karpathy:master' into master

zhangpiu · Aug 10, 2024 · 836ec64 · 836ec64
2 parents 6266f54 + 6e6a528
commit 836ec64
Show file tree

Hide file tree

Showing 82 changed files with 8,885 additions and 3,162 deletions.
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -1,6 +1,7 @@
 name: Build and test
 
 on:
+  create:
   workflow_dispatch:
   push:
     branches:
@@ -138,6 +139,33 @@ jobs:
         call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat"
         make-4.4.1\dist\make -j WIN_CI_BUILD=1 train_gpt2fp32cu test_gpt2fp32cu test_gpt2cu train_gpt2cu profile_gpt2cu
 
+  build-ubuntu20-04:
+    runs-on: ubuntu-20.04
+    container:
+      image: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: System Info
+        run: |
+          nvcc --version
+          g++ --version
+
+      - name: Install cudnn frontend
+        run: |
+          apt-get update && apt-get install -y git
+          git clone https://github.com/NVIDIA/cudnn-frontend.git
+
+      - name: Build FP32 checkpoint
+        run: make train_gpt2fp32cu test_gpt2fp32cu
+
+      - name: Build FP32 precision
+        run: PRECISION=FP32 make train_gpt2cu test_gpt2cu profile_gpt2cu
+
+      - name: Build with CUDNN
+        run: PRECISION=BF16 USE_CUDNN=1 make train_gpt2cu test_gpt2cu profile_gpt2cu
+
   build-cuda-fp32:
     runs-on: ubuntu-latest
     container:

diff --git a/.github/workflows/ci_gpu.yml b/.github/workflows/ci_gpu.yml
@@ -0,0 +1,128 @@
+name: GPU Builds and Tests
+
+on:
+  create:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  build-and-test-gpu:
+    runs-on: ubicloud-gpu-standard-1-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install OpenMP
+        run: sudo apt-get update && sudo apt-get install -y libomp-dev
+
+      - name: Install dependencies
+        run: pip install -r requirements.txt
+
+      - name: Run preprocessing
+        run: python dev/data/tinyshakespeare.py
+
+      - name: Train model
+        run: python train_gpt2.py
+
+      - name: Compile training and testing program
+        run: make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu
+
+      - name: Train model (With OpenMP)
+        run: OMP_NUM_THREADS=8 ./train_gpt2cu
+
+      - name: Train model (FP32) with gpt2_124M.bin
+        run: |
+          PRECISION=FP32 make train_gpt2cu
+          ./train_gpt2cu -b 1 -t 64 -d 256 -l 0.0001 -v 200 -s 200 -a 1 -x 10 -r 0 -f 0 -e "gpt2_124M.bin"
+
+      - name: Test for percent loss differential for FP32 
+        run: |
+          PRECISION=FP32 make train_gpt2cu
+          ./train_gpt2cu -b 1 -t 64 -d 256 -l 0.0001 -v 200 -s 200 -a 1 -x 10 -r 0 -f 0 -e "gpt2_124M.bin" > train_gpt2cu_fp32_precision.txt
+          python dev/loss_checker_ci.py -f train_gpt2cu_fp32_precision.txt -s 20 -e 28 -a 5.0
+
+      - name: Build FP32 precision
+        run: PRECISION=FP32 make test_gpt2cu profile_gpt2cu
+
+      - name: Run default
+        run: ./test_gpt2cu
+
+      - name: Run no recompute GeLU
+        run: ./test_gpt2cu -r 0
+
+      - name: Run recompute LN
+        run: ./test_gpt2cu -r 2
+
+      - name: Build BF16 precision
+        run: PRECISION=BF16 make train_gpt2cu test_gpt2cu profile_gpt2cu
+
+      - name: Run default
+        run: ./test_gpt2cu
+
+      - name: Run no recompute GeLU
+        run: ./test_gpt2cu -r 0
+
+      - name: Run no master weights
+        run: ./test_gpt2cu -w 0
+
+      - name: Run recompute LN
+        run: ./test_gpt2cu -r 2
+
+      - name: Train model fp32 (With OpenMP)
+        run: OMP_NUM_THREADS=8 ./train_gpt2fp32cu
+
+      - name: Execute testing program (With OpenMP)
+        run: OMP_NUM_THREADS=8 ./test_gpt2cu
+
+      - name: Execute testing program fp32 (With OpenMP)
+        run: OMP_NUM_THREADS=8 ./test_gpt2fp32cu
+
+      - name: Compile training and testing program without OpenMP
+        run: NO_OMP=1 make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu
+
+      - name: Train model (No OpenMP)
+        run: NO_OMP=1 ./train_gpt2cu
+
+      - name: Train model fp32 (No OpenMP)
+        run: NO_OMP=1 ./train_gpt2fp32cu
+
+      - name: Execute testing program (No OpenMP)
+        run: ./test_gpt2cu -b 32
+
+      - name: Execute testing program fp32 (No OpenMP)
+        run: ./test_gpt2fp32cu
+
+      - name: Install cuDNN-frontend
+        run:
+          git clone https://github.com/NVIDIA/cudnn-frontend.git
+
+      - name: Build with cuDNN
+        run: USE_CUDNN=1 make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu
+
+      - name: Train model with cuDNN
+        run: ./train_gpt2cu
+
+      - name: Train model fp32 with cuDNN
+        run: ./train_gpt2fp32cu
+
+      - name: Execute testing program with cuDNN
+        run: ./test_gpt2cu
+
+      - name: Execute testing program fp32 with cuDNN
+        run: ./test_gpt2fp32cu
+
+  unit-tests-gpu:
+    runs-on: ubicloud-gpu-standard-1-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Test Device<->File IO
+        run: cd dev/test && nvcc -o device_file_io device_file_io.cu && ./device_file_io
diff --git a/.github/workflows/ci_tests.yml b/.github/workflows/ci_tests.yml
@@ -0,0 +1,100 @@
+name: Unit, Static and other Tests
+
+on:
+  create:
+  workflow_dispatch:
+  push:
+    branches:
+      - master
+  pull_request:
+    branches:
+      - master
+
+jobs:
+  dataloader_test:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: test the dataloader without / with sanitize address
+        run: |
+          cd dev/test
+          make PRECISION=BF16 test_dataloader
+          ./test_dataloader   
+          make clean       
+          make PRECISION=BF16 TEST_CFLAGS="-fsanitize=address -fno-omit-frame-pointer" test_dataloader 
+          ./test_dataloader          
+
+  ptx_and_sass_files:
+    runs-on: ubuntu-latest
+    container:
+      image: nvidia/cuda:12.4.1-devel-ubuntu22.04
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install OpenMP and OpenMPI
+        run: apt-get update && apt-get install -y libomp-dev libopenmpi-dev
+
+      - name: Generate ptx/sass files and upload them to persistent storage
+        run: |
+          mkdir -p dev/cuda/ptx_sass_logs
+          make train_gpt2cu
+          cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx
+          cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass          
+          cd dev/cuda
+          make -j all_ptx
+          make -j all_sass
+          cp *.ptx ptx_sass_logs/
+          cp *.sass ptx_sass_logs/
+          ls ptx_sass_logs/
+
+      - name: Generate ptx/sass files for A100 and upload them to persistent storage
+        run: |
+            mkdir -p dev/cuda/ptx_sass_logs_A100
+            make train_gpt2cu GPU_COMPUTE_CAPABILITY=80
+            cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx
+            cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass          
+            cd dev/cuda
+            make -j GPU_COMPUTE_CAPABILITY=80 all_ptx
+            make -j GPU_COMPUTE_CAPABILITY=80 all_sass
+            cp *.ptx ptx_sass_logs_A100/
+            cp *.sass ptx_sass_logs_A100/
+            ls ptx_sass_logs_A100/
+
+      - name: Generate ptx/sass files for H100 and upload them to persistent storage
+        run: |
+            mkdir -p dev/cuda/ptx_sass_logs_H100
+            make train_gpt2cu GPU_COMPUTE_CAPABILITY=90
+            cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx
+            cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass          
+            cd dev/cuda
+            make -j GPU_COMPUTE_CAPABILITY=90 all_ptx
+            make -j GPU_COMPUTE_CAPABILITY=90 all_sass
+            cp *.ptx ptx_sass_logs_H100/
+            cp *.sass ptx_sass_logs_H100/
+            ls ptx_sass_logs_H100/
+
+      - name: Upload ptx/sass files
+        uses: actions/upload-artifact@v4
+        with:
+          name: ptx_sass_files
+          path: dev/cuda/ptx_sass_logs/
+          retention-days: 30 # days to retain
+
+      - name: Upload ptx/sass files for A100
+        uses: actions/upload-artifact@v4
+        with:
+          name: ptx_sass_files_A100
+          path: dev/cuda/ptx_sass_logs_A100/
+          retention-days: 30 # days to retain          
+
+      - name: Upload ptx/sass files for H100
+        uses: actions/upload-artifact@v4
+        with:
+          name: ptx_sass_files_H100
+          path: dev/cuda/ptx_sass_logs_H100/
+          retention-days: 30 # days to retain