Skip to content

Commit

Permalink
Merge branch 'karpathy:master' into master
Browse files Browse the repository at this point in the history
  • Loading branch information
zhangpiu authored Aug 10, 2024
2 parents 6266f54 + 6e6a528 commit 836ec64
Show file tree
Hide file tree
Showing 82 changed files with 8,885 additions and 3,162 deletions.
28 changes: 28 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
name: Build and test

on:
create:
workflow_dispatch:
push:
branches:
Expand Down Expand Up @@ -138,6 +139,33 @@ jobs:
call "C:\\Program Files\\Microsoft Visual Studio\\2022\\Enterprise\\VC\\Auxiliary\\Build\\vcvars64.bat"
make-4.4.1\dist\make -j WIN_CI_BUILD=1 train_gpt2fp32cu test_gpt2fp32cu test_gpt2cu train_gpt2cu profile_gpt2cu
build-ubuntu20-04:
runs-on: ubuntu-20.04
container:
image: nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
steps:
- name: Checkout code
uses: actions/checkout@v4

- name: System Info
run: |
nvcc --version
g++ --version
- name: Install cudnn frontend
run: |
apt-get update && apt-get install -y git
git clone https://github.com/NVIDIA/cudnn-frontend.git
- name: Build FP32 checkpoint
run: make train_gpt2fp32cu test_gpt2fp32cu

- name: Build FP32 precision
run: PRECISION=FP32 make train_gpt2cu test_gpt2cu profile_gpt2cu

- name: Build with CUDNN
run: PRECISION=BF16 USE_CUDNN=1 make train_gpt2cu test_gpt2cu profile_gpt2cu

build-cuda-fp32:
runs-on: ubuntu-latest
container:
Expand Down
128 changes: 128 additions & 0 deletions .github/workflows/ci_gpu.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,128 @@
name: GPU Builds and Tests

on:
create:
workflow_dispatch:
push:
branches:
- master
pull_request:
branches:
- master

jobs:
build-and-test-gpu:
runs-on: ubicloud-gpu-standard-1-latest

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Install OpenMP
run: sudo apt-get update && sudo apt-get install -y libomp-dev

- name: Install dependencies
run: pip install -r requirements.txt

- name: Run preprocessing
run: python dev/data/tinyshakespeare.py

- name: Train model
run: python train_gpt2.py

- name: Compile training and testing program
run: make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu

- name: Train model (With OpenMP)
run: OMP_NUM_THREADS=8 ./train_gpt2cu

- name: Train model (FP32) with gpt2_124M.bin
run: |
PRECISION=FP32 make train_gpt2cu
./train_gpt2cu -b 1 -t 64 -d 256 -l 0.0001 -v 200 -s 200 -a 1 -x 10 -r 0 -f 0 -e "gpt2_124M.bin"
- name: Test for percent loss differential for FP32
run: |
PRECISION=FP32 make train_gpt2cu
./train_gpt2cu -b 1 -t 64 -d 256 -l 0.0001 -v 200 -s 200 -a 1 -x 10 -r 0 -f 0 -e "gpt2_124M.bin" > train_gpt2cu_fp32_precision.txt
python dev/loss_checker_ci.py -f train_gpt2cu_fp32_precision.txt -s 20 -e 28 -a 5.0
- name: Build FP32 precision
run: PRECISION=FP32 make test_gpt2cu profile_gpt2cu

- name: Run default
run: ./test_gpt2cu

- name: Run no recompute GeLU
run: ./test_gpt2cu -r 0

- name: Run recompute LN
run: ./test_gpt2cu -r 2

- name: Build BF16 precision
run: PRECISION=BF16 make train_gpt2cu test_gpt2cu profile_gpt2cu

- name: Run default
run: ./test_gpt2cu

- name: Run no recompute GeLU
run: ./test_gpt2cu -r 0

- name: Run no master weights
run: ./test_gpt2cu -w 0

- name: Run recompute LN
run: ./test_gpt2cu -r 2

- name: Train model fp32 (With OpenMP)
run: OMP_NUM_THREADS=8 ./train_gpt2fp32cu

- name: Execute testing program (With OpenMP)
run: OMP_NUM_THREADS=8 ./test_gpt2cu

- name: Execute testing program fp32 (With OpenMP)
run: OMP_NUM_THREADS=8 ./test_gpt2fp32cu

- name: Compile training and testing program without OpenMP
run: NO_OMP=1 make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu

- name: Train model (No OpenMP)
run: NO_OMP=1 ./train_gpt2cu

- name: Train model fp32 (No OpenMP)
run: NO_OMP=1 ./train_gpt2fp32cu

- name: Execute testing program (No OpenMP)
run: ./test_gpt2cu -b 32

- name: Execute testing program fp32 (No OpenMP)
run: ./test_gpt2fp32cu

- name: Install cuDNN-frontend
run:
git clone https://github.com/NVIDIA/cudnn-frontend.git

- name: Build with cuDNN
run: USE_CUDNN=1 make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu

- name: Train model with cuDNN
run: ./train_gpt2cu

- name: Train model fp32 with cuDNN
run: ./train_gpt2fp32cu

- name: Execute testing program with cuDNN
run: ./test_gpt2cu

- name: Execute testing program fp32 with cuDNN
run: ./test_gpt2fp32cu

unit-tests-gpu:
runs-on: ubicloud-gpu-standard-1-latest

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Test Device<->File IO
run: cd dev/test && nvcc -o device_file_io device_file_io.cu && ./device_file_io
100 changes: 100 additions & 0 deletions .github/workflows/ci_tests.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
name: Unit, Static and other Tests

on:
create:
workflow_dispatch:
push:
branches:
- master
pull_request:
branches:
- master

jobs:
dataloader_test:
runs-on: ubuntu-latest

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: test the dataloader without / with sanitize address
run: |
cd dev/test
make PRECISION=BF16 test_dataloader
./test_dataloader
make clean
make PRECISION=BF16 TEST_CFLAGS="-fsanitize=address -fno-omit-frame-pointer" test_dataloader
./test_dataloader
ptx_and_sass_files:
runs-on: ubuntu-latest
container:
image: nvidia/cuda:12.4.1-devel-ubuntu22.04

steps:
- name: Checkout code
uses: actions/checkout@v4

- name: Install OpenMP and OpenMPI
run: apt-get update && apt-get install -y libomp-dev libopenmpi-dev

- name: Generate ptx/sass files and upload them to persistent storage
run: |
mkdir -p dev/cuda/ptx_sass_logs
make train_gpt2cu
cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx
cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass
cd dev/cuda
make -j all_ptx
make -j all_sass
cp *.ptx ptx_sass_logs/
cp *.sass ptx_sass_logs/
ls ptx_sass_logs/
- name: Generate ptx/sass files for A100 and upload them to persistent storage
run: |
mkdir -p dev/cuda/ptx_sass_logs_A100
make train_gpt2cu GPU_COMPUTE_CAPABILITY=80
cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx
cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass
cd dev/cuda
make -j GPU_COMPUTE_CAPABILITY=80 all_ptx
make -j GPU_COMPUTE_CAPABILITY=80 all_sass
cp *.ptx ptx_sass_logs_A100/
cp *.sass ptx_sass_logs_A100/
ls ptx_sass_logs_A100/
- name: Generate ptx/sass files for H100 and upload them to persistent storage
run: |
mkdir -p dev/cuda/ptx_sass_logs_H100
make train_gpt2cu GPU_COMPUTE_CAPABILITY=90
cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx
cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass
cd dev/cuda
make -j GPU_COMPUTE_CAPABILITY=90 all_ptx
make -j GPU_COMPUTE_CAPABILITY=90 all_sass
cp *.ptx ptx_sass_logs_H100/
cp *.sass ptx_sass_logs_H100/
ls ptx_sass_logs_H100/
- name: Upload ptx/sass files
uses: actions/upload-artifact@v4
with:
name: ptx_sass_files
path: dev/cuda/ptx_sass_logs/
retention-days: 30 # days to retain

- name: Upload ptx/sass files for A100
uses: actions/upload-artifact@v4
with:
name: ptx_sass_files_A100
path: dev/cuda/ptx_sass_logs_A100/
retention-days: 30 # days to retain

- name: Upload ptx/sass files for H100
uses: actions/upload-artifact@v4
with:
name: ptx_sass_files_H100
path: dev/cuda/ptx_sass_logs_H100/
retention-days: 30 # days to retain
Loading

0 comments on commit 836ec64

Please sign in to comment.