forked from karpathy/llm.c
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'karpathy:master' into master
- Loading branch information
Showing
82 changed files
with
8,885 additions
and
3,162 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
name: GPU Builds and Tests | ||
|
||
on: | ||
create: | ||
workflow_dispatch: | ||
push: | ||
branches: | ||
- master | ||
pull_request: | ||
branches: | ||
- master | ||
|
||
jobs: | ||
build-and-test-gpu: | ||
runs-on: ubicloud-gpu-standard-1-latest | ||
|
||
steps: | ||
- name: Checkout code | ||
uses: actions/checkout@v4 | ||
|
||
- name: Install OpenMP | ||
run: sudo apt-get update && sudo apt-get install -y libomp-dev | ||
|
||
- name: Install dependencies | ||
run: pip install -r requirements.txt | ||
|
||
- name: Run preprocessing | ||
run: python dev/data/tinyshakespeare.py | ||
|
||
- name: Train model | ||
run: python train_gpt2.py | ||
|
||
- name: Compile training and testing program | ||
run: make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu | ||
|
||
- name: Train model (With OpenMP) | ||
run: OMP_NUM_THREADS=8 ./train_gpt2cu | ||
|
||
- name: Train model (FP32) with gpt2_124M.bin | ||
run: | | ||
PRECISION=FP32 make train_gpt2cu | ||
./train_gpt2cu -b 1 -t 64 -d 256 -l 0.0001 -v 200 -s 200 -a 1 -x 10 -r 0 -f 0 -e "gpt2_124M.bin" | ||
- name: Test for percent loss differential for FP32 | ||
run: | | ||
PRECISION=FP32 make train_gpt2cu | ||
./train_gpt2cu -b 1 -t 64 -d 256 -l 0.0001 -v 200 -s 200 -a 1 -x 10 -r 0 -f 0 -e "gpt2_124M.bin" > train_gpt2cu_fp32_precision.txt | ||
python dev/loss_checker_ci.py -f train_gpt2cu_fp32_precision.txt -s 20 -e 28 -a 5.0 | ||
- name: Build FP32 precision | ||
run: PRECISION=FP32 make test_gpt2cu profile_gpt2cu | ||
|
||
- name: Run default | ||
run: ./test_gpt2cu | ||
|
||
- name: Run no recompute GeLU | ||
run: ./test_gpt2cu -r 0 | ||
|
||
- name: Run recompute LN | ||
run: ./test_gpt2cu -r 2 | ||
|
||
- name: Build BF16 precision | ||
run: PRECISION=BF16 make train_gpt2cu test_gpt2cu profile_gpt2cu | ||
|
||
- name: Run default | ||
run: ./test_gpt2cu | ||
|
||
- name: Run no recompute GeLU | ||
run: ./test_gpt2cu -r 0 | ||
|
||
- name: Run no master weights | ||
run: ./test_gpt2cu -w 0 | ||
|
||
- name: Run recompute LN | ||
run: ./test_gpt2cu -r 2 | ||
|
||
- name: Train model fp32 (With OpenMP) | ||
run: OMP_NUM_THREADS=8 ./train_gpt2fp32cu | ||
|
||
- name: Execute testing program (With OpenMP) | ||
run: OMP_NUM_THREADS=8 ./test_gpt2cu | ||
|
||
- name: Execute testing program fp32 (With OpenMP) | ||
run: OMP_NUM_THREADS=8 ./test_gpt2fp32cu | ||
|
||
- name: Compile training and testing program without OpenMP | ||
run: NO_OMP=1 make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu | ||
|
||
- name: Train model (No OpenMP) | ||
run: NO_OMP=1 ./train_gpt2cu | ||
|
||
- name: Train model fp32 (No OpenMP) | ||
run: NO_OMP=1 ./train_gpt2fp32cu | ||
|
||
- name: Execute testing program (No OpenMP) | ||
run: ./test_gpt2cu -b 32 | ||
|
||
- name: Execute testing program fp32 (No OpenMP) | ||
run: ./test_gpt2fp32cu | ||
|
||
- name: Install cuDNN-frontend | ||
run: | ||
git clone https://github.com/NVIDIA/cudnn-frontend.git | ||
|
||
- name: Build with cuDNN | ||
run: USE_CUDNN=1 make test_gpt2cu train_gpt2cu test_gpt2fp32cu train_gpt2fp32cu | ||
|
||
- name: Train model with cuDNN | ||
run: ./train_gpt2cu | ||
|
||
- name: Train model fp32 with cuDNN | ||
run: ./train_gpt2fp32cu | ||
|
||
- name: Execute testing program with cuDNN | ||
run: ./test_gpt2cu | ||
|
||
- name: Execute testing program fp32 with cuDNN | ||
run: ./test_gpt2fp32cu | ||
|
||
unit-tests-gpu: | ||
runs-on: ubicloud-gpu-standard-1-latest | ||
|
||
steps: | ||
- name: Checkout code | ||
uses: actions/checkout@v4 | ||
|
||
- name: Test Device<->File IO | ||
run: cd dev/test && nvcc -o device_file_io device_file_io.cu && ./device_file_io |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,100 @@ | ||
name: Unit, Static and other Tests | ||
|
||
on: | ||
create: | ||
workflow_dispatch: | ||
push: | ||
branches: | ||
- master | ||
pull_request: | ||
branches: | ||
- master | ||
|
||
jobs: | ||
dataloader_test: | ||
runs-on: ubuntu-latest | ||
|
||
steps: | ||
- name: Checkout code | ||
uses: actions/checkout@v4 | ||
|
||
- name: test the dataloader without / with sanitize address | ||
run: | | ||
cd dev/test | ||
make PRECISION=BF16 test_dataloader | ||
./test_dataloader | ||
make clean | ||
make PRECISION=BF16 TEST_CFLAGS="-fsanitize=address -fno-omit-frame-pointer" test_dataloader | ||
./test_dataloader | ||
ptx_and_sass_files: | ||
runs-on: ubuntu-latest | ||
container: | ||
image: nvidia/cuda:12.4.1-devel-ubuntu22.04 | ||
|
||
steps: | ||
- name: Checkout code | ||
uses: actions/checkout@v4 | ||
|
||
- name: Install OpenMP and OpenMPI | ||
run: apt-get update && apt-get install -y libomp-dev libopenmpi-dev | ||
|
||
- name: Generate ptx/sass files and upload them to persistent storage | ||
run: | | ||
mkdir -p dev/cuda/ptx_sass_logs | ||
make train_gpt2cu | ||
cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx | ||
cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass | ||
cd dev/cuda | ||
make -j all_ptx | ||
make -j all_sass | ||
cp *.ptx ptx_sass_logs/ | ||
cp *.sass ptx_sass_logs/ | ||
ls ptx_sass_logs/ | ||
- name: Generate ptx/sass files for A100 and upload them to persistent storage | ||
run: | | ||
mkdir -p dev/cuda/ptx_sass_logs_A100 | ||
make train_gpt2cu GPU_COMPUTE_CAPABILITY=80 | ||
cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx | ||
cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass | ||
cd dev/cuda | ||
make -j GPU_COMPUTE_CAPABILITY=80 all_ptx | ||
make -j GPU_COMPUTE_CAPABILITY=80 all_sass | ||
cp *.ptx ptx_sass_logs_A100/ | ||
cp *.sass ptx_sass_logs_A100/ | ||
ls ptx_sass_logs_A100/ | ||
- name: Generate ptx/sass files for H100 and upload them to persistent storage | ||
run: | | ||
mkdir -p dev/cuda/ptx_sass_logs_H100 | ||
make train_gpt2cu GPU_COMPUTE_CAPABILITY=90 | ||
cuobjdump --dump-ptx train_gpt2cu > dev/cuda/train_gpt2cu.ptx | ||
cuobjdump --dump-sass train_gpt2cu > dev/cuda/train_gpt2cu.sass | ||
cd dev/cuda | ||
make -j GPU_COMPUTE_CAPABILITY=90 all_ptx | ||
make -j GPU_COMPUTE_CAPABILITY=90 all_sass | ||
cp *.ptx ptx_sass_logs_H100/ | ||
cp *.sass ptx_sass_logs_H100/ | ||
ls ptx_sass_logs_H100/ | ||
- name: Upload ptx/sass files | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: ptx_sass_files | ||
path: dev/cuda/ptx_sass_logs/ | ||
retention-days: 30 # days to retain | ||
|
||
- name: Upload ptx/sass files for A100 | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: ptx_sass_files_A100 | ||
path: dev/cuda/ptx_sass_logs_A100/ | ||
retention-days: 30 # days to retain | ||
|
||
- name: Upload ptx/sass files for H100 | ||
uses: actions/upload-artifact@v4 | ||
with: | ||
name: ptx_sass_files_H100 | ||
path: dev/cuda/ptx_sass_logs_H100/ | ||
retention-days: 30 # days to retain |
Oops, something went wrong.