Revert "try dirty hack to use default streams" #147
Workflow file for this run
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
name: Rust GPU Tests | |
on: | |
push: | |
concurrency: | |
group: "${{ github.workflow }} @ ${{ github.event.pull_request.head.label || github.head_ref || github.ref }}" | |
cancel-in-progress: true | |
jobs: | |
e2e: | |
runs-on: gpu | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Validate presence of GPU devices | |
run: nvidia-smi | |
- name: Check shared memory size | |
run: df -h | |
- name: Install OpenSSL && pkg-config | |
run: sudo apt-get update && sudo apt-get install -y pkg-config libssl-dev | |
- name: Install CUDA and NCCL dependencies | |
if: steps.cache-cuda-nccl.outputs.cache-hit != 'true' | |
env: | |
DEBIAN_FRONTEND: noninteractive | |
run: | | |
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb | |
sudo dpkg -i cuda-keyring_1.1-1_all.deb | |
sudo apt update | |
sudo apt install -y cuda-toolkit-12-2 libnccl2 libnccl-dev | |
- name: Find libs | |
run: find /usr -name "libnvrtc*" && find /usr -name libcuda.so | |
- name: Cache Rust build | |
uses: actions/cache@v3 | |
id: cache-rust | |
with: | |
path: | | |
~/.cargo/registry | |
~/.cargo/git | |
target | |
key: rust-build-${{ runner.os }}-${{ hashFiles('**/Cargo.lock') }} | |
restore-keys: | | |
rust-build-${{ runner.os }}- | |
- name: Find libs | |
run: find /usr -name "libnvrtc*" && find /usr -name libcuda.so | |
- name: Install Rust nightly | |
uses: dtolnay/rust-toolchain@master | |
with: | |
toolchain: nightly | |
- name: E2E Tests | |
run: cargo test --release e2e | |
shell: bash | |
env: | |
NCCL_DEBUG: info | |
NCCL_P2P_LEVEL: LOC | |
NCCL_NET: Socket | |
NCCL_P2P_DIRECT_DISABLE: 1 | |
NCCL_SHM_DISABLE: 1 | |
e2e-sanitizer: | |
runs-on: gpu | |
steps: | |
- name: Checkout code | |
uses: actions/checkout@v4 | |
- name: Validate presence of GPU devices | |
run: nvidia-smi | |
- name: Check shared memory size | |
run: df -h | |
- name: Install OpenSSL && pkg-config | |
run: sudo apt-get update && sudo apt-get install -y pkg-config libssl-dev | |
- name: Install CUDA and NCCL dependencies | |
if: steps.cache-cuda-nccl.outputs.cache-hit != 'true' | |
env: | |
DEBIAN_FRONTEND: noninteractive | |
run: | | |
wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/cuda-keyring_1.1-1_all.deb | |
sudo dpkg -i cuda-keyring_1.1-1_all.deb | |
sudo apt update | |
sudo apt install -y cuda-toolkit-12-2 cuda-command-line-tools-12-2 libnccl2 libnccl-dev | |
- name: Find libs | |
run: find /usr -name "libnvrtc*" && find /usr -name libcuda.so | |
- name: Cache Rust build | |
uses: actions/cache@v3 | |
id: cache-rust | |
with: | |
path: | | |
~/.cargo/registry | |
~/.cargo/git | |
target | |
key: rust-build-${{ runner.os }}-${{ hashFiles('**/Cargo.lock') }} | |
restore-keys: | | |
rust-build-${{ runner.os }}- | |
- name: Find libs | |
run: find /usr -name "libnvrtc*" && find /usr -name libcuda.so | |
- name: Find compute-sanitizer | |
run: find /usr -name "compute-sanitizer" | |
- name: Install Rust nightly | |
uses: dtolnay/rust-toolchain@master | |
with: | |
toolchain: nightly | |
- name: Build e2e test | |
run: cargo test --release e2e --no-run | |
- name: Build e2e test and grab executable name | |
run: echo TEST_NAME=$(cargo --color=never test --release e2e --no-run 2>&1 | grep "Executable tests/e2e.rs" | sed "s/.*(\(.*\))/\1/") >> $GITHUB_OUTPUT | |
id: build-e2e | |
- name: E2E Tests w/ compute-sanitizer | |
run: /usr/local/cuda-12.2/bin/compute-sanitizer --tool=memcheck ${{ steps.build-e2e.outputs.TEST_NAME }} --nocapture | |
env: | |
NCCL_DEBUG: info | |
NCCL_P2P_LEVEL: LOC | |
NCCL_NET: Socket | |
NCCL_P2P_DIRECT_DISABLE: 1 | |
NCCL_SHM_DISABLE: 1 |