Skip to content

Commit

Permalink
[ci] migrate multi-node + correctness tests to nightly pipeline (#2662)
Browse files Browse the repository at this point in the history
  • Loading branch information
siddvenk authored Jan 15, 2025
1 parent 2ae0aab commit 7315729
Show file tree
Hide file tree
Showing 3 changed files with 28 additions and 218 deletions.
125 changes: 8 additions & 117 deletions .github/workflows/correctness.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,123 +10,14 @@ on:
schedule:
- cron: '0 9 * * *'

# TODO: port this to integration tests in 0.31.0 and then delete this file
jobs:
create-runners:
runs-on: [self-hosted, scheduler]
fast-fail:
runs-on: ubuntu-latest
steps:
- name: Create new G6 instance
id: create_gpu1
- name: Fail if run on master branch
id: fast_fail
if: github.ref == 'refs/heads/master'
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_g6 $token djl-serving
- name: Create new G6 instance
id: create_gpu2
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_g6 $token djl-serving
- name: Create new Inf2.24xl instance
id: create_inf2
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_inf2 $token djl-serving
outputs:
gpu_instance_id_1: ${{ steps.create_gpu1.outputs.action_g6_instance_id }}
gpu_instance_id_2: ${{ steps.create_gpu2.outputs.action_g6_instance_id }}
inf2_instance_id: ${{ steps.create_inf2.outputs.action_inf2_instance_id }}

test:
runs-on: [ "${{ matrix.test.instance }}" ]
timeout-minutes: 90
needs: create-runners
strategy:
fail-fast: false
matrix:
test:
- test: TestCorrectnessTrtLlm
instance: g6
- test: TestCorrectnessLmiDist
instance: g6
- test: TestCorrectnessNeuronx
instance: inf2
steps:
- uses: actions/checkout@v4
- name: Clean env
run: |
yes | docker system prune -a --volumes
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
echo "wait dpkg lock..."
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
- name: Set up JDK 17
uses: actions/setup-java@v4
with:
distribution: 'corretto'
java-version: 17
- name: Set up Python3
uses: actions/setup-python@v5
with:
python-version: '3.10.x'
- name: Install pip dependencies
run: pip3 install pytest requests "numpy<2" pillow huggingface_hub
- name: Install torch
# Use torch to get cuda capability of current device to selectively run tests
# Torch version doesn't really matter that much
run: |
pip3 install torch==2.3.0
- name: Install awscurl
working-directory: tests/integration
run: |
curl -OL https://publish.djl.ai/awscurl/awscurl
chmod +x awscurl
mkdir outputs
- name: Test
working-directory: tests/integration
env:
TEST_DJL_VERSION: ${{ inputs.djl-version }}
run: |
python -m pytest -k ${{ matrix.test.test }} tests.py
- name: Cleanup
working-directory: tests/integration
run: |
rm -rf outputs
rm awscurl
- name: On Failure
if: ${{ failure() }}
working-directory: tests/integration
run: |
for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done
sudo rm -rf outputs && sudo rm -rf models
rm awscurl
docker rm -f $(docker ps -aq) || true
- name: Upload test logs
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: test-${{ matrix.test.test }}-logs
path: tests/integration/all_logs/

stop-runners:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [ create-runners, test]
steps:
- name: Stop all instances
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
instance_id=${{ needs.create-runners.outputs.gpu_instance_id_1 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.gpu_instance_id_2 }}
./stop_instance.sh $instance_id
instance_id=${{ needs.create-runners.outputs.inf2_instance_id }}
./stop_instance.sh $instance_id
echo "Fast fail"
exit 1
12 changes: 12 additions & 0 deletions .github/workflows/integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,18 @@ jobs:
- test: TestLmiDistPipelineParallel
instance: g6
failure-prefix: lmi
- test: TestLmiDistMultiNode
instance: g6
failure-prefix: lmi
- test: TestCorrectnessTrtLlm
instance: g6
failure-prefix: trtllm
- test: TestCorrectnessLmiDist
instance: g6
failure-prefix: lmi
- test: TestCorrectnessNeuronx
instance: inf2
failure-prefix: neuron
outputs:
failure_cpu: ${{ steps.test-failure.outputs.failure_cpu }}
failure_gpu: ${{ steps.test-failure.outputs.failure_gpu }}
Expand Down
109 changes: 8 additions & 101 deletions .github/workflows/multi_node_integration.yml
Original file line number Diff line number Diff line change
Expand Up @@ -7,108 +7,15 @@ on:
description: 'The released version of DJL'
required: false
default: ''
schedule:
- cron: '0 13 * * *'


# TODO: port this to integration tests in 0.31.0 and then delete this file
jobs:
create-runners:
runs-on: [self-hosted, scheduler]
steps:
- name: Create new G6 instance
id: create_gpu
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
token=$( curl -X POST -H "Authorization: token ${{ secrets.ACTION_RUNNER_PERSONAL_TOKEN }}" \
https://api.github.com/repos/deepjavalibrary/djl-serving/actions/runners/registration-token \
--fail \
| jq '.token' | tr -d '"' )
./start_instance.sh action_g6 $token djl-serving
outputs:
gpu_instance_id_1: ${{ steps.create_gpu.outputs.action_g6_instance_id }}

multi-node-test:
runs-on:
- ${{ matrix.test.gh-runner && matrix.test.instance || 'self-hosted' }}
- ${{ matrix.test.gh-runner && matrix.test.instance || format('RUN_ID-{0}', github.run_id) }}
- ${{ matrix.test.gh-runner && matrix.test.instance || format('RUN_NUMBER-{0}', github.run_number) }}
- ${{ matrix.test.gh-runner && matrix.test.instance || format('SHA-{0}', github.sha) }}
- ${{ matrix.test.instance }}
timeout-minutes: 60
needs: create-runners
strategy:
fail-fast: false
matrix:
test:
- test: TestLmiDistMultiNode
instance: g6
steps:
- uses: actions/checkout@v4
- name: Clean env
run: |
yes | docker system prune -a --volumes
sudo rm -rf /home/ubuntu/actions-runner/_work/_tool/Java_Corretto_jdk/
echo "wait dpkg lock..."
while sudo fuser /var/{lib/{dpkg,apt/lists},cache/apt/archives}/lock >/dev/null 2>&1; do sleep 5; done
- name: Set up Python3
if: ${{ matrix.test.instance != 'aarch64' }}
uses: actions/setup-python@v5
with:
python-version: '3.10.x'
- name: Set up Python3 (aarch64)
if: ${{ matrix.test.instance == 'aarch64' }}
run: |
# Using an alternate installation because of an incompatible combination
# of aarch64 with ubuntu-20.04 not supported by the actions/setup-python
sudo apt-get install python3 python-is-python3 python3-pip -y
- name: Install pip dependencies
run: pip3 install pytest requests "numpy<2" pillow huggingface_hub
- name: Install torch
# Use torch to get cuda capability of current device to selectively run tests
# Torch version doesn't really matter that much
run: |
pip3 install torch==2.3.0
- name: Install awscurl
working-directory: tests/integration
run: |
wget https://publish.djl.ai/awscurl/awscurl
chmod +x awscurl
mkdir outputs
- name: Test
working-directory: tests/integration
env:
TEST_DJL_VERSION: ${{ inputs.djl-version }}
run: |
python -m pytest -k ${{ matrix.test.test }} tests.py
- name: Cleanup
working-directory: tests/integration
run: |
rm -rf outputs
rm awscurl
- name: On Failure
if: ${{ failure() }}
working-directory: tests/integration
run: |
for file in outputs/*; do if [ -f "$file" ]; then echo "Contents of $file:"; cat "$file"; echo; fi; done
echo "Printing lmi worker log"
cat all_logs/llama3-8b/lmi-worker.log
sudo rm -rf outputs && sudo rm -rf models
rm awscurl
./remove_container.sh
- name: Upload test logs
if: ${{ always() }}
uses: actions/upload-artifact@v4
with:
name: test-${{ matrix.test.test }}-logs
path: tests/integration/all_logs/

stop-runners:
if: always()
runs-on: [ self-hosted, scheduler ]
needs: [ create-runners, multi-node-test]
fast-fail:
runs-on: ubuntu-latest
steps:
- name: Stop all instances
- name: Fail if run on master branch
id: fast_fail
if: github.ref == 'refs/heads/master'
run: |
cd /home/ubuntu/djl_benchmark_script/scripts
instance_id=${{ needs.create-runners.outputs.gpu_instance_id_1 }}
./stop_instance.sh $instance_id
echo "Fast fail"
exit 1

0 comments on commit 7315729

Please sign in to comment.