Skip to content

Commit

Permalink
add distributed ut in CI
Browse files Browse the repository at this point in the history
  • Loading branch information
zxd1997066 committed Jan 23, 2025
1 parent c04452f commit 082963e
Show file tree
Hide file tree
Showing 3 changed files with 52 additions and 2 deletions.
23 changes: 22 additions & 1 deletion .github/scripts/ut_result_check.sh
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/bin/bash
ut_suite="${1:-op_regression}" # op_regression / op_extended / op_ut / torch_xpu

if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1' || "${ut_suite}" == 'op_extended' ]]; then
if [[ "${ut_suite}" == 'op_regression' || "${ut_suite}" == 'op_regression_dev1' || "${ut_suite}" == 'op_extended' || "${ut_suite}" == 'xpu_distributed' ]]; then
grep "^FAILED" "${ut_suite}"_test.log | awk '{print $2}' > ./"${ut_suite}"_failed.log
grep "PASSED" "${ut_suite}"_test.log | awk '{print $1}' > ./"${ut_suite}"_passed.log
num_failed=$(wc -l < "./${ut_suite}_failed.log")
Expand Down Expand Up @@ -72,3 +72,24 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
echo -e "[PASS] UT ${ut_suite} test Pass"
fi
fi
if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
grep "^FAILED" c10d_ops_xccl_test.log | awk '{print $2}' > ./"${ut_suite}"_ops_xccl_test_failed.log
grep "^FAILED" c10d_xccl_test.log | awk '{print $2}' > ./"${ut_suite}"_c10d_xccl_test_failed.log
num_failed_ops_xccl=$(wc -l < "./${ut_suite}_ops_xccl_test_failed.log")
num_failed_c10d_xccl=$(wc -l < "./${ut_suite}_c10d_xccl_test_failed.log")
echo -e "========================================================================="
echo -e "Show Failed cases in ${ut_suite} c10d ops xccl"
echo -e "========================================================================="
cat "./${ut_suite}_ops_xccl_test_failed.log"
echo -e "========================================================================="
echo -e "Show Failed cases in ${ut_suite} c10d xccl"
echo -e "========================================================================="
cat "./${ut_suite}_c10d_xccl_test_failed.log"
((num_failed=num_failed_ops_xccl+num_failed_c10d_xccl))
if [[ $num_failed -gt 0 ]]; then
echo -e "[ERROR] UT ${ut_suite} test Fail"
exit 1
else
echo -e "[PASS] UT ${ut_suite} test Pass"
fi
fi
19 changes: 18 additions & 1 deletion .github/workflows/_linux_ut.yml
Original file line number Diff line number Diff line change
Expand Up @@ -234,6 +234,23 @@ jobs:
test_cmd="${test_cmd} test_xpu.py"
fi
eval $test_cmd 2>${{ github.workspace }}/ut_log/torch_xpu/torch_xpu_test_error.log | tee ${{ github.workspace }}/ut_log/torch_xpu/torch_xpu_test.log
- name: Run Torch XPU Distributed UT
if: contains(inputs.ut, 'xpu_distributed')
run: |
source .github/scripts/env.sh ${{ inputs.pytorch }}
source activate xpu_op_${ZE_AFFINITY_MASK}
pip install pytest
cd ${{ github.workspace }}
sudo cp /proc/sys/kernel/yama/ptrace_scope ptrace_scope.bk
sudo echo "0"|sudo tee /proc/sys/kernel/yama/ptrace_scope
mkdir -p ut_log/xpu_distributed
cd ../pytorch/third_party/torch-xpu-ops/test/xpu
python -c "import torch;print(torch.distributed.is_xccl_available())"
# timeout 10000 python run_distributed.py 2>${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log
timeout 10000 python distributed/test_c10d_ops_xccl.py 2>${{ github.workspace }}/ut_log/xpu_distributed/c10d_ops_xccl_test_error.log | tee ${{ github.workspace }}/ut_log/xpu_distributed/c10d_ops_xccl_test.log
timeout 10000 python distributed/test_c10d_xccl.py 2>${{ github.workspace }}/ut_log/xpu_distributed/c10d_xccl_test_error.log | tee ${{ github.workspace }}/ut_log/xpu_distributed/c10d_xccl_test.log
cd ${{ github.workspace }}
sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
- name: UT Test Results Check
shell: bash
run: |
Expand All @@ -249,7 +266,7 @@ jobs:
set -xe
for ut_suite in $(echo ${{ inputs.ut }} |sed 's/,/ /g')
do
contains "op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu" $ut_suite
contains "op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu,xpu_distributed" $ut_suite
$contains_status
cd ${{ github.workspace }}/ut_log/${ut_suite}
cp ${{ github.workspace }}/.github/scripts/ut_result_check.sh ./
Expand Down
12 changes: 12 additions & 0 deletions .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,18 @@ jobs:
ut: op_regression,op_regression_dev1,op_extended,op_ut
runner: linux.idc.xpu

preci-ut-distributed:
# Don't run on forked repos and draft PRs
secrets: inherit
if: ${{ (github.repository_owner == 'intel') && (github.event.pull_request.draft == false) }}
name: preci-linux
needs: preci-linux-build
uses: ./.github/workflows/_linux_ut.yml
with:
pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }}
ut: xpu_distributed
runner: pvc_e2e

Inductor-XPU-E2E-CI-Tests:
name: preci-linux / e2e_test
needs: preci-linux-build
Expand Down

0 comments on commit 082963e

Please sign in to comment.