Skip to content

Commit

Permalink
to use run_distributed.py
Browse files Browse the repository at this point in the history
  • Loading branch information
zxd1997066 committed Jan 24, 2025
1 parent 09f4a38 commit f9bda8f
Show file tree
Hide file tree
Showing 4 changed files with 11 additions and 17 deletions.
16 changes: 5 additions & 11 deletions .github/scripts/ut_result_check.sh
Original file line number Diff line number Diff line change
Expand Up @@ -73,19 +73,13 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then
fi
fi
if [[ "${ut_suite}" == 'xpu_distributed' ]]; then
grep "^FAILED" c10d_ops_xccl_test.log | awk '{print $2}' > ./"${ut_suite}"_ops_xccl_test_failed.log
grep "^FAILED" c10d_xccl_test.log | awk '{print $2}' > ./"${ut_suite}"_c10d_xccl_test_failed.log
num_failed_ops_xccl=$(wc -l < "./${ut_suite}_ops_xccl_test_failed.log")
num_failed_c10d_xccl=$(wc -l < "./${ut_suite}_c10d_xccl_test_failed.log")
grep "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log
num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed.log")
echo -e "========================================================================="
echo -e "Show Failed cases in ${ut_suite} c10d ops xccl"
echo -e "Show Failed cases in ${ut_suite} xpu distributed"
echo -e "========================================================================="
cat "./${ut_suite}_ops_xccl_test_failed.log"
echo -e "========================================================================="
echo -e "Show Failed cases in ${ut_suite} c10d xccl"
echo -e "========================================================================="
cat "./${ut_suite}_c10d_xccl_test_failed.log"
((num_failed=num_failed_ops_xccl+num_failed_c10d_xccl))
cat "./${ut_suite}_xpu_distributed_test_failed.log"
((num_failed=num_failed_xpu_distributed))
if [[ $num_failed -gt 0 ]]; then
echo -e "[ERROR] UT ${ut_suite} test Fail"
exit 1
Expand Down
6 changes: 2 additions & 4 deletions .github/workflows/_linux_ut.yml
Original file line number Diff line number Diff line change
Expand Up @@ -246,9 +246,7 @@ jobs:
mkdir -p ut_log/xpu_distributed
cd ../pytorch/third_party/torch-xpu-ops/test/xpu
python -c "import torch;print(torch.distributed.is_xccl_available())"
# timeout 10000 python run_distributed.py 2>${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log
timeout 10000 python distributed/test_c10d_ops_xccl.py 2>${{ github.workspace }}/ut_log/xpu_distributed/c10d_ops_xccl_test_error.log | tee ${{ github.workspace }}/ut_log/xpu_distributed/c10d_ops_xccl_test.log
timeout 10000 python distributed/test_c10d_xccl.py 2>${{ github.workspace }}/ut_log/xpu_distributed/c10d_xccl_test_error.log | tee ${{ github.workspace }}/ut_log/xpu_distributed/c10d_xccl_test.log
timeout 10000 python run_distributed.py 2>${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log
cd ${{ github.workspace }}
sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope
- name: UT Test Results Check
Expand All @@ -264,7 +262,7 @@ jobs:
}
}
set -xe
for ut_suite in $(echo ${{ inputs.ut }} |sed 's/,/ /g')
for ut_suite in $(echo ${{ inputs.ut }} |sed 's/-/ /g')
do
contains "op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu,xpu_distributed" $ut_suite
$contains_status
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/pull.yml
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ jobs:
uses: ./.github/workflows/_linux_ut.yml
with:
pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }}
ut: op_regression,op_regression_dev1,op_extended,op_ut
ut: op_regression-op_regression_dev1-op_extended-op_ut
runner: linux.idc.xpu

preci-ut-distributed:
Expand Down
4 changes: 3 additions & 1 deletion test/xpu/run_distributed.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@


def run(test_command):
result = subprocess.run(test_command, capture_output=True)
result = subprocess.run(test_command, capture_output=True, text=True)
print(result.stdout)
print(result.stderr)
if "FAILED" in result.stdout or "FAILED" in result.stderr:
return 0
else:
Expand Down

0 comments on commit f9bda8f

Please sign in to comment.