From f9bda8f0a0417574012f87f0e0347ea74465274f Mon Sep 17 00:00:00 2001 From: zengxian Date: Thu, 23 Jan 2025 21:06:56 -0500 Subject: [PATCH] to use run_distributed.py --- .github/scripts/ut_result_check.sh | 16 +++++----------- .github/workflows/_linux_ut.yml | 6 ++---- .github/workflows/pull.yml | 2 +- test/xpu/run_distributed.py | 4 +++- 4 files changed, 11 insertions(+), 17 deletions(-) diff --git a/.github/scripts/ut_result_check.sh b/.github/scripts/ut_result_check.sh index ed4cb52a8..d0bf80f8e 100644 --- a/.github/scripts/ut_result_check.sh +++ b/.github/scripts/ut_result_check.sh @@ -73,19 +73,13 @@ if [[ "${ut_suite}" == 'torch_xpu' ]]; then fi fi if [[ "${ut_suite}" == 'xpu_distributed' ]]; then - grep "^FAILED" c10d_ops_xccl_test.log | awk '{print $2}' > ./"${ut_suite}"_ops_xccl_test_failed.log - grep "^FAILED" c10d_xccl_test.log | awk '{print $2}' > ./"${ut_suite}"_c10d_xccl_test_failed.log - num_failed_ops_xccl=$(wc -l < "./${ut_suite}_ops_xccl_test_failed.log") - num_failed_c10d_xccl=$(wc -l < "./${ut_suite}_c10d_xccl_test_failed.log") + grep "^FAILED" xpu_distributed_test.log | awk '{print $2}' > ./"${ut_suite}"_xpu_distributed_test_failed.log + num_failed_xpu_distributed=$(wc -l < "./${ut_suite}_xpu_distributed_test_failed.log") echo -e "=========================================================================" - echo -e "Show Failed cases in ${ut_suite} c10d ops xccl" + echo -e "Show Failed cases in ${ut_suite} xpu distributed" echo -e "=========================================================================" - cat "./${ut_suite}_ops_xccl_test_failed.log" - echo -e "=========================================================================" - echo -e "Show Failed cases in ${ut_suite} c10d xccl" - echo -e "=========================================================================" - cat "./${ut_suite}_c10d_xccl_test_failed.log" - ((num_failed=num_failed_ops_xccl+num_failed_c10d_xccl)) + cat "./${ut_suite}_xpu_distributed_test_failed.log" + ((num_failed=num_failed_xpu_distributed)) if [[ $num_failed -gt 0 ]]; then echo -e "[ERROR] UT ${ut_suite} test Fail" exit 1 diff --git a/.github/workflows/_linux_ut.yml b/.github/workflows/_linux_ut.yml index 541501644..cd1f14ebd 100644 --- a/.github/workflows/_linux_ut.yml +++ b/.github/workflows/_linux_ut.yml @@ -246,9 +246,7 @@ jobs: mkdir -p ut_log/xpu_distributed cd ../pytorch/third_party/torch-xpu-ops/test/xpu python -c "import torch;print(torch.distributed.is_xccl_available())" - # timeout 10000 python run_distributed.py 2>${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log - timeout 10000 python distributed/test_c10d_ops_xccl.py 2>${{ github.workspace }}/ut_log/xpu_distributed/c10d_ops_xccl_test_error.log | tee ${{ github.workspace }}/ut_log/xpu_distributed/c10d_ops_xccl_test.log - timeout 10000 python distributed/test_c10d_xccl.py 2>${{ github.workspace }}/ut_log/xpu_distributed/c10d_xccl_test_error.log | tee ${{ github.workspace }}/ut_log/xpu_distributed/c10d_xccl_test.log + timeout 10000 python run_distributed.py 2>${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test_error.log | tee ${{ github.workspace }}/ut_log/xpu_distributed/xpu_distributed_test.log cd ${{ github.workspace }} sudo cp ptrace_scope.bk /proc/sys/kernel/yama/ptrace_scope - name: UT Test Results Check @@ -264,7 +262,7 @@ jobs: } } set -xe - for ut_suite in $(echo ${{ inputs.ut }} |sed 's/,/ /g') + for ut_suite in $(echo ${{ inputs.ut }} |sed 's/-/ /g') do contains "op_regression,op_regression_dev1,op_extended,op_ut,torch_xpu,xpu_distributed" $ut_suite $contains_status diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index a25df15ec..9379699b5 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -55,7 +55,7 @@ jobs: uses: ./.github/workflows/_linux_ut.yml with: pytorch: ${{ needs.preci-linux-build.outputs.torch_commit_id }} - ut: op_regression,op_regression_dev1,op_extended,op_ut + ut: op_regression-op_regression_dev1-op_extended-op_ut runner: linux.idc.xpu preci-ut-distributed: diff --git a/test/xpu/run_distributed.py b/test/xpu/run_distributed.py index b242d469d..bed342d74 100644 --- a/test/xpu/run_distributed.py +++ b/test/xpu/run_distributed.py @@ -4,7 +4,9 @@ def run(test_command): - result = subprocess.run(test_command, capture_output=True) + result = subprocess.run(test_command, capture_output=True, text=True) + print(result.stdout) + print(result.stderr) if "FAILED" in result.stdout or "FAILED" in result.stderr: return 0 else: