Skip to content

Commit

Permalink
fix check CUDA_DEVICE_MAX_CONNECTIONS
Browse files Browse the repository at this point in the history
  • Loading branch information
sallyjunjun committed Dec 3, 2024
1 parent 6b7df0b commit 5825926
Show file tree
Hide file tree
Showing 2 changed files with 22 additions and 2 deletions.
8 changes: 8 additions & 0 deletions .github/workflows/demo_in_readme.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -45,25 +45,33 @@ jobs:
id: basic_train
run: |
source activate ${evo_env_torch21_flash2}
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
- name: load_preset_ckpt
if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
run: |
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
- name: load_new_ckpt
run: |
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
rm -rf $GITHUB_WORKSPACE/llm_ckpts
- name: torchrun-train
run: |
source activate ${evo_env_torch21_flash2}
export TORCH_NCCL_AVOID_RECORD_STREAMS=1
export CUDA_DEVICE_MAX_CONNECTIONS=1
sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
rm -rf $GITHUB_WORKSPACE/llm_ckpts
Expand Down
16 changes: 14 additions & 2 deletions internlm/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -250,8 +250,20 @@ def enable_pytorch_expandable_segments():


def check_cuda_env():
if os.getenv("CUDA_DEVICE_MAX_CONNECTIONS") is None:
logger.warning("Env var CUDA_DEVICE_MAX_CONNECTIONS has not be set, please note this!")
if internlm_accelerator.get_accelerator_backend() == AcceleratorType.GPU:
max_connections = os.getenv("CUDA_DEVICE_MAX_CONNECTIONS")
assert max_connections is not None, "Env var CUDA_DEVICE_MAX_CONNECTIONS has not been set, please set it to 1!"
assert (
max_connections == "1"
), "Env var CUDA_DEVICE_MAX_CONNECTIONS is set to {}, it should be set to 1!".format(max_connections)

avoid_record_streams = os.getenv("TORCH_NCCL_AVOID_RECORD_STREAMS")
assert (
avoid_record_streams is not None
), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS has not been set, please set it to 1!"
assert (
avoid_record_streams == "1"
), "Env var TORCH_NCCL_AVOID_RECORD_STREAMS is set to {}, it should be set to 1!".format(avoid_record_streams)


class DummyProfile:
Expand Down

0 comments on commit 5825926

Please sign in to comment.