diff --git a/config/buildspec.yml b/config/buildspec.yml index 2b50599dc..665880a4e 100755 --- a/config/buildspec.yml +++ b/config/buildspec.yml @@ -30,7 +30,7 @@ phases: - sudo apt-get install unzip -qq -o=Dpkg::Use-Pty=0 - cd $CODEBUILD_SRC_DIR && chmod +x config/protoc_downloader.sh && ./config/protoc_downloader.sh - pip install --upgrade pip==20.3.3 - - pip install -q matplotlib==3.3.1 seaborn==0.10.1 nbconvert==5.6.1 papermill==2.1.2 jupyter==1.0.0 scipy==1.5.2 scikit-learn==0.23.2 bokeh==2.2.3 + - pip install -q matplotlib==3.3.1 seaborn==0.10.1 nbconvert==5.6.1 papermill==2.1.2 jupyter==1.0.0 scipy==1.5.2 scikit-learn==0.23.2 bokeh==2.2.3 simplejson==3.17.2 - if [ "$run_pytest_xgboost" = "enable" ]; then pip install --upgrade pyYaml==5.1; else pip install -q pyYaml; fi - pip install -q pytest wheel pytest-html pre-commit awscli pytest-cov diff --git a/config/buildspec_tensorflow_2_3.yml b/config/buildspec_tensorflow_2_3.yml index 36b9fe164..be7032a0b 100644 --- a/config/buildspec_tensorflow_2_3.yml +++ b/config/buildspec_tensorflow_2_3.yml @@ -35,7 +35,7 @@ phases: - sudo apt-get install unzip -qq -o=Dpkg::Use-Pty=0 - cd $CODEBUILD_SRC_DIR && chmod +x config/protoc_downloader.sh && ./config/protoc_downloader.sh - pip install --upgrade pip==19.3.1 - - pip install -q matplotlib==3.3.1 seaborn==0.10.1 nbconvert==5.6.1 papermill==2.1.2 jupyter==1.0.0 scipy==1.5.2 scikit-learn==0.23.2 bokeh==2.2.3 + - pip install -q matplotlib==3.3.1 seaborn==0.10.1 nbconvert==5.6.1 papermill==2.1.2 jupyter==1.0.0 scipy==1.5.2 scikit-learn==0.23.2 bokeh==2.2.3 simplejson==3.17.2 - pip install -q pytest wheel pytest-html pre-commit awscli pytest-cov pre_build: diff --git a/config/buildspec_tensorflow_2_4.yml b/config/buildspec_tensorflow_2_4.yml index 5bac6588e..d165f4d40 100644 --- a/config/buildspec_tensorflow_2_4.yml +++ b/config/buildspec_tensorflow_2_4.yml @@ -30,7 +30,7 @@ phases: - sudo apt-get install unzip -qq -o=Dpkg::Use-Pty=0 - cd $CODEBUILD_SRC_DIR && chmod +x config/protoc_downloader.sh && ./config/protoc_downloader.sh - pip install --upgrade pip==19.3.1 - - pip install -q matplotlib==3.3.1 seaborn==0.10.1 nbconvert==5.6.1 papermill==2.1.2 jupyter==1.0.0 scipy==1.5.2 scikit-learn==0.23.2 bokeh==2.2.3 + - pip install -q matplotlib==3.3.1 seaborn==0.10.1 nbconvert==5.6.1 papermill==2.1.2 jupyter==1.0.0 scipy==1.5.2 scikit-learn==0.23.2 bokeh==2.2.3 simplejson==3.17.2 - if [ "$run_pytest_xgboost" = "enable" ]; then pip install --upgrade pyYaml==5.1; else pip install -q pyYaml; fi - pip install -q pytest wheel pytest-html pre-commit awscli pytest-cov diff --git a/config/profiler/buildspec_profiler_sagemaker_tensorflow_2_3_1_integration_tests.yml b/config/profiler/buildspec_profiler_sagemaker_tensorflow_2_3_1_integration_tests.yml index 01662cdb6..c461792a5 100644 --- a/config/profiler/buildspec_profiler_sagemaker_tensorflow_2_3_1_integration_tests.yml +++ b/config/profiler/buildspec_profiler_sagemaker_tensorflow_2_3_1_integration_tests.yml @@ -4,7 +4,7 @@ version: 0.2 env: variables: use_current_branch: "true" - enable_smdataparallel_tests: "false" + enable_smdataparallel_tests: "true" force_run_tests: "false" framework: "tensorflow" phases: diff --git a/smdebug/core/utils.py b/smdebug/core/utils.py index 5645fd72c..5c9d17da7 100644 --- a/smdebug/core/utils.py +++ b/smdebug/core/utils.py @@ -23,7 +23,8 @@ from smdebug.exceptions import IndexReaderException _is_invoked_via_smddp = None -_smdataparallel_imported = None +_smddp_tf_imported = None +_smddp_pt_imported = None try: import smdistributed.modelparallel.tensorflow as smp @@ -369,11 +370,10 @@ def get_distributed_worker(): elif check_smdataparallel_env(): # smdistributed.dataparallel should be invoked via `mpirun`. # It supports EC2 machines with 8 GPUs per machine. - try: - if _smdataparallel_imported.get_world_size(): - return _smdataparallel_imported.get_rank() - except ValueError: - pass + if _smddp_pt_imported is not None and _smddp_pt_imported.get_world_size(): + return _smddp_pt_imported.get_rank() + elif _smddp_tf_imported is not None and _smddp_tf_imported.size(): + return _smddp_tf_imported.rank() elif _hvd_imported: try: if _hvd_imported.size(): @@ -495,7 +495,8 @@ def __exit__(self, *args): def check_smdataparallel_env(): # Check to ensure it is invoked by mpi and the SM distribution is `dataparallel` global _is_invoked_via_smddp - global _smdataparallel_imported + global _smddp_tf_imported + global _smddp_pt_imported if _is_invoked_via_smddp is None: _is_invoked_via_mpi = ( os.getenv("OMPI_COMM_WORLD_SIZE") is not None @@ -521,12 +522,12 @@ def check_smdataparallel_env(): try: import smdistributed.dataparallel.torch.distributed as smdataparallel - _smdataparallel_imported = smdataparallel + _smddp_pt_imported = smdataparallel except (ModuleNotFoundError, ImportError): try: import smdistributed.dataparallel.tensorflow as smdataparallel - _smdataparallel_imported = smdataparallel + _smddp_tf_imported = smdataparallel except (ModuleNotFoundError, ImportError): _smdataparallel_imported = None else: