Skip to content

Commit

Permalink
SMDDP should use size() and rank() for TF jobs (#451)
Browse files Browse the repository at this point in the history
  • Loading branch information
ndodda-amazon authored Feb 23, 2021
1 parent 0131559 commit c5296cf
Show file tree
Hide file tree
Showing 5 changed files with 14 additions and 13 deletions.
2 changes: 1 addition & 1 deletion config/buildspec.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ phases:
- sudo apt-get install unzip -qq -o=Dpkg::Use-Pty=0
- cd $CODEBUILD_SRC_DIR && chmod +x config/protoc_downloader.sh && ./config/protoc_downloader.sh
- pip install --upgrade pip==20.3.3
- pip install -q matplotlib==3.3.1 seaborn==0.10.1 nbconvert==5.6.1 papermill==2.1.2 jupyter==1.0.0 scipy==1.5.2 scikit-learn==0.23.2 bokeh==2.2.3
- pip install -q matplotlib==3.3.1 seaborn==0.10.1 nbconvert==5.6.1 papermill==2.1.2 jupyter==1.0.0 scipy==1.5.2 scikit-learn==0.23.2 bokeh==2.2.3 simplejson==3.17.2
- if [ "$run_pytest_xgboost" = "enable" ]; then pip install --upgrade pyYaml==5.1; else pip install -q pyYaml; fi
- pip install -q pytest wheel pytest-html pre-commit awscli pytest-cov

Expand Down
2 changes: 1 addition & 1 deletion config/buildspec_tensorflow_2_3.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ phases:
- sudo apt-get install unzip -qq -o=Dpkg::Use-Pty=0
- cd $CODEBUILD_SRC_DIR && chmod +x config/protoc_downloader.sh && ./config/protoc_downloader.sh
- pip install --upgrade pip==19.3.1
- pip install -q matplotlib==3.3.1 seaborn==0.10.1 nbconvert==5.6.1 papermill==2.1.2 jupyter==1.0.0 scipy==1.5.2 scikit-learn==0.23.2 bokeh==2.2.3
- pip install -q matplotlib==3.3.1 seaborn==0.10.1 nbconvert==5.6.1 papermill==2.1.2 jupyter==1.0.0 scipy==1.5.2 scikit-learn==0.23.2 bokeh==2.2.3 simplejson==3.17.2
- pip install -q pytest wheel pytest-html pre-commit awscli pytest-cov

pre_build:
Expand Down
2 changes: 1 addition & 1 deletion config/buildspec_tensorflow_2_4.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ phases:
- sudo apt-get install unzip -qq -o=Dpkg::Use-Pty=0
- cd $CODEBUILD_SRC_DIR && chmod +x config/protoc_downloader.sh && ./config/protoc_downloader.sh
- pip install --upgrade pip==19.3.1
- pip install -q matplotlib==3.3.1 seaborn==0.10.1 nbconvert==5.6.1 papermill==2.1.2 jupyter==1.0.0 scipy==1.5.2 scikit-learn==0.23.2 bokeh==2.2.3
- pip install -q matplotlib==3.3.1 seaborn==0.10.1 nbconvert==5.6.1 papermill==2.1.2 jupyter==1.0.0 scipy==1.5.2 scikit-learn==0.23.2 bokeh==2.2.3 simplejson==3.17.2
- if [ "$run_pytest_xgboost" = "enable" ]; then pip install --upgrade pyYaml==5.1; else pip install -q pyYaml; fi
- pip install -q pytest wheel pytest-html pre-commit awscli pytest-cov

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ version: 0.2
env:
variables:
use_current_branch: "true"
enable_smdataparallel_tests: "false"
enable_smdataparallel_tests: "true"
force_run_tests: "false"
framework: "tensorflow"
phases:
Expand Down
19 changes: 10 additions & 9 deletions smdebug/core/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,8 @@
from smdebug.exceptions import IndexReaderException

_is_invoked_via_smddp = None
_smdataparallel_imported = None
_smddp_tf_imported = None
_smddp_pt_imported = None

try:
import smdistributed.modelparallel.tensorflow as smp
Expand Down Expand Up @@ -369,11 +370,10 @@ def get_distributed_worker():
elif check_smdataparallel_env():
# smdistributed.dataparallel should be invoked via `mpirun`.
# It supports EC2 machines with 8 GPUs per machine.
try:
if _smdataparallel_imported.get_world_size():
return _smdataparallel_imported.get_rank()
except ValueError:
pass
if _smddp_pt_imported is not None and _smddp_pt_imported.get_world_size():
return _smddp_pt_imported.get_rank()
elif _smddp_tf_imported is not None and _smddp_tf_imported.size():
return _smddp_tf_imported.rank()
elif _hvd_imported:
try:
if _hvd_imported.size():
Expand Down Expand Up @@ -495,7 +495,8 @@ def __exit__(self, *args):
def check_smdataparallel_env():
# Check to ensure it is invoked by mpi and the SM distribution is `dataparallel`
global _is_invoked_via_smddp
global _smdataparallel_imported
global _smddp_tf_imported
global _smddp_pt_imported
if _is_invoked_via_smddp is None:
_is_invoked_via_mpi = (
os.getenv("OMPI_COMM_WORLD_SIZE") is not None
Expand All @@ -521,12 +522,12 @@ def check_smdataparallel_env():
try:
import smdistributed.dataparallel.torch.distributed as smdataparallel

_smdataparallel_imported = smdataparallel
_smddp_pt_imported = smdataparallel
except (ModuleNotFoundError, ImportError):
try:
import smdistributed.dataparallel.tensorflow as smdataparallel

_smdataparallel_imported = smdataparallel
_smddp_tf_imported = smdataparallel
except (ModuleNotFoundError, ImportError):
_smdataparallel_imported = None
else:
Expand Down

0 comments on commit c5296cf

Please sign in to comment.