aws-ia · joozero · Feb 12, 2024 · Dec 12, 2023 · Dec 12, 2023 · Dec 12, 2023
@@ -144,9 +144,7 @@ Wrapping provided model in DistributedDataParallel.
 
 Result(
   metrics={'loss': 0.4192830347106792, 'accuracy': 0.8852},
-  path='dt-results-EXAMPLE/ecs_dt_results/TorchTrainer_d1824_00000_0_(...)',
-  filesystem='s3',
-  checkpoint=None
+  (...)
 )
 ```
 
@@ -166,6 +164,12 @@ terraform destroy
 
 ```
 
+## Troubleshooting
+
+* Error: creating ECS Service (...): InvalidParameterException: The specified capacity provider (...) was not found: There are some cases where the capacity provider is still being created and is not ready to be used by a service. Execute "terraform apply" again to solve the issue.
+
+* Error: waiting for ECS Service (...) delete: timeout while waiting for state to become 'INACTIVE' (last state: 'DRAINING', timeout: 20m0s): It can take several minutes for the service to finish draining. Wait 30 minutes and execute "terraform destroy" again to solve the issue.
+
 ## Support
 
 Please open an issue for questions or unexpected behavior
@@ -128,6 +128,11 @@ module "autoscaling_head" {
   }
 
   tags = local.tags
+
+  metadata_options = {
+    http_endpoint = "enabled"
+    http_tokens   = "required"
+  }
 }
 
 module "autoscaling_workers" {
@@ -177,6 +182,11 @@ module "autoscaling_workers" {
   ]
 
   tags = local.tags
+
+  metadata_options = {
+    http_endpoint = "enabled"
+    http_tokens   = "required"
+  }
 }
 
 module "autoscaling_sg" {
@@ -218,7 +228,7 @@ module "ecs_service_head" {
   requires_compatibilities = ["EC2"]
   capacity_provider_strategy = {
     default = {
-      capacity_provider = "distributed_ml_training_head" # needs to match name of capacity provider
+      capacity_provider = module.ecs_cluster.autoscaling_capacity_providers["distributed_ml_training_head"].name # needs to match name of capacity provider
       weight            = 1
       base              = 1
     }
@@ -314,7 +324,7 @@ module "ecs_service_workers" {
   requires_compatibilities = ["EC2"]
   capacity_provider_strategy = {
     default = {
-      capacity_provider = "distributed_ml_training_workers" # needs to match name of capacity provider
+      capacity_provider = module.ecs_cluster.autoscaling_capacity_providers["distributed_ml_training_workers"].name # needs to match name of capacity provider
       weight            = 1
       base              = 1
     }

@@ -13,7 +13,7 @@
 import ray
 import time
 import argparse
-
+import tempfile
 # Get arguments
 
 parser = argparse.ArgumentParser()
@@ -24,12 +24,6 @@
 # Connect to the Ray cluster
 ray.init()
 
-# Download the data in the shared storage
-transform = Compose([ToTensor(), Normalize((0.5,), (0.5,))])
-train_data = FashionMNIST(root='./data',
-                          train=True, download=True,
-                          transform=transform)
-
 # Define the training function that the distributed processes will run
 def train_func(config):
     import os
@@ -50,11 +44,11 @@ def train_func(config):
     # Setup loss and optimizer
     criterion = CrossEntropyLoss()
     optimizer = Adam(model.parameters(), lr=0.001)
-    # Retrieve the data from the shared storage.
+
+    # Prepare data
     transform = Compose([ToTensor(), Normalize((0.5,), (0.5,))])
     with FileLock(os.path.expanduser("./data.lock")):
         train_data = FashionMNIST(root='./data', train=True, download=True, transform=transform)
-        # Download test data from open datasets
         test_data = FashionMNIST(root="./data",train=False,download=True,transform=transform)
     batch_size=128
     train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
@@ -89,8 +83,20 @@ def train_func(config):
 
                 test_loss /= len(test_loader)
                 accuracy = num_correct / num_total
-            # Report metrics and checkpoint to Ray.
-            ray.train.report(metrics={"loss": test_loss, "accuracy": accuracy})
+
+                # Save the checkpoint
+                with tempfile.TemporaryDirectory() as temp_checkpoint_dir:
+                    checkpoint = None
+                    # Only the global rank 0 worker saves the checkpoint
+                    if ray.train.get_context().get_world_rank() == 0:
+                        torch.save(
+                            model.module.state_dict(),
+                            os.path.join(temp_checkpoint_dir, "model.pt"),
+                        )
+                        checkpoint = ray.train.Checkpoint.from_directory(os.path.join(temp_checkpoint_dir, "model.pt"))
+
+                    # Report metrics and checkpoint to Ray
+                    ray.train.report(metrics={"loss": test_loss, "accuracy": accuracy},checkpoint=checkpoint)
 
 # The scaling config defines how many worker processes to use for the training. Usually equals to the number of GPUs
 scaling_config = ScalingConfig(num_workers=2, use_gpu=True)