fix overlap

InternLM · Feb 1, 2024 · f4e096d · f4e096d
1 parent d28d204
commit f4e096d
Show file tree

Hide file tree

Showing 2 changed files with 1 addition and 7 deletions.
diff --git a/internlm/solver/optimizer/hybrid_zero_optim.py b/internlm/solver/optimizer/hybrid_zero_optim.py
@@ -100,8 +100,6 @@ def __init__(
         # self._overlap_communication = overlap_communication
         self._reduce_bucket_size = reduce_bucket_size
 
-        self._comm_bcast_stream = torch.cuda.Stream()
-
         # gradient scaler
         self.grad_scaler = DynamicGradScaler(
             initial_scale=initial_scale,
@@ -837,8 +835,7 @@ def _step(self, closure=None, norms=None):
                     fp16_param.data.copy_(fp32_param)
 
         torch.cuda.synchronize()
-        with torch.cuda.stream(self._comm_bcast_stream):
-            self.broadcast_params()
+        self.broadcast_params()
 
         timer("step").stop()
 
@@ -875,8 +872,6 @@ def broadcast_params(self):
         for handle in handles:
             handle.wait()
 
-        torch.cuda.synchronize()
-
     ##################
     # FP16 Utilities #
     ##################

diff --git a/internlm/utils/gputest.py b/internlm/utils/gputest.py
@@ -301,7 +301,6 @@ def warmup_process_group():
 
 def cuda_memory_analyze(step=0, print_mm_suage=False):
     global n_caching_allocator_flushes
-    torch.cuda.synchronize()
 
     g_rank = gpc.get_global_rank()
     tp_rank = gpc.get_local_rank(ParallelMode.TENSOR)