Skip to content

Commit

Permalink
fix overlap
Browse files Browse the repository at this point in the history
  • Loading branch information
li126com committed Feb 1, 2024
1 parent d28d204 commit f4e096d
Show file tree
Hide file tree
Showing 2 changed files with 1 addition and 7 deletions.
7 changes: 1 addition & 6 deletions internlm/solver/optimizer/hybrid_zero_optim.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,8 +100,6 @@ def __init__(
# self._overlap_communication = overlap_communication
self._reduce_bucket_size = reduce_bucket_size

self._comm_bcast_stream = torch.cuda.Stream()

# gradient scaler
self.grad_scaler = DynamicGradScaler(
initial_scale=initial_scale,
Expand Down Expand Up @@ -837,8 +835,7 @@ def _step(self, closure=None, norms=None):
fp16_param.data.copy_(fp32_param)

torch.cuda.synchronize()
with torch.cuda.stream(self._comm_bcast_stream):
self.broadcast_params()
self.broadcast_params()

timer("step").stop()

Expand Down Expand Up @@ -875,8 +872,6 @@ def broadcast_params(self):
for handle in handles:
handle.wait()

torch.cuda.synchronize()

##################
# FP16 Utilities #
##################
Expand Down
1 change: 0 additions & 1 deletion internlm/utils/gputest.py
Original file line number Diff line number Diff line change
Expand Up @@ -301,7 +301,6 @@ def warmup_process_group():

def cuda_memory_analyze(step=0, print_mm_suage=False):
global n_caching_allocator_flushes
torch.cuda.synchronize()

g_rank = gpc.get_global_rank()
tp_rank = gpc.get_local_rank(ParallelMode.TENSOR)
Expand Down

0 comments on commit f4e096d

Please sign in to comment.