fix hf internlm nan bug (#295)

InternLM · Aug 9, 2024 · 708260f · 708260f
1 parent 137deb3
commit 708260f
Show file tree

Hide file tree

Showing 2 changed files with 2 additions and 3 deletions.
diff --git a/internlm/core/parallel/comm/isp.py b/internlm/core/parallel/comm/isp.py
@@ -795,7 +795,7 @@ class DistributedAttention(nn.Module):
 
     def __init__(
         self,
-        local_attention: nn.Module,
+        local_attention: Union[nn.Module, Callable],
         sequence_process_group: dist.ProcessGroup,
     ) -> None:
         super().__init__()
@@ -914,7 +914,7 @@ def _attetion_constructor(
     return partial(_attetion_constructor, local_attn_cls=cls)
 
 
-def auto_wrap_func_distributed_attention(func: Callable) -> Callable[[bool, Any, float], nn.Module]:
+def auto_wrap_func_distributed_attention(func: Callable) -> Callable[..., Callable]:
     """
     Wrap a local attention function to a distributed one, which will be used in the ISP parallelism.
     """

diff --git a/internlm/model/ops/attention.py b/internlm/model/ops/attention.py
@@ -1032,5 +1032,4 @@ def hf_q_k_v_with_cu_seqlens(
         return_attn_probs=False,
         causal=causal,
     )
-    attn_output = attn_output.unsqueeze(0)
     return attn_output