fix bias[None, :] in tp's functional

huggingface · Jan 14, 2025 · f8c40ad · f8c40ad
1 parent 21b2408
commit f8c40ad
Showing 1 changed file with 8 additions and 3 deletions.
diff --git a/src/nanotron/parallel/tensor_parallel/functional.py b/src/nanotron/parallel/tensor_parallel/functional.py
@@ -204,7 +204,10 @@ def forward(ctx, tensor, weight, bias, group, tp_mode, tp_recompute_allgather):
                     )
                 else:
                     torch.addmm(
-                        input=bias[None, :],
+                        # NOTE(xrsrke): if keep bias[None, :], then we got
+                        # RuntimeError: Attempted to make a tensor into a differentiable view,
+                        # but the tensor already had autograd metadata associated with it
+                        input=bias.view(1, -1),
                         mat1=tensor.view(first_dims, hidden_size),
                         mat2=weight.t(),
                         out=same_device_shard.view(first_dims, output_size),
@@ -236,7 +239,8 @@ def forward(ctx, tensor, weight, bias, group, tp_mode, tp_recompute_allgather):
                         )
                     else:
                         torch.addmm(
-                            input=bias[None, :],
+                            # input=bias[None, :],
+                            input=bias.view(1, -1),
                             mat1=gathered_tensor[: sharded_batch_size * current_rank].view(first_dims, hidden_size),
                             mat2=weight.t(),
                             out=before_shard.view(first_dims, output_size),
@@ -253,7 +257,8 @@ def forward(ctx, tensor, weight, bias, group, tp_mode, tp_recompute_allgather):
                         )
                     else:
                         torch.addmm(
-                            input=bias[None, :],
+                            # input=bias[None, :],
+                            input=bias.view(1, -1),
                             mat1=gathered_tensor[sharded_batch_size * (current_rank + 1) :].view(
                                 first_dims, hidden_size
                             ),