try to solve fsdp bug

huggingface · May 2, 2024 · 673237b · 673237b
1 parent 280cb6c
commit 673237b
Show file tree

Hide file tree

Showing 3 changed files with 7 additions and 4 deletions.
diff --git a/examples/config_tiny_llama.yaml b/examples/config_tiny_llama.yaml
@@ -73,7 +73,7 @@ optimizer:
   learning_rate_scheduler:
     learning_rate: 0.0003
     lr_decay_starting_step: null
-    lr_decay_steps: 8
+    lr_decay_steps: 13
     lr_decay_style: cosine
     lr_warmup_steps: 2
     lr_warmup_style: linear
@@ -104,6 +104,6 @@ tokens:
   limit_test_batches: 0
   limit_val_batches: 0
   micro_batch_size: 2
-  sequence_length: 32
-  train_steps: 10
+  sequence_length: 256
+  train_steps: 15
   val_check_interval: -1
diff --git a/src/nanotron/models/llama.py b/src/nanotron/models/llama.py
@@ -52,6 +52,9 @@
 
 if DISABLE_FLASH_ATTENTION:
     print("Warning: Flash attention was disabled!")
+    # FSDP
+    torch.backends.cuda.enable_mem_efficient_sdp(False)
+    torch.backends.cuda.enable_flash_sdp(False)
 
 
 RMSNorm = RMSNorm if DISABLE_FLASH_ATTENTION else TritonRMSNorm

diff --git a/tests/test_llama.py b/tests/test_llama.py
@@ -1,5 +1,5 @@
 # Script to test correctness of training script by comparing loss value after 100th iteration with expected loss value
-# pytest -sv tests/test_train_llama.py or python tests/test_train_llama.py
+# pytest -sv tests/test_llama.py or python tests/test_train_llama.py
 
 import atexit
 import os