From 9c84cb99c41b403b25b15e4ebf8893c587e32e71 Mon Sep 17 00:00:00 2001
From: hanqing <zhqinsjtu@gmail.com>
Date: Thu, 2 Jan 2025 20:01:09 -0600
Subject: [PATCH] apollo-mini: add scale front for 60m and 130m

---
 README.md                                | 2 +-
 scripts/pretrain_c4/llama_130m_apollo.sh | 1 -
 scripts/pretrain_c4/llama_60m_apollo.sh  | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/README.md b/README.md
index e554a2a..bfd5a10 100644
--- a/README.md
+++ b/README.md
@@ -121,7 +121,7 @@ To stabilize training, we adopt the **Norm-Growth Limiter (NL)** from [Fira](htt
 
 There are two ways to apply the Norm-Growth Limiter based on when it's used relative to the heuristical (`scale`):
 1. **After Scaling**: NL is applied after the gradient is multiplied by the `scale`.
-   - Recommended for smaller models or when training involves fewer warmup steps.
+   - Recommended for when training involves fewer warmup steps, e.g., LLaMA 60M and 130M with APOLLO-Mini.
    - Enable this by setting `--scale_front`.
 2. **Before Scaling**: NL is applied before the gradient is scaled.
    - With sufficient warmup steps, both methods yield similar performance for large models.
diff --git a/scripts/pretrain_c4/llama_130m_apollo.sh b/scripts/pretrain_c4/llama_130m_apollo.sh
index c697d34..b549c87 100755
--- a/scripts/pretrain_c4/llama_130m_apollo.sh
+++ b/scripts/pretrain_c4/llama_130m_apollo.sh
@@ -14,7 +14,6 @@ torchrun --standalone --nproc_per_node 4 main_pretrain.py \
     --warmup_steps 2000 \
     --num_training_steps 20000 \
     --optimizer apollo_adamw \
-    --scale_front \
     --apollo_scale ${apollo_scale} \
     --rank ${num_rank} \
     --scale_type ${scale_type} \
diff --git a/scripts/pretrain_c4/llama_60m_apollo.sh b/scripts/pretrain_c4/llama_60m_apollo.sh
index c8be21b..81e104d 100755
--- a/scripts/pretrain_c4/llama_60m_apollo.sh
+++ b/scripts/pretrain_c4/llama_60m_apollo.sh
@@ -14,7 +14,6 @@ torchrun --standalone --nproc_per_node 1 main_pretrain.py \
     --warmup_steps 1000 \
     --num_training_steps 10000 \
     --optimizer apollo_adamw \
-    --scale_front \
     --apollo_scale ${apollo_scale} \
     --rank ${num_rank} \
     --scale_type ${scale_type} \