From 9c84cb99c41b403b25b15e4ebf8893c587e32e71 Mon Sep 17 00:00:00 2001 From: hanqing Date: Thu, 2 Jan 2025 20:01:09 -0600 Subject: [PATCH] apollo-mini: add scale front for 60m and 130m --- README.md | 2 +- scripts/pretrain_c4/llama_130m_apollo.sh | 1 - scripts/pretrain_c4/llama_60m_apollo.sh | 1 - 3 files changed, 1 insertion(+), 3 deletions(-) diff --git a/README.md b/README.md index e554a2a..bfd5a10 100644 --- a/README.md +++ b/README.md @@ -121,7 +121,7 @@ To stabilize training, we adopt the **Norm-Growth Limiter (NL)** from [Fira](htt There are two ways to apply the Norm-Growth Limiter based on when it's used relative to the heuristical (`scale`): 1. **After Scaling**: NL is applied after the gradient is multiplied by the `scale`. - - Recommended for smaller models or when training involves fewer warmup steps. + - Recommended for when training involves fewer warmup steps, e.g., LLaMA 60M and 130M with APOLLO-Mini. - Enable this by setting `--scale_front`. 2. **Before Scaling**: NL is applied before the gradient is scaled. - With sufficient warmup steps, both methods yield similar performance for large models. diff --git a/scripts/pretrain_c4/llama_130m_apollo.sh b/scripts/pretrain_c4/llama_130m_apollo.sh index c697d34..b549c87 100755 --- a/scripts/pretrain_c4/llama_130m_apollo.sh +++ b/scripts/pretrain_c4/llama_130m_apollo.sh @@ -14,7 +14,6 @@ torchrun --standalone --nproc_per_node 4 main_pretrain.py \ --warmup_steps 2000 \ --num_training_steps 20000 \ --optimizer apollo_adamw \ - --scale_front \ --apollo_scale ${apollo_scale} \ --rank ${num_rank} \ --scale_type ${scale_type} \ diff --git a/scripts/pretrain_c4/llama_60m_apollo.sh b/scripts/pretrain_c4/llama_60m_apollo.sh index c8be21b..81e104d 100755 --- a/scripts/pretrain_c4/llama_60m_apollo.sh +++ b/scripts/pretrain_c4/llama_60m_apollo.sh @@ -14,7 +14,6 @@ torchrun --standalone --nproc_per_node 1 main_pretrain.py \ --warmup_steps 1000 \ --num_training_steps 10000 \ --optimizer apollo_adamw \ - --scale_front \ --apollo_scale ${apollo_scale} \ --rank ${num_rank} \ --scale_type ${scale_type} \