From 91effac2535e00ff854bc2de1eb826c52843a603 Mon Sep 17 00:00:00 2001
From: Cheng Li <pistasable@gmail.com>
Date: Tue, 28 May 2024 09:09:20 -0700
Subject: [PATCH] add dbrx training example

---
 examples/dbrx/run_train.sh           | 24 ++++++++++++++++++++++++
 llm_analysis/model_configs/dbrx.json | 13 +++++++++++++
 2 files changed, 37 insertions(+)
 create mode 100755 examples/dbrx/run_train.sh
 create mode 100644 llm_analysis/model_configs/dbrx.json

diff --git a/examples/dbrx/run_train.sh b/examples/dbrx/run_train.sh
new file mode 100755
index 0000000..7c3c70a
--- /dev/null
+++ b/examples/dbrx/run_train.sh
@@ -0,0 +1,24 @@
+model_name=dbrx # model name (hf model name) or model config json file path
+seq_len=4096
+gpu_name='h100-sxm-80gb'    # python -m llm_analysis.config list_gpu_configs
+dtype_name="w16a16e16"      # 16-bit weights, activations, and embedding
+batch_size_per_gpu=1        # device_train_microbatch_size
+num_gpus=2048               # num_gpus
+activation_recomputation=0  # 0: no activation recomputation; 1: checkpoint attention compute; 2: checkpoint attention ; 3: checkpoint layernorm-attention-layernorm; 4: checkpoint attention the entire transformer layer
+dp_size=128                 # data parallelization size for sharding
+ep_size=8                   # expert parallelization size, moe_dp_sharding_size = dp_size / ep_size
+tp_size=1                   # tensor parallelization size, num_gpus = tp_size * dp_size
+ds_zero=3                   # dp sharding strategy, https://github.com/cli99/llm-analysis#parallelism-scheme
+mlp_activation_quant_bits=8 # number of bits used for mlp activation
+mlp_recompute_gelu=True     # whether to recompute the gelu in mlp backward
+flops_efficiency=0.36721    # mfu
+hbm_memory_efficiency=1     # gpu memory efficiency
+intra_node_memory_efficiency=0.8
+inter_node_memory_efficiency=0.8
+total_num_tokens=12e12       # number of tokens to train on
+master_weights_dtype_bytes=4 # FP32 master weights
+other_op_bytes=4             # lion optimizer
+output_dir=output_dbrx
+output_file_prefix="bs${batch_size_per_gpu}-ar${activation_recomputation}-zero${ds_zero}-"
+
+python -m llm_analysis.analysis train --model_name=${model_name} --seq_len=${seq_len} --gpu_name=${gpu_name} --dtype_name=${dtype_name} --output_dir=${output_dir} --output-file-suffix=${output_file_suffix} --activation_recomputation ${activation_recomputation} --ds_zero ${ds_zero} --batch_size_per_gpu=${batch_size_per_gpu} --total_num_gpus=${num_gpus} --tp_size=${tp_size} --ep_size=${ep_size} --flops_efficiency=${flops_efficiency} --hbm_memory_efficiency=${hbm_memory_efficiency} --total_num_tokens ${total_num_tokens} --mlp_activation_quant_bits ${mlp_activation_quant_bits} --layernorm_dtype_bytes 2 --mlp_recompute_gelu ${mlp_recompute_gelu} --master_weights_dtype_bytes ${master_weights_dtype_bytes} --other_op_bytes ${other_op_bytes} --intra_node_memory_efficiency ${intra_node_memory_efficiency} --inter_node_memory_efficiency ${inter_node_memory_efficiency} --bwd_prefetch 1
diff --git a/llm_analysis/model_configs/dbrx.json b/llm_analysis/model_configs/dbrx.json
new file mode 100644
index 0000000..bf39c83
--- /dev/null
+++ b/llm_analysis/model_configs/dbrx.json
@@ -0,0 +1,13 @@
+{
+    "name": "dbrx",
+    "max_seq_len": 4096,
+    "num_layers": 40,
+    "n_head": 48,
+    "hidden_dim": 6144,
+    "vocab_size": 100352,
+    "expansion_ratio": 1.75,
+    "num_key_value_heads": 8,
+    "mlp_gated_linear_units": true,
+    "moe_num_experts": 16,
+    "moe_top_k": 4
+}