From e88cf81ae84e34f30d8503524605b66c336f67c8 Mon Sep 17 00:00:00 2001
From: HL <linhaibin.eric@gmail.com>
Date: Thu, 9 Jan 2025 21:09:05 -0800
Subject: [PATCH] [megatron] docs: clean up unused code, update megatron
 backend docs and installation docs (#89)

* [megatron] style: clean up unused code in megatron

* update docs

* add install from docker section for docs

---------

Co-authored-by: Your Name <you@example.com>
---
 README.md                                     | 11 ++--
 docs/advance/fsdp_extension.rst               |  4 +-
 docs/advance/megatron_extension.rst           | 11 ++--
 docs/start/install.rst                        | 59 ++++++++++++++++++-
 requirements.txt                              |  3 +-
 .../llama/megatron/modeling_llama_megatron.py | 15 ++---
 6 files changed, 78 insertions(+), 25 deletions(-)
diff --git a/README.md b/README.md
index 375c41b..55d9919 100644
--- a/README.md
+++ b/README.md
@@ -42,13 +42,13 @@ Below are the steps to install veRL in your environment.
 
 veRL supports various backends. Currently, the following configurations are available:
 - **FSDP** and **Megatron-LM** for training.
-- **vLLM** for rollout generation.
+- **vLLM** for rollout generation, **SGLang** support coming soon.
 
 **Training backends**
 
 We recommend using **FSDP** backend to investigate, research and prototype different models, datasets and RL algorithms. The guide for using FSDP backend can be found in [PyTorch FSDP Backend](https://verl.readthedocs.io/en/latest/workers/fsdp_workers.html)
 
-For users who pursue better scalability, we recommend using **Megatron-LM** backend. Currently, we support Megatron-LM@core_v0.4.0 and we fix some internal issues of Megatron-LM. Here's the additional installation guide. The guide for using Megatron-LM backend can be found in [Megatron-LM Backend](https://verl.readthedocs.io/en/latest/workers/megatron_workers.html)
+For users who pursue better scalability, we recommend using **Megatron-LM** backend. Currently, we support Megatron-LM@core_v0.4.0 with some internal patches (soon be updated to latest version directly relying on upstream Megatron-LM). The guide for using Megatron-LM backend can be found in [Megatron-LM Backend](https://verl.readthedocs.io/en/latest/workers/megatron_workers.html)
 
 ### Installation Options
 
@@ -72,7 +72,7 @@ git clone https://github.com/volcengine/verl && cd verl && pip3 install -e .
 # or install from pypi via `pip3 install verl`
 ```
 
-<details><summary> 4. Setup Megatron (optional) </summary>
+<details><summary> 3. Setup Megatron (optional) </summary>
 
 If you want to enable training with Megatron, Megatron code must be added to PYTHONPATH:
 
@@ -176,8 +176,8 @@ Visit our [documentation](https://verl.readthedocs.io/en/latest/index.html) to l
 - Advance Usage and Extension
   - [Ray API Design Tutorial](https://verl.readthedocs.io/en/latest/advance/placement.html)
   - [Extend to other RL(HF) algorithms](https://verl.readthedocs.io/en/latest/advance/dpo_extension.html)
-  - [Add models to FSDP backend](https://verl.readthedocs.io/en/latest/advance/fsdp_extension.html)
-  - [Add models to Megatron-LM backend](https://verl.readthedocs.io/en/latest/advance/megatron_extension.html)
+  - [Add models with the FSDP backend](https://verl.readthedocs.io/en/latest/advance/fsdp_extension.html)
+  - [Add models with the Megatron-LM backend](https://verl.readthedocs.io/en/latest/advance/megatron_extension.html)
 
 
 ## Citation
@@ -201,3 +201,4 @@ Visit our [documentation](https://verl.readthedocs.io/en/latest/index.html) to l
 ## Publications Using veRL
 - [Enhancing Multi-Step Reasoning Abilities of Language Models through Direct Q-Function Optimization](https://arxiv.org/abs/2410.09302)
 - [Flaming-hot Initiation with Regular Execution Sampling for Large Language Models](https://arxiv.org/abs/2410.21236)
+- [Process Reinforcement Through Implicit Rewards](https://github.com/PRIME-RL/PRIME/)
\ No newline at end of file
diff --git a/docs/advance/fsdp_extension.rst b/docs/advance/fsdp_extension.rst
index ae26c59..a7a41aa 100644
--- a/docs/advance/fsdp_extension.rst
+++ b/docs/advance/fsdp_extension.rst
@@ -1,6 +1,6 @@
 
-Add models to FSDP backend
-===========================
+Add models with the FSDP backend
+==================================
 
 Model
 --------------------------
diff --git a/docs/advance/megatron_extension.rst b/docs/advance/megatron_extension.rst
index b2b02e0..91d744a 100644
--- a/docs/advance/megatron_extension.rst
+++ b/docs/advance/megatron_extension.rst
@@ -1,13 +1,13 @@
-Add models to Megatron-LM backend
-===================================
+Add models with the Megatron-LM backend
+=========================================
 
 Model
 -----------
 
-The most challenging aspect to use Megatron-LM backend is implementing
+The most challenging aspect to use the Megatron-LM backend is implementing
 the models for training. Currently, we implement Llama model that
 support data parallelism, tensor parallelism, pipeline parallelism (also
-vPP) and sequence parallelism. We also implement remove padding on Llama
+vPP) and sequence parallelism. We also implement remove padding (sequence packing) on Llama
 model, which can be found in `modeling_llama_megatron.py <https://github.com/volcengine/verl/blob/main/verl/models/llama/megatron/modeling_llama_megatron.py>`_.
 
 To support other model, users are required to implement:
@@ -22,4 +22,5 @@ To support other model, users are required to implement:
    (vLLM) model. Note that both the actor model and rollout model are
    partitioned during runtime. So, it's advisable to map the model name
    in actor model implementation. Otherwise, you may need an additional
-   name mapping and even weight transformation.
+   name mapping and even weight transformation. The weight loader implementation
+   is in `megatron_weight_loaders.py <https://github.com/volcengine/verl/blob/main/verl/third_party/vllm/vllm_v_0_6_3/megatron_weight_loaders.py>`_.
\ No newline at end of file
diff --git a/docs/start/install.rst b/docs/start/install.rst
index 5c198f3..c9e450d 100644
--- a/docs/start/install.rst
+++ b/docs/start/install.rst
@@ -1,7 +1,64 @@
 Installation
 ============
 
-To install the veRL, we recommend using conda:
+Requirements
+------------
+
+- **Python**: Version >= 3.9
+- **CUDA**: Version >= 12.1
+
+veRL supports various backends. Currently, the following configurations are available:
+
+- **FSDP** and **Megatron-LM** (optional) for training.
+- **vLLM** for rollout generation, **SGLang** support coming soon.
+
+Install from docker image
+-------------------------
+
+We provide pre-built Docker images for quick setup.
+
+Image and tag: ``verlai/verl:vemlp-th2.4.0-cu124-vllm0.6.3-ray2.10-te1.7-v0.0.3``. See files under ``docker/`` if you want to build your own image.
+
+1. Launch the desired Docker image:
+
+.. code:: bash
+
+    docker run --runtime=nvidia -it --rm --shm-size="10g" --cap-add=SYS_ADMIN -v <image:tag>
+
+
+2.	Inside the container, install veRL:
+
+.. code:: bash
+
+    # install the nightly version (recommended)
+    git clone https://github.com/volcengine/verl && cd verl && pip3 install -e .
+    # or install from pypi via `pip3 install verl`
+
+
+3. Setup Megatron (optional)
+
+If you want to enable training with Megatron, Megatron code must be added to PYTHONPATH:
+
+.. code:: bash
+
+    cd ..
+    git clone -b core_v0.4.0 https://github.com/NVIDIA/Megatron-LM.git
+    cp verl/patches/megatron_v4.patch Megatron-LM/
+    cd Megatron-LM && git apply megatron_v4.patch
+    pip3 install -e .
+    export PYTHONPATH=$PYTHONPATH:$(pwd)
+
+
+You can also get the Megatron code after verl's patch via
+
+.. code:: bash
+
+    git clone -b core_v0.4.0_verl https://github.com/eric-haibin-lin/Megatron-LM
+
+Install from custom environment
+---------------------------------
+
+To manage environment, we recommend using conda:
 
 .. code:: bash
 
diff --git a/requirements.txt b/requirements.txt
index b282b34..1d12338 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,4 +9,5 @@ pybind11
 ray
 tensordict<0.6
 transformers
-vllm<=0.6.3
\ No newline at end of file
+vllm<=0.6.3
+wandb
\ No newline at end of file
diff --git a/verl/models/llama/megatron/modeling_llama_megatron.py b/verl/models/llama/megatron/modeling_llama_megatron.py
index c356328..30ae825 100644
--- a/verl/models/llama/megatron/modeling_llama_megatron.py
+++ b/verl/models/llama/megatron/modeling_llama_megatron.py
@@ -17,7 +17,7 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-""" PyTorch LLaMA model."""
+"""PyTorch LLaMA model with Megatron-style acceleration."""
 
 from typing import Optional, Tuple, Union
 
@@ -26,7 +26,6 @@
 from megatron.core import tensor_parallel
 from megatron.core import ModelParallelConfig
 from torch import nn
-from torch.nn import init
 from transformers.modeling_outputs import BaseModelOutputWithPast
 from transformers.models.llama.configuration_llama import LlamaConfig
 from transformers.models.llama.modeling_llama import CausalLMOutputWithPast
@@ -482,8 +481,6 @@ def forward(self,
 
         """
         if self.pre_process:
-            # if torch.cuda.current_device() == 0:
-            #     print(f'rank {torch.cuda.current_device()}: input_ids shape before embedding: {input_ids.shape}')
             inputs_embeds = self.embed_tokens(input_ids)  # (1, total_nnz) -> (1, total_nnz, hidden_size)
 
             # vocab parallel embedding will not do sequence parallel reduce-scatter in open source megatron
@@ -493,8 +490,6 @@ def forward(self,
             if self.megatron_config.sequence_parallel:
                 inputs_embeds = tensor_parallel.scatter_to_sequence_parallel_region(inputs_embeds)
 
-            # if torch.cuda.current_device() == 0:
-            #     print(f'rank {torch.cuda.current_device()}: input_embeds shape after embedding: {inputs_embeds.shape}')
             hidden_states = inputs_embeds
         else:
             # self.hidden_states should be passed by Megatron
@@ -558,9 +553,9 @@ def _init_head(self):
 
     def _forward_head(self, hidden_states):
         # all_gather from sequence parallel region is performed inside lm_head
-        # print(f'logits shape before forward_head: {hidden_states.shape}, vocab_size = {self.config.vocab_size}') # [4, 32, 4096]
+        # logits shape before forward_head hidden_states.shape: [4, 32, 4096]
         logits = self.lm_head(hidden_states)[0]
-        # print(f'logits shape after forward_head: {logits.shape}') # [8, 32, 8]
+        # logits shape after forward_head logits.shape: [8, 32, 8]
         logits = logits.float()  # (total_nnz_padded, 1, vocab_size // tp)
         return logits
 
@@ -588,7 +583,7 @@ def forward(
         # remove padding here
         input_ids_rmpad, indices, cu_seqlens, max_seqlen_in_batch = unpad_input(input_ids.unsqueeze(dim=-1),
                                                                                 attention_mask)  # (total_nnz, 1)
-        # print(f'input_ids.shape = {input_ids.shape}, input_ids_rmpad.shape = {input_ids_rmpad.shape}, indices.shape = {indices.shape}, cu_seqlens[-1] = {cu_seqlens[-1]}')
+
         # pad input_ids to multiple of tp for all tp ranks
         # TODO: for better performance, the sp padding should be removed at each layer. Not sure the performance gap
         if self.megatron_config.sequence_parallel:
@@ -607,7 +602,6 @@ def forward(
             hidden_states = outputs
             # print(f'hidden_states.shape = {hidden_states.shape}') # torch.Size([4, 32, 4096])
             logits = self._forward_head(hidden_states)
-            # print(f'logits.shape = {logits.shape}')
             logits = torch.squeeze(logits, dim=1)  # remove the artificial batch dimension # torch.Size([8, 32, 16])
 
             # remove padding from sequence parallel
@@ -615,7 +609,6 @@ def forward(
                 totol_nnz = cu_seqlens[-1]
                 logits = logits[:totol_nnz]  # (total_nnz_padded)
             # add removed padding back. If input is already rmpad, we let the caller pad_input
-            # print(f'logits.shape = {logits.shape}, indices.shape = {indices.shape}, batch_size = {batch_size}, seq_len = {sequence_length}')
             logits = pad_input(logits, indices, batch_size,
                                seqlen=sequence_length)  # (batch_size, sequence_length, vocab_size)