phu0ngng · phu0ngng · Oct 25, 2024 · Sep 16, 2024 · Sep 16, 2024 · Sep 16, 2024
diff --git a/.github/workflows/trigger-ci.yml b/.github/workflows/trigger-ci.yml
@@ -35,6 +35,10 @@ jobs:
            || github.actor == 'xrennvidia'
            || github.actor == 'yaox12'
            || github.actor == 'huanghua1994'
+           || github.actor == 'mgoldfarb-nvidia'
+           || github.actor == 'pggPL'
+           || github.actor == 'vasunvidia'
+           || github.actor == 'erhoo82'
          )
     steps:
       - name: Check if comment is issued by authorized person

diff --git a/.gitignore b/.gitignore
@@ -39,3 +39,4 @@ develop-eggs/
 dist/
 downloads/
 .pytest_cache/
+compile_commands.json
diff --git a/README.rst b/README.rst
@@ -174,7 +174,15 @@ To install the latest stable version of Transformer Engine,
 
     pip install git+https://github.com/NVIDIA/TransformerEngine.git@stable
 
-This will automatically detect if any supported deep learning frameworks are installed and build Transformer Engine support for them. To explicitly specify frameworks, set the environment variable NVTE_FRAMEWORK to a comma-separated list (e.g. NVTE_FRAMEWORK=jax,pytorch).
+This will automatically detect if any supported deep learning frameworks are installed and build Transformer Engine support for them. To explicitly specify frameworks, set the environment variable NVTE_FRAMEWORK to a comma-separated list (e.g. NVTE_FRAMEWORK=jax,pytorch,paddle).
+
+Alternatively, the package can be directly installed from `Transformer Engine's PyPI <https://pypi.org/project/transformer-engine/>`_, e.g.
+
+.. code-block:: bash
+
+    pip install transformer_engine[pytorch]
+
+To obtain the necessary Python bindings for Transformer Engine, the frameworks needed must be explicitly specified as extra dependencies in a comma-separated list (e.g. [jax,pytorch,paddle]). Transformer Engine ships wheels for the core library as well as the PaddlePaddle extensions. Source distributions are shipped for the JAX and PyTorch extensions.
 
 From source
 ^^^^^^^^^^^

diff --git a/build_tools/VERSION.txt b/build_tools/VERSION.txt
@@ -1 +1 @@
-1.11.0.dev0
+1.13.0.dev0
diff --git a/build_tools/build_ext.py b/build_tools/build_ext.py
@@ -106,8 +106,12 @@ def run(self) -> None:
                 if isinstance(ext, CMakeExtension):
                     print(f"Building CMake extension {ext.name}")
                     # Set up incremental builds for CMake extensions
-                    setup_dir = Path(__file__).resolve().parent.parent
-                    build_dir = setup_dir / "build" / "cmake"
+                    build_dir = os.getenv("NVTE_CMAKE_BUILD_DIR")
+                    if build_dir:
+                        build_dir = Path(build_dir).resolve()
+                    else:
+                        root_dir = Path(__file__).resolve().parent.parent
+                        build_dir = root_dir / "build" / "cmake"
 
                     # Ensure the directory exists
                     build_dir.mkdir(parents=True, exist_ok=True)

diff --git a/build_tools/pytorch.py b/build_tools/pytorch.py
@@ -80,15 +80,15 @@ def setup_pytorch_extension(
             )
         )
 
-        if "80" in cuda_architectures:
-            nvcc_flags.extend(["-gencode", "arch=compute_80,code=sm_80"])
-        if "90" in cuda_architectures:
-            nvcc_flags.extend(["-gencode", "arch=compute_90,code=sm_90"])
+        for arch in cuda_architectures.split(";"):
+            if arch == "70":
+                continue  # Already handled
+            nvcc_flags.extend(["-gencode", f"arch=compute_{arch},code=sm_{arch}"])
 
     # Libraries
     library_dirs = []
     libraries = []
-    if os.getenv("NVTE_UB_WITH_MPI"):
+    if bool(int(os.getenv("NVTE_UB_WITH_MPI", 0))):
         assert (
             os.getenv("MPI_HOME") is not None
         ), "MPI_HOME must be set when compiling with NVTE_UB_WITH_MPI=1"

diff --git a/docs/api/pytorch.rst b/docs/api/pytorch.rst
@@ -9,6 +9,9 @@ pyTorch
 .. autoapiclass:: transformer_engine.pytorch.Linear(in_features, out_features, bias=True, **kwargs)
   :members: forward, set_tensor_parallel_group
 
+.. autoapiclass:: transformer_engine.pytorch.GroupedLinear(in_features, out_features, bias=True, **kwargs)
+  :members: forward, set_tensor_parallel_group
+
 .. autoapiclass:: transformer_engine.pytorch.LayerNorm(hidden_size, eps=1e-5, **kwargs)
 
 .. autoapiclass:: transformer_engine.pytorch.RMSNorm(hidden_size, eps=1e-5, **kwargs)

diff --git a/docs/examples/te_llama/te_llama.py b/docs/examples/te_llama/te_llama.py
@@ -102,8 +102,11 @@ def from_pretrained_local(cls, pretrained_model_name_or_path, *args, config, **k
         Custom method adapted from `from_pretrained` method in HuggingFace
         Transformers repo: https://github.com/huggingface/transformers/blob/f497f564bb76697edab09184a252fc1b1a326d1e/src/transformers/modeling_utils.py#L2579
         """
-        vanilla_model = cls(config).to(kwargs["torch_dtype"])
-        is_local = os.path.isdir(pretrained_model_name_or_path)
+        # Before loading the model, set the default dtype for torch
+        torch.set_default_dtype(kwargs["torch_dtype"])
+
+        # Load the vanilla model weights
+        vanilla_model = cls(config)
         subfolder = ""
         variant = None
         if os.path.isfile(
@@ -133,7 +136,7 @@ def from_pretrained_local(cls, pretrained_model_name_or_path, *args, config, **k
         else:
             raise AssertionError("Only sharded PyTorch ckpt format supported at the moment")
 
-        resolved_archive_file, sharded_metadata = get_checkpoint_shard_files(
+        resolved_archive_file, _ = get_checkpoint_shard_files(
             pretrained_model_name_or_path,
             archive_file,
         )

diff --git a/docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb b/docs/examples/te_llama/tutorial_accelerate_hf_llama_with_te.ipynb
@@ -247,15 +247,24 @@
     "restart_jupyter_notebook()\n",
     "\n",
     "\n",
-    "# Import necessary packages and methods\n",
+    "# Import necessary packages, methods and variables\n",
     "from utils import *\n",
     "\n",
     "\n",
-    "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n",
-    "## !!! `model_name` attr must point to the location of the model weights !!!\n",
-    "# For Llama 2, download weights from https://huggingface.co/meta-llama/Llama-2-7b-hf (Hugging Face weight format).\n",
-    "# For Llama 3, download weights from https://huggingface.co/meta-llama/Meta-Llama-3-8B (Hugging Face weight format).\n",
-    "hyperparams.model_name = \"\" # <== Add model weight location here e.g. \"/path/to/downloaded/llama/weights\"\n",
+    "# Provide Huggingface Access Token\n",
+    "hyperparams.hf_access_token = \"\"\n",
+    "assert hyperparams.hf_access_token, \"Provide a HF API Access Token!\"\n",
+    "\n",
+    "# Provide a directory to cache weights in to avoid downloading them every time.\n",
+    "# (By default, weights are cached in `~/.cache/huggingface/hub/models`)\n",
+    "hyperparams.weights_cache_dir = \"\"\n",
+    "\n",
+    "# For Llama 2, uncomment this line (also set by default)\n",
+    "hyperparams.model_name = \"meta-llama/Llama-2-7b-hf\"\n",
+    "\n",
+    "# For Llama 3, uncomment this line\n",
+    "# hyperparams.model_name = \"meta-llama/Meta-Llama-3-8B\"\n",
+    "\n",
     "hyperparams.mixed_precision = \"bf16\"\n",
     "\n",
     "\n",
@@ -554,7 +563,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 2,
+   "execution_count": 1,
    "id": "bdb34b91",
    "metadata": {},
    "outputs": [
@@ -573,15 +582,24 @@
     "restart_jupyter_notebook()\n",
     "\n",
     "\n",
-    "# Import necessary packages and methods\n",
+    "# Import necessary packages, methods and variables\n",
     "from utils import *\n",
     "\n",
     "\n",
-    "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n",
-    "## !!! `model_name` attr must point to the location of the model weights !!!\n",
-    "# For Llama 2, download weights from https://huggingface.co/meta-llama/Llama-2-7b-hf (Hugging Face weight format).\n",
-    "# For Llama 3, download weights from https://huggingface.co/meta-llama/Meta-Llama-3-8B (Hugging Face weight format).\n",
-    "hyperparams.model_name = \"\" # <== Add model weight location here e.g. \"/path/to/downloaded/llama/weights\"\n",
+    "# Provide Huggingface Access Token\n",
+    "hyperparams.hf_access_token = \"\"\n",
+    "assert hyperparams.hf_access_token, \"Provide a HF API Access Token!\"\n",
+    "\n",
+    "# Provide a directory to cache weights in to avoid downloading them every time.\n",
+    "# (By default, weights are cached in `~/.cache/huggingface/hub/models`)\n",
+    "hyperparams.weights_cache_dir = \"\"\n",
+    "\n",
+    "# For Llama 2, uncomment this line (also set by default)\n",
+    "hyperparams.model_name = \"meta-llama/Llama-2-7b-hf\"\n",
+    "\n",
+    "# For Llama 3, uncomment this line\n",
+    "# hyperparams.model_name = \"meta-llama/Meta-Llama-3-8B\"\n",
+    "\n",
     "hyperparams.mixed_precision = \"bf16\"\n",
     "\n",
     "\n",
@@ -653,15 +671,24 @@
     "restart_jupyter_notebook()\n",
     "\n",
     "\n",
-    "# Import necessary packages and methods\n",
+    "# Import necessary packages, methods and variables\n",
     "from utils import *\n",
     "\n",
     "\n",
-    "# Default hyperparams, also defined in `utils.py` in class `Hyperparameters`\n",
-    "## !!! `model_name` attr must point to the location of the model weights !!!\n",
-    "# For Llama 2, download weights from https://huggingface.co/meta-llama/Llama-2-7b-hf (Hugging Face weight format).\n",
-    "# For Llama 3, download weights from https://huggingface.co/meta-llama/Meta-Llama-3-8B (Hugging Face weight format).\n",
-    "hyperparams.model_name = \"\" # <== Add model weight location here e.g. \"/path/to/downloaded/llama/weights\"\n",
+    "# Provide Huggingface Access Token\n",
+    "hyperparams.hf_access_token = \"\"\n",
+    "assert hyperparams.hf_access_token, \"Provide a HF API Access Token!\"\n",
+    "\n",
+    "# Provide a directory to cache weights in to avoid downloading them every time.\n",
+    "# (By default, weights are cached in `~/.cache/huggingface/hub/models`)\n",
+    "hyperparams.weights_cache_dir = \"\"\n",
+    "\n",
+    "# For Llama 2, uncomment this line (also set by default)\n",
+    "hyperparams.model_name = \"meta-llama/Llama-2-7b-hf\"\n",
+    "\n",
+    "# For Llama 3, uncomment this line\n",
+    "# hyperparams.model_name = \"meta-llama/Meta-Llama-3-8B\"\n",
+    "\n",
     "hyperparams.mixed_precision = \"fp8\"\n",
     "\n",
     "\n",

diff --git a/docs/examples/te_llama/utils.py b/docs/examples/te_llama/utils.py
@@ -25,7 +25,10 @@
 class HyperParameters:
     def __init__(self):
         self.mixed_precision = "bf16"
-        # self.model_name = "" # <== Add model weight location here
+
+        # Set to Meta Llama 2 by default.
+        self.model_name = "meta-llama/Llama-2-7b-hf"
+
         self.dataset_name = "timdettmers/openassistant-guanaco"
         self.dataset_text_field = "text"
         self.learning_rate = 1.41e-5
@@ -35,6 +38,10 @@ def __init__(self):
         self.num_warmup_steps = 5
         self.num_training_steps = 10
 
+        # This is either provided by the user or it will be set when the
+        # model weights are downloaded.
+        self.weights_cache_dir = ""
+
 
 hyperparams = HyperParameters()
 
@@ -76,13 +83,49 @@ def tokenize(element):
     return train_dataloader
 
 
+def ensure_model_is_downloaded(hyperparams):
+    assert hyperparams.model_name in [
+        "meta-llama/Meta-Llama-3-8B",
+        "meta-llama/Llama-2-7b-hf",
+    ], "Only Meta Llama 2 7B and Meta Llama 3 8B models are supported!"
+
+    # Login using Huggingface Hub API
+    from huggingface_hub import login
+
+    try:
+        login(hyperparams.hf_access_token)
+    except Exception as e:
+        if "Invalid token passed!" in str(e):
+            print(
+                "Please pass a valid HF Access Token! More info at"
+                " https://huggingface.co/docs/hub/en/security-tokens."
+            )
+        else:
+            print(f"Exception is {e}")
+
+    # Download the model if it doesn't exist
+    from huggingface_hub import snapshot_download
+
+    supplied_cache_dir = (
+        hyperparams.weights_cache_dir if hyperparams.weights_cache_dir != "" else None
+    )
+    hyperparams.weights_cache_dir = snapshot_download(
+        repo_id=hyperparams.model_name, cache_dir=supplied_cache_dir
+    )
+
+    print(f"Model cache directory : {hyperparams.weights_cache_dir}")
+
+
 def init_baseline_model(hyperparams):
+    # Download and cache the weights
+    ensure_model_is_downloaded(hyperparams)
+
     # Init the model
-    config = AutoConfig.from_pretrained(hyperparams.model_name)
+    config = AutoConfig.from_pretrained(hyperparams.weights_cache_dir)
     # make sure to use flash_attention to do iso comparison with TELlamaModel
     config._attn_implementation = "flash_attention_2"
     model = AutoModelForCausalLM.from_pretrained(
-        hyperparams.model_name,
+        hyperparams.weights_cache_dir,
         config=config,
         torch_dtype=torch.bfloat16,
     )
@@ -94,13 +137,16 @@ def init_baseline_model(hyperparams):
 
 
 def init_te_llama_model(hyperparams):
+    # Download and cache the weights
+    ensure_model_is_downloaded(hyperparams)
+
     # Init the model
     from te_llama import TELlamaForCausalLM
 
-    config = AutoConfig.from_pretrained(hyperparams.model_name)
+    config = AutoConfig.from_pretrained(hyperparams.weights_cache_dir)
     config._attn_implementation = "flash_attention_2"
     model = TELlamaForCausalLM.from_pretrained_local(
-        hyperparams.model_name,
+        hyperparams.weights_cache_dir,
         config=config,
         torch_dtype=torch.bfloat16,
     )

diff --git a/docs/faq.rst b/docs/faq.rst
@@ -0,0 +1,75 @@
+..
+    Copyright (c) 2022-2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+
+    See LICENSE for license information.
+
+Frequently Asked Questions (FAQ)
+================================
+
+FP8 checkpoint compatibility
+----------------------------
+
+Transformer Engine starts to support FP8 attention in 1.6. It stores the FP8 metadata, i.e. scaling factors and amax histories, under a `._extra_state` key in the checkpoint. As the FP8 attention support expands from one backend to multiple backends, the location of the `._extra_state` key has also shifted.
+
+Here, we take the `MultiheadAttention` module as an example. Its FP8 attention metadata in Transformer Engine 1.11 is stored as `core_attention._extra_state` as shown below.
+
+.. code-block:: python
+
+    >>> from transformer_engine.pytorch import MultiheadAttention, fp8_model_init
+    >>> with fp8_model_init(enabled=True):
+    ...     mha = MultiheadAttention(
+    ...         hidden_size=1024,
+    ...         num_attention_heads=16,
+    ...         bias=True,
+    ...         params_dtype=torch.bfloat16,
+    ...         input_layernorm=False,
+    ...         fuse_qkv_params=True,
+    ...         attention_type="self",
+    ...         qkv_weight_interleaved=True,
+    ...     ).to(dtype=torch.bfloat16, device="cuda")
+    ...
+    >>> state_dict = mha.state_dict()
+    >>> print(state_dict.keys())
+    odict_keys(['qkv.weight', 'qkv.bias', 'qkv._extra_state', 'core_attention._extra_state', 'proj.weight', 'proj.bias', 'proj._extra_state'])
+
+Here is a full list of the checkpoint save/load behaviors from all Transformer Engine versions.
+
+.. list-table::
+
+   * - **Version: <= 1.5**
+
+         - Saves no FP8 metadata since FP8 attention is not supported
+         - Loading behavior for checkpoints created by the following versions:
+
+             :<= 1.5:    Loads no FP8 metadata
+             :>  1.5:    Error: unexpected key
+   * - **Version: 1.6, 1.7**
+
+         - Saves FP8 metadata to `core_attention.fused_attention._extra_state`
+         - Loading behavior for checkpoints created by the following versions:
+
+             :<= 1.5:    Initializes FP8 metadata to the default, i.e. 1s for scaling factors, and 0s for amaxes
+             :1.6, 1.7:  Loads FP8 metadata from checkpoint
+             :>= 1.8:    Error: unexpected key
+   * - **Version: >=1.8, <= 1.11**
+
+         - Saves FP8 metadata to `core_attention._extra_state`
+         - Loading behavior for checkpoints created by the following versions:
+
+             :<= 1.5:    Initializes FP8 metadata to the default, i.e. 1s for scaling factors, and 0s for amaxes
+             :1.6, 1.7:  This save/load combination relies on users to map the 1.6/1.7 key to the 1.8-1.11 key. Otherwise, it initializes FP8 metadata to the default, i.e. 1s for scaling factors, and 0s for amaxes. The mapping can be done, in this `MultiheadAttention` example, by
+
+              .. code-block:: python
+
+                  >>> state_dict["core_attention._extra_state"] = \
+                          state_dict["core_attention.fused_attention._extra_state"]
+                  >>> del state_dict["core_attention.fused_attention._extra_state"]
+
+             :>= 1.8:    Loads FP8 metadata from checkpoint
+   * - **Version: >=1.12**
+
+         - Saves FP8 metadata to `core_attention._extra_state`
+         - Loading behavior for checkpoints created by the following versions:
+
+             :<= 1.5:    Initializes FP8 metadata to the default, i.e. 1s for scaling factors, and 0s for amaxes
+             :>= 1.6:    Loads FP8 metadata from checkpoint
diff --git a/docs/index.rst b/docs/index.rst
@@ -30,6 +30,7 @@ Transformer Engine documentation
 
    installation
    examples/quickstart.ipynb
+   faq
 
 .. toctree::
    :hidden: