From e2333ddf6f037d8fcc79ed62f71e051df7257a81 Mon Sep 17 00:00:00 2001
From: zzhhjjj <z785566960@gmail.com>
Date: Tue, 23 Apr 2024 12:53:12 +0000
Subject: [PATCH 1/6] num_samples

---
 run_generate.py                   | 11 +++++++----
 src/nanotron/generation/decode.py | 10 ++++++++++
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/run_generate.py b/run_generate.py
index 0f52b8ed..419e0ba3 100644
--- a/run_generate.py
+++ b/run_generate.py
@@ -78,7 +78,7 @@ def main():
         tp=args.tp or config.parallelism.tp,
         pp_engine=OneForwardOneBackwardPipelineEngine(),
         tp_mode=TensorParallelLinearMode.ALL_REDUCE,
-        tp_linear_async_communication=True,
+        tp_linear_async_communication=False,
     )
 
     # Initialise all process groups
@@ -164,9 +164,12 @@ def main():
         tokenizer.padding_side = "left"
         tokenizer.truncation_side = "left"  # TODO @nouamane: do we want this?
         dummy_inputs = [
-            # "Passage: Daniel went back to the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:",
+            "Passage: Daniel went back to the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:",
             "def fib(n)",
-            # "This film was probably inspired by Godzilla",
+            "This film was probably inspired by Godzilla",
+            "The future of AI is",
+            "Advancements in technology will lead to",
+            "Tomorrow's world is shaped by",
         ]
 
         outputs = decode_text(
@@ -177,7 +180,7 @@ def main():
             parallel_context=parallel_context,
             max_new_tokens=args.max_new_tokens,
             max_micro_batch_size=2,
-            generation_config=GenerationArgs(sampler="greedy", use_cache=True),
+            generation_config=GenerationArgs(sampler="top_k", use_cache=True, n_samples=2),
             tokenizer_config=TokenizerConfig(max_input_length=None),
             is_bench=os.environ.get("USE_BENCH", "0") == "1",
         )
diff --git a/src/nanotron/generation/decode.py b/src/nanotron/generation/decode.py
index f6c1f1a8..5ebafa80 100644
--- a/src/nanotron/generation/decode.py
+++ b/src/nanotron/generation/decode.py
@@ -190,6 +190,16 @@ def decode_text(
 
     p2p = model.p2p
 
+    if generation_config and generation_config.n_samples:
+        assert isinstance(generation_config.n_samples, int) and generation_config.n_samples > 0
+        if sampler_type != SamplerType.TOP_P and sampler_type != SamplerType.TOP_K:
+            raise ValueError("Only support n_samples for TOP_P and TOP_K sampler")
+        new_input_iter = []
+        for input in input_iter:
+            for _ in range(generation_config.n_samples):
+                new_input_iter.append(GenerationInput(text=input.text))
+        input_iter = new_input_iter
+
     # That's annoying but I need this as soon as there's a change communication "cross"
     pipeline_state = PipelineEvalBatchState()
     with attach_pipeline_state_to_model(model=model, pipeline_state=pipeline_state):

From 47476ea344f1e20508988df13f6fa4f75febd9a1 Mon Sep 17 00:00:00 2001
From: zzhhjjj <z785566960@gmail.com>
Date: Thu, 25 Apr 2024 09:24:14 +0000
Subject: [PATCH 2/6] add comment.remove assert

---
 run_generate.py                   | 2 +-
 src/nanotron/generation/decode.py | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/run_generate.py b/run_generate.py
index 419e0ba3..04f75cf0 100644
--- a/run_generate.py
+++ b/run_generate.py
@@ -180,7 +180,7 @@ def main():
             parallel_context=parallel_context,
             max_new_tokens=args.max_new_tokens,
             max_micro_batch_size=2,
-            generation_config=GenerationArgs(sampler="top_k", use_cache=True, n_samples=2),
+            generation_config=GenerationArgs(sampler="top_k", top_k=10, use_cache=True, n_samples=2),
             tokenizer_config=TokenizerConfig(max_input_length=None),
             is_bench=os.environ.get("USE_BENCH", "0") == "1",
         )
diff --git a/src/nanotron/generation/decode.py b/src/nanotron/generation/decode.py
index 5ebafa80..f0f010bf 100644
--- a/src/nanotron/generation/decode.py
+++ b/src/nanotron/generation/decode.py
@@ -190,8 +190,8 @@ def decode_text(
 
     p2p = model.p2p
 
+    # replicate input for n_samples times when using TOP_P or TOP_K samplers, in order to get diverse results
     if generation_config and generation_config.n_samples:
-        assert isinstance(generation_config.n_samples, int) and generation_config.n_samples > 0
         if sampler_type != SamplerType.TOP_P and sampler_type != SamplerType.TOP_K:
             raise ValueError("Only support n_samples for TOP_P and TOP_K sampler")
         new_input_iter = []

From 20a3aebb28c5f90fac67e282f7cc4c672c46c0e2 Mon Sep 17 00:00:00 2001
From: zzhhjjj <z785566960@gmail.com>
Date: Thu, 25 Apr 2024 12:15:02 +0000
Subject: [PATCH 3/6] some examples

---
 run_generate.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/run_generate.py b/run_generate.py
index 04f75cf0..03a81564 100644
--- a/run_generate.py
+++ b/run_generate.py
@@ -164,10 +164,10 @@ def main():
         tokenizer.padding_side = "left"
         tokenizer.truncation_side = "left"  # TODO @nouamane: do we want this?
         dummy_inputs = [
+            "The future of AI is",
             "Passage: Daniel went back to the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:",
             "def fib(n)",
             "This film was probably inspired by Godzilla",
-            "The future of AI is",
             "Advancements in technology will lead to",
             "Tomorrow's world is shaped by",
         ]
@@ -180,7 +180,7 @@ def main():
             parallel_context=parallel_context,
             max_new_tokens=args.max_new_tokens,
             max_micro_batch_size=2,
-            generation_config=GenerationArgs(sampler="top_k", top_k=10, use_cache=True, n_samples=2),
+            generation_config=GenerationArgs(sampler="top_k", top_k=50, use_cache=False, n_samples=2),
             tokenizer_config=TokenizerConfig(max_input_length=None),
             is_bench=os.environ.get("USE_BENCH", "0") == "1",
         )

From 5f025dc5b263673c8ad4053dfb08d8f16cd8ac83 Mon Sep 17 00:00:00 2001
From: zzhhjjj <z785566960@gmail.com>
Date: Fri, 26 Apr 2024 12:47:12 +0000
Subject: [PATCH 4/6] fix some bugs

---
 run_train.py            |  2 +-
 src/nanotron/trainer.py | 11 ++++-------
 2 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/run_train.py b/run_train.py
index 8dc16f7a..617d231b 100644
--- a/run_train.py
+++ b/run_train.py
@@ -133,7 +133,7 @@ def get_dataloader_from_data_stage(
             )
             assert num_tokens_needed_for_training <= total_tokens_dataset, (
                 f"Dataset is too small for steps ({total_tokens_dataset} < {num_tokens_needed_for_training}), "
-                f"Try train_steps<={len(dataloader.dataset) // trainer.global_batch_size + trainer.start_iteration_step}"
+                f"Try train_steps<={len(dataloader.dataset) // trainer.global_batch_size + trainer.iteration_step}"
             )
     else:
         raise ValueError(f"Unhandled case of `self.config.data.dataset`. Got: {data.dataset}")
diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py
index b23b99b3..c2f1808a 100644
--- a/src/nanotron/trainer.py
+++ b/src/nanotron/trainer.py
@@ -656,13 +656,10 @@ def init_model(self) -> Union[NanotronModel, DistributedDataParallel]:
                     rank=0,
                 )
             else:
-                log_rank(
-                    f"Setting max_position_embeddings to {self.config.tokens.sequence_length}. Previous value was {self.model_config.max_position_embeddings}.",
-                    logger=logger,
-                    level=logging.INFO,
-                    rank=0,
-                )
-                self.model_config.max_position_embeddings = self.config.tokens.sequence_length
+                # Rather than indirectly setting max_position_embeddings to tokens.sequence_length, we should directly assert their equality.
+                assert (
+                    self.config.tokens.sequence_length == self.model_config.max_position_embeddings
+                ), "The tokenizer's sequence length does not match the model's maximum position embeddings."
 
         log_rank("Config:\n" + pformat(self.config), logger=logger, level=logging.INFO, rank=0)
         log_rank("Model Config:\n" + pformat(self.model_config), logger=logger, level=logging.INFO, rank=0)

From 24cfbbd6d7b99d6bd670e87955f07547294e4b71 Mon Sep 17 00:00:00 2001
From: zzhhjjj <z785566960@gmail.com>
Date: Fri, 26 Apr 2024 17:09:56 +0000
Subject: [PATCH 5/6] clean code

---
 run_generate.py                   | 4 ++--
 src/nanotron/generation/decode.py | 8 +++-----
 src/nanotron/trainer.py           | 1 -
 3 files changed, 5 insertions(+), 8 deletions(-)

diff --git a/run_generate.py b/run_generate.py
index 03a81564..afdf32b4 100644
--- a/run_generate.py
+++ b/run_generate.py
@@ -167,7 +167,7 @@ def main():
             "The future of AI is",
             "Passage: Daniel went back to the garden. Mary travelled to the kitchen. Sandra journeyed to the kitchen. Sandra went to the hallway. John went to the bedroom. Mary went back to the garden. Where is Mary?\nAnswer:",
             "def fib(n)",
-            "This film was probably inspired by Godzilla",
+            'Here is an extract from a webpage: "Have you ever experienced heel pain after a heavy physical activity, or even right after a long period of standing? If you regard this as something usual and normal, then think again. Miscalled as heel pain, plantar fasciitis causes these frequent mild pains experienced in the soles of the feet. It is the inflammation and enlargement the plantar fascia tissue that is located in the heels of the feet, stretching to the base of the toes. This tissue is responsible for absorbing shock in the feet and for supporting the arches. It also plays a vital role in foot movements during walking and standing. Many factors such as excessive walking, standing, and running trigger heel pain and plantar fasciitis. A sudden increase in intensity of activities, increase in weight, and abrupt change of footwear also cause the swelling of the ligament. Non-supportive footwear lacking arch cushions and improper and worn out running or training can also lead to the problem. It is also most evident among those". Write an extensive and detailed course unit suitable for a textbook targeted at college students, related to the given extract, within the context of "Medicine". Do not just list concepts, but develop each one in detail before moving to the next, as we prioritize depth of understanding and comprehensive exploration of the subject matter over breadth. Focus on: - Rigor: Ensure in-depth coverage of the concepts/sections. - Engagement: Write with an academic, professional and engaging tone that captivates interest. - Application: Incorporate specific, practical examples, such as proofs in calculus or critical dates and figures in history. Do not include a title or an introduction, simply write the content without headlines and introductory phrases. Do not use images.',
             "Advancements in technology will lead to",
             "Tomorrow's world is shaped by",
         ]
@@ -180,7 +180,7 @@ def main():
             parallel_context=parallel_context,
             max_new_tokens=args.max_new_tokens,
             max_micro_batch_size=2,
-            generation_config=GenerationArgs(sampler="top_k", top_k=50, use_cache=False, n_samples=2),
+            generation_config=GenerationArgs(sampler="greedy", use_cache=True),
             tokenizer_config=TokenizerConfig(max_input_length=None),
             is_bench=os.environ.get("USE_BENCH", "0") == "1",
         )
diff --git a/src/nanotron/generation/decode.py b/src/nanotron/generation/decode.py
index f0f010bf..6ab71fad 100644
--- a/src/nanotron/generation/decode.py
+++ b/src/nanotron/generation/decode.py
@@ -194,11 +194,9 @@ def decode_text(
     if generation_config and generation_config.n_samples:
         if sampler_type != SamplerType.TOP_P and sampler_type != SamplerType.TOP_K:
             raise ValueError("Only support n_samples for TOP_P and TOP_K sampler")
-        new_input_iter = []
-        for input in input_iter:
-            for _ in range(generation_config.n_samples):
-                new_input_iter.append(GenerationInput(text=input.text))
-        input_iter = new_input_iter
+        input_iter = [
+            GenerationInput(text=input.text) for input in input_iter for _ in range(generation_config.n_samples)
+        ]
 
     # That's annoying but I need this as soon as there's a change communication "cross"
     pipeline_state = PipelineEvalBatchState()
diff --git a/src/nanotron/trainer.py b/src/nanotron/trainer.py
index c2f1808a..0eda00dc 100644
--- a/src/nanotron/trainer.py
+++ b/src/nanotron/trainer.py
@@ -656,7 +656,6 @@ def init_model(self) -> Union[NanotronModel, DistributedDataParallel]:
                     rank=0,
                 )
             else:
-                # Rather than indirectly setting max_position_embeddings to tokens.sequence_length, we should directly assert their equality.
                 assert (
                     self.config.tokens.sequence_length == self.model_config.max_position_embeddings
                 ), "The tokenizer's sequence length does not match the model's maximum position embeddings."

From 693628ec17040b1b41d926ed536481889f46491c Mon Sep 17 00:00:00 2001
From: NouamaneTazi <nouamane98@gmail.com>
Date: Tue, 30 Apr 2024 16:20:07 +0000
Subject: [PATCH 6/6] update config_tiny_llama.py

---
 examples/config_tiny_llama.py   | 30 +++++++++-----
 examples/config_tiny_llama.yaml | 70 +++++++++------------------------
 2 files changed, 39 insertions(+), 61 deletions(-)

diff --git a/examples/config_tiny_llama.py b/examples/config_tiny_llama.py
index dfbee136..765a353d 100644
--- a/examples/config_tiny_llama.py
+++ b/examples/config_tiny_llama.py
@@ -81,11 +81,26 @@
     tp_linear_async_communication=True,
 )
 
-tokens = TokensArgs(sequence_length=32, train_steps=10, micro_batch_size=2, batch_accumulation_per_replica=1)
+tokens = TokensArgs(sequence_length=256, train_steps=15, micro_batch_size=2, batch_accumulation_per_replica=1)
 
-dataset = PretrainDatasetsArgs(
-    hf_dataset_or_datasets="HuggingFaceH4/testing_alpaca_small", text_column_name="completion"
-)
+data_stages = [
+    DatasetStageArgs(
+        name="Stable Training Stage",
+        start_training_step=1,
+        data=DataArgs(
+            dataset=PretrainDatasetsArgs(hf_dataset_or_datasets="stas/openwebtext-10k", text_column_name="text"),
+            seed=seed,
+        ),
+    ),
+    DatasetStageArgs(
+        name="Annealing Phase",
+        start_training_step=10,
+        data=DataArgs(
+            dataset=PretrainDatasetsArgs(hf_dataset_or_datasets="stas/openwebtext-10k", text_column_name="text"),
+            seed=seed,
+        ),
+    ),
+]
 
 checkpoints_path = os.path.dirname(os.path.dirname(__file__)) + "/checkpoints"
 os.makedirs(checkpoints_path, exist_ok=True)
@@ -99,12 +114,7 @@
     optimizer=optimizer,
     logging=LoggingArgs(),
     tokens=tokens,
-    data_stages=[
-        DatasetStageArgs(
-            name="Stable Training Stage", start_training_step=1, data=DataArgs(dataset=dataset, seed=seed)
-        ),
-        DatasetStageArgs(name="Annealing Phase", start_training_step=10, data=DataArgs(dataset=dataset, seed=seed)),
-    ],
+    data_stages=data_stages,
     profiler=None,
 )
 
diff --git a/examples/config_tiny_llama.yaml b/examples/config_tiny_llama.yaml
index 0e87c663..ab358b05 100644
--- a/examples/config_tiny_llama.yaml
+++ b/examples/config_tiny_llama.yaml
@@ -1,6 +1,6 @@
 checkpoints:
   checkpoint_interval: 10
-  checkpoints_path: /fsx/ferdinandmom/ferdinand-hf/nanotron/checkpoints
+  checkpoints_path: /fsx/nouamane/projects/nanotron/checkpoints
   checkpoints_path_is_shared_file_system: false
   resume_checkpoint_path: null
   save_initial_state: false
@@ -10,9 +10,9 @@ data_stages:
       dataset_overwrite_cache: false
       dataset_processing_num_proc_per_process: 1
       hf_dataset_config_name: null
-      hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
+      hf_dataset_or_datasets: stas/openwebtext-10k
       hf_dataset_splits: train
-      text_column_name: completion
+      text_column_name: text
     num_loading_workers: 1
     seed: 42
   name: Stable Training Stage
@@ -22,9 +22,9 @@ data_stages:
       dataset_overwrite_cache: false
       dataset_processing_num_proc_per_process: 1
       hf_dataset_config_name: null
-      hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
+      hf_dataset_or_datasets: stas/openwebtext-10k
       hf_dataset_splits: train
-      text_column_name: completion
+      text_column_name: text
     num_loading_workers: 1
     seed: 42
   name: Annealing Phase
@@ -37,24 +37,28 @@ general:
   run: tiny_llama_%date_%jobid
   seed: 42
   step: null
+lighteval: null
+logging:
+  iteration_step_info_interval: 1
+  log_level: info
+  log_level_replica: info
 model:
   ddp_bucket_cap_mb: 25
   dtype: bfloat16
   init_method:
     std: 0.025
-    # use_mup: true # uncomment this and comment the std line above to use spectral µTransfer
   make_vocab_size_divisible_by: 1
   model_config:
     bos_token_id: 1
     eos_token_id: 2
     hidden_act: silu
-    hidden_size: 32
+    hidden_size: 16
     initializer_range: 0.02
-    intermediate_size: 128
+    intermediate_size: 64
     is_llama_config: true
     max_position_embeddings: 256
     num_attention_heads: 4
-    num_hidden_layers: 10
+    num_hidden_layers: 2
     num_key_value_heads: 4
     pad_token_id: null
     pretraining_tp: 1
@@ -67,11 +71,11 @@ optimizer:
   accumulate_grad_in_fp32: true
   clip_grad: 1.0
   learning_rate_scheduler:
-    learning_rate: 0.001
+    learning_rate: 0.0003
     lr_decay_starting_step: null
-    lr_decay_steps: null
+    lr_decay_steps: 13
     lr_decay_style: cosine
-    lr_warmup_steps: 2000 # 20% of the total steps
+    lr_warmup_steps: 2
     lr_warmup_style: linear
     min_decay_lr: 1.0e-05
   optimizer_factory:
@@ -85,37 +89,12 @@ optimizer:
 parallelism:
   dp: 2
   expert_parallel_size: 1
-  pp: 1
+  pp: 2
   pp_engine: 1f1b
   tp: 2
   tp_linear_async_communication: true
   tp_mode: REDUCE_SCATTER
-data_stages:
-  - name: Stable Training Stage
-    start_training_step: 1
-    data:
-      dataset:
-        dataset_overwrite_cache: false
-        dataset_processing_num_proc_per_process: 1
-        hf_dataset_config_name: null
-        hf_dataset_or_datasets: HuggingFaceH4/testing_alpaca_small
-        hf_dataset_splits: train
-        text_column_name: completion
-      num_loading_workers: 1
-      seed: 42
-  - name: Annealing Phase
-    start_training_step: 10
-    data:
-      dataset:
-        dataset_overwrite_cache: false
-        dataset_processing_num_proc_per_process: 1
-        hf_dataset_config_name: null
-        hf_dataset_or_datasets: HuggingFaceH4/testing_codealpaca_small
-        hf_dataset_splits: train
-        text_column_name: completion
-      num_loading_workers: 1
-      seed: 42
-lighteval: null
+profiler: null
 tokenizer:
   tokenizer_max_length: null
   tokenizer_name_or_path: gpt2
@@ -125,17 +104,6 @@ tokens:
   limit_test_batches: 0
   limit_val_batches: 0
   micro_batch_size: 2
-  sequence_length: 32
+  sequence_length: 256
   train_steps: 15
   val_check_interval: -1
-checkpoints:
-  checkpoint_interval: 10
-  checkpoints_path: checkpoints
-  checkpoints_path_is_shared_file_system: false
-  resume_checkpoint_path: checkpoints
-  save_initial_state: false
-profiler: null
-logging:
-  iteration_step_info_interval: 1
-  log_level: info
-  log_level_replica: info