Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add scripts of high quality cpt experiments #62

Open
wants to merge 5 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
common:
v3_0_info_root: "/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0/token_info"

datasets:
code:
basedir: "v3_0_info_root"
file: "2024_0410_code.sakura_home.csv"
repeat: 0.1014
en:
basedir: "v3_0_info_root"
file: "2024_0410_en.sakura_home.csv"
repeat: 0.1014
ja_cc1:
file: "/home/shared/experiments/0022_v3-high-quality-cpt/corpus/llm-jp-corpus/v3.0.0/training_resharded_sorted_tokenize_ver3.0/ppl_asc/token_info.csv"
filter:
- train/ja/cc-1
repeat: 0.4318
ja_wiki:
basedir: "v3_0_info_root"
file: "2024_0410_ja.sakura_home.csv"
filter:
- train/ja/wiki
repeat: 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
#SBATCH --job-name=0022_1.7b-hugh-qaulity-cpt-exp1B_cc1_asc_ppl
#SBATCH --partition=gpu-small
#SBATCH --nodes=4
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=8
#SBATCH --output=outputs/%x-%j.out
#SBATCH --error=outputs/%x-%j.err

set -eu -o pipefail

# change directory if each experiment will be handled as one experintal issue
EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt
CONF_DIR=exp1B_cc1_asc_ppl

ENV_DIR=${EXPERIMENT_DIR}/environment
SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt

source ${ENV_DIR}/scripts/environment.sh
source ${ENV_DIR}/venv/bin/activate

export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1)
export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000)))

echo "MASTER_ADDR=${MASTER_ADDR}"

NUM_NODES=$SLURM_JOB_NUM_NODES
NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1)
NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE))

echo NUM_NODES=$NUM_NODES
echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE
echo NUM_GPUS=$NUM_GPUS

mpirun \
-np $NUM_GPUS \
--npernode $NUM_GPUS_PER_NODE \
-bind-to none \
-map-by slot \
-x EXPERIMENT_DIR=$EXPERIMENT_DIR \
-x SCRIPT_ROOT=$SCRIPT_ROOT \
-x CONF_DIR=$CONF_DIR \
-x MASTER_ADDR=$MASTER_ADDR \
-x MASTER_PORT=$MASTER_PORT \
bash ${SCRIPT_ROOT}/train-1.7b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
#SBATCH --job-name=0022_13b-hugh-qaulity-cpt-exp1B_cc1_asc_ppl
#SBATCH --partition=gpu-small
#SBATCH --nodes=8
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=8
#SBATCH --output=outputs/%x-%j.out
#SBATCH --error=outputs/%x-%j.err

set -eu -o pipefail

# change directory if each experiment will be handled as one experintal issue
EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt
CONF_DIR=exp1B_cc1_asc_ppl

ENV_DIR=${EXPERIMENT_DIR}/environment
SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt

source ${ENV_DIR}/scripts/environment.sh
source ${ENV_DIR}/venv/bin/activate

export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1)
export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000)))

echo "MASTER_ADDR=${MASTER_ADDR}"

NUM_NODES=$SLURM_JOB_NUM_NODES
NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1)
NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE))

echo NUM_NODES=$NUM_NODES
echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE
echo NUM_GPUS=$NUM_GPUS

mpirun \
-np $NUM_GPUS \
--npernode $NUM_GPUS_PER_NODE \
-bind-to none \
-map-by slot \
-x EXPERIMENT_DIR=$EXPERIMENT_DIR \
-x SCRIPT_ROOT=$SCRIPT_ROOT \
-x CONF_DIR=$CONF_DIR \
-x MASTER_ADDR=$MASTER_ADDR \
-x MASTER_PORT=$MASTER_PORT \
bash ${SCRIPT_ROOT}/train-13b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
#SBATCH --job-name=0022_3.7b-hugh-qaulity-cpt-exp1B_cc1_asc_ppl
#SBATCH --partition=gpu-small
#SBATCH --nodes=4
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=8
#SBATCH --output=outputs/%x-%j.out
#SBATCH --error=outputs/%x-%j.err

set -eu -o pipefail

# change directory if each experiment will be handled as one experintal issue
EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt
CONF_DIR=exp1B_cc1_asc_ppl

ENV_DIR=${EXPERIMENT_DIR}/environment
SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt

source ${ENV_DIR}/scripts/environment.sh
source ${ENV_DIR}/venv/bin/activate

export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1)
export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000)))

echo "MASTER_ADDR=${MASTER_ADDR}"

NUM_NODES=$SLURM_JOB_NUM_NODES
NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1)
NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE))

echo NUM_NODES=$NUM_NODES
echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE
echo NUM_GPUS=$NUM_GPUS

mpirun \
-np $NUM_GPUS \
--npernode $NUM_GPUS_PER_NODE \
-bind-to none \
-map-by slot \
-x EXPERIMENT_DIR=$EXPERIMENT_DIR \
-x SCRIPT_ROOT=$SCRIPT_ROOT \
-x CONF_DIR=$CONF_DIR \
-x MASTER_ADDR=$MASTER_ADDR \
-x MASTER_PORT=$MASTER_PORT \
bash ${SCRIPT_ROOT}/train-3.7b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
common:
v3_0_info_root: "/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0/token_info"

datasets:
code:
basedir: "v3_0_info_root"
file: "2024_0410_code.sakura_home.csv"
repeat: 0.1014
en:
basedir: "v3_0_info_root"
file: "2024_0410_en.sakura_home.csv"
repeat: 0.1014
ja_cc1:
file: "/home/shared/experiments/0022_v3-high-quality-cpt/corpus/llm-jp-corpus/v3.0.0/training_resharded_sorted_tokenize_ver3.0/ppl_desc/token_info.csv"
filter:
- train/ja/cc-1
repeat: 0.4318
ja_wiki:
basedir: "v3_0_info_root"
file: "2024_0410_ja.sakura_home.csv"
filter:
- train/ja/wiki
repeat: 2
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
#SBATCH --job-name=0022_1.7b-hugh-qaulity-cpt-exp1B_cc1_desc_ppl
#SBATCH --partition=gpu-small
#SBATCH --nodes=4
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=8
#SBATCH --output=outputs/%x-%j.out
#SBATCH --error=outputs/%x-%j.err

set -eu -o pipefail

# change directory if each experiment will be handled as one experintal issue
EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt
CONF_DIR=exp1B_cc1_desc_ppl

ENV_DIR=${EXPERIMENT_DIR}/environment
SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt

source ${ENV_DIR}/scripts/environment.sh
source ${ENV_DIR}/venv/bin/activate

export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1)
export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000)))

echo "MASTER_ADDR=${MASTER_ADDR}"

NUM_NODES=$SLURM_JOB_NUM_NODES
NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1)
NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE))

echo NUM_NODES=$NUM_NODES
echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE
echo NUM_GPUS=$NUM_GPUS

mpirun \
-np $NUM_GPUS \
--npernode $NUM_GPUS_PER_NODE \
-bind-to none \
-map-by slot \
-x EXPERIMENT_DIR=$EXPERIMENT_DIR \
-x SCRIPT_ROOT=$SCRIPT_ROOT \
-x CONF_DIR=$CONF_DIR \
-x MASTER_ADDR=$MASTER_ADDR \
-x MASTER_PORT=$MASTER_PORT \
bash ${SCRIPT_ROOT}/train-1.7b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
#SBATCH --job-name=0022_13b-hugh-qaulity-cpt-exp1B_cc1_desc_ppl
#SBATCH --partition=gpu-small
#SBATCH --nodes=8
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=8
#SBATCH --output=outputs/%x-%j.out
#SBATCH --error=outputs/%x-%j.err

set -eu -o pipefail

# change directory if each experiment will be handled as one experintal issue
EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt
CONF_DIR=exp1B_cc1_desc_ppl

ENV_DIR=${EXPERIMENT_DIR}/environment
SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt

source ${ENV_DIR}/scripts/environment.sh
source ${ENV_DIR}/venv/bin/activate

export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1)
export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000)))

echo "MASTER_ADDR=${MASTER_ADDR}"

NUM_NODES=$SLURM_JOB_NUM_NODES
NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1)
NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE))

echo NUM_NODES=$NUM_NODES
echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE
echo NUM_GPUS=$NUM_GPUS

mpirun \
-np $NUM_GPUS \
--npernode $NUM_GPUS_PER_NODE \
-bind-to none \
-map-by slot \
-x EXPERIMENT_DIR=$EXPERIMENT_DIR \
-x SCRIPT_ROOT=$SCRIPT_ROOT \
-x CONF_DIR=$CONF_DIR \
-x MASTER_ADDR=$MASTER_ADDR \
-x MASTER_PORT=$MASTER_PORT \
bash ${SCRIPT_ROOT}/train-13b.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
#!/bin/bash
#SBATCH --job-name=0022_3.7b-hugh-qaulity-cpt-exp1B_cc1_desc_ppl
#SBATCH --partition=gpu-small
#SBATCH --nodes=4
#SBATCH --gpus-per-node=8
#SBATCH --ntasks-per-node=8
#SBATCH --output=outputs/%x-%j.out
#SBATCH --error=outputs/%x-%j.err

set -eu -o pipefail

# change directory if each experiment will be handled as one experintal issue
EXPERIMENT_DIR=/home/shared/experiments/0022_v3-high-quality-cpt
CONF_DIR=exp1B_cc1_desc_ppl

ENV_DIR=${EXPERIMENT_DIR}/environment
SCRIPT_ROOT=${EXPERIMENT_DIR}/scripts/pretrain/scripts/v3-high-quality-cpt

source ${ENV_DIR}/scripts/environment.sh
source ${ENV_DIR}/venv/bin/activate

export MASTER_ADDR=$(scontrol show hostname $SLURM_JOB_NODELIST | head -n1)
export MASTER_PORT=$((10000 + (SLURM_JOBID % 50000)))

echo "MASTER_ADDR=${MASTER_ADDR}"

NUM_NODES=$SLURM_JOB_NUM_NODES
NUM_GPUS_PER_NODE=$(echo $SLURM_TASKS_PER_NODE | cut -d '(' -f 1)
NUM_GPUS=$((NUM_NODES * NUM_GPUS_PER_NODE))

echo NUM_NODES=$NUM_NODES
echo NUM_GPUS_PER_NODE=$NUM_GPUS_PER_NODE
echo NUM_GPUS=$NUM_GPUS

mpirun \
-np $NUM_GPUS \
--npernode $NUM_GPUS_PER_NODE \
-bind-to none \
-map-by slot \
-x EXPERIMENT_DIR=$EXPERIMENT_DIR \
-x SCRIPT_ROOT=$SCRIPT_ROOT \
-x CONF_DIR=$CONF_DIR \
-x MASTER_ADDR=$MASTER_ADDR \
-x MASTER_PORT=$MASTER_PORT \
bash ${SCRIPT_ROOT}/train-3.7b.sh
30 changes: 30 additions & 0 deletions pretrain/scripts/v3-high-quality-cpt/exp4A/data_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
common:
v3_0_info_root: "/home/shared/corpus/llm-jp-corpus/v3.0.0/training_resharded_tokenize_ver3.0/token_info"
v3_1_info_root: "/home/shared/corpus/llm-jp-corpus/v3.1.0/tokenize/v3.0b1/token_info"

datasets:
en:
basedir: "v3_0_info_root"
file: "2024_0410_en.sakura_home.csv"
repeat: 0.1658
ja_v3_1_pdf00:
basedir: "v3_1_info_root"
file: "2024_0718_ja_train2.sakura_home.csv"
filter:
- "train2/ja/warp-pdf-e00"
repeat: 0.1043
ja_v3_1_pdf02:
basedir: "v3_1_info_root"
file: "2024_0718_ja_train2.sakura_home.csv"
filter:
- "train2/ja/warp-pdf-e02"
repeat: 0.0522
ja_other:
basedir: "v3_0_info_root"
file: "2024_0410_ja.sakura_home.csv"
filter:
- train/ja/cc
- train/ja/kaken
- train/ja/warp-html
- train/ja/wiki
repeat: 0.1043
Loading