-
Notifications
You must be signed in to change notification settings - Fork 57
91 lines (81 loc) · 2.72 KB
/
demo_in_readme.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
name: demo-in-readme
on:
pull_request:
branches:
- "main"
- "develop"
paths-ignore:
- "docs/**"
- "**.md"
env:
WORKSPACE_PREFIX: $(echo $GITHUB_WORKSPACE |cut -d '/' -f 1-4)
SLURM_PARTITION: llm_s
jobs:
dataset-preparation:
runs-on: [t_cluster]
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
- name: raw-chinese-data
run: |
source activate ${evo_env_torch21_flash2}
sh ./ci_scripts/data/tokenizer_chinese.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
- name: alpaca-data
run: |
source activate ${evo_env_torch21_flash2}
sh ./ci_scripts/data/tokenizer_alpaca.sh
train:
runs-on: [t_cluster]
timeout-minutes: 30
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
- name: slurm-train
id: basic_train
run: |
source activate ${evo_env_torch21_flash2}
sh ./ci_scripts/train/slurm_train.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
- name: load_preset_ckpt
if: ${{ failure() && steps.basic_train.conclusion == 'failure' }}
run: |
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
sh ./ci_scripts/train/load_ckpt.sh 7B_load_preset_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
- name: load_new_ckpt
run: |
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
sh ./ci_scripts/train/load_ckpt.sh 7B_load_new_ckpt ${GITHUB_RUN_ID}-${GITHUB_JOB}
rm -rf $GITHUB_WORKSPACE/llm_ckpts
- name: torchrun-train
run: |
source activate ${evo_env_torch21_flash2}
sh ./ci_scripts/train/torchrun.sh ${GITHUB_RUN_ID}-${GITHUB_JOB}
rm -rf $GITHUB_WORKSPACE/llm_ckpts
convert-model-then-load:
runs-on: [t_cluster]
timeout-minutes: 15
steps:
- name: mask env
run: |
echo "::add-mask::${{env.WORKSPACE_PREFIX}}"
echo "::add-mask::$path_prefix"
- uses: actions/checkout@v3
- name: convert-model-then-load
run: |
source activate ${evo_env_torch21_flash2}
export PYTHONPATH=$PWD:$PYTHONPATH
sh ./ci_scripts/model/convert_to_hf.sh
cd ./hf_ckpt
jobname=${GITHUB_RUN_ID}-${GITHUB_JOB}-${GITHUB_RUN_ATTEMPT}
srun -p ${SLURM_PARTITION} --kill-on-bad-exit=1 --job-name=$jobname --gpus-per-task=2 python ../ci_scripts/model/loaded_as_transformer.py
exit_code=$?
cd ..
rm -rf $GITHUB_WORKSPACE/hf_ckpt
sh ./ci_scripts/common/check_slurm_cancled.sh $exit_code $jobname