-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Showing
10 changed files
with
499 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,2 @@ | ||
text-to-sql* | ||
checkpoints |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
__pycache__ | ||
.DS_STORE | ||
text-to-sql* | ||
checkpoints | ||
*.png |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
# Finetuning Mistral-7B using LoRA and DeepSpeed | ||
|
||
We finetune [Mistral-7B](https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.2) using [LoRA](https://arxiv.org/abs/2106.09685) and [DeepSpeed](https://github.com/microsoft/DeepSpeed). We ran LoRA on two 40 GB A100 GPUs utilizing DeepSpeed. | ||
|
||
To get started, first install Determined on your local machine: | ||
```bash | ||
pip install determined | ||
``` | ||
|
||
Then finetune with LoRA: | ||
```bash | ||
det e create lora.yaml . | ||
``` | ||
|
||
You can view the actual training code in `finetune.py`. | ||
|
||
|
||
## Configuration | ||
|
||
Change configuration options in `lora.yaml`. Some important options are: | ||
- `slots_per_trial`: the number of GPUs to use. | ||
- `dataset_subset`: the difficulty subset to train on. | ||
- `per_device_train_batch_size`: the batch size per GPU. | ||
|
||
|
||
DeepSpeed configuration files are in the `ds_configs` folder. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
CHAT_ML_TEMPLATE = """ | ||
{% for message in messages %} | ||
{% if message['role'] == 'user' %} | ||
{{'<|im_start|>user\n' + message['content'].strip() + '<|im_end|>' }} | ||
{% elif message['role'] == 'system' %} | ||
{{'<|im_start|>system\n' + message['content'].strip() + '<|im_end|>' }} | ||
{% elif message['role'] == 'assistant' %} | ||
{{'<|im_start|>assistant\n' + message['content'] + '<|im_end|>' }} | ||
{% endif %} | ||
{% endfor %} | ||
""" | ||
|
||
|
||
CHAT_ML_EOS_TOKEN = "<|im_end|>" | ||
|
||
|
||
def get_chat_format(element, model_name, with_assistant_response=True): | ||
system_prompt = ( | ||
"You are a helpful programmer assistant that excels at SQL. " | ||
"When prompted with a task and a definition of an SQL table, you " | ||
"respond with a SQL query to retrieve information from the table. " | ||
"Don't explain your reasoning, only provide the SQL query." | ||
) | ||
|
||
user_prompt = "Task: {instruction}\nSQL table: {input}\nSQL query: " | ||
|
||
if model_name == "mistralai/Mistral-7B-Instruct-v0.2": | ||
user_prompt = f"{system_prompt}\n{user_prompt}" | ||
output = [ | ||
{"role": "user", "content": user_prompt.format_map(element)}, | ||
] | ||
else: | ||
output = [ | ||
{"role": "system", "content": system_prompt}, | ||
{"role": "user", "content": user_prompt.format_map(element)}, | ||
] | ||
|
||
if with_assistant_response: | ||
output.append({"role": "assistant", "content": element["response"]}) | ||
|
||
return output | ||
|
||
|
||
def set_special_tokens(tokenizer, model_name): | ||
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4": | ||
tokenizer.chat_template = CHAT_ML_TEMPLATE | ||
tokenizer.eos_token = CHAT_ML_EOS_TOKEN | ||
if tokenizer.pad_token_id is None: | ||
tokenizer.pad_token_id = tokenizer.eos_token_id | ||
|
||
|
||
def get_assistant_prompt(model_name): | ||
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4": | ||
return "<|im_start|>assistant\n" | ||
else: | ||
return "[/INST]" | ||
|
||
|
||
def get_response_template_ids(tokenizer, model_name): | ||
return tokenizer.encode(get_assistant_prompt(model_name), add_special_tokens=False) | ||
|
||
|
||
def maybe_add_generation_prompt(x, model_name): | ||
if model_name == "TinyLlama/TinyLlama-1.1B-Chat-v0.4": | ||
return x + get_assistant_prompt(model_name) | ||
else: | ||
return x |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
import datasets | ||
import pandas as pd | ||
|
||
|
||
def add_length_column(dataset) -> pd.DataFrame: | ||
df = dataset.to_pandas() | ||
df["total_length"] = 0 | ||
for column_name in ["instruction", "input", "response"]: | ||
num_words = df[column_name].astype(str).str.split().apply(len) | ||
df["total_length"] += num_words | ||
|
||
return df | ||
|
||
|
||
def filter_by_total_length(df, difficulty, number_of_samples): | ||
if difficulty == "easy": | ||
return df[df["total_length"].between(10, 100)].iloc[:number_of_samples] | ||
elif difficulty == "medium": | ||
return df[df["total_length"].between(101, 200)].iloc[:number_of_samples] | ||
elif difficulty == "hard": | ||
return df[df["total_length"].between(201, 800)].iloc[:number_of_samples] | ||
|
||
|
||
def get_dataset_subset_name(difficulty: str) -> str: | ||
return f"text-to-sql-v1-{difficulty}" | ||
|
||
|
||
def create_and_save_datasets( | ||
df, difficulty, train_ratio=0.8, val_ratio=0.1, test_ratio=0.1 | ||
): | ||
seed = 123 | ||
# remove total_length column because we don't need it anymore | ||
df = df.drop(columns=["total_length"]) | ||
dataset = datasets.Dataset.from_pandas(df, preserve_index=False) | ||
|
||
# split into training and "the rest" | ||
train_valtest = dataset.train_test_split(train_size=train_ratio, seed=seed) | ||
|
||
# split "the rest" into validation and testing | ||
val_test = train_valtest["test"].train_test_split( | ||
test_size=test_ratio / (test_ratio + val_ratio), seed=seed | ||
) | ||
|
||
dataset = datasets.DatasetDict( | ||
{ | ||
"train": train_valtest["train"], | ||
"valid": val_test["train"], | ||
"test": val_test["test"], | ||
} | ||
) | ||
dataset_name = get_dataset_subset_name(difficulty) | ||
dataset.save_to_disk(dataset_name) | ||
return dataset | ||
|
||
|
||
def load_dataset(difficulty): | ||
return datasets.load_from_disk(get_dataset_subset_name(difficulty)) | ||
|
||
|
||
def load_or_create_dataset(difficulty, num_samples=10000): | ||
try: | ||
return load_dataset(difficulty) | ||
except FileNotFoundError: | ||
dataset = datasets.load_dataset("Clinton/Text-to-sql-v1") | ||
dataset = dataset["train"] | ||
dataset = dataset.remove_columns(["text", "source"]) | ||
df = add_length_column(dataset) | ||
df = filter_by_total_length(df, difficulty, num_samples) | ||
return create_and_save_datasets(df, difficulty) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
{ | ||
"fp16": { | ||
"enabled": "auto", | ||
"loss_scale": 0, | ||
"loss_scale_window": 1000, | ||
"initial_scale_power": 16, | ||
"hysteresis": 2, | ||
"min_loss_scale": 1 | ||
}, | ||
"bf16": { | ||
"enabled": "auto" | ||
}, | ||
"optimizer": { | ||
"type": "AdamW", | ||
"params": { | ||
"lr": "auto", | ||
"betas": "auto", | ||
"eps": "auto", | ||
"weight_decay": "auto" | ||
} | ||
}, | ||
"scheduler": { | ||
"type": "WarmupDecayLR", | ||
"params": { | ||
"warmup_min_lr": "auto", | ||
"warmup_max_lr": "auto", | ||
"warmup_num_steps": "auto", | ||
"total_num_steps": "auto" | ||
} | ||
}, | ||
"zero_optimization": { | ||
"stage": 3, | ||
"overlap_comm": true, | ||
"contiguous_gradients": true, | ||
"sub_group_size": 1e9, | ||
"reduce_bucket_size": "auto", | ||
"stage3_prefetch_bucket_size": "auto", | ||
"stage3_param_persistence_threshold": "auto", | ||
"stage3_max_live_parameters": 1e9, | ||
"stage3_max_reuse_distance": 1e9, | ||
"stage3_gather_16bit_weights_on_model_save": true | ||
}, | ||
"gradient_accumulation_steps": "auto", | ||
"gradient_clipping": "auto", | ||
"train_batch_size": "auto", | ||
"train_micro_batch_size_per_gpu": "auto" | ||
} |
Oops, something went wrong.