DPO Part1 SFT
In [ ]:
Copied!
! pip install transformers trl peft accelerate datasets bitsandbytes
! pip install transformers trl peft accelerate datasets bitsandbytes
In [ ]:
Copied!
from huggingface_hub import notebook_login
notebook_login()
from huggingface_hub import notebook_login
notebook_login()
In [ ]:
Copied!
import os
from dataclasses import dataclass, field
from typing import Optional
import torch
from datasets import load_dataset
from peft import AutoPeftModelForCausalLM, LoraConfig
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
from trl import SFTTrainer
from trl.trainer import ConstantLengthDataset
def chars_token_ratio(dataset, tokenizer, nb_examples=400):
"""
Estimate the average number of characters per token in the dataset.
"""
total_characters, total_tokens = 0, 0
for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
text = prepare_sample_text(example)
total_characters += len(text)
if tokenizer.is_fast:
total_tokens += len(tokenizer(text).tokens())
else:
total_tokens += len(tokenizer.tokenize(text))
return total_characters / total_tokens
def print_trainable_parameters(model):
"""
Prints the number of trainable parameters in the model.
"""
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
print(
f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
)
def prepare_sample_text(example):
"""Prepare the text from a sample of the dataset."""
text = f"{example['prompt']}\n {example['response']}"
return text
def create_datasets(tokenizer):
dataset = load_dataset(
"Dahoas/full-hh-rlhf",
split = "test",
use_auth_token=True
)
dataset = dataset.train_test_split(test_size=0.005, seed=None)
train_data = dataset["train"]
valid_data = dataset["test"]
print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")
chars_per_token = chars_token_ratio(train_data, tokenizer)
print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")
train_dataset = ConstantLengthDataset(
tokenizer,
train_data,
formatting_func=prepare_sample_text,
infinite=True,
seq_length=256,
chars_per_token=chars_per_token,
)
valid_dataset = ConstantLengthDataset(
tokenizer,
valid_data,
formatting_func=prepare_sample_text,
infinite=False,
seq_length=256,
chars_per_token=chars_per_token,
)
return train_dataset, valid_dataset
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)
base_model = AutoModelForCausalLM.from_pretrained(
"TabbyML/SantaCoder-1B",
quantization_config=bnb_config,
device_map={"": 0},
trust_remote_code=True,
use_auth_token=True,
)
base_model.config.use_cache = False
peft_config = LoraConfig(
r=8,
lora_alpha=16,
lora_dropout=0.05,
target_modules=["c_attn"],
bias="none",
task_type="CAUSAL_LM",
)
tokenizer = AutoTokenizer.from_pretrained("TabbyML/SantaCoder-1B", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
training_args = TrainingArguments(
output_dir="sft_santacoder1b",
per_device_train_batch_size=10,
gradient_accumulation_steps=2,
per_device_eval_batch_size=10,
learning_rate=2e-4,
logging_steps=10,
max_steps=105,
report_to="tensorboard",
save_steps=100,
lr_scheduler_type="cosine",
warmup_steps=1,
optim="paged_adamw_32bit",
fp16=True,
remove_unused_columns=False,
run_name="sft_santacoder1b",
)
train_dataset, eval_dataset = create_datasets(tokenizer)
trainer = SFTTrainer(
model=base_model,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
peft_config=peft_config,
packing=True,
max_seq_length=None,
tokenizer=tokenizer,
args=training_args,
)
trainer.train()
trainer.save_model("sft_santacoder1b")
output_dir = os.path.join("sft_santacoder1b", "final_checkpoint")
trainer.model.save_pretrained(output_dir)
# Free memory for merging weights
del base_model
torch.cuda.empty_cache()
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.float16)
model = model.merge_and_unload()
output_merged_dir = os.path.join("sft_santacoder1b", "final_merged_checkpoint")
model.save_pretrained(output_merged_dir, safe_serialization=True)
import os
from dataclasses import dataclass, field
from typing import Optional
import torch
from datasets import load_dataset
from peft import AutoPeftModelForCausalLM, LoraConfig
from tqdm import tqdm
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, HfArgumentParser, TrainingArguments
from trl import SFTTrainer
from trl.trainer import ConstantLengthDataset
def chars_token_ratio(dataset, tokenizer, nb_examples=400):
"""
Estimate the average number of characters per token in the dataset.
"""
total_characters, total_tokens = 0, 0
for _, example in tqdm(zip(range(nb_examples), iter(dataset)), total=nb_examples):
text = prepare_sample_text(example)
total_characters += len(text)
if tokenizer.is_fast:
total_tokens += len(tokenizer(text).tokens())
else:
total_tokens += len(tokenizer.tokenize(text))
return total_characters / total_tokens
def print_trainable_parameters(model):
"""
Prints the number of trainable parameters in the model.
"""
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
all_param += param.numel()
if param.requires_grad:
trainable_params += param.numel()
print(
f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
)
def prepare_sample_text(example):
"""Prepare the text from a sample of the dataset."""
text = f"{example['prompt']}\n {example['response']}"
return text
def create_datasets(tokenizer):
dataset = load_dataset(
"Dahoas/full-hh-rlhf",
split = "test",
use_auth_token=True
)
dataset = dataset.train_test_split(test_size=0.005, seed=None)
train_data = dataset["train"]
valid_data = dataset["test"]
print(f"Size of the train set: {len(train_data)}. Size of the validation set: {len(valid_data)}")
chars_per_token = chars_token_ratio(train_data, tokenizer)
print(f"The character to token ratio of the dataset is: {chars_per_token:.2f}")
train_dataset = ConstantLengthDataset(
tokenizer,
train_data,
formatting_func=prepare_sample_text,
infinite=True,
seq_length=256,
chars_per_token=chars_per_token,
)
valid_dataset = ConstantLengthDataset(
tokenizer,
valid_data,
formatting_func=prepare_sample_text,
infinite=False,
seq_length=256,
chars_per_token=chars_per_token,
)
return train_dataset, valid_dataset
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)
base_model = AutoModelForCausalLM.from_pretrained(
"TabbyML/SantaCoder-1B",
quantization_config=bnb_config,
device_map={"": 0},
trust_remote_code=True,
use_auth_token=True,
)
base_model.config.use_cache = False
peft_config = LoraConfig(
r=8,
lora_alpha=16,
lora_dropout=0.05,
target_modules=["c_attn"],
bias="none",
task_type="CAUSAL_LM",
)
tokenizer = AutoTokenizer.from_pretrained("TabbyML/SantaCoder-1B", trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training
training_args = TrainingArguments(
output_dir="sft_santacoder1b",
per_device_train_batch_size=10,
gradient_accumulation_steps=2,
per_device_eval_batch_size=10,
learning_rate=2e-4,
logging_steps=10,
max_steps=105,
report_to="tensorboard",
save_steps=100,
lr_scheduler_type="cosine",
warmup_steps=1,
optim="paged_adamw_32bit",
fp16=True,
remove_unused_columns=False,
run_name="sft_santacoder1b",
)
train_dataset, eval_dataset = create_datasets(tokenizer)
trainer = SFTTrainer(
model=base_model,
train_dataset=train_dataset,
eval_dataset=eval_dataset,
peft_config=peft_config,
packing=True,
max_seq_length=None,
tokenizer=tokenizer,
args=training_args,
)
trainer.train()
trainer.save_model("sft_santacoder1b")
output_dir = os.path.join("sft_santacoder1b", "final_checkpoint")
trainer.model.save_pretrained(output_dir)
# Free memory for merging weights
del base_model
torch.cuda.empty_cache()
model = AutoPeftModelForCausalLM.from_pretrained(output_dir, device_map="auto", torch_dtype=torch.float16)
model = model.merge_and_unload()
output_merged_dir = os.path.join("sft_santacoder1b", "final_merged_checkpoint")
model.save_pretrained(output_merged_dir, safe_serialization=True)
In [ ]:
Copied!
! cp -r /content/sft_santacoder1b /content/drive/MyDrive
! cp -r /content/sft_santacoder1b /content/drive/MyDrive
In [ ]:
Copied!
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig
import torch
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)
base_model = AutoModelForCausalLM.from_pretrained(
"TabbyML/SantaCoder-1B",
quantization_config=bnb_config,
device_map={"": 0},
trust_remote_code=True,
use_auth_token=True,
)
base_model.config.use_cache = False
from transformers import AutoModelForCausalLM, BitsAndBytesConfig
from peft import LoraConfig
import torch
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
)
base_model = AutoModelForCausalLM.from_pretrained(
"TabbyML/SantaCoder-1B",
quantization_config=bnb_config,
device_map={"": 0},
trust_remote_code=True,
use_auth_token=True,
)
base_model.config.use_cache = False
In [ ]:
Copied!
print(base_model)
print(base_model)
In [ ]:
Copied!