Cricket News To Cricket Match Report
In [ ]:
Copied!
! pip install transformers datasets accelerate peft
! pip install transformers datasets accelerate peft
In [ ]:
Copied!
from huggingface_hub import notebook_login
notebook_login()
from huggingface_hub import notebook_login
notebook_login()
In [ ]:
Copied!
class config:
DATA_PATH = "/content/match_report_gen.csv"
MODEL_CKPT = "google/flan-t5-large"
SRC_COLUMN = "Commentary_Highlights"
TGT_COLUMN = "Match_Report"
SRC_MAX_LEN = 1200
TGT_MAX_LEN = 256
LABEL_PAD_TOKEN_ID = -100
PADDING = "max_length"
TRUNCATION = True
LORA_R = 8
LORA_ALPHA = 16
LORA_TGT_MODULES = ["q", "v"]
LORA_DROPOUT = 0.1
LORA_TASK_TYPE = "SEQ_2_SEQ_LM"
PAD_TO_MULTIPLE = 8
MODEL_OUT_DIR = "criccomm_to_cricnewss"
TRAIN_BATCH_SIZE = 1
LR = 2e-4
NUM_TRAIN_EPOCHS = 3
LOGGING_DIR = f"{MODEL_OUT_DIR}/logs"
LOGGING_STRATEGY = "epoch"
SAVE_STRATEGY = "epoch"
SAVE_LIMIT = 1
PUSH_TO_HUB = True
class config:
DATA_PATH = "/content/match_report_gen.csv"
MODEL_CKPT = "google/flan-t5-large"
SRC_COLUMN = "Commentary_Highlights"
TGT_COLUMN = "Match_Report"
SRC_MAX_LEN = 1200
TGT_MAX_LEN = 256
LABEL_PAD_TOKEN_ID = -100
PADDING = "max_length"
TRUNCATION = True
LORA_R = 8
LORA_ALPHA = 16
LORA_TGT_MODULES = ["q", "v"]
LORA_DROPOUT = 0.1
LORA_TASK_TYPE = "SEQ_2_SEQ_LM"
PAD_TO_MULTIPLE = 8
MODEL_OUT_DIR = "criccomm_to_cricnewss"
TRAIN_BATCH_SIZE = 1
LR = 2e-4
NUM_TRAIN_EPOCHS = 3
LOGGING_DIR = f"{MODEL_OUT_DIR}/logs"
LOGGING_STRATEGY = "epoch"
SAVE_STRATEGY = "epoch"
SAVE_LIMIT = 1
PUSH_TO_HUB = True
In [ ]:
Copied!
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
class CricketCommentaryReportGenerationDatasetModule:
def __init__(self):
self.data_path = config.DATA_PATH
self.model_ckpt = config.MODEL_CKPT
self.src_column = config.SRC_COLUMN
self.tgt_column = config.TGT_COLUMN
self.src_max_len = config.SRC_MAX_LEN
self.tgt_max_len = config.TGT_MAX_LEN
self.padding = config.PADDING
self.truncation = config.TRUNCATION
self.label_pad_token_id = config.LABEL_PAD_TOKEN_ID
self.tokenizer = AutoTokenizer.from_pretrained(self.model_ckpt)
def load_data(self):
df = pd.read_csv(self.data_path)
data = Dataset.from_pandas(df)
return data
def preprocess_function(self, example):
model_inp = self.tokenizer(
example[self.src_column],
max_length = self.src_max_len,
padding = self.padding,
truncation = self.truncation
)
labels = self.tokenizer(
example[self.tgt_column],
max_length = self.tgt_max_len,
padding = self.padding,
truncation = self.truncation
)
labels["input_ids"] = [
(label if label != self.tokenizer.pad_token_id else self.label_pad_token_id) for label in labels["input_ids"]
]
model_inp["labels"] = labels["input_ids"]
return model_inp
def gen_data(self):
data = self.load_data()
tokenized_data = data.map(
self.preprocess_function,
batched = False,
remove_columns = data.column_names
)
return tokenized_data
import pandas as pd
from datasets import Dataset
from transformers import AutoTokenizer
class CricketCommentaryReportGenerationDatasetModule:
def __init__(self):
self.data_path = config.DATA_PATH
self.model_ckpt = config.MODEL_CKPT
self.src_column = config.SRC_COLUMN
self.tgt_column = config.TGT_COLUMN
self.src_max_len = config.SRC_MAX_LEN
self.tgt_max_len = config.TGT_MAX_LEN
self.padding = config.PADDING
self.truncation = config.TRUNCATION
self.label_pad_token_id = config.LABEL_PAD_TOKEN_ID
self.tokenizer = AutoTokenizer.from_pretrained(self.model_ckpt)
def load_data(self):
df = pd.read_csv(self.data_path)
data = Dataset.from_pandas(df)
return data
def preprocess_function(self, example):
model_inp = self.tokenizer(
example[self.src_column],
max_length = self.src_max_len,
padding = self.padding,
truncation = self.truncation
)
labels = self.tokenizer(
example[self.tgt_column],
max_length = self.tgt_max_len,
padding = self.padding,
truncation = self.truncation
)
labels["input_ids"] = [
(label if label != self.tokenizer.pad_token_id else self.label_pad_token_id) for label in labels["input_ids"]
]
model_inp["labels"] = labels["input_ids"]
return model_inp
def gen_data(self):
data = self.load_data()
tokenized_data = data.map(
self.preprocess_function,
batched = False,
remove_columns = data.column_names
)
return tokenized_data
In [ ]:
Copied!
from peft import get_peft_model, LoraConfig
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments
class CricketCommentaryReportGenerationUtils:
def __init__(self):
self.lora_r = config.LORA_R
self.lora_alpha = config.LORA_ALPHA
self.lora_tgt_modules = config.LORA_TGT_MODULES
self.lora_dropout = config.LORA_DROPOUT
self.lora_task_type = config.LORA_TASK_TYPE
self.label_pad_token_id = config.LABEL_PAD_TOKEN_ID
self.pad_to_multiple = config.PAD_TO_MULTIPLE
self.model_out_dir = config.MODEL_OUT_DIR
self.train_batch_size =config.TRAIN_BATCH_SIZE
self.lr = config.LR
self.num_train_epochs = config.NUM_TRAIN_EPOCHS
self.logging_dir = config.LOGGING_DIR
self.logging_strategy = config.LOGGING_STRATEGY
self.save_strategy = config.SAVE_STRATEGY
self.save_limit = config.SAVE_LIMIT
self.push_to_hub = config.PUSH_TO_HUB
def lora_model(self, model):
lora_config = LoraConfig(
r = self.lora_r,
target_modules= self.lora_tgt_modules,
lora_alpha = self.lora_alpha,
lora_dropout = self.lora_dropout,
bias = "none",
task_type = self.lora_task_type
)
lora_model = get_peft_model(model, lora_config)
print(lora_model.print_trainable_parameters())
return lora_model
def train_data_collator(self, tokenizer, model):
training_data_collator = DataCollatorForSeq2Seq(
tokenizer,
model = model,
label_pad_token_id = self.label_pad_token_id,
pad_to_multiple_of = self.pad_to_multiple
)
return training_data_collator
def set_training_args(self):
training_args = Seq2SeqTrainingArguments(
output_dir = self.model_out_dir,
per_device_train_batch_size = self.train_batch_size,
learning_rate = self.lr,
num_train_epochs = self.num_train_epochs,
logging_dir = self.logging_dir,
logging_strategy = self.logging_strategy,
save_strategy = self.save_strategy,
save_total_limit = self.save_limit,
push_to_hub = self.push_to_hub
)
return training_args
from peft import get_peft_model, LoraConfig
from transformers import DataCollatorForSeq2Seq, Seq2SeqTrainingArguments
class CricketCommentaryReportGenerationUtils:
def __init__(self):
self.lora_r = config.LORA_R
self.lora_alpha = config.LORA_ALPHA
self.lora_tgt_modules = config.LORA_TGT_MODULES
self.lora_dropout = config.LORA_DROPOUT
self.lora_task_type = config.LORA_TASK_TYPE
self.label_pad_token_id = config.LABEL_PAD_TOKEN_ID
self.pad_to_multiple = config.PAD_TO_MULTIPLE
self.model_out_dir = config.MODEL_OUT_DIR
self.train_batch_size =config.TRAIN_BATCH_SIZE
self.lr = config.LR
self.num_train_epochs = config.NUM_TRAIN_EPOCHS
self.logging_dir = config.LOGGING_DIR
self.logging_strategy = config.LOGGING_STRATEGY
self.save_strategy = config.SAVE_STRATEGY
self.save_limit = config.SAVE_LIMIT
self.push_to_hub = config.PUSH_TO_HUB
def lora_model(self, model):
lora_config = LoraConfig(
r = self.lora_r,
target_modules= self.lora_tgt_modules,
lora_alpha = self.lora_alpha,
lora_dropout = self.lora_dropout,
bias = "none",
task_type = self.lora_task_type
)
lora_model = get_peft_model(model, lora_config)
print(lora_model.print_trainable_parameters())
return lora_model
def train_data_collator(self, tokenizer, model):
training_data_collator = DataCollatorForSeq2Seq(
tokenizer,
model = model,
label_pad_token_id = self.label_pad_token_id,
pad_to_multiple_of = self.pad_to_multiple
)
return training_data_collator
def set_training_args(self):
training_args = Seq2SeqTrainingArguments(
output_dir = self.model_out_dir,
per_device_train_batch_size = self.train_batch_size,
learning_rate = self.lr,
num_train_epochs = self.num_train_epochs,
logging_dir = self.logging_dir,
logging_strategy = self.logging_strategy,
save_strategy = self.save_strategy,
save_total_limit = self.save_limit,
push_to_hub = self.push_to_hub
)
return training_args
In [ ]:
Copied!
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer
class CricketCommentaryReportGenerationTrainer:
def __init__(self):
self.model_ckpt = config.MODEL_CKPT
self.model_out_dir = config.MODEL_OUT_DIR
self.data_module = CricketCommentaryReportGenerationDatasetModule()
self.train_dataset = self.data_module.gen_data()
self.utils_module = CricketCommentaryReportGenerationUtils()
self.tokenizer = AutoTokenizer.from_pretrained(self.model_ckpt)
self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_ckpt)
def prepare_training(self):
self.lora_model = self.utils_module.lora_model(self.model)
self.data_collator = self.utils_module.train_data_collator(self.tokenizer, self.model)
self.training_args = self.utils_module.set_training_args()
self.model.config.use_cache = False
def model_trainer(self):
self.prepare_training()
trainer = Seq2SeqTrainer(
model=self.lora_model,
args=self.training_args,
data_collator=self.data_collator,
train_dataset=self.train_dataset,
)
return trainer
def model_train_save_push_to_hub(self):
trainer = self.model_trainer()
print("Training Started")
trainer.train()
trainer.push_to_hub()
trainer.model.base_model.push_to_hub(self.model_out_dir)
self.tokenizer.push_to_hub(self.model_out_dir)
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, Seq2SeqTrainer
class CricketCommentaryReportGenerationTrainer:
def __init__(self):
self.model_ckpt = config.MODEL_CKPT
self.model_out_dir = config.MODEL_OUT_DIR
self.data_module = CricketCommentaryReportGenerationDatasetModule()
self.train_dataset = self.data_module.gen_data()
self.utils_module = CricketCommentaryReportGenerationUtils()
self.tokenizer = AutoTokenizer.from_pretrained(self.model_ckpt)
self.model = AutoModelForSeq2SeqLM.from_pretrained(self.model_ckpt)
def prepare_training(self):
self.lora_model = self.utils_module.lora_model(self.model)
self.data_collator = self.utils_module.train_data_collator(self.tokenizer, self.model)
self.training_args = self.utils_module.set_training_args()
self.model.config.use_cache = False
def model_trainer(self):
self.prepare_training()
trainer = Seq2SeqTrainer(
model=self.lora_model,
args=self.training_args,
data_collator=self.data_collator,
train_dataset=self.train_dataset,
)
return trainer
def model_train_save_push_to_hub(self):
trainer = self.model_trainer()
print("Training Started")
trainer.train()
trainer.push_to_hub()
trainer.model.base_model.push_to_hub(self.model_out_dir)
self.tokenizer.push_to_hub(self.model_out_dir)
In [ ]:
Copied!
if __name__ == "__main__":
model_trainer = CricketCommentaryReportGenerationTrainer()
model_trainer.model_train_save_push_to_hub()
if __name__ == "__main__":
model_trainer = CricketCommentaryReportGenerationTrainer()
model_trainer.model_train_save_push_to_hub()
In [ ]:
Copied!