In [ ]:
Copied!
! pip install transformers datasets evaluate seqeval accelerate
! pip install transformers datasets evaluate seqeval accelerate
Imports and Config¶
In [ ]:
Copied!
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
Dataset¶
In [ ]:
Copied!
class NERDataset:
def __init__(self, data_id, tokenizer_ckpt):
self.data_id = data_id
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_ckpt)
def load_data(self):
self.dataset = load_dataset(self.data_id)
self.train = self.dataset["train"]
self.test = self.dataset["test"]
ner_feature = self.dataset["train"].features["ner_tags"]
label_names = ner_feature.feature.names
return self.train, self.test, label_names
def align_labels_with_tokens(self, labels, word_ids):
new_labels = []
current_word = None
for word_id in word_ids:
if word_id != current_word:
current_word = word_id
try:
label = -100 if word_id is None else labels[word_id]
except:
label = -100
new_labels.append(label)
elif word_id is None:
new_labels.append(-100)
else:
label = labels[word_id]
if label % 2 == 1:
label += 1
new_labels.append(label)
return new_labels
def preprocess_function(self, examples):
tokenized_inputs = self.tokenizer(
examples["tokens"], truncation=True, is_split_into_words=True
)
all_labels = examples["ner_tags"]
new_labels = []
for i, labels in enumerate(all_labels):
word_ids = tokenized_inputs.word_ids(i)
new_labels.append(self.align_labels_with_tokens(labels, word_ids))
tokenized_inputs["labels"] = new_labels
return tokenized_inputs
def create_data(self):
self.train, self.test, label_names = self.load_data()
tokenized_train_dataset = self.train.map(
self.preprocess_function,
batched=True,
remove_columns=self.train.column_names
)
tokenized_test_dataset = self.test.map(
self.preprocess_function,
batched=True,
remove_columns=self.train.column_names
)
return tokenized_train_dataset, tokenized_test_dataset, label_names
class NERDataset:
def __init__(self, data_id, tokenizer_ckpt):
self.data_id = data_id
self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_ckpt)
def load_data(self):
self.dataset = load_dataset(self.data_id)
self.train = self.dataset["train"]
self.test = self.dataset["test"]
ner_feature = self.dataset["train"].features["ner_tags"]
label_names = ner_feature.feature.names
return self.train, self.test, label_names
def align_labels_with_tokens(self, labels, word_ids):
new_labels = []
current_word = None
for word_id in word_ids:
if word_id != current_word:
current_word = word_id
try:
label = -100 if word_id is None else labels[word_id]
except:
label = -100
new_labels.append(label)
elif word_id is None:
new_labels.append(-100)
else:
label = labels[word_id]
if label % 2 == 1:
label += 1
new_labels.append(label)
return new_labels
def preprocess_function(self, examples):
tokenized_inputs = self.tokenizer(
examples["tokens"], truncation=True, is_split_into_words=True
)
all_labels = examples["ner_tags"]
new_labels = []
for i, labels in enumerate(all_labels):
word_ids = tokenized_inputs.word_ids(i)
new_labels.append(self.align_labels_with_tokens(labels, word_ids))
tokenized_inputs["labels"] = new_labels
return tokenized_inputs
def create_data(self):
self.train, self.test, label_names = self.load_data()
tokenized_train_dataset = self.train.map(
self.preprocess_function,
batched=True,
remove_columns=self.train.column_names
)
tokenized_test_dataset = self.test.map(
self.preprocess_function,
batched=True,
remove_columns=self.train.column_names
)
return tokenized_train_dataset, tokenized_test_dataset, label_names
Model Training - Make sure to do login¶
If notebook:
from huggingface_hub import notebook_login
notebook_login()
If script
huggingface-cli login
In [ ]:
Copied!
from huggingface_hub import notebook_login
notebook_login()
from huggingface_hub import notebook_login
notebook_login()
In [ ]:
Copied!
class NERTrainer:
def __init__(self):
self.nerdataset = NERDataset("conll2003", "bert-base-cased")
self.train_data, self.test_data, self.ner_labels = self.nerdataset.create_data()
self.id2label = {i: label for i, label in enumerate(self.ner_labels)}
self.label2id = {v: k for k, v in self.id2label.items()}
self.model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", id2label=self.id2label, label2id= self.label2id)
self.tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def compute_metrics(self, eval_preds):
metric = evaluate.load("seqeval")
logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)
true_labels = [[self.ner_labels[l] for l in label if l != -100] for label in labels]
true_predictions = [
[self.ner_labels[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
return {
"precision": all_metrics["overall_precision"],
"recall": all_metrics["overall_recall"],
"f1": all_metrics["overall_f1"],
"accuracy": all_metrics["overall_accuracy"],
}
def set_training_args(self):
return TrainingArguments(
output_dir="bert-ner-custom",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=3,
weight_decay=0.01,
push_to_hub=True
)
def train_and_save_model(self):
trainer = Trainer(
model=self.model,
args=self.set_training_args(),
train_dataset=self.train_data,
eval_dataset=self.test_data,
data_collator=DataCollatorForTokenClassification(tokenizer=self.tokenizer),
compute_metrics=self.compute_metrics,
tokenizer=self.tokenizer,
)
trainer.train()
nertrainer = NERTrainer()
nertrainer.train_and_save_model()
class NERTrainer:
def __init__(self):
self.nerdataset = NERDataset("conll2003", "bert-base-cased")
self.train_data, self.test_data, self.ner_labels = self.nerdataset.create_data()
self.id2label = {i: label for i, label in enumerate(self.ner_labels)}
self.label2id = {v: k for k, v in self.id2label.items()}
self.model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", id2label=self.id2label, label2id= self.label2id)
self.tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")
def compute_metrics(self, eval_preds):
metric = evaluate.load("seqeval")
logits, labels = eval_preds
predictions = np.argmax(logits, axis=-1)
true_labels = [[self.ner_labels[l] for l in label if l != -100] for label in labels]
true_predictions = [
[self.ner_labels[p] for (p, l) in zip(prediction, label) if l != -100]
for prediction, label in zip(predictions, labels)
]
all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
return {
"precision": all_metrics["overall_precision"],
"recall": all_metrics["overall_recall"],
"f1": all_metrics["overall_f1"],
"accuracy": all_metrics["overall_accuracy"],
}
def set_training_args(self):
return TrainingArguments(
output_dir="bert-ner-custom",
evaluation_strategy="epoch",
save_strategy="epoch",
learning_rate=2e-5,
num_train_epochs=3,
weight_decay=0.01,
push_to_hub=True
)
def train_and_save_model(self):
trainer = Trainer(
model=self.model,
args=self.set_training_args(),
train_dataset=self.train_data,
eval_dataset=self.test_data,
data_collator=DataCollatorForTokenClassification(tokenizer=self.tokenizer),
compute_metrics=self.compute_metrics,
tokenizer=self.tokenizer,
)
trainer.train()
nertrainer = NERTrainer()
nertrainer.train_and_save_model()
Inference¶
In [ ]:
Copied!
from transformers import pipeline
model_checkpoint = "Vasanth/bert-ner-custom"
token_classifier = pipeline(
"token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("Vasanth lives in Chennai.")
from transformers import pipeline
model_checkpoint = "Vasanth/bert-ner-custom"
token_classifier = pipeline(
"token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("Vasanth lives in Chennai.")
In [ ]:
Copied!