In [ ]:

Copied!

! pip install transformers datasets evaluate seqeval accelerate
! pip install transformers datasets evaluate seqeval accelerate

Prepare Data for NER¶

In [ ]:

Copied!

import pandas as pd
from sklearn.model_selection import train_test_split
import json
import pandas as pd
from sklearn.model_selection import train_test_split
import json

In [ ]:

Copied!

data = pd.read_csv("ner.csv")
data
data = pd.read_csv("ner.csv")
data

In [ ]:

Copied!





unique_ner_tags = ['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat']

tags2id = {}
for i, tag in enumerate(unique_ner_tags):
    tags2id[tag] = i
tags2id
unique_ner_tags = ['O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim',
       'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve',
       'I-eve', 'I-nat']

tags2id = {}
for i, tag in enumerate(unique_ner_tags):
    tags2id[tag] = i
tags2id

In [ ]:

Copied!

from ast import literal_eval
data["Tags"] = data["Tag"].apply(lambda x: literal_eval(x)) # Converting list in string format to list
data
from ast import literal_eval
data["Tags"] = data["Tag"].apply(lambda x: literal_eval(x)) # Converting list in string format to list
data

In [ ]:

Copied!





def map_tags2id(x):
    return [tags2id[i] for i in x]
data["ner_tags"] = data["Tags"].apply(lambda x: map_tags2id(x))
data
def map_tags2id(x):
    return [tags2id[i] for i in x]
data["ner_tags"] = data["Tags"].apply(lambda x: map_tags2id(x))
data

In [ ]:

Copied!

data["tokens"] = data["Sentence"].apply(lambda x: x.split())
data
data["tokens"] = data["Sentence"].apply(lambda x: x.split())
data

In [ ]:

Copied!

data = data[["tokens", "ner_tags"]]
train_data, test_data = train_test_split(data, test_size=0.2, shuffle=True, random_state=0)
data = data[["tokens", "ner_tags"]]
train_data, test_data = train_test_split(data, test_size=0.2, shuffle=True, random_state=0)

In [ ]:

Copied!

train_data
train_data

In [ ]:

Copied!

train_data.dropna(inplace=True)
train_data.dropna(inplace=True)

In [ ]:

Copied!

test_data.dropna(inplace=True)
test_data.dropna(inplace=True)

In [ ]:

Copied!

train_data
train_data

In [ ]:

Copied!

tags2id["ner_categories"] = unique_ner_tags
tags2id["ner_categories"] = unique_ner_tags

In [ ]:

Copied!

train_data.to_parquet("train_ner.parquet", index=False)
test_data.to_parquet("test_ner.parquet", index=False)

with open("tags.json", "w") as outfile:
    json.dump(tags2id, outfile)
train_data.to_parquet("train_ner.parquet", index=False)
test_data.to_parquet("test_ner.parquet", index=False)

with open("tags.json", "w") as outfile:
    json.dump(tags2id, outfile)

In [ ]:

Copied!

with open("tags.json", "r") as outfile:
    print(json.load(outfile))
with open("tags.json", "r") as outfile:
    print(json.load(outfile))

Imports and Config¶

In [ ]:

Copied!





from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
import json
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, TrainingArguments, Trainer
import numpy as np
import evaluate
import json

Dataset¶

In [ ]:

Copied!





class NERDataset:

    def __init__(self, train_data_path, test_data_path, tokenizer_ckpt):
        self.train_data_path = train_data_path
        self.test_data_path = test_data_path
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_ckpt)

    def load_data(self):
        self.train = load_dataset("parquet", data_files = self.train_data_path)["train"]
        self.test = load_dataset("parquet", data_files = self.test_data_path)["train"]

    def align_labels_with_tokens(self, labels, word_ids):
        new_labels = []
        current_word = None
        for word_id in word_ids:
            if word_id != current_word:
                # Start of a new word!
                current_word = word_id
                try:
                    label = -100 if word_id is None else labels[word_id]
                except:
                    label = -100
                new_labels.append(label)
            elif word_id is None:
                # Special token
                new_labels.append(-100)
            else:
                # Same word as previous token
                label = labels[word_id]
                # If the label is B-XXX we change it to I-XXX
                if label % 2 == 1:
                    label += 1
                new_labels.append(label)

        return new_labels

    def preprocess_function(self, examples):
        tokenized_inputs = self.tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True)
        all_labels = examples["ner_tags"]
        new_labels = []
        for i, labels in enumerate(all_labels):
            word_ids = tokenized_inputs.word_ids(i)
            new_labels.append(self.align_labels_with_tokens(labels, word_ids))

        tokenized_inputs["labels"] = new_labels
        return tokenized_inputs

    def create_data(self):

        self.load_data()

        tokenized_train_dataset = self.train.map(
            self.preprocess_function,
            batched=True,
            remove_columns=self.train.column_names
        )

        tokenized_test_dataset = self.test.map(
            self.preprocess_function,
            batched=True,
            remove_columns=self.train.column_names
        )

        return tokenized_train_dataset, tokenized_test_dataset
class NERDataset:

    def __init__(self, train_data_path, test_data_path, tokenizer_ckpt):
        self.train_data_path = train_data_path
        self.test_data_path = test_data_path
        self.tokenizer = AutoTokenizer.from_pretrained(tokenizer_ckpt)

    def load_data(self):
        self.train = load_dataset("parquet", data_files = self.train_data_path)["train"]
        self.test = load_dataset("parquet", data_files = self.test_data_path)["train"]

    def align_labels_with_tokens(self, labels, word_ids):
        new_labels = []
        current_word = None
        for word_id in word_ids:
            if word_id != current_word:
                # Start of a new word!
                current_word = word_id
                try:
                    label = -100 if word_id is None else labels[word_id]
                except:
                    label = -100
                new_labels.append(label)
            elif word_id is None:
                # Special token
                new_labels.append(-100)
            else:
                # Same word as previous token
                label = labels[word_id]
                # If the label is B-XXX we change it to I-XXX
                if label % 2 == 1:
                    label += 1
                new_labels.append(label)

        return new_labels

    def preprocess_function(self, examples):
        tokenized_inputs = self.tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True)
        all_labels = examples["ner_tags"]
        new_labels = []
        for i, labels in enumerate(all_labels):
            word_ids = tokenized_inputs.word_ids(i)
            new_labels.append(self.align_labels_with_tokens(labels, word_ids))

        tokenized_inputs["labels"] = new_labels
        return tokenized_inputs

    def create_data(self):

        self.load_data()

        tokenized_train_dataset = self.train.map(
            self.preprocess_function,
            batched=True,
            remove_columns=self.train.column_names
        )

        tokenized_test_dataset = self.test.map(
            self.preprocess_function,
            batched=True,
            remove_columns=self.train.column_names
        )

        return tokenized_train_dataset, tokenized_test_dataset

If notebook:

from huggingface_hub import notebook_login

notebook_login()

If script

huggingface-cli login

In [ ]:

Copied!

from huggingface_hub import notebook_login

notebook_login()
from huggingface_hub import notebook_login

notebook_login()

In [ ]:

Copied!





class NERTrainer:

    def __init__(self):
        with open("tags.json", "r") as outfile:
            self.ner_labels = json.load(outfile)["ner_categories"]

        self.nerdataset = NERDataset("/content/train_ner.parquet", "/content/test_ner.parquet", "bert-base-uncased")
        self.train_data, self.test_data = self.nerdataset.create_data()
        self.id2label = {i: label for i, label in enumerate(self.ner_labels)}
        self.label2id = {v: k for k, v in self.id2label.items()}
        self.model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", id2label=self.id2label, label2id= self.label2id)
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    def compute_metrics(self, eval_preds):
        metric = evaluate.load("seqeval")
        logits, labels = eval_preds
        predictions = np.argmax(logits, axis=-1)

        # Remove ignored index (special tokens) and convert to labels
        true_labels = [[self.ner_labels[l] for l in label if l != -100] for label in labels]
        true_predictions = [
            [self.ner_labels[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
        return {
            "precision": all_metrics["overall_precision"],
            "recall": all_metrics["overall_recall"],
            "f1": all_metrics["overall_f1"],
            "accuracy": all_metrics["overall_accuracy"],
        }

    def set_training_args(self):
        return TrainingArguments(
        output_dir="bert-ner-custom",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        num_train_epochs=3,
        weight_decay=0.01,
        push_to_hub=True
    )

    def train_and_save_model(self):
        trainer = Trainer(
            model=self.model,
            args=self.set_training_args(),
            train_dataset=self.train_data,
            eval_dataset=self.test_data,
            data_collator=DataCollatorForTokenClassification(tokenizer=self.tokenizer),
            compute_metrics=self.compute_metrics,
            tokenizer=self.tokenizer,
        )
        trainer.train()

nertrainer = NERTrainer()
nertrainer.train_and_save_model()
class NERTrainer:

    def __init__(self):
        with open("tags.json", "r") as outfile:
            self.ner_labels = json.load(outfile)["ner_categories"]

        self.nerdataset = NERDataset("/content/train_ner.parquet", "/content/test_ner.parquet", "bert-base-uncased")
        self.train_data, self.test_data = self.nerdataset.create_data()
        self.id2label = {i: label for i, label in enumerate(self.ner_labels)}
        self.label2id = {v: k for k, v in self.id2label.items()}
        self.model = AutoModelForTokenClassification.from_pretrained("bert-base-uncased", id2label=self.id2label, label2id= self.label2id)
        self.tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    def compute_metrics(self, eval_preds):
        metric = evaluate.load("seqeval")
        logits, labels = eval_preds
        predictions = np.argmax(logits, axis=-1)

        # Remove ignored index (special tokens) and convert to labels
        true_labels = [[self.ner_labels[l] for l in label if l != -100] for label in labels]
        true_predictions = [
            [self.ner_labels[p] for (p, l) in zip(prediction, label) if l != -100]
            for prediction, label in zip(predictions, labels)
        ]
        all_metrics = metric.compute(predictions=true_predictions, references=true_labels)
        return {
            "precision": all_metrics["overall_precision"],
            "recall": all_metrics["overall_recall"],
            "f1": all_metrics["overall_f1"],
            "accuracy": all_metrics["overall_accuracy"],
        }

    def set_training_args(self):
        return TrainingArguments(
        output_dir="bert-ner-custom",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        num_train_epochs=3,
        weight_decay=0.01,
        push_to_hub=True
    )

    def train_and_save_model(self):
        trainer = Trainer(
            model=self.model,
            args=self.set_training_args(),
            train_dataset=self.train_data,
            eval_dataset=self.test_data,
            data_collator=DataCollatorForTokenClassification(tokenizer=self.tokenizer),
            compute_metrics=self.compute_metrics,
            tokenizer=self.tokenizer,
        )
        trainer.train()

nertrainer = NERTrainer()
nertrainer.train_and_save_model()

Inference¶

In [ ]:

Copied!





from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "Vasanth/bert-ner-custom"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("I live in Chennai.")
from transformers import pipeline

# Replace this with your own checkpoint
model_checkpoint = "Vasanth/bert-ner-custom"
token_classifier = pipeline(
    "token-classification", model=model_checkpoint, aggregation_strategy="simple"
)
token_classifier("I live in Chennai.")

In [ ]:

Prepare Data for NER¶

Imports and Config¶

Dataset¶

Model Training - Make sure to do login¶

Inference¶