IDEFICS Image Captioning¶
Installing Dependencies¶
In [1]:
Copied!
!pip install -q datasets
!pip install -q git+https://github.com/huggingface/transformers.git@add-model-idefics
!pip install -q bitsandbytes sentencepiece accelerate loralib
!pip install -q -U git+https://github.com/huggingface/peft.git
!pip install -q datasets
!pip install -q git+https://github.com/huggingface/transformers.git@add-model-idefics
!pip install -q bitsandbytes sentencepiece accelerate loralib
!pip install -q -U git+https://github.com/huggingface/peft.git
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 519.3/519.3 kB 8.1 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 115.3/115.3 kB 13.3 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 194.1/194.1 kB 20.3 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 134.8/134.8 kB 15.7 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 268.8/268.8 kB 31.4 MB/s eta 0:00:00 WARNING: Did not find branch or tag 'add-model-idefics', assuming revision or ref. error: subprocess-exited-with-error × git checkout -q add-model-idefics did not run successfully. │ exit code: 1 ╰─> See above for output. note: This error originates from a subprocess, and is likely not a problem with pip. error: subprocess-exited-with-error × git checkout -q add-model-idefics did not run successfully. │ exit code: 1 ╰─> See above for output. note: This error originates from a subprocess, and is likely not a problem with pip. ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 92.6/92.6 MB 9.1 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 67.6 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 251.2/251.2 kB 28.4 MB/s eta 0:00:00 Installing build dependencies ... done Getting requirements to build wheel ... done Preparing metadata (pyproject.toml) ... done ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.5/7.5 MB 15.8 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 1.3/1.3 MB 32.4 MB/s eta 0:00:00 ━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 7.8/7.8 MB 46.3 MB/s eta 0:00:00 Building wheel for peft (pyproject.toml) ... done
Importing dependencies¶
In [1]:
Copied!
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from PIL import Image
from transformers import IdeficsForVisionText2Text, AutoProcessor, Trainer, TrainingArguments, BitsAndBytesConfig
from torchvision import transforms as transforms
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model
from PIL import Image
from transformers import IdeficsForVisionText2Text, AutoProcessor, Trainer, TrainingArguments, BitsAndBytesConfig
from torchvision import transforms as transforms
import torch
In [ ]:
Copied!
from huggingface_hub import notebook_login
notebook_login()
from huggingface_hub import notebook_login
notebook_login()
Dataset¶
In [3]:
Copied!
class CocoCaptionDataset:
def __init__(self):
self.model_ckpt = "HuggingFaceM4/idefics-9b-instruct"
self.processor = AutoProcessor.from_pretrained(self.model_ckpt, use_auth_token=True)
def load_data(self):
data = load_dataset("cat-state/mscoco-1st-caption")
data = data["train"].train_test_split(test_size=0.0002)
train_data = data["train"]
val_data = data["test"]
return train_data, val_data
def img_convert_to_rgb(self, image):
if image.mode == "RGB":
return image
image_rgba = image.convert("RGBA")
background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
alpha_composite = Image.alpha_composite(background, image_rgba)
alpha_composite = alpha_composite.convert("RGB")
return alpha_composite
def transform_data(self, example):
img_size = self.processor.image_processor.image_size
img_mean = self.processor.image_processor.image_mean
img_std = self.processor.image_processor.image_std
img_transform = transforms.Compose([
self.img_convert_to_rgb,
transforms.RandomResizedCrop((img_size, img_size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC),
transforms.ToTensor(),
transforms.Normalize(mean=img_mean, std=img_std),
])
prompts = []
for i in range(len(example['caption'])):
caption = example['caption'][i]
prompts.append(
[
example['url'][i],
f"Question: Explain the picture. Answer: {caption}</s>",
],
)
inputs = self.processor(prompts, transform=img_transform, return_tensors="pt").to("cuda")
inputs["labels"] = inputs["input_ids"]
return inputs
def gen_data(self):
train_dataset, val_dataset = self.load_data()
train_dataset.set_transform(self.transform_data)
val_dataset.set_transform(self.transform_data)
return train_dataset, val_dataset
class CocoCaptionDataset:
def __init__(self):
self.model_ckpt = "HuggingFaceM4/idefics-9b-instruct"
self.processor = AutoProcessor.from_pretrained(self.model_ckpt, use_auth_token=True)
def load_data(self):
data = load_dataset("cat-state/mscoco-1st-caption")
data = data["train"].train_test_split(test_size=0.0002)
train_data = data["train"]
val_data = data["test"]
return train_data, val_data
def img_convert_to_rgb(self, image):
if image.mode == "RGB":
return image
image_rgba = image.convert("RGBA")
background = Image.new("RGBA", image_rgba.size, (255, 255, 255))
alpha_composite = Image.alpha_composite(background, image_rgba)
alpha_composite = alpha_composite.convert("RGB")
return alpha_composite
def transform_data(self, example):
img_size = self.processor.image_processor.image_size
img_mean = self.processor.image_processor.image_mean
img_std = self.processor.image_processor.image_std
img_transform = transforms.Compose([
self.img_convert_to_rgb,
transforms.RandomResizedCrop((img_size, img_size), scale=(0.9, 1.0), interpolation=transforms.InterpolationMode.BICUBIC),
transforms.ToTensor(),
transforms.Normalize(mean=img_mean, std=img_std),
])
prompts = []
for i in range(len(example['caption'])):
caption = example['caption'][i]
prompts.append(
[
example['url'][i],
f"Question: Explain the picture. Answer: {caption}",
],
)
inputs = self.processor(prompts, transform=img_transform, return_tensors="pt").to("cuda")
inputs["labels"] = inputs["input_ids"]
return inputs
def gen_data(self):
train_dataset, val_dataset = self.load_data()
train_dataset.set_transform(self.transform_data)
val_dataset.set_transform(self.transform_data)
return train_dataset, val_dataset
Model Training¶
In [4]:
Copied!
class ImageCaptioningModel:
def __init__(self):
cococaptiondataset = CocoCaptionDataset()
self.train_data, self.val_data = cococaptiondataset.gen_data()
self.model_ckpt = "HuggingFaceM4/idefics-9b-instruct"
def load_model(self):
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
llm_int8_skip_modules=["lm_head", "embed_tokens"],
)
model = IdeficsForVisionText2Text.from_pretrained(self.model_ckpt, quantization_config=bnb_config, device_map="auto")
return model
def create_lora_model(self, model):
config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "k_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
)
lora_model = get_peft_model(model, config)
lora_model.print_trainable_parameters()
return lora_model
def set_training_args(self):
training_args = TrainingArguments(
output_dir="idefics-mscoco-captioner",
learning_rate=2e-4,
fp16=True,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
gradient_accumulation_steps=8,
dataloader_pin_memory=False,
save_total_limit=1,
evaluation_strategy="steps",
save_strategy="steps",
save_steps=50,
eval_steps=50,
logging_steps=50,
max_steps=100,
remove_unused_columns=False,
load_best_model_at_end=True,
optim="paged_adamw_8bit",
label_names=["labels"]
)
return training_args
def train_and_push_to_hub(self):
img_cap_model = self.load_model()
img_cap_model = self.create_lora_model(img_cap_model)
trainer = Trainer(
model = img_cap_model,
args = self.set_training_args(),
train_dataset = self.train_data,
eval_dataset = self.val_data
)
trainer.train()
img_cap_model.push_to_hub("idefics-mscoco-captioner", private=False)
class ImageCaptioningModel:
def __init__(self):
cococaptiondataset = CocoCaptionDataset()
self.train_data, self.val_data = cococaptiondataset.gen_data()
self.model_ckpt = "HuggingFaceM4/idefics-9b-instruct"
def load_model(self):
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
llm_int8_skip_modules=["lm_head", "embed_tokens"],
)
model = IdeficsForVisionText2Text.from_pretrained(self.model_ckpt, quantization_config=bnb_config, device_map="auto")
return model
def create_lora_model(self, model):
config = LoraConfig(
r=16,
lora_alpha=32,
target_modules=["q_proj", "k_proj", "v_proj"],
lora_dropout=0.05,
bias="none",
)
lora_model = get_peft_model(model, config)
lora_model.print_trainable_parameters()
return lora_model
def set_training_args(self):
training_args = TrainingArguments(
output_dir="idefics-mscoco-captioner",
learning_rate=2e-4,
fp16=True,
per_device_train_batch_size=2,
per_device_eval_batch_size=2,
gradient_accumulation_steps=8,
dataloader_pin_memory=False,
save_total_limit=1,
evaluation_strategy="steps",
save_strategy="steps",
save_steps=50,
eval_steps=50,
logging_steps=50,
max_steps=100,
remove_unused_columns=False,
load_best_model_at_end=True,
optim="paged_adamw_8bit",
label_names=["labels"]
)
return training_args
def train_and_push_to_hub(self):
img_cap_model = self.load_model()
img_cap_model = self.create_lora_model(img_cap_model)
trainer = Trainer(
model = img_cap_model,
args = self.set_training_args(),
train_dataset = self.train_data,
eval_dataset = self.val_data
)
trainer.train()
img_cap_model.push_to_hub("idefics-mscoco-captioner", private=False)
In [ ]:
Copied!
if __name__ == "__main__":
imagecaptioningmodel = ImageCaptioningModel()
imagecaptioningmodel.train_and_push_to_hub()
if __name__ == "__main__":
imagecaptioningmodel = ImageCaptioningModel()
imagecaptioningmodel.train_and_push_to_hub()
Inference¶
In [6]:
Copied!
def check_inference(model, processor, prompts, max_new_tokens=50):
tokenizer = processor.tokenizer
bad_words = ["<image>", "<fake_token_around_image>"]
if len(bad_words) > 0:
bad_words_ids = tokenizer(bad_words, add_special_tokens=False).input_ids
eos_token = "</s>"
eos_token_id = tokenizer.convert_tokens_to_ids(eos_token)
inputs = processor(prompts, return_tensors="pt").to("cuda")
generated_ids = model.generate(**inputs, eos_token_id=[eos_token_id], bad_words_ids=bad_words_ids, max_new_tokens=max_new_tokens, early_stopping=True)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)
def check_inference(model, processor, prompts, max_new_tokens=50):
tokenizer = processor.tokenizer
bad_words = [" ", ""]
if len(bad_words) > 0:
bad_words_ids = tokenizer(bad_words, add_special_tokens=False).input_ids
eos_token = ""
eos_token_id = tokenizer.convert_tokens_to_ids(eos_token)
inputs = processor(prompts, return_tensors="pt").to("cuda")
generated_ids = model.generate(**inputs, eos_token_id=[eos_token_id], bad_words_ids=bad_words_ids, max_new_tokens=max_new_tokens, early_stopping=True)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(generated_text)
In [ ]:
Copied!
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
llm_int8_skip_modules=["lm_head", "embed_tokens"],
)
model = IdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/idefics-9b-instruct", quantization_config=bnb_config, device_map="auto")
bnb_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_use_double_quant=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16,
llm_int8_skip_modules=["lm_head", "embed_tokens"],
)
model = IdeficsForVisionText2Text.from_pretrained("HuggingFaceM4/idefics-9b-instruct", quantization_config=bnb_config, device_map="auto")
In [3]:
Copied!
processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics-9b-instruct", use_auth_token=True)
processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics-9b-instruct", use_auth_token=True)
/usr/local/lib/python3.10/dist-packages/transformers/models/auto/processing_auto.py:203: FutureWarning: The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. warnings.warn(
In [4]:
Copied!
from peft import PeftModel, PeftConfig
config = PeftConfig.from_pretrained("/content/idefics-mscoco-captioner")
model = PeftModel.from_pretrained(model,"/content/idefics-mscoco-captioner")
from peft import PeftModel, PeftConfig
config = PeftConfig.from_pretrained("/content/idefics-mscoco-captioner")
model = PeftModel.from_pretrained(model,"/content/idefics-mscoco-captioner")
Sample Image:¶
Let's run prediction with the quantized model for the image below which pictures two kittens. \
In [7]:
Copied!
url = "https://cdn.pixabay.com/photo/2018/01/14/23/12/nature-3082832_1280.jpg"
prompts = [
url,
"Question: Explain the picture. Answer:",
]
check_inference(model, processor, prompts, max_new_tokens=50)
url = "https://cdn.pixabay.com/photo/2018/01/14/23/12/nature-3082832_1280.jpg"
prompts = [
url,
"Question: Explain the picture. Answer:",
]
check_inference(model, processor, prompts, max_new_tokens=50)
/usr/local/lib/python3.10/dist-packages/transformers/generation/configuration_utils.py:399: UserWarning: `num_beams` is set to 1. However, `early_stopping` is set to `True` -- this flag is only used in beam-based generation modes. You should set `num_beams>1` or unset `early_stopping`. warnings.warn(
Question: Explain the picture. Answer: A mountain lake with lightning striking the mountain in the background.
In [ ]:
Copied!