I’m trying to fine-tune LLaVA on a custom dataset, following the code presented here: https://colab.research.google.com/drive/10NLrfBKgt9ntPoQYQ24rEVWU-2rr1xf1#scrollTo=4ycDwt9G1RWN. I’ve been debugging and print steps but I’m not sure what I’m doing wrong to get this error:
ValueError: The input provided to the model are wrong. The number of image tokens is 0 while the number of image given to the model is 8. This prevents correct indexing and breaks batch generation.
<code> raise ValueError(
ValueError: The input provided to the model are wrong. The number of image tokens is 0 while the number of image given to the model is 8. This prevents correct indexing and breaks batch generation.
</code>
raise ValueError(
ValueError: The input provided to the model are wrong. The number of image tokens is 0 while the number of image given to the model is 8. This prevents correct indexing and breaks batch generation.
My code looks like this:
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig
from torchvision import transforms
from transformers import AutoTokenizer, AutoProcessor, TrainingArguments, LlavaForConditionalGeneration, BitsAndBytesConfig
model_id = "llava-hf/llava-1.5-7b-hf"
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = LlavaForConditionalGeneration.from_pretrained(model_id, quantization_config=quantization_config, torch_dtype=torch.float16)
LLAVA_CHAT_TEMPLATE = """
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
{% for message in messages %}
{% if message['role'] == 'user' %}
{% for item in message['content'] %}
{% if item['type'] == 'text' %}
{% elif item['type'] == 'image' %}
{% if message['role'] == 'user' %}
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.chat_template = LLAVA_CHAT_TEMPLATE
processor = AutoProcessor.from_pretrained(model_id)
processor.tokenizer = tokenizer
def __init__(self, processor):
self.processor = processor
def __call__(self, examples):
conversations = example["conversations"]
formatted_conversation = {"messages": []}
for convo in conversations:
role = "user" if convo["from"] == "human" else "assistant"
for line in convo["value"].split("n"):
content_items.append({"type": "image", "text": None})
content_items.append({"type": "text", "text": line})
formatted_conversation["messages"].append({"role": role, "content": content_items})
text = self.processor.tokenizer.apply_chat_template(
formatted_conversation["messages"], tokenize=False, add_generation_prompt=False
# Load and append the image
image_path = example["image"]
image = Image.open(image_path).convert("RGB")
batch = self.processor(texts, images, return_tensors="pt", padding=True)
labels = batch["input_ids"].clone()
if self.processor.tokenizer.pad_token_id is not None:
labels[labels == self.processor.tokenizer.pad_token_id] = -100
train_dataset = load_dataset('json', data_files=f'{dataset_path}/train/dataset.json', split='train')
val_dataset = load_dataset('json', data_files=f'{dataset_path}/validation/dataset.json', split='train')
<code>import os
import torch
import pathlib
import wandb
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig
from torchvision import transforms
from PIL import Image
from transformers import AutoTokenizer, AutoProcessor, TrainingArguments, LlavaForConditionalGeneration, BitsAndBytesConfig
model_id = "llava-hf/llava-1.5-7b-hf"
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = LlavaForConditionalGeneration.from_pretrained(model_id, quantization_config=quantization_config, torch_dtype=torch.float16)
LLAVA_CHAT_TEMPLATE = """
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
{% for message in messages %}
{% if message['role'] == 'user' %}
USER:
{% else %}
ASSISTANT:
{% endif %}
{% for item in message['content'] %}
{% if item['type'] == 'text' %}
{{ item['text'] }}
{% elif item['type'] == 'image' %}
<image>
{% endif %}
{% endfor %}
{% if message['role'] == 'user' %}
{% else %}
{{ eos_token }}
{% endif %}
{% endfor %}
"""
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.chat_template = LLAVA_CHAT_TEMPLATE
processor = AutoProcessor.from_pretrained(model_id)
processor.tokenizer = tokenizer
from PIL import Image
class LLavaDataCollator:
def __init__(self, processor):
self.processor = processor
def __call__(self, examples):
texts = []
images = []
for example in examples:
conversations = example["conversations"]
formatted_conversation = {"messages": []}
for convo in conversations:
role = "user" if convo["from"] == "human" else "assistant"
content_items = []
for line in convo["value"].split("n"):
line = line.strip()
if line == "<image>":
content_items.append({"type": "image", "text": None})
else:
content_items.append({"type": "text", "text": line})
formatted_conversation["messages"].append({"role": role, "content": content_items})
text = self.processor.tokenizer.apply_chat_template(
formatted_conversation["messages"], tokenize=False, add_generation_prompt=False
)
texts.append(text)
# Load and append the image
image_path = example["image"]
image = Image.open(image_path).convert("RGB")
images.append(image)
batch = self.processor(texts, images, return_tensors="pt", padding=True)
labels = batch["input_ids"].clone()
if self.processor.tokenizer.pad_token_id is not None:
labels[labels == self.processor.tokenizer.pad_token_id] = -100
batch["labels"] = labels
return batch
train_dataset = load_dataset('json', data_files=f'{dataset_path}/train/dataset.json', split='train')
val_dataset = load_dataset('json', data_files=f'{dataset_path}/validation/dataset.json', split='train')
</code>
import os
import torch
import pathlib
import wandb
from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig
from torchvision import transforms
from PIL import Image
from transformers import AutoTokenizer, AutoProcessor, TrainingArguments, LlavaForConditionalGeneration, BitsAndBytesConfig
model_id = "llava-hf/llava-1.5-7b-hf"
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = LlavaForConditionalGeneration.from_pretrained(model_id, quantization_config=quantization_config, torch_dtype=torch.float16)
LLAVA_CHAT_TEMPLATE = """
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions.
{% for message in messages %}
{% if message['role'] == 'user' %}
USER:
{% else %}
ASSISTANT:
{% endif %}
{% for item in message['content'] %}
{% if item['type'] == 'text' %}
{{ item['text'] }}
{% elif item['type'] == 'image' %}
<image>
{% endif %}
{% endfor %}
{% if message['role'] == 'user' %}
{% else %}
{{ eos_token }}
{% endif %}
{% endfor %}
"""
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.chat_template = LLAVA_CHAT_TEMPLATE
processor = AutoProcessor.from_pretrained(model_id)
processor.tokenizer = tokenizer
from PIL import Image
class LLavaDataCollator:
def __init__(self, processor):
self.processor = processor
def __call__(self, examples):
texts = []
images = []
for example in examples:
conversations = example["conversations"]
formatted_conversation = {"messages": []}
for convo in conversations:
role = "user" if convo["from"] == "human" else "assistant"
content_items = []
for line in convo["value"].split("n"):
line = line.strip()
if line == "<image>":
content_items.append({"type": "image", "text": None})
else:
content_items.append({"type": "text", "text": line})
formatted_conversation["messages"].append({"role": role, "content": content_items})
text = self.processor.tokenizer.apply_chat_template(
formatted_conversation["messages"], tokenize=False, add_generation_prompt=False
)
texts.append(text)
# Load and append the image
image_path = example["image"]
image = Image.open(image_path).convert("RGB")
images.append(image)
batch = self.processor(texts, images, return_tensors="pt", padding=True)
labels = batch["input_ids"].clone()
if self.processor.tokenizer.pad_token_id is not None:
labels[labels == self.processor.tokenizer.pad_token_id] = -100
batch["labels"] = labels
return batch
train_dataset = load_dataset('json', data_files=f'{dataset_path}/train/dataset.json', split='train')
val_dataset = load_dataset('json', data_files=f'{dataset_path}/validation/dataset.json', split='train')
My dataset looks like this:
{'id': '3d0304ef-c94c-4212-a831-c6553e697d13',
'conversations': [{'from': 'human',
'value': "<image src='/mnt/user_storage/llava_dataset/train/images/3d0304ef-c94c-4212-a831-c6553e697d13.jpg'>nHow would you evaluate the overall quality of this design?"},
{'from': 'gpt', 'value': 'This design is good.'}],
'image': '/mnt/user_storage/llava_dataset/train/images/3d0304ef-c94c-4212-a831-c6553e697d13.jpg'}
<code>> train_dataset[0]
{'id': '3d0304ef-c94c-4212-a831-c6553e697d13',
'conversations': [{'from': 'human',
'value': "<image src='/mnt/user_storage/llava_dataset/train/images/3d0304ef-c94c-4212-a831-c6553e697d13.jpg'>nHow would you evaluate the overall quality of this design?"},
{'from': 'gpt', 'value': 'This design is good.'}],
'image': '/mnt/user_storage/llava_dataset/train/images/3d0304ef-c94c-4212-a831-c6553e697d13.jpg'}
</code>
> train_dataset[0]
{'id': '3d0304ef-c94c-4212-a831-c6553e697d13',
'conversations': [{'from': 'human',
'value': "<image src='/mnt/user_storage/llava_dataset/train/images/3d0304ef-c94c-4212-a831-c6553e697d13.jpg'>nHow would you evaluate the overall quality of this design?"},
{'from': 'gpt', 'value': 'This design is good.'}],
'image': '/mnt/user_storage/llava_dataset/train/images/3d0304ef-c94c-4212-a831-c6553e697d13.jpg'}
And when I test the data collator, this is what I get:
<code>> batch = data_collator([train_dataset[0]])
> print("Batch Input IDs:", batch["input_ids"].shape)
> print("Batch Attention Mask:", batch["attention_mask"].shape)
> print("Batch Pixel Values:", batch["pixel_values"].shape)
> print("Batch Labels:", batch["labels"].shape)
Batch Input IDs: torch.Size([1, 115])
Batch Attention Mask: torch.Size([1, 115])
Batch Pixel Values: torch.Size([1, 3, 336, 336])
Batch Labels: torch.Size([1, 115])
<code>> batch = data_collator([train_dataset[0]])
> print("Batch Input IDs:", batch["input_ids"].shape)
> print("Batch Attention Mask:", batch["attention_mask"].shape)
> print("Batch Pixel Values:", batch["pixel_values"].shape)
> print("Batch Labels:", batch["labels"].shape)
Batch Input IDs: torch.Size([1, 115])
Batch Attention Mask: torch.Size([1, 115])
Batch Pixel Values: torch.Size([1, 3, 336, 336])
Batch Labels: torch.Size([1, 115])
</code>
> batch = data_collator([train_dataset[0]])
> print("Batch Input IDs:", batch["input_ids"].shape)
> print("Batch Attention Mask:", batch["attention_mask"].shape)
> print("Batch Pixel Values:", batch["pixel_values"].shape)
> print("Batch Labels:", batch["labels"].shape)
Batch Input IDs: torch.Size([1, 115])
Batch Attention Mask: torch.Size([1, 115])
Batch Pixel Values: torch.Size([1, 3, 336, 336])
Batch Labels: torch.Size([1, 115])
To train, I do:
<code>training_args = TrainingArguments(
output_dir=f"{dataset_path}/training_output",
per_device_train_batch_size=8,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
remove_unused_columns=False,
lora_config = LoraConfig(
target_modules="all-linear"
train_dataset=train_dataset,
eval_dataset=val_dataset,
dataset_text_field="text", # need a dummy field
data_collator=data_collator,
dataset_kwargs={"skip_prepare_dataset": True},
<code>training_args = TrainingArguments(
output_dir=f"{dataset_path}/training_output",
report_to="wandb",
learning_rate=1.4e-5,
per_device_train_batch_size=8,
gradient_accumulation_steps=1,
logging_steps=5,
num_train_epochs=100,
push_to_hub=False,
gradient_checkpointing=True,
remove_unused_columns=False,
fp16=True,
bf16=False
)
lora_config = LoraConfig(
r=64,
lora_alpha=16,
target_modules="all-linear"
)
trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
peft_config=lora_config,
dataset_text_field="text", # need a dummy field
tokenizer=tokenizer,
data_collator=data_collator,
dataset_kwargs={"skip_prepare_dataset": True},
)
</code>
training_args = TrainingArguments(
output_dir=f"{dataset_path}/training_output",
report_to="wandb",
learning_rate=1.4e-5,
per_device_train_batch_size=8,
gradient_accumulation_steps=1,
logging_steps=5,
num_train_epochs=100,
push_to_hub=False,
gradient_checkpointing=True,
remove_unused_columns=False,
fp16=True,
bf16=False
)
lora_config = LoraConfig(
r=64,
lora_alpha=16,
target_modules="all-linear"
)
trainer = SFTTrainer(
model=model,
args=training_args,
train_dataset=train_dataset,
eval_dataset=val_dataset,
peft_config=lora_config,
dataset_text_field="text", # need a dummy field
tokenizer=tokenizer,
data_collator=data_collator,
dataset_kwargs={"skip_prepare_dataset": True},
)
Unfortunately, I can’t share my dataset here. When I call the trainer, I get:
<code>ValueError Traceback (most recent call last)
File ~/anaconda3/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:361, in SFTTrainer.train(self, *args, **kwargs)
358 if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:
359 self.model = self._trl_activate_neftune(self.model)
--> 361 output = super().train(*args, **kwargs)
363 # After training we make sure to retrieve back the original forward pass method
364 # for the embedding layer by removing the forward post hook.
365 if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:
File ~/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:1885, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1883 hf_hub_utils.enable_progress_bars()
-> 1885 return inner_training_loop(
1887 resume_from_checkpoint=resume_from_checkpoint,
1889 ignore_keys_for_eval=ignore_keys_for_eval,
File ~/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:2216, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2213 self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
2215 with self.accelerator.accumulate(model):
-> 2216 tr_loss_step = self.training_step(model, inputs)
2219 args.logging_nan_inf_filter
2220 and not is_torch_xla_available()
2221 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
2223 # if loss is nan or inf simply add the average of previous logged losses
2224 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File ~/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:3238, in Trainer.training_step(self, model, inputs)
3235 return loss_mb.reduce_mean().detach().to(self.args.device)
3237 with self.compute_loss_context_manager():
-> 3238 loss = self.compute_loss(model, inputs)
3241 torch.cuda.empty_cache()
File ~/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:3264, in Trainer.compute_loss(self, model, inputs, return_outputs)
-> 3264 outputs = model(**inputs)
3265 # Save past state if it exists
3266 # TODO: this needs to be fixed and made cleaner later.
3267 if self.args.past_index >= 0:
File ~/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
-> 1532 return self._call_impl(*args, **kwargs)
File ~/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
File ~/anaconda3/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:185, in DataParallel.forward(self, *inputs, **kwargs)
183 return self.module(*inputs[0], **module_kwargs[0])
184 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 185 outputs = self.parallel_apply(replicas, inputs, module_kwargs)
186 return self.gather(outputs, self.output_device)
File ~/anaconda3/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:200, in DataParallel.parallel_apply(self, replicas, inputs, kwargs)
199 def parallel_apply(self, replicas: Sequence[T], inputs: Sequence[Any], kwargs: Any) -> List[Any]:
--> 200 return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File ~/anaconda3/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py:108, in parallel_apply(modules, inputs, kwargs_tup, devices)
107 if isinstance(output, ExceptionWrapper):
109 outputs.append(output)
File ~/anaconda3/lib/python3.10/site-packages/torch/_utils.py:705, in ExceptionWrapper.reraise(self)
702 # If the exception takes multiple arguments, don't try to
703 # instantiate since we don't know how to
704 raise RuntimeError(msg) from None
ValueError: Caught ValueError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
output = module(*input, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/peft/peft_model.py", line 642, in forward
return self.get_base_model()(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/models/llava/modeling_llava.py", line 439, in forward
inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
File "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/models/llava/modeling_llava.py", line 335, in _merge_input_ids_with_image_features
ValueError: The input provided to the model are wrong. The number of image tokens is 0 while the number of image given to the model is 8. This prevents correct indexing and breaks batch generation.
<code>ValueError Traceback (most recent call last)
Cell In[22], line 1
----> 1 trainer.train()
File ~/anaconda3/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:361, in SFTTrainer.train(self, *args, **kwargs)
358 if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:
359 self.model = self._trl_activate_neftune(self.model)
--> 361 output = super().train(*args, **kwargs)
363 # After training we make sure to retrieve back the original forward pass method
364 # for the embedding layer by removing the forward post hook.
365 if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:
File ~/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:1885, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1883 hf_hub_utils.enable_progress_bars()
1884 else:
-> 1885 return inner_training_loop(
1886 args=args,
1887 resume_from_checkpoint=resume_from_checkpoint,
1888 trial=trial,
1889 ignore_keys_for_eval=ignore_keys_for_eval,
1890 )
File ~/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:2216, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2213 self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
2215 with self.accelerator.accumulate(model):
-> 2216 tr_loss_step = self.training_step(model, inputs)
2218 if (
2219 args.logging_nan_inf_filter
2220 and not is_torch_xla_available()
2221 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
2222 ):
2223 # if loss is nan or inf simply add the average of previous logged losses
2224 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File ~/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:3238, in Trainer.training_step(self, model, inputs)
3235 return loss_mb.reduce_mean().detach().to(self.args.device)
3237 with self.compute_loss_context_manager():
-> 3238 loss = self.compute_loss(model, inputs)
3240 del inputs
3241 torch.cuda.empty_cache()
File ~/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:3264, in Trainer.compute_loss(self, model, inputs, return_outputs)
3262 else:
3263 labels = None
-> 3264 outputs = model(**inputs)
3265 # Save past state if it exists
3266 # TODO: this needs to be fixed and made cleaner later.
3267 if self.args.past_index >= 0:
File ~/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)
File ~/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None
File ~/anaconda3/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:185, in DataParallel.forward(self, *inputs, **kwargs)
183 return self.module(*inputs[0], **module_kwargs[0])
184 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 185 outputs = self.parallel_apply(replicas, inputs, module_kwargs)
186 return self.gather(outputs, self.output_device)
File ~/anaconda3/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:200, in DataParallel.parallel_apply(self, replicas, inputs, kwargs)
199 def parallel_apply(self, replicas: Sequence[T], inputs: Sequence[Any], kwargs: Any) -> List[Any]:
--> 200 return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File ~/anaconda3/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py:108, in parallel_apply(modules, inputs, kwargs_tup, devices)
106 output = results[i]
107 if isinstance(output, ExceptionWrapper):
--> 108 output.reraise()
109 outputs.append(output)
110 return outputs
File ~/anaconda3/lib/python3.10/site-packages/torch/_utils.py:705, in ExceptionWrapper.reraise(self)
701 except TypeError:
702 # If the exception takes multiple arguments, don't try to
703 # instantiate since we don't know how to
704 raise RuntimeError(msg) from None
--> 705 raise exception
ValueError: Caught ValueError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
output = module(*input, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/peft/peft_model.py", line 642, in forward
return self.get_base_model()(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/models/llava/modeling_llava.py", line 439, in forward
inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
File "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/models/llava/modeling_llava.py", line 335, in _merge_input_ids_with_image_features
raise ValueError(
ValueError: The input provided to the model are wrong. The number of image tokens is 0 while the number of image given to the model is 8. This prevents correct indexing and breaks batch generation.
</code>
ValueError Traceback (most recent call last)
Cell In[22], line 1
----> 1 trainer.train()
File ~/anaconda3/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:361, in SFTTrainer.train(self, *args, **kwargs)
358 if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:
359 self.model = self._trl_activate_neftune(self.model)
--> 361 output = super().train(*args, **kwargs)
363 # After training we make sure to retrieve back the original forward pass method
364 # for the embedding layer by removing the forward post hook.
365 if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:
File ~/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:1885, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1883 hf_hub_utils.enable_progress_bars()
1884 else:
-> 1885 return inner_training_loop(
1886 args=args,
1887 resume_from_checkpoint=resume_from_checkpoint,
1888 trial=trial,
1889 ignore_keys_for_eval=ignore_keys_for_eval,
1890 )
File ~/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:2216, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2213 self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
2215 with self.accelerator.accumulate(model):
-> 2216 tr_loss_step = self.training_step(model, inputs)
2218 if (
2219 args.logging_nan_inf_filter
2220 and not is_torch_xla_available()
2221 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
2222 ):
2223 # if loss is nan or inf simply add the average of previous logged losses
2224 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File ~/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:3238, in Trainer.training_step(self, model, inputs)
3235 return loss_mb.reduce_mean().detach().to(self.args.device)
3237 with self.compute_loss_context_manager():
-> 3238 loss = self.compute_loss(model, inputs)
3240 del inputs
3241 torch.cuda.empty_cache()
File ~/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:3264, in Trainer.compute_loss(self, model, inputs, return_outputs)
3262 else:
3263 labels = None
-> 3264 outputs = model(**inputs)
3265 # Save past state if it exists
3266 # TODO: this needs to be fixed and made cleaner later.
3267 if self.args.past_index >= 0:
File ~/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
1530 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1531 else:
-> 1532 return self._call_impl(*args, **kwargs)
File ~/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
1536 # If we don't have any hooks, we want to skip the rest of the logic in
1537 # this function, and just call forward.
1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1539 or _global_backward_pre_hooks or _global_backward_hooks
1540 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541 return forward_call(*args, **kwargs)
1543 try:
1544 result = None
File ~/anaconda3/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:185, in DataParallel.forward(self, *inputs, **kwargs)
183 return self.module(*inputs[0], **module_kwargs[0])
184 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 185 outputs = self.parallel_apply(replicas, inputs, module_kwargs)
186 return self.gather(outputs, self.output_device)
File ~/anaconda3/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:200, in DataParallel.parallel_apply(self, replicas, inputs, kwargs)
199 def parallel_apply(self, replicas: Sequence[T], inputs: Sequence[Any], kwargs: Any) -> List[Any]:
--> 200 return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File ~/anaconda3/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py:108, in parallel_apply(modules, inputs, kwargs_tup, devices)
106 output = results[i]
107 if isinstance(output, ExceptionWrapper):
--> 108 output.reraise()
109 outputs.append(output)
110 return outputs
File ~/anaconda3/lib/python3.10/site-packages/torch/_utils.py:705, in ExceptionWrapper.reraise(self)
701 except TypeError:
702 # If the exception takes multiple arguments, don't try to
703 # instantiate since we don't know how to
704 raise RuntimeError(msg) from None
--> 705 raise exception
ValueError: Caught ValueError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
output = module(*input, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/peft/peft_model.py", line 642, in forward
return self.get_base_model()(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
return self._call_impl(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
return forward_call(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
output = module._old_forward(*args, **kwargs)
File "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/models/llava/modeling_llava.py", line 439, in forward
inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
File "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/models/llava/modeling_llava.py", line 335, in _merge_input_ids_with_image_features
raise ValueError(
ValueError: The input provided to the model are wrong. The number of image tokens is 0 while the number of image given to the model is 8. This prevents correct indexing and breaks batch generation.
I’m not sure what is wrong with my data collator (which I believe is the source of the issue, as it’s the main difference with the original code). Any help would be appreciated.