Thiết kế website giá rẻ

Question

I’m trying to fine-tune LLaVA on a custom dataset, following the code presented here: https://colab.research.google.com/drive/10NLrfBKgt9ntPoQYQ24rEVWU-2rr1xf1#scrollTo=4ycDwt9G1RWN. I’ve been debugging and print steps but I’m not sure what I’m doing wrong to get this error:

    raise ValueError(
ValueError: The input provided to the model are wrong. The number of image tokens is 0 while the number of image given to the model is 8. This prevents correct indexing and breaks batch generation.

My code looks like this:

import os
import torch
import pathlib
import wandb

from datasets import load_dataset
from trl import SFTTrainer
from peft import LoraConfig
from torchvision import transforms
from PIL import Image
from transformers import AutoTokenizer, AutoProcessor, TrainingArguments, LlavaForConditionalGeneration, BitsAndBytesConfig

model_id = "llava-hf/llava-1.5-7b-hf"
quantization_config = BitsAndBytesConfig(load_in_4bit=True)
model = LlavaForConditionalGeneration.from_pretrained(model_id, quantization_config=quantization_config, torch_dtype=torch.float16)

LLAVA_CHAT_TEMPLATE = """
A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. 
{% for message in messages %}
{% if message['role'] == 'user' %}
USER: 
{% else %}
ASSISTANT: 
{% endif %}
{% for item in message['content'] %}
{% if item['type'] == 'text' %}
{{ item['text'] }}
{% elif item['type'] == 'image' %}
<image>
{% endif %}
{% endfor %}
{% if message['role'] == 'user' %} 
{% else %}
{{ eos_token }}
{% endif %}
{% endfor %}
"""

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.chat_template = LLAVA_CHAT_TEMPLATE
processor = AutoProcessor.from_pretrained(model_id)
processor.tokenizer = tokenizer

from PIL import Image

class LLavaDataCollator:
    def __init__(self, processor):
        self.processor = processor

    def __call__(self, examples):
        texts = []
        images = []

        for example in examples:
            conversations = example["conversations"]
            formatted_conversation = {"messages": []}
            for convo in conversations:
                role = "user" if convo["from"] == "human" else "assistant"
                content_items = []
                for line in convo["value"].split("n"):
                    line = line.strip()
                    if line == "<image>":
                        content_items.append({"type": "image", "text": None})
                    else:
                        content_items.append({"type": "text", "text": line})
                formatted_conversation["messages"].append({"role": role, "content": content_items})

            text = self.processor.tokenizer.apply_chat_template(
                formatted_conversation["messages"], tokenize=False, add_generation_prompt=False
            )
            texts.append(text)

            # Load and append the image
            image_path = example["image"]
            image = Image.open(image_path).convert("RGB")
            images.append(image)

        batch = self.processor(texts, images, return_tensors="pt", padding=True)
        
        labels = batch["input_ids"].clone()
        if self.processor.tokenizer.pad_token_id is not None:
            labels[labels == self.processor.tokenizer.pad_token_id] = -100
        batch["labels"] = labels

        return batch

train_dataset = load_dataset('json', data_files=f'{dataset_path}/train/dataset.json', split='train')
val_dataset = load_dataset('json', data_files=f'{dataset_path}/validation/dataset.json', split='train')

My dataset looks like this:

> train_dataset[0]

{'id': '3d0304ef-c94c-4212-a831-c6553e697d13',
 'conversations': [{'from': 'human',
   'value': "<image src='/mnt/user_storage/llava_dataset/train/images/3d0304ef-c94c-4212-a831-c6553e697d13.jpg'>nHow would you evaluate the overall quality of this design?"},
  {'from': 'gpt', 'value': 'This design is good.'}],
 'image': '/mnt/user_storage/llava_dataset/train/images/3d0304ef-c94c-4212-a831-c6553e697d13.jpg'}

And when I test the data collator, this is what I get:

> batch = data_collator([train_dataset[0]])
> print("Batch Input IDs:", batch["input_ids"].shape)
> print("Batch Attention Mask:", batch["attention_mask"].shape)
> print("Batch Pixel Values:", batch["pixel_values"].shape)
> print("Batch Labels:", batch["labels"].shape)

Batch Input IDs: torch.Size([1, 115])
Batch Attention Mask: torch.Size([1, 115])
Batch Pixel Values: torch.Size([1, 3, 336, 336])
Batch Labels: torch.Size([1, 115])

To train, I do:

training_args = TrainingArguments(
    output_dir=f"{dataset_path}/training_output",
    report_to="wandb",
    learning_rate=1.4e-5,
    per_device_train_batch_size=8,
    gradient_accumulation_steps=1,
    logging_steps=5,
    num_train_epochs=100,
    push_to_hub=False,
    gradient_checkpointing=True,
    remove_unused_columns=False,
    fp16=True,
    bf16=False
)

lora_config = LoraConfig(
    r=64,
    lora_alpha=16,
    target_modules="all-linear"
)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    peft_config=lora_config,
    dataset_text_field="text",  # need a dummy field
    tokenizer=tokenizer,
    data_collator=data_collator,
    dataset_kwargs={"skip_prepare_dataset": True},
)

Unfortunately, I can’t share my dataset here. When I call the trainer, I get:

ValueError                                Traceback (most recent call last)
Cell In[22], line 1
----> 1 trainer.train()

File ~/anaconda3/lib/python3.10/site-packages/trl/trainer/sft_trainer.py:361, in SFTTrainer.train(self, *args, **kwargs)
    358 if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:
    359     self.model = self._trl_activate_neftune(self.model)
--> 361 output = super().train(*args, **kwargs)
    363 # After training we make sure to retrieve back the original forward pass method
    364 # for the embedding layer by removing the forward post hook.
    365 if self.neftune_noise_alpha is not None and not self._trainer_supports_neftune:

File ~/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:1885, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1883         hf_hub_utils.enable_progress_bars()
   1884 else:
-> 1885     return inner_training_loop(
   1886         args=args,
   1887         resume_from_checkpoint=resume_from_checkpoint,
   1888         trial=trial,
   1889         ignore_keys_for_eval=ignore_keys_for_eval,
   1890     )

File ~/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:2216, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   2213     self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
   2215 with self.accelerator.accumulate(model):
-> 2216     tr_loss_step = self.training_step(model, inputs)
   2218 if (
   2219     args.logging_nan_inf_filter
   2220     and not is_torch_xla_available()
   2221     and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
   2222 ):
   2223     # if loss is nan or inf simply add the average of previous logged losses
   2224     tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File ~/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:3238, in Trainer.training_step(self, model, inputs)
   3235     return loss_mb.reduce_mean().detach().to(self.args.device)
   3237 with self.compute_loss_context_manager():
-> 3238     loss = self.compute_loss(model, inputs)
   3240 del inputs
   3241 torch.cuda.empty_cache()

File ~/anaconda3/lib/python3.10/site-packages/transformers/trainer.py:3264, in Trainer.compute_loss(self, model, inputs, return_outputs)
   3262 else:
   3263     labels = None
-> 3264 outputs = model(**inputs)
   3265 # Save past state if it exists
   3266 # TODO: this needs to be fixed and made cleaner later.
   3267 if self.args.past_index >= 0:

File ~/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1532, in Module._wrapped_call_impl(self, *args, **kwargs)
   1530     return self._compiled_call_impl(*args, **kwargs)  # type: ignore[misc]
   1531 else:
-> 1532     return self._call_impl(*args, **kwargs)

File ~/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py:1541, in Module._call_impl(self, *args, **kwargs)
   1536 # If we don't have any hooks, we want to skip the rest of the logic in
   1537 # this function, and just call forward.
   1538 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1539         or _global_backward_pre_hooks or _global_backward_hooks
   1540         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1541     return forward_call(*args, **kwargs)
   1543 try:
   1544     result = None

File ~/anaconda3/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:185, in DataParallel.forward(self, *inputs, **kwargs)
    183     return self.module(*inputs[0], **module_kwargs[0])
    184 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 185 outputs = self.parallel_apply(replicas, inputs, module_kwargs)
    186 return self.gather(outputs, self.output_device)

File ~/anaconda3/lib/python3.10/site-packages/torch/nn/parallel/data_parallel.py:200, in DataParallel.parallel_apply(self, replicas, inputs, kwargs)
    199 def parallel_apply(self, replicas: Sequence[T], inputs: Sequence[Any], kwargs: Any) -> List[Any]:
--> 200     return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])

File ~/anaconda3/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py:108, in parallel_apply(modules, inputs, kwargs_tup, devices)
    106     output = results[i]
    107     if isinstance(output, ExceptionWrapper):
--> 108         output.reraise()
    109     outputs.append(output)
    110 return outputs

File ~/anaconda3/lib/python3.10/site-packages/torch/_utils.py:705, in ExceptionWrapper.reraise(self)
    701 except TypeError:
    702     # If the exception takes multiple arguments, don't try to
    703     # instantiate since we don't know how to
    704     raise RuntimeError(msg) from None
--> 705 raise exception

ValueError: Caught ValueError in replica 0 on device 0.
Original Traceback (most recent call last):
  File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/parallel/parallel_apply.py", line 83, in _worker
    output = module(*input, **kwargs)
  File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/ray/anaconda3/lib/python3.10/site-packages/peft/peft_model.py", line 642, in forward
    return self.get_base_model()(*args, **kwargs)
  File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1532, in _wrapped_call_impl
    return self._call_impl(*args, **kwargs)
  File "/home/ray/anaconda3/lib/python3.10/site-packages/torch/nn/modules/module.py", line 1541, in _call_impl
    return forward_call(*args, **kwargs)
  File "/home/ray/anaconda3/lib/python3.10/site-packages/accelerate/hooks.py", line 166, in new_forward
    output = module._old_forward(*args, **kwargs)
  File "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/models/llava/modeling_llava.py", line 439, in forward
    inputs_embeds, attention_mask, labels, position_ids = self._merge_input_ids_with_image_features(
  File "/home/ray/anaconda3/lib/python3.10/site-packages/transformers/models/llava/modeling_llava.py", line 335, in _merge_input_ids_with_image_features
    raise ValueError(
ValueError: The input provided to the model are wrong. The number of image tokens is 0 while the number of image given to the model is 8. This prevents correct indexing and breaks batch generation.

I’m not sure what is wrong with my data collator (which I believe is the source of the issue, as it’s the main difference with the original code). Any help would be appreciated.

Thiết kế website giá rẻ

Danh mục

LLaVA fine-tuning: The input provided to the model are wrong. The number of image tokens is 0 while the number of image given to the model is 8