Hi everyone,
I’m currently searching for a way to fine-tune the Hugging Face TFWhisperForConditionalGeneration
model (https://huggingface.co/docs/transformers/en/model_doc/whisper) to get a model in .h5 TensorFlow format. I need this format since I want to convert the fine-tuned model into TensorFlow Lite (TFLite) for edge device deployment (https://www.tensorflow.org/lite).
I have already done some research and only found fine-tuning examples for WhisperForConditionalGeneration
in PyTorch. Has anyone already fine-tuned the Whisper model in TensorFlow format or knows how to approach this problem? I also thought about converting the finetuned pytorch model to tf if finetuning is not possible via tensorflow directly.
I already experimented and tried using transformers.TFTrainer
for finetuning as follows:
from typing import Any, Dict, List, Union
from datasets import load_from_disk
from transformers import (
WhisperProcessor,
WhisperFeatureExtractor,
WhisperTokenizerFast,
TFWhisperForConditionalGeneration,
TFTrainingArguments,
TFTrainer
)
import tensorflow as tf
processor = WhisperProcessor.from_pretrained(
"openai/whisper-tiny", language=language_whisper, task="transcribe"
)
model = TFWhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny")
feature_extractor = WhisperFeatureExtractor.from_pretrained("openai/whisper-tiny")
tokenizer = WhisperTokenizerFast.from_pretrained(
"openai/whisper-tiny", language=language_whisper, task="transcribe"
)
dataset = load_from_disk(data_path)
n_train = dataset["train"].num_rows
max_steps = int(n_train / BATCH_SIZE)
eval_and_save_steps = max_steps // 10
metric = evaluate.load("wer")
wandb.init(project="whisper-finetune")
training_args = TFTrainingArguments(
output_dir=f"./whisper-tiny-{language}",
per_device_train_batch_size=BATCH_SIZE,
gradient_accumulation_steps=1, # increase by 2x for every 2x decrease in batch size
learning_rate=6.25e-6,
weight_decay=0.01,
warmup_steps=50,
max_steps=max_steps,
gradient_checkpointing=True,
fp16=tf.config.list_physical_devices('GPU'),
evaluation_strategy="steps",
per_device_eval_batch_size=16,
save_steps=eval_and_save_steps,
eval_steps=eval_and_save_steps,
report_to=["wandb"],
load_best_model_at_end=True,
metric_for_best_model="wer",
greater_is_better=False,
push_to_hub=False,
)
def preprocess_function(batch):
# get the tokenized label sequences
label_features = [{"input_ids": feature} for feature in batch["labels"]]
# pad the labels to max length
labels_batch = processor.tokenizer.pad(label_features, return_tensors="tf")
labels = tf.where(tf.not_equal(labels_batch.attention_mask, 1), -100, labels_batch.input_ids)
# check if the first token of each sequence is the BOS token
first_tokens_equal_bos = tf.reduce_all(tf.equal(labels[:, 0], processor.tokenizer.bos_token_id))
# convert to a scalar boolean
first_tokens_equal_bos = first_tokens_equal_bos.numpy()
# If all sequences start with the BOS token, remove the first token
if first_tokens_equal_bos:
labels = labels[:, 1:]
batch["labels"] = labels
input_features = [
{"input_features": feature} for feature in batch["input_features"]
]
batch = processor.feature_extractor.pad(
input_features, return_tensors="tf"
)
return batch
train_dataset = dataset["train"].map(preprocess_function, batched=True)
eval_dataset = dataset["validation"].map(preprocess_function, batched=True)
tf_ds_train = train_dataset.to_tf_dataset(
columns=["input_features"],
label_cols=["labels"],
shuffle=True
)
tf_ds_test = eval_dataset.to_tf_dataset(
columns=["input_features"],
label_cols=["labels"],
shuffle=True
)
trainer = TFTrainer(
args=training_args,
model=model,
train_dataset=tf_ds_train,
eval_dataset=tf_ds_test,
compute_metrics=get_metric,
)
Calling trainer.train()
returns the following error:
ValueError: Trying to create optimizer slot variable under the scope for tf.distribute.Strategy (<tensorflow.python.distribute.one_device_strategy.OneDeviceStrategy object at 0x7ffe8c4a0550>), which is different from the scope used for the original variable (<tf.Variable 'tf_whisper_for_conditional_generation/model/encoder/conv1/kernel:0' shape=(3, 80, 384) dtype=float32
requirements:
Python 3.10.13
transformers==4.30.1
tensorflow==2.9.1
tensorflow-estimator==2.9.0
tensorflow-io-gcs-filesystem==0.37.1
Can anybody help?
David is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.