i have been trying to train a model on extracting handwritten text from images . for this i have been using a hybrd model , where i am using Resnet model for features extraction and lstm for handling sequences .but for doing the same , i am facing error and i am not able to fix it .
this is my error image
here is my detailed code.
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Lambda, Bidirectional, LSTM, Dropout, Reshape, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.sequence import pad_sequences
import cv2
import os
from sklearn.model_selection import train_test_split
# Step 1: Character Mapping
characters = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"
char_to_num = {char: i for i, char in enumerate(characters)}
num_classes = len(characters) + 1 # Include blank label for CTC
max_text_length = 11 # Adjust based on your dataset
# Step 2: Image Preprocessing Function
def load_and_process_image(image_path, img_height=32, img_width=256):
img = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
img = cv2.resize(img, (img_width, img_height))
img = img.astype('float32') / 255.0
img = np.expand_dims(img, axis=-1) # Add channel dimension
return img
# Step 3: Convert Text to Sequence
def text_to_sequence(text, char_to_num):
return [char_to_num[char] for char in text if char in char_to_num]
# Step 4: Load Dataset
def load_dataset(image_dir, label_dir, char_to_num, max_text_length, img_height=32, img_width=256):
image_data = []
label_data = []
label_lengths = []
for img_file in os.listdir(image_dir):
if img_file.endswith(".png") or img_file.endswith(".jpg"):
img_path = os.path.join(image_dir, img_file)
image = load_and_process_image(img_path, img_height, img_width)
image_data.append(image)
label_file = img_file.replace(".png", ".txt").replace(".jpg", ".txt")
label_path = os.path.join(label_dir, label_file)
with open(label_path, 'r') as f:
text = f.readline().strip()
sequence = text_to_sequence(text, char_to_num)
label_length = len(sequence)
padded_sequence = pad_sequences([sequence], maxlen=max_text_length, padding='post')[0]
label_data.append(padded_sequence)
label_lengths.append(label_length) # Store actual label length before padding
image_data = np.array(image_data, dtype=np.float32)
label_data = np.array(label_data, dtype=np.int32)
label_lengths = np.array(label_lengths, dtype=np.int32)
return image_data, label_data, label_lengths
# Step 5: Build the Model
def build_resnet_feature_extractor(input_shape):
base_model = ResNet50(include_top=False, input_shape=input_shape)
return Model(inputs=base_model.input, outputs=base_model.output)
def build_bilstm_layer(feature_extractor_output, lstm_units=256):
lstm = Bidirectional(LSTM(lstm_units, return_sequences=True))(feature_extractor_output)
lstm = Dropout(0.25)(lstm)
return lstm
def apply_attention_layer(lstm_output):
attention = tf.keras.layers.Attention()([lstm_output, lstm_output])
return attention
def ctc_loss_lambda_func(args):
labels, y_pred, input_length, label_length = args
labels = tf.cast(labels, dtype=tf.int32)
input_length = tf.cast(input_length, dtype=tf.int32)
label_length = tf.cast(label_length, dtype=tf.int32)
# Ensure label_length is squeezed to the expected shape
label_length = tf.squeeze(label_length, axis=-1) # Squeeze to ensure it's 1D
return tf.keras.backend.ctc_batch_cost(labels, y_pred, input_length, label_length)
def build_ctc_loss_model(inputs, outputs, max_text_length):
labels = Input(shape=(max_text_length,), name='labels')
input_length = Input(shape=(1,), name='input_length')
label_length = Input(shape=(1,), name='label_length')
ctc_loss = Lambda(ctc_loss_lambda_func, output_shape=(1,), name='ctc')([labels, outputs, input_length, label_length])
return Model(inputs=[inputs, labels, input_length, label_length], outputs=ctc_loss)
def build_handwriting_ocr_model(input_shape, max_text_length, lstm_units=256):
inputs = Input(shape=input_shape, name='image_input')
# Concatenate the grayscale image to 3 channels
concatenated = Concatenate()([inputs, inputs, inputs])
# Feature extraction with ResNet50
feature_extractor = build_resnet_feature_extractor((input_shape[0], input_shape[1], 3)) # update input to 3 channels
features = feature_extractor(concatenated)
# Reshape for LSTM input
features = Reshape((-1, features.shape[-1]))(features)
# BiLSTM + Dropout
lstm_output = build_bilstm_layer(features, lstm_units=lstm_units)
# Attention Layer
attention_output = apply_attention_layer(lstm_output)
# Output Layer
outputs = Dense(num_classes, activation='softmax', name='output')(attention_output)
# CTC Loss Model
ctc_model = build_ctc_loss_model(inputs, outputs, max_text_length)
return ctc_model
# Step 6: Load Data and Split
image_dir = '/content/drive/MyDrive/ifsc_croped'
label_dir = '/content/drive/MyDrive/ifsc_cropped_txt'
train_images, train_labels, train_label_lengths = load_dataset(image_dir, label_dir, char_to_num, max_text_length)
train_images, val_images, train_labels, val_labels, train_label_lengths, val_label_lengths = train_test_split(
train_images, train_labels, train_label_lengths, test_size=0.2, random_state=42
)
# Step 7: Padding Function
def pad_labels(label_sequences, max_text_length):
return pad_sequences(label_sequences, maxlen=max_text_length, padding='post')
# Step 8: Data Generator
def data_generator(images, labels, label_lengths, batch_size, max_text_length):
downsample_factor = 8 # Adjust based on your model
time_steps = images.shape[2] // downsample_factor
while True:
for i in range(0, len(images), batch_size):
batch_images = images[i:i + batch_size]
batch_labels = labels[i:i + batch_size]
batch_label_lengths = label_lengths[i:i + batch_size]
input_lengths = np.ones((len(batch_images), 1), dtype=np.int32) * time_steps
label_lengths_batch = np.array(batch_label_lengths, dtype=np.int32).reshape(-1, 1) # Ensure shape is (batch_size, 1)
inputs = {
'image_input': batch_images,
'labels': batch_labels,
'input_length': input_lengths,
'label_length': label_lengths_batch
}
outputs = np.zeros((len(batch_images), 1)) # Dummy outputs for CTC
yield (inputs, outputs)
# Step 9: Compile and Train the Model
input_shape = (32, 256, 1) # Adjust according to your images
model = build_handwriting_ocr_model(input_shape, max_text_length)
model.compile(optimizer=tf.keras.optimizers.Adam())
# Define training parameters
batch_size = 32
epochs = 20
history = model.fit(
data_generator(train_images, train_labels, train_label_lengths, batch_size, max_text_length),
validation_data=data_generator(val_images, val_labels, val_label_lengths, batch_size, max_text_length),
steps_per_epoch=len(train_images) // batch_size,
validation_steps=len(val_images) // batch_size,
epochs=epochs
)
Yukti Kakkar is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.