I used sinhala english data set and trained it to translate sinhala english java related questions. It gives good training accuracy but problem is after save it unable to load and use it to develop api for translation. I saved this model as a h5 format and tried to load it gives
import os
import shutil
import subprocess
import warnings
from pathlib import Path
import numpy as np
import pandas as pd
import plotly.express as px
import tensorflow as tf
import tensorflow_datasets as tfds
from tensorflow import keras
from keras import layers
!pip install colorama
from colorama import Fore, Style
from IPython.core.display import HTML
warnings.filterwarnings("ignore")
easy_dataset_user = "test"
easy_dataset = "language-translation-englishfrench"
data_dir = Path("data")
from google.colab import drive
drive.mount('/content/drive')
if not ON_KAGGLE:
download_dataset_from_kaggle(easy_dataset_user, easy_dataset, data_dir)
easy_dataset_path = "/content/drive/MyDrive/java_sinhala_vocabulary.csv"
else:
easy_dataset_path = Path(
"/content/drive/MyDrive/java_sinhala_vocabulary.csv"
)
easy_dataset = pd.read_csv(easy_dataset_path, encoding="utf-8", engine="pyarrow")
easy_dataset = easy_dataset.sample(len(easy_dataset), random_state=42)
easy_dataset.head()
easy_dataset.info()
easy_dataset["English Words in Sentence"] = (
easy_dataset["english"].str.split().apply(len)
)
easy_dataset["French Words in Sentence"] = (
easy_dataset["sinhala"].str.split().apply(len)
)
fig = px.histogram(
easy_dataset,
x=["English Words in Sentence", "French Words in Sentence"],
color_discrete_sequence=["#3f384a", "#e04c5f"],
labels={"variable": "Variable", "value": "Words in Sentence"},
marginal="box",
barmode="group",
height=540,
width=840,
title="Easy Dataset - Words in Sentence",
)
fig.update_layout(
font_color=FONT_COLOR,
title_font_size=18,
plot_bgcolor=BACKGROUND_COLOR,
paper_bgcolor=BACKGROUND_COLOR,
bargap=0.2,
bargroupgap=0.1,
legend=dict(orientation="h", yanchor="bottom", xanchor="right", y=1.02, x=1),
yaxis_title="Count",
)
fig.show()
sentences_en = easy_dataset["english"].to_numpy()
sentences_fr = easy_dataset["sinhala"].to_numpy()
valid_fraction = 0.1
valid_len = int(valid_fraction * len(easy_dataset))
sentences_en_train = sentences_en[:-valid_len]
sentences_fr_train = sentences_fr[:-valid_len]
sentences_en_valid = sentences_en[-valid_len:]
sentences_fr_valid = sentences_fr[-valid_len:]
def prepare_input_and_target(sentences_en, sentences_fr):
"""Return data in the format: `((encoder_input, decoder_input), target)`"""
return (sentences_en, b"startofseq " + sentences_fr), sentences_fr + b" endofseq"
def from_sentences_dataset(
sentences_en,
sentences_fr,
batch_size=32,
cache=True,
shuffle=False,
shuffle_buffer_size=10_000,
seed=None,
):
dataset = tf.data.Dataset.from_tensor_slices((sentences_en, sentences_fr))
dataset = dataset.map(prepare_input_and_target, num_parallel_calls=tf.data.AUTOTUNE)
if cache:
dataset = dataset.cache()
if shuffle:
dataset = dataset.shuffle(shuffle_buffer_size, seed=seed)
return dataset.batch(batch_size)
benchmark_ds = from_sentences_dataset(sentences_en_train, sentences_fr_train)
benchmark_ds = benchmark_ds.prefetch(tf.data.AUTOTUNE)
bench_results = tfds.benchmark(benchmark_ds, batch_size=32)
example_ds = from_sentences_dataset(
sentences_en_train, sentences_fr_train, batch_size=4
)
list(example_ds.take(1))[0]
example_ds.cardinality() # Number of batches per epoch.
class ColoramaVerbose(keras.callbacks.Callback):
def on_epoch_end(self, epoch, logs=None):
print(
f"{CLR}Epoch: {RED}{epoch + 1:02d}{CLR} -",
f"{CLR}loss: {RED}{logs['loss']:.5f}{CLR} -",
f"{CLR}accuracy: {RED}{logs['accuracy']:.5f}{CLR} -",
f"{CLR}val_loss: {RED}{logs['val_loss']:.5f}{CLR} -",
f"{CLR}val_accuracy: {RED}{logs['val_accuracy']:.5f}",
)
def adapt_compile_and_fit(
model,
train_dataset,
valid_dataset,
n_epochs=50,
n_patience=5,
init_lr=0.001,
lr_decay_rate=0.1,
colorama_verbose=False,
):
model.vectorization_en.adapt(
train_dataset.map(
lambda sentences, target: sentences[0], # English sentences.
num_parallel_calls=tf.data.AUTOTUNE,
)
)
model.vectorization_fr.adapt(
train_dataset.map(
lambda sentences, target: sentences[1] + b" endofseq", # French sentences.
num_parallel_calls=tf.data.AUTOTUNE,
)
)
train_dataset_prepared = train_dataset.map(
lambda sentences, target: (sentences, model.vectorization_fr(target)),
num_parallel_calls=tf.data.AUTOTUNE,
).prefetch(tf.data.AUTOTUNE)
valid_dataset_prepared = valid_dataset.map(
lambda sentences, target: (sentences, model.vectorization_fr(target)),
num_parallel_calls=tf.data.AUTOTUNE,
).prefetch(tf.data.AUTOTUNE)
early_stopping_cb = keras.callbacks.EarlyStopping(
monitor="val_accuracy", patience=n_patience, restore_best_weights=True
)
# The line below doesn't work with multi-file interleaving.
# n_decay_steps = n_epochs * train_dataset_prepared.cardinality().numpy()
# Less elegant solution.
n_decay_steps = n_epochs * len(list(train_dataset_prepared))
scheduled_lr = keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate=init_lr,
decay_steps=n_decay_steps,
decay_rate=lr_decay_rate,
)
model_callbacks = [early_stopping_cb]
verbose_level = 1
if colorama_verbose:
model_callbacks.append(ColoramaVerbose())
verbose_level = 0
model.compile(
loss="sparse_categorical_crossentropy",
optimizer=keras.optimizers.RMSprop(learning_rate=scheduled_lr),
metrics=["accuracy"],
)
return model.fit(
train_dataset_prepared,
epochs=n_epochs,
validation_data=valid_dataset_prepared,
callbacks=model_callbacks,
verbose=verbose_level,
)
def translate(model, sentence_en):
translation = ""
for word_idx in range(model.max_sentence_len):
X_encoder = np.array([sentence_en])
X_decoder = np.array(["startofseq " + translation])
# Last token's probas.
y_proba = model.predict((X_encoder, X_decoder), verbose=0)[0, word_idx]
predicted_word_id = np.argmax(y_proba)
predicted_word = model.vectorization_fr.get_vocabulary()[predicted_word_id]
if predicted_word == "endofseq":
break
translation += " " + predicted_word
return translation.strip()
class BidirectionalEncoderDecoderWithAttention(keras.Model):
def __init__(
self,
vocabulary_size=5000,
max_sentence_len=50,
embedding_size=256,
n_units_lstm=512,
**kwargs,
):
super().__init__(**kwargs)
self.max_sentence_len = max_sentence_len
self.vectorization_en = layers.TextVectorization(
vocabulary_size, output_sequence_length=max_sentence_len
)
self.vectorization_fr = layers.TextVectorization(
vocabulary_size, output_sequence_length=max_sentence_len
)
self.encoder_embedding = layers.Embedding(
vocabulary_size, embedding_size, mask_zero=True
)
self.decoder_embedding = layers.Embedding(
vocabulary_size, embedding_size, mask_zero=True
)
self.encoder = layers.Bidirectional(
layers.LSTM(n_units_lstm // 2, return_sequences=True, return_state=True)
)
self.decoder = layers.LSTM(n_units_lstm, return_sequences=True)
self.attention = layers.Attention()
self.output_layer = layers.Dense(vocabulary_size, activation="softmax")
def call(self, inputs):
encoder_inputs, decoder_inputs = inputs
encoder_input_ids = self.vectorization_en(encoder_inputs)
decoder_input_ids = self.vectorization_fr(decoder_inputs)
encoder_embeddings = self.encoder_embedding(encoder_input_ids)
decoder_embeddings = self.decoder_embedding(decoder_input_ids)
# The final hidden state of the encoder, representing the entire
# input sequence, is used to initialize the decoder.
encoder_output, *encoder_state = self.encoder(encoder_embeddings)
encoder_state = [
tf.concat(encoder_state[0::2], axis=-1), # Short-term state (0 & 2).
tf.concat(encoder_state[1::2], axis=-1), # Long-term state (1 & 3).
]
decoder_output = self.decoder(decoder_embeddings, initial_state=encoder_state)
attention_output = self.attention([decoder_output, encoder_output])
return self.output_layer(attention_output)
K.clear_session() # Resets all state generated by Keras.
tf.random.set_seed(42) # Ensure reproducibility on CPU.
easy_train_ds = from_sentences_dataset(
sentences_en_train, sentences_fr_train, shuffle=True, seed=42
)
easy_valid_ds = from_sentences_dataset(sentences_en_valid, sentences_fr_valid)
bidirect_encoder_decoder = BidirectionalEncoderDecoderWithAttention(max_sentence_len=15)
bidirect_history = adapt_compile_and_fit(
bidirect_encoder_decoder,
easy_train_ds,
easy_valid_ds,
init_lr=0.01,
lr_decay_rate=0.01,
colorama_verbose=True,
)
fig = px.line(
bidirect_history.history,
markers=True,
height=540,
width=840,
symbol="variable",
labels={"variable": "Variable", "value": "Value", "index": "Epoch"},
title="Easy Dataset - Encoder-Decoder RNN Training Process",
color_discrete_sequence=px.colors.diverging.balance_r,
)
fig.update_layout(
font_color=FONT_COLOR,
title_font_size=18,
plot_bgcolor=BACKGROUND_COLOR,
paper_bgcolor=BACKGROUND_COLOR,
)
fig.show()
translation1 = translate(bidirect_encoder_decoder, "Hello, how are you?")
translation2 = translate(bidirect_encoder_decoder, "This is a test sentence")
translation3 = translate(bidirect_encoder_decoder, "you will receive a confirmation code after completing the registration.")
print(CLR + "Actual Possible Translations:")
print(BLUE + "Hello, how are you?".ljust(25), RED + "-> ", BLUE + "හෙලෝ, ඔයාට කෙසේද?")
print(
BLUE + "This is a test sentence".ljust(25),
RED + "-> ",
BLUE + "මෙය පරීක්ෂණ වාක්යයකි.",
)
print(
BLUE + "you will receive a confirmation code after completing the registration".ljust(25),
RED + "-> ",
BLUE + "ලියාපදිංචිය සම්පූර්ණ කලාට පස්සෙ ඔයාට තහවුරු කිරීමේ කේතයක් හම්බවේවි",
)
print()
print(CLR + "Model Translations:")
print(BLUE + "Hello, how are you?".ljust(25), RED + "-> ", BLUE + translation1)
print(BLUE + "This is a test sentence".ljust(25), RED + "-> ", BLUE + translation2)
print(BLUE + "you will receive a confirmation code after completing the registration".ljust(25), RED + "-> ", BLUE + translation3)
PS D:app> python model.py
2024-07-21 00:08:39.758670: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-07-21 00:08:40.515736: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
Traceback (most recent call last):
File "D:appmodel.py", line 15, in <module>
model = tf.keras.models.load_model(model_path)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
Please help me to build api after save this model. I want to build api using this saved model to get meaning of given english sentence. Do we need to save this model in which format.
This is the way I tried to develop API.
import numpy as np
import keras
from keras import ops
import tensorflow as tf
from tensorflow.keras.models import load_model
from tensorflow.keras.layers import Layer
from fastapi import FastAPI, HTTPException
import tensorflow as tf
import numpy as np
app = FastAPI()
# Load the model
model_path = "english_sinhala_transformer_model.h5"
model = tf.keras.models.load_model(model_path)
def translate(sentence_en):
translation = ""
for word_idx in range(model.max_sentence_len):
X_encoder = np.array([sentence_en])
X_decoder = np.array(["startofseq " + translation])
y_proba = model.predict((X_encoder, X_decoder), verbose=0)[0, word_idx]
predicted_word_id = np.argmax(y_proba)
predicted_word = model.vectorization_fr.get_vocabulary()[predicted_word_id]
if predicted_word == "endofseq":
break
translation += " " + predicted_word
return translation.strip()
@app.get("/translate")
def get_translation(sentence: str):
try:
translation = translate(sentence)
return {"translation": translation}
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))