Hello currently im working on this web system prototype in which the system is able to predict the accuracy of user’s pronunciation for certain words.
The model trained with the code below has the Final Model Accuracy of 0.9919999837875366 and Accuracy: 0.992, Precision: 0.9960762331838565 , F1 Score: 0.9932923420905534. Im using the CNN model architecture for this model training. Also, the dataset has a total of 500 audio recording for 10 different words.
Im using this trained model for my web server using Flask framework in which it will predict user’s pronunciation accuracy of given word which are: (One Syllable) “Sup”, “Bas”, “Jam”, “Wang”, “Sos” and (Two Syllable) “Roti”, “Ayam”, “Pintu”, “Kipas”, “Sampah”. User need to click on the start recording then pronounce the word and click the stop recording button afterwards. The system will then predict the accuracy of the pronounced word by giving percentage value of the accuracy to the user as the feedback.
The problem is now, i dont know why when i use the trained model in the web, it seems like the system could not accurately predict the correctness of user’s pronunciation. For example like if the given word is “Sup” and i pronounced as “Batu”, it should predict it as incorrect and give a low percentage of accuracy but instead, the system gives me a high percentage as the feedback (like 97.5% or 100%). Even if i tried to pronounce it in different ways (incorrect ways to pronounce the word) it will still gives me a range of high value percentages.
Below i provide the model training and the server code.
- Model Training (Model_Training.py)
import os
import numpy as np
import librosa
import seaborn as sns
import matplotlib.pyplot as plt
import joblib
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, Flatten, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.optimizers import Adam
# Add noise with different types
def add_noise(audio_data, noise_level=0.005):
noise = np.random.normal(0, 1, len(audio_data))
scaled_noise = noise_level * noise
return audio_data + scaled_noise
# Add reverberation
def add_reverb(audio_data, sr, intensity=0.5):
return librosa.effects.preemphasis(audio_data, coef=intensity)
# Function to preprocess audio data
def preprocess_audio(file_path):
audio_data, sr = librosa.load(file_path, sr=None)
audio_data = librosa.effects.preemphasis(audio_data)
audio_data = librosa.util.normalize(audio_data)
mfccs = librosa.feature.mfcc(y=audio_data, sr=sr, n_mfcc=20)
return mfccs.T # Transpose for Conv1D layer
# Function to augment audio data
def augment_audio(audio_data, sr):
augmented_audios = []
# Existing augmentations
augmented_audios.append(audio_data + 0.005 * np.random.randn(len(audio_data))) # Noise
augmented_audios.append(np.roll(audio_data, int(np.random.uniform(-0.1, 0.1) * sr))) # Time shifting
augmented_audios.append(librosa.effects.pitch_shift(audio_data, sr=sr, n_steps=np.random.randint(-2, 2))) # Pitch shifting
augmented_audios.append(audio_data) # Original audio
# Advanced augmentations
augmented_audios.append(add_noise(audio_data))
augmented_audios.append(add_reverb(audio_data, sr))
return augmented_audios
# Function to load and preprocess a dataset of audio files
def load_and_preprocess_dataset(dataset_folder):
X, y = [], []
max_length = 222
for root, dirs, files in os.walk(dataset_folder):
for directory in dirs:
word_folder = os.path.join(root, directory)
for participant_folder in os.listdir(word_folder):
participant_folder_path = os.path.join(word_folder, participant_folder)
if os.path.isdir(participant_folder_path):
for filename in os.listdir(participant_folder_path):
if filename.endswith(".wav"):
file_path = os.path.join(participant_folder_path, filename)
audio_data, sr = librosa.load(file_path, sr=None)
augmented_audios = augment_audio(audio_data, sr)
for audio in augmented_audios:
mfccs_features = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=20).T
max_length = max(max_length, mfccs_features.shape[0])
X.append(mfccs_features)
label = 1 if "Correct" in filename else 0
y.append(label)
# Pad the sequences to have the same length
X_padded = []
for mfccs_features in X:
if mfccs_features.shape[0] < max_length:
pad_width = max_length - mfccs_features.shape[0]
mfccs_features = np.pad(mfccs_features, ((0, pad_width), (0, 0)), mode='constant')
else:
mfccs_features = mfccs_features[:max_length, :]
X_padded.append(mfccs_features)
X_reshaped = np.array(X_padded).reshape(-1, max_length * 20) # matches your max_length * n_mfcc
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_reshaped)
X_scaled = X_scaled.reshape(len(X_padded), max_length, 20)
return X_scaled, np.array(y)
# Path to the dataset folder
dataset_folder = r'C:UsersACERDesktopWord_Dataset'
# Load and preprocess the dataset
X, y = load_and_preprocess_dataset(dataset_folder)
# Standardize the features
scaler = StandardScaler()
X_reshaped = X.reshape(-1, X.shape[1] * X.shape[2])
X_scaled = scaler.fit_transform(X_reshaped)
X_scaled = X_scaled.reshape(X.shape[0], X.shape[1], X.shape[2])
# Define a simple CNN model
def create_model(learning_rate=0.0005):
model = Sequential()
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(X_scaled.shape[1], X_scaled.shape[2])))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.4))
model.add(Conv1D(128, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.4))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))
optimizer = Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
return model
# Cross-validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
all_accuracies = []
for train_index, test_index in skf.split(X_scaled, y):
X_train, X_test = X_scaled[train_index], X_scaled[test_index]
y_train, y_test = y[train_index], y[test_index]
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))
model = create_model(learning_rate=0.0005)
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6)
history = model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stopping, reduce_lr], class_weight=class_weight_dict)
loss, accuracy = model.evaluate(X_test, y_test)
all_accuracies.append(accuracy)
print(f"Fold Accuracy: {accuracy}")
# Final evaluation on the entire dataset
loss, accuracy = model.evaluate(X_scaled, y)
print("Final Model Accuracy:", accuracy)
# Save the trained model
MODEL_DIR = os.path.join(os.path.dirname(os.path.abspath(file)), '..', 'models')
os.makedirs(MODEL_DIR, exist_ok=True)
MODEL_PATH = os.path.join(MODEL_DIR, 'pronunciation_assessment_model_final4.h5')
model.save(MODEL_PATH)
print(f"Model saved successfully at {MODEL_PATH}")
# Save the scaler
SCALER_PATH = os.path.join(MODEL_DIR, 'scaler.pkl')
joblib.dump(scaler, SCALER_PATH)
print(f"Scaler saved successfully at {SCALER_PATH}")
# Predict labels for the dataset
y_pred = model.predict(X_scaled)
y_pred_classes = (y_pred > 0.5).astype("int32")
# Calculate evaluation metrics
accuracy = accuracy_score(y, y_pred_classes)
precision = precision_score(y, y_pred_classes)
recall = recall_score(y, y_pred_classes)
f1 = f1_score(y, y_pred_classes)
print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
# Generate confusion matrix
cm = confusion_matrix(y, y_pred_classes)
# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["Incorrect", "Correct"], yticklabels=["Incorrect", "Correct"])
plt.xlabel("Predicted labels")
plt.ylabel("True labels")
plt.title("Confusion Matrix")
plt.show()
# Print classification report
print(classification_report(y, y_pred_classes, target_names=["Incorrect", "Correct"]))
- Server (Pronounce_System.py)
from flask import Flask, render_template, request, jsonify,session, redirect, url_for
from datetime import timedelta
import os
import librosa
import numpy as np
from tensorflow.keras.models import load_model
from sklearn.preprocessing import StandardScaler
import joblib # Import joblib for saving and loading objects
import random
app = Flask(name, static_folder='../static', static_url_path='/static')
app.secret_key = 'f3cfe9ed8fae309f02079dbf' # Set a secret key for session management
# Set session to expire after 30 minutes of inactivity
app.config['PERMANENT_SESSION_LIFETIME'] = timedelta(minutes=30)
CURRENT_DIR = os.path.dirname(os.path.abspath(file))
TEMPLATES_DIR = os.path.join(CURRENT_DIR, '..', 'templates')
app.template_folder = TEMPLATES_DIR
MODEL_PATH = os.path.join(CURRENT_DIR, '..', 'models', 'pronunciation_assessment_model_final4.h5')
pronunciation_model = load_model(MODEL_PATH)
SCALER_PATH = os.path.join(CURRENT_DIR, '..', 'models', 'scaler.pkl')
scaler = joblib.load(SCALER_PATH)
# Function to preprocess user audio input
def preprocess_user_input(audio_data):
audio_data = audio_data.astype(np.float32) / np.iinfo(np.int16).max
audio_data = librosa.effects.preemphasis(audio_data)
audio_data = librosa.util.normalize(audio_data)
mfccs = librosa.feature.mfcc(y=audio_data, sr=22050, n_mfcc=20).T
max_length = 222 # Adjust this to match training exactly
print("MFCCs shape before padding/truncating:", mfccs.shape) # Debugging
if mfccs.shape[0] < max_length:
pad_width = max_length - mfccs.shape[0]
mfccs = np.pad(mfccs, ((0, pad_width), (0, 0)), mode='constant')
else:
mfccs = mfccs[:max_length, :]
print("MFCCs shape after padding/truncating:", mfccs.shape) # Debugging
# Flatten and reshape as done in training
mfccs_flattened = mfccs.flatten()
print("Features flattened shape:", mfccs_flattened.shape) # Debugging
mfccs_scaled = scaler.transform(mfccs_flattened.reshape(1, -1))
mfccs_scaled = mfccs_scaled.reshape(1, max_length, 20)
return mfccs_scaled
@app.route('/')
def index():
return render_template('index.html')
# Words lists for different categories
WORDS_SATU = ["Sup", "Bas", "Jam", "Wang", "Sos"]
WORDS_DUA = ["Roti", "Ayam", "Pintu", "Kipas", "Sampah"]
feedback_phrases = {
'0-10': [
"Jangan risau, cuba lagi. Awak pasti boleh lakukannya dengan lebih baik!",
"Tak mengapa, latihan membuat awak lebih bagus. Mari kita cuba lagi!"
],
'10-20': [
"Bagus usaha awak! Mari cuba sekali lagi, ya?",
"Sudah hampir betul, mari kita ulang semula"
],
'20-30': [
"Awak boleh melakukannya! Cuba sebut sekali lagi",
"Awak sudah mula faham. Teruskan berusaha!"
],
'30-40': [
"Bagus! Mari kita cuba lagi",
"Semakin baik! Sedikit lagi untuk sebutan yang betul"
],
'40-50': [
"Hebat! Kita cuba sekali lagi untuk sebutan lebih bagus",
"Awak semakin pandai! Teruskan berlatih"
],
'50-60': [
"Bagus! Teruskan usaha, kita cuba lagi sekali",
"Awak hampir berjaya! Sikit lagi pasti boleh"
],
'60-70': [
"Hebat! Sebutan awak makin bagus",
"Tahniah! Sebutan awak sudah bagus"
],
'70-80': [
"Bagus sekali! Kamu semakin pandai",
"Wow! Sebutan yang bagus. Teruskan usaha!"
],
'80-90': [
"Hebat! Sebutan awak sangat bagus",
"Sangat bagus! Sebutan awak sudah hampir sempurna"
],
'90-100': [
"Cemerlang! Sebutan awak sangat bagus!",
"Luar biasa! Awak sebut dengan betul"
]
}
Im trying to make the system correctly predict the accuracy of the user’s pronounced word.
If the current word is “Sup” and user pronounce it as “Jam” or “Sip” then the accuracy should be low and vice versa.
Please help me if you have any idea what are the problem that is causing this. Thank you in advance and have a great day!