I have a deep neural network, and I am trying to use it to classify signed numbers in 64×64 pcitures of numbers from 0-9. The cost seems to always converge around 2.3, which is 10% accuracy or just random.
I tested the nn model on a generated dataset with 20 features and 4 classes, and it seems to work fine in plotting the decision boundaries and having low cost, however when I try it on my dataset, it always converges to 2.3 cost. I have adjusted the learning rate, changed the layer dimensions, and rechecked my nn code. I can’t seem to find where it’s going wrong. I made sure the X and Y npy’s I am pulling from are correct, as well as the right shape.
import numpy as np
def initialize_layer_parameters(layer_dims):
parameters = {}
L = len(layer_dims) # number of layers in the network
# Initialize parameters for each layer
for l in range(1, L):
parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.1
parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
assert(parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l - 1]))
assert(parameters['b' + str(l)].shape == (layer_dims[l], 1))
return parameters
def sigmoid(Z):
A = 1 / (1 + np.exp(-Z))
cache = Z
return A, cache
def relu(Z):
A = np.maximum(0, Z)
cache = Z
return A, cache
def softmax(Z):
Z_shifted = Z - np.max(Z, axis=0, keepdims=True) # To prevent overflow
exp_Z = np.exp(Z_shifted)
A = exp_Z / np.sum(exp_Z, axis=0, keepdims=True)
cache = Z
return A, cache
def relu_backward(dA, cache):
Z = cache
dZ = np.array(dA, copy=True)
dZ[Z <= 0] = 0
assert (dZ.shape == Z.shape)
return dZ
def sigmoid_backward(dA, cache):
Z = cache
s = 1/(1+np.exp(-Z))
dZ = dA * s * (1-s)
assert (dZ.shape == Z.shape)
return dZ
def softmax_backward(dAL, cache):
Z = cache
m = Z.shape[1]
dZ = dAL / m
return dZ
def one_hot_encode(Y, num_classes):
Y = np.array(Y).astype(int) # Ensure Y is an array of integers
Y = Y.reshape(-1) # Flatten Y to be a 1D array if needed
if np.any(Y >= num_classes) or np.any(Y < 0):
raise ValueError("Labels must be in the range [0, num_classes-1]")
Y_encoded = np.eye(num_classes)[Y]
return Y_encoded.T
def forward_linear(A, W, b):
Z = np.dot(W, A) + b
cache = (A, W, b)
return Z, cache
def forward_activation(A_prev, W, b, activation):
if activation == "sigmoid":
Z, linear_cache = forward_linear(A_prev, W, b)
A, activation_cache = sigmoid(Z)
elif activation == "relu":
Z, linear_cache = forward_linear(A_prev, W, b)
A, activation_cache = relu(Z)
elif activation == "softmax":
Z, linear_cache = forward_linear(A_prev, W, b)
A, activation_cache = softmax(Z)
else:
raise ValueError("Invalid activation function!")
cache = (linear_cache, activation_cache)
return A, cache
def forward_propagation(X, parameters):
caches = []
A = X
L = len(parameters) // 2 # number of layers in the network
activation = "sigmoid" if classification_type == "binary" else "softmax"
# Forward propagation for each layer
for l in range(1, L):
A_prev = A
A, cache = forward_activation(A_prev, parameters['W' + str(l)], parameters['b' + str(l)], "relu")
caches.append(cache)
#AL is final output layer
AL, cache = forward_activation(A, parameters['W' + str(L)], parameters['b' + str(L)], activation)
caches.append(cache)
return AL, caches
def cost_function(AL, Y):
if classification_type == "binary":
cost = -np.mean(np.multiply(Y, np.log(AL)) + np.multiply((1 - Y), np.log(1 - AL)))
elif classification_type == "multivariable":
epsilon = 1e-8 # Small constant to prevent log(0)
AL = np.clip(AL, epsilon, 1 - epsilon) # Clip AL to avoid log(0) and division by zero
cost = -np.sum(Y * np.log(AL)) / Y.shape[1]
else:
raise ValueError("Invalid classification type!")
return cost
def backward_linear(dZ, cache):
A_prev, W, b = cache
m = A_prev.shape[1]
dW = 1/m * np.dot(dZ, A_prev.T)
db = 1/m * np.sum(dZ, axis=1, keepdims=True)
dA_prev = np.dot(W.T, dZ)
return dA_prev, dW, db
def backward_activation(dA, cache, activation):
linear_cache, activation_cache = cache
if activation == "relu":
dZ = relu_backward(dA, activation_cache)
dA_prev, dW, db = backward_linear(dZ, linear_cache)
elif activation == "sigmoid":
dZ = sigmoid_backward(dA, activation_cache)
dA_prev, dW, db = backward_linear(dZ, linear_cache)
elif activation == "softmax":
dZ = softmax_backward(dA, activation_cache)
dA_prev, dW, db = backward_linear(dZ, linear_cache)
else:
raise ValueError("Invalid activation function!")
return dW, dA_prev, db
def backward_propagation(AL, Y, caches):
grads = {}
L = len(caches)
activation = "sigmoid" if classification_type == "binary" else "softmax"
if activation == "sigmoid":
dAL = -(np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
if activation == "softmax":
dAL = AL - Y
# Gradient descent of output layer
current_cache = caches[L - 1]
dW_temp, dA_prev_temp, db_temp = backward_activation(dAL, current_cache, activation)
grads["dW" + str(L)] = dW_temp
grads["dA" + str(L-1)] = dA_prev_temp
grads["db" + str(L)] = db_temp
# Store gradient descent for remaining layers
for l in reversed(range(L-1)):
dW_temp, dA_prev_temp, db_temp = backward_activation(grads["dA" + str(l+1)], caches[l], "relu")
grads["dW" + str(l + 1)] = dW_temp
grads["dA" + str(l)] = dA_prev_temp
grads["db" + str(l + 1)] = db_temp
return grads
def update_parameters(parameters, grads, learning_rate):
L = len(parameters) // 2 # number of layers in the neural network
# Update each parameter by layer
for l in range(L):
parameters["W" + str(l+1)] -= learning_rate * grads["dW" + str(l+1)]
parameters["b" + str(l+1)] -= learning_rate * grads["db" + str(l+1)]
return parameters
def deep_nn_model(X, Y, num_iterations, layer_dims, learning_rate, classification_method):
costs = []
parameters = initialize_layer_parameters(layer_dims)
global classification_type
classification_type = classification_method
if classification_type == "multivariable":
Y = one_hot_encode(Y, layer_dims[-1])
for i in range(0, num_iterations):
AL, caches = forward_propagation(X, parameters)
cost = cost_function(AL, Y)
grads = backward_propagation(AL, Y, caches)
parameters = update_parameters(parameters, grads, learning_rate)
if i % 100 == 0 or i == num_iterations - 1:
print("Cost after iteration {}: {}".format(i, np.squeeze(cost)))
if i % 100 == 0 or i == num_iterations:
costs.append(cost)
return parameters, costs
# Seperate code running the model
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sys
import os
import pickle
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from src.deep_nn import forward_propagation, deep_nn_model
X = np.load("datasets/signdigits/X.npy")
Y = np.load("datasets/signdigits/Y.npy")
# Split dataset into some training examples(15%) and shuffle data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=42)
m_train = X_train.shape[0]
X_train_flat = X_train.reshape(m_train, X_train.shape[1] * X_train.shape[2]).T
Y_train = Y_train[np.newaxis, :]
m_test = X_test.shape[0]
X_test_flat = X_test.reshape(m_test, X_test.shape[1] * X_test.shape[2]).T
Y_test = Y_test[np.newaxis, :]
print(X_train_flat.shape)
print(Y_train.shape)
print(X_test_flat.shape)
print(Y_test.shape)
layer_dims = [4096, 1024, 512, 256, 128, 10]
parameters, _ = deep_nn_model(X_train_flat, Y_train, num_iterations=1000, layer_dims=layer_dims, learning_rate=1, classification_method="multivariable")
def predict(X, parameters):
AL, _ = forward_propagation(X, parameters)
predictions = np.argmax(AL, axis=0)
return predictions
def calculate_accuracy(predictions, labels):
return np.mean(predictions == labels)
with open('model_parameters.pkl', 'wb') as f:
pickle.dump(parameters, f)
train_predictions = predict(X_train_flat, parameters)
train_accuracy = calculate_accuracy(train_predictions, Y_train)
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")
test_predictions = predict(X_test_flat, parameters)
test_accuracy = calculate_accuracy(test_predictions, Y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")