Thiết kế website giá rẻ

Question

I have a deep neural network, and I am trying to use it to classify signed numbers in 64×64 pcitures of numbers from 0-9. The cost seems to always converge around 2.3, which is 10% accuracy or just random.

I tested the nn model on a generated dataset with 20 features and 4 classes, and it seems to work fine in plotting the decision boundaries and having low cost, however when I try it on my dataset, it always converges to 2.3 cost. I have adjusted the learning rate, changed the layer dimensions, and rechecked my nn code. I can’t seem to find where it’s going wrong. I made sure the X and Y npy’s I am pulling from are correct, as well as the right shape.

import numpy as np

def initialize_layer_parameters(layer_dims):
    parameters = {}
    L = len(layer_dims) # number of layers in the network

    # Initialize parameters for each layer
    for l in range(1, L):
        parameters['W' + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1]) * 0.1
        parameters['b' + str(l)] = np.zeros((layer_dims[l], 1))
        
        assert(parameters['W' + str(l)].shape == (layer_dims[l], layer_dims[l - 1]))
        assert(parameters['b' + str(l)].shape == (layer_dims[l], 1))

    return parameters

def sigmoid(Z):
    A = 1 / (1 + np.exp(-Z))
    cache = Z
    return A, cache

def relu(Z):
    A = np.maximum(0, Z)
    cache = Z
    return A, cache

def softmax(Z):
    Z_shifted = Z - np.max(Z, axis=0, keepdims=True)  # To prevent overflow
    exp_Z = np.exp(Z_shifted)
    A = exp_Z / np.sum(exp_Z, axis=0, keepdims=True)
    cache = Z
    return A, cache

def relu_backward(dA, cache):
    Z = cache
    dZ = np.array(dA, copy=True)
    dZ[Z <= 0] = 0

    assert (dZ.shape == Z.shape)
    
    return dZ

def sigmoid_backward(dA, cache):
    Z = cache
    s = 1/(1+np.exp(-Z))
    dZ = dA * s * (1-s)
    
    assert (dZ.shape == Z.shape)
    
    return dZ

def softmax_backward(dAL, cache):
    Z = cache
    m = Z.shape[1]
    dZ = dAL / m

    return dZ

def one_hot_encode(Y, num_classes):
    Y = np.array(Y).astype(int)  # Ensure Y is an array of integers
    Y = Y.reshape(-1)           # Flatten Y to be a 1D array if needed
    if np.any(Y >= num_classes) or np.any(Y < 0):
        raise ValueError("Labels must be in the range [0, num_classes-1]")
    
    Y_encoded = np.eye(num_classes)[Y]
    return Y_encoded.T

def forward_linear(A, W, b):
    Z = np.dot(W, A) + b
    cache = (A, W, b)

    return Z, cache

def forward_activation(A_prev, W, b, activation):
    if activation == "sigmoid":
        Z, linear_cache = forward_linear(A_prev, W, b)
        A, activation_cache = sigmoid(Z)
    elif activation == "relu":
        Z, linear_cache = forward_linear(A_prev, W, b)
        A, activation_cache = relu(Z)
    elif activation == "softmax":
        Z, linear_cache = forward_linear(A_prev, W, b)
        A, activation_cache = softmax(Z)
    else:
        raise ValueError("Invalid activation function!")

    cache = (linear_cache, activation_cache)

    return A, cache

def forward_propagation(X, parameters):
    caches = []
    A = X
    L = len(parameters) // 2  # number of layers in the network
    activation = "sigmoid" if classification_type == "binary" else "softmax"

    # Forward propagation for each layer
    for l in range(1, L):
        A_prev = A
        A, cache = forward_activation(A_prev, parameters['W' + str(l)], parameters['b' + str(l)], "relu")
        caches.append(cache)

    #AL is final output layer
    AL, cache = forward_activation(A, parameters['W' + str(L)], parameters['b' + str(L)], activation)
    caches.append(cache)

    return AL, caches

def cost_function(AL, Y):
    if classification_type == "binary":
        cost = -np.mean(np.multiply(Y, np.log(AL)) + np.multiply((1 - Y), np.log(1 - AL)))
    elif classification_type == "multivariable":
        epsilon = 1e-8  # Small constant to prevent log(0)
        AL = np.clip(AL, epsilon, 1 - epsilon)  # Clip AL to avoid log(0) and division by zero
        cost = -np.sum(Y * np.log(AL)) / Y.shape[1]  
    else:
        raise ValueError("Invalid classification type!")
    
    return cost

def backward_linear(dZ, cache):
    A_prev, W, b = cache
    m = A_prev.shape[1]

    dW = 1/m * np.dot(dZ, A_prev.T)
    db = 1/m * np.sum(dZ, axis=1, keepdims=True)
    dA_prev = np.dot(W.T, dZ)

    return dA_prev, dW, db

def backward_activation(dA, cache, activation):
    linear_cache, activation_cache = cache

    if activation == "relu":
        dZ = relu_backward(dA, activation_cache)
        dA_prev, dW, db = backward_linear(dZ, linear_cache)
    elif activation == "sigmoid":
        dZ = sigmoid_backward(dA, activation_cache)
        dA_prev, dW, db = backward_linear(dZ, linear_cache)
    elif activation == "softmax":
        dZ = softmax_backward(dA, activation_cache)
        dA_prev, dW, db = backward_linear(dZ, linear_cache)
    else:
        raise ValueError("Invalid activation function!")


    return dW, dA_prev, db

def backward_propagation(AL, Y, caches):
    grads = {}
    L = len(caches)
    activation = "sigmoid" if classification_type == "binary" else "softmax"
    
    if activation == "sigmoid":
        dAL = -(np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
    if activation == "softmax":
        dAL = AL - Y

    # Gradient descent of output layer
    current_cache = caches[L - 1]
    dW_temp, dA_prev_temp, db_temp = backward_activation(dAL, current_cache, activation)
    grads["dW" + str(L)] = dW_temp
    grads["dA" + str(L-1)] = dA_prev_temp
    grads["db" + str(L)] = db_temp

    # Store gradient descent for remaining layers
    for l in reversed(range(L-1)):
        dW_temp, dA_prev_temp, db_temp = backward_activation(grads["dA" + str(l+1)], caches[l], "relu")
        grads["dW" + str(l + 1)] = dW_temp
        grads["dA" + str(l)] = dA_prev_temp  
        grads["db" + str(l + 1)] = db_temp

    return grads

def update_parameters(parameters, grads, learning_rate):
    L = len(parameters) // 2 # number of layers in the neural network

    # Update each parameter by layer  
    for l in range(L):
        parameters["W" + str(l+1)] -= learning_rate * grads["dW" + str(l+1)]
        parameters["b" + str(l+1)] -= learning_rate * grads["db" + str(l+1)]
        
    return parameters

def deep_nn_model(X, Y, num_iterations, layer_dims, learning_rate, classification_method):
    costs = []
    parameters = initialize_layer_parameters(layer_dims)
    global classification_type
    classification_type = classification_method

    if classification_type == "multivariable":
        Y = one_hot_encode(Y, layer_dims[-1])

    for i in range(0, num_iterations):
        AL, caches = forward_propagation(X, parameters)
        cost = cost_function(AL, Y)
        grads = backward_propagation(AL, Y, caches)
        parameters = update_parameters(parameters, grads, learning_rate)

        if i % 100 == 0 or i == num_iterations - 1:
            print("Cost after iteration {}: {}".format(i, np.squeeze(cost)))
        if i % 100 == 0 or i == num_iterations:
            costs.append(cost)

    return parameters, costs

# Seperate code running the model

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import sys
import os
import pickle
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
from src.deep_nn import forward_propagation, deep_nn_model

X = np.load("datasets/signdigits/X.npy")
Y = np.load("datasets/signdigits/Y.npy")

# Split dataset into some training examples(15%) and shuffle data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.15, random_state=42)

m_train = X_train.shape[0]
X_train_flat = X_train.reshape(m_train, X_train.shape[1] * X_train.shape[2]).T
Y_train = Y_train[np.newaxis, :]

m_test = X_test.shape[0]
X_test_flat = X_test.reshape(m_test, X_test.shape[1] * X_test.shape[2]).T
Y_test = Y_test[np.newaxis, :]

print(X_train_flat.shape)
print(Y_train.shape)

print(X_test_flat.shape)
print(Y_test.shape)

layer_dims = [4096, 1024, 512, 256, 128, 10]
parameters, _ = deep_nn_model(X_train_flat, Y_train, num_iterations=1000, layer_dims=layer_dims, learning_rate=1, classification_method="multivariable")

def predict(X, parameters):
    AL, _ = forward_propagation(X, parameters)
    predictions = np.argmax(AL, axis=0)  
    return predictions

def calculate_accuracy(predictions, labels):
    return np.mean(predictions == labels)

with open('model_parameters.pkl', 'wb') as f:
    pickle.dump(parameters, f)

train_predictions = predict(X_train_flat, parameters)
train_accuracy = calculate_accuracy(train_predictions, Y_train)
print(f"Training Accuracy: {train_accuracy * 100:.2f}%")

test_predictions = predict(X_test_flat, parameters)
test_accuracy = calculate_accuracy(test_predictions, Y_test)
print(f"Test Accuracy: {test_accuracy * 100:.2f}%")

Thiết kế website giá rẻ

Danh mục

Deep NN converging at same cost