I am working on a multi-class classification problem using TensorFlow and experiencing challenges with achieving satisfactory accuracy. I have 7 classes. Each class in a folder contains 2000 .csv files (each file have two columns). When I train the model with a binary classification approach, testing one class with another, the accuracy and val_accuracy go high, 0.85 to 0.95, but when I test with multi label 7 classes, the accuracy reaches a maximum of 0.47. Below is the code containing the data polishing and the model multi-class.
import pandas as pd
import numpy as np
import os
import tensorflow as tf
from tensorflow.keras.callbacks import ModelCheckpoint
from google.colab import drive
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from keras.utils import to_categorical, normalize
#Classes csv in folders
folder_paths = [
'/content/drive/MyDrive/medical_chem/Aa',
'/content/drive/MyDrive/medical_chem/Ab',
'/content/drive/MyDrive/medical_chem/Ac',
'/content/drive/MyDrive/medical_chem/Ba',
'/content/drive/MyDrive/medical_chem/Bb',
'/content/drive/MyDrive/medical_chem/Cc',
'/content/drive/MyDrive/medical_chem/DD'
]
data = []
labels = []
#Load the folders and archive the files csv in dataframes
for class_index, folder_path in enumerate(folder_paths):
for file in os.listdir(folder_path):
file_path = os.path.join(folder_path, file)
df = pd.read_csv(file_path)
data.append(df)
labels.append(class_index)
X = data
y = labels
# Find the min value in dataframe
min_length = min(len(df) for df in X)
# Set the dataframes with the same lengh
truncated_dfs = [df.head(min_length) for df in X]
# dataframe to numpy array
X = np.array([df.values for df in truncated_dfs])
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# normalize the data
X_train = normalize(X_train, axis=1)
X_test = normalize(X_test, axis=1)
y_train = to_categorical(y_train, num_classes=7)
y_test = to_categorical(y_test, num_classes=7)
X_train.shape, y_train.shape, X_test.shape, y_test.shape
# output ((8943, 2906, 2), (8943, 7), (2236, 2906, 2), (2236, 7))
model = tf.keras.Sequential([
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(32, activation='relu'),
tf.keras.layers.Dense(7, activation='softmax') # Output layer for 7 classes
])
# checkpoint to trained model
checkpoint_path = "training_checkpoint/cp.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)
checkpoint_callback = ModelCheckpoint(filepath=checkpoint_path,
save_weights_only=True,
save_best_only=True,
monitor='val_loss',
verbose=1)
model.compile(optimizer='adam',
loss='categorical_crossentropy',
metrics=['accuracy'])
#model.load_weights(checkpoint_path)
history = model.fit(X_train, y_train,
epochs=100,
validation_data=(X_test, y_test),
callbacks=[checkpoint_callback])
I have tried adjusting the architecture of the neural network, experimenting with different activation functions, and optimizing hyperparameters such as learning rate and batch size. However, I am still not achieving the desired accuracy.
I’m sure where I’m going wrong is in pre-processing the data or in the model, since binary training has a good results.
Comparing with Binary Training get accuracy 0.85 to 0.95
**Expected accuracy in multiclass: above than 0.90
**
The dataset: https://drive.google.com/drive/folders/1UAt50dPH7ABeoLu16nfa19g4oVccPeFO?usp=sharing
Antonio Neto is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.