I’m training a RNN on the IDS-2018 dataset.
I wrote a pipeline for data preprocessing and applied it to the training set, during which I used one hot encoding using fit_transform. However, when I apply the same preprocessing to the validation set (using transform instead of fit_transform), I get an error saying there are new categories not found during the fitting phase (but these categories were already found).
How can I fix it?
Here’s the code:
# Read the data from the specified CSV files
file_1 = pd.read_csv("Friday-02-03-2018_TrafficForML_CICFlowMeter.csv")
file_2 = pd.read_csv("Friday-16-02-2018_TrafficForML_CICFlowMeter.csv")
# Concatenate the two files vertically
df_data = pd.concat([file_1, file_2], axis=0, ignore_index=True)
# Create a test and a val set
from sklearn.model_selection import train_test_split
train_set, test_set = train_test_split(df_data, test_size=0.2, random_state=42)
train_set, val_set = train_test_split(train_set, test_size=0.25, random_state=42)
# Analyze the target column
df["Label"].unique()
# Drop instances of 'Label' in the 'Label' feature
df.drop(df.loc[df["Label"] == "Label"].index, inplace=True)
# To simplify the model and transform it into a binary classification, convert the Label feature (target column) to be 0 if the attack is benign and 1 if it is malicious (whatever it is)
# Replace values in the 'Label' column with numeric values
df['Label'] = df['Label'].replace({'Benign': 0, 'DoS attacks-Hulk': 1, 'Bot': 1, 'DoS attacks-SlowHTTPTest': 1})
# Verify
df["Label"].value_counts()
# Check for NaN values in the dataframe
nan_values = df.isna().any()
# Total number of NaN values in the dataframe
total_nan = df.isna().sum().sum()
# I want the output not to be truncated
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
# Print it to see which features are missing (True if missing)
print("Total number of NaN values in the DataFrame:", total_nan)
print(nan_values)
# Check which features contain np.inf values
features_with_inf = df.columns[df.isin([np.inf]).any()]
# Print the features that contain np.inf values
print("Features with np.inf values:")
print(features_with_inf)
# Based on these observations, I decide to drop some features:
to_drop = ["Flow Byts/s", # contains null and inf values
"Flow Pkts/s", # contains inf values
"Dst Port", # of little relevance (--> would be categorical but too many categories)
"Timestamp"] # of little relevance
df.drop(columns=to_drop, inplace=True)
# Features that must be numerical
num_attributes = ['Fwd Pkt Len Max', 'Fwd Pkt Len Min', 'Fwd Pkt Len Mean', 'Fwd Pkt Len Std', 'Bwd Pkt Len Max',
'Bwd Pkt Len Min', 'Bwd Pkt Len Mean', 'Bwd Pkt Len Std', 'Flow IAT Mean', 'Flow IAT Std', 'Flow IAT Max',
'Flow IAT Min', 'Fwd IAT Tot', 'Fwd IAT Mean', 'Fwd IAT Std', 'Fwd IAT Max', 'Fwd IAT Min', 'Bwd IAT Tot',
'Bwd IAT Mean', 'Bwd IAT Std', 'Bwd IAT Max', 'Bwd IAT Min', 'Pkt Len Min', 'Pkt Len Max', 'Pkt Len Mean',
'Pkt Len Std', 'Pkt Len Var', 'Pkt Size Avg', 'Fwd Seg Size Avg', 'Bwd Seg Size Avg', 'Fwd Byts/b Avg',
'Fwd Pkts/b Avg', 'Fwd Blk Rate Avg', 'Bwd Byts/b Avg', 'Bwd Pkts/b Avg', 'Bwd Blk Rate Avg', 'Fwd Seg Size Min',
'Active Mean', 'Idle Mean', 'Active Std', 'Idle Min', 'Idle Std', 'Idle Max', 'Active Min', 'Active Max',
'Bwd Pkts/s', 'Fwd Pkts/s', 'Flow Duration', 'TotLen Fwd Pkts', 'TotLen Bwd Pkts', 'Fwd Act Data Pkts', 'Fwd Header Len',
'Bwd Header Len', 'Init Fwd Win Byts','Init Bwd Win Byts', 'Subflow Fwd Byts','Subflow Bwd Byts', 'Subflow Fwd Pkts',
'Subflow Bwd Pkts', 'Down/Up Ratio', 'Tot Bwd Pkts', 'Tot Fwd Pkts']
# Features that must be categorical
cat_attributes = ['PSH Flag Cnt', 'SYN Flag Cnt', 'CWE Flag Count', 'Bwd PSH Flags', 'Fwd URG Flags', 'ACK Flag Cnt',
'RST Flag Cnt', 'ECE Flag Cnt', 'Bwd URG Flags', 'Fwd PSH Flags', 'FIN Flag Cnt', 'URG Flag Cnt']
# Set the type of numerical features to float
df[num_attributes] = df[num_attributes].astype(float)
# Apply one-hot-encoder to categorical features
for feature in cat_attributes:
df[feature] = df[feature].astype(str) #first transform it into a string (otherwise it gives an error)
feature_cat = df[[feature]]
feature_cat_1hot = cat_encoder.fit_transform(feature_cat)
# Display the categories
print("Categories for the feature:", feature)
print(cat_encoder.categories_)
# Select only the numerical columns in the copy of the df, so they can be better visualized on the graph
df_num = train_set
df_num = train_set.loc[:, num_attributes]
# Now apply standardization to all numerical features:
min_max_scaler = MinMaxScaler()
df_num_min_max_scaled = min_max_scaler.fit_transform(df_num)
# Define a function that goes through the preprocessing phase:
def preprocess_data(df):
# Remove instances of 'Label' in the 'Label' feature
df.drop(df.loc[df["Label"] == "Label
# Converto la feature Label in 0 per benigno e 1 per maligno
df['Label'] = df['Label'].replace({'Benign': 0, 'DoS attacks-Hulk': 1, 'Bot': 1, 'DoS attacks-SlowHTTPTest': 1})
to_drop = ["Flow Byts/s", "Flow Pkts/s", "Dst Port", "Timestamp"]
df.drop(columns=to_drop, inplace=True)
# Converto le colonne numeriche in float
df[num_attributes] = df[num_attributes].astype(float)
# Applico one-hot encoding alle feature categoriche
for feature in cat_attributes:
df[feature] = df[feature].astype(str)
# Verifica se ci sono nuove categorie nel set di validazione
new_categories = set(df[feature]) - set(cat_encoder.categories_[0])
if new_categories:
raise ValueError(f"New categories found in the feature '{feature}': {new_categories}")
feature_cat = df[[feature]]
feature_cat_1hot = cat_encoder.transform(feature_cat)
# Applico la standardizzazione alle feature numeriche
df[num_attributes] = min_max_scaler.transform(df[num_attributes])
return df
train_set_processed = df
val_set_processed = preprocess_data(val_set)
The error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-247-20bf03b71629> in <cell line: 3>()
1 train_set_processed = df
----> 2 val_set_processed = preprocess_data(val_set.copy())
3 test_set_processed = preprocess_data(test_set.copy())
<ipython-input-246-cbd178d3c19b> in preprocess_data(df)
26 new_categories = set(df[feature]) - set(cat_encoder.categories_[0])
27 if new_categories:
---> 28 raise ValueError(f"New categories found in the feature '{feature}': {new_categories}")
29
30 feature_cat = df[[feature]]
ValueError: New categories found in the feature': {'17', '6'}
I verified that these categories were already present in the training set printing cat_encoder.categories_ after using the encoder
Agnese Castellani is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.