I’m new in data science so when I do models I’m not sure if I reached the end and also I don’t know how else can I improve the model. This here is random forest classifier for which I used RandomizeSearchCV for parameters and eventually got 73% which was the highest. Also I used class_weight to balance out the classes, I scaled everything and cleaned the data as you will see. I’m not sure if this is THE BEST way to do this and also I don’t know if 73% is good enough when making models and also if I reached a limit with Random Forest Classifier.
<code>from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df = pd.read_csv('pokemon_data.csv')
duplicates = df.duplicated()
if duplicates.any():
print(df[duplicates], "DUPLICATE")
else:
print("NO DUPLICATES")
class_dist = df['type1'].value_counts()
print(class_dist)
df.replace('—', np.nan, inplace=True)
numerical_cols = df.select_dtypes(include=np.number).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())
# Now we fill missing data for categorical columns
categorical_cols = df.select_dtypes(include='object').columns
df[categorical_cols] = df[categorical_cols].fillna('Unknown')
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), ['dexnum']),
('cat', OneHotEncoder(handle_unknown='ignore'), ['species', 'ability1', 'ability2', 'egg_group1', 'egg_group2'])
])
features = ['species', 'ability1', 'ability2', 'egg_group1',
'egg_group2']
X = df[features] # variables we use to predict
y_type1 = df['type1'] # what we predict
y_type2 = df['type2'] # what we predict
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), []),
('cat', OneHotEncoder(handle_unknown='ignore'), ['species', 'ability1', 'ability2', 'egg_group1', 'egg_group2'])
])
# Train_test_split is used to split the dataset into two subsets, a training and testing set
X_train, X_test, y_type1_train, y_type1_test = train_test_split(X, y_type1, test_size=0.4, random_state=42)
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
param_dist = {
'n_estimators': [9000],
'max_depth': [1000],
'min_samples_split': [2],
'min_samples_leaf': [1],
'max_features': ['log2'],
'bootstrap': [False]
}
rf_type1 = RandomForestClassifier(class_weight='balanced', random_state=42)
random_search = RandomizedSearchCV(estimator=rf_type1, param_distributions=param_dist, n_iter=50, cv=5, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_type1_train)
# Get the best parameters
print("Best parameters found: ", random_search.best_params_)
print("Best accuracy: ", random_search.best_score_)
# Re-train the model with the best parameters
best_rf = random_search.best_estimator_
y_type1_pred_best = best_rf.predict(X_test)
# Evaluate the model
print("Type1 Classification Report with Best Parameters:")
print(classification_report(y_type1_test, y_type1_pred_best))
print("Type1 Accuracy with Best Parameters:", accuracy_score(y_type1_test, y_type1_pred_best))
</code>
<code>from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df = pd.read_csv('pokemon_data.csv')
duplicates = df.duplicated()
if duplicates.any():
print(df[duplicates], "DUPLICATE")
else:
print("NO DUPLICATES")
class_dist = df['type1'].value_counts()
print(class_dist)
df.replace('—', np.nan, inplace=True)
numerical_cols = df.select_dtypes(include=np.number).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())
# Now we fill missing data for categorical columns
categorical_cols = df.select_dtypes(include='object').columns
df[categorical_cols] = df[categorical_cols].fillna('Unknown')
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), ['dexnum']),
('cat', OneHotEncoder(handle_unknown='ignore'), ['species', 'ability1', 'ability2', 'egg_group1', 'egg_group2'])
])
features = ['species', 'ability1', 'ability2', 'egg_group1',
'egg_group2']
X = df[features] # variables we use to predict
y_type1 = df['type1'] # what we predict
y_type2 = df['type2'] # what we predict
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), []),
('cat', OneHotEncoder(handle_unknown='ignore'), ['species', 'ability1', 'ability2', 'egg_group1', 'egg_group2'])
])
# Train_test_split is used to split the dataset into two subsets, a training and testing set
X_train, X_test, y_type1_train, y_type1_test = train_test_split(X, y_type1, test_size=0.4, random_state=42)
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
param_dist = {
'n_estimators': [9000],
'max_depth': [1000],
'min_samples_split': [2],
'min_samples_leaf': [1],
'max_features': ['log2'],
'bootstrap': [False]
}
rf_type1 = RandomForestClassifier(class_weight='balanced', random_state=42)
random_search = RandomizedSearchCV(estimator=rf_type1, param_distributions=param_dist, n_iter=50, cv=5, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_type1_train)
# Get the best parameters
print("Best parameters found: ", random_search.best_params_)
print("Best accuracy: ", random_search.best_score_)
# Re-train the model with the best parameters
best_rf = random_search.best_estimator_
y_type1_pred_best = best_rf.predict(X_test)
# Evaluate the model
print("Type1 Classification Report with Best Parameters:")
print(classification_report(y_type1_test, y_type1_pred_best))
print("Type1 Accuracy with Best Parameters:", accuracy_score(y_type1_test, y_type1_pred_best))
</code>
from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df = pd.read_csv('pokemon_data.csv')
duplicates = df.duplicated()
if duplicates.any():
print(df[duplicates], "DUPLICATE")
else:
print("NO DUPLICATES")
class_dist = df['type1'].value_counts()
print(class_dist)
df.replace('—', np.nan, inplace=True)
numerical_cols = df.select_dtypes(include=np.number).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())
# Now we fill missing data for categorical columns
categorical_cols = df.select_dtypes(include='object').columns
df[categorical_cols] = df[categorical_cols].fillna('Unknown')
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), ['dexnum']),
('cat', OneHotEncoder(handle_unknown='ignore'), ['species', 'ability1', 'ability2', 'egg_group1', 'egg_group2'])
])
features = ['species', 'ability1', 'ability2', 'egg_group1',
'egg_group2']
X = df[features] # variables we use to predict
y_type1 = df['type1'] # what we predict
y_type2 = df['type2'] # what we predict
preprocessor = ColumnTransformer(
transformers=[
('num', StandardScaler(), []),
('cat', OneHotEncoder(handle_unknown='ignore'), ['species', 'ability1', 'ability2', 'egg_group1', 'egg_group2'])
])
# Train_test_split is used to split the dataset into two subsets, a training and testing set
X_train, X_test, y_type1_train, y_type1_test = train_test_split(X, y_type1, test_size=0.4, random_state=42)
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)
param_dist = {
'n_estimators': [9000],
'max_depth': [1000],
'min_samples_split': [2],
'min_samples_leaf': [1],
'max_features': ['log2'],
'bootstrap': [False]
}
rf_type1 = RandomForestClassifier(class_weight='balanced', random_state=42)
random_search = RandomizedSearchCV(estimator=rf_type1, param_distributions=param_dist, n_iter=50, cv=5, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_type1_train)
# Get the best parameters
print("Best parameters found: ", random_search.best_params_)
print("Best accuracy: ", random_search.best_score_)
# Re-train the model with the best parameters
best_rf = random_search.best_estimator_
y_type1_pred_best = best_rf.predict(X_test)
# Evaluate the model
print("Type1 Classification Report with Best Parameters:")
print(classification_report(y_type1_test, y_type1_pred_best))
print("Type1 Accuracy with Best Parameters:", accuracy_score(y_type1_test, y_type1_pred_best))