Thiết kế website giá rẻ

Question

I’m new in data science so when I do models I’m not sure if I reached the end and also I don’t know how else can I improve the model. This here is random forest classifier for which I used RandomizeSearchCV for parameters and eventually got 73% which was the highest. Also I used class_weight to balance out the classes, I scaled everything and cleaned the data as you will see. I’m not sure if this is THE BEST way to do this and also I don’t know if 73% is good enough when making models and also if I reached a limit with Random Forest Classifier.

<code>from matplotlib import pyplot as plt

from matplotlib.colors import ListedColormap

import pandas as pd

import numpy as np

from sklearn.compose import ColumnTransformer

from sklearn.model_selection import RandomizedSearchCV, train_test_split

from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, MaxAbsScaler, RobustScaler

from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

from imblearn.over_sampling import SMOTE

pd.set_option('display.max_rows', None)

pd.set_option('display.max_columns', None)

df = pd.read_csv('pokemon_data.csv')

duplicates = df.duplicated()

if duplicates.any():

print(df[duplicates], "DUPLICATE")

else:

print("NO DUPLICATES")

class_dist = df['type1'].value_counts()

print(class_dist)

df.replace('—', np.nan, inplace=True)

numerical_cols = df.select_dtypes(include=np.number).columns

df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())

# Now we fill missing data for categorical columns

categorical_cols = df.select_dtypes(include='object').columns

df[categorical_cols] = df[categorical_cols].fillna('Unknown')

preprocessor = ColumnTransformer(

transformers=[

('num', StandardScaler(), ['dexnum']),

('cat', OneHotEncoder(handle_unknown='ignore'), ['species', 'ability1', 'ability2', 'egg_group1', 'egg_group2'])

])

features = ['species', 'ability1', 'ability2', 'egg_group1',

'egg_group2']

X = df[features] # variables we use to predict

y_type1 = df['type1'] # what we predict

y_type2 = df['type2'] # what we predict

preprocessor = ColumnTransformer(

transformers=[

('num', StandardScaler(), []),

('cat', OneHotEncoder(handle_unknown='ignore'), ['species', 'ability1', 'ability2', 'egg_group1', 'egg_group2'])

])

# Train_test_split is used to split the dataset into two subsets, a training and testing set

X_train, X_test, y_type1_train, y_type1_test = train_test_split(X, y_type1, test_size=0.4, random_state=42)

X_train = preprocessor.fit_transform(X_train)

X_test = preprocessor.transform(X_test)

param_dist = {

'n_estimators': [9000],

'max_depth': [1000],

'min_samples_split': [2],

'min_samples_leaf': [1],

'max_features': ['log2'],

'bootstrap': [False]

}

rf_type1 = RandomForestClassifier(class_weight='balanced', random_state=42)

random_search = RandomizedSearchCV(estimator=rf_type1, param_distributions=param_dist, n_iter=50, cv=5, verbose=2, random_state=42, n_jobs=-1)

random_search.fit(X_train, y_type1_train)

# Get the best parameters

print("Best parameters found: ", random_search.best_params_)

print("Best accuracy: ", random_search.best_score_)

# Re-train the model with the best parameters

best_rf = random_search.best_estimator_

y_type1_pred_best = best_rf.predict(X_test)

# Evaluate the model

print("Type1 Classification Report with Best Parameters:")

print(classification_report(y_type1_test, y_type1_pred_best))

print("Type1 Accuracy with Best Parameters:", accuracy_score(y_type1_test, y_type1_pred_best))

</code>

<code>from matplotlib import pyplot as plt from matplotlib.colors import ListedColormap import pandas as pd import numpy as np from sklearn.compose import ColumnTransformer from sklearn.model_selection import RandomizedSearchCV, train_test_split from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, MaxAbsScaler, RobustScaler from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import classification_report, accuracy_score, confusion_matrix from imblearn.over_sampling import SMOTE pd.set_option('display.max_rows', None) pd.set_option('display.max_columns', None) df = pd.read_csv('pokemon_data.csv') duplicates = df.duplicated() if duplicates.any(): print(df[duplicates], "DUPLICATE") else: print("NO DUPLICATES") class_dist = df['type1'].value_counts() print(class_dist) df.replace('—', np.nan, inplace=True) numerical_cols = df.select_dtypes(include=np.number).columns df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median()) # Now we fill missing data for categorical columns categorical_cols = df.select_dtypes(include='object').columns df[categorical_cols] = df[categorical_cols].fillna('Unknown') preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), ['dexnum']), ('cat', OneHotEncoder(handle_unknown='ignore'), ['species', 'ability1', 'ability2', 'egg_group1', 'egg_group2']) ]) features = ['species', 'ability1', 'ability2', 'egg_group1', 'egg_group2'] X = df[features] # variables we use to predict y_type1 = df['type1'] # what we predict y_type2 = df['type2'] # what we predict preprocessor = ColumnTransformer( transformers=[ ('num', StandardScaler(), []), ('cat', OneHotEncoder(handle_unknown='ignore'), ['species', 'ability1', 'ability2', 'egg_group1', 'egg_group2']) ]) # Train_test_split is used to split the dataset into two subsets, a training and testing set X_train, X_test, y_type1_train, y_type1_test = train_test_split(X, y_type1, test_size=0.4, random_state=42) X_train = preprocessor.fit_transform(X_train) X_test = preprocessor.transform(X_test) param_dist = { 'n_estimators': [9000], 'max_depth': [1000], 'min_samples_split': [2], 'min_samples_leaf': [1], 'max_features': ['log2'], 'bootstrap': [False] } rf_type1 = RandomForestClassifier(class_weight='balanced', random_state=42) random_search = RandomizedSearchCV(estimator=rf_type1, param_distributions=param_dist, n_iter=50, cv=5, verbose=2, random_state=42, n_jobs=-1) random_search.fit(X_train, y_type1_train) # Get the best parameters print("Best parameters found: ", random_search.best_params_) print("Best accuracy: ", random_search.best_score_) # Re-train the model with the best parameters best_rf = random_search.best_estimator_ y_type1_pred_best = best_rf.predict(X_test) # Evaluate the model print("Type1 Classification Report with Best Parameters:") print(classification_report(y_type1_test, y_type1_pred_best)) print("Type1 Accuracy with Best Parameters:", accuracy_score(y_type1_test, y_type1_pred_best)) </code>

from matplotlib import pyplot as plt
from matplotlib.colors import ListedColormap
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder, MinMaxScaler, MaxAbsScaler, RobustScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
from imblearn.over_sampling import SMOTE

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
df = pd.read_csv('pokemon_data.csv')

duplicates = df.duplicated()
if duplicates.any():
    print(df[duplicates], "DUPLICATE")
else:
    print("NO DUPLICATES")
    
class_dist = df['type1'].value_counts()
print(class_dist)

df.replace('—', np.nan, inplace=True)
numerical_cols = df.select_dtypes(include=np.number).columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].median())
# Now we fill missing data for categorical columns
categorical_cols = df.select_dtypes(include='object').columns
df[categorical_cols] = df[categorical_cols].fillna('Unknown')


preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['dexnum']),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['species', 'ability1', 'ability2', 'egg_group1', 'egg_group2'])
    ])

features = ['species', 'ability1', 'ability2', 'egg_group1',
            'egg_group2']

X = df[features] # variables we use to predict 
y_type1 = df['type1'] # what we predict
y_type2 = df['type2'] # what we predict

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), []),
        ('cat', OneHotEncoder(handle_unknown='ignore'), ['species', 'ability1', 'ability2', 'egg_group1', 'egg_group2'])
    ])

# Train_test_split is used to split the dataset into two subsets, a training and testing set
X_train, X_test, y_type1_train, y_type1_test = train_test_split(X, y_type1, test_size=0.4, random_state=42)

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

param_dist = {
    'n_estimators': [9000],
    'max_depth': [1000],
    'min_samples_split': [2],
    'min_samples_leaf': [1],
    'max_features': ['log2'],
    'bootstrap': [False]
}

rf_type1 = RandomForestClassifier(class_weight='balanced', random_state=42)
random_search = RandomizedSearchCV(estimator=rf_type1, param_distributions=param_dist, n_iter=50, cv=5, verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_type1_train)

# Get the best parameters
print("Best parameters found: ", random_search.best_params_)
print("Best accuracy: ", random_search.best_score_)

# Re-train the model with the best parameters
best_rf = random_search.best_estimator_
y_type1_pred_best = best_rf.predict(X_test)

# Evaluate the model
print("Type1 Classification Report with Best Parameters:")
print(classification_report(y_type1_test, y_type1_pred_best))
print("Type1 Accuracy with Best Parameters:", accuracy_score(y_type1_test, y_type1_pred_best))

Thiết kế website giá rẻ

Danh mục

How to know if there is a way to improve this model and also how should I know if I reached a particular models limit