I am trying to pass k=[5, 20, 50] into SelectKBest using a pipeline and GridSearchCV. The parameters are not getting passed and only the default value of 10 is being used. Here is a reproducible set of code – be kind, this is my first post.
# Import Libraries
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression, Lasso, RidgeCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.model_selection import KFold, GridSearchCV, cross_val_score, cross_validate
from sklearn.feature_selection import SelectKBest, SelectFromModel, mutual_info_regression
from sklearn import set_config
from pprint import pprint
df = sns.load_dataset('diamonds')
# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns='price'), df['price'], test_size=0.3, random_state=42)
# Organize columns by data type
cat_cols = list(X_train.select_dtypes(include='category').columns)
num_cols = list(X_train.select_dtypes(include=['float64', 'int64']).columns)
num_cols.remove('depth')
passthru_cols = ['depth']
set_config(transform_output="default")
num_pipeline = Pipeline(steps=[
('imputer', SimpleImputer(strategy='mean')),
('scaler', StandardScaler())
])
cat_pipeline = Pipeline(steps=[
('imputer', SimpleImputer(strategy='most_frequent')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
passthru_pipeline = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant', fill_value=-1))
])
col_transformer = ColumnTransformer(transformers=[
('num', num_pipeline, num_cols),
('cat', cat_pipeline, cat_cols),
('passthru', passthru_pipeline, passthru_cols)
],
remainder='drop',
n_jobs=-1
)
cv = KFold(n_splits=5, shuffle=True, random_state=42)
alpha_space = 10.0 ** np.arange(-5, 4)
pipe_final = Pipeline(steps=[
('preprocessor', col_transformer),
('feature_selection', SelectKBest(mutual_info_regression)),
('regression', LinearRegression())
])
scoring = 'r2'
#scoring = 'neg_mean_squared_error'
search_space = [{'feature_selection__k': [5, 20, 25]},
{'regression':[GradientBoostingRegressor(random_state=42)],
'regression__learning_rate': [0.1, 0.4, 0.5, 0.75]}
]
grid_search = GridSearchCV(pipe_final, search_space, cv=cv,
n_jobs=-1, scoring=scoring, return_train_score=False, verbose=1)
fit=grid_search.fit(X_train, y_train)
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
print('Best parameters found:', fit.best_params_)
print('Best score found:', fit.best_score_)
print('Train Score:', fit.score(X_train, y_train))
print('Test Score:', fit.score(X_test, y_test))
features = fit.best_estimator_.named_steps['preprocessor'].get_feature_names_out()
print('Features In:', features)
print(fit.best_estimator_.named_steps['feature_selection'].get_support())
select_features = features[fit.best_estimator_.named_steps['feature_selection'].get_support()]
print('Selected Features:', select_features)
print('Number of Selected Features:', len(select_features))
Here is the output – the key bit is that there are 10 features selected. That is the default value – the parameter grid was for 5, 20, 25.
Best parameters found: {‘regression’:
GradientBoostingRegressor(random_state=42), 'regression__learning_rate': 0.4}
Best score found: 0.9300842648407406
Train Score: 0.9404150505958049
Test Score: 0.9306286177474039
Features In: ['num__carat' 'num__table' 'num__x' 'num__y' 'num__z' 'cat__cut_Fair'
'cat__cut_Good' 'cat__cut_Ideal' 'cat__cut_Premium' 'cat__cut_Very Good'
'cat__color_D' 'cat__color_E' 'cat__color_F' 'cat__color_G'
'cat__color_H' 'cat__color_I' 'cat__color_J' 'cat__clarity_I1'
'cat__clarity_IF' 'cat__clarity_SI1' 'cat__clarity_SI2'
'cat__clarity_VS1' 'cat__clarity_VS2' 'cat__clarity_VVS1'
'cat__clarity_VVS2' 'passthru__depth']
[ True False True True True False False False False False True True
False True False False False False False True True False True False
False False]
Selected Features: ['num__carat' 'num__x' 'num__y' 'num__z' 'cat__color_D' 'cat__color_E'
'cat__color_G' 'cat__clarity_SI1' 'cat__clarity_SI2' 'cat__clarity_VS2']
Number of Selected Features: 10
I was expecting the number of features to be either 5, 20, 25 and not 10.
Mike is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.