I am on the classic California Housing Dataset trying to predict the median house value. So, the dataset contains NaN values in total bedrooms. I used the Simple Imputer to replace them with median values but when I go on to train the model I still get NaN values. Now, the strategy used with the imputer was median, so traditional methods for constant strategy don’t work. I have checked the dataframe and there aren’t any after my preprocessing. But when I try to preprocess and run the model together via a pipeline, I don’t know what goes wrong. I have copied some of the code that’s relevant linked the Jupyter notebook with the rest of the code for reference. Any help will be greatly appreciated. Thanks!
https://colab.research.google.com/drive/1gpaI2xJE2tY0gxEAD1oFGUGsBgRIal5q?usp=sharing
<code>from sklearn.cluster import KMeans
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.metrics.pairwise import rbf_kernel
class ClusterSimilarity(BaseEstimator, TransformerMixin):
def __init__(self, n_clusters=10, gamma=1.0,random_state=None):
self.n_clusters=n_clusters
self.random_state=random_state
def fit(self, X,y=None,sample_weight=None):
self.kmeans_=KMeans(self.n_clusters,random_state=self.random_state,n_init=10)
self.kmeans_.fit(X,sample_weight=sample_weight)
return rbf_kernel(X, self.kmeans_.cluster_centers_,gamma=self.gamma)
def get_feature_names_out(self,names=None):
return [f"Cluster {i} similarity" for i in range(self.n_clusters)]
<code>from sklearn.cluster import KMeans
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.metrics.pairwise import rbf_kernel
class ClusterSimilarity(BaseEstimator, TransformerMixin):
def __init__(self, n_clusters=10, gamma=1.0,random_state=None):
self.n_clusters=n_clusters
self.gamma=gamma
self.random_state=random_state
def fit(self, X,y=None,sample_weight=None):
self.kmeans_=KMeans(self.n_clusters,random_state=self.random_state,n_init=10)
self.kmeans_.fit(X,sample_weight=sample_weight)
return self
def transform(self,X):
return rbf_kernel(X, self.kmeans_.cluster_centers_,gamma=self.gamma)
def get_feature_names_out(self,names=None):
return [f"Cluster {i} similarity" for i in range(self.n_clusters)]
</code>
from sklearn.cluster import KMeans
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.metrics.pairwise import rbf_kernel
class ClusterSimilarity(BaseEstimator, TransformerMixin):
def __init__(self, n_clusters=10, gamma=1.0,random_state=None):
self.n_clusters=n_clusters
self.gamma=gamma
self.random_state=random_state
def fit(self, X,y=None,sample_weight=None):
self.kmeans_=KMeans(self.n_clusters,random_state=self.random_state,n_init=10)
self.kmeans_.fit(X,sample_weight=sample_weight)
return self
def transform(self,X):
return rbf_kernel(X, self.kmeans_.cluster_centers_,gamma=self.gamma)
def get_feature_names_out(self,names=None):
return [f"Cluster {i} similarity" for i in range(self.n_clusters)]
<code>def column_ratio(X):
def ratio_name(function_transformer,feature_names_in):
return make_pipeline(SimpleImputer(strategy='median',missing_values=pd.NA),FunctionTransformer(column_ratio,feature_names_out=ratio_name),StandardScaler())
log_pipeline=make_pipeline(SimpleImputer(strategy='median',missing_values=pd.NA),FunctionTransformer(np.log,feature_names_out='one-to-one'),StandardScaler())
cluster_simil=ClusterSimilarity(n_clusters=10,gamma=1.,random_state=69)
default_num_pipeline=make_pipeline(SimpleImputer(strategy='median',missing_values=pd.NA),StandardScaler())
preprocessing=ColumnTransformer([("bedrooms_per_room",ratio_pipeline(),["total_bedrooms","total_rooms"]),("rooms_per_house",ratio_pipeline(),
["total_rooms","households"]),("people_per_house",ratio_pipeline(),["population","households"]),
("log",log_pipeline,["total_bedrooms","total_rooms","population","households","median_income"]),("coordinates_adjustments",cluster_simil,["latitude","longitude"]),
("cat",cat_pipeline,make_column_selector(dtype_include=object))],remainder=default_num_pipeline)
<code>def column_ratio(X):
return X[:,[0]]/X[:,[1]]
def ratio_name(function_transformer,feature_names_in):
return['ratio']
def ratio_pipeline():
return make_pipeline(SimpleImputer(strategy='median',missing_values=pd.NA),FunctionTransformer(column_ratio,feature_names_out=ratio_name),StandardScaler())
log_pipeline=make_pipeline(SimpleImputer(strategy='median',missing_values=pd.NA),FunctionTransformer(np.log,feature_names_out='one-to-one'),StandardScaler())
cluster_simil=ClusterSimilarity(n_clusters=10,gamma=1.,random_state=69)
default_num_pipeline=make_pipeline(SimpleImputer(strategy='median',missing_values=pd.NA),StandardScaler())
preprocessing=ColumnTransformer([("bedrooms_per_room",ratio_pipeline(),["total_bedrooms","total_rooms"]),("rooms_per_house",ratio_pipeline(),
["total_rooms","households"]),("people_per_house",ratio_pipeline(),["population","households"]),
("log",log_pipeline,["total_bedrooms","total_rooms","population","households","median_income"]),("coordinates_adjustments",cluster_simil,["latitude","longitude"]),
("cat",cat_pipeline,make_column_selector(dtype_include=object))],remainder=default_num_pipeline)
</code>
def column_ratio(X):
return X[:,[0]]/X[:,[1]]
def ratio_name(function_transformer,feature_names_in):
return['ratio']
def ratio_pipeline():
return make_pipeline(SimpleImputer(strategy='median',missing_values=pd.NA),FunctionTransformer(column_ratio,feature_names_out=ratio_name),StandardScaler())
log_pipeline=make_pipeline(SimpleImputer(strategy='median',missing_values=pd.NA),FunctionTransformer(np.log,feature_names_out='one-to-one'),StandardScaler())
cluster_simil=ClusterSimilarity(n_clusters=10,gamma=1.,random_state=69)
default_num_pipeline=make_pipeline(SimpleImputer(strategy='median',missing_values=pd.NA),StandardScaler())
preprocessing=ColumnTransformer([("bedrooms_per_room",ratio_pipeline(),["total_bedrooms","total_rooms"]),("rooms_per_house",ratio_pipeline(),
["total_rooms","households"]),("people_per_house",ratio_pipeline(),["population","households"]),
("log",log_pipeline,["total_bedrooms","total_rooms","population","households","median_income"]),("coordinates_adjustments",cluster_simil,["latitude","longitude"]),
("cat",cat_pipeline,make_column_selector(dtype_include=object))],remainder=default_num_pipeline)
<code>from sklearn.linear_model import LinearRegression
lin_reg=make_pipeline(preprocessing, LinearRegression())
lin_reg.fit_transform(housing,housing_labels)
<code>from sklearn.linear_model import LinearRegression
lin_reg=make_pipeline(preprocessing, LinearRegression())
lin_reg.fit_transform(housing,housing_labels)
</code>
from sklearn.linear_model import LinearRegression
lin_reg=make_pipeline(preprocessing, LinearRegression())
lin_reg.fit_transform(housing,housing_labels)
I tried using SimpleImputer in my pipeline to remove the NaN values but even after that I’m getting NaN Error. I manually checked my pipeline and tested it out for NaN values, but the data was empty implying no presence of NaN values but when I tried using it for a Linear Regression model, I got NaN error. I also used the missing_values as both pd.NA and np.none besides None and all do nothing.