I am trying to clean a dataset that contains a lot of missing values from different features that are both numerical and categorical. My idea is the following:
- Use OrdinalEncoder to have only numerical values and keep the missing values as NaN (can’t with OneHotEncoder since it creates new columns which stand as NaN)
- Use KNNImputer to impute the missing values
- Reverse the encoding since there is no reasonable ground to draw some order in the categories
Here is my code so far:
import pandas as pd
import numpy as np
from sklearn import set_config
set_config(transform_output="pandas")
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.base import BaseEstimator, TransformerMixin
class RoundingToIntegerTransformer(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X):
return np.round(X)
class ReverseOrdinalEncoder(BaseEstimator, TransformerMixin):
def __init__(self, encoder_name, categorical_columns):
self.encoder_name = encoder_name
self.categorical_columns = categorical_columns
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
X_ = X.copy()
encoder = self.encoder_name
X_categorical = X_[categorical_columns]
X_categorical = encoder.inverse_transform(X_categorical)
X_[categorical_columns] = X_categorical
return X_
data = pd.DataFrame({
'Category1': ['a', 'b', 'a', np.nan, 'c', 'b'],
'Category2': ['x', 'y', 'x', 'z', np.nan, 'y'],
'Numerical1': [1.1, np.nan, 3.3, 4.4, 5.5, np.nan],
'Numerical2': [np.nan, 2.2, np.nan, 4.4, 5.5, 6.6]
})
numerical_columns = data.select_dtypes(include='number').columns.tolist()
categorical_columns = data.select_dtypes(include='object').columns.tolist()
ordinal_encoder = OrdinalEncoder(encoded_missing_value=np.nan)
first_encoder = ColumnTransformer(transformers=[
('ordinal_cat', ordinal_encoder, categorical_columns)
],
remainder='passthrough',
verbose_feature_names_out=False)
imputer = Pipeline([
('KNN_imputing', KNNImputer()),
('rounding', ColumnTransformer(transformers=[('round_cat', RoundingToIntegerTransformer() , categorical_columns)],
remainder='passthrough',
verbose_feature_names_out=False))
])
reverse_encoder = ReverseOrdinalEncoder(
encoder_name=first_encoder.transformers[0][1],
categorical_columns=categorical_columns
)
preprocessor = Pipeline([
('encoding', first_encoder),
('imputing', imputer),
('decoding', reverse_encoder)
])
I have been trying for some time but I never succeeded, I always receive the following error:
NotFittedError: This OrdinalEncoder instance is not fitted yet. Call ‘fit’ with appropriate arguments before using this estimator.
I understand the error but I thought that since the ordinal encoder appears before in the pipeline, it would be fitted when it is then used in the reverse encoder. Is there any way to make that work?
Thank you very much