I am trying to perform some preprocessing on my data for a sales prediction. i used make_column_selector to select specific columns in order to apply different encoders to different column. i was trying to create a make selector column object to access column in the feature variable X. it works well for the numerical columns but the category column are the ones giving issues. averytime i use the selector object to select column in the data for the category, i get “TypeError: unhashable type: ‘list'”
# @let start with data preprocessing
# set the seed
seed = 200
# set the feature and target variable
X = bigmart_copy.drop('Item_Outlet_Sales', axis = 1)
y = bigmart_copy.Item_Outlet_Sales
# split data into train and test set
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.3, shuffle = True, random_state=seed)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=seed)
# create a list of numerical column
num_selector = make_column_selector(dtype_exclude='object')
# create two set of of category list, one of ordinal type and
# the other of nominal type
cat_selector_ord = ['Item_Fat_Content', 'Outlet_Size', 'Item_MRP_Category', 'Outlet_location_type']
cat_selector_nom = [x for x in bigmart_copy.columns if x not in cat_selector_ord]
cat_selector_nom_nom = make_column_selector(pattern=cat_selector_nom)
cat_selector_ord_ord = make_column_selector(pattern=cat_selector_ord)
# sekect this column from the data
num_cols = num_selector(X)
cat_cols_ord = cat_selector_ord_ord(X)
cat_cols_nom = cat_selector_nom_nom(X)
# initiate the preprocessor for each selctor
num_preprocessor = RobustScaler()
cat_selector_nom_preprocessor = OneHotEncoder()
cat_selector_ord_preprocessor = OrdinalEncoder()
# set the preprocessor
preprocesor = ColumnTransformer([
('RobustScaler', num_preprocessor, num_cols),
('OneHotEncoder', cat_selector_nom_preprocessor, cat_cols_nom),
('OrdinalEncoder', cat_selector_ord_preprocessor, cat_cols_ord)
])
# create a machine model pipeline
pipelines = {
'Linear Regression':make_pipeline(preprocesor, LinearRegression),
'Random Forest Regressor': make_pipeline(preprocesor, RandomForestRegressor),
'Gradient Boost Regression':make_pipeline(preprocesor, GradientBoostingRegressor),
'Extra Tree Regressor':make_pipeline(preprocesor, ExtraTreesRegressor)
}
TypeError Traceback (most recent call last)
Cell In[29], line 27
25 # sekect this column from the data
26 num_cols = num_selector(X)
---> 27 cat_cols_ord = cat_selector_ord_ord(X)
28 cat_cols_nom = cat_selector_nom_nom(X)
30 # initiate the preprocessor for each selctor
File ~anaconda3libsite-packagessklearncompose_column_transformer.py:1121, in make_column_selector.__call__(self, df)
1119 cols = df_row.columns
1120 if self.pattern is not None:
-> 1121 cols = cols[cols.str.contains(self.pattern, regex=True)]
1122 return cols.tolist()
File ~anaconda3libsite-packagespandascorestringsaccessor.py:129, in forbid_nonstring_types.<locals>._forbid_nonstring_types.<locals>.wrapper(self, *args, **kwargs)
124 msg = (
125 f"Cannot use .str.{func_name} with values of "
126 f"inferred dtype '{self._inferred_dtype}'."
127 )
128 raise TypeError(msg)
--> 129 return func(self, *args, **kwargs)
File ~anaconda3libsite-packagespandascorestringsaccessor.py:1252, in StringMethods.contains(self, pat, case, flags, na, regex)
1127 @forbid_nonstring_types(["bytes"])
1128 def contains(self, pat, case=True, flags=0, na=None, regex=True):
1129 r"""
1130 Test if pattern or regex is contained within a string of a Series or Index.
1131
(...)
1250 dtype: bool
1251 """
-> 1252 if regex and re.compile(pat).groups:
1253 warnings.warn(
1254 "This pattern is interpreted as a regular expression, and has "
1255 "match groups. To actually get the groups, use str.extract.",
1256 UserWarning,
1257 stacklevel=find_stack_level(),
1258 )
1260 result = self._data.array._str_contains(pat, case, flags, na, regex)
File ~anaconda3libre.py:251, in compile(pattern, flags)
249 def compile(pattern, flags=0):
250 "Compile a regular expression pattern, returning a Pattern object."
--> 251 return _compile(pattern, flags)
File ~anaconda3libre.py:293, in _compile(pattern, flags)
291 flags = flags.value
292 try:
--> 293 return _cache[type(pattern), pattern, flags]
294 except KeyError:
295 pass
TypeError: unhashable type: 'list'
I was expecting no error to be present and expect that the code work as expected