Just trying a standard bayes search optimisation of XGBClassifier params with some datasets that have been manipulated to remove nans, drop rows, change index etc.
This is an overview of the data that I am working with:
x_train shape: (249212, 44)
x_train: <class ‘pandas.core.frame.DataFrame’>
x_train types:
y_train shape: (249212,)
y_train type: <class ‘pandas.core.series.Series’> this has also been a dataframe during testing
X_train dtypes:
col_1_name float32
col_2_name float32
col_3_name float32
…
col_40_name float32
col_41_name float32
col_42_name float32
col_43_name float32
col_44_name float32
dtype: object
Y_train dtypes:
int8
assert X_train.index.equals(y_train.index), "Indices do not match"
assert X_train.shape[0] == y_train.shape[0], "Mismatched number of rows"
These assertions do not fail
This is the error occurs:
changed opt_X_train for X_train and same for y_train and testing variables etc just for ease in the post so ignore changed variable names
{
"name": "XGBoostError",
"message": "[12:37:08] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-06abd128ca6c1688d-1\xgboost\xgboost-ci-windows\src\data\array_interface.h:218: Check failed: m == 1 || n == 1: ",
"stack": "---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
Cell In[35], line 196
193 assert opt_X_train.shape[0] == opt_y_train.shape[0], "Mismatched number of rows!"
195 simple_model = XGBClassifier(objective='binary:logistic', eval_metric=['auc', 'logloss'])
--> 196 simple_model.fit(opt_X_train, opt_y_train)
197 # search = opt.fit(opt_X_train.drop(['y', 'x'], axis = 1), opt_y_train,
198 # eval_set =[(opt_X_train.drop(['y', 'x'], axis = 1), opt_y_train),
199 # (opt_X_test.drop(['y', 'x'], axis = 1), opt_y_test)],
(...)
File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:726, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs)
724 for k, arg in zip(sig.parameters, args):
725 kwargs[k] = arg
--> 726 return func(**kwargs)
File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\sklearn.py:1512, in XGBClassifier.fit(self, X, y, sample_weight, base_margin, eval_set, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights)
1509 params["num_class"] = self.n_classes_
1511 model, metric, params = self._configure_fit(xgb_model, params)
-> 1512 train_dmatrix, evals = _wrap_evaluation_matrices(
1513 missing=self.missing,
1514 X=X,
1515 y=y,
1516 group=None,
1517 qid=None,
1518 sample_weight=sample_weight,
1519 base_margin=base_margin,
1520 feature_weights=feature_weights,
1521 eval_set=eval_set,
1522 sample_weight_eval_set=sample_weight_eval_set,
1523 base_margin_eval_set=base_margin_eval_set,
1524 eval_group=None,
1525 eval_qid=None,
1526 create_dmatrix=self._create_dmatrix,
1527 enable_categorical=self.enable_categorical,
1528 feature_types=self.feature_types,
1529 )
1531 self._Booster = train(
1532 params,
1533 train_dmatrix,
(...)
1542 callbacks=self.callbacks,
1543 )
1545 if not callable(self.objective):
File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\sklearn.py:596, in _wrap_evaluation_matrices(missing, X, y, group, qid, sample_weight, base_margin, feature_weights, eval_set, sample_weight_eval_set, base_margin_eval_set, eval_group, eval_qid, create_dmatrix, enable_categorical, feature_types)
576 def _wrap_evaluation_matrices(
577 missing: float,
578 X: Any,
(...)
592 feature_types: Optional[FeatureTypes],
593 ) -> Tuple[Any, List[Tuple[Any, str]]]:
594 """Convert array_like evaluation matrices into DMatrix. Perform validation on the
595 way."""
--> 596 train_dmatrix = create_dmatrix(
597 data=X,
598 label=y,
599 group=group,
600 qid=qid,
601 weight=sample_weight,
602 base_margin=base_margin,
603 feature_weights=feature_weights,
604 missing=missing,
605 enable_categorical=enable_categorical,
606 feature_types=feature_types,
607 ref=None,
608 )
610 n_validation = 0 if eval_set is None else len(eval_set)
612 def validate_or_none(meta: Optional[Sequence], name: str) -> Sequence:
File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\sklearn.py:1003, in XGBModel._create_dmatrix(self, ref, **kwargs)
1001 if _can_use_qdm(self.tree_method) and self.booster != "gblinear":
1002 try:
-> 1003 return QuantileDMatrix(
1004 **kwargs, ref=ref, nthread=self.n_jobs, max_bin=self.max_bin
1005 )
1006 except TypeError: # `QuantileDMatrix` supports lesser types than DMatrix
1007 pass
File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:726, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs)
724 for k, arg in zip(sig.parameters, args):
725 kwargs[k] = arg
--> 726 return func(**kwargs)
File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:1573, in QuantileDMatrix.__init__(self, data, label, weight, base_margin, missing, silent, feature_names, feature_types, nthread, max_bin, ref, group, qid, label_lower_bound, label_upper_bound, feature_weights, enable_categorical, data_split_mode)
1553 if any(
1554 info is not None
1555 for info in (
(...)
1566 )
1567 ):
1568 raise ValueError(
1569 "If data iterator is used as input, data like label should be "
1570 "specified as batch argument."
1571 )
-> 1573 self._init(
1574 data,
1575 ref=ref,
1576 label=label,
1577 weight=weight,
1578 base_margin=base_margin,
1579 group=group,
1580 qid=qid,
1581 label_lower_bound=label_lower_bound,
1582 label_upper_bound=label_upper_bound,
1583 feature_weights=feature_weights,
1584 feature_names=feature_names,
1585 feature_types=feature_types,
1586 enable_categorical=enable_categorical,
1587 )
File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:1632, in QuantileDMatrix._init(self, data, ref, enable_categorical, **meta)
1620 config = make_jcargs(
1621 nthread=self.nthread, missing=self.missing, max_bin=self.max_bin
1622 )
1623 ret = _LIB.XGQuantileDMatrixCreateFromCallback(
1624 None,
1625 it.proxy.handle,
(...)
1630 ctypes.byref(handle),
1631 )
-> 1632 it.reraise()
1633 # delay check_call to throw intermediate exception first
1634 _check_call(ret)
File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:569, in DataIter.reraise(self)
567 exc = self._exception
568 self._exception = None
--> 569 raise exc
File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:550, in DataIter._handle_exception(self, fn, dft_ret)
547 return dft_ret
549 try:
--> 550 return fn()
551 except Exception as e: # pylint: disable=broad-except
552 # Defer the exception in order to return 0 and stop the iteration.
553 # Exception inside a ctype callback function has no effect except
554 # for printing to stderr (doesn't stop the execution).
555 tb = sys.exc_info()[2]
File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:637, in DataIter._next_wrapper.<locals>.<lambda>()
635 self._temporary_data = None
636 # pylint: disable=not-callable
--> 637 return self._handle_exception(lambda: self.next(input_data), 0)
File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\data.py:1416, in SingleBatchInternalIter.next(self, input_data)
1414 return 0
1415 self.it += 1
-> 1416 input_data(**self.kwargs)
1417 return 1
File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:726, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs)
724 for k, arg in zip(sig.parameters, args):
725 kwargs[k] = arg
--> 726 return func(**kwargs)
File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:625, in DataIter._next_wrapper.<locals>.input_data(data, feature_names, feature_types, **kwargs)
623 # Stage the data, meta info are copied inside C++ MetaInfo.
624 self._temporary_data = (new, cat_codes, feature_names, feature_types)
--> 625 dispatch_proxy_set_data(self.proxy, new, cat_codes)
626 self.proxy.set_info(
627 feature_names=feature_names,
628 feature_types=feature_types,
629 **kwargs,
630 )
631 self._data_ref = ref
File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\data.py:1492, in dispatch_proxy_set_data(proxy, data, cat_codes)
1490 # Host
1491 if isinstance(data, PandasTransformed):
-> 1492 proxy._ref_data_from_pandas(data) # pylint: disable=W0212
1493 return
1494 if _is_np_array_like(data):
File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:1466, in _ProxyDMatrix._ref_data_from_pandas(self, data)
1464 def _ref_data_from_pandas(self, data: DataType) -> None:
1465 """Reference data from a pandas DataFrame. The input is a PandasTransformed instance."""
-> 1466 _check_call(
1467 _LIB.XGProxyDMatrixSetDataColumnar(self.handle, data.array_interface())
1468 )
File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:284, in _check_call(ret)
273 """Check the return value of C API call
274
275 This function will raise exception when error occurs.
(...)
281 return value from API calls
282 """
283 if ret != 0:
--> 284 raise XGBoostError(py_str(_LIB.XGBGetLastError()))
**XGBoostError: [12:37:08] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-06abd128ca6c1688d-1\xgboost\xgboost-ci-windows\src\data\array_interface.h:218: Check failed: m == 1 || n == 1: "**
}
This is the code sections that I believe are relevant:
estimator = XGBClassifier(objective='binary:logistic', scale_pos_weight = 0.75, eval_metric = ['auc', 'logloss'], missing=np.nan, early_stopping_rounds=200, verbosity=2) #scale_pos_weight = scale_pos_weight
search_space = {
'learning_rate': (0.000001, 0.2, 'log-uniform'),
'min_child_weight': (0, 10),
'max_depth': (0, 10),
'max_delta_step': (0, 20),
'subsample': (0.01, 1.0, 'uniform'),
'colsample_bytree': (0.01, 1.0, 'uniform'),
'colsample_bylevel': (0.01, 1.0, 'uniform'),
'reg_lambda': (1e-9, 1000, 'log-uniform'),
'reg_alpha': (1e-9, 1.0, 'log-uniform'),
'gamma': (1e-9, 0.5, 'log-uniform'),
'n_estimators': (1, 100),
#'scale_pos_weight': (1e-6, 500, 'log-uniform')
}
cv = StratifiedKFold(
n_splits=3,
shuffle=True,
random_state=42
)
opt = BayesSearchCV(
estimator=estimator,
search_spaces=search_space,
scoring = 'f1',
#fit_params=param_grid,
cv=cv,
n_jobs = -1,
verbose = 2,
n_iter= 75,
refit = True,
random_state = 42
)
…
search = opt.fit(X_train, y_train,
eval_set =[(X_train, y_train),
(X_test, y_test)],
verbose=2)
This is the function that is doing most of the changing from the original X_train and y_train etc.
def sample_group(group, indices):
vals = group.iloc[indices[:min(len(group), len(indices))]]
return vals
def strat_split(X, Y):
all = pd.concat([X, Y], axis=1)
unique_datetimes = all.index.unique()
unique_datetimes_sorted = sorted(unique_datetimes)
print("Unique datetime indices:")
for dt in unique_datetimes_sorted:
print(dt)
#truths = all[all['truth'] == 1].groupby('time').apply(lambda x: x.sample(min(len(x),5000)))
truths = all[all['truth'] == 1].groupby('time', group_keys=False).apply(lambda x: x.sample(min(len(x),5000)))
#truths = truths.reset_index(0).drop(['time'], axis = 1)
sample_sizes = truths.groupby('time').size()
def sample_falses(group):
group_time = group.name
n_truths = sample_sizes.get(group_time, 0)
n_falses = n_truths
return group.sample(n=n_falses)
falses = all[all['truth'] == 0].groupby('time', group_keys=False).apply(sample_falses)
#falses = falses.reset_index(0).drop(['time'], axis = 1)
all_new = pd.concat([truths, falses], axis=0).sort_values(by='time')
all_new = all_new.dropna(subset=all_bands, how='all')
all_new = all_new.dropna(subset=planet_bands, how='all')
all_new = all_new[['y', 'x'] + all_bands + ['truth']]
X_new = all_new.drop(columns=['truth'])
Y_new = all_new['truth']
# print('Truths: ', len(all_new[all_new['truth'] == 1]))
# print('Falses: ', len(all_new[all_new['truth'] == 0]))
# print('Total Size: ', len(all_new[all_new['truth'] == 1]) + len(all_new[all_new['truth'] == 0]))
# print('Fraction Neg/Pos: ', len(all_new[all_new['truth'] == 0])/len(all_new[all_new['truth'] == 1]))
return X_new, Y_new
X_train, y_train = strat_split(X_train,
y_train)
X_test, y_test = strat_split(X_test
,y_test)
X_train_reindex = X_train.copy()
y_train_reindex = y_train.copy()
X_train_reindex['Year'] = X_train_reindex.index.year.astype('float32')
X_train_reindex['Month'] = X_train_reindex.index.month.astype('float32')
X_train_reindex['Day'] = X_train_reindex.index.day.astype('float32')
X_test_reindex = X_test.copy()
y_test_reindex = y_test.copy()
X_test_reindex['Year'] = X_test_reindex.index.year.astype('float32')
X_test_reindex['Month'] = X_test_reindex.index.month.astype('float32')
X_test_reindex['Day'] = X_test_reindex.index.day.astype('float32')
X_train_reindex = X_train_reindex.reset_index(drop=True).drop(['y', 'x'], axis = 1)
y_train_reindex = y_train_reindex.reset_index(drop=True)
X_test_reindex = X_test_reindex.reset_index(drop=True).drop(['y', 'x'], axis = 1)
y_test_reindex = y_test_reindex.reset_index(drop=True)
Expected the model training to run like the non-manipulated dataset
Reece Smith is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.