Thiết kế website giá rẻ

Question

Just trying a standard bayes search optimisation of XGBClassifier params with some datasets that have been manipulated to remove nans, drop rows, change index etc.

This is an overview of the data that I am working with:
x_train shape: (249212, 44)
x_train: <class ‘pandas.core.frame.DataFrame’>
x_train types:
y_train shape: (249212,)
y_train type: <class ‘pandas.core.series.Series’> this has also been a dataframe during testing

X_train dtypes:
col_1_name float32
col_2_name float32
col_3_name float32
…
col_40_name float32
col_41_name float32
col_42_name float32
col_43_name float32
col_44_name float32

dtype: object

Y_train dtypes:
int8

        assert X_train.index.equals(y_train.index), "Indices do not match"
        assert X_train.shape[0] == y_train.shape[0], "Mismatched number of rows"

These assertions do not fail

This is the error occurs:
changed opt_X_train for X_train and same for y_train and testing variables etc just for ease in the post so ignore changed variable names

{
    "name": "XGBoostError",
    "message": "[12:37:08] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-06abd128ca6c1688d-1\xgboost\xgboost-ci-windows\src\data\array_interface.h:218: Check failed: m == 1 || n == 1: ",
    "stack": "---------------------------------------------------------------------------
XGBoostError                              Traceback (most recent call last)
Cell In[35], line 196
    193         assert opt_X_train.shape[0] == opt_y_train.shape[0], "Mismatched number of rows!"
    195         simple_model = XGBClassifier(objective='binary:logistic', eval_metric=['auc', 'logloss'])
--> 196         simple_model.fit(opt_X_train, opt_y_train)
    197         # search = opt.fit(opt_X_train.drop(['y', 'x'], axis = 1), opt_y_train, 
    198         # eval_set =[(opt_X_train.drop(['y', 'x'], axis = 1), opt_y_train), 
    199         #            (opt_X_test.drop(['y', 'x'], axis = 1), opt_y_test)],
   (...)


File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:726, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs)
    724 for k, arg in zip(sig.parameters, args):
    725     kwargs[k] = arg
--> 726 return func(**kwargs)

File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\sklearn.py:1512, in XGBClassifier.fit(self, X, y, sample_weight, base_margin, eval_set, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights)
   1509     params["num_class"] = self.n_classes_
   1511 model, metric, params = self._configure_fit(xgb_model, params)
-> 1512 train_dmatrix, evals = _wrap_evaluation_matrices(
   1513     missing=self.missing,
   1514     X=X,
   1515     y=y,
   1516     group=None,
   1517     qid=None,
   1518     sample_weight=sample_weight,
   1519     base_margin=base_margin,
   1520     feature_weights=feature_weights,
   1521     eval_set=eval_set,
   1522     sample_weight_eval_set=sample_weight_eval_set,
   1523     base_margin_eval_set=base_margin_eval_set,
   1524     eval_group=None,
   1525     eval_qid=None,
   1526     create_dmatrix=self._create_dmatrix,
   1527     enable_categorical=self.enable_categorical,
   1528     feature_types=self.feature_types,
   1529 )
   1531 self._Booster = train(
   1532     params,
   1533     train_dmatrix,
   (...)
   1542     callbacks=self.callbacks,
   1543 )
   1545 if not callable(self.objective):

File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\sklearn.py:596, in _wrap_evaluation_matrices(missing, X, y, group, qid, sample_weight, base_margin, feature_weights, eval_set, sample_weight_eval_set, base_margin_eval_set, eval_group, eval_qid, create_dmatrix, enable_categorical, feature_types)
    576 def _wrap_evaluation_matrices(
    577     missing: float,
    578     X: Any,
   (...)
    592     feature_types: Optional[FeatureTypes],
    593 ) -> Tuple[Any, List[Tuple[Any, str]]]:
    594     """Convert array_like evaluation matrices into DMatrix.  Perform validation on the
    595     way."""
--> 596     train_dmatrix = create_dmatrix(
    597         data=X,
    598         label=y,
    599         group=group,
    600         qid=qid,
    601         weight=sample_weight,
    602         base_margin=base_margin,
    603         feature_weights=feature_weights,
    604         missing=missing,
    605         enable_categorical=enable_categorical,
    606         feature_types=feature_types,
    607         ref=None,
    608     )
    610     n_validation = 0 if eval_set is None else len(eval_set)
    612     def validate_or_none(meta: Optional[Sequence], name: str) -> Sequence:

File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\sklearn.py:1003, in XGBModel._create_dmatrix(self, ref, **kwargs)
   1001 if _can_use_qdm(self.tree_method) and self.booster != "gblinear":
   1002     try:
-> 1003         return QuantileDMatrix(
   1004             **kwargs, ref=ref, nthread=self.n_jobs, max_bin=self.max_bin
   1005         )
   1006     except TypeError:  # `QuantileDMatrix` supports lesser types than DMatrix
   1007         pass

File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:726, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs)
    724 for k, arg in zip(sig.parameters, args):
    725     kwargs[k] = arg
--> 726 return func(**kwargs)

File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:1573, in QuantileDMatrix.__init__(self, data, label, weight, base_margin, missing, silent, feature_names, feature_types, nthread, max_bin, ref, group, qid, label_lower_bound, label_upper_bound, feature_weights, enable_categorical, data_split_mode)
   1553     if any(
   1554         info is not None
   1555         for info in (
   (...)
   1566         )
   1567     ):
   1568         raise ValueError(
   1569             "If data iterator is used as input, data like label should be "
   1570             "specified as batch argument."
   1571         )
-> 1573 self._init(
   1574     data,
   1575     ref=ref,
   1576     label=label,
   1577     weight=weight,
   1578     base_margin=base_margin,
   1579     group=group,
   1580     qid=qid,
   1581     label_lower_bound=label_lower_bound,
   1582     label_upper_bound=label_upper_bound,
   1583     feature_weights=feature_weights,
   1584     feature_names=feature_names,
   1585     feature_types=feature_types,
   1586     enable_categorical=enable_categorical,
   1587 )

File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:1632, in QuantileDMatrix._init(self, data, ref, enable_categorical, **meta)
   1620 config = make_jcargs(
   1621     nthread=self.nthread, missing=self.missing, max_bin=self.max_bin
   1622 )
   1623 ret = _LIB.XGQuantileDMatrixCreateFromCallback(
   1624     None,
   1625     it.proxy.handle,
   (...)
   1630     ctypes.byref(handle),
   1631 )
-> 1632 it.reraise()
   1633 # delay check_call to throw intermediate exception first
   1634 _check_call(ret)

File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:569, in DataIter.reraise(self)
    567 exc = self._exception
    568 self._exception = None
--> 569 raise exc

File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:550, in DataIter._handle_exception(self, fn, dft_ret)
    547     return dft_ret
    549 try:
--> 550     return fn()
    551 except Exception as e:  # pylint: disable=broad-except
    552     # Defer the exception in order to return 0 and stop the iteration.
    553     # Exception inside a ctype callback function has no effect except
    554     # for printing to stderr (doesn't stop the execution).
    555     tb = sys.exc_info()[2]

File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:637, in DataIter._next_wrapper.<locals>.<lambda>()
    635     self._temporary_data = None
    636 # pylint: disable=not-callable
--> 637 return self._handle_exception(lambda: self.next(input_data), 0)

File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\data.py:1416, in SingleBatchInternalIter.next(self, input_data)
   1414     return 0
   1415 self.it += 1
-> 1416 input_data(**self.kwargs)
   1417 return 1

File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:726, in require_keyword_args.<locals>.throw_if.<locals>.inner_f(*args, **kwargs)
    724 for k, arg in zip(sig.parameters, args):
    725     kwargs[k] = arg
--> 726 return func(**kwargs)

File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:625, in DataIter._next_wrapper.<locals>.input_data(data, feature_names, feature_types, **kwargs)
    623 # Stage the data, meta info are copied inside C++ MetaInfo.
    624 self._temporary_data = (new, cat_codes, feature_names, feature_types)
--> 625 dispatch_proxy_set_data(self.proxy, new, cat_codes)
    626 self.proxy.set_info(
    627     feature_names=feature_names,
    628     feature_types=feature_types,
    629     **kwargs,
    630 )
    631 self._data_ref = ref

File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\data.py:1492, in dispatch_proxy_set_data(proxy, data, cat_codes)
   1490 # Host
   1491 if isinstance(data, PandasTransformed):
-> 1492     proxy._ref_data_from_pandas(data)  # pylint: disable=W0212
   1493     return
   1494 if _is_np_array_like(data):

File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:1466, in _ProxyDMatrix._ref_data_from_pandas(self, data)
   1464 def _ref_data_from_pandas(self, data: DataType) -> None:
   1465     """Reference data from a pandas DataFrame. The input is a PandasTransformed instance."""
-> 1466     _check_call(
   1467         _LIB.XGProxyDMatrixSetDataColumnar(self.handle, data.array_interface())
   1468     )

File c:\Users\name\miniconda3\envs\pifind\lib\site-packages\xgboost\core.py:284, in _check_call(ret)
    273 """Check the return value of C API call
    274 
    275 This function will raise exception when error occurs.
   (...)
    281     return value from API calls
    282 """
    283 if ret != 0:
--> 284     raise XGBoostError(py_str(_LIB.XGBGetLastError()))

**XGBoostError: [12:37:08] C:\buildkite-agent\builds\buildkite-windows-cpu-autoscaling-group-i-06abd128ca6c1688d-1\xgboost\xgboost-ci-windows\src\data\array_interface.h:218: Check failed: m == 1 || n == 1: "**
}

This is the code sections that I believe are relevant:

estimator = XGBClassifier(objective='binary:logistic', scale_pos_weight = 0.75, eval_metric = ['auc', 'logloss'], missing=np.nan, early_stopping_rounds=200, verbosity=2) #scale_pos_weight = scale_pos_weight

search_space = {
    'learning_rate': (0.000001, 0.2, 'log-uniform'),
    'min_child_weight': (0, 10),
    'max_depth': (0, 10),
    'max_delta_step': (0, 20),
    'subsample': (0.01, 1.0, 'uniform'),
    'colsample_bytree': (0.01, 1.0, 'uniform'),
    'colsample_bylevel': (0.01, 1.0, 'uniform'),
    'reg_lambda': (1e-9, 1000, 'log-uniform'),
    'reg_alpha': (1e-9, 1.0, 'log-uniform'),
    'gamma': (1e-9, 0.5, 'log-uniform'),
    'n_estimators': (1, 100),
    #'scale_pos_weight': (1e-6, 500, 'log-uniform')
}

cv = StratifiedKFold(
    n_splits=3,
    shuffle=True,
    random_state=42
)

opt = BayesSearchCV(
    estimator=estimator,
    search_spaces=search_space,
    scoring = 'f1',
    #fit_params=param_grid,
    cv=cv,
    n_jobs = -1,   
    verbose = 2,
    n_iter= 75,
    refit = True,
    random_state = 42
)

…

        search = opt.fit(X_train, y_train, 
        eval_set =[(X_train, y_train), 
                   (X_test, y_test)],
        verbose=2)

This is the function that is doing most of the changing from the original X_train and y_train etc.

       def sample_group(group, indices):
                vals = group.iloc[indices[:min(len(group), len(indices))]]
                return vals
        
        def strat_split(X, Y):
                all = pd.concat([X, Y], axis=1)
    
                unique_datetimes = all.index.unique()
                unique_datetimes_sorted = sorted(unique_datetimes)

                print("Unique datetime indices:")
                for dt in unique_datetimes_sorted:
                        print(dt)

                #truths = all[all['truth'] == 1].groupby('time').apply(lambda x: x.sample(min(len(x),5000)))
                truths = all[all['truth'] == 1].groupby('time', group_keys=False).apply(lambda x: x.sample(min(len(x),5000)))

                #truths = truths.reset_index(0).drop(['time'], axis = 1)
  
                sample_sizes = truths.groupby('time').size()

                def sample_falses(group):
                        group_time = group.name
                        n_truths = sample_sizes.get(group_time, 0)
                        n_falses = n_truths  
                
                        return group.sample(n=n_falses)
         
                falses = all[all['truth'] == 0].groupby('time', group_keys=False).apply(sample_falses)
                #falses = falses.reset_index(0).drop(['time'], axis = 1)
   
                all_new = pd.concat([truths, falses], axis=0).sort_values(by='time')
                all_new = all_new.dropna(subset=all_bands, how='all')
                all_new = all_new.dropna(subset=planet_bands, how='all')  
                all_new = all_new[['y', 'x'] + all_bands + ['truth']]

                X_new = all_new.drop(columns=['truth'])
                Y_new = all_new['truth']

                # print('Truths: ', len(all_new[all_new['truth'] == 1]))
                # print('Falses: ', len(all_new[all_new['truth'] == 0]))
                # print('Total Size: ', len(all_new[all_new['truth'] == 1]) + len(all_new[all_new['truth'] == 0]))
                # print('Fraction Neg/Pos: ', len(all_new[all_new['truth'] == 0])/len(all_new[all_new['truth'] == 1]))
                return X_new, Y_new

        X_train, y_train = strat_split(X_train,
                                               y_train)
        X_test, y_test = strat_split(X_test
                                      ,y_test)
        

        X_train_reindex = X_train.copy()
        y_train_reindex = y_train.copy()
        X_train_reindex['Year'] = X_train_reindex.index.year.astype('float32')
        X_train_reindex['Month'] = X_train_reindex.index.month.astype('float32')
        X_train_reindex['Day'] = X_train_reindex.index.day.astype('float32')

        X_test_reindex = X_test.copy()
        y_test_reindex = y_test.copy()
        X_test_reindex['Year'] = X_test_reindex.index.year.astype('float32')
        X_test_reindex['Month'] = X_test_reindex.index.month.astype('float32')
        X_test_reindex['Day'] = X_test_reindex.index.day.astype('float32')

        X_train_reindex =  X_train_reindex.reset_index(drop=True).drop(['y', 'x'], axis = 1)
        y_train_reindex = y_train_reindex.reset_index(drop=True)
        X_test_reindex = X_test_reindex.reset_index(drop=True).drop(['y', 'x'], axis = 1)
        y_test_reindex = y_test_reindex.reset_index(drop=True)

Expected the model training to run like the non-manipulated dataset

Thiết kế website giá rẻ

Danh mục

XGBClassifier: XGBoost Check failed: m == 1 || n == 1 error