I want to find the optimal decision threshold for class prediction for my groupKFold cross validation, and found TunedThreshold package, my code is listed below, does the way I use this package correct? How can I print out the threshold for each model and plot the f1 score vs threshold plot??
def run_model_augmentation_TunedThreshold(X, y, groups, selection_threshold=0.5):
gkf = GroupKFold(n_splits=10)
pipelines = {
'scaledLR': ImbPipeline([
('scaled', MinMaxScaler()),
('smote_tomek', SMOTETomek(random_state=0)),
('LR', LogisticRegression(solver='lbfgs', random_state=0))
]),
'scaledXGB': ImbPipeline([
('scaled', MinMaxScaler()),
('smote_tomek', SMOTETomek(random_state=0)),
('XGB', XGBClassifier(random_state=0))
]),
'scaledLR_ENN': ImbPipeline([
('scaled', MinMaxScaler()),
('enn', EditedNearestNeighbours()),
('LR', LogisticRegression(solver='lbfgs', random_state=0))
]),
'scaledXGB_ENN': ImbPipeline([
('scaled', MinMaxScaler()),
('enn', EditedNearestNeighbours()),
('XGB', XGBClassifier(random_state=0))
]),
'scaledSVM_RBF_ENN': ImbPipeline([
('scaled', MinMaxScaler()),
('enn', EditedNearestNeighbours()),
('SVM', SVC(kernel='rbf', probability=True, C=175, gamma=0.033, random_state=0))
]),
'scaledSVM_linear_ENN': ImbPipeline([
('scaled', MinMaxScaler()),
('enn', EditedNearestNeighbours()),
('SVM', SVC(kernel='linear', probability=True, C=175, gamma=0.01, random_state=0))
])
}
# Canvas 1: Confusion matrices
fig_cm, axes_cm = plt.subplots(1, len(pipelines), figsize=(20, 5))
# Canvas 2: PR Curve, ROC Curve, and Calibration plot
fig_main, (ax_pr, ax_roc, ax_cal) = plt.subplots(1, 3, figsize=(18, 6))
for i, (pipe, model) in enumerate(pipelines.items()):
# # Use cross_val_predict to get predicted probabilities
pos_label = 1
scorer = make_scorer(f1_score, pos_label=pos_label)
tuned_model = TunedThresholdClassifierCV(model, scoring=scorer,random_state=0, store_cv_results=True)
predicted_probabilities = cross_val_predict(tuned_model, X, y.values.ravel(), groups=groups, cv=gkf, method='predict_proba')[:, 1]
# print(predicted_probabilities,len(predicted_probabilities))
y_pred = cross_val_predict(tuned_model, X, y.values.ravel(), groups=groups, cv=gkf, method='predict')
# Compute metrics from the predictions
accuracy = accuracy_score(y.values.ravel(), y_pred)
precision_metric = precision_score(y.values.ravel(), y_pred)
recall_metric = recall_score(y.values.ravel(), y_pred)
f1 = f1_score(y.values.ravel(), y_pred)
mse = mean_squared_error(y.values.ravel(), y_pred)