I am wondering the following about how it works internally:
when combined with 5-fold cross validation for example, does TunedCalibratedCV take the four training folds then performs 5-fold cross-validation for those 4 folds to determine the optimal threshold and then the model is evaluated on the 5th original fold?
What if I use cv=0.8 instead of the default 5-folds? Does it take the 80% of the 4 folds, select threshold on the remaining 20% of the 4-folds and then the model is evaluated on the 1 fold held out for testing?
from sklearn.model_selection import TunedThresholdClassifierCV
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
# Load the breast cancer dataset
data = load_breast_cancer()
X = data.data
y = data.target
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Initialize the RandomForestClassifier model
rf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
model = TunedThresholdClassifierCV(rf, cv = 5)
# model = TunedThresholdClassifierCV(rf, cv = 0.8)
# Perform 5-fold cross-validation on the training data
cv_scores = cross_val_score(model, X_train, y_train, cv=5)
print("Cross-Validation Scores:", cv_scores)
print("Mean CV Score:", np.mean(cv_scores))