I am trying to calculate the optimism-adjusted AUC’s via a double for-loop over n = n_eucl_sil ... n_gow_gap
and for three clustering algorithms. This should result in three print statements (see end). I am encountering two errors: first, “index out of bounds” for the K-medoids algorithm; second, print statements for each bootstrap iteration, instead of one (averaged) print. This is possibly due to wrong code indentation.
algorithms = {
'KMedoids': KMedoids,
'AgglomerativeClustering': AgglomerativeClustering,
'SpectralClustering': SpectralClustering
}
# Iterate through clustering algorithms and calculate AUC Apparent
for algorithm, labels in cluster_labels.items():
# Create DataFrame for logistic regression with cluster labels
df_orig = pd.DataFrame({"y_orig": np.ravel(y_dich), "labels_orig": np.ravel(labels)})
# Fit logistic regression model
model = sm.formula.glm(formula="y_orig ~ labels_orig", family=sm.families.Binomial(), data=df_orig).fit()
# Calculate AUC Apparent
y_pred_apparent = model.predict(df_orig['labels_orig'])
auc_apparent = roc_auc_score(df_orig['y_orig'], y_pred_apparent)
# Append results to auc_apparents DataFrame
auc_apparents = auc_apparents.append(
{
'method': algorithm,
'auc_apparent': auc_apparent
}, ignore_index=True)
# Display AUC Apparent values
print(auc_apparents)
print()
# Iterate through clustering algorithms and perform bootstrap iterations
for algorithm_name, algorithm in algorithms.items():
optimisms_eucl_sil, optimisms_eucl_gap, optimisms_gow_sil, optimisms_gow_gap = [], [], [], []
ls_orig_eucl_sil, ls_orig_eucl_gap, ls_orig_gow_sil, ls_orig_gow_gap = [pd.DataFrame() for _ in range(4)]
# Perform bootstrap iterations
for i in range(n_bootstraps):
sample = resample(orig_data, replace=True, n_samples=4509)
sample_gap = resample(orig_data_gap, replace=True, n_samples=4509)
y_boot = sample['y_orig']
n_eucl_sil = best_silhouette_euclidean(algorithm, sample.iloc[:, 1:], range_n_clusters)[0]
n_eucl_gap = GapStatEucl.fit_predict(algorithm, K, sample_gap.iloc[:,1:])
n_gow_sil = best_silhouette_gower(algorithm, gower.gower_matrix(sample.iloc[:, 1:]), range_n_clusters)[0]
n_gow_gap = GapStatGow.fit_predict(algorithm, K, pd.DataFrame(sample_gap.iloc[:, 1:]))
# (2) AUC Bootstrap
try:
if algorithm_name == 'KMedoids':
clusterer_eucl_sil = KMedoids(n_clusters=n_eucl_sil).fit(sample.iloc[:, 1:])
clusterer_eucl_gap = KMedoids(n_clusters=n_eucl_gap).fit(sample.iloc[:, 1:])
clusterer_gow_sil = KMedoids(n_clusters=n_gow_sil, metric='precomputed').fit(gower.gower_matrix(sample.iloc[:,1:]))
clusterer_gow_gap = KMedoids(n_clusters=n_gow_gap, metric='precomputed').fit(gower.gower_matrix(sample.iloc[:,1:]))
l_orig_eucl_sil = pd.DataFrame(clusterer_eucl_sil.predict(orig_data.iloc[:, 1:]))
l_orig_eucl_gap = pd.DataFrame(clusterer_eucl_gap.predict(orig_data.iloc[:, 1:]))
l_orig_gow_sil = pd.DataFrame(clusterer_gow_sil.predict(orig_data.iloc[:, 1:]))
l_orig_gow_gap = pd.DataFrame(clusterer_gow_gap.predict(orig_data.iloc[:, 1:]))
elif algorithm_name == 'AgglomerativeClustering':
clusterer_eucl_sil = AgglomerativeClustering(n_clusters=n_eucl_sil,linkage='average').fit(sample.iloc[:, 1:])
clusterer_eucl_gap = AgglomerativeClustering(n_clusters=n_eucl_gap,linkage='average').fit(sample.iloc[:, 1:])
clusterer_gow_sil = AgglomerativeClustering(n_clusters=n_gow_sil, metric='precomputed', linkage='average').fit(gower.gower_matrix(sample.iloc[:, 1:]))
clusterer_gow_gap = AgglomerativeClustering(n_clusters=n_gow_gap, metric='precomputed', linkage='average').fit(gower.gower_matrix(sample.iloc[:, 1:]))
l_orig_eucl_sil = pd.DataFrame(clusterer_eucl_sil.fit_predict(orig_data.iloc[:, 1:]))
l_orig_eucl_gap = pd.DataFrame(clusterer_eucl_gap.fit_predict(orig_data.iloc[:, 1:]))
l_orig_gow_sil = pd.DataFrame(clusterer_gow_sil.fit_predict(gower_matrix))
l_orig_gow_gap = pd.DataFrame(clusterer_gow_gap.fit_predict(gower_matrix))
elif algorithm_name == 'SpectralClustering':
clusterer_eucl_sil = SpectralClustering(n_clusters=n_eucl_sil, affinity='nearest_neighbors').fit(sample.iloc[:, 1:])
clusterer_eucl_gap = SpectralClustering(n_clusters=n_eucl_gap, affinity='nearest_neighbors').fit(sample.iloc[:, 1:])
clusterer_gow_sil = SpectralClustering(n_clusters=n_gow_sil, affinity='precomputed').fit(1 - gower.gower_matrix(sample.iloc[:,1:]))
clusterer_gow_gap = SpectralClustering(n_clusters=n_gow_gap, affinity='precomputed').fit(1 - gower.gower_matrix(sample.iloc[:, 1:]))
l_orig_eucl_sil = pd.DataFrame(clusterer_eucl_sil.fit_predict(orig_data.iloc[:, 1:]))
l_orig_eucl_gap = pd.DataFrame(clusterer_eucl_gap.fit_predict(orig_data.iloc[:, 1:]))
l_orig_gow_sil = pd.DataFrame(clusterer_gow_sil.fit_predict(gower.gower_matrix(sample.iloc[:,1:])))
l_orig_gow_gap = pd.DataFrame(clusterer_gow_gap.fit_predict(gower.gower_matrix(sample.iloc[:,1:])))
l_boot_eucl_sil = pd.DataFrame(clusterer_eucl_sil.labels_)
l_boot_eucl_gap = pd.DataFrame(clusterer_eucl_gap.labels_)
l_boot_gow_sil = pd.DataFrame(clusterer_gow_sil.labels_)
l_boot_gow_gap = pd.DataFrame(clusterer_gow_gap.labels_)
df_boot_eucl_sil = pd.DataFrame({"y_boot": np.ravel(y_boot), "l_boot_eucl_sil": np.ravel(l_boot_eucl_sil)})
df_boot_eucl_gap = pd.DataFrame({"y_boot": np.ravel(y_boot), "l_boot_eucl_gap": np.ravel(l_boot_eucl_gap)})
df_boot_gow_sil = pd.DataFrame({"y_boot": np.ravel(y_boot), "l_boot_gow_sil": np.ravel(l_boot_gow_sil)})
df_boot_gow_gap = pd.DataFrame({"y_boot": np.ravel(y_boot), "l_boot_gow_gap": np.ravel(l_boot_gow_gap)})
m_eucl_sil = sm.formula.glm(formula="y_boot ~ l_boot_eucl_sil", family=sm.families.Binomial(), data=df_boot_eucl_sil).fit()
m_eucl_gap = sm.formula.glm(formula="y_boot ~ l_boot_eucl_gap", family=sm.families.Binomial(), data=df_boot_eucl_gap).fit()
m_gow_sil = sm.formula.glm(formula="y_boot ~ l_boot_gow_sil", family=sm.families.Binomial(), data=df_boot_gow_sil).fit()
m_gow_gap = sm.formula.glm(formula="y_boot ~ l_boot_gow_gap", family=sm.families.Binomial(), data=df_boot_gow_gap).fit()
# Calculate AUC Bootstrap
models = [m_eucl_sil, m_eucl_gap, m_gow_sil, m_gow_gap]
label_dfs = [l_boot_eucl_sil, l_boot_eucl_gap, l_boot_gow_sil, l_boot_gow_gap]
auc_scores = []
# Iterate over models and label dataframes
for model, label_df in zip(models, label_dfs):
# Predictions
y_pred_boot = model.predict(label_df)
# AUC score
auc_boot = roc_auc_score(y_boot, y_pred_boot)
# Append to the list of AUC scores
auc_scores.append(auc_boot)
# Separate the AUC scores for each algorithm
auc_boot_eucl_sil, auc_boot_eucl_gap, auc_boot_gow_sil, auc_boot_gow_gap = [auc_scores[i] for i in range(4)]
# Append to the respective lists
aucs_boot_eucl_sil.append(auc_boot_eucl_sil)
aucs_boot_eucl_gap.append(auc_boot_eucl_gap)
aucs_boot_gow_sil.append(auc_boot_gow_sil)
aucs_boot_gow_gap.append(auc_boot_gow_gap)
# Calculate AUC Original
y_pred_orig_eucl_sil = m_eucl_sil.predict(pd.DataFrame(df_orig['labels_orig']))
y_pred_orig_eucl_gap = m_eucl_gap.predict(pd.DataFrame(df_orig['labels_orig']))
y_pred_orig_gow_sil = m_gow_sil.predict(pd.DataFrame(df_orig['labels_orig']))
y_pred_orig_gow_gap = m_gow_gap.predict(pd.DataFrame(df_orig['labels_orig']))
auc_orig_eucl_sil = roc_auc_score(df_orig['y_orig'], y_pred_orig_eucl_sil)
auc_orig_eucl_gap = roc_auc_score(df_orig['y_orig'], y_pred_orig_eucl_gap)
auc_orig_gow_sil = roc_auc_score(df_orig['y_orig'], y_pred_orig_gow_sil)
auc_orig_gow_gap = roc_auc_score(df_orig['y_orig'], y_pred_orig_gow_gap)
# Calculate optimism and append results
optimism_eucl_sil = auc_boot_eucl_sil - auc_orig_eucl_sil
optimism_eucl_gap = auc_boot_eucl_gap - auc_orig_eucl_gap
optimism_gow_sil = auc_boot_gow_sil - auc_orig_gow_sil
optimism_gow_gap = auc_boot_gow_gap - auc_orig_gow_gap
optimisms_eucl_sil.append(optimism_eucl_sil)
optimisms_eucl_gap.append(optimism_eucl_gap)
optimisms_gow_sil.append(optimism_gow_sil)
optimisms_gow_gap.append(optimism_gow_gap)
ls_orig_eucl_sil = pd.concat([ls_orig_eucl_sil, l_orig_eucl_sil.reset_index(drop=True)], axis=1)
ls_orig_eucl_gap = pd.concat([ls_orig_eucl_gap, l_orig_eucl_gap.reset_index(drop=True)], axis=1)
ls_orig_gow_sil = pd.concat([ls_orig_gow_sil, l_orig_gow_sil.reset_index(drop=True)], axis=1)
ls_orig_gow_gap = pd.concat([ls_orig_gow_gap, l_orig_gow_gap.reset_index(drop=True)], axis=1)
# Calculate average optimism
optimism_eucl_sil_avg = np.sum(optimisms_eucl_sil) / n_bootstraps
optimism_eucl_gap_avg = np.sum(optimisms_eucl_gap) / n_bootstraps
optimism_gow_sil_avg = np.sum(optimisms_gow_sil) / n_bootstraps
optimism_gow_gap_avg = np.sum(optimisms_gow_gap) / n_bootstraps
std_auc_boot_eucl_sil = np.std(aucs_boot_eucl_sil)
std_auc_boot_eucl_gap = np.std(aucs_boot_eucl_gap)
std_auc_boot_gow_sil = np.std(aucs_boot_gow_sil)
std_auc_boot_gow_gap = np.std(aucs_boot_gow_gap)
except Exception as e:
print("An error occurred:", e)
continue
print(f"Algorithm: {algorithm_name}n"
f"Optimism Metrics:n"
f" - Euclidean Silhouette: {optimism_eucl_sil_avg}n"
f" - Euclidean Gap: {optimism_eucl_gap_avg}n"
f" - Gower Silhouette: {optimism_gow_sil_avg}n"
f" - Gower Gap: {optimism_gow_gap_avg}n"
f"Standard Deviation of Bootstrap AUCs:n"
f" - Euclidean Silhouette: {std_auc_boot_eucl_sil}n"
f" - Euclidean Gap: {std_auc_boot_eucl_gap}n"
f" - Gower Silhouette: {std_auc_boot_eucl_gap}n"
f" - Gower Gap: {std_auc_boot_gow_gap}n")