I am doing the task of meta-feature extraction but the computation at some point seems to consume all available memory, so Ubuntu keeps killing the process (Killed
) for excessive memory utilisation.
I decided to split the computation into smaller tasks, and then aggregate the final result. This seems fine, but I do notice it that some metrics computations failed. I tested this on small dataset in two scenarios: 1) using the entire dataset (since it fits into memory), 2) using the n_splits computation I implemented.
I expect scenario 2 to give a final result approximately close to scenario 1. However, it fails computing those measures as well.
To give an MWE
, I illustrate this using the iris dataset
as follows:
<code>!pip -q install pymfe # library for meta-feature comput.
from sklearn.datasets import load_iris
from pymfe.mfe import MFE
X, y= data.data, data.target
<code>!pip -q install pymfe # library for meta-feature comput.
import numpy as np
from sklearn.datasets import load_iris
from pymfe.mfe import MFE
data = load_iris()
X, y= data.data, data.target
</code>
!pip -q install pymfe # library for meta-feature comput.
import numpy as np
from sklearn.datasets import load_iris
from pymfe.mfe import MFE
data = load_iris()
X, y= data.data, data.target
Scenario 1 computing over whole data
<code>features_to_compute = ['f1', 'f2', 'f3', 't1'] # meta-features
summaries = ['min', 'max', 'mean', 'sd'] # summary
extractor = MFE(features=features_to_compute, groups=["complexity"], summary=summaries)
res = extractor.extract()
for i in range(len(res[0])):
var_sp = res[0][i].split('.')
print(f"{res[0][i]} = {res[1][i]}n")
f1.max = 0.599217152923665
f1.mean = 0.2775641932566493
f1.min = 0.05862828094263208
f1.sd = 0.2612622587707819
f2.max = 0.01914529914529914
f2.mean = 0.0063817663817663794
f2.sd = 0.011053543615254369
f3.mean = 0.12333333333333334
f3.sd = 0.21361959960016152
<code>features_to_compute = ['f1', 'f2', 'f3', 't1'] # meta-features
summaries = ['min', 'max', 'mean', 'sd'] # summary
extractor = MFE(features=features_to_compute, groups=["complexity"], summary=summaries)
extractor.fit(X,y)
res = extractor.extract()
# results
for i in range(len(res[0])):
var_sp = res[0][i].split('.')
g_name = var_sp[0]
print(f"{res[0][i]} = {res[1][i]}n")
f1.max = 0.599217152923665
f1.mean = 0.2775641932566493
f1.min = 0.05862828094263208
f1.sd = 0.2612622587707819
f2.max = 0.01914529914529914
f2.mean = 0.0063817663817663794
f2.min = 0.0
f2.sd = 0.011053543615254369
f3.max = 0.37
f3.mean = 0.12333333333333334
f3.min = 0.0
f3.sd = 0.21361959960016152
t1 = 0.12
</code>
features_to_compute = ['f1', 'f2', 'f3', 't1'] # meta-features
summaries = ['min', 'max', 'mean', 'sd'] # summary
extractor = MFE(features=features_to_compute, groups=["complexity"], summary=summaries)
extractor.fit(X,y)
res = extractor.extract()
# results
for i in range(len(res[0])):
var_sp = res[0][i].split('.')
g_name = var_sp[0]
print(f"{res[0][i]} = {res[1][i]}n")
f1.max = 0.599217152923665
f1.mean = 0.2775641932566493
f1.min = 0.05862828094263208
f1.sd = 0.2612622587707819
f2.max = 0.01914529914529914
f2.mean = 0.0063817663817663794
f2.min = 0.0
f2.sd = 0.011053543615254369
f3.max = 0.37
f3.mean = 0.12333333333333334
f3.min = 0.0
f3.sd = 0.21361959960016152
t1 = 0.12
This seems fine.
Scenario 2 splitting to smaller tasks for large dataset beyond available memory.
def split_dataset(X, y, n_splits):
# Split the data into n_splits smaller datasets.
split_X = np.array_split(X, n_splits)
split_y = np.array_split(y, n_splits)
def compute_meta_features(X, y, features, summary):
# Compute meta-features for a given dataset split.
extractor = MFE(features=features, groups=["complexity"], summary=summary)
return extractor.extract()
def average_results(results):
# Average the results from multiple splits.
summary_values = np.mean([result[1] for result in results], axis=0)
return features, summary_values
split_X, split_y = split_dataset(X, y, n_splits)
results = [compute_meta_features(X_part, y_part, features=features_to_compute,
summary=summaries) for X_part, y_part in zip(split_X, split_y)]
# here stack trace issued several warnings, e.g.
0/dist-packages/pymfe/_internal.py:731: RuntimeWarning: Can't summarize feature 'f2' with summary 'mean'. Will set it as 'np.nan'.
/usr/local/lib/python3.10/dist-packages/pymfe/_internal.py:731: RuntimeWarning: Can't summarize feature 'f2' with summary 'max'. Will set it as 'np.nan'.
/usr/local/lib/python3.10/dist-packages/pymfe/_internal.py:731: RuntimeWarning: Can't summarize feature 'f2' with summary 'min'. Will set it as 'np.nan'.
/usr/local/lib/python3.10/dist-packages/pymfe/_internal.py:731: RuntimeWarning: Can't summarize feature 'f2' with summary 'sd'. Will set it as 'np.nan'.
/usr/local/lib/python3.10/dist-packages/pymfe/_internal.py:731: RuntimeWarning: Can't summarize feature 'f3' with summary 'mean'. Will set it as 'np.nan'.
<code># helper functions
def split_dataset(X, y, n_splits):
# Split the data into n_splits smaller datasets.
split_X = np.array_split(X, n_splits)
split_y = np.array_split(y, n_splits)
return split_X, split_y
def compute_meta_features(X, y, features, summary):
# Compute meta-features for a given dataset split.
extractor = MFE(features=features, groups=["complexity"], summary=summary)
extractor.fit(X,y)
return extractor.extract()
def average_results(results):
# Average the results from multiple splits.
features = results[0][0]
summary_values = np.mean([result[1] for result in results], axis=0)
return features, summary_values
n_splits = 10
split_X, split_y = split_dataset(X, y, n_splits)
results = [compute_meta_features(X_part, y_part, features=features_to_compute,
summary=summaries) for X_part, y_part in zip(split_X, split_y)]
# here stack trace issued several warnings, e.g.
0/dist-packages/pymfe/_internal.py:731: RuntimeWarning: Can't summarize feature 'f2' with summary 'mean'. Will set it as 'np.nan'.
warnings.warn(
/usr/local/lib/python3.10/dist-packages/pymfe/_internal.py:731: RuntimeWarning: Can't summarize feature 'f2' with summary 'max'. Will set it as 'np.nan'.
warnings.warn(
/usr/local/lib/python3.10/dist-packages/pymfe/_internal.py:731: RuntimeWarning: Can't summarize feature 'f2' with summary 'min'. Will set it as 'np.nan'.
warnings.warn(
/usr/local/lib/python3.10/dist-packages/pymfe/_internal.py:731: RuntimeWarning: Can't summarize feature 'f2' with summary 'sd'. Will set it as 'np.nan'.
warnings.warn(
/usr/local/lib/python3.10/dist-packages/pymfe/_internal.py:731: RuntimeWarning: Can't summarize feature 'f3' with summary 'mean'. Will set it as 'np.nan'.
</code>
# helper functions
def split_dataset(X, y, n_splits):
# Split the data into n_splits smaller datasets.
split_X = np.array_split(X, n_splits)
split_y = np.array_split(y, n_splits)
return split_X, split_y
def compute_meta_features(X, y, features, summary):
# Compute meta-features for a given dataset split.
extractor = MFE(features=features, groups=["complexity"], summary=summary)
extractor.fit(X,y)
return extractor.extract()
def average_results(results):
# Average the results from multiple splits.
features = results[0][0]
summary_values = np.mean([result[1] for result in results], axis=0)
return features, summary_values
n_splits = 10
split_X, split_y = split_dataset(X, y, n_splits)
results = [compute_meta_features(X_part, y_part, features=features_to_compute,
summary=summaries) for X_part, y_part in zip(split_X, split_y)]
# here stack trace issued several warnings, e.g.
0/dist-packages/pymfe/_internal.py:731: RuntimeWarning: Can't summarize feature 'f2' with summary 'mean'. Will set it as 'np.nan'.
warnings.warn(
/usr/local/lib/python3.10/dist-packages/pymfe/_internal.py:731: RuntimeWarning: Can't summarize feature 'f2' with summary 'max'. Will set it as 'np.nan'.
warnings.warn(
/usr/local/lib/python3.10/dist-packages/pymfe/_internal.py:731: RuntimeWarning: Can't summarize feature 'f2' with summary 'min'. Will set it as 'np.nan'.
warnings.warn(
/usr/local/lib/python3.10/dist-packages/pymfe/_internal.py:731: RuntimeWarning: Can't summarize feature 'f2' with summary 'sd'. Will set it as 'np.nan'.
warnings.warn(
/usr/local/lib/python3.10/dist-packages/pymfe/_internal.py:731: RuntimeWarning: Can't summarize feature 'f3' with summary 'mean'. Will set it as 'np.nan'.
Results from scenario 2:
<code>final_features, final_summary = average_results(results)
for i in range(len(final_features)):
var_sp = final_features[i].split('.')
print(f"{final_features[i]} = {final_summary[i]}n")
f1.max = 0.9378374974227318
f1.mean = 0.8736795575459622
f1.min = 0.8198589466408711
f1.sd = 0.058203211724503635
<code>final_features, final_summary = average_results(results)
for i in range(len(final_features)):
var_sp = final_features[i].split('.')
g_name = var_sp[0]
print(f"{final_features[i]} = {final_summary[i]}n")
f1.max = 0.9378374974227318
f1.mean = 0.8736795575459622
f1.min = 0.8198589466408711
f1.sd = 0.058203211724503635
f2.max = nan
f2.mean = nan
f2.min = nan
f2.sd = nan
f3.max = nan
f3.mean = nan
f3.min = nan
f3.sd = nan
t1 = nan
</code>
final_features, final_summary = average_results(results)
for i in range(len(final_features)):
var_sp = final_features[i].split('.')
g_name = var_sp[0]
print(f"{final_features[i]} = {final_summary[i]}n")
f1.max = 0.9378374974227318
f1.mean = 0.8736795575459622
f1.min = 0.8198589466408711
f1.sd = 0.058203211724503635
f2.max = nan
f2.mean = nan
f2.min = nan
f2.sd = nan
f3.max = nan
f3.mean = nan
f3.min = nan
f3.sd = nan
t1 = nan
f1, f2, f3, ...
are all nan
. Similar result is obtainted using openml volcanoesa1 dataset.
I cannot understand what’s causing this, where the problem arises. How can this be fixed?