I have a very large DF (~200GB) of features that I want to perform cross validation on a random forest model with these features.
The features are from a huggingface model in the form of a .arrow file.
I am trying to figure out how I can minimize the amount of RAM needed to process this code– even with 600GB of RAM, my code crashes.
I believe the issue is I am transforming from a huggingface dataset (https://huggingface.co/docs/datasets/en/loading) to a pandas df to a numpy array (see the first 5 lines of code). I tried to figure out a way around this, but unfortunately, I haven’t been able to figure it out.
If there are other ways to make my code more RAM efficient, please let me know.
# Load data
file_path = 'data.arrow' #200 GB in size
df = Dataset.from_file(file_path)
df = df.data
df = df.to_pandas()
X = np.array([np.array(x) for x in df['embeddings'].values])
y = df['label'].values
groups = df['Chromosome'].values
group_kfold = GroupKFold(n_splits=10)
# Initialize figure for plotting
fig, axes = plt.subplots(1, 2, figsize=(15, 6))
all_fpr = []
all_tpr = []
all_accuracy = []
all_pr_auc = []
best_score = 0
best_model = None
for i, (train_idx, val_idx) in enumerate(group_kfold.split(X, y, groups)):
X_train_fold, X_val_fold = X[train_idx], X[val_idx]
y_train_fold, y_val_fold = y[train_idx], y[val_idx]
# Initialize classifier
rf_classifier = RandomForestClassifier(n_estimators=30, random_state=42, n_jobs=-1)
# Train the classifier on this fold
rf_classifier.fit(X_train_fold, y_train_fold)
# Make predictions on the validation set
y_pred_proba = rf_classifier.predict_proba(X_val_fold)[:, 1]
# Calculate ROC curve
fpr, tpr, _ = roc_curve(y_val_fold, y_pred_proba)
all_fpr.append(fpr)
all_tpr.append(tpr)
roc_auc = auc(fpr, tpr) #
# Keep track of the best model based on ROC AUC
if roc_auc > best_score:
best_score = roc_auc
best_model = rf_classifier
# Plot ROC curve for this fold
axes[0].plot(fpr, tpr, lw=1, alpha=0.7, label=f'ROC Fold {i+1} (AUC = {roc_auc:.2f})')
# Calculate precision-recall curve
precision, recall, _ = precision_recall_curve(y_val_fold, y_pred_proba)
# Calculate PR AUC
pr_auc = auc(recall, precision)
all_pr_auc.append(pr_auc)
# Plot PR curve for this fold
axes[1].plot(recall, precision, lw=1, alpha=0.7, label=f'PR Curve Fold {i+1} (AUC = {pr_auc:.2f})')
# Calculate accuracy
accuracy = accuracy_score(y_val_fold, rf_classifier.predict(X_val_fold))
all_accuracy.append(accuracy)
# Save the best model
joblib.dump(best_model, 'model.pkl')