I am trying to solve the following problem. I have a sample of stores selected by geographies, and I’d like to select 20% of the sample in such a way that the fraction of sales per geography for the selection mimics that of the overall sales distribution by geography. I tried several iterations of the code below, but for some reason the ‘selection’ never changes inside the objective function, so test_fraction is always the same and there’s no real optimization. I have no idea what I am doing wrong. Could someone please help?
Here’s my code with sample data:
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from scipy.special import expit
# Sample data
np.random.seed(0)
data = pd.DataFrame({
'Store': range(1, 101),
'Geography': np.random.choice(['North', 'South', 'East', 'West'], 100),
'TotalSales': np.random.rand(100) * 1000
})
# Define the number of stores to select
num_stores = len(data)
target_percentage = 0.2
target_num_stores = int(target_percentage * num_stores)
# Calculate overall sales fraction per geography
overall_sales = data.groupby('Geography')['TotalSales'].sum()
overall_fraction = overall_sales / overall_sales.sum()
# Objective function to minimize squared error and approach target percentage
def objective(selection):
selection = expit(selection).round() # Ensure binary selection using sigmoid function
selected_stores = data.iloc[np.where(selection == 1)]
if len(selected_stores) == 0:
return np.inf # Prevent division by zero
test_sales = selected_stores.groupby('Geography')['TotalSales'].sum()
test_fraction = test_sales / test_sales.sum()
print(test_fraction)
print()
# Calculate the squared error
error = ((overall_fraction - test_fraction) ** 2).sum()
# Add penalty for deviation from the target number of stores
penalty = ((np.sum(selection) - target_num_stores) / target_num_stores) ** 2
return error +penalty
# Constraint to select approximately 20% of the stores
def constraint(selection):
return np.sum(expit(selection).round()) - target_num_stores
# Set up bounds for the decision variables
bounds = [(0, 1) for _ in range(num_stores)]
# Create an initial feasible solution by randomly selecting approximately 20% of the stores
initial_selection = np.random.uniform(low=0, high=1, size=num_stores)
# Set up constraints
constraints = {'type': 'eq', 'fun': constraint}
# Solve the optimization problem
result = minimize(objective, initial_selection, bounds=bounds, method='SLSQP', options={'disp': True})
# Solve the optimization problem
result = minimize(objective, x0=initial_selection)#, bounds=bounds, constraints=constraints, method='SLSQP', options={'disp': True})