Below I demonstrate a workflow for fitting the same model to many datasets in R by nesting datasets by test_id
, and then fitting the same model to each dataset, and extracting a statistic from each model.
My goal is to create the equivalent workflow in Python, using polars, but I will use pandas if that
Demonstration in R
library(tidyverse)
SIMS <- 3
TRIALS <- 1e3
PROB_A <- .65
PROB_B <- .67
df <- bind_rows(
tibble(
recipe = "A",
trials = TRIALS,
events = rbinom(n=SIMS, size=trials, prob=PROB_A),
rate = events/trials) |>
mutate(test_id = 1:n()),
tibble(
recipe = "B",
trials = TRIALS,
events = rbinom(n=SIMS, size=trials, prob=PROB_B),
rate = events/trials) |>
mutate(test_id = 1:n())
)
df
df_nest <- df |>
group_by(test_id) |>
nest()
df_nest
Define two functions to map over my nested data:
glm_foo <- function(.data){
glm(formula = rate ~ recipe,
data = .data,
weights = trials,
family = binomial)
}
glm_foo(df_nest$data[[1]])
fit_and_extract <- function(.data){
m <- glm(formula = rate ~ recipe,
data = .data,
weights = trials,
family = binomial)
m2$coefficients['recipeB']
}
fit_and_extract(df_nest$data[[1]])
df_nest |>
mutate(
model = map(.x = data, .f = glm_foo),
trt_b = map_dbl(.x = data, .f = fit_and_extract)
)
#Python Section
I can create the same nested data structure in polars, but I am unsure of how to fit the model to each nested dataset within the list column called data
.
import polars as pl
from polars import col
import numpy as np
SIMS = 3
TRIALS = int(1e3)
PROB_A = .65
PROB_B = .67
df_a = pl.DataFrame({
'recipe': "A",
'trials': TRIALS,
'events': np.random.binomial(n=TRIALS, p=PROB_A, size=SIMS),
'test_id': np.arange(SIMS)
})
df_b = pl.DataFrame({
'recipe': "B",
'trials': TRIALS,
'events': np.random.binomial(n=TRIALS, p=PROB_B, size=SIMS),
'test_id': np.arange(SIMS)
})
df = (pl.concat([df_a, df_b], rechunk=True)
.with_columns(
fails = col('trials') - col('events')
))
df
df_agg = df.group_by('test_id').agg(data = pl.struct('events', 'fails', 'recipe'))
df_agg.sort('test_id')
At this point my mental model of pandas starts to crumble. There are so many mapping options and I’m not really sure how to trouble shoot at this stage.
df_agg.with_columns(
(
pl.struct(["data"]).map_batches(
lambda x: smf.glm('events + fails ~ recipe', family=sm.families.Binomial(), data=x.struct.field('data').to_pandas()).fit()
)
).alias("model")
)
ComputeError: PatsyError: Error evaluating factor: TypeError: cannot use __getitem__
on Series of dtype List(Struct({‘events’: Int64, ‘fails’: Int64, ‘recipe’: String})) with argument ‘recipe’ of type ‘str’
events + fails ~ recipe