I am trying to test the fit of several probability distributions on my data and perform a Maximum Likelihood estimation and KS test on the fit of each probability distribution to my data. My code works for continuous probability distributions but not for discrete (because they do not have a .fit
object. To fix the discrete distributions I am attempting to use a chi-square test but I cannot get it to work.
import pandas as pd
import numpy as np
import scipy.stats as st
from scipy.stats import kstest
data = pd.read_csv(r'...demo_data.csv')
data = pd.DataFrame(data)
#continuous variables
continuous_results = []
for cvar in ["cvar1", "cva2", "cvar3"]:
#probability distributions
cdistributions = [
st.arcsine,
st.alpha,
st.beta,
st.betaprime,
st.bradford,
st.burr,
st.chi,
st.chi2,
st.cosine,
st.dgamma,
st.dweibull,
st.expon,
st.exponweib,
st.exponpow,
st.genlogistic,
st.genpareto,
st.genexpon,
st.gengamma,
st.genhyperbolic,
st.geninvgauss,
st.gennorm,
st.f,
st.gamma,
st.invgamma,
st.invgauss,
st.invweibull,
st.laplace,
st.logistic,
st.loggamma,
st.loglaplace,
st.loguniform,
st.nakagami,
st.norm,
st.pareto,
st.powernorm,
st.powerlaw,
st.rdist,
st.semicircular,
st.t,
st.trapezoid,
st.triang,
st.tukeylambda,
st.uniform,
st.wald,
st.weibull_max,
st.weibull_min
]
for distribution in cdistributions:
try:
#fit each probability distribution to each variable in the data
pars = distribution.fit(data[cvar])
mle = distribution.nnlf(pars, data[cvar])
#perform ks test
ks_result = kstest(data[cvar], distribution.cdf, args = pars)
#create dictionary to store results for each variable/distribution
result = {
"variable": cvar,
"distribution": distribution.name,
"type": "continuous",
"mle": mle,
"ks_stat": ks_result.statistic,
"ks_pvalue": ks_result.pvalue
}
continuous_results.append(result)
except Exception as e:
# assign 0 for error
mle = 0
this code isn’t running a chi-square test or ks test and I am unsure how to fix it:
#discrete variables
discrete_results = []
for var in ["dvar1", "dvar2"]:
#probability distributions
distributions = [
st.bernoulli,
st.betabinom,
st.binom,
st.boltzmann,
st.poisson,
st.geom,
st.nbinom,
st.hypergeom,
st.zipf,
st.zipfian,
st.logser,
st.randint,
st.dlaplace
]
for distribution in distributions:
try:
#fit each probability distribution to each variable in the data
distfit = getattr(st, distribution.name)
chisq_stat, pval = st.chisquare(data[var], f = distfit.pmf(range(len(data[var]))))
#perform ks test
ks_result = kstest(data[var], distribution.cdf, args = distfit.pmf(range(len(data[var]))))
#create dictionary to store results for each variable/distribution
result = {
"variable": var,
"distribution": distribution.name,
"type": "continuous",
"chisq": chisq_stat,
"pvalue": pval,
"ks_stat": ks_result.statistic,
"ks_pvalue": ks_result.pvalue
}
discrete_results.append(result)
except Exception as e:
# assign 0 for error
chisq_stat = 0