Skip to content

Commit

Permalink
skeleton for ensemble model mimicking baisc scipy.stats functions imp…
Browse files Browse the repository at this point in the history
…lemented, minimal skeleton for error handling implemented, basic testing for ensemble distributions, official MVP
  • Loading branch information
mbi6245 committed Aug 14, 2024
1 parent 3168b18 commit 6613985
Show file tree
Hide file tree
Showing 4 changed files with 139 additions and 80 deletions.
42 changes: 5 additions & 37 deletions simulations.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -4,19 +4,7 @@
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"array([-0.13232203, 0.43740831, -1.17082002, -0.23752026, -0.29514267,\n",
" -0.12570246, -0.0199769 , 0.584793 , -1.23747854, -0.87275655])"
]
},
"execution_count": 1,
"metadata": {},
"output_type": "execute_result"
}
],
"outputs": [],
"source": [
"import numpy as np\n",
"import scipy\n",
Expand Down Expand Up @@ -60,7 +48,7 @@
" unif_samp = scipy.stats.uniform.rvs(size=size)\n",
" return ppf_vec(unif_samp)\n",
"\n",
"ensemble_rvs(10)"
"temp = ensemble_rvs(1000)"
]
},
{
Expand All @@ -72,36 +60,16 @@
"name": "stdout",
"output_type": "stream",
"text": [
"[0.77402318 0.2793297 ]\n"
"(('normal', 0.6655539267825432), ('gumbel', 0.3337741806768127))\n"
]
}
],
"source": [
"# std_norm = distribution_dict[\"normal\"](0, 1).rvs(100)\n",
"temp = ensemble_rvs(10)\n",
"# temp = ensemble_rvs(1000)\n",
"model = EnsembleFitter([\"normal\", \"gumbel\"], None)\n",
"res = model.fit(temp)\n",
"print(res.weights)\n",
"\n",
"\n",
"\n",
"# mean = 1\n",
"# var = 2\n",
"# x = np.linspace(0, 5, 100)\n",
"# y = np.linspace(-4, 5, 100)\n",
"# std_norm = distribution_dict[\"normal\"](0, 1)\n",
"# q = std_norm.ppf(0.05)\n",
"# p = std_norm.cdf(q)\n",
"# print(q, p)\n",
"\n",
"# a = 0.3\n",
"# b = 0.7\n",
"# exp = distribution_dict[\"exponential\"](mean, var)\n",
"# fisk = distribution_dict[\"fisk\"](mean, var)\n",
"# # print(a * exp.(x) + b * norm.pdf(x))\n",
"\n",
"# temp = EnsembleFitter([\"exponential\", \"fisk\"], None)\n",
"# temp.fit(x)\n"
"print(res.weights)"
]
},
{
Expand Down
26 changes: 13 additions & 13 deletions src/ensemble/distributions.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@

import numpy as np
import numpy.typing as npt
import scipy.optimize
import scipy.stats
import scipy.optimize as opt
import scipy.stats as stats
from scipy.special import gamma as gamma_func

# from scipy.special import gammainccinv, gammaincinv
Expand Down Expand Up @@ -48,7 +48,7 @@ class Exponential(Distribution):
def _create_scipy_dist(self) -> None:
positive_support(self.mean)
lambda_ = 1 / self.mean
self._scipy_dist = scipy.stats.expon(scale=1 / lambda_)
self._scipy_dist = stats.expon(scale=1 / lambda_)


# analytic sol
Expand All @@ -59,7 +59,7 @@ def _create_scipy_dist(self) -> None:
strict_positive_support(self.mean)
alpha = self.mean**2 / self.variance
beta = self.mean / self.variance
self._scipy_dist = scipy.stats.gamma(a=alpha, scale=1 / beta)
self._scipy_dist = stats.gamma(a=alpha, scale=1 / beta)


# analytic sol
Expand All @@ -70,7 +70,7 @@ def _create_scipy_dist(self) -> None:
strict_positive_support(self.mean)
alpha = self.mean**2 / self.variance + 2
beta = self.mean * (self.mean**2 / self.variance + 1)
self._scipy_dist = scipy.stats.invgamma(a=alpha, scale=beta)
self._scipy_dist = stats.invgamma(a=alpha, scale=beta)


# numerical sol
Expand All @@ -79,7 +79,7 @@ class Fisk(Distribution):

def _create_scipy_dist(self):
positive_support(self.mean)
optim_params = scipy.optimize.minimize(
optim_params = opt.minimize(
fun=self._shape_scale,
# start beta at 1.1 and solve for alpha
x0=[self.mean * 1.1 * np.sin(np.pi / 1.1) / np.pi, 1.1],
Expand All @@ -89,7 +89,7 @@ def _create_scipy_dist(self):
alpha, beta = np.abs(optim_params.x)
# parameterization notes: numpy's c is wikipedia's beta, numpy's scale is wikipedia's alpha
# additional note: analytical solution doesn't work b/c dependent on derivative
self._scipy_dist = scipy.stats.fisk(c=beta, scale=alpha)
self._scipy_dist = stats.fisk(c=beta, scale=alpha)

def _shape_scale(self, x: list, samp_mean: float, samp_var: float) -> None:
alpha = x[0]
Expand All @@ -109,7 +109,7 @@ class GumbelR(Distribution):
def _create_scipy_dist(self) -> None:
loc = self.mean - np.sqrt(self.variance * 6) * np.euler_gamma / np.pi
scale = np.sqrt(self.variance * 6) / np.pi
self._scipy_dist = scipy.stats.gumbel_r(loc=loc, scale=scale)
self._scipy_dist = stats.gumbel_r(loc=loc, scale=scale)


# hopelessly broken
Expand All @@ -118,7 +118,7 @@ class Weibull(Distribution):

def _create_scipy_dist(self) -> None:
positive_support(self.mean)
optim_params = scipy.optimize.minimize(
optim_params = opt.minimize(
fun=self._shape_scale,
# ideally can invert gamma function for k, then use mean / sd as a guess for lambda
x0=[self.mean / gamma_func(1 + 1 / 1.5), 1.5],
Expand All @@ -127,7 +127,7 @@ def _create_scipy_dist(self) -> None:
)
lambda_, k = np.abs(optim_params.x)
print("params from optim: ", lambda_, k)
self._scipy_dist = scipy.stats.weibull_min(c=k, scale=lambda_)
self._scipy_dist = stats.weibull_min(c=k, scale=lambda_)

def _shape_scale(self, x: list, samp_mean: float, samp_var: float) -> float:
# TODO: TAKE A LOOK AT JAX SINCE IT DOES AUTOMATIC DERIVATIVES
Expand All @@ -151,15 +151,15 @@ def _create_scipy_dist(self) -> None:
# scipy multiplies in the argument passed to `scale` so in the exponentiated space,
# you're essentially adding `mu` within the exponentiated expression within the
# lognormal's PDF; hence, scale is with exponentiation instead of loc
self._scipy_dist = scipy.stats.lognorm(scale=np.exp(mu), s=sigma)
self._scipy_dist = stats.lognorm(scale=np.exp(mu), s=sigma)


# analytic sol
class Normal(Distribution):
support = "real line"

def _create_scipy_dist(self) -> None:
self._scipy_dist = scipy.stats.norm(
self._scipy_dist = stats.norm(
loc=self.mean, scale=np.sqrt(self.variance)
)

Expand All @@ -179,7 +179,7 @@ def _create_scipy_dist(self) -> None:
/ self.variance
)
print(alpha, beta)
self._scipy_dist = scipy.stats.beta(a=alpha, b=beta)
self._scipy_dist = stats.beta(a=alpha, b=beta)


# exp, gamma, invgamma, llogis, gumbel, weibull, lognormal, normal, mgamma, mgumbel, beta
Expand Down
88 changes: 58 additions & 30 deletions src/ensemble/ensemble_model.py
Original file line number Diff line number Diff line change
@@ -1,25 +1,54 @@
from typing import List
from typing import List, Tuple

import numpy as np
import numpy.typing as npt
import scipy.optimize
import scipy.stats
import scipy.linalg as linalg
import scipy.optimize as opt
import scipy.stats as stats

from ensemble.distributions import distribution_dict


class EnsembleResult:
weights: npt.NDArray
ensemble_density: npt.NDArray

def __init__(self, weights, ensemble_density) -> None:
class EnsembleModel:
def __init__(self, distributions, weights, mean, variance):
self.distributions = distributions
self.weights = weights
self.ensemble_density = ensemble_density
self.mean = mean
self.variance = variance

def pdf(self, x):
raise NotImplementedError

def cdf(self, q):
raise NotImplementedError

# class EnsembleModel:
# def __init__(self, distributions, weights):
# self.distributions =
def ppf(self, p):
raise NotImplementedError

def rvs(self, size=1):
raise NotImplementedError

def stats_temp(self, moments="mv"):
res_list = []
if "m" in moments:
res_list.append(self.mean)
if "v" in moments:
res_list.append(self.variance)

res_list = [res[()] for res in res_list]
if len(res_list) == 1:
return res_list[0]
else:
return tuple(res_list)


class EnsembleResult:
weights: Tuple[str, float]
ensemble_model: EnsembleModel

def __init__(self, weights, ensemble_model) -> None:
self.weights = weights
self.ensemble_model = ensemble_model


class EnsembleFitter:
Expand All @@ -34,34 +63,28 @@ def __init__(self, distributions: List[str], objective):
+ str(self.supports)
)
self.distributions = distributions
self.objective = objective

def objective(self, vec):
return scipy.linalg.norm(vec, 2)

def ensemble_obj(self, weights):
# return data - F @ weights
pass

def get_ensemble_density(
self, pdfs: np.ndarray, fitted_weights: np.ndarray
):
return pdfs @ fitted_weights
def objective_func(self, vec, objective):
if objective is not None:
raise NotImplementedError
return linalg.norm(vec, 2)

def ensemble_func(self, weights: list, ecdf: np.ndarray, cdfs: np.ndarray):
return self.objective(ecdf - cdfs @ weights)
return self.objective_func(ecdf - cdfs @ weights, self.objective)

def fit(self, data: npt.ArrayLike) -> EnsembleResult:
# TODO: SWITCH CASE STATEMENT FOR BOUNDS OF DATA NOT MATCHING THE ELEMENT OF SELF.SUPPORTS
# sample stats, ecdf
sample_mean = np.mean(data)
sample_variance = np.var(data, ddof=1)
ecdf = scipy.stats.ecdf(data).cdf.probabilities
ecdf = stats.ecdf(data).cdf.probabilities

# may be able to condense into 1 line if ub and lb are not used elsewhere
# support_lb = np.min(data)
# support_ub = np.max(data)
# support = np.linspace(support_lb, support_ub, len(data))
equantiles = scipy.stats.ecdf(data).cdf.quantiles
equantiles = stats.ecdf(data).cdf.quantiles

# fill matrix with cdf values over support of data
num_distributions = len(self.distributions)
Expand All @@ -77,19 +100,24 @@ def fit(self, data: npt.ArrayLike) -> EnsembleResult:
# initialize equal weights for all dists and optimize
initial_guess = np.zeros(num_distributions) + 1 / num_distributions
bounds = tuple((0, 1) for i in range(num_distributions))
minimizer_result = scipy.optimize.minimize(
minimizer_result = opt.minimize(
fun=self.ensemble_func,
x0=initial_guess,
args=(ecdf, cdfs),
bounds=bounds,
)
fitted_weights = minimizer_result.x

# return minimizer_result.x

res = EnsembleResult(
# weights=tuple(
# (distribution, weight)
# for distribution, weight in zip(
# self.distributions, fitted_weights
# )
# ),
# distributions=self.distributions,
weights=fitted_weights,
ensemble_density=self.get_ensemble_density(cdfs, fitted_weights),
ensemble_model=EnsembleModel(None, None, None, None),
)

return res
Expand Down
63 changes: 63 additions & 0 deletions tests/test_ensemble_model.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
import numpy as np
import scipy.optimize as opt
import scipy.stats as stats

from ensemble.distributions import distribution_dict
from ensemble.ensemble_model import EnsembleFitter


def ensemble_cdf(x, distributions, weights, mean, variance):
my_objs = []
for distribution in distributions:
my_objs.append(distribution_dict[distribution](mean, variance))
return sum(
weight * distribution.cdf(x)
for distribution, weight in zip(my_objs, weights)
)


def ppf_to_solve(x, q):
return ensemble_cdf(x, ["normal", "gumbel"], [0.7, 0.3], 0, 1) - q


def ppf_single(q):
factor = 10.0
left = -factor
right = factor

while ppf_to_solve(left, q) > 0:
left, right = left * factor, left

while ppf_to_solve(right, q) < 0:
left, right = right, right * factor

return opt.brentq(ppf_to_solve, left, right, args=q)


def ensemble_rvs(size):
ppf_vec = np.vectorize(ppf_single, otypes="d")
unif_samp = stats.uniform.rvs(size=size)
return ppf_vec(unif_samp)


STD_NORMAL_DRAWS = distribution_dict["normal"](0, 1).rvs(100)
# TODO: REMOVE HARDCODED RVS FUNCTION IN TEST FILE AND TEST MORE VARIED DISTRIBUTIONS ONCE THIS IS IMPLEMENTED IN ENSEMBLE_MODEL.PY
ENSEMBLE_RAND_DRAWS = ensemble_rvs(100)


def test_1_dist():
model = EnsembleFitter(["normal"], None)
res = model.fit(STD_NORMAL_DRAWS)
print(res.weights)
assert np.isclose(res.weights[0], 1)
wrong_model = EnsembleFitter(["normal", "gumbel"], None)
res = wrong_model.fit(STD_NORMAL_DRAWS)
print(res.weights)
assert np.allclose(res.weights, [1, 0])


def test_2_dists():
model = EnsembleFitter(["normal", "gumbel"], None)
res = model.fit(ENSEMBLE_RAND_DRAWS)
print(res.weights)
assert np.allclose(res.weights, [0.7, 0.3])

0 comments on commit 6613985

Please sign in to comment.