skeleton for ensemble model mimicking baisc scipy.stats functions imp…

…lemented, minimal skeleton for error handling implemented, basic testing for ensemble distributions, official MVP
ihmeuw-msca · Aug 14, 2024 · 6613985 · 6613985
1 parent 3168b18
commit 6613985
Show file tree

Hide file tree

Showing 4 changed files with 139 additions and 80 deletions.
diff --git a/simulations.ipynb b/simulations.ipynb
@@ -4,19 +4,7 @@
    "cell_type": "code",
    "execution_count": 1,
    "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "array([-0.13232203,  0.43740831, -1.17082002, -0.23752026, -0.29514267,\n",
-       "       -0.12570246, -0.0199769 ,  0.584793  , -1.23747854, -0.87275655])"
-      ]
-     },
-     "execution_count": 1,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
+   "outputs": [],
    "source": [
     "import numpy as np\n",
     "import scipy\n",
@@ -60,7 +48,7 @@
     "  unif_samp = scipy.stats.uniform.rvs(size=size)\n",
     "  return ppf_vec(unif_samp)\n",
     "\n",
-    "ensemble_rvs(10)"
+    "temp = ensemble_rvs(1000)"
    ]
   },
   {
@@ -72,36 +60,16 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "[0.77402318 0.2793297 ]\n"
+      "(('normal', 0.6655539267825432), ('gumbel', 0.3337741806768127))\n"
      ]
     }
    ],
    "source": [
     "# std_norm = distribution_dict[\"normal\"](0, 1).rvs(100)\n",
-    "temp = ensemble_rvs(10)\n",
+    "# temp = ensemble_rvs(1000)\n",
     "model = EnsembleFitter([\"normal\", \"gumbel\"], None)\n",
     "res = model.fit(temp)\n",
-    "print(res.weights)\n",
-    "\n",
-    "\n",
-    "\n",
-    "# mean = 1\n",
-    "# var = 2\n",
-    "# x = np.linspace(0, 5, 100)\n",
-    "# y = np.linspace(-4, 5, 100)\n",
-    "# std_norm = distribution_dict[\"normal\"](0, 1)\n",
-    "# q = std_norm.ppf(0.05)\n",
-    "# p = std_norm.cdf(q)\n",
-    "# print(q, p)\n",
-    "\n",
-    "# a = 0.3\n",
-    "# b = 0.7\n",
-    "# exp = distribution_dict[\"exponential\"](mean, var)\n",
-    "# fisk = distribution_dict[\"fisk\"](mean, var)\n",
-    "# # print(a * exp.(x) + b * norm.pdf(x))\n",
-    "\n",
-    "# temp = EnsembleFitter([\"exponential\", \"fisk\"], None)\n",
-    "# temp.fit(x)\n"
+    "print(res.weights)"
    ]
   },
   {

diff --git a/src/ensemble/distributions.py b/src/ensemble/distributions.py
@@ -2,8 +2,8 @@
 
 import numpy as np
 import numpy.typing as npt
-import scipy.optimize
-import scipy.stats
+import scipy.optimize as opt
+import scipy.stats as stats
 from scipy.special import gamma as gamma_func
 
 # from scipy.special import gammainccinv, gammaincinv
@@ -48,7 +48,7 @@ class Exponential(Distribution):
     def _create_scipy_dist(self) -> None:
         positive_support(self.mean)
         lambda_ = 1 / self.mean
-        self._scipy_dist = scipy.stats.expon(scale=1 / lambda_)
+        self._scipy_dist = stats.expon(scale=1 / lambda_)
 
 
 # analytic sol
@@ -59,7 +59,7 @@ def _create_scipy_dist(self) -> None:
         strict_positive_support(self.mean)
         alpha = self.mean**2 / self.variance
         beta = self.mean / self.variance
-        self._scipy_dist = scipy.stats.gamma(a=alpha, scale=1 / beta)
+        self._scipy_dist = stats.gamma(a=alpha, scale=1 / beta)
 
 
 # analytic sol
@@ -70,7 +70,7 @@ def _create_scipy_dist(self) -> None:
         strict_positive_support(self.mean)
         alpha = self.mean**2 / self.variance + 2
         beta = self.mean * (self.mean**2 / self.variance + 1)
-        self._scipy_dist = scipy.stats.invgamma(a=alpha, scale=beta)
+        self._scipy_dist = stats.invgamma(a=alpha, scale=beta)
 
 
 # numerical sol
@@ -79,7 +79,7 @@ class Fisk(Distribution):
 
     def _create_scipy_dist(self):
         positive_support(self.mean)
-        optim_params = scipy.optimize.minimize(
+        optim_params = opt.minimize(
             fun=self._shape_scale,
             # start beta at 1.1 and solve for alpha
             x0=[self.mean * 1.1 * np.sin(np.pi / 1.1) / np.pi, 1.1],
@@ -89,7 +89,7 @@ def _create_scipy_dist(self):
         alpha, beta = np.abs(optim_params.x)
         # parameterization notes: numpy's c is wikipedia's beta, numpy's scale is wikipedia's alpha
         # additional note: analytical solution doesn't work b/c dependent on derivative
-        self._scipy_dist = scipy.stats.fisk(c=beta, scale=alpha)
+        self._scipy_dist = stats.fisk(c=beta, scale=alpha)
 
     def _shape_scale(self, x: list, samp_mean: float, samp_var: float) -> None:
         alpha = x[0]
@@ -109,7 +109,7 @@ class GumbelR(Distribution):
     def _create_scipy_dist(self) -> None:
         loc = self.mean - np.sqrt(self.variance * 6) * np.euler_gamma / np.pi
         scale = np.sqrt(self.variance * 6) / np.pi
-        self._scipy_dist = scipy.stats.gumbel_r(loc=loc, scale=scale)
+        self._scipy_dist = stats.gumbel_r(loc=loc, scale=scale)
 
 
 # hopelessly broken
@@ -118,7 +118,7 @@ class Weibull(Distribution):
 
     def _create_scipy_dist(self) -> None:
         positive_support(self.mean)
-        optim_params = scipy.optimize.minimize(
+        optim_params = opt.minimize(
             fun=self._shape_scale,
             # ideally can invert gamma function for k, then use mean / sd as a guess for lambda
             x0=[self.mean / gamma_func(1 + 1 / 1.5), 1.5],
@@ -127,7 +127,7 @@ def _create_scipy_dist(self) -> None:
         )
         lambda_, k = np.abs(optim_params.x)
         print("params from optim: ", lambda_, k)
-        self._scipy_dist = scipy.stats.weibull_min(c=k, scale=lambda_)
+        self._scipy_dist = stats.weibull_min(c=k, scale=lambda_)
 
     def _shape_scale(self, x: list, samp_mean: float, samp_var: float) -> float:
         # TODO: TAKE A LOOK AT JAX SINCE IT DOES AUTOMATIC DERIVATIVES
@@ -151,15 +151,15 @@ def _create_scipy_dist(self) -> None:
         # scipy multiplies in the argument passed to `scale` so in the exponentiated space,
         # you're essentially adding `mu` within the exponentiated expression within the
         # lognormal's PDF; hence, scale is with exponentiation instead of loc
-        self._scipy_dist = scipy.stats.lognorm(scale=np.exp(mu), s=sigma)
+        self._scipy_dist = stats.lognorm(scale=np.exp(mu), s=sigma)
 
 
 # analytic sol
 class Normal(Distribution):
     support = "real line"
 
     def _create_scipy_dist(self) -> None:
-        self._scipy_dist = scipy.stats.norm(
+        self._scipy_dist = stats.norm(
             loc=self.mean, scale=np.sqrt(self.variance)
         )
 
@@ -179,7 +179,7 @@ def _create_scipy_dist(self) -> None:
             / self.variance
         )
         print(alpha, beta)
-        self._scipy_dist = scipy.stats.beta(a=alpha, b=beta)
+        self._scipy_dist = stats.beta(a=alpha, b=beta)
 
 
 # exp, gamma, invgamma, llogis, gumbel, weibull, lognormal, normal, mgamma, mgumbel, beta

diff --git a/src/ensemble/ensemble_model.py b/src/ensemble/ensemble_model.py
@@ -1,25 +1,54 @@
-from typing import List
+from typing import List, Tuple
 
 import numpy as np
 import numpy.typing as npt
-import scipy.optimize
-import scipy.stats
+import scipy.linalg as linalg
+import scipy.optimize as opt
+import scipy.stats as stats
 
 from ensemble.distributions import distribution_dict
 
 
-class EnsembleResult:
-    weights: npt.NDArray
-    ensemble_density: npt.NDArray
-
-    def __init__(self, weights, ensemble_density) -> None:
+class EnsembleModel:
+    def __init__(self, distributions, weights, mean, variance):
+        self.distributions = distributions
         self.weights = weights
-        self.ensemble_density = ensemble_density
+        self.mean = mean
+        self.variance = variance
+
+    def pdf(self, x):
+        raise NotImplementedError
 
+    def cdf(self, q):
+        raise NotImplementedError
 
-# class EnsembleModel:
-#     def __init__(self, distributions, weights):
-#         self.distributions =
+    def ppf(self, p):
+        raise NotImplementedError
+
+    def rvs(self, size=1):
+        raise NotImplementedError
+
+    def stats_temp(self, moments="mv"):
+        res_list = []
+        if "m" in moments:
+            res_list.append(self.mean)
+        if "v" in moments:
+            res_list.append(self.variance)
+
+        res_list = [res[()] for res in res_list]
+        if len(res_list) == 1:
+            return res_list[0]
+        else:
+            return tuple(res_list)
+
+
+class EnsembleResult:
+    weights: Tuple[str, float]
+    ensemble_model: EnsembleModel
+
+    def __init__(self, weights, ensemble_model) -> None:
+        self.weights = weights
+        self.ensemble_model = ensemble_model
 
 
 class EnsembleFitter:
@@ -34,34 +63,28 @@ def __init__(self, distributions: List[str], objective):
                 + str(self.supports)
             )
         self.distributions = distributions
+        self.objective = objective
 
-    def objective(self, vec):
-        return scipy.linalg.norm(vec, 2)
-
-    def ensemble_obj(self, weights):
-        # return data - F @ weights
-        pass
-
-    def get_ensemble_density(
-        self, pdfs: np.ndarray, fitted_weights: np.ndarray
-    ):
-        return pdfs @ fitted_weights
+    def objective_func(self, vec, objective):
+        if objective is not None:
+            raise NotImplementedError
+        return linalg.norm(vec, 2)
 
     def ensemble_func(self, weights: list, ecdf: np.ndarray, cdfs: np.ndarray):
-        return self.objective(ecdf - cdfs @ weights)
+        return self.objective_func(ecdf - cdfs @ weights, self.objective)
 
     def fit(self, data: npt.ArrayLike) -> EnsembleResult:
         # TODO: SWITCH CASE STATEMENT FOR BOUNDS OF DATA NOT MATCHING THE ELEMENT OF SELF.SUPPORTS
         # sample stats, ecdf
         sample_mean = np.mean(data)
         sample_variance = np.var(data, ddof=1)
-        ecdf = scipy.stats.ecdf(data).cdf.probabilities
+        ecdf = stats.ecdf(data).cdf.probabilities
 
         # may be able to condense into 1 line if ub and lb are not used elsewhere
         # support_lb = np.min(data)
         # support_ub = np.max(data)
         # support = np.linspace(support_lb, support_ub, len(data))
-        equantiles = scipy.stats.ecdf(data).cdf.quantiles
+        equantiles = stats.ecdf(data).cdf.quantiles
 
         # fill matrix with cdf values over support of data
         num_distributions = len(self.distributions)
@@ -77,19 +100,24 @@ def fit(self, data: npt.ArrayLike) -> EnsembleResult:
         # initialize equal weights for all dists and optimize
         initial_guess = np.zeros(num_distributions) + 1 / num_distributions
         bounds = tuple((0, 1) for i in range(num_distributions))
-        minimizer_result = scipy.optimize.minimize(
+        minimizer_result = opt.minimize(
             fun=self.ensemble_func,
             x0=initial_guess,
             args=(ecdf, cdfs),
             bounds=bounds,
         )
         fitted_weights = minimizer_result.x
 
-        # return minimizer_result.x
-
         res = EnsembleResult(
+            # weights=tuple(
+            #     (distribution, weight)
+            #     for distribution, weight in zip(
+            #         self.distributions, fitted_weights
+            #     )
+            # ),
+            # distributions=self.distributions,
             weights=fitted_weights,
-            ensemble_density=self.get_ensemble_density(cdfs, fitted_weights),
+            ensemble_model=EnsembleModel(None, None, None, None),
         )
 
         return res

diff --git a/tests/test_ensemble_model.py b/tests/test_ensemble_model.py
@@ -0,0 +1,63 @@
+import numpy as np
+import scipy.optimize as opt
+import scipy.stats as stats
+
+from ensemble.distributions import distribution_dict
+from ensemble.ensemble_model import EnsembleFitter
+
+
+def ensemble_cdf(x, distributions, weights, mean, variance):
+    my_objs = []
+    for distribution in distributions:
+        my_objs.append(distribution_dict[distribution](mean, variance))
+    return sum(
+        weight * distribution.cdf(x)
+        for distribution, weight in zip(my_objs, weights)
+    )
+
+
+def ppf_to_solve(x, q):
+    return ensemble_cdf(x, ["normal", "gumbel"], [0.7, 0.3], 0, 1) - q
+
+
+def ppf_single(q):
+    factor = 10.0
+    left = -factor
+    right = factor
+
+    while ppf_to_solve(left, q) > 0:
+        left, right = left * factor, left
+
+    while ppf_to_solve(right, q) < 0:
+        left, right = right, right * factor
+
+    return opt.brentq(ppf_to_solve, left, right, args=q)
+
+
+def ensemble_rvs(size):
+    ppf_vec = np.vectorize(ppf_single, otypes="d")
+    unif_samp = stats.uniform.rvs(size=size)
+    return ppf_vec(unif_samp)
+
+
+STD_NORMAL_DRAWS = distribution_dict["normal"](0, 1).rvs(100)
+# TODO: REMOVE HARDCODED RVS FUNCTION IN TEST FILE AND TEST MORE VARIED DISTRIBUTIONS ONCE THIS IS IMPLEMENTED IN ENSEMBLE_MODEL.PY
+ENSEMBLE_RAND_DRAWS = ensemble_rvs(100)
+
+
+def test_1_dist():
+    model = EnsembleFitter(["normal"], None)
+    res = model.fit(STD_NORMAL_DRAWS)
+    print(res.weights)
+    assert np.isclose(res.weights[0], 1)
+    wrong_model = EnsembleFitter(["normal", "gumbel"], None)
+    res = wrong_model.fit(STD_NORMAL_DRAWS)
+    print(res.weights)
+    assert np.allclose(res.weights, [1, 0])
+
+
+def test_2_dists():
+    model = EnsembleFitter(["normal", "gumbel"], None)
+    res = model.fit(ENSEMBLE_RAND_DRAWS)
+    print(res.weights)
+    assert np.allclose(res.weights, [0.7, 0.3])