Merge pull request #5 from ihmeuw-msca/feature/edge-case-and-ecdf

Feature/edge case and ecdf
ihmeuw-msca · Sep 25, 2024 · f8ece98 · f8ece98
2 parents a3e95ce + e4194f8
commit f8ece98
Show file tree

Hide file tree

Showing 6 changed files with 119 additions and 61 deletions.
diff --git a/docs/getting_started/quickstart.rst b/docs/getting_started/quickstart.rst
@@ -10,50 +10,22 @@ Example
     import scipy.stats
     from ensemble.ensemble_model import EnsembleModel, EnsembleFitter
     # creates an ensemble distribution composed of the normal and gumbel distributions both sharing
-    # the same mean and variance; the normal distribution can be thought of as contributing a
-    # quarter of the "height" of the density curve to the ensemble's density, and the gumbel as
-    # contributing the remaining 3 quarters
+    # the same mean and variance
     normal_gumbel = EnsembleModel(distributions=["normal", "gumbel"],
                                   weights=[0.25, 0.75],
                                   mean=4,
                                   variance=1)
 
     # fits an EnsembleModel object to standard normal draws. Here, the user has specified a
-    # distribution (the gumbel) that is not reflected in the truth. The model typically correctly
-    # identifies this and will give weights close to 1 for the normal, and 0 for the gumbel
+    # distribution (the gumbel) that is not reflected in the truth. Try on your own to see how the
+    # model reflects this!
     std_norm_draws = scipy.stats.norm.rvs(0, 1, size=100)
     model = EnsembleFitter(["normal", "gumbel"], "L2").fit(std_norm_draws)
 
     fitted_weights = model.weights
     fitted_distribution = model.ensemble_distribution
 
-Using both the draws from the standard normal and the fitted :code:`EnsembleModel` object from
-above, we can also plot the results with the help of the :code:`matplotlib` package. There are many
-things that you may want to plot, but 2 useful plots that will be demonstrated below are a density
-histogram overlaid with the ensemble PDF, as well as a comparison of the eCDF and the ensemble's
-CDF.
+    # default plotting function for a demo visualization
+    normal_gumbel.plot()
 
-Plotting
---------
-
-.. code-block:: python
-
-    import numpy as np
-    import matplotlib.pyplot as plt
-
-    fig, ax = plt.subplots(1, 2)
-    support = np.linspace(np.min(std_norm_draws), np.max(std_norm_draws), 1000)
-
-    # plot histogram vs fitted PDF
-    ax[0].hist(std_norm_draws, density=True, bins=30)
-    ax[0].plot(support, fitted_distribution.pdf(support))
-    ax[0].set_xlabel("DATA VALUES (UNITS)")
-    ax[0].set_ylabel("density")
-    ax[0].set_title("DATA histogram w/ensemble PDF Overlay")
-
-    # plot eCDF vs fitted CDF
-    stats.ecdf(std_norm_draws).cdf.plot(ax[1])
-    ax[1].plot(support, fitted_distribution.cdf(support))
-    ax[1].set_xlabel("DATA VALUES (UNITS)")
-    ax[1].set_ylabel("density")
-    ax[1].set_title("Empirical vs Ensemble CDF Comparison")
+**Please see** :ref:`Plotting` **for a practical guide on plotting with ensemble.**
diff --git a/docs/user_guide/ensemble_fitting.rst b/docs/user_guide/ensemble_fitting.rst
@@ -25,9 +25,9 @@ Finally, the function of interest for this use case is the :code:`fit()` functio
 Example: Fitting an Ensemble
 ----------------------------
 
-Suppose we have microdata for systolic blood pressure (SBP) from a certain population of young men
-in Seattle. Since SBP must be positive, let's use all the distributions (except the exponential)
-with a positive support to fit this data.
+Suppose we have microdata for systolic blood pressure (SBP) from a certain population of young
+people in Seattle. Since SBP must be positive, let's use all the distributions (except the
+exponential) with a positive support to fit this data.
 
 .. code-block:: python
 

diff --git a/docs/user_guide/index.rst b/docs/user_guide/index.rst
@@ -7,6 +7,7 @@ User guide
 
    ensemble_fitting
    ensemble_model
+   plotting
    concepts
 
 .. note::

diff --git a/docs/user_guide/plotting.rst b/docs/user_guide/plotting.rst
@@ -0,0 +1,43 @@
+Plotting
+========
+
+Let's reuse the example from :ref:`Example: Fitting an Ensemble` with the SBP. There are many
+things that you may want to plot, but 2 useful plots that will be demonstrated below are:
+
+* a density histogram of the SBP overlaid with the ensemble PDF
+* a comparison between the eCDF of the SBP data and the ensemble's CDF
+
+.. code-block:: python
+
+    import numpy as np
+    import matplotlib.pyplot as plt
+
+    # generate SBP data and fit model as before
+    data = stats.norm(loc=120, scale=7).rvs(size=100)
+    model = EnsembleFitter(
+        distributions=["gamma", "invgamma", "fisk", "lognormal"],
+        objective="L2"
+    )
+    res = model.fit(data)
+
+    # set up matplotlib plotting
+    fig, ax = plt.subplots(1, 2)
+    support = np.linspace(np.min(data), np.max(data), 1000)
+
+    # plot histogram w/fitted PDF
+    ax[0].hist(data, density=True, bins=30)
+    ax[0].plot(support, fitted_distribution.pdf(support))
+    ax[0].set_xlabel("SBP (mm/Hg)")
+    ax[0].set_ylabel("density")
+    ax[0].set_title("SBP histogram w/ensemble PDF Overlay")
+
+    # plot eCDF vs fitted CDF
+    stats.ecdf(std_norm_draws).cdf.plot(ax[1])
+    ax[1].plot(support, fitted_distribution.cdf(support))
+    ax[1].set_xlabel("SBP (mm/Hg)")
+    ax[1].set_ylabel("density")
+    ax[1].set_title("Empirical vs Ensemble CDF Comparison")
+
+**What is** :code:`support` **?:** You can think of :code:`support` as the x values (in the space of the
+data)for which we will calculate corresponding y values of density for, whether that be the PDF or
+CDF.
diff --git a/plots.ipynb b/plots.ipynb
diff --git a/src/ensemble/model.py b/src/ensemble/model.py
@@ -1,6 +1,7 @@
 from typing import List, Tuple, Union
 
 import cvxpy as cp
+import matplotlib.pyplot as plt
 import numpy as np
 import numpy.typing as npt
 import scipy.optimize as opt
@@ -219,6 +220,28 @@ def stats_temp(
         else:
             return tuple(res_list)
 
+    def plot(self):
+        """THIS IS A DEMONSTRATION FUNCTION. SEE DOCUMENTATION FOR MORE PRACTICAL PLOTS
+
+        plots the PDF and CDF of an ensemble distribution
+        """
+        fig, ax = plt.subplots(1, 2, figsize=(12, 6))
+        scaling = 3 * np.sqrt(self.variance)
+        lb = np.max([self.support[0], self.mean - scaling])
+        ub = np.min([self.support[1], self.mean + scaling])
+        support = np.linspace(lb, ub, 100)
+        pdf = self.pdf(support)
+        cdf = self.cdf(support)
+        ax[0].plot(support, pdf)
+        ax[0].set_xlabel("DATA VALUES (UNITS)")
+        ax[0].set_ylabel("density")
+        ax[0].set_title("ensemble PDF")
+
+        ax[1].plot(support, cdf)
+        ax[1].set_xlabel("DATA VALUES (UNITS)")
+        ax[1].set_ylabel("density")
+        ax[1].set_title("ensemble CDF")
+
 
 class EnsembleResult:
     """Result from ensemble distribution fitting
@@ -333,11 +356,23 @@ def fit(self, data: npt.ArrayLike) -> EnsembleResult:
             raise ValueError(
                 "data exceeds bounds of the support of your ensemble"
             )
+
+        if len(data) <= 1:
+            raise ValueError(
+                "you may only run this function with 2 or more data points"
+            )
+
         # sample stats, ecdf
         sample_mean = np.mean(data)
         sample_variance = np.var(data, ddof=1)
-        ecdf = stats.ecdf(data).cdf.probabilities
-        equantiles = stats.ecdf(data).cdf.quantiles
+        ecdf = stats.ecdf(data).cdf
+
+        # reintroduce duplicates into scipy's ecdf for fitting only
+        sorted_indices = np.argsort(data)
+        equantiles = data[sorted_indices]
+        eprobabilities = np.interp(
+            equantiles, ecdf.quantiles, ecdf.probabilities
+        )
 
         # fill matrix with cdf values over support of data
         num_distributions = len(self.distributions)
@@ -352,7 +387,7 @@ def fit(self, data: npt.ArrayLike) -> EnsembleResult:
 
         # CVXPY implementation
         w = cp.Variable(num_distributions)
-        objective = cp.Minimize(self._objective_func(ecdf - cdfs @ w))
+        objective = cp.Minimize(self._objective_func(eprobabilities - cdfs @ w))
         constraints = [0 <= w, cp.sum(w) == 1]
         prob = cp.Problem(objective, constraints)
         prob.solve()