Skip to content

Commit

Permalink
[94] feat: create exact power calculation class (#169)
Browse files Browse the repository at this point in the history
* relaxed precommit python config for black

* Refactor power analysis module adding abstract class interface

* call super in check inputs

* add simulation parameter back to parent class

* refactor power classes to prepare for AA power computation

* draft move of loops interface to parent class

* add normalpoweranalysis and tests

* add tests

* fix tests

* add mlm in test

* add more tests

* bump to 0150

* add failing test

* add normal power

* add alternative power computation

* add comment

* add ludos correction

---------

Co-authored-by: David <[email protected]>
  • Loading branch information
ludovico-lanni and david26694 authored May 27, 2024
1 parent 16b6b5c commit 03a2eb0
Show file tree
Hide file tree
Showing 12 changed files with 1,037 additions and 22 deletions.
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ repos:
rev: 22.12.0
hooks:
- id: black
language_version: python3.8
language_version: python3
- repo: https://github.com/charliermarsh/ruff-pre-commit
rev: 'v0.0.261'
hooks:
Expand Down
30 changes: 26 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ https://codecov.io/gh/david26694/cluster-experiments/branch/main/graph/badge.svg
![License](https://img.shields.io/github/license/david26694/cluster-experiments)
[![Pypi version](https://img.shields.io/pypi/pyversions/cluster-experiments.svg)](https://pypi.python.org/pypi/cluster-experiments)

A library to run simulation-based power analysis, including clustered data. Also useful to design and analyse clustered and switchback experiments.
A library to run simulation-based power analysis, including cluster-randomized trial data. Also useful to design and analyse cluster-randomized and switchback experiments.


<img src="theme/flow.png">
Expand Down Expand Up @@ -50,6 +50,19 @@ power = pw.power_analysis(df, average_effect=0.1)
# You may also get the power curve by running the power analysis with different average effects
power_line = pw.power_line(df, average_effects=[0, 0.1, 0.2])


# A faster method can be used to run the power analysis, using the approximation of
# the central limit theorem, which is stable with less simulations
from cluster_experiments import NormalPowerAnalysis
npw = NormalPowerAnalysis.from_dict(
{
"analysis": "ols_non_clustered",
"splitter": "non_clustered",
"n_simulations": 5,
}
)
power_line_normal = npw.power_line(df, average_effects=[0, 0.1, 0.2])

```

### Switchback
Expand Down Expand Up @@ -93,7 +106,7 @@ print(f"{power = }")

### Long example

This is a comprehensive example of how to use this library. There are simpler ways to run this power analysis above but this shows all the building blocks of the library.
This is a more comprehensive example of how to use this library. There are simpler ways to run this power analysis above but this shows all the building blocks of the library.

```python title="Switchback - using classes"
from datetime import date
Expand All @@ -102,7 +115,7 @@ import numpy as np
import pandas as pd
from cluster_experiments.experiment_analysis import GeeExperimentAnalysis
from cluster_experiments.perturbator import ConstantPerturbator
from cluster_experiments.power_analysis import PowerAnalysis
from cluster_experiments.power_analysis import PowerAnalysis, NormalPowerAnalysis
from cluster_experiments.random_splitter import ClusteredSplitter

# Create fake data
Expand Down Expand Up @@ -138,14 +151,23 @@ pw = PowerAnalysis(
# Keep in mind that the average effect is the absolute effect added, this is not relative!
power = pw.power_analysis(df, average_effect=0.1)
print(f"{power = }")

# You can also use normal power analysis, that uses central limit theorem to estimate power, and it should be stable in less simulations
npw = NormalPowerAnalysis(
splitter=sw, analysis=analysis, n_simulations=50, seed=123
)
power = npw.power_analysis(df, average_effect=0.1)
print(f"{power = }")

```

## Features

The library offers the following classes:

* Regarding power analysis:
* `PowerAnalysis`: to run power analysis on a clustered/switchback design
* `PowerAnalysis`: to run power analysis on any experiment design, using simulation
* `NormalPowerAnalysis`: to run power analysis on any experiment design using the central limit theorem for the distribution of the estimator
* `ConstantPerturbator`: to artificially perturb treated group with constant perturbations
* `BinaryPerturbator`: to artificially perturb treated group for binary outcomes
* `RelativePositivePerturbator`: to artificially perturb treated group with relative positive perturbations
Expand Down
3 changes: 2 additions & 1 deletion cluster_experiments/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
SegmentedBetaRelativePerturbator,
UniformPerturbator,
)
from cluster_experiments.power_analysis import PowerAnalysis
from cluster_experiments.power_analysis import NormalPowerAnalysis, PowerAnalysis
from cluster_experiments.power_config import PowerConfig
from cluster_experiments.random_splitter import (
BalancedClusteredSplitter,
Expand Down Expand Up @@ -48,6 +48,7 @@
"BetaRelativePerturbator",
"SegmentedBetaRelativePerturbator",
"PowerAnalysis",
"NormalPowerAnalysis",
"PowerConfig",
"EmptyRegressor",
"TargetAggregation",
Expand Down
80 changes: 70 additions & 10 deletions cluster_experiments/experiment_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,19 @@ def analysis_point_estimate(
"""
raise NotImplementedError("Point estimate not implemented for this analysis")

def analysis_standard_error(
self,
df: pd.DataFrame,
verbose: bool = False,
) -> float:
"""
Returns the standard error of the analysis. Expects treatment to be 0-1 variable
Arguments:
df: dataframe containing the data to analyze
verbose (Optional): bool, prints the regression summary if True
"""
raise NotImplementedError("Standard error not implemented for this analysis")

def _data_checks(self, df: pd.DataFrame) -> None:
"""Checks that the data is correct"""
if df[self.target_col].isnull().any():
Expand Down Expand Up @@ -116,6 +129,17 @@ def get_point_estimate(self, df: pd.DataFrame) -> float:
self._data_checks(df=df)
return self.analysis_point_estimate(df)

def get_standard_error(self, df: pd.DataFrame) -> float:
"""Returns the standard error of the analysis
Arguments:
df: dataframe containing the data to analyze
"""
df = df.copy()
df = self._create_binary_treatment(df)
self._data_checks(df=df)
return self.analysis_standard_error(df)

def pvalue_based_on_hypothesis(
self, model_result
) -> float: # todo add typehint statsmodels result
Expand Down Expand Up @@ -234,6 +258,15 @@ def analysis_point_estimate(self, df: pd.DataFrame, verbose: bool = False) -> fl
results_gee = self.fit_gee(df)
return results_gee.params[self.treatment_col]

def analysis_standard_error(self, df: pd.DataFrame, verbose: bool = False) -> float:
"""Returns the standard error of the analysis
Arguments:
df: dataframe containing the data to analyze
verbose (Optional): bool, prints the regression summary if True
"""
results_gee = self.fit_gee(df)
return results_gee.bse[self.treatment_col]


class ClusteredOLSAnalysis(ExperimentAnalysis):
"""
Expand Down Expand Up @@ -287,16 +320,20 @@ def __init__(
self.formula = f"{self.target_col} ~ {' + '.join(self.regressors)}"
self.cov_type = "cluster"

def fit_ols_clustered(self, df: pd.DataFrame):
"""Returns the fitted OLS model"""
return sm.OLS.from_formula(self.formula, data=df,).fit(
cov_type=self.cov_type,
cov_kwds={"groups": self._get_cluster_column(df)},
)

def analysis_pvalue(self, df: pd.DataFrame, verbose: bool = False) -> float:
"""Returns the p-value of the analysis
Arguments:
df: dataframe containing the data to analyze
verbose (Optional): bool, prints the regression summary if True
"""
results_ols = sm.OLS.from_formula(self.formula, data=df,).fit(
cov_type=self.cov_type,
cov_kwds={"groups": self._get_cluster_column(df)},
)
results_ols = self.fit_ols_clustered(df)
if verbose:
print(results_ols.summary())

Expand All @@ -309,13 +346,18 @@ def analysis_point_estimate(self, df: pd.DataFrame, verbose: bool = False) -> fl
df: dataframe containing the data to analyze
verbose (Optional): bool, prints the regression summary if True
"""
# Keep in mind that the point estimate of the OLS is the same as the ClusteredOLS
results_ols = sm.OLS.from_formula(
self.formula,
data=df,
).fit()
results_ols = self.fit_ols_clustered(df)
return results_ols.params[self.treatment_col]

def analysis_standard_error(self, df: pd.DataFrame, verbose: bool = False) -> float:
"""Returns the standard error of the analysis
Arguments:
df: dataframe containing the data to analyze
verbose (Optional): bool, prints the regression summary if True
"""
results_ols = self.fit_ols_clustered(df)
return results_ols.bse[self.treatment_col]


class TTestClusteredAnalysis(ExperimentAnalysis):
"""
Expand Down Expand Up @@ -557,7 +599,7 @@ def __init__(
self.formula = f"{self.target_col} ~ {' + '.join(self.regressors)}"
self.hypothesis = hypothesis

def fit_ols(self, df: pd.DataFrame) -> sm.GEE:
def fit_ols(self, df: pd.DataFrame):
"""Returns the fitted OLS model"""
return sm.OLS.from_formula(self.formula, data=df).fit()

Expand All @@ -583,6 +625,15 @@ def analysis_point_estimate(self, df: pd.DataFrame, verbose: bool = False) -> fl
results_ols = self.fit_ols(df=df)
return results_ols.params[self.treatment_col]

def analysis_standard_error(self, df: pd.DataFrame, verbose: bool = False) -> float:
"""Returns the standard error of the analysis
Arguments:
df: dataframe containing the data to analyze
verbose (Optional): bool, prints the regression summary if True
"""
results_ols = self.fit_ols(df=df)
return results_ols.bse[self.treatment_col]

@classmethod
def from_config(cls, config):
"""Creates an OLSAnalysis object from a PowerConfig object"""
Expand Down Expand Up @@ -680,3 +731,12 @@ def analysis_point_estimate(self, df: pd.DataFrame, verbose: bool = False) -> fl
"""
results_mlm = self.fit_mlm(df)
return results_mlm.params[self.treatment_col]

def analysis_standard_error(self, df: pd.DataFrame, verbose: bool = False) -> float:
"""Returns the standard error of the analysis
Arguments:
df: dataframe containing the data to analyze
verbose (Optional): bool, prints the regression summary if True
"""
results_mlm = self.fit_mlm(df)
return results_mlm.bse[self.treatment_col]
Loading

0 comments on commit 03a2eb0

Please sign in to comment.