Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: add experiment analysis functionalities #185

Draft
wants to merge 49 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 42 commits
Commits
Show all changes
49 commits
Select commit Hold shift + click to select a range
bf3bf2e
add metric and variant classes
ludovico-lanni Jul 19, 2024
5514c7e
add dimension class
ludovico-lanni Jul 19, 2024
0a786b3
add hypothesis test class
ludovico-lanni Jul 19, 2024
d830e8e
add hypothesis test class
ludovico-lanni Jul 19, 2024
1477e8a
add analysis plan class
ludovico-lanni Jul 19, 2024
0f51902
add analysis results data structures
ludovico-lanni Jul 19, 2024
4524381
add analysis plan analyze signature
ludovico-lanni Jul 19, 2024
45fd7a2
add analysis plan analyze draft
ludovico-lanni Jul 20, 2024
278929c
change signature for metric components
ludovico-lanni Jul 20, 2024
7fb2a15
fix bugs
ludovico-lanni Jul 20, 2024
5f9e1a2
improve metric system
ludovico-lanni Jul 21, 2024
62562a3
add confidence interval dataclass
ludovico-lanni Jul 21, 2024
55ee65b
add confidence interval logic
ludovico-lanni Jul 21, 2024
8345596
add alpha in results contract
ludovico-lanni Jul 21, 2024
2933355
add support for default total dimension
ludovico-lanni Jul 21, 2024
85e12aa
refactor directory structure into analytics submodule
ludovico-lanni Jul 26, 2024
423d285
edit docstring for analysis plan
ludovico-lanni Jul 26, 2024
5ba0e29
make Variant a dataclass
ludovico-lanni Jul 26, 2024
d55a56d
make Dimension a dataclass
ludovico-lanni Jul 26, 2024
d924165
create inference results logic for abstract experiment analysis class
ludovico-lanni Jul 26, 2024
bda6e0c
add inference results schema and computation method
ludovico-lanni Jul 26, 2024
a711995
refactor analysis plan to use inference results
ludovico-lanni Jul 26, 2024
1da6b84
move analysis class selection to hypothesis test class
ludovico-lanni Jul 26, 2024
00b06d2
added support for cupac
ludovico-lanni Jul 26, 2024
53cc381
refactor directory name
ludovico-lanni Jul 30, 2024
85f1e61
change validation method for variant
ludovico-lanni Jul 30, 2024
fda49f3
change validation method for dimension and rename default dimension
ludovico-lanni Jul 30, 2024
ac5cb9a
change validation method for analysis plan
ludovico-lanni Jul 30, 2024
54b24ea
define variant properties
ludovico-lanni Jul 30, 2024
045d4a1
define variant properties
ludovico-lanni Jul 30, 2024
abade82
move inference results to hypothesis test
ludovico-lanni Jul 30, 2024
c5f4b16
move inference results to hypothesis test
ludovico-lanni Jul 30, 2024
c98639d
move cupac logic and config logic from plan to test
ludovico-lanni Jul 30, 2024
ceda3b0
move prepare data from plan to test
ludovico-lanni Jul 30, 2024
d48b621
fix bug in cupac handler instantiation
ludovico-lanni Jul 30, 2024
3b7cf5b
raise error to handle missing cupac covariate
ludovico-lanni Aug 2, 2024
e618629
raise error to handle missing cupac covariate
ludovico-lanni Aug 2, 2024
42a8297
restructure loops for readability
ludovico-lanni Aug 2, 2024
4bf5e47
refactor analysis results data structures
ludovico-lanni Aug 2, 2024
98b490c
add data checks in analyse method
ludovico-lanni Aug 2, 2024
83791ed
add simplified from_metrics interface for analysis plan
ludovico-lanni Aug 2, 2024
08943bb
add verbose logging
ludovico-lanni Aug 2, 2024
d20e10b
fix logging config bad practise
ludovico-lanni Oct 4, 2024
05be002
make use of class property for metric's target_column
ludovico-lanni Oct 4, 2024
ccba301
move add_covariates logic to hypothesis test
ludovico-lanni Oct 4, 2024
697c5fe
move get_test_results code to hypothesis test
ludovico-lanni Oct 4, 2024
0cc21d3
make use of default fields for AnalysisPlanResults dataclass
ludovico-lanni Oct 4, 2024
8d12b88
add check for analysis type allowed values
ludovico-lanni Oct 11, 2024
e4f14e6
fix analysis class handling to ensure flexibility and extensibility o…
ludovico-lanni Oct 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
240 changes: 240 additions & 0 deletions cluster_experiments/experiment_analysis.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import logging
from abc import ABC, abstractmethod
from dataclasses import dataclass
from typing import Dict, List, Optional

import numpy as np
Expand All @@ -12,6 +13,29 @@
from cluster_experiments.utils import HypothesisEntries


@dataclass
class ConfidenceInterval:
"""
Class to define the structure of a confidence interval.
"""

ludovico-lanni marked this conversation as resolved.
Show resolved Hide resolved
lower: float
upper: float
alpha: float


@dataclass
class InferenceResults:
"""
Class to define the structure of complete statistical analysis results.
"""

ludovico-lanni marked this conversation as resolved.
Show resolved Hide resolved
ate: float
p_value: float
std_error: float
conf_int: ConfidenceInterval


class ExperimentAnalysis(ABC):
"""
Abstract class to run the analysis of a given experiment
Expand Down Expand Up @@ -97,6 +121,40 @@ def analysis_standard_error(
"""
raise NotImplementedError("Standard error not implemented for this analysis")

def analysis_confidence_interval(
self,
df: pd.DataFrame,
alpha: float,
verbose: bool = False,
) -> ConfidenceInterval:
"""
Returns the confidence interval of the analysis. Expects treatment to be 0-1 variable
Arguments:
df: dataframe containing the data to analyze
alpha: significance level
verbose (Optional): bool, prints the regression summary if True
"""
raise NotImplementedError(
"Confidence Interval not implemented for this analysis"
)

def analysis_inference_results(
ludovico-lanni marked this conversation as resolved.
Show resolved Hide resolved
self,
df: pd.DataFrame,
alpha: float,
verbose: bool = False,
) -> InferenceResults:
"""
Returns the InferenceResults object of the analysis. Expects treatment to be 0-1 variable
Arguments:
df: dataframe containing the data to analyze
alpha: significance level
verbose (Optional): bool, prints the regression summary if True
"""
raise NotImplementedError(
"Inference results are not implemented for this analysis"
)

def _data_checks(self, df: pd.DataFrame) -> None:
"""Checks that the data is correct"""
if df[self.target_col].isnull().any():
Expand Down Expand Up @@ -142,6 +200,32 @@ def get_standard_error(self, df: pd.DataFrame) -> float:
self._data_checks(df=df)
return self.analysis_standard_error(df)

def get_confidence_interval(
self, df: pd.DataFrame, alpha: float
) -> ConfidenceInterval:
"""Returns the confidence interval of the analysis

Arguments:
df: dataframe containing the data to analyze
alpha: significance level
"""
df = df.copy()
df = self._create_binary_treatment(df)
self._data_checks(df=df)
return self.analysis_confidence_interval(df, alpha)
ludovico-lanni marked this conversation as resolved.
Show resolved Hide resolved

def get_inference_results(self, df: pd.DataFrame, alpha: float) -> InferenceResults:
"""Returns the inference results of the analysis

Arguments:
df: dataframe containing the data to analyze
alpha: significance level
"""
df = df.copy()
df = self._create_binary_treatment(df)
self._data_checks(df=df)
return self.analysis_inference_results(df, alpha)

def pvalue_based_on_hypothesis(
self, model_result
) -> float: # todo add typehint statsmodels result
Expand Down Expand Up @@ -274,6 +358,58 @@ def analysis_standard_error(self, df: pd.DataFrame, verbose: bool = False) -> fl
results_gee = self.fit_gee(df)
return results_gee.bse[self.treatment_col]

def analysis_confidence_interval(
self, df: pd.DataFrame, alpha: float, verbose: bool = False
) -> ConfidenceInterval:
"""Returns the confidence interval of the analysis
Arguments:
df: dataframe containing the data to analyze
alpha: significance level
verbose (Optional): bool, prints the regression summary if True
"""
results_gee = self.fit_gee(df)
# Extract the confidence interval for the treatment column
conf_int_df = results_gee.conf_int(alpha=alpha)
lower_bound, upper_bound = conf_int_df.loc[self.treatment_col]

if verbose:
print(results_gee.summary())

# Return the confidence interval
return ConfidenceInterval(lower=lower_bound, upper=upper_bound, alpha=alpha)

def analysis_inference_results(
self, df: pd.DataFrame, alpha: float, verbose: bool = False
) -> InferenceResults:
"""Returns the inference results of the analysis
Arguments:
df: dataframe containing the data to analyze
alpha: significance level
verbose (Optional): bool, prints the regression summary if True
"""
results_gee = self.fit_gee(df)

std_error = results_gee.bse[self.treatment_col]
ate = results_gee.params[self.treatment_col]
p_value = self.pvalue_based_on_hypothesis(results_gee)

# Extract the confidence interval for the treatment column
conf_int_df = results_gee.conf_int(alpha=alpha)
lower_bound, upper_bound = conf_int_df.loc[self.treatment_col]

if verbose:
print(results_gee.summary())

# Return the confidence interval
return InferenceResults(
ate=ate,
p_value=p_value,
std_error=std_error,
conf_int=ConfidenceInterval(
lower=lower_bound, upper=upper_bound, alpha=alpha
),
)


class ClusteredOLSAnalysis(ExperimentAnalysis):
"""
Expand Down Expand Up @@ -365,6 +501,58 @@ def analysis_standard_error(self, df: pd.DataFrame, verbose: bool = False) -> fl
results_ols = self.fit_ols_clustered(df)
return results_ols.bse[self.treatment_col]

def analysis_confidence_interval(
self, df: pd.DataFrame, alpha: float, verbose: bool = False
) -> ConfidenceInterval:
"""Returns the confidence interval of the analysis
Arguments:
df: dataframe containing the data to analyze
alpha: significance level
verbose (Optional): bool, prints the regression summary if True
"""
results_ols = self.fit_ols_clustered(df)
# Extract the confidence interval for the treatment column
conf_int_df = results_ols.conf_int(alpha=alpha)
lower_bound, upper_bound = conf_int_df.loc[self.treatment_col]

if verbose:
print(results_ols.summary())

# Return the confidence interval
return ConfidenceInterval(lower=lower_bound, upper=upper_bound, alpha=alpha)

def analysis_inference_results(
self, df: pd.DataFrame, alpha: float, verbose: bool = False
) -> InferenceResults:
"""Returns the inference results of the analysis
Arguments:
df: dataframe containing the data to analyze
alpha: significance level
verbose (Optional): bool, prints the regression summary if True
"""
results_ols = self.fit_ols_clustered(df)

std_error = results_ols.bse[self.treatment_col]
ate = results_ols.params[self.treatment_col]
p_value = self.pvalue_based_on_hypothesis(results_ols)

# Extract the confidence interval for the treatment column
conf_int_df = results_ols.conf_int(alpha=alpha)
lower_bound, upper_bound = conf_int_df.loc[self.treatment_col]

if verbose:
print(results_ols.summary())

# Return the confidence interval
return InferenceResults(
ate=ate,
p_value=p_value,
std_error=std_error,
conf_int=ConfidenceInterval(
lower=lower_bound, upper=upper_bound, alpha=alpha
),
)


class TTestClusteredAnalysis(ExperimentAnalysis):
"""
Expand Down Expand Up @@ -641,6 +829,58 @@ def analysis_standard_error(self, df: pd.DataFrame, verbose: bool = False) -> fl
results_ols = self.fit_ols(df=df)
return results_ols.bse[self.treatment_col]

def analysis_confidence_interval(
self, df: pd.DataFrame, alpha: float, verbose: bool = False
) -> ConfidenceInterval:
"""Returns the confidence interval of the analysis
Arguments:
df: dataframe containing the data to analyze
alpha: significance level
verbose (Optional): bool, prints the regression summary if True
"""
results_ols = self.fit_ols(df)
# Extract the confidence interval for the treatment column
conf_int_df = results_ols.conf_int(alpha=alpha)
lower_bound, upper_bound = conf_int_df.loc[self.treatment_col]

if verbose:
print(results_ols.summary())

# Return the confidence interval
return ConfidenceInterval(lower=lower_bound, upper=upper_bound, alpha=alpha)

def analysis_inference_results(
self, df: pd.DataFrame, alpha: float, verbose: bool = False
) -> InferenceResults:
"""Returns the inference results of the analysis
Arguments:
df: dataframe containing the data to analyze
alpha: significance level
verbose (Optional): bool, prints the regression summary if True
"""
results_ols = self.fit_ols(df)

std_error = results_ols.bse[self.treatment_col]
ate = results_ols.params[self.treatment_col]
p_value = self.pvalue_based_on_hypothesis(results_ols)

# Extract the confidence interval for the treatment column
conf_int_df = results_ols.conf_int(alpha=alpha)
lower_bound, upper_bound = conf_int_df.loc[self.treatment_col]

if verbose:
print(results_ols.summary())

# Return the confidence interval
return InferenceResults(
ate=ate,
p_value=p_value,
std_error=std_error,
conf_int=ConfidenceInterval(
lower=lower_bound, upper=upper_bound, alpha=alpha
),
)

@classmethod
def from_config(cls, config):
"""Creates an OLSAnalysis object from a PowerConfig object"""
Expand Down
Empty file.
Loading
Loading