reorg tests (#141)

* reorg tests * add seed
david26694 · Jan 15, 2024 · 6c22bff · 6c22bff
1 parent ee33c1f
commit 6c22bff
Show file tree

Hide file tree

Showing 15 changed files with 206 additions and 144 deletions.
diff --git a/tests/analysis/conftest.py b/tests/analysis/conftest.py
@@ -0,0 +1,14 @@
+import pandas as pd
+import pytest
+
+
+@pytest.fixture
+def analysis_df():
+    return pd.DataFrame(
+        {
+            "target": [0, 1, 0, 1],
+            "treatment": ["A", "B", "B", "A"],
+            "cluster": ["Cluster 1", "Cluster 1", "Cluster 1", "Cluster 1"],
+            "date": ["2022-01-01", "2022-01-01", "2022-01-01", "2022-01-01"],
+        }
+    )
diff --git a/tests/analysis/test_analysis.py b/tests/analysis/test_analysis.py
@@ -9,7 +9,7 @@
     PairedTTestClusteredAnalysis,
     TTestClusteredAnalysis,
 )
-from tests.examples import analysis_df, generate_clustered_data, generate_random_data
+from tests.utils import generate_clustered_data, generate_random_data
 
 
 @pytest.fixture
@@ -27,14 +27,14 @@ def analysis_df_diff():
     return analysis_df_full
 
 
-def test_cluster_column():
+def test_cluster_column(analysis_df):
     analyser = GeeExperimentAnalysis(
         cluster_cols=["cluster", "date"],
     )
     assert (analyser._get_cluster_column(analysis_df) == "Cluster 12022-01-01").all()
 
 
-def test_binary_treatment():
+def test_binary_treatment(analysis_df):
     analyser = GeeExperimentAnalysis(
         cluster_cols=["cluster", "date"],
     )
@@ -44,7 +44,7 @@ def test_binary_treatment():
     ).all()
 
 
-def test_get_pvalue():
+def test_get_pvalue(analysis_df):
     analysis_df_full = pd.concat([analysis_df for _ in range(100)])
     analyser = GeeExperimentAnalysis(
         cluster_cols=["cluster", "date"],

diff --git a/tests/analysis/test_hypothesis.py b/tests/analysis/test_hypothesis.py
@@ -8,12 +8,12 @@
     OLSAnalysis,
     TTestClusteredAnalysis,
 )
-from tests.examples import analysis_df, generate_clustered_data
+from tests.utils import generate_clustered_data
 
 
 @pytest.mark.parametrize("hypothesis", ["less", "greater", "two-sided"])
 @pytest.mark.parametrize("analysis_class", [OLSAnalysis])
-def test_get_pvalue_hypothesis(analysis_class, hypothesis):
+def test_get_pvalue_hypothesis(analysis_class, hypothesis, analysis_df):
     analysis_df_full = pd.concat([analysis_df for _ in range(100)])
     analyser = analysis_class(hypothesis=hypothesis)
     assert analyser.get_pvalue(analysis_df_full) >= 0
@@ -37,14 +37,14 @@ def test_get_pvalue_hypothesis_clustered(analysis_class, hypothesis):
 
 
 @pytest.mark.parametrize("analysis_class", [OLSAnalysis])
-def test_get_pvalue_hypothesis_default(analysis_class):
+def test_get_pvalue_hypothesis_default(analysis_class, analysis_df):
     analysis_df_full = pd.concat([analysis_df for _ in range(100)])
     analyser = analysis_class()
     assert analyser.get_pvalue(analysis_df_full) >= 0
 
 
 @pytest.mark.parametrize("analysis_class", [OLSAnalysis])
-def test_get_pvalue_hypothesis_wrong_input(analysis_class):
+def test_get_pvalue_hypothesis_wrong_input(analysis_class, analysis_df):
     analysis_df_full = pd.concat([analysis_df for _ in range(100)])
 
     # Use pytest.raises to check for ValueError
@@ -57,7 +57,7 @@ def test_get_pvalue_hypothesis_wrong_input(analysis_class):
 
 
 @pytest.mark.parametrize("analysis_class", [OLSAnalysis])
-def test_several_hypothesis(analysis_class):
+def test_several_hypothesis(analysis_class, analysis_df):
     analysis_df_full = pd.concat([analysis_df for _ in range(100)])
     analysis_less = analysis_class(hypothesis="less")
     analysis_greater = analysis_class(hypothesis="greater")

diff --git a/tests/analysis/test_ols_analysis.py b/tests/analysis/test_ols_analysis.py
@@ -1,18 +1,17 @@
 import pandas as pd
 
 from cluster_experiments.experiment_analysis import OLSAnalysis
-from tests.examples import analysis_df
 
 
-def test_binary_treatment():
+def test_binary_treatment(analysis_df):
     analyser = OLSAnalysis()
     assert (
         analyser._create_binary_treatment(analysis_df)["treatment"]
         == pd.Series([0, 1, 1, 0])
     ).all()
 
 
-def test_get_pvalue():
+def test_get_pvalue(analysis_df):
     analysis_df_full = pd.concat([analysis_df for _ in range(100)])
     analyser = OLSAnalysis()
     assert analyser.get_pvalue(analysis_df_full) >= 0
diff --git a/tests/cupac/conftest.py b/tests/cupac/conftest.py
@@ -0,0 +1,12 @@
+import pandas as pd
+import pytest
+
+
+@pytest.fixture
+def binary_df():
+    return pd.DataFrame(
+        {
+            "target": [0, 1, 0, 1],
+            "treatment": ["A", "B", "B", "A"],
+        }
+    )
diff --git a/tests/cupac/test_aggregator.py b/tests/cupac/test_aggregator.py
@@ -3,14 +3,13 @@
 import pandas as pd
 
 from cluster_experiments.cupac import TargetAggregation
-from tests.examples import binary_df
 
 
-def split_x_y(binary_df: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
-    return binary_df.drop("target", axis=1), binary_df["target"]
+def split_x_y(binary_df_agg: pd.DataFrame) -> Tuple[pd.DataFrame, pd.Series]:
+    return binary_df_agg.drop("target", axis=1), binary_df_agg["target"]
 
 
-def test_set_target_aggs():
+def test_set_target_aggs(binary_df):
     binary_df["user"] = [1, 1, 1, 1]
     ta = TargetAggregation(agg_col="user")
     X, y = split_x_y(binary_df)
@@ -20,7 +19,7 @@ def test_set_target_aggs():
     assert ta.pre_experiment_mean == 0.5
 
 
-def test_smoothing_0():
+def test_smoothing_0(binary_df):
     binary_df["user"] = binary_df["target"]
     ta = TargetAggregation(agg_col="user", smoothing_factor=0)
     X, y = split_x_y(binary_df)
@@ -31,7 +30,7 @@ def test_smoothing_0():
     ).all()
 
 
-def test_smoothing_non_0():
+def test_smoothing_non_0(binary_df):
     binary_df["user"] = binary_df["target"]
     ta = TargetAggregation(agg_col="user", smoothing_factor=2)
     X, y = split_x_y(binary_df)
@@ -45,7 +44,7 @@ def test_smoothing_non_0():
     ).all()
 
 
-def test_add_aggs():
+def test_add_aggs(binary_df):
     binary_df["user"] = binary_df["target"]
     ta = TargetAggregation(agg_col="user", smoothing_factor=2)
     X, y = split_x_y(binary_df)

diff --git a/tests/cupac/test_cupac_handler.py b/tests/cupac/test_cupac_handler.py
@@ -5,7 +5,7 @@
 from sklearn.ensemble import HistGradientBoostingRegressor
 
 from cluster_experiments.cupac import CupacHandler, TargetAggregation
-from tests.examples import generate_random_data
+from tests.utils import generate_random_data
 
 N = 1_000
 

diff --git a/tests/examples.py b/tests/examples.py
diff --git a/tests/perturbator/conftest.py b/tests/perturbator/conftest.py
@@ -0,0 +1,57 @@
+import pandas as pd
+import pytest
+
+
+@pytest.fixture
+def binary_df():
+    return pd.DataFrame(
+        {
+            "target": [0, 1, 0, 1],
+            "treatment": ["A", "B", "B", "A"],
+        }
+    )
+
+
+@pytest.fixture
+def continuous_df():
+    return pd.DataFrame(
+        {
+            "target": [0.5, 0.5, 0.5, 0.5],
+            "treatment": ["A", "B", "B", "A"],
+        }
+    )
+
+
+@pytest.fixture
+def generate_clustered_data() -> pd.DataFrame:
+    analysis_df = pd.DataFrame(
+        {
+            "country_code": ["ES"] * 4 + ["IT"] * 4 + ["PL"] * 4 + ["RO"] * 4,
+            "city_code": ["BCN", "BCN", "MAD", "BCN"]
+            + ["NAP"] * 4
+            + ["WAW"] * 4
+            + ["BUC"] * 4,
+            "user_id": [1, 1, 2, 1, 3, 4, 5, 6, 7, 8, 8, 8, 9, 9, 9, 10],
+            "date": ["2022-01-01", "2022-01-02", "2022-01-03", "2022-01-04"] * 4,
+            "treatment": [
+                "A",
+                "A",
+                "B",
+                "A",
+                "B",
+                "B",
+                "A",
+                "B",
+                "B",
+                "A",
+                "A",
+                "A",
+                "B",
+                "B",
+                "B",
+                "A",
+            ],  # Randomization is done at user level, so same user will always have same treatment
+            "target": [0.01] * 15 + [0.1],
+        }
+    )
+    return analysis_df