Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NAPSU-MQ inference configuration #55

Merged
merged 20 commits into from
Jul 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions ChangeLog.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
- master:
- changed InferenceModel.fit: added show_progress and return_diagnostics arguments, removed model specific kwargs (breaking)
NapsuMQModel.fit and DPVIModel.fit changed accordingly
- DPVIResult no longer contains the final ELBO from model fitting (breaking)
this is now returned as diagnostic information from DPVIModel.fit if return_diagnostics=True
- fixed: reused RNG key in NAPSU-MQ data sampling
- added integers_handler to DataDescription.from_dataframe
to determine how integer typed columns are handled
- changed NapsuMqModel.fit to reject integer typed columns (all values must be categorical)
- added NapsuMQInferenceConfig to better encapsulate configuration of the NapsuMQ algorithm (breaking)

- 1.0.0:
- introducing new API centered around twinify.InferenceModel, twinify.InferenceResult
Expand Down
4,526 changes: 10 additions & 4,516 deletions examples/NAPSU-MQ_coverage_test.ipynb

Large diffs are not rendered by default.

62 changes: 23 additions & 39 deletions examples/NapsuMQ example.ipynb

Large diffs are not rendered by default.

44 changes: 31 additions & 13 deletions tests/dpvi/dpvi_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def test_inference_and_sampling(self) -> None:

rng = d3p.random.PRNGKey(96392153)
dpvi_model = DPVIModel(model, clipping_threshold=10., num_epochs=300, subsample_ratio=0.01)
dpvi_fit = dpvi_model.fit(xs_df, rng, epsilon, delta, silent=True)
dpvi_fit = dpvi_model.fit(xs_df, rng, epsilon, delta, show_progress=False)

self.assertEqual(epsilon, dpvi_fit.privacy_level.epsilon)
self.assertEqual(delta, dpvi_fit.privacy_level.delta)
Expand Down Expand Up @@ -120,7 +120,7 @@ def test_fit_aborts_for_nan(self) -> None:
rng = d3p.random.PRNGKey(96392153)
dpvi_model = DPVIModel(model, clipping_threshold=10., num_epochs=1, subsample_ratio=0.1)
with self.assertRaises(InferenceException):
dpvi_model.fit(xs_df, rng, epsilon, delta, silent=True)
dpvi_model.fit(xs_df, rng, epsilon, delta, show_progress=False)

def test_fit_works(self) -> None:
xs_df = self.xs_df
Expand All @@ -129,7 +129,7 @@ def test_fit_works(self) -> None:

rng = d3p.random.PRNGKey(96392153)
dpvi_model = DPVIModel(model, clipping_threshold=10., num_epochs=1, subsample_ratio=0.1)
dpvi_fit = dpvi_model.fit(xs_df, rng, epsilon, delta, silent=False)
dpvi_fit = dpvi_model.fit(xs_df, rng, epsilon, delta, show_progress=True)

self.assertEqual(epsilon, dpvi_fit.privacy_level.epsilon)
self.assertEqual(delta, dpvi_fit.privacy_level.delta)
Expand All @@ -142,6 +142,28 @@ def test_fit_works(self) -> None:
self.assertIn('auto_scale', dpvi_fit.parameters)
self.assertEqual((6,), dpvi_fit.parameters['auto_scale'].shape)

def test_fit_with_diagnostics(self) -> None:
xs_df = self.xs_df
epsilon = 4.
delta = 1e-6

rng = d3p.random.PRNGKey(96392153)
dpvi_model = DPVIModel(model, clipping_threshold=10., num_epochs=1, subsample_ratio=0.1)
dpvi_fit, diagnostics = dpvi_model.fit(xs_df, rng, epsilon, delta, show_progress=True, return_diagnostics=True)

self.assertEqual(epsilon, dpvi_fit.privacy_level.epsilon)
self.assertEqual(delta, dpvi_fit.privacy_level.delta)
self.assertTrue(dpvi_fit.privacy_level.dp_noise > 0)
self.assertIsNotNone(dpvi_fit.parameters)
self.assertEqual(self.data_description, dpvi_fit.data_description)

self.assertIn('auto_loc', dpvi_fit.parameters)
self.assertEqual((6,), dpvi_fit.parameters['auto_loc'].shape)
self.assertIn('auto_scale', dpvi_fit.parameters)
self.assertEqual((6,), dpvi_fit.parameters['auto_scale'].shape)

self.assertIsInstance(diagnostics, dict)
self.assertIn("final_elbo", diagnostics)

def test_fit_works_silent(self) -> None:
xs_df = self.xs_df
Expand All @@ -150,7 +172,7 @@ def test_fit_works_silent(self) -> None:

rng = d3p.random.PRNGKey(96392153)
dpvi_model = DPVIModel(model, clipping_threshold=10., num_epochs=1, subsample_ratio=0.1)
dpvi_fit = dpvi_model.fit(xs_df, rng, epsilon, delta, silent=True)
dpvi_fit = dpvi_model.fit(xs_df, rng, epsilon, delta, show_progress=False)

self.assertEqual(epsilon, dpvi_fit.privacy_level.epsilon)
self.assertEqual(delta, dpvi_fit.privacy_level.delta)
Expand Down Expand Up @@ -224,7 +246,6 @@ def guide(data = None, num_obs_total = None):
}

self.privacy_params = PrivacyLevel(1., 1e-4, 2.1)
self.final_elbo = 1.67

self.data_description = DataDescription({
'first': np.dtype(np.float64),
Expand All @@ -235,7 +256,7 @@ def guide(data = None, num_obs_total = None):

def test_init(self) -> None:
result = DPVIResult(
self.model, self.guide, self.params, self.privacy_params, self.final_elbo, self.data_description
self.model, self.guide, self.params, self.privacy_params, self.data_description
)

self.assertTrue(
Expand All @@ -244,11 +265,10 @@ def test_init(self) -> None:
)
)
self.assertEqual(self.privacy_params, result.privacy_level)
self.assertEqual(self.final_elbo, result.final_elbo)

def test_generate(self) -> None:
result = DPVIResult(
self.model, self.guide, self.params, self.privacy_params, self.final_elbo, self.data_description
self.model, self.guide, self.params, self.privacy_params, self.data_description
)

num_data_per_parameter = 100
Expand All @@ -267,7 +287,7 @@ def test_generate(self) -> None:

def test_generate_single_dataset(self) -> None:
result = DPVIResult(
self.model, self.guide, self.params, self.privacy_params, self.final_elbo, self.data_description
self.model, self.guide, self.params, self.privacy_params, self.data_description
)

num_data_per_parameter = 100
Expand All @@ -284,7 +304,7 @@ def test_generate_single_dataset(self) -> None:

def test_store_and_load(self) -> None:
result = DPVIResult(
self.model, self.guide, self.params, self.privacy_params, self.final_elbo, self.data_description
self.model, self.guide, self.params, self.privacy_params, self.data_description
)

with tempfile.TemporaryFile("w+b") as f:
Expand All @@ -299,7 +319,6 @@ def test_store_and_load(self) -> None:
)
)
self.assertEqual(self.privacy_params, loaded_result.privacy_level)
self.assertEqual(self.final_elbo, loaded_result.final_elbo)
self.assertEqual(self.data_description, loaded_result.data_description)

result_samples = result.generate(d3p.random.PRNGKey(567), 10, 1)
Expand All @@ -314,7 +333,7 @@ def test_store_and_load_with_loadable_autoguide(self) -> None:
guide = LoadableAutoGuide(model, ["ys", "xs", "cats"], AutoDiagonalNormal)

result = DPVIResult(
self.model, guide, self.params, self.privacy_params, self.final_elbo, self.data_description
self.model, guide, self.params, self.privacy_params, self.data_description
)

with tempfile.TemporaryFile("w+b") as f:
Expand All @@ -329,7 +348,6 @@ def test_store_and_load_with_loadable_autoguide(self) -> None:
)
)
self.assertEqual(self.privacy_params, loaded_result.privacy_level)
self.assertEqual(self.final_elbo, loaded_result.final_elbo)
self.assertEqual(self.data_description, loaded_result.data_description)

result_samples = result.generate(d3p.random.PRNGKey(567), 10, 1)
Expand Down
106 changes: 96 additions & 10 deletions tests/napsu_mq/napsu_mq_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@
import pytest
from tempfile import NamedTemporaryFile, TemporaryFile
from binary_logistic_regression_generator import BinaryLogisticRegressionDataGenerator
from twinify.napsu_mq.napsu_mq import NapsuMQResult, NapsuMQModel
from twinify.napsu_mq.napsu_mq import NapsuMQResult, NapsuMQModel, NapsuMQInferenceConfig
from twinify.napsu_mq.marginal_query import FullMarginalQuerySet
from twinify.dataframe_data import DataDescription

Expand Down Expand Up @@ -58,7 +58,7 @@ def test_NAPSUMQ_model_without_IO(self):
rng = d3p.random.PRNGKey(54363731)
inference_rng, sampling_rng = d3p.random.split(rng)

model = NapsuMQModel(required_marginals=required_marginals, use_laplace_approximation=False)
model = NapsuMQModel(forced_queries_in_automatic_selection=required_marginals)
result = model.fit(data=self.dataframe, rng=inference_rng, epsilon=1, delta=(self.n ** (-2)))

datasets = result.generate(
Expand Down Expand Up @@ -86,7 +86,7 @@ def test_NAPSUMQ_model_with_IO(self):

rng = d3p.random.PRNGKey(69700241)
inference_rng, sampling_rng = d3p.random.split(rng)
model = NapsuMQModel(required_marginals=required_marginals, use_laplace_approximation=False)
model = NapsuMQModel(forced_queries_in_automatic_selection=required_marginals)
result = model.fit(data=self.dataframe, rng=inference_rng, epsilon=1, delta=(self.n ** (-2)))

napsu_result_file = NamedTemporaryFile("wb")
Expand Down Expand Up @@ -133,7 +133,7 @@ def test_NAPSUMQ_model_for_storing_defects(self):

rng = d3p.random.PRNGKey(74249069)
inference_rng, sampling_rng = d3p.random.split(rng)
model = NapsuMQModel(required_marginals=required_marginals, use_laplace_approximation=False)
model = NapsuMQModel(forced_queries_in_automatic_selection=required_marginals)
result = model.fit(data=self.dataframe, rng=inference_rng, epsilon=1, delta=(self.n ** (-2)))

# Use the sampling rng with both generate calls to expect the same generation outcome
Expand Down Expand Up @@ -170,15 +170,46 @@ def test_NAPSUMQ_model_for_storing_defects(self):

# Takes about ~ 1 minute to run
@pytest.mark.slow
def test_NAPSUMQ_model_with_laplace_approximation_without_IO(self):
def test_NAPSUMQ_model_with_laplace_plus_mcmc_without_IO(self):
required_marginals = [
('A', 'B'), ('B', 'C'), ('A', 'C')
]

rng = d3p.random.PRNGKey(85532350)
inference_rng, sampling_rng = d3p.random.split(rng)

model = NapsuMQModel(required_marginals=required_marginals, use_laplace_approximation=True)
config = NapsuMQInferenceConfig(method="laplace+mcmc")
model = NapsuMQModel(forced_queries_in_automatic_selection=required_marginals, inference_config=config)
result = model.fit(data=self.dataframe, rng=inference_rng, epsilon=1, delta=(self.n ** (-2)))

datasets = result.generate(
rng=sampling_rng, num_data_per_parameter_sample=500, num_parameter_samples=5, single_dataframe=False
)

self.assertEqual(len(datasets), 5)
self.assertEqual(datasets[0].shape, (500, 3))

original_means = self.dataframe.mean()
original_stds = self.dataframe.std()

for i, df in enumerate(datasets):
means = df.mean()
stds = df.std()
pd.testing.assert_series_equal(means, original_means, check_exact=False, rtol=0.3)
pd.testing.assert_series_equal(stds, original_stds, check_exact=False, rtol=0.3)

# Takes about ~ 1 minute to run
@pytest.mark.slow
def test_NAPSUMQ_model_with_laplace_approximation_without_IO(self):
required_marginals = [
('A', 'B'), ('B', 'C'), ('A', 'C')
]

rng = d3p.random.PRNGKey(897236)
inference_rng, sampling_rng = d3p.random.split(rng)

config = NapsuMQInferenceConfig(method="laplace")
model = NapsuMQModel(forced_queries_in_automatic_selection=required_marginals, inference_config=config)
result = model.fit(data=self.dataframe, rng=inference_rng, epsilon=1, delta=(self.n ** (-2)))

datasets = result.generate(
Expand Down Expand Up @@ -207,7 +238,8 @@ def test_NAPSUMQ_model_without_IO_single_dataset(self):
rng = d3p.random.PRNGKey(85511235)
inference_rng, sampling_rng = d3p.random.split(rng)

model = NapsuMQModel(required_marginals=required_marginals, use_laplace_approximation=True)
config = NapsuMQInferenceConfig(method="laplace+mcmc")
model = NapsuMQModel(forced_queries_in_automatic_selection=required_marginals, inference_config=config)
result = model.fit(data=self.dataframe, rng=inference_rng, epsilon=1, delta=(self.n ** (-2)))

dataset = result.generate(
Expand Down Expand Up @@ -239,10 +271,9 @@ def test_NAPSUMQ_model_fit_rejects_pure_integer_data(self) -> None:

rng = d3p.random.PRNGKey(42)

model = NapsuMQModel(required_marginals=[])
model = NapsuMQModel(forced_queries_in_automatic_selection=[])
with self.assertRaises(ValueError):
model.fit(data=data, rng=rng, epsilon=1, delta=(n ** (-2)),
use_laplace_approximation=True)
model.fit(data=data, rng=rng, epsilon=1, delta=(n ** (-2)))


class TestNapsuMQResult(unittest.TestCase):
Expand Down Expand Up @@ -302,3 +333,58 @@ def test_store_and_load(self) -> None:
loaded_samples = loaded_result.generate(d3p.random.PRNGKey(15412), 100)

self.assertTrue(np.all(samples.values == loaded_samples.values))


class TestNapsuMQInferenceConfig(unittest.TestCase):

def test_correct_methods(self):
config = NapsuMQInferenceConfig()
config.method = "mcmc"
config.method = "laplace"
config.method = "laplace+mcmc"

def test_incorrect_methods(self):
config = NapsuMQInferenceConfig()
with pytest.raises(ValueError):
config.method = "hfjsdhfk"

def test_correct_no_laplace_config(self):
config = NapsuMQInferenceConfig()
config.method = "mcmc"
config.laplace_approximation_config = None

def test_incorrect_no_laplace_config(self):
config = NapsuMQInferenceConfig()
config.method = "laplace"
with pytest.raises(ValueError):
config.laplace_approximation_config = None

config = NapsuMQInferenceConfig()
config.method = "laplace+mcmc"
with pytest.raises(ValueError):
config.laplace_approximation_config = None

def test_correct_no_mcmc_config(self):
config = NapsuMQInferenceConfig()
config.method = "laplace"
config.mcmc_config = None

def test_incorrect_no_mcmc_config(self):
config = NapsuMQInferenceConfig()
config.method = "mcmc"
with pytest.raises(ValueError):
config.mcmc_config = None

config = NapsuMQInferenceConfig()
config.method = "laplace+mcmc"
with pytest.raises(ValueError):
config.mcmc_config = None

def test_remove_config_then_change_method(self):
config = NapsuMQInferenceConfig()
config.method = "mcmc"
config.laplace_approximation_config = None
with pytest.raises(ValueError):
config.method = "laplace"
with pytest.raises(ValueError):
config.method = "laplace+mcmc"
21 changes: 14 additions & 7 deletions twinify/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,22 +15,29 @@

import abc
import pandas as pd
from typing import Union, Optional, Iterable, BinaryIO
from typing import Union, Optional, Iterable, BinaryIO, Tuple, Any
import d3p.random

class InferenceModel(metaclass=abc.ABCMeta):
""" A statistical model to generate privacy-preserving synthetic twins data sets from sensitive data. """

@abc.abstractmethod
def fit(self, data: pd.DataFrame, rng: d3p.random.PRNGState, epsilon: float, delta: float, **kwargs) -> 'InferenceResult':
def fit(self,
data: pd.DataFrame,
rng: d3p.random.PRNGState,
epsilon: float,
delta: float,
show_progress: bool,
return_diagnostics: bool) -> Union['InferenceResult',Tuple['InferenceResult', Any]]:
""" Compute the parameter posterior (approximation) for a given data set, hyperparameters and privacy bounds.

Args:
data: A `pandas.DataFrame` containing (sensitive) data.
rng: A seeded state for the d3p.random secure random number generator.
epsilon: Privacy bound ε.
delta: Privacy bound δ.
kwargs: Optional (model specific) hyperparameters.
data (pd.DataFrame): A `pandas.DataFrame` containing (sensitive) data.
rng (d3p.random.PRNGState): A seeded state for the d3p.random secure random number generator.
epsilon (float): Privacy bound ε.
delta (float): Privacy bound δ.
show_progress (bool): Display progress bars.
return_diagnostics (bool): Return diagnostics from inference.
"""
pass

Expand Down
Loading