diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index c59e57b6..b08927db 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -3,13 +3,15 @@
 # R specific hooks: https://github.com/lorenzwalthert/precommit
 repos:
   - repo: https://github.com/lorenzwalthert/precommit
-    rev: v0.3.2.9003
+    rev: v0.3.2.9019
     hooks:
       - id: style-files
         args: []
       - id: roxygenize
         additional_dependencies:
           - knitr
+          - stan-dev/cmdstanr@*release
+          - mvtnorm
       # codemeta must be above use-tidy-description when both are used
       # -   id: codemeta-description-updated
       - id: use-tidy-description
@@ -49,11 +51,11 @@ repos:
       - id: no-browser-statement
       - id: deps-in-desc
   - repo: https://github.com/pre-commit/mirrors-prettier
-    rev: v3.0.0-alpha.3
+    rev: v3.0.1
     hooks:
       - id: prettier
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.3.0
+    rev: v4.4.0
     hooks:
       - id: check-added-large-files
         args: ["--maxkb=200"]
@@ -79,7 +81,7 @@ repos:
         files: '\.Rhistory|\.RData|\.Rds|\.rds$'
         # `exclude: <regex>` to allow committing specific files.
   - repo: https://github.com/igorshubovych/markdownlint-cli
-    rev: v0.32.2
+    rev: v0.34.0
     hooks:
       - id: markdownlint
         args:
diff --git a/DESCRIPTION b/DESCRIPTION
index 4dfd5a24..86903442 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -42,7 +42,10 @@ Imports:
     glue,
     methods,
     graphics,
-    posterior
+    posterior,
+    generics,
+    Matrix,
+    mvtnorm
 Suggests:
     survival,
     flexsurv,
@@ -121,6 +124,7 @@ Collate:
     'sim_is_null_effect_covered.R'
     'sim_is_true_effect_covered.R'
     'sim_samplesize.R'
+    'simulate_data.R'
     'simvar_class.R'
     'trim_data_matrix.R'
     'uniform_prior.R'
diff --git a/NAMESPACE b/NAMESPACE
index 062d3593..8351686a 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -1,22 +1,28 @@
 # Generated by roxygen2: do not edit by hand
 
 export(add_covariates)
+export(baseline_covariates)
 export(bernoulli_prior)
 export(beta_prior)
 export(bin_var)
+export(binary_cutoff)
 export(borrowing_details)
 export(cauchy_prior)
 export(check_cmdstanr)
 export(check_data_matrix_has_columns)
 export(cont_var)
+export(covariance_matrix)
 export(create_analysis_obj)
+export(create_baseline_object)
 export(create_data_matrix)
 export(create_simulation_obj)
 export(eval_constraints)
 export(exp_surv_dist)
 export(exponential_prior)
 export(gamma_prior)
+export(generate)
 export(get_cmd_stan_models)
+export(get_quantiles)
 export(get_results)
 export(get_vars)
 export(half_cauchy_prior)
@@ -41,9 +47,12 @@ export(treatment_details)
 export(uniform_prior)
 export(variable_dictionary)
 export(weib_ph_surv_dist)
+exportMethods(generate)
 exportMethods(mcmc_sample)
 import(checkmate)
 import(cmdstanr)
+importFrom(Matrix,nearPD)
+importFrom(generics,generate)
 importFrom(graphics,axis)
 importFrom(graphics,lines)
 importFrom(graphics,par)
@@ -53,7 +62,9 @@ importFrom(methods,callNextMethod)
 importFrom(methods,is)
 importFrom(methods,new)
 importFrom(methods,show)
+importFrom(mvtnorm,rmvnorm)
 importFrom(stats,complete.cases)
 importFrom(stats,formula)
 importFrom(stats,model.matrix)
+importFrom(stats,pnorm)
 importFrom(stats,setNames)
diff --git a/R/cmdstan.R b/R/cmdstan.R
index feb46d72..5a94df06 100644
--- a/R/cmdstan.R
+++ b/R/cmdstan.R
@@ -1,18 +1,21 @@
 #' Check Stan
 #'
 #' @description Check whether `cmdstanr` is available and prints version and logistic example.
+#' @param check_sampling Compile and sample from the "logistic" example model.
 #'
 #' @export
 #'
 #' @examples
 #' check_cmdstanr()
-check_cmdstanr <- function() {
+check_cmdstanr <- function(check_sampling = FALSE) {
   cat("cmdstan version: \n")
   print(cmdstan_version())
 
   cat("Example program:\n")
   print_example_program(example = c("logistic"))
 
-  cat("Example program results:\n")
-  cmdstanr_example("logistic", quiet = FALSE, chains = 1)
+  if (check_sampling) {
+    cat("Example program results:\n")
+    cmdstanr_example("logistic", quiet = FALSE, chains = 1)
+  }
 }
diff --git a/R/generics.R b/R/generics.R
index b57d0ad0..488b2d4c 100644
--- a/R/generics.R
+++ b/R/generics.R
@@ -1,4 +1,3 @@
-
 #' Plot Prior Objects
 #'
 #' Plot prior distributions as densities. Continuous distributions are plotted as curves and
@@ -85,3 +84,13 @@ setGeneric("get_results", function(object) standardGeneric("get_results"))
 #' @export
 #'
 setGeneric("get_cmd_stan_models", function(object) standardGeneric("get_cmd_stan_models"))
+
+#' Generate Data from Object
+#'
+#' @param x object
+#' @param ... Other arguments passed to methods
+#'
+#' @rdname generate
+#'
+#' @export
+setGeneric("generate", function(x, ...) standardGeneric("generate"))
diff --git a/R/simulate_data.R b/R/simulate_data.R
new file mode 100644
index 00000000..d72aadf6
--- /dev/null
+++ b/R/simulate_data.R
@@ -0,0 +1,359 @@
+#' Create Covariance Matrix
+#'
+#' @param diag Diagonal entries of the covariance matrix
+#' @param upper_tri Upper triangle entries of the matrix, specified column wise.
+#'
+#' @return A symmetric matrix with `diag` values on the main diagonal and
+#' `upper_tri` values in the lower and upper triangles.
+#' @export
+#'
+#' @examples
+#' m1 <- covariance_matrix(c(1, 1, 1, 1), c(.8, .3, .8, 0, 0, 0))
+#' m1
+#' mvtnorm::rmvnorm(5, mean = c(0, 0, 0, 0), sigma = m1)
+#'
+#' # No correlation
+#' covariance_matrix(c(1, 2, 3))
+#'
+#' @importFrom Matrix nearPD
+covariance_matrix <- function(diag, upper_tri) {
+  assert_numeric(diag, lower = 0)
+  dim <- length(diag)
+  if (missing(upper_tri)) upper_tri <- rep(0, sum(seq.int(1, dim - 1)))
+
+  if (dim == 1) {
+    cov_mat <- matrix(diag)
+  } else if (dim > 1) {
+    assert_numeric(upper_tri, len = sum(seq.int(1, dim - 1)))
+    cov_mat <- diag(diag, nrow = dim, ncol = dim)
+    cov_mat[upper.tri(cov_mat)] <- upper_tri
+    cov_mat[lower.tri(cov_mat)] <- t(cov_mat)[lower.tri(cov_mat)]
+  }
+  if (any(eigen(cov_mat, only.values = TRUE)$values < 0)) {
+    warning(
+      "Provided parameters do not define a semi-positive definite matrix. ",
+      "Finding nearest positive definite matrix with Matrix::nearPD()."
+    )
+    cov_mat <- Matrix::nearPD(cov_mat, ensureSymmetry = TRUE, keepDiag = TRUE, base.matrix = TRUE)$mat
+  }
+  cov_mat
+}
+
+
+#' Specify Correlated Baseline Covariates
+#'
+#' Set parameters to generate correlated multivariate normal data for internal and external patients.
+#'
+#' @param names character vector of variable names.
+#' @param means_int numeric vector of means for internal patients. Must have same length as `names`
+#' @param means_ext numeric vector of means for external patients. Must have same length as `names`
+#' @param covariance_int variance-covariance matrix for generating multivariate normal for internal patients.
+#' Must be square matrix with same number of rows and `length(names)`
+#' @param covariance_ext variance-covariance matrix for generating multivariate normal data for external patients.
+#' Must be square matrix with same number of rows and `length(names)`
+#'
+#' @return [BaselineObject][BaselineObject-class] to build simulated dataset
+#' @export
+#'
+#' @examples
+#' corr_covs <- baseline_covariates(
+#'   names = c("b1", "b2"),
+#'   means_int = c(5, 25),
+#'   covariance_int = covariance_matrix(diag = c(1, 1), upper_tri = 0.4)
+#' )
+baseline_covariates <- function(names,
+                                means_int,
+                                means_ext = means_int,
+                                covariance_int,
+                                covariance_ext = covariance_int) {
+  assert_character(names)
+  n <- length(names)
+  assert_numeric(means_int, finite = TRUE, len = n, any.missing = FALSE)
+  assert_numeric(means_ext, finite = TRUE, len = n, any.missing = FALSE)
+  assert_matrix(covariance_int, nrows = n, ncols = n, any.missing = FALSE)
+  assert_matrix(covariance_ext, nrows = n, ncols = n, any.missing = FALSE)
+
+  .correlated_covariates(
+    names = names,
+    means_int = means_int,
+    means_ext = means_ext,
+    covariance_int = covariance_int,
+    covariance_ext = covariance_ext
+  )
+}
+
+
+.correlated_covariates <- setClass(
+  "CorrelatedCovariates",
+  slots = c(
+    names = "character",
+    means_int = "numeric",
+    means_ext = "numeric",
+    covariance_int = "matrix",
+    covariance_ext = "matrix"
+  )
+)
+
+
+# BaselineObject -----
+
+#' `BaselineObject` class for data simulation
+#'
+#' @slot n_trt_int integer. Number of internal treated patients
+#' @slot n_ctrl_int integer. Number of internal control patients
+#' @slot n_ctrl_ext integer. Number of external control patients
+#' @slot covariates list. List of correlated covariates objects, see [baseline_covariates()]
+#' @slot transformations list. List of named transformation functions.
+#'
+.baseline_object <- setClass(
+  "BaselineObject",
+  slots = c(
+    n_trt_int = "integer",
+    n_ctrl_int = "integer",
+    n_ctrl_ext = "integer",
+    covariates = "list",
+    transformations = "list"
+  )
+)
+
+#' Create Baseline Data Simulation Object
+#'
+#' @param n_trt_int Number of internal treated patients
+#' @param n_ctrl_int Number of internal control patients
+#' @param n_ctrl_ext Number of external control patients
+#' @param covariates List of correlated covariates objects, see [baseline_covariates()]
+#' @param transformations List of named transformation functions.
+#'
+#' @details
+#' Transformation functions are evaluated in order and create or overwrite a column
+#' in the data.frame with that name. The function should take a `data.frame` (specifically
+#' a `BaselineDataFrame` object from `generate(BaselineObject)`) and return a vector with
+#' length identical to the total number of patients.
+#' The `@BaselineObject` slot may be accessed directly or with [get_quantiles()] to
+#' create transformations. See [binary_cutoff()]
+#'
+#' @return A [BaselineObject][BaselineObject-class]
+#' @export
+#'
+#' @examples
+#' bl_no_covs <- create_baseline_object(
+#'   n_trt_int = 100,
+#'   n_ctrl_int = 50,
+#'   n_ctrl_ext = 100
+#' )
+#'
+#'
+#' bl_biomarkers <- create_baseline_object(
+#'   n_trt_int = 100,
+#'   n_ctrl_int = 50,
+#'   n_ctrl_ext = 100,
+#'   covariates = baseline_covariates(
+#'     c("b1", "b2", "b3"),
+#'     means_int = c(0, 0, 0),
+#'     covariance_int = covariance_matrix(c(1, 1, 1), c(.8, .3, .8))
+#'   ),
+#'   transformations = list(
+#'     exp_b1 = function(data) exp(data$b1),
+#'     b2 = binary_cutoff("b2", int_cutoff = 0.7, ext_cutoff = 0.5)
+#'   )
+#' )
+#'
+create_baseline_object <- function(n_trt_int, n_ctrl_int, n_ctrl_ext, covariates, transformations) {
+  assert_integerish(n_trt_int, len = 1, lower = 1)
+  assert_integerish(n_ctrl_int, len = 1, lower = 1)
+  assert_integerish(n_ctrl_ext, len = 1, lower = 1)
+
+  if (!missing(covariates)) {
+    if (is(covariates, "CorrelatedCovariates")) covariates <- list(covariates)
+    assert_list(covariates)
+  } else {
+    covariates <- list(.correlated_covariates())
+  }
+
+  if (!missing(transformations)) {
+    assert_list(transformations)
+  } else {
+    transformations <- list()
+  }
+
+  .baseline_object(
+    n_trt_int = as.integer(n_trt_int),
+    n_ctrl_int = as.integer(n_ctrl_int),
+    n_ctrl_ext = as.integer(n_ctrl_ext),
+    covariates = covariates,
+    transformations = transformations
+  )
+}
+
+#' @importFrom generics generate
+#' @importFrom mvtnorm rmvnorm
+# nolint start
+generate.BaselineObject <- function(x, ...) {
+  # nolint end
+  arm_data <- data.frame(
+    patid = seq_len(x@n_trt_int + x@n_ctrl_int + x@n_ctrl_ext),
+    ext = rep(c(0, 1), times = c(x@n_trt_int + x@n_ctrl_int, x@n_ctrl_ext)),
+    trt = rep(c(1, 0), times = c(x@n_trt_int, x@n_ctrl_int + x@n_ctrl_ext))
+  )
+
+  # If any covariates are defined, generate multivariate normal data and combine with arm data
+  cov_defined <- vapply(x@covariates, function(x) length(x@names) > 0, logical(1L))
+  if (any(cov_defined)) {
+    cor_data_list <- lapply(
+      x@covariates[cov_defined],
+      function(cor_cov, n_int = x@n_trt_int + x@n_ctrl_int, n_ext = x@n_ctrl_ext) {
+        mvnorm_data <- rbind(
+          mvtnorm::rmvnorm(n = n_int, mean = cor_cov@means_int, sigma = cor_cov@covariance_int),
+          mvtnorm::rmvnorm(n = n_ext, mean = cor_cov@means_ext, sigma = cor_cov@covariance_ext)
+        )
+        colnames(mvnorm_data) <- cor_cov@names
+        as.data.frame(mvnorm_data)
+      }
+    )
+    arm_data <- cbind(arm_data, do.call("cbind", cor_data_list))
+  }
+
+  bl_df <- .baseline_dataframe(arm_data, BaselineObject = x)
+
+  # For each named transformation, either create a new column or overwrite
+  for (i in names(x@transformations)) {
+    var_index <- match(i, bl_df@names)
+    if (!is.na(var_index)) {
+      bl_df@.Data[[var_index]] <- x@transformations[[i]](bl_df)
+    } else {
+      next_index <- length(bl_df@names) + 1
+      bl_df@.Data[[next_index]] <- x@transformations[[i]](bl_df)
+      bl_df@names[next_index] <- i
+    }
+  }
+  bl_df
+}
+
+
+#' Generate Data for a `BaselineObject`
+#'
+#' @param x a `BaselineObject` object created by [create_baseline_object]
+#' @param ... additional parameters are ignored
+#'
+#' @return A [BaselineDataFrame][psborrow2::BaselineDataFrame-class] object
+#' @export
+#'
+#' @examples
+#' bl_biomarkers <- create_baseline_object(
+#'   n_trt_int = 100,
+#'   n_ctrl_int = 50,
+#'   n_ctrl_ext = 100,
+#'   covariates = baseline_covariates(
+#'     c("b1", "b2", "b3"),
+#'     means_int = c(0, 0, 0),
+#'     covariance_int = covariance_matrix(c(1, 1, 1), c(.8, .3, .8))
+#'   ),
+#'   transformations = list(
+#'     exp_b1 = function(data) exp(data$b1),
+#'     b2 = binary_cutoff("b2", int_cutoff = 0.7, ext_cutoff = 0.5)
+#'   )
+#' )
+#' generate(bl_biomarkers)
+setMethod(
+  f = "generate",
+  signature = "BaselineObject",
+  definition = generate.BaselineObject
+)
+
+
+#' Baseline Data Frame Object
+#'
+#' A `data.frame` with additional slot containing simulation definition
+#'
+#' @slot BaselineObject Simulated covariates definitions as `BaselineObject`. See [create_baseline_object()]
+#'
+#' @return A `BaselineDataFrame`
+.baseline_dataframe <- setClass(
+  "BaselineDataFrame",
+  contains = "data.frame",
+  slots = c(
+    BaselineObject = "BaselineObject"
+  )
+)
+
+# Validity checks for Baseline Object
+setValidity("BaselineObject", function(object) {
+  msg <- NULL
+  c(msg, check_list(object@covariates, types = "CorrelatedCovariates"))
+  c(msg, check_list(object@transformations, types = "function", names = "named"))
+  if (is.null(msg)) TRUE else msg
+})
+
+# show method
+setMethod(
+  f = "show",
+  signature = "BaselineDataFrame",
+  definition = function(object) {
+    cat("Simulated Baseline Data")
+    cat("\n")
+    print.data.frame(object)
+  }
+)
+
+#' Get Quantiles of Random Data
+#'
+#' Helper for use within transformation functions for [create_baseline_object()].
+#'
+#' @param object a `BaselineDataFrame`
+#' @param var character string name of the variable
+#'
+#' @return A numeric vector containing quantiles based on the data
+#'  generating distribution.
+#' @export
+#' @importFrom stats pnorm
+#'
+get_quantiles <- function(object, var) {
+  assert_class(object, "BaselineDataFrame")
+
+  covs <- object@BaselineObject@covariates
+  names <- unlist(lapply(covs, function(x) x@names))
+  assert_subset(var, choices = names)
+
+  index <- which(names == var)
+
+  mean_int <- unlist(lapply(covs, function(x) x@means_int))[index]
+  mean_ext <- unlist(lapply(covs, function(x) x@means_ext))[index]
+
+  sd_int <- unlist(lapply(covs, function(x) sqrt(diag(x@covariance_int))))[index]
+  sd_ext <- unlist(lapply(covs, function(x) sqrt(diag(x@covariance_ext))))[index]
+
+  ifelse(
+    object[["ext"]] == 0,
+    pnorm(object[[var]], mean_int, sd_int),
+    pnorm(object[[var]], mean_ext, sd_ext)
+  )
+}
+
+
+#' Binary Cut-Off Transformation
+#'
+#' @param name variable to transform
+#' @param int_cutoff cut-off for internal patients, numeric between 0 and 1
+#' @param ext_cutoff cut-off for external patients, numeric between 0 and 1
+#'
+#' @return Transformation function to be used in [create_baseline_object()].
+#'  Sets quantile values larger than cut-off value to `TRUE` otherwise `FALSE`.
+#' @export
+#' @examples
+#' # Creates a simple function, where `data` is a `BaselineDataFrame`:
+#' function(data) {
+#'   ext <- data$ext == 0
+#'   q <- get_quantiles(data, name)
+#'   ifelse(ext, q > int_cutoff, q > ext_cutoff)
+#' }
+#'
+binary_cutoff <- function(name, int_cutoff, ext_cutoff) {
+  assert_character(name, len = 1, any.missing = FALSE)
+  assert_numeric(int_cutoff, lower = 0, upper = 1, any.missing = FALSE, len = 1)
+  assert_numeric(ext_cutoff, lower = 0, upper = 1, any.missing = FALSE, len = 1)
+  function(data) {
+    ext <- data$ext == 0
+    q <- get_quantiles(data, name)
+    as.integer(ifelse(ext, q > int_cutoff, q > ext_cutoff))
+  }
+}
diff --git a/_pkgdown.yml b/_pkgdown.yml
index 47c7b920..92ca693d 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -30,3 +30,4 @@ articles:
       - "`propensity_scores`"
       - "`overview_slides`"
       - "`weighting`"
+      - "`data_simulation`"
diff --git a/design/Simulate_data_baseline.Rmd b/design/Simulate_data_baseline.Rmd
new file mode 100644
index 00000000..7e1417ea
--- /dev/null
+++ b/design/Simulate_data_baseline.Rmd
@@ -0,0 +1,107 @@
+---
+title: "Simulate Data: Baseline"
+author: "Isaac Gravestock"
+date: "2023-03-31"
+output: html_document
+---
+
+# 1. Baseline Design
+
+To prepare for simulating survival data, we need to first create the baseline data for the cohort.
+
+Requirements:
+
+1. number of patients in the 3 arms
+2. arbitrary number of correlated variables
+3. transform to binary variables by cut-off value
+4. set names
+5. set means for internal and external arms
+
+```{r, eval = FALSE}
+# generate correlated variables
+correlated_covs <- add_covariates(
+  names = c("x1", "x2", "x3"),
+  means_int = c(0, 0, 0),
+  means_ext = means,
+  covariance = covariance_matrix(diag = c(1, 1, 1), upper_tri = c(0, .2, .3))
+)
+
+# generate uncorrelated variables
+indep_covs <- add_covariates(
+  names = c("age", "sex"),
+  means = c(60, 0.5),
+  means_ext = means,
+  covariance = covariance_matrix(diag = c(1, 1), upper_tri = c(0, 0))
+)
+
+baseline_obj <- sim_baseline(
+  sample_size = c(trt_int = 100, ctrl_int = 50, ctrl_ext = 200),
+  covariates = list(correlated_covs, indep_covs)
+) %>% transform(
+  x1 = binary_tranform(x1, cut_off = 0.4, cut_off_ext = 0.45),
+  age2 = \(data) data$age^2
+)
+
+
+
+baseline_obj <- sim_baseline(
+  sample_size = c(trt_int = 100, ctrl_int = 50, ctrl_ext = 200),
+  covariates = list(correlated_covs, indep_covs),
+  transformations = list(
+    x1 = binary_tranform(cut_off = 0.4, cut_off_ext = 0.45),
+    age2 = \(data) data$age^2
+  )
+)
+```
+
+`sim_baseline` is enough to specify a cohort to generate survival times.
+Implementation of the above functions can be a first phase, generating survival data will be in another
+design doc
+
+# Functions descriptions
+
+```{r}
+# Specify covariates to include in baseline
+#
+# names: covariates names
+# means_int: means for internal trial patients
+# means_ext: means for external patients
+# covariance_int: covariance matrix for multivariate normal for internal
+# covariance_ext: covariance matrix for multivariate normal for external
+#
+# Returns object capturing these parameters for later processing
+add_covariates(names, means_int, means_ext, covariance_int, covariance_ext)
+
+# Transform variables
+#
+# ... Arguments corresponding to the names in `add_covariates` taking functions
+#  which takes a list of vectors of percentiles (int, ext), transforms them, and returns
+#  a list of vectors of the same dimension.
+#
+#  Returns a modified covariates object
+transform(...)
+
+binary_tranform <- function(cut_off_int, cut_off_ext) {
+  function(qlist) {
+    list(int = qlist$int > cut_off_int, ext = qlist$ext > cut_off_ext)
+  }
+}
+
+
+# Covariance matrix
+#
+# diag: variances to put in the main diagonal
+# upper_tri: covariance terms to put in the upper and lower triangles. Given by column
+covariance_matrix(diag, upper_tri)
+
+# Simulate Baseline
+#
+# sample_size: named vector of sample sizes (maybe these should be directly parameters)
+# covariates: a list of covariates objects
+#
+# Constructs the multivariate normal distribution by combining the means and covariance matrices blockwise.
+# Generates a data frame using mvrnorm().
+# Any variables to be transformed and calculated back to percentiles using marginal distribution and then
+# transformed.
+sim_baseline(sample_size, covariates)
+```
diff --git a/design/Simulate_data_survival.Rmd b/design/Simulate_data_survival.Rmd
new file mode 100644
index 00000000..e55920b9
--- /dev/null
+++ b/design/Simulate_data_survival.Rmd
@@ -0,0 +1,73 @@
+---
+title: 'Simulate Data: Survival'
+author: "Isaac Gravestock"
+date: "2023-06-20"
+output: html_document
+---
+
+Continuing from the design of baseline covariates, we consider how to implement survival data simulation.
+
+# 2. Survival Design
+
+For survival we need to specify in addition:
+
+ - baseline hazard parameters
+ - HR:
+   - internal/external HR "drift"
+   - covariate HR coefficients
+   - treatment effect HR
+
+```{r}
+# each of these builds up the object
+# nothing needs to generate any data
+#
+# implement generate() method for all of them
+# generate methods are recursive,
+# highest level would take n=N repeat from highest level
+#
+
+
+baseline_obj %>%
+  design_matrx(~ sex + trt + ext) %>%
+  linear_predictor(betas = c(sex = 0.3, trt = 1, ext = 1.1)) %>%
+  simulate_survival(hazard_model = weibull_ph(shape = 1, scale = 0.9)) %>%
+  survival_matrix() %>% # think of better name....
+  generate(n = 100)
+
+generate(baseline_obj()) %>%
+  mutate(age2 = age^2) %>%
+  design_matrix(~ sex + trt + ext)
+
+baseline_design <- design_matrix(baseline_obj, ~ sex + I(age^2) + trt + ext, preprocess_fun = function(data) data)
+baseline_design2 <- design_matrix(baseline_obj, ~., preprocess_fun = function(data) data$age2 <- data$age^2)
+colnames(baseline_design)
+
+
+baseline_predictor <- linear_predictor(
+  design = baseline_design, # matrix
+  betas = c(
+    sex = .3,
+    "I(age^2)" = .4,
+    trt = 1,
+    ext = 1.1
+  )
+)
+
+simulate_survival(
+  baseline_predictor,
+  hazard_model = weibull_ph(shape = 0.76, lambda = 1) # is a function that takes vector of LPs and gives vector of times
+)
+
+survival_matrix(
+  baseline_design,
+  surv_times = simulate_survival(baseline_predictor, hazard_model = weibull_ph(shape = 0.76, lambda = 1)),
+  censor_times = censor(),
+  recruitment = recruitment()
+)
+```
+
+Refer to psborrow for how these can be calculated and combined.
+
+- recruitment rate
+- cut off
+- censoring
diff --git a/man/BaselineDataFrame-class.Rd b/man/BaselineDataFrame-class.Rd
new file mode 100644
index 00000000..f4b57056
--- /dev/null
+++ b/man/BaselineDataFrame-class.Rd
@@ -0,0 +1,19 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/simulate_data.R
+\docType{class}
+\name{BaselineDataFrame-class}
+\alias{BaselineDataFrame-class}
+\alias{.baseline_dataframe}
+\title{Baseline Data Frame Object}
+\value{
+A \code{BaselineDataFrame}
+}
+\description{
+A \code{data.frame} with additional slot containing simulation definition
+}
+\section{Slots}{
+
+\describe{
+\item{\code{BaselineObject}}{Simulated covariates definitions as \code{BaselineObject}. See \code{\link[=create_baseline_object]{create_baseline_object()}}}
+}}
+
diff --git a/man/BaselineObject-class.Rd b/man/BaselineObject-class.Rd
new file mode 100644
index 00000000..4c4ede19
--- /dev/null
+++ b/man/BaselineObject-class.Rd
@@ -0,0 +1,24 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/simulate_data.R
+\docType{class}
+\name{BaselineObject-class}
+\alias{BaselineObject-class}
+\alias{.baseline_object}
+\title{\code{BaselineObject} class for data simulation}
+\description{
+\code{BaselineObject} class for data simulation
+}
+\section{Slots}{
+
+\describe{
+\item{\code{n_trt_int}}{integer. Number of internal treated patients}
+
+\item{\code{n_ctrl_int}}{integer. Number of internal control patients}
+
+\item{\code{n_ctrl_ext}}{integer. Number of external control patients}
+
+\item{\code{covariates}}{list. List of correlated covariates objects, see \code{\link[=baseline_covariates]{baseline_covariates()}}}
+
+\item{\code{transformations}}{list. List of named transformation functions.}
+}}
+
diff --git a/man/baseline_covariates.Rd b/man/baseline_covariates.Rd
new file mode 100644
index 00000000..db83bd21
--- /dev/null
+++ b/man/baseline_covariates.Rd
@@ -0,0 +1,40 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/simulate_data.R
+\name{baseline_covariates}
+\alias{baseline_covariates}
+\title{Specify Correlated Baseline Covariates}
+\usage{
+baseline_covariates(
+  names,
+  means_int,
+  means_ext = means_int,
+  covariance_int,
+  covariance_ext = covariance_int
+)
+}
+\arguments{
+\item{names}{character vector of variable names.}
+
+\item{means_int}{numeric vector of means for internal patients. Must have same length as \code{names}}
+
+\item{means_ext}{numeric vector of means for external patients. Must have same length as \code{names}}
+
+\item{covariance_int}{variance-covariance matrix for generating multivariate normal for internal patients.
+Must be square matrix with same number of rows and \code{length(names)}}
+
+\item{covariance_ext}{variance-covariance matrix for generating multivariate normal data for external patients.
+Must be square matrix with same number of rows and \code{length(names)}}
+}
+\value{
+\link[=BaselineObject-class]{BaselineObject} to build simulated dataset
+}
+\description{
+Set parameters to generate correlated multivariate normal data for internal and external patients.
+}
+\examples{
+corr_covs <- baseline_covariates(
+  names = c("b1", "b2"),
+  means_int = c(5, 25),
+  covariance_int = covariance_matrix(diag = c(1, 1), upper_tri = 0.4)
+)
+}
diff --git a/man/binary_cutoff.Rd b/man/binary_cutoff.Rd
new file mode 100644
index 00000000..52af170d
--- /dev/null
+++ b/man/binary_cutoff.Rd
@@ -0,0 +1,31 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/simulate_data.R
+\name{binary_cutoff}
+\alias{binary_cutoff}
+\title{Binary Cut-Off Transformation}
+\usage{
+binary_cutoff(name, int_cutoff, ext_cutoff)
+}
+\arguments{
+\item{name}{variable to transform}
+
+\item{int_cutoff}{cut-off for internal patients, numeric between 0 and 1}
+
+\item{ext_cutoff}{cut-off for external patients, numeric between 0 and 1}
+}
+\value{
+Transformation function to be used in \code{\link[=create_baseline_object]{create_baseline_object()}}.
+Sets quantile values larger than cut-off value to \code{TRUE} otherwise \code{FALSE}.
+}
+\description{
+Binary Cut-Off Transformation
+}
+\examples{
+# Creates a simple function, where `data` is a `BaselineDataFrame`:
+function(data) {
+  ext <- data$ext == 0
+  q <- get_quantiles(data, name)
+  ifelse(ext, q > int_cutoff, q > ext_cutoff)
+}
+
+}
diff --git a/man/check_cmdstanr.Rd b/man/check_cmdstanr.Rd
index 20088067..9c5dc749 100644
--- a/man/check_cmdstanr.Rd
+++ b/man/check_cmdstanr.Rd
@@ -4,7 +4,10 @@
 \alias{check_cmdstanr}
 \title{Check Stan}
 \usage{
-check_cmdstanr()
+check_cmdstanr(check_sampling = FALSE)
+}
+\arguments{
+\item{check_sampling}{Compile and sample from the "logistic" example model.}
 }
 \description{
 Check whether \code{cmdstanr} is available and prints version and logistic example.
diff --git a/man/covariance_matrix.Rd b/man/covariance_matrix.Rd
new file mode 100644
index 00000000..edd9d4d6
--- /dev/null
+++ b/man/covariance_matrix.Rd
@@ -0,0 +1,29 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/simulate_data.R
+\name{covariance_matrix}
+\alias{covariance_matrix}
+\title{Create Covariance Matrix}
+\usage{
+covariance_matrix(diag, upper_tri)
+}
+\arguments{
+\item{diag}{Diagonal entries of the covariance matrix}
+
+\item{upper_tri}{Upper triangle entries of the matrix, specified column wise.}
+}
+\value{
+A symmetric matrix with \code{diag} values on the main diagonal and
+\code{upper_tri} values in the lower and upper triangles.
+}
+\description{
+Create Covariance Matrix
+}
+\examples{
+m1 <- covariance_matrix(c(1, 1, 1, 1), c(.8, .3, .8, 0, 0, 0))
+m1
+mvtnorm::rmvnorm(5, mean = c(0, 0, 0, 0), sigma = m1)
+
+# No correlation
+covariance_matrix(c(1, 2, 3))
+
+}
diff --git a/man/create_baseline_object.Rd b/man/create_baseline_object.Rd
new file mode 100644
index 00000000..f1d349cf
--- /dev/null
+++ b/man/create_baseline_object.Rd
@@ -0,0 +1,63 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/simulate_data.R
+\name{create_baseline_object}
+\alias{create_baseline_object}
+\title{Create Baseline Data Simulation Object}
+\usage{
+create_baseline_object(
+  n_trt_int,
+  n_ctrl_int,
+  n_ctrl_ext,
+  covariates,
+  transformations
+)
+}
+\arguments{
+\item{n_trt_int}{Number of internal treated patients}
+
+\item{n_ctrl_int}{Number of internal control patients}
+
+\item{n_ctrl_ext}{Number of external control patients}
+
+\item{covariates}{List of correlated covariates objects, see \code{\link[=baseline_covariates]{baseline_covariates()}}}
+
+\item{transformations}{List of named transformation functions.}
+}
+\value{
+A \link[=BaselineObject-class]{BaselineObject}
+}
+\description{
+Create Baseline Data Simulation Object
+}
+\details{
+Transformation functions are evaluated in order and create or overwrite a column
+in the data.frame with that name. The function should take a \code{data.frame} (specifically
+a \code{BaselineDataFrame} object from \code{generate(BaselineObject)}) and return a vector with
+length identical to the total number of patients.
+The \verb{@BaselineObject} slot may be accessed directly or with \code{\link[=get_quantiles]{get_quantiles()}} to
+create transformations. See \code{\link[=binary_cutoff]{binary_cutoff()}}
+}
+\examples{
+bl_no_covs <- create_baseline_object(
+  n_trt_int = 100,
+  n_ctrl_int = 50,
+  n_ctrl_ext = 100
+)
+
+
+bl_biomarkers <- create_baseline_object(
+  n_trt_int = 100,
+  n_ctrl_int = 50,
+  n_ctrl_ext = 100,
+  covariates = baseline_covariates(
+    c("b1", "b2", "b3"),
+    means_int = c(0, 0, 0),
+    covariance_int = covariance_matrix(c(1, 1, 1), c(.8, .3, .8))
+  ),
+  transformations = list(
+    exp_b1 = function(data) exp(data$b1),
+    b2 = binary_cutoff("b2", int_cutoff = 0.7, ext_cutoff = 0.5)
+  )
+)
+
+}
diff --git a/man/generate-BaselineObject-method.Rd b/man/generate-BaselineObject-method.Rd
new file mode 100644
index 00000000..42f37977
--- /dev/null
+++ b/man/generate-BaselineObject-method.Rd
@@ -0,0 +1,36 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/simulate_data.R
+\name{generate,BaselineObject-method}
+\alias{generate,BaselineObject-method}
+\title{Generate Data for a \code{BaselineObject}}
+\usage{
+\S4method{generate}{BaselineObject}(x, ...)
+}
+\arguments{
+\item{x}{a \code{BaselineObject} object created by \link{create_baseline_object}}
+
+\item{...}{additional parameters are ignored}
+}
+\value{
+A \link[=BaselineDataFrame-class]{BaselineDataFrame} object
+}
+\description{
+Generate Data for a \code{BaselineObject}
+}
+\examples{
+bl_biomarkers <- create_baseline_object(
+  n_trt_int = 100,
+  n_ctrl_int = 50,
+  n_ctrl_ext = 100,
+  covariates = baseline_covariates(
+    c("b1", "b2", "b3"),
+    means_int = c(0, 0, 0),
+    covariance_int = covariance_matrix(c(1, 1, 1), c(.8, .3, .8))
+  ),
+  transformations = list(
+    exp_b1 = function(data) exp(data$b1),
+    b2 = binary_cutoff("b2", int_cutoff = 0.7, ext_cutoff = 0.5)
+  )
+)
+generate(bl_biomarkers)
+}
diff --git a/man/generate.Rd b/man/generate.Rd
new file mode 100644
index 00000000..213766ba
--- /dev/null
+++ b/man/generate.Rd
@@ -0,0 +1,16 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/generics.R
+\name{generate}
+\alias{generate}
+\title{Generate Data from Object}
+\usage{
+generate(x, ...)
+}
+\arguments{
+\item{x}{object}
+
+\item{...}{Other arguments passed to methods}
+}
+\description{
+Generate Data from Object
+}
diff --git a/man/get_quantiles.Rd b/man/get_quantiles.Rd
new file mode 100644
index 00000000..c3424aa2
--- /dev/null
+++ b/man/get_quantiles.Rd
@@ -0,0 +1,20 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/simulate_data.R
+\name{get_quantiles}
+\alias{get_quantiles}
+\title{Get Quantiles of Random Data}
+\usage{
+get_quantiles(object, var)
+}
+\arguments{
+\item{object}{a \code{BaselineDataFrame}}
+
+\item{var}{character string name of the variable}
+}
+\value{
+A numeric vector containing quantiles based on the data
+generating distribution.
+}
+\description{
+Helper for use within transformation functions for \code{\link[=create_baseline_object]{create_baseline_object()}}.
+}
diff --git a/tests/testthat/test-cmdstan.R b/tests/testthat/test-cmdstan.R
index 19dd31be..1d7c1cb5 100644
--- a/tests/testthat/test-cmdstan.R
+++ b/tests/testthat/test-cmdstan.R
@@ -1,8 +1,12 @@
 test_that("check_cmdstanr runs as expected", {
   expect_output(
-    result <- check_cmdstanr(),
+    check_cmdstanr(check_sampling = FALSE),
     "int<lower=0> N;",
     fixed = TRUE
   )
+
+  skip_on_ci()
+  skip_on_cran()
+  result <- check_cmdstanr(check_sampling = TRUE)
   expect_class(result, "CmdStanMCMC")
 })
diff --git a/tests/testthat/test-simulate_data.R b/tests/testthat/test-simulate_data.R
new file mode 100644
index 00000000..5ea9219f
--- /dev/null
+++ b/tests/testthat/test-simulate_data.R
@@ -0,0 +1,38 @@
+test_that("covariance_matrix works", {
+  result <- covariance_matrix(c(1:4), c(1, 0, 0, 0, 0, 0))
+  expected <- matrix(
+    c(
+      1, 1, 0, 0,
+      1, 2, 0, 0,
+      0, 0, 3, 0,
+      0, 0, 0, 4
+    ),
+    nrow = 4
+  )
+
+  expect_equal(result, expected)
+})
+
+test_that("covariance_matrix finds positive definite matrices", {
+  expect_warning(
+    result <- covariance_matrix(c(1:4), c(1, 4, -2, 1, 0, 0)),
+    "Finding nearest positive definite matrix"
+  )
+  expected <- matrix(
+    c(
+      1, -0.173717259157214, 1.55477286166658, 0.606677145265074,
+      -0.173717259157214, 2, -1.25191988052734, 0.120331114550728,
+      1.55477286166658, -1.25191988052734, 3, 0.250687992911699,
+      0.606677145265074, 0.120331114550728, 0.250687992911699, 4
+    ),
+    nrow = 4
+  )
+
+  expect_equal(result, expected)
+})
+
+
+test_that("covariance_matrix fails with incompatible arguments", {
+  expect_error(covariance_matrix(c(1:3), c(0, 0)))
+  expect_error(covariance_matrix(c("a", "b", "c")))
+})
diff --git a/vignettes/data_simulation.Rmd b/vignettes/data_simulation.Rmd
new file mode 100644
index 00000000..c6e599e4
--- /dev/null
+++ b/vignettes/data_simulation.Rmd
@@ -0,0 +1,104 @@
+---
+title: "8. Data Simulation"
+output: rmarkdown::html_vignette
+vignette: >
+  %\VignetteIndexEntry{8. Data Simulation}
+  %\VignetteEngine{knitr::rmarkdown}
+  %\VignetteEncoding{UTF-8}
+---
+
+```{r, include = FALSE}
+knitr::opts_chunk$set(
+  collapse = TRUE,
+  comment = "#>"
+)
+options("max.print" = 20)
+```
+
+This vignette will show the internal functions for generating data that can be used in simulation studies.
+
+```{r setup}
+library(psborrow2)
+```
+
+## Generating Baseline Data
+
+First we define how to generate baseline data for our study. In its simplest form we only need to define how many
+patients are in the arms: treated internal, control internal, control external.
+
+```{r}
+simple_baseline <- create_baseline_object(
+  n_trt_int = 100,
+  n_ctrl_int = 50,
+  n_ctrl_ext = 100
+)
+```
+
+This object defines how the data will be created. To actually generate a sample, use `generate()` to produce a
+`data.frame`.
+
+```{r}
+generate(simple_baseline)
+```
+
+Often we are interested in some additional covariates. Internally the package uses multivariate normal distributions,
+so we need to specify the mean and (co-)variance. We have the option to specify different distribution parameters
+for the internal and external arms. The internal arms are assumed to be randomized and therefore come from the same
+distribution. If no external parameters are specified, the internal ones are used for both.
+
+```{r}
+with_age <- create_baseline_object(
+  n_trt_int = 100,
+  n_ctrl_int = 50,
+  n_ctrl_ext = 100,
+  covariates = baseline_covariates(
+    names = "age",
+    means_int = 55,
+    covariance_int = covariance_matrix(5)
+  )
+)
+generate(with_age)
+```
+
+In a more complex setting, we can generate correlated baseline covariates.
+```{r}
+with_age_score <- create_baseline_object(
+  n_trt_int = 100,
+  n_ctrl_int = 50,
+  n_ctrl_ext = 100,
+  covariates = baseline_covariates(
+    names = c("age", "score"),
+    means_int = c(55, 5),
+    means_ext = c(60, 5),
+    covariance_int = covariance_matrix(c(5, 1))
+  )
+)
+data_age_score <- generate(with_age_score)
+```
+
+```{r}
+plot(
+  x = data_age_score$age,
+  y = data_age_score$score,
+  col = data_age_score$trt + 1,
+  pch = data_age_score$ext * 16,
+  xlab = "Age",
+  ylab = "Score"
+)
+legend(
+  "topright",
+  legend = c("internal treated", "internal control", "external control"),
+  col = c(2, 1, 1),
+  pch = c(0, 0, 16)
+)
+```
+
+Finally, we have the option to transform non-normal covariates, such as with binary cut-offs.
+These can be added to existing objects and can use built-in functions or used defined.
+```{r}
+with_age_score@transformations <- list(
+  score_high = binary_cutoff("score", int_cutoff = 0.7, ext_cutoff = 0.7),
+  score_rounded = function(data) round(data$score)
+)
+generate(with_age_score)
+```
diff --git a/vignettes/propensity_scores.Rmd b/vignettes/propensity_scores.Rmd
index fb8d5e36..5d93a451 100644
--- a/vignettes/propensity_scores.Rmd
+++ b/vignettes/propensity_scores.Rmd
@@ -27,78 +27,11 @@ library(psborrow2)
 ```
 
 Propensity scores (PS) methods offer various ways to adjust analyses for differences in groups of patients.
-
 @austin2013 discusses various approaches for using PS with survival analyses to obtain effect measures similar to
-randomized controlled trials.
-
-@wang2021 discuss using PS for IPTW, matching and stratification in combination with a Bayesian analysis. These methods
-allow for the separation of the design and analysis into two stages, which may be attractive in a regulatory setting.
-
+randomized controlled trials. @wang2021 discuss using PS for IPTW, matching and stratification in combination with a Bayesian analysis.
+These methods allow for the separation of the design and analysis into two stages, which may be attractive in a regulatory setting. 
 Another approach is the direct inclusion of the PS as a covariate in the outcome model.
 
-## Propensity Score Calculation
-
-Propensity scores are the probabilities that an observation belongs to one group or another, in this setting the
-internal or external patients. The PS models are fitted based on baseline covariates that could be confounders between
-outcome and being in the internal/external group.
-
-The simplest approach for calculating PS is a logistic generalized linear model. First, let's load some demo data from
-`psborrow2`.
-
-```{r data}
-example_dataframe <- as.data.frame(example_matrix)
-example_dataframe$int <- 1 - example_dataframe$ext
-```
-
-Let's now fit a logistic generalized linear model and add the PS as an additional covariate.
-
-```{r glm}
-ps_model <- glm(int ~ cov1 + cov2 + cov3 + cov4,
-  data = example_dataframe,
-  family = binomial
-)
-ps <- predict(ps_model, type = "response")
-example_dataframe$ps <- ps
-
-example_dataframe$ps_cat_ <- cut(
-  example_dataframe$ps,
-  breaks = 5,
-  include.lowest = TRUE
-)
-levels(example_dataframe$ps_cat_) <- c(
-  "ref", "low",
-  "low_med", "high_med", "high"
-)
-```
-
-```{r include = FALSE}
-example_matrix_ps <- create_data_matrix(
-  example_dataframe,
-  outcome = c("time", "cnsr"),
-  trt_flag_col = "trt",
-  ext_flag_col = "ext",
-  covariates = ~ ps_cat_ + ps
-)
-```
-
-The calculated PS can be included as a covariate in the analysis object, for the "covariate adjustment" PS approach:
-
-```{r analysis_obj}
-analysis_ps_cuts <- create_analysis_obj(
-  data_matrix = example_matrix_ps,
-  covariates = add_covariates(
-    c("ps_cat_low", "ps_cat_low_med", "ps_cat_high_med", "ps_cat_high"),
-    priors = normal_prior(0, 10000)
-  ),
-  outcome = exp_surv_dist("time", "cnsr", normal_prior(0, 10000)),
-  borrowing = borrowing_details("Full borrowing", "ext"),
-  treatment = treatment_details("trt", normal_prior(0, 10000))
-)
-
-result_ps_cuts <- mcmc_sample(analysis_ps_cuts)
-```
-
-
 ## Alternative PS Weights with `WeightIt`
 
 The [`WeightIt`](https://cran.r-project.org/package=WeightIt) package can calculate PS and other balancing
@@ -109,6 +42,9 @@ In addition, weights can be calculated differently for different estimands. Here
 ```{r WeightIt}
 library(WeightIt)
 
+example_dataframe <- as.data.frame(example_matrix)
+example_dataframe$int <- 1 - example_dataframe$ext
+
 weightit_model <- weightit(
   int ~ cov1 + cov2 + cov3 + cov4,
   data = example_dataframe,
@@ -128,6 +64,12 @@ library(cobalt)
 bal.plot(weightit_model)
 ```
 
+"Love plots" can also be generated that compare the populations before and after weighting:
+
+```{r loveplot}
+love.plot(weightit_model, stars = "std")
+```
+
 The PS values can be copied into the data set and the analysis object can be constructed as before.
 
 ```{r covs}