diff --git a/.github/workflows/build-book.yaml b/.github/workflows/build-book.yaml
index 61635ac..c9ef0d5 100644
--- a/.github/workflows/build-book.yaml
+++ b/.github/workflows/build-book.yaml
@@ -33,7 +33,7 @@ jobs:
         shell: Rscript {0}
         run: |
           cmdstanr::check_cmdstan_toolchain(fix = TRUE)
-          cmdstanr::install_cmdstan()
+          cmdstanr::install_cmdstan(version = "2.32.2")
 
       - uses: quarto-dev/quarto-actions/setup@v2
 
diff --git a/_quarto-public.yml b/_quarto-public.yml
index 3ffec23..484d08a 100644
--- a/_quarto-public.yml
+++ b/_quarto-public.yml
@@ -1,6 +1,5 @@
 project:
   output-dir: docs/public
-  post-render: src/move_videos.R # workaround for Quarto issue https://github.com/quarto-dev/quarto-cli/issues/3892
 
 book:
   repo-url: https://github.com/Novartis/bamdd/
@@ -11,12 +10,15 @@ book:
       chapters:
       - src/01a_introduction.qmd
       - src/01b_basic_workflow.qmd
+      - src/01c_priors.qmd
     - part: src/02_case_studies.qmd
       chapters:
       - src/02a_meta_analysis.qmd
       - src/02ab_meta_analysis_trtdiff.qmd
+      - src/02ac_meta_analysis_strata.qmd
       - src/02b_dose_finding.qmd
       - src/02c_dose_escalation.qmd
+      - src/02cb_tte_dose_escalation.qmd
       - src/02e_multiple_imputation.qmd
       - src/02g_longitudinal.qmd
       - src/02h_mmrm.qmd
diff --git a/_quarto.yml b/_quarto.yml
index dd8e37c..2ab73c8 100644
--- a/_quarto.yml
+++ b/_quarto.yml
@@ -13,7 +13,7 @@ execute:
    freeze: false
 
 book:
-  title: "(Bayesian) Applied Modelling in Drug Development (BAMDD)"
+  title: "Applied Modelling in Drug Development"
   subtitle: "Flexible regression modelling in Stan via `brms`"
   repo-actions: [issue]
   repo-branch: master
@@ -25,6 +25,8 @@ book:
   - Lukas Widmer - <lukas_andreas.widmer@novartis.com>
   - Andrew Bean - <andrew.bean@novartis.com>
   date: today
+  sidebar:
+    logo: bamdd_logo.png
 
 # we use per chapter references
 bibliography:
diff --git a/bamdd_logo.png b/bamdd_logo.png
new file mode 100644
index 0000000..554d74d
Binary files /dev/null and b/bamdd_logo.png differ
diff --git a/index.qmd b/index.qmd
index 87280c6..d5bfb68 100644
--- a/index.qmd
+++ b/index.qmd
@@ -34,6 +34,10 @@ appropiate. Key changes to the web-site are tracked here:
 
 
 ::: {.content-visible when-profile="public"}
+- 29th April 2024: Third edition course from Paul Bürkner with the new
+  case studies on probability of success, meta-analytic priors with
+  covariates & time-to-event modelling in Oncology phase I
+  dose-escalation. Pre-read version.
 - 19th April 2024: First public version at [opensource.nibr.com/bamdd](http://opensource.nibr.com/bamdd)
 - 5th January 2024: Release web-site in more modern quarto book
   based format
diff --git a/src/01c_priors.qmd b/src/01c_priors.qmd
new file mode 100644
index 0000000..90c5973
--- /dev/null
+++ b/src/01c_priors.qmd
@@ -0,0 +1,425 @@
+---
+author:
+  - Sebastian Weber - <sebastian.weber@novartis.com>
+bibliography: references.bib
+---
+
+{{< include _macros.qmd >}}
+
+# Model setup & priors {#sec-model-priors}
+
+```{r, include=FALSE}
+source(here::here("src", "setup.R"))
+##knitr::opts_chunk$set(cache=FALSE) 
+library(ggplot2)
+library(dplyr)
+library(knitr)
+library(brms)
+library(posterior)
+library(ggdist)
+library(bayesplot)
+library(tidybayes)
+library(forcats)
+library(RBesT)
+library(here)
+# instruct brms to use cmdstanr as backend and cache all Stan binaries
+options(brms.backend="cmdstanr", cmdstanr_write_stan_file_dir=here("_brms-cache"))
+# create cache directory if not yet available
+dir.create(here("_brms-cache"), FALSE) 
+set.seed(593467)
+theme_set(theme_bw(12))
+```
+
+Within the Bayesian regression modeling in Stan framework `brms`
+priors are required to perform inference. This introductory material
+is intended to provide some pragmatic considerations on how to setup
+priors and point readers to material to follow-up on. The case studies
+themselves provide details on setting up priors for the respective
+problem. By default these priors have been chosen to have minimal
+impact on the posterior whenever appropriate and also ensure stable
+numerical inference. As a consequence, the results from the case
+studies will resemble respective Frequentist model results in most
+cases. Nonetheless, priors are defined explicitly for all models,
+since these are an integral part of any Bayesian analysis. This can be
+easily seen by considering Bayes rule to obtain the posterior:
+
+$$ p(\theta | y) = \frac{p(y|\theta) \, p(\theta)}{p(y)} \propto p(y|\theta) \, p(\theta)$$
+
+The marginal likelihood term $p(y)$ can be dropped, since this is
+merely a normalization constant given that we condition the inference
+on the observed data $y$. What is left is hence the product of the
+likelihood $p(y|\theta)$ multiplied by the prior $p(\theta)$. The
+posterior information on $\theta$ is hence provided equally by
+likelihood and prior.
+
+From a practical perspective one may ask under which circumstances it
+actually matters which prior we set. In many cases the posterior is
+dominated by the data, which means that the likelihood term
+$p(y|\theta)$ is much larger than the prior term $p(\theta)$. This is
+the case for most case studies. It is tempting to drop the prior in
+these cases and not worry about it. However, this is not recommended
+as it is in many instances of interest to study a model with small
+sample sizes eventually (by applying the model to subsets, increasing
+model complexity, etc.). Another practical aspect is the numerical
+stability and quality of the Markov Chain Monte Carlo (MCMC) sample we
+obtain from Stan. Without a prior the inference problem becomes much
+harder to solve. That is, the Markov Chain Monte Carlo (MCMC) sampler
+Stan has to consider the full sampling space of the prior. Dropping
+the prior entirely implies an improper prior on the sampling space
+which becomes un-countably infinitely large. Rather than dropping the
+prior entirely we strongly recommend to consider so-called weakly
+informative priors. While no formal definition is given in the
+literature, these priors aim to identify the *scale* of
+parameters. This requires a basic understanding of the parameters for
+which priors are being defined. Thus, a prior can only be understood
+in the overall context of the likelihood, parametrization and problem
+at hand [see also @Gelman2017]. For most statistical analyses this
+means to consider the endpoint and the applied transformations.
+
+<!--
+A notable exception to this are the case studies on the meta
+analytic predictive (MAP) priors. These case studies infer between
+study heterogeneity from very few studies and therefore the prior used
+for the heterogeneity parameter will not be dominated by the data. The
+MAP priors themselves represent *data* derived priors and they have
+become very popular nowadays.
+-->
+
+To just get started with `brms` one may choose to not specify priors
+when calling `brm`. Doing so will let `brms` provide in most cases
+reasonable default priors. These default priors are intended to avoid
+any influence on the calculated posterior. Hence, the results are
+fully data driven and will be very close to the respective Frequentist
+maximum likelihood inference result. However, the default prior is not
+guaranteed to stay stable between releases and can thus change whenever
+the `brms` version changes.
+
+Given that any Bayesian analysis requires a prior, we recommend to
+always explicitly define these - even if these just repeat the default
+prior from `brms`, which one can easily obtain. Here we use as example
+binomially distributed data (responder in a control arm) and we use
+the simplest possible model, which will pool the information across the
+different studies (in practice one would allow for between-trial
+heterogeneity):
+
+```{r}
+model <- bf(r | trials(n) ~ 1, family=binomial)
+
+## with brms >= 2.21.0 we can  directly write... 
+## default_prior(model, data=RBesT::AS)
+
+## a workaround is to create an empty brms model and obtain the
+## defined prior like
+prior_summary(brm(model, data=RBesT::AS, empty=TRUE)) |> kable()
+```
+
+Note that we have defined first the model via the `bf` call. This
+defines the linear predictor and the likelihood using the `family`
+argument of `bf`. We recommend to always explicitly do this step first
+as it defines the likelihood and the model parametrization in one
+step. Choosing these is a critical first step in defining the model
+and, most importantly, the chosen priors can only be understood in the
+context of the model (likelihood and parametrization) [see @Gelman2017].
+
+Rather than running the model with no explicit prior definition we
+encourage users to explicitly define the priors used for the
+analysis - even if these are simply the default priors:
+
+```{r warning=FALSE, message=FALSE}
+model_def <- bf(r | trials(n) ~ 1, family=binomial)
+model_prior <- prior(student_t(3, 0, 2.5), class=Intercept)
+
+fit <- brm(model_def, data=RBesT::AS, prior=model_prior, seed=467657, refresh=0)
+fixef(fit)
+```
+
+The provided default prior is indeed not informative, as can be seen
+by comparing the posterior estimate to the respective Frequentist fit
+from `glm`:
+
+```{r warning=FALSE, message=FALSE}
+fit_freq <- glm(cbind(r, n-r) ~ 1, data=RBesT::AS, family=binomial)
+## intercept estimate
+coef(fit_freq)
+## standard error estimate
+sqrt(vcov(fit_freq)[1,1])
+```
+
+In this case the data with a total of `r sum(RBesT::AS$n)` subjects
+dominates the posterior. Nonetheless, it is illustrative to consider in
+more detail the prior here as an example. Importantly, the intercept
+of the model is defined on the *logit* scale of the response
+probability as it is common for a standard logistic regression. The
+transformation changes the scale from the interval 0 to 1 to a new
+scale which covers the entire space of reals. While formally very
+large or very small logits are admissible, the range from 5% to 95%
+response rate corresponds to -3 to 3 on the logit space
+approximately. Thus, logit values smaller or greater than these values
+would be considered extreme effects.
+
+When defining a prior for a model we would typically want to compare
+different choices of the prior to one another. As example prior for 
+the intercept only model we may compare here three different priors:
+(i) a very wide prior $\N(0,10^2)$, (2) the default `brms` prior and
+(3) the density $\N(0,2^2)$ as used as default prior in `RBesT` for
+this problem. In a first step we sample these priors and assess them
+graphically. We could use `brms` to sample these priors directly with
+the argument `sample_prior="only"`, but we instead use basic R
+functions for simplicity:
+
+
+```{r warning=FALSE, message=FALSE}
+#| code-fold: true
+#| code-summary: "Show the code"
+
+num_draws <- 1E4
+priors_cmp <- tibble(case=c("wide", "brms", "RBesT"),
+                     density=c("N(0,10^2)", "S(3, 0, 2.5^2)", "N(0, 2^2)"),
+                     prior=rvar(cbind(rnorm(num_draws, 0, 10),
+                                      rstudent_t(num_draws, 3, 0, 2.5),
+                                      rnorm(num_draws, 0, 2))))
+
+kable(priors_cmp)
+
+priors_logit <- priors_cmp |>
+    ggplot(aes(y=case, xdist=prior)) +
+    stat_slab() +
+    xlab("Intercept\nlinear predictor") +
+    ylab(NULL) +
+    labs(title="Prior logit scale") +
+    coord_cartesian(xlim=c(-30, 30))
+
+priors_response <- priors_cmp |>
+    ggplot(aes(y=case, xdist=rdo(inv_logit(prior)))) +
+    stat_slab() +
+    xlab("Intercept\nresponse scale") +
+    ylab(NULL) +
+    labs(title="Prior response scale") +
+    coord_cartesian(xlim=c(0, 1))
+
+bayesplot_grid(priors_logit, priors_response, grid_args=list(nrow=1))
+```
+
+As one can easily see, the wide prior has density at very extreme
+logit values of -20 and 20. In comparison, the `brms` and the `RBesT`
+default prior place most probability mass in the vicinity of
+zero. While the wide prior may suggest to be very non-informative it's
+meaning becomes clearer when back transforming to the original
+response scale from 0 to 1. It is clear that the wide prior implies
+that we expect extreme response rates of either 0 or 1. In contrast,
+the `RBesT` default prior has an almost uniform distribution on the 0
+to 1 range while the default `brms` prior has some "U" shape. To now
+further discriminate these priors a helpful concept is to consider
+**interval probabilities** for pre-defined categories. The definition
+of cut-points of the categories is problem specific, but can be
+defined with usual common sense for most endpoints. The critical step
+here is in defining these categories and documenting these along with
+the analysis. Here we categorize the response as extreme 0-1% &
+99-100%, very small 1-5% & 95-99%, small 5-10% & 90-95% and
+moderate for the remainder. Comparing then these priors gives:
+
+```{r warning=FALSE, message=FALSE}
+#| code-fold: true
+#| code-summary: "Show the code"
+
+response_category <- function(r) {
+    fct_rev(cut(pmin(r, 1-r), c(0, 0.01, 0.05, 0.1, 0.5), labels=c("extreme", "very_small", "small", "moderate")))
+}
+
+priors_cmp |> unnest_rvars()|> 
+    ggplot(aes(x=case, fill=response_category(inv_logit(prior)))) +
+    geom_bar(position="fill") +
+    xlab("Prior") +
+    ylab("Probability") +
+    labs(title="Prior interval probabilities", fill="Category") +
+    scale_y_continuous(breaks=seq(0,1,by=0.2)) +
+    theme(legend.position="right")
+
+
+mutate(priors_cmp,
+       prior_response=rfun(inv_logit)(prior),
+       summarise_draws(prior_response,
+                       extreme=~100*E(.x < 0.01 | .x > 0.99),
+                       very_small=~100*E( (0.01 <= .x & .x < 0.05) | (0.95 < .x & .x <= 0.99) ),
+                       small=~100*E( (0.05 <= .x & .x < 0.1) | (0.9 < .x & .x <= 0.95) ),
+                       moderate=~100*E( (0.1 <= .x & .x < 0.9) )
+                       )) |>
+    select(case, density, extreme, very_small, small, moderate) |>
+    kable(digits=1, caption="Interval probabilities per prior given in percent")
+```
+
+It is apparent that the seemingly non-informative prior "wide" is in
+fact an informative prior implying that the expected response rates
+are extreme. This is the consequence of transforming to the logit
+scale.
+
+In practice one would most certainly not fit the intercept only model,
+which pools the information. The `RBesT::AS` data set is in fact the
+standard example for the use of historical control data. 
+```{r warning=FALSE, message=FALSE}
+kable(RBesT::AS)
+```
+To use this data as historical control data one uses instead of the
+intercept only model a random intercept model, which casts this
+into a meta-analtyic model:
+
+```{r warning=FALSE, message=FALSE}
+model_meta_def <- bf(r | trials(n) ~ 1 + (1 | study), family=binomial)
+model_meta_prior <- prior(normal(0, 2), class=Intercept) +
+    prior(normal(0, 0.5), class=sd, coef=Intercept, group=study)
+
+fit_meta <- brm(model_meta_def, data=RBesT::AS, prior=model_meta_prior,
+                seed=982345, refresh=0, control=list(adapt_delta=0.95))
+summary(fit_meta)
+```
+
+The meta-analytic model is generative in the sense of allowing to
+predict the mean response rate for future studies by way of the
+hierarchical model structure. Now the importance of the prior on the
+overall intercept becomes more relevance as the information from each
+study is discounted through the random effects model leading to less
+data to estimate the parameter in comparison to the full pooling
+model. However, the key parameter in this model is the between-trial
+heterogeneity parameter. In the example a half-normal prior with scale
+1 is used (`brms` knows that the parameter must be positive and hence
+truncates the prior at zero automatically). This distribution of a
+half-normal density has been studied extensively in the literature and
+found to be a robust choice in a wide range of problems. For this
+reason, it is a very rational choice to use this prior in the current
+(and future) analyses. The choice of the scale of 0.5 is often
+referred to as a conservative choice in this problem while 1 is a very
+conservative choice and 0.25 a less conservative choice. A
+complication with the hierarchical model is that $\tau$ itself implies
+a distribution. To simplify this, we first consider the case of a
+known between-trial heterogeneity parameter and study what this
+implies. For a known $\tau$ the range of implied log odds can be
+characterized by the respective 95% probability mass range given by $2
+\cdot 1.96 \, \tau$, which is interpret-able as the largest log odds
+ratio. Recalling that the between-study heterogeneity parameter
+controls the (random) differences in the logits of response rates
+between studies, it is helpful to consider the distribution of implied
+log odds ratios between random pairs of two studies. These differences
+(log odds ratios) have a distribution of $\N(0, (\sqrt{2} \,
+\tau)^2)$. Considering the absolute value of these differences, the
+distributions becomes a half-normal with scale $\sqrt{2} \, \tau$,
+which has it's median value at $1.09 \, \tau$. This results for a
+range of values of $\tau$ on the respective odds scale to:
+
+```{r echo=FALSE}
+tau_known <- tibble(tau=c(0, 0.125, 0.25, 0.5, 1.0, 1.5, 2)) |>
+    mutate(largest_odds_ratio=exp(3.92 * tau),
+           median_odds_ratio=exp(1.09 * tau))
+
+tau_known |> kable(digits=3)
+```
+
+In [@Neuenschwander2020] suggest the categorization of the
+between-study heterogeneity $\tau$ parameter the values of $1$ for
+being large, $0.5$ substantial, $0.25$ moderate, and $0.125$
+small. Considering these with the table above qualifies these
+categories as plausible. An alternative approach to the categorization
+is to consider what is an extreme value for $\tau$ (like unity) and
+then choose the prior on $\tau$ such that a large quantile (like the
+95% quantile) corresponds to this extreme value. 
+
+With the categorization of specific values for $\tau$ we can now
+proceed and consider the interval probabilities for these categories
+for the different choices of $\tau \sim \HN(s^2)$ for $s=1$ and
+$s=1/2$.
+
+```{r}
+#| code-fold: true
+#| code-summary: "Show the code"
+priors_cmp_tau <- tibble(case=c("less_conservative", "conservative", "very_conservative"),
+                         density=c("HN(0, (1/4)^2)", "HN(0, (1/2)^2)", "HN(0, 1^2)"),
+                         prior=rvar(cbind(abs(rnorm(num_draws, 0, 0.25)),
+                                          abs(rnorm(num_draws, 0, 0.5)),
+                                          abs(rnorm(num_draws, 0, 1))))
+                         )
+
+kable(priors_cmp_tau)
+
+priors_hetero <- priors_cmp_tau |>
+    ggplot(aes(y=case, xdist=prior)) +
+    stat_slab() +
+    xlab("Between-study heterogeneity") +
+    ylab(NULL) +
+    scale_x_continuous(breaks=0:3) +
+    labs(title="Between-study\nheterogeneity") +
+    coord_cartesian(xlim=c(0, 3))
+
+priors_lor <- priors_cmp_tau |>
+    mutate(lor1=rvar_rng(rnorm, n=n(), mean=0, prior),
+           lor2=rvar_rng(rnorm, n=n(), mean=0, prior)) |>
+    ggplot(aes(y=case, xdist=lor1-lor2)) +
+    stat_slab() +
+    xlab("Log-odds ratio") +
+    ylab(NULL) +
+    scale_x_continuous(breaks=-5:5) +
+    labs(title="Log-odds ratio\nof random pair") +
+    coord_cartesian(xlim=c(-3, 3))
+
+bayesplot_grid(priors_hetero, priors_lor, grid_args=list(nrow=1))
+```
+
+```{r warning=FALSE, message=FALSE}
+#| code-fold: true
+#| code-summary: "Show the code"
+
+tau_category <- function(tau) {
+    fct_rev(cut(tau, c(0, 0.125, 0.25, 0.5, 1, Inf), labels=c("small", "moderate", "substantial", "large", "very_large")))
+}
+
+priors_cmp_tau |> unnest_rvars()|> 
+    ggplot(aes(x=case, fill=tau_category(prior))) +
+    geom_bar(position="fill") +
+    xlab("Prior") +
+    ylab("Probability") +
+    labs(title="Prior interval probabilities", fill="Category") +
+    ##scale_y_continuous(breaks=seq(0,1,by=0.2)) +
+    theme(legend.position="right")
+
+
+mutate(priors_cmp_tau,
+       summarise_draws(prior,
+                       small=~100*E(.x < 0.125),
+                       moderate=~100*E( 0.125 <= .x & .x < 0.25 ),
+                       substantial=~100*E( 0.25 <= .x & .x < 0.5 ),
+                       large=~100*E( 0.5 <= .x & .x < 1 ),
+                       very_large=~100*E( .x >= 1 )
+                       )) |>
+    select(case, density, small, moderate, substantial, large, very_large) |>
+    kable(digits=1, caption="Interval probabilities per prior given in percent")
+```
+
+We can see that the "conservative" choice has an about 5% tail
+probability exceeding the value of large. This implies that some
+degree of homogeneity between the studies is given. This is usually
+the case whenever meta-analyses are conducted, since the inclusion &
+exclusion criteria for studies are aligned to a certain degree in
+order to ensure that a similar patient population is included in the
+analysis. In contrast, the "very conservative" choice admits with 30%
+probability mass the possibility of values in the domain "very large"
+for which practically no borrowing of historical information
+occurs. The "less conservative" choice on the hand has 5% tail
+probability above substantial. This way a large heterogeneity is
+considered to be unlikely as it can be the case for twin Phase III
+studies, for example.
+
+Additional literature for consideration:
+
+- Empirical priors study for HTA treatment effect evaluation by the
+  German IQWIG [@Lilienthal2023]
+- Empirical priors for meta-analyses organized in disease specific
+  manner [@Turner2015]
+- Endpoint specific considerations for between-trial heterogeneity
+  parameter priors in random effect meta-analyses [@Rover2021]
+- Comprehensive introductory book to applied Bayesian data analysis
+  with detailed discussion on many examples [@Gelman2014]
+- Live wiki document maintained by Stan user community (heavily
+  influenced by Andrew Gelman & Aki Vehtari) [@stanwikiPrior]
+- Prior strategy based on nested modeling considerations (penalization
+  of more complex models), [@Simpson2014]
+- Global model shrinkage regularized horseshoe prior [@Piironen2017]
+  or R2D2 prior (overall $R^2$) [@Zhang2022]
+
diff --git a/src/02_case_studies.qmd b/src/02_case_studies.qmd
index 15d0405..b0e7edc 100644
--- a/src/02_case_studies.qmd
+++ b/src/02_case_studies.qmd
@@ -12,15 +12,15 @@ format:
 ```{r, echo = FALSE}
 overview <- dplyr::tribble(
                        ~Problem, ~Technique,
-                       "[Use of historical control data](02a_meta_analysis.qmd)", "nested random effects",
-                       "[Meta-analysis to estimate treatment effects](02ab_meta_analysis_trtdiff.qmd)", "aggregate data modeling & varying exposure times of count data",
-                       "[Dose finding](02b_dose_finding.qmd)", "non-linear models",
-                       "[Oncology dose escalation](02c_dose_escalation.qmd)",  "constrained parameters",
-                       "[Multiple imputation](02e_multiple_imputation.qmd)", "multi-variate outcome modeling",
-                       "[Longitudinal data](02g_longitudinal.qmd)", "longitudinal modeling with different covariance structures (MMRM)",
-                       "[Bayesian Mixed effects Model for Repeated Measures (MMRM)](02h_mmrm.qmd)", "unstructured MMRM for a continuous endpoint",
-                       "[Time-to-event data](02i_time_to_event.qmd)", "parametric time-to-event modeling with customized parametrization by using user-defined contrasts",
-                       "[Network meta-analysis](02j_network_meta_analysis.qmd)", "arm based network meta-analysis"
+                       "@sec-use-hist-control-data", "nested random effects",
+                       "@sec-map-treat-effects", "aggregate data modeling & varying exposure times of count data",
+                       "@sec-dose-finding", "non-linear models",
+                       "@sec-onc-escalation",  "constrained parameters",
+                       "@sec-multiple-imputation", "multi-variate outcome modeling",
+                       "@sec-longitudinal-data", "longitudinal modeling with different covariance structures (MMRM)",
+                       "@sec-mmrm", "unstructured MMRM for a continuous endpoint",
+                       "@sec-tte-data", "parametric time-to-event modeling with customized parametrization by using user-defined contrasts",
+                       "@sec-network-meta", "arm based network meta-analysis"
                    )
 knitr::kable(overview)
 ```
diff --git a/src/02ab_meta_analysis_trtdiff.qmd b/src/02ab_meta_analysis_trtdiff.qmd
index f00d10e..a0433bf 100644
--- a/src/02ab_meta_analysis_trtdiff.qmd
+++ b/src/02ab_meta_analysis_trtdiff.qmd
@@ -4,7 +4,7 @@ author:
   - Sebastian Weber - <sebastian.weber@novartis.com>
 ---
 
-# Meta-analysis to estimate treatment effects
+# Meta-analysis to estimate treatment effects {#sec-map-treat-effects}
 
 ```{r, include=FALSE}
 source(here::here("src", "setup.R"))
diff --git a/src/02ac_meta_analysis_strata.qmd b/src/02ac_meta_analysis_strata.qmd
new file mode 100644
index 0000000..3076ac3
--- /dev/null
+++ b/src/02ac_meta_analysis_strata.qmd
@@ -0,0 +1,703 @@
+---
+author:
+  - Nicolas Ballarini - <nicolas.ballarini@novartis.com>
+  - Sebastian Weber - <sebastian.weber@novartis.com>
+---
+
+# Use of historical control data with stratification {#sec-use-hist-control-data-strata}
+
+```{r, include=FALSE}
+source(here::here("src", "setup.R"))
+## use caching explicitly here as for quarto 1.3.450 the local
+## _quarto.yml is an issue
+# knitr::opts_chunk$set(cache=TRUE) 
+```
+
+{{< include _macros.qmd >}}
+
+Here we will demonstrate the use of historical control data as an
+example for a meta-analytic predictive (MAP) prior approach for the
+case of data being divided intro strata which are modelled using
+covariates.
+
+This case study demonstrates
+
+-   setting up a random effect meta-analysis with covariates for data
+    strata
+-   how to summarize with a multi-variate normal mixture the MAP prior
+    with covariates
+-   how to use the multi-variate normal mixture MAP prior as a prior
+    for the main analysis using `brms`
+
+To run the R code of this section please ensure to load these libraries
+and options first:
+
+```{r, eval=TRUE,echo=TRUE,message=FALSE,warning=FALSE}
+library(ggplot2)
+library(dplyr)
+library(knitr)
+library(brms)
+library(posterior)
+library(bayesplot)
+library(RBesT)
+library(GGally)
+library(here)
+# instruct brms to use cmdstanr as backend and cache all Stan binaries
+options(brms.backend="cmdstanr", cmdstanr_write_stan_file_dir=here("_brms-cache"))
+# create cache directory if not yet available
+dir.create(here("_brms-cache"), FALSE)
+set.seed(57339)
+# allow for wider and prettier printing
+options(width=120, digits=2)
+```
+
+## Background
+
+Assume we wish to evaluate in a Phase III study, the effectiveness of
+an investigational treatment by comparing it to a placebo in a
+population with a rare disease. The study focused on evaluating the
+treatment's efficacy in different age groups from the age range 2 to
+under 18 years old.  The primary endpoint of the study is the change
+in a Patient Reported Outcome (PRO) total score at the end of the
+follow-up period (12 months).
+
+The statistical hypothesis being tested for the primary endpoint is that there
+is no difference in the change from baseline in the PRO total score between
+the treatment group and the placebo group.
+
+A Bayesian linear regression model will be applied, adjusting for the
+age classified into 3 groups (2 to under 5, 5 to under 13, and 13 to
+under 18) and baseline score. The age group of 5 to under 13 years
+old will serve as the reference category, allowing the use of priors
+on the intercept and coefficients for treatment to represent the mean
+change in the score at 12 months in this subgroup of patients.
+
+In order to inform the model, informative priors will be included for the mean 
+change in the PRO score at 12 months in each age subgroup. 
+These priors will be derived from historical studies.
+
+
+## Data
+
+At the design stage of the trial, data from two historical studies were
+available for the control group.
+
+- Study 1: A natural history study of patients with the disease.
+- Study 2: Control arm of a randomized study evaluating an alternative
+  therapy.
+
+These historical studies provide valuable information for the primary
+evaluation of the treatment effect in the trial of interest. Casting
+the historical data into an informative prior on the control response
+is expected to increase the precision in the estimation of the
+treatment effect for the Phase III study of interest.
+
+The data set obtained from these two studies are summary statistics
+for the PRO score at baseline (BASE), the mean change from baseline
+(CHGm), and is standard error (CHGse) stratified by age group
+(AGEGRP).
+ 
+```{r}
+hdata <- tibble::tribble(
+  ~Study   , ~AGEGRPN, ~AGEGRP,           ~n, ~BASE, ~CHGm, ~CHGse,
+  "1. STUDY 1" , "1" , "Age >=2 to <5"   , 28, 16.8,  -0.7,  1.00,
+  "1. STUDY 1" , "2" , "Age >=5 to <13"  , 50, 26.7,  -1.1,  0.73,
+  "1. STUDY 1" , "3" , "Age >=13 to <18" , 23, 18.5,  -1.1,  1.27,
+  "2. STUDY 2" , "1" , "Age >=2 to <5"   , 42, 19.9,   0.2,  0.66)
+
+base_mean <- with(hdata, round(weighted.mean(BASE, n), 2))
+hdata <- hdata |>
+    mutate(
+        # make the age group a factor and set the middle age group to be the reference
+        age_group = factor(AGEGRPN, c("2", "1", "3")),
+        # add a centered baseline covariate
+        cBASE = BASE - base_mean,
+        # back-calculate the sd
+        CHGsd = CHGse * sqrt(n),
+        # make Study a factor for better handling
+        Study = factor(Study),
+        # make a variable 'label' for later use in plots
+        label  = paste0(Study, "/", age_group),
+        label2 = paste0(Study, "/", age_group, ". ", AGEGRP),
+        # include active treatment indicator (in this case, is set to 0 as all patients are 'control')
+        active = 0L)
+
+hdata |>
+    select(Study, AGEGRP, n, BASE, cBASE, CHGm, CHGse, CHGsd) |>
+    kable()
+```
+
+## Model description
+
+The primary endpoint will be analyzed using a Bayesian regression model with
+the mean change from baseline in the PRO score as the response $Y$ and with
+treatment $Z$ ($Z = 1$ is treatment, $Z = 0$ is control). The basic
+analysis model for the Phase III model can be written as
+
+$$Y|\mu,\sigma \sim \N(\mu, \sigma^2)$$
+
+$$\mu = \beta_0 + \beta_1 \, x_1 + \beta_2 \, x_2 + \beta_3 \, cBASE + \theta \, Z$$
+
+where:
+
+-   $\beta_0$ is the mean change from baseline in the score on the control
+    arm for the 5 to 13 years old age group,
+-   $x_1,x_2$ are indicators for the Age \>=2 to \<-5 and Age \>=13 to
+    \<18 groups,
+-   $\beta_3$ is the covariate for the centered baseline score,
+-   $\theta$ is the treatment effect comparing mean change from
+    baseline for the treatment arm vs control arm,
+-   $\sigma$ is the sample standard deviation.
+
+Inference for the primary analysis focuses on the posterior
+distributions of the treatment effect comparing mean change from
+baseline in the PRO score on the treatment arm vs control arm, $\theta$.
+
+To form priors, the mean change from baseline as well the standard
+errors from each historical study and age subgroup are combined using
+the meta-analytic predictive approach (MAP, Neuenschwander et al
+2010). While the MAP approach is mostly used for an intercept only
+model, the use of covariates complicates the analysis here. However,
+this complication can be resolved by considering the equivalent
+meta-analytic (MAC) combined approach. Recall, that whereas the MAP
+approach is a 2-step procedure (derived MAP prior from historical
+data, then apply to current data), the MAC approach performs a joint
+analysis of the historical data and the current data. Both approaches
+result in exactly the same results (Schmidli et al 2014). Thus, it is
+useful to first consider the respective MAC model. For this we expand
+the basic model shown above to also include a random intercept effect,
+which accounts for between-study heterogeneity. Denoting with $i$ the
+study, we thus model the data as
+
+$$Y_i|\mu_i,\sigma \sim \N(\mu_i, \sigma^2)$$
+
+$$\mu_i = b_i  + \beta_1 \, x_1 + \beta_2 \, x_2 + \beta_3 \, cBASE + \theta \, Z$$
+
+$$b_i|\beta_0,\tau \sim \N(\beta_0, \tau^2).$$
+
+The key assumption of this model is that the covariate effects for
+age, centered baseline score and treatment effect are *pooled* accross
+all studies. That is, no between-study heterogeneity is admitted in
+these effects; heterogeneity is only accounted for in the intercept
+term representing the mean change from baseline for control of the
+reference age group with a baseline value as used for centering
+(`r base_mean`). Note that since we do only include data on control for
+the historical studies, we will infer the treatment effect $\theta$
+for the new Phase III study without any influence from the other
+studies.
+
+However, in practice we will still want to run the primary analysis in
+a 2-step MAP approach. The desire here is to be able to evaluate the
+properties of the MAP prior derived from the historical data in
+advance. The challenge becomes then that we have to use the
+information from fitting the above model to the historical data for
+the *entire model posterior* and not merley the intercept
+only. Therefore, the MAP prior becomes multi-variate, since the MAP
+prior includes the covariate effects.
+
+Hence, the model is fitted first to the historical data under the
+assumption of a known sampling standard deviation (accounting for the
+reported standard errors).
+
+<!--
+Let $\delta_{(j,0)}^{(1)}, \delta_{(j,0)}^{(2)}, \delta_{(j,0)}^{(3)}$ denote 
+the change in score at 12 months in the historical study $j$ for the 
+control arm in age groups 1, 2, and 3.
+Let $\delta_{(new,0)}^{(i)}$ and $\delta_{(new,1)}^{(i)}$ denote the change in 
+score at 12 months for the new studiy in the control and treatment arm, 
+respectively. The estimates across studies are linked through the hierarchical model:
+
+$\hat{\delta_{(j,z)}^{(i)}} \sim N(\delta_{(j,z)}^{(i)}, SE(\hat{\delta_{(j,z)}^{(i)}})^2)$
+
+with agegroup $i=1,2,3$; $j=$Study 1, Study 2, New Study; and treatment $z=0, 1$, such that
+
+$\delta_{(j,z)}^{(1)}\sim N(\beta_0 + b_j + \beta_1 + \beta_3 BASE + \theta z,\sigma^2)$
+
+$\delta_{(j,z)}^{(2)}\sim N(\beta_0 + b_j + \beta_3 BASE + \theta z, \sigma^2)$
+ 
+$\delta_{(j,z)}^{(3)}\sim N(\beta_0 + b_j + \beta_2 + \beta_3 BASE + \theta z,\sigma^2)$
+
+$b_j \sim N(0,\tau_s^2)$
+
+MAP-prior distributions can then be obtained under the assumptions that the 
+change from baseline in the PRO score in the new study for the treatment and
+control arm, $\delta_1^{(new)}$, and $\delta_0^{(new)}$ come from the same 
+distribution as the change from baseline in the PRO score for historical studies.
+-->
+
+As priors for $\beta_0,\beta_1,\beta_2,\beta_3,\theta$ weakly
+informative $\N(0,s^2)$ distributions are used, with $s = 5$. The
+choice of $5$ follows from the observation that the historical data
+has approximatly a sampling standard deviation of $5$ (the pooled
+estimate from the historical data is 
+`r round(sqrt(weighted.mean(hdata$CHGsd^2, hdata$n)), 2)` ). For the
+between-trial standard deviation $\tau$ a half-normal prior with scale
+$s / 4$ is used. This is considered a
+plausible conservative, weakly-informative prior, as values of the
+ratio $\tau/\sigma \ge 1/4$ are generally considered to represent
+substantial between-trial variability (Neuenschwander et al 2010) and
+the proposed prior will generally place more than half of the prior
+probability on this area.
+
+## Implementation
+
+Using `brms` we now specify the MAP model step by step. We first define
+the model:
+
+```{r}
+# Set up a model formula for use in 'brms'
+hist_model <- bf(CHGm | se(CHGse) ~ 1 + age_group + cBASE + active + (1 | Study),
+                 family=gaussian, center=FALSE)
+# Display prior model across all parameters
+get_prior(hist_model, hdata)
+```
+
+**Note** that we do set explicitly the option `center=FALSE` in the
+function call to `bf`. This avoids the otherwise automatic centering
+done by `brms`, which is needed here as we require full control over
+the parametrization of the model.
+
+With the model (and data) being defined, we are left to specify the
+model priors. With the help of the call
+
+```{r}
+## Fit historical data with between-study heterogeneity  -----------------------
+# For β_0,β_1,β_2,β_3,θ weakly informative N(0,5^2) distributions are used.
+# For the group level effect of the study, we use τ^2=(5.0/4)^2.
+# Set then sd = 5 for later use
+sd <- 5.0
+sd_tau <- sd/4
+hist_prior <- prior_string(glue::glue("normal(0, {sd})"), class="b") +
+    prior_string(glue::glue("normal(0, {sd_tau})"), class="sd", coef="Intercept", group="Study")
+```
+
+Now we are ready to run the model in `brms`:
+
+```{r}
+# Apply brms model using historical data hdata and the model defined in hist_model
+map_mc_brms <- brm(hist_model, 
+                   data = hdata, 
+                   prior = hist_prior,
+                   seed = 234324, refresh = 0,
+                   control = list(adapt_delta = 0.99))
+map_mc_brms
+```
+
+## Results
+
+Once the MAP prior is obtained in MCMC form a model check for it is
+recommended. In `brms` model diagnostic functions are directly available
+and essentially expose the functionality found in the
+[`bayesplot`](https://mc-stan.org/bayesplot/index.html) R package. A
+suitable `bayesplot` plot in this situation could be an `intervals` plot
+as:
+
+```{r}
+## Posterior predictive check for the mean change in Score at 12mo by age group -------
+pp_check(map_mc_brms, type="intervals") +
+  scale_x_continuous("Study / Strata", 
+                     breaks = 1:nrow(hdata), 
+                     labels = hdata$label2,
+                     trans = "reverse") +
+  ylab("Mean change from baseline") +
+  xlab(NULL) +
+  coord_flip() +
+  hline_0(linetype=2) +
+  ggtitle("Posterior predictive check") +
+  theme(plot.background = element_rect(fill = "white"),
+        legend.position="right",
+        # suppress vertical grid lines for better readability of intervals
+        panel.grid.major.y = element_blank())
+
+```
+
+The call of the `pp_check` function is forwarded to the respective
+`ppc_*` functions for posterior predictive checks from `bayesplot`
+(depending on the `type` argument). The plots are designed to compare
+the *posterior predictive* distribution to the observed data rather
+than comparing mean estimates to one another. Thus, the outcome of
+each study/age group in the historical data set is sampled according
+to the fitted model and the resulting predictive distribution of the
+outcome (change in score) is compared to the observed outcome. The
+`intervals` predictive probability check summarises the predictive
+distributions using a light color for an outer credible interval range
+and a darker line for an inner credible interval. The outer defaults
+to a 90% credible interval (`prob_outer` argument) while the inner
+uses a 50% credible interval (`prob` argument). The light dot in the
+middle is the median of the predictive distribution and the dark dot
+is the outcome $y$. As we can observe, the outcomes $y$ for all age
+groups in historical studies all are contained within outer credible
+intervals of the predictive distributions for the simulated replicate
+data $y_{rep}$.  We can conclude that the model is consistent with the
+data.
+
+Once the model has been checked for plausibility, we can proceed and
+derive the main target of the MAP analysis, which is the MAP prior in
+*parametric* form. The MAP priors are stored in the `post_coefs_mix` object. 
+Note that since we will be predicting for a *new* study, we sample a random 
+intercept using the between study variation.
+
+```{r}
+# For the simple case of this case study for which only a random
+# effect is assigned to the intercept, we can use the brms prediction
+# functions. We are seeking a sample of the intercept with
+# between-trial heterogeneity, but without having other terms
+# included in the linear predictor. Hence, we can predict the mean
+# change from baseline for a new study arm which is has the covariate
+# values set to the reference
+
+new_study_ref <- tibble(Study="new_study", age_group="2", cBASE=0, CHGse=0, active=0L)
+rv_study_sim_inter <- rvar(posterior_epred(map_mc_brms,
+                                           newdata=new_study_ref,
+                                           allow_new_levels=TRUE,
+                                           sample_new_levels="gaussian"))
+rv_study_sim_inter
+
+# Get random draws from the posterior of each parameter
+# after fitting historical data
+hist_post_draws <- as_draws_rvars(map_mc_brms)
+
+
+# Whenever one also models the covariate effects to be varying between
+# studies, one needs to smaple the random effect directly, since brms
+# always simulates the linear predictor by convention. In this case
+# one needs to sample the random intercept directly from the posterior.
+
+# Add variability to the intercept to capture random between study variation
+rv_study_sim_inter_alt <- with(hist_post_draws,
+                               rvar_rng(rnorm, 1, b_Intercept, sd_Study__Intercept, ndraws=ndraws(hist_post_draws)))
+
+# Get draws for the fixed effects from the model posterior
+hist_post_coefs <- subset_draws(hist_post_draws,
+                                variable="^b_",
+                                regex=TRUE) |>
+  as_draws_matrix()
+
+# Replace the intercept draws with the intercept and added variability
+hist_post_coefs[,"b_Intercept"] <- as_draws_matrix(rv_study_sim_inter)[,1]
+
+# Get a descriptive statistics for posterior draws by coefficient
+summarize_draws(hist_post_coefs) |> kable()
+
+# Summarize as mixture multi-variate normal  MAP distribution
+post_coefs_mix <- mixfit(sample = hist_post_coefs, type = "mvnorm", Nc = 3)
+print(post_coefs_mix)
+
+# notice the considerable correlations present in the posterior
+round(cor(hist_post_coefs), 2)
+```
+
+And a comparison of the fitted density vs the histogram of the MCMC
+sample is available as:
+
+```{r}
+plot(post_coefs_mix)
+```
+
+```{r}
+# Create summary statistics for the posteriors draws of coefficients
+hist_post_coefs_df <- as_draws_df(hist_post_coefs)
+hist_post_coefs_df[,1:5]
+ggpairs(hist_post_coefs_df[1:1000,1:5])
+```
+
+These MAP priors summaries for the coefficients of the model are displayed in 
+the table below. Note that since the historical data did not contain data in
+both the treatment and control arms, the parameter related to the treatment 
+effect (`b_active`) will remain with a weakly informative prior. 
+
+```{r}
+map_coef_table <- hist_post_coefs_df |>
+  summarise_draws(median, ~quantile2(.x, probs = c(0.025, 0.975))) |>
+  rename(lower = q2.5,
+         upper = q97.5)
+```
+
+Often it is of interest to evaluate the properties of the MAP prior at
+the design stage. This is complicated in this example due to the
+requirement to assume a certain baseline value for the used
+covariates. However, note that these evaluations can be repeated once
+the trial has enrolled the study population given that only baseline
+data is needed for this. Here we assume a baseline score of 20:
+
+```{r}
+agegroups <- c("Age >=2 to <5",
+               "Age >=5 to <13",
+               "Age >=13 to <18")
+# Set a value for the baseline score
+base_score_new <- 20
+# Create summary statistics for the posteriors draws of means across subgroups
+new_study_base20 <- tibble(Study="new_study",
+                           age_group=c("1", "2", "3"),
+                           cBASE=base_score_new-base_mean,
+                           CHGse=0,
+                           active=0L)
+
+map_resp_table <- new_study_base20 |>
+    tidybayes::add_epred_rvars(map_mc_brms, allow_new_levels=TRUE, sample_new_levels="gaussian", value="MAP") |>
+    mutate(summarise_draws(MAP, ~quantile2(.x, probs = c(0.025, 0.5, 0.975))),
+           MAP_ess=ess(mixfit(draws_of(MAP)[,1], Nc=3), sigma=5),
+           treatment = "Control") |>
+    rename(median= q50,
+           lower = q2.5,
+           upper = q97.5) |>
+  select(MAP, treatment, MAP_ess, median, lower, upper)
+map_resp_table |> kable()
+```
+
+At this point we can use these MAP priors in the analysis of the pivotal
+trial as outlined in the following section.
+
+## Analysis of a new study
+
+Assume now that the trial reads out and you have to analyze the study
+using the MAP prior that were derived. For this exercise  will use a simulated 
+data set, see code below for reference.
+
+```{r simulate_trial }
+#| code-fold: true
+#| code-summary: "Show the code"
+# Simulate summary data for 1 trial
+#
+# @param n1 The sample size in the 3 strata of the treatment arm. A vector of length 3
+# @param n0 The sample size in the 3 strata of the control arm. A vector of length 3
+# @param e1 The mean change from baseline at month 12 for the 3 strata of the treatment arm. A vector of length 3
+# @param e0 The mean change from baseline at month 12 for the 3 strata of the control arm. A vector of length 3
+# @param s1 The se of the response in the 3 strata of the treatment arm. A vector of length 3
+# @param s0 The se of the response in the 3 strata of the control arm. A vector of length 3
+# @param s1 The baseline score in the 3 strata of the treatment arm. A vector of length 3
+# @param s0 The baseline score in the 3 strata of the control arm. A vector of length 3
+#
+# @return A data frame with simulated trial data
+#
+simulate_1_trial_ipd <- function(n1 = c(39, 18, 18),
+                                 n0 = c(26, 12, 12),
+                                 e1 = c( 5,  5,  5),
+                                 e0 = c( 0,  0,  0),
+                                 s1 = rep(5, 3),
+                                 s0 = rep(5, 3),
+                                 bs1 = c( 33,33,33),
+                                 bs0 = c( 33,33,33),
+                                 base_center=base_mean) {
+    # Arm 1: Treatment
+    # Arm 0: Control
+    
+    # AGEGRPN 1: Age >=2 to <5
+    # AGEGRPN 2: Age >=5 to <13
+    # AGEGRPN 3: Age >=13 to <18
+    
+    # Simulate individual patient data  -------
+    d1 <- sapply(1:3, function(i) rnorm(n = n1[i], mean = e1[i], sd = s1[i]))
+    d0 <- sapply(1:3, function(i) rnorm(n = n0[i], mean = e0[i], sd = s0[i]))
+    
+    # Simulate individual patient data for the baseline score -------
+    b1 <- sapply(1:3, function(i) rnorm(n = n1[i], mean = bs1[i], sd = s1[i]))
+    b0 <- sapply(1:3, function(i) rnorm(n = n0[i], mean = bs0[i], sd = s0[i]))
+    
+    agegr <- c("Age >=2 to <5", "Age >=5 to <13", "Age >=13 to <18")
+    AGEGROUP <- c(rep(agegr, n1), rep(agegr, n0))
+    strataN <- c(rep(1:3, n1), rep(1:3, n0))
+  
+    BASE <- c(unlist(b1), unlist(b0))
+    y <- c(unlist(d1), unlist(d0))
+    
+    ### Put generated values into a data frame -----------------------------------
+    out <- data.frame(AGEGRPN = strataN,
+                      AGEGRP  = AGEGROUP,
+                      age_group = relevel(as.factor(strataN), "2"),
+                      active    = rep(c(1, 0), c(sum(n1), sum(n0))),
+                      BASE = BASE,
+                      cBASE= BASE - base_center,
+                      CHG  = y
+                      )
+  out
+}
+```
+
+```{r}
+new_dat <- simulate_1_trial_ipd(
+  n1 = c(39, 18, 18),
+  n0 = c(26, 12, 12),
+  e1 = c( 2,  1,  0),
+  e0 = c(-4, -3, -2),
+  s1 = rep(5, 3),
+  s0 = rep(5, 3),
+  bs1 = c( 33,33,33),
+  bs0 = c( 33,33,33),
+  base_center=base_mean)
+new_dat <- new_dat |> 
+  mutate(Study = "New")
+
+head(new_dat) |> kable()
+```
+
+We first calculate simple summary statistics for the simulated data:
+```{r}
+new_dat |> 
+    summarise(BASE = mean(BASE),
+              cBASE = mean(cBASE),
+              CHGm  = mean(CHG),
+              CHGsd = sd(CHG),
+              n     = n(),
+              CHGse = CHGsd / sqrt(n),
+              .by=c(Study, AGEGRPN, AGEGRP, age_group, active)) |>
+    kable()
+```
+
+We must define the model to be used. Note that now, since we have individual patient
+level data, we use only `CHG` as response variable. Hence, the
+sampling standard deviation is estimated from the data (requiring us
+to set a prior for this parameter as well). It is obviously important
+that we must use the very same model specification as before, since
+otherwise we could not use the derived MAP prior easily:
+
+```{r}
+joint_model_ipd <- bf(CHG ~ 1 + age_group + cBASE + active,
+                      family=gaussian, center=FALSE)
+```
+
+And then use `brm` to put it altogether. To use the previously derived priors,
+we need to use two options in the brm funciton. The `prior` option indicates
+that a multivariate normal prior will be used, while the `stanvars` will pass 
+the actual prior distribution, which is a mixture distribution.
+
+```{r message=FALSE, warning=FALSE}
+# Fit with historical data priors -------------------------------------------
+mcmc_seed <- 573391
+# use the MAP prior for the coefficients and encode with the gamma
+# prior on the sigma that we anticipate an sd of 5
+map_prior <- prior(mixmvnorm(prior_mix_w, prior_mix_m, prior_mix_sigma_L), class=b) +
+    prior(gamma(5,1), class=sigma)
+bfit_stage2_ipd <- brm(formula  = joint_model_ipd,
+                       prior    = map_prior,
+                       stanvars = mixstanvar(prior_mix = post_coefs_mix),
+                       data     = new_dat,
+                       seed    = mcmc_seed,
+                       refresh = 0,
+                       backend = "cmdstanr")
+bfit_stage2_ipd
+```
+
+Having fit the model, the last step is to use it to test the statistical 
+hypothesis:
+
+```{r}
+hypothesis(bfit_stage2_ipd, "active > 0", alpha = 0.05)
+```
+
+
+## Conclusion
+
+This case study demonstrated how a random-effects meta-analysis model
+has been implemented with `brms`. The model was implemented in a
+two-stage approach where first the model is fit with historical data
+to obtain meta-analytic-predictive priors. Later, when data from the
+study of interest is available, the model is fit using the study data
+and MAP priors.
+
+The model enables borrowing between historical studies and age groups
+and hence a more informative MAP prior.
+
+## Exercises
+
+1. Fit a model where only weakly informative priors are considered to
+  analyze the data from the new study. This will help providing
+  insights of the impact of using the historical data.
+
+2. To add robustification to the MAP prior, non-informative components can added
+  to the mixture distributions to protect against prior-data conflicts (Schmidli et al 2014). 
+  Fit a model where the MAP priors are robustified where the weight for the  
+  non-informative component is 20%. 
+  The non-informative component is normally distributed with the mean 0 and 
+  variance $\sigma^2$.
+
+
+## Solution to exercises
+
+```{r}
+get2sidedCI <- function(h1side, h2side) {
+  xx = h2side$hypothesis |>
+    select(Hypothesis, CI.Lower, CI.Upper) |>
+    rename(CI.95.Lower=CI.Lower,
+           CI.95.Upper=CI.Upper)
+  data.frame(h1side$hypothesis, xx)
+}
+get1sidehyp2sidedCI = function(brm_res) {
+  h1side = hypothesis(brm_res, "active > 0", alpha=0.025)
+  h2side = hypothesis(brm_res, "active > 0", alpha=0.05)
+  
+  xx = h2side$hypothesis |>
+    select(Hypothesis, CI.Lower, CI.Upper) |>
+    rename(CI.95.Lower=CI.Lower,
+           CI.95.Upper=CI.Upper)
+  data.frame(h1side$hypothesis, xx)
+}
+```
+
+
+```{r eval=FALSE}
+#| code-fold: true
+#| code-summary: "Show the code"
+## Fit Non-Informative Prior using default priors -----------------------
+head(new_dat)
+bfit_trial_ipd_noninf_flat <- brm(joint_model_ipd,
+                                  seed = mcmc_seed, 
+                                  refresh = 0,
+                                  data = new_dat)
+
+## Fit Non-Informative Prior using wide Normal priors ------------------------
+trial_prior_mix <- mixmvnorm(flat = c(1, rep(0, 5), diag(sd^2, 5)))
+joint_model_ipd <- bf(CHG ~ 1 + age_group + BASE + active,
+                      family=gaussian, center=FALSE)
+bfit_trial_ipd_noninf_norm <- brm(joint_model_ipd,
+                                  prior = mix_prior,
+                                  stanvars = mixstanvar(prior_mix = trial_prior_mix),
+                                  seed = mcmc_seed, refresh = 0,
+                                  backend = "cmdstanr",
+                                  data = new_dat)
+
+## Fit with historical data MAP Priors -------------------------------------------
+bfit_stage2_ipd <- brm(joint_model_ipd,
+                       prior = mix_prior,
+                       stanvars = mixstanvar(prior_mix = post_coefs_mix),
+                       seed = mcmc_seed, refresh = 0, backend = "cmdstanr")
+                       data = new_dat)
+
+## Fit with historical data MAP Priors with robustification ----------------------
+ps = seq(0, 1, 0.2)
+hp_ipd =  hn_ipd = list()
+for (i in 1:length(ps)){
+  p = ps[i]
+  rob_post_coefs_mix <- mixcombine(flat = trial_prior_mix, 
+                                   MAP = post_coefs_mix, 
+                                   weight=c(p, 1 - p))
+  
+  bfit_stage2_rob_ipd <- brm(joint_model_ipd,
+                             prior = mix_prior,
+                             stanvars = mixstanvar(prior_mix = rob_post_coefs_mix),
+                             seed = mcmc_seed, refresh = 0, backend = "cmdstanr")
+                             data = new_dat)
+  hp_ipd[[i]] = get1sidehyp2sidedCI(bfit_stage2_rob_ipd)
+  hn_ipd[[i]] = glue::glue("MAP prior {p*100}% robust")
+}
+
+h1_ipd_noninf_flat = get1sidehyp2sidedCI(bfit_trial_ipd_noninf_flat)
+h1_ipd_noninf_norm = get1sidehyp2sidedCI(bfit_trial_ipd_noninf_norm)
+h1_ipd_map         = get1sidehyp2sidedCI(bfit_stage2_ipd)
+
+h1n_ipd = unlist(hn_ipd)
+h1_ipd_rob = do.call(rbind, lapply(hp_ipd, function(x) x))
+
+res = data.frame(type = c("Non-informative prior. brm default",
+                          "Non-informative prior. wide normal prior",
+                          "MAP prior",
+                          h1n_ipd),
+                 bind_rows(
+                   h1_ipd_noninf_flat,
+                   h1_ipd_noninf_norm,
+                   h1_ipd_map,
+                   h1_ipd_rob
+                 ))
+res
+```
+
diff --git a/src/02b_dose_finding.qmd b/src/02b_dose_finding.qmd
index 21cc636..37e0872 100644
--- a/src/02b_dose_finding.qmd
+++ b/src/02b_dose_finding.qmd
@@ -3,7 +3,7 @@ author:
   - Björn Holzhauer - <bjoern.holzhauer@novartis.com>
 ---
 
-# Dose finding
+# Dose finding {#sec-dose-finding}
 
 ```{r, include=FALSE}
 source(here::here("src", "setup.R"))
diff --git a/src/02c_dose_escalation.qmd b/src/02c_dose_escalation.qmd
index 0652d2f..770e6ef 100644
--- a/src/02c_dose_escalation.qmd
+++ b/src/02c_dose_escalation.qmd
@@ -4,7 +4,7 @@ author:
   - Lukas Widmer - <lukas_andreas.widmer@novartis.com>
 ---
 
-# Oncology dose escalation
+# Oncology dose escalation {#sec-onc-escalation}
 
 ```{r, include=FALSE}
 source(here::here("src", "setup.R"))
diff --git a/src/02cb_tte_dose_escalation.qmd b/src/02cb_tte_dose_escalation.qmd
new file mode 100644
index 0000000..c0a93f7
--- /dev/null
+++ b/src/02cb_tte_dose_escalation.qmd
@@ -0,0 +1,1057 @@
+---
+author:
+  - Lukas Widmer - <lukas_andreas.widmer@novartis.com>
+  - Sebastian Weber - <sebastian.weber@novartis.com>
+---
+
+# Time-to-event modelling in Oncology dose escalation {#sec-tte-oncology}
+
+```{r, include=FALSE}
+source(here::here("src", "setup.R"))
+```
+
+{{< include _macros.qmd >}}
+
+This case study demonstrates
+
+- setting up a time-to-event model and prior with explicit modeling of a 
+  piecewise-constant log-hazard in `brms` using Poisson regression
+- how this time-to-event model relates to the standard BLRM
+- computing and visualizing conditional and cumulative event probabilities
+- how to determine the impact of the Monte Carlo Standard Error (MCSE) on 
+  decision criteria, e.g., Escalation With Overdose Control (EWOC)
+- in the exercise: setting up custom `brms` link functions for the two-drug case
+
+## Background
+
+The aim of Oncology Phase-I trials is to identify a drug dose which is
+reasonable safe to use and efficacious at the same time. As cytotoxic
+drugs aim to kill cancer cells, it is expected that toxicity events
+are to some extent a surrogate for efficacy of the drug. Due to the
+life-threatening nature of the disease one thereby uses *dose
+escalation* trial designs to identify the maximum tolerated dose (MTD)
+or recommended dose (RD) rapidly. These trials enroll small patient
+cohorts at some drug dose and then assess at a dose escalation meeting
+(DEM) all safety events occurring over the time-course of a treatment
+cycle (often 4 weeks). The safety events classified as dose limiting
+toxicities (DLTs) then determine which dose levels are safe for use in
+the next patient cohort. In this way an increasing data set on the
+toxicity of the drug emerges as the trial continues.
+
+There are many approaches on how to guide dose escalation trials at a
+DEM. Traditional designs such as the 3+3 design are based on
+algorithmic rules governing the decision for the subsequent cohort
+based on the outcome at the current dose. Specifically, 3 patients are
+enrolled onto a cohort and the drug dose is increased in case no DLT
+occurs, if 1 DLT occurs the drug dose is kept constant while it is
+decreased if more DLTs occur. Whenever two cohorts in sequence observe
+1 out of 3, the 3+3 trial completes. This is in line with the goal
+of a 33% DLT rate at the MTD. It has been shown that these rule-based
+approaches have rather poor statistical properties and model based
+approaches have been proposed. In particular, Bayesian model-based
+designs have proven to provide greater flexibility and improved
+performance for estimating the MTD, while protecting patient
+safety. In the model-based paradigm for dose escalation, one develops
+a model for the dose-toxicity relationship. As DLT data accumulates
+on-study, the model is used to adaptively guide dose escalation
+decisions. Bayesian approaches are well-suited for this setting, due
+in part to the limited amount of available data (Phase-I studies are
+generally small).
+
+Recent, more targeted treatments, have challenged several aspects
+taken for granted for traditional cytotoxic therapies, such as
+
+* **Estimating the MTD over just one treatment cycle**: since targeted
+  therapies are typically safer than cytotoxic ones and are applied
+  over longer time periods, estimating longer-term (multi-cycle)
+  dose-toxicity relationships has become more important. However, we
+  typically do not want to wait for the full time period (e.g., 3
+  cycles) to decide the dose for the next cohort. That is, after the
+  first cycle, one may want to extrapolate to the dose-toxicity
+  relationship over 3 cycles given a prior on how the conditional
+  toxicity of the next cycles changes with respect to the first
+  one. However, this is not easily feasible with methods that only use
+  binary (DLT yes/no) data.
+* **Constant dosing regimen**: For some targeted treatments it may be
+  advantageous to vary the dosing regimen, for instance, by performing
+  within-patient dose-ramp-up over several weeks to avoid cytokine
+  release syndrome. However, classical binary cycle-1 methods such as
+  the 3+3, BOIN or the Bayesian Logistic Regression Model (BLRM) are not able 
+  to directly incorporate time-varying dosing or hazard reductions due to
+  ramp-up dosing.
+* **Ignoring dropout**: In classical binary dose-DLT methods, where
+  DLT yes/no was measured for each patient over the first treatment
+  cycle, data from patients that dropped out during this cycle (for
+  instance due to disease progression) is typically ignored. This is
+  no longer an option with modern cancer treatments in Phase I that
+  are both taken over longer time periods where dropout is more
+  likely, as well as administered in last-line treatment populations
+  with aggressive cancers / rapid dropout due to disease progression.
+
+These challenges can be addressed by switching to time-to-first-event
+(time-to-DLT) modeling with piecewise-constant hazards, where the
+constant period is chosen to align with a period of interest. For
+instance, if the dose-toxicity relationship over 3 cycles is of
+interest, one might choose to model piecewise-constant hazards per
+cycle, or if we are interested in the dose-toxicity relationship of
+several regimens that contain within-patient ramp-up of dosing on a
+weekly basis, we might choose piecewise-constant hazards for each
+week.
+
+## Required libraries
+
+To run the R code of this section please ensure to load these
+libraries and options first:
+
+```{r, eval=TRUE,echo=TRUE,message=FALSE,warning=FALSE}
+library(lubridate)
+library(tibble)
+library(dplyr)
+library(tidyr)
+library(brms)
+library(bayesplot)
+library(tidybayes)
+library(posterior)
+library(knitr)
+library(ggplot2)
+library(purrr)
+library(here)
+library(assertthat)
+library(gt)
+theme_set(theme_bw(12))
+# instruct brms to use cmdstanr as backend and cache all Stan binaries
+options(brms.backend="cmdstanr", cmdstanr_write_stan_file_dir=here("_brms-cache"))
+# create cache directory if not yet available
+dir.create(here("_brms-cache"), FALSE)
+set.seed(8794567)
+# Table formatting helper function:
+gt_format <- function(x) {
+  x |> fmt_number(decimals=2) |>
+    opt_interactive(
+      page_size_default = 5, 
+      use_text_wrapping = F, 
+      use_compact_mode = T
+    )
+}
+```
+
+```{r include=FALSE}
+# invisible to the reader we move the cache directory
+options(cmdstanr_write_stan_file_dir=brms_cache_dir)
+```
+
+## Example trial
+In the following, we will assume a simple setting of a first-in-human
+(FIH) study of a new Oncology treatment. Here, we consider the
+dose-DLT relationship over a time horizon of 3 treatment cycles, a cycle
+length of 4 weeks, and daily (QD) dosing of a drug A at a pre-defined
+dose set with reference dose $\tilde{d} = 50$ (at the reference dose
+we anticipate the MTD *a priori*):
+
+```{r}
+cycle_lengths <- rep(weeks(4), 3)
+dref_A <- 50
+doses <- c(1, 2.5, 5, 10, 20, 30, 40, 45, 50)
+
+cycle_seq <- seq_along(cycle_lengths)
+dose_seq  <- seq_along(doses)
+```
+
+We will use the logarithm of the normalized dose (with respect to a
+reference dose $\tilde{d}$ ), $\log\left(\tilde{d}_{j}\right) =
+\log\left(d_{j} / \tilde{d}\right)$, where $j$ is the cycle index. We
+define a helper function to add this column as `std_A`, and whether
+the log-dose is finite as `finite_A`, which will become relevant in the
+case of drug combinations.
+
+```{r}
+add_std_dose_col <- function(dose_info, drug = "A", dref) {
+  dose_info |> mutate(
+    "std_{drug}"    := log(get(paste0("dose_", drug)) / dref), 
+    "finite_{drug}" := 1L * (get(paste0("dose_", drug)) != 0)
+  )
+}
+```
+
+## Data
+We now need individual patient data (IPD) containing:
+
+* The doses planned to be administered in each of the three cycles,
+* The start and end dates of each cycle
+* The date of censoring (either at the end of the last cycle due to
+  end of the observation period, or during the trial due to dropout)
+* The date of DLT 
+
+As an example, we create 3-cycle data in long format reflecting the
+data used in the BLRM dose escalation case study: 
+
+```{r}
+# Create a single patient
+example_ipd <- tibble(
+  cycle_index = cycle_seq,
+  dose_A = 1,
+  num_toxicities = 0,
+  follow_up = as.numeric(cycle_lengths, "days")
+) 
+
+# Create long-format individual-patient data (IPD) corresponding to the one 
+# in the BLRM dose escalation case study
+ipd <- bind_rows(
+  example_ipd |> slice(rep(row_number(), 3)),
+  example_ipd |> mutate(dose_A = 2.5) |> slice(rep(row_number(), 4)),
+  example_ipd |> mutate(dose_A = 5)   |> slice(rep(row_number(), 5)),
+  example_ipd |> mutate(dose_A = 10)  |> slice(rep(row_number(), 4)),
+  example_ipd |> mutate(dose_A = 25, num_toxicities = 1) |> slice(rep(1, 2))
+) |> add_std_dose_col(drug = "A", dref = dref_A)
+
+ipd |> gt() |> gt_format()
+```
+We define dosing schedules that we want to compute DLT probabilities for. Here,
+we choose schedules that are constant over all three cycles: 
+
+```{r}
+# Dosing information per cycle
+dose_info_long <- expand_grid(schedule_id = dose_seq, cycle = cycle_seq) |>
+    mutate(dose_A = doses[schedule_id])
+
+# Cycle information
+cycle_info <- tibble(
+  cycle = cycle_seq, 
+  follow_up = as.numeric(cycle_lengths, "days")
+)
+
+```
+We also generate a reference dosing data set which is formatted like
+an analysis data set, but is only setup to evaluate the final model on
+the predefined set of doses. These will be needed to compute 
+the conditional (or cumulative) DLT probabilities in (up to) cycles 1, 2 and 3:
+```{r}
+dose_ref_data <- dose_info_long |>
+    left_join(cycle_info, by="cycle") |>
+    mutate(num_toxicities=0L) |>
+    add_std_dose_col(drug = "A", dref = dref_A)
+```
+
+## Model description
+For each cycle $j$, we model the probability of observing a DLT event
+conditional on entering cycle $j$ as a function of the dose $d_{j}$
+being administered during this cycle.
+
+For each patient $i \in {1, \ldots, N}$, we observe either the DLT
+event time $T_i$ or the censoring time $C_i$, whichever occurs
+first. We denote this as <!-- \unicode{x1D7D9} -->
+
+$$
+(U_i, \delta_i), \textrm{ where } U_i = \min(T_i, C_i) \textrm{ and } \delta_i = 1(T_i \leq C_i).
+$$
+
+Here, $U_i$ represents the observed time, and $\delta_i$ indicates
+whether a DLT occurred ($\delta_i = 1$) or not ($\delta_i = 0$).
+
+The primary focus of the model is the risk for DLT events over the
+course of a sequence of cycles. Thus, we do not aim to model
+accurately the risk for an event within each cycle. While one could
+model the event and censoring times as interval censored observations,
+we disregard here the interval censoring and use a continuous time
+representation of the time to event process. The continuous time
+representation can be understood as an approximation of the interval
+censored process with time lapsing in full units of a cycle. As a
+consequence, the observed event time $T_i$ and censoring times $C_i$
+are always recorded at the planned cycle completion time-point (time
+lapses in units of cycles). Here we assume for simplicity that each
+cycle $j$ has the same duration $\Delta t$ such that any time being
+recorded is a multiple of $\Delta t$ and we denote time $t$ being
+during a cycle $j$ with $t \in ((j-1) \, \Delta t, j \, \Delta t]$ or
+equivalently $t \in I_j$ for brevity.
+
+### Model definition
+To define our model, we use the following basic definitions from
+survival analysis. The survival function $S(t)$ and cumulative event
+probability $F(t)$ are defined as
+
+\begin{align*}
+S(t) := & \Pr(\textrm{"No event up to time t"}) = \Pr(T \geq t) = 1 - F(t).
+\end{align*}
+
+We now have two quantities that are of particular interest:
+
+1. The cumulative DLT probability up to cycle $j$ given a prescribed
+   dosing $\boldsymbol{d} = (d_{1}, d_{2}, d_{3})$ over the three
+   cycles: $$\Pr(\textrm{"DLT up to cycle j"}|\boldsymbol{d}) =
+   F(t).$$
+   
+2. The conditional DLT probability of cycle $j$ given survival up to cycle $j-1$ (i.e., not having observed a DLT event up to the end of cycle $j-1$):
+   \begin{align*}
+      \, &  \Pr(\textrm{"DLT in cycle j"}|\textrm{"No DLT up to cycle j-1"}, d_j) \\
+    = \, &  \Pr(T \leq t + \Delta t|T > t, d_j)\\
+    = \, &  1 - \exp\left(-\int_{t}^{t + \Delta t} h(u) du \right),
+   \end{align*}
+   where $h(u)$ is the hazard function and $H(t) = \int_{0}^{t} h(u) du$ the cumulative hazard.
+
+Given the drug dose $d_{j}$ administered in cycle $j$, we model the log-hazard $\log(h_j(t|\tilde{d}_{j}))$ as 
+$$
+\log(h_j(t|\tilde{d}_{j})) = \alpha + \beta \log\left(\tilde{d}_{j}\right), \textrm{ where } \tilde{d}_{j} = \frac{d_{j}}{\tilde{d}}
+$$
+and $\tilde{d}$ is the reference dose. Then (see @sec-onco-tte-appendix for the full derivation), the conditional DLT probability is 
+closely related to the classic two-parameter Bayesian Logistic Regression Model 
+(BLRM), since
+
+\begin{align}
+\mathrm{cloglog}\Pr(T \leq t + \Delta t|T > t, d_j) & = \alpha + \beta \log\left(\tilde{d}_{j}\right) + \log(\Delta t),
+\end{align}
+
+is the same as for the BLRM in cycle one if we elapse time in cycles
+(then $\Delta t = 1$ and the $\log(\Delta t)$ is zero), except for the
+link function changing from $\mathrm{logit}$ to $\mathrm{cloglog}(x) = \log(-\log(1 - x))$.
+
+The cumulative DLT probability can also be derived, where $j_t$ is the
+index of the cycle that time $t$ lies in:
+
+\begin{align}
+\mathrm{cloglog}\Pr(T \leq t|\boldsymbol{d}) & = \log\left(\sum_{j = 1}^{j_t} \exp\left(\alpha + \beta \log\left(\tilde{d}_{j}\right) + \log(\Delta t)\right)\right) = \mathrm{cloglog} F(t|\boldsymbol{d}).
+\end{align}
+
+
+Last, but not least, the likelihood for all patients $i$ is then
+$$
+L(U|\alpha, \beta) = \prod_i \underbrace{ f(T_i)^{\delta_i} }_\textrm{DLT
+ events} \, \underbrace{S(C_i)^{1 - \delta_i}}_\textrm{Censoring event} \,.
+$$
+In order to fit these models to time to first event data it is helpful
+to consider its representation as a Poisson regression. Since $f(t) =
+h(t) \, S(t)$ and $S(t) = \exp\left(-H(t)\right)$ from basic
+survival analysis, it follows that the likelihood contribution in
+absence of an event is $\exp(-H(t))$ while it is $h(t) \,
+\exp(-H(t))$ whenever an event occurs.
+
+The Poisson distribution for observing $k$ events,
+
+$$ \Pr(k|\lambda) = \frac{\lambda^k \, \exp(-\lambda)}{k!}, $$
+
+becomes $\exp(-\lambda)$ for no event ($k=0$) and $\lambda \,
+\exp(-\lambda)$ for an event ($k=1$). Therefore, we can model the data
+using a Poisson regression approach whenever the hazard is constant
+over the time units we model, $h(t) = \lambda$. In this case the
+counts on each time interval are $0$ or $1$. Moreover, we use a $\log$
+link function for the counting rate $\lambda$ and model with a linear
+function the $\log$ counting rate to ensure its
+positivity. Importantly, we also add to the linear predictor an offset
+equal to the $\log$ of the exposure time $\log(\Delta t)$. The offset
+multiplies $\lambda$ with $\Delta t$ and thereby turns the constant
+counting rate $\lambda$ into a cumulative hazard $H(t) = \lambda \,
+\Delta t$. Thereby we obtain the desired likelihood.
+
+Note, that when using the Poisson approach in `brms` we pass the
+cumulative hazard $\lambda \, \Delta t$ as argument to the counting
+rate such that the likelihood term for an event becomes $\lambda \,
+\Delta t \, \exp(-\lambda \, \Delta t)$, which is not quite equal to
+$f(t)$ as required above, but is instead equal to $\Delta t \,
+f(t)$. However, as $\Delta t$ is known it is merely a constant and is
+hence irrelevant for sampling the posterior.
+
+### Model prior
+For the above model, we must define a prior for the parameters
+$\alpha$ and $\beta$. The interpretation of these parameters depends
+on two reference values, the reference *dose* $\tilde{d}$ and
+reference *time point* $\tilde{t}$.
+
+The reference dose $\tilde{d}$, with respect to which the dose is normalized,
+defines the meaning of the intercept parameter $\alpha$. Here we
+choose to define the meaning of the intercept by considering the
+reference *time point* $\tilde{t}$ such that for $d=\tilde{d}$ and
+$t=\tilde{t}$ we have that
+
+$$
+\mathrm{cloglog}\Pr(T \leq \tilde{t}|\tilde{d}) = \alpha + \log(\tilde{t}).
+$$
+
+Hence, when setting the mean of $\alpha$ to
+$\mathrm{cloglog}^{-1}(\tilde{\pi}) - \log(\tilde{t})$ it becomes
+apparent that the intercept is equal to the rate of events
+$\tilde{\pi}$ over the reference time period up to the reference time
+point $\tilde{t}$ whenever dose $\tilde{d}$ is given. 
+
+If we are interested in modeling treatment cycles, we might let the
+time unit elapse in full cycles and set the reference time point to the end
+of cycle 1 ($\tilde{t} = 1$). Then, the interpretation of the prior is
+the same as for the classic BLRM for which the DLT probability over
+one cycle is modeled as a binomial logistic regression
+($\mathrm{logit}(\pi(d)) = \alpha + \beta \, \log(d/\tilde{d})$). The
+only difference is that the link function is now $\mathrm{cloglog}$
+instead of $\mathrm{logit}$.
+
+Given the similarity of the Poisson TTE model to the one cycle
+focused BLRM we will use the same rational for the slope parameter
+$\beta$. The slope parameter $\beta$ can be interpreted as the Hill
+coefficient of a sigmoid logistic curve. We can hence use mechanistic
+knowledge and prior *in vivo* and *in vitro* data to motivate its
+feasible range of values, and condense this prior knowledge into an
+appropriate prior for the slope. In an efficacy meta-analysis,
+"modeling showed the Hill parameters were concentrated near 1.0", see
+
+- Thomas *et al.* (2014) [doi:10.1080/19466315.2014.924876](https://doi.org/10.1080/19466315.2014.924876), 
+- Bayesian Methods in Pharmaceutical Research (2020) [doi:10.1201/9781315180212](https://doi.org/10.1201/9781315180212) and
+- Thomas *et al.* (2017) [doi:10.1080/19466315.2016.1256229](https://doi.org/10.1080/19466315.2016.1256229) for details.
+
+Therefore, in the absence of more specific knowledge 
+(e.g. pre-clinical data, historical clinical data for the mechanism of action,
+etc.) we assume a prior for $\log\left(\beta\right)$ with mean 0.
+Furthermore, we wish to avoid extremely steep or 
+flat dose-response curves. Thereby, we parameterize the 95% central
+credible interval to allow for a 4 fold increase or decrease via setting
+
+$$ \log\left(\beta\right) \sim \N\left(0,
+\left(\frac{\log(4)}{1.96}\right)^2\right).$$
+
+As discussed above, the intercept $\alpha$ defines the reference event
+rate $\tilde{\pi}$ at the reference dose $\tilde{t}$ over the time
+span up to the reference time $\tilde{t}$. By setting $\tilde{\pi}$ to
+20\% we imply that the reference dose $\tilde{d}$ corresponds to our
+prior guess of the MTD. These considerations define the prior mean for
+$\alpha$. The standard deviation for the intercept is set to
+unity. This choice results from an extensive study of the model
+properties under various data scenarios. These data scenarios studied
+how the model guides a dose-escalation trial at an early stage. At an
+early stage of the trial one requires sufficient conservative dose
+recommendations from the safety model while at the same time one
+wishes to ensure that the model allows the trial to continue enrolling
+patients to low doses should a limited number of DLT events be
+observed during the early cohorts. The rationale behind the prior of
+
+$$ \alpha \sim \N\left(\mathrm{cloglog}(\tilde{\pi} = 20\%) - \log\left(\tilde{t}\right), 1^2\right) $$ 
+
+is that we have some level of confidence in the set of doses which we
+study in the trial to be reasonable. That is, the fact that we choose
+to study a concrete dose range is by itself prior knowledge driven by
+a multitude of (pre-clinical & clinical) considerations. This prior
+choice deliberately avoids that a limited number of DLT events at the
+start of the trial can entirely undermine our initial understanding of
+the drug safety profile. Only with sufficient data the model will stop
+a trial.
+
+For a better intuition of the model prior, it is a helpful to study
+data scenarios (left out here) and to visualize the joint model prior
+rather than considering the prior on each parameter individually. This
+is covered in section @sec-onco-tte-prior-vis.
+
+## Implementation
+For the case of a single agent trial we can cast the problem into a
+Poisson regression framework as detailed above and define the `brms`
+model formula as follows. We use here a non-linear model formla
+syntax to enforce a positive slope:
+```{r}
+tte_model <-
+    bf(num_toxicities | rate(follow_up) ~ interA + exp(slopeA) * std_A,
+       interA ~ 1,
+       slopeA ~ 1,
+       nl=TRUE, family=poisson()
+  )
+```
+`brms` helps us to get the list of model parameters requiring prior specifications:
+```{r}
+get_prior(tte_model, data = ipd)
+```
+We can for instance define a prior with 20% DLT probability after the first 
+treatment cycle at the reference dose. For more details on the $\beta$ parameter
+prior, see the bite-size guidance on [go/BLRM](https://go/BLRM).
+```{r}
+## -4.83 approx cloglog(0.2) - log(7*4) for drug A
+tte_prior <-
+  prior(normal(-4.83, 1), nlpar=interA, class=b, coef=Intercept) +
+  prior(normal(0, log(4)/1.96), nlpar=slopeA)
+```
+
+We then create the Stan model code
+```{r}
+tte_stanmodel <- tte_model |> make_stancode(
+  data = ipd, prior = tte_prior
+)
+```
+and data:
+```{r}
+tte_standata <- tte_model |> make_standata(
+  data = ipd, prior = tte_prior
+)
+```
+After inspecting the generated code and data for correctness, we compile the 
+model without sampling it (`chains = 0`) yet:
+```{r message=FALSE, warning=FALSE}
+tte_brms_model <- tte_model |> brm(
+  data = ipd, prior = tte_prior, chains = 0, silent = 2
+)
+```
+
+### Computing conditional and cumulative DLT probabilities for a given dosing schedule
+
+We define a function to compute the conditional and cumulative DLT
+probabilities over a given schedule, respectively. This function needs
+to take the hazards for each piecewise-constant interval and compute
+the respective DLT probability. We write the function in the same
+logic as the `add_*_rvars` functions from `tidybayes` such that we can
+easily complement analysis data sets with the probabilities of
+interest:
+
+```{r}
+add_risk_rvars <- function(newdata, model, time, .by) {
+    ## P(T =< t) = 1 - P(T > t) = inv_cloglog(log(H(t)))
+    ## <=> P(T =< t) = 1 - P(T > t) = inv_clog(H(t))
+    ## inv_cloglog(cll) = 1 - exp(-exp(cll))
+    ## => inv_clog = 1 - exp(-cl)
+    inv_clog <- function(cl) { 1 - exp(-cl) }
+
+    ## data must be sorted by time to ensure that the cumulative sum
+    ## works correctly below
+    time_order <- order(pull(newdata, {{.by}}), pull(newdata, {{time}}))
+    orig_order <- seq_len(nrow(newdata))[time_order]
+    riskdata <- newdata[time_order,]
+    H <- rvar(posterior_epred(model,
+                              newdata=riskdata,
+                              allow_new_levels=TRUE,
+                              sample_new_levels="gaussian"))
+    riskdata <- mutate(riskdata,
+                       prob=inv_clog(cumsum(H[cur_group_rows()])),
+                       cprob=inv_clog(H[cur_group_rows()]),
+                       .by={{.by}})
+    riskdata[orig_order,]
+}
+```
+
+
+### Dose escalation decisions
+As in the basic dose-escalation chapter, we will apply Escalation With
+Overdose Control (EWOC). I.e., we define a threshold above which doses
+are "excessively toxic" $\pi_{over}$ (typically $\pi_{over} = 33\%$),
+and a so-called feasibility bound $c$ (typically $c = 25\%$):
+
+$$ \text{EWOC satisfied at }d \iff \Pr( \pi(d) > \pi_{over}) \leq c $$
+Furthermore, we often define another cutoff $\pi_{targ}$, and summarize the posterior probabilities for three intervals,
+
+\begin{align*} \Pr(d\text{ is an underdose}) &= \Pr( \pi(d) < \pi_{targ} ) \\
+\Pr(d\text{ is in the target range}) &= \Pr( \pi_{targ} \leq \pi(d) < \pi_{over} ) \\
+\Pr(d\text{ is an overdose}) &= \Pr( \pi(d) > \pi_{over} ),
+\end{align*}
+
+and evaluate EWOC by checking if the last quantity exceeds $c$.
+
+For the set of MCMC draws, we can compute median, mean, sd, 25% and 75% 
+quantiles, the interval probability as well as the EWOC criteria using the 
+`summarize_draws` function from the `posterior` package. 
+Especially for the EWOC metric, we also would like to know if there 
+are any pre-specified doses where the Monte Carlo Error could flip the decision
+whether a dose is considered safe enough (or not). 
+
+We test for this using a Z-test (one-sample location test) for the 75%
+quantile and its distance from the critical threshold $\pi_{over}$
+using the Monte-Carlo Standard Error (MCSE) for the 75% quantile
+(`mcse_q75`). We define the EWOC metric to be sufficiently accurate if
+the probability of flipping the decision due insufficient accuracy
+(and hence large MCSE) is smaller than 2.5% certainty on whatever side
+of the decision threshold. In turn we are at least 97.5% certain that
+the decision is robust wrt to fluctuations implied by MCMC sampling
+variability. Here we assume that the statistic is normally
+distributed:
+```{r}
+summarize_probs <- function(draws) {
+  summarize_draws(
+    draws,
+    median, mean, sd, 
+    ~quantile2(.x, probs = c(0.25, 0.75)), 
+    ~prop.table(table(cut(.x, breaks = c(0, 0.16, 0.33, 1)))),
+    ~mcse_quantile(.x, probs = 0.75), 
+    default_convergence_measures()
+  ) |> mutate(
+    ewoc_ok = q75 < 0.33,
+    stat = (q75 - .33) / mcse_q75,
+    ewoc_accurate =  abs(stat) >= qnorm(0.975)
+  ) |> select(-variable)
+}
+```
+
+To inspect the prior, we first sample the model ignoring the data:
+```{r, message=FALSE, warning=FALSE}
+tte_model_prioronly <- update(tte_brms_model, chains = 4, sample_prior = "only") 
+```
+
+Now we can compute the conditional and cumulative DLT probabilities implied by the prior,
+```{r}
+prior_risk <- dose_ref_data |>
+  add_risk_rvars(tte_model_prioronly, cycle, schedule_id)
+```
+and display them as a table, i.e., the conditional DLT probabilities
+```{r}
+prior_risk |>
+  mutate(summarize_probs(cprob)) |>
+  select(-num_toxicities, -std_A, -prob, -cprob) |>
+  gt() |> gt_format()
+```
+as well the cumulative probabilities
+```{r}
+prior_risk |>
+  mutate(summarize_probs(prob)) |>
+  select(-num_toxicities, -std_A, -prob, -cprob) |>
+  gt() |> gt_format()
+```
+
+
+### Visualizing the model prior {#sec-onco-tte-prior-vis}
+We can also visualize the prior in terms of the sampled curves, and see whether
+this distribution makes sense to us. We can, e.g., see, that the prior we defined
+above is quite sensible in terms of the dose-response curves it implies:
+
+#### Conditional DLT probability in cycle 1
+```{r}
+#| code-fold: true
+#| code-summary: "Show the code"
+add_plot_cols <- function(data) {
+  data |> mutate(
+    EWOC = factor(ewoc_ok, c(TRUE, FALSE), c("OK", "Not OK")), 
+    dose_A = factor(dose_A),
+    Cycle = cycle
+  )
+}
+
+doses_dense <- seq(0, 2*dref_A, length.out = 100 + 1)
+dose_info_dense <- expand_grid(
+  schedule_id = seq_along(doses_dense), 
+  cycle = cycle_seq
+  ) |>
+  mutate(dose_A = doses_dense[schedule_id])
+	
+dose_ref_data_dense <- dose_info_dense |>
+  left_join(cycle_info, by="cycle") |>
+  mutate(num_toxicities=0L) |>
+  add_std_dose_col(drug = "A", dref = dref_A)	 
+  
+plot_cycle1_dist <- function(model, schedules, doses) {
+  plot_data <- schedules |> 
+    add_risk_rvars(model, cycle, schedule_id) |>
+    mutate(summarize_probs(cprob)) |>
+    filter(cycle == 1)
+    
+  plot_data_long <- plot_data |>
+    select(dose_A, cprob) |>
+    unnest_rvars()
+  
+  plot_data_long |>
+    ggplot(aes(dose_A, y = cprob, group = .draw)) +
+    geom_line( alpha=0.01) +
+    scale_x_continuous(breaks = doses) +
+    scale_y_continuous(breaks = seq(0, 1, by = 0.1)) +
+    hline_at(0.33, linetype = I(2)) +
+    ggtitle(
+      "Risk of a DLT in cycle 1"
+    ) +
+    xlab("Dose [mg]") +
+    ylab("P(DLT in cycle | survival to cycle start)") +
+    theme(axis.text.x = element_text(angle = 90))
+}
+
+plot_cycle1_dist(tte_model_prioronly, dose_ref_data_dense, c(doses, 2*dref_A))
+```
+
+If one chooses a very wide prior on a parameter without considering the response
+scale, this can, result in priors that are not biologically
+justifiable. For instance, if we chose a much higher standard deviation for
+the prior of the slope $\beta$, this leads to a prior is bimodal in the sense 
+that it implies either flat dose-response curves or step functions at the 
+reference dose (which is easier to overlook when only plotting the point 
+intervals):
+
+```{r, message=FALSE, warning=FALSE}
+#| code-fold: true
+#| code-summary: "Show the code"
+tte_prior_wide <-
+  prior(normal(-4.83, 1), nlpar=interA, class=b, coef=Intercept) +
+  prior(normal(0, 3), nlpar=slopeA)
+
+tte_brms_model_wide_prioronly <- tte_model |> brm(
+  data = ipd, prior = tte_prior_wide, silent = 2, sample_prior = "only"
+)
+
+plot_cycle1_dist(tte_brms_model_wide_prioronly, dose_ref_data_dense, c(doses, 2*dref_A))
+```
+
+
+Nevertheless, it is easier to visualize the distributiom of the estimated 
+dose-response curves for the pre-specified doses using the `stat_pointinterval` 
+stat from the `ggdist` package:
+
+#### Conditional DLT probability in each of the 3 cycles
+```{r}
+plot_cprob <- function(model, schedules) {
+  schedules |> 
+    add_risk_rvars(model, cycle, schedule_id) |>
+    mutate(summarize_probs(cprob)) |>
+    add_plot_cols() |>
+    ggplot(aes(dose_A, ydist = cprob, colour = EWOC)) +
+    facet_wrap(~Cycle, labeller = label_both) + 
+    stat_pointinterval(.width = 0.5) +
+    scale_y_continuous(breaks = seq(0, 1, by = 0.1)) +
+    coord_cartesian(ylim = c(0, 0.5)) +
+    hline_at(0.33, linetype = I(2)) +
+    ggtitle(
+      "Conditional risk for one DLT per cycle", 
+      "Risk is conditional on survival at the same dose up to cycle start.\nShown is the median (dot) and central 50% CrI (line)."
+    ) +
+    xlab("Dose [mg]") +
+    ylab("P(DLT in cycle | survival to cycle start)")
+}
+
+plot_cprob(tte_model_prioronly, dose_ref_data)
+
+
+```
+
+#### Cumulative DLT probability over 3 cycles
+```{r}
+plot_prob <- function(model, schedules) {
+  schedules |>
+    add_risk_rvars(model, cycle, schedule_id) |>
+    mutate(summarize_probs(prob)) |>
+    add_plot_cols() |>
+    ggplot(aes(dose_A, ydist = prob, colour = EWOC)) +
+    stat_pointinterval(.width = 0.5) +
+    facet_wrap(~Cycle, labeller = label_both) +
+    scale_y_continuous(breaks = seq(0, 1, by = 0.1)) +
+    coord_cartesian(ylim = c(0, 0.5)) +
+    hline_at(0.33, linetype = I(2)) +
+    ggtitle(
+      "Overall risk for DLT up to cycle", 
+      "Given same dose over all cycles.\nShown is the median (dot) and central 50% CrI (line)."
+    ) +
+    xlab("Dose [mg]") +
+    ylab("P(DLT up to cycle)")
+}
+
+plot_prob(tte_model_prioronly, dose_ref_data) 
+```
+## Examining the model posterior
+We now sample the model including the data:
+```{r}
+tte_model_posterior <- update(tte_brms_model, chains = 4)
+```
+
+### Inference for model parameters
+It is simple to get summary statistics and graphical displays of the posterior distributions for the model parameters:
+
+```{r}
+posterior_summary(tte_model_posterior)
+```
+```{r}
+brms::mcmc_plot(tte_model_posterior, type = "dens", facet_args = list(ncol = 1))
+```
+### Inference for conditional and cumulative DLT probabilities
+The conditional and cumulative DLT probabilities are now higher, as expected from the data:
+```{r}
+plot_cprob(tte_model_posterior, dose_ref_data) 
+```
+
+```{r}
+plot_prob(tte_model_posterior, dose_ref_data) 
+```
+
+
+
+## Conclusion
+
+`brms` can handle time-to-event modelling for DLTs in an early Oncology dose escalation
+trial using Poisson regression with an offset for the follow-up, and produce all the posterior summaries necessary for guiding
+such trials. There are, of course, *many* possible extensions and complications
+to this methodology, such as hazards varying by treatment cycle (e.g.,
+monotonically decreasing), or by prior treatment (e.g., decreasing
+hazard in case of prior ramp-up dosing), or drug combinations. The
+latter will be the topic of the exercise for this chapter where we showcase
+how to define custom link functions in `brms` to handle the combination case.
+
+## Exercise
+Often, investigational cancer drugs are tested on top of a
+standard-of-care (SoC) treatment. In this exercise, we will explore
+how we can extend the single-agent time-to-event model to a
+combination of investigational drug and SoC treatment.
+
+In the following, we assume the SoC treatment also has a hazard rate
+when present, and that there is no drug-drug interactions, i.e., the
+hazards are *additive*.
+
+We model the log-hazard $\log\left(h_{B,j}(t|\tilde{d}_{B,j})\right)$ of 
+the SoC treatment (called "drug B" here) in a more simplified way, i.e., 
+either as present "at the reference dose" or absent: 
+$$
+\log\left(h_{B,j}(t|\tilde{d}_{B,j})\right) = \begin{cases}
+  \alpha_B & \textrm{if } \tilde{d}_{B,j} = 1  \\
+  -\infty  & \textrm{if } \tilde{d}_{B,j} = 0
+\end{cases} 
+$$
+First, we add a second component to the model:
+
+```{r}
+## Stan function to make combo2 non-linear link come to life. To avoid
+## issues with negative infinity (log(0)) whenever one of the drugs is
+## not present, we have to pass in this information.
+## 3. if drug_A is present (finiteA: indicator 1 = present, 0 = absent)
+## 4. if drug_B is present (finnitB: indicator 1 = present, 0 = absent)
+combo2_tte_stan_code <- "
+// log(h) = log(exp(muA) + exp(muB))
+real combo2_tte_log_inv_link(real muA, real muB, int finiteA, int finiteB) {
+  real log_h;
+  if(finiteA == 1 && finiteB == 1) {
+    log_h = log_sum_exp(muA, muB);
+  } else if(finiteA == 1 && finiteB != 1) {
+    log_h = muA;
+  } else if(finiteA != 1 && finiteB == 1) {
+    log_h = muB;
+  } else if(finiteA != 1 && finiteB != 1) {
+    // Need to use negative_infinity() instead of -std::numeric_limits<double>::infinity()
+    // to avoid autodiff issues
+    log_h = negative_infinity();
+  }
+  return log_h;
+}
+"
+
+combo2_tte_stanvar <- stanvar(scode = combo2_tte_stan_code, block = "functions")
+
+## Define respective R function which is used for simulation of the
+## posterior in R. Note that the inputs are given as draws of matrices
+## which have the format of draws representing rows and columns
+## corresponding to observation rows in the modeling data set for
+## which the posterior is simulated.
+combo2_tte_log_inv_link <- function(muA, muB, finiteA, finiteB) {
+    N <- ncol(muA) # number of observations
+    D <- nrow(muA) # number of draws
+
+    ## flatten the matrices to vectors (column-major ordering)
+    muA <- as.vector(muA)
+    muB <- as.vector(muB)
+    finiteA <- as.vector(finiteA)
+    finiteB <- as.vector(finiteB)
+  
+    log_muAB <- matrixStats::rowLogSumExps(cbind(muA, muB))
+    
+    log_h <- case_when(
+        finiteA == 1 & finiteB == 1 ~ log_muAB,
+        finiteA == 1 & finiteB != 1 ~ muA,
+        finiteA != 1 & finiteB == 1 ~ muB,
+        finiteA != 1 & finiteB != 1 ~ rep.int(-Inf, times=D*N)
+    )
+
+    ## cast result into draws x observation matrix
+    matrix(log_h, D, N)
+}
+```
+
+
+Note that the `brms` model corresponding to the combination model
+described above is quite different from the single-drug case due to the need for
+a custom non-linear link function:
+```{r}
+combo2_tte_model  <-
+    bf(num_toxicities | rate(follow_up) ~ combo2_tte_log_inv_link(muA, muB, finite_A, finite_B),
+       nlf(muA ~ interA + exp(slopeA) * std_A),
+       slopeA ~ 1,
+       interA ~ 1,
+       muB ~ 1,
+       nl=TRUE, loop=TRUE, family=poisson
+  )
+
+```
+In addition, we need the indicators finite_A and finite_B now, in order to 
+numerically deal with cases where one or both of the drugs has zero 0 dose.
+
+Now for the exercise:
+
+1. Update the `ipd` data to include the SoC as drug B (hint: use drug name "B" 
+   and a reference dose of 1 for SoC present).
+```{r}
+combo2_ipd <- ipd |> 
+  mutate(dose_B = 1) |> 
+  add_std_dose_col(drug = "B", dref = 1)
+```
+2. Define a prior for the SoC treatment that corresponds to a DLT probability of 5% in
+   the first cycle with a standard deviation of 1.
+```{r}
+get_prior(combo2_tte_model, data = combo2_ipd)
+```
+
+```{r}
+## -6.3 approx cloglog(0.05) - log(7*4) for the SoC
+combo2_tte_prior <-
+  prior(normal(-4.83, 1), nlpar=interA, class=b, coef=Intercept) +
+  prior(normal(0, log(4)/1.96), nlpar=slopeA) +
+  prior(normal(-6.3, 1), nlpar=muB, coef=Intercept)
+
+```
+3. Which dose of the investigational drug would still be sufficiently safe according 
+to the standard EWOC criterion when administered on top of the SoC treatment?
+```{r}
+combo2_tte_brms_model <- combo2_tte_model |> brm(
+  data = combo2_ipd, stanvars = combo2_tte_stanvar, 
+  prior = combo2_tte_prior, silent = 2
+)
+```
+
+```{r}
+combo2_dose_ref_data <- dose_ref_data |> 
+  mutate(dose_B = 1) |> 
+  add_std_dose_col(drug = "B", dref = 1)
+```
+
+
+```{r}
+
+combo2_risk <- combo2_dose_ref_data |>
+    add_risk_rvars(combo2_tte_brms_model, cycle, schedule_id)
+```
+
+```{r}
+combo2_risk |>
+    mutate(summarize_probs(cprob)) |>
+    select(-num_toxicities, -std_A, -prob, -cprob) |>
+    gt() |> gt_format()
+```
+
+```{r}
+combo2_risk |>
+    mutate(summarize_probs(prob)) |>
+    filter(cycle == 3, ewoc_ok) |> 
+    select(-num_toxicities, -std_A, -std_B, -prob, -cprob, -finite_A, -finite_B) |>
+    gt() |> gt_format()
+```
+I.e., we can see that the highest dose that is safe according to the EWOC 
+criterion until the end of cycle 3 would be a dose of 10 for drug A.
+
+
+
+
+## Appendix {#sec-onco-tte-appendix .appendix}
+
+### Model derivation {.appendix}
+For each cycle $j$, we model the probability of observing a DLT event
+conditional on entering cycle $j$ as a function of the dose $d_{j}$
+being administered during this cycle.
+
+For each patient $i \in {1, \ldots, N}$, we observe either the DLT
+event time $T_i$ or the censoring time $C_i$, whichever occurs
+first. We denote this as <!-- \unicode{x1D7D9} -->
+
+$$
+(U_i, \delta_i), \textrm{ where } U_i = \min(T_i, C_i) \textrm{ and } \delta_i = 1(T_i \leq C_i).
+$$
+
+Here, $U_i$ represents the observed time, and $\delta_i$ indicates
+whether a DLT occurred ($\delta_i = 1$) or not ($\delta_i = 0$).
+
+The primary focus of the model is the risk for DLT events over the
+course of a sequence of cycles. Thus, we do not aim to model
+accurately the risk for an event within each cycle. While one could
+model the event and censoring times as interval censored observations,
+we disregard here the interval censoring and use a continuous time
+representation of the time to event process. The continuous time
+representation can be understood as an approximation of the interval
+censored process with time lapsing in full units of a cycle. As a
+consequence, the observed event time $T_i$ and censoring times $C_i$
+are always recorded at the planned cycle completion time-point (time
+lapses in units of cycles). Here we assume for simplicity that each
+cycle $j$ has the same duration $\Delta t$ such that any time being
+recorded is a multiple of $\Delta t$ and we denote time $t$ being
+during a cycle $j$ with $t \in ((j-1) \, \Delta t, j \, \Delta t]$ or
+equivalently $t \in I_j$ for brevity.
+
+### Definitions from Survival Analysis {.appendix}
+To define our model, we use the following basic definitions from
+survival analysis. The survival function $S(t)$ and cumulative event
+probability $F(t)$ are defined as
+
+\begin{align*}
+S(t) := & \Pr(\textrm{"No event up to time t"}) \\
+ = & \Pr(T \geq t) \\
+ = & 1 - F(t),
+\end{align*}
+
+where $F(t) = \Pr(T < t)$. The respective event densities $s(t)$ and $f(t)$ are
+$$
+s(t) = \frac{d}{dt}S(t) 
+     = \frac{d}{dt} \int_t^\infty f(u) du 
+     = \frac{d}{dt}[1 - F(t)] 
+     = -f(t)
+$$
+The hazard function $h(t)$ is defined as
+$$
+h(t) := \lim_{\Delta t \rightarrow 0}\frac{\Pr(t \leq T < t + \Delta t)}{\Delta t \cdot S(t)} 
+= \lim_{\Delta t \rightarrow 0}\frac{S(t) - S(t + \Delta t)}{\Delta t \cdot S(t)} 
+= \frac{f(t)}{S(t)} = -\frac{s(t)}{S(t)}
+$$
+
+and the cumulative hazard function $H(t)$ is given by
+$$
+H(t) = \int_0^t h(u) du = -\log(S(t)).
+$$
+
+We now have two quantities that are of particular interest:
+
+1. The cumulative DLT probability up to cycle $j$ given a prescribed
+   dosing $\boldsymbol{d} = (d_{1}, d_{2}, d_{3})$ over the three
+   cycles: $$\Pr(\textrm{"DLT up to cycle j"}|\boldsymbol{d}) =
+   F(t).$$
+   
+2. The conditional DLT probability of cycle $j$ given survival up to cycle $j-1$ (i.e., not having observed a DLT event up to the end of cycle $j-1$):
+   \begin{align*}
+      \, &  \Pr(\textrm{"DLT in cycle j"}|\textrm{"No DLT up to cycle j-1"}, d_j) \\
+    = \, &  \Pr(T \leq t + \Delta t|T > t, d_j)\\
+    = \, &  1 - \exp\left(-\int_{t}^{t + \Delta t} h(u) du \right).
+   \end{align*}
+
+In the following, we will also use the complementary log-log link $\mathrm{cloglog}$, as well as its inverse $\mathrm{cloglog}^{-1}:$
+
+\begin{align}
+\mathrm{cloglog}(x) :=&\ \log(-\log(1 - x)), \textrm{ and} \\
+\mathrm{cloglog}^{-1}(y) =&\ 1 - \exp(-\exp(y)). 
+\end{align}
+
+### Model definition {.appendix}
+Given the drug dose $d_{j}$ administered in cycle $j$, we model the log-hazard $\log(h_j(t|\tilde{d}_{j}))$ as 
+$$
+\log(h_j(t|\tilde{d}_{j})) = \alpha + \beta \log\left(\tilde{d}_{j}\right), \textrm{ where } \tilde{d}_{j} = \frac{d_{j}}{\tilde{d}}
+$$
+and $\tilde{d}$ is the reference dose. The conditional DLT probability over 1 cycle in this model is then
+
+
+\begin{align}
+\Pr(T \leq t + \Delta t|T > t, \tilde{d}_{j}) & = 1 - \exp\left(-\int_{t}^{t + \Delta t} h(u) du \right)\\
+& = 1 - \exp\left(-\int_{t}^{t + \Delta t} \exp\left(\alpha + \beta \log\left(\tilde{d}_{j}\right)\right) du \right) \\
+& = 1 - \exp\left(-\Delta t \exp\left(\alpha + \beta \log\left(\tilde{d}_{j}\right)\right) \right) \\
+& = 1 - \exp\left(-\exp\left(\alpha + \beta \log\left(\tilde{d}_{j}\right) + \log(\Delta t)\right) \right).
+\end{align}
+
+This is closely related to the classic two-parameter Bayesian Logistic
+Regression Model (BLRM), since the conditional probability of
+observing a DLT,
+
+\begin{align}
+\mathrm{cloglog}\Pr(T \leq t + \Delta t|T > t, d_j) & = \alpha + \beta \log\left(\tilde{d}_{j}\right) + \log(\Delta t),
+\end{align}
+
+is the same as for the BLRM in cycle one if we elapse time in cycles
+(then $\Delta t = 1$ and the $\log(\Delta t)$ is zero), except for the
+link function changing from $\mathrm{logit}$ to $\mathrm{cloglog}$.
+
+The cumulative DLT probability can also be derived, where $j_t$ is the
+index of the cycle that time $t$ lies in:
+
+\begin{align}
+\mathrm{cloglog}\Pr(T \leq t|\boldsymbol{d}) & = \log\left(\sum_{j = 1}^{j_t} \exp\left(\alpha + \beta \log\left(\tilde{d}_{j}\right) + \log(\Delta t)\right)\right) = \mathrm{cloglog} F(t|\boldsymbol{d}).
+\end{align}
+
+
+Last, but not least, the likelihood for all patients $i$ is then
+$$
+L(U|\alpha, \beta) = \prod_i \underbrace{ f(T_i)^{\delta_i} }_\textrm{DLT
+ events} \, \underbrace{S(C_i)^{1 - \delta_i}}_\textrm{Survival
+ events} \,,
+$$
+and the log-likelihood
+$$
+LL(U|\alpha, \beta) = \underbrace{\sum_i \log\left(f(T_i)\right) \delta_i}_\textrm{DLT
+ events} + \underbrace{\sum_i \log\left(S(C_i)\right)\left(1 - \delta_i\right)}_\textrm{Survival
+ events} \,.
+$$
diff --git a/src/02e_multiple_imputation.qmd b/src/02e_multiple_imputation.qmd
index efd9581..5b1fc6c 100644
--- a/src/02e_multiple_imputation.qmd
+++ b/src/02e_multiple_imputation.qmd
@@ -3,7 +3,7 @@ author:
   - Björn Holzhauer - <bjoern.holzhauer@novartis.com>
 ---
 
-# Multiple imputation
+# Multiple imputation {#sec-multiple-imputation}
 
 ```{r, include=FALSE}
 source(here::here("src", "setup.R"))
diff --git a/src/02g_longitudinal.qmd b/src/02g_longitudinal.qmd
index 92429b5..15859fb 100644
--- a/src/02g_longitudinal.qmd
+++ b/src/02g_longitudinal.qmd
@@ -3,7 +3,7 @@ author:
   - Andrew Bean - <andrew.bean@novartis.com>
 ---
 
-# Longitudinal data
+# Longitudinal data {#sec-longitudinal-data}
 
 ```{r setup, echo = FALSE, include = FALSE}
 source(here::here("src", "setup.R"))
diff --git a/src/02h_mmrm.qmd b/src/02h_mmrm.qmd
index 4ae3b19..f75ea5b 100644
--- a/src/02h_mmrm.qmd
+++ b/src/02h_mmrm.qmd
@@ -6,7 +6,7 @@ author:
 
 {{< include _macros.qmd >}}
 
-# Bayesian Mixed effects Model for Repeated Measures
+# Bayesian Mixed effects Model for Repeated Measures {#sec-mmrm}
 
 This case study covers
 
diff --git a/src/02i_time_to_event.qmd b/src/02i_time_to_event.qmd
index c832770..04d9ca6 100644
--- a/src/02i_time_to_event.qmd
+++ b/src/02i_time_to_event.qmd
@@ -4,7 +4,7 @@ author:
   - Sebastian Weber - <sebastian.weber@novartis.com>
 ---
 
-# Time-to-event data
+# Time-to-event data {#sec-tte-data}
 
 In this case study the design of a time-to-event trial is
 presented. The trial evaluates a combination treatment of an active
diff --git a/src/02j_network_meta_analysis.qmd b/src/02j_network_meta_analysis.qmd
index ab6ae3c..8a92123 100644
--- a/src/02j_network_meta_analysis.qmd
+++ b/src/02j_network_meta_analysis.qmd
@@ -4,7 +4,7 @@ author:
   - Andrew Bean - <andrew.bean@novartis.com>
 ---
 
-# Network meta-analysis
+# Network meta-analysis {#sec-network-meta}
 
 This case study covers:
 
diff --git a/src/plot_distributions.R b/src/plot_distributions.R
new file mode 100644
index 0000000..b8448dd
--- /dev/null
+++ b/src/plot_distributions.R
@@ -0,0 +1,110 @@
+library(dplyr)
+library(tidyr)
+library(ggplot2)
+# library(patchwork)
+# library(latex2exp)
+theme_set(bayesplot::theme_default())
+
+# additional distributions
+dbeta2 <- function(x, mu, nu, ...) {
+  shape1 <- mu * nu
+  shape2 <- (1-mu) * nu
+  dbeta(x, shape1, shape2, ...)
+}
+
+dbeta2_prime <- function(x, mu, nu) {
+  shape1 <- mu * nu
+  shape2 <- (1-mu) * nu
+  x^(shape1-1) * (1 + x)^(-shape1 - shape2) / beta(shape1, shape2)
+}
+
+dbernoulli <- function(x, prob, log = FALSE) {
+  dbinom(x, 1, prob, log = log)
+}
+
+dcategorical <- function(x, ...) {
+  probs <- c(...)
+  probs[x]
+}
+
+dhyper2 <- function(x, size, k, prob, log = FALSE) {
+  dhyper(x, m = size * prob, n = size * (1 - prob), k = k, log = log)
+}
+
+plot_dist <- function(dist, bounds, pars, xtype = c("c", "d"), 
+                      prefix = c("d", "p", "q"), parnames = NULL, 
+                      package = NULL, ...) {
+  xtype <- match.arg(xtype)
+  prefix <- match.arg(prefix)
+  pos <- -1
+  if (!is.null(package)) {
+    pos <- asNamespace(package)
+  }
+  dist_fun <- get(paste0(prefix, dist), pos = pos, mode = "function")
+  if (xtype == "c") {
+    # continuous
+    df <- data.frame(x = seq(bounds[1], bounds[2], 0.001))
+  } else if (xtype == "d") {
+    # discrete
+    df <- data.frame(x = bounds[1]:bounds[2])
+  }
+  if (!is.null(parnames)) {
+    parnames <- paste0(parnames, " = ")
+  }
+  cnames <- rep(NA, length(pars))
+  for (i in seq_along(pars)) {
+    tmp <- do.call(dist_fun, c(list(df$x), pars[[i]], list(...)))
+    cnames[i] <- paste0("$", parnames, pars[[i]], "$", collapse = ", ")
+    df[paste0(parnames, pars[[i]], collapse = ", ")] <- tmp
+  }
+  df <- df %>%
+    gather("pars", "dens", -x) %>%
+    mutate(pars = factor(pars, unique(pars)))
+  
+  gg <- ggplot(df, aes(x, dens, color = pars))
+  if (xtype == "c") {
+    gg <- gg + geom_line(size = 1)
+  } else if (xtype == "d") {
+    gg <- gg + 
+      geom_linerange(aes(ymin=0, ymax=dens), size = 1) +
+      geom_line(size = 0.8, linetype = "dotted", alpha = 0.8)
+  }
+  gg <- gg + 
+    scale_color_viridis_d(labels = unname(latex2exp::TeX(cnames))) +
+    labs(x = "x", y = "", color = "") + 
+    theme(
+      axis.text.y = element_blank(),
+      axis.ticks.y = element_blank(),
+      axis.line.y = element_blank(),
+      legend.position = "bottom",
+      legend.text = element_text(size = 10)
+    )
+  if (prefix == "p") {
+    gg <- gg +
+      scale_y_continuous(breaks = c(0, 0.5, 1)) +
+      theme(axis.ticks.y = element_line(), 
+            axis.text.y = element_text(),
+            axis.line.y = element_line())
+  } else if (prefix == "q") {
+    gg <- gg +
+      scale_y_continuous() +
+      theme(axis.ticks.y = element_line(), 
+            axis.text.y = element_text(),
+            axis.line.y = element_line())
+  }
+  gg
+}
+
+histogram <- function(x, bins = 30) {
+  xname <- deparse(substitute(x))
+  xexpr <- eval(parse(text = paste0("expression(", xname, ")")))
+  ggplot(data.frame(x), aes(x, y = ..density..)) +
+    geom_histogram(fill = "lightgrey", color = "black", bins = bins) +
+    xlab(xexpr) +
+    theme(
+      axis.text.y = element_blank(),
+      axis.ticks.y = element_blank(),
+      axis.line.y = element_blank(),
+      axis.title.y = element_blank()
+    )
+}
diff --git a/src/references.bib b/src/references.bib
index 8b28583..6ca9725 100644
--- a/src/references.bib
+++ b/src/references.bib
@@ -676,3 +676,145 @@ @article{allison2012equip
 }
 
 @Comment{jabref-meta: databaseType:bibtex;}
+
+@article{Gelman2017,
+   abstract = {A key sticking point of Bayesian analysis is the choice of prior distribution, and there is a vast literature on potential defaults including uniform priors, Jeffreys' priors, reference priors, maximum entropy priors, and weakly informative priors. These methods, however, often manifest a key conceptual tension in prior modeling: a model encoding true prior information should be chosen without reference to the model of the measurement process, but almost all common prior modeling techniques are implicitly motivated by a reference likelihood. In this paper we resolve this apparent paradox by placing the choice of prior into the context of the entire Bayesian analysis, from inference to prediction to model evaluation.},
+   author = {Andrew Gelman and Daniel Simpson and Michael Betancourt},
+   doi = {10.3390/e19100555},
+   issn = {10994300},
+   issue = {10},
+   journal = {Entropy},
+   keywords = {Bayesian inference,Default priors,Prior distribution},
+   month = {10},
+   publisher = {MDPI AG},
+   title = {The prior can often only be understood in the context of the likelihood},
+   volume = {19},
+   year = {2017},
+}
+
+@misc{Neuenschwander2020,
+   author = {Beat Neuenschwander and Heinz Schmidli},
+   isbn = {9781315180212},
+   title = {Use of Historical Data},
+   url = {https://CRAN.R-project.org/package=RBesT},
+   year = {2020},
+}
+
+@article{Lilienthal2023,
+   abstract = {In Bayesian random‐effects meta‐analysis, the use of weakly informative prior distributions is of particular benefit in cases where only a few studies are included, a situation often encountered in health technology assessment (HTA). Suggestions for empirical prior distributions are available in the literature but it is unknown whether these are adequate in the context of HTA. Therefore, a database of all relevant meta‐analyses conducted by the Institute for Quality and Efficiency in Health Care (IQWiG, Germany) was constructed to derive empirical prior distributions for the heterogeneity parameter suitable for HTA. Previously, an extension to the normal‐normal hierarchical model had been suggested for this purpose. For different effect measures, this extended model was applied on the database to conservatively derive a prior distribution for the heterogeneity parameter. Comparison of a Bayesian approach using the derived priors with IQWiG's current standard approach for evidence synthesis shows favorable properties. Therefore, these prior distributions are recommended for future meta‐analyses in HTA settings and could be embedded into the IQWiG evidence synthesis approach in the case of very few studies.},
+   author = {Jona Lilienthal and Sibylle Sturtz and Christoph Schürmann and Matthias Maiworm and Christian Röver and Tim Friede and Ralf Bender},
+   doi = {10.1002/jrsm.1685},
+   issn = {1759-2879},
+   journal = {Research Synthesis Methods},
+   month = {12},
+   publisher = {Wiley},
+   title = {Bayesian random‐effects meta‐analysis with empirical heterogeneity priors for application in health technology assessment with very few studies},
+   year = {2023},
+}
+
+@book{Gelman2014,
+   abstract = {Incorporating new and updated information, this second edition of THE bestselling text in Bayesian data analysis continues to emphasize practice over theory, describing how to conceptualize, perform, and critique statistical analyses from a Bayesian perspective. Its world-class authors provide guidance on all aspects of Bayesian data analysis and include examples of real statistical analyses, based on their own research, that demonstrate how to solve complicated problems. Changes in the new edition include: Stronger focus on MCMCRevision of the computational advice in Part IIINew chapters on nonlinear models and decision analysisSeveral additional applied examples from the authors' recent researchAdditional chapters on current models for Bayesian data analysis such as nonlinear models, generalized linear mixed models, and moreReorganization of chapters 6 and 7 on model checking and data collectionBayesian computation is currently at a stage where there are many reasonable ways to compute any given posterior distribution. However, the best approach is not always clear ahead of time. Reflecting this, the new edition offers a more pluralistic presentation, giving advice on performing computations from many perspectives while making clear the importance of being aware that there are different ways to implement any given iterative simulation computation. The new approach, additional examples, and updated information make Bayesian Data Analysis an excellent introductory text and a reference that working scientists will use throughout their professional life.},
+   author = {Andrew Gelman and John B Carlin and Hal S Stern and David B Dunson and Aki Vehtari and Donald B Rubin},
+   doi = {10.1007/s13398-014-0173-7.2},
+   edition = {Third edit},
+   isbn = {978-1-4398-9820-8},
+   issn = {1467-9280},
+   journal = {Chapman Texts in Statistical Science Series},
+   pages = {696},
+   pmid = {25052830},
+   title = {Bayesian Data Analysis},
+   year = {2014},
+}
+
+@article{Turner2015,
+   abstract = {Numerous meta-analyses in healthcare research combine results from only a small number of studies, for which the variance representing between-study heterogeneity is estimated imprecisely. A Bayesian approach to estimation allows external evidence on the expected magnitude of heterogeneity to be incorporated. The aim of this paper is to provide tools that improve the accessibility of Bayesian meta-analysis. We present two methods for implementing Bayesian meta-analysis, using numerical integration and importance sampling techniques. Based on 14,886 binary outcome meta-analyses in the Cochrane Database of Systematic Reviews, we derive a novel set of predictive distributions for the degree of heterogeneity expected in 80 settings depending on the outcomes assessed and comparisons made. These can be used as prior distributions for heterogeneity in future meta-analyses. The two methods are implemented in R, for which code is provided. Both methods produce equivalent results to standard but more complex Markov chain Monte Carlo approaches. The priors are derived as log-normal distributions for the between-study variance, applicable to meta-analyses of binary outcomes on the log odds-ratio scale. The methods are applied to two example meta-analyses, incorporating the relevant predictive distributions as prior distributions for between-study heterogeneity. We have provided resources to facilitate Bayesian meta-analysis, in a form accessible to applied researchers, which allow relevant prior information on the degree of heterogeneity to be incorporated.},
+   author = {Rebecca M Turner and Dan Jackson and Yinghui Wei and Simon G Thompson and Julian P T Higgins},
+   doi = {10.1002/sim.6381},
+   isbn = {0277-6715},
+   issn = {10970258},
+   issue = {6},
+   journal = {Statistics in Medicine},
+   keywords = {Bayesian methods,Heterogeneity,Meta-analysis,Prior distributions},
+   pages = {984-998},
+   pmid = {25475839},
+   title = {Predictive distributions for between-study heterogeneity and simple methods for their application in Bayesian meta-analysis},
+   volume = {34},
+   url = {https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4383649/pdf/sim0034-0984.pdf},
+   year = {2015},
+}
+
+@online{stanwikiPrior,
+  author = {Stan},
+  title = {Prior Choice Recommendations},
+  year = 2024,
+  url = {https://github.com/stan-dev/stan/wiki/Prior-Choice-Recommendations},
+  urldate = {2024-04-22}
+}
+
+@article{Simpson2014,
+   abstract = {In this paper, we introduce a new concept for constructing prior distributions. We exploit the natural nested structure inherent to many model components, which defines the model component to be a flexible extension of a base model. Proper priors are defined to penalise the complexity induced by deviating from the simpler base model and are formulated after the input of a user-defined scaling parameter for that model component, both in the univariate and the multivariate case. These priors are invariant to reparameterisations, have a natural connection to Jeffreys' priors, are designed to support Occam's razor and seem to have excellent robustness properties, all which are highly desirable and allow us to use this approach to define default prior distributions. Through examples and theoretical results, we demonstrate the appropriateness of this approach and how it can be applied in various situations.},
+   author = {Daniel P. Simpson and Håvard Rue and Thiago G. Martins and Andrea Riebler and Sigrunn H. Sørbye},
+   doi = {10.1214/16-STS576},
+   isbn = {0012-9658},
+   issn = {0883-4237},
+   issue = {1},
+   journal = {Statistical Science},
+   keywords = {Bayesian theory,disease mapping,hierarchical models,information geometry,interpretable prior distributions,prior on correlation matrices},
+   month = {2},
+   pages = {1-28},
+   pmid = {17645027},
+   publisher = {Institute of Mathematical Statistics},
+   title = {Penalising model component complexity: A principled, practical approach to constructing priors},
+   volume = {32},
+   url = {http://projecteuclid.org/euclid.ss/1491465621 http://arxiv.org/abs/1403.4630},
+   year = {2014},
+}
+
+@article{Piironen2017,
+   abstract = {The horseshoe prior has proven to be a noteworthy alternative for sparse Bayesian estimation, but has previously suffered from two problems. First, there has been no systematic way of specifying a prior for the global shrinkage hyperparameter based on the prior information about the degree of sparsity in the parameter vector. Second, the horseshoe prior has the undesired property that there is no possibility of specifying separately information about sparsity and the amount of regularization for the largest coefficients, which can be problematic with weakly identified parameters, such as the logistic regression coefficients in the case of data separation. This paper proposes solutions to both of these problems. We introduce a concept of effective number of nonzero parameters, show an intuitive way of formulating the prior for the global hyperparameter based on the sparsity assumptions, and argue that the previous default choices are dubious based on their tendency to favor solutions with more unshrunk parameters than we typically expect a priori. Moreover, we introduce a generalization to the horseshoe prior, called the regularized horseshoe, that allows us to specify a minimum level of regularization to the largest values. We show that the new prior can be considered as the continuous counterpart of the spike-and-slab prior with a finite slab width, whereas the original horseshoe resembles the spike-and-slab with an infinitely wide slab. Numerical experiments on synthetic and real world data illustrate the benefit of both of these theoretical advances.},
+   author = {Juho Piironen and Aki Vehtari},
+   doi = {10.1214/17-EJS1337SI},
+   issn = {1935-7524},
+   issue = {2},
+   journal = {Electronic Journal of Statistics},
+   keywords = {Bayesian inference,Horseshoe prior,Shrinkage priors,Sparse estimation},
+   month = {1},
+   pages = {5018-5051},
+   title = {Sparsity information and regularization in the horseshoe and other shrinkage priors},
+   volume = {11},
+   url = {https://projecteuclid.org/journals/electronic-journal-of-statistics/volume-11/issue-2/Sparsity-information-and-regularization-in-the-horseshoe-and-other-shrinkage/10.1214/17-EJS1337SI.full},
+   year = {2017},
+}
+
+@article{Zhang2022,
+   abstract = {Prior distributions for high-dimensional linear regression require specifying a joint distribution for the unobserved regression coefficients, which is inherently difficult. We instead propose a new class of shrinkage priors for linear regression via specifying a prior first on the model fit, in particular, the coefficient of determination, and then distributing through to the coefficients in a novel way. The proposed method compares favorably to previous approaches in terms of both concentration around the origin and tail behavior, which leads to improved performance both in posterior contraction and in empirical performance. The limiting behavior of the proposed prior is (Formula presented.), both around the origin and in the tails. This behavior is optimal in the sense that it simultaneously lies on the boundary of being an improper prior both in the tails and around the origin. None of the existing shrinkage priors obtain this behavior in both regions simultaneously. We also demonstrate that our proposed prior leads to the same near-minimax posterior contraction rate as the spike-and-slab prior. Supplementary materials for this article are available online.},
+   author = {Yan Dora Zhang and Brian P. Naughton and Howard D. Bondell and Brian J. Reich},
+   doi = {10.1080/01621459.2020.1825449},
+   issn = {1537274X},
+   issue = {538},
+   journal = {Journal of the American Statistical Association},
+   keywords = {Beta-prime distribution,Coefficient of determination,Global-local shrinkage,High-dimensional regression},
+   pages = {862-874},
+   publisher = {American Statistical Association},
+   title = {Bayesian Regression Using a Prior on the Model Fit: The R2-D2 Shrinkage Prior},
+   volume = {117},
+   year = {2022},
+}
+
+@article{Rover2021,
+   abstract = {The normal-normal hierarchical model (NNHM) constitutes a simple and widely used framework for meta-analysis. In the common case of only few studies contributing to the meta-analysis, standard approaches to inference tend to perform poorly, and Bayesian meta-analysis has been suggested as a potential solution. The Bayesian approach, however, requires the sensible specification of prior distributions. While noninformative priors are commonly used for the overall mean effect, the use of weakly informative priors has been suggested for the heterogeneity parameter, in particular in the setting of (very) few studies. To date, however, a consensus on how to generally specify a weakly informative heterogeneity prior is lacking. Here we investigate the problem more closely and provide some guidance on prior specification.},
+   author = {Christian Röver and Ralf Bender and Sofia Dias and Christopher H. Schmid and Heinz Schmidli and Sibylle Sturtz and Sebastian Weber and Tim Friede},
+   doi = {10.1002/jrsm.1475},
+   issn = {17592887},
+   issue = {4},
+   journal = {Research Synthesis Methods},
+   keywords = {Bayes factor,GLMM,hierarchical model,marginal likelihood,variance component},
+   month = {1},
+   pages = {448-474},
+   pmid = {33486828},
+   publisher = {John Wiley & Sons, Ltd},
+   title = {On weakly informative prior distributions for the heterogeneity parameter in Bayesian random-effects meta-analysis},
+   volume = {12},
+   url = {https://onlinelibrary.wiley.com/doi/10.1002/jrsm.1475},
+   year = {2021},
+}