density, cdf, quantiles for weighted rvar

stan-dev · Jan 19, 2024 · b0a778a · b0a778a
1 parent 534d7fb
commit b0a778a
Show file tree

Hide file tree

Showing 3 changed files with 176 additions and 19 deletions.
diff --git a/R/rvar-dist.R b/R/rvar-dist.R
@@ -40,8 +40,9 @@
 #' @name rvar-dist
 #' @export
 density.rvar <- function(x, at, ...) {
+  weights <- weights(x)
   summarise_rvar_by_element(x, function(draws) {
-    d <- density(draws, cut = 0, ...)
+    d <- density(draws, weights = weights, cut = 0, ...)
     f <- approxfun(d$x, d$y, yleft = 0, yright = 0)
     f(at)
   })
@@ -50,11 +51,12 @@ density.rvar <- function(x, at, ...) {
 #' @rdname rvar-dist
 #' @export
 density.rvar_factor <- function(x, at, ...) {
+  weights <- weights(x)
   at <- as.numeric(factor(at, levels = levels(x)))
-  nbins <- nlevels(x)
 
   summarise_rvar_by_element(x, function(draws) {
-    props <- prop.table(tabulate(draws, nbins = nbins))[at]
+    tab <- weighted_simple_table(draws, weights)
+    props <- prop.table(tab$count)[at]
     props
   })
 }
@@ -66,8 +68,9 @@ distributional::cdf
 #' @rdname rvar-dist
 #' @export
 cdf.rvar <- function(x, q, ...) {
+  weights <- weights(x)
   summarise_rvar_by_element(x, function(draws) {
-    ecdf(draws)(q)
+    weighted_ecdf(draws, weights)(q)
   })
 }
 
@@ -76,7 +79,7 @@ cdf.rvar <- function(x, q, ...) {
 cdf.rvar_factor <- function(x, q, ...) {
   # CDF is not defined for unordered distributions
   # generate an all-NA array of the appropriate shape
-  out <- rep_len(NA, length(x) * length(q))
+  out <- rep_len(NA_real_, length(x) * length(q))
   if (length(x) > 1) dim(out) <- c(length(q), dim(x))
   out
 }
@@ -91,14 +94,10 @@ cdf.rvar_ordered <- function(x, q, ...) {
 #' @rdname rvar-dist
 #' @export
 quantile.rvar <- function(x, probs, ...) {
-  summarise_rvar_by_element_via_matrix(x,
-    "quantile",
-    function(draws) {
-      t(matrixStats::colQuantiles(draws, probs = probs, useNames = TRUE, ...))
-    },
-    .extra_dim = length(probs),
-    .extra_dimnames = list(NULL)
-  )
+  weights <- weights(x)
+  summarise_rvar_by_element(x, function(draws) {
+    weighted_quantile(draws, probs = probs, weights = weights, ...)
+  })
 }
 
 #' @rdname rvar-dist

diff --git a/R/weighted.R b/R/weighted.R
@@ -0,0 +1,118 @@
+# weighted distribution functions --------------------------------------------
+
+#' Weighted version of [stats::ecdf()].
+#' Based on ggdist::weighted_ecdf().
+#' @noRd
+weighted_ecdf = function(x, weights = NULL) {
+  n = length(x)
+  if (n < 1) stop("Need at least 1 or more values to calculate an ECDF")
+
+  weights = if (is.null(weights)) rep(1, n) else weights
+
+  #sort only if necessary
+  if (is.unsorted(x)) {
+    sort_order = order(x)
+    x = x[sort_order]
+    weights = weights[sort_order]
+  }
+
+  # calculate weighted cumulative probabilities
+  p = cumsum(weights)
+  p = p/p[n]
+
+  approxfun(x, p, yleft = 0, yright = 1, ties = "ordered", method = "constant")
+}
+
+#' Weighted version of [stats::quantile()].
+#' Based on ggdist::weighted_quantile().
+#' @noRd
+weighted_quantile = function(x,
+  probs = seq(0, 1, 0.25),
+  weights = NULL,
+  na.rm = FALSE,
+  type = 7
+) {
+  weighted_quantile_fun(
+    x,
+    weights = weights,
+    na.rm = na.rm,
+    type = type
+  )(probs)
+}
+
+#' @rdname weighted_quantile
+#' @export
+weighted_quantile_fun = function(x, weights = NULL, na.rm = FALSE, type = 7) {
+  na.rm <- as_one_logical(na.rm)
+  if (!isTRUE(type %in% 1:9)) {
+    stop0("Quantile type `", deparse0(type), "` is invalid. It must be in 1:9.")
+  }
+
+  if (na.rm) {
+    keep = !is.na(x) & !is.na(weights)
+    x = x[keep]
+    weights = weights[keep]
+  }
+
+  # determine weights
+  weights = weights %||% rep(1, length(x))
+  non_zero = weights != 0
+  x = x[non_zero]
+  weights = weights[non_zero]
+  weights = weights / sum(weights)
+
+  # if there is only 0 or 1 x values, we don't need the weighted version (and
+  # we couldn't calculate it anyway as we need > 2 points for the interpolation)
+  if (length(x) <= 1) {
+    return(function(p) quantile(x, p, names = FALSE))
+  }
+
+  # sort values if necessary
+  if (is.unsorted(x)) {
+    x_order = order(x)
+    x = x[x_order]
+    weights = weights[x_order]
+  }
+
+  # calculate the weighted CDF
+  F_k = cumsum(weights)
+
+  # generate the function for the approximate inverse CDF
+  if (1 <= type && type <= 3) {
+    # discontinuous quantiles
+    switch(type,
+      # type 1
+      stepfun(F_k, c(x, x[length(x)]), right = TRUE),
+      # type 2
+      {
+        x_over_2 = c(x, x[length(x)])/2
+        inverse_cdf_type2_left = stepfun(F_k, x_over_2, right = FALSE)
+        inverse_cdf_type2_right = stepfun(F_k, x_over_2, right = TRUE)
+        function(x) inverse_cdf_type2_left(x) + inverse_cdf_type2_right(x)
+      },
+      # type 3
+      stepfun(F_k - weights/2, c(x[[1]], x), right = TRUE)
+    )
+  } else {
+    # Continuous quantiles. These are based on the definition of p_k as described
+    # in the documentation of `quantile()`. The trick to re-writing those formulas
+    # (which use `n` and `k`) for the weighted case is that `k` = `F_k * n` and
+    # `1/n` = `weight_k`. Using these two facts, we can express the formulas for
+    # `p_k` without using `n` or `k`, which don't really apply in the weighted case.
+    p_k = switch(type - 3,
+      # type 4
+      F_k,
+      # type 5
+      F_k - weights/2,
+      # type 6
+      F_k / (1 + weights),
+      # type 7
+      (F_k - weights) / (1 - weights),
+      # type 8
+      (F_k - weights/3) / (1 + weights/3),
+      # type 9
+      (F_k - weights*3/8) / (1 + weights/4)
+    )
+    approxfun(p_k, x, rule = 2, ties = "ordered")
+  }
+}
diff --git a/tests/testthat/test-rvar-dist.R b/tests/testthat/test-rvar-dist.R
@@ -33,7 +33,7 @@ test_that("distributional functions work on an rvar array", {
   q21 <- quantile(4:6, p)
   q12 <- quantile(7:9, p)
   q22 <- quantile(10:12, p)
-  x_quantiles <- array(c(q11, q21, q12, q22), dim = c(9, 2, 2), dimnames = list(NULL))
+  x_quantiles <- array(c(q11, q21, q12, q22), dim = c(9, 2, 2))
   expect_equal(quantile(x, p), x_quantiles)
 })
 
@@ -43,14 +43,14 @@ test_that("distributional functions work on an rvar_factor", {
   x <- rvar_factor(x_letters, levels = letters[1:5])
   x2 <- c(rvar_factor(letters), rvar_factor(letters))
 
-  expect_equal(density(x, letters[1:6]), c(0, .3, .2, .4, .1, NA))
+  expect_equal(density(x, letters[1:6]), c(0, .3, .2, .4, .1, NA_real_))
   expect_equal(density(x2, letters[1:3]), array(rep(1/26, 6), dim = c(3,2)))
 
-  expect_equal(cdf(x, letters[1:5]), c(NA, NA, NA, NA, NA))
-  expect_equal(cdf(x2, letters[1:3]), array(rep(NA, 6), dim = c(3,2)))
+  expect_equal(cdf(x, letters[1:5]), c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_))
+  expect_equal(cdf(x2, letters[1:3]), array(rep(NA_real_, 6), dim = c(3,2)))
 
-  expect_equal(quantile(x, 1:4/4), c(NA, NA, NA, NA))
-  expect_equal(quantile(x2, 1:3/3), array(rep(NA, 6), dim = c(3,2)))
+  expect_equal(quantile(x, 1:4/4), c(NA_real_, NA_real_, NA_real_, NA_real_))
+  expect_equal(quantile(x2, 1:3/3), array(rep(NA_real_, 6), dim = c(3,2)))
 })
 
 test_that("distributional functions work on an rvar_ordered", {
@@ -64,3 +64,43 @@ test_that("distributional functions work on an rvar_ordered", {
 
   expect_equal(quantile(x, c(.3, .5, .9, 1)), letters[2:5])
 })
+
+# weighted rvar -----------------------------------------------------------
+
+test_that("weighted rvar works", {
+  x1_draws = qnorm(ppoints(10))
+  x2_draws = qnorm(ppoints(10), 5)
+  w1 = rep(1, 10)
+  w2 = rep(2, 10)
+  w3 = rep(0, 10)
+  x = rvar(c(x1_draws, x2_draws, rep(10, 10)), log_weights = log(c(w1, w2, w3)))
+
+  expect_equal(
+    density(x, 0:9, bw = 2.25),
+    density(draws_of(x), weights = weights(x), bw = 2.25, from = 0, to = 9, n = 10)$y,
+    tolerance = 1e-4
+  )
+  expect_equal(cdf(x, 0:9), ecdf(x1_draws)(0:9)/3 + ecdf(x2_draws)(0:9)*2/3)
+  expect_equal(quantile(x, cdf(x, c(x1_draws, x2_draws)), type = 1), c(x1_draws, x2_draws))
+  expect_equal(quantile(x, cdf(x, c(x1_draws, x2_draws)), type = 4), c(x1_draws, x2_draws))
+})
+
+test_that("weighted rvar_factor works", {
+  x = rvar_factor(c("b", "g", "f", "g"), levels = letters, log_weights = log(c(1/2, 1/6, 1/6, 1/6)))
+
+  expect_equal(density(x, letters), c(0, 1/2, 0, 0, 0, 1/6, 1/3, rep(0, 19)))
+  expect_equal(cdf(x, letters), rep(NA_real_, 26))
+  expect_equal(quantile(x, c(0.2, 0.8)), rep(NA_real_, 2))
+})
+
+test_that("weighted rvar_ordered works", {
+  x = rvar_ordered(c("b", "g", "f", "g"), levels = letters, log_weights = log(c(1/2, 1/6, 1/6, 1/6)))
+
+  expect_equal(density(x, letters), c(0, 1/2, 0, 0, 0, 1/6, 1/3, rep(0, 19)))
+  expect_equal(cdf(x, letters), cumsum(c(0, 1/2, 0, 0, 0, 1/6, 1/3, rep(0, 19))))
+  expect_equal(quantile(x, c(0.2, 0.6, 0.8)), c("b", "f", "g"))
+
+  xl = weight_draws(rvar_ordered(letters), 1:26)
+  expect_equal(quantile(xl, cdf(xl, letters) - .Machine$double.eps), letters)
+})
+