Skip to content

Commit

Permalink
density, cdf, quantiles for weighted rvar
Browse files Browse the repository at this point in the history
  • Loading branch information
mjskay committed Jan 19, 2024
1 parent 534d7fb commit b0a778a
Show file tree
Hide file tree
Showing 3 changed files with 176 additions and 19 deletions.
25 changes: 12 additions & 13 deletions R/rvar-dist.R
Original file line number Diff line number Diff line change
Expand Up @@ -40,8 +40,9 @@
#' @name rvar-dist
#' @export
density.rvar <- function(x, at, ...) {
weights <- weights(x)
summarise_rvar_by_element(x, function(draws) {
d <- density(draws, cut = 0, ...)
d <- density(draws, weights = weights, cut = 0, ...)
f <- approxfun(d$x, d$y, yleft = 0, yright = 0)
f(at)
})
Expand All @@ -50,11 +51,12 @@ density.rvar <- function(x, at, ...) {
#' @rdname rvar-dist
#' @export
density.rvar_factor <- function(x, at, ...) {
weights <- weights(x)
at <- as.numeric(factor(at, levels = levels(x)))
nbins <- nlevels(x)

summarise_rvar_by_element(x, function(draws) {
props <- prop.table(tabulate(draws, nbins = nbins))[at]
tab <- weighted_simple_table(draws, weights)
props <- prop.table(tab$count)[at]
props
})
}
Expand All @@ -66,8 +68,9 @@ distributional::cdf
#' @rdname rvar-dist
#' @export
cdf.rvar <- function(x, q, ...) {
weights <- weights(x)
summarise_rvar_by_element(x, function(draws) {
ecdf(draws)(q)
weighted_ecdf(draws, weights)(q)
})
}

Expand All @@ -76,7 +79,7 @@ cdf.rvar <- function(x, q, ...) {
cdf.rvar_factor <- function(x, q, ...) {
# CDF is not defined for unordered distributions
# generate an all-NA array of the appropriate shape
out <- rep_len(NA, length(x) * length(q))
out <- rep_len(NA_real_, length(x) * length(q))
if (length(x) > 1) dim(out) <- c(length(q), dim(x))
out
}
Expand All @@ -91,14 +94,10 @@ cdf.rvar_ordered <- function(x, q, ...) {
#' @rdname rvar-dist
#' @export
quantile.rvar <- function(x, probs, ...) {
summarise_rvar_by_element_via_matrix(x,
"quantile",
function(draws) {
t(matrixStats::colQuantiles(draws, probs = probs, useNames = TRUE, ...))
},
.extra_dim = length(probs),
.extra_dimnames = list(NULL)
)
weights <- weights(x)
summarise_rvar_by_element(x, function(draws) {
weighted_quantile(draws, probs = probs, weights = weights, ...)
})
}

#' @rdname rvar-dist
Expand Down
118 changes: 118 additions & 0 deletions R/weighted.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
# weighted distribution functions --------------------------------------------

#' Weighted version of [stats::ecdf()].
#' Based on ggdist::weighted_ecdf().
#' @noRd
weighted_ecdf = function(x, weights = NULL) {
n = length(x)
if (n < 1) stop("Need at least 1 or more values to calculate an ECDF")

weights = if (is.null(weights)) rep(1, n) else weights

#sort only if necessary
if (is.unsorted(x)) {
sort_order = order(x)
x = x[sort_order]
weights = weights[sort_order]
}

# calculate weighted cumulative probabilities
p = cumsum(weights)
p = p/p[n]

approxfun(x, p, yleft = 0, yright = 1, ties = "ordered", method = "constant")
}

#' Weighted version of [stats::quantile()].
#' Based on ggdist::weighted_quantile().
#' @noRd
weighted_quantile = function(x,
probs = seq(0, 1, 0.25),
weights = NULL,
na.rm = FALSE,
type = 7
) {
weighted_quantile_fun(
x,
weights = weights,
na.rm = na.rm,
type = type
)(probs)
}

#' @rdname weighted_quantile
#' @export
weighted_quantile_fun = function(x, weights = NULL, na.rm = FALSE, type = 7) {
na.rm <- as_one_logical(na.rm)
if (!isTRUE(type %in% 1:9)) {
stop0("Quantile type `", deparse0(type), "` is invalid. It must be in 1:9.")
}

if (na.rm) {
keep = !is.na(x) & !is.na(weights)
x = x[keep]
weights = weights[keep]
}

# determine weights
weights = weights %||% rep(1, length(x))
non_zero = weights != 0
x = x[non_zero]
weights = weights[non_zero]
weights = weights / sum(weights)

# if there is only 0 or 1 x values, we don't need the weighted version (and
# we couldn't calculate it anyway as we need > 2 points for the interpolation)
if (length(x) <= 1) {
return(function(p) quantile(x, p, names = FALSE))
}

# sort values if necessary
if (is.unsorted(x)) {
x_order = order(x)
x = x[x_order]
weights = weights[x_order]
}

# calculate the weighted CDF
F_k = cumsum(weights)

# generate the function for the approximate inverse CDF
if (1 <= type && type <= 3) {
# discontinuous quantiles
switch(type,
# type 1
stepfun(F_k, c(x, x[length(x)]), right = TRUE),
# type 2
{
x_over_2 = c(x, x[length(x)])/2
inverse_cdf_type2_left = stepfun(F_k, x_over_2, right = FALSE)
inverse_cdf_type2_right = stepfun(F_k, x_over_2, right = TRUE)
function(x) inverse_cdf_type2_left(x) + inverse_cdf_type2_right(x)
},
# type 3
stepfun(F_k - weights/2, c(x[[1]], x), right = TRUE)
)
} else {
# Continuous quantiles. These are based on the definition of p_k as described
# in the documentation of `quantile()`. The trick to re-writing those formulas
# (which use `n` and `k`) for the weighted case is that `k` = `F_k * n` and
# `1/n` = `weight_k`. Using these two facts, we can express the formulas for
# `p_k` without using `n` or `k`, which don't really apply in the weighted case.
p_k = switch(type - 3,
# type 4
F_k,
# type 5
F_k - weights/2,
# type 6
F_k / (1 + weights),
# type 7
(F_k - weights) / (1 - weights),
# type 8
(F_k - weights/3) / (1 + weights/3),
# type 9
(F_k - weights*3/8) / (1 + weights/4)
)
approxfun(p_k, x, rule = 2, ties = "ordered")
}
}
52 changes: 46 additions & 6 deletions tests/testthat/test-rvar-dist.R
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ test_that("distributional functions work on an rvar array", {
q21 <- quantile(4:6, p)
q12 <- quantile(7:9, p)
q22 <- quantile(10:12, p)
x_quantiles <- array(c(q11, q21, q12, q22), dim = c(9, 2, 2), dimnames = list(NULL))
x_quantiles <- array(c(q11, q21, q12, q22), dim = c(9, 2, 2))
expect_equal(quantile(x, p), x_quantiles)
})

Expand All @@ -43,14 +43,14 @@ test_that("distributional functions work on an rvar_factor", {
x <- rvar_factor(x_letters, levels = letters[1:5])
x2 <- c(rvar_factor(letters), rvar_factor(letters))

expect_equal(density(x, letters[1:6]), c(0, .3, .2, .4, .1, NA))
expect_equal(density(x, letters[1:6]), c(0, .3, .2, .4, .1, NA_real_))
expect_equal(density(x2, letters[1:3]), array(rep(1/26, 6), dim = c(3,2)))

expect_equal(cdf(x, letters[1:5]), c(NA, NA, NA, NA, NA))
expect_equal(cdf(x2, letters[1:3]), array(rep(NA, 6), dim = c(3,2)))
expect_equal(cdf(x, letters[1:5]), c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_))
expect_equal(cdf(x2, letters[1:3]), array(rep(NA_real_, 6), dim = c(3,2)))

expect_equal(quantile(x, 1:4/4), c(NA, NA, NA, NA))
expect_equal(quantile(x2, 1:3/3), array(rep(NA, 6), dim = c(3,2)))
expect_equal(quantile(x, 1:4/4), c(NA_real_, NA_real_, NA_real_, NA_real_))
expect_equal(quantile(x2, 1:3/3), array(rep(NA_real_, 6), dim = c(3,2)))
})

test_that("distributional functions work on an rvar_ordered", {
Expand All @@ -64,3 +64,43 @@ test_that("distributional functions work on an rvar_ordered", {

expect_equal(quantile(x, c(.3, .5, .9, 1)), letters[2:5])
})

# weighted rvar -----------------------------------------------------------

test_that("weighted rvar works", {
x1_draws = qnorm(ppoints(10))
x2_draws = qnorm(ppoints(10), 5)
w1 = rep(1, 10)
w2 = rep(2, 10)
w3 = rep(0, 10)
x = rvar(c(x1_draws, x2_draws, rep(10, 10)), log_weights = log(c(w1, w2, w3)))

expect_equal(
density(x, 0:9, bw = 2.25),
density(draws_of(x), weights = weights(x), bw = 2.25, from = 0, to = 9, n = 10)$y,
tolerance = 1e-4
)
expect_equal(cdf(x, 0:9), ecdf(x1_draws)(0:9)/3 + ecdf(x2_draws)(0:9)*2/3)
expect_equal(quantile(x, cdf(x, c(x1_draws, x2_draws)), type = 1), c(x1_draws, x2_draws))
expect_equal(quantile(x, cdf(x, c(x1_draws, x2_draws)), type = 4), c(x1_draws, x2_draws))
})

test_that("weighted rvar_factor works", {
x = rvar_factor(c("b", "g", "f", "g"), levels = letters, log_weights = log(c(1/2, 1/6, 1/6, 1/6)))

expect_equal(density(x, letters), c(0, 1/2, 0, 0, 0, 1/6, 1/3, rep(0, 19)))
expect_equal(cdf(x, letters), rep(NA_real_, 26))
expect_equal(quantile(x, c(0.2, 0.8)), rep(NA_real_, 2))
})

test_that("weighted rvar_ordered works", {
x = rvar_ordered(c("b", "g", "f", "g"), levels = letters, log_weights = log(c(1/2, 1/6, 1/6, 1/6)))

expect_equal(density(x, letters), c(0, 1/2, 0, 0, 0, 1/6, 1/3, rep(0, 19)))
expect_equal(cdf(x, letters), cumsum(c(0, 1/2, 0, 0, 0, 1/6, 1/3, rep(0, 19))))
expect_equal(quantile(x, c(0.2, 0.6, 0.8)), c("b", "f", "g"))

xl = weight_draws(rvar_ordered(letters), 1:26)
expect_equal(quantile(xl, cdf(xl, letters) - .Machine$double.eps), letters)
})

0 comments on commit b0a778a

Please sign in to comment.