diff --git a/DESCRIPTION b/DESCRIPTION index 81f4c84fa..aee54a845 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -45,6 +45,7 @@ Imports: mlr3misc (>= 0.1.4), paradox, R6, + R.cache, withr Suggests: ggplot2, diff --git a/R/Graph.R b/R/Graph.R index 72fdbb0d1..48861c556 100644 --- a/R/Graph.R +++ b/R/Graph.R @@ -58,6 +58,15 @@ #' (and therefore their `$param_set$values`) and a hash of `$edges`. #' * `keep_results` :: `logical(1)` \cr #' Whether to store intermediate results in the [`PipeOp`]'s `$.result` slot, mostly for debugging purposes. Default `FALSE`. +#' * `cache` :: `logical(1)` \cr +#' Whether to cache individual [`PipeOp`]'s during "train" and "predict". Default `FALSE`. +#' Caching is performed using the [`R.cache`](R.cache::R.cache) package. +#' Caching can be disabled/enabled globally using `getOption("R.cache.enabled", TRUE)`. +#' By default, files are cached in `R.cache::getCacheRootPath()`. +#' For more information on how to set the cache path or retrieve cached items please consider +#' the [`R.cache`](R.cache::R.cache) documentation. +#' Caching can be fine-controlled for each [`PipeOp`] by adjusting individual [`PipeOp`]'s +#' `cache`, `cache_state` and `stochastic` fields. #' #' @section Methods: #' * `ids(sorted = FALSE)` \cr @@ -407,6 +416,13 @@ Graph = R6Class("Graph", } else { map(self$pipeops, "state") } + }, + cache = function(val) { + if (!missing(val)) { + private$.cache = assert_flag(val) + } else { + private$.cache + } } ), @@ -419,7 +435,8 @@ Graph = R6Class("Graph", value ) }, - .param_set = NULL + .param_set = NULL, + .cache = FALSE ) ) @@ -539,7 +556,7 @@ graph_reduce = function(self, input, fun, single_input) { input = input_tbl$payload names(input) = input_tbl$name - output = op[[fun]](input) + output = cached_pipeop_eval(self, op, fun, input) if (self$keep_results) { op$.result = output } @@ -609,3 +626,70 @@ predict.Graph = function(object, newdata, ...) { } result } + +# Cached train/predict of a PipeOp. +# 1) Caching of a PipeOp only performed if graph and po have `cache = TRUE`, +# i.e both the Graph AND the PipeOp want to be cached. +# 2) Additonally caching is only performed if 'train' or 'predict' is not stochastic +# for a given PipeOp. This can be obtained from `.$stochastic` and can be set +# for each PipeOp. +# 3) During training we have two options +# Each PipeOp stores whether it wants to do I. or II. in `.$cache_state`. +# I. Cache only state: +# This is possible if the train transform is the same as the predict transform +# and predict is comparatively cheap (i.e. filters). +# II. Cache state and output +# (All other cases) + +cached_pipeop_eval = function(self, op, fun, input) { + + if (self$cache && op$cache) { + require_namespaces("R.cache") + cache_key = list(map_chr(input, get_hash), op$hash) + if (fun == "train") { + if (fun %nin% op$stochastic) { + # Two options: + # I. cache state (can predict on train set using state during train) + # II. do not cache state () (if I. is not possible) + if (op$cache_state) { + # only cache state (I.) + R.cache::evalWithMemoization({ + op[[fun]](input) + state = op$state + }, key = cache_key) + # Set state if PipeOp was cached (and "train" was therefore not called) + if (is.null(op$state) && fun == "train") op$state = state + # We call "predict" on train inputs, this avoids storing the outputs + # during training on disk. + # This is only done for pipeops where 'cache_state' is TRUE. + return(cached_pipeop_eval(self, op, "predict", input)) + } else { + # Otherwise we cache state and input (II.) + R.cache::evalWithMemoization({ + result = list(output = op[[fun]](input), state = op$state) + }, key = cache_key) + # Set state if PipeOp was cached before (and thus no state was set) + if (is.null(op$state) && fun == "train") op$state = result$state + return(result$output) + } + } + } else if (fun == "predict" && !op$cache_state) { + # during predict, only cache if cache_state is FALSE and op is not stochastic. + if (fun %nin% op$stochastic) { + R.cache::evalWithMemoization( + {output = op[[fun]](input)}, + key = cache_key) + return(output) + } + } + } + # No caching fallback, anything where we do not run into conditions above + return(op[[fun]](input)) +} + +get_hash = function(x) { + hash = try(x$hash, silent = TRUE) + if (inherits(hash, "try-error") || is.null(hash)) + hash = digest(x, algo = "xxhash64") + hash +} diff --git a/R/PipeOp.R b/R/PipeOp.R index aee3d5958..c7500978e 100644 --- a/R/PipeOp.R +++ b/R/PipeOp.R @@ -115,6 +115,21 @@ #' If the [`Graph`]'s `$keep_results` flag is set to `TRUE`, then the intermediate Results of `$train()` and `$predict()` #' are saved to this slot, exactly as they are returned by these functions. This is mainly for debugging purposes #' and done, if requested, by the [`Graph`] backend itself; it should *not* be done explicitly by `private$.train()` or `private$.predict()`. +#' * `cache` :: `logical(1)` \cr +#' Whether to cache the [`PipeOp`]'s state and or output during "train" and "predict". Defaults to `TRUE`. +#' See the `cache` field in [`Graph`] for more detailed information on caching, as well as `cache_state` and +#' `stochastic` below. +#' * `cache_state` :: `logical(1)` \cr +#' Whether the [`PipeOp`]s behaviour during training is equal to behaviour during prediction +#' (other then setting a state). In this case, only the [`PipeOp`]s state is cached. +#' This avoids caching possibly large intermediate results. +#' Defaults to `TRUE`. +#' * `stochastic` :: `character` \cr +#' Whether a [`PipeOp`] is stochastic during `"train"`, `"predict"`, or not at all: `character(0)`. +#' Defaults to `character(0)` (deterministic). Stochastic [`PipeOp`]s are not cached during the +#' respective phase. +#' A [`PipeOp`] is only cached if it is deterministic. +#' #' #' @section Methods: #' * `train(input)`\cr @@ -254,7 +269,6 @@ PipeOp = R6Class("PipeOp", if (is_noop(self$state)) { stopf("Pipeop %s got NO_OP during train but no NO_OP during predict.", self$id) } - input = check_types(self, input, "input", "predict") output = private$.predict(input) output = check_types(self, output, "output", "predict") @@ -296,6 +310,26 @@ PipeOp = R6Class("PipeOp", hash = function() { digest(list(class(self), self$id, self$param_set$values), algo = "xxhash64") + }, + cache = function(val) { + if (!missing(val)) { + private$.cache = assert_flag(val) + } else { + private$.cache + } + }, + cache_state = function(val) { + if (!missing(val)) { + stop("cache_state is read-only!") + } + private$.cache_state + }, + stochastic = function(val) { + if (!missing(val)) { + private$.stochastic = assert_subset(val, c("train", "predict")) + } else { + private$.stochastic + } } ), @@ -318,7 +352,10 @@ PipeOp = R6Class("PipeOp", .predict = function(input) stop("abstract"), .param_set = NULL, .param_set_source = NULL, - .id = NULL + .id = NULL, + .cache = TRUE, + .cache_state = TRUE, + .stochastic = character(0) ) ) diff --git a/R/PipeOpBranch.R b/R/PipeOpBranch.R index f02cae9f7..8986c4459 100644 --- a/R/PipeOpBranch.R +++ b/R/PipeOpBranch.R @@ -117,7 +117,8 @@ PipeOpBranch = R6Class("PipeOpBranch", ret = named_list(self$output$name, NO_OP) ret[[self$param_set$values$selection]] = inputs[[1]] ret - } + }, + .cache = FALSE ) ) diff --git a/R/PipeOpChunk.R b/R/PipeOpChunk.R index 2f784ac5d..b2b7470cf 100644 --- a/R/PipeOpChunk.R +++ b/R/PipeOpChunk.R @@ -75,6 +75,17 @@ PipeOpChunk = R6Class("PipeOpChunk", ) } ), + active = list( + stochastic = function(val) { + if (!missing(val)) { + assert_subset(val, c("train", "predict")) + private$.stochastic = val + } else { + if (self$param_set$values$shuffle) return("train") + character(0) + } + } + ), private = list( .train = function(inputs) { self$state = list() @@ -88,7 +99,8 @@ PipeOpChunk = R6Class("PipeOpChunk", }, .predict = function(inputs) { rep(inputs, self$outnum) - } + }, + .cache = FALSE ) ) diff --git a/R/PipeOpClassBalancing.R b/R/PipeOpClassBalancing.R index 2790a2f9f..60094881f 100644 --- a/R/PipeOpClassBalancing.R +++ b/R/PipeOpClassBalancing.R @@ -160,7 +160,10 @@ PipeOpClassBalancing = R6Class("PipeOpClassBalancing", task_filter_ex(task, new_ids) }, - .predict_task = identity + .predict_task = identity, + .cache = FALSE, + .stochastic = "train", + .cache_state = FALSE ) ) diff --git a/R/PipeOpCopy.R b/R/PipeOpCopy.R index 2dc9a7b11..fe3ba709b 100644 --- a/R/PipeOpCopy.R +++ b/R/PipeOpCopy.R @@ -99,7 +99,8 @@ PipeOpCopy = R6Class("PipeOpCopy", }, .predict = function(inputs) { rep_len(inputs, self$outnum) - } + }, + .cache = FALSE ) ) diff --git a/R/PipeOpImputeHist.R b/R/PipeOpImputeHist.R index e44b85f71..a5bd0fb0e 100644 --- a/R/PipeOpImputeHist.R +++ b/R/PipeOpImputeHist.R @@ -74,7 +74,9 @@ PipeOpImputeHist = R6Class("PipeOpImputeHist", } feature[is.na(feature)] = sampled feature - } + }, + .cache = FALSE, + .stochastic = c("train", "predict") ) ) diff --git a/R/PipeOpImputeSample.R b/R/PipeOpImputeSample.R index f19171950..177bdb50b 100644 --- a/R/PipeOpImputeSample.R +++ b/R/PipeOpImputeSample.R @@ -85,7 +85,9 @@ PipeOpImputeSample = R6Class("PipeOpImputeSample", feature[is.na(feature)] = sample(model, outlen, replace = TRUE) } feature - } + }, + .cache = FALSE, + .stochastic = c("train", "predict") ) ) diff --git a/R/PipeOpNOP.R b/R/PipeOpNOP.R index c2d70c5c8..87235cd72 100644 --- a/R/PipeOpNOP.R +++ b/R/PipeOpNOP.R @@ -75,7 +75,8 @@ PipeOpNOP = R6Class("PipeOpNOP", .predict = function(inputs) { inputs - } + }, + .cache = FALSE ) ) diff --git a/R/PipeOpProxy.R b/R/PipeOpProxy.R index 56600f825..89612c33c 100644 --- a/R/PipeOpProxy.R +++ b/R/PipeOpProxy.R @@ -105,6 +105,35 @@ PipeOpProxy = R6Class("PipeOpProxy", ) } ), + active = list( + cache = function(val) { + if (!missing(val)) { + self$param_set$values$content$cache = assert_flag(val) + } else { + self$param_set$values$content$cache + } + }, + stochastic = function(val) { + if (!missing(val)) { + assert_subset(val, c("train", "predict")) + if (inherits(self$param_set$values$content, "Graph")) + stop("'stochastic' not be set when content is a graph!") + else + self$param_set$values$content$stochastic = val + } else { + if (inherits(self$param_set$values$content, "Graph")) return(character(0)) + self$param_set$values$content$stochastic + } + }, + cache_state = function(val) { + if (!missing(val)) { + stop("cache_state is read-only!") + } else { + if (inherits(self$param_set$values$content, "Graph")) return(TRUE) + self$param_set$values$content$cache_state + } + } + ), private = list( .param_set = NULL, .param_set_source = NULL, diff --git a/R/PipeOpSmote.R b/R/PipeOpSmote.R index 3f56ebef7..4c8a042ea 100644 --- a/R/PipeOpSmote.R +++ b/R/PipeOpSmote.R @@ -106,7 +106,9 @@ PipeOpSmote = R6Class("PipeOpSmote", } setnames(st, "class", task$target_names) task$rbind(st) - } + }, + .cache = FALSE, + .stochastic = "train" ) ) diff --git a/R/PipeOpSubsample.R b/R/PipeOpSubsample.R index f746a3b2f..cea2606e0 100644 --- a/R/PipeOpSubsample.R +++ b/R/PipeOpSubsample.R @@ -93,8 +93,8 @@ PipeOpSubsample = R6Class("PipeOpSubsample", self$state = list() task_filter_ex(task, keep) }, - - .predict_task = identity + .predict_task = identity, + .cache_state = FALSE ) ) diff --git a/R/PipeOpThreshold.R b/R/PipeOpThreshold.R index 6ba93507d..a90f5c1b0 100644 --- a/R/PipeOpThreshold.R +++ b/R/PipeOpThreshold.R @@ -82,7 +82,9 @@ PipeOpThreshold = R6Class("PipeOpThreshold", } list(prd$set_threshold(thr)) - } + }, + .cache = FALSE, + .cache_state = FALSE ) ) diff --git a/R/PipeOpUnbranch.R b/R/PipeOpUnbranch.R index bcb4e6753..6255bfd7f 100644 --- a/R/PipeOpUnbranch.R +++ b/R/PipeOpUnbranch.R @@ -88,7 +88,8 @@ PipeOpUnbranch = R6Class("PipeOpUnbranch", }, .predict = function(inputs) { filter_noop(inputs) - } + }, + .cache = FALSE ) ) diff --git a/attic/caching.md b/attic/caching.md new file mode 100644 index 000000000..1aa076c10 --- /dev/null +++ b/attic/caching.md @@ -0,0 +1,216 @@ +# Caching + +These docs describe `oportunistic caching`, i.e. caching after a first function call. +If the same function is executed twice in parallel, this does not save any time/cores. +The example currently uses the `R.cache` package by Henrik Bengtsson for caching. +This is just a very simple caching package, that provides a clean, simple API, could +theoretically be replaced by other packages. + + +## Preliminaries + +- Pipelines should be cached at a PipeOp level, as there are rarely situations where + caching a full Graph would be required (e.g. tuning a graph requires caching of individual steps). + +- PipeOps could either cache `state` and `result` during training or alternatively only `state` + when `predict` is comparatively cheap and the same transform steps can be done during `train` + and `predict`. For now we will call the latter pipeops `"predict_like_train"`. + This should be annotated in each `PipeOp`. Can default to `"predict_like_train"`. + +- PipeOps can be **stochastic**, either during `"train"`, `"predict"`, `"both"` or `"deterministic"`. + Implementation suggestion: + ``` + stochastic = c("train", "predict", "character(0)") # "character(0)" means not stochastic. + ``` + This needs to be annotated in each `PipeOp`. Could default to deterministic. + +- Caching can be turned on / off for individual a full `Graph` or individual `PipeOps`. + API for this could e.g. be: + - `Graph` has a `cache` slot, can be set to `TRUE` or `FALSE`, Default `FALSE`? + - `PipeOp` has a `cache` slot, can be set to `TRUE` or `FALSE`, Default `TRUE`? + `PipeOp`s that should never be cached (stochastic, meta, ...) are set to `FALSE`. + - If `Graph$cache && PipeOp$cache`, caching is active. + + +**Current implementation:** + +- `PipeOp` gets the following new slots: + - `stochastic`: can be c("train", "predict", character(0)). Default `character(0)`, set for some pos. + - `cache`: Whether the `PipeOp` should be cached. Default `TRUE`, set to `FALSE` for some pos. + - `cache_state`: Whether it is sufficient to cache the `$state`. + + Those slots are `xxx` AB's pointing to `private$.xxxx` + +- `Graph` gets the following new slots: + - `cache`: Whether the `Graph` should be cached. Default `TRUE`, set to `FALSE` for some pos. + +- New function called within `graph_reduce`: `cached_pipeop_eval`. See **Implementation Details** below. + + + +## Implementation Details + +Ideally we would like to do caching on an abstract level, instead of writing a caching mechanism +for each `PipeOp`. + +`R.cache::evalWithMemoization` memoizes the provided expression. +The `hash` is computed from its `key` argument. + +Possible solution: apply caching during `graph_reduce` (`Graph.R`): + +The call to `op[[fun]](input)` calls each `PipeOp's` "train" and "predict" fun. +Note: This is a simplified version, see the actual implementation `cached_pipeop_eval` in `graph.R`. + +``` +cached_pipeop_eval = function(self, op, fun, input) { + + if (self$cache && op$cache) { + cache_key = list(map_chr(input, get_hash), op$hash) + if (fun == "train") { + if (fun %nin% op$stochastic) { + # Two options: cache state (can predict on train set using state during train) + # Or: do not cache state () (if upper is not possible) + if (op$cache_state) { + R.cache::evalWithMemoization({ + op[[fun]](input) + state = op$state + }, key = cache_key) + # Set state if PipeOp was cached + if (is.null(op$state) && fun == "train") op$state = state + # We call "predict" on train inputs, this avoids storing the outputs + # during training on disk. This is only possible for some pipeops. + cached_pipeop_eval(self, op, "predict", input) + } else { + R.cache::evalWithMemoization({ + result = list(output = op[[fun]](input), state = op$state) + }, key = cache_key) + # Set state if PipeOp was cached + if (is.null(op$state) && fun == "train") op$state = result$state + return(result$output) + } + } + } else if (fun == "predict" && !op$cache_state) { + if (fun %nin% op$stochastic) { + R.cache::evalWithMemoization( + {output = op[[fun]](input)}, + key = cache_key) + return(output) + } + } + } + # No caching fallback, anything where we do not run into conditions above + return(op[[fun]](input)) +} +``` + +where `get_hash` is: +``` +get_hash = function(x) { + hash = try(x$hash, silent = TRUE) + if (inherits(hash, "try-error")) + digest(x, algo = "xxhash64") + return(hash) +} +``` + + +## Possible problems: + +A) Unfortunately `private$.train()` is not a pure function, but + instead has side-effects: + - sets a `$state` + +If we can ensure that the only side-effect of `$.train` is a modified state, +we could also memoize the state during `$train` (see above). +If other fields are updated, we need to have a list of fields that are updated or go a different route. + +## Further Issues: + +F) Should caching be optional? + Probably yes! + +G) How do we globally enable/disable caching? + 1. global option + < ugly, might not work with parallelization. > + + 2. caching can be turned on in `Graph` | `GraphLearner` + ``` + Graph = R6Class( + ... + caching = TRUE, + ... + ) + ``` + `GraphLearner` gets an active binding to turn caching of it's graph on/off. + Could also be added as an arg to the `GraphLearner`s constructor. + + The caching of individual steps is then done by adjusting calls to `graph_reduce`: + `graph_reduce(..., caching = self$caching)` + +H) Caching for some `PipeOp`s can be manually changed to disable caching for any pipeop. + +Open Questions: + - How do `$train` and `$predict` know whether to do caching or not? + Add a second argument `caching`? + - How do caching and `parallelization` interact? + - Does `R.cache::evalWithMemoization`s `key` arg need anything else? + - If `state` is obtained from a stochastic function, how do we want this to behave? + +From @mb706: + +- PipeOps should contain metadata about whether they are deterministic or not, and whether + their .train() and .predict() results are the same whenever the input to both is the same (use common vs. separate cache) + + **Possible solution** + + 1. Add a new field: + ``` + cacheable = TRUE # or deterministic = TRUE + ``` + only `PipeOp`s where this holds are beeing cached. + + 2. For `cacheable = FALSE`, the `.Random.seed` is added to the caching `key`. + This would allow to cache reproducible workflows. + +- with some operations it may make more sense to save just the $state and not the result. + Then during $train() the caching mechanism can set the state from cache and call $.predict(). + + Question: How do we decide this? We should maybe think about an **API** for this. + +### Caching a full graph + +- caching in mlrCPO was a wrapper-PipeOp, we could also have that here. + Pro: For multiple operations only the last output needs to be saved; makes the configuration of different caching mechanisms easier. + Cons: We get the drawbacks of wrapping: the graph structure gets obscured. Also when wrapping multiple operations and just one of them is nondeterministic everything falls apart. We may want a ppl() function that wraps a graph optimally so that linear deterministic segments are cached together and only the output of the last PipeOp is kept. (Also works for arbitrary Graphs). + + Comments: + - Caching the graph: Yes! + Caching segments of the graph? + This makes things unneccessarily complicated. We could instead either cache the whole graph **or** if any po is nondeterministic, cache only deterministic pipeops. + + - **Possible solution** + 1. Wrap the graph as described above with pro's, con's. + + 2. Cache the graph's `$reduce_graph` method in `$train, $predict` (in `Graph.R`) + similarly to how `PipeOp`s are cached above. + This is only possible if all po's in a graph are deterministic. + + +### Caching non-deterministic `PipeOp`s + +This could be done if we add `Random.seed` to the `key`. +Additionally we would have to advance the `Random.seed` properly. +This could be added in future work, but might not be relevant now. + +It should be possible to enforce caching for stochastic `PipeOp`s. +Example: I want to evaluate choices (branches) made after or before a stochastic pipeop. + This would allow me to circumvent stochasticity. + + +### User Control for caching + +This basically could be handled via `R.cache`'s functionality, but should somehow be documented. + +### Testthat + +How do we disable caching during `unit` tests. \ No newline at end of file diff --git a/man/Graph.Rd b/man/Graph.Rd index 36e85150a..d8b571a7b 100644 --- a/man/Graph.Rd +++ b/man/Graph.Rd @@ -66,6 +66,15 @@ Stores a checksum calculated on the \code{\link{Graph}} configuration, which inc (and therefore their \verb{$param_set$values}) and a hash of \verb{$edges}. \item \code{keep_results} :: \code{logical(1)} \cr Whether to store intermediate results in the \code{\link{PipeOp}}'s \verb{$.result} slot, mostly for debugging purposes. Default \code{FALSE}. +\item \code{cache} :: \code{logical(1)} \cr +Whether to cache individual \code{\link{PipeOp}}'s during "train" and "predict". Default \code{FALSE}. +Caching is performed using the \href{R.cache::R.cache}{\code{R.cache}} package. +Caching can be disabled/enabled globally using \code{getOption("R.cache.enabled", TRUE)}. +By default, files are cached in \code{R.cache::getCacheRootPath()}. +For more information on how to set the cache path or retrieve cached items please consider +the \href{R.cache::R.cache}{\code{R.cache}} documentation. +Caching can be fine-controlled for each \code{\link{PipeOp}} by adjusting individual \code{\link{PipeOp}}'s +\code{cache}, \code{cache_state} and \code{stochastic} fields. } } diff --git a/man/PipeOp.Rd b/man/PipeOp.Rd index 21479adfc..d3ef834b2 100644 --- a/man/PipeOp.Rd +++ b/man/PipeOp.Rd @@ -120,6 +120,20 @@ binding and calculate the hash as \verb{digest(list(super$hash, ), If the \code{\link{Graph}}'s \verb{$keep_results} flag is set to \code{TRUE}, then the intermediate Results of \verb{$train()} and \verb{$predict()} are saved to this slot, exactly as they are returned by these functions. This is mainly for debugging purposes and done, if requested, by the \code{\link{Graph}} backend itself; it should \emph{not} be done explicitly by \code{private$.train()} or \code{private$.predict()}. +\item \code{cache} :: \code{logical(1)} \cr +Whether to cache the \code{\link{PipeOp}}'s state and or output during "train" and "predict". Defaults to \code{TRUE}. +See the \code{cache} field in \code{\link{Graph}} for more detailed information on caching, as well as \code{cache_state} and +\code{stochastic} below. +\item \code{cache_state} :: \code{logical(1)} \cr +Whether the \code{\link{PipeOp}}s behaviour during training is equal to behaviour during prediction +(other then setting a state). In this case, only the \code{\link{PipeOp}}s state is cached. +This avoids caching possibly large intermediate results. +Defaults to \code{TRUE}. +\item \code{stochastic} :: \code{character} \cr +Whether a \code{\link{PipeOp}} is stochastic during \code{"train"}, \code{"predict"}, or not at all: \code{character(0)}. +Defaults to \code{character(0)} (deterministic). Stochastic \code{\link{PipeOp}}s are not cached during the +respective phase. +A \code{\link{PipeOp}} is only cached if it is deterministic. } } diff --git a/tests/testthat/helper_functions.R b/tests/testthat/helper_functions.R index 8113af53e..28f7b2e80 100644 --- a/tests/testthat/helper_functions.R +++ b/tests/testthat/helper_functions.R @@ -105,6 +105,9 @@ expect_pipeop = function(po) { expect_names(names(po$output), permutation.of = c("name", "train", "predict")) expect_int(po$innum, lower = 1) expect_int(po$outnum, lower = 1) + expect_flag(po$cache) + expect_flag(po$cache_state) + expect_character(po$stochastic) # at least one of "train" or "predict" must be in every parameter's tag testthat::expect_true(every(po$param_set$tags, function(x) length(intersect(c("train", "predict"), x)) > 0)) diff --git a/tests/testthat/test_Graph.R b/tests/testthat/test_Graph.R index 64c5fd2fa..494836595 100644 --- a/tests/testthat/test_Graph.R +++ b/tests/testthat/test_Graph.R @@ -365,3 +365,10 @@ test_that("Graph with vararg input", { expect_equal(list(nop.output = 1, featureunion.output = tcombined, nop2.output = 2, nop3.output = 3), gr$train(list(1, t1, t2, 2, 3), single_input = FALSE)) }) + +test_that("Caching ABs", { + gr = as_graph(po("scale")) + expect_true(!gr$cache) + gr$cache = TRUE + expect_true(gr$cache) +}) diff --git a/tests/testthat/test_PipeOp.R b/tests/testthat/test_PipeOp.R index 0342f0525..c870e18ee 100644 --- a/tests/testthat/test_PipeOp.R +++ b/tests/testthat/test_PipeOp.R @@ -67,3 +67,15 @@ test_that("Errors occur for inputs", { po$param_set = ParamSet$new() }, "read-only") }) + +test_that("Caching ABs", { + po = po("scale") + expect_true(po$cache) + expect_true(po$cache_state) + expect_true(length(po$stochastic) == 0L) + po$cache = FALSE + expect_error({po$cache_state = TRUE}) + po$stochastic = "train" + expect_true(!po$cache) + expect_true(po$stochastic == "train") +}) diff --git a/tests/testthat/test_caching.R b/tests/testthat/test_caching.R new file mode 100644 index 000000000..d5e7b266a --- /dev/null +++ b/tests/testthat/test_caching.R @@ -0,0 +1,151 @@ +context("Caching") + +test_that("Caching works for test hash pipeop", { + skip_on_cran() + require("R.cache") + + # cache to tempdir + old_tmpdir = R.cache::getCacheRootPath() + test_tmpdir = dir.create(paste0(tempdir(), "R.cache")) + R.cache::setCacheRootPath(test_tmpdir) + + PipeOpTestHash = R6Class("PipeOpTestHash", + inherit = PipeOp, + public = list( + initialize = function(id = "test.hash", param_set = ParamSet$new()) { + super$initialize(id = id, param_set = param_set, + input = data.table(name = "input", train = "*", predict = "*"), + output = data.table(name = "output", train = "*", predict = "*") + ) + }), + active = list( + cache_state = function(val) { + if (missing(val)) + return(private$.cache_state) + private$.cache_state = val + } + ), + private = list( + .train = function(inputs) { + Sys.sleep(1) + message("sleeping train") + self$state = list("train") + inputs + }, + .predict = function(inputs) { + if (inputs[[1]] == "predict") { + Sys.sleep(1) + message("sleeping predict") + } + inputs + }, + .cache = TRUE, + .cache_state = TRUE + ) + ) + + # caching is only enabled for graphs + gr = as_graph(PipeOpTestHash$new()) + gr$cache = TRUE + + # takes > 1 second + R.cache::clearCache(prompt = FALSE) + st = Sys.time() + expect_message(gr$train("train"), "sleeping train") + expect_true(st < Sys.time() - 1) + + # takes > 1 second + st = Sys.time() + expect_message(gr$predict("predict"), "sleeping predict") + expect_true(st < Sys.time() - 1) + + # cached train takes < 1 second + st = Sys.time() + expect_silent(gr$train("train")) + expect_true(gr$train("train") == "train") + expect_true(st > (Sys.time() - 1)) + + # uncached (cach_state) predict takes > 1 second + st = Sys.time() + expect_message(gr$predict("predict"), "sleeping predict") + expect_true(st < Sys.time() - 1) + + # Obtain result from cache: + key = list(map_chr(list(input = "train"), get_hash), PipeOpTestHash$new()$hash) + # R.cache appends the expression to the key before storing + expr = substitute({ + op[[fun]](input) + state = op$state + }) + key = c(list(expr = expr), key) + expect_equal(R.cache::loadCache(key)$results[[1]], "train") + + + # PO stochastic is respected----------------------- + po = PipeOpTestHash$new() + po$stochastic = c("train", "predict") + po$cache_state = FALSE + gr = as_graph(po) + gr$cache = TRUE + + # takes > 1 second + R.cache::clearCache(prompt = FALSE) + st = Sys.time() + expect_message(gr$train("train"), "sleeping train") + expect_true(st < Sys.time() - 1) + + # takes > 1 second + st = Sys.time() + expect_message(gr$predict("predict"), "sleeping predict") + expect_true(st < Sys.time() - 1) + # Nothing was cached: + expect_true(all(list.files(R.cache::getCacheRootPath()) == "README.txt")) + + + # PO cache = FALSE is respected ----------------------- + po = PipeOpTestHash$new() + po$cache = FALSE + gr = as_graph(po) + gr$cache = TRUE + + # takes > 1 second + R.cache::clearCache(prompt = FALSE) + st = Sys.time() + expect_message(gr$train("train"), "sleeping train") + expect_true(st < Sys.time() - 1) + expect_true(all(list.files(R.cache::getCacheRootPath()) == "README.txt")) + + + # PO cache_state = FALSE is respected------------------- + po = PipeOpTestHash$new() + po$cache_state = FALSE + gr = as_graph(po) + gr$cache = TRUE + + # takes > 1 second + R.cache::clearCache(prompt = FALSE) + st = Sys.time() + expect_message(gr$train("train"), "sleeping train") + expect_true(st < Sys.time() - 1) + + # takes > 1 second + st = Sys.time() + expect_message(gr$predict("predict"), "sleeping predict") + expect_true(st < Sys.time() - 1) + + # cached train takes < 1 second + st = Sys.time() + expect_silent(gr$train("train")) + expect_true(gr$train("train") == "train") + expect_true(st > (Sys.time() - 1)) + + # cached predict takes < 1 second + st = Sys.time() + expect_silent(gr$predict("predict")) + expect_true(st > Sys.time() - 1) + expect_true(length(list.files(R.cache::getCacheRootPath())) == 3) + + # # Reset old cachepath + R.cache::setCacheRootPath(old_tmpdir) + unlink(test_tmpdir, recursive = TRUE) +})