diff --git a/DESCRIPTION b/DESCRIPTION
index 81f4c84fa..aee54a845 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -45,6 +45,7 @@ Imports:
     mlr3misc (>= 0.1.4),
     paradox,
     R6,
+    R.cache,
     withr
 Suggests:
     ggplot2,
diff --git a/R/Graph.R b/R/Graph.R
index 72fdbb0d1..48861c556 100644
--- a/R/Graph.R
+++ b/R/Graph.R
@@ -58,6 +58,15 @@
 #'   (and therefore their `$param_set$values`) and a hash of `$edges`.
 #' * `keep_results` :: `logical(1)` \cr
 #'   Whether to store intermediate results in the [`PipeOp`]'s `$.result` slot, mostly for debugging purposes. Default `FALSE`.
+#' * `cache` :: `logical(1)` \cr
+#'   Whether to cache individual [`PipeOp`]'s during "train" and "predict". Default `FALSE`.
+#'   Caching is performed using the [`R.cache`](R.cache::R.cache) package.
+#'   Caching can be disabled/enabled globally using `getOption("R.cache.enabled", TRUE)`.
+#'   By default, files are cached in `R.cache::getCacheRootPath()`.
+#'   For more information on how to set the cache path or retrieve cached items please consider
+#'   the [`R.cache`](R.cache::R.cache) documentation.
+#'   Caching can be fine-controlled for each [`PipeOp`] by adjusting individual [`PipeOp`]'s
+#'   `cache`, `cache_state` and `stochastic` fields.
 #'
 #' @section Methods:
 #' * `ids(sorted = FALSE)` \cr
@@ -407,6 +416,13 @@ Graph = R6Class("Graph",
       } else {
         map(self$pipeops, "state")
       }
+    },
+    cache = function(val) {
+      if (!missing(val)) {
+        private$.cache = assert_flag(val)
+      } else {
+        private$.cache
+      }
     }
   ),
 
@@ -419,7 +435,8 @@ Graph = R6Class("Graph",
         value
       )
     },
-    .param_set = NULL
+    .param_set = NULL,
+    .cache = FALSE
   )
 )
 
@@ -539,7 +556,7 @@ graph_reduce = function(self, input, fun, single_input) {
     input = input_tbl$payload
     names(input) = input_tbl$name
 
-    output = op[[fun]](input)
+    output = cached_pipeop_eval(self, op, fun, input)
     if (self$keep_results) {
       op$.result = output
     }
@@ -609,3 +626,70 @@ predict.Graph = function(object, newdata, ...) {
   }
   result
 }
+
+# Cached train/predict of a PipeOp. 
+# 1) Caching of a PipeOp only performed if graph and po have `cache = TRUE`,
+#    i.e both the Graph AND the PipeOp want to be cached.
+# 2) Additonally caching is only performed if 'train' or 'predict' is not stochastic
+#    for a given PipeOp. This can be obtained from `.$stochastic` and can be set
+#    for each PipeOp.
+# 3) During training we have two options
+#    Each PipeOp stores whether it wants to do I. or II. in `.$cache_state`.
+#    I. Cache only state:
+#      This is possible if the train transform is the same as the predict transform
+#      and predict is comparatively cheap (i.e. filters).
+#    II. Cache state and output
+#      (All other cases)
+
+cached_pipeop_eval = function(self, op, fun, input) {
+
+  if (self$cache && op$cache) {
+    require_namespaces("R.cache")
+    cache_key = list(map_chr(input, get_hash), op$hash)
+    if (fun == "train") {
+      if (fun %nin% op$stochastic) {
+        # Two options:
+        # I.  cache state (can predict on train set using state during train)
+        # II. do not cache state () (if I. is not possible)
+        if (op$cache_state) {
+          # only cache state (I.)
+          R.cache::evalWithMemoization({
+            op[[fun]](input)
+            state = op$state 
+          }, key = cache_key)
+          # Set state if PipeOp was cached (and "train" was therefore not called)
+          if (is.null(op$state) && fun == "train") op$state = state
+          # We call "predict" on train inputs, this avoids storing the outputs 
+          # during training on disk.
+          # This is only done for pipeops where 'cache_state' is TRUE.
+          return(cached_pipeop_eval(self, op, "predict", input))
+        } else {
+          # Otherwise we cache state and input (II.)
+          R.cache::evalWithMemoization({
+            result = list(output = op[[fun]](input), state = op$state)
+          }, key = cache_key)
+          # Set state if PipeOp was cached before (and thus no state was set)
+          if (is.null(op$state) && fun == "train") op$state = result$state
+          return(result$output)
+        }
+      }
+    } else if (fun == "predict" && !op$cache_state) {
+      # during predict, only cache if cache_state is FALSE and op is not stochastic.
+      if (fun %nin% op$stochastic) {
+        R.cache::evalWithMemoization(
+          {output = op[[fun]](input)},
+          key = cache_key)
+        return(output)
+      }
+    }
+  }
+  # No caching fallback, anything where we do not run into conditions above
+  return(op[[fun]](input))
+}
+
+get_hash = function(x) {
+  hash = try(x$hash, silent = TRUE)
+  if (inherits(hash, "try-error") || is.null(hash))
+    hash = digest(x, algo = "xxhash64")
+  hash
+}
diff --git a/R/PipeOp.R b/R/PipeOp.R
index aee3d5958..c7500978e 100644
--- a/R/PipeOp.R
+++ b/R/PipeOp.R
@@ -115,6 +115,21 @@
 #'   If the [`Graph`]'s `$keep_results` flag is set to `TRUE`, then the intermediate Results of `$train()` and `$predict()`
 #'   are saved to this slot, exactly as they are returned by these functions. This is mainly for debugging purposes
 #'   and done, if requested, by the [`Graph`] backend itself; it should *not* be done explicitly by `private$.train()` or `private$.predict()`.
+#' * `cache` :: `logical(1)` \cr
+#'   Whether to cache the [`PipeOp`]'s state and or output during "train" and "predict". Defaults to `TRUE`.
+#'   See the `cache` field in [`Graph`] for more detailed information on caching, as well as `cache_state` and
+#'   `stochastic` below.
+#' * `cache_state` :: `logical(1)` \cr
+#'   Whether the [`PipeOp`]s behaviour during training is equal to behaviour during prediction
+#'   (other then setting a state). In this case, only the [`PipeOp`]s state is cached.
+#'   This avoids caching possibly large intermediate results.
+#'   Defaults to `TRUE`.
+#' * `stochastic` :: `character` \cr
+#'   Whether a [`PipeOp`] is stochastic during `"train"`, `"predict"`, or not at all: `character(0)`.
+#'   Defaults to `character(0)` (deterministic). Stochastic [`PipeOp`]s are not cached during the
+#'   respective phase.
+#'   A [`PipeOp`] is only cached if it is deterministic.
+#' 
 #'
 #' @section Methods:
 #' * `train(input)`\cr
@@ -254,7 +269,6 @@ PipeOp = R6Class("PipeOp",
       if (is_noop(self$state)) {
         stopf("Pipeop %s got NO_OP during train but no NO_OP during predict.", self$id)
       }
-
       input = check_types(self, input, "input", "predict")
       output = private$.predict(input)
       output = check_types(self, output, "output", "predict")
@@ -296,6 +310,26 @@ PipeOp = R6Class("PipeOp",
     hash = function() {
       digest(list(class(self), self$id, self$param_set$values),
         algo = "xxhash64")
+    },
+    cache = function(val) {
+      if (!missing(val)) {
+        private$.cache = assert_flag(val)
+      } else {
+        private$.cache
+      }
+    },
+    cache_state = function(val) {
+      if (!missing(val)) {
+        stop("cache_state is read-only!")
+      } 
+      private$.cache_state
+    },
+    stochastic = function(val) {
+      if (!missing(val)) {
+        private$.stochastic = assert_subset(val, c("train", "predict"))
+      } else {
+        private$.stochastic
+      }
     }
   ),
 
@@ -318,7 +352,10 @@ PipeOp = R6Class("PipeOp",
     .predict = function(input) stop("abstract"),
     .param_set = NULL,
     .param_set_source = NULL,
-    .id = NULL
+    .id = NULL,
+    .cache = TRUE,
+    .cache_state = TRUE,
+    .stochastic = character(0)
   )
 )
 
diff --git a/R/PipeOpBranch.R b/R/PipeOpBranch.R
index f02cae9f7..8986c4459 100644
--- a/R/PipeOpBranch.R
+++ b/R/PipeOpBranch.R
@@ -117,7 +117,8 @@ PipeOpBranch = R6Class("PipeOpBranch",
       ret = named_list(self$output$name, NO_OP)
       ret[[self$param_set$values$selection]] = inputs[[1]]
       ret
-    }
+    },
+    .cache = FALSE
   )
 )
 
diff --git a/R/PipeOpChunk.R b/R/PipeOpChunk.R
index 2f784ac5d..b2b7470cf 100644
--- a/R/PipeOpChunk.R
+++ b/R/PipeOpChunk.R
@@ -75,6 +75,17 @@ PipeOpChunk = R6Class("PipeOpChunk",
       )
     }
   ),
+  active = list(
+    stochastic = function(val) {
+      if (!missing(val)) {
+        assert_subset(val, c("train", "predict"))
+        private$.stochastic = val
+      } else {
+        if (self$param_set$values$shuffle) return("train")
+        character(0)
+      }
+    }
+  ),
   private = list(
     .train = function(inputs) {
       self$state = list()
@@ -88,7 +99,8 @@ PipeOpChunk = R6Class("PipeOpChunk",
     },
     .predict = function(inputs) {
       rep(inputs, self$outnum)
-    }
+    },
+    .cache = FALSE
   )
 )
 
diff --git a/R/PipeOpClassBalancing.R b/R/PipeOpClassBalancing.R
index 2790a2f9f..60094881f 100644
--- a/R/PipeOpClassBalancing.R
+++ b/R/PipeOpClassBalancing.R
@@ -160,7 +160,10 @@ PipeOpClassBalancing = R6Class("PipeOpClassBalancing",
       task_filter_ex(task, new_ids)
     },
 
-    .predict_task = identity
+    .predict_task = identity,
+    .cache = FALSE,
+    .stochastic = "train",
+    .cache_state = FALSE
   )
 )
 
diff --git a/R/PipeOpCopy.R b/R/PipeOpCopy.R
index 2dc9a7b11..fe3ba709b 100644
--- a/R/PipeOpCopy.R
+++ b/R/PipeOpCopy.R
@@ -99,7 +99,8 @@ PipeOpCopy = R6Class("PipeOpCopy",
     },
     .predict = function(inputs) {
       rep_len(inputs, self$outnum)
-    }
+    },
+    .cache = FALSE
   )
 )
 
diff --git a/R/PipeOpImputeHist.R b/R/PipeOpImputeHist.R
index e44b85f71..a5bd0fb0e 100644
--- a/R/PipeOpImputeHist.R
+++ b/R/PipeOpImputeHist.R
@@ -74,7 +74,9 @@ PipeOpImputeHist = R6Class("PipeOpImputeHist",
       }
       feature[is.na(feature)] = sampled
       feature
-    }
+    },
+    .cache = FALSE,
+    .stochastic = c("train", "predict")
   )
 )
 
diff --git a/R/PipeOpImputeSample.R b/R/PipeOpImputeSample.R
index f19171950..177bdb50b 100644
--- a/R/PipeOpImputeSample.R
+++ b/R/PipeOpImputeSample.R
@@ -85,7 +85,9 @@ PipeOpImputeSample = R6Class("PipeOpImputeSample",
         feature[is.na(feature)] = sample(model, outlen, replace = TRUE)
       }
       feature
-    }
+    },
+    .cache = FALSE,
+    .stochastic = c("train", "predict")
   )
 )
 
diff --git a/R/PipeOpNOP.R b/R/PipeOpNOP.R
index c2d70c5c8..87235cd72 100644
--- a/R/PipeOpNOP.R
+++ b/R/PipeOpNOP.R
@@ -75,7 +75,8 @@ PipeOpNOP = R6Class("PipeOpNOP",
 
     .predict = function(inputs) {
       inputs
-    }
+    },
+    .cache = FALSE
   )
 )
 
diff --git a/R/PipeOpProxy.R b/R/PipeOpProxy.R
index 56600f825..89612c33c 100644
--- a/R/PipeOpProxy.R
+++ b/R/PipeOpProxy.R
@@ -105,6 +105,35 @@ PipeOpProxy = R6Class("PipeOpProxy",
       )
     }
   ),
+  active = list(
+    cache = function(val) {
+      if (!missing(val)) {
+        self$param_set$values$content$cache = assert_flag(val)
+      } else {
+        self$param_set$values$content$cache
+      }
+    },
+    stochastic = function(val) {
+      if (!missing(val)) {
+        assert_subset(val, c("train", "predict"))
+        if (inherits(self$param_set$values$content, "Graph"))
+          stop("'stochastic' not be set when content is a graph!")
+        else 
+          self$param_set$values$content$stochastic = val
+      } else {
+        if (inherits(self$param_set$values$content, "Graph")) return(character(0))
+        self$param_set$values$content$stochastic
+      }
+    },
+    cache_state = function(val) {
+      if (!missing(val)) {
+        stop("cache_state is read-only!")
+      } else {
+        if (inherits(self$param_set$values$content, "Graph")) return(TRUE)
+        self$param_set$values$content$cache_state
+      }
+    }
+  ),
   private = list(
     .param_set = NULL,
     .param_set_source = NULL,
diff --git a/R/PipeOpSmote.R b/R/PipeOpSmote.R
index 3f56ebef7..4c8a042ea 100644
--- a/R/PipeOpSmote.R
+++ b/R/PipeOpSmote.R
@@ -106,7 +106,9 @@ PipeOpSmote = R6Class("PipeOpSmote",
       }
       setnames(st, "class", task$target_names)
       task$rbind(st)
-    }
+    },
+    .cache = FALSE,
+    .stochastic = "train"
   )
 )
 
diff --git a/R/PipeOpSubsample.R b/R/PipeOpSubsample.R
index f746a3b2f..cea2606e0 100644
--- a/R/PipeOpSubsample.R
+++ b/R/PipeOpSubsample.R
@@ -93,8 +93,8 @@ PipeOpSubsample = R6Class("PipeOpSubsample",
       self$state = list()
       task_filter_ex(task, keep)
     },
-
-    .predict_task = identity
+    .predict_task = identity,
+    .cache_state = FALSE
   )
 )
 
diff --git a/R/PipeOpThreshold.R b/R/PipeOpThreshold.R
index 6ba93507d..a90f5c1b0 100644
--- a/R/PipeOpThreshold.R
+++ b/R/PipeOpThreshold.R
@@ -82,7 +82,9 @@ PipeOpThreshold = R6Class("PipeOpThreshold",
       }
 
       list(prd$set_threshold(thr))
-    }
+    },
+    .cache = FALSE,
+    .cache_state = FALSE
   )
 )
 
diff --git a/R/PipeOpUnbranch.R b/R/PipeOpUnbranch.R
index bcb4e6753..6255bfd7f 100644
--- a/R/PipeOpUnbranch.R
+++ b/R/PipeOpUnbranch.R
@@ -88,7 +88,8 @@ PipeOpUnbranch = R6Class("PipeOpUnbranch",
     },
     .predict = function(inputs) {
       filter_noop(inputs)
-    }
+    },
+    .cache = FALSE
   )
 )
 
diff --git a/attic/caching.md b/attic/caching.md
new file mode 100644
index 000000000..1aa076c10
--- /dev/null
+++ b/attic/caching.md
@@ -0,0 +1,216 @@
+# Caching
+
+These docs describe `oportunistic caching`, i.e. caching after a first function call.
+If the same function is executed twice in parallel, this does not save any time/cores.
+The example currently uses the `R.cache` package by Henrik Bengtsson for caching.
+This is just a very simple caching package, that provides a clean, simple API, could
+theoretically be replaced by other packages.
+
+
+## Preliminaries
+
+- Pipelines should be cached at a PipeOp level, as there are rarely situations where 
+  caching a full Graph would be required (e.g. tuning a graph requires caching of individual steps).
+
+- PipeOps could either cache `state` and `result` during training or alternatively only `state`
+  when `predict` is comparatively cheap and the same transform steps can be done during `train`
+  and `predict`. For now we will call the latter pipeops `"predict_like_train"`.
+  This should be annotated in each `PipeOp`. Can default to `"predict_like_train"`.
+
+- PipeOps can be **stochastic**, either during `"train"`, `"predict"`, `"both"` or `"deterministic"`.
+  Implementation suggestion:
+  ```
+  stochastic = c("train", "predict", "character(0)") # "character(0)" means not stochastic.
+  ```
+  This needs to be annotated in each `PipeOp`. Could default to deterministic.
+
+- Caching can be turned on / off for individual a full `Graph` or individual `PipeOps`.
+  API for this could e.g. be: 
+  - `Graph` has a `cache` slot, can be set to `TRUE` or `FALSE`, Default `FALSE`?
+  - `PipeOp` has a `cache` slot, can be set to `TRUE` or `FALSE`, Default `TRUE`?
+    `PipeOp`s that should never be cached (stochastic, meta, ...) are set to `FALSE`.
+  - If `Graph$cache && PipeOp$cache`, caching is active.
+
+
+**Current implementation:**
+
+- `PipeOp` gets the following new slots:
+  - `stochastic`: can be c("train", "predict", character(0)). Default `character(0)`, set for some pos.
+  - `cache`: Whether the `PipeOp` should be cached. Default `TRUE`, set to `FALSE` for some pos.
+  - `cache_state`: Whether it is sufficient to cache the `$state`.
+
+  Those slots are `xxx` AB's pointing to `private$.xxxx`
+
+- `Graph` gets the following new slots:
+  - `cache`: Whether the `Graph` should be cached. Default `TRUE`, set to `FALSE` for some pos.
+
+- New function called within `graph_reduce`: `cached_pipeop_eval`. See **Implementation Details** below.
+
+
+
+## Implementation Details
+
+Ideally we would like to do caching on an abstract level, instead of writing a caching mechanism
+for each `PipeOp`.
+
+`R.cache::evalWithMemoization` memoizes the provided expression.
+The `hash` is computed from its `key` argument.
+
+Possible solution: apply caching during `graph_reduce` (`Graph.R`):
+
+The call to `op[[fun]](input)` calls each `PipeOp's` "train" and "predict" fun.
+Note: This is a simplified version, see the actual implementation `cached_pipeop_eval` in `graph.R`.
+
+```
+cached_pipeop_eval = function(self, op, fun, input) {
+
+  if (self$cache && op$cache) {
+    cache_key = list(map_chr(input, get_hash), op$hash)
+    if (fun == "train") {
+      if (fun %nin% op$stochastic) {
+        # Two options: cache state (can predict on train set using state during train)
+        # Or: do not cache state () (if upper is not possible)
+        if (op$cache_state) {
+          R.cache::evalWithMemoization({
+            op[[fun]](input)
+            state = op$state
+          }, key = cache_key)
+          # Set state if PipeOp was cached
+          if (is.null(op$state) && fun == "train") op$state = state
+          # We call "predict" on train inputs, this avoids storing the outputs 
+          # during training on disk. This is only possible for some pipeops.
+          cached_pipeop_eval(self, op, "predict", input)
+        } else {
+          R.cache::evalWithMemoization({
+            result = list(output = op[[fun]](input), state = op$state)
+          }, key = cache_key)
+          # Set state if PipeOp was cached
+          if (is.null(op$state) && fun == "train") op$state = result$state
+          return(result$output)
+        }
+      }
+    } else if (fun == "predict" && !op$cache_state) {
+      if (fun %nin% op$stochastic) {
+        R.cache::evalWithMemoization(
+          {output = op[[fun]](input)},
+          key = cache_key)
+        return(output)
+      }
+    }
+  }
+  # No caching fallback, anything where we do not run into conditions above
+  return(op[[fun]](input))
+}
+```   
+
+where `get_hash` is:
+```
+get_hash = function(x) {
+  hash = try(x$hash, silent = TRUE)
+  if (inherits(hash, "try-error"))
+    digest(x, algo = "xxhash64")
+  return(hash)    
+}
+```
+
+
+## Possible problems:
+
+A) Unfortunately `private$.train()` is not a pure function, but
+   instead has side-effects:
+    - sets a `$state`
+
+If we can ensure that the only side-effect of `$.train` is a modified state, 
+we could also memoize the state during `$train` (see above).
+If other fields are updated, we need to have a list of fields that are updated or go a different route.
+
+## Further Issues:
+
+F) Should caching be optional? 
+   Probably yes!
+
+G) How do we globally enable/disable caching?
+    1. global option
+    < ugly, might not work with parallelization. >
+
+    2. caching can be turned on in `Graph` | `GraphLearner`
+    ```
+    Graph = R6Class(
+     ...
+     caching = TRUE,
+     ...
+    )
+    ```
+    `GraphLearner` gets an active binding to turn caching of it's graph on/off.
+    Could also be added as an arg to the `GraphLearner`s constructor.
+
+    The caching of individual steps is then done by adjusting calls to `graph_reduce`:
+    `graph_reduce(..., caching = self$caching)`
+
+H) Caching for some `PipeOp`s can be manually changed to disable caching for any pipeop.
+
+Open Questions:
+  - How do `$train` and `$predict` know whether to do caching or not?
+    Add a second argument `caching`?
+  - How do caching and `parallelization` interact?
+  - Does `R.cache::evalWithMemoization`s `key` arg need anything else?
+  - If `state` is obtained from a stochastic function, how do we want this to behave?
+
+From @mb706:
+
+- PipeOps should contain metadata about whether they are deterministic or not, and whether 
+  their .train() and .predict() results are the same whenever the input to both is the same (use common vs. separate cache)
+
+  **Possible solution**
+
+  1. Add a new field:
+  ```
+  cacheable = TRUE  # or deterministic = TRUE
+  ```
+  only `PipeOp`s where this holds are beeing cached.
+
+  2. For `cacheable = FALSE`, the `.Random.seed` is added to the caching `key`. 
+     This would allow to cache reproducible workflows.
+
+- with some operations it may make more sense to save just the $state and not the result.
+  Then during $train() the caching mechanism can set the state from cache and call $.predict().
+
+  Question: How do we decide this? We should maybe think about an **API** for this.
+
+### Caching a full graph
+
+- caching in mlrCPO was a wrapper-PipeOp, we could also have that here. 
+  Pro: For multiple operations only the last output needs to be saved; makes the configuration of different caching mechanisms easier. 
+  Cons: We get the drawbacks of wrapping: the graph structure gets obscured. Also when wrapping multiple operations and just one of them is nondeterministic everything falls apart. We may want a ppl() function that wraps a graph optimally so that linear deterministic segments are cached together and only the output of the last PipeOp is kept. (Also works for arbitrary Graphs).
+
+  Comments:
+  - Caching the graph: Yes!
+    Caching segments of the graph? 
+    This makes things unneccessarily complicated. We could instead either cache the whole graph **or** if any po is nondeterministic, cache only deterministic pipeops.
+
+  - **Possible solution**
+    1. Wrap the graph as described above with pro's, con's.
+
+    2. Cache the graph's `$reduce_graph` method in `$train, $predict` (in `Graph.R`)
+       similarly to how `PipeOp`s are cached above.
+       This is only possible if all po's in a graph are deterministic.
+
+
+### Caching non-deterministic `PipeOp`s
+
+This could be done if we add `Random.seed` to the `key`. 
+Additionally we would have to advance the `Random.seed` properly.
+This could be added in future work, but might not be relevant now.
+
+It should be possible to enforce caching for stochastic `PipeOp`s.
+Example: I want to evaluate choices (branches) made after or before a stochastic pipeop.
+         This would allow me to circumvent stochasticity.
+
+
+### User Control for caching
+
+This basically could be handled via `R.cache`'s functionality, but should somehow be documented.
+
+### Testthat
+
+How do we disable caching during `unit` tests.
\ No newline at end of file
diff --git a/man/Graph.Rd b/man/Graph.Rd
index 36e85150a..d8b571a7b 100644
--- a/man/Graph.Rd
+++ b/man/Graph.Rd
@@ -66,6 +66,15 @@ Stores a checksum calculated on the \code{\link{Graph}} configuration, which inc
 (and therefore their \verb{$param_set$values}) and a hash of \verb{$edges}.
 \item \code{keep_results} :: \code{logical(1)} \cr
 Whether to store intermediate results in the \code{\link{PipeOp}}'s \verb{$.result} slot, mostly for debugging purposes. Default \code{FALSE}.
+\item \code{cache} :: \code{logical(1)} \cr
+Whether to cache individual \code{\link{PipeOp}}'s during "train" and "predict". Default \code{FALSE}.
+Caching is performed using the \href{R.cache::R.cache}{\code{R.cache}} package.
+Caching can be disabled/enabled globally using \code{getOption("R.cache.enabled", TRUE)}.
+By default, files are cached in \code{R.cache::getCacheRootPath()}.
+For more information on how to set the cache path or retrieve cached items please consider
+the \href{R.cache::R.cache}{\code{R.cache}} documentation.
+Caching can be fine-controlled for each \code{\link{PipeOp}} by adjusting individual \code{\link{PipeOp}}'s
+\code{cache}, \code{cache_state} and \code{stochastic} fields.
 }
 }
 
diff --git a/man/PipeOp.Rd b/man/PipeOp.Rd
index 21479adfc..d3ef834b2 100644
--- a/man/PipeOp.Rd
+++ b/man/PipeOp.Rd
@@ -120,6 +120,20 @@ binding and calculate the hash as \verb{digest(list(super$hash, <OTHER THINGS>),
 If the \code{\link{Graph}}'s \verb{$keep_results} flag is set to \code{TRUE}, then the intermediate Results of \verb{$train()} and \verb{$predict()}
 are saved to this slot, exactly as they are returned by these functions. This is mainly for debugging purposes
 and done, if requested, by the \code{\link{Graph}} backend itself; it should \emph{not} be done explicitly by \code{private$.train()} or \code{private$.predict()}.
+\item \code{cache} :: \code{logical(1)} \cr
+Whether to cache the \code{\link{PipeOp}}'s state and or output during "train" and "predict". Defaults to \code{TRUE}.
+See the \code{cache} field in \code{\link{Graph}} for more detailed information on caching, as well as \code{cache_state} and
+\code{stochastic} below.
+\item \code{cache_state} :: \code{logical(1)} \cr
+Whether the \code{\link{PipeOp}}s behaviour during training is equal to behaviour during prediction
+(other then setting a state). In this case, only the \code{\link{PipeOp}}s state is cached.
+This avoids caching possibly large intermediate results.
+Defaults to \code{TRUE}.
+\item \code{stochastic} :: \code{character} \cr
+Whether a \code{\link{PipeOp}} is stochastic during \code{"train"}, \code{"predict"}, or not at all: \code{character(0)}.
+Defaults to \code{character(0)} (deterministic). Stochastic \code{\link{PipeOp}}s are not cached during the
+respective phase.
+A \code{\link{PipeOp}} is only cached if it is deterministic.
 }
 }
 
diff --git a/tests/testthat/helper_functions.R b/tests/testthat/helper_functions.R
index 8113af53e..28f7b2e80 100644
--- a/tests/testthat/helper_functions.R
+++ b/tests/testthat/helper_functions.R
@@ -105,6 +105,9 @@ expect_pipeop = function(po) {
   expect_names(names(po$output), permutation.of = c("name", "train", "predict"))
   expect_int(po$innum, lower = 1)
   expect_int(po$outnum, lower = 1)
+  expect_flag(po$cache)
+  expect_flag(po$cache_state)
+  expect_character(po$stochastic)
   # at least one of "train" or "predict" must be in every parameter's tag
   testthat::expect_true(every(po$param_set$tags, function(x) length(intersect(c("train", "predict"), x)) > 0))
 
diff --git a/tests/testthat/test_Graph.R b/tests/testthat/test_Graph.R
index 64c5fd2fa..494836595 100644
--- a/tests/testthat/test_Graph.R
+++ b/tests/testthat/test_Graph.R
@@ -365,3 +365,10 @@ test_that("Graph with vararg input", {
   expect_equal(list(nop.output = 1, featureunion.output = tcombined, nop2.output = 2, nop3.output = 3),
                gr$train(list(1, t1, t2, 2, 3), single_input = FALSE))
 })
+
+test_that("Caching ABs", {
+  gr = as_graph(po("scale"))
+  expect_true(!gr$cache)
+  gr$cache = TRUE
+  expect_true(gr$cache)
+})
diff --git a/tests/testthat/test_PipeOp.R b/tests/testthat/test_PipeOp.R
index 0342f0525..c870e18ee 100644
--- a/tests/testthat/test_PipeOp.R
+++ b/tests/testthat/test_PipeOp.R
@@ -67,3 +67,15 @@ test_that("Errors occur for inputs", {
     po$param_set = ParamSet$new()
   }, "read-only")
 })
+
+test_that("Caching ABs", {
+  po = po("scale")
+  expect_true(po$cache)
+  expect_true(po$cache_state)
+  expect_true(length(po$stochastic) == 0L)
+  po$cache = FALSE
+  expect_error({po$cache_state = TRUE})
+  po$stochastic = "train"
+  expect_true(!po$cache)
+  expect_true(po$stochastic == "train")
+})
diff --git a/tests/testthat/test_caching.R b/tests/testthat/test_caching.R
new file mode 100644
index 000000000..d5e7b266a
--- /dev/null
+++ b/tests/testthat/test_caching.R
@@ -0,0 +1,151 @@
+context("Caching")
+
+test_that("Caching works for test hash pipeop", {
+  skip_on_cran()
+  require("R.cache")
+
+  # cache to tempdir
+  old_tmpdir = R.cache::getCacheRootPath()
+  test_tmpdir = dir.create(paste0(tempdir(), "R.cache"))
+  R.cache::setCacheRootPath(test_tmpdir)
+
+  PipeOpTestHash = R6Class("PipeOpTestHash",
+    inherit = PipeOp,
+    public = list(
+      initialize = function(id = "test.hash", param_set = ParamSet$new()) {
+        super$initialize(id = id, param_set = param_set,
+          input = data.table(name = "input", train = "*", predict = "*"),
+          output = data.table(name = "output", train = "*", predict = "*")
+        )
+      }),
+    active = list(
+      cache_state = function(val) {
+        if (missing(val))
+          return(private$.cache_state)
+        private$.cache_state = val
+      }
+    ),
+    private = list(
+      .train = function(inputs) {
+        Sys.sleep(1)
+        message("sleeping train")
+        self$state = list("train")
+        inputs
+      },
+      .predict = function(inputs) {
+        if (inputs[[1]] == "predict") {
+          Sys.sleep(1)
+          message("sleeping predict")
+        }
+        inputs
+      },
+      .cache = TRUE,
+      .cache_state = TRUE
+    )
+  )
+
+  # caching is only enabled for graphs
+  gr = as_graph(PipeOpTestHash$new())
+  gr$cache = TRUE
+
+  # takes > 1 second
+  R.cache::clearCache(prompt = FALSE)
+  st = Sys.time()
+  expect_message(gr$train("train"), "sleeping train")
+  expect_true(st < Sys.time() - 1)
+
+  # takes > 1 second
+  st = Sys.time()
+  expect_message(gr$predict("predict"), "sleeping predict")
+  expect_true(st < Sys.time() - 1)
+
+  # cached train takes < 1 second
+  st = Sys.time()
+  expect_silent(gr$train("train"))
+  expect_true(gr$train("train") == "train")
+  expect_true(st > (Sys.time() - 1))
+
+  # uncached (cach_state) predict takes > 1 second
+  st = Sys.time()
+  expect_message(gr$predict("predict"),  "sleeping predict")
+  expect_true(st < Sys.time() - 1)
+
+  # Obtain result from cache:
+  key = list(map_chr(list(input = "train"), get_hash), PipeOpTestHash$new()$hash)
+  # R.cache appends the expression to the key before storing
+  expr = substitute({
+    op[[fun]](input)
+    state = op$state
+  })
+  key = c(list(expr = expr), key)
+  expect_equal(R.cache::loadCache(key)$results[[1]], "train")
+
+
+  # PO stochastic is respected-----------------------
+  po = PipeOpTestHash$new()
+  po$stochastic = c("train", "predict")
+  po$cache_state = FALSE
+  gr = as_graph(po)
+  gr$cache = TRUE
+
+  # takes > 1 second
+  R.cache::clearCache(prompt = FALSE)
+  st = Sys.time()
+  expect_message(gr$train("train"), "sleeping train")
+  expect_true(st < Sys.time() - 1)
+
+  # takes > 1 second
+  st = Sys.time()
+  expect_message(gr$predict("predict"), "sleeping predict")
+  expect_true(st < Sys.time() - 1)
+  # Nothing was cached:
+  expect_true(all(list.files(R.cache::getCacheRootPath()) == "README.txt"))
+
+
+  # PO cache = FALSE is respected -----------------------
+  po = PipeOpTestHash$new()
+  po$cache = FALSE
+  gr = as_graph(po)
+  gr$cache = TRUE
+
+  # takes > 1 second
+  R.cache::clearCache(prompt = FALSE)
+  st = Sys.time()
+  expect_message(gr$train("train"), "sleeping train")
+  expect_true(st < Sys.time() - 1)
+  expect_true(all(list.files(R.cache::getCacheRootPath()) == "README.txt"))
+
+  
+  # PO cache_state = FALSE is respected-------------------
+  po = PipeOpTestHash$new()
+  po$cache_state = FALSE
+  gr = as_graph(po)
+  gr$cache = TRUE
+
+  # takes > 1 second
+  R.cache::clearCache(prompt = FALSE)
+  st = Sys.time()
+  expect_message(gr$train("train"), "sleeping train")
+  expect_true(st < Sys.time() - 1)
+
+  # takes > 1 second
+  st = Sys.time()
+  expect_message(gr$predict("predict"), "sleeping predict")
+  expect_true(st < Sys.time() - 1)
+
+  # cached train takes < 1 second
+  st = Sys.time()
+  expect_silent(gr$train("train"))
+  expect_true(gr$train("train") == "train")
+  expect_true(st > (Sys.time() - 1))
+
+  # cached predict takes < 1 second
+  st = Sys.time()
+  expect_silent(gr$predict("predict"))
+  expect_true(st > Sys.time() - 1)
+  expect_true(length(list.files(R.cache::getCacheRootPath())) == 3)
+
+  # # Reset old cachepath
+  R.cache::setCacheRootPath(old_tmpdir)
+  unlink(test_tmpdir, recursive = TRUE)
+})