Merge pull request #75 from 0UmfHxcvx5J7JoaOhFSs5mncnisTJJ6q/dev/filt…

…er_read.quitte allow filtering of large data sets in read.quitte()
pik-piam · Oct 18, 2023 · baf83e4 · baf83e4
2 parents a5b69f0 + 36532fe
commit baf83e4
Show file tree

Hide file tree

Showing 7 changed files with 176 additions and 34 deletions.
diff --git a/.buildlibrary b/.buildlibrary
@@ -1,4 +1,4 @@
-ValidationKey: '6.14e+08'
+ValidationKey: '614196480'
 AcceptedWarnings:
 - 'Warning: package ''.*'' was built under R version'
 - 'Warning: namespace ''.*'' is not available and has been replaced'

diff --git a/CITATION.cff b/CITATION.cff
@@ -2,7 +2,7 @@ cff-version: 1.2.0
 message: If you use this software, please cite it using the metadata from this file.
 type: software
 title: 'quitte: Bits and pieces of code to use with quitte-style data frames'
-version: 0.3125.0
+version: 0.3126.0
 date-released: '2023-10-18'
 abstract: A collection of functions for easily dealing with quitte-style data frames,
   doing multi-model comparisons and plots.

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Type: Package
 Package: quitte
 Title: Bits and pieces of code to use with quitte-style data frames
-Version: 0.3125.0
+Version: 0.3126.0
 Date: 2023-10-18
 Authors@R: c(
     person("Michaja", "Pehl", , "[email protected]", role = c("aut", "cre")),

diff --git a/NAMESPACE b/NAMESPACE
@@ -131,8 +131,9 @@ importFrom(magclass,getSets)
 importFrom(magrittr,"%>%")
 importFrom(plyr,revalue)
 importFrom(purrr,map)
+importFrom(readr,DataFrameCallback)
 importFrom(readr,problems)
-importFrom(readr,read_delim)
+importFrom(readr,read_delim_chunked)
 importFrom(readr,read_lines)
 importFrom(readr,write_lines)
 importFrom(readxl,excel_sheets)

diff --git a/R/read.quitte.R b/R/read.quitte.R
@@ -21,6 +21,38 @@
 #'   present, is returned as a `comment_header` attribute.  If multiple files
 #'   are read, the `comment_header` attribute is a list of comment headers with
 #'   file paths as names.
+#' @param filter.function A function used to filter data during read.  See
+#'     Details.
+#' @param chunk_size Number of lines to read at a time.  Defaults to 200000.
+#'     (REMIND .mif files have between 55000 and 105000 lines for H12 and EU21
+#'     regional settings, respectively.)
+#'
+#' @details
+#' In order to process large data sets, like IIASA data base snapshots,
+#' `read.quitte()` reads provided files (other then Excel files) in chunks of
+#' `chunk_size` lines, and applies `filter.function()` to the chunks.  This
+#' allows for filtering data piece-by-piece, without exceeding available memory.
+#' `filter.function` is a function taking one argument, a quitte data frame of
+#' the read chunk, and is expected to return a data frame.  Usually it should
+#' simply contain all the filters usually applied after all the data is read in.
+#' Suppose there is a file `big_IIASA_snapshot.csv`, from which only data for
+#' the REMIND and MESSAGE models between the years 2020 to 2050 is of interest.
+#' Normally, this data would be processed as
+#' ```
+#' read.quitte(file = 'big_IIASA_snapshot.csv') %>%
+#'     filter(grepl('^(REMIND|MESSAGE)', .data$model),
+#'            between(.data$period, 2020, 2060))
+#' ```
+#' If however `big_IIASA_snapshot.csv` is too large to be read in completely,
+#' it can be read using
+#' ```
+#' read.quitte(file = 'big_IIASA_snapshot.csv',
+#'             filter.function = function(x) {
+#'                 x %>%
+#'                     filter(grepl('^(REMIND|MESSAGE)', .data$model),
+#'                            between(.data$period, 2020, 2060))
+#'             })
+#' ```
 #'
 #' @return A quitte data frame.
 #'
@@ -37,7 +69,7 @@
 #' @importFrom forcats as_factor
 #' @importFrom magrittr %>%
 #' @importFrom rlang .data is_empty
-#' @importFrom readr problems read_delim read_lines
+#' @importFrom readr DataFrameCallback problems read_delim_chunked read_lines
 #' @importFrom readxl read_excel excel_sheets
 #' @importFrom tidyr pivot_longer
 #' @importFrom tidyselect all_of
@@ -52,13 +84,20 @@ read.quitte <- function(file,
                         check.duplicates = TRUE,
                         factors = TRUE,
                         drop.na = FALSE,
-                        comment = '#') {
+                        comment = '#',
+                        filter.function = NULL,
+                        chunk_size = 200000L) {
 
     if (!length(file))
         stop('\'file\' is empty.')
 
+    if (   !is.null(filter.function)
+           && !is.function(filter.function)
+           && 1 != length(formals(filter.function)))
+        stop('`filter.function` must be a function taking only one argument.')
+
     .read.quitte <- function(f, sep, quote, na.strings, convert.periods,
-                             drop.na, comment) {
+                             drop.na, comment, filter.function, chunk_size) {
 
         default.columns  <- c("model", "scenario", "region", "variable", "unit")
 
@@ -104,30 +143,60 @@ read.quitte <- function(file,
                             collapse = '')
 
         # read data ----
+
+        # the callback function accepts a chunk of data, `x`, pivots the periods
+        # to long format (dropping NAs if required), converts the periods to
+        # integer or POSIXct values as required, and applies the
+        # `filter.function`.  If the `filter.function` is `NULL`, it just
+        # returns the processed data.
+        chunk_callback <- DataFrameCallback$new(
+            (function(F, convert.periods, drop.na) {
+                if (is.null(F))
+                    F <- function(x) { x }
+
+                function(x, pos) {
+                    if ('problems' %in% names(attributes(x))) {
+                        p <- problems(x)
+                    }
+                    else {
+                        p <- NULL
+                    }
+
+                    x <- x %>%
+                        relocate(all_of(default.columns)) %>%
+                        # convert to long format
+                        pivot_longer(all_of(periods), names_to = 'period',
+                                     values_drop_na = drop.na) %>%
+                        # convert periods
+                        mutate(period = gsub('^[A-Za-z]?', '', .data$period),
+                               period = if (convert.periods) {
+                                   ISOyear(.data$period)
+                               } else {
+                                   as.integer(as.character(.data$period))
+                               }) %>%
+                        # apply filter
+                        F()
+
+                    if (!is.null(p))
+                        attr(x, 'problems') <- p
+
+                    return(x)
+                }
+            })(filter.function, convert.periods, drop.na)
+        )
+
         data <- suppressWarnings(
-            read_delim(file = f, quote = quote, col_names = c(header),
-                       col_types = colClasses, delim = sep, na = na.strings,
-                       skip = length(comment_header) + 1, comment = comment,
-                       trim_ws = TRUE)
+            read_delim_chunked(
+                file = f, callback = chunk_callback, delim = sep,
+                chunk_size = chunk_size, quote = quote, col_names = header,
+                col_types = colClasses, na = na.strings, comment = comment,
+                trim_ws = TRUE, skip = length(comment_header) + 1)
         )
 
         # catch any parsing problems
         data_problems <- if (nrow(problems(data))) {
-            problems(data)
-        }
-
-        data <- data %>%
-            relocate(all_of(default.columns)) %>%
-            # convert to long format
-            pivot_longer(all_of(periods), names_to = 'period',
-                         values_drop_na = drop.na)
-
-        # convert periods ----
-        data$period <- gsub("^[A-Za-z]?", "", data$period)
-        if (convert.periods) {
-            data$period <- ISOyear(data$period)
-        } else {
-            data$period <- as.integer(as.character(data$period))
+            problems(data) %>%
+                mutate(file = f)
         }
 
         # re-attach parsing problems
@@ -143,7 +212,7 @@ read.quitte <- function(file,
     comment_header <- list()
     for (f in file) {
         data <- .read.quitte(f, sep, quote, na.strings, convert.periods,
-                             drop.na, comment)
+                             drop.na, comment, filter.function, chunk_size)
         quitte <- bind_rows(quitte, data)
         quitte_problems <- bind_rows(quitte_problems, attr(data, 'problems'))
         comment_header <- c(comment_header,

diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Bits and pieces of code to use with quitte-style data frames
 
-R package **quitte**, version **0.3125.0**
+R package **quitte**, version **0.3126.0**
 
 [![CRAN status](https://www.r-pkg.org/badges/version/quitte)](https://cran.r-project.org/package=quitte)  [![R build status](https://github.com/pik-piam/quitte/workflows/check/badge.svg)](https://github.com/pik-piam/quitte/actions) [![codecov](https://codecov.io/gh/pik-piam/quitte/branch/master/graph/badge.svg)](https://app.codecov.io/gh/pik-piam/quitte) [![r-universe](https://pik-piam.r-universe.dev/badges/quitte)](https://pik-piam.r-universe.dev/builds)
 
@@ -47,7 +47,7 @@ In case of questions / problems please contact Michaja Pehl <michaja.pehl@pik-po
 
 To cite package **quitte** in publications use:
 
-Pehl M, Bauer N, Hilaire J, Levesque A, Luderer G, Schultes A, Dietrich J, Richters O (2023). _quitte: Bits and pieces of code to use with quitte-style data frames_. R package version 0.3125.0, <https://github.com/pik-piam/quitte>.
+Pehl M, Bauer N, Hilaire J, Levesque A, Luderer G, Schultes A, Dietrich J, Richters O (2023). _quitte: Bits and pieces of code to use with quitte-style data frames_. R package version 0.3126.0, <https://github.com/pik-piam/quitte>.
 
 A BibTeX entry for LaTeX users is
 
@@ -56,7 +56,7 @@ A BibTeX entry for LaTeX users is
   title = {quitte: Bits and pieces of code to use with quitte-style data frames},
   author = {Michaja Pehl and Nico Bauer and Jérôme Hilaire and Antoine Levesque and Gunnar Luderer and Anselm Schultes and Jan Philipp Dietrich and Oliver Richters},
   year = {2023},
-  note = {R package version 0.3125.0},
+  note = {R package version 0.3126.0},
   url = {https://github.com/pik-piam/quitte},
 }
 ```
diff --git a/tests/testthat/test-read.quitte.R b/tests/testthat/test-read.quitte.R
@@ -1,8 +1,6 @@
-context('read.quitte()')
-
-# read.quitte() results ----
+# read.quitte() reads .mif files correctly ----
 test_that(
-  'read.quitte() results',
+  'read.quitte() reads .mif files correctly',
   {
     object = read.quitte(
       system.file('extdata',
@@ -51,3 +49,77 @@ test_that(
       object = d1,
       expected = d2)
   })
+
+# read.quitte() reports problems ----
+test_that(
+  desc = 'read.quitte() reports problems',
+  code = {
+    # insert parsing problem into temporary file
+    x <- read_lines(system.file('extdata', 'extra_column.mif',
+                                package = 'quitte', mustWork = TRUE))
+    x[[length(x)]] <- sub('[0-9]+$', 'Inf', x[[length(x)]])
+    tmp <- tempfile('read_delim_problem', tempdir(), '.mif')
+    write_lines(x, tmp)
+
+    # warns about problems
+    expect_warning(
+      object = x <- read.quitte(tmp),
+      regexp = 'One or more parsing issues')
+
+    # returns a data frame as the `problems` attribute
+    expect_s3_class(object = problems(x), class = 'data.frame')
+    # data frame has more then one row
+    expect_gt(object = nrow(problems(x)), expected = 0)
+    # data frame has a `file` column
+    expect_contains(object = colnames(problems(x)), expected = 'file')
+
+    unlink(tmp)
+  })
+
+# read.quitte(filter.function) filters correctly ----
+df_diff <- function(lhs, rhs, value = value) {
+  value <- sub('^"(.*)"$', '\\1', deparse(substitute(value)))
+  lhs.value <- paste0('lhs.', value)
+  rhs.value <- paste0('rhs.', value)
+
+  tmp <- full_join(
+    lhs %>%
+      rename(!!sym(lhs.value) := all_of(value)),
+
+    rhs %>%
+      rename(!!sym(rhs.value) := all_of(value))
+  ) %>%
+    filter(!!sym(lhs.value) != !!sym(rhs.value))
+
+  if (0 == nrow(tmp)) {
+    return(NULL)
+  }
+  else {
+    return(tmp)
+  }
+}
+
+lhs <- read.quitte(
+  file = system.file('extdata',
+                     c('REMIND_generic_r7552c_1p5C_Def-rem-5.mif',
+                       'REMIND_generic_r7552c_1p5C_UBA_Sust-rem-5.mif',
+                       'REMIND_generic_r7552c_2C_Def-rem-5.mif',
+                       'REMIND_generic_r7552c_2C_UBA_Sustlife-rem-5.mif',
+                       'REMIND_generic_r7552c_REF_Def05-rem-5.mif'),
+                     package = 'quitte')) %>%
+  filter(!grepl('^PE\\|', variable))
+
+rhs <- read.quitte(
+  file = system.file('extdata',
+                     c('REMIND_generic_r7552c_1p5C_Def-rem-5.mif',
+                       'REMIND_generic_r7552c_1p5C_UBA_Sust-rem-5.mif',
+                       'REMIND_generic_r7552c_2C_Def-rem-5.mif',
+                       'REMIND_generic_r7552c_2C_UBA_Sustlife-rem-5.mif',
+                       'REMIND_generic_r7552c_REF_Def05-rem-5.mif'),
+                     package = 'quitte'),
+  filter.function = function(x) {
+    x %>%
+      filter(!grepl('^PE\\|', variable))
+  })
+
+expect_null(df_diff(lhs, rhs))