Skip to content

Commit

Permalink
Merge pull request #75 from 0UmfHxcvx5J7JoaOhFSs5mncnisTJJ6q/dev/filt…
Browse files Browse the repository at this point in the history
…er_read.quitte

allow filtering of large data sets in read.quitte()
  • Loading branch information
0UmfHxcvx5J7JoaOhFSs5mncnisTJJ6q authored Oct 18, 2023
2 parents a5b69f0 + 36532fe commit baf83e4
Show file tree
Hide file tree
Showing 7 changed files with 176 additions and 34 deletions.
2 changes: 1 addition & 1 deletion .buildlibrary
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
ValidationKey: '6.14e+08'
ValidationKey: '614196480'
AcceptedWarnings:
- 'Warning: package ''.*'' was built under R version'
- 'Warning: namespace ''.*'' is not available and has been replaced'
Expand Down
2 changes: 1 addition & 1 deletion CITATION.cff
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ cff-version: 1.2.0
message: If you use this software, please cite it using the metadata from this file.
type: software
title: 'quitte: Bits and pieces of code to use with quitte-style data frames'
version: 0.3125.0
version: 0.3126.0
date-released: '2023-10-18'
abstract: A collection of functions for easily dealing with quitte-style data frames,
doing multi-model comparisons and plots.
Expand Down
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Type: Package
Package: quitte
Title: Bits and pieces of code to use with quitte-style data frames
Version: 0.3125.0
Version: 0.3126.0
Date: 2023-10-18
Authors@R: c(
person("Michaja", "Pehl", , "[email protected]", role = c("aut", "cre")),
Expand Down
3 changes: 2 additions & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -131,8 +131,9 @@ importFrom(magclass,getSets)
importFrom(magrittr,"%>%")
importFrom(plyr,revalue)
importFrom(purrr,map)
importFrom(readr,DataFrameCallback)
importFrom(readr,problems)
importFrom(readr,read_delim)
importFrom(readr,read_delim_chunked)
importFrom(readr,read_lines)
importFrom(readr,write_lines)
importFrom(readxl,excel_sheets)
Expand Down
115 changes: 92 additions & 23 deletions R/read.quitte.R
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,38 @@
#' present, is returned as a `comment_header` attribute. If multiple files
#' are read, the `comment_header` attribute is a list of comment headers with
#' file paths as names.
#' @param filter.function A function used to filter data during read. See
#' Details.
#' @param chunk_size Number of lines to read at a time. Defaults to 200000.
#' (REMIND .mif files have between 55000 and 105000 lines for H12 and EU21
#' regional settings, respectively.)
#'
#' @details
#' In order to process large data sets, like IIASA data base snapshots,
#' `read.quitte()` reads provided files (other then Excel files) in chunks of
#' `chunk_size` lines, and applies `filter.function()` to the chunks. This
#' allows for filtering data piece-by-piece, without exceeding available memory.
#' `filter.function` is a function taking one argument, a quitte data frame of
#' the read chunk, and is expected to return a data frame. Usually it should
#' simply contain all the filters usually applied after all the data is read in.
#' Suppose there is a file `big_IIASA_snapshot.csv`, from which only data for
#' the REMIND and MESSAGE models between the years 2020 to 2050 is of interest.
#' Normally, this data would be processed as
#' ```
#' read.quitte(file = 'big_IIASA_snapshot.csv') %>%
#' filter(grepl('^(REMIND|MESSAGE)', .data$model),
#' between(.data$period, 2020, 2060))
#' ```
#' If however `big_IIASA_snapshot.csv` is too large to be read in completely,
#' it can be read using
#' ```
#' read.quitte(file = 'big_IIASA_snapshot.csv',
#' filter.function = function(x) {
#' x %>%
#' filter(grepl('^(REMIND|MESSAGE)', .data$model),
#' between(.data$period, 2020, 2060))
#' })
#' ```
#'
#' @return A quitte data frame.
#'
Expand All @@ -37,7 +69,7 @@
#' @importFrom forcats as_factor
#' @importFrom magrittr %>%
#' @importFrom rlang .data is_empty
#' @importFrom readr problems read_delim read_lines
#' @importFrom readr DataFrameCallback problems read_delim_chunked read_lines
#' @importFrom readxl read_excel excel_sheets
#' @importFrom tidyr pivot_longer
#' @importFrom tidyselect all_of
Expand All @@ -52,13 +84,20 @@ read.quitte <- function(file,
check.duplicates = TRUE,
factors = TRUE,
drop.na = FALSE,
comment = '#') {
comment = '#',
filter.function = NULL,
chunk_size = 200000L) {

if (!length(file))
stop('\'file\' is empty.')

if ( !is.null(filter.function)
&& !is.function(filter.function)
&& 1 != length(formals(filter.function)))
stop('`filter.function` must be a function taking only one argument.')

.read.quitte <- function(f, sep, quote, na.strings, convert.periods,
drop.na, comment) {
drop.na, comment, filter.function, chunk_size) {

default.columns <- c("model", "scenario", "region", "variable", "unit")

Expand Down Expand Up @@ -104,30 +143,60 @@ read.quitte <- function(file,
collapse = '')

# read data ----

# the callback function accepts a chunk of data, `x`, pivots the periods
# to long format (dropping NAs if required), converts the periods to
# integer or POSIXct values as required, and applies the
# `filter.function`. If the `filter.function` is `NULL`, it just
# returns the processed data.
chunk_callback <- DataFrameCallback$new(
(function(F, convert.periods, drop.na) {
if (is.null(F))
F <- function(x) { x }

function(x, pos) {
if ('problems' %in% names(attributes(x))) {
p <- problems(x)
}
else {
p <- NULL
}

x <- x %>%
relocate(all_of(default.columns)) %>%
# convert to long format
pivot_longer(all_of(periods), names_to = 'period',
values_drop_na = drop.na) %>%
# convert periods
mutate(period = gsub('^[A-Za-z]?', '', .data$period),
period = if (convert.periods) {
ISOyear(.data$period)
} else {
as.integer(as.character(.data$period))
}) %>%
# apply filter
F()

if (!is.null(p))
attr(x, 'problems') <- p

return(x)
}
})(filter.function, convert.periods, drop.na)
)

data <- suppressWarnings(
read_delim(file = f, quote = quote, col_names = c(header),
col_types = colClasses, delim = sep, na = na.strings,
skip = length(comment_header) + 1, comment = comment,
trim_ws = TRUE)
read_delim_chunked(
file = f, callback = chunk_callback, delim = sep,
chunk_size = chunk_size, quote = quote, col_names = header,
col_types = colClasses, na = na.strings, comment = comment,
trim_ws = TRUE, skip = length(comment_header) + 1)
)

# catch any parsing problems
data_problems <- if (nrow(problems(data))) {
problems(data)
}

data <- data %>%
relocate(all_of(default.columns)) %>%
# convert to long format
pivot_longer(all_of(periods), names_to = 'period',
values_drop_na = drop.na)

# convert periods ----
data$period <- gsub("^[A-Za-z]?", "", data$period)
if (convert.periods) {
data$period <- ISOyear(data$period)
} else {
data$period <- as.integer(as.character(data$period))
problems(data) %>%
mutate(file = f)
}

# re-attach parsing problems
Expand All @@ -143,7 +212,7 @@ read.quitte <- function(file,
comment_header <- list()
for (f in file) {
data <- .read.quitte(f, sep, quote, na.strings, convert.periods,
drop.na, comment)
drop.na, comment, filter.function, chunk_size)
quitte <- bind_rows(quitte, data)
quitte_problems <- bind_rows(quitte_problems, attr(data, 'problems'))
comment_header <- c(comment_header,
Expand Down
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Bits and pieces of code to use with quitte-style data frames

R package **quitte**, version **0.3125.0**
R package **quitte**, version **0.3126.0**

[![CRAN status](https://www.r-pkg.org/badges/version/quitte)](https://cran.r-project.org/package=quitte) [![R build status](https://github.com/pik-piam/quitte/workflows/check/badge.svg)](https://github.com/pik-piam/quitte/actions) [![codecov](https://codecov.io/gh/pik-piam/quitte/branch/master/graph/badge.svg)](https://app.codecov.io/gh/pik-piam/quitte) [![r-universe](https://pik-piam.r-universe.dev/badges/quitte)](https://pik-piam.r-universe.dev/builds)

Expand Down Expand Up @@ -47,7 +47,7 @@ In case of questions / problems please contact Michaja Pehl <michaja.pehl@pik-po

To cite package **quitte** in publications use:

Pehl M, Bauer N, Hilaire J, Levesque A, Luderer G, Schultes A, Dietrich J, Richters O (2023). _quitte: Bits and pieces of code to use with quitte-style data frames_. R package version 0.3125.0, <https://github.com/pik-piam/quitte>.
Pehl M, Bauer N, Hilaire J, Levesque A, Luderer G, Schultes A, Dietrich J, Richters O (2023). _quitte: Bits and pieces of code to use with quitte-style data frames_. R package version 0.3126.0, <https://github.com/pik-piam/quitte>.

A BibTeX entry for LaTeX users is

Expand All @@ -56,7 +56,7 @@ A BibTeX entry for LaTeX users is
title = {quitte: Bits and pieces of code to use with quitte-style data frames},
author = {Michaja Pehl and Nico Bauer and Jérôme Hilaire and Antoine Levesque and Gunnar Luderer and Anselm Schultes and Jan Philipp Dietrich and Oliver Richters},
year = {2023},
note = {R package version 0.3125.0},
note = {R package version 0.3126.0},
url = {https://github.com/pik-piam/quitte},
}
```
80 changes: 76 additions & 4 deletions tests/testthat/test-read.quitte.R
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
context('read.quitte()')

# read.quitte() results ----
# read.quitte() reads .mif files correctly ----
test_that(
'read.quitte() results',
'read.quitte() reads .mif files correctly',
{
object = read.quitte(
system.file('extdata',
Expand Down Expand Up @@ -51,3 +49,77 @@ test_that(
object = d1,
expected = d2)
})

# read.quitte() reports problems ----
test_that(
desc = 'read.quitte() reports problems',
code = {
# insert parsing problem into temporary file
x <- read_lines(system.file('extdata', 'extra_column.mif',
package = 'quitte', mustWork = TRUE))
x[[length(x)]] <- sub('[0-9]+$', 'Inf', x[[length(x)]])
tmp <- tempfile('read_delim_problem', tempdir(), '.mif')
write_lines(x, tmp)

# warns about problems
expect_warning(
object = x <- read.quitte(tmp),
regexp = 'One or more parsing issues')

# returns a data frame as the `problems` attribute
expect_s3_class(object = problems(x), class = 'data.frame')
# data frame has more then one row
expect_gt(object = nrow(problems(x)), expected = 0)
# data frame has a `file` column
expect_contains(object = colnames(problems(x)), expected = 'file')

unlink(tmp)
})

# read.quitte(filter.function) filters correctly ----
df_diff <- function(lhs, rhs, value = value) {
value <- sub('^"(.*)"$', '\\1', deparse(substitute(value)))
lhs.value <- paste0('lhs.', value)
rhs.value <- paste0('rhs.', value)

tmp <- full_join(
lhs %>%
rename(!!sym(lhs.value) := all_of(value)),

rhs %>%
rename(!!sym(rhs.value) := all_of(value))
) %>%
filter(!!sym(lhs.value) != !!sym(rhs.value))

if (0 == nrow(tmp)) {
return(NULL)
}
else {
return(tmp)
}
}

lhs <- read.quitte(
file = system.file('extdata',
c('REMIND_generic_r7552c_1p5C_Def-rem-5.mif',
'REMIND_generic_r7552c_1p5C_UBA_Sust-rem-5.mif',
'REMIND_generic_r7552c_2C_Def-rem-5.mif',
'REMIND_generic_r7552c_2C_UBA_Sustlife-rem-5.mif',
'REMIND_generic_r7552c_REF_Def05-rem-5.mif'),
package = 'quitte')) %>%
filter(!grepl('^PE\\|', variable))

rhs <- read.quitte(
file = system.file('extdata',
c('REMIND_generic_r7552c_1p5C_Def-rem-5.mif',
'REMIND_generic_r7552c_1p5C_UBA_Sust-rem-5.mif',
'REMIND_generic_r7552c_2C_Def-rem-5.mif',
'REMIND_generic_r7552c_2C_UBA_Sustlife-rem-5.mif',
'REMIND_generic_r7552c_REF_Def05-rem-5.mif'),
package = 'quitte'),
filter.function = function(x) {
x %>%
filter(!grepl('^PE\\|', variable))
})

expect_null(df_diff(lhs, rhs))

0 comments on commit baf83e4

Please sign in to comment.