Merge pull request #78 from orichters/read.snapshot

Add read.snapshot to load IIASA database snapshots
pik-piam · Oct 19, 2023 · f6f6147 · f6f6147
2 parents ab88750 + 1843108
commit f6f6147
Show file tree

Hide file tree

Showing 11 changed files with 214 additions and 11 deletions.
diff --git a/.buildlibrary b/.buildlibrary
@@ -1,4 +1,4 @@
-ValidationKey: '614216128'
+ValidationKey: '614424230'
 AcceptedWarnings:
 - 'Warning: package ''.*'' was built under R version'
 - 'Warning: namespace ''.*'' is not available and has been replaced'

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -15,7 +15,7 @@ repos:
     -   id: mixed-line-ending
 
 -   repo: https://github.com/lorenzwalthert/precommit
-    rev: v0.3.2.9019
+    rev: v0.3.2.9021
     hooks:
     -   id: parsable-R
     -   id: deps-in-desc

diff --git a/CITATION.cff b/CITATION.cff
@@ -2,8 +2,8 @@ cff-version: 1.2.0
 message: If you use this software, please cite it using the metadata from this file.
 type: software
 title: 'quitte: Bits and pieces of code to use with quitte-style data frames'
-version: 0.3126.1
-date-released: '2023-10-18'
+version: 0.3127.0
+date-released: '2023-10-19'
 abstract: A collection of functions for easily dealing with quitte-style data frames,
   doing multi-model comparisons and plots.
 authors:

diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Type: Package
 Package: quitte
 Title: Bits and pieces of code to use with quitte-style data frames
-Version: 0.3126.1
-Date: 2023-10-18
+Version: 0.3127.0
+Date: 2023-10-19
 Authors@R: c(
     person("Michaja", "Pehl", , "[email protected]", role = c("aut", "cre")),
     person("Nico", "Bauer", , "[email protected]", role = "aut"),
@@ -53,7 +53,7 @@ Suggests:
     knitr,
     mip,
     rmarkdown,
-    testthat,
+    testthat (>= 3.2.0),
     tidyverse,
 VignetteBuilder:
     knitr

diff --git a/NAMESPACE b/NAMESPACE
@@ -69,6 +69,7 @@ export(quitte2quantiles)
 export(quitteSort)
 export(read.gdx)
 export(read.quitte)
+export(read.snapshot)
 export(removeColNa)
 export(replace_column)
 export(replace_column_)

diff --git a/R/read.snapshot.R b/R/read.snapshot.R
@@ -0,0 +1,71 @@
+#' Reads IAMC-style .csv files obtained as a IIASA snapshot into a quitte data frame,
+#' filtering the data. This function is helpful if the csv file is large and R runs out
+#' of memory loading it completely. This function requires head, tail and grep on your system.
+#' If not supported, use read.quitte().
+#'
+#' @md
+#' @param file Path of single IAMC-style .csv/.mif file
+#' @param keep list with quitte columns as names and data points that should be kept.
+#'
+#' @return A quitte data frame.
+#'
+#' @author Oliver Richters
+#'
+#' @examples
+#' \dontrun{
+#' read.filter.snapshot("snapshot.csv", list(scenario = c("CurPol", "NDC"), region = "World"))
+#' }
+#'
+#' @importFrom dplyr filter
+#'
+#' @export
+
+read.snapshot <- function(file, keep = list()) {
+  unknowntype <- setdiff(names(keep), c("model", "scenario", "region", "variable", "unit", "period"))
+  if (length(unknowntype) > 0) {
+    stop("Unknown types to be kept: ", toString(unknowntype))
+  }
+  testcommand <- c("grep", "head", "tail", "sed")
+  exitcodes <- suppressWarnings(
+      sapply(paste(testcommand, '--version'), system,
+             ignore.stdout = TRUE, ignore.stderr = TRUE))
+  if (any(0 != exitcodes)) {
+      stop(paste(paste0('`', testcommand[0 != exitcodes], '`', collapse = ', '),
+                "are not available system commands, please use 'read.quitte'."))
+  }
+
+  # temporary file
+  tmpfile <- tempfile(pattern = "data", fileext = ".csv")
+  if (length(setdiff(names(keep), "period")) > 0) {
+    # always keep first lines of original file (comments, colnames), grep in the rest
+    alwayskeep <- 20
+    system(paste("head -n", alwayskeep, file, ">", tmpfile))
+    # the goal of the next lines is to grep one after the other through the elements of keep
+    # keep = list(variable = "GDP|PPP", region = c("World", "FRA")) should get you
+    # | grep -E '(GDP\|PPP)' | grep -E '(World|FRA)'
+    # 1. escape | in variable names and do not grep for period
+    cleanup <- function(x) {
+      x <- gsub("[^A-Za-z0-9\\| ]", ".", x)
+      x <- gsub("|", "\\|", x, fixed = TRUE)        
+    }
+    keepescaped <- lapply(keep[setdiff(names(keep), "period")], cleanup)
+    # 2. collapse each element with a |
+    keepcollapsed <- unlist(lapply(keepescaped, paste0, collapse = "|"))
+    # generate a grep -E statement for each element of keep list
+    greptext <- paste0(" | grep -E '(", keepcollapsed, ")'", collapse = "")
+    command <- paste0("tail -n +", (alwayskeep + 1), " ", file, greptext, " >> ", tmpfile)
+    system(command)
+  } else {
+    file.copy(file, tmpfile, overwrite = TRUE)
+  }
+  # read file and do correct filtering
+  data <- read.quitte(tmpfile,
+                      na.strings = c("UNDF", "NA", "N/A", "n_a", ""),
+                      quote = '"',
+                      drop.na = TRUE)
+  unlink(tmpfile)
+  for (t in names(keep)) {
+    data <- droplevels(filter(data, .data[[t]] %in% keep[[t]]))
+  }
+  return(data)
+}
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 # Bits and pieces of code to use with quitte-style data frames
 
-R package **quitte**, version **0.3126.1**
+R package **quitte**, version **0.3127.0**
 
 [![CRAN status](https://www.r-pkg.org/badges/version/quitte)](https://cran.r-project.org/package=quitte)  [![R build status](https://github.com/pik-piam/quitte/workflows/check/badge.svg)](https://github.com/pik-piam/quitte/actions) [![codecov](https://codecov.io/gh/pik-piam/quitte/branch/master/graph/badge.svg)](https://app.codecov.io/gh/pik-piam/quitte) [![r-universe](https://pik-piam.r-universe.dev/badges/quitte)](https://pik-piam.r-universe.dev/builds)
 
@@ -47,7 +47,7 @@ In case of questions / problems please contact Michaja Pehl <michaja.pehl@pik-po
 
 To cite package **quitte** in publications use:
 
-Pehl M, Bauer N, Hilaire J, Levesque A, Luderer G, Schultes A, Dietrich J, Richters O (2023). _quitte: Bits and pieces of code to use with quitte-style data frames_. R package version 0.3126.1, <https://github.com/pik-piam/quitte>.
+Pehl M, Bauer N, Hilaire J, Levesque A, Luderer G, Schultes A, Dietrich J, Richters O (2023). _quitte: Bits and pieces of code to use with quitte-style data frames_. R package version 0.3127.0, <URL: https://github.com/pik-piam/quitte>.
 
 A BibTeX entry for LaTeX users is
 
@@ -56,7 +56,7 @@ A BibTeX entry for LaTeX users is
   title = {quitte: Bits and pieces of code to use with quitte-style data frames},
   author = {Michaja Pehl and Nico Bauer and Jérôme Hilaire and Antoine Levesque and Gunnar Luderer and Anselm Schultes and Jan Philipp Dietrich and Oliver Richters},
   year = {2023},
-  note = {R package version 0.3126.1},
+  note = {R package version 0.3127.0},
   url = {https://github.com/pik-piam/quitte},
 }
 ```
diff --git a/man/df_variation.Rd b/man/df_variation.Rd
diff --git a/man/read.quitte.Rd b/man/read.quitte.Rd
diff --git a/man/read.snapshot.Rd b/man/read.snapshot.Rd
diff --git a/tests/testthat/test-read.snapshot.R b/tests/testthat/test-read.snapshot.R
@@ -0,0 +1,37 @@
+test_that("read.snapshot works", {
+  qe <- droplevels(quitteSort(as.quitte(quitte_example_dataAR6, na.rm = TRUE)))
+  tmpfile <- tempfile(pattern = "data", fileext = ".csv")
+  write.table(pivot_wider(qe, names_from = "period"), 
+              file = tmpfile, append = FALSE, quote = FALSE, sep = ",",
+              eol = "\n", na = "", dec = ".", row.names = FALSE,
+              col.names = TRUE) # mimick IIASA snapshot format
+  fails <- tryCatch(read.snapshot(tmpfile), error = function(e) { paste(e) })
+  if (is.character(fails) && length(fails) == 1 && grepl("not available system commands", fails)) {
+    skip(paste0(gsub("Error in ", "", gsub(", pleas.*", "", fails)), ", skipping tests."))
+  }
+  system(paste("sed -i 's/GCAM/\"GCAM\"/g;'", tmpfile))
+  system(paste("sed -i 's/Delayed transition/\"Delayed transition\"/g;'", tmpfile))
+  rtests <- list(head(levels(qe$region), 1), head(levels(qe$region), 2))
+  for (r in rtests) {
+    expect_equal(droplevels(dplyr::filter(qe, region %in% r)),
+                 read.snapshot(tmpfile, list(region = r)))
+  }
+  stests <- list(head(levels(qe$scenario), 1), head(levels(qe$scenario), 2))
+  for (s in stests) {
+    expect_equal(droplevels(dplyr::filter(qe, scenario %in% s)),
+                 read.snapshot(tmpfile, list(scenario = s)))
+  }
+  vtests <- list(head(levels(qe$variable), 1), head(levels(qe$variable), 2), levels(qe$variable))
+  for (v in vtests) {
+    expect_equal(droplevels(dplyr::filter(qe, variable %in% v)),
+                 read.snapshot(tmpfile, list(variable = v)))
+  }
+  ptests <- list(head(unique(qe$period), 1), head(unique(qe$period, 2)), unique(qe$period))  
+  for (p in ptests) {
+    expect_equal(droplevels(dplyr::filter(qe, period %in% p)),
+                 read.snapshot(tmpfile, list(period = p)))
+  }
+  # test all jointly with last setting
+  expect_equal(droplevels(dplyr::filter(qe, period %in% p, variable %in% v, region %in% r, scenario %in% s)),
+               read.snapshot(tmpfile, list(period = p, variable = v, region = r, scenario = s)))
+})