From e616d8e96b83ddd78812c5b97466bbdeb4e531ac Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Paul=20Li=C3=A9tar?= Date: Wed, 11 Sep 2024 16:18:29 +0100 Subject: [PATCH] Refactor directory expansion. There were three separate implementations of directory expansion spread across the codebase, each accepting slightly different inputs. The one used by `orderly_resource` operated over a single vector of paths. The one used by `orderly_shared_resource` operated over a dataframe of `there`/`here` columns. Finally the one used by `orderly_copy_files` used the packet metadata to enumerate files, instead of looking at the filesystem. The three implementations are replaced by just one, which uses callbacks to determine which files are directories and to enumerate their contents. This enables it to be used either with real files on disk or off of a packet's metadata. --- DESCRIPTION | 2 +- R/metadata.R | 26 ++------ R/outpack_helpers.R | 28 +++----- R/util.R | 57 ++++++++++++---- tests/testthat/test-outpack-packet.R | 8 +-- tests/testthat/test-run.R | 2 +- tests/testthat/test-util.R | 98 ++++++++++++++++++++++++++++ 7 files changed, 165 insertions(+), 56 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 935e8154..61f46f4d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: orderly2 Title: Orderly Next Generation -Version: 1.99.36 +Version: 1.99.37 Authors@R: c(person("Rich", "FitzJohn", role = c("aut", "cre"), email = "rich.fitzjohn@gmail.com"), person("Robert", "Ashton", role = "aut"), diff --git a/R/metadata.R b/R/metadata.R index 0843c16d..e6fb2d39 100644 --- a/R/metadata.R +++ b/R/metadata.R @@ -391,28 +391,14 @@ copy_shared_resource <- function(path_root, path_dest, config, files, call) { shared_dir)) } - here <- files$here - there <- files$there - - assert_file_exists_relative(there, workdir = shared_path, + assert_file_exists_relative(files$there, workdir = shared_path, name = "Shared resource file", call = call) - src <- file.path(shared_path, there) - dst <- file.path(path_dest, here) - - is_dir <- is_directory(file.path(shared_path, there)) - fs::dir_create(file.path(path_dest, dirname(here))) - if (any(is_dir)) { - fs::dir_copy(src[is_dir], dst[is_dir]) - ## Update the names that will be used in the metadata: - files <- lapply(src[is_dir], dir) - here <- replace_ragged(here, is_dir, Map(file.path, here[is_dir], files)) - there <- replace_ragged(there, is_dir, Map(file.path, there[is_dir], files)) - } - if (any(!is_dir)) { - copy_files(src[!is_dir], dst[!is_dir], overwrite = TRUE) - } - data_frame(here = here, there = there) + files_expanded <- expand_dirs(files, shared_path) + copy_files(fs::path(shared_path, files_expanded$there), + fs::path(path_dest, files_expanded$here), + overwrite = TRUE) + files_expanded } diff --git a/R/outpack_helpers.R b/R/outpack_helpers.R index d19354fd..167c8da9 100644 --- a/R/outpack_helpers.R +++ b/R/outpack_helpers.R @@ -115,7 +115,7 @@ orderly_copy_files <- function(..., files, dest, overwrite = TRUE, } } - plan <- plan_copy_files(root, id, files$there, files$here, environment()) + plan <- plan_copy_files(root, id, files, environment()) name <- outpack_metadata_core(id, root)$name tryCatch( @@ -149,25 +149,17 @@ orderly_copy_files <- function(..., files, dest, overwrite = TRUE, } -plan_copy_files <- function(root, id, there, here, call = NULL) { - assert_relative_path(there, name = "File", workdir = id, call = call) - validate_packet_has_file(root, id, there, call) - is_dir <- grepl("/$", there) - if (any(is_dir)) { - meta <- outpack_metadata_core(id, root) - files <- meta$files$path - expanded <- lapply(which(is_dir), function(i) { - p <- there[[i]] - j <- string_starts_with(p, files) - nms <- file.path(sub("/+$", "", here[[i]]), - string_drop_prefix(p, files[j])) - set_names(files[j], nms) - }) +plan_copy_files <- function(root, id, files, call = NULL) { + assert_relative_path(files$there, name = "File", workdir = id, call = call) + validate_packet_has_file(root, id, files$there, call) - there <- replace_ragged(there, is_dir, lapply(expanded, unname)) - here <- replace_ragged(here, is_dir, lapply(expanded, names)) + meta <- outpack_metadata_core(id, root) + is_dir <- function(p) grepl("/$", p) + list_files <- function(p) { + j <- string_starts_with(p, meta$files$path) + string_drop_prefix(p, meta$files$path[j]) } - data_frame(there, here) + expand_dirs_virtual(files, is_dir, list_files) } diff --git a/R/util.R b/R/util.R index d7c0a34f..636c25d1 100644 --- a/R/util.R +++ b/R/util.R @@ -134,20 +134,53 @@ vcapply <- function(X, FUN, ...) { # nolint } -## TODO: also replace copy_shared with this -expand_dirs <- function(paths, workdir) { - if (length(paths) == 0) { - return(character()) +#' Expand directories into their content lists. +#' +#' This function does not access the filesystem directly and instead calls the +#' given `is_dir` and `list_files` callback. This allows using the function with +#' files that do not exist on disk yet, such as those listed in a packet's +#' metadata. +#' +#' @param files either a character vector or a dataframe with columns `there` +#' and `here`. +#' @param is_dir a function from a character vector to a logical vector, +#' indicating whether each path is a directory needing expansion or not. +#' @param list_files a function from a character scalar to a character vector, +#' enumerating the contents of the directory. The return values must *not* +#' include the directory path as a prefix. +#' @return a modified version of `files`, where directories have been replaced +#' by their contents. If `files` was a data_frame, both the `there` and `here` +#' columns are modified. +#' @noRd +expand_dirs_virtual <- function(files, is_dir, list_files) { + if (is.character(files)) { + dirs <- is_dir(files) + expanded <- lapply(files[dirs], list_files) + replace_ragged(files, dirs, Map(fs::path, files[dirs], expanded)) + } else { + dirs <- is_dir(files$there) + expanded <- lapply(files$there[dirs], list_files) + + there <- replace_ragged(files$there, dirs, + Map(fs::path, files$there[dirs], expanded)) + here <- replace_ragged(files$here, dirs, + Map(fs::path, files$here[dirs], expanded)) + + data_frame(here, there) } - withr::local_dir(workdir) - i <- is_directory(paths) - if (any(i)) { - contents <- lapply(paths[i], function(p) { - as.character(fs::dir_ls(p, all = TRUE, type = "file", recurse = TRUE)) - }) - paths <- replace_ragged(paths, i, contents) +} + + +expand_dirs <- function(files, workdir) { + assert_scalar_character(workdir) + + is_dir <- function(p) is_directory(fs::path(workdir, p)) + list_files <- function(p) { + full_path <- fs::path(workdir, p) + files <- fs::dir_ls(full_path, all = TRUE, type = "file", recurse = TRUE) + fs::path_rel(files, full_path) } - paths + expand_dirs_virtual(files, is_dir, list_files) } diff --git a/tests/testthat/test-outpack-packet.R b/tests/testthat/test-outpack-packet.R index 02e35225..5933695a 100644 --- a/tests/testthat/test-outpack-packet.R +++ b/tests/testthat/test-outpack-packet.R @@ -583,7 +583,7 @@ test_that("can depend based on a simple query", { p$depends[[1]], list(packet = id$b[[3]], query = "latest()", - files = data.frame(there = "data.rds", here = "1.rds"))) + files = data.frame(here = "1.rds", there = "data.rds"))) query <- orderly_query("latest(parameter:i < 3)", name = "a") outpack_packet_use_dependency(p, query, c("2.rds" = "data.rds")) @@ -591,7 +591,7 @@ test_that("can depend based on a simple query", { p$depends[[2]], list(packet = id$a[[2]], query = 'latest(parameter:i < 3 && name == "a")', - files = data.frame(there = "data.rds", here = "2.rds"))) + files = data.frame(here = "2.rds", there = "data.rds"))) }) @@ -790,8 +790,8 @@ test_that("can pull in directories", { p2 <- outpack_packet_start_quietly(path_src2, "b", root = root) outpack_packet_use_dependency(p2, 'latest(name == "a")', c(d = "data/")) expect_equal(p2$depends[[1]]$files, - data_frame(there = file.path("data", letters[1:6]), - here = file.path("d", letters[1:6]))) + data_frame(here = file.path("d", letters[1:6]), + there = file.path("data", letters[1:6]))) }) diff --git a/tests/testthat/test-run.R b/tests/testthat/test-run.R index 05fd7050..122125f5 100644 --- a/tests/testthat/test-run.R +++ b/tests/testthat/test-run.R @@ -1347,7 +1347,7 @@ test_that("can read about dependencies", { r <- readRDS(file.path(path, "archive", "depends", id2, "depends.rds")) expect_equal(r$id, id1) expect_equal(r$name, "data") - expect_equal(r$files, data_frame(there = "data.rds", here = "input.rds")) + expect_equal(r$files, data_frame(here = "input.rds", there = "data.rds")) res <- withr::with_dir( path_src, diff --git a/tests/testthat/test-util.R b/tests/testthat/test-util.R index c86ce449..0408adca 100644 --- a/tests/testthat/test-util.R +++ b/tests/testthat/test-util.R @@ -321,6 +321,104 @@ test_that("read_string strips newlines", { expect_equal(result, "12345678") }) +describe("expand_dirs_virtual", { + files <- list( + "d1" = c("f2", "f3"), + "d2" = c("f4", "d3/f5"), + "d2/d3" = c("f5")) + is_dir <- function(p) p %in% c("d1", "d2", "d2/d3") + list_files <- function(p) files[[p]] + + check <- function(object, expected) { + expect_equal(expand_dirs_virtual(object, is_dir, list_files), expected) + } + + it("accepts a character vector", { + check("f1", "f1") + check("d1", c("d1/f2", "d1/f3")) + check(c("f1", "d1"), c("f1", "d1/f2", "d1/f3")) + check("d2", c("d2/f4", "d2/d3/f5")) + }) + + it("accepts a dataframe", { + check( + data_frame(here = "g1", there = "f1"), + data_frame(here = "g1", there = "f1")) + + check( + data_frame(here = "dest", there = "d1"), + data_frame(here = c("dest/f2", "dest/f3"), + there = c("d1/f2", "d1/f3"))) + + check( + data_frame(here = c("g1", "dest"), there = c("f1", "d1")), + data_frame(here = c("g1", "dest/f2", "dest/f3"), + there = c("f1", "d1/f2", "d1/f3"))) + + check( + data_frame(here = "dest", there = "d2"), + data_frame(here = c("dest/f4", "dest/d3/f5"), + there = c("d2/f4", "d2/d3/f5"))) + + check( + data_frame(here = c("foo", "bar"), there = c("d2", "d2")), + data_frame(here = c("foo/f4", "foo/d3/f5", "bar/f4", "bar/d3/f5"), + there = c("d2/f4", "d2/d3/f5", "d2/f4", "d2/d3/f5"))) + }) +}) + +describe("expand_dirs", { + p <- withr::local_tempdir() + files <- c("f1", "d1/f2", "d1/f3", "d2/f4", "d2/d3/f5") + fs::dir_create(fs::path(p, unique(dirname(files)))) + fs::file_create(fs::path(p, files)) + + it("accepts a character vector", { + check <- function(object, expected) { + expect_setequal(expand_dirs(object, p), expected) + } + + check("f1", "f1") + check("d1", c("d1/f2", "d1/f3")) + check(c("f1", "d1"), c("f1", "d1/f2", "d1/f3")) + check("d2", c("d2/f4", "d2/d3/f5")) + }) + + it("accepts a dataframe", { + check <- function(object, expected) { + result <- expand_dirs(object, p) + # This compares the dataframes ignoring the order, which is not + # deterministic + expect_setequal(unname(split(result, seq_len(nrow(result)))), + unname(split(expected, seq_len(nrow(expected))))) + } + + check( + data_frame(here = "g1", there = "f1"), + data_frame(here = "g1", there = "f1")) + + check( + data_frame(here = "dest", there = "d1"), + data_frame(here = c("dest/f2", "dest/f3"), there = c("d1/f2", "d1/f3"))) + + check( + data_frame(here = c("g1", "dest"), there = c("f1", "d1")), + data_frame(here = c("g1", "dest/f2", "dest/f3"), + there = c("f1", "d1/f2", "d1/f3"))) + + check( + data_frame(here = "dest", there = "d2"), + data_frame(here = c("dest/f4", "dest/d3/f5"), + there = c("d2/f4", "d2/d3/f5"))) + + check( + data_frame(here = c("foo", "bar"), there = c("d2", "d2")), + data_frame(here = c("foo/f4", "foo/d3/f5", "bar/f4", "bar/d3/f5"), + there = c("d2/f4", "d2/d3/f5", "d2/f4", "d2/d3/f5"))) + }) +}) + + test_that("fill_missing_names works", { expect_equal(fill_missing_names(NULL), NULL) expect_equal(fill_missing_names(c("a", "b")), c(a = "a", b = "b"))