From b8a0315a081cbf75fa086447f04a83ee6c6f12a4 Mon Sep 17 00:00:00 2001 From: David Blodgett Date: Sat, 27 Jul 2024 22:07:08 -0500 Subject: [PATCH 1/4] add archive support to get_nhdplushr fixes #404 --- R/A_nhdplusTools.R | 2 ++ R/downloading_tools.R | 15 +++++++++++++-- man/download_nhdplushr.Rd | 10 +++++++++- tests/testthat/test_get_nhdplushr.R | 6 ++++++ 4 files changed, 30 insertions(+), 3 deletions(-) diff --git a/R/A_nhdplusTools.R b/R/A_nhdplusTools.R index 5bc89a75..d1ad9cdb 100644 --- a/R/A_nhdplusTools.R +++ b/R/A_nhdplusTools.R @@ -259,10 +259,12 @@ assign("default_nhdplus_path", default_nhdplus_path, envir = nhdplusTools_env) nhd_bucket <- "https://prd-tnm.s3.amazonaws.com/" nhdhr_file_list <- "?prefix=StagedProducts/Hydrography/NHDPlusHR/VPU/Current/GDB/" +archive_nhdhr_file_list <- "?prefix=StagedProducts/Hydrography/NHDPlusHR/VPU/Archive/GDB/" nhd_file_list <- "?prefix=StagedProducts/Hydrography/NHD/HU4/GDB/" assign("nhd_bucket", nhd_bucket, envir = nhdplusTools_env) assign("nhdhr_file_list", nhdhr_file_list, envir = nhdplusTools_env) +assign("archive_nhdhr_file_list", archive_nhdhr_file_list, envir = nhdplusTools_env) assign("nldi_tier", "prod", envir = nhdplusTools_env) diff --git a/R/downloading_tools.R b/R/downloading_tools.R index 95e589ec..00e6eaef 100644 --- a/R/downloading_tools.R +++ b/R/downloading_tools.R @@ -5,6 +5,11 @@ #' and four digit codes. #' @param download_files boolean if FALSE, only URLs to files will be returned #' can be hu02s and/or hu04s +#' @param archive pull data from the "archive" folder rather than "current". +#' The archive contains the original releases of NHDPlusHR data that were updated +#' in subsequent processing. Not all subsets of NHDPlusHR were updated. See: +#' https://www.usgs.gov/national-hydrography/access-national-hydrography-products +#' for more details. #' #' @return character Paths to geodatabases created. #' @export @@ -16,11 +21,17 @@ #' (hu <- substr(hu$huc8, 1, 2)) #' #' download_nhdplushr(tempdir(), c(hu, "0203"), download_files = FALSE) +#' +#' download_nhdplushr(tempdir(), c(hu, "0203"), download_files = FALSE, archive = TRUE) #' } -download_nhdplushr <- function(nhd_dir, hu_list, download_files = TRUE) { +download_nhdplushr <- function(nhd_dir, hu_list, download_files = TRUE, archive = FALSE) { + + list_source <- get("nhdhr_file_list", envir = nhdplusTools_env) + + if(archive) list_source <- get("archive_nhdhr_file_list", envir = nhdplusTools_env) download_nhd_internal(get("nhd_bucket", envir = nhdplusTools_env), - get("nhdhr_file_list", envir = nhdplusTools_env), + list_source, "NHDPLUS_H_", nhd_dir, hu_list, download_files) } diff --git a/man/download_nhdplushr.Rd b/man/download_nhdplushr.Rd index 3ec9239c..e562262c 100644 --- a/man/download_nhdplushr.Rd +++ b/man/download_nhdplushr.Rd @@ -4,7 +4,7 @@ \alias{download_nhdplushr} \title{Download NHDPlus HiRes} \usage{ -download_nhdplushr(nhd_dir, hu_list, download_files = TRUE) +download_nhdplushr(nhd_dir, hu_list, download_files = TRUE, archive = FALSE) } \arguments{ \item{nhd_dir}{character directory to save output into} @@ -15,6 +15,12 @@ and four digit codes.} \item{download_files}{boolean if FALSE, only URLs to files will be returned can be hu02s and/or hu04s} + +\item{archive}{pull data from the "archive" folder rather than "current". +The archive contains the original releases of NHDPlusHR data that were updated +in subsequent processing. Not all subsets of NHDPlusHR were updated. See: +https://www.usgs.gov/national-hydrography/access-national-hydrography-products +for more details.} } \value{ character Paths to geodatabases created. @@ -30,5 +36,7 @@ hu <- get_huc(sf::st_sfc(sf::st_point(c(-73, 42)), crs = 4326), (hu <- substr(hu$huc8, 1, 2)) download_nhdplushr(tempdir(), c(hu, "0203"), download_files = FALSE) + +download_nhdplushr(tempdir(), c(hu, "0203"), download_files = FALSE, archive = TRUE) } } diff --git a/tests/testthat/test_get_nhdplushr.R b/tests/testthat/test_get_nhdplushr.R index 77c492db..3d8f1521 100644 --- a/tests/testthat/test_get_nhdplushr.R +++ b/tests/testthat/test_get_nhdplushr.R @@ -10,9 +10,15 @@ test_that("we get urls for nhdplushr and base", { expect_equal(length(urls), 11) + urls <- download_nhdplushr(work_dir, c("01", "0203"), download_files = FALSE, archive = TRUE) + + expect_true(all(grepl("Archive", urls))) + urls <- download_nhd(work_dir, c("01", "0203"), download_files = FALSE) expect_equal(length(urls), 11) + + }) test_that("get_nhdplushr layers and gpkg", { From 6a2ad2aa40f8219c53f16fc0ed4927bbbb905178 Mon Sep 17 00:00:00 2001 From: David Blodgett Date: Sat, 27 Jul 2024 22:44:55 -0500 Subject: [PATCH 2/4] remove duplicate hu4s from nhdplushr loads fixes #360 --- R/get_nhdplushr.R | 9 +++++++++ tests/testthat/test_get_nhdplushr.R | 24 +++++++++++++++++++++++- 2 files changed, 32 insertions(+), 1 deletion(-) diff --git a/R/get_nhdplushr.R b/R/get_nhdplushr.R index 56304d3e..dc4cda88 100644 --- a/R/get_nhdplushr.R +++ b/R/get_nhdplushr.R @@ -67,6 +67,15 @@ get_nhdplushr <- function(hr_dir, out_gpkg = NULL, gdbs <- list.files(hr_dir, pattern = "sub.gpkg", full.names = TRUE) } + dup_list_key <- regmatches(gdbs, regexpr("[0-9][0-9][0-9][0-9]", gdbs)) + + if(any(duplicated(dup_list_key))) { + remove <- gdbs[duplicated(dup_list_key)] + warning("Found duplicate HU04s in nhdplushr directory? Will not use: \n", + paste(remove, collapse = "\n")) + gdbs <- gdbs[!duplicated(dup_list_key)] + } + if(is.null(layers)) { layers <- st_layers(gdbs[1]) diff --git a/tests/testthat/test_get_nhdplushr.R b/tests/testthat/test_get_nhdplushr.R index 3d8f1521..7ca17bef 100644 --- a/tests/testthat/test_get_nhdplushr.R +++ b/tests/testthat/test_get_nhdplushr.R @@ -18,7 +18,6 @@ test_that("we get urls for nhdplushr and base", { expect_equal(length(urls), 11) - }) test_that("get_nhdplushr layers and gpkg", { @@ -47,6 +46,29 @@ test_that("get_nhdplushr layers and gpkg", { expect(length(names(out)), 7) }) +test_that("get_nhdplushr duplicate vpus", { + skip_on_cran() + + get_test_file(work_dir) + + f <- file.path(work_dir, "03_sub.gpkg") + ftemp <- file.path(work_dir, "03.gpkg") + f1 <- file.path(work_dir, "0303_sub.gpkg") + f2 <- file.path(work_dir, "0303_2sub.gpkg") + + file.copy(f, f1) + file.copy(f, f2) + + file.rename(f, ftemp) + + expect_warning(out <- get_nhdplushr(work_dir, out_gpkg = out_gpkg), + "Found duplicate HU04s") + + file.rename(ftemp, f) + unlink(f1) + unlink(f2) +}) + test_that("nhdplus hr waterbody", { skip_on_cran() From 917cb342e26bcebe012464b091569b603fa5f971 Mon Sep 17 00:00:00 2001 From: David Blodgett Date: Sun, 28 Jul 2024 20:10:35 -0500 Subject: [PATCH 3/4] test cleanup --- tests/testthat/helper.R | 11 +++ tests/testthat/test_get_nhdplushr.R | 130 ++++++++++++++++------------ 2 files changed, 88 insertions(+), 53 deletions(-) diff --git a/tests/testthat/helper.R b/tests/testthat/helper.R index 8434ee27..490118b1 100644 --- a/tests/testthat/helper.R +++ b/tests/testthat/helper.R @@ -39,3 +39,14 @@ check_layers <- function(out_file) { expect_true(sf::st_crs(sf::read_sf(out_file, "NHDFlowline_Network")) == sf::st_crs(4269)) } + +setup_workdir <- function() { + work_dir <- file.path(tempdir(), "test_hr") + dir.create(work_dir, recursive = TRUE, showWarnings = FALSE) + out_gpkg <- file.path(work_dir, "temp.gpkg") + list(wd = work_dir, og = out_gpkg) +} + +teardown_workdir <- function(work_dir) { + unlink(work_dir, recursive = TRUE, force = TRUE) +} diff --git a/tests/testthat/test_get_nhdplushr.R b/tests/testthat/test_get_nhdplushr.R index 7ca17bef..8ad4b893 100644 --- a/tests/testthat/test_get_nhdplushr.R +++ b/tests/testthat/test_get_nhdplushr.R @@ -1,141 +1,158 @@ - -work_dir <- file.path(tempdir(), "test_hr") -dir.create(work_dir, recursive = TRUE, showWarnings = FALSE) -out_gpkg <- file.path(work_dir, "temp.gpkg") - test_that("we get urls for nhdplushr and base", { skip_on_cran() - urls <- download_nhdplushr(work_dir, c("01", "0203"), download_files = FALSE) + + sw <- setup_workdir() + + urls <- download_nhdplushr(sw$wd, c("01", "0203"), download_files = FALSE) expect_equal(length(urls), 11) - urls <- download_nhdplushr(work_dir, c("01", "0203"), download_files = FALSE, archive = TRUE) + urls <- download_nhdplushr(sw$wd, c("01", "0203"), download_files = FALSE, archive = TRUE) expect_true(all(grepl("Archive", urls))) - urls <- download_nhd(work_dir, c("01", "0203"), download_files = FALSE) + urls <- download_nhd(sw$wd, c("01", "0203"), download_files = FALSE) expect_equal(length(urls), 11) + teardown_workdir(sw$wd) }) test_that("get_nhdplushr layers and gpkg", { skip_on_cran() - get_test_file(work_dir) + sw <- setup_workdir() + + get_test_file(sw$wd) - out <- get_nhdplushr(work_dir, out_gpkg = out_gpkg) + out <- get_nhdplushr(sw$wd, out_gpkg = sw$og) - layers <- sf::st_layers(out_gpkg) + layers <- sf::st_layers(sw$og) expect_equal(layers$name, c("NHDFlowline", "NHDPlusCatchment")) expect_equal(layers$features, c(2691, 2603)) expect_equal(layers$features, c(nrow(out[[1]]), nrow(out[[2]]))) expect_equal(layers$name, names(out)) - out <- get_nhdplushr(work_dir, out_gpkg = out_gpkg, + out <- get_nhdplushr(sw$wd, out_gpkg = sw$og, layers = NULL, overwrite = TRUE) - layers <- sf::st_layers(out_gpkg) + layers <- sf::st_layers(sw$og) expect_equal(length(layers$name), 7) expect_equal(layers$fields[which(layers$name == "NHDFlowline")], 57) - out <- get_nhdplushr(work_dir, layers = NULL) + out <- get_nhdplushr(sw$wd, layers = NULL) expect(length(names(out)), 7) + + teardown_workdir(sw$wd) }) test_that("get_nhdplushr duplicate vpus", { skip_on_cran() - get_test_file(work_dir) + sw <- setup_workdir() + + get_test_file(sw$wd) - f <- file.path(work_dir, "03_sub.gpkg") - ftemp <- file.path(work_dir, "03.gpkg") - f1 <- file.path(work_dir, "0303_sub.gpkg") - f2 <- file.path(work_dir, "0303_2sub.gpkg") + f <- file.path(sw$wd, "03_sub.gpkg") + ftemp <- file.path(sw$wd, "03.gpkg") + f1 <- file.path(sw$wd, "0303_sub.gpkg") + f2 <- file.path(sw$wd, "0303_2sub.gpkg") file.copy(f, f1) file.copy(f, f2) file.rename(f, ftemp) - expect_warning(out <- get_nhdplushr(work_dir, out_gpkg = out_gpkg), + expect_warning(out <- get_nhdplushr(sw$wd, out_gpkg = sw$og), "Found duplicate HU04s") file.rename(ftemp, f) unlink(f1) unlink(f2) + + teardown_workdir(sw$wd) }) test_that("nhdplus hr waterbody", { skip_on_cran() - get_test_file(work_dir) + sw <- setup_workdir() - out <- get_nhdplushr(work_dir, out_gpkg = out_gpkg) + get_test_file(sw$wd) - out <- get_nhdplushr(work_dir, layers = c("NHDFlowline", + out <- get_nhdplushr(sw$wd, out_gpkg = sw$og) + + out <- get_nhdplushr(sw$wd, layers = c("NHDFlowline", "NHDWaterbody"), - out_gpkg = out_gpkg) + out_gpkg = sw$og) wb <- out$NHDWaterbody[out$NHDWaterbody$Permanent_Identifier == 46376571,] expect_equal(get_wb_outlet(wb$Permanent_Identifier, out$NHDFlowline)$Permanent_Identifier, "46338320") + + teardown_workdir(sw$wd) }) test_that("get_nhdplushr overwrite gpkg and pattern", { skip_on_cran() - get_test_file(work_dir) + sw <- setup_workdir() - out <- get_nhdplushr(work_dir, out_gpkg = out_gpkg) + get_test_file(sw$wd) + + out <- get_nhdplushr(sw$wd, out_gpkg = sw$og) layer <- c("NHDFlowline") - out_sub <- get_nhdplushr(work_dir, out_gpkg = out_gpkg, + out_sub <- get_nhdplushr(sw$wd, out_gpkg = sw$og, layers = layer, overwrite = FALSE) expect_equal(names(out_sub), layer) - layers <- sf::st_layers(out_gpkg) - expect_equal(length(layers$name), 7) + layers <- sf::st_layers(sw$og) + expect_equal(length(layers$name), 2) - fl <- sf::read_sf(out_gpkg, layer) + fl <- sf::read_sf(sw$og, layer) - out_sub <- get_nhdplushr(work_dir, out_gpkg = out_gpkg, + out_sub <- get_nhdplushr(sw$wd, out_gpkg = sw$og, layers = layer, min_size_sqkm = 10, overwrite = TRUE) - layers <- sf::st_layers(out_gpkg) - expect_equal(length(layers$name), 7) + layers <- sf::st_layers(sw$og) + expect_equal(length(layers$name), 2) - fl2 <- sf::read_sf(out_gpkg, layer) + fl2 <- sf::read_sf(sw$og, layer) expect_true(nrow(fl2) < nrow(fl)) - devnull <- file.copy(file.path(work_dir, "03_sub.gpkg"), - file.path(work_dir, "04_sub.gpkg")) + devnull <- file.copy(file.path(sw$wd, "03_sub.gpkg"), + file.path(sw$wd, "04_sub.gpkg")) - fl <- read_sf(file.path(work_dir, "04_sub.gpkg"), "NHDFlowline") + fl <- read_sf(file.path(sw$wd, "04_sub.gpkg"), "NHDFlowline") fl$NHDPlusID <- fl$NHDPlusID + max(fl$NHDPlusID) - write_sf(fl, file.path(work_dir, "04_sub.gpkg"), "NHDFlowline") + write_sf(fl, file.path(sw$wd, "04_sub.gpkg"), "NHDFlowline") - out_sub <- get_nhdplushr(work_dir, pattern = ".*sub.gpkg$") + out_sub <- get_nhdplushr(sw$wd, pattern = ".*sub.gpkg$") expect_equal(nrow(out_sub$NHDFlowline), 2*nrow(fl)) - unlink(file.path(work_dir, "04_sub.gpkg")) + unlink(file.path(sw$wd, "04_sub.gpkg")) + + teardown_workdir(sw$wd) }) test_that("get_nhdplushr simp and proj", { skip_on_cran() - get_test_file(work_dir) + sw <- setup_workdir() + + get_test_file(sw$wd) - out <- get_nhdplushr(work_dir) + out <- get_nhdplushr(sw$wd) - out_sub <- get_nhdplushr(work_dir, proj = 5070) + out_sub <- get_nhdplushr(sw$wd, proj = 5070) expect_equal(st_crs(out_sub$NHDFlowline), st_crs(5070)) @@ -143,43 +160,50 @@ test_that("get_nhdplushr simp and proj", { expect_equal(st_crs(out_sub$NHDPlusCatchment), st_crs(5070)) - out_sub2 <- get_nhdplushr(work_dir, proj = 5070, simp = 20) + out_sub2 <- get_nhdplushr(sw$wd, proj = 5070, simp = 20) expect_true(length(st_geometry(out_sub$NHDFlowline)[[1]]) > length(st_geometry(out_sub2$NHDFlowline)[[1]])) expect_true(nrow(st_geometry(out_sub$NHDPlusCatchment)[[1]][[1]][[1]]) > nrow(st_geometry(out_sub2$NHDPlusCatchment)[[1]][[1]])) + teardown_workdir(sw$wd) }) test_that("get_nhdplushr rename and keep_cols", { skip_on_cran() - get_test_file(work_dir) + sw <- setup_workdir() - out <- get_nhdplushr(work_dir, out_gpkg = out_gpkg) + get_test_file(sw$wd) - out_sub <- get_nhdplushr(work_dir, + out <- get_nhdplushr(sw$wd, out_gpkg = sw$og) + + out_sub <- get_nhdplushr(sw$wd, keep_cols = c("COMID", "FEATUREID", "StreamOrde", "AreaSqKM"), check_terminals = FALSE) expect_equal(names(out_sub$NHDFlowline), c("COMID", "StreamOrde", "AreaSqKM", "geom")) expect_equal(names(out_sub$NHDPlusCatchment), c("FEATUREID", "AreaSqKM", "geom")) - out_sub <- get_nhdplushr(work_dir, rename = FALSE, check_terminals = FALSE) + out_sub <- get_nhdplushr(sw$wd, rename = FALSE, check_terminals = FALSE) expect_true("NHDPlusID" %in% names(out_sub$NHDFlowline)) + + teardown_workdir(sw$wd) }) test_that("make_standalone", { skip_on_cran() - get_test_file(work_dir) + sw <- setup_workdir() + + get_test_file(sw$wd) - fl <- get_nhdplushr(work_dir, check_terminals = FALSE)$NHDFlowline + fl <- get_nhdplushr(sw$wd, check_terminals = FALSE)$NHDFlowline sa <- make_standalone(fl) - sa_check <- get_nhdplushr(work_dir, check_terminals = TRUE)$NHDFlowline + sa_check <- get_nhdplushr(sw$wd, check_terminals = TRUE)$NHDFlowline expect_true(all(sa$LevelPathI == sa_check$LevelPathI)) expect_true(!all(fl$LevelPathI == sa_check$LevelPathI)) @@ -205,6 +229,6 @@ test_that("make_standalone", { sample_flines <- make_standalone(sample_flines) expect_true(0 %in% sample_flines$toCOMID) -}) -unlink(work_dir, recursive = TRUE) + teardown_workdir(sw$wd) +}) From a609fa781dc1d54035034b8efe236472ca34e978 Mon Sep 17 00:00:00 2001 From: David Blodgett Date: Sun, 28 Jul 2024 20:38:42 -0500 Subject: [PATCH 4/4] only check basename for dups --- R/get_nhdplushr.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/R/get_nhdplushr.R b/R/get_nhdplushr.R index dc4cda88..1b54e858 100644 --- a/R/get_nhdplushr.R +++ b/R/get_nhdplushr.R @@ -67,7 +67,7 @@ get_nhdplushr <- function(hr_dir, out_gpkg = NULL, gdbs <- list.files(hr_dir, pattern = "sub.gpkg", full.names = TRUE) } - dup_list_key <- regmatches(gdbs, regexpr("[0-9][0-9][0-9][0-9]", gdbs)) + dup_list_key <- regmatches(basename(gdbs), regexpr("[0-9][0-9][0-9][0-9]", basename(gdbs))) if(any(duplicated(dup_list_key))) { remove <- gdbs[duplicated(dup_list_key)]