diff --git a/README.md b/README.md index 7614a11..7d30767 100644 --- a/README.md +++ b/README.md @@ -143,7 +143,7 @@ Attributes from non-standardized datasets may need to be acquired for RaFTS mode Run [`flow.install.proc.attr.hydfab.R`](https://github.com/NOAA-OWP/formulation-selector/blob/main/pkg/proc.attr.hydfab/flow/flow.install.proc.attr.hydfab.R) to install the package. Note that a user may need to modify the section that creates the `fs_dir` for their custom path to this repo's directory. ## Usage - `proc.attr.hydfab` -The following is an example script that runs the attribute grabber: [`fs_attrs_grab`](https://github.com/NOAA-OWP/formulation-selector/blob/main/pkg/proc.attr.hydfab/flow/fsds_attrs_grab.R). +The following is an example script that runs the attribute grabber: [`fs_attrs_grab`](https://github.com/NOAA-OWP/formulation-selector/blob/main/pkg/proc.attr.hydfab/flow/fs_attrs_grab.R). This script grabs attribute data corresponding to locations of interest, and saves those attribute data inside a directory as multiple parquet files. The `proc.attr.hydfab::retrieve_attr_exst()` function may then efficiently query and then retrieve desired data by variable name and comid from those parquet files. diff --git a/pkg/proc.attr.hydfab/R/proc_attr_grabber.R b/pkg/proc.attr.hydfab/R/proc_attr_grabber.R index 46536b8..cd3c7b9 100644 --- a/pkg/proc.attr.hydfab/R/proc_attr_grabber.R +++ b/pkg/proc.attr.hydfab/R/proc_attr_grabber.R @@ -105,16 +105,16 @@ retrieve_attr_exst <- function(comids, vars, dir_db_attrs, bucket_conn=NA){ } -proc_attr_std_hfsub_name <- function(comid,custom_name='', ext='gpkg'){ +proc_attr_std_hfsub_name <- function(comid,custom_name='', fileext='gpkg'){ #' @title Standardidze hydrofabric subsetter's local filename #' @description Internal function that ensures consistent filename #' @param comid the USGS common identifier, generated by nhdplusTools #' @param custom_name Desired custom name following 'hydrofab_' - #' @param ext file extension of the hydrofrabric data. Default 'gpkg' + #' @param fileext file extension of the hydrofrabric data. Default 'gpkg' hfsub_fn <- base::gsub(pattern = paste0(custom_name,"__"), replacement = "_", - base::paste0('hydrofab_',custom_name,'_',comid,'.',ext)) + base::paste0('hydrofab_',custom_name,'_',comid,'.',fileext)) return(hfsub_fn) } @@ -185,7 +185,7 @@ proc_attr_usgs_nhd <- function(comid,usgs_vars){ } -proc_attr_hf <- function(comid, dir_db_hydfab,custom_name="{lyrs}_",ext = 'gpkg', +proc_attr_hf <- function(comid, dir_db_hydfab,custom_name="{lyrs}_",fileext = 'gpkg', lyrs=c('divides','network')[2], hf_cat_sel=TRUE, overwrite=FALSE){ @@ -195,7 +195,7 @@ proc_attr_hf <- function(comid, dir_db_hydfab,custom_name="{lyrs}_",ext = 'gpkg' #' @param comid character class. The common identifier USGS location code for a surface water feature. #' @param dir_db_hydfab character class. Local directory path for storing hydrofabric data #' @param custom_name character class. A custom name to insert into hydrofabric file. Default \code{glue("{lyrs}_")} - #' @param ext character class. file extension of hydrofabric file. Default 'gpkg' + #' @param fileext character class. file extension of hydrofabric file. Default 'gpkg' #' @param lyrs character class. The layer name(s) of interest from hydrofabric. Default 'network'. #' @param hf_cat_sel boolean. TRUE for a total catchment characterization specific to a single comid, FALSE (or anything else) for all subcatchments #' @param overwrite boolean. Overwrite local data when pulling from hydrofabric s3 bucket? Default FALSE. @@ -204,7 +204,7 @@ proc_attr_hf <- function(comid, dir_db_hydfab,custom_name="{lyrs}_",ext = 'gpkg' # Build the hydfab filepath name_file <- proc.attr.hydfab:::proc_attr_std_hfsub_name(comid=comid, custom_name=glue::glue('{lyrs}_'), - ext=ext) + fileext=fileext) fp_cat <- base::file.path(dir_db_hydfab, name_file) if(!base::dir.exists(dir_db_hydfab)){ @@ -225,7 +225,7 @@ proc_attr_hf <- function(comid, dir_db_hydfab,custom_name="{lyrs}_",ext = 'gpkg' # Read the hydrofabric file gpkg for each layer hfab_ls <- list() - if (ext == 'gpkg') { + if (fileext == 'gpkg') { # Define layers layers <- sf::st_layers(dsn = fp_cat) for (lyr in layers$name){ @@ -469,7 +469,7 @@ proc_attr_gageids <- function(gage_ids,featureSource,featureID,Retr_Params, # Retrieve the variables corresponding to datasets of interest & update database loc_attrs <- try(proc.attr.hydfab::proc_attr_wrap(comid=comid, Retr_Params=Retr_Params, - lyrs='network',overwrite=FALSE)) + lyrs=lyrs,overwrite=FALSE)) if("try-error" %in% class(loc_attrs)){ message(glue::glue("Skipping gage_id {gage_id} corresponding to comid {comid}")) } diff --git a/pkg/proc.attr.hydfab/flow/fs_attrs_grab.R b/pkg/proc.attr.hydfab/flow/fs_attrs_grab.R index bfa71ba..e0d0f7c 100644 --- a/pkg/proc.attr.hydfab/flow/fs_attrs_grab.R +++ b/pkg/proc.attr.hydfab/flow/fs_attrs_grab.R @@ -12,6 +12,7 @@ #' term 'gage_id' is used as a variable in glue syntax to create featureID #' @seealso [fs_proc] A python package that processes input data for the #' formulation-selector +#' @usage Rscript fs_attrs_grab.R "/path/to/attribute_config.yaml" # Changelog / Contributions # 2024-07-24 Originally created, GL @@ -21,61 +22,81 @@ library(yaml) library(ncdf4) library(proc.attr.hydfab) library(glue) + # TODO is AWS_NO_SIGN_REQUEST necessary?? # Sys.setenv(AWS_NO_SIGN_REQUEST="YES") -# TODO create config yaml. - -# TODO read in config yaml, must populate NA for items that are empty. - -# Define input directory: -# TODO change this to reading the standardized metadata, not the generated data +# Define command line argument +cmd_args <- commandArgs("trailingOnly" = TRUE) -# raw_config <- yaml::read_yaml("/Users/guylitt/git/formulation-selector/scripts/eval_ingest/xssa/xssa_attr_config.yaml") -raw_config <- yaml::read_yaml("/Users/guylitt/git/formulation-selector/scripts/eval_ingest/SI/SI_attr_config.yaml") +if(base::length(cmd_args)!=1){ + warning("Unexpected to have more than one argument in Rscript fs_attrs_grab.R /path/to/attribute_config.yaml.") +} +# Read in config file, e.g. "~/git/formulation-selector/scripts/eval_ingest/SI/SI_attr_config.yaml" +path_attr_config <- cmd_args[1] # "~/git/formulation-selector/scripts/eval_ingest/xssa/xssa_attr_config.yaml" +raw_config <- yaml::read_yaml(path_attr_config) -datasets <- ds <- raw_config$formulation_metadata[[grep("datasets",raw_config$formulation_metadata)]]$datasets#c("juliemai-xSSA",'all')[1] # A listing of datasets to grab attributes. Dataset names match what is inside dir_std_base. 'all' processes all datasets inside dir_std_base. -#ds_nc_filenames <- c('juliemai-xSSA_Raven_blended.nc','*.nc')[1] - +# A listing of datasets to grab attributes. Dataset names match what is inside dir_std_base. 'all' processes all datasets inside dir_std_base. +datasets <- raw_config$formulation_metadata[[grep("datasets", + raw_config$formulation_metadata)]]$datasets #c("juliemai-xSSA",'all')[1] +# Define directory paths from the config file home_dir <- Sys.getenv("HOME") -dir_base <- file.path(home_dir,'noaa','regionalization','data') - -dir_std_base <- file.path(dir_base,"input","user_data_std") # The location of standardized data generated by fs_proc python package -dir_db_hydfab <- file.path(dir_base,'input','hydrofabric') # The local dir where hydrofabric data are stored to limit s3 connections -dir_db_attrs <- file.path(dir_base,'input','attributes') # The parent dir where each comid's attribute parquet file is stored in the subdirectory 'comid/', and each dataset's aggregated parquet attributes are stored in the subdirectory '/{dataset_name} - - -s3_base <- "s3://lynker-spatial/tabular-resources" # s3 path containing hydrofabric-formatted attribute datasets -s3_bucket <- 'lynker-spatial' # s3 bucket containing hydrofabric data - -s3_path_hydatl <- glue::glue('{s3_base}/hydroATLAS/hydroatlas_vars.parquet') # path to hydroatlas data formatted for hydrofabric +dir_base <- glue::glue(base::unlist(raw_config$file_io)[['dir_base']])#file.path(home_dir,'noaa','regionalization','data') +dir_std_base <- glue::glue(base::unlist(raw_config$file_io)[['dir_std_base']]) #file.path(dir_base,"input","user_data_std") # The location of standardized data generated by fs_proc python package +dir_db_hydfab <- glue::glue(base::unlist(raw_config$file_io)[['dir_db_hydfab']]) # file.path(dir_base,'input','hydrofabric') # The local dir where hydrofabric data are stored to limit s3 connections +dir_db_attrs <- glue::glue(base::unlist(raw_config$file_io)[['dir_db_attrs']]) # file.path(dir_base,'input','attributes') # The parent dir where each comid's attribute parquet file is stored in the subdirectory 'comid/', and each dataset's aggregated parquet attributes are stored in the subdirectory '/{dataset_name} + +# Read s3 connection details +s3_base <- base::unlist(raw_config$hydfab_config)[['s3_base']]#s3://lynker-spatial/tabular-resources" # s3 path containing hydrofabric-formatted attribute datasets +s3_bucket <- base::unlist(raw_config$hydfab_config)[['s3_bucket']] #'lynker-spatial' # s3 bucket containing hydrofabric data + +# s3 path to hydroatlas data formatted for hydrofabric +if ("s3_path_hydatl" %in% names(base::unlist(raw_config$attr_select))){ + s3_path_hydatl <- glue::glue(base::unlist(raw_config$attr_select)[['s3_path_hydatl']]) # glue::glue('{s3_base}/hydroATLAS/hydroatlas_vars.parquet') +} else { + s3_path_hydatl <- NULL +} # Additional config options -hf_cat_sel <- c("total","all")[1] # total: interested in the single location's aggregated catchment data; all: all subcatchments of interest -ext <- 'gpkg' -attr_sources <- c("hydroatlas","usgs") # "streamcat", -# TODO communicate to user that these are standardized variable names -ha_vars <- c('pet_mm_s01', 'cly_pc_sav', 'cly_pc_uav','cly_pc_sav','ari_ix_sav') # hydroatlas variables -sc_vars <- c() # TODO look up variables. May need to select datasets first -usgs_vars <- c('TOT_TWI','TOT_PRSNOW','TOT_POPDENS90','TOT_EWT','TOT_RECHG','TOT_PPT7100_ANN','TOT_AET','TOT_PET','TOT_SILTAVE','TOT_BASIN_AREA','TOT_BASIN_SLOPE','TOT_ELEV_MEAN','TOT_ELEV_MAX','TOT_Intensity','TOT_Wet','TOT_Dry' ) # list of variables retrievable using nhdplusTools::get_characteristics_metadata() +hf_cat_sel <- base::unlist(raw_config$hydfab_config)[['hf_cat_sel']] #c("total","all")[1] # total: interested in the single location's aggregated catchment data; all: all subcatchments of interest +ext <- base::unlist(raw_config$hydfab_config)[['ext']] # 'gpkg' + +#----------------------------------------------------- +# Variable listings: +names_attr_sel <- base::unlist(base::lapply(raw_config$attr_select, + function(x) base::names(x))) + +# Transform into single named list of lists rather than nested sublists +idxs_vars <- base::grep("_vars", names_attr_sel) +var_names <- names_attr_sel[idxs_vars] +sub_attr_sel <- base::lapply(idxs_vars, function(i) + raw_config$attr_select[[i]][[1]]) +base::names(sub_attr_sel) <- var_names + +# Subset to only those non-null variables: +sub_attr_sel <- sub_attr_sel[base::unlist(base::lapply(sub_attr_sel, + function(x) base::any(!base::is.null(unlist(x)))))] +var_names_sub <- names(sub_attr_sel) #----------------------------------------------------- +message(glue::glue("Attribute dataset sources include the following:\n + {paste0(var_names_sub,collapse='\n')}")) -# TODO generate this listing structure based on what is provided in yaml config -# & accounting for empty entries +message(glue::glue("Attribute variables to be acquired include :\n + {paste0(sub_attr_sel,collapse='\n')}")) -Retr_Params <- list(paths = list(# Note that if a path is provided, ensure the +Retr_Params <- base::list(paths = base::list( + # Note that if a path is provided, ensure the # name includes 'path'. Same for directory having variable name with 'dir' dir_db_hydfab=dir_db_hydfab, dir_db_attrs=dir_db_attrs, s3_path_hydatl = s3_path_hydatl, dir_std_base = dir_std_base), - vars = list(usgs_vars = usgs_vars, - ha_vars = ha_vars, - sc_vars = sc_vars), + vars = sub_attr_sel, datasets = datasets ) +# PROCESS ATTRIBUTES ls_comids <- proc.attr.hydfab:::grab_attrs_datasets_fs_wrap(Retr_Params,overwrite = TRUE) diff --git a/pkg/proc.attr.hydfab/man/proc_attr_hf.Rd b/pkg/proc.attr.hydfab/man/proc_attr_hf.Rd index f1f2e92..6264aff 100644 --- a/pkg/proc.attr.hydfab/man/proc_attr_hf.Rd +++ b/pkg/proc.attr.hydfab/man/proc_attr_hf.Rd @@ -8,7 +8,7 @@ proc_attr_hf( comid, dir_db_hydfab, custom_name = "{lyrs}_", - ext = "gpkg", + fileext = "gpkg", lyrs = c("divides", "network")[2], hf_cat_sel = TRUE, overwrite = FALSE @@ -21,7 +21,7 @@ proc_attr_hf( \item{custom_name}{character class. A custom name to insert into hydrofabric file. Default \code{glue("{lyrs}_")}} -\item{ext}{character class. file extension of hydrofabric file. Default 'gpkg'} +\item{fileext}{character class. file extension of hydrofabric file. Default 'gpkg'} \item{lyrs}{character class. The layer name(s) of interest from hydrofabric. Default 'network'.} diff --git a/pkg/proc.attr.hydfab/man/proc_attr_std_hfsub_name.Rd b/pkg/proc.attr.hydfab/man/proc_attr_std_hfsub_name.Rd index 232d21e..ba64afa 100644 --- a/pkg/proc.attr.hydfab/man/proc_attr_std_hfsub_name.Rd +++ b/pkg/proc.attr.hydfab/man/proc_attr_std_hfsub_name.Rd @@ -4,14 +4,14 @@ \alias{proc_attr_std_hfsub_name} \title{Standardidze hydrofabric subsetter's local filename} \usage{ -proc_attr_std_hfsub_name(comid, custom_name = "", ext = "gpkg") +proc_attr_std_hfsub_name(comid, custom_name = "", fileext = "gpkg") } \arguments{ \item{comid}{the USGS common identifier, generated by nhdplusTools} \item{custom_name}{Desired custom name following 'hydrofab_'} -\item{ext}{file extension of the hydrofrabric data. Default 'gpkg'} +\item{fileext}{file extension of the hydrofrabric data. Default 'gpkg'} } \description{ Internal function that ensures consistent filename diff --git a/pkg/proc.attr.hydfab/tests/testthat/test_proc_attr_grabber.R b/pkg/proc.attr.hydfab/tests/testthat/test_proc_attr_grabber.R index 6618113..fe66c99 100644 --- a/pkg/proc.attr.hydfab/tests/testthat/test_proc_attr_grabber.R +++ b/pkg/proc.attr.hydfab/tests/testthat/test_proc_attr_grabber.R @@ -292,7 +292,7 @@ testthat::test_that("proc_attr_usgs_nhd", { testthat::test_that("proc_attr_hf not a comid",{ testthat::expect_error(proc.attr.hydfab::proc_attr_hf(comid="13Notacomid14", dir_db_hydfab, - custom_name="{lyrs}_",ext = 'gpkg', + custom_name="{lyrs}_",fileext = 'gpkg', lyrs=c('divides','network')[2], hf_cat_sel=TRUE, overwrite=FALSE)) }) diff --git a/scripts/config/hfsubsetRv2.2_test.R b/scripts/config/hfsubsetRv2.2_test.R new file mode 100644 index 0000000..58d09f0 --- /dev/null +++ b/scripts/config/hfsubsetRv2.2_test.R @@ -0,0 +1,322 @@ +library(hfsubsetR) +library(dplyr) +library(glue) +library(DBI) +library(tidyselect) +#comid <- c("6716129","7388043") +source <- "~/noaa/hydrofabric/v2.2/ls_conus.gpkg" +hf_ver <- '2.2' +#outfile <- glue::glue('~/noaa/hydrofabric/test/{comid}.gpkg') +dir_save <- '~/noaa/hydrofabric/test/' +hfab_vars <- "all" + + + +weight_attrs_by_area <- function(df, hfab_vars, area_col = 'areasqkm'){ + #' @title summarize a multi-divide attribute dataset into area-weighted average attribute values + #' @param df data.frame class. Attribute dataframe + #' @param hfab_vars character class. The column names of attributes inside \code{df} + #' @param area_col character class. The area column name inside \code{df} + #' @return single-row data.frame of area-averaged attribute \code{df} + #' @export + tot_area <- base::sum(df[[area_col]]) + if(any(is.na(df))){ # Determine area-weighted mean, ignoring NA + ls_wt_area <- list() + for(hfvar in hfab_vars){ + prod_var_area <- df[[hfvar]]*df[[area_col]] + if(any(is.na(prod_var_area))){ + idxs_na <- which(is.na(prod_var_area)) + sub_tot_area <- sum(df[[area_col]][-idxs_na]) # remove NA areas from consideration + ls_wt_area[[hfvar]] <- sum(prod_var_area,na.rm = TRUE)/sub_tot_area + } else { + ls_wt_area[[hfvar]] <- sum(prod_var_area)/tot_area + } + } + } else { + ls_wt_area <- base::lapply(hfab_vars, function(a) try(base::sum(df[[a]]*df[[area_col]])/tot_area)) + } + + + names(ls_wt_area) <- hfab_vars + return(as.data.frame(ls_wt_area)) +} + +proc_attr_hfab <- function(comid,dir_save= "~/", hfab_vars="all", + attr_name = c("model-attributes","divide-attributes")[2], + div_name = "divides", + source="s3://lynker-spatial/hydrofabric", hf_ver = '2.2', + id_col = "divide_id", type = 'nextgen', + domain='conus'){ + #' @title Process hydrofabric attributes + #' @description Connects to the hydrofabric via s3 or local connection and + #' acquires attributes of interest + #' @details If retrieved attribute data empty, generates warning. + #' @param comid atomic + #' @param dir_save The temporary directory location for saving subset hydrofabric. + #' @param hfab_vars character vector. The attributes of interest to acquire + #' from hydrofabric. Default "all". + #' Refer to formulation-selector github wiki for options. + #' @param attr_name The list name for attribute data as retrieved from + #' \code{hfsubsetR::get_subset()}. Default 'divide-attributes' corresponds to + #' hydrofabric v2.2. Previous versions used 'model-attributes' + #' @param div_name The list name for the divides data as retrieved from + #' \code{hfsubsetR::get_subset()}. Default 'divides'. + #' @param hf_ver Hydrofabric version. Default "2.2" + #' @param id_col The identifier column for each divide inside the divides and + #' divide-attributes layers. Default 'divide_id' + #' @param type hydrofabric type, as used in \code{hfsubsetR::get_subset()}, default "nextgen" + #' @param domain hydrofabric domain, as used in \code{hfsubsetR::get_subset()}, default "conus" + #' @export + + lyrs <- c(attr_name, div_name) + # TODO add local source check. Should this happen here or elsewhere? + + if (base::length(comid) > 1){ + stop("hfsubsetR only processes one comid at a time as of v0.0.9. + If multi-comid processing becomes available, the + weighted-area average calc needs to account for multiple + comids") + } + + if ((base::as.numeric(hf_ver)<2.2) && !('model-attributes' %in% lyrs)){ + warning("The geopackage layer 'model-attributes' was used prior to + hydrofabric version 2.2 and may need to be defined in the `lyrs` + argument of proc_attr_hfab() for use in hfsubsetR::get_subset().") + } + # ----------- READ HYDROFABRIC ATTRIBUTE DATA ------------ # + # Read in the divide and divide-attributes layers + if (tools::file_ext(source) == 'gpkg'){ # In case a local version of hydrofab exists + outfile <- file.path(dir_save,glue::glue("hfab_attr_temp.gpkg")) + rslt <- hfsubsetR::get_subset( + gpkg = source, + lyrs=lyrs, + comid = comid[1], + outfile = outfile,overwrite=TRUE) + + # Create a list of layers from the temporary location + attr_div <- base::lapply(lyrs, function(l) sf::st_read(outfile, layer = l,quiet = TRUE)) + names(attr_div) <- lyrs + + } else { # Otherwise try the AWS connection + attr_div <- hfsubsetR::get_subset(comid = comid, + lyrs = lyrs, + source =source, + hf_version = hf_ver, + type = type, + domain =domain) + } + + + # ----------- SUBSET TO VARIABLES OF INTEREST ------------ # + if (hfab_vars == 'all'){ + hfab_vars <- names(attr_div[[attr_name]]) + hfab_vars <- hfab_vars[-grep("vpuid", hfab_vars)] + hfab_vars <- hfab_vars[-grep("divide_id", hfab_vars)] + } + attr_df <- attr_div[[attr_name]] %>% + dplyr::select(dplyr::all_of(c("divide_id",hfab_vars))) %>% + dplyr::select(-tidyselect::matches("vpuid")) + + if(nrow(attr_df) == 0){ + warning(glue::glue("COMID {comid} has no data in the hydrofabric. Skipping.")) + } else { + # Combine model attributes with divides + attrs_area <- dplyr::left_join(x=attr_df, attr_div[[div_name]], by="divide_id") + + if('dist_4.twi' %in% names(attrs_area)){ + #Expand dist_4.twi into multiple cols, by converting json format to df + ls_dist_4_twi <- list() + for(i in 1:nrow(attrs_area)){ + df_twi <- jsonlite::fromJSON(attrs_area[["dist_4.twi"]][[i]]) + + ls_dist_4_twi[[i]] <- data.table::as.data.table(base::t(base::unlist(df_twi))) + } + df_twi_all <- data.table::rbindlist(ls_dist_4_twi,fill = TRUE) %>% base::as.data.frame() + df_twi_sub <- df_twi_all[,c("v1","v2","v3","v4")] + new_twi_vars <- c("twi_25pctl", "twi_50pctl","twi_75pctl","twi_100pctl") + names(df_twi_sub) <- new_twi_vars + + attrs_area <- base::cbind(attrs_area,df_twi_sub) %>% + dplyr::select(-dplyr::all_of("dist_4.twi")) + + hfab_vars <- hfab_vars[-base::grep("dist_4.twi", hfab_vars)] + hfab_vars <- c(hfab_vars,new_twi_vars) + } + + if (base::length(base::unique(attr_div$divides$divide_id)) != + base::length(base::unique(attr_div[[attr_name]]$divide_id))){ + + tot_area <- base::sum(attr_div[[div_name]]$areasqkm) + attr_area <- base::sum(attrs_area$areasqkm) + + warning(glue::glue("COMID {comid} basin area totals {round(tot_area,2)}km^2, + but {attr_name} only cover {round(attr_area,2)}km^2")) + } + # ----------- AREAL WEIGHTING OF ATTRIBUTES ------------ # + df_wt <- weight_attrs_by_area(df=attrs_area,hfab_vars=hfab_vars) + df_wt$comid <- comid + + df_wt <- df_wt %>% dplyr::relocate(comid) + return(df_wt) + } +} + + + +# Read in comids: +dir_attr <- "~/noaa/regionalization/data/input/attributes/" +files_attr <- list.files(dir_attr) +comids <- gsub("comid_","",files_attr) %>% gsub(pattern = "_attrs.parquet",replacement = "") + + + +comids <- comids[grep("23864616",comids):length(comids)] + +attr_dat <- list() +for(comid in comids){ + print(glue::glue("Processing {comid}")) + rslt <- try(proc_attr_hfab(comid,dir_save=dir_save, hfab_vars="all", + attr_name ="divide-attributes", + div_name = "divides", + source=source, hf_ver = '2.2', + id_col = "divide_id", type = 'nextgen', + domain='conus')) + + if("try-error" %in% rslt){ # sometimes trying again seems to fix things + Sys.sleep(5) + rslt <- try(proc_attr_hfab(comid,dir_save=dir_save, hfab_vars="all", + attr_name ="divide-attributes", + div_name = "divides", + source=source, hf_ver = '2.2', + id_col = "divide_id", type = 'nextgen', + domain='conus')) + + } + + if(!"try-error" %in% rslt){ + attr_dat[[comid]] <- rslt + } +} + + + +# Review all attribute data +check_empty_func <- function(x) { + if (length(x) > 1) { + return(x) + } +} +sub_ls_attrs <- lapply(attr_dat, check_empty_func) + +dt_all_attrs <- data.table::rbindlist(sub_ls_attrs) + +summary(dt_all_attrs) + +# TODO review all areas: +areas_all_ls <- list() +for(comid in comids){ + subset_dat <- hfsubsetR::get_subset(comid = comid, + lyrs = 'divides', + gpkg =source, + hf_version = hf_ver, + type = type, + domain =domain) + areas_all_ls[[comid]] <- sum(subset_dat$divides$areasqkm) + +} +library(ggplot2) +area_data <- unlist(unname(areas_all_ls)) %>% as.data.frame() +ggplot2::ggplot(area_data) + + ggplot2::geom_histogram() + +###### LOOKUP COMID LOCATION +library(nhdplusTools) + +rslt <- nhdplusTools::get_nhdplus(comid="23864616",realization = 'outlet') +rslt$geometry +# Paste the coordinates into here: https://apps.nationalmap.gov/viewer/ + # TODO add divide id and attr areal coverages to final attribute dataset + + +# +# +# +# # Note that there are additional divide ids inside the divides layer, meaning the model-attributes layer is missing some divide ids +# print(glue::glue("Total divide ids inside divides layer: {length(unique(attr_div$divides$divide_id))}")) +# print(glue::glue("Total divide ids inside model-attributes layer: {length(unique(attr_div$`model-attributes`$divide_id))}")) +# +# # Total area accounted for in each layer: +# print(glue::glue("Total catchment area inside the divides layer: {sum(attr_div$divides$areasqkm)} km^2")) +# +# print(glue::glue("Total catchment area inside the model-attributes layer: {sum(attrs_area$areasqkm)} km^2")) +# +# ###### +# +# # https://noaa-owp.github.io/hydrofabric/articles/05-subsetting.html +# library(dplyr) +# library(arrow) +# library(hydrofabric) +# proc_attr_hfab <- function(comid,usgs_vars){ +# +# +# } +# (g = open_dataset('s3://lynker-spatial/hydrofabric/v2.2/conus_hl') |> +# select("vpuid", 'hl_reference', "hl_link") %>% +# filter(hl_link == '06752260') %>% +# collect()) +# +# dat <- arrow::read_parquet("/Users/guylitt/Downloads/nextgen_04.parquet") +# +# dim(dat) +# +# (hydLoc <- hfsubsetR::findOrigin(network_path, comid=comid)) +# +# attr_data <- arrow::open_dataset("s3://lynker-spatial/hydrofabric/v2.1.1/nextgen/conus_model-attributes") |> +# filter(vpuid == hydLoc$vpuid) |> +# # filter(divide_id==hydLoc$id) |> +# collect() +# +# +# s3 <- "s3://lynker-spatial/hydrofabric" +# version <- 'v2.1.1' +# type <- "nextgen" +# domain <- "conus" +# network_path <- glue("{s3}/{version}/{type}/conus_network") +# net <- open_dataset(network_path) +# +# glimpse(filter(net,hl_uri == "HUC12-010100100101")) +# +# subset_enhanced = get_subset(comid = comid, +# lyrs = c("model-attributes","divides") +# #lyrs = c("forcing-weights", "model-attributes","divides","nexus"), +# source =s3, +# hf_version = '2.1.1', +# type = "nextgen", +# domain = "conus") +# length(unique(subset_enhanced$`model-attributes`$divide_id)) +# length(unique(subset_enhanced$divides$divide_id)) +# +# intersect(subset_enhanced$`model-attributes`$divide_id,subset_enhanced$divides$divide_id) +# +# attrs_area <- dplyr::left_join(x=subset_enhanced$`model-attributes`, subset_enhanced$divides, by="divide_id") +# # TODO why does the total area differ for comid "6716129"? Some divide_ids do not have attributes calculated (~10% of total catchment area) +# (totl_actl_area <- sum(subset_enhanced$divides$areasqkm)) +# tot_area <- sum(attrs_area$areasqkm) +# gwCoefWt <- sum(attrs_area$gw_Coeff*attrs_area$areasqkm)/tot_area +# sum(attrs_area$impervious_mean*attrs_area$areasqkm)/tot_area +# +# merge(subset_enhanced) +# +# subset_enhanced$divides %>% filter(divide_id %in% subset_enhanced$`model-attributes`$divide_id) +# +# length(subset_enhanced$`forcing-weights`$) +# subset_enhanced$divides +# +# +# +# +# + +library(nhdplusTools) +nldi_feat <- list() +nhdplusTools::discover_nhdplus_id()