forked from NOAA-OWP/ngen-forcing
-
Notifications
You must be signed in to change notification settings - Fork 0
/
regrid_ExactExtract.py
384 lines (311 loc) · 16.7 KB
/
regrid_ExactExtract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
import numpy as np
import geopandas as gpd
import netCDF4 as nc4
import os
import glob
from pathlib import Path
from os.path import join
import argparse
import pandas as pd
from multiprocessing import Process, Lock
import multiprocessing
import time
import datetime
import re
def get_date_time_csv(path):
"""
Extract the date-time from the file path
"""
path = Path(path)
name = path.stem
date_time = name.split('.')[0]
date_time = date_time.split('_')[2] #this index may depend on the naming format of the forcing data
date_time = re.sub('\D','',date_time)
return date_time
def get_date_time(path):
"""
Extract the date-time from the file path
"""
path = Path(path)
name = path.stem
date_time = name.split('.')[0]
date_time = date_time.split('_')[1] #this index may depend on the naming format of the forcing data
date_time = re.sub('\D','',date_time)
return date_time
def csv_to_netcdf(num_catchments, weighted_csv_files, aorc_ncfile):
"""
Convert from csv to netcdf format
"""
# first read AORC metadata in and extract scale factors and offsets if needed
ds = nc4.Dataset(aorc_ncfile)
# process the srcfield data
srcfield_vars = []
srcfield_vars.append('APCP_surface')
srcfield_vars.append('DLWRF_surface')
srcfield_vars.append('DSWRF_surface')
srcfield_vars.append('PRES_surface')
srcfield_vars.append('SPFH_2maboveground')
srcfield_vars.append('TMP_2maboveground')
srcfield_vars.append('UGRD_10maboveground')
srcfield_vars.append('VGRD_10maboveground')
# get scale_factor and offset keys if available
# (AORC-OWP files for HUC01 scenario has this metadata)
add_offset = np.zeros([len(srcfield_vars)])
scale_factor = np.zeros([len(srcfield_vars)])
i = 0
for key in srcfield_vars:
try:
scale_factor[i] = ds.variables[key].scale_factor
except AttributeError as e:
scale_factor[i] = 1.0
try:
add_offset[i] = ds.variables[key].add_offset
except AttributeError as e:
add_offset[i] = 0.0
i += 1
# pre-allocate arrays for netcdf files
num_files = len(weighted_csv_files)
print("num_catchments = {}".format(num_catchments))
cat_id = np.zeros(num_catchments, dtype=str)
print("num_time_steps = {}".format(num_files))
time = np.zeros(num_files, dtype=float)
APCP_surface = np.zeros((num_catchments,num_files), dtype=float)
DLWRF_surface = np.zeros((num_catchments,num_files), dtype=float)
DSWRF_surface = np.zeros((num_catchments,num_files), dtype=float)
PRES_surface = np.zeros((num_catchments,num_files), dtype=float)
SPFH_2maboveground = np.zeros((num_catchments,num_files), dtype=float)
TMP_2maboveground = np.zeros((num_catchments,num_files), dtype=float)
UGRD_10maboveground = np.zeros((num_catchments,num_files), dtype=float)
VGRD_10maboveground = np.zeros((num_catchments,num_files), dtype=float)
# get catchment ids and reference start time for NextGen forcing file
df = pd.read_csv(weighted_csv_files[0])
cat_id[:] = df['id'].values
start_time = get_date_time(weighted_csv_files[0])
# AORC reference time to use
ref_date = pd.Timestamp("1970-01-01 00:00:00")
# Loop over ech ExactExtract csv file, read and append data to netcdf arrays
for i in np.arange(num_files):
file_date = get_date_time_csv(weighted_csv_files[i])
time[i] = (pd.Timestamp(datetime.datetime.strptime(file_date,'%Y%m%d%H')) - ref_date).total_seconds()
df = pd.read_csv(weighted_csv_files[i])
#since ExactExtract module doesn't account for scale_factor and offset
# keys for a given variable, we will have to finish calculating the lumped
# sum for the ExactExtract csv files
df['APCP_surface_weighted_mean'] = df['APCP_surface_weighted_mean']*scale_factor[0] + add_offset[0]
df['DLWRF_surface_weighted_mean'] = df['DLWRF_surface_weighted_mean']*scale_factor[1] + add_offset[1]
df['DSWRF_surface_weighted_mean'] = df['DSWRF_surface_weighted_mean']*scale_factor[2] + add_offset[2]
df['PRES_surface_weighted_mean'] = df['PRES_surface_weighted_mean']*scale_factor[3] + add_offset[3]
df['SPFH_2maboveground_weighted_mean'] = df['SPFH_2maboveground_weighted_mean']*scale_factor[4] + add_offset[4]
df['TMP_2maboveground_weighted_mean'] = df['TMP_2maboveground_weighted_mean']*scale_factor[5] + add_offset[5]
df['UGRD_10maboveground_weighted_mean'] = df['UGRD_10maboveground_weighted_mean']*scale_factor[6] + add_offset[6]
df['VGRD_10maboveground_weighted_mean'] = df['VGRD_10maboveground_weighted_mean']*scale_factor[7] + add_offset[7]
# Now add the ExactExtract csv data to netcdf variables
APCP_surface[:,i] = df['APCP_surface_weighted_mean'].values
DLWRF_surface[:,i] = df['DLWRF_surface_weighted_mean'].values
DSWRF_surface[:,i] = df['DSWRF_surface_weighted_mean'].values
PRES_surface[:,i] = df['PRES_surface_weighted_mean'].values
SPFH_2maboveground[:,i] = df['SPFH_2maboveground_weighted_mean'].values
TMP_2maboveground[:,i] = df['TMP_2maboveground_weighted_mean'].values
UGRD_10maboveground[:,i] = df['UGRD_10maboveground_weighted_mean'].values
VGRD_10maboveground[:,i] = df['VGRD_10maboveground_weighted_mean'].values
# save csv file with updated data after accounting for
# scale factor and offset within netcdf metadata
df.to_csv(weighted_csv_files[i],header=True,index=False)
#create output netcdf file name
output_path = join(output_root, forcing, "NextGen_forcing_"+start_time+".nc")
#make the data set
filename = output_path
filename_out = output_path
# write data to netcdf files
filename_out = output_path
ncfile_out = nc4.Dataset(filename_out, 'w', format='NETCDF4')
#add the dimensions
time_dim = ncfile_out.createDimension('time', num_files)
catchment_id_dim = ncfile_out.createDimension('catchment-id', num_catchments)
string_dim =ncfile_out.createDimension('str_dim', 1)
# create variables
cat_id_out = ncfile_out.createVariable('ids', 'str', ('catchment-id'), fill_value="None")
time_out = ncfile_out.createVariable('Time', 'double', ('time',), fill_value=-99999)
APCP_surface_out = ncfile_out.createVariable('RAINRATE', 'f4', ('catchment-id', 'time',), fill_value=-99999)
TMP_2maboveground_out = ncfile_out.createVariable('T2D', 'f4', ('catchment-id', 'time',), fill_value=-99999)
SPFH_2maboveground_out = ncfile_out.createVariable('Q2D', 'f4', ('catchment-id', 'time',), fill_value=-99999)
UGRD_10maboveground_out = ncfile_out.createVariable('U2D', 'f4', ('catchment-id', 'time',), fill_value=-99999)
VGRD_10maboveground_out = ncfile_out.createVariable('V2D', 'f4', ('catchment-id', 'time',), fill_value=-99999)
PRES_surface_out = ncfile_out.createVariable('PSFC', 'f4', ('catchment-id', 'time',), fill_value=-99999)
DSWRF_surface_out = ncfile_out.createVariable('SWDOWN', 'f4', ('catchment-id', 'time',), fill_value=-99999)
DLWRF_surface_out = ncfile_out.createVariable('LWDOWN', 'f4', ('catchment-id', 'time',), fill_value=-99999)
#set output netcdf file atributes
varout_dict = {'time':time_out,
'APCP_surface':APCP_surface_out, 'DLWRF_surface':DLWRF_surface_out, 'DSWRF_surface':DSWRF_surface_out,
'PRES_surface':PRES_surface_out, 'SPFH_2maboveground':SPFH_2maboveground_out, 'TMP_2maboveground':TMP_2maboveground_out,
'UGRD_10maboveground':UGRD_10maboveground_out, 'VGRD_10maboveground':VGRD_10maboveground_out}
#copy all attributes from input netcdf file
for name, variable in ds.variables.items():
if name == 'latitude' or name == 'longitude':
pass
else:
varout_name = varout_dict[name]
for attrname in variable.ncattrs():
if attrname != '_FillValue':
setattr(varout_name, attrname, getattr(variable, attrname))
#drop the scale_factor from the output netcdf forcing file attributes
for key, varout_name in varout_dict.items():
if key != 'time':
try:
del varout_name.scale_factor
except:
print("No scale factor in forcing files. No keys to tweak for output netcdf")
#####################################################################
#set attributes for additional variables
setattr(cat_id_out, 'description', 'catchment_id')
cat_id_out[:] = cat_id[:]
time_out[:] = time[:]
APCP_surface_out[:,:] = APCP_surface[:,:]
DLWRF_surface_out[:,:] = DLWRF_surface[:,:]
DSWRF_surface_out[:,:] = DSWRF_surface[:,:]
PRES_surface_out[:,:] = PRES_surface[:,:]
SPFH_2maboveground_out[:,:] = SPFH_2maboveground[:,:]
TMP_2maboveground_out[:,:] = TMP_2maboveground[:,:]
UGRD_10maboveground_out[:,:] = UGRD_10maboveground[:,:]
VGRD_10maboveground_out[:,:] = VGRD_10maboveground[:,:]
# Now close NextGen netcdf file
# and AORC file
ncfile_out.close()
ds.close()
def process_sublist(data : dict, lock: Lock, num: int):
num_files = len(data["forcing_files"])
for i in range(num_files):
# extract forcing file
aorc_file = data["forcing_files"][i]
# get datetime of forcing file to append
# to ExactExtract csv output file
date_time = get_date_time(aorc_file)
NextGen_csv = join(exactextract_files,'NextGen_forcings_'+str(date_time)+'.csv')
# create string for ExactExtract module output file
exactextract_output_file = ' -o ' + NextGen_csv
# Since we now have the pathway to the location file
# We can go ahead and finish the ExactExtract command
# string, which specifies which variables to call for
# this particular AORC file
file_vars = ''
for i in np.arange(len(AORC_met_vars)):
file_vars += ' -r "'+AORC_met_vars[i]+':NETCDF:'+aorc_file+':'+AORC_met_vars[i]+'"'
''''''''''''''''''''''''''''
Example call for ExactExtract Module
exactextract -r "VGRD_10maboveground:NETCDF:AORC_Charlotte_2015121309.nc4:VGRD_10maboveground" -p catchment_data.geojson -f id -s "weighted_mean(VGRD_10maboveground)" -o NextGen_VGRD_Lumped_Sum_Forcings.csv
'''''''''''''''''''''''''''
# Call ExactExtract Module to produce NextGen csv weighted lumped sum data
os.system(exactextract_executable + file_vars + shapefile + shapefile_variable + avg_variable_technique + exactextract_output_file)
if __name__ == '__main__':
#example: python code_name -i /home/jason.ducker/esmf_forcing_files_test/ExactExtract_sugar_creek -o /home/jason.ducker/esmf_forcing_files_test/ExactExtract_sugar_creek -a /apd_common/test/test_data/aorc_netcdf/AORC/2015 -w weight_files/ -f forcing_files/ -e_csv csv_files/ -e /home/jason.ducker/ExactExtract/build/exactextract
#parse the input and output root directory
parser = argparse.ArgumentParser()
parser.add_argument("-i", dest="input_root", type=str, required=True, help="The input directory with csv files")
parser.add_argument("-o", dest="output_root", type=str, required=True, help="The output file path")
parser.add_argument("-a", dest="aorc_netcdf", type=str, required=True, help="The input aorc netcdf files directory")
parser.add_argument("-w", dest="weight_files", type=str, required=True, help="The output weight files sub_dir")
parser.add_argument("-f", dest="forcing", type=str, required=True, help="The output forcing files sub_dir")
parser.add_argument("-e_csv", dest="ExactExtract_csv_files", type=str, required=True, help="The output sub_dir for ExactExtract csv files created from each AORC file")
parser.add_argument("-e", dest="ExactExtract_executable", type=str, required=True, help="The pathway pointing to the ExactExtract executable")
args = parser.parse_args()
#retrieve parsed values
input_root = args.input_root
output_root = args.output_root
aorc_netcdf = args.aorc_netcdf
weight_files = join(output_root,args.weight_files)
forcing = join(output_root,args.forcing)
exactextract_files = join(output_root,args.ExactExtract_csv_files)
exactextract_executable = args.ExactExtract_executable
#generate catchment geometry from hydrofabric
#hyfabfile = "/home/jason.ducker/sugar_creek/catchment_data.geojson"
hyfabfile = "/home/jason.ducker/hydrofabric/huc01/catchment_data.geojson"
cat_df_full = gpd.read_file(hyfabfile)
g = [i for i in cat_df_full.geometry]
h = [i for i in cat_df_full.id]
n_cats = len(g)
num_catchments = n_cats
print("number of catchments = {}".format(n_cats))
# Data paths for either sugar_creek (AOR_Charlotte) or
# Huc01 (AORC-OWP) files on Linux clustet
#datafile_path = join(aorc_netcdf, "AORC_Charlotte_*.nc4")
datafile_path = join(aorc_netcdf, "AORC-OWP_*.nc4")
#get list of files
datafiles = glob.glob(datafile_path)
print("number of forcing files = {}".format(len(datafiles)))
#process data with time ordered
datafiles.sort()
# Extract variable names from AORC netcdf data
nc_file = nc4.Dataset(datafiles[0])
# Get variable list from AORC file
nc_vars = list(nc_file.variables.keys())
# Get indices corresponding to Meteorological data
indices = [nc_vars.index(i) for i in nc_vars if '_' in i]
# Make array with variable names to use for ExactExtract module
AORC_met_vars = np.array(nc_vars)[indices]
# Close netcdf file
nc_file.close()
# We want to now go ahead and quickly build part of the system call string
# for python to call the ExactExtract Module for our given AORC
# variables. This portion just simply calls what variables to calculate
# a weighted mean and which shape file we will use
shapefile = ' -p ' + hyfabfile
shapefile_variable = ' -f id'
avg_variable_technique = ''
for i in np.arange(len(AORC_met_vars)):
avg_variable_technique += ' -s "weighted_mean('+AORC_met_vars[i]+')"'
#prepare for processing
num_forcing_files = len(datafiles)
num_processes = 25
#generate the data objects for child processes
file_groups = np.array_split(np.array(datafiles), num_processes)
process_data = []
process_list = []
lock = Lock()
for i in range(num_processes):
# fill the dictionary with needed at
data = {}
data["forcing_files"] = file_groups[i]
#append to the list
process_data.append(data)
p = Process(target=process_sublist, args=(data, lock, i))
process_list.append(p)
#start all processes
for p in process_list:
p.start()
#wait for termination
for p in process_list:
p.join()
# Since we have finished creating the ExactExtract csv files,
# we can now go ahead and call the ExactExtract module once
# to just extract the weight file for AORC forcing dataset
weight_csv = join(weight_files,'AORC_weight_file.csv')
# create string for ExactExtract module output weight file
exactextract_output_file = ' -o ' + weight_csv
# create string to request coverage fraction (weight)
# file from the ExactExtract module
extract_weights = ' --strategy "coverage_fraction"'
# Just take the first AORC file to get weights
aorc_file = datafiles[0]
# Since we now have the pathway to the location file
# We can go ahead and finish the ExactExtract command
# string, which specifies which variables to call for
# this particular AORC file
file_vars = ''
for i in np.arange(len(AORC_met_vars)):
file_vars += ' -r "'+AORC_met_vars[i]+':NETCDF:'+aorc_file+':'+AORC_met_vars[i]+'"'
''''''''''''''''''''''''''''
Example call for ExactExtract Module requesting weight file
exactextract -r "VGRD_10maboveground:NETCDF:AORC_Charlotte_2015121309.nc4:VGRD_10maboveground" -p catchment_data.geojson -f id -s "weighted_mean(VGRD_10maboveground)" --strategy "coverage_fraction" -o NextGen_AORC_weights.csv
'''''''''''''''''''''''''''
# Call ExactExtract Module to produce weighted csv file for AORC forcings file
os.system(exactextract_executable + file_vars + shapefile + shapefile_variable + avg_variable_technique + extract_weights + exactextract_output_file)
# Now get file paths for created ExactExtract csv files
ExactExtract_path = join(exactextract_files, "*.csv")
weighted_csv_files = glob.glob(ExactExtract_path)
print("Number of ExactExtract csv files = {}".format(len(weighted_csv_files)))
#process data with time ordered
weighted_csv_files.sort()
# set aorc netcdf file pathway to get variable attribute data
aorc_ncfile = datafiles[0]
#generate single NextGen netcdf file from generated ExactExtract weighted csv files
csv_to_netcdf(num_catchments, weighted_csv_files, aorc_ncfile)