From 807a0539db4433533300266519cf2e220ed41b90 Mon Sep 17 00:00:00 2001 From: minghangli-uni Date: Mon, 9 Sep 2024 09:30:20 +1000 Subject: [PATCH 1/8] Add standardise_mom6_filenames.sh script (#32) --- payu_config/archive.sh | 1 + .../standardise_mom6_filenames.sh | 53 +++++++++++++++++++ 2 files changed, 54 insertions(+) create mode 100755 payu_config/archive_scripts/standardise_mom6_filenames.sh diff --git a/payu_config/archive.sh b/payu_config/archive.sh index b5d3ff4..f098f15 100644 --- a/payu_config/archive.sh +++ b/payu_config/archive.sh @@ -2,4 +2,5 @@ source $(dirname "$0")/archive_scripts/archive_cice_restarts.sh source $(dirname "$0")/archive_scripts/concat_ice_daily.sh +source $(dirname "$0")/archive_scripts/standardise_mom6_filenames.sh python3 $(dirname "$0")/archive_scripts/build_intake_ds.py diff --git a/payu_config/archive_scripts/standardise_mom6_filenames.sh b/payu_config/archive_scripts/standardise_mom6_filenames.sh new file mode 100755 index 0000000..8c1e0bf --- /dev/null +++ b/payu_config/archive_scripts/standardise_mom6_filenames.sh @@ -0,0 +1,53 @@ +#!/usr/bin/bash +# Copyright 2024 ACCESS-NRI and contributors. See the top-level COPYRIGHT file for details. +# SPDX-License-Identifier: Apache-2.0. +# +# Standardise file naming for MOM6 output files from access-om3. +# This was written assuming it would be used as a payu "userscript" at the "archive" stage, but alternatively a path to an "archive" directory can be provided. +# For more details, see https://github.com/COSIMA/om3-scripts/issues/32 + +Help() +{ + # Display help + echo -e "Standardise file naming for MOM6 output files.\n" + echo "Syntax: scriptTemplate [-h|d DIRECTORY]" + echo "options:" + echo "h Print this help message." + echo -e "d Process files in the specified 'DIRECTORY'." +} + +while getopts ":hd:" option; do + case $option in + h) # display help + Help + exit;; + d) # Enter a directory + out_dir=$OPTARG + if [ ! -d $out_dir ]; then + echo $out_dir Does not exist + exit + fi;; + \?) # Invalid option + echo "Error: Invalid option" + exit;; + esac +done + +# if no directory was specified, collect all directories from 'archive' +if [ -z $out_dir ]; then + out_dirs=$(ls -rd archive/output*[0-9] 2>/dev/null) +else + out_dirs=$out_dir +fi + +# process each output directory +for dir in ${out_dirs[@]}; do + # process each mom6 file + for current_file in $dir/access-om3.mom6.*.nc; do + if [ -f $current_file ]; then + new_filename=$(echo $current_file | sed -E 's/_([0-9]{4})\./\1./') + # rename the file without overwriting exisiting files + mv -n $current_file $new_filename + fi + done +done From ccd4f5f25bd8b94a6e25ffe593164bc58e6c6059 Mon Sep 17 00:00:00 2001 From: anton-seaice Date: Tue, 10 Sep 2024 14:12:58 +1000 Subject: [PATCH 2/8] mom6 filenames test --- test/test_payu_conf/test_mom6_filenames.py | 148 +++++++++++++++++++++ 1 file changed, 148 insertions(+) create mode 100644 test/test_payu_conf/test_mom6_filenames.py diff --git a/test/test_payu_conf/test_mom6_filenames.py b/test/test_payu_conf/test_mom6_filenames.py new file mode 100644 index 0000000..83a0a17 --- /dev/null +++ b/test/test_payu_conf/test_mom6_filenames.py @@ -0,0 +1,148 @@ +import pytest +import xarray as xr +import numpy as np +import pandas as pd + +from os import makedirs, chdir +from subprocess import run +from pathlib import Path + +scripts_base = Path(__file__).parents[2] +run_str = f"{scripts_base}/payu_config/archive_scripts/standardise_mom6_filenames.sh" + + +def assert_file_exists(p): + if not Path(p).resolve().is_file(): + raise AssertionError("File does not exist: %s" % str(p)) + + +def assert_f_not_exists(p): + if Path(p).resolve().is_file(): + raise AssertionError("File exists and should not: %s" % str(p)) + + +def monthly_files(dir_name, hist_base, nmonths, tmp_path): + """ + Make 12 months of empty data files data, and then write it into 12 files + + request = (path, ndays) + e.g. request = ("archive/output000", "365") + + """ + + times = pd.date_range("2010-01-01 12:00", freq="ME", periods=nmonths+1) + + out_dir = str(tmp_path) + "/" + dir_name + "/" + paths = [f"{out_dir}{hist_base}_{str(t)[0:7]}.nc" for t in times] + + makedirs(out_dir) + + for path in paths: + with open(path, "w") as f: + f.close() + + return paths + + +@pytest.fixture( + params=["access-om3.mom.h.test"] #, "access-om3.cice", "access-om3.cice.1day.mean"] +) +def hist_base(request): + return str(request.param) + + +@pytest.mark.parametrize( + "hist_dir, use_dir, nmonths", + [ + ("Default", False, 12), + ("archive/output999", False, 1), + ("archive/output9999", False, 1), + ("archive/output574", True, 12), + ], +) # run this test with a several folder names and lengths, provide the directory as an argument sometimes +def test_true_case(hist_dir, use_dir, nmonths, hist_base, tmp_path): + + + monthly_paths = monthly_files(hist_dir, hist_base, nmonths, tmp_path) + chdir(tmp_path) + output_dir = Path(monthly_paths[0]).parents[0] + + if not use_dir: # default path + run([run_str]) + expected_months = pd.date_range("2010-01-01", freq="ME", periods=nmonths + 1) + else: # provide path + run( + [ + run_str, + "-d", + output_dir, + ], + ) + expected_months = pd.date_range("2010-01-01", freq="ME", periods=nmonths + 1) + + # valid output filenames + monthly_paths = [ + f"{output_dir}/{hist_base}.{str(t)[0:7]}.nc" for t in expected_months + ] + + for p in monthly_paths[0:nmonths]: + assert_file_exists(p) + + for p in monthly_paths[nmonths]: + assert_f_not_exists(p) + + for p in daily_paths: + assert_f_not_exists(p) + + +# @pytest.mark.parametrize("hist_dir, ndays", [("Default", 1), ("Default", 30)]) +# def test_incomplete_month(hist_dir, ndays, hist_base, tmp_path): +# """ +# Run the script to convert the daily data into monthly files, with less than 28 days data, and check no things happen. +# """ + +# daily_paths = daily_files(hist_dir, hist_base, ndays, tmp_path) + +# chdir(tmp_path) +# output_dir = Path(daily_paths[0]).parents[0] + +# run([run_str]) +# expected_months = pd.date_range("2010-01-01", freq="ME", periods=1) + +# monthly_paths = [ +# f"{output_dir}/{hist_base}.{str(t)[0:7]}.nc" for t in expected_months +# ] + +# for p in daily_paths: +# assert_file_exists(p) + +# for p in monthly_paths: +# assert_f_not_exists(p) + + +# @pytest.mark.parametrize("hist_dir, ndays", [("Default", 31), ("Default", 27)]) +# def test_no_override(hist_dir, ndays, hist_base, tmp_path): +# """ +# Run the script to convert the daily data into monthly files, but the output filename already exists, and check nothing happens. +# """ + +# daily_paths = daily_files(hist_dir, hist_base, ndays, tmp_path) + +# chdir(tmp_path) +# output_dir = Path(daily_paths[0]).parents[0] + +# expected_months = pd.date_range("2010-01-01", freq="ME", periods=1) + +# monthly_paths = [ +# f"{output_dir}/{hist_base}.{str(t)[0:7]}.nc" for t in expected_months +# ] +# for p in monthly_paths: +# Path(p).touch() + +# run([run_str]) + +# for p in daily_paths: +# assert_file_exists(p) + +# for p in monthly_paths: +# assert_file_exists(p) From fa8ba3cd1500ad12cb05d5b738cd3dea90310451 Mon Sep 17 00:00:00 2001 From: anton-seaice Date: Tue, 10 Sep 2024 14:48:29 +1000 Subject: [PATCH 3/8] formatting and tweaks --- test/test_payu_conf/test_mom6_filenames.py | 41 +++++++++++----------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/test/test_payu_conf/test_mom6_filenames.py b/test/test_payu_conf/test_mom6_filenames.py index 83a0a17..514fffd 100644 --- a/test/test_payu_conf/test_mom6_filenames.py +++ b/test/test_payu_conf/test_mom6_filenames.py @@ -10,6 +10,8 @@ scripts_base = Path(__file__).parents[2] run_str = f"{scripts_base}/payu_config/archive_scripts/standardise_mom6_filenames.sh" +DIAG_BASE = "access-om3.mom6.h.test" + def assert_file_exists(p): if not Path(p).resolve().is_file(): @@ -21,7 +23,7 @@ def assert_f_not_exists(p): raise AssertionError("File exists and should not: %s" % str(p)) -def monthly_files(dir_name, hist_base, nmonths, tmp_path): +def monthly_files(dir_name, nmonths, tmp_path): """ Make 12 months of empty data files data, and then write it into 12 files @@ -30,25 +32,22 @@ def monthly_files(dir_name, hist_base, nmonths, tmp_path): """ - times = pd.date_range("2010-01-01 12:00", freq="ME", periods=nmonths+1) + times = pd.date_range("2010-01-01", freq="ME", periods=nmonths) out_dir = str(tmp_path) + "/" + dir_name + "/" - paths = [f"{out_dir}{hist_base}_{str(t)[0:7]}.nc" for t in times] - + paths = [f"{out_dir}{DIAG_BASE}_{str(t)[0:4]}_{str(t)[5:7]}.nc" for t in times] + makedirs(out_dir) - for path in paths: - with open(path, "w") as f: + for p in paths: + with open(p, "w") as f: + # f.write("blank") f.close() - return paths - + for p in paths: + assert_file_exists(p) -@pytest.fixture( - params=["access-om3.mom.h.test"] #, "access-om3.cice", "access-om3.cice.1day.mean"] -) -def hist_base(request): - return str(request.param) + return paths @pytest.mark.parametrize( @@ -60,10 +59,9 @@ def hist_base(request): ("archive/output574", True, 12), ], ) # run this test with a several folder names and lengths, provide the directory as an argument sometimes -def test_true_case(hist_dir, use_dir, nmonths, hist_base, tmp_path): - +def test_true_case(hist_dir, use_dir, nmonths, tmp_path): - monthly_paths = monthly_files(hist_dir, hist_base, nmonths, tmp_path) + monthly_paths = monthly_files(hist_dir, nmonths, tmp_path) chdir(tmp_path) output_dir = Path(monthly_paths[0]).parents[0] @@ -81,17 +79,18 @@ def test_true_case(hist_dir, use_dir, nmonths, hist_base, tmp_path): expected_months = pd.date_range("2010-01-01", freq="ME", periods=nmonths + 1) # valid output filenames - monthly_paths = [ - f"{output_dir}/{hist_base}.{str(t)[0:7]}.nc" for t in expected_months + expected_paths = [ + f"{output_dir}/{DIAG_BASE}_{str(t)[0:4]}-{str(t)[5:7]}.nc" + for t in expected_months ] - for p in monthly_paths[0:nmonths]: + for p in expected_paths[0:nmonths]: assert_file_exists(p) - for p in monthly_paths[nmonths]: + for p in expected_paths[nmonths]: assert_f_not_exists(p) - for p in daily_paths: + for p in monthly_paths: assert_f_not_exists(p) From b7c0bc67246a88e9f804d2471fd2c123c454335c Mon Sep 17 00:00:00 2001 From: "minghang.li" Date: Tue, 10 Sep 2024 15:01:33 +1000 Subject: [PATCH 4/8] Update payu_config/archive_scripts/standardise_mom6_filenames.sh Co-authored-by: Anton Steketee <79179784+anton-seaice@users.noreply.github.com> --- payu_config/archive_scripts/standardise_mom6_filenames.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/payu_config/archive_scripts/standardise_mom6_filenames.sh b/payu_config/archive_scripts/standardise_mom6_filenames.sh index 8c1e0bf..ba49fa3 100755 --- a/payu_config/archive_scripts/standardise_mom6_filenames.sh +++ b/payu_config/archive_scripts/standardise_mom6_filenames.sh @@ -46,7 +46,7 @@ for dir in ${out_dirs[@]}; do for current_file in $dir/access-om3.mom6.*.nc; do if [ -f $current_file ]; then new_filename=$(echo $current_file | sed -E 's/_([0-9]{4})\./\1./') - # rename the file without overwriting exisiting files + # rename the file without overwriting existing files mv -n $current_file $new_filename fi done From d53e9231d579299c786bbdb206c8fbc8b552a7b5 Mon Sep 17 00:00:00 2001 From: Anton Steketee <79179784+anton-seaice@users.noreply.github.com> Date: Tue, 10 Sep 2024 15:47:25 +1000 Subject: [PATCH 5/8] Update payu_config/archive_scripts/standardise_mom6_filenames.sh Co-authored-by: minghang.li --- payu_config/archive_scripts/standardise_mom6_filenames.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/payu_config/archive_scripts/standardise_mom6_filenames.sh b/payu_config/archive_scripts/standardise_mom6_filenames.sh index ba49fa3..58ff59c 100755 --- a/payu_config/archive_scripts/standardise_mom6_filenames.sh +++ b/payu_config/archive_scripts/standardise_mom6_filenames.sh @@ -2,7 +2,7 @@ # Copyright 2024 ACCESS-NRI and contributors. See the top-level COPYRIGHT file for details. # SPDX-License-Identifier: Apache-2.0. # -# Standardise file naming for MOM6 output files from access-om3. +# Standardise file naming for MOM6 output files in access-om3 by removing the underscore before the four-digit year, i.e., replacing '_YYYY' with 'YYYY' # This was written assuming it would be used as a payu "userscript" at the "archive" stage, but alternatively a path to an "archive" directory can be provided. # For more details, see https://github.com/COSIMA/om3-scripts/issues/32 From 7587496c07572586e80fe3c9c8517b62bcf53038 Mon Sep 17 00:00:00 2001 From: "minghang.li" Date: Tue, 10 Sep 2024 15:57:57 +1000 Subject: [PATCH 6/8] Update payu_config/archive_scripts/standardise_mom6_filenames.sh --- payu_config/archive_scripts/standardise_mom6_filenames.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/payu_config/archive_scripts/standardise_mom6_filenames.sh b/payu_config/archive_scripts/standardise_mom6_filenames.sh index 58ff59c..7253ff2 100755 --- a/payu_config/archive_scripts/standardise_mom6_filenames.sh +++ b/payu_config/archive_scripts/standardise_mom6_filenames.sh @@ -45,7 +45,7 @@ for dir in ${out_dirs[@]}; do # process each mom6 file for current_file in $dir/access-om3.mom6.*.nc; do if [ -f $current_file ]; then - new_filename=$(echo $current_file | sed -E 's/_([0-9]{4})\./\1./') + new_filename=$(echo $current_file | sed -E 's/_([0-9]{4})/\1/') # rename the file without overwriting existing files mv -n $current_file $new_filename fi From 4f1d5bbdfb8273fce4938e2e3847c39393fe64d9 Mon Sep 17 00:00:00 2001 From: anton-seaice Date: Tue, 10 Sep 2024 16:08:49 +1000 Subject: [PATCH 7/8] fix tests --- test/test_payu_conf/test_mom6_filenames.py | 53 ---------------------- 1 file changed, 53 deletions(-) diff --git a/test/test_payu_conf/test_mom6_filenames.py b/test/test_payu_conf/test_mom6_filenames.py index 514fffd..ebe3c71 100644 --- a/test/test_payu_conf/test_mom6_filenames.py +++ b/test/test_payu_conf/test_mom6_filenames.py @@ -92,56 +92,3 @@ def test_true_case(hist_dir, use_dir, nmonths, tmp_path): for p in monthly_paths: assert_f_not_exists(p) - - -# @pytest.mark.parametrize("hist_dir, ndays", [("Default", 1), ("Default", 30)]) -# def test_incomplete_month(hist_dir, ndays, hist_base, tmp_path): -# """ -# Run the script to convert the daily data into monthly files, with less than 28 days data, and check no things happen. -# """ - -# daily_paths = daily_files(hist_dir, hist_base, ndays, tmp_path) - -# chdir(tmp_path) -# output_dir = Path(daily_paths[0]).parents[0] - -# run([run_str]) -# expected_months = pd.date_range("2010-01-01", freq="ME", periods=1) - -# monthly_paths = [ -# f"{output_dir}/{hist_base}.{str(t)[0:7]}.nc" for t in expected_months -# ] - -# for p in daily_paths: -# assert_file_exists(p) - -# for p in monthly_paths: -# assert_f_not_exists(p) - - -# @pytest.mark.parametrize("hist_dir, ndays", [("Default", 31), ("Default", 27)]) -# def test_no_override(hist_dir, ndays, hist_base, tmp_path): -# """ -# Run the script to convert the daily data into monthly files, but the output filename already exists, and check nothing happens. -# """ - -# daily_paths = daily_files(hist_dir, hist_base, ndays, tmp_path) - -# chdir(tmp_path) -# output_dir = Path(daily_paths[0]).parents[0] - -# expected_months = pd.date_range("2010-01-01", freq="ME", periods=1) - -# monthly_paths = [ -# f"{output_dir}/{hist_base}.{str(t)[0:7]}.nc" for t in expected_months -# ] -# for p in monthly_paths: -# Path(p).touch() - -# run([run_str]) - -# for p in daily_paths: -# assert_file_exists(p) - -# for p in monthly_paths: -# assert_file_exists(p) From f54da51cf06298625b9e6fc18f111f545554d567 Mon Sep 17 00:00:00 2001 From: anton-seaice Date: Tue, 10 Sep 2024 16:19:41 +1000 Subject: [PATCH 8/8] x2 --- test/test_payu_conf/test_mom6_filenames.py | 111 ++++++++++++++++----- 1 file changed, 88 insertions(+), 23 deletions(-) diff --git a/test/test_payu_conf/test_mom6_filenames.py b/test/test_payu_conf/test_mom6_filenames.py index ebe3c71..1d45c8a 100644 --- a/test/test_payu_conf/test_mom6_filenames.py +++ b/test/test_payu_conf/test_mom6_filenames.py @@ -1,6 +1,4 @@ import pytest -import xarray as xr -import numpy as np import pandas as pd from os import makedirs, chdir @@ -23,25 +21,20 @@ def assert_f_not_exists(p): raise AssertionError("File exists and should not: %s" % str(p)) -def monthly_files(dir_name, nmonths, tmp_path): +def yearly_files(dir_name, n, tmp_path): """ - Make 12 months of empty data files data, and then write it into 12 files - - request = (path, ndays) - e.g. request = ("archive/output000", "365") - + Make empty data files """ - times = pd.date_range("2010-01-01", freq="ME", periods=nmonths) + times = pd.date_range("2010-01-01", freq="YE", periods=n) out_dir = str(tmp_path) + "/" + dir_name + "/" - paths = [f"{out_dir}{DIAG_BASE}_{str(t)[0:4]}_{str(t)[5:7]}.nc" for t in times] + paths = [f"{out_dir}{DIAG_BASE}._{str(t)[0:4]}.nc" for t in times] makedirs(out_dir) for p in paths: with open(p, "w") as f: - # f.write("blank") f.close() for p in paths: @@ -51,23 +44,22 @@ def monthly_files(dir_name, nmonths, tmp_path): @pytest.mark.parametrize( - "hist_dir, use_dir, nmonths", + "hist_dir, use_dir, n", [ - ("Default", False, 12), + ("archive/output000", False, 12), ("archive/output999", False, 1), ("archive/output9999", False, 1), ("archive/output574", True, 12), ], ) # run this test with a several folder names and lengths, provide the directory as an argument sometimes -def test_true_case(hist_dir, use_dir, nmonths, tmp_path): +def test_true_case(hist_dir, use_dir, n, tmp_path): - monthly_paths = monthly_files(hist_dir, nmonths, tmp_path) + yearly_paths = yearly_files(hist_dir, n, tmp_path) chdir(tmp_path) - output_dir = Path(monthly_paths[0]).parents[0] + output_dir = Path(yearly_paths[0]).parents[0] if not use_dir: # default path run([run_str]) - expected_months = pd.date_range("2010-01-01", freq="ME", periods=nmonths + 1) else: # provide path run( [ @@ -76,19 +68,92 @@ def test_true_case(hist_dir, use_dir, nmonths, tmp_path): output_dir, ], ) - expected_months = pd.date_range("2010-01-01", freq="ME", periods=nmonths + 1) + + expected_years = pd.date_range("2010-01-01", freq="YE", periods=n + 1) # valid output filenames expected_paths = [ - f"{output_dir}/{DIAG_BASE}_{str(t)[0:4]}-{str(t)[5:7]}.nc" - for t in expected_months + f"{output_dir}/{DIAG_BASE}.{str(t)[0:4]}.nc" for t in expected_years ] - for p in expected_paths[0:nmonths]: + for p in expected_paths[0:n]: assert_file_exists(p) - for p in expected_paths[nmonths]: + for p in expected_paths[n]: assert_f_not_exists(p) - for p in monthly_paths: + for p in yearly_paths: assert_f_not_exists(p) + + +@pytest.mark.parametrize( + "hist_dir, use_dir, n", + [ + ("archive/output000", False, 12), + ], +) +def test_dont_override(hist_dir, use_dir, n, tmp_path): + """ + make some empty data files, and make some files where the files should be renamed to, + and confirm it doesn't delete any of them + """ + + yearly_paths = yearly_files(hist_dir, n, tmp_path) + chdir(tmp_path) + output_dir = Path(yearly_paths[0]).parents[0] + + # write the expected output too + expected_years = pd.date_range("2010-01-01", freq="YE", periods=n) + + expected_paths = [ + f"{output_dir}/{DIAG_BASE}.{str(t)[0:4]}.nc" for t in expected_years + ] + + for p in expected_paths: + with open(p, "w") as f: + f.close() + + if not use_dir: # default path + run([run_str]) + else: # provide path + run( + [ + run_str, + "-d", + output_dir, + ], + ) + + for p in expected_paths: + assert_file_exists(p) + + for p in yearly_paths: + assert_file_exists(p) + + +# @pytest.mark.parametrize("hist_dir, ndays", [("Default", 31), ("Default", 27)]) +# def test_no_override(hist_dir, ndays, hist_base, tmp_path): +# """ +# Run the script to convert the daily data into monthly files, but the output filename already exists, and check nothing happens. +# """ + +# daily_paths = daily_files(hist_dir, hist_base, ndays, tmp_path) + +# chdir(tmp_path) +# output_dir = Path(daily_paths[0]).parents[0] + +# expected_months = pd.date_range("2010-01-01", freq="ME", periods=1) + +# monthly_paths = [ +# f"{output_dir}/{hist_base}.{str(t)[0:7]}.nc" for t in expected_months +# ] +# for p in monthly_paths: +# Path(p).touch() + +# run([run_str]) + +# for p in daily_paths: +# assert_file_exists(p) + +# for p in monthly_paths: +# assert_file_exists(p)