diff --git a/.github/workflows/ci_unit_tests.yaml b/.github/workflows/ci_unit_tests.yaml new file mode 100644 index 0000000000..e22f63bf56 --- /dev/null +++ b/.github/workflows/ci_unit_tests.yaml @@ -0,0 +1,64 @@ +name: CI Unit Tests +on: [pull_request, push, workflow_dispatch] + +jobs: + + ci_pytest: + runs-on: ubuntu-latest + name: Run unit tests on CI system + permissions: + checks: write + + steps: + - name: Checkout code + uses: actions/checkout@v4 + + - name: Set up Python + uses: actions/setup-python@v5 + with: + python-version: 3.11.8 + + - name: Install dependencies + run: | + sudo apt-get update + sudo apt-get install -y perl libxml-libxml-perl libxml-libxslt-perl libdatetime-perl + python -m pip install --upgrade pip + pip install pytest + pip install wxflow + pip install wget + + - name: Cache Rocoto Install + uses: actions/cache@v4 + with: + path: ~/rocoto + key: ${{ runner.os }}-rocoto-${{ hashFiles('**/ci-unit_tests.yaml') }} + + - name: Install Rocoto + run: | + if [ ! -d "$HOME/rocoto/bin" ]; then + git clone https://github.com/christopherwharrop/rocoto.git $HOME/rocoto + cd $HOME/rocoto + ./INSTALL + fi + echo "$HOME/rocoto/bin" >> $GITHUB_PATH + + - name: Run tests + shell: bash + run: | + sudo mkdir -p /scratch1/NCEPDEV + cd $GITHUB_WORKSPACE/sorc + git submodule update --init --recursive + ./link_workflow.sh + cd $GITHUB_WORKSPACE/ci/scripts/tests + ln -s ../wxflow + + pytest -v --junitxml $GITHUB_WORKSPACE/ci/scripts/tests/test-results.xml + + + - name: Publish Test Results + if: always() + uses: EnricoMi/publish-unit-test-result-action@v2 + with: + files: ci/scripts/tests/test-results.xml + job_summary: true + comment_mode: off diff --git a/ci/Jenkinsfile b/ci/Jenkinsfile index 956bd692dd..05d38b7898 100644 --- a/ci/Jenkinsfile +++ b/ci/Jenkinsfile @@ -14,7 +14,7 @@ pipeline { options { skipDefaultCheckout() - //parallelsAlwaysFailFast() + parallelsAlwaysFailFast() } stages { // This initial stage is used to get the Machine name from the GitHub labels on the PR @@ -90,9 +90,6 @@ pipeline { stage('3. Build System') { matrix { agent { label NodeName[machine].toLowerCase() } - //options { - // throttle(['global_matrix_build']) - //} axes { axis { name 'system' @@ -102,6 +99,7 @@ pipeline { stages { stage('build system') { steps { + catchError(buildResult: 'UNSTABLE', stageResult: 'FAILURE') { script { def HOMEgfs = "${CUSTOM_WORKSPACE}/${system}" // local HOMEgfs is used to build the system on per system basis under the custome workspace for each buile system sh(script: "mkdir -p ${HOMEgfs}") @@ -120,8 +118,8 @@ pipeline { if (env.CHANGE_ID) { sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "Checkout **Failed** on ${Machine}: ${e.getMessage()}" """) } - echo "Failed to checkout: ${e.getMessage()}" STATUS = 'Failed' + error("Failed to checkout: ${e.getMessage()}") } def gist_url = "" def error_logs = "" @@ -155,6 +153,7 @@ pipeline { } catch (Exception error_comment) { echo "Failed to comment on PR: ${error_comment.getMessage()}" } + STATUS = 'Failed' error("Failed to build system on ${Machine}") } } @@ -174,6 +173,7 @@ pipeline { } } } + } } } } @@ -181,7 +181,9 @@ pipeline { } stage('4. Run Tests') { - failFast false + when { + expression { STATUS != 'Failed' } + } matrix { agent { label NodeName[machine].toLowerCase() } axes { @@ -198,14 +200,21 @@ pipeline { expression { return caseList.contains(Case) } } steps { + catchError(buildResult: 'UNSTABLE', stageResult: 'FAILURE') { script { sh(script: "sed -n '/{.*}/!p' ${CUSTOM_WORKSPACE}/gfs/ci/cases/pr/${Case}.yaml > ${CUSTOM_WORKSPACE}/gfs/ci/cases/pr/${Case}.yaml.tmp") def yaml_case = readYaml file: "${CUSTOM_WORKSPACE}/gfs/ci/cases/pr/${Case}.yaml.tmp" system = yaml_case.experiment.system def HOMEgfs = "${CUSTOM_WORKSPACE}/${system}" // local HOMEgfs is used to populate the XML on per system basis env.RUNTESTS = "${CUSTOM_WORKSPACE}/RUNTESTS" - sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh create_experiment ${HOMEgfs}/ci/cases/pr/${Case}.yaml") + try { + error_output = sh(script: "${HOMEgfs}/ci/scripts/utils/ci_utils_wrapper.sh create_experiment ${HOMEgfs}/ci/cases/pr/${Case}.yaml", returnStdout: true).trim() + } catch (Exception error_create) { + sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "${Case} **FAILED** to create experment on ${Machine}\n with the error:\n\\`\\`\\`\n${error_output}\\`\\`\\`" """) + error("Case ${Case} failed to create experment directory") + } } + } } } @@ -213,7 +222,6 @@ pipeline { when { expression { return caseList.contains(Case) } } - failFast false steps { script { HOMEgfs = "${CUSTOM_WORKSPACE}/gfs" // common HOMEgfs is used to launch the scripts that run the experiments @@ -255,11 +263,11 @@ pipeline { STATUS = 'Failed' try { sh(script: """${GH} pr edit ${env.CHANGE_ID} --repo ${repo_url} --remove-label "CI-${Machine}-Running" --add-label "CI-${Machine}-${STATUS}" """, returnStatus: true) - sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "Experiment ${Case} **FAILED** on ${Machine} in\n\\`${CUSTOM_WORKSPACE}/RUNTESTS/${pslot}\\`" """) + sh(script: """${GH} pr comment ${env.CHANGE_ID} --repo ${repo_url} --body "Experiment ${Case} **FAILED** on ${Machine} in\n\\`${CUSTOM_WORKSPACE}/RUNTESTS/EXPDIR/${pslot}\\`" """) } catch (Exception e) { echo "Failed to update label from Running to ${STATUS}: ${e.getMessage()}" } - error("Failed to run experiments ${Case} on ${Machine}") + echo "Failed to run experiments ${Case} on ${Machine}" } } } @@ -268,6 +276,7 @@ pipeline { } } } + stage( '5. FINALIZE' ) { agent { label NodeName[machine].toLowerCase() } steps { diff --git a/ci/cases/yamls/gefs_ci_defaults.yaml b/ci/cases/yamls/gefs_ci_defaults.yaml index ceb36d4acb..05a97edd90 100644 --- a/ci/cases/yamls/gefs_ci_defaults.yaml +++ b/ci/cases/yamls/gefs_ci_defaults.yaml @@ -1,4 +1,4 @@ defaults: !INC {{ HOMEgfs }}/parm/config/gefs/yaml/defaults.yaml base: - HPC_ACCOUNT: {{ 'HPC_ACCOUNT' | getenv }} + ACCOUNT: {{ 'HPC_ACCOUNT' | getenv }} diff --git a/ci/scripts/tests/test_create_experiment.py b/ci/scripts/tests/test_create_experiment.py new file mode 100644 index 0000000000..03f3a30805 --- /dev/null +++ b/ci/scripts/tests/test_create_experiment.py @@ -0,0 +1,29 @@ +from wxflow import Executable +from shutil import rmtree +import os +import copy + +_here = os.path.dirname(__file__) +HOMEgfs = os.sep.join(_here.split(os.sep)[:-3]) +RUNDIR = os.path.join(_here, 'testdata/RUNDIR') + + +def test_create_experiment(): + + create_experiment_script = Executable(f'{HOMEgfs}/workflow/create_experiment.py') + yaml_dir = yaml_dir = os.path.join(HOMEgfs, 'ci/cases/pr') + env = os.environ.copy() + env['RUNTESTS'] = RUNDIR + + for case in os.listdir(yaml_dir): + if case.endswith('.yaml'): + with open(os.path.join(yaml_dir, case), 'r') as file: + file_contents = file.read() + if 'ICSDIR_ROOT' not in file_contents: + create_experiment = copy.deepcopy(create_experiment_script) + create_experiment.add_default_arg(['-y', f'../../cases/pr/{case}', '--overwrite']) + env['pslot'] = os.path.splitext(case)[0] + create_experiment(env=env) + assert (create_experiment.returncode == 0) + + rmtree(RUNDIR) diff --git a/ci/scripts/tests/test_rocotostat.py b/ci/scripts/tests/test_rocotostat.py new file mode 100755 index 0000000000..f43f8df2f8 --- /dev/null +++ b/ci/scripts/tests/test_rocotostat.py @@ -0,0 +1,90 @@ +import sys +import os +from shutil import rmtree +import wget + +script_dir = os.path.dirname(os.path.abspath(__file__)) +sys.path.append(os.path.join(os.path.dirname(script_dir), 'utils')) + +from rocotostat import rocoto_statcount, rocotostat_summary, is_done, is_stalled, CommandNotFoundError +from wxflow import which + +test_data_url = 'https://noaa-nws-global-pds.s3.amazonaws.com/data/CI/' + +testdata_path = 'testdata/rocotostat' +testdata_full_path = os.path.join(script_dir, testdata_path) + + +if not os.path.isfile(os.path.join(testdata_full_path, 'database.db')): + os.makedirs(testdata_full_path, exist_ok=True) + workflow_url = test_data_url + str(testdata_path) + '/workflow.xml' + workflow_destination = os.path.join(testdata_full_path, 'workflow.xml') + wget.download(workflow_url, workflow_destination) + + database_url = test_data_url + str(testdata_path) + '/database.db' + database_destination = os.path.join(testdata_full_path, 'database.db') + wget.download(database_url, database_destination) + +try: + rocotostat = which('rocotostat') +except CommandNotFoundError: + raise CommandNotFoundError("rocotostat not found in PATH") + +rocotostat.add_default_arg(['-w', os.path.join(testdata_path, 'workflow.xml'), '-d', os.path.join(testdata_path, 'database.db')]) + + +def test_rocoto_statcount(): + + result = rocoto_statcount(rocotostat) + + assert result['SUCCEEDED'] == 20 + assert result['FAIL'] == 0 + assert result['DEAD'] == 0 + assert result['RUNNING'] == 0 + assert result['SUBMITTING'] == 0 + assert result['QUEUED'] == 0 + + +def test_rocoto_summary(): + + result = rocotostat_summary(rocotostat) + + assert result['CYCLES_TOTAL'] == 1 + assert result['CYCLES_DONE'] == 1 + + +def test_rocoto_done(): + + result = rocotostat_summary(rocotostat) + + assert is_done(result) + + rmtree(testdata_full_path) + + +def test_rocoto_stalled(): + testdata_path = 'testdata/rocotostat_stalled' + testdata_full_path = os.path.join(script_dir, testdata_path) + xml = os.path.join(testdata_full_path, 'stalled.xml') + db = os.path.join(testdata_full_path, 'stalled.db') + + if not os.path.isfile(os.path.join(testdata_full_path, 'stalled.db')): + os.makedirs(testdata_full_path, exist_ok=True) + workflow_url = test_data_url + str(testdata_path) + '/stalled.xml' + database_url = test_data_url + str(testdata_path) + '/stalled.db' + + workflow_destination = os.path.join(testdata_full_path, 'stalled.xml') + wget.download(workflow_url, workflow_destination) + + database_destination = os.path.join(testdata_full_path, 'stalled.db') + wget.download(database_url, database_destination) + + rocotostat = which('rocotostat') + rocotostat.add_default_arg(['-w', xml, '-d', db]) + + result = rocoto_statcount(rocotostat) + + assert result['SUCCEEDED'] == 11 + assert is_stalled(result) + + rmtree(testdata_full_path) diff --git a/ci/scripts/tests/test_setup.py b/ci/scripts/tests/test_setup.py new file mode 100755 index 0000000000..77a36369f4 --- /dev/null +++ b/ci/scripts/tests/test_setup.py @@ -0,0 +1,89 @@ +from wxflow import Executable, Configuration, ProcessError +from shutil import rmtree +import pytest +import os + +_here = os.path.dirname(__file__) +HOMEgfs = os.sep.join(_here.split(os.sep)[:-3]) +RUNDIR = os.path.join(_here, 'testdata/RUNDIR') +pslot = "C48_ATM" +account = "fv3-cpu" +foobar = "foobar" + + +def test_setup_expt(): + + arguments = [ + "gfs", "forecast-only", + "--pslot", pslot, "--app", "ATM", "--resdetatmos", "48", + "--comroot", f"{RUNDIR}", "--expdir", f"{RUNDIR}", + "--idate", "2021032312", "--edate", "2021032312", "--overwrite" + ] + setup_expt_script = Executable(os.path.join(HOMEgfs, "workflow", "setup_expt.py")) + setup_expt_script.add_default_arg(arguments) + setup_expt_script() + assert (setup_expt_script.returncode == 0) + + +def test_setup_xml(): + + setup_xml_script = Executable(os.path.join(HOMEgfs, "workflow/setup_xml.py")) + setup_xml_script.add_default_arg(f"{RUNDIR}/{pslot}") + setup_xml_script() + assert (setup_xml_script.returncode == 0) + + cfg = Configuration(f"{RUNDIR}/{pslot}") + base = cfg.parse_config('config.base') + assert base.ACCOUNT == account + + assert "UNKNOWN" not in base.values() + + with open(f"{RUNDIR}/{pslot}/{pslot}.xml", 'r') as file: + contents = file.read() + assert contents.count(account) > 5 + + rmtree(RUNDIR) + + +def test_setup_xml_fail_config_env_cornercase(): + + script_content = ('''#!/usr/bin/env bash +export HOMEgfs=foobar +../../../workflow/setup_xml.py "${1}"\n +''') + + with open('run_setup_xml.sh', 'w') as file: + file.write(script_content) + os.chmod('run_setup_xml.sh', 0o755) + + try: + setup_xml_script = Executable(os.path.join(HOMEgfs, "ci", "scripts", "tests", "run_setup_xml.sh")) + setup_xml_script.add_default_arg(f"{RUNDIR}/{pslot}") + setup_xml_script() + assert (setup_xml_script.returncode == 0) + + cfg = Configuration(f"{RUNDIR}/{pslot}") + base = cfg.parse_config('config.base') + assert base.ACCOUNT == account + + assert foobar not in base.values() + assert "UNKNOWN" not in base.values() + + with open(f"{RUNDIR}/{pslot}/{pslot}.xml", 'r') as file: + contents = file.read() + assert contents.count(account) > 5 + + except ProcessError as e: + # We expect this fail becuse ACCOUNT=fv3-cpu in config.base and environment + pass + except Exception as e: + # If an exception occurs, pass the test with a custom message + pytest.fail(f"Expected exception occurred: {e}") + + finally: + # Cleanup code to ensure it runs regardless of test outcome + os.remove('run_setup_xml.sh') + try: + rmtree(RUNDIR) + except FileNotFoundError: + pass diff --git a/ci/scripts/utils/publish_logs.py b/ci/scripts/utils/publish_logs.py index 7768c17c10..283c84a8d1 100755 --- a/ci/scripts/utils/publish_logs.py +++ b/ci/scripts/utils/publish_logs.py @@ -46,7 +46,8 @@ def add_logs_to_gist(args, emcbot_gh): gist_files = {} for file in args.file: - file_content = file.read() + with open(file.name, 'r', encoding='latin-1') as file: + file_content = file.read() gist_files[os.path.basename(file.name)] = emcbot_gh.InputFileContent(file_content) gist = emcbot_gh.user.create_gist(public=True, files=gist_files, description=f"error log file from CI run {args.gist[0]}") @@ -85,7 +86,8 @@ def upload_logs_to_repo(args, emcbot_gh, emcbot_ci_url): break for file in args.file: - file_content = file.read() + with open(file.name, 'r', encoding='latin-1') as file: + file_content = file.read() file_path_in_repo = f"{repo_path}/{path_header}/" + str(os.path.basename(file.name)) emcbot_gh.repo.create_file(file_path_in_repo, "Adding error log file", file_content, branch="error_logs") diff --git a/ci/scripts/utils/rocotostat.py b/ci/scripts/utils/rocotostat.py index 9b1d8dcc3a..70c672f0e8 100755 --- a/ci/scripts/utils/rocotostat.py +++ b/ci/scripts/utils/rocotostat.py @@ -14,6 +14,35 @@ def attempt_multiple_times(expression, max_attempts, sleep_duration=0, exception_class=Exception): + """ + Retries a function multiple times. + + Try to execute the function expression up to max_attempts times ignoring any exceptions + of the type exception_class, It waits for sleep_duration seconds between attempts. + + Parameters + ---------- + expression : callable + The function to be executed. + max_attempts : int + The maximum number of attempts to execute the function. + sleep_duration : int, optional + The number of seconds to wait between attempts. Default is 0. + exception_class : Exception, optional + The type of exception to catch. Default is the base Exception class, catching all exceptions. + + Returns + ------- + The return value of the function expression. + + Raises + ------ + exception_class + If the function expression raises an exception of type exception_class + in all max_attempts attempts. + + """ + attempt = 0 last_exception = None while attempt < max_attempts: @@ -189,7 +218,7 @@ def is_stalled(rocoto_status): error_return = rocoto_status['UNKNOWN'] rocoto_state = 'UNKNOWN' elif is_stalled(rocoto_status): - rocoto_status = attempt_multiple_times(rocoto_statcount(rocotostat), 2, 120, ProcessError) + rocoto_status = attempt_multiple_times(lambda: rocoto_statcount(rocotostat), 2, 120, ProcessError) if is_stalled(rocoto_status): error_return = 3 rocoto_state = 'STALLED' diff --git a/sorc/wxflow b/sorc/wxflow index 5dad7dd61c..1356acdb2b 160000 --- a/sorc/wxflow +++ b/sorc/wxflow @@ -1 +1 @@ -Subproject commit 5dad7dd61cebd9b3f2b163b3b06bb75eae1860a9 +Subproject commit 1356acdb2bbca28e442597699da1a295faa18fe3