From 8bc16320effbed0fcdfceab33aec560da0dba9d3 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Tue, 14 May 2024 22:21:10 -0700 Subject: [PATCH] build: Add --exclude-from-upload and --exclude-from-download options Excluding files from upload is very handy for ad-hoc skipping of large ancillary files or previous output files in the build directory that the user wants to ignore for the remote build on AWS Batch (i.e. to start it "fresh"). Existing workarounds for the lack of an exclusion mechanism include git worktrees to obtain a clean state and moving files or directories temporarily out of the build directory. A future improvement would be adding support to also specify these patterns via a file, which can then be checked in to a build repo and shared by all users. As a broader improvement, I'd like to design away this issue by separating out the workflow-as-program (rules, code, etc.) from the workflow-as-state (config, inputs, outputs, etc.). This comes out of several other goals, but would apply to this "excluded files" need too. For example, instead of relying on the implicit state combined with the code like now in a single build directory, we'd instead explicitly pass an empty starting state separate from the workflow program. This change includes a small bug fix for --download to allow _only_ negated patterns. Resolves: --- CHANGES.md | 22 ++++++++ doc/commands/build.rst | 50 ++++++++++++++++++ nextstrain/cli/command/build.py | 58 +++++++++++++++++++++ nextstrain/cli/runner/aws_batch/__init__.py | 2 +- nextstrain/cli/runner/aws_batch/s3.py | 23 +++++++- nextstrain/cli/util.py | 6 ++- tests/command-build.py | 29 +++++++++++ 7 files changed, 185 insertions(+), 5 deletions(-) create mode 100644 tests/command-build.py diff --git a/CHANGES.md b/CHANGES.md index 393d3565..b899be18 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -24,6 +24,15 @@ installation method we recommend in the [Nextstrain installation documentation](https://docs.nextstrain.org/page/install.html). In that case, a supported Python version is always bundled with `nextstrain`. +## Features + +* `nextstrain build` now supports two new options when using the AWS Batch + runtime: [`--exclude-from-upload`][] and [`--exclude-from-download`][]. The + former is useful for avoiding the upload of large, ancillary files not needed + by the build. The latter exists to parallel the former and make it easier to + exclude files from both upload and download. + ([#370](https://github.com/nextstrain/cli/pull/370)) + ## Improvements * The Conda runtime now uses Micromamba 1.5.8 (upgraded from 1.1.0) to manage @@ -32,6 +41,19 @@ supported Python version is always bundled with `nextstrain`. the Conda runtime. ([#367](https://github.com/nextstrain/cli/pull/367)) +## Bug fixes + +* The [`--download`][] option of `nextstrain build` now supports passing _only_ + negated patterns (e.g. `!…` and `!(…)`). All files which _don't_ match the + negated patterns will be downloaded. Previously, no files were downloaded + unless at least one positive pattern was given. + ([#370](https://github.com/nextstrain/cli/pull/370)) + + +[`--exclude-from-upload`]: https://docs.nextstrain.org/projects/cli/en/__NEXT__/commands/build/#cmdoption-nextstrain-build-exclude-from-upload +[`--exclude-from-download`]: https://docs.nextstrain.org/projects/cli/en/__NEXT__/commands/build/#cmdoption-nextstrain-build-exclude-from-download +[`--download`]: https://docs.nextstrain.org/projects/cli/en/__NEXT__/commands/build/#cmdoption-nextstrain-build-download + # 8.3.0 (30 April 2024) diff --git a/doc/commands/build.rst b/doc/commands/build.rst index a2c61e99..264ce200 100644 --- a/doc/commands/build.rst +++ b/doc/commands/build.rst @@ -104,6 +104,56 @@ options Do not download any files from the remote build when it completes. Currently only supported when also using :option:`--aws-batch`. +.. option:: --exclude-from-download + + Exclude files matching ```` from being downloaded from + the remote build. Equivalent to passing a negated pattern to + :option:`--download`. That is, the following are equivalent:: + + --exclude-from-download 'xyz' + --download '!xyz' + + Refer to :option:`--download` for usage details, but note that this + option doesn't support already-negated patterns (e.g. ``!…`` or + ``!(…)``). + + This option exists to parallel :option:`--exclude-from-upload`. + + + + +.. option:: --exclude-from-upload + + Exclude files matching ```` from being uploaded as part of + the remote build. Shell-style advanced globbing is supported, but + be sure to escape wildcards or quote the whole pattern so your + shell doesn't expand them. May be passed more than once. + Currently only supported when also using :option:`--aws-batch`. + Default is to upload the entire pathogen build directory (except + for some ancillary files which are always excluded). + + Note that files excluded from upload may still be downloaded from + the remote build, e.g. if they're created by it, and if downloaded + will overwrite the local files. Use :option:`--no-download` to + avoid downloading any files, or :option:`--exclude-from-download` + to avoid downloading specific files, e.g.:: + + --exclude-from-upload 'results/**' \ + --exclude-from-download 'results/**' + + Your shell's brace expansion can also be used to shorten this, e.g.:: + + --exclude-from-{up,down}load='results/**' + + Besides basic glob features like single-part wildcards (``*``), + character classes (``[…]``), and brace expansion (``{…, …}``), + several advanced globbing features are also supported: multi-part + wildcards (``**``), extended globbing (``@(…)``, ``+(…)``, etc.), + and negation (``!…``). + + + + .. option:: --no-logs Do not show the log messages of the remote build. Currently only supported when also using :option:`--aws-batch`. Default is to show all log messages, even when attaching to a completed build. diff --git a/nextstrain/cli/command/build.py b/nextstrain/cli/command/build.py index 1c489304..3f1461e8 100644 --- a/nextstrain/cli/command/build.py +++ b/nextstrain/cli/command/build.py @@ -117,6 +117,64 @@ def register_parser(subparser): dest = "download", action = "store_false") + parser.add_argument( + "--exclude-from-download", + metavar = "", + help = dedent(f"""\ + Exclude files matching ```` from being downloaded from + the remote build. Equivalent to passing a negated pattern to + :option:`--download`. That is, the following are equivalent:: + + --exclude-from-download 'xyz' + --download '!xyz' + + Refer to :option:`--download` for usage details, but note that this + option doesn't support already-negated patterns (e.g. ``!…`` or + ``!(…)``). + + This option exists to parallel :option:`--exclude-from-upload`. + + {SKIP_AUTO_DEFAULT_IN_HELP} + """), + dest = "download", + type = lambda pattern: f"!{pattern}", + action = AppendOverwriteDefault) + + parser.add_argument( + "--exclude-from-upload", + metavar = "", + help = dedent(f"""\ + Exclude files matching ```` from being uploaded as part of + the remote build. Shell-style advanced globbing is supported, but + be sure to escape wildcards or quote the whole pattern so your + shell doesn't expand them. May be passed more than once. + Currently only supported when also using :option:`--aws-batch`. + Default is to upload the entire pathogen build directory (except + for some ancillary files which are always excluded). + + Note that files excluded from upload may still be downloaded from + the remote build, e.g. if they're created by it, and if downloaded + will overwrite the local files. Use :option:`--no-download` to + avoid downloading any files, or :option:`--exclude-from-download` + to avoid downloading specific files, e.g.:: + + --exclude-from-upload 'results/**' \\ + --exclude-from-download 'results/**' + + Your shell's brace expansion can also be used to shorten this, e.g.:: + + --exclude-from-{{up,down}}load='results/**' + + Besides basic glob features like single-part wildcards (``*``), + character classes (``[…]``), and brace expansion (``{{…, …}}``), + several advanced globbing features are also supported: multi-part + wildcards (``**``), extended globbing (``@(…)``, ``+(…)``, etc.), + and negation (``!…``). + + {SKIP_AUTO_DEFAULT_IN_HELP} + """), + action = "append") + # A --logs option doesn't make much sense right now for most of our # runtimes, but I can see how it might in the future. So we're ready if # that future comes to pass, set up --no-logs as if there's a --logs option diff --git a/nextstrain/cli/runner/aws_batch/__init__.py b/nextstrain/cli/runner/aws_batch/__init__.py index 78ad13a4..eac5bfb4 100644 --- a/nextstrain/cli/runner/aws_batch/__init__.py +++ b/nextstrain/cli/runner/aws_batch/__init__.py @@ -213,7 +213,7 @@ def run(opts, argv, working_volume = None, extra_env: Env = {}, cpus: int = None print_stage("Uploading %s to S3" % local_workdir) bucket = s3.bucket(opts.s3_bucket) - remote_workdir = s3.upload_workdir(local_workdir, bucket, run_id) + remote_workdir = s3.upload_workdir(local_workdir, bucket, run_id, opts.exclude_from_upload) print("uploaded:", s3.object_url(remote_workdir)) diff --git a/nextstrain/cli/runner/aws_batch/s3.py b/nextstrain/cli/runner/aws_batch/s3.py index 6aa7331a..e4338b7a 100644 --- a/nextstrain/cli/runner/aws_batch/s3.py +++ b/nextstrain/cli/runner/aws_batch/s3.py @@ -38,17 +38,20 @@ def object_from_url(s3url: str) -> S3Object: return bucket(url.netloc).Object(key) -def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str) -> S3Object: +def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str, patterns: List[str] = None) -> S3Object: """ Upload a ZIP archive of the local *workdir* to the remote S3 *bucket* for the given *run_id*. + An optional list of *patterns* (shell-style advanced globs) can be passed + to selectively exclude part of the local *workdir* from being uploaded. + Returns the S3.Object instance of the uploaded archive. """ remote_workdir = bucket.Object(run_id + ".zip") - excluded = path_matcher([ + always_excluded = path_matcher([ # Jobs don't use .git, so save the bandwidth/space/time. It may also # contain information in history that shouldn't be uploaded. ".git/", @@ -65,6 +68,13 @@ def upload_workdir(workdir: Path, bucket: S3Bucket, run_id: str) -> S3Object: "__pycache__/", ]) + if patterns: + deselected = glob_matcher(patterns) + else: + deselected = lambda path: False + + excluded = lambda path: always_excluded(path) or deselected(path) + # Stream writes directly to the remote ZIP file remote_file: Any with fsspec.open(object_url(remote_workdir), "wb", auto_mkdir = False) as remote_file: @@ -86,6 +96,15 @@ def download_workdir(remote_workdir: S3Object, workdir: Path, patterns: List[str to selectively download only part of the remote workdir. """ + # XXX TODO: Consider extending excluded patterns with the globs from any + # --exclude-from-upload options given, so that files excluded from upload + # are by default also excluded from download. That behaviour would seems + # sometimes useful but other times confusing. See also my discussion with + # Trevor and James about this.¹ + # -trs, 23 May 2024 + # + # ¹ + excluded = path_matcher([ # Jobs don't use .git and it may also contain information that # shouldn't be uploaded. diff --git a/nextstrain/cli/util.py b/nextstrain/cli/util.py index 7ffc65e3..d020e320 100644 --- a/nextstrain/cli/util.py +++ b/nextstrain/cli/util.py @@ -13,7 +13,7 @@ from shlex import quote as shquote from shutil import which from textwrap import dedent, indent -from wcmatch.glob import globmatch, GLOBSTAR, EXTGLOB, BRACE, MATCHBASE, NEGATE +from wcmatch.glob import globmatch, GLOBSTAR, EXTGLOB, BRACE, MATCHBASE, NEGATE, NEGATEALL from .__version__ import __version__ from .debug import debug from .types import RunnerModule, RunnerTestResults, RunnerTestResultStatus @@ -559,10 +559,12 @@ def glob_match(path: Union[str, Path], patterns: Union[str, Sequence[str]]) -> b globbing features are also supported: multi-part wildcards (``**``), extended globbing (``@(…)``, ``+(…)``, etc.), basename matching for patterns containing only a single path part, and negation (``!…``). + Passing only negated patterns is explicitly supported and will match + anything that doesn't match the given patterns. Implemented with with :func:`wcmatch.glob.globmatch`. """ - return globmatch(path, patterns, flags = GLOBSTAR | BRACE | EXTGLOB | MATCHBASE | NEGATE) + return globmatch(path, patterns, flags = GLOBSTAR | BRACE | EXTGLOB | MATCHBASE | NEGATE | NEGATEALL) def runner_tests_ok(tests: RunnerTestResults) -> bool: diff --git a/tests/command-build.py b/tests/command-build.py new file mode 100644 index 00000000..78d4ff86 --- /dev/null +++ b/tests/command-build.py @@ -0,0 +1,29 @@ +from nextstrain.cli import make_parser + + +def pytest_build_download_options(): + parser = make_parser() + + opts = parser.parse_args(["build", "."]) + assert opts.download is True + + opts = parser.parse_args(["build", "--no-download", "."]) + assert opts.download is False + + opts = parser.parse_args(["build", "--download", "x", "."]) + assert opts.download == ["x"] + + opts = parser.parse_args(["build", "--download", "x", "--download", "y", "."]) + assert opts.download == ["x", "y"] + + opts = parser.parse_args(["build", "--exclude-from-download", "z", "."]) + assert opts.download == ["!z"] + + opts = parser.parse_args(["build", "--exclude-from-download", "z", "--exclude-from-download", "a", "."]) + assert opts.download == ["!z", "!a"] + + opts = parser.parse_args(["build", "--download", "y", "--exclude-from-download", "z", "."]) + assert opts.download == ["y", "!z"] + + opts = parser.parse_args(["build", "--download", "y", "--download", "!z", "."]) + assert opts.download == ["y", "!z"]