Skip to content

Commit

Permalink
Clarify behavior for options that use strain ID
Browse files Browse the repository at this point in the history
Add reference to --metadata-id-columns which affects the behavior of
these options.
  • Loading branch information
victorlin committed Mar 5, 2024
1 parent 14302ac commit 348613b
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 20 deletions.
16 changes: 10 additions & 6 deletions augur/filter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,8 @@ def register_arguments(parser):
"201X-10-01"). Similarly, an ambiguous month makes the
corresponding day ambiguous (e.g., "2010-XX-01").""")
metadata_filter_group.add_argument('--exclude', type=str, nargs="+", metavar="FILE", default=argparse.SUPPRESS,
help="File(s) with list of strains to exclude.")
help="""File(s) with list of strain IDs to exclude. The ID column is
determined by --metadata-id-columns.""")
metadata_filter_group.add_argument('--exclude-where', nargs='+', metavar="CONDITION", default=argparse.SUPPRESS,
help="""Exclude strains matching these conditions. Ex: \"host=rat\" or
\"host!=rat\". Multiple values are processed as OR (matching any of
Expand All @@ -75,8 +76,9 @@ def register_arguments(parser):
help="""Exclude all strains by default. Use this with the include
arguments to select a specific subset of strains.""")
metadata_filter_group.add_argument('--include', type=str, nargs="+", metavar="FILE", default=argparse.SUPPRESS,
help="""File(s) with list of strains to include regardless of
priorities, subsampling, or absence of an entry in --sequences.""")
help="""File(s) with list of strain IDs to include regardless of
priorities, subsampling, or absence of an entry in --sequences. The
ID column is determined by --metadata-id-columns.""")
metadata_filter_group.add_argument('--include-where', nargs='+', metavar="CONDITION", default=argparse.SUPPRESS,
help="""Include strains with these values. ex: host=rat. Multiple values
are processed as OR (having any of those specified will be
Expand Down Expand Up @@ -128,13 +130,14 @@ def register_arguments(parser):

subsample_group.add_argument('--priority', type=str, metavar="FILE", default=argparse.SUPPRESS,
help="""Tab-delimited file with list of priority scores for strains
(e.g., "<strain>\\t<priority>") and no header. When scores are
(e.g., "<strain ID>\\t<priority>") and no header. When scores are
provided, Augur converts scores to floating point values, sorts
strains within each subsampling group from highest to lowest
priority, and selects the top N strains per group where N is the
calculated or requested number of strains per group. Higher numbers
indicate higher priority. Since priorities represent relative
values between strains, these values can be arbitrary.""")
values between strains, these values can be arbitrary. The ID
column is determined by --metadata-id-columns.""")
subsample_group.add_argument('--subsample-seed', type=int, metavar="N", default=argparse.SUPPRESS,
help="""Random number generator seed to allow reproducible subsampling
(with same input data).""")
Expand All @@ -149,7 +152,8 @@ def register_arguments(parser):
output_group.add_argument('--output-metadata', metavar="FILE", default=argparse.SUPPRESS,
help="Metadata for strains that passed filters.")
output_group.add_argument('--output-strains', metavar="FILE", default=argparse.SUPPRESS,
help="List of strains that passed filters (no header).")
help="""List of strain IDs that passed filters (no header). The ID
column is determined by --metadata-id-columns.""")
output_group.add_argument('--output-log', metavar="FILE", default=argparse.SUPPRESS,
help="""Tab-delimited file with one row for each filtered strain and the
reason it was filtered. Keyword arguments used for a given filter
Expand Down
32 changes: 18 additions & 14 deletions tests/functional/filter/cram/filter-help.t
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,8 @@ Show help text
"201X-10-01"). Similarly, an ambiguous month makes the
corresponding day ambiguous (e.g., "2010-XX-01").
--exclude FILE [FILE ...]
File(s) with list of strains to exclude.
File(s) with list of strain IDs to exclude. The ID
column is determined by --metadata-id-columns.
--exclude-where CONDITION [CONDITION ...]
Exclude strains matching these conditions. Ex:
"host=rat" or "host!=rat". Multiple values are
Expand All @@ -102,9 +103,10 @@ Show help text
include arguments to select a specific subset of
strains.
--include FILE [FILE ...]
File(s) with list of strains to include regardless of
priorities, subsampling, or absence of an entry in
--sequences.
File(s) with list of strain IDs to include regardless
of priorities, subsampling, or absence of an entry in
--sequences. The ID column is determined by
--metadata-id-columns.
--include-where CONDITION [CONDITION ...]
Include strains with these values. ex: host=rat.
Multiple values are processed as OR (having any of
Expand Down Expand Up @@ -151,15 +153,16 @@ Show help text
max-sequences` is provided. (default: True)
--no-probabilistic-sampling
--priority FILE Tab-delimited file with list of priority scores for
strains (e.g., "<strain>\t<priority>") and no header.
When scores are provided, Augur converts scores to
floating point values, sorts strains within each
subsampling group from highest to lowest priority, and
selects the top N strains per group where N is the
calculated or requested number of strains per group.
Higher numbers indicate higher priority. Since
priorities represent relative values between strains,
these values can be arbitrary.
strains (e.g., "<strain ID>\t<priority>") and no
header. When scores are provided, Augur converts
scores to floating point values, sorts strains within
each subsampling group from highest to lowest
priority, and selects the top N strains per group
where N is the calculated or requested number of
strains per group. Higher numbers indicate higher
priority. Since priorities represent relative values
between strains, these values can be arbitrary. The ID
column is determined by --metadata-id-columns.
--subsample-seed N Random number generator seed to allow reproducible
subsampling (with same input data).

Expand All @@ -173,7 +176,8 @@ Show help text
--output-metadata FILE
Metadata for strains that passed filters.
--output-strains FILE
List of strains that passed filters (no header).
List of strain IDs that passed filters (no header).
The ID column is determined by --metadata-id-columns.
--output-log FILE Tab-delimited file with one row for each filtered
strain and the reason it was filtered. Keyword
arguments used for a given filter are reported in JSON
Expand Down

0 comments on commit 348613b

Please sign in to comment.