From 19c685f7c4b265dbabe5b71ff50c67a55b18db4e Mon Sep 17 00:00:00 2001 From: Victor Lin <13424970+victorlin@users.noreply.github.com> Date: Wed, 3 Jul 2024 17:52:22 -0700 Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A7=20add=20--output-group-by-missing-?= =?UTF-8?q?weights?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- augur/filter/__init__.py | 6 ++++++ augur/filter/_run.py | 1 + augur/filter/subsample.py | 16 ++++++++------- .../filter/cram/subsample-weighted.t | 20 ++++++++++++++++--- 4 files changed, 33 insertions(+), 10 deletions(-) diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py index 7b221a8c3..dd4b9a0c6 100644 --- a/augur/filter/__init__.py +++ b/augur/filter/__init__.py @@ -92,6 +92,12 @@ def register_arguments(parser): output_group.add_argument('--output-metadata', help="metadata for strains that passed filters") output_group.add_argument('--output-strains', help="list of strains that passed filters (no header)") output_group.add_argument('--output-log', help="tab-delimited file with one row for each filtered strain and the reason it was filtered. Keyword arguments used for a given filter are reported in JSON format in a `kwargs` column.") + output_group.add_argument('--output-group-by-missing-weights', type=str, metavar="FILE", help=""" + TSV file formatted for --group-by-weights with an empty weight column. + Represents groups that had sequences in the metadata but were dropped + due to weights missing from the file given to --group-by-weights. If + this is not set, augur filter will error upon missing weights. + """) # FIXME: 2 options: # (1) add validation to ensure this is only added with --group-by-weights # (2) implement this for all sampling methods diff --git a/augur/filter/_run.py b/augur/filter/_run.py index e8a19a00b..b7ea036de 100644 --- a/augur/filter/_run.py +++ b/augur/filter/_run.py @@ -286,6 +286,7 @@ def run(args): group_by, args.group_by_weights, args.subsample_max_sequences, + args.output_group_by_missing_weights, args.output_group_by_sizes, args.subsample_seed, ) diff --git a/augur/filter/subsample.py b/augur/filter/subsample.py index 4ffb21ed2..8bfc08080 100644 --- a/augur/filter/subsample.py +++ b/augur/filter/subsample.py @@ -299,7 +299,7 @@ def get_probabilistic_group_sizes(groups, target_group_size, random_seed=None): return max_sizes_per_group -def get_weighted_group_sizes(groups, group_by, weights_file, target_total_size, output_sizes_file, random_seed): +def get_weighted_group_sizes(groups, group_by, weights_file, target_total_size, output_missing_weights, output_sizes_file, random_seed): """Return group sizes based on weights defined in ``weights_file``. Returns @@ -363,12 +363,14 @@ def get_weighted_group_sizes(groups, group_by, weights_file, target_total_size, missing_groups = set(groups) - set(weights[group_by].apply(tuple, axis=1)) if missing_groups: n_missing = len(missing_groups) - print_err(f"WARNING: {n_missing} groups appear in the metadata but are missing from the weights file. Sequences from these groups will be dropped.") - missing_weights = pd.DataFrame(sorted(missing_groups), columns=group_by) - missing_weights[WEIGHTS_COLUMN] = '' - missing_weights_file = 'missing_weights.tsv' - missing_weights.to_csv(missing_weights_file, index=False, sep='\t') - print_err(f"All missing groups added to a file {missing_weights_file!r}.") + if not output_missing_weights: + raise AugurError(f"{n_missing} groups appear in the metadata but are missing from the weights file. Re-run with --output-group-by-missing-weights to continue.") + else: + print_err(f"WARNING: {n_missing} groups appear in the metadata but are missing from the weights file. Sequences from these groups will be dropped.") + missing_weights = pd.DataFrame(sorted(missing_groups), columns=group_by) + missing_weights[WEIGHTS_COLUMN] = '' + missing_weights.to_csv(output_missing_weights, index=False, sep='\t') + print_err(f"All missing groups added to {output_missing_weights!r}.") # Calculate maximum group sizes based on weights SIZE_COLUMN_FLOAT = '_augur_filter_target_size_float' diff --git a/tests/functional/filter/cram/subsample-weighted.t b/tests/functional/filter/cram/subsample-weighted.t index 1bdfc0d6f..5fa0ed7f6 100644 --- a/tests/functional/filter/cram/subsample-weighted.t +++ b/tests/functional/filter/cram/subsample-weighted.t @@ -72,7 +72,7 @@ month) should work. 2 were dropped because of subsampling criteria 6 strains passed all filters -Sampling with incomplete weights should show a warning. +Sampling with incomplete weights should show an error. $ cat >weights.tsv <<~~ > location weight @@ -87,13 +87,27 @@ Sampling with incomplete weights should show a warning. > --subsample-seed 0 \ > --output-strains strains.txt Sampling with weights defined by weights.tsv. + ERROR: 1 groups appear in the metadata but are missing from the weights file. Re-run with --output-group-by-missing-weights to continue. + [2] + +Re-running with --output-group-by-missing-weights shows a warning and a file to use for fixing. + + $ ${AUGUR} filter \ + > --metadata metadata.tsv \ + > --group-by location \ + > --group-by-weights weights.tsv \ + > --subsample-max-sequences 6 \ + > --subsample-seed 0 \ + > --output-group-by-missing-weights missing-weights.tsv \ + > --output-strains strains.txt + Sampling with weights defined by weights.tsv. WARNING: 1 groups appear in the metadata but are missing from the weights file. Sequences from these groups will be dropped. - All missing groups added to a file 'missing_weights.tsv'. + All missing groups added to 'missing-weights.tsv'. 4 strains were dropped during filtering 4 were dropped because of subsampling criteria 4 strains passed all filters - $ cat missing_weights.tsv + $ cat missing-weights.tsv location weight B