🚧 add --output-group-by-missing-weights

nextstrain · Jul 4, 2024 · 19c685f · 19c685f
1 parent 2d84b0d
commit 19c685f
Show file tree

Hide file tree

Showing 4 changed files with 33 additions and 10 deletions.
diff --git a/augur/filter/__init__.py b/augur/filter/__init__.py
@@ -92,6 +92,12 @@ def register_arguments(parser):
     output_group.add_argument('--output-metadata', help="metadata for strains that passed filters")
     output_group.add_argument('--output-strains', help="list of strains that passed filters (no header)")
     output_group.add_argument('--output-log', help="tab-delimited file with one row for each filtered strain and the reason it was filtered. Keyword arguments used for a given filter are reported in JSON format in a `kwargs` column.")
+    output_group.add_argument('--output-group-by-missing-weights', type=str, metavar="FILE", help="""
+        TSV file formatted for --group-by-weights with an empty weight column.
+        Represents groups that had sequences in the metadata but were dropped
+        due to weights missing from the file given to --group-by-weights. If
+        this is not set, augur filter will error upon missing weights.
+    """)
     # FIXME: 2 options:
     # (1) add validation to ensure this is only added with --group-by-weights
     # (2) implement this for all sampling methods

diff --git a/augur/filter/_run.py b/augur/filter/_run.py
@@ -286,6 +286,7 @@ def run(args):
                     group_by,
                     args.group_by_weights,
                     args.subsample_max_sequences,
+                    args.output_group_by_missing_weights,
                     args.output_group_by_sizes,
                     args.subsample_seed,
                 )

diff --git a/augur/filter/subsample.py b/augur/filter/subsample.py
@@ -299,7 +299,7 @@ def get_probabilistic_group_sizes(groups, target_group_size, random_seed=None):
     return max_sizes_per_group
 
 
-def get_weighted_group_sizes(groups, group_by, weights_file, target_total_size, output_sizes_file, random_seed):
+def get_weighted_group_sizes(groups, group_by, weights_file, target_total_size, output_missing_weights, output_sizes_file, random_seed):
     """Return group sizes based on weights defined in ``weights_file``.
 
     Returns
@@ -363,12 +363,14 @@ def get_weighted_group_sizes(groups, group_by, weights_file, target_total_size,
     missing_groups = set(groups) - set(weights[group_by].apply(tuple, axis=1))
     if missing_groups:
         n_missing = len(missing_groups)
-        print_err(f"WARNING: {n_missing} groups appear in the metadata but are missing from the weights file. Sequences from these groups will be dropped.")
-        missing_weights = pd.DataFrame(sorted(missing_groups), columns=group_by)
-        missing_weights[WEIGHTS_COLUMN] = ''
-        missing_weights_file = 'missing_weights.tsv'
-        missing_weights.to_csv(missing_weights_file, index=False, sep='\t')
-        print_err(f"All missing groups added to a file {missing_weights_file!r}.")
+        if not output_missing_weights:
+            raise AugurError(f"{n_missing} groups appear in the metadata but are missing from the weights file. Re-run with --output-group-by-missing-weights to continue.")
+        else:
+            print_err(f"WARNING: {n_missing} groups appear in the metadata but are missing from the weights file. Sequences from these groups will be dropped.")
+            missing_weights = pd.DataFrame(sorted(missing_groups), columns=group_by)
+            missing_weights[WEIGHTS_COLUMN] = ''
+            missing_weights.to_csv(output_missing_weights, index=False, sep='\t')
+            print_err(f"All missing groups added to {output_missing_weights!r}.")
 
     # Calculate maximum group sizes based on weights
     SIZE_COLUMN_FLOAT = '_augur_filter_target_size_float'

diff --git a/tests/functional/filter/cram/subsample-weighted.t b/tests/functional/filter/cram/subsample-weighted.t
@@ -72,7 +72,7 @@ month) should work.
   	2 were dropped because of subsampling criteria
   6 strains passed all filters
 
-Sampling with incomplete weights should show a warning.
+Sampling with incomplete weights should show an error.
 
   $ cat >weights.tsv <<~~
   > location	weight
@@ -87,13 +87,27 @@ Sampling with incomplete weights should show a warning.
   >   --subsample-seed 0 \
   >   --output-strains strains.txt
   Sampling with weights defined by weights.tsv.
+  ERROR: 1 groups appear in the metadata but are missing from the weights file. Re-run with --output-group-by-missing-weights to continue.
+  [2]
+
+Re-running with --output-group-by-missing-weights shows a warning and a file to use for fixing.
+
+  $ ${AUGUR} filter \
+  >   --metadata metadata.tsv \
+  >   --group-by location \
+  >   --group-by-weights weights.tsv \
+  >   --subsample-max-sequences 6 \
+  >   --subsample-seed 0 \
+  >   --output-group-by-missing-weights missing-weights.tsv \
+  >   --output-strains strains.txt
+  Sampling with weights defined by weights.tsv.
   WARNING: 1 groups appear in the metadata but are missing from the weights file. Sequences from these groups will be dropped.
-  All missing groups added to a file 'missing_weights.tsv'.
+  All missing groups added to 'missing-weights.tsv'.
   4 strains were dropped during filtering
   	4 were dropped because of subsampling criteria
   4 strains passed all filters
 
-  $ cat missing_weights.tsv
+  $ cat missing-weights.tsv
   location	weight
   B