Skip to content

Commit

Permalink
🚧 add --output-group-by-missing-weights
Browse files Browse the repository at this point in the history
  • Loading branch information
victorlin committed Jul 4, 2024
1 parent 2d84b0d commit 19c685f
Show file tree
Hide file tree
Showing 4 changed files with 33 additions and 10 deletions.
6 changes: 6 additions & 0 deletions augur/filter/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,12 @@ def register_arguments(parser):
output_group.add_argument('--output-metadata', help="metadata for strains that passed filters")
output_group.add_argument('--output-strains', help="list of strains that passed filters (no header)")
output_group.add_argument('--output-log', help="tab-delimited file with one row for each filtered strain and the reason it was filtered. Keyword arguments used for a given filter are reported in JSON format in a `kwargs` column.")
output_group.add_argument('--output-group-by-missing-weights', type=str, metavar="FILE", help="""
TSV file formatted for --group-by-weights with an empty weight column.
Represents groups that had sequences in the metadata but were dropped
due to weights missing from the file given to --group-by-weights. If
this is not set, augur filter will error upon missing weights.
""")
# FIXME: 2 options:
# (1) add validation to ensure this is only added with --group-by-weights
# (2) implement this for all sampling methods
Expand Down
1 change: 1 addition & 0 deletions augur/filter/_run.py
Original file line number Diff line number Diff line change
Expand Up @@ -286,6 +286,7 @@ def run(args):
group_by,
args.group_by_weights,
args.subsample_max_sequences,
args.output_group_by_missing_weights,
args.output_group_by_sizes,
args.subsample_seed,
)
Expand Down
16 changes: 9 additions & 7 deletions augur/filter/subsample.py
Original file line number Diff line number Diff line change
Expand Up @@ -299,7 +299,7 @@ def get_probabilistic_group_sizes(groups, target_group_size, random_seed=None):
return max_sizes_per_group


def get_weighted_group_sizes(groups, group_by, weights_file, target_total_size, output_sizes_file, random_seed):
def get_weighted_group_sizes(groups, group_by, weights_file, target_total_size, output_missing_weights, output_sizes_file, random_seed):
"""Return group sizes based on weights defined in ``weights_file``.
Returns
Expand Down Expand Up @@ -363,12 +363,14 @@ def get_weighted_group_sizes(groups, group_by, weights_file, target_total_size,
missing_groups = set(groups) - set(weights[group_by].apply(tuple, axis=1))
if missing_groups:
n_missing = len(missing_groups)
print_err(f"WARNING: {n_missing} groups appear in the metadata but are missing from the weights file. Sequences from these groups will be dropped.")
missing_weights = pd.DataFrame(sorted(missing_groups), columns=group_by)
missing_weights[WEIGHTS_COLUMN] = ''
missing_weights_file = 'missing_weights.tsv'
missing_weights.to_csv(missing_weights_file, index=False, sep='\t')
print_err(f"All missing groups added to a file {missing_weights_file!r}.")
if not output_missing_weights:
raise AugurError(f"{n_missing} groups appear in the metadata but are missing from the weights file. Re-run with --output-group-by-missing-weights to continue.")
else:
print_err(f"WARNING: {n_missing} groups appear in the metadata but are missing from the weights file. Sequences from these groups will be dropped.")
missing_weights = pd.DataFrame(sorted(missing_groups), columns=group_by)
missing_weights[WEIGHTS_COLUMN] = ''
missing_weights.to_csv(output_missing_weights, index=False, sep='\t')
print_err(f"All missing groups added to {output_missing_weights!r}.")

# Calculate maximum group sizes based on weights
SIZE_COLUMN_FLOAT = '_augur_filter_target_size_float'
Expand Down
20 changes: 17 additions & 3 deletions tests/functional/filter/cram/subsample-weighted.t
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ month) should work.
2 were dropped because of subsampling criteria
6 strains passed all filters

Sampling with incomplete weights should show a warning.
Sampling with incomplete weights should show an error.

$ cat >weights.tsv <<~~
> location weight
Expand All @@ -87,13 +87,27 @@ Sampling with incomplete weights should show a warning.
> --subsample-seed 0 \
> --output-strains strains.txt
Sampling with weights defined by weights.tsv.
ERROR: 1 groups appear in the metadata but are missing from the weights file. Re-run with --output-group-by-missing-weights to continue.
[2]

Re-running with --output-group-by-missing-weights shows a warning and a file to use for fixing.

$ ${AUGUR} filter \
> --metadata metadata.tsv \
> --group-by location \
> --group-by-weights weights.tsv \
> --subsample-max-sequences 6 \
> --subsample-seed 0 \
> --output-group-by-missing-weights missing-weights.tsv \
> --output-strains strains.txt
Sampling with weights defined by weights.tsv.
WARNING: 1 groups appear in the metadata but are missing from the weights file. Sequences from these groups will be dropped.
All missing groups added to a file 'missing_weights.tsv'.
All missing groups added to 'missing-weights.tsv'.
4 strains were dropped during filtering
4 were dropped because of subsampling criteria
4 strains passed all filters

$ cat missing_weights.tsv
$ cat missing-weights.tsv
location weight
B

Expand Down

0 comments on commit 19c685f

Please sign in to comment.