diff --git a/freyja/_cli.py b/freyja/_cli.py index e4bf335..230f7ba 100644 --- a/freyja/_cli.py +++ b/freyja/_cli.py @@ -82,10 +82,15 @@ def print_barcode_version(ctx, param, value): help='Pathogen of interest.' + 'Not used if using --barcodes option.', show_default=True) +@click.option('--autoadapt', default=False, + is_flag=True, + help='use error profile to set adapt', + show_default=True) def demix(variants, depths, output, eps, barcodes, meta, covcut, confirmedonly, depthcutoff, lineageyml, adapt, a_eps, region_of_interest, - relaxedmrca, relaxedthresh, solver, pathogen): + relaxedmrca, relaxedthresh, solver, pathogen, + autoadapt): """ Generate relative lineage abundances from VARIANTS and DEPTHS """ @@ -110,9 +115,9 @@ def demix(variants, depths, output, eps, barcodes, meta, # drop intra-lineage diversity naming (keeps separate barcodes) indexSimplified = [dfi.split('_')[0] for dfi in df_barcodes.index] df_barcodes = df_barcodes.loc[indexSimplified, :] - df_depth = pd.read_csv(depths, sep='\t', header=None, index_col=1) if depthcutoff != 0: + df_depth = pd.read_csv(depths, sep='\t', header=None, index_col=1) df_barcodes = collapse_barcodes(df_barcodes, df_depth, depthcutoff, lineageyml, locDir, output, relaxedmrca, relaxedthresh, @@ -121,11 +126,20 @@ def demix(variants, depths, output, eps, barcodes, meta, mapDict = buildLineageMap(meta) print('building mix/depth matrices') # assemble data from (possibly) mixed samples - mix, depths_, cov = build_mix_and_depth_arrays(variants, depths, muts, - covcut) - print('demixing') + if autoadapt: + mix, depths_, cov, adapt = build_mix_and_depth_arrays(variants, + depths, + muts, + covcut, + autoadapt) + else: + mix, depths_, cov, _ = build_mix_and_depth_arrays(variants, + depths, + muts, + covcut, + autoadapt) df_barcodes, mix, depths_ = reindex_dfs(df_barcodes, mix, depths_) - + print('demixing') try: sample_strains, abundances, error = solve_demixing_problem(df_barcodes, mix, @@ -507,10 +521,14 @@ def variants(bamfile, ref, variants, depths, refname, minq, annot, varthresh): help='Pathogen of interest.' + 'Not used if using --barcodes option.', show_default=True) +@click.option('--autoadapt', default=False, + is_flag=True, + help='use error profile to set adapt', + show_default=True) def boot(variants, depths, output_base, eps, barcodes, meta, nb, nt, boxplot, confirmedonly, lineageyml, depthcutoff, rawboots, relaxedmrca, relaxedthresh, bootseed, - solver, pathogen): + solver, pathogen, autoadapt): """ Perform bootstrapping method for freyja using VARIANTS and DEPTHS """ @@ -546,8 +564,11 @@ def boot(variants, depths, output_base, eps, barcodes, meta, print('building mix/depth matrices') # assemble data from (possibly) mixed samples covcut = 10 # set value, coverage estimate not returned to user from boot - mix, depths_, cov = build_mix_and_depth_arrays(variants, depths, muts, - covcut) + mix, depths_, cov, adapt = build_mix_and_depth_arrays(variants, + depths, + muts, + covcut, + autoadapt) print('demixing') df_barcodes, mix, depths_ = reindex_dfs(df_barcodes, mix, depths_) lin_df, constell_df = perform_bootstrap(df_barcodes, mix, depths_, diff --git a/freyja/sample_deconv.py b/freyja/sample_deconv.py index 1e1f90c..52a67a4 100644 --- a/freyja/sample_deconv.py +++ b/freyja/sample_deconv.py @@ -58,15 +58,31 @@ def buildLineageMap(locDir): return mapDict -def build_mix_and_depth_arrays(fn, depthFn, muts, covcut): +def get_error_rate(muts, df0, df_depths0): + from freyja.convert_paths2barcodes import sortFun + sites = set([sortFun(m) for m in muts]) + df_depths0 = df_depths0[~df_depths0.index.isin(sites)] + df0 = df0[~df0['POS'].isin(sites)] + df0 = df0[df0['ALT_FREQ'] > 0] + error_rate = df0['ALT_FREQ'].median() + return error_rate + + +def build_mix_and_depth_arrays(fn, depthFn, muts, covcut, autoadapt): input_is_vcf = fn.lower().endswith('vcf') if input_is_vcf: df = read_snv_frequencies_vcf(fn, depthFn, muts) else: df = read_snv_frequencies_ivar(fn, depthFn, muts) + df = df[['REF', 'POS', 'ALT', 'ALT_FREQ', 'ALT_DP']] # only works for substitutions, but that's what we get from usher tree - df_depth = pd.read_csv(depthFn, sep='\t', header=None, index_col=1) + df_depth = pd.read_csv(depthFn, sep='\t', header=None, index_col=1)[[3]] + + if autoadapt: + adapt = get_error_rate(muts, df, df_depth) + else: + adapt = 0 df['mutName'] = df['REF'] + df['POS'].astype(str) + df['ALT'] df = df.drop_duplicates(subset='mutName') @@ -77,7 +93,7 @@ def build_mix_and_depth_arrays(fn, depthFn, muts, covcut): depths = pd.Series({kI: df_depth.loc[int(re.findall(r'\d+', kI)[0]), 3] .astype(float) for kI in muts}, name=fn) coverage = 100.*np.sum(df_depth.loc[:, 3] >= covcut)/df_depth.shape[0] - return mix, depths, coverage + return mix, depths, coverage, adapt def read_snv_frequencies_ivar(fn, depthFn, muts): diff --git a/freyja/tests/test_deconv.py b/freyja/tests/test_deconv.py index c89d46b..97b0995 100644 --- a/freyja/tests/test_deconv.py +++ b/freyja/tests/test_deconv.py @@ -21,8 +21,10 @@ def test_build_mix_and_depth_plus_reindex(self): varFn = 'freyja/data/mixture.tsv' depthFn = 'freyja/data/mixture.depth' covcut = 10 - mix, depths, cov = build_mix_and_depth_arrays(varFn, depthFn, muts, - covcut) + autoadapt = False + mix, depths, cov, adapt = build_mix_and_depth_arrays(varFn, depthFn, + muts, covcut, + autoadapt) # just making sure the files we've read in are ready for use self.assertTrue(ptypes.is_float_dtype(mix)) self.assertTrue(ptypes.is_float_dtype(depths))