######## snakemake preamble start (automatically inserted, do not edit) ########
import sys; sys.path.extend(['/home/ckikawa/.conda/envs/ZIKV_DMS_NS5_EvansLab/lib/python3.8/site-packages', '/fh/fast/bloom_j/computational_notebooks/ckikawa/2022/ZIKV_DMS_NS5_EvansLab']); import pickle; snakemake = pickle.loads(b'\x80\x04\x95\x9e\x07\x00\x00\x00\x00\x00\x00\x8c\x10snakemake.script\x94\x8c\tSnakemake\x94\x93\x94)\x81\x94}\x94(\x8c\x05input\x94\x8c\x0csnakemake.io\x94\x8c\nInputFiles\x94\x93\x94)\x81\x94(\x8c\x1adata/tile_5_amplicon.fasta\x94\x8c&data/tile_5_subamplicon_alignspecs.txt\x94\x8c\x1adata/tile_5_samplelist.csv\x94e}\x94(\x8c\x06_names\x94}\x94(\x8c\x08amplicon\x94K\x00N\x86\x94\x8c\nalignspecs\x94K\x01N\x86\x94\x8c\nsamplelist\x94K\x02N\x86\x94u\x8c\x12_allowed_overrides\x94]\x94(\x8c\x05index\x94\x8c\x04sort\x94eh\x18\x8c\tfunctools\x94\x8c\x07partial\x94\x93\x94h\x06\x8c\x19Namedlist._used_attribute\x94\x93\x94\x85\x94R\x94(h\x1e)}\x94\x8c\x05_name\x94h\x18sNt\x94bh\x19h\x1ch\x1e\x85\x94R\x94(h\x1e)}\x94h"h\x19sNt\x94bh\x10h\nh\x12h\x0bh\x14h\x0cub\x8c\x06output\x94h\x06\x8c\x0bOutputFiles\x94\x93\x94)\x81\x94(\x8c\x0eresults/tile_5\x94\x8c results/tile_5/dms_view/data.csv\x94e}\x94(h\x0e}\x94(\x8c\nresultsdir\x94K\x00N\x86\x94\x8c\x08dms_view\x94K\x01N\x86\x94uh\x16]\x94(h\x18h\x19eh\x18h\x1ch\x1e\x85\x94R\x94(h\x1e)}\x94h"h\x18sNt\x94bh\x19h\x1ch\x1e\x85\x94R\x94(h\x1e)}\x94h"h\x19sNt\x94bh0h,h2h-ub\x8c\x06params\x94h\x06\x8c\x06Params\x94\x93\x94)\x81\x94(\x8c\x11wt-plasmid-210921\x94M\xc2\x01e}\x94(h\x0e}\x94(\x8c\x06errpre\x94K\x00N\x86\x94\x8c\x12site_number_offset\x94K\x01N\x86\x94uh\x16]\x94(h\x18h\x19eh\x18h\x1ch\x1e\x85\x94R\x94(h\x1e)}\x94h"h\x18sNt\x94bh\x19h\x1ch\x1e\x85\x94R\x94(h\x1e)}\x94h"h\x19sNt\x94bhDhAhFM\xc2\x01ub\x8c\twildcards\x94h\x06\x8c\tWildcards\x94\x93\x94)\x81\x94\x8c\x06tile_5\x94a}\x94(h\x0e}\x94\x8c\x04tile\x94K\x00N\x86\x94sh\x16]\x94(h\x18h\x19eh\x18h\x1ch\x1e\x85\x94R\x94(h\x1e)}\x94h"h\x18sNt\x94bh\x19h\x1ch\x1e\x85\x94R\x94(h\x1e)}\x94h"h\x19sNt\x94b\x8c\x04tile\x94hUub\x8c\x07threads\x94K\x10\x8c\tresources\x94h\x06\x8c\tResources\x94\x93\x94)\x81\x94(K\x10K\x01\x8c\x14/loc/scratch/1544219\x94e}\x94(h\x0e}\x94(\x8c\x06_cores\x94K\x00N\x86\x94\x8c\x06_nodes\x94K\x01N\x86\x94\x8c\x06tmpdir\x94K\x02N\x86\x94uh\x16]\x94(h\x18h\x19eh\x18h\x1ch\x1e\x85\x94R\x94(h\x1e)}\x94h"h\x18sNt\x94bh\x19h\x1ch\x1e\x85\x94R\x94(h\x1e)}\x94h"h\x19sNt\x94bhlK\x10hnK\x01hphiub\x8c\x03log\x94h\x06\x8c\x03Log\x94\x93\x94)\x81\x94\x8c+results/notebooks/dms_tile_5_analysis.ipynb\x94a}\x94(h\x0e}\x94\x8c\x08notebook\x94K\x00N\x86\x94sh\x16]\x94(h\x18h\x19eh\x18h\x1ch\x1e\x85\x94R\x94(h\x1e)}\x94h"h\x18sNt\x94bh\x19h\x1ch\x1e\x85\x94R\x94(h\x1e)}\x94h"h\x19sNt\x94bh\x82h\x7fub\x8c\x06config\x94}\x94(\x8c\x08max_cpus\x94KH\x8c\x05tiles\x94}\x94(\x8c\x06tile_1\x94}\x94(\x8c\x06errpre\x94\x8c\x11wt-plasmid-201112\x94\x8c\x12site_number_offset\x94J\xff\xff\xff\xffu\x8c\x06tile_2\x94}\x94(\x8c\x06errpre\x94\x8c\x11wt-plasmid-210528\x94\x8c\x12site_number_offset\x94Knu\x8c\x06tile_3\x94}\x94(\x8c\x06errpre\x94\x8c\x11wt-plasmid-210921\x94\x8c\x12site_number_offset\x94K\xe0u\x8c\x06tile_4\x94}\x94(\x8c\x06errpre\x94\x8c\x11wt-plasmid-210921\x94\x8c\x12site_number_offset\x94MQ\x01u\x8c\x06tile_5\x94}\x94(\x8c\x06errpre\x94hA\x8c\x12site_number_offset\x94M\xc2\x01u\x8c\x06tile_6\x94}\x94(\x8c\x06errpre\x94\x8c\x11wt-plasmid-210921\x94\x8c\x12site_number_offset\x94M3\x02u\x8c\x06tile_7\x94}\x94(\x8c\x06errpre\x94\x8c\x11wt-plasmid-210702\x94\x8c\x12site_number_offset\x94M\xa4\x02u\x8c\x06tile_8\x94}\x94(\x8c\x06errpre\x94\x8c\x11wt-plasmid-210702\x94\x8c\x12site_number_offset\x94M\x15\x03uuu\x8c\x04rule\x94\x8c\x11dms_tile_analysis\x94\x8c\x0fbench_iteration\x94N\x8c\tscriptdir\x94\x8cK/fh/fast/bloom_j/computational_notebooks/ckikawa/2022/ZIKV_DMS_NS5_EvansLab\x94ub.'); from snakemake.logging import logger; logger.printshellcmds = False; import os; os.chdir(r'/fh/fast/bloom_j/computational_notebooks/ckikawa/2022/ZIKV_DMS_NS5_EvansLab');
######## snakemake preamble end #########

Deep mutational scanning of ZIKV E protein NS5

Mutational antigenic profiling of ZIKV E from the MR766 strain. Experiments performed by Blake Richardson and Matt Evans. Analysis by Jesse Bloom, Caroline Kikawa, David Bacsik, Allie Greaney.

The NS5 mutagensis was performed in "tiles" along the length of the gene.

Set up for analysis

Import Python packages and modules:

import glob
import os
import subprocess
import shutil

import Bio.SeqIO

import dms_tools2
from dms_tools2 import AAS
from dms_tools2.ipython_utils import showPDF
from dms_tools2.plot import COLOR_BLIND_PALETTE_GRAY as CBPALETTE
import dms_tools2.prefs
import dms_tools2.utils
print(f"Using dms_tools2 {dms_tools2.__version__}")

from IPython.display import display, HTML

import pandas as pd

import altair as alt
from plotnine import *

import numpy

import dms_variants.plotnine_themes

Using dms_tools2 2.6.10

Get variables from snakemake:

ncpus = snakemake.threads
refseqfile = snakemake.input.amplicon
samplelist = snakemake.input.samplelist
alignspecsfile = snakemake.input.alignspecs
resultsdir = snakemake.output.resultsdir
errpre = snakemake.params.errpre
site_number_offset = snakemake.params.site_number_offset

Some additional configuration for analysis:

use_existing = 'no' # use existing output

os.makedirs(resultsdir, exist_ok=True)

Read in the wildtype (reference) sequence and its protein translation:

refseqrecord = Bio.SeqIO.read(refseqfile, 'fasta')
refprot = str(refseqrecord.seq.translate())
refseq = str(refseqrecord.seq)

print(f"Read reference sequence of {len(refseq)} nucleotides from {refseqfile} "
      f"that translates to protein of {len(refprot)} amino acids.")

Read reference sequence of 339 nucleotides from data/tile_5_amplicon.fasta that translates to protein of 113 amino acids.

Process deep sequencing data

We process the data from the barcoded subamplicon deep sequencing to count the frequency of each codon in each sample.

First, we read in the samples:

samples = (pd.read_csv(samplelist)
           .assign(name=lambda x: x.library + '-' + x.selection + '-' + x.date.astype(str))
           )

display(HTML(samples.to_html(index=False)))

library	selection	date	R1	SRA_accession	name
lib2	Huh-7.5	210921	/shared/ngs/illumina/bloom_lab/210921_D00300_1326_BHMTYKBCX3/Unaligned/Project_bloom_lab/Tile_5-2_Sample_S20_R1_001.fastq.gz	NaN	lib2-Huh-7.5-210921
lib2	plasmid	210921	/shared/ngs/illumina/bloom_lab/210921_D00300_1326_BHMTYKBCX3/Unaligned/Project_bloom_lab/Tile_5-2_Plasmid_S24_R1_001.fastq.gz	NaN	lib2-plasmid-210921
lib3	plasmid	210921	/shared/ngs/illumina/bloom_lab/210921_D00300_1326_BHMTYKBCX3/Unaligned/Project_bloom_lab/Tile_5-3_Plasmid_S25_R1_001.fastq.gz	NaN	lib3-plasmid-210921
lib1	plasmid	210921	/shared/ngs/illumina/bloom_lab/210921_D00300_1326_BHMTYKBCX3/Unaligned/Project_bloom_lab/Tile_5-1_Plasmid_S23_R1_001.fastq.gz	NaN	lib1-plasmid-210921
lib3	Huh-7.5	210921	/shared/ngs/illumina/bloom_lab/210921_D00300_1326_BHMTYKBCX3/Unaligned/Project_bloom_lab/Tile_5-3_Sample_S21_R1_001.fastq.gz	NaN	lib3-Huh-7.5-210921
lib1	Huh-7.5	210921	/shared/ngs/illumina/bloom_lab/210921_D00300_1326_BHMTYKBCX3/Unaligned/Project_bloom_lab/Tile_5-1_Sample_S19_R1_001.fastq.gz	NaN	lib1-Huh-7.5-210921
wt	Huh-7.5	210921	/shared/ngs/illumina/bloom_lab/210921_D00300_1326_BHMTYKBCX3/Unaligned/Project_bloom_lab/WT_Tile_5_Sample_S18_R1_001.fastq.gz	NaN	wt-Huh-7.5-210921
wt	plasmid	210921	/shared/ngs/illumina/bloom_lab/210921_D00300_1326_BHMTYKBCX3/Unaligned/Project_bloom_lab/WT_Tile_5_Plasmid_S22_R1_001.fastq.gz	NaN	wt-plasmid-210921

Now we read in the alignment specs for the barcoded subamplicon sequencing:

with open(alignspecsfile) as f:
    alignspecs = f.read().strip()
print(alignspecs)

1,339,30,30

Now we use the dms2_batch_bcsubamp program to process the deep sequencing data to obtain codon counts:

countsdir = os.path.join(resultsdir, 'codoncounts')
os.makedirs(countsdir, exist_ok=True)

bcsubamp_batchfile = os.path.join(countsdir, 'batch.csv')
samples[['name', 'R1']].to_csv(bcsubamp_batchfile, index=False)

log = ! dms2_batch_bcsubamp \
        --batchfile {bcsubamp_batchfile} \
        --refseq {refseqfile} \
        --alignspecs {alignspecs} \
        --outdir {countsdir} \
        --summaryprefix summary \
        --R1trim 210 \
        --R2trim 210 \
        --ncpus {ncpus} \
        --use_existing {use_existing}

samples['codoncounts'] = countsdir + '/' + samples['name'] + '_codoncounts.csv'

# check that expected codon counts files created
assert all(map(os.path.isfile, samples.codoncounts)), '\n'.join(log)

print(f"Processed sequencing data to create codon counts files in {countsdir}")

Processed sequencing data to create codon counts files in results/tile_5/codoncounts

Now we look at the plots. They will all have the following prefix:

bcsubamp_plot_prefix = os.path.join(countsdir, 'summary_')
print(f"Plots prefix is {bcsubamp_plot_prefix}")

Plots prefix is results/tile_5/codoncounts/summary_

First, we look at the number of reads and barcodes per sample.

showPDF(bcsubamp_plot_prefix + 'readstats.pdf')
showPDF(bcsubamp_plot_prefix + 'bcstats.pdf')

Next we look at number of reads per barcode.

showPDF(bcsubamp_plot_prefix + 'readsperbc.pdf')

Now we look at the depth across the gene. Note that this is still 1, 2, ... numbering of the reference sequence for this tile alone.

showPDF(bcsubamp_plot_prefix + 'depth.pdf')

Here are the mutation frequencies across the gene. As expected, the library plasmids have higher mutation rates than the wildtype control:

showPDF(bcsubamp_plot_prefix + 'mutfreq.pdf')

Here are the overall per-codon mutation rate averages:

showPDF(bcsubamp_plot_prefix + 'codonmuttypes.pdf')

We have single and multi-nucleotide changes in the libraries, although the single nucleotide changes are perhaps over-represented:

showPDF(bcsubamp_plot_prefix + 'codonntchanges.pdf')

Here are the frequencies of different types of mutations among single-nucleotide codon changes. There is no massive over-representation of any class as would be expected if oxidative damage, which leads to C->A or G->T mutations:

showPDF(bcsubamp_plot_prefix + 'singlentchanges.pdf')

Finally, we look at mutation sampling. We can see that most possible mutations are sampled very well in the plasmid samples, although the overall coverage is still pretty low so some are missed:

showPDF(bcsubamp_plot_prefix + 'cumulmutcounts.pdf')

Now re-number the sites

Above everything is numbered 1, 2, ... for that tile. We want to renumber for the whole gene:

print(f"Renumbering by adding an offset of {site_number_offset}")

Renumbering by adding an offset of 450

Create a directory for the re-numbered codon counts:

renumb_countsdir = os.path.join(resultsdir, 'renumbered_codoncounts')
os.makedirs(renumb_countsdir, exist_ok=True)
print(f"Putting renumbered codon counts in {renumb_countsdir}")

Putting renumbered codon counts in results/tile_5/renumbered_codoncounts

Create a renumbering file:

ncodons = len(refseq)
assert 0 == ncodons % 3, f"invalid {ncodons=}"

renumbfile = os.path.join(renumb_countsdir, 'renumbering.csv')
with open(renumbfile, 'w') as f:
    f.write('original,new\n')
    for orig in range(1, ncodons + 1):
        f.write(f"{orig},{orig + site_number_offset}\n")

Renumber all CSVs:

counts_files = glob.glob(f"{countsdir}/*_codoncounts.csv")
print(f"Renumbering {len(counts_files)} files")

dms_tools2.utils.renumberSites(renumbfile, counts_files, outdir=renumb_countsdir)

Renumbering 8 files

Correct our 'samples' file to include renumb_codoncounts

samples['renumb_codoncounts'] = renumb_countsdir + '/' + samples['name'] + '_codoncounts.csv'

Functional effects of mutations of viral growth

Compute the functional effects of mutations on viral growth by comparing the passaged virus to the original plasmid.

To do this, we compute the amino-acid preferences under selection for viral growth. We do this using dms2_batch_prefs.

First, make a data frame with the batch file:

prefs_batch = (
    samples
    .query('library != "wt"')
    .query('selection != "plasmid"')
    .assign(post=lambda x: x['name'])
    .merge(samples.query('selection == "plasmid"')
                  .assign(pre=lambda x: x['name'])
                  [['library', 'pre']],
           on=['library'], how='left', validate='many_to_one',
           )
    [['name', 'selection', 'library', 'pre', 'post', 'date']]
    .assign(errpre=errpre)
    .merge(samples.query('library == "wt"')
                  .assign(errpost=lambda x: x['name'])
                  [['selection', 'errpost', 'date']],
           on=['selection', 'date'], how='left'
           )
    )
assert prefs_batch.notnull().all().all()

display(prefs_batch)

.dataframe tbody tr th {
    vertical-align: top;
}

.dataframe thead th {
    text-align: right;
}

</style>

	name	selection	library	pre	post	date	errpre	errpost
0	lib2-Huh-7.5-210921	Huh-7.5	lib2	lib2-plasmid-210921	lib2-Huh-7.5-210921	210921	wt-plasmid-210921	wt-Huh-7.5-210921
1	lib3-Huh-7.5-210921	Huh-7.5	lib3	lib3-plasmid-210921	lib3-Huh-7.5-210921	210921	wt-plasmid-210921	wt-Huh-7.5-210921
2	lib1-Huh-7.5-210921	Huh-7.5	lib1	lib1-plasmid-210921	lib1-Huh-7.5-210921	210921	wt-plasmid-210921	wt-Huh-7.5-210921

Now run dms2_batch_prefs:

prefsdir = os.path.join(resultsdir, 'prefs')
os.makedirs(prefsdir, exist_ok=True)

prefs_batchfile = os.path.join(prefsdir, 'batch.csv')
prefs_batch.to_csv(prefs_batchfile, index=False)

log = ! dms2_batch_prefs \
        --indir {renumb_countsdir} \
        --batchfile {prefs_batchfile} \
        --outdir {prefsdir} \
        --summaryprefix summary \
        --method ratio \
        --use_existing {use_existing} \
        --ncpus {ncpus}

assert all(map(os.path.isfile, [os.path.join(prefsdir, name + '_prefs.csv') 
                                for name in prefs_batch.name])), '\n'.join(log)

print("Amino-acid preferences calculated for all samples.")

Amino-acid preferences calculated for all samples.

Look at correlation among the amino-acid preferences for the individual libraries:

showPDF(os.path.join(prefsdir, 'summary_prefscorr.pdf'))

Now let's get the amino-acid preferences for all samples, and for each condition separately:

# file with preferences for all samples
prefs_files = {'all': os.path.join(prefsdir, 'prefs_all.csv')}
pd.read_csv(os.path.join(prefsdir, 'summary_avgprefs.csv')).to_csv(prefs_files['all'],
                                                                   index=False,
                                                                   float_format='%.5f')

# file with preferences for each condition
for selection, df in prefs_batch.groupby('selection'):
    selection_prefsfiles = [os.path.join(prefsdir, f"{name}_prefs.csv") for name in df['name']]
    assert all(map(os.path.isfile, selection_prefsfiles)), selection_prefsfiles
    prefs_files[selection] = os.path.join(prefsdir, f"prefs_{selection}.csv")
    dms_tools2.prefs.avgPrefs(selection_prefsfiles).to_csv(prefs_files[selection],
                                                           index=False,
                                                           float_format='%.5f')
    
print('Average preferences across conditions are in the following files:')
display(HTML(pd.Series(prefs_files).rename('file').to_frame().to_html()))

Average preferences across conditions are in the following files:

	file
all	results/tile_5/prefs/prefs_all.csv
Huh-7.5	results/tile_5/prefs/prefs_Huh-7.5.csv

Now we will make a logo plot of the average of the amino-acid preferences across all samples, and each group of samples. We do this using dms2_logoplot. Note that this logo plot shows the raw unscaled (not re-scaled) preferences. In this plot, the height of each letter is proportional to the "preference" for that amino acid at that site, so taller letters are more preferred at a site. If the site tolerates everything, there will just be lots of small letters as all amino acids equally tolerated:

logodir = os.path.join(resultsdir, 'logoplots')
os.makedirs(logodir, exist_ok=True)

# get wildtype amino acids to use as overlay
wt_aas = pd.DataFrame.from_records(
            [(r + 1 + site_number_offset, a) for r, a in enumerate(refprot) if a != '*'],
            columns=['site', 'wildtype'])
wtoverlayfile = os.path.join(logodir, 'wt_overlay.csv')
wt_aas.to_csv(wtoverlayfile, index=False)

for selection, prefs_csv in prefs_files.items():

    logoplot = os.path.join(logodir, f"{selection}_prefs.pdf")

    log = ! dms2_logoplot \
            --prefs {prefs_csv} \
            --name {selection} \
            --outdir {logodir} \
            --nperline 56 \
            --overlay1 {wtoverlayfile} wildtype wildtype \
            --letterheight 1.2 \
            --use_existing {use_existing}

    assert os.path.isfile(logoplot), '\n'.join(log)

    print(f"\n\nPreferences for {selection} samples:")
    showPDF(logoplot)

Preferences for all samples:

Preferences for Huh-7.5 samples:

We can also represent the effects of mutations in a different way than the amino acid preferences. Specifically, the ratio of the preference for the mutant amino-acid to the wildtype amino-acid is a measure of its enrichment (this is just the ratio of letter heights in the plot above). If we take the log of this mutational effect, negative values indicate deleterious mutations and positive values indicate favorable mutations The potential advantage of this representation is that it better shows the detailed differences between mutations to amino acids with small preferences, which can be useful for figuring out if we think a mutation is just very mildly deleterious or highly deleterious.

Here we calculate the mutational effects and then plot their log2 values on a logo plot.

First, create a subdirectory for these analyses:

muteffectsdir = os.path.join(resultsdir, 'muteffects')
os.makedirs(muteffectsdir, exist_ok=True)

Convert the amino-acid preferences into mutational effects:

# ensure stop codons are not in the character list
if '*' in AAS:
    AAS.remove('*')

# calculate mutational effects 
muteffects_files = {}
for selection, prefs_csv in prefs_files.items():
    muteffects = dms_tools2.prefs.prefsToMutFromWtEffects(
                    prefs=pd.read_csv(prefs_csv),
                    charlist=AAS,
                    wts=wt_aas)
    muteffects_files[selection] = os.path.join(muteffectsdir, f"{selection}_muteffects.csv")
    print(f"Writing mutational effects for {selection} to {muteffects_files[selection]}")
    muteffects.to_csv(muteffects_files[selection], index=False, float_format='%.5g')

Writing mutational effects for all to results/tile_5/muteffects/all_muteffects.csv
Writing mutational effects for Huh-7.5 to results/tile_5/muteffects/Huh-7.5_muteffects.csv

Now make a logo plots showing the mutational effects for all samples, and for each condition. Letters below the line indicate deleterious mutations, and letters above the line indicate beneficial ones. We include a scale bar indicating the fold-enrichment implied by each letter height:

for selection, muteffects_csv in muteffects_files.items():

    logoplot = os.path.join(logodir, f"{selection}_muteffects.pdf")

    log = ! dms2_logoplot \
            --muteffects {muteffects_csv} \
            --name {selection} \
            --outdir {logodir} \
            --nperline 56 \
            --overlay1 {wtoverlayfile} wildtype wildtype \
            --scalebar 6.64 "100-fold change (log scale)" \
            --use_existing {use_existing}

    assert os.path.isfile(logoplot), '\n'.join(log)

    print(f"\n\nMutational effects for {selection} samples:")
    showPDF(logoplot)

Mutational effects for all samples:

Mutational effects for Huh-7.5 samples:

Create `dms-view` input files

Now we create a file to visualize the results of the deep mutational scanning using dms-view, setting up the mapping for the 6WCZ PDB file. In this PDB file, chain A is human STAT2 and chain B is ZIKV NS5.

offset_to_pdb = 0
pdb_chain = 'B'

dms_view_data = pd.DataFrame()

# preferences for all conditions
for condition, csvfile in prefs_files.items():
    prefs = pd.read_csv(csvfile)
    dms_view_data = dms_view_data.append(
        prefs
        .melt(id_vars='site',
              var_name='mutation',
              value_name='mut_preference',
              )
        .merge(dms_tools2.prefs.prefsEntropy(prefs, prefs.columns[1:].tolist())
               [['site', 'entropy', 'neffective']],
               on='site', validate='many_to_one')
        .rename(columns={'entropy': 'site_entropy', 'neffective': 'site_neffective'})
        .assign(condition=condition)
        )

# add PDB information
dms_view_data = dms_view_data.assign(label_site=lambda x: x['site'] + offset_to_pdb,
                                     protein_site=lambda x: x['label_site'],
                                     protein_chain=pdb_chain)

# display and print
dms_view_dir = os.path.join(resultsdir, 'dms_view')
os.makedirs(dms_view_dir, exist_ok=True)
dms_view_csv = os.path.join(dms_view_dir, 'data.csv')
print(f"Writing CSV to {dms_view_csv}; here are first few lines:")
dms_view_data.to_csv(dms_view_csv, index=False, float_format='%.3g')
display(HTML(dms_view_data.head().to_html()))

Writing CSV to results/tile_5/dms_view/data.csv; here are first few lines:

	site	mutation	mut_preference	site_entropy	site_neffective	condition	label_site	protein_site	protein_chain
0	451	A	0.01129	2.039572	7.687319	all	451	451	B
1	451	C	0.50791	2.039572	7.687319	all	451	451	B
2	451	D	0.02563	2.039572	7.687319	all	451	451	B
3	451	E	0.02488	2.039572	7.687319	all	451	451	B
4	451	F	0.02452	2.039572	7.687319	all	451	451	B

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

dms_tile_5_analysis.md

dms_tile_5_analysis.md

Deep mutational scanning of ZIKV E protein NS5

Set up for analysis

Process deep sequencing data

Now re-number the sites

Functional effects of mutations of viral growth

Create `dms-view` input files

Files

dms_tile_5_analysis.md

Latest commit

History

dms_tile_5_analysis.md

File metadata and controls

Deep mutational scanning of ZIKV E protein NS5

Set up for analysis

Process deep sequencing data

Now re-number the sites

Functional effects of mutations of viral growth

Create dms-view input files

Create `dms-view` input files