-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_dms_data.py
87 lines (77 loc) · 2.66 KB
/
get_dms_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import io
import sys
import urllib
import Bio.SeqIO
import pandas as pd
sys.stdout = sys.stderr = open(snakemake.log[0], "w")
# get DMS protein, removing trailing stop codon if there is one
with urllib.request.urlopen(snakemake.params.prot) as response:
fasta_content = response.read().decode("utf-8")
fasta_io = io.StringIO(fasta_content)
prot = Bio.SeqIO.read(fasta_io, "fasta")
protseq = str(prot.seq)
if prot[-1] == "*":
prot = prot[: -1]
Bio.SeqIO.write([prot], snakemake.output.prot, "fasta")
# get the site numbering map and add the wildtype
site_numbering_map = pd.read_csv(snakemake.params.site_numbering_map)
assert "wildtype" not in site_numbering_map.columns
assert "sequential_site" in site_numbering_map.columns
site_numbering_map["wildtype"] = site_numbering_map["sequential_site"].map(
lambda r: protseq[r - 1]
)
site_numbering_map.to_csv(snakemake.output.site_numbering_map, index=False)
# Get the phenotypes
phenotypes = (
pd.read_csv(snakemake.params.phenotypes)
.merge(
(
pd.read_csv(snakemake.params.escape_by_species)
.assign(serum=lambda x: x["antibody"] + "_sera_escape")
.pivot_table(
index=["site", "wildtype", "mutant"],
values="escape",
columns="serum",
)
.reset_index()
),
on=["site", "wildtype", "mutant"],
validate="one_to_one",
how="outer",
)
.rename(
columns={
"site": "mature_H3_site",
"SA26 usage increase": "SA26_usage_increase",
"entry in 293T cells": "entry_in_293T_cells"
}
)
[
[f"{scheme}_site" for scheme in snakemake.params.site_numbering_schemes]
+ [
"wildtype",
"mutant",
"entry_in_293T_cells",
"stability",
"SA26_usage_increase",
"ferret_sera_escape",
"mouse_sera_escape",
]
]
.assign(
stability_increase=lambda x: x["stability"].clip(lower=0),
ferret_sera_escape_increase=lambda x: x["ferret_sera_escape"].clip(lower=0),
mouse_sera_escape_increase=lambda x: x["mouse_sera_escape"].clip(lower=0),
)
.sort_values(["sequential_site", "mutant"])
)
assert len(phenotypes) == len(phenotypes.groupby(["sequential_site", "mutant"]))
# make sure wildtypes in protein match those in phenotypes
wts = phenotypes.set_index("sequential_site")["wildtype"].to_dict()
wts_prot = dict(enumerate(str(prot.seq), start=1))
assert all(aa == wts_prot[i] for (i, aa) in wts.items())
phenotypes.to_csv(
snakemake.output.phenotypes,
index=False,
float_format=f"%.{snakemake.params.decimal_scale}f",
)