-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconfig.yaml
executable file
·117 lines (85 loc) · 4.01 KB
/
config.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
# config for analysis
# conda environment file
# environment: environment.yml
run_from_ngs: False
# most CPUs to ever use at once
max_cpus: 8
# amplicons sequenced by PacBio
amplicon: data/PacBio_amplicon.gb
# how to parse the amplicon
feature_parse_specs: data/feature_parse_specs.yaml
# do we get the sequencing data from the 'HutchServer' or 'SRA'?
seqdata_source: HutchServer
# list of PacBio sequencing runs linking barcodes to variants
pacbio_runs: data/PacBio_runs.csv
# list of Illumina sequencing runs of barcodes
barcode_runs: data/barcode_runs.csv
# wildtype sequence of mutagenized gene
wildtype_sequence: data/wildtype_sequence.fasta
# table with info on renumbering of sites, heavy/light chain, wt codon, etc.
CGGnaive_site_info: data/CGGnaive_sites.csv
#target for mapping codon variant table (sort of redundant, but in code for when we have other unmutated targets spiked in lib)
primary_target: CGG_naive
#cryo-EM refined structure of CGGnaive bound to CGG (two complexes)
pdb: data/IgY-CH2_EH2_final_real_space_refined_021.pdb
# output directories / files
summary_dir: results/summary
figs_dir: results/figures
ccs_dir: results/ccs
process_ccs_dir: results/process_ccs
processed_ccs_file: results/process_ccs/processed_ccs.csv
variants_dir: results/variants
nt_variant_table_file: results/variants/nucleotide_variant_table.csv
codon_variant_table_file: results/variants/codon_variant_table.csv
counts_dir: results/counts
variant_counts_file: results/counts/variant_counts.csv
aggregated_counts_dir: results/aggregated_counts
prepped_variant_counts_file: results/aggregated_counts/prepped_variant_counts.csv
prepped_barcode_counts_file: results/aggregated_counts/prepped_barcode_counts.csv
expression_sortseq_dir: results/expression_meanF
expression_sortseq_file: results/expression_meanF/variant_expression.csv
Titeseq_Kds_dir: results/binding_Kd
Titeseq_Kds_file: results/binding_Kd/variant_binding.csv
final_variant_scores_dir: results/final_variant_scores
final_variant_scores_mut_file: results/final_variant_scores/final_variant_scores.csv
structural_mapping_dir: results/structural_mapping
# parameters for running PacBio `ccs` program
min_ccs_accuracy: 0.999
min_ccs_passes: 3
min_ccs_length: 50
max_ccs_length: 5000
# Summed across bins, concentrations, and barcodes, how many reads to keep a variant
min_Titeseq_reads_per_variant: 100
# Summed across bins and barcodes, how many reads to keep a variant
min_Sortseq_reads_per_variant: 100
# how many barcode supports must a variant have to allow for phenotypes to be reported
# in the final varinants table? values are masked with NA if a variant lies below this values
# applies separately to binding and expression data
min_variant_barcode_replicates: 3
# how many library replicates must a variant have to allow for phenotypes to be reported
# in the final varinants table? values are masked with NA if a variant lies below this values
# applies separately to binding and expression data
min_variant_library_replicates: 2
# FACS file pattern for Titeseq-modeling.ipynb
facs_file_pattern: exptl_data/210624_TiteSeq_CGG/FCS_scFvpos/Specimen_001_sample_*
concentrations:
CGG: [1e-6, 1e-7, 1e-8, 1e-9, 1e-10, 1e-11, 1e-12, 1e-13, 0]
# we're assuming bins are sequentially numbered, starting at 1
# so if the len(facs_boundaries) = n, then there are n-1 bins
bins:
CGG: [-288., 136., 2000., 29421., 262143.]
# max error rate in gene / barcode CCSs retained for consensu building
max_error_rate: 0.0001
# Parameters for processing Illumina barcodes, assuming this structure:
# [R2 binds] - [upstream] - [barcode] - [downstream] - [R1 binds]
# This orientation is hard-wired in the barcode parser, but here it is opposite
# Therefore, supplied the "downstream" sequence as reverse complemented upstream
# Passed to `dms_variants.illuminabarcodeparser.IlluminaBarcodeParser`:
illumina_barcode_parser_params:
upstream: GGCCGC
downstream: ''
minq: 20
upstream_mismatch: 1
downstream_mismatch: 0
# Input files for analyses from the data subdirectory
CGGnaive_sites: data/CGGnaive_sites.csv