config.yaml

# config for analysis

# conda environment file
# environment: environment.yml
run_from_ngs: False

# most CPUs to ever use at once
max_cpus: 8

# amplicons sequenced by PacBio
amplicon: data/PacBio_amplicon.gb

# how to parse the amplicon
feature_parse_specs: data/feature_parse_specs.yaml

# do we get the sequencing data from the 'HutchServer' or 'SRA'?
seqdata_source: HutchServer

# list of PacBio sequencing runs linking barcodes to variants
pacbio_runs: data/PacBio_runs.csv

# list of Illumina sequencing runs of barcodes
barcode_runs: data/barcode_runs.csv

# wildtype sequence of mutagenized gene
wildtype_sequence: data/wildtype_sequence.fasta

# table with info on renumbering of sites, heavy/light chain, wt codon, etc.
CGGnaive_site_info: data/CGGnaive_sites.csv

#target for mapping codon variant table (sort of redundant, but in code for when we have other unmutated targets spiked in lib)
primary_target: CGG_naive

#cryo-EM refined structure of CGGnaive bound to CGG (two complexes)
pdb: data/IgY-CH2_EH2_final_real_space_refined_021.pdb


# output directories / files
summary_dir: results/summary
figs_dir: results/figures

ccs_dir: results/ccs
process_ccs_dir: results/process_ccs
processed_ccs_file: results/process_ccs/processed_ccs.csv
variants_dir: results/variants
nt_variant_table_file: results/variants/nucleotide_variant_table.csv
codon_variant_table_file: results/variants/codon_variant_table.csv

counts_dir: results/counts
variant_counts_file: results/counts/variant_counts.csv

aggregated_counts_dir: results/aggregated_counts
prepped_variant_counts_file: results/aggregated_counts/prepped_variant_counts.csv
prepped_barcode_counts_file: results/aggregated_counts/prepped_barcode_counts.csv

expression_sortseq_dir: results/expression_meanF
expression_sortseq_file: results/expression_meanF/variant_expression.csv

Titeseq_Kds_dir: results/binding_Kd
Titeseq_Kds_file: results/binding_Kd/variant_binding.csv

final_variant_scores_dir: results/final_variant_scores
final_variant_scores_mut_file: results/final_variant_scores/final_variant_scores.csv

structural_mapping_dir: results/structural_mapping

# parameters for running PacBio `ccs` program
min_ccs_accuracy: 0.999
min_ccs_passes: 3
min_ccs_length: 50
max_ccs_length: 5000

# Summed across bins, concentrations, and barcodes, how many reads to keep a variant
min_Titeseq_reads_per_variant: 100

# Summed across bins and barcodes, how many reads to keep a variant
min_Sortseq_reads_per_variant: 100

# how many barcode supports must a variant have to allow for phenotypes to be reported
# in the final varinants table? values are masked with NA if a variant lies below this values
# applies separately to binding and expression data
min_variant_barcode_replicates: 3

# how many library replicates must a variant have to allow for phenotypes to be reported 
# in the final varinants table? values are masked with NA if a variant lies below this values
# applies separately to binding and expression data
min_variant_library_replicates: 2


# FACS file pattern for Titeseq-modeling.ipynb
facs_file_pattern: exptl_data/210624_TiteSeq_CGG/FCS_scFvpos/Specimen_001_sample_*

concentrations:
  CGG: [1e-6, 1e-7, 1e-8, 1e-9, 1e-10, 1e-11, 1e-12, 1e-13, 0]

# we're assuming bins are sequentially numbered, starting at 1
# so if the len(facs_boundaries) = n, then there are n-1 bins
bins:
  CGG: [-288., 136., 2000., 29421., 262143.]

# max error rate in gene / barcode CCSs retained for consensu building
max_error_rate: 0.0001

# Parameters for processing Illumina barcodes, assuming this structure:
#  [R2 binds] - [upstream] - [barcode] - [downstream] - [R1 binds]
#  This orientation is hard-wired in the barcode parser, but here it is opposite
#  Therefore, supplied the "downstream" sequence as reverse complemented upstream
# Passed to `dms_variants.illuminabarcodeparser.IlluminaBarcodeParser`:
illumina_barcode_parser_params:
  upstream: GGCCGC
  downstream: ''
  minq: 20
  upstream_mismatch: 1
  downstream_mismatch: 0

# Input files for analyses from the data subdirectory
CGGnaive_sites: data/CGGnaive_sites.csv