diff --git a/dockerfiles/sv-pipeline-base/Dockerfile b/dockerfiles/sv-pipeline-base/Dockerfile
index 92daae96d..cd4ecaad6 100644
--- a/dockerfiles/sv-pipeline-base/Dockerfile
+++ b/dockerfiles/sv-pipeline-base/Dockerfile
@@ -49,7 +49,7 @@ ARG CONDA_DEP_TRANSIENT="make git wget"
 ARG CONDA_DEP="software-properties-common zlib1g-dev libbz2-dev liblzma-dev libcurl4-openssl-dev libssl-dev libblas-dev liblapack-dev libatlas-base-dev g++ gfortran ${CONDA_DEP_TRANSIENT}"
 # versions of bedtools > 2.27.0 seem to have lost the ability to read gzipped files
 # pandas 1.0.0 causes problem with bedtools in aggregate.py
-ARG PYTHON_PKGS="setuptools=52.0.0 wheel=0.34.2 bzip2=1.0.8 cython=0.29.14 numpy=1.18.1 pandas=0.25.3 scikit-learn=0.22.1 scipy=1.4.1 intervaltree=3.0.2 matplotlib=3.1.3 natsort=7.0.1 bedtools=2.27.0 pybedtools=0.8.1 pysam=0.14.1=py36_htslib1.7_0"
+ARG PYTHON_PKGS="pip=21.2.2 setuptools=52.0.0 wheel=0.34.2 bzip2=1.0.8 cython=0.29.14 numpy=1.18.1 pandas=0.25.3 scikit-learn=0.22.1 scipy=1.4.1 intervaltree=3.0.2 matplotlib=3.1.3 natsort=7.0.1 bedtools=2.27.0 pybedtools=0.8.1 pysam=0.14.1=py36_htslib1.7_0"
 ENV LANG=C.UTF-8
 ENV LC_ALL=C.UTF-8
 ARG CONDA_INSTALL_DIR="/opt/conda"
diff --git a/dockerfiles/sv-pipeline-hail/Dockerfile b/dockerfiles/sv-pipeline-hail/Dockerfile
new file mode 100644
index 000000000..5f19f7b4e
--- /dev/null
+++ b/dockerfiles/sv-pipeline-hail/Dockerfile
@@ -0,0 +1,9 @@
+# GATK SV Pipeline Hail dockerfile
+
+# IMPORTANT: these arguments must be specified at the begining to take advantage of multi-stage build AND runtime specification of base images
+ARG SV_PIPELINE_IMAGE=gatksv/sv-pipeline:latest
+FROM ${SV_PIPELINE_IMAGE}
+
+# Dependencies for creating a Hail cluster on GCS DataProc
+RUN pip3 --no-cache-dir install hail==0.2.71 && \
+    pip3 --no-cache-dir install google-cloud-dataproc \
diff --git a/dockerfiles/sv-pipeline/Dockerfile b/dockerfiles/sv-pipeline/Dockerfile
index 341d80af5..5e64eb469 100644
--- a/dockerfiles/sv-pipeline/Dockerfile
+++ b/dockerfiles/sv-pipeline/Dockerfile
@@ -48,3 +48,51 @@ RUN apt-get -qqy update --fix-missing && \
            /usr/share/man/?? \
            /usr/share/man/??_*
 ENV PATH="/opt/:${PATH}"
+
+# Compile StitchFragmentedCNVs Java program
+ENV STITCH_JAR="/opt/sv-pipeline/java/build/StitchFragmentedCNVs.jar"
+RUN cd /opt/sv-pipeline/java && \
+    mkdir -p build/classes && \
+    javac -d build/classes org/broadinstitute/svpipeline/StitchFragmentedCNVs.java org/broadinstitute/svpipeline/VCFParser.java && \
+    jar cfe build/StitchFragmentedCNVs.jar "org.broadinstitute.svpipeline.StitchFragmentedCNVs" -C build/classes . && \
+    rm -r build/classes
+
+# Compile StitchFragmentedCNVs unit tests
+ENV STITCH_UNIT_TEST_JAR="/opt/sv-pipeline/java/build/StitchFragmentedCNVsUnitTest.jar"
+RUN cd /opt/sv-pipeline/java && \
+    mkdir -p build/classes && \
+    javac -d build/classes org/broadinstitute/svpipeline/StitchFragmentedCNVsUnitTest.java org/broadinstitute/svpipeline/StitchFragmentedCNVs.java org/broadinstitute/svpipeline/VCFParser.java && \
+    jar cfe build/StitchFragmentedCNVsUnitTest.jar "org.broadinstitute.svpipeline.StitchFragmentedCNVsUnitTest" -C build/classes . && \
+    echo "Running StitchFragmentedCNVsUnitTest..." && \
+    java -enableassertions -jar $STITCH_UNIT_TEST_JAR && \
+    rm -r build/classes $STITCH_UNIT_TEST_JAR
+
+# Compile VCFParser unit tests
+ENV VCF_PARSER_UNIT_TEST_JAR="/opt/sv-pipeline/java/build/VCFParserUnitTest.jar"
+RUN cd /opt/sv-pipeline/java && \
+    mkdir -p build/classes && \
+    javac -d build/classes org/broadinstitute/svpipeline/VCFParserUnitTest.java org/broadinstitute/svpipeline/VCFParser.java && \
+    jar cfe build/VCFParserUnitTest.jar "org.broadinstitute.svpipeline.VCFParserUnitTest" -C build/classes . && \
+    echo "Running VCFParserUnitTest..." && \
+    java -enableassertions -jar $VCF_PARSER_UNIT_TEST_JAR && \
+    rm -r build/classes $VCF_PARSER_UNIT_TEST_JAR
+
+# Compile and test CleanVCFPart1 Java program
+ENV CLEAN_VCF_PART_1_JAR="/opt/sv-pipeline/java/build/CleanVCFPart1.jar"
+RUN cd /opt/sv-pipeline/java && \
+    mkdir -p build/classes && \
+    javac -d build/classes org/broadinstitute/svpipeline/CleanVCFPart1.java org/broadinstitute/svpipeline/VCFParser.java && \
+    jar cfe build/CleanVCFPart1.jar "org.broadinstitute.svpipeline.CleanVCFPart1" -C build/classes . && \
+    rm -r build/classes
+
+# Compile and test CleanVCFPart1 unit tests
+ENV CLEAN_VCF_PART_1_UNIT_TEST_JAR="/opt/sv-pipeline/java/build/CleanVCFPart1UnitTest.jar"
+RUN cd /opt/sv-pipeline/java && \
+    mkdir -p build/classes && \
+    javac -d build/classes org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java org/broadinstitute/svpipeline/CleanVCFPart1.java org/broadinstitute/svpipeline/VCFParser.java && \
+    jar cfe build/CleanVCFPart1UnitTest.jar "org.broadinstitute.svpipeline.CleanVCFPart1UnitTest" -C build/classes . && \
+    echo "Running CleanVCFPart1UnitTest..." && \
+    java -enableassertions -jar $CLEAN_VCF_PART_1_UNIT_TEST_JAR && \
+    rm -r build/classes $CLEAN_VCF_PART_1_UNIT_TEST_JAR && \
+    rm -rf /tmp/* /var/tmp/*
+
diff --git a/input_templates/GATKSVPipelineBatch.ref_panel_1kg.json.tmpl b/input_templates/GATKSVPipelineBatch.ref_panel_1kg.json.tmpl
index ff2a0a725..0fc97f0a9 100644
--- a/input_templates/GATKSVPipelineBatch.ref_panel_1kg.json.tmpl
+++ b/input_templates/GATKSVPipelineBatch.ref_panel_1kg.json.tmpl
@@ -28,6 +28,8 @@
   "GATKSVPipelineBatch.sv_pipeline_base_docker": {{ dockers.sv_pipeline_base_docker | tojson }},
   "GATKSVPipelineBatch.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }},
   "GATKSVPipelineBatch.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }},
+  "GATKSVPipelineBatch.sv_pipeline_hail_docker": {{ dockers.sv_pipeline_hail_docker | tojson }},
+  "GATKSVPipelineBatch.sv_pipeline_updates_docker": {{ dockers.sv_pipeline_updates_docker | tojson }},
   "GATKSVPipelineBatch.sv_pipeline_qc_docker": {{ dockers.sv_pipeline_qc_docker | tojson }},
   "GATKSVPipelineBatch.sv_pipeline_rdtest_docker": {{ dockers.sv_pipeline_rdtest_docker | tojson }},
   "GATKSVPipelineBatch.samtools_cloud_docker": {{ dockers.samtools_cloud_docker | tojson }},
@@ -117,6 +119,9 @@
   "GATKSVPipelineBatch.RegenotypeCNVs.n_RdTest_bins": "100000",
   "GATKSVPipelineBatch.RegenotypeCNVs.n_per_split": "5000",
 
+  "GATKSVPipelineBatch.MakeCohortVcf.chr_x": {{ reference_resources.chr_x | tojson }},
+  "GATKSVPipelineBatch.MakeCohortVcf.chr_y": {{ reference_resources.chr_y | tojson }},
+
   "GATKSVPipelineBatch.MakeCohortVcf.bin_exclude": {{ reference_resources.bin_exclude | tojson }},
   "GATKSVPipelineBatch.MakeCohortVcf.empty_file" : {{ reference_resources.empty_file | tojson }},
   "GATKSVPipelineBatch.MakeCohortVcf.cytobands": {{ reference_resources.cytobands | tojson }},
@@ -126,7 +131,9 @@
   "GATKSVPipelineBatch.MakeCohortVcf.min_sr_background_fail_batches": 0.5,
   "GATKSVPipelineBatch.MakeCohortVcf.max_shards_per_chrom_clean_vcf_step1": 200,
   "GATKSVPipelineBatch.MakeCohortVcf.min_records_per_shard_clean_vcf_step1": 5000,
+  "GATKSVPipelineBatch.MakeCohortVcf.clean_vcf1b_records_per_shard": 10000,
   "GATKSVPipelineBatch.MakeCohortVcf.samples_per_clean_vcf_step2_shard": 100,
+  "GATKSVPipelineBatch.MakeCohortVcf.clean_vcf5_records_per_shard": 5000,
   "GATKSVPipelineBatch.MakeCohortVcf.random_seed": 0,
   "GATKSVPipelineBatch.MakeCohortVcf.max_shard_size_resolve": 500,
 
diff --git a/input_templates/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.json.tmpl b/input_templates/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.json.tmpl
index 09fec86b2..efd5c5fc1 100644
--- a/input_templates/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.json.tmpl
+++ b/input_templates/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.json.tmpl
@@ -34,6 +34,8 @@
   "GATKSVPipelineSingleSample.sv_pipeline_base_docker": {{ dockers.sv_pipeline_base_docker | tojson }},
   "GATKSVPipelineSingleSample.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }},
   "GATKSVPipelineSingleSample.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }},
+  "GATKSVPipelineSingleSample.sv_pipeline_hail_docker": {{ dockers.sv_pipeline_hail_docker | tojson }},
+  "GATKSVPipelineSingleSample.sv_pipeline_updates_docker": {{ dockers.sv_pipeline_updates_docker | tojson }},
   "GATKSVPipelineSingleSample.sv_pipeline_qc_docker": {{ dockers.sv_pipeline_qc_docker | tojson }},
   "GATKSVPipelineSingleSample.sv_pipeline_rdtest_docker": {{ dockers.sv_pipeline_rdtest_docker | tojson }},
   "GATKSVPipelineSingleSample.wham_docker": {{ dockers.wham_docker | tojson }},
@@ -84,7 +86,9 @@
   "GATKSVPipelineSingleSample.max_shard_size_resolve" : 500,
   "GATKSVPipelineSingleSample.clean_vcf_max_shards_per_chrom_clean_vcf_step1": 200,
   "GATKSVPipelineSingleSample.clean_vcf_min_records_per_shard_clean_vcf_step1": 5000,
+  "GATKSVPipelineSingleSample.clean_vcf1b_records_per_shard": 10000,
   "GATKSVPipelineSingleSample.clean_vcf_samples_per_clean_vcf_step2_shard": 100,
+  "GATKSVPipelineSingleSample.clean_vcf5_records_per_shard": 5000,
   "GATKSVPipelineSingleSample.clean_vcf_random_seed": 0,
   "GATKSVPipelineSingleSample.run_vcf_qc" : false,
 
diff --git a/input_templates/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.no_melt.json.tmpl b/input_templates/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.no_melt.json.tmpl
index 54fd6767d..5b341f079 100644
--- a/input_templates/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.no_melt.json.tmpl
+++ b/input_templates/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.no_melt.json.tmpl
@@ -36,6 +36,8 @@
   "GATKSVPipelineSingleSample.sv_pipeline_base_docker": {{ dockers.sv_pipeline_base_docker | tojson }},
   "GATKSVPipelineSingleSample.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }},
   "GATKSVPipelineSingleSample.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }},
+  "GATKSVPipelineSingleSample.sv_pipeline_hail_docker": {{ dockers.sv_pipeline_hail_docker | tojson }},
+  "GATKSVPipelineSingleSample.sv_pipeline_updates_docker": {{ dockers.sv_pipeline_updates_docker | tojson }},
   "GATKSVPipelineSingleSample.sv_pipeline_qc_docker": {{ dockers.sv_pipeline_qc_docker | tojson }},
   "GATKSVPipelineSingleSample.sv_pipeline_rdtest_docker": {{ dockers.sv_pipeline_rdtest_docker | tojson }},
   "GATKSVPipelineSingleSample.wham_docker": {{ dockers.wham_docker | tojson }},
@@ -85,8 +87,10 @@
   "GATKSVPipelineSingleSample.max_shard_size_resolve" : 500,
   "GATKSVPipelineSingleSample.clean_vcf_max_shards_per_chrom_clean_vcf_step1": 200,
   "GATKSVPipelineSingleSample.clean_vcf_min_records_per_shard_clean_vcf_step1": 5000,
+  "GATKSVPipelineSingleSample.clean_vcf1b_records_per_shard": 10000,
   "GATKSVPipelineSingleSample.clean_vcf_samples_per_clean_vcf_step2_shard": 100,
   "GATKSVPipelineSingleSample.clean_vcf_random_seed": 0,
+  "GATKSVPipelineSingleSample.clean_vcf5_records_per_shard": 5000,
   "GATKSVPipelineSingleSample.run_vcf_qc" : false,
 
   "GATKSVPipelineSingleSample.protein_coding_gtf" : {{ reference_resources.protein_coding_gtf | tojson }},
diff --git a/input_templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.SingleBatch.json.tmpl b/input_templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.SingleBatch.json.tmpl
index 73be24740..a02554989 100644
--- a/input_templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.SingleBatch.json.tmpl
+++ b/input_templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.SingleBatch.json.tmpl
@@ -14,12 +14,16 @@
   "MakeCohortVcf.min_sr_background_fail_batches": 0.5,
   "MakeCohortVcf.max_shards_per_chrom_clean_vcf_step1": 200,
   "MakeCohortVcf.min_records_per_shard_clean_vcf_step1": 5000,
+  "MakeCohortVcf.clean_vcf1b_records_per_shard": 10000,
+  "MakeCohortVcf.clean_vcf5_records_per_shard": 5000,
   "MakeCohortVcf.samples_per_clean_vcf_step2_shard": 100,
   "MakeCohortVcf.random_seed": 0,
   "MakeCohortVcf.max_shard_size_resolve": 500,
 
   "MakeCohortVcf.linux_docker": "${workspace.linux_docker}",
   "MakeCohortVcf.sv_pipeline_docker": "${workspace.sv_pipeline_docker}",
+  "MakeCohortVcf.sv_pipeline_hail_docker": "${workspace.sv_pipeline_hail_docker}",
+  "MakeCohortVcf.sv_pipeline_updates_docker": "${workspace.sv_pipeline_updates_docker}",
   "MakeCohortVcf.sv_base_mini_docker": "${workspace.sv_base_mini_docker}",
   "MakeCohortVcf.sv_pipeline_rdtest_docker": "${workspace.sv_pipeline_rdtest_docker}",
   "MakeCohortVcf.sv_pipeline_qc_docker": "${workspace.sv_pipeline_qc_docker}",
@@ -27,6 +31,9 @@
   "MakeCohortVcf.primary_contigs_list": "${workspace.primary_contigs_list}",
   "MakeCohortVcf.sv_pipeline_base_docker": "${workspace.sv_pipeline_base_docker}",
 
+  "MakeCohortVcf.chr_x": "${workspace.chr_x}",
+  "MakeCohortVcf.chr_y": "${workspace.chr_y}",
+
   "MakeCohortVcf.cohort_name": "${this.sample_set_id}",
   "MakeCohortVcf.batches": "${this.sample_set_id}",
   "MakeCohortVcf.ped_file": "${workspace.cohort_ped_file}",
diff --git a/input_templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl b/input_templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl
index 450fa879b..c19ab5a4e 100644
--- a/input_templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl
+++ b/input_templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl
@@ -14,12 +14,16 @@
   "MakeCohortVcf.min_sr_background_fail_batches": 0.5,
   "MakeCohortVcf.max_shards_per_chrom_clean_vcf_step1": 200,
   "MakeCohortVcf.min_records_per_shard_clean_vcf_step1": 5000,
+  "MakeCohortVcf.clean_vcf1b_records_per_shard": 10000,
+  "MakeCohortVcf.clean_vcf5_records_per_shard": 5000,
   "MakeCohortVcf.samples_per_clean_vcf_step2_shard": 100,
   "MakeCohortVcf.random_seed": 0,
   "MakeCohortVcf.max_shard_size_resolve": 500,
 
   "MakeCohortVcf.linux_docker": "${workspace.linux_docker}",
   "MakeCohortVcf.sv_pipeline_docker": "${workspace.sv_pipeline_docker}",
+  "MakeCohortVcf.sv_pipeline_hail_docker": "${workspace.sv_pipeline_hail_docker}",
+  "MakeCohortVcf.sv_pipeline_updates_docker": "${workspace.sv_pipeline_updates_docker}",
   "MakeCohortVcf.sv_base_mini_docker": "${workspace.sv_base_mini_docker}",
   "MakeCohortVcf.sv_pipeline_rdtest_docker": "${workspace.sv_pipeline_rdtest_docker}",
   "MakeCohortVcf.sv_pipeline_qc_docker": "${workspace.sv_pipeline_qc_docker}",
@@ -27,6 +31,9 @@
   "MakeCohortVcf.primary_contigs_list": "${workspace.primary_contigs_list}",
   "MakeCohortVcf.sv_pipeline_base_docker": "${workspace.sv_pipeline_base_docker}",
 
+  "MakeCohortVcf.chr_x": "${workspace.chr_x}",
+  "MakeCohortVcf.chr_y": "${workspace.chr_y}",
+
   "MakeCohortVcf.cohort_name": "${this.sample_set_set_id}",
   "MakeCohortVcf.batches": "${this.sample_sets.sample_set_id}",
   "MakeCohortVcf.ped_file": "${workspace.cohort_ped_file}",
diff --git a/input_templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl b/input_templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl
index 456361948..a8e6fe515 100644
--- a/input_templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl
+++ b/input_templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl
@@ -1,2 +1,2 @@
-workspace:cloud_sdk_docker	cnmops_docker	condense_counts_docker	gatk_docker	gatk_docker_pesr_override	gcnv_gatk_docker	genomes_in_the_cloud_docker	linux_docker	manta_docker	samtools_cloud_docker	sv_base_docker	sv_base_mini_docker	sv_pipeline_base_docker	sv_pipeline_docker	sv_pipeline_qc_docker	sv_pipeline_rdtest_docker	wham_docker	allosome_file	autosome_file	bin_exclude	cnmops_exclude_list	cohort_ped_file	contig_ploidy_priors	copy_number_autosomal_contigs	cytobands	dbsnp_vcf	delly_exclude_intervals_file	depth_exclude_list	empty_file	exclude_intervals_for_gcnv_filter_intervals	external_af_ref_bed	external_af_ref_bed_prefix	genome_file	inclusion_bed	linc_rna_gtf	manta_region_bed	mei_bed	melt_standard_vcf_header	noncoding_bed	pesr_exclude_list	preprocessed_intervals	primary_contigs_fai	primary_contigs_list	promoter_bed	protein_coding_gtf	reference_build	reference_dict	reference_fasta	reference_index	reference_version	rmsk	segdups	seed_cutoffs	unpadded_intervals_file	wgd_scoring_mask	wham_include_list_bed_file
-{{ dockers.cloud_sdk_docker }}	{{ dockers.cnmops_docker }}	{{ dockers.condense_counts_docker }}	{{ dockers.gatk_docker }}	{{ dockers.gatk_docker_pesr_override }}	{{ dockers.gatk_docker }}	{{ dockers.genomes_in_the_cloud_docker }}	{{ dockers.linux_docker }}	{{ dockers.manta_docker }}	{{ dockers.samtools_cloud_docker }}	{{ dockers.sv_base_docker }}	{{ dockers.sv_base_mini_docker }}	{{ dockers.sv_pipeline_base_docker }}	{{ dockers.sv_pipeline_docker }}	{{ dockers.sv_pipeline_qc_docker }}	{{ dockers.sv_pipeline_rdtest_docker }}	{{ dockers.wham_docker }}	{{ reference_resources.allosome_file }}	{{ reference_resources.autosome_file }}	{{ reference_resources.bin_exclude }}	{{ reference_resources.cnmops_exclude_list }}	gs://broad-dsde-methods-eph/ped_1kgp_all.ped	{{ reference_resources.contig_ploidy_priors }}	{{ reference_resources.copy_number_autosomal_contigs }}	{{ reference_resources.cytobands }}	{{ reference_resources.dbsnp_vcf }}	{{ reference_resources.delly_exclude_intervals_file }}	{{ reference_resources.depth_exclude_list }}	{{ reference_resources.empty_file }}	{{ reference_resources.exclude_intervals_for_gcnv_filter_intervals }}	{{ reference_resources.external_af_ref_bed | tojson }}	{{ reference_resources.external_af_ref_bed_prefix | tojson }}	{{ reference_resources.genome_file }}	{{ reference_resources.inclusion_bed }}	{{ reference_resources.linc_rna_gtf | tojson }}	{{ reference_resources.manta_region_bed }}	{{ reference_resources.mei_bed }}	{{ reference_resources.melt_std_vcf_header }}	{{ reference_resources.noncoding_bed | tojson }}	{{ reference_resources.pesr_exclude_list }}	{{ reference_resources.preprocessed_intervals }}	{{ reference_resources.primary_contigs_fai }}	{{ reference_resources.primary_contigs_list }}	{{ reference_resources.promoter_bed | tojson }}	{{ reference_resources.protein_coding_gtf | tojson }}	{{ reference_resources.reference_build }}	{{ reference_resources.reference_dict }}	{{ reference_resources.reference_fasta }}	{{ reference_resources.reference_index }}	{{ reference_resources.reference_version }}	{{ reference_resources.rmsk }}	{{ reference_resources.segdups }}	{{ reference_resources.seed_cutoffs }}	{{ reference_resources.unpadded_intervals_file }}	{{ reference_resources.wgd_scoring_mask }}	{{ reference_resources.wham_include_list_bed_file }}
+workspace:cloud_sdk_docker	cnmops_docker	condense_counts_docker	gatk_docker	gatk_docker_pesr_override	gcnv_gatk_docker	genomes_in_the_cloud_docker	linux_docker	manta_docker	samtools_cloud_docker	sv_base_docker	sv_base_mini_docker	sv_pipeline_base_docker	sv_pipeline_docker  sv_pipeline_hail_docker  sv_pipeline_updates_docker	sv_pipeline_qc_docker	sv_pipeline_rdtest_docker	wham_docker	allosome_file	autosome_file	bin_exclude	cnmops_exclude_list	cohort_ped_file	contig_ploidy_priors	copy_number_autosomal_contigs	cytobands	dbsnp_vcf	delly_exclude_intervals_file	depth_exclude_list	empty_file	exclude_intervals_for_gcnv_filter_intervals	external_af_ref_bed	external_af_ref_bed_prefix	genome_file	inclusion_bed	linc_rna_gtf	manta_region_bed	mei_bed	melt_standard_vcf_header	noncoding_bed	pesr_exclude_list	preprocessed_intervals	primary_contigs_fai	primary_contigs_list	promoter_bed	protein_coding_gtf	reference_build	reference_dict	reference_fasta	reference_index	reference_version	rmsk	segdups	seed_cutoffs	unpadded_intervals_file	wgd_scoring_mask	wham_include_list_bed_file  chr_x   chr_y
+{{ dockers.cloud_sdk_docker }}	{{ dockers.cnmops_docker }}	{{ dockers.condense_counts_docker }}	{{ dockers.gatk_docker }}	{{ dockers.gatk_docker_pesr_override }}	{{ dockers.gatk_docker }}	{{ dockers.genomes_in_the_cloud_docker }}	{{ dockers.linux_docker }}	{{ dockers.manta_docker }}	{{ dockers.samtools_cloud_docker }}	{{ dockers.sv_base_docker }}	{{ dockers.sv_base_mini_docker }}	{{ dockers.sv_pipeline_base_docker }}	{{ dockers.sv_pipeline_docker }}    {{ dockers.sv_pipeline_hail_docker }}    {{ dockers.sv_pipeline_updates_docker }}	{{ dockers.sv_pipeline_qc_docker }}	{{ dockers.sv_pipeline_rdtest_docker }}	{{ dockers.wham_docker }}	{{ reference_resources.allosome_file }}	{{ reference_resources.autosome_file }}	{{ reference_resources.bin_exclude }}	{{ reference_resources.cnmops_exclude_list }}	gs://broad-dsde-methods-eph/ped_1kgp_all.ped	{{ reference_resources.contig_ploidy_priors }}	{{ reference_resources.copy_number_autosomal_contigs }}	{{ reference_resources.cytobands }}	{{ reference_resources.dbsnp_vcf }}	{{ reference_resources.delly_exclude_intervals_file }}	{{ reference_resources.depth_exclude_list }}	{{ reference_resources.empty_file }}	{{ reference_resources.exclude_intervals_for_gcnv_filter_intervals }}	{{ reference_resources.external_af_ref_bed | tojson }}	{{ reference_resources.external_af_ref_bed_prefix | tojson }}	{{ reference_resources.genome_file }}	{{ reference_resources.inclusion_bed }}	{{ reference_resources.linc_rna_gtf | tojson }}	{{ reference_resources.manta_region_bed }}	{{ reference_resources.mei_bed }}	{{ reference_resources.melt_std_vcf_header }}	{{ reference_resources.noncoding_bed | tojson }}	{{ reference_resources.pesr_exclude_list }}	{{ reference_resources.preprocessed_intervals }}	{{ reference_resources.primary_contigs_fai }}	{{ reference_resources.primary_contigs_list }}	{{ reference_resources.promoter_bed | tojson }}	{{ reference_resources.protein_coding_gtf | tojson }}	{{ reference_resources.reference_build }}	{{ reference_resources.reference_dict }}	{{ reference_resources.reference_fasta }}	{{ reference_resources.reference_index }}	{{ reference_resources.reference_version }}	{{ reference_resources.rmsk }}	{{ reference_resources.segdups }}	{{ reference_resources.seed_cutoffs }}	{{ reference_resources.unpadded_intervals_file }}	{{ reference_resources.wgd_scoring_mask }}	{{ reference_resources.wham_include_list_bed_file }}    {{ reference_resources.chr_x }} {{ reference_resources.chr_y }}
diff --git a/input_templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.no_melt.json.tmpl b/input_templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.no_melt.json.tmpl
index ce930bc57..5853242cc 100644
--- a/input_templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.no_melt.json.tmpl
+++ b/input_templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.no_melt.json.tmpl
@@ -36,6 +36,8 @@
   "GATKSVPipelineSingleSample.sv_pipeline_base_docker": "${workspace.sv_pipeline_base_docker}",
   "GATKSVPipelineSingleSample.sv_base_mini_docker": "${workspace.sv_base_mini_docker}",
   "GATKSVPipelineSingleSample.sv_pipeline_docker": "${workspace.sv_pipeline_docker}",
+  "GATKSVPipelineSingleSample.sv_pipeline_hail_docker": "${workspace.sv_pipeline_hail_docker}",
+  "GATKSVPipelineSingleSample.sv_pipeline_updates_docker": "${workspace.sv_pipeline_updates_docker}",
   "GATKSVPipelineSingleSample.sv_pipeline_qc_docker": "${workspace.sv_pipeline_qc_docker}",
   "GATKSVPipelineSingleSample.sv_pipeline_rdtest_docker": "${workspace.sv_pipeline_rdtest_docker}",
   "GATKSVPipelineSingleSample.wham_docker": "${workspace.wham_docker}",
@@ -84,7 +86,9 @@
   "GATKSVPipelineSingleSample.max_shard_size_resolve" : 500,
   "GATKSVPipelineSingleSample.clean_vcf_max_shards_per_chrom_clean_vcf_step1": 200,
   "GATKSVPipelineSingleSample.clean_vcf_min_records_per_shard_clean_vcf_step1": 5000,
+  "GATKSVPipelineSingleSample.clean_vcf1b_records_per_shard": 10000,
   "GATKSVPipelineSingleSample.clean_vcf_samples_per_clean_vcf_step2_shard": 100,
+  "GATKSVPipelineSingleSample.clean_vcf5_records_per_shard": 5000,
   "GATKSVPipelineSingleSample.clean_vcf_random_seed": 0,
   "GATKSVPipelineSingleSample.run_vcf_qc" : false,
 
diff --git a/input_templates/terra_workspaces/single_sample/workspace.tsv.tmpl b/input_templates/terra_workspaces/single_sample/workspace.tsv.tmpl
index f338efa17..ba6e6ff4f 100644
--- a/input_templates/terra_workspaces/single_sample/workspace.tsv.tmpl
+++ b/input_templates/terra_workspaces/single_sample/workspace.tsv.tmpl
@@ -1,2 +1,2 @@
-workspace:cnmops_docker	condense_counts_docker	gatk_docker	gatk_docker_pesr_override	genomes_in_the_cloud_docker	linux_docker	manta_docker	samtools_cloud_docker	sv_base_docker	sv_base_mini_docker	sv_pipeline_base_docker	sv_pipeline_docker	sv_pipeline_qc_docker	sv_pipeline_rdtest_docker	wham_docker	ref_panel_name	ref_panel_bincov_matrix	ref_panel_contig_ploidy_model_tar	ref_panel_cutoffs	ref_panel_del_bed	ref_panel_dup_bed	ref_panel_genotype_pesr_pesr_sepcutoff	ref_panel_genotype_pesr_depth_sepcutoff	ref_panel_genotype_depth_pesr_sepcutoff	ref_panel_genotype_depth_depth_sepcutoff	ref_panel_ped_file	ref_panel_PE_metrics	ref_panel_qc_definitions	ref_panel_requester_pays_crams	ref_panel_SR_metrics	ref_panel_vcf	reference_name	reference_allosome_file	reference_autosome_file	reference_bin_exclude	reference_cnmops_exclude_list	reference_contig_ploidy_priors	reference_copy_number_autosomal_contigs	reference_cytobands	reference_dbsnp_vcf	reference_delly_exclude_intervals_file	reference_depth_exclude_list	reference_empty_file	reference_exclude_intervals_for_gcnv_filter_intervals	reference_external_af_ref_bed	reference_external_af_ref_bed_prefix	reference_genome_file	reference_inclusion_bed	reference_linc_rna_gtf	reference_manta_region_bed	reference_mei_bed	reference_melt_std_vcf_header	reference_noncoding_bed	reference_pesr_exclude_list	reference_preprocessed_intervals	reference_primary_contigs_list	reference_primary_contigs_fai	reference_promoter_bed	reference_protein_coding_gtf	reference_dict	reference_fasta	reference_index	reference_version	reference_rmsk	reference_segdups	reference_seed_cutoffs	reference_unpadded_intervals_file	reference_wgd_scoring_mask	reference_wham_include_list_bed_file
-{{ dockers.cnmops_docker }}	{{ dockers.condense_counts_docker }}	{{ dockers.gatk_docker }}	{{ dockers.gatk_docker_pesr_override }}	{{ dockers.genomes_in_the_cloud_docker }}	{{ dockers.linux_docker }}	{{ dockers.manta_docker }}	{{ dockers.samtools_cloud_docker }}	{{ dockers.sv_base_docker }}	{{ dockers.sv_base_mini_docker }}	{{ dockers.sv_pipeline_base_docker }}	{{ dockers.sv_pipeline_docker }}	{{ dockers.sv_pipeline_qc_docker }}	{{ dockers.sv_pipeline_rdtest_docker }}	{{ dockers.wham_docker }}	{{ ref_panel.name }}	{{ ref_panel.bincov_matrix }}	{{ ref_panel.contig_ploidy_model_tar }}	{{ ref_panel.cutoffs }}	{{ ref_panel.del_bed }}	{{ ref_panel.dup_bed }}	{{ ref_panel.genotype_pesr_pesr_sepcutoff }}	{{ ref_panel.genotype_pesr_depth_sepcutoff }}	{{ ref_panel.genotype_depth_pesr_sepcutoff }}	{{ ref_panel.genotype_depth_depth_sepcutoff }}	{{ ref_panel.ped_file }}	{{ ref_panel.PE_metrics }}	{{ ref_panel.qc_definitions }}	{{ ref_panel.requester_pays_crams }}	{{ ref_panel.SR_metrics }}	{{ ref_panel.vcf }}	{{ reference_resources.name }}	{{ reference_resources.allosome_file }}	{{ reference_resources.autosome_file }}	{{ reference_resources.bin_exclude }}	{{ reference_resources.cnmops_exclude_list }}	{{ reference_resources.contig_ploidy_priors }}	{{ reference_resources.copy_number_autosomal_contigs }}	{{ reference_resources.cytobands }}	{{ reference_resources.dbsnp_vcf }}	{{ reference_resources.delly_exclude_intervals_file }}	{{ reference_resources.depth_exclude_list }}	{{ reference_resources.empty_file }}	{{ reference_resources.exclude_intervals_for_gcnv_filter_intervals }}	{{ reference_resources.external_af_ref_bed }}	{{ reference_resources.external_af_ref_bed_prefix }}	{{ reference_resources.genome_file }}	{{ reference_resources.inclusion_bed }}	{{ reference_resources.linc_rna_gtf }}	{{ reference_resources.manta_region_bed }}	{{ reference_resources.mei_bed }}	{{ reference_resources.melt_std_vcf_header }}	{{ reference_resources.noncoding_bed }}	{{ reference_resources.pesr_exclude_list }}	{{ reference_resources.preprocessed_intervals }}	{{ reference_resources.primary_contigs_list }}	{{ reference_resources.primary_contigs_fai }}	{{ reference_resources.promoter_bed }}	{{ reference_resources.protein_coding_gtf }}	{{ reference_resources.reference_dict }}	{{ reference_resources.reference_fasta }}	{{ reference_resources.reference_index }}	{{ reference_resources.reference_version }}	{{ reference_resources.rmsk }}	{{ reference_resources.segdups }}	{{ reference_resources.seed_cutoffs }}	{{ reference_resources.unpadded_intervals_file }}	{{ reference_resources.wgd_scoring_mask }}	{{ reference_resources.wham_include_list_bed_file }}
+workspace:cnmops_docker	condense_counts_docker	gatk_docker	gatk_docker_pesr_override	genomes_in_the_cloud_docker	linux_docker	manta_docker	samtools_cloud_docker	sv_base_docker	sv_base_mini_docker	sv_pipeline_base_docker	sv_pipeline_docker  sv_pipeline_hail_docker  sv_pipeline_updates_docker	sv_pipeline_qc_docker	sv_pipeline_rdtest_docker	wham_docker	ref_panel_name	ref_panel_bincov_matrix	ref_panel_contig_ploidy_model_tar	ref_panel_cutoffs	ref_panel_del_bed	ref_panel_dup_bed	ref_panel_genotype_pesr_pesr_sepcutoff	ref_panel_genotype_pesr_depth_sepcutoff	ref_panel_genotype_depth_pesr_sepcutoff	ref_panel_genotype_depth_depth_sepcutoff	ref_panel_ped_file	ref_panel_PE_metrics	ref_panel_qc_definitions	ref_panel_requester_pays_crams	ref_panel_SR_metrics	ref_panel_vcf	reference_name	reference_allosome_file	reference_autosome_file	reference_bin_exclude	reference_cnmops_exclude_list	reference_contig_ploidy_priors	reference_copy_number_autosomal_contigs	reference_cytobands	reference_dbsnp_vcf	reference_delly_exclude_intervals_file	reference_depth_exclude_list	reference_empty_file	reference_exclude_intervals_for_gcnv_filter_intervals	reference_external_af_ref_bed	reference_external_af_ref_bed_prefix	reference_genome_file	reference_inclusion_bed	reference_linc_rna_gtf	reference_manta_region_bed	reference_mei_bed	reference_melt_std_vcf_header	reference_noncoding_bed	reference_pesr_exclude_list	reference_preprocessed_intervals	reference_primary_contigs_list	reference_primary_contigs_fai	reference_promoter_bed	reference_protein_coding_gtf	reference_dict	reference_fasta	reference_index	reference_version	reference_rmsk	reference_segdups	reference_seed_cutoffs	reference_unpadded_intervals_file	reference_wgd_scoring_mask	reference_wham_include_list_bed_file
+{{ dockers.cnmops_docker }}	{{ dockers.condense_counts_docker }}	{{ dockers.gatk_docker }}	{{ dockers.gatk_docker_pesr_override }}	{{ dockers.genomes_in_the_cloud_docker }}	{{ dockers.linux_docker }}	{{ dockers.manta_docker }}	{{ dockers.samtools_cloud_docker }}	{{ dockers.sv_base_docker }}	{{ dockers.sv_base_mini_docker }}	{{ dockers.sv_pipeline_base_docker }}	{{ dockers.sv_pipeline_docker }}    {{ dockers.sv_pipeline_hail_docker }}    {{ dockers.sv_pipeline_updates_docker }}	{{ dockers.sv_pipeline_qc_docker }}	{{ dockers.sv_pipeline_rdtest_docker }}	{{ dockers.wham_docker }}	{{ ref_panel.name }}	{{ ref_panel.bincov_matrix }}	{{ ref_panel.contig_ploidy_model_tar }}	{{ ref_panel.cutoffs }}	{{ ref_panel.del_bed }}	{{ ref_panel.dup_bed }}	{{ ref_panel.genotype_pesr_pesr_sepcutoff }}	{{ ref_panel.genotype_pesr_depth_sepcutoff }}	{{ ref_panel.genotype_depth_pesr_sepcutoff }}	{{ ref_panel.genotype_depth_depth_sepcutoff }}	{{ ref_panel.ped_file }}	{{ ref_panel.PE_metrics }}	{{ ref_panel.qc_definitions }}	{{ ref_panel.requester_pays_crams }}	{{ ref_panel.SR_metrics }}	{{ ref_panel.vcf }}	{{ reference_resources.name }}	{{ reference_resources.allosome_file }}	{{ reference_resources.autosome_file }}	{{ reference_resources.bin_exclude }}	{{ reference_resources.cnmops_exclude_list }}	{{ reference_resources.contig_ploidy_priors }}	{{ reference_resources.copy_number_autosomal_contigs }}	{{ reference_resources.cytobands }}	{{ reference_resources.dbsnp_vcf }}	{{ reference_resources.delly_exclude_intervals_file }}	{{ reference_resources.depth_exclude_list }}	{{ reference_resources.empty_file }}	{{ reference_resources.exclude_intervals_for_gcnv_filter_intervals }}	{{ reference_resources.external_af_ref_bed }}	{{ reference_resources.external_af_ref_bed_prefix }}	{{ reference_resources.genome_file }}	{{ reference_resources.inclusion_bed }}	{{ reference_resources.linc_rna_gtf }}	{{ reference_resources.manta_region_bed }}	{{ reference_resources.mei_bed }}	{{ reference_resources.melt_std_vcf_header }}	{{ reference_resources.noncoding_bed }}	{{ reference_resources.pesr_exclude_list }}	{{ reference_resources.preprocessed_intervals }}	{{ reference_resources.primary_contigs_list }}	{{ reference_resources.primary_contigs_fai }}	{{ reference_resources.promoter_bed }}	{{ reference_resources.protein_coding_gtf }}	{{ reference_resources.reference_dict }}	{{ reference_resources.reference_fasta }}	{{ reference_resources.reference_index }}	{{ reference_resources.reference_version }}	{{ reference_resources.rmsk }}	{{ reference_resources.segdups }}	{{ reference_resources.seed_cutoffs }}	{{ reference_resources.unpadded_intervals_file }}	{{ reference_resources.wgd_scoring_mask }}	{{ reference_resources.wham_include_list_bed_file }}
diff --git a/input_values/dockers.json b/input_values/dockers.json
index 747e515cb..f2ec7fff9 100644
--- a/input_values/dockers.json
+++ b/input_values/dockers.json
@@ -10,10 +10,12 @@
   "manta_docker" : "us.gcr.io/broad-dsde-methods/manta:8645aa",
   "melt_docker" : "us.gcr.io/talkowski-sv-gnomad/melt:vj-4ff9de9f",
   "samtools_cloud_docker" : "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:mw-gnomad-02-6a66c96",
-  "sv_base_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-base:mw-gnomad-0506-pr-087d4df",
+  "sv_base_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-base:mw-gnomad-0506-pr-2-6d104d7",
   "sv_base_mini_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-base-mini:mw-gnomad-0506-pr-087d4df",
-  "sv_pipeline_base_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline-base:mw-gnomad-0506-pr-087d4df",
-  "sv_pipeline_docker" : "us.gcr.io/broad-dsde-methods/eph/sv-pipeline:eph_hotfix_no_evidence-1f461ed",
+  "sv_pipeline_base_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline-base:mw-gnomad-0506-pr-2-6d104d7",
+  "sv_pipeline_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline:mw-gnomad-0506-pr-2-6d104d7",
+  "sv_pipeline_hail_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline-hail:mw-gnomad-0506-pr-2-b7988f0",
+  "sv_pipeline_updates_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline-updates:mw-gnomad-0506-superscale-dev-304ffa1",
   "sv_pipeline_qc_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline-qc:mw-gnomad-0506-pr-087d4df",
   "sv_pipeline_rdtest_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline-rdtest:mw-gnomad-0506-pr-087d4df",
   "wham_docker" : "us.gcr.io/broad-dsde-methods/wham:8645aa",
diff --git a/input_values/resources_hg38.json b/input_values/resources_hg38.json
index eb727d077..dddcec7bf 100644
--- a/input_values/resources_hg38.json
+++ b/input_values/resources_hg38.json
@@ -2,6 +2,8 @@
   "name" : "resources_hg38",
   "allosome_file" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/allosome.fai",
   "allosomal_contigs" : ["chrX", "chrY"],
+  "chr_x" : "chrX",
+  "chr_y" : "chrY",
   "asc_tarballs" : [
     "gs://gatk-sv-resources-secure/resources/hg38_benchmarking/ASC_Werling/ASC_Werling.SV.ALL.bed.gz",
     "gs://gatk-sv-resources-secure/resources/hg38_benchmarking/ASC_Werling/ASC_Werling.SV.EUR.bed.gz",
diff --git a/scripts/docker/build_docker.py b/scripts/docker/build_docker.py
index 0b048ab00..e3ea2b892 100755
--- a/scripts/docker/build_docker.py
+++ b/scripts/docker/build_docker.py
@@ -32,6 +32,8 @@ class to track dependencies, control build and push of entire job
         'sv-pipeline-base': {'sv-base': "SVBASE_IMAGE"},
         'sv-pipeline': {'sv-pipeline-base': "SV_PIPELINE_BASE_IMAGE"},
         'sv-pipeline-children-r': {'sv-pipeline-base': "SV_PIPELINE_BASE_IMAGE"},
+        'sv-pipeline-hail': {'sv-pipeline': "SV_PIPELINE_IMAGE"},
+        'sv-pipeline-updates': {'sv-pipeline': "SV_PIPELINE_IMAGE"},
         'sv-pipeline-rdtest': {'sv-pipeline-children-r': "SV_PIPELINE_BASE_R_IMAGE"},
         'sv-pipeline-qc': {'sv-pipeline-children-r': "SV_PIPELINE_BASE_R_IMAGE"}
     }
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/calculate_sr_bothside_support.py b/src/sv-pipeline/04_variant_resolution/scripts/calculate_sr_bothside_support.py
new file mode 100644
index 000000000..58656aab9
--- /dev/null
+++ b/src/sv-pipeline/04_variant_resolution/scripts/calculate_sr_bothside_support.py
@@ -0,0 +1,30 @@
+#!/bin/python
+
+import sys
+from collections import defaultdict
+
+
+def count_vids(list_path):
+    counts = defaultdict(lambda: 0)
+    with open(list_path, 'r') as f_list:
+        for path in f_list:
+            with open(path.strip(), 'r') as f:
+                for vid in f:
+                    counts[vid.strip()] += 1
+    return counts
+
+
+NON_REF_VIDS_LIST = sys.argv[1]
+BOTHSIDE_PASS_LIST = sys.argv[2]
+
+non_ref_counts = count_vids(NON_REF_VIDS_LIST)
+bothside_pass_counts = count_vids(BOTHSIDE_PASS_LIST)
+
+for vid, bothside_pass_count in bothside_pass_counts.items():
+    if bothside_pass_count == 0:
+        continue
+    non_ref_count = non_ref_counts[vid]
+    if non_ref_count == 0:
+        continue
+    fraction_support = min(1., bothside_pass_count / float(non_ref_count))
+    sys.stdout.write("{}\t{}\n".format(fraction_support, vid))
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_VCF_script.sh b/src/sv-pipeline/04_variant_resolution/scripts/clean_VCF_script.sh
deleted file mode 100755
index 03155de8c..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/clean_VCF_script.sh
+++ /dev/null
@@ -1,682 +0,0 @@
-#!/bin/bash
-#
-# clean_VCF.sh
-#
-
-##requires >= vcftools/0.1.15 ##
-
-set -e
-
-##gzipped vcf##
-vcf=$1
-backgroundlist=$2
-
-
-##get sampleids from VCF##
-zcat $vcf \
-  |egrep "^#" \
-  |tail -n -1 \
-  |cut -f10- \
-  |tr '\t' '\n' \
-  >whitelist.txt
-
-##convert EV integer back into string##
-zcat $vcf \
-      | awk '{print $0 "\t"}' \
-      | sed -e 's/:7'"\t"'/:RD,PE,SR'"\t"'/g' \
-      | sed -e 's/:6'"\t"'/:PE,SR'"\t"'/g' \
-      | sed -e 's/:5'"\t"'/:RD,SR'"\t"'/g' \
-      | sed -e 's/:3'"\t"'/:RD,PE'"\t"'/g' \
-      | sed -e 's/:2'"\t"'/:PE'"\t"'/g' \
-      -e 's/:4'"\t"'/:SR'"\t"'/g' \
-      -e 's/:1'"\t"'/:RD'"\t"'/g' \
-      |sed 's/'"\t"'$//g' \
-      |sed 's/ID=EV,Number=1,Type=Integer/ID=EV,Number=1,Type=String/g' \
-      | bgzip > EV.update.vcf.gz
-
-##convert all alt to svtype and alt to N##
-svtk vcf2bed EV.update.vcf.gz stdout -i SVTYPE  \
-  |awk '{ if ($5!~"ME")$5=$7; print $4"\t" "<"$5 ">"}' \
-  |gzip \
-  >vcf.convert.svtype.gz
-
-zcat EV.update.vcf.gz \
-  |awk 'NR==FNR{inFileA[$1]=$2; next} {if ($3 in inFileA && $1!~"#") $5=inFileA[$3]; print }' OFS='\t'  \
-   <(zcat vcf.convert.svtype.gz) - \
-   |awk '{if ($1!~"#") $4="N"; print}' OFS='\t' \
-   |bgzip \
-   >convertsvtype.vcf.gz
-
-##get rid of multiallelic tage in INFO field and add varGQ to QUAL column##
-svtk vcf2bed convertsvtype.vcf.gz stdout -i varGQ \
-  |awk '{print $4 "\t" $7}' \
-  >vargq.persample
-
-zcat convertsvtype.vcf.gz \
-  |sed 's/;MULTIALLELIC//g' \
-  |sed 's/;varGQ=[0-9]*//g' \
-  |awk 'NR==FNR{inFileA[$1]=$2; next} {if ($3 in inFileA && $1!~"#") $6=inFileA[$3]; print }' OFS='\t' vargq.persample - \
-  |bgzip \
-   >cleaninfo.vcf.gz
-   
-   
-##change tag for SR background failures##
-zcat cleaninfo.vcf.gz \
-  |awk 'NR==FNR{inFileA[$1]; next} {if ($3 in inFileA && $1!~"#") sub($7,"HIGH_SR_BACKGROUND"); print }' $backgroundlist - \
-  |awk '{if (NR==2) print $0 "\n" "##FILTER=<ID=HIGH_SR_BACKGROUND,Description=\"High number of SR splits in background samples indicating messy region\">" ;else print}' \
-  |bgzip \
-  >int.vcf.gz  
- 
-
-##Remove CNVs that are improperly genotyped by depth because they are nested within a real CNV##
-
-##Only affects CNV so pull those out##
-svtk vcf2bed int.vcf.gz stdout \
-  |awk '{if ($5=="DEL" || $5=="DUP") print}' \
-  |gzip>int.bed.gz
-
-##list of potenital overlaps with a normal copy state variant (>5kb variants require depth but nested events could be missed; i.e a duplication with a nest deletion will have a normal copy state for the deletion)##
-bedtools intersect -wa -wb -a <(zcat int.bed.gz|awk '{if ($3-$2>=5000 ) print}') \
--b <(zcat int.bed.gz|awk '{if ($3-$2>=5000) print}') \
-  |awk -F"\t" '{if ($4!=$10 && $3-$2>=$9-$8 && $5!=$11) print ;\
- else if ($4!=$10 && $5!=$11) print $7,$8,$9,$10,$11,$12,$1,$2,$3,$4,$5,$6}' OFS='\t' \
-  |awk -F'\t' '{if ($6!="") print}' \
-  |sort -u \
-  >normaloverlap.txt
-
-
-##pull out the depth based copy number variant for each normal overlapping variant## 
-zcat int.vcf.gz \
-  |awk '{if ($1!~"#") sub($1,$3);print}' \
-  |awk '{if ($1~"#" || $5=="<DEL>" || $5=="<DUP>") print}' \
-  |vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \
-  |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1"@"header[j] "\t" $j }' \
-  |sort -k1,1 \
-  |gzip \
-  >RD_CN.normalcheck.FORMAT.gz
-
-
-##pull out evidence supporting each normal overlapping variant## 
-cat <(zcat int.vcf.gz|awk -F"\t" '{if ($1~"#") print}') \
-  <(awk '{print $4 "\n" $10}' normaloverlap.txt|sort -u|fgrep -wf - <(zcat int.vcf.gz))\
-  |awk '{if ($1!~"#") sub($1,$3);print}' \
-  |vcftools --vcf - --stdout --extract-FORMAT-info EV \
-  |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1"@"header[j] "\t" $j }' \
-  |sort -k1,1 \
-  |gzip \
-  >EV.normalcheck.FORMAT.gz
-
-##check if nested is incorrectly classified as normal##
-
-while read bed
-do
- echo $bed|tr ' ' '\t'|cut -f1-6 >large.bed
- echo $bed|tr ' ' '\t'|cut -f7-12>small.bed
- ##require at least 50% coverage to consider a variant overlapping##
- overlap=$(bedtools coverage -a small.bed -b large.bed|awk '{if ($NF>=0.50) print "YES";else print "NO"}')
-
- if [ "$overlap" == "YES" ]
- then
-  smallid=$(awk '{print $4}' small.bed)
-  
-  ##pull out variants that are called a variants for both the smaller and larger CNVs (don't have normal copy state to check for)##
-  awk '{print $NF}' small.bed \
-    |tr ',' '\n' \
-    |fgrep -wvf - <(awk -F"[,\t]" -v var=$smallid '{for(i=6;i<=NF;i++) print var"@"$i "\t" $4"@"$i "\t" $5}' large.bed) \
-    >>overlap.test.txt
- fi
-done<normaloverlap.txt
-
-##determine variants that need to be revised from a normal copy state into a CNV##
-cat overlap.test.txt \
-  |sort -k1,1 \
-  |join -j 1 - <(zcat RD_CN.normalcheck.FORMAT.gz) \
-  |join -j 1 - <(zcat EV.normalcheck.FORMAT.gz) \
-  |tr ' ' '\t' \
-  |sort -k2,2 \
-  |join -1 2 -2 1 - <(zcat RD_CN.normalcheck.FORMAT.gz) \
-  |awk '{if ($3=="DUP" && $4==2 && $6==3) print $2 "\t" "\t" 1; else if ($3=="DEL" && $4==2 && $6==1)  print $2 "\t" 3 }' \
-  |tr '@' '\t'\
-  >geno.normal.revise.txt
-
-##Update genotypes##
-
-##Determine columns of VCF after header##
-zcat int.vcf.gz \
-  |egrep ^# \
-  |tail -n 1 \
-  |tr '\t' '\n' \
-  |cat -n - \
-  >col.txt
-
-
-##seed the vcf lines file which will provide the revisions to vcf file## 
-echo "">normal.revise.vcf.lines.txt
-
-
-##pull out and revise vcf line that needs to be edited##
-while read line
-do
- id=$(echo $line|awk '{print $2}' )
- col=$(fgrep -w $id col.txt|awk '{print $1}')
- variant=$(echo $line|awk '{print $1}')
- cn=$(echo $line|awk '{print $3}')
-
- zcat int.vcf.gz |fgrep -w $variant >line.txt
-
- ##Updated genotype and rebuild Format field ##
- GT=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $1}')
- GQ=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $2}')
- RD_CN=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $3}')
- RD_GQ=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $4}')
- PE_GT=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $5}')
- PE_GQ=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $6}')
- SR_GT=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $7}')
- SR_GQ=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $8}')
- EV=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $9}')
-
- if [ $(cat normal.revise.vcf.lines.txt|fgrep -w $variant|wc -l) -gt 0 ]
- then
-  cat normal.revise.vcf.lines.txt \
-   |awk -v col=$col -v var=$variant -v GT=$GT -v GQ=$GQ -v RD_CN=$cn -v RD_GQ=$RD_GQ -v PE_GT=$PE_GT -v PE_GQ=$PE_GQ -v SR_GT=$SR_GT -v SR_GQ=$SR_GT -v EV=$EV '{if ($3==var ) $col="0/1:"GQ":"RD_CN":"RD_GQ":"PE_GT":"PE_GQ":"SR_GT":"SR_GQ":"EV ;print}' \
-   >int.lines.txt
-
-  cat int.lines.txt > normal.revise.vcf.lines.txt
-
- else 
-  cat line.txt \
-  |awk -v col=$col -v var=$variant -v GT=$GT -v GQ=$GQ -v RD_CN=$cn -v RD_GQ=$RD_GQ -v PE_GT=$PE_GT -v PE_GQ=$PE_GQ -v SR_GT=$SR_GT -v SR_GQ=$SR_GT -v EV=$EV '{if ($3==var ) $col="0/1:"GQ":"RD_CN":"RD_GQ":"PE_GT":"PE_GQ":"SR_GT":"SR_GQ":"EV ;print}' \
-  >>normal.revise.vcf.lines.txt
-  fi
-
-done<geno.normal.revise.txt
-
-
-##rewrite vcf with updated genotypes##
-
-cat <(zcat int.vcf.gz|fgrep -wvf <(awk '{print $1}' geno.normal.revise.txt|sort -u)) \
-  <(awk '{if ($1!="") print}' normal.revise.vcf.lines.txt|tr ' ' '\t') \
-  |vcf-sort \
-  |bgzip \
-  >normal.revise.vcf.gz
-
-##create new bed with updated genotypes###
-svtk vcf2bed normal.revise.vcf.gz stdout \
-  |awk '{if ($5=="DEL" || $5=="DUP") print}' \
-  |sort -k4,4 \
-  |gzip \
-  >int.afternormalfix.bed.gz
-
-
-###Find overlapping depth based variants and reassign depth based; note this is necessary because depth call >5kb genotypes are 100% driven by depth ##
-
-## generate a sample list based on depth for depth overlap check below. Necessary because genotype is capped at 1/1 and by direction (i.e no dels in dups)##
-zcat normal.revise.vcf.gz \
-  |awk '{if ($1~"#" || ($5=="<DEL>" || $5=="<DUP>")) print}'\
-  |awk '{if ($1!~"#") sub($1,$3);print}' \
-  |vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \
-  |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1 "\t" header[j] "\t" $j }' \
-  |sort -k1,1 \
-  |gzip \
-  >RD_CN.afternormalfix.FORMAT.gz
-  
-##grab all samples per variant with a non normal copy state## 
-zcat RD_CN.afternormalfix.FORMAT.gz \
-  |awk '{if ($3!="2") print $1 "\t" $2}' \
-  |awk '{a[$1]=a[$1]?a[$1]","$2:$2;}END{for (i in a)print i "\t" a[i];}' \
-  |sort -k1,1 \
-  >afternormal.combined.RD_CN.list.txt
-
-#overlapping##
-zcat int.afternormalfix.bed.gz \
-  |cut -f1-5 \
-  |join -1 4 -2 1 -t $'\t' - afternormal.combined.RD_CN.list.txt \
-  |awk -F"\t" '{if ($6!="") print }' \
-  |awk -F'[,\t]' '{for(i=6;i<=NF;i++) print $2"_"$i,$3,$4,$1,$5,$i,$1"@"$i}' \
-  |tr ' ' '\t' \
-  |gzip \
-  >all.bed.gz
-
-
-##intersect variants and always set larger to left##
-bedtools intersect -wa -wb -a all.bed.gz -b all.bed.gz \
-  |awk '{if ($4!=$11 && $3-$2>=$10-$9) print $0;else if ($4!=$11) print $8,$9,$10,$11,$12,$13,$14,$1,$2,$3,$4,$5,$6,$7}' \
-  |tr ' ' '\t' \
-  |sort -u \
-  |sort -k7,7 \
-  |gzip \
-  >bed.overlap.txt.gz
-
-
-##pull out per variant metrics from the INFO field## 
-
-for var in EV RD_CN PE_GT SR_GT PE_GQ SR_GQ
-do
- cat <(zcat normal.revise.vcf.gz|awk -F"\t" '{if ($1~"#") print}')  \
-  <(zcat bed.overlap.txt.gz|awk '{print $4 "\n" $11}' |sort -u|fgrep -wf - <(zcat normal.revise.vcf.gz)) \
-  |awk '{if ($1!~"#") sub($1,$3);print}' \
-  |vcftools --vcf - --stdout --extract-FORMAT-info ${var} \
-  |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1"@"header[j] "\t" $j }' \
-  |sort -k1,1 \
-  |gzip \
-  >${var}.FORMAT.gz
-done
-
-##Append info field to bed file##
-join -1 7 -2 1 <(zcat bed.overlap.txt.gz) \
-  <(zcat EV.FORMAT.gz)|join -j 1 - <(zcat RD_CN.FORMAT.gz) \
-  |join -j 1 - <(zcat PE_GT.FORMAT.gz) \
-  |join -j 1 - <(zcat PE_GQ.FORMAT.gz) \
-  |join -j 1 - <(zcat SR_GT.FORMAT.gz) \
-  |join -j 1 - <(zcat SR_GQ.FORMAT.gz) \
-  |sort -k14,14 \
-  |join -1 14 -2 1 - <(zcat EV.FORMAT.gz) \
-  |join -j 1 - <(zcat RD_CN.FORMAT.gz) \
-  |join -j 1 - <(zcat PE_GT.FORMAT.gz) \
-  |join -j 1 - <(zcat PE_GQ.FORMAT.gz) \
-  |join -j 1 - <(zcat SR_GT.FORMAT.gz) \
-  |join -j 1 - <(zcat SR_GQ.FORMAT.gz) \
-  |tr ' ' '\t' \
-  |cut -f3- \
-  |awk '{print $3-$2,$10-$9,$0}' \
-  |tr ' ' '\t' \
-  |sort -nrk1,1 -k2,2nr \
-  |cut -f3- \
-  |gzip \
-  >all.combined.bed.gz
-
-
-####If Multi-allelic is driving depth difference ignore###
-
-##get copy state per variant##
-zcat normal.revise.vcf.gz \
-  |awk '{if ($1!~"#") sub($1,$3);print}' \
-  |vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \
-  |gzip \
-  >copystate.RD_CN.FORMAT.gz
-
-##get copy state per variant##
-zcat copystate.RD_CN.FORMAT.gz \
-  |awk 'NR>1{for(i=3;i<=NF;i++) lines[$1 "\t" $i]++ } END{for (x in lines) print x}' \
-  |gzip \
-  >copystate.per.variant.txt.gz
-
-##Find multi-allelic for del or dup ; CNV >1kb we trust depth ##
-##del##
-zcat copystate.per.variant.txt.gz \
-  |awk '{if ($2!="." && $2>2) print $1}' \
-  |sort -u \
-  |fgrep -wf <(zcat int.bed.gz|awk -F"\t" '{if ($5=="DEL" && $3-$2>=1000) print $4}' ) \
-  >multi.cnvs.txt
-
-##dup##
-zcat copystate.per.variant.txt.gz \
-  |awk '{if ($2!="." && ($2<2 || $2>4)) print $1}' \
-  |sort -u \
-  |fgrep -wf <(zcat int.bed.gz|awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}' ) \
-  >>multi.cnvs.txt
-
-
-##update copy state which will lead to a new genotype when genotyping is rerun towards end of script ##
-echo "">RD_CN.revise.txt
-
-while read id
-do
- echo $id
- zcat all.combined.bed.gz \
-  |awk -v id=$id '{if ($6==id) print $0 "\t" $4"@"$10}' \
-  >overlap.bed.ids.txt
-
- while read bed
- do
-  compareID=$(echo $bed |awk '{print $NF}')
-  id1=$(echo $bed |awk '{print $4"@"$6}')
-  id2=$(echo $bed |awk '{print $10"@"$12}')
-  vID1=$(echo $bed |awk '{print $4}')
-  vID2=$(echo $bed |awk '{print $10}')
-  svtype1=$(echo $bed |awk '{print $5}')
-  svtype2=$(echo $bed |awk '{print $11}')
-  support1=$(echo $bed |awk '{print $13}')
-  support2=$(echo $bed |awk '{print $19}')
-  length1=$(echo $bed|awk '{print $3-$2}')
-  length2=$(echo $bed|awk '{print $9-$8}')
-  RD_CN1=$(echo $bed|awk '{print $14}')
-  RD_CN2=$(echo $bed|awk '{print $20}')
-  PE_GT1=$(echo $bed|awk '{print $15}')
-  PE_GT2=$(echo $bed|awk '{print $21}')
-  PE_GQ1=$(echo $bed|awk '{print $16}')
-  PE_GQ2=$(echo $bed|awk '{print $22}')
-  SR_GT1=$(echo $bed|awk '{print $17}')
-  SR_GT2=$(echo $bed|awk '{print $23}')
-  SR_GQ1=$(echo $bed|awk '{print $18}')
-  SR_GQ2=$(echo $bed|awk '{print $24}')
-
-  echo $bed|tr ' ' '\t'|cut -f1-6 >large.bed
-  echo $bed|tr ' ' '\t'|cut -f7-12>small.bed
-  overlap=$(bedtools coverage -a small.bed -b large.bed|awk '{if ($NF>0.50) print "YES";else print "NO"}')  
-  
-  ##remove any large CNV comparisons that have been revised to normal copy state of 2##
-  awk '{if ($2==2) print $1}' RD_CN.revise.txt>depthnormal.exclude.txt
-
-  if [ $(fgrep -w $id1 depthnormal.exclude.txt |wc -l) -eq 0 ] 
-  then 
-  ##classification##
-   ##Call where smaller depth call is being driven by larger##
-   if [[ $support1 =~ "RD" ]] && [[ $support2 = "RD" ]] && [ "$overlap" == "YES"  ] && [[ $support1 != "RD" ]] && [ $(fgrep -w $vID1 multi.cnvs.txt |wc -l) -eq 0 ] 
-   then
-    echo $bed \
-     |awk -v id2=$id2 -v svtype1=$svtype1 -v RD_CN1=$RD_CN1 -v RD_CN2=$RD_CN2  '{if (RD_CN1==1) print id2   "\t" RD_CN2+RD_CN1 ; \
-     else if(RD_CN1>1) print id2  "\t"  RD_CN2-(RD_CN1-2) }' \
-     >>RD_CN.revise.txt
-   ##Smaller CNV driving larger CNV genotype##
-   elif [[ $support1 = "RD"  ]] && [[ $support2 =~ "RD"  ]] && [ "$overlap" == "YES"  ] && [[ $support2 != "RD" ]] && [ $(fgrep -w $vID2 multi.cnvs.txt |wc -l) -eq 0 ] 
-   then
-    echo $bed \
-     |awk -v id1=$id1 -v svtype1=$svtype1 -v RD_CN1=$RD_CN1 -v RD_CN2=$RD_CN2  '{if (RD_CN2==1) print id1   "\t" RD_CN1+RD_CN2 ; \
-     else if(RD_CN2>1) print id1  "\t"  RD_CN1-(RD_CN2-2) }' \
-     >>RD_CN.revise.txt
-   ##Depth only calls where smaller call is being driven by larger##
-   elif [[ $support1 = "RD" ]]  && [[ $support2 = "RD" ]] && [ "$overlap" == "YES"  ] && [  "$svtype1" == "$svtype2"  ]  && [ $(fgrep -w $vID1 multi.cnvs.txt |wc -l) -eq 0 ] 
-   then
-    echo $bed \
-     |awk -v id2=$id2 -v svtype1=$svtype1 -v RD_CN1=$RD_CN1 -v RD_CN2=$RD_CN2 '{if (RD_CN1==1 && RD_CN1>RD_CN2  ) print id2 "\t" 1; \
-      else if (RD_CN1>1 && RD_CN1<RD_CN2 ) print id2  "\t"  RD_CN2-(RD_CN1-2) ; \
-      else print id2   "\t" 2 }' \
-      >>RD_CN.revise.txt
-   ##Any other time a larger call is driving a smaller call##
-   elif [[ $support1 =~ "RD" ]]  && [ "$overlap" == "YES"  ] && [ $length2 -gt 5000  ] && [ $(fgrep -w $vID1 multi.cnvs.txt |wc -l) -eq 0 ] 
-   then
-    echo $bed \
-    |awk -v id2=$id2 -v svtype1=$svtype1 -v RD_CN1=$RD_CN1 -v RD_CN2=$RD_CN2  '{if (RD_CN1==1) print id2   "\t" RD_CN2+RD_CN1 ; \
-    else if(RD_CN1>1) print id2  "\t"  RD_CN2-(RD_CN1-2) }' \
-    >>RD_CN.revise.txt
-   fi
-  fi
-done<overlap.bed.ids.txt
-done<whitelist.txt
-
-##fix negative copy states that will rarely arise and assign them 0 copy state##
-sed '/^$/d' RD_CN.revise.txt \
-  |tr '@' '\t' \
-  |awk '{if ($NF<0) print $1 "\t" $2 "\t" 0; else print}' \
-  >RD_CN.revise.forgeno.txt
-
-##Determine columns of VCF after header##
-zcat normal.revise.vcf.gz\
-  |egrep ^# \
-  |tail -n 1 \
-  |tr '\t' '\n' \
-  |cat -n - \
-  >col.txt
-
-
-##seed the vcf lines file which will provide the revisions to vcf file## 
-echo "">revise.vcf.lines.txt
-
-
-##pull out and revise vcf line that needs to be edited##
-while read line
-do
- id=$(echo $line|awk '{print $2}' )
- col=$(fgrep -w $id col.txt|awk '{print $1}')
- variant=$(echo $line|awk '{print $1}')
- cn=$(echo $line|awk '{print $3}')
-
- zcat normal.revise.vcf.gz |fgrep -w $variant >line.txt
-
- echo $variant $id
- ##Updated genotype and rebuild Format field ##
- GT=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $1}')
- GQ=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $2}')
- RD_CN=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $3}')
- RD_GQ=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $4}')
- PE_GT=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $5}')
- PE_GQ=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $6}')
- SR_GT=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $7}')
- SR_GQ=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $8}')
- EV=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $9}')
-
- if [ $(cat revise.vcf.lines.txt|fgrep -w $variant|wc -l) -gt 0 ]
- then
-  cat revise.vcf.lines.txt \
-   |awk -v col=$col -v var=$variant -v GT=$GT -v GQ=$GQ -v RD_CN=$cn -v RD_GQ=$RD_GQ -v PE_GT=$PE_GT -v PE_GQ=$PE_GQ -v SR_GT=$SR_GT -v SR_GQ=$SR_GT -v EV=$EV '{if ($3==var ) $col="0/1:"GQ":"RD_CN":"RD_GQ":"PE_GT":"PE_GQ":"SR_GT":"SR_GQ":"EV ;print}' \
-   >int.lines.txt
-
-  cat int.lines.txt > revise.vcf.lines.txt
-
- else 
-  cat line.txt \
-  |awk -v col=$col -v var=$variant -v GT=$GT -v GQ=$GQ -v RD_CN=$cn -v RD_GQ=$RD_GQ -v PE_GT=$PE_GT -v PE_GQ=$PE_GQ -v SR_GT=$SR_GT -v SR_GQ=$SR_GT -v EV=$EV '{if ($3==var ) $col="0/1:"GQ":"RD_CN":"RD_GQ":"PE_GT":"PE_GQ":"SR_GT":"SR_GQ":"EV ;print}' \
-  >>revise.vcf.lines.txt
-  fi
-
-done<RD_CN.revise.forgeno.txt
-
-cat <(zcat normal.revise.vcf.gz|fgrep -wvf <(awk '{print $1}' RD_CN.revise.forgeno.txt|sort -u)) \
-  <(awk '{if ($1!="") print}' revise.vcf.lines.txt|tr ' ' '\t') \
-  |vcf-sort \
-  |bgzip \
-  >overlap.revise.vcf.gz
-
-##multi check##
-zcat overlap.revise.vcf.gz \
-  |awk '{if ($1!~"#") sub($1,$3);print}' \
-  |vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \
-  |gzip \
-  >copystate.RD_CN.FORMAT.gz
-
-zcat copystate.RD_CN.FORMAT.gz \
-  |awk 'NR>1{for(i=3;i<=NF;i++) lines[$1 "\t" $i]++ } END{for (x in lines) print x}' \
-  |gzip \
-  >copystate.per.variant.txt.gz
-
-##Copy state just del and dup ; CNV >1kb we trust depth ##
-zcat copystate.per.variant.txt.gz \
-  |awk '{if ($2!="." && $2>2) print $1}' \
-  |sort -u \
-  |fgrep -wf <(zcat int.bed.gz|awk -F"\t" '{if ($5=="DEL" && $3-$2>=1000) print $4}' ) \
-  |gzip \
-  >multi.del.ids.txt.gz
-
-zcat copystate.per.variant.txt.gz \
-  |awk '{if ($2!="." && ($2<2 || $2>4)) print $1}' \
-  |sort -u \
-  |fgrep -wf <(zcat int.bed.gz|awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}' ) \
-  |gzip \
-  >multi.dup.ids.txt.gz 
-
-##Regenotype to determine multiallelic##
-##Genotype big dup##
-svtk vcf2bed overlap.revise.vcf.gz stdout \
-  |gzip>regeno.bed.gz
-
-##generate list##
-##CNV >5kb, split del and dup ##
-##  ##
-zcat regeno.bed.gz  \
- |awk '{if ($3-$2>=5000 && $5=="DUP")print $4}' \
- |fgrep -wvf <(zcat multi.dup.ids.txt.gz)  \
- >gt5kb.dup.ids.txt
- 
-zcat regeno.bed.gz \
-  |awk '{if ($3-$2>=5000 && $5=="DEL")print $4}' \
-  |fgrep -wvf <(zcat multi.del.ids.txt.gz) \
-  >gt5kb.del.ids.txt
-
-end=$(zcat overlap.revise.vcf.gz|awk '{if ($1!~"#") print}'|head -n 1 |awk -F'[:\t]' '{print NF}' )
-
-zcat overlap.revise.vcf.gz \
-  |fgrep -wf gt5kb.dup.ids.txt \
-  >dup.int.txt
-
-zcat overlap.revise.vcf.gz \
-  |fgrep -wf gt5kb.del.ids.txt \
-  >del.int.txt
-
-##regenotype VCF##
-for ((i=18;i<=$end;i+=9))
-do
- echo $i
- cat dup.int.txt \
-  |awk -F'[:\t]' -v i=$i '{if ($(i+2)==2) sub($i,"0/0"); \
-  else if ($(i+2)==3) sub($i,"0/1"); \
-  else sub($i,"1/1");print}' \
-  >dup.revise.txt
-  
- cat del.int.txt \
-  |awk -F'[:\t]' -v i=$i '{if ($(i+2)==2) sub($i,"0/0"); \
-  else if ($(i+2)==1) sub($i,"0/1"); \
-  else sub($i,"1/1");print}' \
-  >del.revise.txt
-  
-  cat dup.revise.txt>dup.int.txt
-  cat del.revise.txt>del.int.txt
-done
-
-cat <(zcat overlap.revise.vcf.gz|fgrep -wvf <(cat gt5kb.dup.ids.txt gt5kb.del.ids.txt)) \
-  <(cat dup.revise.txt del.revise.txt) \
-  |vcf-sort \
-  |bgzip \
-  >newdepth.geno.vcf.gz
-
-
-##Tag VCF##
-##find individual level metrics to determine multi allelic by PE/SR genotypes##
-
-for var in PE_GT SR_GT PE_GQ SR_GQ
-do
- zcat newdepth.geno.vcf.gz \
-  |awk '{if ($1!~"#") sub($1,$3);print}' \
-  |vcftools --vcf - --stdout --extract-FORMAT-info ${var} \
-  |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1"@"header[j] "\t" $j }' \
-  |sort -k1,1 \
-  |gzip \
-  >multicheck.${var}.FORMAT.gz
-done
-
-##concatenate metrics##
-join -j 1 <(zcat multicheck.PE_GT.FORMAT.gz) \
-  <(zcat multicheck.PE_GQ.FORMAT.gz) \
-  |join -j 1  - <(zcat multicheck.SR_GT.FORMAT.gz) \
-  |join -j 1  - <(zcat multicheck.SR_GQ.FORMAT.gz) \
-  |tr ' ' '\t' \
-  |gzip \
-  >multi.combined.format.gz
-
-
-##check by genotype##
-zcat multi.combined.format.gz \
-  |awk '{if ($2>0 && $4==0) print $1"\t" $2; \
-  else if ($2==0) print $1 "\t" $4; \
-  else if ($3>=$5)print $1"\t" $2; \
-  else print $1"\t" $4 }' \
-  |tr '@' '\t' \
-  |awk '{if ($3>2 && $2!=".") print $1}' \
-  |sort -u \
-  |gzip \
-  >multi.geno.ids.txt.gz 
-
-##Tag multi##
-zcat newdepth.geno.vcf.gz \
-  |awk 'NR==FNR{inFileA[$1]; next} {if ($3 in inFileA && $1!~"#") sub($7,"MULTIALLELIC"); print }' OFS='\t' \
-   <(zcat multi.del.ids.txt.gz multi.dup.ids.txt.gz multi.geno.ids.txt.gz|sort -u) - \
-   |bgzip \
-   >multitagged.vcf.gz
-
-###genotype multiallelics##
-##pull out multiallelic lines of vcf###
-zcat multitagged.vcf.gz \
-   |fgrep -wf <(zcat multi.geno.ids.txt.gz) \
-   >multi.gt.int.txt
-
-zcat multitagged.vcf.gz \
-   |fgrep -wf <(zcat multi.dup.ids.txt.gz) \
-   >multi.dup.int.txt
-
-zcat multitagged.vcf.gz \
-   |fgrep -wf <(zcat multi.del.ids.txt.gz) \
-   >multi.del.int.txt  
-
-end=$(zcat multitagged.vcf.gz|awk '{if ($1!~"#") print}'|head -n 1 |awk -F'[:\t]' '{print NF}' )
-
-for ((i=18;i<=$end;i+=9))
-do
- echo $i
- cat multi.dup.int.txt \
-  |awk -F'[:\t]' -v i=$i '{sub($i,"./"$(i+2));print}' \
-  >dup.multi.revise.txt
-  
- cat multi.del.int.txt \
-  |awk -F'[:\t]' -v i=$i '{sub($i,"./"$(i+2));print}' \
-  >del.multi.revise.txt
-  
- cat multi.gt.int.txt \
-  |awk -F'[:\t]' -v i=$i '{if ($(i+4)>0 && $(i+6)==0)  sub($i,"./"$(i+4)); \
-  else if ($(i+4)==0)  sub($i,"./"$(i+6)); \
-  else if ($(i+5)>=$(i+7)) sub($i,"./"$(i+4)); \
-  else  sub($i,"./"$(i+6)) ;print }' \
-  >gt.multi.revise.txt
-  
-  cat dup.multi.revise.txt>multi.dup.int.txt
-  cat del.multi.revise.txt>multi.del.int.txt
-  cat gt.multi.revise.txt>multi.gt.int.txt
-done
-
-##remove overlapping multi###
-zcat multitagged.vcf.gz \
-  |awk '{if ($1~"#" || ($7=="MULTIALLELIC" &&  ($5=="<DEL>" || $5=="<DUP>"))) print}' \
-  |svtk vcf2bed stdin stdout  \
-  |gzip \
-  >multi.bed.gz
-
-##strip out overlapping multiallelics##
-bedtools intersect -wa -wb -a  multi.bed.gz -b  multi.bed.gz \
-  |awk '{if ($4!=$10 && $3-$2>=$9-$8) print $0; \
-  else if ($4!=$10) print $7,$8,$9,$10,$11,$12,$1,$2,$3,$4,$5,$6}' \
-  |tr ' ' '\t' \
-  |sort -u \
-  |awk '{print $3-$2,$9-$8,$0}' \
-  |tr ' ' '\t' \
-  |sort -nrk1,1 -k2,2nr \
-  |cut -f3- \
-  |gzip \
-  >multi.bed.overlap.txt.gz
-
-echo "">multi.remove.txt
-
-while read bed
-do
-  echo $bed|tr ' ' '\t'|cut -f1-6 >large.bed
-  echo $bed|tr ' ' '\t'|cut -f7-12>small.bed
-  overlap=$(bedtools coverage -a small.bed -b large.bed|awk '{if ($NF>0.50) print "YES";else print "NO"}')  
-  
-  if [ "$overlap" == "YES" ] && [ $(awk '{print $4}' large.bed|fgrep -wf - multi.remove.txt|wc -l) -eq 0 ]
-  then
-  awk '{print $4}' small.bed >>multi.remove.txt
-  fi
-done< <(zcat multi.bed.overlap.txt.gz)
-
-
-##strip out variants with no genotypes and overlapping multiallelics##
-### Find missing genotype and then add multiallelics that need to be removed###
-
-svtk vcf2bed multitagged.vcf.gz stdout \
-  |awk -F'\t' '{if ($6=="") print $4}' \
-  |cat - multi.remove.txt \
-  |sed '/^$/d' \
-  |fgrep -wvf - <(zcat multitagged.vcf.gz) \
-  |gzip \
-  >cleantagandmulti.vcf.gz
-
-##Fix header##
-##get header to clean##
-##add new filters##
-zcat cleantagandmulti.vcf.gz \
-  |awk '{if (NR==2) print $0 "\n" "##FILTER=<ID=MULTIALLELIC,Description=\"Multiallelic site\">" ;else print}' \
-  |awk '{if ($1~"##" && NR>1)  print}' \
-  |sort -k1,1 \
-  |egrep -v "CIPOS|CIEND|RMSSTD|MEMBERS|UNRESOLVED|source|MULTIALLELIC|varGQ|bcftools|ALT=<ID=UNR" \
-  |cat <(zcat cleantagandmulti.vcf.gz|head -n 1) - <(zcat cleantagandmulti.vcf.gz|awk '{if ($1!~"##")  print}') \
-  |gzip \
-  >polished.vcf.gz
-
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1.sh b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1.sh
deleted file mode 100755
index c1798b378..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1.sh
+++ /dev/null
@@ -1,268 +0,0 @@
-#!/bin/bash
-#
-# clean_VCF.sh
-#
-
-##requires >= vcftools/0.1.15 ##
-##requires >= bcftools/1.9 ##
-
-set -euxo pipefail
-
-# use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker
-BCFTOOLS=/usr/local/bin/bcftools
-
-##gzipped vcf##
-vcf=$1
-backgroundlist=$2
-famfile=$3
-allosome_fai=$4
-
-##get sampleids from VCF##
-zcat $vcf \
-  |sed -n '1,1000p' \
-  |egrep "^#" \
-  |tail -n -1 \
-  |cut -f10- \
-  |tr '\t' '\n' \
-  > includelist.txt
-
-##convert EV integer back into string##
-/opt/sv-pipeline/04_variant_resolution/scripts/replace_ev_numeric_code_with_string.py ${vcf} - | bgzip -c > EV.update.vcf.gz
-rm $vcf
-
-##convert all alt to svtype and alt to N##
-svtk vcf2bed EV.update.vcf.gz stdout -i SVTYPE  \
-  |awk -F"\t" '{ if ($5!~"ME")$5=$7; print $4"\t" "<"$5 ">"}' \
-  |gzip \
-  >vcf.convert.svtype.gz
-
-zcat EV.update.vcf.gz \
-  |awk 'NR==FNR{inFileA[$1]=$2; next} {if ($3 in inFileA && $1!~"#") $5=inFileA[$3]; print }' OFS='\t'  \
-   <(zcat vcf.convert.svtype.gz) - \
-   |awk '{if ($1!~"#") $4="N"; print}' OFS='\t' \
-   |bgzip \
-   >convertsvtype.vcf.gz
-
-##get rid of multiallelic tage in INFO field and add varGQ to QUAL column and Members field##
-svtk vcf2bed convertsvtype.vcf.gz stdout -i varGQ \
-  |awk -F"\t" '{print $4 "\t" $7}' \
-  >vargq.persample
-
-zcat convertsvtype.vcf.gz \
-  |sed 's/;MULTIALLELIC//g' \
-  |sed 's/UNRESOLVED;//g' \
-  |sed 's/;varGQ=[0-9]*//g' \
-  |awk 'NR==FNR{inFileA[$1]=$2; next} {if ($3 in inFileA && $1!~"#") $6=inFileA[$3]; print }' OFS='\t' vargq.persample - \
-  |bgzip \
-   >cleaninfo.vcf.gz
-   tabix -p vcf cleaninfo.vcf.gz
-
-   
-##fix sex chr if necessary##
-if [ $(zcat cleaninfo.vcf.gz|awk '{if (($1~"X" || $1~"Y") && $1!~"#" ) print}'|wc -l) -gt 0 ]
-then
-
-
-svtk vcf2bed cleaninfo.vcf.gz stdout \
-  |awk '{if (($5=="DEL" || $5=="DUP") && $3-$2>=5000 && ($1~"X" || $1~"Y") && $1!~"#") print}' \
-  >clean.bed || true
-
-awk '{print $4}' clean.bed>clean.bed.ids.txt
-
-
-##male##
-awk '{if ($5==1) print $2}' $famfile \
-   |fgrep -wf <(zcat cleaninfo.vcf.gz|head -n 1000|fgrep "CHROM"|fgrep POS|cut -f10-|tr '\t' '\n') >male.txt
-
-##female##
-awk '{if ($5==2) print $2}' $famfile \
-   |fgrep -wf <(zcat cleaninfo.vcf.gz|head -n 1000|fgrep "CHROM"|fgrep POS|cut -f10-|tr '\t' '\n') >female.txt
-
-   if [ $(cat clean.bed.ids.txt|wc -l) -gt 0 ]
-   then
-
-  awk '{print $1"\t0\t"$2}' < ${allosome_fai} > allosomes.list
-   ${BCFTOOLS} query -R allosomes.list -S male.txt -i 'ID=@clean.bed.ids.txt' -f '[%ID\t%SAMPLE\t%RD_CN\n]' cleaninfo.vcf.gz \
-      | awk '{if ($3!=".") print}' \
-      | gzip > RD_CN.sexcheck.FORMAT.male.gz
-
-   ${BCFTOOLS} query -R allosomes.list -S female.txt -i 'ID=@clean.bed.ids.txt' -f '[%ID\t%SAMPLE\t%RD_CN\n]' cleaninfo.vcf.gz \
-      | awk '{if ($3!=".") print}' \
-      | gzip > RD_CN.sexcheck.FORMAT.female.gz
-
-    zcat RD_CN.sexcheck.FORMAT.male.gz| Rscript -e 'd<-read.table("stdin")' \
-    -e 'x<-tapply(d[,3],d[,1],median)' \
-    -e 'write.table(x,"male.median.value.pervar.txt",col.names=FALSE,quote=FALSE,sep = "\t")'
-
-     zcat RD_CN.sexcheck.FORMAT.female.gz| Rscript -e 'd<-read.table("stdin")' \
-     -e 'x<-tapply(d[,3],d[,1],median)' \
-     -e 'write.table(x,"female.median.value.pervar.txt",col.names=FALSE,quote=FALSE,sep = "\t")'
-    fi
-##Pull out ids where male copy state 1 to normal when female normal and on X##
- echo "">sexchr.revise.txt 
-
- if [ $(awk '{if (($5=="DEL" || $5=="DUP") && $3-$2>=5000) print }' clean.bed|awk '{if (($1~"X") && $1!~"#" ) print}'|wc -l) -gt 0 ]
- then
- awk '{if ($2==1) print $1}' male.median.value.pervar.txt \
-  |fgrep -wf <(awk '{if ($2==2) print $1}' female.median.value.pervar.txt) \
-  |fgrep -wf  - <(zcat cleaninfo.vcf.gz|awk '{if ($1~"X" && $1!~"#") print $3}') \
-  >sexchr.revise.txt || true
- fi
-
- if [ $(awk '{if (($5=="DEL" || $5=="DUP") && $3-$2>=5000) print }' clean.bed|awk '{if (($1~"Y") && $1!~"#" ) print}'|wc -l) -gt 0 ]
- then
- awk '{if ($2==1) print $1}' male.median.value.pervar.txt \
-  |fgrep -wf <(awk '{if ($2==0) print $1}' female.median.value.pervar.txt) \
-  |fgrep -wf - <(zcat cleaninfo.vcf.gz|awk '{if ($1~"Y" && $1!~"#") print $3}') \
-  >>sexchr.revise.txt || true
- fi
-
-
-${BCFTOOLS} index cleaninfo.vcf.gz
-
-##Pull out male and females sex chr##
-${BCFTOOLS} view cleaninfo.vcf.gz -S male.txt -r chrX:1-1000000000,chrY:1-1000000000,X:1-1000000000,Y:1-1000000000 --no-update|bgzip>male.vcf.gz
-${BCFTOOLS} view cleaninfo.vcf.gz -S female.txt -r chrX:1-1000000000,chrY:1-1000000000,X:1-1000000000,Y:1-1000000000 --no-update|bgzip>female.vcf.gz
-
-${BCFTOOLS} index male.vcf.gz
-${BCFTOOLS} index female.vcf.gz
-
-zcat male.vcf.gz\
-   |awk -F'\t' '{if ($5~"DEL" && $1!~"#") print $0 "\t" "ENDOFLINE"}' \
-   |fgrep -wf sexchr.revise.txt \
-   |tr '\t' '\n' \
-   |awk -F':' '{if ($3>=1 && NF>4 && $1!="GT") $1="0/0";else if ($3==0 && NF>4 && $1!="GT" ) $1="0/1"; if (NF>4 && $1!="GT") $3=$3+1;print}' OFS=":" \
-   |tr '\n' '\t' \
-   |sed 's/ENDOFLINE/\n/g' \
-   |sed -e 's/^[ \t]*//' \
-   |sed -e 's/[\t]$//g' \
-   |bgzip \
-   >male_del.revise.txt.gz ||true
-
-
-zcat male.vcf.gz\
-   |awk -F'\t' '{if ($5~"DUP" && $1!~"#") print $0 "\t" "ENDOFLINE"}' \
-   |fgrep -wf sexchr.revise.txt \
-   |tr '\t' '\n' \
-   |awk -F':' '{if ($3<=1 && NF>4 && $1!="GT") $1="0/0";else if ($3==2 && NF>4 && $1!="GT" ) $1="0/1";else if (NF>4 && $1!="GT" ) $1="1/1"; if (NF>4 && $1!="GT" ) $3=$3+1;print}' OFS=":" \
-   |tr '\n' '\t' \
-   |sed 's/ENDOFLINE/\n/g' \
-   |sed -e 's/^[ \t]*//' \
-   |sed -e 's/[\t]$//g' \
-   |bgzip \
-   >male_dup.revise.txt.gz ||true
-
-  if [ $(cat male_dup.revise.txt.gz male_del.revise.txt.gz|wc -l) -gt 0 ]
-  then 
-   cat <(zcat male.vcf.gz|fgrep -wvf <(zcat male_dup.revise.txt.gz male_del.revise.txt.gz|awk '{print $3}' )) \
-     <(zcat male_del.revise.txt.gz male_dup.revise.txt.gz|awk '{if ($1!="") print}'|tr ' ' '\t') \
-    |vcf-sort \
-    |bgzip \
-     >cleanmale.vcf.gz
-  else
-    cp male.vcf.gz cleanmale.vcf.gz
-  fi
-
- ${BCFTOOLS} index cleanmale.vcf.gz
-
-  ##Modify female only for chrY###
-  if [ $(zcat cleaninfo.vcf.gz |awk '{if ($1~"Y" && $1!~"#") print}'|wc -l) -gt 0 ]
-  then
-   zcat female.vcf.gz\
-    |awk -F'\t' '{if ($1!~"#" && $1~"Y") print $0 "\t" "ENDOFLINE"}' \
-    |tr '\t' '\n' \
-    |awk -F':' '{ if (NF>4 && $1!="GT" ) $1="./."  \
-    ;if (NF>4 && $1!="GT" ) $2=$3=$4=$5=$6=$7=$8=$9=".";print}' OFS=":" \
-    |tr '\n' '\t' \
-    |sed 's/ENDOFLINE/\n/g' \
-    |sed -e 's/^[ \t]*//' \
-    |sed -e 's/[\t]$//g' \
-    |bgzip \
-    >female.y.revise.txt.gz
-
-   cat <(zcat female.vcf.gz \
-    |fgrep -wvf <(zcat female.y.revise.txt.gz|awk '{print $3}' )) \
-    <(zcat female.y.revise.txt.gz) \
-    |vcf-sort \
-    |bgzip \
-    >cleanfemale.vcf.gz 
-
-    ${BCFTOOLS} index cleanfemale.vcf.gz
-
-  else 
-   cp female.vcf.gz cleanfemale.vcf.gz
-   ${BCFTOOLS} index cleanfemale.vcf.gz
-  fi
-
-
-  ##replace genotype to ./. for other sex calls##
-  ##sex anueplodies ##
-
-  if [ $(awk '{if ($5!=2 && $5!=1) print $2}' $famfile|wc -l) -gt 0 ]
-  then    
-    awk '{if ($5!=2 && $5!=1) print $2}' $famfile>other.txt
-    ${BCFTOOLS} view cleaninfo.vcf.gz -S other.txt -r chrX:1-1000000000,chrY:1-1000000000,X:1-1000000000,Y:1-1000000000 --no-update|bgzip>other.vcf.gz
-    ${BCFTOOLS} index other.vcf.gz
-
-   zcat other.vcf.gz\
-    |awk -F'\t' '{if ($1!~"#") print $0 "\t" "ENDOFLINE"}' \
-    |tr '\t' '\n' \
-    |awk -F':' '{ if (NF>4 && $1!="GT" ) $1="./.";print}' OFS=":" \
-    |tr '\n' '\t' \
-    |sed 's/ENDOFLINE/\n/g' \
-    |sed -e 's/^[ \t]*//' \
-    |sed -e 's/[\t]$//g' \
-    |bgzip \
-    >other.revise.txt.gz
-
-  cat <(zcat other.vcf.gz \
-    |fgrep -wvf <(zcat other.revise.txt.gz|awk '{print $3}' )) \
-    <(zcat other.revise.txt.gz) \
-    |vcf-sort \
-    |bgzip \
-    >cleanother.vcf.gz 
-
-  ${BCFTOOLS} index cleanother.vcf.gz
-  
-   cat <(zcat cleanmale.vcf.gz|egrep "##") \
-    <(paste <(zcat cleanmale.vcf.gz|egrep -v "##") <(zcat cleanfemale.vcf.gz|cut -f10-|egrep -v "##") <(zcat cleanother.vcf.gz|cut -f10-|egrep -v "##") ) \
-    |bgzip \
-    >combinedsex.vcf.gz
-
-else 
-   cat <(zcat cleanmale.vcf.gz|egrep "##") \
-    <(paste <(zcat cleanmale.vcf.gz|egrep -v "##") <(zcat cleanfemale.vcf.gz|cut -f10-|egrep -v "##"))  \
-    |bgzip \
-    >combinedsex.vcf.gz
-fi
-
-
-
-  tabix -p vcf combinedsex.vcf.gz
-
-zcat combinedsex.vcf.gz|awk '{if ($1!~"#") print $3}'>modified.ids.txt
-
-##shuffle sex ids backinto place to match original vcf and back to initial vcf##
- vcf-shuffle-cols -t cleaninfo.vcf.gz combinedsex.vcf.gz \
-  |awk '{if ($1!~"#") print}' \
-  |cat <(zcat cleaninfo.vcf.gz|fgrep -wvf modified.ids.txt ) - \
-  |vcf-sort \
-  |bgzip \
-  >cleanallo.vcf.gz
-
-else 
- cp cleaninfo.vcf.gz cleanallo.vcf.gz
- echo "">sexchr.revise.txt
-fi   
-
-# the code below will not print any lines if the background list file is empty, so add a dummy sentinel record at the end
-cat $backgroundlist <(echo "XXX_SENTINEL_XXX") > background_list_with_sentinel.list
-
-##change tag for SR background failures and Unresolved##
-zcat cleanallo.vcf.gz\
-  |awk 'NR==FNR{inFileA[$1]; next} {if ($3 in inFileA && $1!~"#") $7=$7";HIGH_SR_BACKGROUND"; print }' OFS='\t' <(awk '{print $NF}' background_list_with_sentinel.list) - \
-  |awk '{if (NR==2) print $0 "\n" "##FILTER=<ID=HIGH_SR_BACKGROUND,Description=\"High number of SR splits in background samples indicating messy region\">" ;else print}' \
-  |awk '{if ($8~"UNRESOLVED") $7=$7";UNRESOLVED";print}' OFS='\t' \
-  |awk '{if (NR==2) print $0 "\n" "##FILTER=<ID=UNRESOLVED,Description=\"Variant is unresolved\">" ;else print}' \
-  |bgzip \
-  >int.vcf.gz  
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b.sh b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b.sh
deleted file mode 100755
index 40df71ea7..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b.sh
+++ /dev/null
@@ -1,166 +0,0 @@
-#!/bin/bash
-#
-# clean_vcf_part1b.sh
-#
-
-set -euxo pipefail
-
-##gzipped vcf from clean vcf part1.sh##
-int_vcf_gz=$1
-
-##Remove CNVs that are improperly genotyped by depth because they are nested within a real CNV##
-
-##Determine columns of VCF after header##
-zcat $int_vcf_gz\
-  |sed -n '1,1000p'\
-  |egrep ^# \
-  |tail -n 1 \
-  |tr '\t' '\n' \
-  |cat -n - \
-  >col.txt
-
-##Only affects CNV so pull those out##
-zcat $int_vcf_gz \
-  |awk '{if ($5~"DEL" || $5~"DUP" || $1~"#") print}' \
-  |svtk vcf2bed stdin stdout \
-  |awk -F"\t" '{if ($6=="") print $6="blanksample";print $0}' OFS='\t' \
-  |gzip>int.bed.gz
-
-##list of potenital overlaps with a normal copy state variant (>5kb variants require depth but nested events could be missed; i.e a duplication with a nest deletion will have a normal copy state for the deletion)##
-##flip bed intersect so largest is CNV is always first##
-bedtools intersect -wa -wb -a <(zcat int.bed.gz|awk '{if ($3-$2>=5000 ) print}') \
--b <(zcat int.bed.gz|awk '{if ($3-$2>=5000) print}') \
-  |awk -F'\t' '{if ($4!=$10 && $3-$2>=$9-$8 && $5!=$11) print ;\
- else if ($4!=$10 && $5!=$11) print $7,$8,$9,$10,$11,$12,$1,$2,$3,$4,$5,$6}' OFS='\t' \
-  |awk -F'\t' '{if ($6!="blanksample") print}' \
-  |sort -u \
-  >normaloverlap.txt
-  
-
-##pull out the depth based copy number variant for each normal overlapping variant## 
-{ cat <(zcat $int_vcf_gz|awk -F"\t" '{if ($1~"#") print}') \
-  <(awk '{print $4 "\n" $10}' normaloverlap.txt|sort -u|fgrep -wf - <(zcat $int_vcf_gz)) || true; }\
-  |awk '{if ($1!~"#") $1=$3;print}' OFS="\t" \
-  |awk '{if ($1~"#" || $5=="<DEL>" || $5=="<DUP>") print}' \
-  |vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \
-  |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1"@"header[j] "\t" $j }' \
-  |sort -k1,1 \
-  |gzip \
-  >RD_CN.normalcheck.FORMAT.gz
-
-
-##pull out evidence supporting each normal overlapping variant## 
-{ cat <(zcat $int_vcf_gz|awk -F"\t" '{if ($1~"#") print}') \
-  <(awk '{print $4 "\n" $10}' normaloverlap.txt|sort -u|fgrep -wf - <(zcat $int_vcf_gz)) || true; }\
-  |awk '{if ($1!~"#") $1=$3;print}' OFS="\t"\
-  |vcftools --vcf - --stdout --extract-FORMAT-info EV \
-  |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1"@"header[j] "\t" $j }' \
-  |sort -k1,1 \
-  |gzip \
-  >EV.normalcheck.FORMAT.gz
-
-
-##check if nested is incorrectly classified as normal##
-touch overlap.test.txt
-while read bed
-do
- echo $bed|tr ' ' '\t'|cut -f1-6 >large.bed
- echo $bed|tr ' ' '\t'|cut -f7-12>small.bed
- ##require at least 50% coverage to consider a variant overlapping##
- overlap=$(bedtools coverage -a small.bed -b large.bed|awk '{if ($NF>=0.50) print "YES";else print "NO"}')
-
- if [ "$overlap" == "YES" ]
- then
-  smallid=$(awk '{print $4}' small.bed)
-  
-  ##pull out variants that are called a variants for both the smaller and larger CNVs (don't have normal copy state to check for)##
-   if [ $(awk '{print $NF}' small.bed \
-       |tr ',' '\n' \
-       |fgrep -wvf - <(awk -F"[,\t]" -v var=$smallid '{for(i=6;i<=NF;i++) print var"@"$i "\t" $4"@"$i "\t" $5}' large.bed)|wc -l) -gt 0 ]
-    then
-         awk '{print $NF}' small.bed \
-          |tr ',' '\n' \
-          |fgrep -wvf - <(awk -F"[,\t]" -v var=$smallid '{for(i=6;i<=NF;i++) print var"@"$i "\t" $4"@"$i "\t" $5}' large.bed) \
-          >>overlap.test.txt
-   fi
- fi
-done<normaloverlap.txt
-
-
-##determine variants that need to be revised from a normal copy state into a CNV##
-cat overlap.test.txt \
-  |sort -k1,1 \
-  |join -j 1 - <(zcat RD_CN.normalcheck.FORMAT.gz) \
-  |join -j 1 - <(zcat EV.normalcheck.FORMAT.gz) \
-  |tr ' ' '\t' \
-  |sort -k2,2 \
-  |join -1 2 -2 1 - <(zcat RD_CN.normalcheck.FORMAT.gz) \
-  |awk '{if ($3=="DUP" && $4==2 && $6==3) print $2 "\t" 1; else if ($3=="DEL" && $4==2 && $6==1)  print $2 "\t" 3 }' \
-  |tr '@' '\t'\
-  >geno.normal.revise.txt
-
-##Update genotypes##
-{ zfgrep -wf <(awk '{print $1}' geno.normal.revise.txt|sort -u) $int_vcf_gz || true; }\
-  |bgzip \
-  >subset.vcf.gz || true
-
-##pull out and revise vcf line that needs to be edited##
-while read variant
-do
- 
- echo $variant
- #note no longer change depth from id.txt (column 2)##
- { fgrep $variant geno.normal.revise.txt || true; }|awk '{print $2 "\t" $3}'>id.txt
- zcat subset.vcf.gz |{ fgrep -w $variant || true; }>line.txt
- 
- cat line.txt  \
-   |tr '\t' '\n' \
-   |paste col.txt - \
-   |tr ':' '\t' \
-   |awk 'NR==FNR{inFileA[$1]=$2; next} {if ($2 in inFileA ) $3="0/1"; print }' OFS='\t' id.txt - \
-   |awk 'NR==FNR{inFileA[$1]=$2; next} {if ($2 in inFileA ) $4=$6; print }' OFS='\t' id.txt - \
-   |cut -f3-|tr '\t' ':' \
-   |tr '\n' '\t' \
-   |awk '{print $0}' \
-   >>normal.revise.vcf.lines.txt
-
-done< <(awk '{print $1}' geno.normal.revise.txt|sort -u)
-
-
-##rewrite vcf with updated genotypes##
-
-cat <(zcat $int_vcf_gz|fgrep -wvf <(awk '{print $3}' normal.revise.vcf.lines.txt|sort -u)) \
-  <(sed 's/\t$//' normal.revise.vcf.lines.txt) \
-  |vcf-sort \
-  |bgzip \
-  >normal.revise.vcf.gz || true
-
- bcftools index normal.revise.vcf.gz 
-
-##get copy state per variant##
-zcat normal.revise.vcf.gz \
-  |awk '{if ($1!~"#") $1=$3;print}' OFS="\t" \
-  |vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \
-  |gzip \
-  >copystate.RD_CN.FORMAT.gz
-
-##get copy state per variant##
-zcat copystate.RD_CN.FORMAT.gz \
-  |awk 'NR>1{for(i=3;i<=NF;i++) lines[$1 "\t" $i]++ } END{for (x in lines) print x}' \
-  |gzip \
-  >copystate.per.variant.txt.gz
-
-##Find multi-allelic for del or dup ; CNV >1kb we trust depth ##
-##del##
-zcat copystate.per.variant.txt.gz \
-  |awk '{if ($2!="." && $2>3) print $1}' \
-  |sort -u \
-  |fgrep -wf <(zcat int.bed.gz|awk -F"\t" '{if ($5=="DEL" && $3-$2>=1000) print $4}' ) \
-  >multi.cnvs.txt || true
-
-##dup##
-zcat copystate.per.variant.txt.gz \
-  |awk '{if ($2!="." && ($2<1 || $2>4)) print $1}' \
-  |sort -u \
-  |fgrep -wf <(zcat int.bed.gz|awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}' ) \
-  >>multi.cnvs.txt || true
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py
new file mode 100644
index 000000000..b7da153cb
--- /dev/null
+++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py
@@ -0,0 +1,154 @@
+"""
+Remove CNVs that are improperly genotyped by depth because they are nested
+within a real CNV
+"""
+
+import logging
+import pybedtools
+import pysam
+import sys
+import json
+
+from collections import defaultdict
+
+SVTYPE = "SVTYPE"
+BLANK_SAMPLES = "blanksample"
+
+
+class SVType:
+    DUP = "DUP"
+    DEL = "DEL"
+
+
+class VariantFormatTypes:
+    # Predicted copy state
+    RD_CN = "RD_CN"
+    # Classes of evidence supporting final genotype
+    EV = "EV"
+
+
+class VCFReviser:
+    def __init__(self):
+        self.rd_cn = {}
+        self.sample_indices_dict = {}
+        self.sample_list = []
+
+    def _update_rd_cn(self, variant, sample_indices):
+        self.rd_cn[variant.id] = {s: variant.samples[s][VariantFormatTypes.RD_CN] for s in sample_indices}
+
+    @staticmethod
+    def get_wider(f):
+        # f[1] : first interval start
+        # f[2] : first interval end
+        # f[7] : second interval start
+        # f[8] : second interval end
+        if int(f[2]) - int(f[1]) >= int(f[8]) - int(f[7]):
+            return f[0:6], f[6:12]
+        else:
+            return f[6:12], f[0:6]
+
+    @staticmethod
+    def get_coverage(wider, narrower):
+        n_start = int(narrower[1])
+        n_stop = int(narrower[2])
+        w_start = int(wider[1])
+        w_stop = int(wider[2])
+
+        coverage = 0
+        if w_start <= n_stop and n_start <= w_stop:
+            intersection_size = min(n_stop, w_stop) - max(n_start, w_start)
+            coverage = intersection_size / (n_stop - n_start)
+        return coverage
+
+    def get_geno_normal_revise(self, vcf_file, bed_file):
+        overlap_test_text = defaultdict(dict)
+        with pysam.VariantFile(vcf_file, "r") as f:
+            header = f.header
+            i = -1
+            for sample in header.samples:
+                i += 1
+                self.sample_indices_dict[sample] = i
+                self.sample_list.append(sample)
+
+            logging.info("Filtering intersect results")
+            bed = pybedtools.BedTool(bed_file)
+            for interval in bed.intervals:
+                wider, narrower = self.get_wider(interval.fields)
+                # wider and narrower are lists/tuples with the following fields:
+                # [0] : contig
+                # [1] : start position
+                # [2] : end position
+                # [3] : variant ID
+                # [4] : SV type
+                # [5] : comma-delimited sample lists, or BLANK_SAMPLES if none
+                if wider[5] == BLANK_SAMPLES:
+                    continue
+
+                coverage = self.get_coverage(wider, narrower)
+                if coverage >= 0.5:
+                    wider_samples = set(wider[5].split(","))
+                    narrower_samples = set(narrower[5].split(","))
+                    non_common_samples = [self.sample_indices_dict[s] for s in wider_samples - narrower_samples]
+                    for x in non_common_samples:
+                        vid = narrower[3]
+                        overlap_test_text[vid][x] = (wider[3], wider[4])
+
+            # Determine for which vid/sample pairs we need RD_CN
+            # Substantially reduces memory
+            logging.info('Getting revised variant IDs')
+            revise_vids = defaultdict(set)
+            for var_id, samples_dict in overlap_test_text.items():
+                for sample_index, v in samples_dict.items():
+                    # v[0] : variant ID
+                    # v[1] : SV type
+                    if v[1] == SVType.DUP or v[1] == SVType.DEL:
+                        revise_vids[var_id].add(sample_index)
+                        revise_vids[v[0]].add(sample_index)
+
+            logging.info('Getting RD_CN/EV')
+            for variant in f:
+                if variant.id in revise_vids:
+                    sample_indices = revise_vids[variant.id]
+                    self._update_rd_cn(variant, sample_indices)
+
+        logging.info('Generating geno_normal_revise_dict')
+        geno_normal_revise_dict = {}
+        for var_id, samples_dict in overlap_test_text.items():
+            for sample_index, v in samples_dict.items():
+                # v[0] : variant ID
+                # v[1] : SV type
+                new_val = None
+                if sample_index not in revise_vids[v[0]]:
+                    sys.stderr.write("{} {}\n".format(sample_index, v[0]))
+                if v[1] == SVType.DUP and \
+                        self.rd_cn[var_id][sample_index] == 2 and \
+                        self.rd_cn[v[0]][sample_index] == 3:
+                    new_val = 1
+                elif v[1] == SVType.DEL and \
+                        self.rd_cn[var_id][sample_index] == 2 \
+                        and self.rd_cn[v[0]][sample_index] == 1:
+                    new_val = 3
+
+                if new_val:
+                    if var_id not in geno_normal_revise_dict:
+                        geno_normal_revise_dict[var_id] = {}
+                    sample_id = self.sample_list[sample_index]
+                    geno_normal_revise_dict[var_id][sample_id] = new_val
+
+        return geno_normal_revise_dict
+
+
+def main(args):
+    logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
+    logging.info('Starting script')
+    reviser = VCFReviser()
+    filtered_vcf = args[1]
+    intersected_bed = args[2]
+    geno_normal_revise_dict = reviser.get_geno_normal_revise(filtered_vcf, intersected_bed)
+    logging.info('Dumping dictionary')
+    sys.stdout.write(json.dumps(geno_normal_revise_dict))
+    logging.info('Done')
+
+
+if __name__ == '__main__':
+    main(sys.argv)
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py
new file mode 100644
index 000000000..e63b890cd
--- /dev/null
+++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py
@@ -0,0 +1,82 @@
+"""
+Remove CNVs that are improperly genotyped by depth because they are nested
+within a real CNV
+"""
+
+import os
+import logging
+import pysam
+import sys
+from pathlib import Path
+import json
+import gzip
+
+SVTYPE = "SVTYPE"
+BLANK_SAMPLES = "B"
+
+
+class SVType:
+    DUP = "DUP"
+    DEL = "DEL"
+
+
+class VariantFormatTypes:
+    # Predicted copy state
+    RD_CN = "RD_CN"
+    # Classes of evidence supporting final genotype
+    EV = "EV"
+
+
+def modify_variants(dict_file_gz, vcf, multi_cnvs):
+    logging.info('Loading dictionary')
+    with gzip.open(dict_file_gz, 'rt') as f:
+        geno_normal_revise_dict = json.load(f)
+
+    logging.info('Filtering variants')
+    with pysam.VariantFile(vcf, "r") as f_in:
+        header = f_in.header
+        sys.stdout.write(str(header))
+        with open(multi_cnvs, "w") as multi_cnvs_f:
+            variants = f_in.fetch()
+            for variant in variants:
+                if variant.id in geno_normal_revise_dict:
+                    for sample_id in geno_normal_revise_dict[variant.id]:
+                        o = variant.samples[sample_id]
+                        o.update({"GT": (0, 1)})
+                        o.update({"GQ": o["RD_GQ"]})
+
+                if variant.stop - variant.start >= 1000:
+                    if variant.info[SVTYPE] in [SVType.DEL, SVType.DUP]:
+                        is_del = variant.info[SVTYPE] == SVType.DEL
+                        for k, v in variant.samples.items():
+                            rd_cn = v[VariantFormatTypes.RD_CN]
+                            if rd_cn is None:
+                                continue
+                            if (is_del and rd_cn > 3) or \
+                                    (not is_del and (rd_cn < 1 or rd_cn > 4)):
+                                multi_cnvs_f.write(variant.id + "\n")
+                                break
+
+                sys.stdout.write(str(variant))
+
+
+def ensure_file(filename):
+    filename = os.path.join(".", filename)
+    filename = Path(filename)
+    if filename.exists():
+        os.remove(filename)
+    return filename.name
+
+
+def main(args):
+    logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO)
+    logging.info('Starting script')
+    multi_cnvs_filename = ensure_file("multi.cnvs.txt")
+    dict_file_gz = args[1]
+    vcf_file = args[2]
+    modify_variants(dict_file_gz, vcf_file, multi_cnvs_filename)
+    logging.info('Done')
+
+
+if __name__ == '__main__':
+    main(sys.argv)
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.py
new file mode 100644
index 000000000..86e869e46
--- /dev/null
+++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.py
@@ -0,0 +1,58 @@
+#!/bin/python
+
+import argparse
+from collections import defaultdict
+from os import mkdir, path
+
+
+def count_variants(infile):
+    variant_counts = defaultdict(int)
+    with open(infile, 'r') as IN:
+        for line in IN:
+            var_id = line.strip().split('\t')[0]
+            variant_counts[var_id] += 1
+    return dict(sorted(variant_counts.items(), key=lambda item: item[1], reverse=True))
+
+
+def assign_shards(variant_counts, max_samples):
+    shard_assignments = {}
+    shard_number = 0
+    sample_counter = 0
+    first = True
+    for variant in variant_counts.keys():
+        if not first and (sample_counter + variant_counts[variant] > max_samples):
+            shard_number += 1
+            sample_counter = 0
+        shard_assignments[variant] = shard_number
+        sample_counter += variant_counts[variant]
+        first = False
+    return shard_number, shard_assignments
+
+
+def create_shards(infile, shard_assignments, num_shards):
+    if not path.isdir("./shards"):
+        mkdir("./shards")
+    with open(infile, 'r') as IN:
+        for line in IN:
+            var_id = line.strip().split('\t')[0]
+            shard = shard_assignments[var_id]
+            shard_file = f"shards/out.{shard}_{num_shards}.txt"
+            with open(shard_file, 'a') as OUT:
+                OUT.write(line)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("combined_file", help="rd_cn_revise file with variant ID, sample ID, and CN columns")
+    parser.add_argument("-s", "--max-samples",
+                        help="Maximum number of variant x sample entries in a shard (default = 7,000)",
+                        default=7000, type=int)
+    args = parser.parse_args()
+
+    variant_counts = count_variants(args.combined_file)
+    num_shards, shard_assignments = assign_shards(variant_counts, args.max_samples)
+    create_shards(args.combined_file, shard_assignments, num_shards)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.sh b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.sh
deleted file mode 100755
index d2defd28b..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/bin/bash
-#
-# clean_VCF_part3.sh
-#
-
-set -euo pipefail
-
-combined_file=$1
-
-awk '{print $1}' $combined_file \
-  | sort \
-  | uniq -c \
-  | sort -nrk1,1  \
-  > variant.count.txt
-
-final=0
-prev=0
-var=0
-
-while read line
-do
-  i=$(echo $line|awk -v prev=$prev '{print $1+prev}' )
-  let "var=$var+1"
-  if [ $i -gt 5000  ] || [ $var -gt 100  ]
-  then
-    final=$(echo $final|awk '{print $1+1}')
-    prev=0
-    var=0
-  else
-    prev=$i
-  fi
-done < variant.count.txt
-
-j=0
-prev=0
-mkdir shards
-
-while read line
-do
-  i=$(echo $line|awk -v prev=$prev '{print $1+prev}' )
-  let "var=$var+1"
-  if [ $i -gt 5000  ] || [ $var -gt 100  ]
-  then
-    j=$(echo $j|awk '{print $1+1}')
-    prev=0
-    var=0
-  else
-    prev=$i
-  fi
-  out=$(echo $j"_"$final)
-  echo $line|awk '{print $2}'|fgrep -wf - $combined_file >>shards/out.$out.txt || true
-done < variant.count.txt
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part4.sh b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part4.sh
deleted file mode 100755
index 0b29ae8cc..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part4.sh
+++ /dev/null
@@ -1,132 +0,0 @@
-#!/bin/bash
-#
-# clean_VCF_part4.sh
-#
-
-set -euxo pipefail
-
-##gzipped combined bed file##
-##combined output file from clean_vcf_part2.sh##
-RD_CN_revise_forgeno=$1
-normal_revise_vcf=$2
-
-
-##seed the vcf lines file which will provide the revisions to vcf file## 
-echo "">revise.vcf.lines.txt
-
-
-##reduce vcf to lines for given shard##
- cat <(zcat $normal_revise_vcf|sed -n '1,1000p' |egrep ^# ) \
-  <(zcat $normal_revise_vcf |fgrep -wf <(awk '{print $1}' $RD_CN_revise_forgeno|sort -u)) \
-  |bgzip \
-  >int.vcf.gz || true
-
-
-##get column ids##
-zcat $normal_revise_vcf \
-  |sed -n '1,1000p' \
-  |egrep ^# \
-  |tail -n 1 \
-  |tr '\t' '\n' \
-  |cat -n - \
-  >col.txt
-    
-
-##pull out and revise vcf line that needs to be edited##
-while read line
-do
- id=$(echo $line|awk '{print $2}' )
- col=$(awk -v id=$id '{if($2==id) print $1}' col.txt)
- variant=$(echo $line|awk '{print $1}')
- cn=$(echo $line|awk '{print $3}')
-
- zcat int.vcf.gz \
-   |{ fgrep -w $variant || true; } \
-    >line.txt
-
- echo $variant $id
- ##Updated genotype and rebuild Format field ##
- GT=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $1}')
- GQ=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $2}')
- RD_CN=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $3}')
- RD_GQ=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $4}')
- PE_GT=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $5}')
- PE_GQ=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $6}')
- SR_GT=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $7}')
- SR_GQ=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $8}')
- EV=$(awk -v col=$col  '{print $col}' line.txt|awk -F":" '{print $9}')
-
- if [ $(cat revise.vcf.lines.txt|fgrep -w $variant|wc -l) -gt 0 ]
- then
-  cat revise.vcf.lines.txt \
-   |awk -v col=$col -v var=$variant -v GT=$GT -v GQ=$GQ -v RD_CN=$cn -v RD_GQ=$RD_GQ -v PE_GT=$PE_GT -v PE_GQ=$PE_GQ -v SR_GT=$SR_GT -v SR_GQ=$SR_GQ -v EV=$EV '{if ($3==var ) $col="0/1:"GQ":"RD_CN":"RD_GQ":"PE_GT":"PE_GQ":"SR_GT":"SR_GQ":"EV ;print}' \
-   >int.lines.txt
-
-  cat int.lines.txt > revise.vcf.lines.txt
-
- else 
-  cat line.txt \
-  |awk -v col=$col -v var=$variant -v GT=$GT -v GQ=$GQ -v RD_CN=$cn -v RD_GQ=$RD_GQ -v PE_GT=$PE_GT -v PE_GQ=$PE_GQ -v SR_GT=$SR_GT -v SR_GQ=$SR_GQ -v EV=$EV '{if ($3==var ) $col="0/1:"GQ":"RD_CN":"RD_GQ":"PE_GT":"PE_GQ":"SR_GT":"SR_GQ":"EV ;print}' \
-  >>revise.vcf.lines.txt
-  fi
-
-done<$RD_CN_revise_forgeno
-
-
-bgzip revise.vcf.lines.txt
-
-
-##get multilallelic genotypes##
-##pull out lines for normal vcf for given batch##
-total_lines=$(zcat $normal_revise_vcf|egrep -v "^#"|wc -l)
-batch=$(ls $RD_CN_revise_forgeno|awk -F'/' '{print $NF}'|awk -F'[._]' '{print $2}'|awk '{if ($1==0) print 1; else print}')
-total_batch=$(ls $RD_CN_revise_forgeno|awk -F'/' '{print $NF}'|awk -F'[._]' '{print $3}'|awk '{if ($1==0) print 1; else print}')
-
-segments=$(echo $total_batch $total_lines|awk '{print $2/$1}')
-
- cat <(zcat $normal_revise_vcf|sed -n '1,1000p' |egrep ^# ) \
-  <(zcat $normal_revise_vcf |egrep -v "^#"|awk -v batch=$batch -v segments=$segments '{if (NR<=batch*segments && NR>=((batch-1)*segments) ) print }') \
-  |bgzip \
-  >split.vcf.gz
-
-
-for var in PE_GT SR_GT PE_GQ SR_GQ
-do
- zcat split.vcf.gz\
-  |awk -F'\t' '{if ($1!~"#") $1=$3;print}' OFS="\t" \
-  |vcftools --vcf - --stdout --extract-FORMAT-info ${var} \
-  |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1"@"header[j] "\t" $j }' \
-  |sort -k1,1 \
-  |gzip \
-  >multicheck.${var}.FORMAT.gz
-done
-
-##concatenate metrics##
-# get each line formatted as SITE@SAMPLE PE_GT PE_GQ SR_GT SR_GQ
-join -j 1 <(zcat multicheck.PE_GT.FORMAT.gz) \
-  <(zcat multicheck.PE_GQ.FORMAT.gz) \
-  |join -j 1  - <(zcat multicheck.SR_GT.FORMAT.gz) \
-  |join -j 1  - <(zcat multicheck.SR_GQ.FORMAT.gz) \
-  |tr ' ' '\t' \
-  |gzip \
-  >multi.combined.format.gz
-
-# Set the maximum allowable number of samples with a PE or SR GT > 3 to be 1% or 2, whichever is greater
-vf_1=$(zcat split.vcf.gz  |sed -n '1,1000p' |egrep -v "^##"|cut -f10-|awk 'NR==1{print (NF) * 0.01}' |awk '{if ($1 <= 2) {$1 = 2}; print $1}')
-
-# Choose the best of PE and SR genotypes for each site / sample
-# Count the number of samples with a GT over 3 for each site
-# Add site IDs with sample counts over $vf_1 to the multi.geno.ids.txt.gz file
-zcat multi.combined.format.gz \
-   |awk '{if ($2>0 && $4==0) print $1"\t" $2; \
-   else if ($2==0) print $1 "\t" $4; \
-   else if ($3>=$5)print $1"\t" $2; \
-   else print $1"\t" $4 }' \
-   |tr '@' '\t' \
-   |awk '{if ($3>2 && $2!=".") print $1}' \
-   |sort \
-   |uniq -c \
-   |awk -v vf_1=$vf_1 '{if ($1>vf_1)print $2}' \
-   |gzip \
-   >multi.geno.ids.txt.gz 
-
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5.sh b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5.sh
deleted file mode 100755
index 3704e086d..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5.sh
+++ /dev/null
@@ -1,332 +0,0 @@
-#!/bin/bash
-#
-# clean_VCF_part5.sh
-#
-
-set -euo pipefail
-
-##gzipped combined bed file##
-##combined output file from clean_vcf_part2.sh##
-revise_vcf_lines=$1
-normal_revise_vcf=$2
-famfile=$3
-sexchr_revise=$4
-multi_geno_ids_txt=$5
-outliers_samples_list=$6
-
-# use BCFTOOLS 1.9
-BCFTOOLS=/usr/local/bin/bcftools
-
-cat <(zcat $normal_revise_vcf|fgrep -wvf <(zcat $revise_vcf_lines|awk '{if ($1!="") print $3}'|sort -u)) \
-  <(zcat $revise_vcf_lines|awk '{if ($1!="") print}' |tr ' ' '\t') \
-  |vcf-sort \
-  |bgzip \
-  >overlap.revise.vcf.gz || true
-  
-##create bed of VCF##
-svtk vcf2bed overlap.revise.vcf.gz stdout|gzip> overlap.revise.bed.gz 
-
-##multi check##
-zcat overlap.revise.vcf.gz \
-  |awk '{if ($1!~"#") $1=$3;print}' OFS="\t" \
-  |vcftools --vcf - --remove $outliers_samples_list --stdout --extract-FORMAT-info RD_CN \
-  |gzip \
-  >copystate.RD_CN.FORMAT.gz
-
-zcat overlap.revise.vcf.gz \
-  |awk '{if ($1!~"#") $1=$3;print}' OFS="\t" \
-  |vcftools --vcf - --remove $outliers_samples_list --stdout --extract-FORMAT-info GT \
-  |gzip \
-  >genotype.gt.FORMAT.gz  
-
-##New method for determining copy state based on >1% of people having an multi-allelic copy state as define above##
-vf_1=$(zcat copystate.RD_CN.FORMAT.gz|awk 'NR==1{print (NF-2) * 0.01}'|awk '{if ($1<=1) print 2; else print }' )
-
-zcat copystate.RD_CN.FORMAT.gz \
-   |fgrep -wf <(zcat overlap.revise.bed.gz|awk -F"\t" '{if ($5=="DEL" && $3-$2>=1000) print $4}' ) \
-   |awk 'NR>1{for(i=3;i<=NF;i++) if ($i!="." && $i>3) print  $1 }' \
-   |sort \
-   |uniq -c \
-   |awk -v vf_1=$vf_1 '{if ($1>vf_1)print $2}' \
-   |gzip \
-    >multi.del.ids.txt.gz || true
-
-zcat copystate.RD_CN.FORMAT.gz \
-   |fgrep -wf <(zcat overlap.revise.bed.gz|awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}' ) \
-   |awk 'NR>1{for(i=3;i<=NF;i++) if ($i!="." && $i>4) print  $1 }' \
-   |sort \
-   |uniq -c \
-   |awk -v vf_1=$vf_1 '{if ($1>vf_1)print $2}' \
-    >multi.dup.ids.txt || true
-##Case with CN 0,1,2,3,4##
-zcat copystate.RD_CN.FORMAT.gz \
-   |fgrep -wf <(zcat overlap.revise.bed.gz \
-   |awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}') \
-   |awk 'NR>1{for(i=3;i<=NF;i++) if ($i!="." && ($i<1 || $i>4)) print  $1 "\t" $i }'\
-   |sort -u \
-   |awk  '{print $1}' \
-   |sort \
-   |uniq -c \
-   |awk '{if ($1>4) print $2}'>gt4copystate.txt ||true
-zcat copystate.RD_CN.FORMAT.gz \
-   |fgrep -wf <(zcat overlap.revise.bed.gz|awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}' ) \
-   |awk 'NR>1{for(i=3;i<=NF;i++) if ($i!="." && ($i<1 || $i>4)) print  $1 }' \
-   |sort \
-   |uniq -c \
-   |fgrep -wf gt4copystate.txt \
-   |awk -v vf_1=$vf_1 '{if ($1>vf_1)print $2}' \
-    >>multi.dup.ids.txt || true
-sort -u multi.dup.ids.txt|gzip >multi.dup.ids.txt.gz||true
-
-##Regenotype to determine multiallelic; we just change copy state for some nested variants and we need to make sure we get proper genotype for these; also previous stages have different notaion for multiallelic and we need to make this uniform; this is a CN based regenotyping so restricted to >5kb ##
-##Genotype big dup##
-svtk vcf2bed overlap.revise.vcf.gz stdout \
-  |gzip>regeno.bed.gz
-
-##add variants that are <5kb because clustering but have a mutliallelic genotype from before##
-zcat genotype.gt.FORMAT.gz \
-   |awk '{if ($1~"DUP") print}' \
-   |awk '{for (i = 3; i <= NF; ++i) print $1 "\t" $i}' \
-   |awk '{if ($2!="1/1" && $2!="0/0" && $2!="0/1" && $2!="./.") print $1}' \
-   |fgrep -wvf <(zcat multi.dup.ids.txt.gz) \
-   |sort -u>gt5kb.dup.ids.txt || true
-
-zcat genotype.gt.FORMAT.gz \
-   |awk '{if ($1~"DEL") print}' \
-   |awk '{for (i = 3; i <= NF; ++i) print $1 "\t" $i}' \
-   |awk '{if ($2!="1/1" && $2!="0/0" && $2!="0/1" && $2!="./.") print $1}' \
-   |fgrep -wvf <(zcat multi.del.ids.txt.gz) \
-   |sort -u>gt5kb.del.ids.txt || true
-
-##generate list##
-##CNV >5kb, split del and dup ##
-if [ -f multi.dup.ids.txt.gz ]
-then
- zcat regeno.bed.gz  \
-  |awk '{if ($3-$2>=5000 && $5=="DUP")print $4}' \
-  |fgrep -wvf <(zcat multi.dup.ids.txt.gz) \
-  >>gt5kb.dup.ids.txt || true
-else  
- zcat regeno.bed.gz  \
-  |awk '{if ($3-$2>=5000 && $5=="DUP")print $4}' \
-  >>gt5kb.dup.ids.txt
-fi  
-
-if [ -f multi.del.ids.txt.gz ]
-then
- zcat regeno.bed.gz \
-  |awk '{if ($3-$2>=5000 && $5=="DEL")print $4}' \
-  |fgrep -wvf <(zcat multi.del.ids.txt.gz) \
-  >>gt5kb.del.ids.txt || true
-else 
- zcat regeno.bed.gz \
-  |awk '{if ($3-$2>=5000 && $5=="DEL")print $4}' \
-  >>gt5kb.del.ids.txt
-fi
-
-
-zcat overlap.revise.vcf.gz \
-  |fgrep -wf gt5kb.dup.ids.txt \
-  >>dup.int.txt || true
-
-zcat overlap.revise.vcf.gz \
-  |fgrep -wf gt5kb.del.ids.txt \
-  >>del.int.txt || true
-
-##regenotype VCF##
-dellen=$(cat del.int.txt|wc -l)
-columnlen=$(less del.int.txt|cut -f10-|tr '\t' '\n' |wc -l)
-dellenchange=$(echo $dellen $columnlen|awk '{if ($1 == 0) { print "0" } else { print $2/$1}}')
-
-paste <(less del.int.txt|cut -f1-9) <(less del.int.txt|cut -f10-|tr '\t' '\n' \
- |awk -F':' '{if ($3>=2 && $1!="./.") $1="0/0"; \
-  else if ($3==1 && $1!="./.") $1="0/1"; \
-  else if ($1!="./.")$1="1/1";print}' OFS=":" \
-  |awk -v lenchange=$dellenchange 'NR%lenchange {printf("%s\t", $0); next} \
-    {print $0}')>del.revise.txt
-
-duplen=$(cat dup.int.txt|wc -l)
-columnlen=$(less dup.int.txt|cut -f10-|tr '\t' '\n' |wc -l)
-duplenchange=$(echo $duplen $columnlen|awk '{if ($1 == 0) { print "0" } else { print $2/$1}}')
-
-  
-paste <(less dup.int.txt|cut -f1-9) <(less dup.int.txt|cut -f10-|tr '\t' '\n' \
- |awk -F':' '{if ($3<=2 && $1!="./.") $1="0/0"; \
-  else if ($3==3 && $1!="./.") $1="0/1"; \
-  else if ($1!="./.") $1="1/1";print}' OFS=":" \
-  |awk -v lenchange=$duplenchange 'NR%lenchange {printf("%s\t", $0); next} \
-    {print $0}') >dup.revise.txt
-    
-
-cat <(zcat overlap.revise.vcf.gz|fgrep -wvf <(cat gt5kb.dup.ids.txt gt5kb.del.ids.txt)) \
-  <(cat dup.revise.txt del.revise.txt) \
-  |vcf-sort \
-  |bgzip \
-  >newdepth.geno.vcf.gz || true
-
-
-##Tag multi##
-##Add filters to header##
-zcat newdepth.geno.vcf.gz \
-  |awk -F'\t' 'NR==FNR{inFileA[$1]; next} {if ($3 in inFileA && $1!~"#" && $7!~"PESR_GT_OVERDISPERSION") $7=$7";PESR_GT_OVERDISPERSION"; print }' OFS='\t' <(cat <(zcat $multi_geno_ids_txt) <(printf "\n")) - \
-  |awk -F'\t' 'NR==FNR{inFileA[$1]; next} {if ($3 in inFileA && $1!~"#") $7=$7";MULTIALLELIC"; print }' OFS='\t' \
-   <(cat <(zcat multi.del.ids.txt.gz multi.dup.ids.txt.gz |sort -u) <(printf "\n")) - \
-  |sed 's\PASS;\\g' \
-  |awk '{if (NR==2) print $0 "\n" "##FILTER=<ID=PESR_GT_OVERDISPERSION,Description=\"High PESR dispersion count\">" ;else print}' \
-  |awk '{if (NR==2) print $0 "\n" "##FILTER=<ID=MULTIALLELIC,Description=\"Multiallelic copy number variant>" ;else print}' \
-  |bgzip \
-  >multitagged.vcf.gz
-tabix multitagged.vcf.gz
-
-touch all.multi.revised.list
-
-touch dup.multi.revise.vcf
-if [ $(zcat multi.dup.ids.txt.gz|wc -l) -ge 1  ]
-then
-  /opt/sv-pipeline/04_variant_resolution/scripts/reset_multiallelic_format_fields.py multitagged.vcf.gz <(zcat multi.dup.ids.txt.gz) > dup.multi.revise.vcf
- ${BCFTOOLS} query -f '%ID\n' dup.multi.revise.vcf >> all.multi.revised.list
-fi
-
-touch del.multi.revise.vcf
-if [ $(zcat multi.del.ids.txt.gz|wc -l) -ge 1 ]
-then
-  /opt/sv-pipeline/04_variant_resolution/scripts/reset_multiallelic_format_fields.py multitagged.vcf.gz <(zcat multi.del.ids.txt.gz) > del.multi.revise.vcf
- ${BCFTOOLS} query -f '%ID\n' del.multi.revise.vcf >> all.multi.revised.list
-fi
-
-# make sure that the new header includes CN and CNQ format fields if we set any
-if [ -s dup.multi.revise.vcf ]
-then
-  grep '^#' dup.multi.revise.vcf > new_header.vcf
-elif [ -s  del.multi.revise.vcf ]
-then
-  grep '^#' del.multi.revise.vcf > new_header.vcf
-else
-  zcat multitagged.vcf.gz | grep '^#' > new_header.vcf
-fi
-
-# combine the revised variants with the unrevised variants, reheader, resort, and compress
- cat <(zcat multitagged.vcf.gz| \
-  fgrep -wvf all.multi.revised.list) \
-  <(cat del.multi.revise.vcf dup.multi.revise.vcf \
-  | grep -v '^#' \
-  |awk '!seen[$3]++') \
-  |${BCFTOOLS} reheader -h new_header.vcf \
-  |vcf-sort \
-  |bgzip \
-  >multitagged.geno.vcf.gz || true
-         
-##remove overlapping multi###
-zcat multitagged.vcf.gz \
-  |awk -F'\t' '{if ($1~"#" || ($7~"MULTIALLELIC" &&  ($5=="<DEL>" || $5=="<DUP>"))) print}' \
-  |svtk vcf2bed stdin stdout  \
-  |cut -f1-5 \
-  |gzip \
-  >multi.bed.gz
-
-##strip out overlapping multiallelics##
-bedtools intersect -wa -wb -a  multi.bed.gz -b  multi.bed.gz \
-  |awk -F'\t' '{if ($4!=$9 && $3-$2>=$8-$7) print $0; \
-  else if ($4!=$9) print $6,$7,$8,$9,$10,$1,$2,$3,$4,$5}' OFS="\t" \
-  |sort -u \
-  |awk '{print $3-$2,$8-$7,$0}' OFS="\t"  \
-  |sort -nrk1,1 -k2,2nr \
-  |cut -f3- \
-  >multi.bed.overlap.txt
-
-echo "">multi.remove.txt
-
-while read bed
-do
-  echo "$bed"|cut -d$'\t' -f1-5 >large.bed
-  echo "$bed"|cut -d$'\t' -f6-10>small.bed
-  overlap=$(bedtools coverage -a small.bed -b large.bed|awk '{if ($NF>0.50) print "YES";else print "NO"}')  
-  echo $bed|awk '{print $4}'
-  if [ "$overlap" == "YES" ] && [ $(awk '{print $4}' large.bed|fgrep -wf - multi.remove.txt|wc -l) -eq 0 ]
-  then
-  awk '{print $4}' small.bed >>multi.remove.txt
-  fi
-done< multi.bed.overlap.txt
-
-##get alt tag for multiallelics##
-## produces a file with a row for each distinct multialllic variant ID and copy number combination
-${BCFTOOLS} query -i 'FILTER = "MULTIALLELIC"' -f '[%ID\t%CN\n]' multitagged.geno.vcf.gz \
- |sort -u >multi.cn.txt
-
-##strip out variants with no genotypes and overlapping multiallelics##
-### Find missing genotype and then add multiallelics that need to be removed###
-##change multiallelics svtype into mCNV##
-##add CN information to ALT column##
-zcat multitagged.geno.vcf.gz \
-  |${BCFTOOLS} view -e 'FILTER == "MULTIALLELIC"'  \
-  |svtk vcf2bed stdin stdout \
-  |awk -F'\t' '{if ($6=="") print $4}' \
-  |cat - multi.remove.txt \
-  |sed '/^$/d' \
-  |fgrep -wvf - <(zcat multitagged.geno.vcf.gz ) \
-  |awk -F';' '{if ($1~"MULTIALLELIC" && ( $2~"DEL" || $2~"DUP")) $2="SVTYPE=CNV"; print}' OFS=';' \
-  |awk '{OFS="\t"; if ($8~"SVTYPE=CNV;") $5="<CNV>"; print}' \
-  |bgzip \
-  >cleantagandmulti.vcf.gz || true
-  
-##add back original CN for sex variants which had to be changed for multiallelic##
-
-if [ $(zcat cleantagandmulti.vcf.gz|awk '{if (($1~"X" || $1~"Y") && $1!~"#") print}'|wc -l) -gt 0 ]
-then
-##Determine columns male columns##
-zcat cleantagandmulti.vcf.gz\
-  |egrep ^# \
-  |tail -n 1 \
-  |tr '\t' '\n' \
-  |cat -n - \
-  >col.txt
-  
-awk '{if ($5==1) print $2}' $famfile \
-  |fgrep -wf - col.txt \
-   >malecols.txt || true
-
-##regenotype male calls on sex chr and add 1 to copy state for multialleic check##
-zcat cleantagandmulti.vcf.gz \
-   |fgrep -wf <(grep . $sexchr_revise || true) \
-   |awk  -v OFS='\t' 'NR == FNR {list[$1]; next} { for (col in list) $col="MALE"$col; print $0 }' malecols.txt - \
-   |awk '{print $0 "\t" "ENDOFLINE"}' \
-   |tr '\t' '\n' \
-   |awk -F':' '{ if ($0!~"SVTYPE" && NF>4 && $1~"MALE" && $1!="GT" && $3-1>=0 && $3!=".") $3=$3-1;print}' OFS=":" \
-   |sed 's/^MALE//g' \
-   |tr '\n' '\t' \
-   |sed 's/ENDOFLINE/\n/g' \
-   |sed -e 's/^[ \t]*//' \
-   |sed -e 's/[\t]$//g' \
-   |bgzip \
-   >sexchr.backtoorig.txt.gz || true
-
-cat <(zcat cleantagandmulti.vcf.gz|fgrep -wvf <(zcat sexchr.backtoorig.txt.gz|awk '{print $3}'  )) \
-  <(zcat sexchr.backtoorig.txt.gz |awk '{if ($1!="") print}' |tr ' ' '\t') \
-  |vcf-sort \
-  |bgzip \
-  >cleansexCN.vcf.gz || true
-
-else 
-cp cleantagandmulti.vcf.gz cleansexCN.vcf.gz
-
-fi
-
-mv cleansexCN.vcf.gz cleanGQ.vcf.gz  
-
-##find blank variants with no samples##
-svtk vcf2bed cleanGQ.vcf.gz stdout \
- |awk -F'\t' '{if ($5!~"CN" && $6=="") print $4}' \
- >blankcheck.ids.txt
-
-##Fix header##
-##get header to clean##
-##add new filters##
-zcat cleanGQ.vcf.gz \
-  |awk '{if ($1~"##" && NR>1)  print}' \
-  |fgrep -v "MULTIALLELIC" \
-  |awk '{if (NR==2) print $0 "\n" "##FILTER=<ID=MULTIALLELIC,Description=\"Multiallelic site\">" ;else print}' \
-  |awk '{if (NR==2) print $0 "\n" "##ALT=<ID=CNV,Description=\"Copy Number Polymorphism\">" ;else print}' \
-  |sort -k1,1 \
-  |egrep -v "CIPOS|CIEND|RMSSTD|EVENT|INFO=<ID=UNRESOLVED,|source|varGQ|bcftools|ALT=<ID=UNR" \
-  |cat <(zcat cleanGQ.vcf.gz|head -n 1) - <(zcat cleanGQ.vcf.gz|fgrep -wvf blankcheck.ids.txt |awk '{if ($1!~"##")  print}') \
-  |bgzip >polished.vcf.gz || true
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py
new file mode 100755
index 000000000..ad2b744a5
--- /dev/null
+++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py
@@ -0,0 +1,60 @@
+#!/usr/bin/env python
+
+import argparse
+import sys
+import svtk.utils as svu
+
+
+def process_features_for_size1(features_for_size1, redundant_multiallelics):
+    for intersection in sorted(features_for_size1, key=lambda x: int(x[9]) - int(x[8]), reverse=True):
+        b_len = int(intersection.fields[9]) - int(intersection.fields[8])
+        overlap = int(intersection.fields[14])
+        small_coverage = overlap / b_len
+        if small_coverage > 0.50:
+            if intersection.fields[3] not in redundant_multiallelics:
+                redundant_multiallelics.add(intersection.fields[10])
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('multiallelic_filename')
+    parser.add_argument('fout')
+    args = parser.parse_args()
+
+    print("finding redundant overlapping sites", file=sys.stderr)
+    multiallelic_bed = svu.vcf2bedtool(args.multiallelic_filename, include_filters=True)
+
+    redundant_multiallelics = set()
+    # feature fields:
+    #   [1] : first interval start
+    #   [2] : first interval end
+    #   [3] : first interval variant ID
+    #   [8] : second interval start
+    #   [9] : second interval end
+    #   [10] : second interval variant ID
+    self_inter = multiallelic_bed.intersect(multiallelic_bed, wo=True)\
+        .filter(lambda feature: feature[3] != feature[10]) \
+        .filter(lambda feature: (int(feature[2]) - int(feature[1])) >= (int(feature[9]) - int(feature[8]))) \
+        .sort(sizeD=True)
+    current_size1 = -1
+    features_for_size1 = []
+    for feature in self_inter:
+        size1 = int(feature[2]) - int(feature[1])
+        if size1 != current_size1:
+            process_features_for_size1(features_for_size1, redundant_multiallelics)
+            features_for_size1 = []
+
+        current_size1 = size1
+        features_for_size1.append(feature)
+
+    process_features_for_size1(features_for_size1, redundant_multiallelics)
+    print("identified {} redundant multiallelic sites".format(len(redundant_multiallelics)), file=sys.stderr)
+    with open(args.fout, "w") as list_file:
+        for vid in redundant_multiallelics:
+            print(vid, file=list_file)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py
new file mode 100755
index 000000000..1e28b90af
--- /dev/null
+++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py
@@ -0,0 +1,181 @@
+#!/usr/bin/env python
+
+import argparse
+from collections import Counter
+import gzip
+import pysam
+import sys
+import svtk.utils as svu
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('revise_vcf_lines', type=argparse.FileType('r'))
+    parser.add_argument('normal_revise_vcf')
+    parser.add_argument('famfile', type=argparse.FileType('r'))
+    parser.add_argument('sexchr_revise')
+    parser.add_argument('multi_geno_ids_txt')
+    parser.add_argument('outlier_samples_list', type=argparse.FileType('r'))
+    parser.add_argument('out_prefix')
+    parser.add_argument('--threads_per_file', required=False, default=2, type=int)
+    args = parser.parse_args()
+
+    # load the revised lines and index by ID
+    revised_lines_by_id = {}
+    with pysam.VariantFile(args.revise_vcf_lines, threads=args.threads_per_file) as revise_vcf:
+        header2 = revise_vcf.header
+        revised_lines_by_id = {record.id: record for record in revise_vcf}
+    print("loaded {} revised lines".format(len(revised_lines_by_id)), file=sys.stderr)
+
+    outlier_samples = set([line.rstrip() for line in args.outlier_samples_list if not line.isspace()])
+    print("loaded {} outlier samples".format(len(outlier_samples)), file=sys.stderr)
+
+    male_samples = set()
+    for line in args.famfile:
+        if line.isspace():
+            continue
+        fields = line.rstrip().split("\t")
+        if fields[4] == '1':
+            male_samples.add(fields[1])
+    print("identified {} male samples".format(len(male_samples)), file=sys.stderr)
+
+    if args.sexchr_revise.endswith(".gz"):
+        sexchr_revise = {line.rstrip() for line in gzip.open(args.sexchr_revise, 'rt')}
+    else:
+        sexchr_revise = {line.rstrip() for line in open(args.sexchr_revise, 'rt')}
+    print("{} sites to revise on sex chromosomes".format(len(sexchr_revise)), file=sys.stderr)
+
+    if args.multi_geno_ids_txt.endswith(".gz"):
+        multi_geno_ids = {line.rstrip() for line in gzip.open(args.multi_geno_ids_txt, 'rt')}
+    else:
+        multi_geno_ids = {line.rstrip() for line in open(args.multi_geno_ids_txt, 'rt')}
+    print("{} multiallelic sites".format(len(multi_geno_ids)), file=sys.stderr)
+
+    NEW_HEADER_LINES = ['##ALT=<ID=CNV,Description="Copy Number Polymorphism">',
+                        '##FORMAT=<ID=CNQ,Number=1,Type=Integer,Description="Read-depth genotype quality">',
+                        '##FORMAT=<ID=CN,Number=1,Type=Integer,Description="Predicted copy state">',
+                        '##FILTER=<ID=PESR_GT_OVERDISPERSION,Description="High PESR dispersion count">',
+                        '##FILTER=<ID=MULTIALLELIC,Description="Multiallelic site">']
+
+    with pysam.VariantFile(args.normal_revise_vcf) as normal_vcf:
+
+        # # Add metadata lines for annotations
+        header1 = normal_vcf.header
+
+        for f in NEW_HEADER_LINES:
+            header1.add_line(f)
+            header2.add_line(f)
+
+        non_outlier_samples = {s for s in header1.samples if s not in outlier_samples}
+        vf_1 = max(len(non_outlier_samples) * 0.01, 2)
+
+        biallelic_gts = {(1, 1), (0, 0), (0, 1), (None, None)}
+
+        print("reformatting records", file=sys.stderr)
+        cleangq_filename = args.out_prefix + ".cleanGQ.vcf.gz"
+        multiallelic_filename = args.out_prefix + ".multiallelic.vcf.gz"
+        no_variant_samples_list_file = args.out_prefix + ".no_called_samples.list"
+
+        with pysam.VariantFile(cleangq_filename, 'w', header=normal_vcf.header, threads=args.threads_per_file) as cleanqg_out, \
+                pysam.VariantFile(multiallelic_filename, 'w', header=normal_vcf.header) as multiallelic_out, \
+                open(no_variant_samples_list_file, 'w') as no_variant_samples_out:
+            for idx, record in enumerate(normal_vcf):
+                multi_del = False
+                multi_dup = False
+                gt4_copystate = False
+                gt5kb_dup = False
+                gt5kb_del = False
+                if (idx - 1) % 1000 == 0:
+                    print("processed {} records".format(idx), file=sys.stderr)
+                if record.id in revised_lines_by_id:
+                    record = revised_lines_by_id[record.id]
+                if record.info.get('SVTYPE', None) == 'DEL':
+                    if abs(record.stop - record.pos) >= 1000:
+                        sample_cn_map = {s: record.samples[s]['RD_CN'] for s in non_outlier_samples}
+                        if len([s for s in sample_cn_map if (sample_cn_map[s] is not None and sample_cn_map[s] > 3)]) > vf_1:
+                            multi_del = True
+                    gts = [record.samples[s]['GT'] for s in non_outlier_samples]
+                    if any(gt not in biallelic_gts for gt in gts):
+                        gt5kb_del = True
+                    if abs(record.stop - record.pos) >= 5000:
+                        if not multi_del:
+                            gt5kb_del = True
+
+                if record.info.get('SVTYPE', None) == 'DUP':
+                    if abs(record.stop - record.pos) >= 1000:
+                        sample_cn_map = {s: record.samples[s]['RD_CN'] for s in non_outlier_samples}
+                        if sum(1 for s in sample_cn_map if sample_cn_map[s] is not None and sample_cn_map[s] > 4) > vf_1:
+                            multi_dup = True
+                        if sum(1 for x in Counter(sample_cn_map.values()) if x is not None and (x < 1 or x > 4)) > 4:
+                            gt4_copystate = True
+                        if sum(1 for s in sample_cn_map if sample_cn_map[s] is not None and
+                                (sample_cn_map[s] < 1 or sample_cn_map[s] > 4) and
+                                gt4_copystate) > vf_1:
+                            multi_dup = True
+                    gts = [record.samples[s]['GT'] for s in non_outlier_samples]
+                    if any(gt not in biallelic_gts for gt in gts):
+                        gt5kb_dup = True
+                    if abs(record.stop - record.pos) >= 5000:
+                        if not multi_dup:
+                            gt5kb_dup = True
+
+                if gt5kb_del:
+                    for sample_obj in record.samples.itervalues():
+                        if not sample_obj['GQ'] is None and sample_obj['RD_CN'] >= 2:
+                            sample_obj['GT'] = (0, 0)
+                        elif not sample_obj['GQ'] is None and sample_obj['RD_CN'] == 1:
+                            sample_obj['GT'] = (0, 1)
+                        elif not sample_obj['GQ'] is None:
+                            sample_obj['GT'] = (1, 1)  # RD_CN 0 DEL
+
+                if gt5kb_dup:
+                    for sample_obj in record.samples.itervalues():
+                        if not sample_obj['GQ'] is None and sample_obj['RD_CN'] <= 2:
+                            sample_obj['GT'] = (0, 0)
+                        elif not sample_obj['GQ'] is None and sample_obj['RD_CN'] == 3:
+                            sample_obj['GT'] = (0, 1)
+                        elif not sample_obj['GQ'] is None:
+                            sample_obj['GT'] = (1, 1)  # RD_CN > 3 DUP
+
+                if record.id in multi_geno_ids:
+                    record.filter.add('PESR_GT_OVERDISPERSION')
+
+                if multi_del or multi_dup:
+                    record.filter.add('MULTIALLELIC')
+                    for j, sample in enumerate(record.samples):
+                        record.samples[sample]['GT'] = None
+                        record.samples[sample]['GQ'] = None
+                        record.samples[sample]['CN'] = record.samples[sample]['RD_CN']
+                        record.samples[sample]['CNQ'] = record.samples[sample]['RD_GQ']
+
+                if len(record.filter) > 1 and 'PASS' in record.filter:
+                    del record.filter['PASS']
+
+                if 'MULTIALLELIC' in record.filter and ('<DUP>' in record.alts or '<DEL>' in record.alts):
+                    record.alts = ('<CNV>',)
+                    record.info['SVTYPE'] = 'CNV'
+
+                if record.id in sexchr_revise:
+                    for sample in record.samples:
+                        if sample in male_samples:
+                            cn = int(record.samples[sample]['RD_CN'])
+                            if cn is not None and cn > 0:
+                                record.samples[sample]['RD_CN'] = cn - 1
+                                if 'CN' in record.samples[sample]:
+                                    record.samples[sample]['CN'] = cn - 1  # the old script didn't do this but I think it should
+
+                cleanqg_out.write(record)
+
+                if 'MULTIALLELIC' in record.filter:
+                    multiallelic_out.write(record)
+
+                if len(svu.get_called_samples(record)) == 0:
+                    print(record.id, file=no_variant_samples_out)
+
+    print("done", file=sys.stderr)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/merge_pesr_depth.py b/src/sv-pipeline/04_variant_resolution/scripts/merge_pesr_depth.py
index f5c830eba..f380cd18e 100755
--- a/src/sv-pipeline/04_variant_resolution/scripts/merge_pesr_depth.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/merge_pesr_depth.py
@@ -8,10 +8,10 @@
 import svtk.utils as svu
 
 
-def merge_pesr_depth(vcf, fout, prefix, frac=0.5, sample_overlap=0.5):
+def merge_pesr_depth(vcf, fout, prefix, frac, sample_overlap, min_depth_only_size):
 
-    sample_overlap_cache = {}
-    sample_id_to_index_dict = {s: i for i, s in enumerate(vcf.header.samples)}
+    def _get_shard_path(base_path, index):
+        return "{}.shard_{}.vcf.gz".format(base_path, index)
 
     # Given one pesr record and one depth record, merge depth attributes into the pesr record
     def _merge_pair(record_a, record_b):
@@ -89,6 +89,8 @@ def _flush_sample_overlap_cache():
         sample_overlap_cache.clear()
 
     def _sample_overlap(record_a, record_b):
+        if sample_overlap == 0:
+            return True
         _cache_sample_overlap(record_a)
         _cache_sample_overlap(record_b)
         return svu.samples_overlap(sample_overlap_cache[record_a.id], sample_overlap_cache[record_b.id],
@@ -106,6 +108,11 @@ def _get_base_record(vcf):
                 vcf.reset()
                 return record
 
+    sample_overlap_cache = {}
+    sample_id_to_index_dict = {s: i for i, s in enumerate(vcf.header.samples)}
+    cnv_types = ['DEL', 'DUP']
+    min_svlen = min_depth_only_size * frac
+
     base_record = _get_base_record(vcf)
     if base_record is None:
         raise ValueError("No PESR records were found")
@@ -118,9 +125,19 @@ def _get_base_record(vcf):
     count = 0
     for record in vcf.fetch():
 
+        if count > 0 and count % 1000 == 0:
+            sys.stderr.write("Traversed {} records; {} active records; {} record sample sets cached\n"
+                             .format(count, len(active_records), len(sample_overlap_cache)))
+        count += 1
+
         # Seed MEMBERS info with original VID
         record.info['MEMBERS'] = (record.id,)
 
+        if record.info['SVTYPE'] not in cnv_types \
+                or record.info['SVLEN'] < min_svlen:
+            _write_record(record, False)
+            continue
+
         # Write all-ref sites as "salvaged"
         samples = _cache_sample_overlap(record)
         if len(samples) == 0:
@@ -150,9 +167,6 @@ def _get_base_record(vcf):
                         clustered_depth_ids.add(ar.id)
         active_records.append(record)
         active_records = [r for r in active_records if r.id not in finalized_record_ids]
-        if count % 1000 == 0:
-            sys.stderr.write("{}: {}\n".format(count, len(sample_overlap_cache)))
-        count += 1
 
     _flush_active_records()
 
@@ -191,19 +205,23 @@ def main():
         description=__doc__,
         formatter_class=argparse.RawDescriptionHelpFormatter)
     parser.add_argument('vcf', help='Combined but unmerged VCF of PE/SR calls')
-    parser.add_argument('fout', help='Output VCF (unsorted!), can be "-" or "stdout"')
+    parser.add_argument('fout', help='Output VCF (unsorted!)')
+    parser.add_argument('--interval-overlap', help='Interval reciprocal overlap fraction',
+                        type=float, default=0.5)
+    parser.add_argument('--sample-overlap', help='Sample overlap fraction',
+                        type=float, default=0.5)
+    parser.add_argument('--min-depth-only-size', help='Smallest depth only call SVLEN',
+                        type=int, default=5000)
     parser.add_argument('--prefix', default='pesr_rd_merged')
     args = parser.parse_args()
 
     vcf = pysam.VariantFile(args.vcf)
     check_header(vcf)
-
-    if args.fout in '- stdout'.split():
-        fout = pysam.VariantFile(sys.stdout, 'w', header=vcf.header)
-    else:
-        fout = pysam.VariantFile(args.fout, 'w', header=vcf.header)
-
-    merge_pesr_depth(vcf, fout, args.prefix)
+    fout = pysam.VariantFile(args.fout, 'w', header=vcf.header)
+    merge_pesr_depth(vcf, fout=fout, prefix=args.prefix,
+                     frac=args.interval_overlap,
+                     sample_overlap=args.sample_overlap,
+                     min_depth_only_size=args.min_depth_only_size)
     fout.close()
 
 
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py b/src/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py
index 18ffd5271..ec1748747 100644
--- a/src/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py
+++ b/src/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py
@@ -12,6 +12,11 @@
 VCF_PATH = sys.argv[1]
 BOTHSIDE_PASS_PATH = sys.argv[2]
 BACKGROUND_FAIL_PATH = sys.argv[3]
+DROPPED_RECORD_OUTPUT_VCF_PATH = sys.argv[4]
+if len(sys.argv) >= 6:
+    DEBUG_OUTPUT_PATH = sys.argv[5]
+else:
+    DEBUG_OUTPUT_PATH = None
 
 
 # Sorts list xs by specified attributes
@@ -25,7 +30,10 @@ def multisort(xs, specs):
 class RecordData:
     def __init__(self, record):
         self.id = record.id
-        ev = set(record.info['EVIDENCE'])
+        if 'EVIDENCE' in record.info:
+            ev = set(record.info['EVIDENCE'])
+        else:
+            ev = set()
         if 'PE' in ev and 'SR' in ev and 'RD' in ev:
             self.level_of_support = 1
         elif 'PE' in ev and 'RD' in ev:
@@ -55,9 +63,12 @@ def __init__(self, record):
         self.freq = len(self.called_samples)
         self.length = record.info['SVLEN']
         self.gt_50bp = self.length >= 50
+        self.is_mei = 'melt' in record.info['ALGORITHMS']
 
-    def __repr__(self):
-        return repr((self.level_of_support, -self.both_end_support, self.sr_fail, self.is_bnd, -self.vargq, -self.freq, self.gt_50bp, self.length, self.id))
+    def __str__(self):
+        return ",".join(str(x) for x in
+                        (self.is_bnd, self.level_of_support, self.is_mei, self.both_end_support,
+                         self.sr_fail, self.vargq, self.freq, self.gt_50bp, self.length, self.id))
 
 
 vcf = pysam.VariantFile(VCF_PATH)
@@ -117,10 +128,11 @@ def __repr__(self):
 
 # This is how we sort record pairs to determine which one gets filtered
 sort_spec = [
+    ('is_bnd', False),
     ('level_of_support', False),
+    ('is_mei', True),
     ('both_end_support', True),
     ('sr_fail', False),
-    ('is_bnd', False),
     ('vargq', True),
     ('freq', True),
     ('gt_50bp', False),
@@ -129,7 +141,10 @@ def __repr__(self):
 ]
 
 # Iterate through record pairs and generate list of record ids to filter out
-ids_to_remove = set([])
+ids_to_remove_dict = dict()
+if DEBUG_OUTPUT_PATH is not None:
+    debug = open(DEBUG_OUTPUT_PATH, 'w')
+    debug.write("#record_kept\trecord_dropped\n")
 for data_list in pairwise_record_data:
     # Check for 50% sample overlap
     sample_intersection = set(data_list[0].called_samples).intersection(data_list[1].called_samples)
@@ -138,13 +153,29 @@ def __repr__(self):
         continue
     # Determine which to filter
     sorted_data_list = multisort(list(data_list), sort_spec)
-    ids_to_remove.add(sorted_data_list[1].id)
+    ids_to_remove_dict[sorted_data_list[1].id] = sorted_data_list[0].id
+    if DEBUG_OUTPUT_PATH is not None:
+        debug.write("\t".join(str(x) for x in sorted_data_list) + "\n")
+if DEBUG_OUTPUT_PATH is not None:
+    debug.close()
 
 # Perform filtering
-sys.stderr.write("Filtering {} records\n".format(len(ids_to_remove)))
+sys.stderr.write("Filtering {} records\n".format(len(ids_to_remove_dict)))
 vcf = pysam.VariantFile(VCF_PATH)
-sys.stdout.write(str(vcf.header))
+header = vcf.header
+sys.stdout.write(str(header))
+
+# Create
+header.add_line(
+    '##INFO=<ID=BPID,Number=.,Type=String,'
+    'Description="ID of retained variant from breakpoint overlap filtering">')
+dropped_record_vcf = pysam.VariantFile(DROPPED_RECORD_OUTPUT_VCF_PATH, 'w', header=header)
+
 for record in vcf:
-    if record.id not in ids_to_remove:
+    if record.id in ids_to_remove_dict:
+        record.info['BPID'] = ids_to_remove_dict[record.id]
+        dropped_record_vcf.write(record)
+    else:
         sys.stdout.write(str(record))
 vcf.close()
+dropped_record_vcf.close()
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/overlapbpchange.sh b/src/sv-pipeline/04_variant_resolution/scripts/overlapbpchange.sh
deleted file mode 100755
index 41b7692a1..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/overlapbpchange.sh
+++ /dev/null
@@ -1,111 +0,0 @@
-#!/bin/bash
-#
-# overlapbpchange.sh
-#
-
-set -euo pipefail
-
-##Inputs##
-vcf=$1
-##sr fail##
-backgroundlist=$2
-##sr support on both sides##
-bothendSR=$3
-
-##clean out variants that overlap at one site##
-##pull out variants with duplicate bp that are not driven by depth which will be integrated in the clean vcf##
-##make sure to flip bed as well so second bp location can be compared with first from other variants##
-svtk vcf2bed $vcf stdout -i CHR2 -i STRANDS -i SVLEN -i varGQ -i END -i EVIDENCE -i SVTYPE --split-bnd  \
-  | sed "s/+-/+ "$'\t -/g' \
-  | sed "s/-+/- "$'\t +/g' \
-  | sed "s/++/+ "$'\t +/g' \
-  | sed "s/--/- "$'\t -/g' | \
-  ##Convert back to 1-based positions##
-  awk -v OFS='\t' '{$2=$2+1; print $0}' \
-  | awk -v OFS='\t' \
-    '{if (!(($NF=="DEL" || $NF=="DUP") && $10>=5000)) print $0 "\n" $7,$12,$2,$4,$5,$6,$1,$9,$8,$10,$11,$2,$13,$14 }' | \
-  ###Find duplicated variants that overlap at same bp one side##
-  awk 'cnt[$1"_"$2"_"$8]++{if (cnt[$1"_"$2"_"$8]==2) print prev[$1"_"$2"_"$8] "\t" $1"_"$2"_"$8 \
-  ; print $0 "\t" $1"_"$2"_"$8} {prev[$1"_"$2"_"$8]=$0}' \
-  | awk '!seen[$4"_"$NF]++' \
-  | awk 'cnt[$NF]++{if (cnt[$NF]==2) print prev[$NF] \
-  ; print $0 } {prev[$NF]=$0}' \
-  >dupside1.bed
-
-
-##Find 50% overlap between samples for overlaps##
-join -j 2 <(awk '{print $NF "\t" $6}' dupside1.bed \
-            | awk -F'[,\t]' '{for (i=2;i<=NF;i++) print $1 "\t" $i}' \
-            | sort \
-            | uniq -D \
-            | awk '{print $1}'|sort|uniq -c  ) \
-          <(awk '{print $NF "\t" $6}' dupside1.bed \
-            | awk -F'[,\t]' '{for (i=2;i<=NF;i++) print $1 "\t" $i}' \
-            | awk '{print $1}' \
-            | sort \
-            | uniq -c) \
-  | awk '{if ($2 >= 0.5 * $3) print $1}' \
-  | (fgrep -wf - dupside1.bed || printf "") \
-  > dupside1.freq50.txt
-
-##Add SRfail###
-{ fgrep -wf <(awk '{print $NF}' $backgroundlist) dupside1.freq50.txt || true; } \
-  | awk '{print $0 "\t" 0}' \
-  > dupside1.passSR.txt
-
-{ fgrep -wvf <(awk '{print $NF}' $backgroundlist) dupside1.freq50.txt || true; } \
-  | awk '{print $0 "\t" 1}' \
-  >> dupside1.passSR.txt
-
-##Attach the % of variants that show SR support at bothends##
-join -1 4 -2 1 -e "0" -a 1 -o 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 1.12 1.13 1.14 1.15 1.16 2.2 \
-   <(sort -k4,4 dupside1.passSR.txt) \
-   <(awk '{print $NF "\t" $1}' $bothendSR | sort -k1,1) \
-   | tr ' ' '\t' \
-   > dupside1.bothpassfilter.txt
-rm dupside1.passSR.txt
-
-##count number of samples and indiciate if size gt 50bp##
-join -1 4 -2 1 dupside1.bothpassfilter.txt \
-   <(awk '{print $4 "\t" $6}' dupside1.bed \
-   | awk -F'[,\t]' '{print $1 "\t" NF-1}' \
-   | sort -k1,1) \
-   | tr ' ' '\t' \
-   | awk '{if ($10>=50) print $0 "\t" 1;else print $0 "\t" 0}' \
-   > dupside1.samplecountfilter.txt
-rm dupside1.bed dupside1.bothpassfilter.txt
-
-##Convert Evidence column into Integers for scoring and ##
-##RD,PE,SR-1,RD,PE-2,PE,SR-3,RD,SR-4,PE-5,RD-6,SR-7##
-sed 's/BAF,//g' dupside1.samplecountfilter.txt \
-  | awk -v OFS='\t' '
-        {
-          if ($13=="PE,RD,SR") print $0 "\t" 1
-          else if ($13=="PE,RD") print $0 "\t" 2
-          else if ($13=="PE,SR") print $0 "\t" 3
-          else if ($13=="RD,SR") print $0 "\t" 4
-          else if ($13=="PE") print $0 "\t" 5
-          else if ($13=="RD") print $0 "\t" 6
-          else if ($13=="SR") print $0 "\t" 7
-        }' | \
-  ##assign BND to bottom
-  awk '{if ($14=="BND") print $0 "\t" 0;else print $0 "\t" 1}' \
-  > dupside1.allfilter.txt
-rm dupside1.samplecountfilter.txt
-###DO THIS#####
-##
-
-
-##sort file with overlapping samples LevelofSupport->BothEndsupport->SRfail-> Not BND->Higher varq-> Higher Freq -> Smallest size if gt 5kb##
-sort -k20,20n -k17,17nr -nrk16,16 -k21,21nr -k11,11nr -k18,18nr  -k19,19nr -k10,10n dupside1.allfilter.txt \
-  | awk '!seen[$15]++' \
-  | awk '{print $1}' \
-  | (fgrep -wvf - dupside1.freq50.txt || printf "") \
-  | awk '{print $4}' \
-  > remove.side1.var.txt
-rm dupside1.freq50.txt dupside1.allfilter.txt
-
-##remove variants with samebp##
-(zgrep -wvf remove.side1.var.txt $vcf || printf "") \
-  | bgzip \
-  > non_redundant.vcf.gz
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/patch_sr_bothside_support.py b/src/sv-pipeline/04_variant_resolution/scripts/patch_sr_bothside_support.py
new file mode 100644
index 000000000..ac7e2b879
--- /dev/null
+++ b/src/sv-pipeline/04_variant_resolution/scripts/patch_sr_bothside_support.py
@@ -0,0 +1,46 @@
+#!/bin/python
+
+import sys
+from collections import defaultdict
+
+
+def count_vids(list_path):
+    counts = defaultdict(lambda: 0)
+    with open(list_path, 'r') as f_list:
+        for path in f_list:
+            with open(path.strip(), 'r') as f:
+                for vid in f:
+                    counts[vid.strip()] += 1
+    return counts
+
+
+def count_sr_pass(path, n):
+    counts = defaultdict(lambda: 0)
+    with open(path, 'r') as f:
+        for line in f:
+            tokens = line.strip().split('\t')
+            n_support = round(float(tokens[0]) * n)
+            vid = tokens[-1]
+            counts[vid] = n_support
+    return counts
+
+
+NON_REF_VIDS_LIST = sys.argv[1]
+BOTHSIDE_PASS_FILE = sys.argv[2]
+NUM_BATCHES = int(sys.argv[3])
+
+non_ref_counts = count_vids(NON_REF_VIDS_LIST)
+bothside_pass_counts = count_sr_pass(BOTHSIDE_PASS_FILE, NUM_BATCHES)
+
+with open(BOTHSIDE_PASS_FILE, 'r') as f:
+    for line in f:
+        tokens = line.strip().split('\t')
+        vid = tokens[-1]
+        bothside_pass_count = bothside_pass_counts[vid]
+        if bothside_pass_count == 0:
+            continue
+        non_ref_count = non_ref_counts[vid]
+        if non_ref_count == 0:
+            continue
+        fraction_support = min(1., bothside_pass_count / float(non_ref_count))
+        sys.stdout.write("{}\t{}\n".format(fraction_support, "\t".join(tokens[1:])))
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.sh b/src/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.sh
index e533f07d7..20751103b 100755
--- a/src/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.sh
+++ b/src/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.sh
@@ -2,7 +2,7 @@
 
 # Reassign variant labels based on depth regenotyping in mod04b
 
-set -exo pipefail
+set -eo pipefail
 ###USAGE
 usage(){
 cat <<EOF
@@ -23,19 +23,19 @@ Optional arguments:
   -h  HELP                 Show this help message and exit
   -s  MINSIZE              Minimum size (in bp) of CNV interval to be considered
                            during variant reclassification [default: 1000]
-  -d  MINDIFF              Minimum difference in non-ref CNV genotype frequency 
-                           between predicted carriers and noncarriers to consider 
-                           a CNV interval to have adequate depth support 
+  -d  MINDIFF              Minimum difference in non-ref CNV genotype frequency
+                           between predicted carriers and noncarriers to consider
+                           a CNV interval to have adequate depth support
                            [default: 0.4]
-  -D  <integer>            Minimum insertion site size (in bp) to be considered for 
+  -D  <integer>            Minimum insertion site size (in bp) to be considered for
                            distinguishing insertion site deletions [default: 150 bp]
   -T  <integer>            Minimum size (in bp) at which to prioritize an inverted
                            dDUP classification over a dupINV or INVdup classification
                            [default: 1000000 bp]
-  -R  <path>               Path to table containing the final reclassification 
-                           decision made per variant. [default: no table output] 
+  -R  <path>               Path to table containing the final reclassification
+                           decision made per variant. [default: no table output]
   -G  <path>               Path to table containing the raw genotype counts table
-                           per interval per variant. [default: no table output] 
+                           per interval per variant. [default: no table output]
 
 
 Notes:
@@ -221,9 +221,9 @@ while read chr start end VID samps trash; do
     unset medCN
 
     ###Get list of samples & reference CN to consider, dependent on chr of call
-    #ChrX: use only diploid females if possible, otherwise use haploid males 
+    #ChrX: use only diploid females if possible, otherwise use haploid males
     if [ ${chr} == "X" ] || [ ${chr} == "chrX" ]; then
-      
+
       #Try to get female carrier samples
       echo -e "${samps}" | sed 's/,/\n/g' \
         | fgrep -wf - ${GTDIR}/female.samples.list \
@@ -323,7 +323,7 @@ while read chr start end VID samps trash; do
     fi
 
 
-    #For predicted carriers, count number of genotypes lower than, 
+    #For predicted carriers, count number of genotypes lower than,
     # equal to, and greater than the overall median
     if [ $( cat ${GTDIR}/carrier_samples.tmp | wc -l ) -gt 0 ]; then
       fgrep -wf ${GTDIR}/carrier_samples.tmp \
@@ -338,8 +338,8 @@ while read chr start end VID samps trash; do
     else
       echo -e "0\n0\n0"
     fi
-    
-    #For predicted non-carriers, count number of genotypes lower than, 
+
+    #For predicted non-carriers, count number of genotypes lower than,
     # equal to, and greater than the overall median
     if [ $( cat ${GTDIR}/control_samples.tmp | wc -l ) -gt 0 ]; then
       fgrep -wf ${GTDIR}/control_samples.tmp \
@@ -432,7 +432,7 @@ awk -v ENDidx=${ENDidx} -v OFS="\t" '{ $3=$ENDidx; print }' \
   ${GTDIR}/inv_se_vcf2bed.precut.bed \
   | fgrep -v "#" || true \
   >> ${GTDIR}/variants_to_reclassify.vcf2bed.bed
-  
+
 
 ###MAKE FINAL ASSESSMENT FOR EACH VARIANT
 #Print header
@@ -994,14 +994,25 @@ while read VID MOD REASON svtype cpxtype cpxintervals SVLEN SOURCE START END; do
       #Modify info as needed
       INFO=$( fgrep -w ${VID} ${GTDIR}/variants_to_be_reassessed.vcf \
                 | cut -f8 \
-                | sed -r -e "s/END=[^;]*;/END=$END;/" \
+                | sed -r -e "s/^END=[^;]*;/END=$END;/" \
+                | sed -r -e "s/;END=[^;]*;/;END=$END;/" \
+                | sed -r -e "s/;END=[^;]*$/;END=$END/" \
+                | sed -r -e "s/^SVTYPE=[^;]*;/SVTYPE=$svtype;/" \
                 | sed -r -e "s/;SVTYPE=[^;]*;/;SVTYPE=$svtype;/" \
+                | sed -r -e "s/;SVTYPE=[^;]*$/;SVTYPE=$svtype/" \
+                | sed -r -e "s/^SVLEN=[^;]*;/SVLEN=$SVLEN;/" \
                 | sed -r -e "s/;SVLEN=[^;]*;/;SVLEN=$SVLEN;/" \
-                | sed -r -e "s/;CPX_TYPE=[^;]*$/;CPX_TYPE=${cpxtype}/" \
+                | sed -r -e "s/;SVLEN=[^;]*$/;SVLEN=$SVLEN/" \
+                | sed -r -e "s/^CPX_TYPE=[^;]*;/CPX_TYPE=${cpxtype};/" \
                 | sed -r -e "s/;CPX_TYPE=[^;]*;/;CPX_TYPE=${cpxtype};/" \
+                | sed -r -e "s/;CPX_TYPE=[^;]*$/;CPX_TYPE=${cpxtype}/" \
+                | sed -r -e 's/^UNRESOLVED;//' \
                 | sed -r -e 's/;UNRESOLVED;/;/' \
+                | sed -r -e 's/;UNRESOLVED$//' \
+                | sed -r -e 's/^UNRESOLVED_TYPE=[^;]*;//' \
                 | sed -r -e 's/;UNRESOLVED_TYPE=[^;]*;/;/' \
                 | sed -r -e 's/;UNRESOLVED_TYPE=[^;]*$//' \
+                | sed -r -e 's/^EVENT=[^;]*;//' \
                 | sed -r -e 's/;EVENT=[^;]*;/;/' \
                 | sed -r -e 's/;EVENT=[^;]*$//' )
       #Add/remove/modify CPX_TYPE, if needed
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/reset_cnv_gts.py b/src/sv-pipeline/04_variant_resolution/scripts/reset_cnv_gts.py
new file mode 100644
index 000000000..7c1073f6a
--- /dev/null
+++ b/src/sv-pipeline/04_variant_resolution/scripts/reset_cnv_gts.py
@@ -0,0 +1,50 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+#
+
+"""
+Sets CNV GT fields to "."
+This is needed following any HailMerge step for VCFs containing CNVs
+"""
+
+import argparse
+import sys
+import pysam
+
+
+def reset_cnv_gts(vcf, fout):
+
+    for record in vcf:
+        if record.info['SVTYPE'] == 'CNV':
+            for sample in record.samples:
+                record.samples[sample]['GT'] = (None,)
+        fout.write(record)
+
+
+def main():
+
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter)
+    parser.add_argument('vcf')
+    parser.add_argument('fout')
+
+    args = parser.parse_args()
+
+    if args.vcf in '- stdin'.split():
+        vcf = pysam.VariantFile(sys.stdin)
+    else:
+        vcf = pysam.VariantFile(args.vcf)
+
+    header = vcf.header
+
+    if args.fout in '- stdout'.split():
+        fout = pysam.VariantFile(sys.stdout, 'w', header=header)
+    else:
+        fout = pysam.VariantFile(args.fout, 'w', header=header)
+
+    reset_cnv_gts(vcf, fout)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/resolve_CPX_CNV_redundancies.sh b/src/sv-pipeline/04_variant_resolution/scripts/resolve_CPX_CNV_redundancies.sh
deleted file mode 100755
index 73bc405a5..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/resolve_CPX_CNV_redundancies.sh
+++ /dev/null
@@ -1,238 +0,0 @@
-#!/bin/bash
-
-# Resolve redundancies between simple CNVs and unbalanced complex SV in mod04b
-
-# TODO : Missing pipefail
-set -e
-
-###USAGE
-usage(){
-cat <<EOF
-
-usage: resolve_CPX_CNV_redundancies.sh [-h] INVCF OUTVCF
-
-Resolve redundancies between simple CNVs and unbalanced complex SV in mod04b
-
-Positional arguments:
-  INVCF                    Original input VCF prior to regenotyping
-  OUTVCF                   Full path to output VCF after relabeling
-
-Optional arguments:
-  -h  HELP                 Show this help message and exit
-
-Notes:
-  1. All input files must be compressed with bgzip.
-
-EOF
-}
-
-
-###PARSE ARGS
-while getopts ":h" opt; do
-  case "$opt" in
-    h)
-      usage
-      exit 0
-      ;;
-  esac
-done
-shift $(( ${OPTIND} - 1))
-INVCF=$1
-OUTVCF=$2
-
-
-###PROCESS ARGS & OPTIONS
-#Check for required input
-if [ -z ${INVCF} ]; then
-  echo -e "\nERROR: input VCF not specified\n"
-  usage
-  exit 0
-fi
-if ! [ -s ${INVCF} ]; then
-  echo -e "\nERROR: input VCF either empty or not found\n"
-  usage
-  exit 0
-fi
-if [ $( file ${INVCF} | fgrep "gzip" | wc -l ) -lt 1 ]; then
-  echo -e "\nERROR: input VCF must be bgzipped\n"
-  usage
-  exit 0
-fi
-if [ -z ${OUTVCF} ]; then
-  echo -e "\nERROR: path to output VCF not specified\n"
-  usage
-  exit 0
-fi
-#Set path to execution directory
-BIN=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
-#Prepares temporary directory
-PROCDIR=`mktemp -d`
-
-
-###PREP FILES
-#Convert full VCF to BED intervals
-#Ignore CPX events with UNRESOLVED filter status
-svtk vcf2bed --split-cpx --info SVTYPE \
-  <(bcftools view -e 'INFO/SVTYPE == "CPX" && FILTER == "UNRESOLVED"' ${INVCF}) - \
-  | grep -e '^#\|DEL\|DUP\|CNV\|CPX' \
-  | awk -v OFS="\t" '{ if ($5=="CN0") print $1, $2, $3, $4, "DEL", $5"\n"$1, $2, $3, $4, "DUP", $5; \
-                       else if ($5=="DEL" || $5=="DUP") print $1, $2, $3, $4, $6, $5 }' \
-  | sort -Vk1,1 -k2,2n -k3,3n -k4,4V \
-  | bgzip -c \
-  > ${PROCDIR}/intervals.preclustered.bed.gz
-
-
-###REMOVE CNVS REDUNDANT WITH COMPLEX EVENTS
-#Subset to only variants that share some overlap (at least 10% recip) with at least one CPX variant
-bedtools intersect -wa -r -f 0.1 \
-  -a ${PROCDIR}/intervals.preclustered.bed.gz \
-  -b <( zcat ${PROCDIR}/intervals.preclustered.bed.gz | fgrep "CPX" ) \
-  | sort -Vk1,1 -k2,2n -k3,3n -k4,4V \
-  | uniq \
-  | bgzip -c \
-  > ${PROCDIR}/intervals.preclustered.subset.bed.gz
-#Melt subsetted variants
-while read chr start end VID samples CNV; do
-  echo -e "${samples}" \
-    | sed 's/,/\n/g' \
-    | awk -v OFS="\t" -v chr=${chr} -v start=${start} -v end=${end} -v VID=${VID} -v CNV=${CNV} \
-      '{ print chr, start, end, VID, $1, CNV }'
-done < <( zcat ${PROCDIR}/intervals.preclustered.subset.bed.gz ) \
-  | bgzip -c \
-  > ${PROCDIR}/intervals.preclustered.subset.melted.bed.gz
-#Cluster BED intervals (50% RO)
-svtk bedcluster -f 0.5 \
-  ${PROCDIR}/intervals.preclustered.subset.melted.bed.gz - \
-  | bgzip -c > \
-  ${PROCDIR}/intervals.clustered.bed.gz
-#Get list of all variants that cluster with a complex variant, 
-# evaluate sample overlap from original intervals file, 
-# and, if overlap >50%, write that ID to be stripped from the output VCF
-while read VIDs; do
-  #Get nonredundant list of sample IDs involved in any clustered variant
-  echo -e "${VIDs}" | sed 's/,/\n/g' \
-    | fgrep -wf - <( zcat ${PROCDIR}/intervals.preclustered.bed.gz ) \
-    | cut -f5 | sort | uniq \
-    > ${PROCDIR}/nonredundant_samples.list
-  #Iterate over VIDs and print non-CPX VID if sample overlap >50%
-  while read VID samples; do
-    #Get list of samples in variant
-    echo -e "${samples}" | sed 's/,/\n/g' \
-      | sort | uniq > ${PROCDIR}/query_samples.list
-    nsamp=$( cat ${PROCDIR}/query_samples.list | wc -l )
-
-    #Compare
-    frac=$( fgrep -wf ${PROCDIR}/query_samples.list \
-              ${PROCDIR}/nonredundant_samples.list | wc -l \
-              | awk -v nsamp=${nsamp} '{ print 100*($1/nsamp) }' \
-              | cut -f1 -d\. )
-    if [ ${frac} -ge 50 ]; then
-      echo "${VID}"
-    fi
-
-    #Clean up
-    rm ${PROCDIR}/query_samples.list
-
-  done < <( echo -e "${VIDs}" | sed 's/,/\n/g' \
-              | fgrep -wf - <( zcat ${PROCDIR}/intervals.preclustered.bed.gz ) \
-              | cut -f4,5 | sort | uniq | fgrep -v "CPX" )
-
-  #Clean up
-  rm ${PROCDIR}/nonredundant_samples.list
-
-done < <( zcat ${PROCDIR}/intervals.clustered.bed.gz \
-            | cut -f7 | fgrep "CPX" | grep -e "DEL\|DUP" ) \
-  | sort -V | uniq \
-  > ${PROCDIR}/VIDs_to_remove.list
-
-
-###FIND REMAINING REDUNDANT CNVS WITH STRONG (80%) OVERLAP IN SAMPLES AND SIZE
-#Find CNV intervals that have 80% reciprocal overlap
-bedtools intersect -wa -wb -r -f 0.8 \
-  -a ${PROCDIR}/intervals.preclustered.bed.gz \
-  -b ${PROCDIR}/intervals.preclustered.bed.gz \
-  | awk -v FS="\t" '{ if ($4!=$10 && $6==$12) print $0 }' \
-  | awk -v OFS="\t" '$4 ~ /DEL|DUP/ { print $0 }' \
-  | awk -v OFS="\t" '$10 ~ /DEL|DUP/ { print $0 }' \
-  | bgzip -c \
-  > ${PROCDIR}/step2.intervals.preclustered.subset.bed.gz
-#Determine which events share 80% sample overlap
-while read VIDa sa VIDb sb; do
-  na=$( echo -e "${sa}" | sed 's/,/\n/g' | sort | uniq | wc -l )
-  nb=$( echo -e "${sb}" | sed 's/,/\n/g' | sort | uniq | wc -l )
-  denom=$( echo -e "${sa},${sb}" | sed 's/,/\n/g' | sort | uniq | wc -l )
-  numer=$( echo -e "${sa}" | sed 's/,/\n/g' | fgrep -wf - \
-            <( echo -e "${sb}" | sed 's/,/\n/g' ) \
-            | sort | uniq | wc -l )
-  if [ ${denom} -gt 0 ]; then
-    ovr=$(( 100 * ${numer} / ${denom} ))
-  fi
-  if [ -z ${ovr} ]; then
-    ovr=0
-  fi
-  if [ ${ovr} -ge 80 ]; then
-    echo -e "${VIDa}\n${VIDb}" \
-      | sort | uniq | paste -s -d,
-  fi
-done < <( zcat ${PROCDIR}/step2.intervals.preclustered.subset.bed.gz \
-            | cut -f4,5,10,11 ) \
-  | sort | uniq \
-  > ${PROCDIR}/step2.variants_to_resolve.list
-#Iterate over variants, pick info & coords from variant with largest N, 
-# and consolidate genotypes
-sed 's/,/\n/g' ${PROCDIR}/step2.variants_to_resolve.list \
-  | sort | uniq \
-  > ${PROCDIR}/step2.variants_to_resolve.melted.list
-if [ -e ${PROCDIR}/records_to_add.vcf ]; then
-  rm ${PROCDIR}/records_to_add.vcf
-fi
-until [ $( cat ${PROCDIR}/step2.variants_to_resolve.melted.list | wc -l ) -eq 0 ]; do
-  #get next variant
-  VID=$( head -n1 ${PROCDIR}/step2.variants_to_resolve.melted.list )
-  #get all other variants from clusters containing this variant
-  fgrep -w ${VID} ${PROCDIR}/step2.variants_to_resolve.list \
-    | sed 's/,/\n/g' | sort | uniq \
-    > ${PROCDIR}/step2.partners.tmp
-  #Print all genotypes to tmp file
-  zcat ${INVCF} | fgrep -v "#" \
-    | fgrep -wf ${PROCDIR}/step2.partners.tmp | cut -f10- \
-    > ${PROCDIR}/gts.tmp
-  #Select best genotypes to keep
-  ${BIN}/selectBestGT.R ${PROCDIR}/gts.tmp ${PROCDIR}/gts.best.tmp
-  #Select record with greatest total number of samples
-  bVID=$( zcat ${PROCDIR}/intervals.preclustered.bed.gz \
-            | fgrep -wf ${PROCDIR}/step2.partners.tmp \
-            | cut -f4-5 | sed 's/,/\t/g' \
-            | awk -v OFS="\t" '{ print $1, NF }' \
-            | sort -nrk2,2 \
-            | cut -f1 \
-            | head -n1 )
-  #Add new record to final append tmp file
-  paste <( zcat ${INVCF} | fgrep -w ${bVID} | cut -f1-9 ) \
-        ${PROCDIR}/gts.best.tmp \
-    >> ${PROCDIR}/records_to_add.vcf
-  #Write list of variants to exclude from original VCF
-  cat ${PROCDIR}/step2.partners.tmp >> ${PROCDIR}/VIDs_to_remove.list
-  #Exclude variants from list of VIDs to resolve
-  fgrep -wvf ${PROCDIR}/step2.partners.tmp \
-    ${PROCDIR}/step2.variants_to_resolve.melted.list \
-    > ${PROCDIR}/step2.variants_to_resolve.melted.list2 \
-    || true
-  mv ${PROCDIR}/step2.variants_to_resolve.melted.list2 \
-    ${PROCDIR}/step2.variants_to_resolve.melted.list
-done
-
-
-###CLEAN UP FINAL OUTPUT
-zcat ${INVCF} \
-  | fgrep -wvf ${PROCDIR}/VIDs_to_remove.list \
-  | cat - ${PROCDIR}/records_to_add.vcf \
-  | vcf-sort \
-  | bgzip -c \
-  > ${OUTVCF}
-
-
-###CLEAN UP
-rm -rf ${PROCDIR}
-
-
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/resolve_cpx_cnv_redundancies.py b/src/sv-pipeline/04_variant_resolution/scripts/resolve_cpx_cnv_redundancies.py
new file mode 100755
index 000000000..51baaa0a6
--- /dev/null
+++ b/src/sv-pipeline/04_variant_resolution/scripts/resolve_cpx_cnv_redundancies.py
@@ -0,0 +1,585 @@
+#!/usr/bin/env python
+
+import sys
+import os
+import pybedtools
+import pysam
+import numpy
+import scipy.sparse
+import argparse
+from typing import List, Text, Optional, Iterable, Iterator, Tuple, Set, Dict, Mapping
+from types import MappingProxyType
+import multiprocessing
+
+
+class Keys:  # static class with re-used strings (to avoid typo errors, allow easy refactoring)
+    svtype = "SVTYPE"
+    ins = "INS"
+    deletion = "DEL"
+    dup = "DUP"
+    cpx = "CPX"
+    cnv = "CNV"
+    unresolved = "UNRESOLVED"
+    cpx_intervals = "CPX_INTERVALS"
+    cpx_type = "CPX_TYPE"
+
+
+class Default:  # static class with default values for kwargs
+    min_cpx_reciprocal_overlap = 0.1
+    cnv_cpx_reciprocal_overlap = 0.5
+    cnv_cpx_sample_overlap = 0.5
+    cnv_cnv_reciprocal_overlap = 0.8
+    cnv_cnv_sample_overlap = 0.8
+    clusterable_sv_types = frozenset({Keys.deletion, Keys.dup, Keys.cnv})
+    cpx_ins_classes = frozenset({"dDUP", "dDUP_iDEL", "INS_iDEL"})
+    temp_dir = "/tmp"
+    num_threads = multiprocessing.cpu_count()
+
+
+name_field = 3
+sv_type_field = 4
+is_cpx_field = 5
+ref_ploidy = 2  # note, even for autosome, VCFs always have ploidy=2 calls
+ref_gt = (0, 0)
+non_carrier_gts = {None, (None, None), (0, 0), (0, None), (None, 0)}
+
+
+def _fix_coords(start: int, end: int) -> (int, int):
+    """ ensure that start preceeds end, and is >= 0 """
+    start, end = (start, end) if (start <= end) else (end, start)  # ensure in sorted order
+    return max(start - 1, 0), end  # convert from VCF to bed format
+
+
+def _get_carrier_status(
+        record: pysam.VariantRecord
+) -> Tuple[numpy.ndarray, numpy.ndarray]:
+    """
+    Get boolean numpy arrays detailing carrier status for each sample
+    Parameters
+    ----------
+    record: VariantRecord
+        pysam record for this variant
+    Returns
+    -------
+    is_carrier: numpy.ndarray
+        boolean array that is True for samples called non-ref for this Variant, and False otherwise (including no-call)
+    is_ref: numpy.ndarray
+        boolean array that is True for samples called ref for this Variant, and False otherwise (including no-call)
+    """
+    if record.info.get(Keys.svtype, None) == Keys.cnv:  # genotype is always no-call, check info.CN
+        copy_numbers = [sample_rec.get("CN") for sample_rec in record.samples.itervalues()]
+        is_carrier = numpy.fromiter(
+            (copy_number is not None and copy_number != ref_ploidy for copy_number in copy_numbers),
+            dtype=bool, count=len(copy_numbers)
+        )
+        is_ref = numpy.fromiter(
+            (copy_number == ref_ploidy for copy_number in copy_numbers),
+            dtype=bool, count=len(copy_numbers)
+        )
+    else:
+        genotypes = [sample_rec.get("GT") for sample_rec in record.samples.itervalues()]
+        is_carrier = numpy.fromiter(
+            (genotype not in non_carrier_gts for genotype in genotypes), dtype=bool, count=len(genotypes)
+        )
+        is_ref = numpy.fromiter(
+            (genotype == ref_gt for genotype in genotypes), dtype=bool, count=len(genotypes)
+        )
+    return is_carrier, is_ref
+
+
+def _unfiltered_vcf_records_to_bed_intervals(
+        vcf_records: Iterable[pysam.VariantRecord],
+        is_carrier: Dict[Text, numpy.ndarray],
+        is_ref: Dict[Text, numpy.ndarray],
+        clusterable_sv_types: Set[Text] = Default.clusterable_sv_types,
+        cpx_ins_classes: Set[Text] = Default.cpx_ins_classes
+) -> Iterator[Tuple]:
+    f"""
+    Iterate over input VCF, yielding records that may be redundant. Also gather is_carrier and is_ref mappings.
+    Parameters
+    ----------
+    vcf_records: Iterable[VariantRecord]
+        Iterable with pysam records from input VCF file.
+    is_carrier: Dict[Text, numpy.ndarray]
+        Dict from variant ID to boolean array that is True for samples called non-ref for this Variant, and False
+        otherwise (including no-call). NOTE: this function *updates* is_carrier in place.
+    is_ref: Dict[Text, numpy.ndarray]
+        Dict from boolean array that is True for samples called ref for this Variant, and False otherwise (including
+        no-call). NOTE: this function *updates* is_ref in place.
+    clusterable_sv_types: Set[Text] (default={Default.clusterable_sv_types})
+        SV types that may be redundant (or needed for clustering with redundant SVs).
+    cpx_ins_classes: Set[Text] (default={Default.cpx_ins_classes})
+        CPX SV types that should produce an INS sink (modeled as a DEL)
+    Yields
+    -------
+    bed_tuple: Tuple
+        successive records for bed object, with fields: contig, start, end, variant_id, sv_type, is_cpx
+    """
+    for record in vcf_records:
+        sv_type = record.info[Keys.svtype]
+        if sv_type == Keys.cpx:
+            if Keys.unresolved in record.filter:
+                continue
+            # If complex, all constituent intervals are in CPX_INTERVALS
+            variant_id = record.id
+            is_carrier[variant_id], is_ref[variant_id] = _get_carrier_status(record)
+            for cpx_interval in record.info[Keys.cpx_intervals]:
+                sv_type, region = cpx_interval.split('_', 1)
+                contig, coords = region.split(':', 1)
+                start, end = _fix_coords(*(int(c) for c in coords.split('-', 1)))
+                yield contig, start, end, variant_id, sv_type, 1
+            if record.info.get(Keys.cpx_type, None) in cpx_ins_classes:
+                # If complex insertion, return insertion point as 1bp DEL
+                sv_type = Keys.deletion
+                contig = record.contig
+                end = record.pos
+                start = max(0, end - 1)
+                yield contig, start, end, variant_id, sv_type, 1
+        elif sv_type in clusterable_sv_types:
+            start, end = _fix_coords(record.pos, record.stop)
+            variant_id = record.id
+            is_carrier[variant_id], is_ref[variant_id] = _get_carrier_status(record)
+            yield record.contig, start, end, variant_id, sv_type, 0
+
+
+def _vcf_records_to_bed_intervals(
+        vcf_records: Iterable[pysam.VariantRecord],
+        is_carrier: Dict[Text, numpy.ndarray],
+        is_ref: Dict[Text, numpy.ndarray],
+        clusterable_sv_types: Set[Text] = Default.clusterable_sv_types,
+        cpx_ins_classes: Set[Text] = Default.cpx_ins_classes
+) -> Iterator[Tuple]:
+    f"""
+    Iterate over input VCF, yielding records that may be redundant. Also gather is_carrier and is_ref mappings.
+    This function mainly passes results from _unfiltered_vcf_records_to_bed_intervals, but potentially filters out
+    unneeded SV intervals that originated in CPX events, and duplicates SVTYPE=CNV into one DUP and one DEL.
+    Parameters
+    ----------
+    vcf_records: Iterable[VariantRecord]
+        Iterable with pysam records from input VCF file.
+    is_carrier: Dict[Text, numpy.ndarray]
+        Dict from variant ID to boolean array that is True for samples called non-ref for this Variant, and False
+        otherwise (including no-call). NOTE: this function *updates* is_carrier in place.
+    is_ref: Dict[Text, numpy.ndarray]
+        Dict from boolean array that is True for samples called ref for this Variant, and False otherwise (including
+        no-call). NOTE: this function *updates* is_ref in place.
+    clusterable_sv_types: Set[Text] (default={Default.clusterable_sv_types})
+        SV types that may be redundant (or needed for clustering with redundant SVs).
+    cpx_ins_classes: Set[Text] (default={Default.cpx_ins_classes})
+        CPX SV types that should produce an INS sink (modeled as a DEL)
+    Yields
+    -------
+    bed_tuple: Tuple
+        successive records for bed object, with fields: contig, start, end, variant_id, sv_type, is_cpx
+    """
+    for contig, start, end, variant_id, sv_type, is_cpx in _unfiltered_vcf_records_to_bed_intervals(
+        vcf_records, is_carrier, is_ref, clusterable_sv_types=clusterable_sv_types, cpx_ins_classes=cpx_ins_classes
+    ):
+        if sv_type in clusterable_sv_types:
+            # store sv_type in interval.score, is_cpx in interval.strand
+            if sv_type == Keys.cnv:  # ensure CNVs cluster with both insertions and deletions
+                yield contig, start, end, variant_id, Keys.deletion, is_cpx
+                yield contig, start, end, variant_id, Keys.dup, is_cpx
+            else:  # yield this interval normally
+                yield contig, start, end, variant_id, sv_type, is_cpx
+
+
+def jaccard_index(is_carrier_a: numpy.ndarray, is_carrier_b: numpy.ndarray) -> float:
+    """ return Jaccard index of carrier samples based on two boolean arrays of carrier status """
+    return numpy.logical_and(is_carrier_a, is_carrier_b).sum() / numpy.logical_or(is_carrier_a, is_carrier_b).sum()
+
+
+def _iter_pairwise_connections(
+        clusterable_bedtool: pybedtools.BedTool,
+        min_reciprocal_overlap: float,
+        min_sample_overlap: float = 0,
+        is_carrier: Mapping[Text, numpy.ndarray] = MappingProxyType({})
+) -> Iterator[Tuple[Text, Text]]:
+    """
+    Iterate over pairs of variant intervals that meet minimum requirement for reciprocal overlap. Exclude self-overlaps.
+    Optionally impose requirement of minimum Jaccard index for carrier samples.
+    Parameters
+    ----------
+    clusterable_bedtool: BedTool
+        bed object with intervals that may overlap each other
+    min_reciprocal_overlap: float
+        minimum reciprocal overlap for two intervals to be connected
+    min_sample_overlap: float (default=0)
+        minimum Jaccard index of carrier samples for two intervals to be connected
+    is_carrier: Mapping[Text, numpy.ndarray]
+        map from variant ID to carrier status (array boolean True/False for each sample)
+    Yields
+    -------
+    variant_id_1, variant_id_2: Tuple[Text, Text]
+        successive pairs of variant IDs that meet the overlap requiremnts
+    """
+    # Cluster intervals based on reciprocal overlap
+    if len(clusterable_bedtool) == 0:
+        return
+    overlap_bedtool = clusterable_bedtool.intersect(clusterable_bedtool, f=min_reciprocal_overlap, r=True, wa=True,
+                                                    wb=True, sorted=True, nonamecheck=True)
+    num_1_fields = clusterable_bedtool.field_count()
+    name_1_field = name_field
+    sv_type_1_field = sv_type_field
+    name_2_field = num_1_fields + name_field
+    sv_type_2_field = num_1_fields + sv_type_field
+
+    if min_sample_overlap > 0:
+        for overlap in overlap_bedtool:
+            fields = overlap.fields
+            if fields[sv_type_1_field] != fields[sv_type_2_field]:
+                continue  # only cluster same sv_type
+            name_1 = fields[name_1_field]
+            name_2 = fields[name_2_field]
+            if name_1 != name_2 and jaccard_index(is_carrier[name_1], is_carrier[name_2]) >= min_sample_overlap:
+                yield name_1, name_2
+    else:
+        for overlap in overlap_bedtool:
+            fields = overlap.fields
+            if fields[sv_type_1_field] != fields[sv_type_2_field]:
+                continue  # only cluster same sv_type
+            name_1 = fields[name_1_field]
+            name_2 = fields[name_2_field]
+            if name_1 != name_2:
+                yield name_1, name_2
+
+
+def _get_clusters(
+        clusterable_bedtool: pybedtools.BedTool,
+        min_reciprocal_overlap: float,
+        min_sample_overlap: float = 0,
+        is_carrier: Mapping[Text, numpy.ndarray] = MappingProxyType({})
+) -> List[numpy.ndarray]:
+    """
+    Perform single-linkage clustering of variant intervals based on reciprocal overlap. Potentially impose a clustering
+    requirement of high Jaccard index for carrier samples.
+    Parameters
+    ----------
+    clusterable_bedtool: BedTool
+        bed object with intervals that may cluster with each other
+    min_reciprocal_overlap: float
+        minimum reciprocal overlap for two intervals to be placed in a cluster
+    min_sample_overlap: float (default=0)
+        minimum Jaccard index of carrier samples for two intervals to be placed in a cluster
+    is_carrier: Mapping[Text, numpy.ndarray]
+        map from variant ID to carrier status (array boolean True/False for each sample)
+    Returns
+    -------
+    clusters: List[numpy.ndarray]
+        each element is an object numpy array of intervals that are in a cluster
+    """
+    # form map from variant IDs to unique indices for this clustering
+    name_to_index = {name: index for index, name in enumerate({interval.name for interval in clusterable_bedtool})}
+    num_vertices = len(name_to_index)
+    sparse_connections = scipy.sparse.eye(num_vertices, dtype=numpy.uint8, format="lil")
+    for name_1, name_2 in _iter_pairwise_connections(
+        clusterable_bedtool, min_reciprocal_overlap=min_reciprocal_overlap, min_sample_overlap=min_sample_overlap,
+        is_carrier=is_carrier
+    ):
+        sparse_connections[(name_to_index[name_1], name_to_index[name_2])] = 1
+
+    # Cluster graph. Use "weak" connection because bedtools will list "A overlaps B" and "B overlaps A"
+    num_clusters, cluster_labels = scipy.sparse.csgraph.connected_components(sparse_connections, connection="weak")
+
+    # Build lists of clustered Intervals
+    clusters = [[] for _ in range(num_clusters)]
+    for interval in clusterable_bedtool:
+        cluster_label = cluster_labels[name_to_index[interval.name]]
+        clusters[cluster_label].append(interval)
+
+    # convert lists to numpy object arrays for faster indexing
+    def _to_numpy_array(_cluster: List[pybedtools.Interval]) -> numpy.ndarray:
+        _cluster_array = numpy.empty((len(_cluster),), dtype=numpy.object)
+        _cluster_array[:] = _cluster
+        return _cluster_array
+
+    return [_to_numpy_array(cluster) for cluster in clusters]
+
+
+def _is_cpx(interval: pybedtools.Interval) -> int:
+    """ returns 1 if this interval originated as a CPX interval, 0 otherwise """
+    return int(interval.strand)  # is_cpx is stored in strand
+
+
+def _is_not_cpx(interval: pybedtools.Interval) -> bool:
+    """ returns 0 if this interval originated as a CPX interval, 1 otherwise """
+    return int(interval.strand) == 0  # is_cpx is stored in strand
+
+
+def _get_redundant_cluster_cnv_cpx_vids(
+        cluster: numpy.ndarray,
+        is_carrier: Mapping[Text, numpy.ndarray],
+        cnv_cpx_sample_overlap: float
+) -> Iterator[Text]:
+    """
+    Find CNVs that are redundant with CPX events
+    for each sample that participates in this interval-cluster:
+        join every variant ID that the sample participates in into a sample-cluster
+        if the sample-cluster contains >= 1 CPX and >= 1 non-CPX:
+            find logical-or carrier status over variant IDs in sample-cluster
+            for every non-CPX variant ID in sample_cluster:
+                if its Jaccard index is >= cnv_cpx_sample_overlap, it's redundant
+    Parameters
+    ----------
+    cluster: numpy.ndarray
+        numpy object array of pybedtools.Interval holding intervals that cluster together
+    is_carrier: Mapping[Text, numpy.ndarray]
+        Map from variant ID to boolean array that is True for samples called non-ref for this Variant, and False
+        otherwise (including no-call).
+    cnv_cpx_sample_overlap: float
+        Minimum Jaccard index for variant interval to have with sample cluster in order for it to be redundant.
+    Yields
+    -------
+    redundant_variant_id: str
+        Successive redundant variant IDs
+    """
+    interval_is_cpx = numpy.fromiter((_is_cpx(interval) for interval in cluster), dtype=bool, count=len(cluster))
+
+    def _is_valid_sample_cluster(indices_in_sample_cluster: numpy.ndarray) -> bool:
+        # valid sample clusters have some CPX intervals and some non-CPX intervals
+        return 0 < interval_is_cpx.take(indices_in_sample_cluster).sum() < len(indices_in_sample_cluster)
+
+    if not _is_valid_sample_cluster(numpy.arange(len(cluster))):
+        # no hope of finding valid sample-clusters if the whole thing won't work
+        return
+
+    # loop over unique combinations of intervals that are all non-ref for a single sample
+    is_carrier_matrix = numpy.concatenate(
+        [is_carrier[interval.name].reshape(1, -1) for interval in cluster], axis=0
+    )
+
+    for interval_in_potential_cluster in numpy.unique(is_carrier_matrix, axis=1).transpose():
+        indices_in_potential_cluster = numpy.nonzero(interval_in_potential_cluster)[0]
+        if not _is_valid_sample_cluster(indices_in_potential_cluster):
+            continue  # not a valid cluster, skip it
+        # can check jaccard index a little more quickly because each interval is in the cluster, so the intersection
+        # is equal to the carrier status of the interval
+        num_cluster_carriers = numpy.logical_or.reduce(
+            is_carrier_matrix.take(indices_in_potential_cluster, axis=0), axis=0
+        ).sum()
+        for index in indices_in_potential_cluster:
+            if not interval_is_cpx.take(index) and \
+                    is_carrier_matrix.take(index, axis=0).sum() / num_cluster_carriers >= cnv_cpx_sample_overlap:
+                yield cluster.take(index).name
+
+
+def _find_cnv_cpx_redundancies(
+        potentially_clusterable: pybedtools.BedTool,
+        is_carrier: Mapping[Text, numpy.ndarray],
+        min_cpx_reciprocal_overlap: float,
+        cnv_cpx_reciprocal_overlap: float,
+        cnv_cpx_sample_overlap: float
+) -> Set[Text]:
+    """
+    Subset potentially clusterable intervals to those that meet required minimum overlap with a CPX event.
+    Then find clusters, and remove redundant CNVs from those clusters.
+    Parameters
+    ----------
+    potentially_clusterable: BedTool
+        bed object with intervals that could potentially be used for clustering
+    is_carrier: Mapping[Text, numpy.ndarray]
+        Map from variant ID to boolean array that is True for samples called non-ref for this Variant, and False
+        otherwise (including no-call).
+    min_cpx_reciprocal_overlap: float
+        Minimum reciprocal overlap with a CPX interval for a CNV interval to be clusterable.
+    cnv_cpx_reciprocal_overlap: float
+        Minimum reciprocal overlap between two intervals to be part of a cluster.
+    cnv_cpx_sample_overlap: float
+        Minimum Jaccard index for variant interval to have with sample cluster in order for it to be redundant.
+    Returns
+    -------
+    vids_to_remove: Set[Text]
+        Set of variant IDs that are redundant and should be removed from the output VCF.
+    """
+    # find all potentially clusterable intervals that meet required minimum overlap with CPX
+    precluster_subset = potentially_clusterable.intersect(
+        potentially_clusterable.filter(_is_cpx), u=True, f=min_cpx_reciprocal_overlap, r=True, sorted=True,
+        nonamecheck=True
+    )
+
+    # find clusters of intervals with high reciprocal overlap, then check each cluster for redundant variant IDs
+    return {
+        variant_id
+        for cluster in _get_clusters(precluster_subset, min_reciprocal_overlap=cnv_cpx_reciprocal_overlap)
+        for variant_id in _get_redundant_cluster_cnv_cpx_vids(cluster, is_carrier,
+                                                              cnv_cpx_sample_overlap=cnv_cpx_sample_overlap)
+    }
+
+
+def _update_cnv_cnv_redundances(
+        vids_to_remove: Set[Text],
+        potentially_clusterable: pybedtools.BedTool,
+        is_carrier: Mapping[Text, numpy.ndarray],
+        is_ref: Mapping[Text, numpy.ndarray],
+        cnv_cnv_reciprocal_overlap: float,
+        cnv_cnv_sample_overlap: float
+):
+    """
+    Update vids_to_remove by finding CNVs that are redundant with other CNVs (as opposed to CPX)
+    -Find CNVs with very high reciprocal overlap, and very high carrier sample Jaccard index
+    -For each CNV that is connected to any other CNVs
+        Add that CNV and all its connections to vids_to_remove
+        Find the "best" CNV: the maximum choosing 1st by number of carriers, 2nd by number of called refs
+        Add the best CNV to set of vids that will be put back in (no matter what, even if previously or subsequently
+        "removed")
+    -Update vids_to_remove by removing the "best" variant IDs
+
+    Parameters
+    ----------
+    vids_to_remove: Set[Text]
+        set of variant IDs that are redundant and should be removed. NOTE: this function updates this set in place.
+    potentially_clusterable: BedTool
+        bed object with intervals that could potentially be used for clustering
+    is_carrier: Mapping[Text, numpy.ndarray]
+        Map from variant ID to boolean array that is True for samples called non-ref for this Variant, and False
+        otherwise (including no-call).
+    is_ref: Mapping[Text, numpy.ndarray]
+        Map from variant ID to boolean array that is True for samples called ref for this Variant, and False otherwise
+        (including no-call).
+    cnv_cnv_reciprocal_overlap: float
+        minimum reciprocal overlap for two CNVs to be connected
+    cnv_cnv_sample_overlap: float
+        minimum carrier samples Jaccard index for two CNVs to be connected
+    """
+    # for each non-CPX interval, find all non-CPX intervals it has sufficient reciprocal overlap and sample overlap with
+    variant_pairwise_connections = {}
+
+    non_cpx_potentially_clusterable = potentially_clusterable.filter(_is_not_cpx).saveas()
+    for name_1, name_2 in _iter_pairwise_connections(
+            non_cpx_potentially_clusterable, min_reciprocal_overlap=cnv_cnv_reciprocal_overlap,
+            min_sample_overlap=cnv_cnv_sample_overlap, is_carrier=is_carrier
+    ):
+        variant_pairwise_connections[name_1] = variant_pairwise_connections.get(name_1, (name_1,)) + (name_2,)
+
+    vids_to_remove.update(variant_pairwise_connections.keys())  # set all the clustered variants to be removed
+
+    # for each of these variant and its direct connections
+    #    - choose one "best" variant to represent it, with priority given to most carriers, followed by most ref calls
+    #    - keep the "best" variant (even if it's previously or subsequently "removed") and remove all others
+    num_carrier = {variant_id: variant_is_carrier.sum() for variant_id, variant_is_carrier in is_carrier.items()}
+    num_ref = {variant_id: variant_is_ref.sum() for variant_id, variant_is_ref in is_ref.items()}
+
+    def _best_variant_id(variant_id: Text) -> (int, int, str):
+        return num_carrier[variant_id], num_ref[variant_id], variant_id
+    # then remove the best ones
+    vids_to_remove.difference_update(
+        max(variant_id_cluster, key=_best_variant_id) for variant_id_cluster in variant_pairwise_connections.values()
+    )
+
+
+def resolve_cpx_cnv_redundancies(
+        input_vcf: Text,
+        output_vcf: Text,
+        min_cpx_reciprocal_overlap: float = Default.min_cpx_reciprocal_overlap,
+        cnv_cpx_reciprocal_overlap: float = Default.cnv_cpx_reciprocal_overlap,
+        cnv_cpx_sample_overlap: float = Default.cnv_cpx_sample_overlap,
+        cnv_cnv_reciprocal_overlap: float = Default.cnv_cnv_reciprocal_overlap,
+        cnv_cnv_sample_overlap: float = Default.cnv_cnv_sample_overlap,
+        clusterable_sv_types: Set[Text] = Default.clusterable_sv_types,
+        cpx_ins_classes: Set[Text] = Default.cpx_ins_classes,
+        temp_dir: str = Default.temp_dir,
+        num_threads: int = Default.num_threads
+):
+    f"""
+    From input VCF, find redundant CNVs:
+        CNVs that have sufficient reciprocal overlap and carrier sample Jaccard index with a CPX
+        CNVs that have sufficient reciprocal overlap and carrier sampel Jaccard index with another CNV
+    Write new VCF without redundant CNVs.
+    Parameters
+    ----------
+    input_vcf: Text
+        path to input vcf
+    output_vcf: Text
+        path to write output vcf
+    min_cpx_reciprocal_overlap: float (default={Default.min_cpx_reciprocal_overlap,})
+        Minimum reciprocal overlap with a CPX interval for a CNV interval to be clusterable.
+    cnv_cpx_reciprocal_overlap: float (default={Default.cnv_cpx_reciprocal_overlap})
+        Minimum reciprocal overlap between two intervals to be part of a cluster.
+    cnv_cpx_sample_overlap: float (default={Default.cnv_cpx_sample_overlap})
+        Minimum Jaccard index for variant interval to have with sample cluster in order for it to be redundant.
+    cnv_cnv_reciprocal_overlap: float (default={Default.cnv_cnv_reciprocal_overlap})
+        Minimum reciprocal overlap for two CNVs to be connected
+    cnv_cnv_sample_overlap: float (default={Default.cnv_cnv_sample_overlap})
+        Minimum carrier samples Jaccard index for two CNVs to be connected
+    clusterable_sv_types: Set[Text] (default={Default.clusterable_sv_types})
+        SV types that may be redundant (or needed for clustering with redundant SVs).
+    cpx_ins_classes: Set[Text] (default={Default.cpx_ins_classes})
+        CPX SV types that should produce an INS sink (modeled as a DEL)
+    temp_dir: str (default={Default.temp_dir})
+        Base folder to create new temp folder in.
+    num_threads: int (default={Default.temp_dir})
+        Number of threads to use for compression/decompression of VCF files.
+    """
+    temp_dir = os.path.abspath(os.path.expanduser(temp_dir))
+    os.makedirs(temp_dir, exist_ok=True)
+    pybedtools.set_tempdir(temp_dir)
+    is_carrier, is_ref = {}, {}
+    with pysam.VariantFile(input_vcf, 'r', threads=num_threads) as f_in:
+        header = f_in.header
+        potentially_clusterable = pybedtools.BedTool(
+            _vcf_records_to_bed_intervals(f_in.fetch(), is_carrier, is_ref, clusterable_sv_types=clusterable_sv_types,
+                                          cpx_ins_classes=cpx_ins_classes)
+        ).saveas().sort()
+
+    # get all the potentially clusterable intervals
+    vids_to_remove = _find_cnv_cpx_redundancies(
+        potentially_clusterable, is_carrier, min_cpx_reciprocal_overlap=min_cpx_reciprocal_overlap,
+        cnv_cpx_reciprocal_overlap=cnv_cpx_reciprocal_overlap, cnv_cpx_sample_overlap=cnv_cpx_sample_overlap
+    )
+    _update_cnv_cnv_redundances(
+        vids_to_remove, potentially_clusterable, is_carrier, is_ref,
+        cnv_cnv_reciprocal_overlap=cnv_cnv_reciprocal_overlap, cnv_cnv_sample_overlap=cnv_cnv_sample_overlap
+    )
+
+    output_folder = os.path.dirname(os.path.abspath(os.path.expanduser(output_vcf)))
+    os.makedirs(output_folder, exist_ok=True)
+    with pysam.VariantFile(input_vcf, 'r', threads=num_threads) as f_in, \
+            pysam.VariantFile(output_vcf, 'w', header=header, threads=num_threads) as f_out:
+        for record in f_in.fetch():
+            if record.id not in vids_to_remove:
+                f_out.write(record)
+
+
+def __parse_arguments(argv: List[Text]) -> argparse.Namespace:
+    # noinspection PyTypeChecker
+    parser = argparse.ArgumentParser(
+        description="Remove CNVs that are redundant with CPX variants, or each other",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument("input_vcf", type=str, help="VCF with potentially redundant CNVs")
+    parser.add_argument("output_vcf", type=str, help="VCF with redundant CNVs removed")
+    parser.add_argument("--min-cpx-reciprocal-overlap", type=float, default=Default.min_cpx_reciprocal_overlap,
+                        help="Minimum reciprocal overlap with a CPX for an interval to be possibly redundant")
+    parser.add_argument("--cnv-cpx-reciprocal-overlap", type=float, default=Default.cnv_cpx_reciprocal_overlap,
+                        help="Minimum reciprocal interval overlap for clustering CNV with CPX")
+    parser.add_argument("--cnv-cpx-sample-overlap", type=float, default=Default.cnv_cpx_sample_overlap,
+                        help="Minimum Jaccard index (intersection/union) of samples for clustering CNV with CPX")
+    parser.add_argument("--cnv-cnv-reciprocal-overlap", type=float, default=Default.cnv_cnv_reciprocal_overlap,
+                        help="Minimum reciprocal interval overlap for clustering CNV with other CNV")
+    parser.add_argument("--cnv-cnv-sample-overlap", type=float, default=Default.cnv_cnv_sample_overlap,
+                        help="Minimum Jaccard index (intersection/union) of samples for clustering CNV with other CNV")
+    parser.add_argument("--temp-dir", "-t", type=str, default=Default.temp_dir, help="directory for temp files")
+    parser.add_argument("--num-threads", type=int, default=Default.num_threads,
+                        help="number of threads for compressing/decompressing bgzipped files")
+
+    if len(argv) <= 1:
+        parser.parse_args(["--help"])
+        sys.exit(0)
+    parsed_arguments = parser.parse_args(argv[1:])
+    if parsed_arguments.input_vcf is None:
+        raise ValueError("Must supply input-vcf")
+    if parsed_arguments.output_vcf is None:
+        raise ValueError("Must supply output-vcf")
+
+    return parsed_arguments
+
+
+def main(argv: Optional[List[Text]] = None):
+    if argv is None:
+        argv = sys.argv
+    arguments = __parse_arguments(argv)
+    resolve_cpx_cnv_redundancies(**vars(arguments))
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_backend_part1.sh b/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_backend_part1.sh
deleted file mode 100755
index bf9d4994e..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_backend_part1.sh
+++ /dev/null
@@ -1,419 +0,0 @@
-#!/bin/bash
-
-# Intelligently shards a VCF prior to complex resolution (for parallelization)
-
-# Subsetted to first half, to just output lists of which variants should go in each shard
-
-set -Eeu -o pipefail
-
-# ARGS defaults and hard-coded values
-DIST=${DEFAULT_DIST:-1000}
-RECIP=${DEFAULT_RECIP:-0.1}
-MIN_LINES_PER_SHARD=${DEFAULT_MIN_LINES_PER_SHARD:-10}
-MAX_SHARDS=${DEFAULT_MAX_SHARDS:-100}
-NONCLUSTER_SHARDS=${DEFAULT_NONCLUSTER_SHARDS:-30}
-PREFIX=${DEFAULT_PREFIX:-"vcf_shard"}
-BREAKPOINT_PADDING=${DEFAULT_BREAKPOINT_PADDING:-5000}
-IGNORE_SV_TYPES=${DEFAULT_IGNORE_SV_TYPES:-false}
-ADD_SINGLE_REC=${DEFAULT_ADD_SINGLE_REC:-false}
-SHARD_LARGE_CLUSTERS=${DEFAULT_SHARD_LARGE_CLUSTERS:-true}
-SCRIPT_NAME=${SCRIPT_NAME:-$(basename "${BASH_SOURCE[0]}")}
-#Set path to execution directory
-BIN=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
-#Set default output folder
-OUTDIR=`pwd`
-
-###USAGE
-usage(){
-cat <<EOF
-usage: $SCRIPT_NAME [-h] [-D DIST] [-R RECIP] [-L MIN_LINES_PER_SHARD]
-                                       [-S MAX_SHARDS] [-N NONCLUSTER_SHARDS] [-P PREFIX]
-                                       [-T VCF_INDEX] VCF
-Shard VCF prior to complex resolution (for parallelization)
-Positional arguments:
-  VCF                      VCF from sv-pipeline prior to svtk resolve
-  OUTDIR                   Output directory for all shards
-Optional arguments:
-  -h  HELP                 Show this help message and exit
-  -D  DIST                 Breakpoint distance used in clustering [default: $DIST]
-  -R  RECIP                Reciprocal overlap used in clustering [default: $((100 * RECIP))%]
-  -L  MIN_LINES_PER_SHARD  Minimum variants per shard [default: $MIN_LINES_PER_SHARD]
-  -S  MAX_SHARDS           Maximum number of shards [default: $MAX_SHARDS]
-  -N  NONCLUSTER_SHARDS    Add an additional N shards to evenly split the 
-                           remaining records in the VCF that do not cluster with
-                           any other variants [default: $NONCLUSTER_SHARDS]
-  -P  PREFIX               String appended to the filename of each output VCF 
-                           shard [default: $PREFIX]
-  -B  BREAKPOINT_PADDING   Distance breakpoints are padded for clustering
-                           [default: $BREAKPOINT_PADDING]
-  -C  SHARD_LARGE_CLUSTERS Put exceptionally large clusters in their own shard
-                           [default: $SHARD_LARGE_CLUSTERS]
-  -I  IGNORE_SV_TYPES      If true, ignore sv types when clustering [default: $IGNORE_SV_TYPES]
-  -A  ADD_SINGLE_REC       If true, add single record non-CNVs to candidate
-                           complex clusters [default: $ADD_SINGLE_REC]
-  -O  OUTDIR               Output directory [default: pwd]
-  -T  VCF_index            Tabix index corresponding to input VCF 
-                           [default: VCF path + .tbi]
-Notes:
-  1. The total number of shards created will be no more 
-     than MAX_SHARDS + NONCLUSTER_SHARDS.
-EOF
-}
-
-###PARSE ARGS
-while getopts ":D:R:L:S:N:P:B:C:I:A:O:T:h" opt; do
-  case "$opt" in
-    h)
-      usage
-      exit 0
-      ;;
-    D)
-      DIST=${OPTARG}
-      ;;
-    R)
-      RECIP=${OPTARG}
-      ;;
-    L)
-      MIN_LINES_PER_SHARD=${OPTARG}
-      ;;
-    S)
-      MAX_SHARDS=${OPTARG}
-      ;;
-    N)
-      NONCLUSTER_SHARDS=${OPTARG}
-      ;;
-    P)
-      PREFIX=${OPTARG}
-      ;;
-    B)
-      BREAKPOINT_PADDING=${OPTARG}
-      ;;
-    C)
-      SHARD_LARGE_CLUSTERS=${OPTARG}
-      ;;
-    I)
-      IGNORE_SV_TYPES=${OPTARG}
-      ;;
-    A)
-      ADD_SINGLE_REC=${OPTARG}
-      ;;
-    O)
-      OUTDIR=${OPTARG}
-      ;;
-    T)
-      VCF_INDEX=${OPTARG}
-      ;;
-  esac
-done
-shift $(( ${OPTIND} - 1))
-VCF=$1
-VCF_INDEX=${VCF_INDEX:-"${VCF}.tbi"}
-
-
-###PROCESS ARGS & OPTIONS
-#Check for required input
-if [ -z ${VCF} ]; then
-  echo -e "\nERROR: input VCF not specified\n"
-  usage
-  exit 0
-fi
-if ! [ -s ${VCF} ]; then
-  echo -e "\nERROR: input VCF either empty or not found\n"
-  usage
-  exit 0
-fi
-if ! file ${VCF} | fgrep -q "gzip"; then
-  echo -e "\nERROR: input VCF must be bgzipped\n"
-  usage
-  exit 0
-fi
-if [ -z ${VCF_INDEX} ]; then
-  echo -e "\nERROR: input VCF index not specified\n"
-  usage
-  exit 0
-fi
-if ! [ -s ${VCF_INDEX} ]; then
-  echo -e "\nERROR: input VCF index either empty or not found\n"
-  usage
-  exit 0
-fi
-if [ -z ${OUTDIR} ]; then
-  echo -e "\nERROR: output directory not specified\n"
-  usage
-  exit 0
-fi
-#Creates $OUTDIR if necessary
-if ! [ -e ${OUTDIR} ]; then
-  mkdir ${OUTDIR}
-fi
-#Checks for numeric L and S options
-if [ ${MIN_LINES_PER_SHARD} -lt 1 ] || [ ${MIN_LINES_PER_SHARD} -gt 1000000 ]; then
-  echo -e "\nERROR: MIN_LINES_PER_SHARD must be an integer between 1 and 1000000\n"
-  usage
-  exit 0
-fi
-if [ ${MAX_SHARDS} -lt 1 ] || [ ${MAX_SHARDS} -gt 10000 ]; then
-  echo -e "\nERROR: MAX_SHARDS must be an integer between 1 and 10000\n"
-  usage
-  exit 0
-fi
-#Prep string to pass to vcfcluster
-if IGNORE_SV_TYPES; then
-    IGNORE_SV_TYPES_ARG="--ignore-svtypes"
-else
-    IGNORE_SV_TYPES_ARG=""
-fi
-
-###IDENTIFY CANDIDATE CLUSTERS
-#Make temporary directory
-SHARD_VCF_TMP=`mktemp -d`
-#Cut vcf to single-sample for improved clustering speed
-zcat ${VCF} | cut -f1-10 | bgzip -c > ${SHARD_VCF_TMP}/single_sample_input.vcf.gz
-#Identify all candidate complex variant clusters (generous 1kb clustering)
-svtk vcfcluster \
-  -d ${DIST} \
-  -f ${RECIP} \
-  -p candidate_complex_clusters \
-  ${IGNORE_SV_TYPES_ARG} \
-  -o 0 \
-  --preserve-ids \
-  <( echo "${SHARD_VCF_TMP}/single_sample_input.vcf.gz" ) \
-  ${SHARD_VCF_TMP}/input_vcf.clustered.vcf
-
-#Convert clustered variants to bed
-svtk vcf2bed \
-  --no-samples \
-  --info ALL \
-  ${SHARD_VCF_TMP}/input_vcf.clustered.vcf \
-  ${SHARD_VCF_TMP}/input_vcf.clustered.bed
-
-#Write list of clusters with >1 constituent variant
-mem_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed \
-           | sed 's/\t/\n/g' | awk '{ if ($1=="MEMBERS") print NR }' )
-awk -v idx=${mem_idx} -v OFS="\t" '$idx ~ /,/ { print $1, $2, $3, $idx }' \
-    ${SHARD_VCF_TMP}/input_vcf.clustered.bed \
-  | (grep -v "^#" || printf "") \
-  > ${SHARD_VCF_TMP}/candidate_complex_clusters.bed
-if ADD_SINGLE_REC; then
-  #Add all non-CNV single-record variants
-  class_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \
-              sed 's/\t/\n/g' | awk '{ if ($1=="SVTYPE") print NR }' )
-  awk -v idx=${class_idx} -v mem_idx=${mem_idx} -v OFS="\t" \
-    '$mem_idx !~ /,/ { print $1, $2, $3, $idx, $mem_idx }' \
-    ${SHARD_VCF_TMP}/input_vcf.clustered.bed \
-    | awk -v OFS="\t" '$4 !~ /DEL|DUP|CNV|MCNV|mCNV/ { print $1, $2, $3, $5 }' \
-    | (grep -v "^#" || printf "") \
-    >> ${SHARD_VCF_TMP}/candidate_complex_clusters.bed
-fi
-
-#Get min/max coordinates of all variants in list of VIDs
-{
-  zcat ${VCF} \
-    | (grep "^#" || printf "") \
-    | cut -f1-10;
-  cut -f4 ${SHARD_VCF_TMP}/candidate_complex_clusters.bed \
-    | sed 's/\,/\n/g' \
-    | sort -Vk1,1 \
-    | uniq \
-    | (fgrep -wf - <(zcat ${VCF}) || printf "") \
-    | cut -f1-10;
-} \
-  | svtk vcf2bed --no-samples /dev/stdin /dev/stdout \
-  > ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed
-
-###ONLY PERFORM CLUSTER-BASED SHARDING IF ANY VARIANTS PREDICTED TO CLUSTER
-if grep -vq "^#" ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed; then
-  #Split into breakpoints and pad all breakpoints by ±BREAKPOINT_PADDING
-  ###DETERMINE SET OF NONREDUNDANT INTERVALS FOR ALL CLUSTERS
-  grep -v "^#" ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed \
-    | awk -v OFS="\t" -v buffer=$BREAKPOINT_PADDING \
-        '{ print $1, $2-buffer, $2+buffer, $4;
-           print $1, $3-buffer, $3+buffer, $4 }' \
-    | awk -v OFS="\t" '{ if ($2<1) $2=1; print $1, $2, $3, $4 }' \
-    | sort -Vk1,1 -k2,2n -k3,3n \
-    | bedtools merge -i - -c 4 -o distinct \
-    > ${SHARD_VCF_TMP}/breakpoint_intervals.bed
-  #Iterate over breakpoint intervals and write list of maximum nonredundant intervals
-  in_cluster=`mktemp`
-  remaining=`mktemp`
-  cp ${SHARD_VCF_TMP}/breakpoint_intervals.bed ${remaining}
-  while read chr start end VIDs; do
-    #Get all lines associated with current VIDs
-    echo -e "${VIDs}" \
-      | sed 's/,/\n/g' \
-      | (fgrep -wf - ${remaining} || printf "") \
-      > ${in_cluster}
-    #Only run if at least one line added to ${in_cluster}
-    if [ -s "${in_cluster}" ]; then
-      #Exclude all lines in ${in_cluster} from ${remaining}
-      bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2
-      mv ${remaining}2 ${remaining}
-      #Iterate until no more related VIDs are present in ${remaining}
-      until ! cut -f4 ${in_cluster} | sed 's/\,/\n/g' | fgrep -qwf - ${remaining}; do
-        #Add new lines to ${in_cluster}
-        cut -f4 ${in_cluster} \
-          | sed 's/\,/\n/g' \
-          | (fgrep -wf - ${remaining} || printf "") \
-          >> ${in_cluster}
-        #Exclude all lines in ${in_cluster} from ${remaining}
-        bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2
-        mv ${remaining}2 ${remaining}
-      done
-      #Write out final interval
-      for wrapper in 1; do
-        #Print list of coordinates
-        cut -f1-3 ${in_cluster} \
-          | sort -Vk1,1 -k2,2 -k3,3 \
-          | bedtools merge -i - \
-          | awk '{ print $1":"$2"-"$3 }' \
-          | paste -s -d\;
-        #Print list of involved VIDs
-        cut -f4 ${in_cluster} \
-          | sed 's/,/\n/g' \
-          | sort \
-          | uniq \
-          | paste -s -d,
-      done | paste -s
-    fi
-  done < ${SHARD_VCF_TMP}/breakpoint_intervals.bed \
-    > ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt
-
-  if SHARD_LARGE_CLUSTERS; then
-      #Pull out exceptionally large clusters to the side to be placed in their own shards
-      while read ints VIDs; do
-        if [ $( echo ${VIDs} | sed 's/,/\n/g' | wc -l ) -ge ${MIN_LINES_PER_SHARD} ]; then
-          echo -e "${ints}\t${VIDs}"
-        fi
-      done < ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \
-        > ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt
-      if [ -s "${SHARD_VCF_TMP}/large_intervals_to_test.final.txt" ]; then
-        cut -f2 ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt \
-          | sed 's/,/\n/g' \
-          | (fgrep -wvf - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt || printf "") \
-          > ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2
-        mv ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2 \
-          ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt
-      fi
-  fi
-
-  ###DETERMINE COORDINATES FOR EACH SHARD
-  #Split variants into shards based on number of variants
-  #If total number of intervals/MAX_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard
-  if [ $(( $( wc -l < "${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt" ) / ${MAX_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then
-    ${BIN}/evenSplitter.R \
-      -L ${MIN_LINES_PER_SHARD} \
-      ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \
-      ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_
-  #Otherwise, split into MAX_SHARDS evenly-sized shards
-  else
-    ${BIN}/evenSplitter.R \
-      -S ${MAX_SHARDS} \
-      ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \
-      ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_
-  fi
-  #Determine number of shards
-  n_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}.shard_intervals_*" | wc -l )
-  #Writes exceptionally large clusters to their own shards
-  if [ -s "${SHARD_VCF_TMP}/large_intervals_to_test.final.txt" ]; then
-    while read ints VIDs; do
-      ((++n_shards))
-      echo -e "${ints}\t${VIDs}" > ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${n_shards}
-    done < ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt
-  fi
-  #Reformat interval shards
-  for i in $( seq 1 ${n_shards} ); do
-    cut -f1 ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i} \
-      | sed -e 's/\;/\n/g' -e 's/\:/\t/g' -e 's/\-/\t/g' \
-      | sort -Vk1,1 -k2,2n -k3,3n \
-      | bedtools merge -i - \
-      > ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed
-    rm ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}
-  done
-
-
-  ###SHARD CLUSTERABLE VCF
-  #Convert full, original VCF to BED
-  svtk vcf2bed --no-samples ${VCF} int.bed
-  #Harrison's patch for sharding
-  awk '{ if ($1!~"#") { print $1,$2,$2+1,$4,$5;
-                        print $1,$3-1,$3,$4,$5 }
-         else print }' OFS='\t' int.bed \
-    > ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed
-  rm int.bed
-
-  #Create exclusion list of VIDs already used in earlier shards
-  touch ${SHARD_VCF_TMP}/used_VIDs.tmp
-  #Iterate over all sharded intervals
-  for i in $( seq 1 ${n_shards} ); do
-    if [ ${i} -gt 1 ]; then
-      cat ${OUTDIR}/${PREFIX}.shard_*.VIDs.list \
-        | sort \
-        | uniq \
-        > ${SHARD_VCF_TMP}/used_VIDs.tmp
-    else
-      touch ${SHARD_VCF_TMP}/used_VIDs.tmp
-    fi
-    #Get list of IDs to be used in shard
-    bedtools intersect \
-        -a ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed \
-        -b ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed \
-        | cut -f4 \
-        | (fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.tmp || printf "")\
-        | sort \
-        | uniq \
-        > ${OUTDIR}/${PREFIX}.shard_${i}.VIDs.list
-
-    #Update exclusion list of VIDs already used in earlier shards
-    cat ${OUTDIR}/${PREFIX}.shard_*.VIDs.list \
-      | sort \
-      | uniq \
-      > ${SHARD_VCF_TMP}/used_VIDs.tmp
-  done
-
-  #Write list of eligible VIDs
-  (grep -v "^#" ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed || printf "") \
-    | cut -f4 \
-    | (fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.tmp || printf "") \
-    | sort \
-    | uniq \
-    > ${SHARD_VCF_TMP}/remaining_VIDs.list
-else
-  n_shards=0
-  zcat ${VCF} \
-    | (grep -v "^#" || printf "") \
-    | cut -f3 \
-    | sort \
-    | uniq \
-    > ${SHARD_VCF_TMP}/remaining_VIDs.list
-fi
-
-
-###SHARD NONCLUSTERABLE VCF
-#Shard remainder intervals into no more than $NONCLUSTER_SHARDS shards
-#If total number of variants/NONCLUSTER_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard
-if [ -s "${SHARD_VCF_TMP}/remaining_VIDs.list" ]; then
-  if [ $(( $(wc -l < ${SHARD_VCF_TMP}/remaining_VIDs.list) / ${NONCLUSTER_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then
-    ${BIN}/evenSplitter.R \
-      -L ${MIN_LINES_PER_SHARD} \
-      ${SHARD_VCF_TMP}/remaining_VIDs.list \
-      ${SHARD_VCF_TMP}/${PREFIX}.remaining_variants_
-  #Otherwise, split into MAX_SHARDS evenly-sized shards
-  else
-    ${BIN}/evenSplitter.R \
-      -S ${NONCLUSTER_SHARDS} \
-      ${SHARD_VCF_TMP}/remaining_VIDs.list \
-      ${SHARD_VCF_TMP}/${PREFIX}.remaining_variants_
-  fi
-  n_nonclusterable_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}.remaining_variants_*" | wc -l )
-  #Iterate over all sharded variant lists
-  for i in $( seq 1 ${n_nonclusterable_shards} ); do
-    idx=$(( ${n_shards} + ${i} ))
-    mv ${SHARD_VCF_TMP}/${PREFIX}.remaining_variants_${i} \
-      ${OUTDIR}/${PREFIX}.shard_${idx}.VIDs.list
-  done
-fi
-
-###CLEAN UP
-rm -rf ${SHARD_VCF_TMP}
-
-1>&2 echo "$(basename $0): Success"
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preClustering.sh b/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preClustering.sh
deleted file mode 100755
index 2d3062daa..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preClustering.sh
+++ /dev/null
@@ -1,328 +0,0 @@
-#!/bin/bash
-
-# Intelligently shards a VCF prior to complex resolution (for parallelization)
-
-set -e
-
-###USAGE
-usage(){
-cat <<EOF
-usage: shardVCF_preResolveCPX.sh [-h] [-D DIST] [-R RECIP] [-L MIN_LINES_PER_SHARD]
-                                 [-S MAX_SHARDS] [-N NONCLUSTER_SHARDS] [-P PREFIX]
-                                 [-T VCF_INDEX] VCF
-Shard VCF prior to complex resolution (for parallelization)
-Positional arguments:
-  VCF                      VCF from sv-pipeline prior to svtk resolve
-  OUTDIR                   Output directory for all shards
-Optional arguments:
-  -h  HELP                 Show this help message and exit
-  -D  DIST                 Breakpoint distance used in clustering [default: 1000]
-  -R  RECIP                Reciprocal overlap used in clustering [default: 10%] 
-  -L  MIN_LINES_PER_SHARD  Minimum variants per shard [default: 10]
-  -S  MAX_SHARDS           Maximum number of shards [default: 100]
-  -N  NONCLUSTER_SHARDS    Add an additional N shards to evenly split the 
-                           remaining records in the VCF that do not cluster with
-                           any other variants [default: 30]
-  -P  PREFIX               String appended to the filename of each output VCF 
-                           shard [default: "vcf_shard"]
-  -O  OUTDIR               Output directory [default: pwd]
-  -T  VCF_index            Tabix index corresponding to input VCF 
-                           [default: VCF path + .tbi]
-Notes:
-  1. The total number of shards created will be no more 
-     than MAX_SHARDS + NONCLUSTER_SHARDS.
-EOF
-}
-
-
-###PARSE ARGS
-DIST=1000
-RECIP=0.1
-MIN_LINES_PER_SHARD=10
-MAX_SHARDS=100
-NONCLUSTER_SHARDS=30
-PREFIX="vcf_shard"
-OUTDIR=`pwd`
-while getopts ":D:R:L:S:N:P:O:T:h" opt; do
-  case "$opt" in
-    h)
-      usage
-      exit 0
-      ;;
-    D)
-      DIST=${OPTARG}
-      ;;
-    R)
-      RECIP=${OPTARG}
-      ;;
-    L)
-      MIN_LINES_PER_SHARD=${OPTARG}
-      ;;
-    S)
-      MAX_SHARDS=${OPTARG}
-      ;;
-    N)
-      NONCLUSTER_SHARDS=${OPTARG}
-      ;;
-    P)
-      PREFIX=${OPTARG}
-      ;;
-    O)
-      OUTDIR=${OPTARG}
-      ;;
-    T)
-      VCF_INDEX=${OPTARG}
-      ;;
-  esac
-done
-shift $(( ${OPTIND} - 1))
-VCF=$1
-if [ -z ${VCF_INDEX} ]; then
-  VCF_INDEX=${VCF}.tbi
-fi
-
-
-###PROCESS ARGS & OPTIONS
-#Check for required input
-if [ -z ${VCF} ]; then
-  echo -e "\nERROR: input VCF not specified\n"
-  usage
-  exit 0
-fi
-if ! [ -s ${VCF} ]; then
-  echo -e "\nERROR: input VCF either empty or not found\n"
-  usage
-  exit 0
-fi
-if [ $( file ${VCF} | fgrep "gzip" | wc -l ) -lt 1 ]; then
-  echo -e "\nERROR: input VCF must be bgzipped\n"
-  usage
-  exit 0
-fi
-if [ -z ${VCF_INDEX} ]; then
-  echo -e "\nERROR: input VCF index not specified\n"
-  usage
-  exit 0
-fi
-if ! [ -s ${VCF_INDEX} ]; then
-  echo -e "\nERROR: input VCF index either empty or not found\n"
-  usage
-  exit 0
-fi
-if [ -z ${OUTDIR} ]; then
-  echo -e "\nERROR: output directory not specified\n"
-  usage
-  exit 0
-fi
-#Creates $OUTDIR if necessary
-if ! [ -e ${OUTDIR} ]; then
-  mkdir ${OUTDIR}
-fi
-#Checks for numeric L and S options
-if [ ${MIN_LINES_PER_SHARD} -lt 1 ] || [ ${MIN_LINES_PER_SHARD} -gt 1000000 ]; then
-  echo -e "\nERROR: MIN_LINES_PER_SHARD must be an integer between 1 and 1000000\n"
-  usage
-  exit 0
-fi
-if [ ${MAX_SHARDS} -lt 1 ] || [ ${MAX_SHARDS} -gt 10000 ]; then
-  echo -e "\nERROR: MAX_SHARDS must be an integer between 1 and 10000\n"
-  usage
-  exit 0
-fi
-#Set path to execution directory
-BIN=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
-
-
-###IDENTIFY CANDIDATE COMPLEX CLUSTERS
-#Make temporary directory
-SHARD_VCF_TMP=`mktemp -d`
-#Cut vcf to single-sample for improved clustering speed
-zcat ${VCF} | cut -f1-10 | bgzip -c > ${SHARD_VCF_TMP}/single_sample_input.vcf.gz
-#Identify all candidate complex variant clusters (generous 1kb clustering)
-svtk vcfcluster \
-  -d ${DIST} \
-  -f ${RECIP} \
-  -p candidate_complex_clusters \
-  -o 0 \
-  --preserve-ids \
-  <( echo "${SHARD_VCF_TMP}/single_sample_input.vcf.gz" ) \
-  ${SHARD_VCF_TMP}/input_vcf.clustered.vcf
-#Convert clustered variants to bed
-svtk vcf2bed \
-  --no-samples \
-  --info ALL \
-  ${SHARD_VCF_TMP}/input_vcf.clustered.vcf \
-  ${SHARD_VCF_TMP}/input_vcf.clustered.bed
-#Write list of clusters with >1 constituent variant
-mem_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \
-           sed 's/\t/\n/g' | awk '{ if ($1=="MEMBERS") print NR }' )
-awk -v idx=${mem_idx} -v OFS="\t" \
-'$idx ~ /,/ { print $1, $2, $3, $idx }' \
-${SHARD_VCF_TMP}/input_vcf.clustered.bed | fgrep -v "#" > \
-${SHARD_VCF_TMP}/candidate_complex_clusters.bed
-#Get min/max coordinates of all variants in list of VIDs
-cat <( zcat ${VCF} | fgrep "#" | cut -f1-10 ) \
-<( cut -f4 ${SHARD_VCF_TMP}/candidate_complex_clusters.bed | \
-   sed 's/\,/\n/g' | sort -Vk1,1 | uniq | fgrep -wf - \
-   <( zcat ${VCF} ) | cut -f1-10 ) | \
-svtk vcf2bed --no-samples /dev/stdin /dev/stdout > \
-${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed
-
-
-###DETERMINE SET OF NONREDUNDANT INTERVALS FOR ALL CLUSTERS
-#Split into breakpoints and pad all breakpoints by ±5kb
-fgrep -v "#" ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed | \
-awk -v OFS="\t" -v buffer=5000 \
-'{ print $1, $2-buffer, $2+buffer, $4"\n"$1, $3-buffer, $3+buffer, $4 }' | \
-awk -v OFS="\t" '{ if ($2<1) $2=1; print $1, $2, $3, $4 }' | \
-sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - -c 4 -o distinct > \
-${SHARD_VCF_TMP}/breakpoint_intervals.bed
-#Iterate over breakpoint intervals and write list of maximum nonredundant intervals
-in_cluster=`mktemp`
-remaining=`mktemp`
-cp ${SHARD_VCF_TMP}/breakpoint_intervals.bed ${remaining}
-while read chr start end VIDs; do
-  #Get all lines associated with current VIDs
-  echo -e "${VIDs}" | sed 's/,/\n/g' | fgrep -wf - \
-  ${remaining} > ${in_cluster}
-  #Only run if at least one line added to ${in_cluster}
-  if [ $( cat ${in_cluster} | wc -l ) -gt 0 ]; then
-    #Exclude all lines in ${in_cluster} from ${remaining}
-    bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2
-    mv ${remaining}2 ${remaining}
-    #Iterate until no more related VIDs are present in ${remaining}
-    until [ $( cut -f4 ${in_cluster} | sed 's/\,/\n/g' | fgrep -wf - ${remaining} | wc -l ) -eq 0 ]; do
-      #Add new lines to ${in_cluster}
-      cut -f4 ${in_cluster} | sed 's/\,/\n/g' | fgrep -wf - ${remaining} >> ${in_cluster}
-      #Exclude all lines in ${in_cluster} from ${remaining}
-      bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2
-      mv ${remaining}2 ${remaining}
-    done
-    #Write out final interval
-    for wrapper in 1; do
-      #Print list of coordinates
-      cut -f1-3 ${in_cluster} | sort -Vk1,1 -k2,2 -k3,3 | bedtools merge -i - | \
-      awk '{ print $1":"$2"-"$3 }' | paste -s -d\;
-      #Print list of involved VIDs
-      cut -f4 ${in_cluster} | sed 's/,/\n/g' | sort | uniq | paste -s -d,
-    done | paste -s
-  fi
-done < ${SHARD_VCF_TMP}/breakpoint_intervals.bed > \
-${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt
-
-
-###DETERMINE COORDINATES FOR EACH SHARD
-#Split variants into shards based on number of variants
-#If total number of intervals/MAX_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard
-if [ $(( $( cat ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt | wc -l ) / ${MAX_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then
-  ${BIN}/evenSplitter.R \
-  -L ${MIN_LINES_PER_SHARD} \
-  ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \
-  ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_
-#Otherwise, split into MAX_SHARDS evenly-sized shards
-else
-  ${BIN}/evenSplitter.R \
-  -S ${MAX_SHARDS} \
-  ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \
-  ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_
-fi
-#Determine number of shards
-n_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}*" | wc -l )
-#Reformat interval shards
-for i in $( seq 1 ${n_shards} ); do
-  cut -f1 ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i} | \
-  sed -e 's/\;/\n/g' -e 's/\:/\t/g' -e 's/\-/\t/g' | \
-  sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - > \
-  ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed
-  rm ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}
-done
-
-
-###SHARD CLUSTERABLE VCF
-#Convert full, original VCF to BED
-svtk vcf2bed --no-samples \
-  ${VCF} int.bed
-#Harrison's patch for sharding
-awk '{if ($1!~"#") print $1,$2,$2+1,$4,$5 \
-  "\n" $1,$3-1,$3,$4,$5;else print}' OFS='\t' int.bed \
-  > ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed
-rm int.bed
-#Iterate over all sharded intervals
-for i in $( seq 1 ${n_shards} ); do
-  #Write exclusion list of VIDs already used in earlier shards
-  touch ${SHARD_VCF_TMP}/used_VIDs.tmp
-  if [ ${i} -gt 1 ]; then
-    for j in $( seq 1 $(( ${i} - 1 )) ); do
-      cat ${SHARD_VCF_TMP}/${PREFIX}.shard_${j}.VIDs.list
-    done | sort | uniq > ${SHARD_VCF_TMP}/used_VIDs.tmp
-  fi
-  #Get list of IDs to be used in shard
-  bedtools intersect \
-  -a ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed \
-  -b ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed | \
-  cut -f4 | fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.tmp > \
-  ${SHARD_VCF_TMP}/${PREFIX}.shard_${i}.VIDs.list
-  #Print header
-  zcat ${VCF} | head -n1000 | fgrep "#" > \
-  ${OUTDIR}/${PREFIX}.shard_${i}.vcf
-  #Shard based on VIDs (slower than tabix, but avoids omitting variants)
-  zcat ${VCF} | fgrep -wf ${SHARD_VCF_TMP}/${PREFIX}.shard_${i}.VIDs.list >> \
-  ${OUTDIR}/${PREFIX}.shard_${i}.vcf
-  #Sanity check shard
-  if [ $( fgrep -v "#" ${OUTDIR}/${PREFIX}.shard_${i}.vcf | wc -l ) -gt 0 ]; then
-    #Bgzip & tabix shard
-    bgzip -f ${OUTDIR}/${PREFIX}.shard_${i}.vcf
-    tabix -f ${OUTDIR}/${PREFIX}.shard_${i}.vcf.gz
-  else
-    rm ${OUTDIR}/${PREFIX}.shard_${i}.vcf
-  fi
-  #Clean up used VID list
-  rm ${SHARD_VCF_TMP}/used_VIDs.tmp
-done
-
-
-###SHARD NONCLUSTERABLE VCF
-#Get list of variant IDs not present in any previous shard
-vcf-concat ${OUTDIR}/${PREFIX}.shard_*.vcf.gz \
-  | fgrep -v "#" | cut -f3 \
-  > ${SHARD_VCF_TMP}/used_VIDs.tmp
-#Get list of eligible variant IDs
-zcat ${VCF} | fgrep -v "#" | cut -f3 \
-  | fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.tmp \
-  > ${SHARD_VCF_TMP}/remaining_VIDs.tmp
-#Shard remainder intervals into no more than $NONCLUSTER_SHARDS shards
-#If total number of variants/NONCLUSTER_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard
-if [ $(( $( cat ${SHARD_VCF_TMP}/remaining_VIDs.tmp | wc -l ) / ${NONCLUSTER_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then
-  ${BIN}/evenSplitter.R \
-  -L ${MIN_LINES_PER_SHARD} \
-  ${SHARD_VCF_TMP}/remaining_VIDs.tmp \
-  ${SHARD_VCF_TMP}/${PREFIX}.remaining_variants_
-#Otherwise, split into MAX_SHARDS evenly-sized shards
-else
-  ${BIN}/evenSplitter.R \
-  -S ${NONCLUSTER_SHARDS} \
-  ${SHARD_VCF_TMP}/remaining_VIDs.tmp \
-  ${SHARD_VCF_TMP}/${PREFIX}.remaining_variants_
-fi
-n_nonclusterable_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}.remaining_variants_*" | wc -l )
-#Iterate over all sharded variant lists
-for i in $( seq 1 ${n_nonclusterable_shards} ); do
-  #Print header
-  zcat ${VCF} | head -n1000 | fgrep "#" > \
-  ${OUTDIR}/${PREFIX}.shard_$(( ${n_shards} + ${i} )).vcf
-  #Shard based on VIDs (slower than tabix, but avoids omitting variants)
-  zcat ${VCF} | fgrep -wf ${SHARD_VCF_TMP}/${PREFIX}.remaining_variants_${i} >> \
-  ${OUTDIR}/${PREFIX}.shard_$(( ${n_shards} + ${i} )).vcf
-  #Sanity check shard
-  if [ $( fgrep -v "#" ${OUTDIR}/${PREFIX}.shard_$(( ${n_shards} + ${i} )).vcf | wc -l ) -gt 0 ]; then
-    #Bgzip & tabix shard
-    bgzip -f ${OUTDIR}/${PREFIX}.shard_$(( ${n_shards} + ${i} )).vcf
-    tabix -f ${OUTDIR}/${PREFIX}.shard_$(( ${n_shards} + ${i} )).vcf.gz
-  else
-    rm ${OUTDIR}/${PREFIX}.shard_$(( ${n_shards} + ${i} )).vcf
-  fi
-done
-
-
-###CLEAN UP
-rm -rf ${SHARD_VCF_TMP}
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preClustering_part1.sh b/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preClustering_part1.sh
deleted file mode 100755
index 7434b0eaa..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preClustering_part1.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-
-# Intelligently shards a VCF prior to complex resolution (for parallelization)
-
-# Subsetted to first half, to just output lists of which variants should go in each shard
-
-set -Eeu -o pipefail
-
-# ARGS defaults and hard-coded values
-# generous 1kb clustering, 10% RO clustering
-DEFAULT_DIST=1000
-DEFAULT_RECIP=0.1
-DEFAULT_MIN_LINES_PER_SHARD=10
-DEFAULT_MAX_SHARDS=100
-DEFAULT_NONCLUSTER_SHARDS=30
-DEFAULT_PREFIX="vcf_shard"
-DEFAULT_BREAKPOINT_PADDING=5000
-DEFAULT_IGNORE_SV_TYPES=false
-DEFAULT_ADD_SINGLE_REC=false
-DEFAULT_SHARD_LARGE_CLUSTERS=true
-SCRIPT_NAME=$(basename "${BASH_SOURCE[0]}")
-
-BIN=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
-source $BIN/shardVCF_backend_part1.sh
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX.sh b/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX.sh
deleted file mode 100755
index 0793f75c7..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX.sh
+++ /dev/null
@@ -1,368 +0,0 @@
-#!/bin/bash
-
-# Intelligently shards a VCF prior to complex resolution (for parallelization)
-
-set -euo pipefail
-
-###USAGE
-usage(){
-cat <<EOF
-usage: shardVCF_preResolveCPX.sh [-h] [-L MIN_LINES_PER_SHARD] [-S MAX_SHARDS]
-                                 [-N NONCLUSTER_SHARDS] [-P PREFIX] [-T VCF_INDEX] VCF
-Shard VCF prior to complex resolution (for parallelization)
-Positional arguments:
-  VCF                      VCF from sv-pipeline prior to svtk resolve
-  OUTDIR                   Output directory for all shards
-Optional arguments:
-  -h  HELP                 Show this help message and exit
-  -L  MIN_LINES_PER_SHARD  Minimum variants per shard [default: 10]
-  -S  MAX_SHARDS           Maximum number of shards [default: 100]
-  -N  NONCLUSTER_SHARDS    Add an additional N shards to evenly split the 
-                           remaining records in the VCF that do not cluster with
-                           any other variants [default: 30]
-  -P  PREFIX               String appended to the filename of each output VCF 
-                           shard [default: "vcf_shard"]
-  -O  OUTDIR               Output directory [default: pwd]
-  -T  VCF_index            Tabix index corresponding to input VCF 
-                           [default: VCF path + .tbi]
-Notes:
-  1. The total number of shards created will be no more 
-     than MAX_SHARDS + NONCLUSTER_SHARDS.
-EOF
-}
-
-
-###PARSE ARGS
-MIN_LINES_PER_SHARD=10
-MAX_SHARDS=100
-NONCLUSTER_SHARDS=30
-PREFIX="vcf_shard"
-OUTDIR=`pwd`
-while getopts ":L:S:N:P:O:T:h" opt; do
-  case "$opt" in
-    h)
-      usage
-      exit 0
-      ;;
-    L)
-      MIN_LINES_PER_SHARD=${OPTARG}
-      ;;
-    S)
-      MAX_SHARDS=${OPTARG}
-      ;;
-    N)
-      NONCLUSTER_SHARDS=${OPTARG}
-      ;;
-    P)
-      PREFIX=${OPTARG}
-      ;;
-    O)
-      OUTDIR=${OPTARG}
-      ;;
-    T)
-      VCF_INDEX=${OPTARG}
-      ;;
-  esac
-done
-shift $(( ${OPTIND} - 1))
-VCF=$1
-if [ -z ${VCF_INDEX} ]; then
-  VCF_INDEX=${VCF}.tbi
-fi
-
-
-###PROCESS ARGS & OPTIONS
-#Check for required input
-if [ -z ${VCF} ]; then
-  echo -e "\nERROR: input VCF not specified\n"
-  usage
-  exit 0
-fi
-if ! [ -s ${VCF} ]; then
-  echo -e "\nERROR: input VCF either empty or not found\n"
-  usage
-  exit 0
-fi
-if [ $( file ${VCF} | fgrep "gzip" | wc -l ) -lt 1 ]; then
-  echo -e "\nERROR: input VCF must be bgzipped\n"
-  usage
-  exit 0
-fi
-if [ -z ${VCF_INDEX} ]; then
-  echo -e "\nERROR: input VCF index not specified\n"
-  usage
-  exit 0
-fi
-if ! [ -s ${VCF_INDEX} ]; then
-  echo -e "\nERROR: input VCF index either empty or not found\n"
-  usage
-  exit 0
-fi
-if [ -z ${OUTDIR} ]; then
-  echo -e "\nERROR: output directory not specified\n"
-  usage
-  exit 0
-fi
-#Creates $OUTDIR if necessary
-if ! [ -e ${OUTDIR} ]; then
-  mkdir ${OUTDIR}
-fi
-#Checks for numeric L and S options
-if [ ${MIN_LINES_PER_SHARD} -lt 1 ] || [ ${MIN_LINES_PER_SHARD} -gt 1000000 ]; then
-  echo -e "\nERROR: MIN_LINES_PER_SHARD must be an integer between 1 and 1000000\n"
-  usage
-  exit 0
-fi
-if [ ${MAX_SHARDS} -lt 1 ] || [ ${MAX_SHARDS} -gt 10000 ]; then
-  echo -e "\nERROR: MAX_SHARDS must be an integer between 1 and 10000\n"
-  usage
-  exit 0
-fi
-#Set path to execution directory
-BIN=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
-
-
-###IDENTIFY CANDIDATE COMPLEX CLUSTERS
-#Make temporary directory
-SHARD_VCF_TMP=`mktemp -d`
-#Cut vcf to single-sample for improved clustering speed
-zcat ${VCF} | cut -f1-10 | bgzip -c > ${SHARD_VCF_TMP}/single_sample_input.vcf.gz
-#Identify all candidate complex variant clusters (generous 1kb clustering)
-svtk vcfcluster \
-  -d 1000 \
-  -f 0 \
-  -p candidate_complex_clusters \
-  --ignore-svtypes \
-  -o 0 \
-  --preserve-ids \
-  <( echo "${SHARD_VCF_TMP}/single_sample_input.vcf.gz" ) \
-  ${SHARD_VCF_TMP}/input_vcf.clustered.vcf
-#Convert clustered variants to bed
-svtk vcf2bed \
-  --no-samples \
-  --info ALL \
-  ${SHARD_VCF_TMP}/input_vcf.clustered.vcf \
-  ${SHARD_VCF_TMP}/input_vcf.clustered.bed
-#Write list of clusters with >1 constituent variant
-mem_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \
-           sed 's/\t/\n/g' | awk '{ if ($1=="MEMBERS") print NR }' )
-awk -v idx=${mem_idx} -v OFS="\t" \
-'$idx ~ /,/ { print $1, $2, $3, $idx }' \
-${SHARD_VCF_TMP}/input_vcf.clustered.bed | fgrep -v "#" > \
-${SHARD_VCF_TMP}/candidate_complex_clusters.bed
-#Add all non-CNV single-record variants
-class_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \
-           sed 's/\t/\n/g' | awk '{ if ($1=="SVTYPE") print NR }' )
-awk -v idx=${class_idx} -v mem_idx=${mem_idx} -v OFS="\t" \
-'$mem_idx !~ /,/ { print $1, $2, $3, $idx, $mem_idx }' \
-${SHARD_VCF_TMP}/input_vcf.clustered.bed | \
-awk -v OFS="\t" '$4 !~ /DEL|DUP|CNV|MCNV|mCNV/ { print $1, $2, $3, $5 }' | fgrep -v "#" >> \
-${SHARD_VCF_TMP}/candidate_complex_clusters.bed
-#Get min/max coordinates of all variants in list of VIDs
-cat <( zcat ${VCF} | fgrep "#" | cut -f1-10 ) \
-<( cut -f4 ${SHARD_VCF_TMP}/candidate_complex_clusters.bed | \
-   sed 's/\,/\n/g' | sort -Vk1,1 | uniq | { fgrep -wf - <( zcat ${VCF} ) || true; } \
-   | cut -f1-10 ) | \
-svtk vcf2bed --no-samples /dev/stdin /dev/stdout > \
-${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed
-
-
-#Split into breakpoints and pad all breakpoints by ±0kb 
-#DEV NOTE: padding breakpoints for large chromosomes & many samples was causing
-# issues where tens of thousands of breakpoints would end up in the same shard
-# and take >36h to resolve, defeating the purpose of sharding
-fgrep -v "#" ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed | \
-awk -v OFS="\t" -v buffer=0 \
-'{ print $1, $2-buffer, $2+buffer, $4"\n"$1, $3-buffer, $3+buffer, $4 }' | \
-awk -v OFS="\t" '{ if ($2<1) $2=1; print $1, $2, $3, $4 }' | \
-sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - -c 4 -o distinct > \
-${SHARD_VCF_TMP}/breakpoint_intervals.bed
-#Iterate over breakpoint intervals and write list of maximum nonredundant intervals
-in_cluster=`mktemp`
-remaining=`mktemp`
-cp ${SHARD_VCF_TMP}/breakpoint_intervals.bed ${remaining}
-while read chr start end VIDs; do
-  #Get all lines associated with current VIDs
-  echo -e "${VIDs}" | sed 's/,/\n/g' | { fgrep -wf - ${remaining} || true; } > ${in_cluster}
-  #Only run if at least one line added to ${in_cluster}
-  if [ $( cat ${in_cluster} | wc -l ) -gt 0 ]; then
-    #Exclude all lines in ${in_cluster} from ${remaining}
-    bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2
-    mv ${remaining}2 ${remaining}
-    #Iterate until no more related VIDs are present in ${remaining}
-    until [ $( cut -f4 ${in_cluster} | sed 's/\,/\n/g' | { fgrep -wf - ${remaining} || true; } | wc -l ) -eq 0 ]; do
-      #Add new lines to ${in_cluster}
-      cut -f4 ${in_cluster} | sed 's/\,/\n/g' | { fgrep -wf - ${remaining} || true; } >> ${in_cluster}
-      #Exclude all lines in ${in_cluster} from ${remaining}
-      bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2
-      mv ${remaining}2 ${remaining}
-    done
-    #Write out final interval
-    for wrapper in 1; do
-      #Print list of coordinates
-      cut -f1-3 ${in_cluster} | sort -Vk1,1 -k2,2 -k3,3 | bedtools merge -i - | \
-      awk '{ print $1":"$2"-"$3 }' | paste -s -d\;
-      #Print list of involved VIDs
-      cut -f4 ${in_cluster} | sed 's/,/\n/g' | sort | uniq | paste -s -d,
-    done | paste -s
-  fi
-done < ${SHARD_VCF_TMP}/breakpoint_intervals.bed > \
-${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt
-#Pull out exceptionally large clusters to the side to be placed in their own shards
-while read ints VIDs; do
-  if [ $( echo ${VIDs} | sed 's/,/\n/g' | wc -l ) -ge ${MIN_LINES_PER_SHARD} ]; then
-    echo -e "${ints}\t${VIDs}"
-  fi
-done < ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \
-  > ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt
-if [ $( cat ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt | wc -l ) -gt 0 ]; then
-  cut -f2 ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt \
-  | sed 's/,/\n/g' \
-  | { fgrep -wvf - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt || true; } \
-  > ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2
-  mv ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2 \
-  ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt
-fi
-
-
-###DETERMINE COORDINATES FOR EACH SHARD
-#Split variants into shards based on number of variants
-#If total number of intervals/MAX_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard
-if [ $(( $( cat ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt | wc -l ) / ${MAX_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then
-  ${BIN}/evenSplitter.R \
-  -L ${MIN_LINES_PER_SHARD} \
-  ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \
-  ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_
-#Otherwise, split into MAX_SHARDS evenly-sized shards
-else
-  ${BIN}/evenSplitter.R \
-  -S ${MAX_SHARDS} \
-  ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \
-  ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_
-fi
-#Writes exceptionally large clusters to their own shards
-n_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}*" | wc -l )
-if [ $( cat ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt | wc -l ) -gt 0 ]; then
-  while read ints VIDs; do
-    n_shards=$(( ${n_shards} + 1 ))
-    echo -e "${ints}\t${VIDs}" > ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${n_shards}
-  done < ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt
-fi
-#Reformat interval shards
-for i in $( seq 1 ${n_shards} ); do
-  cut -f1 ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i} | \
-  sed -e 's/\;/\n/g' -e 's/\:/\t/g' -e 's/\-/\t/g' | \
-  sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - > \
-  ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed
-  rm ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}
-done
-
-
-###SHARD VCF
-#Convert full, original VCF to BED
-svtk vcf2bed --no-samples \
-  ${VCF} int.bed
-#Harrison's patch for sharding
-awk '{if ($1!~"#") print $1,$2,$2+1,$4,$5 \
-  "\n" $1,$3-1,$3,$4,$5;else print}' OFS='\t' int.bed \
-  > ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed
-rm int.bed
-#Iterate over all sharded intervals
-for i in $( seq 1 $(( ${n_shards} )) ); do
-  #Write exclusion list of VIDs already used in earlier shards
-  touch ${SHARD_VCF_TMP}/used_VIDs.tmp
-  if [ ${i} -gt 1 ]; then
-    for j in $( seq 1 $(( ${i} - 1 )) ); do
-      cat ${SHARD_VCF_TMP}/${PREFIX}.shard_${j}.VIDs.list
-    done | sort | uniq > ${SHARD_VCF_TMP}/used_VIDs.tmp
-  fi
-  #Get list of IDs to be used in shard
-  bedtools intersect \
-  -a ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed \
-  -b ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed | \
-  cut -f4 | { fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.tmp || true; } > \
-  ${SHARD_VCF_TMP}/${PREFIX}.shard_${i}.VIDs.list
-  #Print header
-  zcat ${VCF} | head -n1000 | fgrep "#" > \
-  ${OUTDIR}/${PREFIX}.shard_${i}.vcf
-  #Shard based on VIDs (slower than tabix, but avoids omitting variants)
-  zcat ${VCF} | { fgrep -wf ${SHARD_VCF_TMP}/${PREFIX}.shard_${i}.VIDs.list || true; } >> \
-  ${OUTDIR}/${PREFIX}.shard_${i}.vcf
-  #Sanity check shard
-  if [ $( { fgrep -v "#" ${OUTDIR}/${PREFIX}.shard_${i}.vcf || true; } | wc -l ) -gt 0 ]; then
-    #Bgzip & tabix shard
-    bgzip -f ${OUTDIR}/${PREFIX}.shard_${i}.vcf
-    tabix -f ${OUTDIR}/${PREFIX}.shard_${i}.vcf.gz
-  else
-    rm ${OUTDIR}/${PREFIX}.shard_${i}.vcf
-  fi
-  #Clean up used VID list
-  rm ${SHARD_VCF_TMP}/used_VIDs.tmp
-done
-#Write list of all VIDs used in cluster shards
-zcat ${OUTDIR}/${PREFIX}.shard_*.vcf.gz \
-  | cut -f1-3 | fgrep -v "#" | cut -f3 \
-  > ${SHARD_VCF_TMP}/used_VIDs.cluster_shards.list
-#Write list of eligible VIDs
-fgrep -v "#" ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed \
-  | cut -f4 \
-  | { fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.cluster_shards.list || true; } \
-  > ${SHARD_VCF_TMP}/remaining_VIDs.list
-#Shard remaining records into no more than $NONCLUSTER_SHARDS shards
-#If total number of records/NONCLUSTER_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard
-if [ $(( $( cat ${SHARD_VCF_TMP}/remaining_VIDs.list | wc -l ) / ${NONCLUSTER_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then
-  ${BIN}/evenSplitter.R \
-  -L ${MIN_LINES_PER_SHARD} \
-  ${SHARD_VCF_TMP}/remaining_VIDs.list \
-  ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_
-#Otherwise, split into MAX_SHARDS evenly-sized shards
-else
-  ${BIN}/evenSplitter.R \
-  -S ${NONCLUSTER_SHARDS} \
-  ${SHARD_VCF_TMP}/remaining_VIDs.list \
-  ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_
-fi
-#Iterate over all non-cluster shards and generate VCF shards
-n_noncluster_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}.remainder_VIDs_*" | wc -l )
-for i in $( seq 1 ${n_noncluster_shards} ); do
-  idx=$(( ${n_shards} + ${i} ))
-  #Print header
-  zcat ${VCF} | head -n1000 | fgrep "#" > \
-  ${OUTDIR}/${PREFIX}.shard_${idx}.vcf
-  #Shard based on VIDs (slower than tabix, but avoids omitting variants)
-  zcat ${VCF} | { fgrep -wf ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_${i} || true; } >> \
-  ${OUTDIR}/${PREFIX}.shard_${idx}.vcf
-  #Sanity check shard
-  if [ $( { fgrep -v "#" ${OUTDIR}/${PREFIX}.shard_${idx}.vcf || true; } | wc -l ) -gt 0 ]; then
-    #Bgzip & tabix shard
-    bgzip -f ${OUTDIR}/${PREFIX}.shard_${idx}.vcf
-    tabix -f ${OUTDIR}/${PREFIX}.shard_${idx}.vcf.gz
-  else
-    rm ${OUTDIR}/${PREFIX}.shard_${idx}.vcf
-  fi
-done
-
-
-###SANITY CHECK SHARDS
-while read shard; do 
-  zcat ${shard} | cut -f1 | { fgrep -v "#" || true; } | wc -l
-done < <( find ${OUTDIR} -name "${PREFIX}.shard_*.vcf.gz" ) \
-  | sort -nrk1,1 \
-  > ${SHARD_VCF_TMP}/vars_per_shard.txt
-echo -e "FINISHED SHARDING VCF. RESULTING RECORDS PER SHARD FOR LARGEST 100 SHARDS:"
-head -n100 ${SHARD_VCF_TMP}/vars_per_shard.txt | paste -s -d','
-#If shard with most variants is >10-fold more than next-largest shard, exit with code 1
-if [ $( find ${OUTDIR} -name "${PREFIX}.shard_*.vcf.gz" | wc -l ) -gt 1 ]; then
-  first=$( sed -n '1p' ${SHARD_VCF_TMP}/vars_per_shard.txt )
-  second=$( sed -n '2p' ${SHARD_VCF_TMP}/vars_per_shard.txt )
-  if [ ! -z ${second} ] && [ ${second} -gt 0 ]; then
-    if [ $(( ${first} / ${second} )) -ge 10 ]; then
-      echo -e "CRITICAL WARNING: LARGEST SHARD IS AT LEAST 10 TIMES LARGER THAN SECOND-LARGEST SHARD"
-      exit 1
-    fi
-  fi
-fi
-
-
-###CLEAN UP
-rm -rf ${SHARD_VCF_TMP}
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX_invOnly.sh b/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX_invOnly.sh
deleted file mode 100755
index e8d6f18bd..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX_invOnly.sh
+++ /dev/null
@@ -1,369 +0,0 @@
-#!/bin/bash
-
-# Intelligently shards a VCF prior to complex resolution (for parallelization)
-
-# Version modified for inv-only sharding (RO 10%, no restriction on breakpoint distance)
-
-set -euo pipefail
-
-###USAGE
-usage(){
-cat <<EOF
-usage: shardVCF_preResolveCPX_invOnly.sh [-h] [-L MIN_LINES_PER_SHARD] [-S MAX_SHARDS]
-                                 [-N NONCLUSTER_SHARDS] [-P PREFIX] [-T VCF_INDEX] VCF
-Shard VCF prior to complex resolution (for parallelization)
-Positional arguments:
-  VCF                      VCF from sv-pipeline prior to svtk resolve
-  OUTDIR                   Output directory for all shards
-Optional arguments:
-  -h  HELP                 Show this help message and exit
-  -L  MIN_LINES_PER_SHARD  Minimum variants per shard [default: 10]
-  -S  MAX_SHARDS           Maximum number of shards [default: 100]
-  -N  NONCLUSTER_SHARDS    Add an additional N shards to evenly split the 
-                           remaining records in the VCF that do not cluster with
-                           any other variants [default: 30]
-  -P  PREFIX               String appended to the filename of each output VCF 
-                           shard [default: "vcf_shard"]
-  -O  OUTDIR               Output directory [default: pwd]
-  -T  VCF_index            Tabix index corresponding to input VCF 
-                           [default: VCF path + .tbi]
-Notes:
-  1. The total number of shards created will be no more 
-     than MAX_SHARDS + NONCLUSTER_SHARDS.
-EOF
-}
-
-
-###PARSE ARGS
-MIN_LINES_PER_SHARD=10
-MAX_SHARDS=100
-NONCLUSTER_SHARDS=30
-PREFIX="vcf_shard"
-OUTDIR=`pwd`/
-while getopts ":L:S:N:P:O:T:h" opt; do
-  case "$opt" in
-    h)
-      usage
-      exit 0
-      ;;
-    L)
-      MIN_LINES_PER_SHARD=${OPTARG}
-      ;;
-    S)
-      MAX_SHARDS=${OPTARG}
-      ;;
-    N)
-      NONCLUSTER_SHARDS=${OPTARG}
-      ;;
-    P)
-      PREFIX=${OPTARG}
-      ;;
-    O)
-      OUTDIR=${OPTARG}
-      ;;
-    T)
-      VCF_INDEX=${OPTARG}
-      ;;
-  esac
-done
-shift $(( ${OPTIND} - 1))
-VCF=$1
-if [ -z ${VCF_INDEX} ]; then
-  VCF_INDEX=${VCF}.tbi
-fi
-
-
-###PROCESS ARGS & OPTIONS
-#Check for required input
-if [ -z ${VCF} ]; then
-  echo -e "\nERROR: input VCF not specified\n"
-  usage
-  exit 0
-fi
-if ! [ -s ${VCF} ]; then
-  echo -e "\nERROR: input VCF either empty or not found\n"
-  usage
-  exit 0
-fi
-if [ $( file ${VCF} | fgrep "gzip" | wc -l ) -lt 1 ]; then
-  echo -e "\nERROR: input VCF must be bgzipped\n"
-  usage
-  exit 0
-fi
-if [ -z ${VCF_INDEX} ]; then
-  echo -e "\nERROR: input VCF index not specified\n"
-  usage
-  exit 0
-fi
-if ! [ -s ${VCF_INDEX} ]; then
-  echo -e "\nERROR: input VCF index either empty or not found\n"
-  usage
-  exit 0
-fi
-if [ -z ${OUTDIR} ]; then
-  echo -e "\nERROR: output directory not specified\n"
-  usage
-  exit 0
-fi
-#Creates $OUTDIR if necessary
-if ! [ -e ${OUTDIR} ]; then
-  mkdir ${OUTDIR}
-fi
-#Checks for numeric L and S options
-if [ ${MIN_LINES_PER_SHARD} -lt 1 ] || [ ${MIN_LINES_PER_SHARD} -gt 1000000 ]; then
-  echo -e "\nERROR: MIN_LINES_PER_SHARD must be an integer between 1 and 1000000\n"
-  usage
-  exit 0
-fi
-if [ ${MAX_SHARDS} -lt 1 ] || [ ${MAX_SHARDS} -gt 10000 ]; then
-  echo -e "\nERROR: MAX_SHARDS must be an integer between 1 and 10000\n"
-  usage
-  exit 0
-fi
-#Set path to execution directory
-BIN=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
-
-
-###IDENTIFY CANDIDATE COMPLEX CLUSTERS
-#Make temporary directory
-SHARD_VCF_TMP=`mktemp -d`
-#Cut vcf to single-sample for improved clustering speed
-zcat ${VCF} | cut -f1-10 | bgzip -c > ${SHARD_VCF_TMP}/single_sample_input.vcf.gz
-#Identify all candidate complex variant clusters (generous 50Mb, 10% RO clustering)
-svtk vcfcluster \
-  -d 50000000 \
-  -f 0.10 \
-  -p candidate_complex_clusters \
-  --ignore-svtypes \
-  -o 0 \
-  --preserve-ids \
-  <( echo "${SHARD_VCF_TMP}/single_sample_input.vcf.gz" ) \
-  ${SHARD_VCF_TMP}/input_vcf.clustered.vcf
-#Convert clustered variants to bed
-svtk vcf2bed \
-  --no-samples \
-  --info ALL \
-  ${SHARD_VCF_TMP}/input_vcf.clustered.vcf \
-  ${SHARD_VCF_TMP}/input_vcf.clustered.bed
-#Write list of clusters with >1 constituent variant
-mem_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \
-           sed 's/\t/\n/g' | awk '{ if ($1=="MEMBERS") print NR }' )
-awk -v idx=${mem_idx} -v OFS="\t" \
-'$idx ~ /,/ { print $1, $2, $3, $idx }' \
-${SHARD_VCF_TMP}/input_vcf.clustered.bed | fgrep -v "#" > \
-${SHARD_VCF_TMP}/candidate_complex_clusters.bed
-#Add all non-CNV single-record variants
-class_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \
-           sed 's/\t/\n/g' | awk '{ if ($1=="SVTYPE") print NR }' )
-awk -v idx=${class_idx} -v mem_idx=${mem_idx} -v OFS="\t" \
-'$mem_idx !~ /,/ { print $1, $2, $3, $idx, $mem_idx }' \
-${SHARD_VCF_TMP}/input_vcf.clustered.bed | \
-awk -v OFS="\t" '$4 !~ /DEL|DUP|CNV|MCNV|mCNV/ { print $1, $2, $3, $5 }' | fgrep -v "#" >> \
-${SHARD_VCF_TMP}/candidate_complex_clusters.bed
-#Get min/max coordinates of all variants in list of VIDs
-cat <( zcat ${VCF} | fgrep "#" | cut -f1-10 ) \
-<( cut -f4 ${SHARD_VCF_TMP}/candidate_complex_clusters.bed | \
-   sed 's/\,/\n/g' | sort -Vk1,1 | uniq | { fgrep -wf - <( zcat ${VCF} ) || true; } | \
-   cut -f1-10 ) | \
-svtk vcf2bed --no-samples /dev/stdin /dev/stdout > \
-${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed
-
-
-#Split into breakpoints and pad all breakpoints by ±0kb 
-#DEV NOTE: padding breakpoints for large chromosomes & many samples was causing
-# issues where tens of thousands of breakpoints would end up in the same shard
-# and take >36h to resolve, defeating the purpose of sharding
-fgrep -v "#" ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed | \
-awk -v OFS="\t" -v buffer=0 \
-'{ print $1, $2-buffer, $2+buffer, $4"\n"$1, $3-buffer, $3+buffer, $4 }' | \
-awk -v OFS="\t" '{ if ($2<1) $2=1; print $1, $2, $3, $4 }' | \
-sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - -c 4 -o distinct > \
-${SHARD_VCF_TMP}/breakpoint_intervals.bed
-#Iterate over breakpoint intervals and write list of maximum nonredundant intervals
-in_cluster=`mktemp`
-remaining=`mktemp`
-cp ${SHARD_VCF_TMP}/breakpoint_intervals.bed ${remaining}
-while read chr start end VIDs; do
-  #Get all lines associated with current VIDs
-  echo -e "${VIDs}" | sed 's/,/\n/g' | { fgrep -wf - ${remaining} || true; } \
-  > ${in_cluster}
-  #Only run if at least one line added to ${in_cluster}
-  if [ $( cat ${in_cluster} | wc -l ) -gt 0 ]; then
-    #Exclude all lines in ${in_cluster} from ${remaining}
-    bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2
-    mv ${remaining}2 ${remaining}
-    #Iterate until no more related VIDs are present in ${remaining}
-    until [ $( cut -f4 ${in_cluster} | sed 's/\,/\n/g' | { fgrep -wf - ${remaining} || true; } | wc -l ) -eq 0 ]; do
-      #Add new lines to ${in_cluster}
-      cut -f4 ${in_cluster} | sed 's/\,/\n/g' | { fgrep -wf - ${remaining} || true; } >> ${in_cluster}
-      #Exclude all lines in ${in_cluster} from ${remaining}
-      bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2
-      mv ${remaining}2 ${remaining}
-    done
-    #Write out final interval
-    for wrapper in 1; do
-      #Print list of coordinates
-      cut -f1-3 ${in_cluster} | sort -Vk1,1 -k2,2 -k3,3 | bedtools merge -i - | \
-      awk '{ print $1":"$2"-"$3 }' | paste -s -d\;
-      #Print list of involved VIDs
-      cut -f4 ${in_cluster} | sed 's/,/\n/g' | sort | uniq | paste -s -d,
-    done | paste -s
-  fi
-done < ${SHARD_VCF_TMP}/breakpoint_intervals.bed > \
-${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt
-#Pull out exceptionally large clusters to the side to be placed in their own shards
-while read ints VIDs; do
-  if [ $( echo ${VIDs} | sed 's/,/\n/g' | wc -l ) -ge ${MIN_LINES_PER_SHARD} ]; then
-    echo -e "${ints}\t${VIDs}"
-  fi
-done < ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \
-  > ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt
-cut -f2 ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt \
-| sed 's/,/\n/g' \
-| { fgrep -wvf - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt || true; } \
-> ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2
-mv ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2 \
-${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt
-
-
-###DETERMINE COORDINATES FOR EACH SHARD
-#Split variants into shards based on number of variants
-#If total number of intervals/MAX_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard
-if [ $(( $( cat ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt | wc -l ) / ${MAX_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then
-  ${BIN}/evenSplitter.R \
-  -L ${MIN_LINES_PER_SHARD} \
-  ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \
-  ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_
-#Otherwise, split into MAX_SHARDS evenly-sized shards
-else
-  ${BIN}/evenSplitter.R \
-  -S ${MAX_SHARDS} \
-  ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \
-  ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_
-fi
-#Writes exceptionally large clusters to their own shards
-n_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}*" | wc -l )
-if [ $( cat ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt | wc -l ) -gt 0 ]; then
-  while read ints VIDs; do
-    n_shards=$(( ${n_shards} + 1 ))
-    echo -e "${ints}\t${VIDs}" > ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${n_shards}
-  done < ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt
-fi
-#Reformat interval shards
-for i in $( seq 1 ${n_shards} ); do
-  cut -f1 ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i} | \
-  sed -e 's/\;/\n/g' -e 's/\:/\t/g' -e 's/\-/\t/g' | \
-  sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - > \
-  ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed
-  rm ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}
-done
-
-
-###SHARD VCF
-#Convert full, original VCF to BED
-svtk vcf2bed --no-samples \
-  ${VCF} int.bed
-#Harrison's patch for sharding
-awk '{if ($1!~"#") print $1,$2,$2+1,$4,$5 \
-  "\n" $1,$3-1,$3,$4,$5;else print}' OFS='\t' int.bed \
-  > ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed
-rm int.bed
-#Iterate over all sharded intervals
-for i in $( seq 1 $(( ${n_shards} )) ); do
-  #Write exclusion list of VIDs already used in earlier shards
-  touch ${SHARD_VCF_TMP}/used_VIDs.tmp
-  if [ ${i} -gt 1 ]; then
-    for j in $( seq 1 $(( ${i} - 1 )) ); do
-      cat ${SHARD_VCF_TMP}/${PREFIX}.shard_${j}.VIDs.list
-    done | sort | uniq > ${SHARD_VCF_TMP}/used_VIDs.tmp
-  fi
-  #Get list of IDs to be used in shard
-  bedtools intersect \
-  -a ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed \
-  -b ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed | \
-  cut -f4 | { fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.tmp || true; } > \
-  ${SHARD_VCF_TMP}/${PREFIX}.shard_${i}.VIDs.list
-  #Print header
-  zcat ${VCF} | head -n1000 | fgrep "#" > \
-  ${OUTDIR}/${PREFIX}.shard_${i}.vcf
-  #Shard based on VIDs (slower than tabix, but avoids omitting variants)
-  zcat ${VCF} | { fgrep -wf ${SHARD_VCF_TMP}/${PREFIX}.shard_${i}.VIDs.list || true; } >> \
-  ${OUTDIR}/${PREFIX}.shard_${i}.vcf
-  #Sanity check shard
-  if [ $( fgrep -v "#" ${OUTDIR}/${PREFIX}.shard_${i}.vcf | wc -l ) -gt 0 ]; then
-    #Bgzip & tabix shard
-    bgzip -f ${OUTDIR}/${PREFIX}.shard_${i}.vcf
-    tabix -f ${OUTDIR}/${PREFIX}.shard_${i}.vcf.gz
-  else
-    rm ${OUTDIR}/${PREFIX}.shard_${i}.vcf
-  fi
-  #Clean up used VID list
-  rm ${SHARD_VCF_TMP}/used_VIDs.tmp
-done
-#Write list of all VIDs used in cluster shards
-zcat ${OUTDIR}/${PREFIX}.shard_*.vcf.gz \
-  | cut -f1-3 | fgrep -v "#" | cut -f3 \
-  > ${SHARD_VCF_TMP}/used_VIDs.cluster_shards.list
-#Write list of eligible VIDs
-fgrep -v "#" ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed \
-  | cut -f4 \
-  | { fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.cluster_shards.list || true; } \
-  > ${SHARD_VCF_TMP}/remaining_VIDs.list
-#Shard remaining records into no more than $NONCLUSTER_SHARDS shards
-#If total number of records/NONCLUSTER_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard
-if [ $(( $( cat ${SHARD_VCF_TMP}/remaining_VIDs.list | wc -l ) / ${NONCLUSTER_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then
-  ${BIN}/evenSplitter.R \
-  -L ${MIN_LINES_PER_SHARD} \
-  ${SHARD_VCF_TMP}/remaining_VIDs.list \
-  ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_
-#Otherwise, split into MAX_SHARDS evenly-sized shards
-else
-  ${BIN}/evenSplitter.R \
-  -S ${NONCLUSTER_SHARDS} \
-  ${SHARD_VCF_TMP}/remaining_VIDs.list \
-  ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_
-fi
-#Iterate over all non-cluster shards and generate VCF shards
-n_noncluster_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}.remainder_VIDs_*" | wc -l )
-for i in $( seq 1 ${n_noncluster_shards} ); do
-  idx=$(( ${n_shards} + ${i} ))
-  #Print header
-  zcat ${VCF} | head -n1000 | fgrep "#" > \
-  ${OUTDIR}/${PREFIX}.shard_${idx}.vcf
-  #Shard based on VIDs (slower than tabix, but avoids omitting variants)
-  zcat ${VCF} | { fgrep -wf ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_${i} || true; } >> \
-  ${OUTDIR}/${PREFIX}.shard_${idx}.vcf
-  #Sanity check shard
-  if [ $( { fgrep -v "#" ${OUTDIR}/${PREFIX}.shard_${idx}.vcf || true; } | wc -l ) -gt 0 ]; then
-    #Bgzip & tabix shard
-    bgzip -f ${OUTDIR}/${PREFIX}.shard_${idx}.vcf
-    tabix -f ${OUTDIR}/${PREFIX}.shard_${idx}.vcf.gz
-  else
-    rm ${OUTDIR}/${PREFIX}.shard_${idx}.vcf
-  fi
-done
-
-
-###SANITY CHECK SHARDS
-while read shard; do 
-  zcat ${shard} | cut -f1 | { fgrep -v "#" || true; } | wc -l
-done < <( find ${OUTDIR} -name "${PREFIX}.shard_*.vcf.gz" ) \
-  | sort -nrk1,1 \
-  > ${SHARD_VCF_TMP}/vars_per_shard.txt
-echo -e "FINISHED SHARDING VCF. RESULTING RECORDS PER SHARD FOR LARGEST 100 SHARDS:"
-head -n100 ${SHARD_VCF_TMP}/vars_per_shard.txt | paste -s -d','
-#If shard with most variants is >10-fold more than next-largest shard, exit with code 1
-if [ $( find ${OUTDIR} -name "${PREFIX}.shard_*.vcf.gz" | wc -l ) -gt 1 ]; then
-  first=$( sed -n '1p' ${SHARD_VCF_TMP}/vars_per_shard.txt )
-  second=$( sed -n '2p' ${SHARD_VCF_TMP}/vars_per_shard.txt )
-  if [ ! -z ${second} ] && [ ${second} -gt 0 ]; then
-    if [ $(( ${first} / ${second} )) -ge 10 ]; then
-      echo -e "CRITICAL WARNING: LARGEST SHARD IS AT LEAST 10 TIMES LARGER THAN SECOND-LARGEST SHARD"
-      exit 1
-    fi
-  fi
-fi
-
-
-###CLEAN UP
-rm -rf ${SHARD_VCF_TMP}
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX_invOnly_part1.sh b/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX_invOnly_part1.sh
deleted file mode 100755
index 6551fc60f..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX_invOnly_part1.sh
+++ /dev/null
@@ -1,337 +0,0 @@
-#!/bin/bash
-
-# Intelligently shards a VCF prior to complex resolution (for parallelization)
-
-# Version modified for inv-only sharding (RO 10%, no restriction on breakpoint distance)
-
-# Subsetted to first half, to just output lists of which variants should go in each shard
-
-set -euo pipefail
-
-###USAGE
-usage(){
-cat <<EOF
-usage: shardVCF_preResolveCPX_invOnly_part1.sh [-h] [-L MIN_LINES_PER_SHARD] [-S MAX_SHARDS]
-                                               [-N NONCLUSTER_SHARDS] [-P PREFIX] [-T VCF_INDEX] VCF
-Shard VCF prior to complex resolution (for parallelization)
-Positional arguments:
-  VCF                      VCF from sv-pipeline prior to svtk resolve
-  OUTDIR                   Output directory for all shards
-Optional arguments:
-  -h  HELP                 Show this help message and exit
-  -L  MIN_LINES_PER_SHARD  Minimum variants per shard [default: 10]
-  -S  MAX_SHARDS           Maximum number of shards [default: 100]
-  -N  NONCLUSTER_SHARDS    Add an additional N shards to evenly split the 
-                           remaining records in the VCF that do not cluster with
-                           any other variants [default: 30]
-  -P  PREFIX               String appended to the filename of each output VCF 
-                           shard [default: "vcf_shard"]
-  -O  OUTDIR               Output directory [default: pwd]
-  -T  VCF_index            Tabix index corresponding to input VCF 
-                           [default: VCF path + .tbi]
-Notes:
-  1. The total number of shards created will be no more 
-     than MAX_SHARDS + NONCLUSTER_SHARDS.
-EOF
-}
-
-
-###PARSE ARGS
-MIN_LINES_PER_SHARD=10
-MAX_SHARDS=100
-NONCLUSTER_SHARDS=30
-PREFIX="vcf_shard"
-OUTDIR=`pwd`/
-while getopts ":L:S:N:P:O:T:h" opt; do
-  case "$opt" in
-    h)
-      usage
-      exit 0
-      ;;
-    L)
-      MIN_LINES_PER_SHARD=${OPTARG}
-      ;;
-    S)
-      MAX_SHARDS=${OPTARG}
-      ;;
-    N)
-      NONCLUSTER_SHARDS=${OPTARG}
-      ;;
-    P)
-      PREFIX=${OPTARG}
-      ;;
-    O)
-      OUTDIR=${OPTARG}
-      ;;
-    T)
-      VCF_INDEX=${OPTARG}
-      ;;
-  esac
-done
-shift $(( ${OPTIND} - 1))
-VCF=$1
-if [ -z ${VCF_INDEX} ]; then
-  VCF_INDEX=${VCF}.tbi
-fi
-
-
-###PROCESS ARGS & OPTIONS
-#Check for required input
-if [ -z ${VCF} ]; then
-  echo -e "\nERROR: input VCF not specified\n"
-  usage
-  exit 0
-fi
-if ! [ -s ${VCF} ]; then
-  echo -e "\nERROR: input VCF either empty or not found\n"
-  usage
-  exit 0
-fi
-if [ $( file ${VCF} | fgrep "gzip" | wc -l ) -lt 1 ]; then
-  echo -e "\nERROR: input VCF must be bgzipped\n"
-  usage
-  exit 0
-fi
-if [ -z ${VCF_INDEX} ]; then
-  echo -e "\nERROR: input VCF index not specified\n"
-  usage
-  exit 0
-fi
-if ! [ -s ${VCF_INDEX} ]; then
-  echo -e "\nERROR: input VCF index either empty or not found\n"
-  usage
-  exit 0
-fi
-if [ -z ${OUTDIR} ]; then
-  echo -e "\nERROR: output directory not specified\n"
-  usage
-  exit 0
-fi
-#Creates $OUTDIR if necessary
-if ! [ -e ${OUTDIR} ]; then
-  mkdir ${OUTDIR}
-fi
-#Checks for numeric L and S options
-if [ ${MIN_LINES_PER_SHARD} -lt 1 ] || [ ${MIN_LINES_PER_SHARD} -gt 1000000 ]; then
-  echo -e "\nERROR: MIN_LINES_PER_SHARD must be an integer between 1 and 1000000\n"
-  usage
-  exit 0
-fi
-if [ ${MAX_SHARDS} -lt 1 ] || [ ${MAX_SHARDS} -gt 10000 ]; then
-  echo -e "\nERROR: MAX_SHARDS must be an integer between 1 and 10000\n"
-  usage
-  exit 0
-fi
-#Set path to execution directory
-BIN=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
-
-
-###IDENTIFY CANDIDATE COMPLEX CLUSTERS
-#Make temporary directory
-SHARD_VCF_TMP=`mktemp -d`
-#Cut vcf to single-sample for improved clustering speed
-zcat ${VCF} | cut -f1-10 | bgzip -c > ${SHARD_VCF_TMP}/single_sample_input.vcf.gz
-#Identify all candidate complex variant clusters (generous 50Mb, 10% RO clustering)
-svtk vcfcluster \
-  -d 50000000 \
-  -f 0.10 \
-  -p candidate_complex_clusters \
-  --ignore-svtypes \
-  -o 0 \
-  --preserve-ids \
-  <( echo "${SHARD_VCF_TMP}/single_sample_input.vcf.gz" ) \
-  ${SHARD_VCF_TMP}/input_vcf.clustered.vcf
-#Convert clustered variants to bed
-svtk vcf2bed \
-  --no-samples \
-  --info ALL \
-  ${SHARD_VCF_TMP}/input_vcf.clustered.vcf \
-  ${SHARD_VCF_TMP}/input_vcf.clustered.bed
-#Write list of clusters with >1 constituent variant
-mem_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \
-           sed 's/\t/\n/g' | awk '{ if ($1=="MEMBERS") print NR }' )
-awk -v idx=${mem_idx} -v OFS="\t" \
-'$idx ~ /,/ { print $1, $2, $3, $idx }' \
-${SHARD_VCF_TMP}/input_vcf.clustered.bed | (fgrep -v "#" || printf "") > \
-${SHARD_VCF_TMP}/candidate_complex_clusters.bed
-#Add all non-CNV single-record variants
-class_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \
-           sed 's/\t/\n/g' | awk '{ if ($1=="SVTYPE") print NR }' )
-awk -v idx=${class_idx} -v mem_idx=${mem_idx} -v OFS="\t" \
-'$mem_idx !~ /,/ { print $1, $2, $3, $idx, $mem_idx }' \
-${SHARD_VCF_TMP}/input_vcf.clustered.bed | \
-awk -v OFS="\t" '$4 !~ /DEL|DUP|CNV|MCNV|mCNV/ { print $1, $2, $3, $5 }' | (fgrep -v "#" || printf "") >> \
-${SHARD_VCF_TMP}/candidate_complex_clusters.bed
-#Get min/max coordinates of all variants in list of VIDs
-cat <( zcat ${VCF} | fgrep "#" | cut -f1-10 ) \
-<( cut -f4 ${SHARD_VCF_TMP}/candidate_complex_clusters.bed | \
-   sed 's/\,/\n/g' | sort -Vk1,1 | uniq | fgrep -wf - \
-   <( zcat ${VCF} ) | cut -f1-10 ) | \
-svtk vcf2bed --no-samples /dev/stdin /dev/stdout > \
-${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed
-
-if [ $( cat ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed | (fgrep -v "#" || printf "") | wc -l ) -gt 0 ]; then
-  #Split into breakpoints and pad all breakpoints by ±1bp
-  #DEV NOTE: padding breakpoints for large chromosomes & many samples was causing
-  # issues where tens of thousands of breakpoints would end up in the same shard
-  # and take >36h to resolve, defeating the purpose of sharding
-  (fgrep -v "#" ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed || printf "") | \
-  awk -v OFS="\t" -v buffer=1 \
-  '{ print $1, $2-buffer, $2+buffer, $4"\n"$1, $3-buffer, $3+buffer, $4 }' | \
-  awk -v OFS="\t" '{ if ($2<0) $2=0; print $1, $2, $3, $4 }' | \
-  sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - -c 4 -o distinct > \
-  ${SHARD_VCF_TMP}/breakpoint_intervals.bed
-  #Iterate over breakpoint intervals and write list of maximum nonredundant intervals
-  in_cluster=`mktemp`
-  remaining=`mktemp`
-  cp ${SHARD_VCF_TMP}/breakpoint_intervals.bed ${remaining}
-  while read chr start end VIDs; do
-    #Get all lines associated with current VIDs
-    echo -e "${VIDs}" | sed 's/,/\n/g' | (fgrep -wf - ${remaining} || printf "") \
-     > ${in_cluster} || true
-    #Only run if at least one line added to ${in_cluster}
-    if [ $( cat ${in_cluster} | wc -l ) -gt 0 ]; then
-      #Exclude all lines in ${in_cluster} from ${remaining}
-      bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2
-      mv ${remaining}2 ${remaining}
-      #Iterate until no more related VIDs are present in ${remaining}
-      until [ $( cut -f4 ${in_cluster} | sed 's/\,/\n/g' | (fgrep -wf - ${remaining} || printf "") | wc -l || true ) -eq 0 ]; do
-        #Add new lines to ${in_cluster}
-        cut -f4 ${in_cluster} | sed 's/\,/\n/g' | (fgrep -wf - ${remaining} || printf "") >> ${in_cluster} || true
-        #Exclude all lines in ${in_cluster} from ${remaining}
-        bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2
-        mv ${remaining}2 ${remaining}
-      done
-      #Write out final interval
-      for wrapper in 1; do
-        #Print list of coordinates
-        cut -f1-3 ${in_cluster} | sort -Vk1,1 -k2,2 -k3,3 | bedtools merge -i - | \
-        awk '{ print $1":"$2"-"$3 }' | paste -s -d\;
-        #Print list of involved VIDs
-        cut -f4 ${in_cluster} | sed 's/,/\n/g' | sort | uniq | paste -s -d,
-      done | paste -s
-    fi
-  done < ${SHARD_VCF_TMP}/breakpoint_intervals.bed > \
-  ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt
-  #Pull out exceptionally large clusters to the side to be placed in their own shards
-  while read ints VIDs; do
-    if [ $( echo ${VIDs} | sed 's/,/\n/g' | wc -l ) -ge ${MIN_LINES_PER_SHARD} ]; then
-      echo -e "${ints}\t${VIDs}"
-    fi
-  done < ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \
-    > ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt
-  if [ $( cat ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt | wc -l ) -gt 0 ]; then
-    cut -f2 ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt \
-    | sed 's/,/\n/g' \
-    | { fgrep -wvf - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt || true; } \
-    > ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2
-    mv ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2 \
-    ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt
-  fi
-
-
-  ###DETERMINE COORDINATES FOR EACH SHARD
-  #Split variants into shards based on number of variants
-  #If total number of intervals/MAX_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard
-  if [ $(( $( cat ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt | wc -l ) / ${MAX_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then
-    ${BIN}/evenSplitter.R \
-    -L ${MIN_LINES_PER_SHARD} \
-    ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \
-    ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_
-  #Otherwise, split into MAX_SHARDS evenly-sized shards
-  else
-    ${BIN}/evenSplitter.R \
-    -S ${MAX_SHARDS} \
-    ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \
-    ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_
-  fi
-  #Writes exceptionally large clusters to their own shards
-  n_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}*" | wc -l )
-  if [ $( cat ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt | wc -l ) -gt 0 ]; then
-    while read ints VIDs; do
-      n_shards=$(( ${n_shards} + 1 ))
-      echo -e "${ints}\t${VIDs}" > ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${n_shards}
-    done < ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt
-  fi
-  #Reformat interval shards
-  for i in $( seq 1 ${n_shards} ); do
-    cut -f1 ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i} | \
-    sed -e 's/\;/\n/g' -e 's/\:/\t/g' -e 's/\-/\t/g' | \
-    sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - > \
-    ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed
-    rm ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}
-  done
-
-
-  ###SHARD VCF
-  #Convert full, original VCF to BED
-  svtk vcf2bed --no-samples \
-    ${VCF} int.bed
-  #Harrison's patch for sharding
-  awk '{if ($1!~"#") print $1,$2,$2+1,$4,$5 \
-    "\n" $1,$3-1,$3,$4,$5;else print}' OFS='\t' int.bed \
-    > ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed
-  rm int.bed
-  #Iterate over all sharded intervals
-  for i in $( seq 1 $(( ${n_shards} )) ); do
-    #Write exclusion list of VIDs already used in earlier shards
-    touch ${SHARD_VCF_TMP}/used_VIDs.tmp
-    if [ ${i} -gt 1 ]; then
-      for j in $( seq 1 $(( ${i} - 1 )) ); do
-        cat ${OUTDIR}/${PREFIX}.shard_${j}.VIDs.list
-      done | sort | uniq > ${SHARD_VCF_TMP}/used_VIDs.tmp
-    fi
-    #Get list of IDs to be used in shard
-    bedtools intersect \
-    -a ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed \
-    -b ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed | \
-    cut -f4 | (fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.tmp || printf "") \
-    | sort | uniq > \
-    ${OUTDIR}/${PREFIX}.shard_${i}.VIDs.list || true
-    #Clean up used VID list
-    rm ${SHARD_VCF_TMP}/used_VIDs.tmp
-  done
-  #Write list of all VIDs used in cluster shards
-  cat ${OUTDIR}/${PREFIX}.shard_*.VIDs.list \
-    | sort | uniq \
-    > ${SHARD_VCF_TMP}/used_VIDs.cluster_shards.list || true
-  #Write list of eligible VIDs
-  (fgrep -v "#" ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed || printf "") \
-    | cut -f4 \
-    | (fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.cluster_shards.list || printf "") \
-    | sort | uniq \
-    > ${SHARD_VCF_TMP}/remaining_VIDs.list || true
-else
-  n_shards=0
-  zcat ${VCF} | cut -f1-3 | (fgrep -v "#" || printf "") | cut -f3 | sort | uniq \
-  > ${SHARD_VCF_TMP}/remaining_VIDs.list
-fi
-
-
-if [ $( cat ${SHARD_VCF_TMP}/remaining_VIDs.list | wc -l ) -gt 0 ]; then
-  #Shard remaining records into no more than $NONCLUSTER_SHARDS shards
-  #If total number of records/NONCLUSTER_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard
-  if [ $(( $( cat ${SHARD_VCF_TMP}/remaining_VIDs.list | wc -l ) / ${NONCLUSTER_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then
-    ${BIN}/evenSplitter.R \
-    -L ${MIN_LINES_PER_SHARD} \
-    ${SHARD_VCF_TMP}/remaining_VIDs.list \
-    ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_
-  #Otherwise, split into MAX_SHARDS evenly-sized shards
-  else
-    ${BIN}/evenSplitter.R \
-    -S ${NONCLUSTER_SHARDS} \
-    ${SHARD_VCF_TMP}/remaining_VIDs.list \
-    ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_
-  fi
-  #Iterate over all non-cluster shards and generate VCF shards
-  n_noncluster_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}.remainder_VIDs_*" | wc -l )
-  for i in $( seq 1 ${n_noncluster_shards} ); do
-    idx=$(( ${n_shards} + ${i} ))
-    mv ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_${i} \
-      ${OUTDIR}/${PREFIX}.shard_${idx}.VIDs.list
-  done
-fi
-
-
-###CLEAN UP
-rm -rf ${SHARD_VCF_TMP}
diff --git a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX_part1.sh b/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX_part1.sh
deleted file mode 100755
index d3422745f..000000000
--- a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX_part1.sh
+++ /dev/null
@@ -1,335 +0,0 @@
-#!/bin/bash
-
-# Intelligently shards a VCF prior to complex resolution (for parallelization)
-
-# Subsetted to first half, to just output lists of which variants should go in each shard
-
-set -euo pipefail
-
-###USAGE
-usage(){
-cat <<EOF
-usage: shardVCF_preResolveCPX_part1.sh [-h] [-L MIN_LINES_PER_SHARD] [-S MAX_SHARDS]
-                                       [-N NONCLUSTER_SHARDS] [-P PREFIX] [-T VCF_INDEX] VCF
-Shard VCF prior to complex resolution (for parallelization)
-Positional arguments:
-  VCF                      VCF from sv-pipeline prior to svtk resolve
-  OUTDIR                   Output directory for all shard lists
-Optional arguments:
-  -h  HELP                 Show this help message and exit
-  -L  MIN_LINES_PER_SHARD  Minimum variants per shard [default: 10]
-  -S  MAX_SHARDS           Maximum number of shards [default: 100]
-  -N  NONCLUSTER_SHARDS    Add an additional N shards to evenly split the 
-                           remaining records in the VCF that do not cluster with
-                           any other variants [default: 30]
-  -P  PREFIX               String appended to the filename of each output VCF 
-                           shard [default: "vcf_shard"]
-  -O  OUTDIR               Output directory [default: pwd]
-  -T  VCF_index            Tabix index corresponding to input VCF 
-                           [default: VCF path + .tbi]
-Notes:
-  1. The total number of shards created will be no more 
-     than MAX_SHARDS + NONCLUSTER_SHARDS.
-EOF
-}
-
-
-###PARSE ARGS
-MIN_LINES_PER_SHARD=10
-MAX_SHARDS=100
-NONCLUSTER_SHARDS=30
-PREFIX="vcf_shard"
-OUTDIR=`pwd`
-while getopts ":L:S:N:P:O:T:h" opt; do
-  case "$opt" in
-    h)
-      usage
-      exit 0
-      ;;
-    L)
-      MIN_LINES_PER_SHARD=${OPTARG}
-      ;;
-    S)
-      MAX_SHARDS=${OPTARG}
-      ;;
-    N)
-      NONCLUSTER_SHARDS=${OPTARG}
-      ;;
-    P)
-      PREFIX=${OPTARG}
-      ;;
-    O)
-      OUTDIR=${OPTARG}
-      ;;
-    T)
-      VCF_INDEX=${OPTARG}
-      ;;
-  esac
-done
-shift $(( ${OPTIND} - 1))
-VCF=$1
-if [ -z ${VCF_INDEX} ]; then
-  VCF_INDEX=${VCF}.tbi
-fi
-
-
-###PROCESS ARGS & OPTIONS
-#Check for required input
-if [ -z ${VCF} ]; then
-  echo -e "\nERROR: input VCF not specified\n"
-  usage
-  exit 0
-fi
-if ! [ -s ${VCF} ]; then
-  echo -e "\nERROR: input VCF either empty or not found\n"
-  usage
-  exit 0
-fi
-if [ $( file ${VCF} | fgrep "gzip" | wc -l ) -lt 1 ]; then
-  echo -e "\nERROR: input VCF must be bgzipped\n"
-  usage
-  exit 0
-fi
-if [ -z ${VCF_INDEX} ]; then
-  echo -e "\nERROR: input VCF index not specified\n"
-  usage
-  exit 0
-fi
-if ! [ -s ${VCF_INDEX} ]; then
-  echo -e "\nERROR: input VCF index either empty or not found\n"
-  usage
-  exit 0
-fi
-if [ -z ${OUTDIR} ]; then
-  echo -e "\nERROR: output directory not specified\n"
-  usage
-  exit 0
-fi
-#Creates $OUTDIR if necessary
-if ! [ -e ${OUTDIR} ]; then
-  mkdir ${OUTDIR}
-fi
-#Checks for numeric L and S options
-if [ ${MIN_LINES_PER_SHARD} -lt 1 ] || [ ${MIN_LINES_PER_SHARD} -gt 1000000 ]; then
-  echo -e "\nERROR: MIN_LINES_PER_SHARD must be an integer between 1 and 1000000\n"
-  usage
-  exit 0
-fi
-if [ ${MAX_SHARDS} -lt 1 ] || [ ${MAX_SHARDS} -gt 10000 ]; then
-  echo -e "\nERROR: MAX_SHARDS must be an integer between 1 and 10000\n"
-  usage
-  exit 0
-fi
-#Set path to execution directory
-BIN=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )
-
-
-###IDENTIFY CANDIDATE COMPLEX CLUSTERS
-#Make temporary directory
-SHARD_VCF_TMP=`mktemp -d`
-#Cut vcf to single-sample for improved clustering speed
-zcat ${VCF} | cut -f1-10 | bgzip -c > ${SHARD_VCF_TMP}/single_sample_input.vcf.gz
-#Identify all candidate complex variant clusters (generous 1kb clustering)
-svtk vcfcluster \
-  -d 1000 \
-  -f 0 \
-  -p candidate_complex_clusters \
-  --ignore-svtypes \
-  -o 0 \
-  --preserve-ids \
-  <( echo "${SHARD_VCF_TMP}/single_sample_input.vcf.gz" ) \
-  ${SHARD_VCF_TMP}/input_vcf.clustered.vcf
-#Convert clustered variants to bed
-svtk vcf2bed \
-  --no-samples \
-  --info ALL \
-  ${SHARD_VCF_TMP}/input_vcf.clustered.vcf \
-  ${SHARD_VCF_TMP}/input_vcf.clustered.bed
-#Write list of clusters with >1 constituent variant
-mem_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \
-           sed 's/\t/\n/g' | awk '{ if ($1=="MEMBERS") print NR }' )
-awk -v idx=${mem_idx} -v OFS="\t" \
-'$idx ~ /,/ { print $1, $2, $3, $idx }' \
-${SHARD_VCF_TMP}/input_vcf.clustered.bed | (fgrep -v "#" || printf "") > \
-${SHARD_VCF_TMP}/candidate_complex_clusters.bed
-#Add all non-CNV single-record variants
-class_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \
-           sed 's/\t/\n/g' | awk '{ if ($1=="SVTYPE") print NR }' )
-awk -v idx=${class_idx} -v mem_idx=${mem_idx} -v OFS="\t" \
-'$mem_idx !~ /,/ { print $1, $2, $3, $idx, $mem_idx }' \
-${SHARD_VCF_TMP}/input_vcf.clustered.bed | \
-awk -v OFS="\t" '$4 !~ /DEL|DUP|CNV|MCNV|mCNV/ { print $1, $2, $3, $5 }' | (fgrep -v "#" || printf "") >> \
-${SHARD_VCF_TMP}/candidate_complex_clusters.bed
-#Get min/max coordinates of all variants in list of VIDs
-cat <( zcat ${VCF} | fgrep "#" | cut -f1-10 ) \
-<( cut -f4 ${SHARD_VCF_TMP}/candidate_complex_clusters.bed | \
-   sed 's/\,/\n/g' | sort -Vk1,1 | uniq | fgrep -wf - \
-   <( zcat ${VCF} ) | cut -f1-10 ) | \
-svtk vcf2bed --no-samples /dev/stdin /dev/stdout > \
-${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed
-
-
-###ONLY RUN IF ANY CANDIDATE COMPLEX CLUSTERS ARE IDENTIFIED
-if [ $( cat ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed | (fgrep -v "#" || printf "") | wc -l ) -gt 0 ]; then
-  #Split into breakpoints and pad all breakpoints by ±1bp
-  #DEV NOTE: padding breakpoints for large chromosomes & many samples was causing
-  # issues where tens of thousands of breakpoints would end up in the same shard
-  # and take >36h to resolve, defeating the purpose of sharding
-  (fgrep -v "#" ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed || printf "") | \
-  awk -v OFS="\t" -v buffer=1 \
-  '{ print $1, $2-buffer, $2+buffer, $4"\n"$1, $3-buffer, $3+buffer, $4 }' | \
-  awk -v OFS="\t" '{ if ($2<0) $2=0; print $1, $2, $3, $4 }' | \
-  sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - -c 4 -o distinct > \
-  ${SHARD_VCF_TMP}/breakpoint_intervals.bed
-  #Iterate over breakpoint intervals and write list of maximum nonredundant intervals
-  in_cluster=`mktemp`
-  remaining=`mktemp`
-  cp ${SHARD_VCF_TMP}/breakpoint_intervals.bed ${remaining}
-  while read chr start end VIDs; do
-    #Get all lines associated with current VIDs
-    echo -e "${VIDs}" | sed 's/,/\n/g' | (fgrep -wf - ${remaining} || printf "") \
-     > ${in_cluster} || true
-    #Only run if at least one line added to ${in_cluster}
-    if [ $( cat ${in_cluster} | wc -l ) -gt 0 ]; then
-      #Exclude all lines in ${in_cluster} from ${remaining}
-      bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2
-      mv ${remaining}2 ${remaining}
-      #Iterate until no more related VIDs are present in ${remaining}
-      until [ $( cut -f4 ${in_cluster} | sed 's/\,/\n/g' | (fgrep -wf - ${remaining} || printf "") | wc -l || true ) -eq 0 ]; do
-        #Add new lines to ${in_cluster}
-        cut -f4 ${in_cluster} | sed 's/\,/\n/g' | (fgrep -wf - ${remaining} || printf "") >> ${in_cluster} || true
-        #Exclude all lines in ${in_cluster} from ${remaining}
-        bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2
-        mv ${remaining}2 ${remaining}
-      done
-      #Write out final interval
-      for wrapper in 1; do
-        #Print list of coordinates
-        cut -f1-3 ${in_cluster} | sort -Vk1,1 -k2,2 -k3,3 | bedtools merge -i - | \
-        awk '{ print $1":"$2"-"$3 }' | paste -s -d\;
-        #Print list of involved VIDs
-        cut -f4 ${in_cluster} | sed 's/,/\n/g' | sort | uniq | paste -s -d,
-      done | paste -s
-    fi
-  done < ${SHARD_VCF_TMP}/breakpoint_intervals.bed > \
-  ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt
-  #Pull out exceptionally large clusters to the side to be placed in their own shards
-  while read ints VIDs; do
-    if [ $( echo ${VIDs} | sed 's/,/\n/g' | wc -l ) -ge ${MIN_LINES_PER_SHARD} ]; then
-      echo -e "${ints}\t${VIDs}"
-    fi
-  done < ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \
-    > ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt
-  if [ $( cat ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt | wc -l ) -gt 0 ]; then
-    cut -f2 ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt \
-    | sed 's/,/\n/g' \
-    | { fgrep -wvf - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt || true; } \
-    > ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2
-    mv ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2 \
-    ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt
-  fi
-
-
-  ###DETERMINE COORDINATES FOR EACH SHARD
-  #Split variants into shards based on number of variants
-  #If total number of intervals/MAX_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard
-  if [ $(( $( cat ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt | wc -l ) / ${MAX_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then
-    ${BIN}/evenSplitter.R \
-    -L ${MIN_LINES_PER_SHARD} \
-    ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \
-    ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_
-  #Otherwise, split into MAX_SHARDS evenly-sized shards
-  else
-    ${BIN}/evenSplitter.R \
-    -S ${MAX_SHARDS} \
-    ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \
-    ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_
-  fi
-  #Writes exceptionally large clusters to their own shards
-  n_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}*" | wc -l )
-  if [ $( cat ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt | wc -l ) -gt 0 ]; then
-    while read ints VIDs; do
-      n_shards=$(( ${n_shards} + 1 ))
-      echo -e "${ints}\t${VIDs}" > ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${n_shards}
-    done < ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt
-  fi
-  #Reformat interval shards
-  for i in $( seq 1 ${n_shards} ); do
-    cut -f1 ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i} | \
-    sed -e 's/\;/\n/g' -e 's/\:/\t/g' -e 's/\-/\t/g' | \
-    sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - > \
-    ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed
-    rm ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}
-  done
-
-
-  ###SHARD VCF
-  #Convert full, original VCF to BED
-  svtk vcf2bed --no-samples \
-    ${VCF} int.bed
-  #Harrison's patch for sharding
-  awk '{if ($1!~"#") print $1,$2,$2+1,$4,$5 \
-    "\n" $1,$3-1,$3,$4,$5;else print}' OFS='\t' int.bed \
-    > ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed
-  rm int.bed
-  #Iterate over all sharded intervals
-  for i in $( seq 1 $(( ${n_shards} )) ); do
-    #Write exclusion list of VIDs already used in earlier shards
-    touch ${SHARD_VCF_TMP}/used_VIDs.tmp
-    if [ ${i} -gt 1 ]; then
-      for j in $( seq 1 $(( ${i} - 1 )) ); do
-        cat ${OUTDIR}/${PREFIX}.shard_${j}.VIDs.list
-      done | sort | uniq > ${SHARD_VCF_TMP}/used_VIDs.tmp
-    fi
-    #Get list of IDs to be used in shard
-    bedtools intersect \
-    -a ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed \
-    -b ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed | \
-    cut -f4 | (fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.tmp || printf "") \
-    | sort | uniq > \
-    ${OUTDIR}/${PREFIX}.shard_${i}.VIDs.list
-    #Clean up used VID list
-    rm ${SHARD_VCF_TMP}/used_VIDs.tmp
-  done
-  #Write list of all VIDs used in cluster shards
-  cat ${OUTDIR}/${PREFIX}.shard_*.VIDs.list \
-    | sort | uniq \
-    > ${SHARD_VCF_TMP}/used_VIDs.cluster_shards.list || true
-  #Write list of eligible VIDs
-  (fgrep -v "#" ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed || printf "") \
-    | cut -f4 \
-    | (fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.cluster_shards.list || printf "") \
-    | sort | uniq \
-    > ${SHARD_VCF_TMP}/remaining_VIDs.list || true
-else
-  n_shards=0
-  zcat ${VCF} | cut -f1-3 | (fgrep -v "#" || printf "") | cut -f3 | sort | uniq \
-  > ${SHARD_VCF_TMP}/remaining_VIDs.list
-fi
-
-
-#Shard remaining records into no more than $NONCLUSTER_SHARDS shards
-#If total number of records/NONCLUSTER_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard
-if [ $(( $( cat ${SHARD_VCF_TMP}/remaining_VIDs.list | wc -l ) / ${NONCLUSTER_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then
-  ${BIN}/evenSplitter.R \
-  -L ${MIN_LINES_PER_SHARD} \
-  ${SHARD_VCF_TMP}/remaining_VIDs.list \
-  ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_
-#Otherwise, split into MAX_SHARDS evenly-sized shards
-else
-  ${BIN}/evenSplitter.R \
-  -S ${NONCLUSTER_SHARDS} \
-  ${SHARD_VCF_TMP}/remaining_VIDs.list \
-  ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_
-fi
-#Iterate over all non-cluster shards and generate VCF shards
-n_noncluster_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}.remainder_VIDs_*" | wc -l )
-for i in $( seq 1 ${n_noncluster_shards} ); do
-  idx=$(( ${n_shards} + ${i} ))
-  mv ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_${i} \
-    ${OUTDIR}/${PREFIX}.shard_${idx}.VIDs.list
-done
-
-
-###CLEAN UP
-rm -rf ${SHARD_VCF_TMP}
diff --git a/src/sv-pipeline/05_annotation/scripts/compute_AFs.py b/src/sv-pipeline/05_annotation/scripts/compute_AFs.py
index 93a4196c9..31b5a0c0a 100755
--- a/src/sv-pipeline/05_annotation/scripts/compute_AFs.py
+++ b/src/sv-pipeline/05_annotation/scripts/compute_AFs.py
@@ -82,7 +82,7 @@ def update_sex_freqs(record, pop=None):
     return record
 
 
-def gather_allele_freqs(record, all_samples, males, females, parbt, pop_dict, pops,
+def gather_allele_freqs(record, samples, males_set, females_set, parbt, pop_dict, pops,
                         sex_chroms, no_combos=False):
     """
     Wrapper to compute allele frequencies for all sex & population pairings
@@ -99,40 +99,40 @@ def gather_allele_freqs(record, all_samples, males, females, parbt, pop_dict, po
         rec_in_par = False
 
     # Get allele frequencies for all populations
-    calc_allele_freq(record, all_samples)
-    if len(males) > 0:
+    calc_allele_freq(record, samples)
+    if len(males_set) > 0:
         if record.chrom in sex_chroms and not rec_in_par:
-            calc_allele_freq(record, males, prefix='MALE', hemi=True)
+            calc_allele_freq(record, males_set, prefix='MALE', hemi=True)
         else:
-            calc_allele_freq(record, males, prefix='MALE')
-    if len(females) > 0:
-        calc_allele_freq(record, females, prefix='FEMALE')
+            calc_allele_freq(record, males_set, prefix='MALE')
+    if len(females_set) > 0:
+        calc_allele_freq(record, females_set, prefix='FEMALE')
 
     # Adjust global allele frequencies on sex chromosomes, if famfile provided
     if record.chrom in sex_chroms and not rec_in_par \
-            and svu.is_biallelic(record) and len(males) + len(females) > 0:
+            and svu.is_biallelic(record) and len(males_set) + len(females_set) > 0:
         update_sex_freqs(record)
 
     # Get allele frequencies per population
     if len(pops) > 0:
         for pop in pops:
             pop_samps = [
-                s for s in all_samples if pop_dict.get(s, None) == pop]
+                s for s in samples if pop_dict.get(s, None) == pop]
             calc_allele_freq(record, pop_samps, prefix=pop)
-            if len(males) > 0 and not no_combos:
+            if len(males_set) > 0 and not no_combos:
                 if record.chrom in sex_chroms and not rec_in_par:
-                    calc_allele_freq(record, [s for s in pop_samps if s in males],
+                    calc_allele_freq(record, list([s for s in pop_samps if s in males_set]),
                                      prefix=pop + '_MALE', hemi=True)
                 else:
-                    calc_allele_freq(record, [s for s in pop_samps if s in males],
+                    calc_allele_freq(record, list([s for s in pop_samps if s in males_set]),
                                      prefix=pop + '_MALE')
-            if len(females) > 0 and not no_combos:
-                calc_allele_freq(record, [s for s in pop_samps if s in females],
+            if len(females_set) > 0 and not no_combos:
+                calc_allele_freq(record, list([s for s in pop_samps if s in females_set]),
                                  prefix=pop + '_FEMALE')
 
             # Adjust per-pop allele frequencies on sex chromosomes, if famfile provided
             if record.chrom in sex_chroms and not rec_in_par \
-                    and svu.is_biallelic(record) and len(males) + len(females) > 0:
+                    and svu.is_biallelic(record) and len(males_set) + len(females_set) > 0:
                 update_sex_freqs(record, pop=pop)
 
         # Get POPMAX AF biallelic sites only
@@ -154,7 +154,7 @@ def calc_allele_freq(record, samples, prefix=None, hemi=False):
     if svu.is_biallelic(record):
 
         # Get all sample GTs
-        GTs = [s['GT'] for s in record.samples.values() if s.name in samples]
+        GTs = [record.samples[s]['GT'] for s in samples]
 
         # Count alleles & genotypes
         AC = 0
@@ -237,8 +237,7 @@ def calc_allele_freq(record, samples, prefix=None, hemi=False):
     else:
 
         # Get all sample CNs and remove Nones
-        CNs_wNones = [s['CN']
-                      for s in record.samples.values() if s.name in samples]
+        CNs_wNones = [record.samples[s]['CN'] for s in samples]
         CNs = [c for c in CNs_wNones if c is not None and c not in '. NA'.split()]
 
         if len(CNs) == 0:
@@ -306,24 +305,26 @@ def main():
         vcf = pysam.VariantFile(args.vcf)
 
     # Get list of all samples in vcf
-    all_samples = list(vcf.header.samples)
+    samples_list = list(vcf.header.samples)
 
     # Get lists of males and females
     parbt = pbt.BedTool('', from_string=True)
     if args.famfile is not None:
         famfile = [line.rstrip('\n') for line in open(args.famfile)]
-        males = [line.split('\t')[1]
-                 for line in famfile if line.split('\t')[4] == '1']
-        females = [line.split('\t')[1]
-                   for line in famfile if line.split('\t')[4] == '2']
+        males_set = set([line.split('\t')[1]
+                 for line in famfile if line.split('\t')[4] == '1'])
+        males_set = set(s for s in samples_list if s in males_set)
+        females_set = set([line.split('\t')[1]
+                   for line in famfile if line.split('\t')[4] == '2'])
+        females_set = set(s for s in samples_list if s in females_set)
         sexes = 'MALE FEMALE'.split()
         if args.par is not None:
             parbt = pbt.BedTool(args.par)
 
     else:
-        males = []
-        females = []
-        sexes = []
+        males_set = set()
+        females_set = set()
+        sexes = list()
 
     # Get dictionary of populations
     if args.popfile is not None:
@@ -335,6 +336,7 @@ def main():
         pop_dict = {}
         pops = []
 
+
     # Get list of sex chromosomes, if optioned
     if args.allosomes_list is not None:
         sex_chroms = [l.split('\t')[0]
@@ -491,7 +493,7 @@ def main():
 
     # Get allele frequencies for each record & write to new VCF
     for r in vcf.fetch():
-        newrec = gather_allele_freqs(r, all_samples, males, females, parbt, pop_dict,
+        newrec = gather_allele_freqs(r, samples_list, males_set, females_set, parbt, pop_dict,
                                      pops, sex_chroms, args.no_combos)
         fout.write(newrec)
 
diff --git a/src/sv-pipeline/java/StitchFragmentedCNVs.java b/src/sv-pipeline/java/StitchFragmentedCNVs.java
deleted file mode 100644
index e7e9e519e..000000000
--- a/src/sv-pipeline/java/StitchFragmentedCNVs.java
+++ /dev/null
@@ -1,332 +0,0 @@
-import java.io.*;
-import java.util.*;
-
-/** Read a VCF, and try to stitch together adjacent copy-number variations.
- * Eligible Records (which we call "stitchable") must meet certain criteria as specified by the
- * isStitchable method of the StitchableIterator.
- * If two stitchables overlap appropriately, and all their samples have identical genotypes, we can
- * replace the first one by adding on the interval covered by the second one.
- */
-public class StitchFragmentedCNVs {
-    private static final VCFParser.ByteSequence END = new VCFParser.ByteSequence("END");
-    private static final VCFParser.ByteSequence SVLEN = new VCFParser.ByteSequence("SVLEN");
-
-    // These 3 values will always be overwritten, but are initialized to reasonable defaults as documentation
-    private static double PAD_FACTOR = .2;
-    private static int MAX_PAD = 200000;
-    private static double MAX_OVERLAP_FACTOR = .2;
-
-    public static void main( final String[] args ) {
-        if ( args.length != 4 ) {
-            System.err.println("Usage: java StitchFragmentedCNVs PAD% MAXPAD OVRLAP% VCFFILE");
-            System.err.println("E.g.:  java StitchFragmentedCNVs .2 200000 .2 input.vcf.gz");
-            System.err.println("Combines neighboring CNVs with matching genotypes into a larger event.");
-            System.err.println("Writes an uncompressed vcf to stdout.");
-            System.exit(1);
-        }
-
-        initCommandLineArgs(args);
-
-        try ( final OutputStream os
-                      = new BufferedOutputStream(new FileOutputStream(FileDescriptor.out)) ) {
-            try ( final VCFParser vcfParser = new VCFParser(args[3]) ) {
-                while ( vcfParser.hasMetadata() ) {
-                    vcfParser.nextMetaData().write(os);
-                }
-                final StitchableIterator sItr = new StitchableIterator(vcfParser);
-                VCFParser.Record stitchableRecord;
-                while ( (stitchableRecord = sItr.nextSubject(os)) != null ) {
-                    findExtension(stitchableRecord, sItr);
-                    stitchableRecord.write(os);
-                }
-            }
-        } catch ( final IOException ioe ) {
-            throw new VCFParser.MalformedVCFException("can't write to stdout", ioe);
-        }
-    }
-
-    /** Look for a stitchable downstream of the subject that can be joined to it to
-     *  make a larger event. */
-    private static void findExtension( final VCFParser.Record stitchable,
-                                       final StitchableIterator sItr ) throws IOException {
-        final PaddedInterval originalPaddedInterval = new PaddedInterval(stitchable);
-        PaddedInterval paddedInterval = originalPaddedInterval;
-
-        // sItr.hasNext returns false at EOF, or when the next record is too far away to
-        // overlap the subject
-        while ( sItr.hasNext() ) {
-            final VCFParser.Record record = sItr.next();
-            final PaddedInterval paddedInterval2 = new PaddedInterval(record);
-            if ( paddedInterval.canCoalesceWith(paddedInterval2) &&
-                    genotypesMatch(stitchable, record) ) {
-                paddedInterval = paddedInterval2;
-                sItr.remove();
-            }
-        }
-
-        if ( paddedInterval != originalPaddedInterval ) {
-            final int endPos = paddedInterval.getVCFEnd();
-            // this won't be null -- it was checked in isStitchable
-            final VCFParser.ByteSequence endField = stitchable.getInfoField(END);
-            stitchable.setInfoField(endField, new VCFParser.ByteSequence(Integer.toString(endPos)));
-            final VCFParser.ByteSequence svLenField = stitchable.getInfoField(SVLEN);
-            if ( svLenField == null ) {
-                throw new VCFParser.MalformedVCFException(stitchable.getID().toString() + " has no SVLEN field");
-            }
-            final int svLength = endPos + 1 - stitchable.getPosition();
-            final VCFParser.ByteSequence svLenValue =
-                    new VCFParser.ByteSequence(Integer.toString(svLength));
-            stitchable.setInfoField(svLenField, svLenValue);
-        }
-    }
-
-    private static boolean genotypesMatch( final VCFParser.Record rec1, final VCFParser.Record rec2 ) {
-        final List<VCFParser.ByteSequence> gt1 = rec1.getGenotypes();
-        final List<VCFParser.ByteSequence> gt2 = rec2.getGenotypes();
-        final int nGTs = gt1.size();
-        if ( gt2.size() != nGTs ) {
-            throw new IllegalStateException("two records have a different number of genotypes");
-        }
-        for ( int idx = 0; idx != nGTs; ++idx ) {
-            final VCFParser.ByteIterator itr1 = gt1.get(idx).iterator();
-            final VCFParser.ByteIterator itr2 = gt2.get(idx).iterator();
-            byte b1;
-            do {
-                b1 = itr1.hasNext() ? itr1.next() : -1;
-                final byte b2 = itr2.hasNext() ? itr2.next() : -1;
-                if ( b1 != b2 ) return false;
-            } while ( b1 != ':' );
-        }
-        return true;
-    }
-
-    private static void initCommandLineArgs( final String[] args ) {
-        try {
-            PAD_FACTOR = Double.parseDouble(args[0]);
-        } catch ( final NumberFormatException nfe ) {
-            System.err.println("Can't interpret 1st argument (padding fraction) as a floating point number.");
-            System.exit(2);
-        }
-        if ( PAD_FACTOR < 0.0 ) {
-            System.err.println("First argument should be a padding fraction >= 0.");
-            System.exit(2);
-        }
-        try {
-            MAX_PAD = Integer.parseInt(args[1]);
-        } catch ( final NumberFormatException nfe ) {
-            System.err.println("Can't interpret 2nd argument (maximum padding in bases) as an integer.");
-            System.exit(2);
-        }
-        if ( MAX_PAD < 0 ) {
-            System.err.println("Second argument must be a maximum padding in bases >= 0.");
-            System.exit(2);
-        }
-        try {
-            MAX_OVERLAP_FACTOR = Double.parseDouble(args[0]);
-        } catch ( final NumberFormatException nfe ) {
-            System.err.println("Can't interpret 3rd argument (maximum overlap fraction) as a floating point number.");
-            System.exit(2);
-        }
-        if ( MAX_OVERLAP_FACTOR < 0.0 || MAX_OVERLAP_FACTOR > 1.0 ) {
-            System.err.println("Third argument should be a maximum overlap fraction between 0 and 1.");
-            System.exit(2);
-        }
-    }
-
-    /** A little helper class to do padding and overlap calculations
-     *  Note: this class uses half-open intervals, unlike a vcf */
-    private final static class PaddedInterval {
-        private final int start;
-        private final int end;
-        private final int padding;
-        private final int maxOverlap;
-
-        public PaddedInterval( final VCFParser.Record record ) {
-            this.start = record.getPosition();
-            // getInfoField can't return null -- it's been checked in isStitchable
-            // + 1 because vcf has closed intervals, we use half-open
-            this.end = record.getInfoField(END).asInt() + 1;
-            final int length = end - start;
-            this.padding = Math.min(MAX_PAD, (int)(length * PAD_FACTOR));
-            this.maxOverlap = (int)(length * MAX_OVERLAP_FACTOR);
-        }
-
-        public boolean canCoalesceWith( final PaddedInterval downstreamInterval ) {
-            // Check that the padded intervals overlap.
-            // Only have to check one end, because we know the downstream interval starts as late
-            // or later than this one.
-            if ( end + padding <= downstreamInterval.start - downstreamInterval.padding ) {
-                return false;
-            }
-            // but the unpadded intervals mustn't overlap too much
-            final int overlap = Math.min(end, downstreamInterval.end) - downstreamInterval.start;
-            return overlap < maxOverlap && overlap < downstreamInterval.maxOverlap;
-        }
-
-        public int getVCFEnd() { return end - 1; }
-    }
-
-    /** As we go through the VCF we create a new Chunk whenever we encounter a stitchable record.
-     *  So, a Chunk consists of a mess of non-stitchables, and a trailing stitchable. */
-    private final static class Chunk {
-        private final List<VCFParser.Record> nonStitchables;
-        private final VCFParser.Record stitchable;
-
-        public Chunk( final List<VCFParser.Record> nonStitchables,
-                      final VCFParser.Record stitchable ) {
-            this.nonStitchables = nonStitchables;
-            this.stitchable = stitchable;
-        }
-
-        public List<VCFParser.Record> getNonStitchables() { return nonStitchables; }
-        public VCFParser.Record getStitchable() { return stitchable; }
-    }
-
-    /** Maintains a list of chunks so that the client just sees the stitchables, while making sure
-     * that the record ordering of the input file is maintained.
-     * This is a kind of double iterator:  At the outer level, calling nextSubject repeatedly
-     * until it returns null lets you simply iterate over each stitchable record in the input file.
-     * Each time you do so, the inner iterator (hasNext/next) gets reset to iterate over the
-     * stitchable records downstream of the subject.  The inner iterator is smart enough to quit
-     * (i.e., hasNext will return false) when we've read so far ahead that we can't possibly find
-     * a stitchable that can be joined to the subject.
-     */
-    private final static class StitchableIterator implements Iterator<VCFParser.Record> {
-        private final VCFParser vcfParser;
-        private final List<Chunk> chunks;
-        private int subjectIndex;
-        private VCFParser.ByteSequence subjectChromosome;
-        private int subjectMinNoOverlapPosition; // far enough downstream that MAX_PAD will ensure there's no overlap
-        private int iterationIndex;
-        private VCFParser.Record nextRecord; // this is a pushback for a record that's too far downstream
-
-        private static final VCFParser.ByteSequence MULTIALLELIC = new VCFParser.ByteSequence("MULTIALLELIC");
-        private static final VCFParser.ByteSequence SVTYPE = new VCFParser.ByteSequence("SVTYPE");
-        private static final VCFParser.ByteSequence SVTYPE_DEL = new VCFParser.ByteSequence("DEL");
-        private static final VCFParser.ByteSequence SVTYPE_DUP = new VCFParser.ByteSequence("DUP");
-        private static final VCFParser.ByteSequence EVIDENCE = new VCFParser.ByteSequence("EVIDENCE");
-        private static final String EVIDENCE_RD = "RD";
-        private static final String EVIDENCE_SR = "SR";
-        private static final String EVIDENCE_PE = "PE";
-        private static final String EVIDENCE_BAF = "BAF";
-
-        public StitchableIterator( final VCFParser vcfParser ) {
-            this.vcfParser = vcfParser;
-            this.chunks = new ArrayList<>();
-        }
-
-        /** write the non-stitchables that precede the first stitchable,
-         *  and return the next stitchable */
-        public VCFParser.Record nextSubject( final OutputStream os ) throws IOException {
-            final int nChunks = chunks.size();
-            while ( subjectIndex < nChunks ) {
-                final Chunk chunk = chunks.get(subjectIndex);
-                // clean out the chunks as we use them:  in coordinate-dense vcfs the chunks array
-                // can get quite large, and, especially in vcfs with lots of sample, a chunk can
-                // occupy quite a large amount of memory.  we want to release the chunks for garbage
-                // collection ASAP to control memory use.
-                chunks.set(subjectIndex, null);
-                iterationIndex = ++subjectIndex;
-                for ( final VCFParser.Record rec : chunk.getNonStitchables() ) {
-                    rec.write(os);
-                }
-                final VCFParser.Record stitchable = chunk.getStitchable();
-                if ( stitchable != null ) {
-                    return setSubject(stitchable);
-                }
-            }
-
-            // there are no more chunks to serve as subjects, reset the queue
-            chunks.clear();
-            subjectIndex = iterationIndex = 0;
-
-            while ( nextRecord != null || vcfParser.hasRecord() ) {
-                final VCFParser.Record record = nextRecord != null ? nextRecord: vcfParser.nextRecord();
-                nextRecord = null;
-                if ( isStitchable(record) ) {
-                    return setSubject(record);
-                }
-                record.write(os);
-            }
-            return null;
-        }
-
-        /** Is there another stitchable downstream of the subject that is within joining range? **/
-        @Override public boolean hasNext() {
-            final int nChunks = chunks.size();
-            while ( iterationIndex < nChunks ) {
-                final VCFParser.Record stitchable = chunks.get(iterationIndex).getStitchable();
-                if ( stitchable != null ) {
-                    return true;
-                }
-                ++iterationIndex;
-            }
-            if ( nextRecord != null || vcfParser.hasRecord() ) {
-                List<VCFParser.Record> nonStitchables = null;
-                do {
-                    final VCFParser.Record record =
-                            nextRecord != null ? nextRecord : vcfParser.nextRecord();
-                    nextRecord = null;
-                    if ( !record.getChromosome().equals(subjectChromosome) ||
-                            record.getPosition() >= subjectMinNoOverlapPosition ) {
-                        nextRecord = record;
-                        if ( nonStitchables != null ) {
-                            chunks.add(new Chunk(nonStitchables, null));
-                        }
-                        return false;
-                    }
-                    if ( isStitchable(record) ) {
-                        if ( nonStitchables == null ) {
-                            nonStitchables = Collections.emptyList();
-                        }
-                        chunks.add(new Chunk(nonStitchables, record));
-                        return true;
-                    }
-                    if ( nonStitchables == null ) {
-                        nonStitchables = new ArrayList<>();
-                    }
-                    nonStitchables.add(record);
-                } while ( vcfParser.hasRecord() );
-                chunks.add(new Chunk(nonStitchables, null));
-            }
-            return false;
-        }
-
-        @Override public VCFParser.Record next() {
-            if ( !hasNext() ) {
-                throw new NoSuchElementException();
-            }
-            return chunks.get(iterationIndex++).getStitchable();
-        }
-
-        @Override public void remove() {
-            final int idx = iterationIndex - 1;
-            chunks.set(idx, new Chunk(chunks.get(idx).getNonStitchables(), null));
-        }
-
-        private static boolean isStitchable( final VCFParser.Record record ) {
-            if ( MULTIALLELIC.equals(record.getFilter()) ) return false;
-            final Map<VCFParser.ByteSequence, VCFParser.ByteSequence> infoMap = record.getInfoAsMap();
-            final VCFParser.ByteSequence svType = infoMap.get(SVTYPE);
-            if ( !SVTYPE_DEL.equals(svType) && !SVTYPE_DUP.equals(svType) ) return false;
-            // you can't be a stitchable if you don't have an "END" info field.
-            // code elsewhere assumes it can grab this value without checking for its existence
-            final VCFParser.ByteSequence end = infoMap.get(END);
-            if ( end == null || end.asInt() == VCFParser.ByteSequence.MISSING_VALUE ) return false;
-            final VCFParser.ByteSequence evidence = infoMap.get(EVIDENCE);
-            if ( evidence == null ) return false;
-            final String evStr = evidence.toString();
-            return !evStr.contains(EVIDENCE_PE) && !evStr.contains(EVIDENCE_SR) &&
-                    (evStr.contains(EVIDENCE_RD) || evStr.contains(EVIDENCE_BAF));
-        }
-
-        private VCFParser.Record setSubject( final VCFParser.Record record ) {
-            subjectChromosome = record.getChromosome();
-            final int start = record.getPosition();
-            final int end = record.getInfoField(END).asInt(); // can't be null -- checked in isStitchable
-            subjectMinNoOverlapPosition =
-                    end + Math.min(MAX_PAD, (int)(PAD_FACTOR * (end - start))) + MAX_PAD;
-            return record;
-        }
-    }
-}
diff --git a/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java b/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java
new file mode 100644
index 000000000..1633ed010
--- /dev/null
+++ b/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java
@@ -0,0 +1,320 @@
+package org.broadinstitute.svpipeline;
+
+import java.io.*;
+import java.nio.charset.StandardCharsets;
+import java.util.*;
+import java.util.regex.Pattern;
+import org.broadinstitute.svpipeline.VCFParser.*;
+
+public class CleanVCFPart1 {
+    private static final ByteSequence[] EV_VALS = {
+            null,
+            new ByteSequence("RD"),
+            new ByteSequence("PE"),
+            new ByteSequence("RD,PE"),
+            new ByteSequence("SR"),
+            new ByteSequence("RD,SR"),
+            new ByteSequence("PE,SR"),
+            new ByteSequence("RD,PE,SR")
+    };
+    private static final ByteSequence FORMAT_LINE = new ByteSequence("FORMAT");
+    private static final ByteSequence ID_KEY = new ByteSequence("ID");
+    private static final ByteSequence EV_VALUE = new ByteSequence("EV");
+    private static final ByteSequence TYPE_KEY = new ByteSequence("Type");
+    private static final ByteSequence STRING_VALUE = new ByteSequence("String");
+    private static final ByteSequence NUMBER_KEY = new ByteSequence("Number");
+    private static final ByteSequence SVTYPE_KEY = new ByteSequence("SVTYPE");
+    private static final ByteSequence ME_VALUE = new ByteSequence(":ME");
+    private static final ByteSequence LT_VALUE = new ByteSequence("<");
+    private static final ByteSequence GT_VALUE = new ByteSequence(">");
+    private static final ByteSequence N_VALUE = new ByteSequence("N");
+    private static final ByteSequence END_KEY = new ByteSequence("END");
+    private static final ByteSequence VARGQ_KEY = new ByteSequence("varGQ");
+    private static final ByteSequence MULTIALLELIC_KEY = new ByteSequence("MULTIALLELIC");
+    private static final ByteSequence UNRESOLVED_KEY = new ByteSequence("UNRESOLVED");
+    private static final ByteSequence HIGH_SR_BACKGROUND = new ByteSequence("HIGH_SR_BACKGROUND");
+    private static final ByteSequence PASS_VALUE = new ByteSequence("PASS");
+    private static final ByteSequence BOTHSIDES_VALUE = new ByteSequence("BOTHSIDES_SUPPORT");
+    private static final ByteSequence DEL_VALUE = new ByteSequence("DEL");
+    private static final ByteSequence DUP_VALUE = new ByteSequence("DUP");
+    private static final ByteSequence RDCN_VALUE = new ByteSequence("RD_CN");
+    private static final ByteSequence MISSING_VALUE = new ByteSequence(".");
+    private static final ByteSequence MISSING_GENOTYPE = new ByteSequence("./.");
+    private static final ByteSequence GT_REF_REF = new ByteSequence("0/0");
+    private static final ByteSequence GT_REF_ALT = new ByteSequence("0/1");
+    private static final ByteSequence GT_ALT_ALT = new ByteSequence("1/1");
+
+    private static final int MIN_ALLOSOME_EVENT_SIZE = 5000;
+
+    public static void main( final String[] args ) {
+        if ( args.length != 8 ) {
+            System.err.println("Usage: java org.broadinstitute.svpipeline.CleanVCFPart1 " +
+                    "INPUTVCFFILE PEDIGREES XCHR YCHR NOISYEVENTS BOTHSIDES SAMPLESOUT REVISEDEVENTSOUT");
+            System.exit(1);
+        }
+        final VCFParser parser = new VCFParser(args[0]);
+        final ByteSequence xChrName = new ByteSequence(args[2]);
+        final ByteSequence yChrName = new ByteSequence(args[3]);
+        final Set<ByteSequence> noisyEvents = readLastColumn(args[4]);
+        final Set<ByteSequence> bothsidesSupportEvents = readLastColumn(args[5]);
+        try ( final OutputStream os
+                      = new BufferedOutputStream(new FileOutputStream(FileDescriptor.out));
+              final OutputStream osSamples = new BufferedOutputStream(new FileOutputStream(args[6]));
+              final OutputStream osRevEvents = new BufferedOutputStream(new FileOutputStream(args[7])) ) {
+            int[] sexForSample = null;
+            while ( parser.hasMetadata() ) {
+                final Metadata metadata = parser.nextMetaData();
+                if ( metadata instanceof ColumnHeaderMetadata ) {
+                    final ColumnHeaderMetadata cols = ((ColumnHeaderMetadata)metadata);
+                    final List<ByteSequence> colNames = cols.getValue();
+                    final int nCols = colNames.size();
+                    for ( int idx = 9; idx < nCols; ++idx ) {
+                        colNames.get(idx).write(osSamples);
+                        osSamples.write('\n');
+                    }
+                    sexForSample = readPedFile(args[1], cols.getValue());
+                    os.write(("##FILTER=<ID=HIGH_SR_BACKGROUND,Description=\"High number of "
+                            + "SR splits in background samples indicating messy region\">\n")
+                                .getBytes(StandardCharsets.UTF_8));
+                    os.write("##FILTER=<ID=UNRESOLVED,Description=\"Variant is unresolved\">\n"
+                                .getBytes(StandardCharsets.UTF_8));
+                    os.write(("##FILTER=<ID=BOTHSIDES_SUPPORT,Description=\"Variant has " +
+                            "read-level support for both sides of breakpoint\">\n")
+                                .getBytes(StandardCharsets.UTF_8));
+                } else if ( metadata instanceof KeyAttributesMetadata ) {
+                    final KeyAttributesMetadata keyAttrs = (KeyAttributesMetadata)metadata;
+                    if ( keyAttrs.getKey().equals(FORMAT_LINE) ) {
+                        final List<KeyValue> kvs = keyAttrs.getValue();
+                        final int nKVs = kvs.size();
+                        if ( nKVs > 2 ) {
+                            final KeyValue kv0 = kvs.get(0);
+                            final KeyValue kv1 = kvs.get(1);
+                            final KeyValue kv2 = kvs.get(2);
+                            if ( kv0.getKey().equals(ID_KEY) && kv0.getValue().equals(EV_VALUE) ) {
+                                if ( kv1.getKey().equals(NUMBER_KEY) ) {
+                                    kvs.set(1, new KeyValue(NUMBER_KEY, MISSING_VALUE));
+                                }
+                                if ( kv2.getKey().equals(TYPE_KEY) ) {
+                                    kvs.set(2, new KeyValue(TYPE_KEY, STRING_VALUE));
+                                }
+                            }
+                        }
+                    }
+                }
+                metadata.write(os);
+            }
+            if ( sexForSample == null ) {
+                throw new RuntimeException("header line with sample names is missing.");
+            }
+            while ( parser.hasRecord() ) {
+                final Record record = parser.nextRecord();
+
+                // replace the numeric EV value with a text value
+                final int evIdx = record.getFormat().indexOf(EV_VALUE);
+                if ( evIdx >= 0 ) {
+                    for ( final CompoundField genotypeVals : record.getGenotypes() ) {
+                        genotypeVals.set(evIdx, EV_VALS[genotypeVals.get(evIdx).asInt()]);
+                    }
+                }
+
+                // move the SVTYPE to the ALT field (except for MEs)
+                final InfoField info = record.getInfo();
+                final ByteSequence svType = info.get(SVTYPE_KEY);
+                if ( !record.getAlt().contains(ME_VALUE) ) {
+                    if ( svType != null ) {
+                        record.setAlt(new ByteSequence(LT_VALUE, svType, GT_VALUE));
+                    }
+                }
+                record.setRef(N_VALUE);
+
+                // move varGQ info field to quality column
+                final ByteSequence varGQ = info.get(VARGQ_KEY);
+                if ( varGQ != null ) {
+                    record.setQuality(varGQ);
+                    info.remove(VARGQ_KEY);
+                }
+
+                // remove MULTIALLELIC flag, if present
+                info.remove(MULTIALLELIC_KEY);
+
+                // remove UNRESOLVED flag and add it as a filter
+                if ( info.containsKey(UNRESOLVED_KEY) ) {
+                    record.getFilter().add(UNRESOLVED_KEY);
+                    info.remove(UNRESOLVED_KEY);
+                }
+
+                // mark noisy events
+                if ( noisyEvents.contains(record.getID()) ) {
+                    record.getFilter().add(HIGH_SR_BACKGROUND);
+                }
+
+                // mark bothsides support
+                if ( bothsidesSupportEvents.contains(record.getID()) ) {
+                    final CompoundField filters = record.getFilter();
+                    if ( filters.size() == 1 && filters.get(0).equals(PASS_VALUE) ) {
+                        record.setFilter(BOTHSIDES_VALUE);
+                    } else {
+                        filters.add(BOTHSIDES_VALUE);
+                    }
+                }
+
+                // fix genotypes on allosomes
+                final boolean isY;
+                if ( (isY = yChrName.equals(record.getChromosome())) ||
+                        xChrName.equals(record.getChromosome())) {
+                    final List<CompoundField> genotypes = record.getGenotypes();
+                    final int rdCNIndex = record.getFormat().indexOf(RDCN_VALUE);
+                    final ByteSequence end = info.get(END_KEY);
+                    boolean adjustMale = false;
+                    final boolean isDel;
+                    if ( ((isDel = DEL_VALUE.equals(svType)) || DUP_VALUE.equals(svType)) && rdCNIndex >= 0 && end != null &&
+                            end.asInt() + 1 - record.getPosition() > MIN_ALLOSOME_EVENT_SIZE ) {
+                        adjustMale = isRevisableEvent(genotypes, rdCNIndex, sexForSample, isY);
+                        if ( adjustMale ) {
+                            record.getID().write(osRevEvents);
+                            osRevEvents.write('\n');
+                        }
+                    }
+                    CompoundField emptyGenotype = null;
+                    final int nSamples = genotypes.size();
+                    for ( int sampleIdx = 0; sampleIdx < nSamples; ++sampleIdx ) {
+                        final int sampleSex = sexForSample[sampleIdx];
+                        final CompoundField genotype = genotypes.get(sampleIdx);
+                        if ( sampleSex == 1 ) {
+                            if ( adjustMale ) {
+                                final ByteSequence rdCN = genotype.get(rdCNIndex);
+                                if ( rdCN.equals(MISSING_VALUE) ) {
+                                    continue;
+                                }
+                                final int rdCNVal = rdCN.asInt();
+                                genotype.set(rdCNIndex, new ByteSequence(Integer.toString(rdCNVal + 1)));
+                                if ( isDel ) {
+                                    if ( rdCNVal >= 1 ) genotype.set(0, GT_REF_REF);
+                                    else if ( rdCNVal == 0 ) genotype.set(0, GT_REF_ALT);
+                                } else {
+                                    if ( rdCNVal <= 1 ) genotype.set(0, GT_REF_REF);
+                                    else if ( rdCNVal == 2 ) genotype.set(0, GT_REF_ALT);
+                                    else genotype.set(0, GT_ALT_ALT);
+                                }
+                            }
+                        } else if ( sampleSex == 2 ) {
+                            if ( isY ) {
+                                if ( emptyGenotype == null ) {
+                                    emptyGenotype = new CompoundField(MISSING_GENOTYPE, ':');
+                                    int nFields = genotype.size();
+                                    while ( --nFields > 0 ) {
+                                        emptyGenotype.add(MISSING_VALUE);
+                                    }
+                                    emptyGenotype.getValue(); // performance hack to put the pieces together
+                                }
+                                genotypes.set(sampleIdx, emptyGenotype);
+                            }
+                        } else {
+                            genotype.set(0, MISSING_GENOTYPE);
+                        }
+                    }
+                }
+
+                record.write(os);
+            }
+        } catch ( final IOException ioe ) {
+            throw new RuntimeException("Can't write to stdout", ioe);
+        }
+    }
+
+    private static boolean isRevisableEvent( final List<CompoundField> genotypes,
+                                             final int rdCNIndex,
+                                             final int[] sexForColumn,
+                                             final boolean isY ) {
+        // We're going to calculate the median rdCN values for males and females.
+        // We only care if the median is 0, 1, 2, or something larger, so we'll use 4 bins to
+        // sum up the counts:  all values >2 go into the last bucket.
+        final int[] maleCounts = new int[4];
+        final int[] femaleCounts = new int[4];
+        final int nSamples = genotypes.size();
+        for ( int sampleIdx = 0; sampleIdx < nSamples; ++sampleIdx ) {
+            final ByteSequence rdCN = genotypes.get(sampleIdx).get(rdCNIndex);
+            if ( MISSING_VALUE.equals(rdCN) ) {
+                continue;
+            }
+            int rdCNVal = rdCN.asInt();
+            if ( rdCNVal > 2 ) {
+                rdCNVal = 3;
+            }
+            final int sampleSex = sexForColumn[sampleIdx];
+            if ( sampleSex == 1 ) {
+                maleCounts[rdCNVal] += 1;
+            } else if ( sampleSex == 2 ) {
+                femaleCounts[rdCNVal] += 1;
+            }
+        }
+        final double maleMedian = calcMedian(maleCounts);
+        double femaleMedian = calcMedian(femaleCounts);
+        return maleMedian == 1. && (isY ? femaleMedian == 0. : femaleMedian == 2.);
+    }
+
+    // visible for testing
+    static double calcMedian( final int[] counts ) {
+        final double target = (counts[0] + counts[1] + counts[2] + counts[3]) / 2.;
+        if ( target == 0. ) {
+            return Double.NaN;
+        }
+        int total = 0;
+        for ( int iii = 0; iii < 4; ++iii ) {
+            total += counts[iii];
+            if ( total == target ) {
+                return iii + .5;
+            } else if ( total > target ) {
+                return (double)iii;
+            }
+        }
+        throw new IllegalStateException("we should never reach this statement");
+    }
+
+    private static Set<ByteSequence> readLastColumn( final String filename ) {
+        final Set<ByteSequence> values = new HashSet<>();
+        try {
+            final BufferedReader neRdr =
+                    new BufferedReader(new InputStreamReader(new FileInputStream(filename)));
+            String line;
+            while ( (line = neRdr.readLine()) != null ) {
+                final String lastCol = line.substring(line.lastIndexOf('\t') + 1);
+                values.add(new ByteSequence(lastCol));
+            }
+        } catch ( final IOException ioe ) {
+            throw new RuntimeException("can't read table file " + filename);
+        }
+        return values;
+    }
+
+    private static int[] readPedFile( final String pedFilename, List<ByteSequence> sampleNames ) {
+        final int nCols = sampleNames.size() - 9;
+        final Map<ByteSequence, Integer> sexForSampleMap = new HashMap<>(2*nCols);
+        final int[] sexForSample = new int[nCols];
+        try {
+            final BufferedReader pedRdr =
+                    new BufferedReader(new InputStreamReader(new FileInputStream(pedFilename)));
+            final Pattern tabPattern = Pattern.compile("\\t");
+            String line;
+            while ( (line = pedRdr.readLine()) != null ) {
+                final Scanner scanner = new Scanner(line).useDelimiter(tabPattern);
+                scanner.next(); // family ignored
+                final String sampleName = scanner.next();
+                scanner.next(); // mom ignored
+                scanner.next(); // pop ignored
+                final int sex = scanner.nextInt();
+                sexForSampleMap.put(new ByteSequence(sampleName), sex);
+            }
+        } catch ( final IOException ioe ) {
+            throw new RuntimeException("can't read " + pedFilename, ioe);
+        }
+        for ( int col = 0; col < nCols; ++col ) {
+            final Integer sex = sexForSampleMap.get(sampleNames.get(col + 9));
+            if ( sex == null ) {
+                throw new RuntimeException("can't determine sex for sample " + sampleNames.get(col));
+            }
+            sexForSample[col] = sex;
+        }
+        return sexForSample;
+    }
+}
diff --git a/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java b/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java
new file mode 100644
index 000000000..77a6b5658
--- /dev/null
+++ b/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java
@@ -0,0 +1,40 @@
+package org.broadinstitute.svpipeline;
+
+public class CleanVCFPart1UnitTest {
+    public static void main( final String[] args ) {
+        testAsserts();
+        testMedianCalculation();
+        System.out.println("OK");
+    }
+
+    public static void testAsserts() {
+        boolean caughtIt = false;
+        try {
+            assert(false);
+        } catch ( final AssertionError ae ) {
+            caughtIt = true;
+        }
+        if ( !caughtIt ) {
+            throw new AssertionError("assertions aren't turned on, so you're not testing anything.");
+        }
+    }
+
+    public static void testMedianCalculation() {
+        final int[] counts = new int[4];
+        assert(Double.isNaN(CleanVCFPart1.calcMedian(counts)));
+        counts[0] = 1;
+        assert(CleanVCFPart1.calcMedian(counts) == 0.0);
+        counts[1] = 1;
+        assert(CleanVCFPart1.calcMedian(counts) == 0.5);
+        counts[2] = 1;
+        assert(CleanVCFPart1.calcMedian(counts) == 1.0);
+        counts[3] = 1;
+        assert(CleanVCFPart1.calcMedian(counts) == 1.5);
+        counts[2] = 2;
+        assert(CleanVCFPart1.calcMedian(counts) == 2.0);
+        counts[3] = 4;
+        assert(CleanVCFPart1.calcMedian(counts) == 2.5);
+        counts[3] = 5;
+        assert(CleanVCFPart1.calcMedian(counts) == 3.0);
+    }
+}
diff --git a/src/sv-pipeline/java/org/broadinstitute/svpipeline/StitchFragmentedCNVs.java b/src/sv-pipeline/java/org/broadinstitute/svpipeline/StitchFragmentedCNVs.java
new file mode 100644
index 000000000..a41a508a5
--- /dev/null
+++ b/src/sv-pipeline/java/org/broadinstitute/svpipeline/StitchFragmentedCNVs.java
@@ -0,0 +1,269 @@
+package org.broadinstitute.svpipeline;
+
+import java.io.*;
+import java.util.*;
+import org.broadinstitute.svpipeline.VCFParser.*;
+
+/** Read a VCF, and try to stitch together adjacent copy-number variations.
+ * Eligible Records (which we call "stitchable") must meet certain criteria as specified by the
+ * isStitchable method of the StitchableIterator.
+ * If two stitchables overlap appropriately, and all their samples have identical genotypes, we can
+ * replace the first one by adding on the interval covered by the second one.
+ */
+public class StitchFragmentedCNVs {
+    // These 3 values will always be overwritten, but are initialized to reasonable defaults as documentation
+    private static double PAD_FACTOR = .2;
+    static int MAX_PAD = 200000; // visible for testing
+    private static double MAX_OVERLAP_FACTOR = .2;
+
+    // A new "end position" for the disposition map signalling that the record is to be removed because it was combined
+    // with another record.
+    private static final int ENDPOS_REMOVED_RECORD = -1;
+
+    // relevant INFO field keys and values--these are visible for testing
+    static final ByteSequence END = new ByteSequence("END");
+    static final ByteSequence SVLEN = new ByteSequence("SVLEN");
+    static final ByteSequence SVTYPE = new ByteSequence("SVTYPE");
+    static final ByteSequence SVTYPE_DEL = new ByteSequence("DEL");
+    static final ByteSequence SVTYPE_DUP = new ByteSequence("DUP");
+    static final ByteSequence MULTIALLELIC = new ByteSequence("MULTIALLELIC");
+    static final ByteSequence EVIDENCE = new ByteSequence("EVIDENCE");
+    static final ByteSequence EVIDENCE_RD = new ByteSequence("RD");
+    static final ByteSequence EVIDENCE_SR = new ByteSequence("SR");
+    static final ByteSequence EVIDENCE_PE = new ByteSequence("PE");
+    static final ByteSequence EVIDENCE_BAF = new ByteSequence("BAF");
+
+    public static void main( final String[] args ) {
+        if ( args.length != 4 ) {
+            System.err.println("Usage: java StitchFragmentedCNVs PAD% MAXPAD OVRLAP% VCFFILE");
+            System.err.println("E.g.:  java StitchFragmentedCNVs .2 200000 .2 input.vcf.gz");
+            System.err.println("Combines neighboring CNVs with matching genotypes into a larger event.");
+            System.err.println("Writes an uncompressed vcf to stdout.");
+            System.exit(1);
+        }
+
+        initCommandLineArgs(args);
+
+        // a map of IDs onto revised ENDs
+        final Map<ByteSequence, Integer> disposition = new HashMap<>(1000);
+
+        // a push-back buffer of previous PaddedIntervals for stitchables that are still in range
+        final List<PaddedInterval> intervalList = new LinkedList<>();
+
+        try ( final VCFParser vcfParser = new VCFParser(args[3]) ) {
+            while ( vcfParser.hasMetadata() ) {
+                vcfParser.nextMetaData();
+            }
+            ByteSequence currentChromosome = null;
+            while ( vcfParser.hasRecord() ) {
+                final Record record = vcfParser.nextRecord();
+                if ( isStitchable(record) ) {
+                    if ( !record.getChromosome().equals(currentChromosome) ) {
+                        intervalList.clear();
+                        currentChromosome = record.getChromosome();
+                    }
+                    final PaddedInterval currentInterval = new PaddedInterval(record);
+                    final ListIterator<PaddedInterval> previousIntervals = intervalList.listIterator();
+                    boolean recordRemoved = false;
+                    while ( previousIntervals.hasNext() ) {
+                        final PaddedInterval previousInterval = previousIntervals.next();
+                        final PaddedInterval revisedInterval;
+                        if ( previousInterval.doneStitching(currentInterval) ) {
+                            previousIntervals.remove();
+                        } else if ( (revisedInterval = previousInterval.stitchTo(currentInterval)) != null ) {
+                            previousIntervals.set(revisedInterval);
+                            disposition.put(revisedInterval.getRecord().getID(), revisedInterval.getVCFEnd());
+                            disposition.put(record.getID(), ENDPOS_REMOVED_RECORD);
+                            recordRemoved = true;
+                        }
+                    }
+                    if ( !recordRemoved ) {
+                        intervalList.add(currentInterval);
+                    }
+                }
+            }
+        }
+        intervalList.clear();
+
+        try ( final OutputStream os = new BufferedOutputStream(new FileOutputStream(FileDescriptor.out));
+                final VCFParser vcfParser = new VCFParser(args[3])) {
+            while ( vcfParser.hasMetadata() ) {
+                vcfParser.nextMetaData().write(os);
+            }
+            while ( vcfParser.hasRecord() ) {
+                final Record record = vcfParser.nextRecord();
+                final Integer endPosObj = disposition.get(record.getID());
+                if ( endPosObj == null ) {
+                    record.write(os);
+                } else {
+                    final int endPos = endPosObj;
+                    if ( endPos != ENDPOS_REMOVED_RECORD ) {
+                        final InfoField infoField = record.getInfo();
+                        infoField.put(END, new ByteSequence(Integer.toString(endPos)));
+                        final int svLength = endPos + 1 - record.getPosition();
+                        infoField.put(SVLEN, new ByteSequence(Integer.toString(svLength)));
+                        record.write(os);
+                    }
+                }
+            }
+        } catch ( final IOException ioe ) {
+            throw new MalformedVCFException("can't write revised vcf", ioe);
+        }
+    }
+
+    private static void initCommandLineArgs( final String[] args ) {
+        try {
+            PAD_FACTOR = Double.parseDouble(args[0]);
+        } catch ( final NumberFormatException nfe ) {
+            System.err.println("Can't interpret 1st argument (padding fraction) as a floating point number.");
+            System.exit(2);
+        }
+        if ( PAD_FACTOR < 0.0 ) {
+            System.err.println("First argument should be a padding fraction >= 0.");
+            System.exit(2);
+        }
+        try {
+            MAX_PAD = Integer.parseInt(args[1]);
+        } catch ( final NumberFormatException nfe ) {
+            System.err.println("Can't interpret 2nd argument (maximum padding in bases) as an integer.");
+            System.exit(2);
+        }
+        if ( MAX_PAD < 0 ) {
+            System.err.println("Second argument must be a maximum padding in bases >= 0.");
+            System.exit(2);
+        }
+        try {
+            MAX_OVERLAP_FACTOR = Double.parseDouble(args[0]);
+        } catch ( final NumberFormatException nfe ) {
+            System.err.println("Can't interpret 3rd argument (maximum overlap fraction) as a floating point number.");
+            System.exit(2);
+        }
+        if ( MAX_OVERLAP_FACTOR < 0.0 || MAX_OVERLAP_FACTOR > 1.0 ) {
+            System.err.println("Third argument should be a maximum overlap fraction between 0 and 1.");
+            System.exit(2);
+        }
+    }
+
+    // VisibleForTesting
+    static boolean isStitchable( final Record record ) {
+        final CompoundField filterField = record.getFilter();
+        for ( final ByteSequence filter : filterField ) {
+            if ( MULTIALLELIC.equals(filter) ) {
+                return false;
+            }
+        }
+
+        final InfoField infoField = record.getInfo();
+        final ByteSequence svType = infoField.get(SVTYPE);
+        if ( !SVTYPE_DEL.equals(svType) && !SVTYPE_DUP.equals(svType) ) {
+            return false;
+        }
+
+        // you can't be a stitchable if you don't have an "END" info field.
+        // code elsewhere assumes it can grab this value without checking for its existence
+        final ByteSequence endValue = infoField.get(END);
+        if ( endValue == null || endValue.asInt() == ByteSequence.MISSING_VALUE ) {
+            return false;
+        }
+
+        final ByteSequence evidence = infoField.get(EVIDENCE);
+        if ( evidence == null ) {
+            return false;
+        }
+        return !evidence.contains(EVIDENCE_PE) && !evidence.contains(EVIDENCE_SR) &&
+                (evidence.contains(EVIDENCE_RD) || evidence.contains(EVIDENCE_BAF));
+    }
+
+    /** A little helper class to do padding and overlap calculations
+     *  Note: this class uses half-open intervals, unlike a vcf */
+    final static class PaddedInterval { // visible for testing
+        private final Record record;
+        private final int start;
+        private final int end;
+        private final int padding;
+        private final int maxOverlap;
+        private final ByteSequence eventType;
+
+        public PaddedInterval( final Record record ) {
+            this.record = record;
+            this.start = record.getPosition();
+            this.end = record.getInfo().get(END).asInt() + 1;
+            final int length = end - start;
+            this.padding = Math.min(MAX_PAD, (int)(length * PAD_FACTOR));
+            this.maxOverlap = (int)(length * MAX_OVERLAP_FACTOR);
+            this.eventType = record.getInfo().get(SVTYPE);
+        }
+
+        private PaddedInterval( final PaddedInterval upstream, final PaddedInterval downstream ) {
+            this.record = upstream.record;
+            this.start = upstream.start;
+            this.end = downstream.end;
+            this.padding = Math.max(upstream.padding, downstream.padding);
+            this.maxOverlap = Math.max(upstream.maxOverlap, downstream.maxOverlap);
+            this.eventType = upstream.eventType;
+        }
+
+        public int getPaddedStart() { return start - padding; }
+        public int getPaddedEnd() { return end + padding; }
+        public Record getRecord() { return record; }
+
+        /** Returns true if we're done trying to stitch this interval.  Criterion is that the
+         * padded end of this interval more than MAX_PAD bases away from the start of the
+         * currentInterval.  So this one is definitely disjoint (regardless of its length), and that
+         * will also be true of all subsequent intervals (since they're in sorted order on the
+         * starting interval.
+         */
+        public boolean doneStitching( final PaddedInterval currentInterval ) {
+            return getPaddedEnd() < currentInterval.start - MAX_PAD;
+        }
+
+        /** Returns an expanded interval if possible, otherwise null. */
+        public PaddedInterval stitchTo( final PaddedInterval downstreamInterval ) {
+            if ( !eventType.equals(downstreamInterval.eventType) ) {
+                return null;
+            }
+
+            // Check that the padded intervals overlap.
+            // Only have to check one end, because we know the downstream interval starts as late
+            // or later than this one.
+            if ( getPaddedEnd() <= downstreamInterval.getPaddedStart() ) {
+                return null;
+            }
+
+            // But the unpadded intervals mustn't overlap too much.
+            // Note that the calculated overlap can be negative (they don't actually overlap),
+            //   but that's OK.
+            final int overlap = Math.min(end, downstreamInterval.end) - downstreamInterval.start;
+            if ( overlap > maxOverlap || overlap > downstreamInterval.maxOverlap ) {
+                return null;
+            }
+
+            if ( !genotypesMatch(record.getGenotypes(), downstreamInterval.record.getGenotypes()) ) {
+                return null;
+            }
+
+            return new PaddedInterval(this, downstreamInterval);
+        }
+
+        public int getVCFEnd() { return end - 1; }
+
+        private static boolean genotypesMatch( final List<CompoundField> genotypes1,
+                                               final List<CompoundField> genotypes2 ) {
+            final int nGTs = genotypes1.size();
+            if ( genotypes2.size() != nGTs ) {
+                throw new IllegalStateException("records have a different number of genotypes");
+            }
+            for ( int idx = 0; idx != nGTs; ++idx ) {
+                final ByteIterator itr1 = genotypes1.get(idx).getValue().iterator();
+                final ByteIterator itr2 = genotypes2.get(idx).getValue().iterator();
+                byte b1;
+                do {
+                    b1 = itr1.hasNext() ? itr1.next() : (byte)':';
+                    final byte b2 = itr2.hasNext() ? itr2.next() : (byte)':';
+                    if ( b1 != b2 ) return false;
+                } while ( b1 != ':' );
+            }
+            return true;
+        }
+    }
+}
diff --git a/src/sv-pipeline/java/org/broadinstitute/svpipeline/StitchFragmentedCNVsUnitTest.java b/src/sv-pipeline/java/org/broadinstitute/svpipeline/StitchFragmentedCNVsUnitTest.java
new file mode 100644
index 000000000..1d9513fb6
--- /dev/null
+++ b/src/sv-pipeline/java/org/broadinstitute/svpipeline/StitchFragmentedCNVsUnitTest.java
@@ -0,0 +1,160 @@
+package org.broadinstitute.svpipeline;
+
+import java.io.ByteArrayInputStream;
+import java.util.Arrays;
+
+import org.broadinstitute.svpipeline.StitchFragmentedCNVs.PaddedInterval;
+import org.broadinstitute.svpipeline.VCFParser.*;
+
+public final class StitchFragmentedCNVsUnitTest {
+    public static void main( final String[] args ) {
+        testAsserts();
+        testIsStitchable();
+        testDoneStitching();
+        testStitchTo();
+        System.out.println("OK");
+    }
+
+    public static void testAsserts() {
+        boolean caughtIt = false;
+        try {
+            assert (false);
+        } catch ( final AssertionError ae ) {
+            caughtIt = true;
+        }
+        if ( !caughtIt ) {
+            throw new AssertionError("assertions aren't turned on (with -ea), so you're not testing anything.");
+        }
+    }
+
+    public static void testIsStitchable() {
+        final String vcfLine = "chr1\t1000\tID1\tN\t<DEL>\t60\tPASS\tEND=1999;SVTYPE=DEL;EVIDENCE=RD\tGT\t0/0\t0/1\n";
+        final Record record = fromString(vcfLine);
+        assert(StitchFragmentedCNVs.isStitchable(record));
+
+        // not stitchable if there's a MULTIALLELIC filter component
+        final ByteSequence originalFilter = record.getFilter().getValue();
+        record.setFilter(StitchFragmentedCNVs.MULTIALLELIC);
+        assert(!StitchFragmentedCNVs.isStitchable(record));
+        final ByteSequence x = new ByteSequence("X");
+        record.setFilter(Arrays.asList(x, StitchFragmentedCNVs.MULTIALLELIC));
+        assert(!StitchFragmentedCNVs.isStitchable(record));
+        final ByteSequence y = new ByteSequence("Y");
+        record.setFilter(Arrays.asList(x, StitchFragmentedCNVs.MULTIALLELIC, y));
+        assert(!StitchFragmentedCNVs.isStitchable(record));
+        record.setFilter(originalFilter);
+        assert(StitchFragmentedCNVs.isStitchable(record));
+
+        // not stitchable if the the SVTYPE isn't DUP or DEL
+        final InfoField info = record.getInfo();
+        final ByteSequence originalSVTYPE = info.get(StitchFragmentedCNVs.SVTYPE);
+        info.put(StitchFragmentedCNVs.SVTYPE, new ByteSequence("INS"));
+        assert(!StitchFragmentedCNVs.isStitchable(record));
+        info.put(StitchFragmentedCNVs.SVTYPE, StitchFragmentedCNVs.SVTYPE_DEL);
+        assert(StitchFragmentedCNVs.isStitchable(record));
+        info.put(StitchFragmentedCNVs.SVTYPE, StitchFragmentedCNVs.SVTYPE_DUP);
+        assert(StitchFragmentedCNVs.isStitchable(record));
+        info.put(StitchFragmentedCNVs.SVTYPE, originalSVTYPE);
+        assert(StitchFragmentedCNVs.isStitchable(record));
+
+        // not stitchable if END is missing
+        final ByteSequence originalEnd = info.get(StitchFragmentedCNVs.END);
+        info.remove(StitchFragmentedCNVs.END);
+        assert(!StitchFragmentedCNVs.isStitchable(record));
+        info.put(StitchFragmentedCNVs.END, originalEnd);
+        assert(StitchFragmentedCNVs.isStitchable(record));
+
+        // not stitchable if EVIDENCE includes PE or SR
+        final ByteSequence originalEVIDENCE = info.get(StitchFragmentedCNVs.EVIDENCE);
+        info.put(StitchFragmentedCNVs.EVIDENCE, StitchFragmentedCNVs.EVIDENCE_RD);
+        assert(StitchFragmentedCNVs.isStitchable(record));
+        info.put(StitchFragmentedCNVs.EVIDENCE, StitchFragmentedCNVs.EVIDENCE_BAF);
+        assert(StitchFragmentedCNVs.isStitchable(record));
+        info.put(StitchFragmentedCNVs.EVIDENCE, StitchFragmentedCNVs.EVIDENCE_SR);
+        assert(!StitchFragmentedCNVs.isStitchable(record));
+        info.put(StitchFragmentedCNVs.EVIDENCE, StitchFragmentedCNVs.EVIDENCE_PE);
+        assert(!StitchFragmentedCNVs.isStitchable(record));
+        final ByteSequence sep = new ByteSequence(",");
+        info.put(StitchFragmentedCNVs.EVIDENCE,
+                new ByteSequence(StitchFragmentedCNVs.EVIDENCE_BAF, sep, StitchFragmentedCNVs.EVIDENCE_SR));
+        assert(!StitchFragmentedCNVs.isStitchable(record));
+        info.put(StitchFragmentedCNVs.EVIDENCE, originalEVIDENCE);
+        assert(StitchFragmentedCNVs.isStitchable(record));
+    }
+
+    public static void testDoneStitching() {
+        final String vcfLine = "chr1\t1000\tID1\tN\t<DEL>\t60\tPASS\tEND=1999;SVTYPE=DEL;EVIDENCE=RD\tGT\t0/0\t0/1\n";
+        final Record upstreamRecord = fromString(vcfLine);
+        upstreamRecord.setPosition(1000);
+        upstreamRecord.getInfo().put(StitchFragmentedCNVs.END, new ByteSequence(Integer.toString(1999)));
+        final PaddedInterval upstreamInterval = new PaddedInterval(upstreamRecord);
+
+        // we've got a record on chr1:1000-2000 with a 200-base pad.  move MAX_PAD bases further downstream.
+        final int startNotTooFar = upstreamInterval.getPaddedEnd() + StitchFragmentedCNVs.MAX_PAD;
+        final Record downstreamRecord = fromString(vcfLine);
+        downstreamRecord.setPosition(startNotTooFar);
+        downstreamRecord.getInfo().put(StitchFragmentedCNVs.END, new ByteSequence(Integer.toString(startNotTooFar + 1000)));
+        final PaddedInterval downstreamInterval = new PaddedInterval(downstreamRecord);
+        assert(!upstreamInterval.doneStitching(downstreamInterval));
+
+        // move one more base, and doneStitching should return true
+        downstreamRecord.setPosition(startNotTooFar + 1);
+        assert(upstreamInterval.doneStitching(new PaddedInterval(downstreamRecord)));
+    }
+
+    public static void testStitchTo() {
+        final String vcfLine1 = "chr1\t1000\tID1\tN\t<DEL>\t60\tPASS\tEND=1999;SVTYPE=DEL;EVIDENCE=RD\tGT\t0/0\t0/1\n";
+        final Record upstreamRecord = fromString(vcfLine1);
+        assert(StitchFragmentedCNVs.isStitchable(upstreamRecord));
+        final PaddedInterval upstreamInterval = new PaddedInterval(upstreamRecord);
+        final String vcfLine2 = "chr1\t2399\tID1\tN\t<DEL>\t60\tPASS\tEND=3399;SVTYPE=DEL;EVIDENCE=RD\tGT\t0/0\t0/1\n";
+        final Record downstreamRecord = fromString(vcfLine2);
+        assert(StitchFragmentedCNVs.isStitchable(downstreamRecord));
+        final PaddedInterval stitched = upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord));
+        assert(stitched != null);
+        assert(stitched.getRecord().getPosition() == 1000);
+        assert(stitched.getVCFEnd() == 3399);
+
+        // fails because no overlap (padded intervals are adjacent)
+        downstreamRecord.setPosition(2400);
+        assert(upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord)) == null);
+
+        // back to starting conditions
+        downstreamRecord.setPosition(2399);
+        assert(upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord)) != null);
+
+        // fails because event types don't match
+        downstreamRecord.getInfo().put(StitchFragmentedCNVs.SVTYPE, StitchFragmentedCNVs.SVTYPE_DUP);
+        assert(upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord)) == null);
+
+        // back to starting conditions
+        downstreamRecord.getInfo().put(StitchFragmentedCNVs.SVTYPE, StitchFragmentedCNVs.SVTYPE_DEL);
+        assert(upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord)) != null);
+
+        // overlaps upstream interval too much
+        downstreamRecord.setPosition(1799);
+        assert(upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord)) == null);
+
+        // back to starting conditions
+        downstreamRecord.setPosition(2399);
+        assert(upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord)) != null);
+
+        // overlaps downstream interval too much
+        downstreamRecord.setPosition(1899);
+        downstreamRecord.getInfo().put(StitchFragmentedCNVs.END, new ByteSequence(Integer.toString(2399)));
+        assert(upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord)) == null);
+
+        // back to starting conditions
+        downstreamRecord.setPosition(2399);
+        downstreamRecord.getInfo().put(StitchFragmentedCNVs.END, new ByteSequence(Integer.toString(3399)));
+        assert(upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord)) != null);
+
+        // genotypes don't match
+        downstreamRecord.getGenotypes().get(0).set(0, new ByteSequence("0/1"));
+        assert(upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord)) == null);
+    }
+
+    private static Record fromString( final String vcfLine ) {
+        return new VCFParser(new ByteArrayInputStream(vcfLine.getBytes())).nextRecord();
+    }
+}
diff --git a/src/sv-pipeline/java/VCFParser.java b/src/sv-pipeline/java/org/broadinstitute/svpipeline/VCFParser.java
similarity index 54%
rename from src/sv-pipeline/java/VCFParser.java
rename to src/sv-pipeline/java/org/broadinstitute/svpipeline/VCFParser.java
index 6adb828e8..6c6fbbbf8 100644
--- a/src/sv-pipeline/java/VCFParser.java
+++ b/src/sv-pipeline/java/org/broadinstitute/svpipeline/VCFParser.java
@@ -1,3 +1,5 @@
+package org.broadinstitute.svpipeline;
+
 import java.io.*;
 import java.util.*;
 import java.util.zip.GZIPInputStream;
@@ -22,15 +24,13 @@ public class VCFParser implements Closeable {
     public VCFParser( final String pathName ) {
         if ( pathName == null || "-".equals(pathName) ) {
             this.pathName = "stdin";
-            this.is = System.in instanceof BufferedInputStream ?
-                    System.in :
-                    new BufferedInputStream(System.in);
+            this.is = new BufferedInputStream(new FileInputStream(FileDescriptor.in), BUFFER_SIZE);
         } else {
             this.pathName = pathName;
             try {
                 final BufferedInputStream bis =
-                        new BufferedInputStream(new FileInputStream(pathName));
-                this.is = pathName.endsWith(GZ) ? new GZIPInputStream(bis) : bis;
+                        new BufferedInputStream(new FileInputStream(pathName), BUFFER_SIZE);
+                this.is = pathName.endsWith(GZ) ? new GZIPInputStream(bis, BUFFER_SIZE) : bis;
             } catch ( final IOException ioe ) {
                 throw new MalformedVCFException("can't open " + pathName, ioe);
             }
@@ -40,6 +40,14 @@ public VCFParser( final String pathName ) {
         }
     }
 
+    public VCFParser( final InputStream is ) {
+        this.pathName = "input VCF";
+        this.is = is;
+        if ( !readBuffer() ) {
+            throw new MalformedVCFException("input VCF is empty");
+        }
+    }
+
     public void close() {
         try {
             is.close();
@@ -68,7 +76,7 @@ public Metadata nextMetaData() {
         // it's the only metadata line that doesn't start with "##" but goes:
         // #CHROM  POS     ID      REF     ALT     QUAL    FILTER  INFO    FORMAT   (sample names)
         if ( bufferIterator.peek() != '#' ) {
-            return new Columns(captureColumns());
+            return new ColumnHeaderMetadata(captureColumns());
         }
         bufferIterator.skip();
         // get the key part of the metadata, e.g., INFO in ##INFO=, or contig in ##contig=
@@ -80,11 +88,11 @@ public Metadata nextMetaData() {
         if ( bufferIterator.peek() != '<' ) {
             // nope, simple value.  just grab the rest of the line
             final ByteSequence value = capture('\n');
-            return new KeyValue(key, value);
+            return new KeyValueMetadata(key, value);
         }
         bufferIterator.skip();
         // yup.  multiple values. tokenize them.
-        return new KeyAttributes(key, captureAttributes());
+        return new KeyAttributesMetadata(key, captureAttributes());
     }
 
     /** once we've had hasMetadata return false (line doesn't start with '#')
@@ -128,18 +136,6 @@ private boolean readBuffer() {
         return bufferIterator.hasNext();
     }
 
-    private void expect( final String expect ) {
-        final int expectLen = expect.length();
-        for ( int iii = 0; iii < expectLen; ++iii ) {
-            needData();
-            final byte nextByte = bufferIterator.next();
-            if ( expect.charAt(iii) != nextByte ) {
-                throw new MalformedVCFException("expected " + expect + " but found " +
-                        expect.substring(0, iii) + (char)nextByte + "...");
-            }
-        }
-    }
-
     /** grab the sequence of bytes up to the specified delimiter */
     private ByteSequence capture( final char delim ) {
         ByteSequence prefix = null;
@@ -191,7 +187,10 @@ private List<KeyValue> captureAttributes() {
                     (bs == null ? prefix : new ByteSequence(prefix, bs));
             attributes.add(new KeyValue(key, value));
         } while ( finalByte != '>' );
-        expect("\n");
+        needData();
+        if ( bufferIterator.next() != '\n' ) {
+            throw new MalformedVCFException("unexpected characters at end of metadata line");
+        }
         return attributes;
     }
 
@@ -315,20 +314,67 @@ public ByteSequence( final ByteSequence seq1, final ByteSequence seq2 ) {
             end = buffer.length;
         }
 
+        public ByteSequence( final ByteSequence... seqs ) {
+            int totalLen = 0;
+            for ( final ByteSequence seq : seqs ) {
+                totalLen += seq.length();
+            }
+            buffer = new byte[totalLen];
+            start = 0;
+            end = totalLen;
+            int curLen = 0;
+            for ( final ByteSequence seq : seqs ) {
+                final int len = seq.length();
+                System.arraycopy(seq.buffer, seq.start, buffer, curLen, len);
+                curLen += len;
+            }
+        }
+
+        public ByteSequence( final List<ByteSequence> pieces, final char delim ) {
+            final int nPieces = pieces.size();
+            int totalLen = 0;
+            if ( nPieces > 0 ) {
+                totalLen = nPieces - 1; // this many delimiters
+                for ( final ByteSequence piece : pieces ) {
+                    totalLen += piece.length();
+                }
+            }
+            buffer = new byte[totalLen];
+            start = 0;
+            end = totalLen;
+            if ( nPieces > 0 ) {
+                ByteSequence piece = pieces.get(0);
+                int destIdx = piece.length();
+                System.arraycopy(piece.buffer, piece.start, buffer, 0, destIdx);
+                for ( int pieceIdx = 1; pieceIdx < nPieces; ++pieceIdx ) {
+                    buffer[destIdx++] = (byte)delim;
+                    piece = pieces.get(pieceIdx);
+                    int len = piece.length();
+                    System.arraycopy(piece.buffer, piece.start, buffer, destIdx, len);
+                    destIdx += len;
+                }
+            }
+        }
+
         public int length() { return end - start; }
 
-        public ByteSequence replace( final ByteSequence oldValue, final ByteSequence newValue ) {
-            if ( buffer != oldValue.buffer ) {
-                throw new IllegalStateException("oldValue not drawn from INFO field");
+        public boolean contains( final ByteSequence subSeq ) {
+            final int len = subSeq.length();
+            final int stop = end - len;
+            for ( int idx = start; idx <= stop; ++idx ) {
+                int idx1 = idx;
+                int idx2 = subSeq.start;
+                int nnn = len;
+                while ( nnn-- > 0 ) {
+                    if ( buffer[idx1++] != subSeq.buffer[idx2++] ) {
+                        break;
+                    }
+                }
+                if ( nnn < 0 ) {
+                    return true;
+                }
             }
-            final int length = length();
-            final int newLen = newValue.length();
-            final byte[] newBuf = new byte[length + newLen - oldValue.length()];
-            final int len1 = oldValue.start - start;
-            System.arraycopy(buffer, start, newBuf, 0, len1);
-            System.arraycopy(newValue.buffer, newValue.start, newBuf, len1, newLen);
-            System.arraycopy(buffer, oldValue.end, newBuf, len1 + newLen, end - oldValue.end);
-            return new ByteSequence(newBuf, 0, newBuf.length);
+            return false;
         }
 
         public int asInt() {
@@ -365,7 +411,7 @@ public List<ByteSequence> split( final char delim ) {
                     mark = itr.mark();
                 }
             }
-            splits.add(itr.getSequenceNoDelim(mark));
+            splits.add(itr.getSequence(mark));
             return splits;
         }
 
@@ -392,7 +438,7 @@ public void write( final OutputStream os ) throws IOException {
         }
 
         public boolean equals( final ByteSequence that ) {
-            if ( length() != that.length() ) return false;
+            if ( that == null || length() != that.length() ) return false;
             int idx2 = that.start;
             for ( int idx = start; idx < end; ++idx ) {
                 if ( buffer[idx] != that.buffer[idx2++] ) return false;
@@ -401,20 +447,13 @@ public boolean equals( final ByteSequence that ) {
         }
     }
 
-    enum MetadataType {
-        KeyValue,
-        KeyAttributes,
-        Columns
-    }
-
     public interface Metadata {
-        MetadataType getType();
         ByteSequence getKey();
         Object getValue();
         void write( OutputStream os ) throws IOException;
     }
 
-    public static final class KeyValue implements Metadata {
+    public static final class KeyValue {
         private final ByteSequence key;
         private final ByteSequence value;
 
@@ -423,32 +462,49 @@ public KeyValue( final ByteSequence key, final ByteSequence value ) {
             this.value = value;
         }
 
-        @Override public MetadataType getType() { return MetadataType.KeyValue; }
-        @Override public ByteSequence getKey() { return key; }
-        @Override public ByteSequence getValue() { return value; }
+        public ByteSequence getKey() { return key; }
+        public ByteSequence getValue() { return value; }
+
+        public void write( final OutputStream os ) throws IOException {
+            key.write(os);
+            if ( value != null ) {
+                os.write('=');
+                value.write(os);
+            }
+        }
+
+        @Override public String toString() { return key + "=" + value; }
+    }
+
+    public static final class KeyValueMetadata implements Metadata {
+        private final KeyValue keyValue;
+
+        public KeyValueMetadata( final ByteSequence key, final ByteSequence value ) {
+            keyValue = new KeyValue(key, value);
+        }
+
+        @Override public ByteSequence getKey() { return keyValue.getKey(); }
+        @Override public ByteSequence getValue() { return keyValue.getValue(); }
 
         @Override public void write( final OutputStream os ) throws IOException {
             os.write('#');
             os.write('#');
-            key.write(os);
-            os.write('=');
-            value.write(os);
+            keyValue.write(os);
             os.write('\n');
         }
 
-        @Override public String toString() { return "##" + key + "=" + value; }
+        @Override public String toString() { return keyValue.toString(); }
     }
 
-    public static final class KeyAttributes implements Metadata {
+    public static final class KeyAttributesMetadata implements Metadata {
         private final ByteSequence key;
         private final List<KeyValue> values;
 
-        public KeyAttributes( final ByteSequence key, final List<KeyValue> values ) {
+        public KeyAttributesMetadata( final ByteSequence key, final List<KeyValue> values ) {
             this.key = key;
             this.values = values;
         }
 
-        @Override public MetadataType getType() { return MetadataType.KeyAttributes; }
         @Override public ByteSequence getKey() { return key; }
         @Override public List<KeyValue> getValue() { return values; }
 
@@ -460,9 +516,7 @@ public KeyAttributes( final ByteSequence key, final List<KeyValue> values ) {
             int prefix = '<';
             for ( final KeyValue kv : values ) {
                 os.write(prefix);
-                kv.getKey().write(os);
-                os.write('=');
-                kv.getValue().write(os);
+                kv.write(os);
                 prefix = ',';
             }
             os.write('>');
@@ -471,7 +525,7 @@ public KeyAttributes( final ByteSequence key, final List<KeyValue> values ) {
 
         @Override public String toString() {
             final StringBuilder sb = new StringBuilder();
-            sb.append("##").append(key).append("=");
+            sb.append(key).append("=");
             char prefix = '<';
             for ( final KeyValue kv : values ) {
                 sb.append(prefix).append(kv.getKey()).append('=').append(kv.getValue());
@@ -481,14 +535,13 @@ public KeyAttributes( final ByteSequence key, final List<KeyValue> values ) {
         }
     }
 
-    public static final class Columns implements Metadata {
+    public static final class ColumnHeaderMetadata implements Metadata {
         private final List<ByteSequence> columns;
 
-        public Columns( final List<ByteSequence> columns ) {
+        public ColumnHeaderMetadata( final List<ByteSequence> columns ) {
             this.columns = columns;
         }
 
-        @Override public MetadataType getType() { return MetadataType.Columns; }
         @Override public ByteSequence getKey() { return EMPTY_SEQUENCE; }
         @Override public List<ByteSequence> getValue() { return columns; }
 
@@ -513,112 +566,315 @@ public Columns( final List<ByteSequence> columns ) {
         }
     }
 
+    /** a field like format and genotype with delimited subfields */
+    public static final class CompoundField extends AbstractList<ByteSequence> {
+        private ByteSequence value;
+        private final char delim;
+        private List<ByteSequence> subFields;
+
+        public CompoundField( final ByteSequence value, final char delim ) {
+            this.value = value;
+            this.delim = delim;
+            subFields = null;
+        }
+
+        public CompoundField( final List<ByteSequence> vals, final char delim ) {
+            this.value = null;
+            this.delim = delim;
+            this.subFields = vals;
+        }
+
+        public ByteSequence getValue() {
+            if ( value == null ) {
+                value = new ByteSequence(subFields, delim);
+            }
+            return value;
+        }
+
+        public void write( final OutputStream os ) throws IOException {
+            if ( value != null ) value.write(os);
+            else {
+                int len = subFields.size();
+                if ( len <= 0 ) {
+                    os.write('.');
+                } else {
+                    subFields.get(0).write(os);
+                    for ( int idx = 1; idx < len; ++idx ) {
+                        os.write(delim);
+                        subFields.get(idx).write(os);
+                    }
+                }
+            }
+        }
+
+        @Override public int size() {
+            populateSubFields();
+            return subFields.size();
+        }
+
+        @Override public ByteSequence get( final int index ) {
+            populateSubFields();
+            return subFields.get(index);
+        }
+
+        @Override public ByteSequence set( final int index, final ByteSequence val ) {
+            populateSubFields();
+            value = null;
+            return subFields.set(index, val);
+        }
+
+        @Override public void add( final int index, final ByteSequence val ) {
+            populateSubFields();
+            value = null;
+            subFields.add(index, val);
+        }
+
+        @Override public ByteSequence remove( final int index ) {
+            populateSubFields();
+            value = null;
+            return subFields.remove(index);
+        }
+
+        @Override public boolean equals( final Object obj ) {
+            if ( this == obj ) return true;
+            if ( !(obj instanceof CompoundField) ) return false;
+            return getValue().equals(((CompoundField)obj).getValue());
+        }
+        @Override public int hashCode() {
+            return getValue().hashCode();
+        }
+        @Override public String toString() { return getValue().toString(); }
+
+        private void populateSubFields() {
+            if ( subFields == null ) {
+                subFields = value.split(delim);
+            }
+        }
+    }
+
+    /** the info subfields are semicolon delimited and contain key/value pairs */
+    public static final class InfoField extends AbstractMap<ByteSequence, ByteSequence> {
+        private ByteSequence value;
+        private LinkedHashMap<ByteSequence, ByteSequence> subFields;
+
+        public InfoField( final ByteSequence value ) {
+            this.value = value;
+            subFields = null;
+        }
+
+        public ByteSequence getValue() {
+            if ( value == null ) {
+                final ByteArrayOutputStream os = new ByteArrayOutputStream();
+                try {
+                    write(os);
+                } catch ( final IOException ioe ) {
+                    throw new IllegalStateException("IOException when writing to ByteArrayOutputStream!?");
+                }
+                final byte[] buffer = os.toByteArray();
+                value = new ByteSequence(buffer, 0, buffer.length);
+            }
+            return value;
+        }
+
+        public void write( final OutputStream os ) throws IOException {
+            if ( value != null ) {
+                value.write(os);
+            } else if ( subFields.isEmpty() ) {
+                os.write('.');
+            } else {
+                boolean needSep = false;
+                for ( final Map.Entry<ByteSequence, ByteSequence> entry : subFields.entrySet() ) {
+                    if ( needSep ) {
+                        os.write(';');
+                    }
+                    needSep = true;
+                    entry.getKey().write(os);
+                    final ByteSequence value = entry.getValue();
+                    if ( value != null ) {
+                        os.write('=');
+                        value.write(os);
+                    }
+                }
+            }
+        }
+
+        @Override public Set<Entry<ByteSequence, ByteSequence>> entrySet() {
+            populateSubFields();
+            return subFields.entrySet();
+        }
+
+        @Override public boolean containsKey( final Object key ) {
+            populateSubFields();
+            return subFields.containsKey(key);
+        }
+
+        @Override public ByteSequence get( final Object key ) {
+            populateSubFields();
+            return subFields.get(key);
+        }
+
+        @Override public ByteSequence put( final ByteSequence key, final ByteSequence val ) {
+            populateSubFields();
+            value = null;
+            return subFields.put(key, val);
+        }
+
+        @Override public ByteSequence remove( final Object key ) {
+            populateSubFields();
+            if ( containsKey(key) ) {
+                value = null;
+            }
+            return subFields.remove(key);
+        }
+
+        private void populateSubFields() {
+            if ( subFields == null ) {
+                subFields = new LinkedHashMap<>();
+                final ByteIterator itr = value.iterator();
+                int mark = itr.mark();
+                ByteSequence key = null;
+                while ( itr.hasNext() ) {
+                    byte nextByte = itr.next();
+                    if ( nextByte == '=' ) {
+                        key = itr.getSequenceNoDelim(mark);
+                        mark = itr.mark();
+                    } else if ( nextByte == ';' ) {
+                        if ( key == null ) {
+                            subFields.put(itr.getSequenceNoDelim(mark), null);
+                        } else {
+                            subFields.put(key, itr.getSequenceNoDelim(mark));
+                        }
+                        key = null;
+                        mark = itr.mark();
+                    }
+                }
+                if ( key == null ) {
+                    subFields.put(itr.getSequence(mark), null);
+                } else {
+                    subFields.put(key, itr.getSequence(mark));
+                }
+            }
+        }
+    }
+
     /** a line of data from the VCF */
     public static final class Record {
         private static final int UNINITIALIZED = -1;
 
-        private final List<ByteSequence> columns;
-        private List<KeyValue> infoKeyValues = null;
+        private final List<ByteSequence> simpleFields;
+        private CompoundField filters;
+        private InfoField infos;
+        private CompoundField formats;
+        private final List<CompoundField> genotypes;
+
         private int position = UNINITIALIZED;
         private int quality = UNINITIALIZED;
 
-        public Record( final List<ByteSequence> columns ) {
-            this.columns = columns;
+        public Record( final List<ByteSequence> vals ) {
+            simpleFields = new ArrayList<>(vals.subList(0, 6));
+            filters = new CompoundField(vals.get(6), ';');
+            infos = new InfoField(vals.get(7));
+            final int nVals = vals.size();
+            formats = nVals > 8 ? new CompoundField(vals.get(8), ':') : null;
+            genotypes = new ArrayList<>(Math.max(0, nVals - 9));
+            for ( int idx = 9; idx < nVals; ++idx ) {
+                genotypes.add(new CompoundField(vals.get(idx), ':'));
+            }
         }
 
-        public ByteSequence getChromosome() { return columns.get(0); }
+        public ByteSequence getChromosome() { return simpleFields.get(0); }
+        public void setChromosome( final ByteSequence val ) { simpleFields.set(0, val); }
 
         public int getPosition() {
             if ( position == UNINITIALIZED ) {
-                position = columns.get(1).asInt();
+                position = simpleFields.get(1).asInt();
             }
             return position;
         }
+        public void setPosition( final int pos ) {
+            setPosition(new ByteSequence(Integer.toString(pos)));
+        }
+        public void setPosition( final ByteSequence val ) {
+            simpleFields.set(1, val);
+            position = UNINITIALIZED;
+        }
+
+        public ByteSequence getID() { return simpleFields.get(2); }
+        public void setID( final ByteSequence val ) { simpleFields.set(2, val); }
+
+        public ByteSequence getRef() { return simpleFields.get(3); }
+        public void setRef( final ByteSequence val ) { simpleFields.set(3, val); }
 
-        public ByteSequence getID() { return columns.get(2); }
-        public ByteSequence getRef() { return columns.get(3); }
-        public ByteSequence getAlt() { return columns.get(4); }
+        public ByteSequence getAlt() { return simpleFields.get(4); }
+        public void setAlt( final ByteSequence val ) { simpleFields.set(4, val); }
 
         public int getQuality() {
             if ( quality == UNINITIALIZED ) {
-                quality = columns.get(5).asInt();
+                quality = simpleFields.get(5).asInt();
             }
             return quality;
         }
-
-        public ByteSequence getFilter() { return columns.get(6); }
-
-        public List<KeyValue> getInfo() {
-            if ( infoKeyValues == null ) {
-                infoKeyValues = parseKVs(columns.get(7));
-            }
-            return infoKeyValues;
+        public void setQuality( final ByteSequence val ) {
+            simpleFields.set(5, val);
+            quality = UNINITIALIZED;
         }
 
-        public ByteSequence getInfoField( final ByteSequence key ) {
-            final List<KeyValue> infoKeyValues = getInfo();
-            for ( final KeyValue kv : infoKeyValues ) {
-                if ( key.equals(kv.getKey()) ) return kv.getValue();
-            }
-            return null;
+        public CompoundField getFilter() { return filters; }
+        public void setFilter( final ByteSequence val ) {
+            filters = new CompoundField(val, ';');
         }
-
-        public void setInfoField( final ByteSequence oldValue, final ByteSequence newValue ) {
-            infoKeyValues = null;
-            columns.set(7, columns.get(7).replace(oldValue, newValue));
+        public void setFilter( final List<ByteSequence> vals ) {
+            filters = new CompoundField(vals, ';');
         }
 
-        public Map<ByteSequence, ByteSequence> getInfoAsMap() {
-            final List<KeyValue> infoList = getInfo();
-            final Map<ByteSequence, ByteSequence> infoMap = new HashMap<>(infoList.size() * 2);
-            infoList.forEach(kv -> infoMap.put(kv.getKey(), kv.getValue()));
-            return infoMap;
+        public InfoField getInfo() {
+            return infos;
         }
+        public void setInfo( final ByteSequence val ) { infos = new InfoField(val); }
 
-        public ByteSequence getFormat() { return columns.size() > 8 ? columns.get(8) : null; }
+        public CompoundField getFormat() { return formats; }
+        public void setFormat( final ByteSequence val ) {
+            formats = new CompoundField(val, ':');
+        }
 
-        public List<ByteSequence> getGenotypes() {
-            return columns.size() > 9 ? columns.subList(9, columns.size()) : Collections.emptyList();
+        public List<CompoundField> getGenotypes() { return genotypes; }
+        public void setGenotypes( final List<ByteSequence> vals ) {
+            genotypes.clear();
+            for ( final ByteSequence val : vals ) {
+                genotypes.add(new CompoundField(val, ':'));
+            }
         }
 
         public void write( final OutputStream os ) throws IOException {
-            final int nCols = columns.size();
-            columns.get(0).write(os);
-            for ( int iii = 1; iii < nCols; ++iii ) {
+            simpleFields.get(0).write(os);
+            for ( int idx = 1; idx < 6; ++idx ) {
                 os.write('\t');
-                columns.get(iii).write(os);
+                simpleFields.get(idx).write(os);
+            }
+            os.write('\t');
+            filters.write(os);
+            os.write('\t');
+            infos.write(os);
+            if ( formats != null ) {
+                os.write('\t');
+                formats.write(os);
+                for ( final CompoundField genotype : genotypes ) {
+                    os.write('\t');
+                    genotype.write(os);
+                }
             }
             os.write('\n');
         }
 
-        @Override public String toString() {
+        @Override
+        public String toString() {
             final StringBuilder sb = new StringBuilder();
             String prefix = "";
-            for ( final ByteSequence col : columns ) {
-                sb.append(prefix).append(col);
+            for ( final ByteSequence field : simpleFields ) {
+                sb.append(prefix).append(field.toString());
                 prefix = "\t";
             }
             return sb.toString();
         }
-
-        private static List<KeyValue> parseKVs( final ByteSequence bs ) {
-            final List<KeyValue> attributes = new ArrayList<>();
-            final ByteIterator itr = bs.iterator();
-            int mark = itr.mark();
-            ByteSequence key = null;
-            while ( itr.hasNext() ) {
-                byte nextByte = itr.next();
-                if ( nextByte == '=' ) {
-                    key = itr.getSequenceNoDelim(mark);
-                    mark = itr.mark();
-                } else if ( nextByte == ';' ) {
-                    attributes.add(new KeyValue(key, itr.getSequenceNoDelim(mark)));
-                    key = null;
-                    mark = itr.mark();
-                }
-            }
-            attributes.add(new KeyValue(key, itr.getSequence(mark)));
-            return attributes;
-        }
     }
 }
diff --git a/src/sv-pipeline/java/org/broadinstitute/svpipeline/VCFParserUnitTest.java b/src/sv-pipeline/java/org/broadinstitute/svpipeline/VCFParserUnitTest.java
new file mode 100644
index 000000000..08e3a2339
--- /dev/null
+++ b/src/sv-pipeline/java/org/broadinstitute/svpipeline/VCFParserUnitTest.java
@@ -0,0 +1,267 @@
+package org.broadinstitute.svpipeline;
+
+import java.io.ByteArrayInputStream;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+
+import org.broadinstitute.svpipeline.VCFParser.*;
+
+public final class VCFParserUnitTest {
+    public static void main( final String[] args ) {
+        testAsserts();
+        testEmptyFile();
+        testFileFormatMetadata();
+        testFilter();
+        testColumnHeaders();
+        testRecord();
+        testRoundTrip();
+        System.out.println("OK");
+    }
+
+    public static void testAsserts() {
+        boolean caughtIt = false;
+        try {
+            assert(false);
+        } catch ( final AssertionError ae ) {
+            caughtIt = true;
+        }
+        if ( !caughtIt ) {
+            throw new AssertionError("assertions aren't turned on (with -ea), so you're not testing anything.");
+        }
+    }
+
+    public static void testEmptyFile() {
+        boolean caughtIt = false;
+        try ( final VCFParser parser = new VCFParser("/dev/null") ) {
+            assert(!parser.hasMetadata());
+        } catch ( final MalformedVCFException emptyVCF ) {
+            caughtIt = true;
+        }
+        assert(caughtIt);
+    }
+
+    public static void testFileFormatMetadata() {
+        final byte[] bytes = "##fileformat=VCFv4.2\n".getBytes();
+        final VCFParser parser = new VCFParser(new ByteArrayInputStream(bytes));
+        assert(parser.hasMetadata());
+        final Metadata metadata = parser.nextMetaData();
+        assert(metadata instanceof KeyValueMetadata);
+        final KeyValueMetadata kvMetadata = (KeyValueMetadata)metadata;
+        assert(kvMetadata.getKey().equals(new ByteSequence("fileformat")));
+        assert(kvMetadata.getValue().equals(new ByteSequence("VCFv4.2")));
+        assert(!parser.hasMetadata());
+        assert(!parser.hasRecord());
+        try ( final ByteArrayOutputStream os = new ByteArrayOutputStream() ) {
+            metadata.write(os);
+            assert(Arrays.equals(bytes, os.toByteArray()));
+        } catch ( final IOException ioe ) {
+            throw new RuntimeException(ioe);
+        }
+        parser.close();
+    }
+
+    public static void testFilter() {
+        final byte[] bytes = "##FILTER=<ID=PASS,Description=\"All filters passed\">\n".getBytes();
+        final VCFParser parser = new VCFParser(new ByteArrayInputStream(bytes));
+        assert(parser.hasMetadata());
+        final Metadata metadata = parser.nextMetaData();
+        assert(metadata instanceof KeyAttributesMetadata);
+        final KeyAttributesMetadata kaMetadata = (KeyAttributesMetadata)metadata;
+        assert(kaMetadata.getKey().equals(new ByteSequence("FILTER")));
+        final List<KeyValue> kaValues = kaMetadata.getValue();
+        assert(kaValues.size() == 2);
+        final KeyValue kv0 = kaValues.get(0);
+        assert(kv0.getKey().equals(new ByteSequence("ID")));
+        assert(kv0.getValue().equals(new ByteSequence("PASS")));
+        final KeyValue kv1 = kaValues.get(1);
+        assert(kv1.getKey().equals(new ByteSequence("Description")));
+        assert(kv1.getValue().equals(new ByteSequence("\"All filters passed\"")));
+        assert(!parser.hasMetadata());
+        assert(!parser.hasRecord());
+        try ( final ByteArrayOutputStream os = new ByteArrayOutputStream() ) {
+            metadata.write(os);
+            assert(Arrays.equals(bytes, os.toByteArray()));
+        } catch ( final IOException ioe ) {
+            throw new RuntimeException(ioe);
+        }
+    }
+
+    public static void testColumnHeaders() {
+        final String line = "CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE1\tSAMPLE2";
+        final byte[] bytes = ("#" + line + "\n").getBytes();
+        final VCFParser parser = new VCFParser(new ByteArrayInputStream(bytes));
+        assert(parser.hasMetadata());
+        final Metadata metadata = parser.nextMetaData();
+        assert(metadata instanceof ColumnHeaderMetadata);
+        final ColumnHeaderMetadata columns = (ColumnHeaderMetadata)metadata;
+        final List<ByteSequence> cols = columns.getValue();
+        final String[] splitLine = line.split("\t");
+        assert(splitLine.length == cols.size());
+        for ( int idx = 0; idx < splitLine.length; ++idx ) {
+            assert(cols.get(idx).equals(new ByteSequence(splitLine[idx])));
+        }
+        assert(!parser.hasMetadata());
+        assert(!parser.hasRecord());
+        try ( final ByteArrayOutputStream os = new ByteArrayOutputStream() ) {
+            metadata.write(os);
+            assert(Arrays.equals(bytes, os.toByteArray()));
+        } catch ( final IOException ioe ) {
+            throw new RuntimeException(ioe);
+        }
+    }
+
+    public static void testRecord() {
+        final String line = "chr1\t10000\tna19240_DUP_chr1_1\tN\t<DUP>\t999\tPASS;BUT_FUNKY\t" +
+                "END=16000;SVTYPE=DUP;FLAG1;CHR2=chr1;SVLEN=6000;ALGORITHMS=depth;EVIDENCE=RD;FLAG2\t" +
+                "GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV\t0/1:142:3:142:.:.:.:.:RD\t" +
+                "0/0:999:2:999:.:.:.:.:RD";
+        final byte[] bytes = (line + "\n").getBytes();
+        final VCFParser parser = new VCFParser(new ByteArrayInputStream(bytes));
+        assert(!parser.hasMetadata());
+        assert(parser.hasRecord());
+        final Record record = parser.nextRecord();
+        assert(!parser.hasMetadata());
+        assert(!parser.hasRecord());
+        final String[] cols = line.split("\t");
+
+        assert(record.getChromosome().equals(new ByteSequence(cols[0])));
+        final ByteSequence newChr = new ByteSequence("chr1");
+        record.setChromosome(newChr);
+        assert(record.getChromosome().equals(newChr));
+
+        final int curPosition = record.getPosition();
+        assert(curPosition == Integer.parseInt(cols[1]));
+        final ByteSequence newPos = new ByteSequence("10001");
+        record.setPosition(newPos);
+        final int newPosition = record.getPosition();
+        assert(newPosition == 10001);
+
+        assert(record.getID().equals(new ByteSequence(cols[2])));
+        final ByteSequence newID = new ByteSequence("newID");
+        record.setID(newID);
+        assert(record.getID().equals(newID));
+
+        assert(record.getRef().equals(new ByteSequence(cols[3])));
+        final ByteSequence newRef = new ByteSequence("A");
+        record.setRef(newRef);
+        assert(record.getRef().equals(newRef));
+
+        assert(record.getAlt().equals(new ByteSequence(cols[4])));
+        final ByteSequence newAlt = new ByteSequence("C");
+        record.setAlt(newAlt);
+        assert(record.getAlt().equals(newAlt));
+
+        final int curQuality = record.getQuality();
+        assert(curQuality == Integer.parseInt(cols[5]));
+        final ByteSequence newQual = new ByteSequence("1");
+        record.setQuality(newQual);
+        final int newQuality = record.getQuality();
+        assert(newQuality == 1);
+
+        final CompoundField filters = record.getFilter();
+        final ByteSequence originalFilters = new ByteSequence(cols[6]);
+        final ByteSequence curFilters = filters.getValue();
+        assert(curFilters.equals(originalFilters));
+        assert(filters.size() == 2);
+        assert(filters.get(0).equals(new ByteSequence("PASS")));
+        assert(filters.get(1).equals(new ByteSequence("BUT_FUNKY")));
+        final ByteSequence failFilter = new ByteSequence("FAIL");
+        filters.set(0, failFilter);
+        assert(filters.get(0).equals(failFilter));
+        final ByteSequence newFilters = filters.getValue();
+        assert(newFilters.equals(new ByteSequence("FAIL;BUT_FUNKY")));
+        record.setFilter(originalFilters);
+        final CompoundField revisedFilters = record.getFilter();
+        final ByteSequence newerFilters = revisedFilters.getValue();
+        assert(newerFilters.equals(originalFilters));
+        revisedFilters.add(revisedFilters.remove(0));
+        final ByteSequence newestFilters = revisedFilters.getValue();
+        assert(newestFilters.equals(new ByteSequence("BUT_FUNKY;PASS")));
+
+        final InfoField info = record.getInfo();
+        final ByteSequence originalInfo = new ByteSequence(cols[7]);
+        final ByteSequence curInfo = info.getValue();
+        assert(curInfo.equals(originalInfo));
+        final String[] infoVals = cols[7].split(";");
+        assert(info.size() == infoVals.length);
+        for ( final String val : infoVals ) {
+            final String[] kv = val.split("=");
+            final ByteSequence key = new ByteSequence(kv[0]);
+            assert(info.containsKey(key));
+            if ( kv.length > 1 ) {
+                assert(info.get(key).equals(new ByteSequence(kv[1])));
+            } else {
+                assert(info.get(key) == null);
+            }
+        }
+        final ByteSequence svLenKey = new ByteSequence("SVLEN");
+        final ByteSequence newSVLen = new ByteSequence("6001");
+        info.put(svLenKey, newSVLen);
+        assert(info.get(svLenKey).equals(newSVLen));
+        info.put(svLenKey, new ByteSequence("6000"));
+        final ByteSequence newInfoValue = info.getValue();
+        assert(newInfoValue.equals(originalInfo));
+        final ByteSequence flag1Key = new ByteSequence("FLAG1");
+        info.remove(flag1Key);
+        assert(info.get(flag1Key) == null);
+        final ByteSequence flag2Key = new ByteSequence("FLAG2");
+        record.setInfo(flag2Key);
+        final InfoField newInfo = record.getInfo();
+        assert(!newInfo.containsKey(flag1Key));
+        assert(newInfo.containsKey(flag2Key));
+
+        final CompoundField format = record.getFormat();
+        final ByteSequence originalFormat = new ByteSequence(cols[8]);
+        final ByteSequence curFormat = format.getValue();
+        assert(curFormat.equals(originalFormat));
+        record.setFormat(new ByteSequence("GT"));
+        assert(record.getFormat().size() == 1);
+
+        final List<CompoundField> genotypes = record.getGenotypes();
+        assert(genotypes.size() == 2);
+        final ByteSequence geno1 = genotypes.get(0).getValue();
+        final ByteSequence geno1Value = new ByteSequence("0/1:142:3:142:.:.:.:.:RD");
+        assert(geno1.equals(geno1Value));
+        final ByteSequence geno2 = genotypes.get(1).getValue();
+        final ByteSequence geno2Value = new ByteSequence("0/0:999:2:999:.:.:.:.:RD");
+        assert(geno2.equals(geno2Value));
+        record.setGenotypes(Collections.singletonList(geno2Value));
+        final List<CompoundField> newGenotypes = record.getGenotypes();
+        assert(newGenotypes.size() == 1);
+        final ByteSequence newGeno2Value = newGenotypes.get(0).getValue();
+        assert(newGeno2Value.equals(geno2Value));
+    }
+
+    public static void testRoundTrip() {
+        final StringBuilder sb = new StringBuilder(100000);
+        for ( int idx = 0; idx < 1000; ++idx ) {
+            buildLine(idx, sb);
+        }
+        final byte[] bytes = sb.toString().getBytes();
+        final ByteArrayInputStream is = new ByteArrayInputStream(bytes);
+        final VCFParser parser = new VCFParser(is);
+        final ByteArrayOutputStream os = new ByteArrayOutputStream(100000);
+        while ( parser.hasRecord() ) {
+            final Record record = parser.nextRecord();
+            try {
+                record.write(os);
+            } catch ( final IOException ioe ) {
+                throw new RuntimeException("unexpected IOException");
+            }
+        }
+        parser.close();
+        assert(Arrays.equals(os.toByteArray(),bytes));
+    }
+
+    private static void buildLine( final int idx, final StringBuilder sb ) {
+        final int pos = 10000 * idx;
+        sb.append("chr1\t").append(10000+100*idx).append('\t').append("Event").append(idx).append('\t');
+        sb.append("N\t").append("<DUP>\t").append("999\t").append("PASS\t");
+        sb.append("END=").append(pos+999).append('\t');
+        sb.append("SVTYPE=DUP;CHR2=chr1;SVLEN=1000;ALGORITHMS=depth;EVIDENCE=RD\t");
+        sb.append("GT:GQ:RD_CN\t").append("0/1:999:2\t").append("0/0:999:1\n");
+    }
+}
diff --git a/src/sv-pipeline/scripts/hailmerge.py b/src/sv-pipeline/scripts/hailmerge.py
new file mode 100644
index 000000000..b984738e4
--- /dev/null
+++ b/src/sv-pipeline/scripts/hailmerge.py
@@ -0,0 +1,79 @@
+import hail as hl
+import argparse
+
+parser = argparse.ArgumentParser()
+parser.add_argument('out_bucket')
+parser.add_argument('cluster_name')
+args = parser.parse_args()
+
+files = [f.rstrip() for f in open("files.list", "r").readlines()]
+
+# Define custom reference with only primary contigs, otherwise Hail adds all GRCh38 contigs,
+# which may be problematic downstream
+
+contigs = [
+    "chr1",
+    "chr2",
+    "chr3",
+    "chr4",
+    "chr5",
+    "chr6",
+    "chr7",
+    "chr8",
+    "chr9",
+    "chr10",
+    "chr11",
+    "chr12",
+    "chr13",
+    "chr14",
+    "chr15",
+    "chr16",
+    "chr17",
+    "chr18",
+    "chr19",
+    "chr20",
+    "chr21",
+    "chr22",
+    "chrX",
+    "chrY"
+]
+
+lengths = {
+    "chr1": 248956422,
+    "chr2": 242193529,
+    "chr3": 198295559,
+    "chr4": 190214555,
+    "chr5": 181538259,
+    "chr6": 170805979,
+    "chr7": 159345973,
+    "chr8": 145138636,
+    "chr9": 138394717,
+    "chr10": 133797422,
+    "chr11": 135086622,
+    "chr12": 133275309,
+    "chr13": 114364328,
+    "chr14": 107043718,
+    "chr15": 101991189,
+    "chr16": 90338345,
+    "chr17": 83257441,
+    "chr18": 80373285,
+    "chr19": 58617616,
+    "chr20": 64444167,
+    "chr21": 46709983,
+    "chr22": 50818468,
+    "chrX": 156040895,
+    "chrY": 57227415
+}
+
+ref = hl.ReferenceGenome(name="hg38", contigs=contigs, lengths=lengths, x_contigs="chrX", y_contigs="chrY")
+all_datasets = hl.import_vcf(files, reference_genome=ref, force_bgz=True)
+
+# union_rows approach causes ClassTooLargeException
+# mt = hl.MatrixTable.union_rows(*all_datasets)
+mt = all_datasets
+# rest the qual to missing because hail by default populates it with -1.00e+01
+merged_reset_qual = mt.annotate_rows(qual=hl.missing('float64'))
+
+hl.export_vcf(merged_reset_qual,
+              "gs://{}/{}/merged.vcf.bgz".format(args.out_bucket, args.cluster_name),
+              metadata=hl.get_vcf_metadata(files[0]))
diff --git a/src/svtest/svtest/utils/VCFUtils.py b/src/svtest/svtest/utils/VCFUtils.py
index 177b8224d..8d6fb16e1 100644
--- a/src/svtest/svtest/utils/VCFUtils.py
+++ b/src/svtest/svtest/utils/VCFUtils.py
@@ -7,8 +7,14 @@
 
 def get_info_field(record, name):
     if name not in record.info:
-        raise ValueError("%s info field not found: %s" %
-                         (name, record.info.keys()))
+        if name == 'SVLEN':
+            if record.info['SVTYPE'] in ['DEL', 'DUP', 'INV']:
+                record.info['SVLEN'] = record.stop - record.pos
+            else:
+                record.info['SVLEN'] = -1
+        else:
+            raise ValueError("%s info field not found: %s" %
+                             (name, record.info.keys()))
     return record.info[name]
 
 
diff --git a/src/svtk/scripts/svtk b/src/svtk/scripts/svtk
index c6f6054b7..f05169c6b 100755
--- a/src/svtk/scripts/svtk
+++ b/src/svtk/scripts/svtk
@@ -25,7 +25,6 @@ usage: svtk [-h] <subcommand> [options]
     rdtest*        Calculate comparative coverage statistics at CNV sites.
 
 [ PE/SR analysis ]
-    collect-pesr   Count clipped reads and extract discordant pairs genomewide.
     sr-test        Calculate enrichment of clipped reads at SV breakpoints.
     pe-test        Calculate enrichment of discordant pairs at SV breakpoints.
 
diff --git a/src/svtk/setup.py b/src/svtk/setup.py
index 66ce3df43..b967e07d6 100755
--- a/src/svtk/setup.py
+++ b/src/svtk/setup.py
@@ -28,7 +28,6 @@
         'pybedtools',
         'cython',
         'natsort',
-        'boto3<=1.9.224',
         'pandas',
     ]
 )
diff --git a/src/svtk/svtk/cli/__init__.py b/src/svtk/svtk/cli/__init__.py
index 24766f84b..6055c9903 100644
--- a/src/svtk/svtk/cli/__init__.py
+++ b/src/svtk/svtk/cli/__init__.py
@@ -5,7 +5,6 @@
 from .bincov import main as bincov
 from .rdtest2vcf import main as rdtest2vcf
 from .resolve import main as resolve
-from .collect_pesr import main as collect_pesr
 from .annotate import main as annotate
 from .utils import vcf2bed, remote_tabix
 from .pesr_test import pe_test, sr_test, count_pe, count_sr
diff --git a/src/svtk/svtk/cli/bedcluster.py b/src/svtk/svtk/cli/bedcluster.py
index 45439cc90..b220e6411 100644
--- a/src/svtk/svtk/cli/bedcluster.py
+++ b/src/svtk/svtk/cli/bedcluster.py
@@ -58,15 +58,11 @@ def bedcluster(bed, frac=0.8, intersection=None):
     -------
     clusters : list of deque of pybedtools.Interval
     """
-
-    # Get list of unique variant IDs and initialize sparse graph
-    variant_IDs = [interval.fields[3] for interval in bed.intervals]
-    G = sparse.eye(len(variant_IDs), dtype=np.uint16, format='lil')
-
-    # Map variant IDs to graph indices
-    variant_indexes = {}
-    for i, variant in enumerate(variant_IDs):
-        variant_indexes[variant.strip()] = i
+    # Get list of unique variant IDs and map to indices on sparse graph
+    variant_indices = {variant_id: index for index, variant_id in enumerate(
+        {interval.name for interval in bed.intervals})
+    }
+    G = sparse.eye(len(variant_indices), dtype=np.uint16, format='lil')
 
     # Self-intersect the bed
     if intersection is None:
@@ -81,19 +77,18 @@ def bedcluster(bed, frac=0.8, intersection=None):
 
         # Link the two calls from the current line
         if c2.chrom != '.' and c1.svtype == c2.svtype:
-            idx1 = variant_indexes[c1.name]
-            idx2 = variant_indexes[c2.name]
+            idx1 = variant_indices[c1.name]
+            idx2 = variant_indices[c2.name]
             G[idx1, idx2] = 1
 
     # Cluster graph
     n_comp, cluster_labels = csgraph.connected_components(G, connection='weak')
 
-    # Build deques of clustered Intervals
-    clusters = [deque() for i in range(n_comp)]
-    for idx, interval in enumerate(bed.intervals):
-        label = cluster_labels[idx]
+    # Build lists of clustered Intervals
+    clusters = [[] for _ in range(n_comp)]
+    for interval in bed.intervals:
+        label = cluster_labels[variant_indices[interval.name]]
         clusters[label].append(interval)
-
     return clusters
 
 
diff --git a/src/svtk/svtk/cli/collect_pesr.py b/src/svtk/svtk/cli/collect_pesr.py
deleted file mode 100644
index ec82401d0..000000000
--- a/src/svtk/svtk/cli/collect_pesr.py
+++ /dev/null
@@ -1,313 +0,0 @@
-#!/usr/bin/env python
-# -*- coding: utf-8 -*-
-#
-
-"""
-Collect split read and discordant pair data from a bam alignment.
-
-Split reads: The tool counts the number of reads soft-clipped in each direction
-(30S121M = left-clipped, 121M30S = right-clipped) at each position in the
-genome.  The position of a right-clipped read is shifted by the length of its
-alignment.
-
-Discordant pairs: The tool reduces discordant pairs to (chrA, posA, strandA,
-chrB, posB, strandB).
-
-Unmapped reads, reads with unmapped mates, secondary and supplementary
-alignments, and duplicates are excluded (SAM flag 3340).
-
-Collection can be performed on an S3-hosted bam. The tool will attempt to find
-a local copy of the bam index in the working directory, or the directory
-specified with `--index-dir`, otherwise the index will be downloaded.
-"""
-
-import argparse
-import sys
-from collections import defaultdict, deque
-import numpy as np
-import pysam
-from natsort import natsorted
-import svtk.utils as svu
-
-
-class PESRCollection:
-    def __init__(self, bam, splitfile, discfile, sample='.',
-                 max_split_dist=300):
-        self.bam = bam
-        self.splitfile = splitfile
-        self.discfile = discfile
-        self.sample = sample
-
-        # SR evidence
-        self.right_split_counts = defaultdict(int)
-        self.left_split_counts = defaultdict(int)
-        self.prev_split_pos = None
-        self.curr_chrom = None
-        self.max_split_dist = max_split_dist
-
-        # PE evidence
-        self.disc_pairs = deque()
-        self.observed_disc_names = {}
-        self.curr_disc_pos = -1
-
-    def collect_pesr(self):
-        """
-        Collect PE and SR evidence from a BAM file.
-
-        Excludes unmapped reads, reads with an unmapped mate, duplicate reads,
-        and secondary or supplementary alignments. Reads are considered split
-        if their CIGAR string contains a soft clip operation.
-        """
-
-        for read in self.bam:
-            # Restrict to unique primary alignments with a mapped mate
-            # Equivalent to `samtools view -F 3340`
-            if svu.is_excluded(read):
-                continue
-
-            # Soft clip indicate a candidate split read
-            if svu.is_soft_clipped(read):
-                if self.splitfile is not None:
-                    self.count_split(read)
-
-            # After counting splits, evaluate discordant pairs
-            if not read.is_proper_pair:
-                if self.discfile is not None:
-                    self.report_disc(read)
-
-        self.flush_split_counts()
-        self.flush_disc_pairs()
-
-    def report_disc(self, read):
-        """
-        Report simplified discordant pair info.
-
-        Parameters
-        ----------
-        read : pysam.AlignedSegment
-        """
-
-        # Stack up all discordant pairs at a position, then sort
-        # and write out in chunks
-        if read.reference_start != self.curr_disc_pos:
-            self.flush_disc_pairs()
-            self.curr_disc_pos = read.reference_start
-
-        # Avoid double-counting translocations by requiring chrA < chrB
-        if read.reference_id < read.next_reference_id:
-            self.disc_pairs.append(read)
-
-        # If interchromosomal, rely on coordinate to not double count
-        elif read.reference_id == read.next_reference_id:
-            # Report if posA < posB
-            if read.reference_start < read.next_reference_start:
-                self.disc_pairs.append(read)
-
-            # If posA == posB, check if we've seen the read before
-            elif read.reference_start == read.next_reference_start:
-                # If we have, delete the log to save memory and skip the read
-                if read.query_name in self.observed_disc_names:
-                    del self.observed_disc_names[read.query_name]
-
-                # Otherwise, report and log it
-                else:
-                    self.disc_pairs.append(read)
-                    self.observed_disc_names[read.query_name] = 1
-
-    def write_disc(self, read):
-        """
-        Write discordant pair to file.
-        """
-        strandA = '-' if read.is_reverse else '+'
-        strandB = '-' if read.mate_is_reverse else '+'
-
-        self.discfile.write(
-            ('%s\t%d\t%s\t%s\t%d\t%s\t%s\n' % (
-                read.reference_name, read.reference_start, strandA,
-                read.next_reference_name, read.next_reference_start, strandB,
-                self.sample)
-             ).encode('utf-8'))
-
-    def flush_disc_pairs(self):
-        """
-        Write all logged discordant reads to file.
-        """
-        def _key(read):
-            return (read.reference_name, read.reference_start,
-                    read.next_reference_name, read.next_reference_start)
-
-        # Sort by chrA/posA and chrB/posB then write to disc
-        for read in natsorted(self.disc_pairs, key=_key):
-            self.write_disc(read)
-
-        # Reset list of reads
-        self.disc_pairs = deque()
-
-    def count_split(self, read):
-        """
-        Count splits at each position.
-
-        Parameters
-        ----------
-        read : pysam.AlignedSegment
-        """
-
-        split_positions = get_split_positions(read)
-        # pos, side = get_split_positions(read)
-
-        for (pos, side) in split_positions:
-            # Calculate distance to previous split and update position tracker
-            # Use abs to catch contig switches
-            if self.prev_split_pos is None:
-                dist = 0
-            else:
-                dist = np.abs(pos - self.prev_split_pos)
-            self.prev_split_pos = pos
-
-            if self.curr_chrom is None:
-                self.curr_chrom = read.reference_name
-
-            # Flush aggregated split reads if we've moved beyond the max dist
-            if dist > self.max_split_dist:
-                self.flush_split_counts()
-                self.curr_chrom = read.reference_name
-
-            # Tally the split at its corresponding position
-            if side == 'RIGHT':
-                self.right_split_counts[pos] += 1
-            elif side == 'LEFT':
-                self.left_split_counts[pos] += 1
-
-    def flush_split_counts(self):
-        """
-        Write current split counts to disk and reset dictionaries
-        """
-
-        # Compile counts collected so far
-        entries = deque()
-        for clip in 'left right'.split():
-            df = getattr(self, '%s_split_counts' % clip)
-
-            for pos, count in df.items():
-                entries.append((self.curr_chrom, pos, clip, count,
-                                self.sample))
-
-        # Sort in chunks as we go
-        entries = sorted(entries, key=lambda s: s[1])
-
-        # Flush to disk
-        fmt = '%s\t%d\t%s\t%d\t%s\n'
-        for entry in entries:
-            self.splitfile.write((fmt % entry).encode('utf-8'))
-
-        # Reset split counts
-        self.right_split_counts = defaultdict(int)
-        self.left_split_counts = defaultdict(int)
-
-
-def get_split_positions(read):
-    """
-    Calculate split coordinate based on read alignment and CIGAR operations.
-
-    Support is only present for reads soft-clipped on one side, e.g. 100M51S,
-    as the coordinate is calculated by shifting the alignment position by the
-    length of the flanking match operation.
-
-    Parameters
-    ----------
-    read : pysam.AlignedSegment
-
-    Returns
-    -------
-    pos : int
-        Adjusted split read coordinate
-    side : str [RIGHT,LEFT,MIDDLE]
-        Direction of soft clip
-    """
-
-    pos = read.pos
-
-    split_positions = []
-
-    # Left soft clip - sequence is already aligned to split position
-    if is_left_clipped(read):
-        split_positions.append([pos, 'LEFT'])
-
-    # Right soft clip - add length of aligned sequence
-    if is_right_clipped(read):
-        clip_pos = pos
-        for operation, length in read.cigartuples:
-            # Only shift based on matches, ignore DEL/INS/clips
-            if not is_clipping_operation(operation) and operation_consumes_ref_bases(operation):
-                clip_pos += length
-        split_positions.append([clip_pos, 'RIGHT'])
-
-    return split_positions
-
-
-def is_left_clipped(read):
-    return len(read.cigartuples) >= 1 and is_clipping_operation(read.cigartuples[0][0])
-
-
-def is_right_clipped(read):
-    return len(read.cigartuples) >= 1 and is_clipping_operation(read.cigartuples[-1][0])
-
-
-def is_clipping_operation(operation):
-    return operation == 4 or operation == 5
-
-
-def operation_consumes_ref_bases(operation):
-    """
-    Returns true if this is a cigar operation that consumes reference bases
-    """
-    return operation == 0 or operation == 2 or operation == 3 or operation == 7
-
-
-def main(argv):
-    parser = argparse.ArgumentParser(
-        description=__doc__,
-        prog='svtk collect-pesr',
-        formatter_class=argparse.RawDescriptionHelpFormatter)
-
-    parser.add_argument('bam', help='Local or S3 path to bam')
-    parser.add_argument('sample', help='ID to append to each line of output '
-                        'files.')
-    parser.add_argument('splitfile',
-                        help='Output split counts.')
-    parser.add_argument('discfile',
-                        help='Output discordant pairs.')
-
-    parser.add_argument('--index-dir', default=None,
-                        help='Directory of local BAM indexes if accessing '
-                        'a remote S3 bam.')
-    parser.add_argument('-r', '--region',
-                        help='Tabix-formatted region to parse')
-    parser.add_argument('-z', '--bgzip', default=False, action='store_true',
-                        help='bgzip and tabix index output')
-
-    # Print help if no arguments specified
-    if len(argv) == 0:
-        parser.print_help()
-        sys.exit(1)
-    args = parser.parse_args(argv)
-
-    # Load bam from S3 if necessary
-    if args.bam.startswith('s3://'):
-        bam = svu.load_s3bam(args.bam, args.index_dir)
-    else:
-        bam = pysam.AlignmentFile(args.bam)
-
-    # Restrict to region of interest
-    if args.region:
-        bam = bam.fetch(args.region.encode('utf-8'))
-
-    # Collect data and save
-    with svu.BgzipFile(args.splitfile, args.bgzip) as splitfile:
-        with svu.BgzipFile(args.discfile, args.bgzip) as discfile:
-            PESRCollection(bam, splitfile, discfile,
-                           args.sample).collect_pesr()
-
-
-if __name__ == '__main__':
-    main(sys.argv[1:])
diff --git a/src/svtk/svtk/cli/resolve.py b/src/svtk/svtk/cli/resolve.py
index 3ed275d4d..ad9578cc0 100644
--- a/src/svtk/svtk/cli/resolve.py
+++ b/src/svtk/svtk/cli/resolve.py
@@ -12,6 +12,8 @@
 import numpy as np
 import string
 from collections import deque
+from operator import attrgetter
+import itertools
 import pysam
 import pybedtools as pbt
 import svtk.utils as svu
@@ -91,25 +93,16 @@ def remove_CPX_from_INV(resolve_CPX, resolve_INV):
     return out
 
 
+def multisort(xs, specs):
+    for key, reverse in reversed(specs):
+        xs.sort(key=attrgetter(key), reverse=reverse)
+    return xs
+
+
 def cluster_INV(independent_INV):
-    inv_hash = {}
-    for i in independent_INV:
-        if i.chrom not in inv_hash.keys():
-            inv_hash[i.chrom] = {}
-        if i.pos not in inv_hash[i.chrom].keys():
-            inv_hash[i.chrom][i.pos] = {}
-        if i.stop not in inv_hash[i.chrom][i.pos].keys():
-            inv_hash[i.chrom][i.pos][i.stop] = i
-    list_INV = {}
-    for i in inv_hash.keys():
-        list_INV[i] = []
-        for j in sorted(inv_hash[i].keys()):
-            for k in sorted(inv_hash[i][j].keys()):
-                list_INV[i].append(inv_hash[i][j][k])
-    out = []
-    for i in list_INV.keys():
-        out += _cluster_INV_list(list_INV[i])
-    return out
+    list_INV = [multisort(list(group), (('pos', False), ('stop', False)))
+                for chrom, group in itertools.groupby(independent_INV, attrgetter('chrom'))]
+    return [x for group in list_INV for x in _cluster_INV_list(group)]
 
 
 def _cluster_INV_list(independent_INV):
@@ -316,13 +309,12 @@ def cluster_cleanup(clusters_v2):
     return [clusters_v2[i] for i in cluster_pos]
 
 
-def resolve_complex_sv_v2(resolve_CPX, resolve_INV, resolve_CNV, cytobands, disc_pairs,
+def resolve_complex_sv_v2(resolve_INV, cytobands, disc_pairs,
                           mei_bed, variant_prefix='CPX_', min_rescan_support=4,
                           pe_blacklist=None, quiet=False, SR_only_cutoff=1000,
                           random_resolved_id_length=10):
-    independent_INV = remove_CPX_from_INV(resolve_CPX, resolve_INV)
-    linked_INV = cluster_INV(independent_INV)
-    clusters_v2 = link_cpx_V2(linked_INV, resolve_CNV, cpx_dist=2000)
+    linked_INV = cluster_INV(resolve_INV)
+    clusters_v2 = link_cpx_V2(linked_INV, cpx_dist=2000)
     clusters_v2 = cluster_cleanup(clusters_v2)
 
     np.random.seed(0)  # arbitrary fixed seed for reproducibility
@@ -518,9 +510,7 @@ def main(argv):
 
     # RLC: As of Sept 19, 2018, only considering inversion single-enders in second-pass
     # due to too many errors in second-pass linking and variant reporting
-    resolve_CPX = []
-    resolve_CNV = []
-    cpx_records_v2 = resolve_complex_sv_v2(resolve_CPX, resolve_INV, resolve_CNV,
+    cpx_records_v2 = resolve_complex_sv_v2(resolve_INV,
                                            cytobands, disc_pairs, mei_bed, args.prefix,
                                            args.min_rescan_pe_support, blacklist, args.quiet)
 
diff --git a/src/svtk/svtk/cli/vcfcluster.py b/src/svtk/svtk/cli/vcfcluster.py
index e0d0cc1a9..b91c137d4 100644
--- a/src/svtk/svtk/cli/vcfcluster.py
+++ b/src/svtk/svtk/cli/vcfcluster.py
@@ -115,7 +115,11 @@ def main(argv):
                         help='Do not merge clustered records. Adds CLUSTER info fields.')
     parser.add_argument('--merge-only', action='store_true',
                         default=False,
-                        help='When run on a vcf generated with --skip-merge, only merges records with identical CLUSTER fields.')
+                        help='When run on a vcf generated with --skip-merge, only merges records '
+                             'with identical CLUSTER fields.')
+    parser.add_argument('--single-end', action='store_true',
+                        default=False,
+                        help='Require only one end to be within the minimum distance.')
     #  parser.add_argument('--cluster-bed', type=argparse.FileType('w'),
     #                      help='Bed of constituent calls in each cluster')
 
@@ -145,7 +149,8 @@ def main(argv):
                      sample_overlap=args.sample_overlap,
                      preserve_header=args.preserve_header,
                      do_cluster=do_cluster,
-                     do_merge=do_merge)
+                     do_merge=do_merge,
+                     single_end=args.single_end)
 
     # Open new file
     if args.fout in '- stdout'.split():
diff --git a/src/svtk/svtk/cxsv/complex_sv.py b/src/svtk/svtk/cxsv/complex_sv.py
index c3e0e562f..777c46255 100644
--- a/src/svtk/svtk/cxsv/complex_sv.py
+++ b/src/svtk/svtk/cxsv/complex_sv.py
@@ -179,6 +179,14 @@ def clean_record(self):
         if len(varGQs) > 0 and 'varGQ' in self.vcf_record.header.info.keys():
             self.vcf_record.info['varGQ'] = max(varGQs)
 
+        # if resolved as a CNV, ensure RD_CN and RD_GQ are set
+        if len(self.records) > 1 and self.vcf_record.info['SVTYPE'] in ['DEL', 'DUP', 'CNV'] and len(self.cnvs) > 0:
+            cnv_record = self.cnvs[0]
+            if 'RD_CN' in cnv_record.format.keys() and 'RD_GQ' in cnv_record.format.keys():
+                for sample in self.vcf_record.samples:
+                    self.vcf_record.samples[sample]['RD_CN'] = cnv_record.samples[sample]['RD_CN']
+                    self.vcf_record.samples[sample]['RD_GQ'] = cnv_record.samples[sample]['RD_GQ']
+
     @property
     def record_ids(self):
         return [r.id for r in self.records]
@@ -245,17 +253,8 @@ def resolve_inversion(self, SR_only_cutoff):
             is_mei = check_mei_overlap(self.vcf_record.chrom, source_start,
                                        source_end, self.mei_bed)
 
-            # then check for RdTest support
-            #  is_dup = check_rdtest(self.vcf_record, source_start, source_end,
-            #  self.rdtest)
-
             if is_mei:
                 self.cpx_type = 'MEI_' + self.cpx_type.split('/')[1]
-            #  elif is_dup:
-                #  self.svtype = 'CPX'
-                #  self.cpx_type = 'INV_DISPERSED_DUP'
-            #  else:
-                #  self.cpx_type = self.cpx_type.split('/')[1]
 
             self.vcf_record.pos = sink_start
             self.vcf_record.stop = sink_end
@@ -480,6 +479,8 @@ def report_simple_insertion(self):
                 record = self.insertions[0]
                 self.cpx_type = record.alts[0].strip('<>')
                 self.svtype = 'INS'
+                self.vcf_record.pos = record.pos
+                self.vcf_record.stop = record.stop
                 self.vcf_record.alts = record.alts
                 self.vcf_record.id = record.id
                 self.vcf_record.info['SVTYPE'] = self.svtype
@@ -490,6 +491,8 @@ def report_simple_insertion(self):
             record = self.insertions[0]
             self.cpx_type = record.alts[0].strip('<>')
             self.svtype = 'INS'
+            self.vcf_record.pos = record.pos
+            self.vcf_record.stop = record.stop
             self.vcf_record.id = record.id
             self.vcf_record.alts = record.alts
             self.vcf_record.info['SVTYPE'] = self.svtype
@@ -508,6 +511,8 @@ def report_insertion_strip_CNVs(self):
                 and self.cnvs[0].info['SVTYPE'] == 'DUP':
             record = self.cnvs[0]
             self.svtype = 'DUP'
+            self.vcf_record.pos = record.pos
+            self.vcf_record.stop = record.stop
             self.vcf_record.id = record.id
             self.vcf_record.alts = record.alts
             self.vcf_record.info['SVTYPE'] = self.svtype
@@ -519,75 +524,13 @@ def report_insertion_strip_CNVs(self):
         else:
             self.svtype = 'INS'
 
-        # if len(self.breakends) > 0 and len(self.cnvs) == 0:
-        #     record = self.insertions[0]
-        #     self.cpx_type = record.alts[0].strip('<>')
-        #     self.svtype = 'INS'
-        #     self.vcf_record.alts = record.alts
-        #     self.vcf_record.info['SVTYPE'] = self.svtype
-        #     self.vcf_record.info['CPX_TYPE'] = self.cpx_type
-        #     self.vcf_record.info['CHR2'] = record.info['CHR2']
-        #     self.vcf_record.info['SVLEN'] = record.info['SVLEN']
-        # elif len(self.cnvs) == 1 and len(self.breakends) == 0:
-        #     if self.cnvs[0].info['SVTYPE'] == 'DUP':
-        #         record = self.cnvs[0]
-        #         self.svtype = 'DUP'
-        #         self.vcf_record.alts = record.alts
-        #         self.vcf_record.info['SVTYPE'] = self.svtype
-        #         self.vcf_record.info['CHR2'] = record.info['CHR2']
-        #         self.vcf_record.info['SVLEN'] = record.info['SVLEN']
-        #     else:
-        #         self.set_unresolved()
-        # else:
-        #     self.set_unresolved()
-
     # Where Manta calls two insertions flanking a duplication, report just the dup
     def report_manta_tandem_dup(self):
         record = self.dups[0]
         self.cpx_type = record.alts[0].strip('<>')
         self.svtype = 'DUP'
-        self.vcf_record.alts = record.alts
-        self.vcf_record.info['SVTYPE'] = self.svtype
-        self.vcf_record.info['CPX_TYPE'] = self.cpx_type
-        self.vcf_record.info['CHR2'] = record.info['CHR2']
-        self.vcf_record.info['SVLEN'] = record.info['SVLEN']
-
-    def report_single_ender(self):
-        # if cluster contains a single duplication, report that
-        # otherwise, report the first insertion record and discard all others
-        if len(self.cnvs) == 1:
-            if self.cnvs[0].info['SVTYPE'] == 'DUP':
-                record = self.cnvs[0]
-                self.svtype = 'DUP'
-                self.vcf_record.alts = record.alts
-                self.vcf_record.info['SVTYPE'] = self.svtype
-                self.vcf_record.info['CHR2'] = record.info['CHR2']
-                self.vcf_record.info['SVLEN'] = record.info['SVLEN']
-            else:
-                record = self.insertions[0]
-                self.cpx_type = record.alts[0].strip('<>')
-                self.svtype = 'INS'
-                self.vcf_record.alts = record.alts
-                self.vcf_record.info['SVTYPE'] = self.svtype
-                self.vcf_record.info['CPX_TYPE'] = self.cpx_type
-                self.vcf_record.info['CHR2'] = record.info['CHR2']
-                self.vcf_record.info['SVLEN'] = record.info['SVLEN']
-        else:
-            record = self.insertions[0]
-            self.cpx_type = record.alts[0].strip('<>')
-            self.svtype = 'INS'
-            self.vcf_record.alts = record.alts
-            self.vcf_record.info['SVTYPE'] = self.svtype
-            self.vcf_record.info['CPX_TYPE'] = self.cpx_type
-            self.vcf_record.info['CHR2'] = record.info['CHR2']
-            self.vcf_record.info['SVLEN'] = record.info['SVLEN']
-
-    # Where Manta calls two insertions flanking a duplication, report just the dup
-
-    def report_manta_tandem_dup(self):
-        record = self.dups[0]
-        self.cpx_type = record.alts[0].strip('<>')
-        self.svtype = 'DUP'
+        self.vcf_record.pos = record.pos
+        self.vcf_record.stop = record.stop
         self.vcf_record.alts = record.alts
         self.vcf_record.info['SVTYPE'] = self.svtype
         self.vcf_record.info['CPX_TYPE'] = self.cpx_type
@@ -751,19 +694,3 @@ def check_mei_overlap(chrom, start, end, mei_bed):
     cov = float(i.fields[6])
 
     return cov >= 0.5
-
-
-def check_rdtest(record, start, end, rdtest):
-    """
-    Check if putative insertion has depth support
-    """
-
-    rdtest_record = record.copy()
-    rdtest_record.pos = start
-    rdtest_record.stop = end
-    rdtest_record.info['SVTYPE'] = 'DUP'
-
-    if end - start < 1000:
-        return rdtest.test_record(rdtest_record, cutoff_type='pesr_lt1kb')
-    else:
-        return rdtest.test_record(rdtest_record, cutoff_type='pesr_gt1kb')
diff --git a/src/svtk/svtk/cxsv/cpx_link.py b/src/svtk/svtk/cxsv/cpx_link.py
index c03b4d88e..aab48dc1e 100644
--- a/src/svtk/svtk/cxsv/cpx_link.py
+++ b/src/svtk/svtk/cxsv/cpx_link.py
@@ -11,14 +11,16 @@
 import numpy as np
 import scipy.sparse as sps
 import natsort
-import pysam
 import svtk.utils as svu
 
 
-def samples_overlap_records(recA, recB, upper_thresh=0.5, lower_thresh=0.5):
-    samplesA = set(svu.get_called_samples(recA))
-    samplesB = set(svu.get_called_samples(recB))
-    return samples_overlap(samplesA, samplesB, upper_thresh=upper_thresh, lower_thresh=lower_thresh)
+def samples_overlap_records(recA, recB, called_samples_dict, upper_thresh=0.5, lower_thresh=0.5):
+    if recA.id not in called_samples_dict:
+        called_samples_dict[recA.id] = set(svu.get_called_samples(recA))
+    if recB.id not in called_samples_dict:
+        called_samples_dict[recB.id] = set(svu.get_called_samples(recB))
+    return samples_overlap(called_samples_dict[recA.id], called_samples_dict[recB.id],
+                           upper_thresh=upper_thresh, lower_thresh=lower_thresh)
 
 
 def samples_overlap(samplesA, samplesB, upper_thresh=0.5, lower_thresh=0.5):
@@ -81,7 +83,7 @@ def extract_breakpoints(vcf, bkpt_idxs):
     return bkpts
 
 
-def link_cpx(vcf, bkpt_window=300, cpx_dist=2000):
+def link_cpx(vcf, bkpt_window=300):
     """
     Parameters
     ----------
@@ -94,19 +96,12 @@ def link_cpx(vcf, bkpt_window=300, cpx_dist=2000):
     # Identify breakpoints which overlap within specified window
     overlap = bt.window(bt, w=bkpt_window).saveas()
 
-    # Exclude self-hits
-    #  overlap = overlap.filter(lambda b: b.fields[3] != b.fields[9]).saveas()
-
     # Exclude intersections where two DELs or two DUPs cluster together
-    # cnvtypes = 'DEL DUP'.split()
     overlap = overlap.filter(lambda b: not (
         b.fields[4] == "DEL" and b.fields[10] == "DEL")).saveas()
     overlap = overlap.filter(lambda b: not (
         b.fields[4] == "DUP" and b.fields[10] == "DUP")).saveas()
 
-    # # Exclude intersections with annotated mobile elements (rather than BNDs)
-    # overlap = overlap.filter(lambda b: b.fields[4] is not re.match(re.compile('INS\:ME\:*'), b.fields[4])).saveas()
-
     # Get linked variant IDs
     links = [(b[3], b[9]) for b in overlap.intervals]
     linked_IDs = natsort.natsorted(set(itertools.chain.from_iterable(links)))
@@ -138,14 +133,6 @@ def link_cpx(vcf, bkpt_window=300, cpx_dist=2000):
     for i, c_label in enumerate(comp_list):
         clusters[c_label].append(bkpts[i])
 
-    # # Remove clusters of only CNV - leftover from shared sample filtering
-    # def _ok_cluster(cluster):
-    #     ok = any([record.info['SVTYPE'] not in cnvtypes for record in cluster])
-    #     return ok
-
-    # clusters = [c for c in clusters if _ok_cluster(c)]
-    #  clusters = [c for c in clusters if len(c) > 1]
-
     return clusters
 
 
@@ -157,118 +144,35 @@ def unify_list(list):
     return out
 
 
-def CNV_readin_from_resolved_vcf(resolved_name, inv_intervals):
-    resolved_f = pysam.VariantFile(resolved_name, 'r')
-    # rec_a = 0
-    out = []
-    for i in resolved_f:
-        for j in inv_intervals:
-            if i.chrom == j[0]:
-                if (i.pos - j[1]) * (i.pos - j[2]) < 0 or (i.stop - j[1]) * (i.stop - j[2]) < 0:
-                    if i.info['SVTYPE'] in ['DEL', 'DUP']:
-                        out.append(i)
-    resolved_f.close()
-    return out
-
-
-def link_cpx_V2(linked_INV, resolve_CNV, cpx_dist=2000):
-    linked_INV_V2 = []
+def link_cpx_V2(linked_INV, cpx_dist=2000):
+    overlapping_inv = []
+    called_samples_dict = {}
     for group in linked_INV:
         if len(group) > 1:
-            for i in group:
-                for j in group:
-                    if ro_calu(i, j) > 0 and samples_overlap_records(i, j):
-                        linked_INV_V2.append([i, j])
-        else:
-            linked_INV_V2.append([group[0]])
-    inv_intervals = []
-    for i in linked_INV_V2:
-        if len(i) > 1:
-            tmp = [i[0].chrom]
-            for j in i:
-                tmp += [j.pos, j.stop]
-            inv_intervals.append(
-                [tmp[0], min(unify_list(tmp[1:])), max(unify_list(tmp[1:]))])
+            for i, j in itertools.combinations(group, 2):
+                if records_overlap(i, j) and samples_overlap_records(i, j, called_samples_dict):
+                    overlapping_inv.append([i, j])
         else:
-            inv_intervals.append([i[0].chrom, i[0].pos, i[0].stop])
-    inv_intervals = sorted(unify_list(inv_intervals))
-    # out_rec = unify_list(CNV_readin_from_resolved_vcf(resolved_name,inv_intervals) + CNV_readin_from_resolved_vcf(unresolved_name,inv_intervals))
-    out_rec = resolve_CNV
+            overlapping_inv.append(group)
     cluster = []
-    for i in linked_INV_V2:
-        if len(i) > 1:
-            if abs(i[1].pos - i[0].pos) > cpx_dist and abs(i[1].stop - i[0].stop) > cpx_dist:
-                if 'STRANDS' in i[0].info.keys() and 'STRANDS' in i[1].info.keys():
-                    if sorted(unify_list([i[0].info['STRANDS'], i[1].info['STRANDS']])) == ['++', '--']:
-                        if i[0].pos < i[1].pos < i[0].stop < i[1].stop or i[1].pos < i[0].pos < i[1].stop < i[0].stop:
-                            cpx_intervals = [[i[0].chrom, sorted([i[0].pos, i[0].stop, i[1].pos, i[1].stop])[0], sorted([i[0].pos, i[0].stop, i[1].pos, i[1].stop])[1]], [
-                                i[0].chrom, sorted([i[0].pos, i[0].stop, i[1].pos, i[1].stop])[2], sorted([i[0].pos, i[0].stop, i[1].pos, i[1].stop])[3]]]
-                            CNV_close = [j for j in out_rec if ro_calu_interval([j.chrom, j.pos, j.stop], cpx_intervals[0]) > .5 and abs(
-                                j.pos - cpx_intervals[0][1]) < cpx_dist and abs(j.stop - cpx_intervals[0][2]) < cpx_dist]
-                            CNV_close += [j for j in out_rec if ro_calu_interval([j.chrom, j.pos, j.stop], cpx_intervals[1]) > .5 and abs(
-                                j.pos - cpx_intervals[1][1]) < cpx_dist and abs(j.stop - cpx_intervals[1][2]) < cpx_dist]
-                            cluster.append(CNV_close + i)
+    for inv in overlapping_inv:
+        if len(inv) > 1:
+            if abs(inv[1].pos - inv[0].pos) > cpx_dist and abs(inv[1].stop - inv[0].stop) > cpx_dist:
+                if 'STRANDS' in inv[0].info.keys() and 'STRANDS' in inv[1].info.keys():
+                    if inv[0].info['STRANDS'] != inv[1].info['STRANDS']:
+                        if inv[0].pos < inv[1].pos < inv[0].stop < inv[1].stop \
+                                or inv[1].pos < inv[0].pos < inv[1].stop < inv[0].stop:
+                            cluster.append(inv)
         else:
-            cluster.append(i)
+            cluster.append(inv)
     return cluster
 
 
-def link_inv(vcf, bkpt_window=300, cpx_dist=2000):
-    bt = svu.vcf2bedtool(vcf.filename, annotate_ins=False)
-    overlap = bt.window(bt, w=bkpt_window).saveas()
-    overlap = overlap.filter(lambda b: not (
-        b.fields[4] == "DEL" and b.fields[10] == "DEL")).saveas()
-    overlap = overlap.filter(lambda b: not (
-        b.fields[4] == "DUP" and b.fields[10] == "DUP")).saveas()
-    links = [(b[3], b[9]) for b in overlap.intervals]
-    linked_IDs = natsort.natsorted(set(itertools.chain.from_iterable(links)))
-    linked_IDs = np.array(linked_IDs)
-    bkpt_idxs = {ID: i for i, ID in enumerate(linked_IDs)}
-    indexed_links = np.array([(bkpt_idxs[a], bkpt_idxs[b]) for a, b in links])
-    n_bkpts = len(linked_IDs)
-    bkpts = extract_breakpoints(vcf, bkpt_idxs)
-    # Exclude wildly disparate overlaps
-    G = sps.eye(n_bkpts, dtype=np.uint16, format='lil')
-    for i, j in indexed_links:
-        if (ro_calu(bkpts[i], bkpts[j]) > 0 and samples_overlap_records(bkpts[i], bkpts[j])):
-            G[i, j] = 1
-    # Generate lists of clustered breakpoints
-    n_comp, comp_list = sps.csgraph.connected_components(G)
-    clusters = [deque() for x in range(n_comp)]
-    for i, c_label in enumerate(comp_list):
-        clusters[c_label].append(bkpts[i])
-    return clusters
-
-
 def close_enough(r1, r2, cpx_dist=2000):
     distA = np.abs(r1.pos - r2.pos)
     distB = np.abs(r1.stop - r2.stop)
     return distA < cpx_dist or distB < cpx_dist
 
 
-def ro_calu(r1, r2):
-    out = 0
-    if not r1.chrom == r2.chrom:
-        out = 0
-    elif r1.pos > r2.stop or r1.stop < r2.pos:
-        out = 0
-    else:
-        maxval = max([r1.stop - r1.pos, r2.stop - r2.pos])
-        if maxval > 0:
-            out = (sorted([r1.pos, r2.pos, r1.stop, r2.stop])[
-                   2] - sorted([r1.pos, r2.pos, r1.stop, r2.stop])[1]) / maxval
-        else:
-            out = 0
-    return out
-
-
-def ro_calu_interval(r1, r2):
-    out = 0
-    if not r1[0] == r2[0]:
-        out = 0
-    elif r1[1] > r2[2] or r1[2] < r2[1]:
-        out = 0
-    else:
-        out = (sorted(r1[1:] + r2[1:])[2] - sorted(r1[1:] +
-                                                   r2[1:])[1]) / max([r1[2] - r1[1], r2[2] - r2[1]])
-    return out
+def records_overlap(r1, r2):
+    return r1.chrom == r2.chrom and not (r1.pos > r2.stop or r1.stop < r2.pos)
diff --git a/src/svtk/svtk/genomeslink.py b/src/svtk/svtk/genomeslink.py
index e325bdd1b..85fe7b426 100644
--- a/src/svtk/svtk/genomeslink.py
+++ b/src/svtk/svtk/genomeslink.py
@@ -131,7 +131,7 @@ def __str__(self):
 
 
 class GenomeSLINK(object):
-    def __init__(self, nodes, dist, size=1, blacklist=None):
+    def __init__(self, nodes, dist, size=1, blacklist=None, single_end=False):
         """
         Graph-based single-linkage clustering of genomic coordinates.
 
@@ -147,12 +147,15 @@ def __init__(self, nodes, dist, size=1, blacklist=None):
         blacklist : pysam.TabixFile, optional
             Regions to exclude from clustering. Any node with a coordinate
             inside an excluding region is omitted. (NOTE: not overlap-based.)
+        single_end : bool, optional
+            Require only one end to be within min dist.
         """
 
         self.nodes = nodes
         self.dist = dist
         self.size = size
         self.blacklist = blacklist
+        self.single_end = single_end
 
     def is_clusterable_with(self, first, second):
         """
@@ -165,9 +168,15 @@ def clusters_with(self, first, second):
         """
         Test if candidates meet cluster distance requirement on chrB, posB
         """
-        return (first.chrB == second.chrB and
-                abs(first.posA - second.posA) < self.dist and
-                abs(first.posB - second.posB) < self.dist)
+        if first.chrB == second.chrB:
+            if self.single_end:
+                return abs(first.posA - second.posA) < self.dist or \
+                    abs(first.posB - second.posB) < self.dist
+            else:
+                return abs(first.posA - second.posA) < self.dist and \
+                    abs(first.posB - second.posB) < self.dist
+        else:
+            return False
 
     def filter_nodes(self):
         """
diff --git a/src/svtk/svtk/utils/__init__.py b/src/svtk/svtk/utils/__init__.py
index 03ffa3fc7..5ca6d96ad 100644
--- a/src/svtk/svtk/utils/__init__.py
+++ b/src/svtk/svtk/utils/__init__.py
@@ -1,7 +1,6 @@
 from .utils import *
 from .bgzipfile import BgzipFile
-from .s3bam import load_s3bam
-from .helpers import is_excluded, is_soft_clipped, reciprocal_overlap, overlap_frac
+from .helpers import reciprocal_overlap, overlap_frac
 from .multi_tabixfile import MultiTabixFile
 from .genotype_merging import update_best_genotypes
 from .rdtest import RdTest
diff --git a/src/svtk/svtk/utils/helpers.pyx b/src/svtk/svtk/utils/helpers.pyx
index 9c26c9219..b9dc5334f 100644
--- a/src/svtk/svtk/utils/helpers.pyx
+++ b/src/svtk/svtk/utils/helpers.pyx
@@ -1,24 +1,10 @@
 #cython: language_level=3
-from pysam.libcalignedsegment cimport AlignedSegment
-from pysam.libcalignmentfile cimport AlignmentFile
 
 cdef inline int int_max(int a, int b): return a if a >= b else b
 cdef inline int int_min(int a, int b): return a if a <= b else b
 cdef inline float float_max(float a, float b): return a if a >= b else b
 cdef inline float float_min(float a, float b): return a if a <= b else b
 
-cpdef bint is_excluded(AlignedSegment read):
-    cdef bint exclude = (read.is_unmapped or
-                         read.mate_is_unmapped or
-                         read.is_secondary or
-                         read.is_duplicate or
-                         read.is_supplementary)
-    return exclude
-
-cpdef bint is_soft_clipped(AlignedSegment read):
-    return (((read.cigartuples[0][0] == 4) & (read.cigartuples[-1][0] == 0)) |
-            ((read.cigartuples[-1][0] == 4) & (read.cigartuples[0][0] == 0)))
-
 cpdef float reciprocal_overlap(int startA, int endA, int startB, int endB):
     """Calculate fraction of reciprocal overlap between two intervals"""
 
diff --git a/src/svtk/svtk/utils/s3bam.py b/src/svtk/svtk/utils/s3bam.py
deleted file mode 100644
index bfe2209d4..000000000
--- a/src/svtk/svtk/utils/s3bam.py
+++ /dev/null
@@ -1,44 +0,0 @@
-#! /usr/bin/env python
-# -*- coding: utf-8 -*-
-# vim:fenc=utf-8
-#
-
-"""
-Load S3-hosted bam into pysam.AlignmentFile
-"""
-
-import os
-import boto3
-import pysam
-
-
-def load_s3bam(bam_path, index_dir=None):
-    if not bam_path.startswith('s3://'):
-        raise Exception('Bam {0} is not a valid S3 path'.format(bam_path))
-
-    # Pysam doesn't accept explicit path to index file, expects index to be
-    # present in working directory. If a local copy of the index is available,
-    # move to its directory to use it.
-    # Otherwise, index is downloaded automatically
-    if index_dir is not None:
-        os.chdir(index_dir)
-    else:
-        msg = ('Local index directory not specified for {0}. Downloading '
-               'remote copy of index to working directory.')
-        raise Warning(msg.format(bam_path))
-
-    # Parse bucket and key from filepath
-    s3path = bam_path[5:]
-    bucket = s3path.split('/')[0]
-    bam_path = '/'.join(s3path.split('/')[1:])
-
-    # Create S3 client and get presigned URL
-    # Necessary to take advantage of pysam's https support until the library
-    # supports S3 paths directly
-    s3 = boto3.client('s3')
-    url = s3.generate_presigned_url(
-        ClientMethod='get_object',
-        Params={'Bucket': bucket, 'Key': bam_path},
-        ExpiresIn=86400)
-
-    return pysam.AlignmentFile(url)
diff --git a/src/svtk/svtk/utils/utils.py b/src/svtk/svtk/utils/utils.py
index ee9c9e53c..2eae72c16 100644
--- a/src/svtk/svtk/utils/utils.py
+++ b/src/svtk/svtk/utils/utils.py
@@ -302,7 +302,7 @@ def _converter():
                     chrom = record.chrom
                     start = max([0, int(record.pos) - 1])
                     end = record.pos
-                    entry.format(**locals())
+                    yield entry.format(**locals())
 
             # elif (record.info.get('SVTYPE', None) == 'CPX' and
             #       'CPX_TYPE' in record.info.keys()):
diff --git a/src/svtk/svtk/vcfcluster.py b/src/svtk/svtk/vcfcluster.py
index 0d53cd4c7..0891a67e1 100644
--- a/src/svtk/svtk/vcfcluster.py
+++ b/src/svtk/svtk/vcfcluster.py
@@ -29,7 +29,8 @@ def __init__(self, vcfs,
                  preserve_genotypes=False, sample_overlap=0.0,
                  preserve_header=False,
                  do_cluster=True,
-                 do_merge=True):
+                 do_merge=True,
+                 single_end=False):
         """
         Clustering of VCF records.
 
@@ -71,6 +72,8 @@ def __init__(self, vcfs,
             specified, all svtypes will be clustered.
         sample_overlap : float, optional
             Minimum fraction of samples to overlap to cluster variants
+        single_end : bool, optional
+            Require only one end to be within min dist.
         """
 
         if (not do_cluster) and (not do_merge):
@@ -115,7 +118,7 @@ def __init__(self, vcfs,
         self.sources = sorted(sources)
         self.header = self.make_vcf_header()
 
-        super().__init__(nodes, dist, 1, blacklist)
+        super().__init__(nodes, dist, 1, blacklist, single_end)
 
     def clusters_with(self, first, second):
         """
diff --git a/test_input_templates/MakeCohortVcf/MakeCohortVcf.json.tmpl b/test_input_templates/MakeCohortVcf/MakeCohortVcf.json.tmpl
index 6811de23b..5dccf27ab 100644
--- a/test_input_templates/MakeCohortVcf/MakeCohortVcf.json.tmpl
+++ b/test_input_templates/MakeCohortVcf/MakeCohortVcf.json.tmpl
@@ -16,18 +16,25 @@
   "MakeCohortVcf.collins_2017_tarball": {{ reference_resources.collins_2017_tarball | tojson }},
   "MakeCohortVcf.werling_2018_tarball": {{ reference_resources.werling_2018_tarball | tojson }},
 
+  "MakeCohortVcf.chr_x": {{ reference_resources.chr_x | tojson }},
+  "MakeCohortVcf.chr_y": {{ reference_resources.chr_y | tojson }},
+
   "MakeCohortVcf.min_sr_background_fail_batches": 0.5,
+  "MakeCohortVcf.max_shard_size_resolve" : 500,
   "MakeCohortVcf.max_shards_per_chrom_clean_vcf_step1": 200,
   "MakeCohortVcf.min_records_per_shard_clean_vcf_step1": 5000,
+  "MakeCohortVcf.clean_vcf1b_records_per_shard": 10000,
   "MakeCohortVcf.samples_per_clean_vcf_step2_shard": 100,
+  "MakeCohortVcf.clean_vcf5_records_per_shard": 5000,
   "MakeCohortVcf.random_seed": 0,
-  "MakeCohortVcf.max_shard_size_resolve": 500,
 
   "MakeCohortVcf.primary_contigs_list": {{ reference_resources.primary_contigs_list | tojson }},
   "MakeCohortVcf.sv_pipeline_base_docker": {{ dockers.sv_pipeline_base_docker | tojson }},
 
   "MakeCohortVcf.linux_docker": {{ dockers.linux_docker | tojson }},
   "MakeCohortVcf.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }},
+  "MakeCohortVcf.sv_pipeline_hail_docker": {{ dockers.sv_pipeline_hail_docker | tojson }},
+  "MakeCohortVcf.sv_pipeline_updates_docker": {{ dockers.sv_pipeline_updates_docker | tojson }},
   "MakeCohortVcf.sv_base_mini_docker":{{ dockers.sv_base_mini_docker | tojson }},
   "MakeCohortVcf.sv_pipeline_rdtest_docker": {{ dockers.sv_pipeline_rdtest_docker | tojson }},
   "MakeCohortVcf.sv_pipeline_qc_docker": {{ dockers.sv_pipeline_qc_docker | tojson }},
diff --git a/test_input_templates/batch/GATKSVPipelineBatch.json.tmpl b/test_input_templates/batch/GATKSVPipelineBatch.json.tmpl
index 87fcbb8eb..f55e4f99f 100644
--- a/test_input_templates/batch/GATKSVPipelineBatch.json.tmpl
+++ b/test_input_templates/batch/GATKSVPipelineBatch.json.tmpl
@@ -34,6 +34,8 @@
   "GATKSVPipelineBatch.sv_pipeline_base_docker": {{ dockers.sv_pipeline_base_docker | tojson }},
   "GATKSVPipelineBatch.sv_base_mini_docker":{{ dockers.sv_base_mini_docker | tojson }},
   "GATKSVPipelineBatch.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }},
+  "GATKSVPipelineBatch.sv_pipeline_hail_docker": {{ dockers.sv_pipeline_hail_docker | tojson }},
+  "GATKSVPipelineBatch.sv_pipeline_updates_docker": {{ dockers.sv_pipeline_updates_docker | tojson }},
   "GATKSVPipelineBatch.sv_pipeline_qc_docker": {{ dockers.sv_pipeline_qc_docker | tojson }},
   "GATKSVPipelineBatch.sv_pipeline_rdtest_docker": {{ dockers.sv_pipeline_rdtest_docker | tojson }},
   "GATKSVPipelineBatch.wham_docker": {{ dockers.wham_docker | tojson }},
@@ -127,7 +129,11 @@
   "GATKSVPipelineBatch.MakeCohortVcf.min_sr_background_fail_batches": 0.5,
   "GATKSVPipelineBatch.MakeCohortVcf.max_shards_per_chrom_clean_vcf_step1": 200,
   "GATKSVPipelineBatch.MakeCohortVcf.min_records_per_shard_clean_vcf_step1": 5000,
+  "GATKSVPipelineBatch.MakeCohortVcf.clean_vcf1b_records_per_shard": 10000,
+  "GATKSVPipelineBatch.MakeCohortVcf.clean_vcf5_records_per_shard": 5000,
   "GATKSVPipelineBatch.MakeCohortVcf.samples_per_clean_vcf_step2_shard": 100,
   "GATKSVPipelineBatch.MakeCohortVcf.random_seed": 0,
-  "GATKSVPipelineBatch.MakeCohortVcf.max_shard_size_resolve": 500
+  "GATKSVPipelineBatch.MakeCohortVcf.max_shard_size_resolve": 500,
+  "GATKSVPipelineBatch.MakeCohortVcf.chr_x": {{ reference_resources.chr_x | tojson }},
+  "GATKSVPipelineBatch.MakeCohortVcf.chr_y": {{ reference_resources.chr_y | tojson }}
 }
diff --git a/test_input_templates/single-sample/GATKSVPipelineSingleSampleTest.json.tmpl b/test_input_templates/single-sample/GATKSVPipelineSingleSampleTest.json.tmpl
index a31cbf18a..d6579168a 100644
--- a/test_input_templates/single-sample/GATKSVPipelineSingleSampleTest.json.tmpl
+++ b/test_input_templates/single-sample/GATKSVPipelineSingleSampleTest.json.tmpl
@@ -44,6 +44,8 @@
   "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.sv_pipeline_base_docker": {{ dockers.sv_pipeline_base_docker | tojson }},
   "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.sv_base_mini_docker":{{ dockers.sv_base_mini_docker | tojson }},
   "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }},
+  "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.sv_pipeline_hail_docker": {{ dockers.sv_pipeline_hail_docker | tojson }},
+  "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.sv_pipeline_updates_docker": {{ dockers.sv_pipeline_updates_docker | tojson }},
   "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.sv_pipeline_qc_docker": {{ dockers.sv_pipeline_qc_docker | tojson }},
   "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.sv_pipeline_rdtest_docker": {{ dockers.sv_pipeline_rdtest_docker | tojson }},
   "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.wham_docker": {{ dockers.wham_docker | tojson }},
@@ -94,7 +96,9 @@
   "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.max_shard_size_resolve" : 500,
   "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.clean_vcf_max_shards_per_chrom_clean_vcf_step1": 200,
   "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.clean_vcf_min_records_per_shard_clean_vcf_step1": 5000,
+  "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.clean_vcf1b_records_per_shard": 10000,
   "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.clean_vcf_samples_per_clean_vcf_step2_shard": 100,
+  "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.clean_vcf5_records_per_shard": 5000,
   "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.clean_vcf_random_seed": 0,
   "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.run_vcf_qc" : false,
   "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.max_ref_panel_carrier_freq": 0.03,
diff --git a/wdl/CalcAF.wdl b/wdl/CalcAF.wdl
index 9bdbdfde9..1801918a7 100644
--- a/wdl/CalcAF.wdl
+++ b/wdl/CalcAF.wdl
@@ -1,15 +1,16 @@
 version 1.0
 
 import "Structs.wdl"
+import "CleanVcf5.wdl" as cleanvcf5
 
 workflow CalcAF {
   input{
     File vcf
     File vcf_idx
-    String contig
     Int sv_per_shard
     String prefix
     String sv_pipeline_docker
+    String sv_pipeline_updates_docker
     File? sample_pop_assignments  #Two-column file with sample ID & pop assignment. "." for pop will ignore sample
     File? famfile                 #Used for M/F AF calculations
     File? par_bed                 #Used for marking hemizygous males on X & Y
@@ -20,23 +21,22 @@ workflow CalcAF {
 
 
   # Tabix to chromosome of interest, and shard input VCF for stats collection
-  call ShardVcf {
+  call cleanvcf5.ScatterVcf {
     input:
       vcf=vcf,
-      vcf_idx=vcf_idx,
-      contig=contig,
-      sv_pipeline_docker=sv_pipeline_docker,
-      sv_per_shard=sv_per_shard
+      prefix=prefix,
+      sv_pipeline_docker=sv_pipeline_updates_docker,
+      records_per_shard=sv_per_shard
   }
 
   # Scatter over VCF shards
-  scatter ( shard in ShardVcf.shard_vcfs ) {
+  scatter ( shard in ScatterVcf.shards ) {
     # Collect AF summary stats
     call ComputeShardAFs {
       input:
         vcf=shard,
         sv_pipeline_docker=sv_pipeline_docker,
-        prefix="~{prefix}.~{contig}",
+        prefix=prefix,
         sample_pop_assignments=sample_pop_assignments,
         famfile=famfile,
         par_bed=par_bed,
@@ -49,7 +49,7 @@ workflow CalcAF {
     input:
       vcfs=ComputeShardAFs.shard_wAFs,
       sv_pipeline_docker=sv_pipeline_docker,
-      prefix="~{prefix}.~{contig}",
+      prefix=prefix,
       drop_empty_records=drop_empty_records
   }
 
@@ -60,53 +60,6 @@ workflow CalcAF {
   }
 }
 
-
-# Shard VCF into fixed size chunks
-task ShardVcf {
-  input{
-    File vcf
-    File vcf_idx
-    String contig
-    Int sv_per_shard
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-  RuntimeAttr default_attr = object {
-    cpu_cores: 1, 
-    mem_gb: 4,
-    disk_gb: 250,
-    boot_disk_gb: 10,
-    preemptible_tries: 3,
-    max_retries: 1
-  }
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-
-  command {
-    #Tabix chromosome of interest
-    tabix -h ~{vcf} ~{contig} | bgzip -c > ~{contig}.vcf.gz
-    #Then shard VCF
-    /opt/sv-pipeline/scripts/shard_VCF.sh \
-      ~{contig}.vcf.gz \
-      ~{sv_per_shard} \
-      "vcf.shard."
-  }
-
-  output {
-    Array[File] shard_vcfs = glob("vcf.shard.*.vcf.gz")
-  }
-  
-  runtime {
-    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
-    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
-    docker: sv_pipeline_docker
-    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
-  }
-}
-
-
 # Subset a vcf to a single chromosome, and add global AF information (no subpop)
 task ComputeShardAFs {
   input{
@@ -121,8 +74,8 @@ task ComputeShardAFs {
   }
   RuntimeAttr default_attr = object {
     cpu_cores: 1, 
-    mem_gb: 8,
-    disk_gb: 20,
+    mem_gb: 1.5,
+    disk_gb: 20 + size(vcf, "GB") * 2,
     boot_disk_gb: 10,
     preemptible_tries: 3,
     max_retries: 1
diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl
index dce8ecb9e..e5d95713b 100644
--- a/wdl/CleanVcf.wdl
+++ b/wdl/CleanVcf.wdl
@@ -2,7 +2,7 @@ version 1.0
 
 import "CleanVcfChromosome.wdl" as CleanVcfChromosome
 import "TasksMakeCohortVcf.wdl" as MiniTasks
-import "Utils.wdl" as util
+import "HailMerge.wdl" as HailMerge
 
 workflow CleanVcf {
   input {
@@ -15,52 +15,72 @@ workflow CleanVcf {
 
     File contig_list
     File allosome_fai
-    Int max_shards_per_chrom_clean_vcf_step1
-    Int min_records_per_shard_clean_vcf_step1
-    Int samples_per_clean_vcf_step2_shard
+    Int max_shards_per_chrom_step1
+    Int min_records_per_shard_step1
+    Int samples_per_step2_shard
+    Int? max_samples_per_shard_step3
+    Int clean_vcf1b_records_per_shard
+    Int clean_vcf5_records_per_shard
+
+    String chr_x
+    String chr_y
 
     File? outlier_samples_list
 
+    Boolean use_hail = false
+    String? gcs_project
+
     String linux_docker
     String sv_base_mini_docker
     String sv_pipeline_docker
+    String sv_pipeline_hail_docker
+    String sv_pipeline_updates_docker
 
     # overrides for mini tasks
+    RuntimeAttr? runtime_override_preconcat_clean_final
+    RuntimeAttr? runtime_override_hail_merge_clean_final
+    RuntimeAttr? runtime_override_fix_header_clean_final
     RuntimeAttr? runtime_override_concat_cleaned_vcfs
 
     # overrides for CleanVcfContig
     RuntimeAttr? runtime_override_clean_vcf_1a
-    RuntimeAttr? runtime_override_clean_vcf_1b
     RuntimeAttr? runtime_override_clean_vcf_2
     RuntimeAttr? runtime_override_clean_vcf_3
     RuntimeAttr? runtime_override_clean_vcf_4
-    RuntimeAttr? runtime_override_clean_vcf_5
+    RuntimeAttr? runtime_override_clean_vcf_5_scatter
+    RuntimeAttr? runtime_override_clean_vcf_5_make_cleangq
+    RuntimeAttr? runtime_override_clean_vcf_5_find_redundant_multiallelics
+    RuntimeAttr? runtime_override_clean_vcf_5_polish
     RuntimeAttr? runtime_override_stitch_fragmented_cnvs
     RuntimeAttr? runtime_override_final_cleanup
+
+    # Clean vcf 1b
+    RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b
+    RuntimeAttr? runtime_attr_override_sort_bed_1b
+    RuntimeAttr? runtime_attr_override_intersect_bed_1b
+    RuntimeAttr? runtime_attr_override_build_dict_1b
+    RuntimeAttr? runtime_attr_override_scatter_1b
+    RuntimeAttr? runtime_attr_override_filter_vcf_1b
+    RuntimeAttr? runtime_override_concat_vcfs_1b
+    RuntimeAttr? runtime_override_cat_multi_cnvs_1b
+
+    RuntimeAttr? runtime_override_preconcat_step1
+    RuntimeAttr? runtime_override_hail_merge_step1
+    RuntimeAttr? runtime_override_fix_header_step1
+
+    RuntimeAttr? runtime_override_preconcat_drc
+    RuntimeAttr? runtime_override_hail_merge_drc
+    RuntimeAttr? runtime_override_fix_header_drc
+
     RuntimeAttr? runtime_override_split_vcf_to_clean
-    RuntimeAttr? runtime_override_combine_step_1_vcfs
     RuntimeAttr? runtime_override_combine_step_1_sex_chr_revisions
     RuntimeAttr? runtime_override_split_include_list
     RuntimeAttr? runtime_override_combine_clean_vcf_2
     RuntimeAttr? runtime_override_combine_revised_4
     RuntimeAttr? runtime_override_combine_multi_ids_4
-    RuntimeAttr? runtime_attr_ids_from_vcf
-    RuntimeAttr? runtime_attr_subset_ped
-  }
-
-  call util.GetSampleIdsFromVcf {
-    input:
-      vcf = complex_genotype_vcfs[0],
-      sv_base_mini_docker = sv_base_mini_docker,
-      runtime_attr_override = runtime_attr_ids_from_vcf
-  }
-  call util.SubsetPedFile {
-    input:
-      ped_file = merged_ped_file,
-      sample_list = GetSampleIdsFromVcf.out_file,
-      subset_name = "vcf_samples",
-      sv_base_mini_docker = sv_base_mini_docker,
-      runtime_attr_override = runtime_attr_subset_ped
+    RuntimeAttr? runtime_override_drop_redundant_cnvs
+    RuntimeAttr? runtime_override_combine_step_1_vcfs
+    RuntimeAttr? runtime_override_sort_drop_redundant_cnvs
   }
 
   #Scatter per chromosome
@@ -73,47 +93,89 @@ workflow CleanVcf {
         vcf=complex_genotype_vcfs[i],
         contig=contig,
         background_list=complex_resolve_background_fail_lists[i],
-        ped_file=SubsetPedFile.ped_subset_file,
+        ped_file=merged_ped_file,
         bothsides_pass_list=complex_resolve_bothside_pass_lists[i],
         allosome_fai=allosome_fai,
-        prefix=cohort_name,
-        max_shards_per_chrom_step1=max_shards_per_chrom_clean_vcf_step1,
-        min_records_per_shard_step1=min_records_per_shard_clean_vcf_step1,
-        samples_per_step2_shard=samples_per_clean_vcf_step2_shard,
+        prefix="~{cohort_name}.~{contig}",
+        max_shards_per_chrom_step1=max_shards_per_chrom_step1,
+        min_records_per_shard_step1=min_records_per_shard_step1,
+        samples_per_step2_shard=samples_per_step2_shard,
+        max_samples_per_shard_step3=max_samples_per_shard_step3,
         outlier_samples_list=outlier_samples_list,
+        use_hail=use_hail,
+        gcs_project=gcs_project,
+        clean_vcf1b_records_per_shard=clean_vcf1b_records_per_shard,
+        clean_vcf5_records_per_shard=clean_vcf5_records_per_shard,
+        chr_x=chr_x,
+        chr_y=chr_y,
         linux_docker=linux_docker,
         sv_base_mini_docker=sv_base_mini_docker,
+        sv_pipeline_updates_docker=sv_pipeline_updates_docker,
         sv_pipeline_docker=sv_pipeline_docker,
+        sv_pipeline_hail_docker=sv_pipeline_hail_docker,
         runtime_override_clean_vcf_1a=runtime_override_clean_vcf_1a,
-        runtime_override_clean_vcf_1b=runtime_override_clean_vcf_1b,
         runtime_override_clean_vcf_2=runtime_override_clean_vcf_2,
         runtime_override_clean_vcf_3=runtime_override_clean_vcf_3,
         runtime_override_clean_vcf_4=runtime_override_clean_vcf_4,
-        runtime_override_clean_vcf_5=runtime_override_clean_vcf_5,
+        runtime_override_clean_vcf_5_scatter=runtime_override_clean_vcf_5_scatter,
+        runtime_override_clean_vcf_5_make_cleangq=runtime_override_clean_vcf_5_make_cleangq,
+        runtime_override_clean_vcf_5_find_redundant_multiallelics=runtime_override_clean_vcf_5_find_redundant_multiallelics,
+        runtime_override_clean_vcf_5_polish=runtime_override_clean_vcf_5_polish,
         runtime_override_stitch_fragmented_cnvs=runtime_override_stitch_fragmented_cnvs,
         runtime_override_final_cleanup=runtime_override_final_cleanup,
         runtime_override_split_vcf_to_clean=runtime_override_split_vcf_to_clean,
-        runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs,
         runtime_override_combine_step_1_sex_chr_revisions=runtime_override_combine_step_1_sex_chr_revisions,
         runtime_override_split_include_list=runtime_override_split_include_list,
         runtime_override_combine_clean_vcf_2=runtime_override_combine_clean_vcf_2,
         runtime_override_combine_revised_4=runtime_override_combine_revised_4,
-        runtime_override_combine_multi_ids_4=runtime_override_combine_multi_ids_4
+        runtime_override_combine_multi_ids_4=runtime_override_combine_multi_ids_4,
+        runtime_override_preconcat_step1=runtime_override_preconcat_step1,
+        runtime_override_hail_merge_step1=runtime_override_hail_merge_step1,
+        runtime_override_fix_header_step1=runtime_override_fix_header_step1,
+        runtime_override_preconcat_drc=runtime_override_preconcat_drc,
+        runtime_override_hail_merge_drc=runtime_override_hail_merge_drc,
+        runtime_override_fix_header_drc=runtime_override_fix_header_drc,
+        runtime_override_drop_redundant_cnvs=runtime_override_drop_redundant_cnvs,
+        runtime_attr_override_subset_large_cnvs_1b=runtime_attr_override_subset_large_cnvs_1b,
+        runtime_attr_override_sort_bed_1b=runtime_attr_override_sort_bed_1b,
+        runtime_attr_override_intersect_bed_1b=runtime_attr_override_intersect_bed_1b,
+        runtime_attr_override_build_dict_1b=runtime_attr_override_build_dict_1b,
+        runtime_attr_override_scatter_1b=runtime_attr_override_scatter_1b,
+        runtime_attr_override_filter_vcf_1b=runtime_attr_override_filter_vcf_1b,
+        runtime_override_concat_vcfs_1b=runtime_override_concat_vcfs_1b,
+        runtime_override_cat_multi_cnvs_1b=runtime_override_cat_multi_cnvs_1b
     }
   }
 
-  call MiniTasks.ConcatVcfs as ConcatCleanedVcfs {
-    input:
-      vcfs=CleanVcfChromosome.out,
-      vcfs_idx=CleanVcfChromosome.out_idx,
-      allow_overlaps=true,
-      outfile_prefix="~{cohort_name}.cleaned",
-      sv_base_mini_docker=sv_base_mini_docker,
-      runtime_attr_override=runtime_override_concat_cleaned_vcfs
+  if (use_hail) {
+    call HailMerge.HailMerge as ConcatVcfsHail {
+      input:
+        vcfs=CleanVcfChromosome.out,
+        prefix="~{cohort_name}.cleaned",
+        gcs_project=gcs_project,
+        reset_cnv_gts=true,
+        sv_base_mini_docker=sv_base_mini_docker,
+        sv_pipeline_docker=sv_pipeline_docker,
+        sv_pipeline_hail_docker=sv_pipeline_hail_docker,
+        runtime_override_preconcat=runtime_override_preconcat_clean_final,
+        runtime_override_hail_merge=runtime_override_hail_merge_clean_final,
+        runtime_override_fix_header=runtime_override_fix_header_clean_final
+    }
+  }
+  if (!use_hail) {
+    call MiniTasks.ConcatVcfs as ConcatCleanedVcfs {
+      input:
+        vcfs=CleanVcfChromosome.out,
+        vcfs_idx=CleanVcfChromosome.out_idx,
+        allow_overlaps=true,
+        outfile_prefix="~{cohort_name}.cleaned",
+        sv_base_mini_docker=sv_base_mini_docker,
+        runtime_attr_override=runtime_override_concat_cleaned_vcfs
+    }
   }
 
   output {
-    File cleaned_vcf = ConcatCleanedVcfs.concat_vcf
-    File cleaned_vcf_index = ConcatCleanedVcfs.concat_vcf_idx
+    File cleaned_vcf = select_first([ConcatCleanedVcfs.concat_vcf, ConcatVcfsHail.merged_vcf])
+    File cleaned_vcf_index = select_first([ConcatCleanedVcfs.concat_vcf_idx, ConcatVcfsHail.merged_vcf_index])
   }
 }
\ No newline at end of file
diff --git a/wdl/CleanVcf1.wdl b/wdl/CleanVcf1.wdl
deleted file mode 100644
index 8e9dc8d8c..000000000
--- a/wdl/CleanVcf1.wdl
+++ /dev/null
@@ -1,1558 +0,0 @@
-version 1.0
-
-import "Structs.wdl"
-
-workflow CleanVcf1 {
-  input {
-    File vcf
-    File background_list
-    File ped_file
-    String sv_pipeline_docker
-    String linux_docker
-    File bothsides_pass_list
-    File allosome_fai
-    RuntimeAttr? runtime_attr_override # TODO
-  }
-
-  call CreateEmptyFile {
-    input:
-      linux_docker=linux_docker
-  }
-
-  call CleanVcf1_1 {
-    input:
-      vcf=vcf,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf1_2 {
-    input:
-      EV_update_vcf=CleanVcf1_1.EV_update_vcf,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf1_3 {
-    input:
-      EV_update_vcf=CleanVcf1_1.EV_update_vcf,
-      vcf_convert_svtype=CleanVcf1_2.vcf_convert_svtype,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf1_4 {
-    input:
-      convertsvtype_vcf=CleanVcf1_3.convertsvtype_vcf,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf1_5 {
-    input:
-      convertsvtype_vcf=CleanVcf1_3.convertsvtype_vcf,
-      vargq_persample=CleanVcf1_4.vargq_persample,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  if (CleanVcf1_5.count_xy > 0) {
-    call CleanVcf1_6 {
-      input:
-        cleaninfo_vcf=CleanVcf1_5.cleaninfo_vcf,
-        cleaninfo_vcf_tbi=CleanVcf1_5.cleaninfo_vcf_tbi,
-        ped_file=ped_file,
-        sv_pipeline_docker=sv_pipeline_docker
-    }
-
-    if (CleanVcf1_6.clean_bed_ids_count > 0) {
-      call CleanVcf1_7 {
-        input:
-          allosome_fai=allosome_fai,
-          cleaninfo_vcf=CleanVcf1_5.cleaninfo_vcf,
-          cleaninfo_vcf_tbi=CleanVcf1_5.cleaninfo_vcf_tbi,
-          clean_bed_ids=CleanVcf1_6.clean_bed_ids,
-          male=CleanVcf1_6.male,
-          sv_pipeline_docker=sv_pipeline_docker
-      }
-      call CleanVcf1_8 {
-        input:
-          allosome_fai=allosome_fai,
-          cleaninfo_vcf=CleanVcf1_5.cleaninfo_vcf,
-          cleaninfo_vcf_tbi=CleanVcf1_5.cleaninfo_vcf_tbi,
-          clean_bed_ids=CleanVcf1_6.clean_bed_ids,
-          female=CleanVcf1_6.female,
-          sv_pipeline_docker=sv_pipeline_docker
-      }
-      call CleanVcf1_9 {
-        input:
-          RD_CN_sexcheck_FORMAT_male=CleanVcf1_7.RD_CN_sexcheck_FORMAT_male,
-          sv_pipeline_docker=sv_pipeline_docker
-      }
-      call CleanVcf1_10 {
-        input:
-          RD_CN_sexcheck_FORMAT_female=CleanVcf1_8.RD_CN_sexcheck_FORMAT_female,
-          sv_pipeline_docker=sv_pipeline_docker
-      }
-    }
-
-    call CleanVcf1_11 {
-      input:
-        cleaninfo_vcf=CleanVcf1_5.cleaninfo_vcf,
-        cleaninfo_vcf_tbi=CleanVcf1_5.cleaninfo_vcf_tbi,
-        clean_bed=CleanVcf1_6.clean_bed,
-        male_median_value_pervar=select_first([CleanVcf1_9.male_median_value_pervar, CreateEmptyFile.empty]),
-        female_median_value_pervar=select_first([CleanVcf1_10.female_median_value_pervar, CreateEmptyFile.empty]),
-        sv_pipeline_docker=sv_pipeline_docker
-    }
-
-    call CleanVcf1_12 {
-      input:
-        sexchr_revise_1=CleanVcf1_11.sexchr_revise_1,
-        cleaninfo_vcf=CleanVcf1_5.cleaninfo_vcf,
-        cleaninfo_vcf_tbi=CleanVcf1_5.cleaninfo_vcf_tbi,
-        clean_bed=CleanVcf1_6.clean_bed,
-        male_median_value_pervar=select_first([CleanVcf1_9.male_median_value_pervar, CreateEmptyFile.empty]),
-        female_median_value_pervar=select_first([CleanVcf1_10.female_median_value_pervar, CreateEmptyFile.empty]),
-        sv_pipeline_docker=sv_pipeline_docker
-    }
-
-    call CleanVcf1_13 {
-      input:
-        cleaninfo_vcf=CleanVcf1_5.cleaninfo_vcf,
-        cleaninfo_vcf_csi=CleanVcf1_5.cleaninfo_vcf_csi,
-        male=CleanVcf1_6.male,
-        sv_pipeline_docker=sv_pipeline_docker
-    }
-
-    call CleanVcf1_14 {
-      input:
-        cleaninfo_vcf=CleanVcf1_5.cleaninfo_vcf,
-        cleaninfo_vcf_csi=CleanVcf1_5.cleaninfo_vcf_csi,
-        female=CleanVcf1_6.female,
-        sv_pipeline_docker=sv_pipeline_docker
-    }
-
-    call CleanVcf1_15 {
-      input:
-        male_vcf=CleanVcf1_13.male_vcf,
-        sexchr_revise_2=CleanVcf1_12.sexchr_revise_2,
-        sv_pipeline_docker=sv_pipeline_docker
-    }
-
-    call CleanVcf1_16 {
-      input:
-        male_vcf=CleanVcf1_13.male_vcf,
-        sexchr_revise_2=CleanVcf1_12.sexchr_revise_2,
-        sv_pipeline_docker=sv_pipeline_docker
-    }
-
-    if ((CleanVcf1_15.count + CleanVcf1_16.count) > 0) {
-      call CleanVcf1_17 {
-        input:
-          male_vcf=CleanVcf1_13.male_vcf,
-          male_dup_revise_txt=CleanVcf1_16.male_dup_revise_txt,
-          male_del_revise_txt=CleanVcf1_15.male_del_revise_txt,
-          sv_pipeline_docker=sv_pipeline_docker
-      }
-    }
-
-    if (CleanVcf1_5.count_y > 0) {
-      call CleanVcf1_18 {
-        input:
-          female_vcf=CleanVcf1_14.female_vcf,
-          sv_pipeline_docker=sv_pipeline_docker
-      }
-      call CleanVcf1_19 {
-        input:
-          female_vcf=CleanVcf1_14.female_vcf,
-          female_y_revise_txt=CleanVcf1_18.female_y_revise_txt,
-          sv_pipeline_docker=sv_pipeline_docker
-      }
-    }
-
-    if (CleanVcf1_6.ped_file_count > 0) {
-      call CleanVcf1_20 {
-        input:
-          cleaninfo_vcf=CleanVcf1_5.cleaninfo_vcf,
-          cleaninfo_vcf_csi=CleanVcf1_5.cleaninfo_vcf_csi,
-          ped_file=ped_file,
-          sv_pipeline_docker=sv_pipeline_docker
-      }
-      call CleanVcf1_21 {
-        input:
-          other_vcf=CleanVcf1_20.other_vcf,
-          sv_pipeline_docker=sv_pipeline_docker
-      }
-      call CleanVcf1_22 {
-        input:
-          other_vcf=CleanVcf1_20.other_vcf,
-          other_revise_txt=CleanVcf1_21.other_revise_txt,
-          sv_pipeline_docker=sv_pipeline_docker
-      }
-      call CleanVcf1_23 {
-        input:
-          cleanmale_vcf=select_first([CleanVcf1_17.cleanmale_vcf, CleanVcf1_13.male_vcf]),
-          cleanfemale_vcf=select_first([CleanVcf1_19.cleanfemale_vcf, CleanVcf1_14.female_vcf]),
-          cleanother_vcf=CleanVcf1_22.cleanother_vcf,
-          sv_pipeline_docker=sv_pipeline_docker
-      }
-    }
-    if (CleanVcf1_6.ped_file_count == 0) {
-      call CleanVcf1_24 {
-        input:
-          cleanmale_vcf=select_first([CleanVcf1_17.cleanmale_vcf, CleanVcf1_13.male_vcf]),
-          cleanfemale_vcf=select_first([CleanVcf1_19.cleanfemale_vcf, CleanVcf1_14.female_vcf]),
-          sv_pipeline_docker=sv_pipeline_docker
-      }
-    }
-
-    call CleanVcf1_25 {
-      input:
-        combinedsex_vcf=select_first([CleanVcf1_23.combinedsex_vcf, CleanVcf1_24.combinedsex_vcf]),
-        combinedsex_vcf_tbi=select_first([CleanVcf1_23.combinedsex_vcf_tbi, CleanVcf1_24.combinedsex_vcf_tbi]),
-        cleaninfo_vcf=CleanVcf1_5.cleaninfo_vcf,
-        cleaninfo_vcf_tbi=CleanVcf1_5.cleaninfo_vcf_tbi,
-        sv_pipeline_docker=sv_pipeline_docker
-    }
-  }
-
-  call CleanVcf1_26 {
-    input:
-      background_list=background_list,
-      cleanallo_vcf=select_first([CleanVcf1_25.cleanallo_vcf, CleanVcf1_5.cleaninfo_vcf]),
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf1_27 {
-    input:
-      int_vcf=CleanVcf1_26.int_vcf,
-      bothsides_pass_list=bothsides_pass_list,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  output {
-    File include_list=CleanVcf1_1.include_list
-    File sex=select_first([CleanVcf1_12.sexchr_revise_2, CreateEmptyFile.empty])
-    File intermediate_vcf=CleanVcf1_27.intermediate_vcf
-    File intermediate_vcf_idx=CleanVcf1_27.intermediate_vcf_idx
-  }
-}
-
-
-task CleanVcf1_1 {
-  input {
-    File vcf
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(vcf, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    ##get sampleids from VCF##
-    zcat ~{vcf} \
-      |sed -n '1,1000p' \
-      |egrep "^#" \
-      |tail -n -1 \
-      |cut -f10- \
-      |tr '\t' '\n' \
-      > includelist.txt
-
-    ##convert EV integer back into string##
-    /opt/sv-pipeline/04_variant_resolution/scripts/replace_ev_numeric_code_with_string.py ~{vcf} - | bgzip -c > EV.update.vcf.gz
-  >>>
-
-  output {
-    File include_list="includelist.txt"
-    File EV_update_vcf="EV.update.vcf.gz"
-  }
-}
-
-task CleanVcf1_2 {
-  input {
-    File EV_update_vcf
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(EV_update_vcf, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    ##convert all alt to svtype and alt to N##
-    svtk vcf2bed ~{EV_update_vcf} stdout -i SVTYPE  \
-      |awk -F"\t" '{ if ($5!~"ME")$5=$7; print $4"\t" "<"$5 ">"}' \
-      |gzip \
-      >vcf.convert.svtype.bed.gz
-  >>>
-
-  output {
-    File vcf_convert_svtype="vcf.convert.svtype.bed.gz"
-  }
-}
-task CleanVcf1_3 {
-  input {
-    File EV_update_vcf
-    File vcf_convert_svtype
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([EV_update_vcf, vcf_convert_svtype], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    zcat ~{EV_update_vcf} \
-      |awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]=$2; next} {if ($3 in inFileA && $1!~"#") $5=inFileA[$3]; print }'  \
-        <(zcat ~{vcf_convert_svtype}) - \
-      |awk '{if ($1!~"#") $4="N"; print}' OFS='\t' \
-      |bgzip \
-      >convertsvtype.vcf.gz
-  >>>
-
-  output {
-    File convertsvtype_vcf="convertsvtype.vcf.gz"
-  }
-}
-
-task CleanVcf1_4 {
-  input {
-    File convertsvtype_vcf
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(convertsvtype_vcf, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##get rid of multiallelic tage in INFO field and add varGQ to QUAL column and Members field##
-    svtk vcf2bed ~{convertsvtype_vcf} stdout -i varGQ \
-      |awk -F"\t" '{print $4 "\t" $7}' \
-      >vargq.persample
-  >>>
-
-  output {
-    File vargq_persample="vargq.persample"
-  }
-}
-
-task CleanVcf1_5 {
-  input {
-    File convertsvtype_vcf
-    File vargq_persample
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([convertsvtype_vcf, vargq_persample], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    # use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker
-    BCFTOOLS=/usr/local/bin/bcftools
-
-    zcat ~{convertsvtype_vcf} \
-      |sed 's/;MULTIALLELIC//g' \
-      |sed 's/UNRESOLVED;//g' \
-      |sed 's/;varGQ=[0-9]*//g' \
-      |awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]=$2; next} {if ($3 in inFileA && $1!~"#") $6=inFileA[$3]; print }' ~{vargq_persample} - \
-      |bgzip \
-      >cleaninfo.vcf.gz
-
-    tabix -p vcf cleaninfo.vcf.gz
-    ${BCFTOOLS} index cleaninfo.vcf.gz
-
-    zcat cleaninfo.vcf.gz|awk '{if (($1~"X" || $1~"Y") && $1!~"#" ) print}'|wc -l > count_xy.txt
-    zcat cleaninfo.vcf.gz|awk '{if ($1~"Y" && $1!~"#") print}'|wc -l > count_y.txt
-  >>>
-
-  output {
-    File cleaninfo_vcf="cleaninfo.vcf.gz"
-    File cleaninfo_vcf_tbi="cleaninfo.vcf.gz.tbi"
-    File cleaninfo_vcf_csi="cleaninfo.vcf.gz.csi"
-    Int count_xy = read_int("count_xy.txt")
-    Int count_y= read_int("count_y.txt")
-  }
-}
-
-task CleanVcf1_6 {
-  input {
-    File cleaninfo_vcf
-    File cleaninfo_vcf_tbi
-    File ped_file
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([cleaninfo_vcf, ped_file], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    svtk vcf2bed ~{cleaninfo_vcf} stdout \
-      |awk '{if (($5=="DEL" || $5=="DUP") && $3-$2>=5000 && ($1~"X" || $1~"Y") && $1!~"#") print}' \
-      >clean.bed
-
-    awk '{print $4}' clean.bed>clean.bed.ids.txt
-
-    ##male##
-    awk '{if ($5==1) print $2}' ~{ped_file} \
-      |fgrep -wf <(zcat ~{cleaninfo_vcf}|head -n 1000|fgrep "CHROM"|fgrep POS|cut -f10-|tr '\t' '\n') >male.txt
-
-    ##female##
-    awk '{if ($5==2) print $2}' ~{ped_file} \
-      |fgrep -wf <(zcat ~{cleaninfo_vcf}|head -n 1000|fgrep "CHROM"|fgrep POS|cut -f10-|tr '\t' '\n') >female.txt
-
-    cat clean.bed.ids.txt|wc -l > clean_bed_ids_count.txt
-    awk '{if ($5!=2 && $5!=1) print $2}' ~{ped_file}|wc -l > ped_file_count.txt
-  >>>
-
-  output {
-    File clean_bed="clean.bed"
-    File clean_bed_ids="clean.bed.ids.txt"
-    File male="male.txt"
-    File female="female.txt"
-    Int clean_bed_ids_count=read_int("clean_bed_ids_count.txt")
-    Int ped_file_count=read_int("ped_file_count.txt")
-  }
-}
-
-task CleanVcf1_7 {
-  input {
-    File allosome_fai
-    File cleaninfo_vcf
-    File cleaninfo_vcf_tbi
-    File clean_bed_ids
-    File male
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([cleaninfo_vcf, clean_bed_ids], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    # use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker
-    BCFTOOLS=/usr/local/bin/bcftools
-
-    awk '{print $1"\t0\t"$2}' < ~{allosome_fai} > allosomes.list
-    ${BCFTOOLS} query -R allosomes.list -S ~{male} -i 'ID=@~{clean_bed_ids}' -f '[%ID\t%SAMPLE\t%RD_CN\n]' ~{cleaninfo_vcf} \
-      | awk '{if ($3!=".") print}' \
-      | gzip > RD_CN.sexcheck.FORMAT.male.gz
-  >>>
-
-  output {
-    File RD_CN_sexcheck_FORMAT_male="RD_CN.sexcheck.FORMAT.male.gz"
-  }
-}
-
-task CleanVcf1_8 {
-  input {
-    File cleaninfo_vcf
-    File cleaninfo_vcf_tbi
-    File allosome_fai
-    File clean_bed_ids
-    File female
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(clean_bed_ids, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    # use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker
-    BCFTOOLS=/usr/local/bin/bcftools
-
-    awk '{print $1"\t0\t"$2}' < ~{allosome_fai} > allosomes.list
-    ${BCFTOOLS} query -R allosomes.list -S ~{female} -i 'ID=@~{clean_bed_ids}' -f '[%ID\t%SAMPLE\t%RD_CN\n]' ~{cleaninfo_vcf} \
-      | awk '{if ($3!=".") print}' \
-      | gzip > RD_CN.sexcheck.FORMAT.female.gz
-  >>>
-
-  output {
-    File RD_CN_sexcheck_FORMAT_female="RD_CN.sexcheck.FORMAT.female.gz"
-  }
-}
-
-task CleanVcf1_9 {
-  input {
-    File RD_CN_sexcheck_FORMAT_male
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(RD_CN_sexcheck_FORMAT_male, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    zcat ~{RD_CN_sexcheck_FORMAT_male}| Rscript -e 'd<-read.table("stdin")' \
-      -e 'x<-tapply(d[,3],d[,1],median)' \
-      -e 'write.table(x,"male.median.value.pervar.txt",col.names=FALSE,quote=FALSE,sep = "\t")'
-  >>>
-
-  output {
-    File male_median_value_pervar="male.median.value.pervar.txt"
-  }
-}
-
-task CleanVcf1_10 {
-  input {
-    File RD_CN_sexcheck_FORMAT_female
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(RD_CN_sexcheck_FORMAT_female, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    zcat ~{RD_CN_sexcheck_FORMAT_female}| Rscript -e 'd<-read.table("stdin")' \
-      -e 'x<-tapply(d[,3],d[,1],median)' \
-      -e 'write.table(x,"female.median.value.pervar.txt",col.names=FALSE,quote=FALSE,sep = "\t")'
-  >>>
-
-  output {
-    File female_median_value_pervar="female.median.value.pervar.txt"
-  }
-}
-
-task CreateEmptyFile {
-  input {
-    String linux_docker
-    RuntimeAttr? runtime_attr_override
-  }
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 1.0,
-                                  disk_gb: 10,
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: linux_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    touch empty.txt
-  >>>
-
-  output {
-    File empty="empty.txt"
-  }
-}
-
-task CleanVcf1_11 {
-  input {
-    File cleaninfo_vcf
-    File cleaninfo_vcf_tbi
-    File clean_bed
-    File? male_median_value_pervar
-    File? female_median_value_pervar
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([cleaninfo_vcf, clean_bed, male_median_value_pervar, female_median_value_pervar], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##Pull out ids where male copy state 1 to normal when female normal and on X##
-    echo "">sexchr.revise.1.txt
-
-    if [ $(awk '{if (($5=="DEL" || $5=="DUP") && $3-$2>=5000) print }' ~{clean_bed}|awk '{if (($1~"X") && $1!~"#" ) print}'|wc -l) -gt 0 ]
-    then
-      awk '{if ($2==1) print $1}' ~{male_median_value_pervar} \
-        |{ fgrep -wf <(awk '{if ($2==2) print $1}' ~{female_median_value_pervar}) || true; } \
-        |{ fgrep -wf  - <(zcat ~{cleaninfo_vcf}|awk '{if ($1~"X" && $1!~"#") print $3}') || true; } \
-        >sexchr.revise.1.txt
-    fi
-  >>>
-
-  output {
-    File sexchr_revise_1="sexchr.revise.1.txt"
-  }
-}
-
-task CleanVcf1_12 {
-  input {
-    File sexchr_revise_1
-    File cleaninfo_vcf
-    File cleaninfo_vcf_tbi
-    File clean_bed
-    File male_median_value_pervar
-    File female_median_value_pervar
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([sexchr_revise_1, cleaninfo_vcf, clean_bed, male_median_value_pervar, female_median_value_pervar], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    mv ~{sexchr_revise_1} sexchr.revise.2.txt
-    if [ $(awk '{if (($5=="DEL" || $5=="DUP") && $3-$2>=5000) print }' ~{clean_bed}|awk '{if (($1~"Y") && $1!~"#" ) print}'|wc -l) -gt 0 ]
-    then
-      awk '{if ($2==1) print $1}' ~{male_median_value_pervar} \
-        |{ fgrep -wf <(awk '{if ($2==0) print $1}' ~{female_median_value_pervar}) || true; } \
-        |{ fgrep -wf - <(zcat ~{cleaninfo_vcf}|awk '{if ($1~"Y" && $1!~"#") print $3}') || true; } \
-        >>sexchr.revise.2.txt
-    fi
-  >>>
-
-  output {
-    File sexchr_revise_2="sexchr.revise.2.txt"
-  }
-}
-
-
-task CleanVcf1_13 {
-  input {
-    File cleaninfo_vcf
-    File cleaninfo_vcf_csi
-    File male
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([cleaninfo_vcf, male], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    # use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker
-    BCFTOOLS=/usr/local/bin/bcftools
-
-    ##Pull out male sex chr##
-    ${BCFTOOLS} view ~{cleaninfo_vcf} -S ~{male} -r chrX:1-1000000000,chrY:1-1000000000,X:1-1000000000,Y:1-1000000000 --no-update|bgzip>male.vcf.gz
-    ${BCFTOOLS} index male.vcf.gz
-  >>>
-
-  output {
-    File male_vcf="male.vcf.gz"
-    File male_vcf_csi="male.vcf.gz.csi"
-  }
-}
-
-
-task CleanVcf1_14 {
-  input {
-    File cleaninfo_vcf
-    File cleaninfo_vcf_csi
-    File female
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([cleaninfo_vcf, female], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    # use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker
-    BCFTOOLS=/usr/local/bin/bcftools
-
-    ##Pull out female sex chr##
-    ${BCFTOOLS} view ~{cleaninfo_vcf} -S ~{female} -r chrX:1-1000000000,chrY:1-1000000000,X:1-1000000000,Y:1-1000000000 --no-update|bgzip>female.vcf.gz
-    ${BCFTOOLS} index female.vcf.gz
-  >>>
-
-  output {
-    File female_vcf="female.vcf.gz"
-    File female_vcf_csi="female.vcf.gz.csi"
-  }
-}
-
-
-task CleanVcf1_15 {
-  input {
-    File male_vcf
-    File sexchr_revise_2
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([male_vcf, sexchr_revise_2], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    zcat ~{male_vcf}\
-      |awk -F'\t' '{if ($5~"DEL" && $1!~"#") print $0 "\t" "ENDOFLINE"}' \
-      |{ fgrep -wf ~{sexchr_revise_2} || true; } \
-      |tr '\t' '\n' \
-      |awk -F':' '{if ($3>=1 && NF>4 && $1!="GT") $1="0/0";else if ($3==0 && NF>4 && $1!="GT" ) $1="0/1"; if (NF>4 && $1!="GT") $3=$3+1;print}' OFS=":" \
-      |tr '\n' '\t' \
-      |sed 's/ENDOFLINE/\n/g' \
-      |sed -e 's/^[ \t]*//' \
-      |sed -e 's/[\t]$//g' \
-      |bgzip \
-      >male_del.revise.txt.gz
-    zcat male_del.revise.txt.gz|wc -l > count.txt
-  >>>
-
-  output {
-    File male_del_revise_txt="male_del.revise.txt.gz"
-    Int count=read_int("count.txt")
-  }
-}
-
-
-task CleanVcf1_16 {
-  input {
-    File male_vcf
-    File sexchr_revise_2
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([male_vcf, sexchr_revise_2], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    zcat ~{male_vcf}\
-      |awk -F'\t' '{if ($5~"DUP" && $1!~"#") print $0 "\t" "ENDOFLINE"}' \
-      |{ fgrep -wf ~{sexchr_revise_2} || true; } \
-      |tr '\t' '\n' \
-      |awk -F':' '{if ($3<=1 && NF>4 && $1!="GT") $1="0/0";else if ($3==2 && NF>4 && $1!="GT" ) $1="0/1";else if (NF>4 && $1!="GT" ) $1="1/1"; if (NF>4 && $1!="GT" ) $3=$3+1;print}' OFS=":" \
-      |tr '\n' '\t' \
-      |sed 's/ENDOFLINE/\n/g' \
-      |sed -e 's/^[ \t]*//' \
-      |sed -e 's/[\t]$//g' \
-      |bgzip \
-      >male_dup.revise.txt.gz
-    zcat male_dup.revise.txt.gz|wc -l > count.txt
-  >>>
-
-  output {
-    File male_dup_revise_txt="male_dup.revise.txt.gz"
-    Int count=read_int("count.txt")
-  }
-}
-
-task CleanVcf1_17 {
-  input {
-    File male_vcf
-    File male_dup_revise_txt
-    File male_del_revise_txt
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([male_vcf, male_dup_revise_txt, male_del_revise_txt], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 10.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    # use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker
-    BCFTOOLS=/usr/local/bin/bcftools
-
-    cat <(zcat ~{male_vcf}|fgrep -wvf <(zcat ~{male_dup_revise_txt} ~{male_del_revise_txt}|awk '{print $3}' )) \
-      <(zcat ~{male_del_revise_txt} ~{male_dup_revise_txt}|awk '{if ($1!="") print}'|tr ' ' '\t') \
-      |vcf-sort \
-      |bgzip \
-      >cleanmale.vcf.gz
-    ${BCFTOOLS} index cleanmale.vcf.gz
-  >>>
-
-  output {
-    File cleanmale_vcf="cleanmale.vcf.gz"
-    File cleanmale_vcf_csi="cleanmale.vcf.gz.csi"
-  }
-}
-
-
-
-task CleanVcf1_18 {
-  input {
-    File female_vcf
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(female_vcf, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    zcat ~{female_vcf}\
-      |awk -F'\t' '{if ($1!~"#" && $1~"Y") print $0 "\t" "ENDOFLINE"}' \
-      |tr '\t' '\n' \
-      |awk -F':' '{ if (NF>4 && $1!="GT" ) $1="./."  \
-      ;if (NF>4 && $1!="GT" ) $2=$3=$4=$5=$6=$7=$8=$9=".";print}' OFS=":" \
-      |tr '\n' '\t' \
-      |sed 's/ENDOFLINE/\n/g' \
-      |sed -e 's/^[ \t]*//' \
-      |sed -e 's/[\t]$//g' \
-      |bgzip \
-      >female.y.revise.txt.gz
-  >>>
-
-  output {
-    File female_y_revise_txt="female.y.revise.txt.gz"
-  }
-}
-
-
-
-task CleanVcf1_19 {
-  input {
-    File female_vcf
-    File female_y_revise_txt
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([female_vcf, female_y_revise_txt], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    # use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker
-    BCFTOOLS=/usr/local/bin/bcftools
-
-    cat <(zcat ~{female_vcf} \
-      |fgrep -wvf <(zcat ~{female_y_revise_txt}|awk '{print $3}' )) \
-      <(zcat ~{female_y_revise_txt}) \
-      |vcf-sort \
-      |bgzip \
-      >cleanfemale.vcf.gz
-    ${BCFTOOLS} index cleanfemale.vcf.gz
-  >>>
-
-  output {
-    File cleanfemale_vcf="cleanfemale.vcf.gz"
-    File cleanfemale_vcf_csi="cleanfemale.vcf.gz.csi"
-  }
-}
-
-
-
-task CleanVcf1_20 {
-  input {
-    File cleaninfo_vcf
-    File cleaninfo_vcf_csi
-    File ped_file
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([cleaninfo_vcf, ped_file], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    # use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker
-    BCFTOOLS=/usr/local/bin/bcftools
-
-    awk '{if ($5!=2 && $5!=1) print $2}' ~{ped_file}>other.txt
-    ${BCFTOOLS} view ~{cleaninfo_vcf} -S other.txt -r chrX:1-1000000000,chrY:1-1000000000,X:1-1000000000,Y:1-1000000000 --no-update|bgzip>other.vcf.gz
-    ${BCFTOOLS} index other.vcf.gz
-  >>>
-
-  output {
-    File other_vcf="other.vcf.gz"
-    File other_vcf_csi="other.vcf.gz.csi"
-  }
-}
-
-task CleanVcf1_21 {
-  input {
-    File other_vcf
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(other_vcf, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    zcat ~{other_vcf}\
-      |awk -F'\t' '{if ($1!~"#") print $0 "\t" "ENDOFLINE"}' \
-      |tr '\t' '\n' \
-      |awk -F':' '{ if (NF>4 && $1!="GT" ) $1="./.";print}' OFS=":" \
-      |tr '\n' '\t' \
-      |sed 's/ENDOFLINE/\n/g' \
-      |sed -e 's/^[ \t]*//' \
-      |sed -e 's/[\t]$//g' \
-      |bgzip \
-      >other.revise.txt.gz
-  >>>
-
-  output {
-    File other_revise_txt="other.revise.txt.gz"
-  }
-}
-
-
-
-task CleanVcf1_22 {
-  input {
-    File other_vcf
-    File other_revise_txt
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([other_vcf, other_revise_txt], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    # use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker
-    BCFTOOLS=/usr/local/bin/bcftools
-
-    cat <(zcat ~{other_vcf} \
-      |fgrep -wvf <(zcat ~{other_revise_txt}|awk '{print $3}' )) \
-      <(zcat ~{other_revise_txt}) \
-      |vcf-sort \
-      |bgzip \
-      >cleanother.vcf.gz
-    ${BCFTOOLS} index cleanother.vcf.gz
-  >>>
-
-  output {
-    File cleanother_vcf="cleanother.vcf.gz"
-    File cleanother_vcf_csi="cleanother.vcf.gz.csi"
-  }
-}
-
-
-
-task CleanVcf1_23 {
-  input {
-    File cleanmale_vcf
-    File cleanfemale_vcf
-    File cleanother_vcf
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([cleanmale_vcf, cleanfemale_vcf, cleanother_vcf], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    cat <(zcat ~{cleanmale_vcf}|egrep "##") \
-      <(paste <(zcat ~{cleanmale_vcf}|egrep -v "##") <(zcat ~{cleanfemale_vcf}|cut -f10-|egrep -v "##") <(zcat ~{cleanother_vcf}|cut -f10-|egrep -v "##") ) \
-      |bgzip \
-      >combinedsex.vcf.gz
-    tabix -p vcf combinedsex.vcf.gz
-  >>>
-
-  output {
-    File combinedsex_vcf="combinedsex.vcf.gz"
-    File combinedsex_vcf_tbi="combinedsex.vcf.gz.tbi"
-  }
-}
-
-task CleanVcf1_24 {
-  input {
-    File cleanmale_vcf
-    File cleanfemale_vcf
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([cleanmale_vcf, cleanfemale_vcf], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    cat <(zcat ~{cleanmale_vcf}|egrep "##") \
-      <(paste <(zcat ~{cleanmale_vcf}|egrep -v "##") <(zcat ~{cleanfemale_vcf}|cut -f10-|egrep -v "##"))  \
-      |bgzip \
-      >combinedsex.vcf.gz
-    tabix -p vcf combinedsex.vcf.gz
-  >>>
-
-  output {
-    File combinedsex_vcf="combinedsex.vcf.gz"
-    File combinedsex_vcf_tbi="combinedsex.vcf.gz.tbi"
-  }
-}
-
-task CleanVcf1_25 {
-  input {
-    File combinedsex_vcf
-    File combinedsex_vcf_tbi
-    File cleaninfo_vcf
-    File cleaninfo_vcf_tbi
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([combinedsex_vcf, cleaninfo_vcf], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    zcat ~{combinedsex_vcf}|awk '{if ($1!~"#") print $3}'>modified.ids.txt
-
-    ##shuffle sex ids backinto place to match original vcf and back to initial vcf##
-    vcf-shuffle-cols -t ~{cleaninfo_vcf} ~{combinedsex_vcf} \
-      |awk '{if ($1!~"#") print}' \
-      |cat <(zcat ~{cleaninfo_vcf}|fgrep -wvf modified.ids.txt ) - \
-      |vcf-sort \
-      |bgzip \
-      >cleanallo.vcf.gz
-  >>>
-
-  output {
-    File cleanallo_vcf="cleanallo.vcf.gz"
-  }
-}
-
-task CleanVcf1_26 {
-  input {
-    File background_list
-    File cleanallo_vcf
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([background_list, cleanallo_vcf], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    # the code below will not print any lines if the background list file is empty, so add a dummy sentinel record at the end
-    cat ~{background_list} <(echo "XXX_SENTINEL_XXX") > background_list_with_sentinel.list
-
-    ##change tag for SR background failures and Unresolved##
-    zcat ~{cleanallo_vcf} \
-      |awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]; next} {if ($3 in inFileA && $1!~"#") $7=$7";HIGH_SR_BACKGROUND"; print }' <(awk '{print $NF}' background_list_with_sentinel.list) - \
-      |awk '{if (NR==2) print $0 "\n" "##FILTER=<ID=HIGH_SR_BACKGROUND,Description=\"High number of SR splits in background samples indicating messy region\">" ;else print}' \
-      |awk '{if ($8~"UNRESOLVED") $7=$7";UNRESOLVED";print}' OFS='\t' \
-      |awk '{if (NR==2) print $0 "\n" "##FILTER=<ID=UNRESOLVED,Description=\"Variant is unresolved\">" ;else print}' \
-      |bgzip \
-      >int.vcf.gz
-  >>>
-
-  output {
-    File int_vcf="int.vcf.gz"
-  }
-}
-
-task CleanVcf1_27 {
-  input {
-    File int_vcf
-    File bothsides_pass_list
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([int_vcf, bothsides_pass_list], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    /opt/sv-pipeline/04_variant_resolution/scripts/add_bothsides_support_filter.py \
-      --bgzip \
-      --outfile int.w_bothsides.vcf.gz \
-      ~{int_vcf} \
-      ~{bothsides_pass_list}
-    tabix int.w_bothsides.vcf.gz
-  >>>
-
-  output {
-    File intermediate_vcf="int.w_bothsides.vcf.gz"
-    File intermediate_vcf_idx="int.w_bothsides.vcf.gz.tbi"
-  }
-}
diff --git a/wdl/CleanVcf1b.wdl b/wdl/CleanVcf1b.wdl
index d9537921b..a93bce670 100644
--- a/wdl/CleanVcf1b.wdl
+++ b/wdl/CleanVcf1b.wdl
@@ -1,790 +1,354 @@
 version 1.0
 
 import "Structs.wdl"
+import "CleanVcf5.wdl" as CleanVcf5
+import "TasksMakeCohortVcf.wdl" as MiniTasks
 
 workflow CleanVcf1b {
-  input {
-    File intermediate_vcf
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override # TODO
-  }
-
-  call CleanVcf1b_1 {
-    input:
-      intermediate_vcf=intermediate_vcf,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf1b_2 {
-    input:
-      int_bed=CleanVcf1b_1.int_bed,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf1b_3 {
-    input:
-      int_vcf=intermediate_vcf,
-      normoverlap=CleanVcf1b_2.normoverlap,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf1b_4 {
-    input:
-      int_vcf=intermediate_vcf,
-      normoverlap=CleanVcf1b_2.normoverlap,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf1b_5 {
-    input:
-      normoverlap=CleanVcf1b_2.normoverlap,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf1b_6 {
-    input:
-      overlap_test=CleanVcf1b_5.overlap_test,
-      rd_cn_normcheck=CleanVcf1b_3.rd_cn_normcheck,
-      ev_normcheck=CleanVcf1b_4.ev_normcheck,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf1b_7 {
-    input:
-      geno_normal_revise=CleanVcf1b_6.geno_normal_revise,
-      int_vcf=intermediate_vcf,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf1b_8 {
-    input:
-      subset_vcf=CleanVcf1b_7.subset_vcf,
-      geno_normal_revise=CleanVcf1b_6.geno_normal_revise,
-      col=CleanVcf1b_1.col,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf1b_9 {
-    input:
-      normal_revise_vcf_lines=CleanVcf1b_8.normal_revise_vcf_lines,
-      int_vcf=intermediate_vcf,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf1b_10 {
-    input:
-      normal_revise_vcf=CleanVcf1b_9.normal_revise_vcf,
-      normal_revise_vcf_csi=CleanVcf1b_9.normal_revise_vcf_csi,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf1b_11 {
-    input:
-      copystate_rd_cn=CleanVcf1b_10.copystate_rd_cn,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf1b_12 {
-    input:
-      int_bed=CleanVcf1b_1.int_bed,
-      copystate_per_variant=CleanVcf1b_11.copystate_per_variant,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf1b_13 {
-    input:
-      int_bed=CleanVcf1b_1.int_bed,
-      multi_del=CleanVcf1b_12.multi_del,
-      copystate_per_variant=CleanVcf1b_11.copystate_per_variant,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  output {
-    File multi = CleanVcf1b_13.multi
-    File normal = CleanVcf1b_9.normal_revise_vcf
-    File vcftools_idx = CleanVcf1b_9.normal_revise_vcf_csi
-  }
+    input {
+        File intermediate_vcf
+        String prefix
+        Int records_per_shard
+
+        String sv_pipeline_docker
+        String sv_base_mini_docker
+        String sv_pipeline_updates_docker
+
+        RuntimeAttr? runtime_attr_override_subset_large_cnvs
+        RuntimeAttr? runtime_attr_override_sort_bed
+        RuntimeAttr? runtime_attr_override_intersect_bed
+        RuntimeAttr? runtime_attr_override_build_dict
+        RuntimeAttr? runtime_attr_override_scatter
+        RuntimeAttr? runtime_attr_override_filter_vcf
+        RuntimeAttr? runtime_override_concat_vcfs
+        RuntimeAttr? runtime_override_cat_multi_cnvs
+    }
+
+    call SubsetLargeCNVs {
+        input:
+            vcf=intermediate_vcf,
+            prefix="~{prefix}.subset_large_cnvs",
+            sv_pipeline_docker=sv_pipeline_docker,
+            runtime_attr_override=runtime_attr_override_subset_large_cnvs
+    }
+
+    call Vcf2Bed {
+        input:
+            vcf=SubsetLargeCNVs.out,
+            prefix="~{prefix}.subset_large_cnvs",
+            sv_pipeline_docker=sv_pipeline_docker,
+            runtime_attr_override=runtime_attr_override_subset_large_cnvs
+    }
+
+    call SortBed {
+        input:
+            bed=Vcf2Bed.out,
+            prefix="~{prefix}.subset_large_cnvs.sorted",
+            sv_base_mini_docker=sv_base_mini_docker,
+            runtime_attr_override=runtime_attr_override_sort_bed
+    }
+
+    call BedtoolsIntersect {
+        input:
+            bed=SortBed.out,
+            prefix="~{prefix}.bedtools_intersect",
+            sv_base_mini_docker=sv_base_mini_docker,
+            runtime_attr_override=runtime_attr_override_intersect_bed
+    }
+
+    call BuildGenoNormalReviseDictionary {
+        input:
+            filtered_vcf=SubsetLargeCNVs.out,
+            intersect_bed=BedtoolsIntersect.out,
+            prefix="~{prefix}.geno_normal_revise",
+            sv_pipeline_docker=sv_pipeline_docker,
+            runtime_attr_override=runtime_attr_override_build_dict
+    }
+
+    call MiniTasks.ScatterVcf {
+        input:
+            vcf=intermediate_vcf,
+            records_per_shard=records_per_shard,
+            prefix="~{prefix}.scatter_vcf",
+            sv_pipeline_docker=sv_pipeline_updates_docker,
+            runtime_attr_override=runtime_attr_override_scatter
+    }
+
+    scatter ( i in range(length(ScatterVcf.shards)) ) {
+        call FilterVcf {
+            input:
+                intermediate_vcf=ScatterVcf.shards[i],
+                dictionary_json_gz=BuildGenoNormalReviseDictionary.out,
+                prefix="~{prefix}.filter_vcf.shard_~{i}",
+                sv_pipeline_docker=sv_pipeline_docker,
+                runtime_attr_override=runtime_attr_override_filter_vcf
+        }
+    }
+
+    call MiniTasks.ConcatVcfs as ConcatCleanVcf1bShards {
+        input:
+            vcfs=FilterVcf.out,
+            naive=true,
+            sort_vcf_list=true,
+            outfile_prefix="~{prefix}.concat_vcfs",
+            sv_base_mini_docker=sv_pipeline_updates_docker,
+            runtime_attr_override=runtime_override_concat_vcfs
+    }
+
+    call MiniTasks.CatUncompressedFiles as ConcatMultiCnvs  {
+        input:
+            shards=FilterVcf.multi_cnvs,
+            outfile_name="~{prefix}.multi.cnvs.txt",
+            sv_base_mini_docker=sv_base_mini_docker,
+            runtime_attr_override=runtime_override_cat_multi_cnvs
+    }
+
+    output {
+        File normal = ConcatCleanVcf1bShards.concat_vcf
+        File multi = ConcatMultiCnvs.outfile
+    }
 }
 
-
-task CleanVcf1b_1 {
-  input {
-    File intermediate_vcf
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(intermediate_vcf, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 2.0 + input_size * 1.5,
-                                  disk_gb: ceil(10.0 + input_size * 10.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##gzipped vcf from clean vcf part1.sh##
-    int_vcf_gz=~{intermediate_vcf}
-
-    ##Remove CNVs that are improperly genotyped by depth because they are nested within a real CNV##
-
-    ##Determine columns of VCF after header##
-    zcat $int_vcf_gz\
-    |sed -n '1,1000p'\
-    |egrep ^# \
-    |tail -n 1 \
-    |tr '\t' '\n' \
-    |cat -n - \
-    >col.txt
-
-    ##Only affects CNV so pull those out##
-    zcat $int_vcf_gz \
-      |awk '{if ($5~"DEL" || $5~"DUP" || $1~"#") print}' \
-      |svtk vcf2bed stdin tmp.bed
-    awk -F"\t" '{if ($6=="") print $6="blanksample";print $0}' OFS='\t' tmp.bed \
-      |gzip>int.bed.gz
-  >>>
-
-  output {
-    File col="col.txt"
-    File int_bed="int.bed.gz"
-  }
-
-}
-
-
-task CleanVcf1b_2 {
-  input {
-    File int_bed
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(int_bed, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 2.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##list of potenital overlaps with a normal copy state variant (>5kb variants require depth but nested events could be missed; i.e a duplication with a nest deletion will have a normal copy state for the deletion)##
-    ##flip bed intersect so largest is CNV is always first##
-    bedtools intersect -wa -wb -a <(zcat ~{int_bed}|awk '{if ($3-$2>=5000 ) print}') \
-    -b <(zcat ~{int_bed}|awk '{if ($3-$2>=5000) print}') \
-    |awk -F'\t' '{if ($4!=$10 && $3-$2>=$9-$8 && $5!=$11) print ;\
-    else if ($4!=$10 && $5!=$11) print $7,$8,$9,$10,$11,$12,$1,$2,$3,$4,$5,$6}' OFS='\t' \
-    |awk -F'\t' '{if ($6!="blanksample") print}' \
-    |sort -u \
-    >normaloverlap.txt
-  >>>
-
-  output {
-    File normoverlap="normaloverlap.txt"
-  }
-
-}
-
-
-task CleanVcf1b_3 {
-  input {
-    File int_vcf
-    File normoverlap
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([int_vcf, normoverlap], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 7.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##pull out the depth based copy number variant for each normal overlapping variant##
-    int_vcf_gz=~{int_vcf}
-    cat <(zcat $int_vcf_gz|awk -F"\t" '{if ($1~"#") print}') \
-        <(awk '{print $4 "\n" $10}' ~{normoverlap}|sort -u|fgrep -wf - <(zcat $int_vcf_gz)) \
-      |awk '{if ($1!~"#") $1=$3;print}' OFS="\t" \
-      |awk '{if ($1~"#" || $5=="<DEL>" || $5=="<DUP>") print}' \
-      |vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \
-      |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1"@"header[j] "\t" $j }' \
-      |sort -k1,1 \
-      |gzip \
-      >RD_CN.normalcheck.FORMAT.gz
-  >>>
-
-  output {
-    File rd_cn_normcheck="RD_CN.normalcheck.FORMAT.gz"
-  }
-
-}
-
-
-
-task CleanVcf1b_4 {
-  input {
-    File int_vcf
-    File normoverlap
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([int_vcf, normoverlap], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 7.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##pull out evidence supporting each normal overlapping variant##
-    int_vcf_gz=~{int_vcf}
-    cat <(zcat $int_vcf_gz|awk -F"\t" '{if ($1~"#") print}') \
-      <(awk '{print $4 "\n" $10}' ~{normoverlap}|sort -u|fgrep -wf - <(zcat $int_vcf_gz)) \
-      |awk '{if ($1!~"#") $1=$3;print}' OFS="\t"\
-      |vcftools --vcf - --stdout --extract-FORMAT-info EV \
-      |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1"@"header[j] "\t" $j }' \
-      |sort -k1,1 \
-      |gzip \
-      >EV.normalcheck.FORMAT.gz
-  >>>
-
-  output {
-    File ev_normcheck="EV.normalcheck.FORMAT.gz"
-  }
-
-}
-
-
-task CleanVcf1b_5 {
-  input {
-    File normoverlap
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(normoverlap, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 3.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euo pipefail
-
-    ##check if nested is incorrectly classified as normal##
-    touch overlap.test.txt
-    while read bed
-    do
-      echo $bed|tr ' ' '\t'|cut -f1-6 >large.bed
-      echo $bed|tr ' ' '\t'|cut -f7-12>small.bed
-      ##require at least 50% coverage to consider a variant overlapping##
-      overlap=$(bedtools coverage -a small.bed -b large.bed|awk '{if ($NF>=0.50) print "YES";else print "NO"}')
-
-      if [ "$overlap" == "YES" ]
-      then
-        smallid=$(awk '{print $4}' small.bed)
-
-        ##pull out variants that are called a variants for both the smaller and larger CNVs (don't have normal copy state to check for)##
-        if [ $(awk '{print $NF}' small.bed \
-        |tr ',' '\n' \
-        |fgrep -wvf - <(awk -F"[,\t]" -v var=$smallid '{for(i=6;i<=NF;i++) print var"@"$i "\t" $4"@"$i "\t" $5}' large.bed)|wc -l) -gt 0 ]
-        then
-          awk '{print $NF}' small.bed \
-          |tr ',' '\n' \
-          |fgrep -wvf - <(awk -F"[,\t]" -v var=$smallid '{for(i=6;i<=NF;i++) print var"@"$i "\t" $4"@"$i "\t" $5}' large.bed) \
-          >>overlap.test.txt
-        fi
-      fi
-    done<~{normoverlap}
-  >>>
-
-  output {
-    File overlap_test="overlap.test.txt"
-  }
-
-}
-
-
-task CleanVcf1b_6 {
-  input {
-    File overlap_test
-    File rd_cn_normcheck
-    File ev_normcheck
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([overlap_test, rd_cn_normcheck, ev_normcheck], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 4.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##determine variants that need to be revised from a normal copy state into a CNV##
-    cat ~{overlap_test} \
-    |sort -k1,1 \
-    |join -j 1 - <(zcat ~{rd_cn_normcheck}) \
-    |join -j 1 - <(zcat ~{ev_normcheck}) \
-    |tr ' ' '\t' \
-    |sort -k2,2 \
-    |join -1 2 -2 1 - <(zcat ~{rd_cn_normcheck}) \
-    |awk '{if ($3=="DUP" && $4==2 && $6==3) print $2 "\t" 1; else if ($3=="DEL" && $4==2 && $6==1)  print $2 "\t" 3 }' \
-    |tr '@' '\t'\
-    >geno.normal.revise.txt
-
-  >>>
-
-  output {
-    File geno_normal_revise="geno.normal.revise.txt"
-  }
-
+task SubsetLargeCNVs {
+    input {
+        File vcf
+        String prefix
+        String sv_pipeline_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float input_size = size(vcf, "GB")
+    RuntimeAttr runtime_default = object {
+                                      mem_gb: 3.75,
+                                      disk_gb: ceil(10.0 + input_size * 2.0),
+                                      cpu_cores: 1,
+                                      preemptible_tries: 3,
+                                      max_retries: 1,
+                                      boot_disk_gb: 10
+                                  }
+    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    runtime {
+        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        docker: sv_pipeline_docker
+        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+    }
+
+    command <<<
+        set -euo pipefail
+        bcftools view --no-version \
+            -i '(INFO/SVTYPE=="DEL" || INFO/SVTYPE=="DUP") && INFO/SVLEN>=5000' \
+            ~{vcf} \
+            | bgzip \
+            > ~{prefix}.vcf.gz
+    >>>
+    output {
+        File out = "~{prefix}.vcf.gz"
+    }
 }
 
-
-task CleanVcf1b_7 {
-  input {
-    File int_vcf
-    File geno_normal_revise
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([int_vcf, geno_normal_revise], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + input_size * 3.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##Update genotypes##
-    { zfgrep -wf <(awk '{print $1}' ~{geno_normal_revise}|sort -u) ~{int_vcf} || true; }\
-    |bgzip \
-    >subset.vcf.gz
-  >>>
-
-  output {
-    File subset_vcf="subset.vcf.gz"
-  }
-
+task Vcf2Bed {
+    input {
+        File vcf
+        String prefix
+        String sv_pipeline_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float input_size = size(vcf, "GB")
+    RuntimeAttr runtime_default = object {
+                                      mem_gb: 3.75,
+                                      disk_gb: ceil(10.0 + input_size * 2.0),
+                                      cpu_cores: 1,
+                                      preemptible_tries: 3,
+                                      max_retries: 1,
+                                      boot_disk_gb: 10
+                                  }
+    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    runtime {
+        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        docker: sv_pipeline_docker
+        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+    }
+
+    command <<<
+        set -euo pipefail
+        svtk vcf2bed --no-header ~{vcf} stdout \
+            | awk -F'\t' -v OFS='\t' '{if ($6=="") $6="blanksample";print $0}' \
+            | gzip -1 \
+            > ~{prefix}.bed.gz
+    >>>
+    output {
+        File out = "~{prefix}.bed.gz"
+    }
 }
 
-
-task CleanVcf1b_8 {
-  input {
-    File subset_vcf
-    File geno_normal_revise
-    File col
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([subset_vcf, geno_normal_revise, col], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 10.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euo pipefail
-
-    python3 <<CODE > normal.revise.vcf.lines.txt
-    import gzip
-    import sys
-
-    VCF='~{subset_vcf}'
-    REVISE='~{geno_normal_revise}'
-    COL='~{col}'
-
-    # Grab regenotyped samples of interest
-    sys.stderr.write("Reading {}...\n".format(REVISE))
-    geno_dict = {}
-    with open(REVISE) as f:
-      for line in f:
-        tokens = line.strip().split('\t')
-        vid = tokens[0]
-        if vid not in geno_dict:
-          geno_dict[vid] = []
-        geno_dict[vid].append(tokens[1]) # id.txt but only sample id
-
-    # Column definitions
-    sys.stderr.write("Reading {}...\n".format(COL))
-    sample_columns_dict = {}
-    with open(COL) as f:
-      for line in f:
-        tokens = line.strip().split('\t')
-        sample_columns_dict[tokens[1]] = int(tokens[0]) - 1
-
-    # Assign GT/GQ
-    sys.stderr.write("Reassigning genotypes...\n")
-    with gzip.open(VCF, 'rb') as f:
-      for lineb in f:
-        line = lineb.decode('utf-8').strip()
-        vid = line.split('\t', 3)[2]
-        if vid in geno_dict:
-          sample_ids = geno_dict[vid]
-          tokens = line.split('\t')
-          sample_indexes = [sample_columns_dict[s] for s in sample_ids]
-          for i in sample_indexes:
-            entry = tokens[i].split(':', 4)
-            entry[0] = "0/1"
-            entry[1] = entry[3]
-            tokens[i] = ":".join(entry)
-          sys.stdout.write("{}\t\n".format("\t".join(tokens)))
-        else:
-          sys.stdout.write("{}\t\n".format(line))
-    CODE
-  >>>
-
-  output {
-    File normal_revise_vcf_lines="normal.revise.vcf.lines.txt"
-  }
-
+task SortBed {
+    input {
+        File bed
+        String prefix
+        String sv_base_mini_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float input_size = size(bed, "GB")
+    RuntimeAttr runtime_default = object {
+                                      mem_gb: 3.75,
+                                      disk_gb: ceil(10.0 + input_size * 10.0),
+                                      cpu_cores: 1,
+                                      preemptible_tries: 3,
+                                      max_retries: 1,
+                                      boot_disk_gb: 10
+                                  }
+    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    runtime {
+        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        docker: sv_base_mini_docker
+        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+    }
+
+    command <<<
+        set -euo pipefail
+        mkdir tmp
+        zcat ~{bed} \
+            | sort -T tmp -k1,1 -k2,2n \
+            | gzip -1 \
+            > ~{prefix}.bed.gz
+    >>>
+    output {
+        File out = "~{prefix}.bed.gz"
+    }
 }
 
-
-
-task CleanVcf1b_9 {
-  input {
-    File int_vcf
-    File normal_revise_vcf_lines
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([int_vcf, normal_revise_vcf_lines], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 15,
-                                  disk_gb: ceil(10.0 + input_size * 50.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##rewrite vcf with updated genotypes##
-    awk '{print $3}' ~{normal_revise_vcf_lines}|sort -u > vids.list
-    cat <(zcat ~{int_vcf} | fgrep -wvf vids.list) \
-      <(sed 's/\t$//' ~{normal_revise_vcf_lines}) \
-      |vcf-sort \
-      |bgzip \
-      >normal.revise.vcf.gz
-
-    bcftools index normal.revise.vcf.gz
-  >>>
-
-  output {
-    File normal_revise_vcf="normal.revise.vcf.gz"
-    File normal_revise_vcf_csi="normal.revise.vcf.gz.csi"
-  }
-
+task BedtoolsIntersect {
+    input {
+        File bed
+        String prefix
+        String sv_base_mini_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float input_size = size(bed, "GB")
+    RuntimeAttr runtime_default = object {
+                                      mem_gb: 3.75,
+                                      disk_gb: ceil(10.0 + input_size * 10.0),
+                                      cpu_cores: 1,
+                                      preemptible_tries: 3,
+                                      max_retries: 1,
+                                      boot_disk_gb: 10
+                                  }
+    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    runtime {
+        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        docker: sv_base_mini_docker
+        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+    }
+
+    command <<<
+        set -euo pipefail
+        bedtools intersect -sorted -wa -wb -a <(zcat ~{bed}) -b <(zcat ~{bed}) \
+            | awk -F'\t' -v OFS='\t' '$4!=$10 && $5!=$11' \
+            | gzip -1 \
+            > ~{prefix}.bed.gz
+    >>>
+    output {
+        File out = "~{prefix}.bed.gz"
+    }
 }
 
-
-task CleanVcf1b_10 {
-  input {
-    File normal_revise_vcf
-    File normal_revise_vcf_csi
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(normal_revise_vcf, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 15,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##get copy state per variant##
-    zcat ~{normal_revise_vcf} \
-      |awk '{if ($1!~"#") $1=$3;print}' OFS="\t" \
-      |vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \
-      |gzip \
-      >copystate.RD_CN.FORMAT.gz
-  >>>
-
-  output {
-    File copystate_rd_cn="copystate.RD_CN.FORMAT.gz"
-  }
-
+task BuildGenoNormalReviseDictionary {
+    input {
+        File filtered_vcf
+        File intersect_bed
+        String prefix
+        String sv_pipeline_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float input_size = size([filtered_vcf, intersect_bed], "GB")
+    RuntimeAttr runtime_default = object {
+                                      mem_gb: 3.75,
+                                      disk_gb: ceil(10.0 + input_size * 2.0),
+                                      cpu_cores: 1,
+                                      preemptible_tries: 3,
+                                      max_retries: 1,
+                                      boot_disk_gb: 10
+                                  }
+    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    runtime {
+        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        docker: sv_pipeline_docker
+        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+    }
+
+    command <<<
+        set -euo pipefail
+        python /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py ~{filtered_vcf} ~{intersect_bed} \
+            | gzip -1 \
+            > ~{prefix}.json.gz
+    >>>
+    output {
+        File out = "~{prefix}.json.gz"
+    }
 }
 
-
-task CleanVcf1b_11 {
-  input {
-    File copystate_rd_cn
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(copystate_rd_cn, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 15,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##get copy state per variant##
-    zcat ~{copystate_rd_cn} \
-      |awk 'NR>1{for(i=3;i<=NF;i++) lines[$1 "\t" $i]++ } END{for (x in lines) print x}' \
-      |gzip \
-      >copystate.per.variant.txt.gz
-  >>>
-
-  output {
-    File copystate_per_variant="copystate.per.variant.txt.gz"
-  }
-
-}
-
-
-task CleanVcf1b_12 {
-  input {
-    File int_bed
-    File copystate_per_variant
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([int_bed, copystate_per_variant], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 15,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##Find multi-allelic for del or dup ; CNV >1kb we trust depth ##
-    ##del##
-    zcat ~{copystate_per_variant} \
-      |awk '{if ($2!="." && $2>3) print $1}' \
-      |sort -u \
-      |{ fgrep -wf <(zcat ~{int_bed}|awk -F"\t" '{if ($5=="DEL" && $3-$2>=1000) print $4}' ) || true; } \
-      >multi.cnvs.del.txt
-  >>>
-
-  output {
-    File multi_del="multi.cnvs.del.txt"
-  }
-
-}
-
-
-task CleanVcf1b_13 {
-  input {
-    File int_bed
-    File multi_del
-    File copystate_per_variant
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([int_bed, multi_del, copystate_per_variant], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 15,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##dup##
-    mv ~{multi_del} multi.cnvs.txt
-    zcat ~{copystate_per_variant} \
-      |awk '{if ($2!="." && ($2<1 || $2>4)) print $1}' \
-      |sort -u \
-      |{ fgrep -wf <(zcat ~{int_bed}|awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}' ) || true; } \
-      >>multi.cnvs.txt
-  >>>
-
-  output {
-    File multi="multi.cnvs.txt"
-  }
-
+task FilterVcf {
+    input {
+        File intermediate_vcf
+        File dictionary_json_gz
+        String prefix
+        String sv_pipeline_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float input_size = size([intermediate_vcf, dictionary_json_gz], "GB")
+    RuntimeAttr runtime_default = object {
+                                      mem_gb: 3.75,
+                                      disk_gb: ceil(10.0 + input_size * 2.0),
+                                      cpu_cores: 1,
+                                      preemptible_tries: 3,
+                                      max_retries: 1,
+                                      boot_disk_gb: 10
+                                  }
+    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    runtime {
+        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        docker: sv_pipeline_docker
+        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+    }
+
+    command <<<
+        set -euo pipefail
+        python /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py ~{dictionary_json_gz} ~{intermediate_vcf} \
+            | bgzip \
+            > ~{prefix}.vcf.gz
+        mv multi.cnvs.txt ~{prefix}.multi.cnvs.txt
+    >>>
+    output {
+        File out = "~{prefix}.vcf.gz"
+        File multi_cnvs = "~{prefix}.multi.cnvs.txt"
+    }
 }
diff --git a/wdl/CleanVcf5.wdl b/wdl/CleanVcf5.wdl
index 7d8d7c47a..a1b75f839 100644
--- a/wdl/CleanVcf5.wdl
+++ b/wdl/CleanVcf5.wdl
@@ -1,1817 +1,262 @@
 version 1.0
 
 import "Structs.wdl"
+import "TasksMakeCohortVcf.wdl" as tasks
 
 workflow CleanVcf5 {
-  input {
-    File revise_vcf_lines
-    File normal_revise_vcf
-    File ped_file
-    File sex_chr_revise
-    File multi_ids
-    File? outlier_samples_list
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override # TODO
-  }
-
-  call CleanVcf5_1 {
-    input:
-      normal_revise_vcf=normal_revise_vcf,
-      revise_vcf_lines=revise_vcf_lines,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_2 {
-    input:
-      overlap_revise_vcf=CleanVcf5_1.overlap_revise_vcf,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_3 {
-    input:
-      overlap_revise_vcf=CleanVcf5_1.overlap_revise_vcf,
-      outlier_samples_list=outlier_samples_list,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_4 {
-    input:
-      overlap_revise_vcf=CleanVcf5_1.overlap_revise_vcf,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_5 {
-    input:
-      copystate_rd_cn_format=CleanVcf5_3.copystate_rd_cn_format,
-      overlap_revise_bed=CleanVcf5_2.overlap_revise_bed,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_6 {
-    input:
-      copystate_rd_cn_format=CleanVcf5_3.copystate_rd_cn_format,
-      overlap_revise_bed=CleanVcf5_2.overlap_revise_bed,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_7 {
-    input:
-      copystate_rd_cn_format=CleanVcf5_3.copystate_rd_cn_format,
-      overlap_revise_bed=CleanVcf5_2.overlap_revise_bed,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_8 {
-    input:
-      copystate_rd_cn_format=CleanVcf5_3.copystate_rd_cn_format,
-      overlap_revise_bed=CleanVcf5_2.overlap_revise_bed,
-      gt4copystate=CleanVcf5_7.gt4copystate,
-      multi_dup_ids_1=CleanVcf5_6.multi_dup_ids_1,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_9 {
-    input:
-      overlap_revise_vcf=CleanVcf5_1.overlap_revise_vcf,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_10 {
-    input:
-      genotype_gt_format=CleanVcf5_4.genotype_gt_format,
-      multi_dup_ids=CleanVcf5_8.multi_dup_ids,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_11 {
-    input:
-      genotype_gt_format=CleanVcf5_4.genotype_gt_format,
-      multi_del_ids=CleanVcf5_5.multi_del_ids,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_12 {
-    input:
-      multi_dup_ids=CleanVcf5_8.multi_dup_ids,
-      regeno_bed=CleanVcf5_9.regeno_bed,
-      gt5kb_dup_ids_1=CleanVcf5_10.gt5kb_dup_ids_1,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_13 {
-    input:
-      multi_del_ids=CleanVcf5_5.multi_del_ids,
-      gt5kb_del_ids_1=CleanVcf5_11.gt5kb_del_ids_1,
-      regeno_bed=CleanVcf5_9.regeno_bed,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_14 {
-    input:
-      overlap_revise_vcf=CleanVcf5_1.overlap_revise_vcf,
-      gt5kb_dup_ids=CleanVcf5_12.gt5kb_dup_ids,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_15 {
-    input:
-      overlap_revise_vcf=CleanVcf5_1.overlap_revise_vcf,
-      gt5kb_del_ids=CleanVcf5_13.gt5kb_del_ids,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_16 {
-    input:
-      del_int=CleanVcf5_15.del_int,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_17 {
-    input:
-      dup_int=CleanVcf5_14.dup_int,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_18 {
-    input:
-      overlap_revise_vcf=CleanVcf5_1.overlap_revise_vcf,
-      gt5kb_dup_ids=CleanVcf5_12.gt5kb_dup_ids,
-      gt5kb_del_ids=CleanVcf5_13.gt5kb_del_ids,
-      dup_revise=CleanVcf5_17.dup_revise,
-      del_revise=CleanVcf5_16.del_revise,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_19 {
-    input:
-      multi_geno_ids_txt=multi_ids,
-      multi_del_ids=CleanVcf5_5.multi_del_ids,
-      multi_dup_ids=CleanVcf5_8.multi_dup_ids,
-      newdepth_geno_vcf=CleanVcf5_18.newdepth_geno_vcf,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_20 {
-    input:
-      multi_dup_ids=CleanVcf5_8.multi_dup_ids,
-      multitagged_vcf=CleanVcf5_19.multitagged_vcf,
-      multitagged_vcf_tbi=CleanVcf5_19.multitagged_vcf_tbi,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_21 {
-    input:
-      multi_del_ids=CleanVcf5_5.multi_del_ids,
-      multitagged_vcf=CleanVcf5_19.multitagged_vcf,
-      multitagged_vcf_tbi=CleanVcf5_19.multitagged_vcf_tbi,
-      dup_multi_revise_vcf=CleanVcf5_20.dup_multi_revise_vcf,
-      all_multi_revised_list_1=CleanVcf5_20.all_multi_revised_list_1,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_22 {
-    input:
-      multitagged_vcf=CleanVcf5_19.multitagged_vcf,
-      multitagged_vcf_tbi=CleanVcf5_19.multitagged_vcf_tbi,
-      dup_multi_revise_vcf=CleanVcf5_20.dup_multi_revise_vcf,
-      del_multi_revise_vcf=CleanVcf5_21.del_multi_revise_vcf,
-      all_multi_revised_list_2=CleanVcf5_21.all_multi_revised_list_2,
-      new_header=CleanVcf5_21.new_header,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_23 {
-    input:
-      multitagged_vcf=CleanVcf5_19.multitagged_vcf,
-      multitagged_vcf_tbi=CleanVcf5_19.multitagged_vcf_tbi,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_24 {
-    input:
-      multi_bed=CleanVcf5_23.multi_bed,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_25 {
-    input:
-      multi_bed_overlap=CleanVcf5_24.multi_bed_overlap,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_26 {
-    input:
-      multitagged_geno_vcf=CleanVcf5_22.multitagged_geno_vcf,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_27 {
-    input:
-      multitagged_geno_vcf=CleanVcf5_22.multitagged_geno_vcf,
-      multi_remove=CleanVcf5_25.multi_remove,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_28 {
-    input:
-      cleantagandmulti_vcf=CleanVcf5_27.cleantagandmulti_vcf,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  if (CleanVcf5_28.out > 0) {
-    call CleanVcf5_29_TRUE_1 {
-      input:
-        famfile=ped_file,
-        cleantagandmulti_vcf=CleanVcf5_27.cleantagandmulti_vcf,
-        sv_pipeline_docker=sv_pipeline_docker
+    input {
+        File normal_revise_vcf
+        File revise_vcf_lines
+        File ped_file
+        File sex_chr_revise
+        File multi_ids
+        File? outlier_samples_list
+
+        String prefix
+        String contig
+        Int records_per_shard
+
+        File? make_clean_gq_script
+        File? find_redundant_sites_script
+
+        String sv_base_mini_docker
+        String sv_pipeline_docker
+
+        Int? threads_per_task
+        RuntimeAttr? runtime_attr_override_scatter
+        RuntimeAttr? runtime_attr_override_make_cleangq
+        RuntimeAttr? runtime_attr_override_find_redundant_multiallelics
+        RuntimeAttr? runtime_attr_override_polish
     }
 
-    call CleanVcf5_29_TRUE_2 {
-      input:
-        malecols=CleanVcf5_29_TRUE_1.malecols,
-        sex_chr_revise=sex_chr_revise,
-        cleantagandmulti_vcf=CleanVcf5_27.cleantagandmulti_vcf,
-        sv_pipeline_docker=sv_pipeline_docker
+    call tasks.ScatterVcf {
+        input:
+            vcf=normal_revise_vcf,
+            records_per_shard = records_per_shard,
+            prefix = "~{prefix}.scatter_vcf",
+            sv_pipeline_docker=sv_pipeline_docker,
+            runtime_attr_override=runtime_attr_override_scatter
     }
 
-    call CleanVcf5_29_TRUE_3 {
-      input:
-        cleantagandmulti_vcf=CleanVcf5_27.cleantagandmulti_vcf,
-        sexchr_backtoorig=CleanVcf5_29_TRUE_2.sexchr_backtoorig,
-        sv_pipeline_docker=sv_pipeline_docker
+    scatter ( i in range(length(ScatterVcf.shards)) ) {
+        call MakeCleanGQ {
+            input:
+                revise_vcf_lines=revise_vcf_lines,
+                normal_revise_vcf=ScatterVcf.shards[i],
+                ped_file=ped_file,
+                sex_chr_revise=sex_chr_revise,
+                multi_ids=multi_ids,
+                outlier_samples_list=outlier_samples_list,
+                make_clean_gq_script=make_clean_gq_script,
+                prefix="~{prefix}.make_clean_gq.shard_~{i}",
+                sv_pipeline_docker=sv_pipeline_docker,
+                runtime_attr_override=runtime_attr_override_make_cleangq
+        }
     }
-  }
-
-  File cleansexcn_vcf_ = select_first([CleanVcf5_29_TRUE_3.cleansexcn_vcf, CleanVcf5_27.cleantagandmulti_vcf])
-  call CleanVcf5_30 {
-    input:
-      cleansexcn_vcf=cleansexcn_vcf_,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call CleanVcf5_31 {
-    input:
-      cleansexcn_vcf=cleansexcn_vcf_,
-      blankcheck_ids=CleanVcf5_30.blankcheck_ids,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  output {
-    File polished = CleanVcf5_31.polished
-  }
-}
-
-task CleanVcf5_1 {
-  input {
-    File normal_revise_vcf
-    File revise_vcf_lines
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([normal_revise_vcf, revise_vcf_lines], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 20.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    cat <(zcat ~{normal_revise_vcf}|fgrep -wvf <(zcat ~{revise_vcf_lines}|awk '{if ($1!="") print $3}'|sort -u)) \
-    <(zcat ~{revise_vcf_lines}|awk '{if ($1!="") print}' |tr ' ' '\t') \
-    |vcf-sort \
-    |bgzip \
-    >overlap.revise.vcf.gz
-  >>>
-
-  output {
-    File overlap_revise_vcf="overlap.revise.vcf.gz"
-  }
-}
-
-task CleanVcf5_2 {
-  input {
-    File overlap_revise_vcf
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(overlap_revise_vcf, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##create bed of VCF##
-    svtk vcf2bed ~{overlap_revise_vcf} overlap.revise.bed
-    gzip overlap.revise.bed
-  >>>
-
-  output {
-    File overlap_revise_bed="overlap.revise.bed.gz"
-  }
-}
-
-task CleanVcf5_3 {
-  input {
-    File overlap_revise_vcf
-    File? outlier_samples_list
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([overlap_revise_vcf, overlap_revise_vcf], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ~{if defined(outlier_samples_list) then "ln ~{outlier_samples_list} outliers.txt" else "touch outliers.txt"}
-    ##multi check##
-    zcat ~{overlap_revise_vcf} \
-    |awk '{if ($1!~"#") $1=$3;print}' OFS="\t" \
-    |vcftools --vcf - --remove outliers.txt --stdout --extract-FORMAT-info RD_CN \
-    |gzip \
-    >copystate.RD_CN.FORMAT.gz
-  >>>
-
-  output {
-    File copystate_rd_cn_format="copystate.RD_CN.FORMAT.gz"
-  }
-}
-
-task CleanVcf5_4 {
-  input {
-    File overlap_revise_vcf
-    File? outlier_samples_list
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([overlap_revise_vcf, outlier_samples_list], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ~{if defined(outlier_samples_list) then "ln ~{outlier_samples_list} outliers.txt" else "touch outliers.txt"}
-    zcat ~{overlap_revise_vcf} \
-    |awk '{if ($1!~"#") $1=$3;print}' OFS="\t" \
-    |vcftools --vcf - --remove outliers.txt --stdout --extract-FORMAT-info GT \
-    |gzip \
-    >genotype.gt.FORMAT.gz
-  >>>
-
-  output {
-    File genotype_gt_format="genotype.gt.FORMAT.gz"
-  }
-}
-
-task CleanVcf5_5 {
-  input {
-    File copystate_rd_cn_format
-    File overlap_revise_bed
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([copystate_rd_cn_format, overlap_revise_bed], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##New method for determining copy state based on >1% of people having an multi-allelic copy state as define above##
-    vf_1=$(zcat ~{copystate_rd_cn_format}|awk 'NR==1{print (NF-2) * 0.01}'|awk '{if ($1<=1) print 2; else print }' )
-
-    zcat ~{copystate_rd_cn_format} \
-    |{ fgrep -wf <(zcat ~{overlap_revise_bed} |awk -F"\t" '{if ($5=="DEL" && $3-$2>=1000) print $4}' ) || true; } \
-    |awk 'NR>1{for(i=3;i<=NF;i++) if ($i!="." && $i>3) print  $1 }' \
-    |sort \
-    |uniq -c \
-    |awk -v vf_1=$vf_1 '{if ($1>vf_1)print $2}' \
-    |gzip \
-    >multi.del.ids.txt.gz
-  >>>
-
-  output {
-    File multi_del_ids="multi.del.ids.txt.gz"
-  }
-}
-
-task CleanVcf5_6 {
-  input {
-    File copystate_rd_cn_format
-    File overlap_revise_bed
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([copystate_rd_cn_format, overlap_revise_bed], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    vf_1=$(zcat ~{copystate_rd_cn_format}|awk 'NR==1{print (NF-2) * 0.01}'|awk '{if ($1<=1) print 2; else print }' )
-
-    zcat ~{copystate_rd_cn_format} \
-    |{ fgrep -wf <(zcat ~{overlap_revise_bed}|awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}' ) || true; } \
-    |awk 'NR>1{for(i=3;i<=NF;i++) if ($i!="." && $i>4) print  $1 }' \
-    |sort \
-    |uniq -c \
-    |awk -v vf_1=$vf_1 '{if ($1>vf_1)print $2}' \
-    >multi.dup.ids.txt
-  >>>
-
-  output {
-    File multi_dup_ids_1="multi.dup.ids.txt"
-  }
-}
-
-task CleanVcf5_7 {
-  input {
-    File copystate_rd_cn_format
-    File overlap_revise_bed
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([copystate_rd_cn_format, overlap_revise_bed], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##Case with CN 0,1,2,3,4##
-    zcat ~{copystate_rd_cn_format} \
-    |{ fgrep -wf <(zcat ~{overlap_revise_bed} | awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}') || true; } \
-    |awk 'NR>1{for(i=3;i<=NF;i++) if ($i!="." && ($i<1 || $i>4)) print  $1 "\t" $i }'\
-    |sort -u \
-    |awk  '{print $1}' \
-    |sort \
-    |uniq -c \
-    |awk '{if ($1>4) print $2}'>gt4copystate.txt
-  >>>
-
-  output {
-    File gt4copystate="gt4copystate.txt"
-  }
-}
-
-task CleanVcf5_8 {
-  input {
-    File copystate_rd_cn_format
-    File overlap_revise_bed
-    File gt4copystate
-    File multi_dup_ids_1
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([copystate_rd_cn_format, overlap_revise_bed, gt4copystate, multi_dup_ids_1], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    vf_1=$(zcat ~{copystate_rd_cn_format}|awk 'NR==1{print (NF-2) * 0.01}'|awk '{if ($1<=1) print 2; else print }' )
-
-    mv ~{multi_dup_ids_1} multi.dup.ids.txt
-    zcat ~{copystate_rd_cn_format} \
-      | awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]; next} {if ($1 in inFileA) print }' <(zcat ~{overlap_revise_bed}|awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}') - \
-      | awk 'NR>1{for(i=3;i<=NF;i++) if ($i!="." && ($i<1 || $i>4)) print  $1 }' \
-      | sort \
-      | uniq -c \
-      | awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]; next} {if ($1 in inFileA) print }' ~{gt4copystate} - \
-      | awk -v vf_1=$vf_1 '{if ($1>vf_1)print $2}' \
-      >> multi.dup.ids.txt
-
-    sort -u multi.dup.ids.txt |gzip >multi.dup.ids.txt.gz
-  >>>
-
-  output {
-    File multi_dup_ids="multi.dup.ids.txt.gz"
-  }
-}
-
-task CleanVcf5_9 {
-  input {
-    File overlap_revise_vcf
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(overlap_revise_vcf, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##Regenotype to determine multiallelic; we just change copy state for some nested variants and we need to make sure we get proper genotype for these; also previous stages have different notaion for multiallelic and we need to make this uniform; this is a CN based regenotyping so restricted to >5kb ##
-    ##Genotype big dup##
-    svtk vcf2bed ~{overlap_revise_vcf} regeno.bed
-    gzip regeno.bed
-  >>>
-
-  output {
-    File regeno_bed="regeno.bed.gz"
-  }
-}
-
-task CleanVcf5_10 {
-  input {
-    File genotype_gt_format
-    File multi_dup_ids
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([genotype_gt_format, multi_dup_ids], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##add variants that are <5kb because clustering but have a mutliallelic genotype from before##
-    zcat ~{genotype_gt_format} \
-      |awk '{if ($1~"DUP") print}' \
-      |awk '{for (i = 3; i <= NF; ++i) print $1 "\t" $i}' \
-      |awk '{if ($2!="1/1" && $2!="0/0" && $2!="0/1" && $2!="./.") print $1}' \
-      |{ fgrep -wvf <(zcat ~{multi_dup_ids}) || true; } \
-      |sort -u \
-      >gt5kb.dup.ids.txt
-  >>>
-
-  output {
-    File gt5kb_dup_ids_1="gt5kb.dup.ids.txt"
-  }
-}
-
-task CleanVcf5_11 {
-  input {
-    File genotype_gt_format
-    File multi_del_ids
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([genotype_gt_format, multi_del_ids], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    zcat ~{genotype_gt_format} \
-      |awk '{if ($1~"DEL") print}' \
-      |awk '{for (i = 3; i <= NF; ++i) print $1 "\t" $i}' \
-      |awk '{if ($2!="1/1" && $2!="0/0" && $2!="0/1" && $2!="./.") print $1}' \
-      |{ fgrep -wvf <(zcat ~{multi_del_ids}) || true; } \
-      |sort -u \
-      >gt5kb.del.ids.txt
-  >>>
-
-  output {
-    File gt5kb_del_ids_1="gt5kb.del.ids.txt"
-  }
-}
-
-task CleanVcf5_12 {
-  input {
-    File multi_dup_ids
-    File regeno_bed
-    File gt5kb_dup_ids_1
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([multi_dup_ids, regeno_bed, gt5kb_dup_ids_1], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    mv ~{gt5kb_dup_ids_1} gt5kb.dup.ids.txt
-    ##generate list##
-    ##CNV >5kb, split del and dup ##
-    if [ -f ~{multi_dup_ids} ]
-    then
-      zcat ~{regeno_bed}  \
-      |awk '{if ($3-$2>=5000 && $5=="DUP")print $4}' \
-      |{ fgrep -wvf <(zcat ~{multi_dup_ids}) || true; } \
-      >>gt5kb.dup.ids.txt
-    else
-      zcat ~{regeno_bed} \
-      |awk '{if ($3-$2>=5000 && $5=="DUP")print $4}' \
-      >>gt5kb.dup.ids.txt
-    fi
-  >>>
-
-  output {
-    File gt5kb_dup_ids="gt5kb.dup.ids.txt"
-  }
-}
-
-task CleanVcf5_13 {
-  input {
-    File multi_del_ids
-    File gt5kb_del_ids_1
-    File regeno_bed
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([multi_del_ids, gt5kb_del_ids_1, regeno_bed], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    mv ~{gt5kb_del_ids_1} gt5kb.del.ids.txt
-    if [ -f ~{multi_del_ids} ]
-    then
-      zcat ~{regeno_bed} \
-        |awk '{if ($3-$2>=5000 && $5=="DEL")print $4}' \
-        |{ fgrep -wvf <(zcat ~{multi_del_ids}) || true; } \
-        >>gt5kb.del.ids.txt
-    else
-      zcat ~{regeno_bed} \
-        |awk '{if ($3-$2>=5000 && $5=="DEL")print $4}' \
-        >>gt5kb.del.ids.txt
-    fi
-
-  >>>
-
-  output {
-    File gt5kb_del_ids="gt5kb.del.ids.txt"
-  }
-}
-
-task CleanVcf5_14 {
-  input {
-    File overlap_revise_vcf
-    File gt5kb_dup_ids
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([overlap_revise_vcf, gt5kb_dup_ids], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    zcat ~{overlap_revise_vcf} \
-      |fgrep -wf ~{gt5kb_dup_ids} \
-      >dup.int.txt || true
-  >>>
-
-  output {
-    File dup_int="dup.int.txt"
-  }
-}
-
-task CleanVcf5_15 {
-  input {
-    File overlap_revise_vcf
-    File gt5kb_del_ids
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([overlap_revise_vcf, gt5kb_del_ids], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    zcat ~{overlap_revise_vcf} \
-      |fgrep -wf ~{gt5kb_del_ids} \
-      >>del.int.txt || true
-  >>>
 
-  output {
-    File del_int="del.int.txt"
-  }
-}
-
-task CleanVcf5_16 {
-  input {
-    File del_int
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(del_int, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##regenotype VCF##
-    dellen=$(cat ~{del_int}|wc -l)
-    columnlen=$(less ~{del_int}|cut -f10-|tr '\t' '\n' |wc -l)
-    dellenchange=$(echo $dellen $columnlen|awk '{if ($1 == 0) { print "0" } else { print $2/$1}}')
-
-    paste <(less ~{del_int}|cut -f1-9) <(less ~{del_int}|cut -f10-|tr '\t' '\n' \
-    |awk -F':' '{if ($3>=2 && $1!="./.") $1="0/0"; \
-    else if ($3==1 && $1!="./.") $1="0/1"; \
-    else if ($1!="./.")$1="1/1";print}' OFS=":" \
-    |awk -v lenchange=$dellenchange 'NR%lenchange {printf("%s\t", $0); next} \
-    {print $0}')>del.revise.txt
-  >>>
-
-  output {
-    File del_revise="del.revise.txt"
-  }
-}
-
-task CleanVcf5_17 {
-  input {
-    File dup_int
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(dup_int, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    duplen=$(cat ~{dup_int}|wc -l)
-    columnlen=$(less ~{dup_int}|cut -f10-|tr '\t' '\n' |wc -l)
-    duplenchange=$(echo $duplen $columnlen|awk '{if ($1 == 0) { print "0" } else { print $2/$1}}')
-
-    paste <(less ~{dup_int}|cut -f1-9) <(less ~{dup_int}|cut -f10-|tr '\t' '\n' \
-    |awk -F':' '{if ($3<=2 && $1!="./.") $1="0/0"; \
-    else if ($3==3 && $1!="./.") $1="0/1"; \
-    else if ($1!="./.") $1="1/1";print}' OFS=":" \
-    |awk -v lenchange=$duplenchange 'NR%lenchange {printf("%s\t", $0); next} \
-    {print $0}') >dup.revise.txt
-  >>>
-
-  output {
-    File dup_revise="dup.revise.txt"
-  }
-}
-
-task CleanVcf5_18 {
-  input {
-    File overlap_revise_vcf
-    File gt5kb_dup_ids
-    File gt5kb_del_ids
-    File dup_revise
-    File del_revise
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([overlap_revise_vcf, gt5kb_dup_ids, gt5kb_del_ids, dup_revise, del_revise], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 10.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    cat <(zcat ~{overlap_revise_vcf}|fgrep -wvf <(cat ~{gt5kb_dup_ids} ~{gt5kb_del_ids})) \
-    <(cat ~{dup_revise} ~{del_revise}) \
-    |vcf-sort \
-    |bgzip \
-    >newdepth.geno.vcf.gz
-  >>>
-
-  output {
-    File newdepth_geno_vcf="newdepth.geno.vcf.gz"
-  }
-}
-
-task CleanVcf5_19 {
-  input {
-    File multi_geno_ids_txt
-    File multi_del_ids
-    File multi_dup_ids
-    File newdepth_geno_vcf
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([multi_geno_ids_txt, multi_del_ids, multi_dup_ids, newdepth_geno_vcf, newdepth_geno_vcf], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##Tag multi##
-    ##Add filters to header##
-    zcat ~{newdepth_geno_vcf} \
-      |awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]; next} {if ($3 in inFileA && $1!~"#" && $7!~"PESR_GT_OVERDISPERSION") $7=$7";PESR_GT_OVERDISPERSION"; print }' \
-        <(cat <(zcat ~{multi_geno_ids_txt}) <(printf "\n")) - \
-      |awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]; next} {if ($3 in inFileA && $1!~"#") $7=$7";MULTIALLELIC"; print }' \
-        <(cat <(zcat ~{multi_del_ids} ~{multi_dup_ids} |sort -u) <(printf "\n")) - \
-      |sed 's\PASS;\\g' \
-      |awk '{if (NR==2) print $0 "\n" "##FILTER=<ID=PESR_GT_OVERDISPERSION,Description=\"High PESR dispersion count\">" ;else print}' \
-      |awk '{if (NR==2) print $0 "\n" "##FILTER=<ID=MULTIALLELIC,Description=\"Multiallelic copy number variant>" ;else print}' \
-      |bgzip \
-      >multitagged.vcf.gz
-    tabix multitagged.vcf.gz
-  >>>
-
-  output {
-    File multitagged_vcf="multitagged.vcf.gz"
-    File multitagged_vcf_tbi="multitagged.vcf.gz.tbi"
-  }
-}
-
-task CleanVcf5_20 {
-  input {
-    File multi_dup_ids
-    File multitagged_vcf
-    File multitagged_vcf_tbi
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([multi_dup_ids, multitagged_vcf], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    # use BCFTOOLS 1.9
-    BCFTOOLS=/usr/local/bin/bcftools
-
-    touch all.multi.revised.list
-    touch dup.multi.revise.vcf
-    if [ $(zcat ~{multi_dup_ids}|wc -l) -ge 1  ]
-    then
-      /opt/sv-pipeline/04_variant_resolution/scripts/reset_multiallelic_format_fields.py ~{multitagged_vcf} <(zcat ~{multi_dup_ids}) > dup.multi.revise.vcf
-      ${BCFTOOLS} query -f '%ID\n' dup.multi.revise.vcf >> all.multi.revised.list
-    fi
-  >>>
-
-  output {
-    File dup_multi_revise_vcf="dup.multi.revise.vcf"
-    File all_multi_revised_list_1="all.multi.revised.list"
-  }
-}
-
-task CleanVcf5_21 {
-  input {
-    File multi_del_ids
-    File multitagged_vcf
-    File multitagged_vcf_tbi
-    File dup_multi_revise_vcf
-    File all_multi_revised_list_1
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([multi_del_ids, multitagged_vcf, dup_multi_revise_vcf, all_multi_revised_list_1], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    # use BCFTOOLS 1.9
-    BCFTOOLS=/usr/local/bin/bcftools
-
-    mv ~{all_multi_revised_list_1} all.multi.revised.list
-    touch del.multi.revise.vcf
-    if [ $(zcat ~{multi_del_ids}|wc -l) -ge 1 ]
-    then
-      /opt/sv-pipeline/04_variant_resolution/scripts/reset_multiallelic_format_fields.py ~{multitagged_vcf} <(zcat ~{multi_del_ids}) > del.multi.revise.vcf
-      ${BCFTOOLS} query -f '%ID\n' del.multi.revise.vcf >> all.multi.revised.list
-    fi
-
-    # make sure that the new header includes CN and CNQ format fields if we set any
-    if [ -s ~{dup_multi_revise_vcf} ]
-    then
-      grep '^#' ~{dup_multi_revise_vcf} > new_header.vcf
-    elif [ -s  del.multi.revise.vcf ]
-    then
-      grep '^#' del.multi.revise.vcf > new_header.vcf
-    else
-      zcat ~{multitagged_vcf} | grep '^#' > new_header.vcf
-    fi
-  >>>
-
-  output {
-    File del_multi_revise_vcf="del.multi.revise.vcf"
-    File all_multi_revised_list_2="all.multi.revised.list"
-    File new_header="new_header.vcf"
-  }
-}
-
-
-task CleanVcf5_22 {
-  input {
-    File multitagged_vcf
-    File multitagged_vcf_tbi
-    File dup_multi_revise_vcf
-    File del_multi_revise_vcf
-    File all_multi_revised_list_2
-    File new_header
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([multitagged_vcf, dup_multi_revise_vcf, del_multi_revise_vcf, all_multi_revised_list_2, new_header], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 20.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    # use BCFTOOLS 1.9
-    BCFTOOLS=/usr/local/bin/bcftools
-
-    # combine the revised variants with the unrevised variants, reheader, resort, and compress
-    cat <(zcat ~{multitagged_vcf} | fgrep -wvf ~{all_multi_revised_list_2}) \
-      <(cat ~{del_multi_revise_vcf} ~{dup_multi_revise_vcf} | grep -v '^#' | awk '!seen[$3]++') \
-      |${BCFTOOLS} reheader -h ~{new_header} \
-      |vcf-sort \
-      |bgzip \
-      >multitagged.geno.vcf.gz
-  >>>
-
-  output {
-    File multitagged_geno_vcf="multitagged.geno.vcf.gz"
-  }
-}
-
-task CleanVcf5_23 {
-  input {
-    File multitagged_vcf
-    File multitagged_vcf_tbi
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(multitagged_vcf, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##remove overlapping multi###
-    zcat ~{multitagged_vcf} \
-    |awk -F'\t' '{if ($1~"#" || ($7~"MULTIALLELIC" &&  ($5=="<DEL>" || $5=="<DUP>"))) print}' \
-    |svtk vcf2bed stdin tmp.bed
-    cut -f1-5 tmp.bed \
-    |gzip \
-    >multi.bed.gz
-  >>>
-
-  output {
-    File multi_bed="multi.bed.gz"
-  }
-}
-
-task CleanVcf5_24 {
-  input {
-    File multi_bed
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(multi_bed, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##strip out overlapping multiallelics##
-    bedtools intersect -wa -wb -a  ~{multi_bed} -b ~{multi_bed} \
-    |awk -F'\t' '{if ($4!=$9 && $3-$2>=$8-$7) print $0; \
-    else if ($4!=$9) print $6,$7,$8,$9,$10,$1,$2,$3,$4,$5}' OFS="\t" \
-    |sort -u \
-    |awk '{print $3-$2,$8-$7,$0}' OFS="\t"  \
-    |sort -nrk1,1 -k2,2nr \
-    |cut -f3- \
-    >multi.bed.overlap.txt
-  >>>
-
-  output {
-    File multi_bed_overlap="multi.bed.overlap.txt"
-  }
-}
-
-task CleanVcf5_25 {
-  input {
-    File multi_bed_overlap
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(multi_bed_overlap, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euo pipefail
-    echo "">multi.remove.txt
-
-    while read bed
-    do
-      echo "$bed"|cut -d$'\t' -f1-5 >large.bed
-      echo "$bed"|cut -d$'\t' -f6-10>small.bed
-      overlap=$(bedtools coverage -a small.bed -b large.bed|awk '{if ($NF>0.50) print "YES";else print "NO"}')
-      echo $bed|awk '{print $4}'
-      if [ "$overlap" == "YES" ] && [ $(awk '{print $4}' large.bed|fgrep -wf - multi.remove.txt|wc -l) -eq 0 ]
-      then
-        awk '{print $4}' small.bed >>multi.remove.txt
-      fi
-    done< ~{multi_bed_overlap}
-  >>>
-
-  output {
-    File multi_remove="multi.remove.txt"
-  }
-}
-
-task CleanVcf5_26 {
-  input {
-    File multitagged_geno_vcf
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(multitagged_geno_vcf, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    # use BCFTOOLS 1.9
-    BCFTOOLS=/usr/local/bin/bcftools
-
-    ##get alt tag for multiallelics##
-    ## produces a file with a row for each distinct multialllic variant ID and copy number combination
-    ${BCFTOOLS} query -i 'FILTER = "MULTIALLELIC"' -f '[%ID\t%CN\n]' ~{multitagged_geno_vcf} \
-    |sort -u >multi.cn.txt
-  >>>
-
-  output {
-    File multi_cn="multi.cn.txt"
-  }
-}
-
-task CleanVcf5_27 {
-  input {
-    File multitagged_geno_vcf
-    File multi_remove
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([multitagged_geno_vcf, multi_remove], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    # use BCFTOOLS 1.9
-    BCFTOOLS=/usr/local/bin/bcftools
-
-    ##strip out variants with no genotypes and overlapping multiallelics##
-    ### Find missing genotype and then add multiallelics that need to be removed###
-    ##change multiallelics svtype into mCNV##
-    ##add CN information to ALT column##
-    zcat ~{multitagged_geno_vcf} \
-    |${BCFTOOLS} view -e 'FILTER == "MULTIALLELIC"'  \
-    |svtk vcf2bed stdin tmp.bed
-
-    awk -F'\t' '{if ($6=="") print $4}' tmp.bed \
-    |cat - ~{multi_remove} \
-    |sed '/^$/d' \
-    |{ fgrep -wvf - <(zcat ~{multitagged_geno_vcf} ) || true; } \
-    |awk -F';' '{if ($1~"MULTIALLELIC" && ( $2~"DEL" || $2~"DUP")) $2="SVTYPE=CNV"; print}' OFS=';' \
-    |awk '{OFS="\t"; if ($8~"SVTYPE=CNV;") $5="<CNV>"; print}' \
-    |bgzip \
-    >cleantagandmulti.vcf.gz
-  >>>
-
-  output {
-    File cleantagandmulti_vcf="cleantagandmulti.vcf.gz"
-  }
-}
-
-task CleanVcf5_28 {
-  input {
-    File cleantagandmulti_vcf
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(cleantagandmulti_vcf, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    zcat ~{cleantagandmulti_vcf}|awk '{if (($1~"X" || $1~"Y") && $1!~"#") print}'|wc -l > out
-  >>>
-
-  output {
-    Int out=read_int("out")
-  }
-}
-
-task CleanVcf5_29_TRUE_1 {
-  input {
-    File famfile
-    File cleantagandmulti_vcf
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([famfile, cleantagandmulti_vcf], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##Determine columns male columns##
-    zcat ~{cleantagandmulti_vcf}\
-    |egrep ^# \
-    |tail -n 1 \
-    |tr '\t' '\n' \
-    |cat -n - \
-    >col.txt
-
-    awk '{if ($5==1) print $2}' ~{famfile} \
-    |{ fgrep -wf - col.txt || true; } \
-    >malecols.txt
-  >>>
-
-  output {
-    File malecols="malecols.txt"
-  }
-}
-
-task CleanVcf5_29_TRUE_2 {
-  input {
-    File sex_chr_revise
-    File cleantagandmulti_vcf
-    File malecols
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([sex_chr_revise, cleantagandmulti_vcf, malecols], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-    ##regenotype male calls on sex chr and add 1 to copy state for multialleic check##
-
-    python3 <<CODE | bgzip > sexchr.backtoorig.txt.gz
-    import pysam
-    import sys
-
-    with open("~{malecols}") as f:
-      samples = [x.strip().split('\t')[1] for x in f.readlines() if x]
-
-    with open("~{sex_chr_revise}") as f:
-      vids = set([x.strip() for x in f.readlines() if x])
-
-    vcf = pysam.VariantFile("~{cleantagandmulti_vcf}")
-
-    for record in vcf:
-      if record.id not in vids:
-        continue
-      for i in samples:
-        g = record.samples[i]
-        if g['RD_CN'] is not None and g['RD_CN'] >= 1:
-          g['RD_CN'] = g['RD_CN'] - 1
-      sys.stdout.write(str(record))
-
-    vcf.close()
-    CODE
+    call FindRedundantMultiallelics {
+        input:
+            multiallelic_vcfs=MakeCleanGQ.multiallelic_vcf,
+            find_redundant_sites_script=find_redundant_sites_script,
+            prefix="~{prefix}.find_redundant_multiallelics",
+            sv_pipeline_docker=sv_pipeline_docker,
+            runtime_attr_override=runtime_attr_override_find_redundant_multiallelics
+    }
 
-  >>>
+    call Polish {
+        input:
+            clean_gq_vcfs=MakeCleanGQ.clean_gq_vcf,
+            no_sample_lists=MakeCleanGQ.no_sample_list,
+            redundant_multiallelics_list=FindRedundantMultiallelics.redundant_multiallelics_list,
+            prefix="~{prefix}.polish",
+            sv_pipeline_docker=sv_pipeline_docker,
+            runtime_attr_override=runtime_attr_override_polish
+    }
 
-  output {
-    File sexchr_backtoorig="sexchr.backtoorig.txt.gz"
-  }
+    output {
+        File polished=Polish.polished
+    }
 }
 
-task CleanVcf5_29_TRUE_3 {
-  input {
-    File cleantagandmulti_vcf
-    File sexchr_backtoorig
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([cleantagandmulti_vcf, sexchr_backtoorig], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 50.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
+task MakeCleanGQ {
+    input {
+        File revise_vcf_lines
+        File normal_revise_vcf
+        File ped_file
+        File sex_chr_revise
+        File multi_ids
+        File? outlier_samples_list
+        File? make_clean_gq_script
+        String prefix
+        Int? threads = 2
+        String sv_pipeline_docker
+        RuntimeAttr? runtime_attr_override
+    }
 
-  command <<<
-    set -euxo pipefail
-    cat <(zcat ~{cleantagandmulti_vcf}|fgrep -wvf <(zcat ~{sexchr_backtoorig}|awk '{print $3}'  )) \
-    <(zcat ~{sexchr_backtoorig} |awk '{if ($1!="") print}' |tr ' ' '\t') \
-    |vcf-sort \
-    |bgzip \
-    >cleansexCN.vcf.gz
-  >>>
+    # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed
+    # generally assume working memory is ~3 * inputs
+    Float input_size = size(
+                       select_all([revise_vcf_lines, normal_revise_vcf, ped_file, sex_chr_revise, multi_ids, outlier_samples_list]),
+                       "GB")
+    Float base_disk_gb = 10.0
+
+    RuntimeAttr runtime_default = object {
+                                      mem_gb: 16,
+                                      disk_gb: ceil(base_disk_gb + input_size * 5.0),
+                                      cpu_cores: 1,
+                                      preemptible_tries: 3,
+                                      max_retries: 1,
+                                      boot_disk_gb: 10
+                                  }
+    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    runtime {
+        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        docker: sv_pipeline_docker
+        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+    }
 
-  output {
-    File cleansexcn_vcf="cleansexCN.vcf.gz"
-  }
+    command <<<
+        set -eu -o pipefail
+
+        ~{if defined(outlier_samples_list) then "ln ~{outlier_samples_list} outliers.txt" else "touch outliers.txt"}
+
+        # put the revise lines into a normal VCF format
+        bcftools view -h ~{normal_revise_vcf} > header.txt
+        cat header.txt <(zcat ~{revise_vcf_lines} | grep . | tr " " "\t") | bgzip -c > revise.vcf.lines.vcf.gz
+
+        python3 ~{default="/opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py" make_clean_gq_script} \
+            --threads_per_file ~{threads} \
+            revise.vcf.lines.vcf.gz \
+            ~{normal_revise_vcf} \
+            ~{ped_file} \
+            ~{sex_chr_revise} \
+            ~{multi_ids} \
+            outliers.txt \
+            ~{prefix}
+
+        bcftools view -G -O z ~{prefix}.multiallelic.vcf.gz > ~{prefix}.multiallelic.sites.vcf.gz
+        tabix ~{prefix}.cleanGQ.vcf.gz
+    >>>
+
+    output {
+        File clean_gq_vcf=prefix + ".cleanGQ.vcf.gz"
+        File clean_gq_vcf_idx=prefix + ".cleanGQ.vcf.gz.tbi"
+        File multiallelic_vcf=prefix + ".multiallelic.sites.vcf.gz"
+        File no_sample_list = prefix + ".no_called_samples.list"
+    }
 }
 
-task CleanVcf5_30 {
-  input {
-    File cleansexcn_vcf
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
+task FindRedundantMultiallelics {
+    input {
+        Array[File] multiallelic_vcfs
+        File? find_redundant_sites_script
+        String prefix
+        String sv_pipeline_docker
+        RuntimeAttr? runtime_attr_override
+    }
 
-  Float input_size = size(cleansexcn_vcf, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
+    # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed
+    # generally assume working memory is ~3 * inputs
+    Float input_size = size(multiallelic_vcfs, "GB")
+    Float base_disk_gb = 10.0
+
+    RuntimeAttr runtime_default = object {
+                                      mem_gb: 16,
+                                      disk_gb: ceil(base_disk_gb + input_size * 5.0),
+                                      cpu_cores: 1,
+                                      preemptible_tries: 3,
+                                      max_retries: 1,
+                                      boot_disk_gb: 10
+                                  }
+    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    runtime {
+        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        docker: sv_pipeline_docker
+        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+    }
 
-  command <<<
-    set -euxo pipefail
+    command <<<
+        set -euo pipefail
+        VCFS="~{write_lines(multiallelic_vcfs)}"
+        cat $VCFS | awk -F '/' '{print $NF"\t"$0}' | sort -k1,1V | awk '{print $2}' > vcfs_sorted.list
+        bcftools concat --no-version --output-type z --file-list vcfs_sorted.list --output multiallelic.vcf.gz
 
-    mv ~{cleansexcn_vcf} cleanGQ.vcf.gz
-    ##find blank variants with no samples##
-    svtk vcf2bed cleanGQ.vcf.gz tmp.bed
+        python3 ~{default="/opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py" find_redundant_sites_script} \
+            multiallelic.vcf.gz \
+            ~{prefix}.list
 
-    awk -F'\t' '{if ($5!~"CN" && $6=="") print $4}' tmp.bed \
-    >blankcheck.ids.txt
-  >>>
+    >>>
 
-  output {
-    File blankcheck_ids="blankcheck.ids.txt"
-  }
+    output {
+        File redundant_multiallelics_list="~{prefix}.list"
+    }
 }
 
-task CleanVcf5_31 {
-  input {
-    File cleansexcn_vcf
-    File blankcheck_ids
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
 
-  Float input_size = size([cleansexcn_vcf, blankcheck_ids], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
+task Polish {
+    input {
+        Array[File] clean_gq_vcfs
+        Array[File] no_sample_lists
+        File redundant_multiallelics_list
+        String prefix
+        String sv_pipeline_docker
+        Int threads = 2
+        RuntimeAttr? runtime_attr_override
+    }
 
-  command <<<
-    set -euxo pipefail
-    mv ~{cleansexcn_vcf} cleanGQ.vcf.gz
-    ##Fix header##
-    ##get header to clean##
-    ##add new filters##
-    zcat cleanGQ.vcf.gz \
-    |awk '{if ($1~"##" && NR>1)  print}' \
-    |{ fgrep -v "MULTIALLELIC" || true; } \
-    |awk '{if (NR==2) print $0 "\n" "##FILTER=<ID=MULTIALLELIC,Description=\"Multiallelic site\">" ;else print}' \
-    |awk '{if (NR==2) print $0 "\n" "##ALT=<ID=CNV,Description=\"Copy Number Polymorphism\">" ;else print}' \
-    |sort -k1,1 \
-    |{ egrep -v "CIPOS|CIEND|RMSSTD|EVENT|INFO=<ID=UNRESOLVED,|source|varGQ|bcftools|ALT=<ID=UNR" || true; } \
-    |cat <(zcat cleanGQ.vcf.gz|head -n 1) - <(zcat cleanGQ.vcf.gz|fgrep -wvf ~{blankcheck_ids} |awk '{if ($1!~"##")  print}') \
-    |bgzip >polished.vcf.gz
-  >>>
+    # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed
+    # generally assume working memory is ~3 * inputs
+    Float input_size = size(clean_gq_vcfs, "GB")
+    Float base_disk_gb = 10.0
+
+    RuntimeAttr runtime_default = object {
+                                      mem_gb: 16,
+                                      disk_gb: ceil(base_disk_gb + input_size * 5.0),
+                                      cpu_cores: 4,
+                                      preemptible_tries: 3,
+                                      max_retries: 1,
+                                      boot_disk_gb: 10
+                                  }
+    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    runtime {
+        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        docker: sv_pipeline_docker
+        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+    }
 
-  output {
-    File polished="polished.vcf.gz"
-  }
+    command <<<
+        set -euo pipefail
+
+        VCFS="~{write_lines(clean_gq_vcfs)}"
+        cat $VCFS | awk -F '/' '{print $NF"\t"$0}' | sort -k1,1V | awk '{print $2}' > vcfs_sorted.list
+        cat ~{redundant_multiallelics_list} ~{sep=" " no_sample_lists} > ids_to_remove.list
+        /usr/local/bin/bcftools concat --no-version --output-type u --file-list vcfs_sorted.list | \
+            /usr/local/bin/bcftools view --no-version \
+                --exclude 'ID=@ids_to_remove.list' \
+                --output-type z -o polished.need_reheader.vcf.gz --threads ~{threads}
+
+        # do the last bit of header cleanup
+        bcftools view -h polished.need_reheader.vcf.gz | awk 'NR == 1' > new_header.vcf
+        bcftools view -h polished.need_reheader.vcf.gz \
+            | awk 'NR > 1' \
+            | egrep -v "CIPOS|CIEND|RMSSTD|EVENT|INFO=<ID=UNRESOLVED,|source|varGQ|bcftools|ALT=<ID=UNR|INFO=<ID=MULTIALLELIC," \
+            | sort -k1,1 >> new_header.vcf
+        bcftools reheader polished.need_reheader.vcf.gz -h new_header.vcf -o ~{prefix}.vcf.gz
+    >>>
+
+    output {
+        File polished="~{prefix}.vcf.gz"
+    }
 }
diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl
index 47e02f193..2a1230da4 100644
--- a/wdl/CleanVcfChromosome.wdl
+++ b/wdl/CleanVcfChromosome.wdl
@@ -2,10 +2,9 @@ version 1.0
 
 import "Structs.wdl"
 import "TasksMakeCohortVcf.wdl" as MiniTasks
-import "CleanVcf1.wdl" as c1
 import "CleanVcf1b.wdl" as c1b
 import "CleanVcf5.wdl" as c5
-import "DropRedundantCNVs.wdl" as drc
+import "HailMerge.wdl" as HailMerge
 
 workflow CleanVcfChromosome {
   input {
@@ -19,99 +18,163 @@ workflow CleanVcfChromosome {
     File bothsides_pass_list
     Int min_records_per_shard_step1
     Int samples_per_step2_shard
+    Int clean_vcf1b_records_per_shard
+    Int clean_vcf5_records_per_shard
+    Int? clean_vcf5_threads_per_task
     File? outlier_samples_list
+    Int? max_samples_per_shard_step3
+
+    String chr_x
+    String chr_y
+
+    Boolean use_hail
+    String? gcs_project
 
     String linux_docker
     String sv_base_mini_docker
     String sv_pipeline_docker
+    String sv_pipeline_hail_docker
+    String sv_pipeline_updates_docker
 
     # overrides for local tasks
     RuntimeAttr? runtime_override_clean_vcf_1a
-    RuntimeAttr? runtime_override_clean_vcf_1b
     RuntimeAttr? runtime_override_clean_vcf_2
     RuntimeAttr? runtime_override_clean_vcf_3
     RuntimeAttr? runtime_override_clean_vcf_4
-    RuntimeAttr? runtime_override_clean_vcf_5
+    RuntimeAttr? runtime_override_clean_vcf_5_scatter
+    RuntimeAttr? runtime_override_clean_vcf_5_make_cleangq
+    RuntimeAttr? runtime_override_clean_vcf_5_find_redundant_multiallelics
+    RuntimeAttr? runtime_override_clean_vcf_5_polish
     RuntimeAttr? runtime_override_stitch_fragmented_cnvs
     RuntimeAttr? runtime_override_final_cleanup
 
+    # Clean vcf 1b
+    RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b
+    RuntimeAttr? runtime_attr_override_sort_bed_1b
+    RuntimeAttr? runtime_attr_override_intersect_bed_1b
+    RuntimeAttr? runtime_attr_override_build_dict_1b
+    RuntimeAttr? runtime_attr_override_scatter_1b
+    RuntimeAttr? runtime_attr_override_filter_vcf_1b
+    RuntimeAttr? runtime_override_concat_vcfs_1b
+    RuntimeAttr? runtime_override_cat_multi_cnvs_1b
+
+    RuntimeAttr? runtime_override_preconcat_step1
+    RuntimeAttr? runtime_override_hail_merge_step1
+    RuntimeAttr? runtime_override_fix_header_step1
+
+    RuntimeAttr? runtime_override_preconcat_drc
+    RuntimeAttr? runtime_override_hail_merge_drc
+    RuntimeAttr? runtime_override_fix_header_drc
+
     # overrides for MiniTasks
     RuntimeAttr? runtime_override_split_vcf_to_clean
-    RuntimeAttr? runtime_override_combine_step_1_vcfs
     RuntimeAttr? runtime_override_combine_step_1_sex_chr_revisions
     RuntimeAttr? runtime_override_split_include_list
     RuntimeAttr? runtime_override_combine_clean_vcf_2
     RuntimeAttr? runtime_override_combine_revised_4
     RuntimeAttr? runtime_override_combine_multi_ids_4
+    RuntimeAttr? runtime_override_drop_redundant_cnvs
+    RuntimeAttr? runtime_override_combine_step_1_vcfs
+    RuntimeAttr? runtime_override_sort_drop_redundant_cnvs
+
   }
 
   call MiniTasks.SplitVcf as SplitVcfToClean {
     input:
       vcf=vcf,
       contig=contig,
-      prefix="~{prefix}.~{contig}.shard_",
+      prefix="~{prefix}.shard_",
       n_shards=max_shards_per_chrom_step1,
       min_vars_per_shard=min_records_per_shard_step1,
       sv_base_mini_docker=sv_base_mini_docker,
       runtime_attr_override=runtime_override_split_vcf_to_clean
   }
 
-  scatter ( vcf_shard in SplitVcfToClean.vcf_shards ) {
-    call c1.CleanVcf1 as CleanVcf1a {
+  scatter ( i in range(length(SplitVcfToClean.vcf_shards)) ) {
+    call CleanVcf1a {
       input:
-        vcf=vcf_shard,
-        background_list=background_list,
-        ped_file=ped_file,
-        sv_pipeline_docker=sv_pipeline_docker,
-        linux_docker=linux_docker,
+        vcf=SplitVcfToClean.vcf_shards[i],
+        prefix="~{prefix}.clean_vcf_1.shard_~{i}",
+        background_fail_list=background_list,
         bothsides_pass_list=bothsides_pass_list,
+        ped_file=ped_file,
         allosome_fai=allosome_fai,
+        chr_x=chr_x,
+        chr_y=chr_y,
+        sv_pipeline_docker=sv_pipeline_docker,
         runtime_attr_override=runtime_override_clean_vcf_1a
     }
   }
 
-  call MiniTasks.ConcatVcfs as CombineStep1Vcfs {
-    input:
-      vcfs=CleanVcf1a.intermediate_vcf,
-      vcfs_idx=CleanVcf1a.intermediate_vcf_idx,
-      naive=true,
-      generate_index=false,
-      outfile_prefix=prefix + ".cleanVCF_step1.intermediate_vcf.merged",
-      sv_base_mini_docker=sv_base_mini_docker,
-      runtime_attr_override=runtime_override_combine_step_1_vcfs
+  if (use_hail) {
+    call HailMerge.HailMerge as CombineStep1VcfsHail {
+      input:
+        vcfs=CleanVcf1a.intermediate_vcf,
+        prefix="~{prefix}.combine_step_1_vcfs",
+        gcs_project=gcs_project,
+        sv_base_mini_docker=sv_base_mini_docker,
+        sv_pipeline_docker=sv_pipeline_docker,
+        sv_pipeline_hail_docker=sv_pipeline_hail_docker,
+        runtime_override_preconcat=runtime_override_preconcat_step1,
+        runtime_override_hail_merge=runtime_override_hail_merge_step1,
+        runtime_override_fix_header=runtime_override_fix_header_step1
+    }
+  }
+  if (!use_hail) {
+    call MiniTasks.ConcatVcfs as CombineStep1Vcfs {
+      input:
+        vcfs=CleanVcf1a.intermediate_vcf,
+        vcfs_idx=CleanVcf1a.intermediate_vcf_idx,
+        naive=true,
+        generate_index=false,
+        outfile_prefix="~{prefix}.combine_step_1_vcfs",
+        sv_base_mini_docker=sv_base_mini_docker,
+        runtime_attr_override=runtime_override_combine_step_1_vcfs
+    }
   }
 
   call MiniTasks.CatUncompressedFiles as CombineStep1SexChrRevisions {
     input:
       shards=CleanVcf1a.sex,
-      outfile_name=prefix + ".cleanVCF_step1.sexchr_revise.merged.txt",
+      outfile_name="~{prefix}.combine_step_1_sex_chr_revisions.txt",
       sv_base_mini_docker=sv_base_mini_docker,
       runtime_attr_override=runtime_override_combine_step_1_sex_chr_revisions
   }
 
   call c1b.CleanVcf1b {
     input:
-      intermediate_vcf=CombineStep1Vcfs.concat_vcf,
+      intermediate_vcf=select_first([CombineStep1Vcfs.concat_vcf, CombineStep1VcfsHail.merged_vcf]),
+      prefix="~{prefix}.clean_vcf_1b",
+      records_per_shard=clean_vcf1b_records_per_shard,
       sv_pipeline_docker=sv_pipeline_docker,
-      runtime_attr_override=runtime_override_clean_vcf_1b
+      sv_pipeline_updates_docker=sv_pipeline_updates_docker,
+      sv_base_mini_docker=sv_base_mini_docker,
+      runtime_attr_override_subset_large_cnvs=runtime_attr_override_subset_large_cnvs_1b,
+      runtime_attr_override_sort_bed=runtime_attr_override_sort_bed_1b,
+      runtime_attr_override_intersect_bed=runtime_attr_override_intersect_bed_1b,
+      runtime_attr_override_build_dict=runtime_attr_override_build_dict_1b,
+      runtime_attr_override_scatter=runtime_attr_override_scatter_1b,
+      runtime_attr_override_filter_vcf=runtime_attr_override_filter_vcf_1b,
+      runtime_override_concat_vcfs=runtime_override_concat_vcfs_1b,
+      runtime_override_cat_multi_cnvs=runtime_override_cat_multi_cnvs_1b
   }
 
   call MiniTasks.SplitUncompressed as SplitIncludeList {
     input:
       whole_file=CleanVcf1a.include_list[0],
       lines_per_shard=samples_per_step2_shard,
-      shard_prefix="includeexclude.",
+      shard_prefix="~{prefix}.split_include_list.",
       sv_pipeline_docker=sv_pipeline_docker,
       runtime_attr_override=runtime_override_split_include_list
   }
 
-  scatter ( included_interval in SplitIncludeList.shards ){
-    call CleanVcf2{
+  scatter ( i in range(length(SplitIncludeList.shards)) ){
+    call CleanVcf2 {
       input:
         normal_revise_vcf=CleanVcf1b.normal,
-        include_list=included_interval,
+        prefix="~{prefix}.clean_vcf_2.shard_~{i}",
+        include_list=SplitIncludeList.shards[i],
         multi_cnvs=CleanVcf1b.multi,
-        vcftools_idx=CleanVcf1b.vcftools_idx,
         sv_pipeline_docker=sv_pipeline_docker,
         runtime_attr_override=runtime_override_clean_vcf_2
       }
@@ -120,6 +183,7 @@ workflow CleanVcfChromosome {
   call MiniTasks.CatUncompressedFiles as CombineCleanVcf2 {
     input:
       shards=CleanVcf2.out,
+      outfile_name="~{prefix}.combine_clean_vcf_2.txt",
       sv_base_mini_docker=sv_base_mini_docker,
       runtime_attr_override=runtime_override_combine_clean_vcf_2
   }
@@ -127,15 +191,17 @@ workflow CleanVcfChromosome {
   call CleanVcf3 {
     input:
       rd_cn_revise=CombineCleanVcf2.outfile,
+      max_samples_shard = max_samples_per_shard_step3,
       sv_pipeline_docker=sv_pipeline_docker,
       runtime_attr_override=runtime_override_clean_vcf_3
   }
 
-  scatter ( rd_cn_revise in CleanVcf3.shards ){
+  scatter ( i in range(length(CleanVcf3.shards)) ){
     call CleanVcf4 {
       input:
-        rd_cn_revise=rd_cn_revise,
+        rd_cn_revise=CleanVcf3.shards[i],
         normal_revise_vcf=CleanVcf1b.normal,
+        prefix="~{prefix}.clean_vcf_4.shard_~{i}",
         sv_pipeline_docker=sv_pipeline_docker,
         runtime_attr_override=runtime_override_clean_vcf_4
     }
@@ -144,7 +210,7 @@ workflow CleanVcfChromosome {
   call MiniTasks.CatUncompressedFiles as CombineRevised4 {
     input:
       shards=CleanVcf4.out,
-      outfile_name="revise.vcf.lines.txt.gz",
+      outfile_name="~{prefix}.combine_revised_4.txt.gz",
       sv_base_mini_docker=sv_base_mini_docker,
       runtime_attr_override=runtime_override_combine_revised_4
   }
@@ -152,7 +218,7 @@ workflow CleanVcfChromosome {
   call MiniTasks.CatUncompressedFiles as CombineMultiIds4 {
     input:
       shards=CleanVcf4.multi_ids,
-      outfile_name="multi.geno.ids.txt.gz",
+      outfile_name="~{prefix}.combine_multi_ids_4.txt.gz",
       sv_base_mini_docker=sv_base_mini_docker,
       runtime_attr_override=runtime_override_combine_multi_ids_4
   }
@@ -165,22 +231,56 @@ workflow CleanVcfChromosome {
       sex_chr_revise=CombineStep1SexChrRevisions.outfile,
       multi_ids=CombineMultiIds4.outfile,
       outlier_samples_list=outlier_samples_list,
-      sv_pipeline_docker=sv_pipeline_docker,
-      runtime_attr_override=runtime_override_clean_vcf_5
+      contig=contig,
+      prefix="~{prefix}.clean_vcf_5",
+      records_per_shard=clean_vcf5_records_per_shard,
+      threads_per_task=clean_vcf5_threads_per_task,
+      sv_pipeline_docker=sv_pipeline_updates_docker,
+      sv_base_mini_docker=sv_base_mini_docker,
+      runtime_attr_override_scatter=runtime_override_clean_vcf_5_scatter,
+      runtime_attr_override_make_cleangq=runtime_override_clean_vcf_5_make_cleangq,
+      runtime_attr_override_find_redundant_multiallelics=runtime_override_clean_vcf_5_find_redundant_multiallelics,
+      runtime_attr_override_polish=runtime_override_clean_vcf_5_polish
   }
 
-  call drc.DropRedundantCNVs {
+  call DropRedundantCnvs {
     input:
       vcf=CleanVcf5.polished,
+      prefix="~{prefix}.drop_redundant_cnvs",
       contig=contig,
-      sv_pipeline_docker=sv_pipeline_docker
+      sv_pipeline_docker=sv_pipeline_updates_docker,
+      runtime_attr_override=runtime_override_drop_redundant_cnvs
+  }
+
+  if (use_hail) {
+    call HailMerge.HailMerge as SortDropRedundantCnvsHail {
+      input:
+        vcfs=[DropRedundantCnvs.out],
+        prefix="~{prefix}.drop_redundant_cnvs.sorted",
+        gcs_project=gcs_project,
+        reset_cnv_gts=true,
+        sv_base_mini_docker=sv_base_mini_docker,
+        sv_pipeline_docker=sv_pipeline_docker,
+        sv_pipeline_hail_docker=sv_pipeline_hail_docker,
+        runtime_override_preconcat=runtime_override_preconcat_drc,
+        runtime_override_hail_merge=runtime_override_hail_merge_drc,
+        runtime_override_fix_header=runtime_override_fix_header_drc
+    }
+  }
+  if (!use_hail) {
+    call MiniTasks.SortVcf as SortDropRedundantCnvs {
+      input:
+        vcf=DropRedundantCnvs.out,
+        outfile_prefix="~{prefix}.drop_redundant_cnvs.sorted",
+        sv_base_mini_docker=sv_base_mini_docker,
+        runtime_attr_override=runtime_override_sort_drop_redundant_cnvs
+    }
   }
 
   call StitchFragmentedCnvs {
     input:
-      vcf=DropRedundantCNVs.cleaned_vcf_shard,
-      contig=contig,
-      prefix=prefix,
+      vcf=select_first([SortDropRedundantCnvs.out, SortDropRedundantCnvsHail.merged_vcf]),
+      prefix="~{prefix}.stitch_fragmented_cnvs",
       sv_pipeline_docker=sv_pipeline_docker,
       runtime_attr_override=runtime_override_stitch_fragmented_cnvs
   }
@@ -189,7 +289,7 @@ workflow CleanVcfChromosome {
     input:
       vcf=StitchFragmentedCnvs.stitched_vcf_shard,
       contig=contig,
-      prefix=prefix,
+      prefix="~{prefix}.final_cleanup",
       sv_pipeline_docker=sv_pipeline_docker,
       runtime_attr_override=runtime_override_final_cleanup
 
@@ -202,25 +302,89 @@ workflow CleanVcfChromosome {
 }
 
 
+task CleanVcf1a {
+  input {
+    File vcf
+    String prefix
+    File background_fail_list
+    File bothsides_pass_list
+    File ped_file
+    File allosome_fai
+    String chr_x
+    String chr_y
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  Float input_size = size([vcf, background_fail_list, bothsides_pass_list], "GB")
+  RuntimeAttr runtime_default = object {
+                                  mem_gb: 3.75,
+                                  disk_gb: ceil(10.0 + input_size * 2),
+                                  cpu_cores: 1,
+                                  preemptible_tries: 3,
+                                  max_retries: 1,
+                                  boot_disk_gb: 10
+                                }
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  runtime {
+    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_pipeline_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
+
+  command <<<
+    set -euo pipefail
+
+    touch ~{prefix}.includelist.txt
+    touch ~{prefix}.sexchr.revise.txt
+
+    # outputs
+    # includelist.txt: the names of all the samples in the input vcf
+    # sexchr.revise.txt: the names of the events where genotypes got tweaked on allosomes
+    # stdout: a revised vcf
+    java -jar $CLEAN_VCF_PART_1_JAR \
+      ~{vcf} \
+      ~{ped_file} \
+      ~{chr_x} \
+      ~{chr_y} \
+      ~{background_fail_list} \
+      ~{bothsides_pass_list} \
+      ~{prefix}.includelist.txt \
+      ~{prefix}.sexchr.revise.txt \
+      | bgzip \
+      > ~{prefix}.vcf.gz
+    tabix ~{prefix}.vcf.gz
+  >>>
+
+  output {
+    File include_list="~{prefix}.includelist.txt"
+    File sex="~{prefix}.sexchr.revise.txt"
+    File intermediate_vcf="~{prefix}.vcf.gz"
+    File intermediate_vcf_idx="~{prefix}.vcf.gz.tbi"
+  }
+}
+
 task CleanVcf2 {
   input {
     File normal_revise_vcf
+    String prefix
     File include_list
     File multi_cnvs
-    File vcftools_idx
     String sv_pipeline_docker
     RuntimeAttr? runtime_attr_override
   }
 
   # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed
   # generally assume working memory is ~3 * inputs
-  Float input_size = size([normal_revise_vcf, include_list, multi_cnvs, vcftools_idx], "GB")
+  Float input_size = size([normal_revise_vcf, include_list, multi_cnvs], "GB")
   Float base_disk_gb = 10.0
-  Float base_mem_gb = 4.0
-  Float input_mem_scale = 3.0
-  Float input_disk_scale = 5.0
+  Float input_disk_scale = 3.0
   RuntimeAttr runtime_default = object {
-    mem_gb: base_mem_gb + input_size * input_mem_scale,
+    mem_gb: 2.0,
     disk_gb: ceil(base_disk_gb + input_size * input_disk_scale),
     cpu_cores: 1,
     preemptible_tries: 3,
@@ -241,36 +405,34 @@ task CleanVcf2 {
   command <<<
     set -eu -o pipefail
 
+    bcftools index ~{normal_revise_vcf}
     /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part2.sh \
       ~{normal_revise_vcf} \
       ~{include_list} \
       ~{multi_cnvs} \
-      "output.txt"
+      "~{prefix}.txt"
   >>>
 
   output {
-    File out="output.txt"
+    File out="~{prefix}.txt"
   }
 }
 
 
-task CleanVcf3{
+task CleanVcf3 {
   input {
     File rd_cn_revise
+    Int? max_samples_shard
     String sv_pipeline_docker
     RuntimeAttr? runtime_attr_override
   }
-
+  Int max_samples_shard_ = select_first([max_samples_shard, 7000])
   # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed
   # generally assume working memory is ~3 * inputs
   Float input_size = size(rd_cn_revise, "GB")
-  Float base_disk_gb = 10.0
-  Float base_mem_gb = 2.0
-  Float input_mem_scale = 3.0
-  Float input_disk_scale = 5.0
   RuntimeAttr runtime_default = object {
-    mem_gb: base_mem_gb + input_size * input_mem_scale,
-    disk_gb: ceil(base_disk_gb + input_size * input_disk_scale),
+    mem_gb: 3.75,
+    disk_gb: ceil(10.0 + input_size * 2.0),
     cpu_cores: 1,
     preemptible_tries: 3,
     max_retries: 1,
@@ -289,13 +451,9 @@ task CleanVcf3{
 
   command <<<
     set -euo pipefail
-    
-    /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.sh ~{rd_cn_revise}
-
+    python /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.py ~{rd_cn_revise} -s ~{max_samples_shard_}
     # Ensure there is at least one shard
-    if [ -z "$(ls -A shards/)" ]; then
-      touch shards/out.0_0.txt
-    fi
+    touch shards/out.0_0.txt
   >>>
 
   output {
@@ -308,13 +466,14 @@ task CleanVcf4 {
   input {
     File rd_cn_revise
     File normal_revise_vcf
+    String prefix
     String sv_pipeline_docker
     RuntimeAttr? runtime_attr_override
   }
 
   Float input_size = size([rd_cn_revise, normal_revise_vcf], "GB")
   RuntimeAttr runtime_default = object {
-                                  mem_gb: 2.0 + input_size * 3.0,
+                                  mem_gb: 2.0,
                                   disk_gb: 50,
                                   cpu_cores: 1,
                                   preemptible_tries: 3,
@@ -353,7 +512,7 @@ task CleanVcf4 {
         vid_sample_cn_map[vid].append(tuple(tokens[1:]))
 
     # Traverse VCF and replace genotypes
-    with open("revise.vcf.lines.txt", "w") as f:
+    with open("~{prefix}.revise_vcf_lines.txt", "w") as f:
       vcf = pysam.VariantFile(VCF_FILE)
       num_vcf_records = 0
       for record in vcf:
@@ -406,53 +565,46 @@ task CleanVcf4 {
           gt = s['SR_GT']
         if gt > 2:
           num_gt_over_2 += 1
-          if record.id == "gnomad-sv-v3-TEST-SMALL.chr22_BND_chr22_173":
-            print("{} {}".format(sid, num_gt_over_2))
-            print("{} {} {} {}".format(s['PE_GT'], s['PE_GQ'], s['SR_GT'], s['SR_GQ']))
       if num_gt_over_2 > max_vf:
         multi_geno_ids.add(record.id)
     vcf.close()
 
     multi_geno_ids = sorted(list(multi_geno_ids))
-    with open("multi.geno.ids.txt", "w") as f:
+    with open("~{prefix}.multi_geno_ids.txt", "w") as f:
       for vid in multi_geno_ids:
         f.write(vid + "\n")
     CODE
 
-    bgzip revise.vcf.lines.txt
-    gzip multi.geno.ids.txt
+    bgzip ~{prefix}.revise_vcf_lines.txt
+    gzip ~{prefix}.multi_geno_ids.txt
   >>>
 
   output {
-    File out="revise.vcf.lines.txt.gz"
-    File multi_ids="multi.geno.ids.txt.gz"
+    File out="~{prefix}.revise_vcf_lines.txt.gz"
+    File multi_ids="~{prefix}.multi_geno_ids.txt.gz"
   }
 }
 
 
-# Stitch fragmented RD-only calls found in 100% of the same samples
-task StitchFragmentedCnvs {
+# Remove CNVs that are redundant with CPX events or other CNVs
+task DropRedundantCnvs {
   input {
     File vcf
-    String contig
     String prefix
+    String contig
     String sv_pipeline_docker
     RuntimeAttr? runtime_attr_override
   }
-  
-  String stitched_vcf_name = contig + ".shard.fragmented_CNVs_stitched.vcf.gz"
 
-  # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed
-  # generally assume working memory is ~3 * inputs
-  Float input_size = size(vcf, "GB")
-  Float base_disk_gb = 10.0
-  Float base_mem_gb = 2.0
-  Float input_mem_scale = 3.0
-  Float input_disk_scale = 5.0
+  Float input_size = size(vcf, "GiB")
+  # disk is cheap, read/write speed is proportional to disk size, and disk IO is a significant time factor:
+  # in tests on large VCFs, memory usage is ~1.0 * input VCF size
+  # the biggest disk usage is at the end of the task, with input + output VCF on disk
+  Int cpu_cores = 2 # speed up compression / decompression of VCFs
   RuntimeAttr runtime_default = object {
-    mem_gb: base_mem_gb + input_size * input_mem_scale,
-    disk_gb: ceil(base_disk_gb + input_size * input_disk_scale),
-    cpu_cores: 1,
+    mem_gb: 3.75 + input_size * 1.5,
+    disk_gb: ceil(100.0 + input_size * 2.0),
+    cpu_cores: cpu_cores,
     preemptible_tries: 3,
     max_retries: 1,
     boot_disk_gb: 10
@@ -469,19 +621,64 @@ task StitchFragmentedCnvs {
   }
 
   command <<<
-    set -eu -o pipefail
-    
-    /opt/sv-pipeline/04_variant_resolution/scripts/stitch_fragmented_CNVs.sh \
-      ~{vcf} \
-      "tmp_~{stitched_vcf_name}"
-    
-    /opt/sv-pipeline/04_variant_resolution/scripts/stitch_fragmented_CNVs.sh \
-      "tmp_~{stitched_vcf_name}" \
-      "~{stitched_vcf_name}"
+    set -euo pipefail
+    /opt/sv-pipeline/04_variant_resolution/scripts/resolve_cpx_cnv_redundancies.py \
+      ~{vcf} ~{prefix}.vcf.gz --temp-dir ./tmp
   >>>
 
   output {
-    File stitched_vcf_shard = stitched_vcf_name
+    File out = "~{prefix}.vcf.gz"
+  }
+}
+
+
+# Stitch fragmented RD-only calls found in 100% of the same samples
+task StitchFragmentedCnvs {
+  input {
+    File vcf
+    String prefix
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  Float input_size = size(vcf, "GB")
+  RuntimeAttr runtime_default = object {
+                                  mem_gb: 7.5,
+                                  disk_gb: ceil(10.0 + input_size * 2),
+                                  cpu_cores: 1,
+                                  preemptible_tries: 3,
+                                  max_retries: 1,
+                                  boot_disk_gb: 10
+                                }
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  Float mem_gb = select_first([runtime_override.mem_gb, runtime_default.mem_gb])
+  Int java_mem_mb = ceil(mem_gb * 1000 * 0.8)
+
+  runtime {
+    memory: "~{mem_gb} GB"
+    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_pipeline_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
+
+  command <<<
+    set -euo pipefail
+    echo "First pass..."
+    java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 ~{vcf} \
+      | bgzip \
+      > tmp.vcf.gz
+    rm ~{vcf}
+    echo "Second pass..."
+    java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 tmp.vcf.gz \
+      | bgzip \
+      > ~{prefix}.vcf.gz
+  >>>
+
+  output {
+    File stitched_vcf_shard = "~{prefix}.vcf.gz"
   }
 }
 
@@ -495,8 +692,6 @@ task FinalCleanup {
     String sv_pipeline_docker
     RuntimeAttr? runtime_attr_override
   }
-  
-  String cleaned_shard_name = prefix + "." + contig + ".final_cleanup.vcf.gz"
 
   # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed
   # generally assume working memory is ~3 * inputs
@@ -532,15 +727,14 @@ task FinalCleanup {
       --prefix ~{prefix} \
       ~{vcf} stdout \
       | fgrep -v "##INFO=<ID=HIGH_SR_BACKGROUND" \
-      | /opt/sv-pipeline/04_variant_resolution/scripts/sanitize_filter_field.py stdin stdout \
       | fgrep -v "##INFO=<ID=MEMBERS,Number=.,Type=String," \
       | bgzip -c \
-      > "~{cleaned_shard_name}"
-    tabix ~{cleaned_shard_name}
+      > ~{prefix}.vcf.gz
+    tabix ~{prefix}.vcf.gz
   >>>
 
   output {
-    File final_cleaned_shard = cleaned_shard_name
-    File final_cleaned_shard_idx = cleaned_shard_name + ".tbi"
+    File final_cleaned_shard = "~{prefix}.vcf.gz"
+    File final_cleaned_shard_idx = "~{prefix}.vcf.gz.tbi"
   }
 }
\ No newline at end of file
diff --git a/wdl/ClusterSingleChromosome.wdl b/wdl/ClusterSingleChromosome.wdl
index a1e18bc8b..4d8adcbe0 100644
--- a/wdl/ClusterSingleChromosome.wdl
+++ b/wdl/ClusterSingleChromosome.wdl
@@ -4,13 +4,17 @@ version 1.0
 
 import "TasksMakeCohortVcf.wdl" as MiniTasks
 import "ShardedCluster.wdl" as ShardedCluster
+import "HailMerge.wdl" as HailMerge
 
 # Workflow to perform sharding & clustering of a vcf for a single chromosome
 workflow ClusterSingleChrom {
   input {
     File vcf
     File vcf_index
+    Int num_samples
     String contig
+    String cohort_name
+    String evidence_type
     String prefix
     Int dist
     Float frac
@@ -18,23 +22,34 @@ workflow ClusterSingleChrom {
     File? exclude_list
     Int sv_size
     Array[String] sv_types
+    File empty_file
+
+    Boolean use_hail
+    String? gcs_project
 
     String sv_pipeline_docker
+    String sv_pipeline_hail_docker
     String sv_base_mini_docker
 
-    # overrides for local tasks
-    RuntimeAttr? runtime_override_concat_svtypes
-
     # overrides for MiniTasks
     RuntimeAttr? runtime_override_subset_sv_type
+    RuntimeAttr? runtime_override_cat_vid_lists_chrom
 
     # overrides for ShardedCluster
-    RuntimeAttr? runtime_override_shard_vcf_precluster
+    RuntimeAttr? runtime_override_shard_clusters
+    RuntimeAttr? runtime_override_shard_vids
     RuntimeAttr? runtime_override_pull_vcf_shard
     RuntimeAttr? runtime_override_svtk_vcf_cluster
     RuntimeAttr? runtime_override_get_vcf_header_with_members_info_line
     RuntimeAttr? runtime_override_concat_svtypes
     RuntimeAttr? runtime_override_concat_sharded_cluster
+    RuntimeAttr? runtime_override_cat_vid_lists_sharded
+    RuntimeAttr? runtime_override_make_sites_only
+    RuntimeAttr? runtime_override_sort_merged_vcf
+
+    RuntimeAttr? runtime_override_preconcat_sharded_cluster
+    RuntimeAttr? runtime_override_hail_merge_sharded_cluster
+    RuntimeAttr? runtime_override_fix_header_sharded_cluster
   }
 
   #Scatter over svtypes
@@ -53,105 +68,45 @@ workflow ClusterSingleChrom {
     }
 
     #For each svtype, intelligently shard VCF for clustering
-    call ShardedCluster.ShardedCluster as ShardedCluster {
+    call ShardedCluster.ShardedCluster {
       input:
         vcf=SubsetSvType.filtered_vcf,
+        num_samples=num_samples,
         dist=dist,
         frac=frac,
         prefix="~{prefix}.~{sv_type}",
+        cohort_name=cohort_name,
         contig=contig,
+        evidence_type=evidence_type,
         sv_type=sv_type,
         sample_overlap=sample_overlap,
         exclude_list=exclude_list,
         sv_size=sv_size,
         sv_types=sv_types,
+        empty_file=empty_file,
+        use_hail=use_hail,
+        gcs_project=gcs_project,
         sv_pipeline_docker=sv_pipeline_docker,
+        sv_pipeline_hail_docker=sv_pipeline_hail_docker,
         sv_base_mini_docker=sv_base_mini_docker,
-        runtime_override_shard_vcf_precluster=runtime_override_shard_vcf_precluster,
+        runtime_override_shard_clusters=runtime_override_shard_clusters,
+        runtime_override_shard_vids=runtime_override_shard_vids,
         runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard,
         runtime_override_svtk_vcf_cluster=runtime_override_svtk_vcf_cluster,
         runtime_override_get_vcf_header_with_members_info_line=runtime_override_get_vcf_header_with_members_info_line,
-        runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster
-    }
-    call RenameVariants {
-      input:
-        vcf=ShardedCluster.clustered_vcf,
-        vcf_index=ShardedCluster.clustered_vcf_idx,
-        prefix=prefix,
-        contig=contig,
-        sv_pipeline_docker=sv_pipeline_docker,
-        runtime_attr_override=runtime_override_concat_svtypes
+        runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster,
+        runtime_override_cat_vid_lists_sharded=runtime_override_cat_vid_lists_sharded,
+        runtime_override_make_sites_only=runtime_override_make_sites_only,
+        runtime_override_sort_merged_vcf=runtime_override_sort_merged_vcf,
+        runtime_override_preconcat_sharded_cluster=runtime_override_preconcat_sharded_cluster,
+        runtime_override_hail_merge_sharded_cluster=runtime_override_hail_merge_sharded_cluster,
+        runtime_override_fix_header_sharded_cluster=runtime_override_fix_header_sharded_cluster
     }
   }
 
-  #Merge svtypes
-  call MiniTasks.ConcatVcfs as ConcatSvTypes {
-    input:
-      vcfs=RenameVariants.out,
-      vcfs_idx=RenameVariants.out_index,
-      allow_overlaps=true,
-      outfile_prefix="~{prefix}.~{contig}.precluster_concat",
-      sv_base_mini_docker=sv_base_mini_docker,
-      runtime_attr_override=runtime_override_concat_svtypes
-  }
-
   #Output clustered vcf
   output {
-    File clustered_vcf = ConcatSvTypes.concat_vcf
-    File clustered_vcf_idx = ConcatSvTypes.concat_vcf_idx
-  }
-}
-
-task RenameVariants {
-  input {
-    File vcf
-    File vcf_index
-    String prefix
-    String contig
-
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  String vcf_name = prefix + "." + contig + ".renamed.vcf.gz"
-
-  # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to
-  # be held in memory or disk while working, potentially in a form that takes up more space)
-  Float input_size = size(vcf, "GiB")
-  RuntimeAttr runtime_default = object {
-    mem_gb: 3.75,
-    disk_gb: ceil(10.0 + 2.0 * input_size),
-    cpu_cores: 1,
-    preemptible_tries: 3,
-    max_retries: 1,
-    boot_disk_gb: 10
-  }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euo pipefail
-
-    /opt/sv-pipeline/04_variant_resolution/scripts/rename_after_vcfcluster.py \
-      --chrom ~{contig} \
-      --prefix ~{prefix} \
-      ~{vcf} - \
-      | bgzip -c \
-      > ~{vcf_name}
-
-    tabix -p vcf -f ~{vcf_name}
-  >>>
-
-  output {
-    File out = vcf_name
-    File out_index = vcf_name + ".tbi"
+    Array[File] clustered_vcfs = ShardedCluster.clustered_vcf
+    Array[File] clustered_vcf_indexes = ShardedCluster.clustered_vcf_idx
   }
 }
diff --git a/wdl/CombineBatches.wdl b/wdl/CombineBatches.wdl
index e42525ae3..66be86168 100644
--- a/wdl/CombineBatches.wdl
+++ b/wdl/CombineBatches.wdl
@@ -1,7 +1,12 @@
 version 1.0
 
+import "CombineSRBothsidePass.wdl" as CombineSRBothsidePass
 import "VcfClusterSingleChromsome.wdl" as VcfClusterContig
 import "TasksMakeCohortVcf.wdl" as MiniTasks
+import "HailMerge.wdl" as HailMerge
+import "HarmonizeHeaders.wdl" as HarmonizeHeaders
+import "MergePesrDepth.wdl" as MergePesrDepth
+import "Utils.wdl" as Utils
 
 workflow CombineBatches {
   input {
@@ -24,19 +29,31 @@ workflow CombineBatches {
 
     File empty_file
 
+    Boolean use_hail = false
+    String? gcs_project
+
     String sv_base_mini_docker
     String sv_pipeline_docker
+    String sv_pipeline_hail_docker
 
     # overrides for local tasks
     RuntimeAttr? runtime_override_update_sr_list
     RuntimeAttr? runtime_override_merge_pesr_depth
+    RuntimeAttr? runtime_override_reheader
+    RuntimeAttr? runtime_override_pull_header
 
     # overrides for mini tasks
-    RuntimeAttr? runtime_override_clean_bothside_pass
+    RuntimeAttr? runtime_attr_get_non_ref_vids
+    RuntimeAttr? runtime_attr_calculate_support_frac
     RuntimeAttr? runtime_override_clean_background_fail
     RuntimeAttr? runtime_override_concat
-    RuntimeAttr? runtime_override_sort_pesr_depth_merged_vcf
     RuntimeAttr? runtime_override_concat_pesr_depth
+    RuntimeAttr? runtime_override_update_fix_pesr_header
+    RuntimeAttr? runtime_override_count_samples
+    RuntimeAttr? runtime_override_preconcat_pesr_depth
+    RuntimeAttr? runtime_override_hail_merge_pesr_depth
+    RuntimeAttr? runtime_override_fix_header_pesr_depth
+    RuntimeAttr? runtime_override_concat_large_pesr_depth
 
     # overrides for VcfClusterContig
     RuntimeAttr? runtime_override_localize_vcfs
@@ -46,24 +63,53 @@ workflow CombineBatches {
     RuntimeAttr? runtime_override_subset_bothside_pass
     RuntimeAttr? runtime_override_subset_background_fail
     RuntimeAttr? runtime_override_subset_sv_type
-    RuntimeAttr? runtime_override_shard_vcf_precluster
+    RuntimeAttr? runtime_override_shard_clusters
+    RuntimeAttr? runtime_override_shard_vids
     RuntimeAttr? runtime_override_pull_vcf_shard
     RuntimeAttr? runtime_override_svtk_vcf_cluster
     RuntimeAttr? runtime_override_get_vcf_header_with_members_info_line
     RuntimeAttr? runtime_override_concat_vcf_cluster
     RuntimeAttr? runtime_override_concat_svtypes
     RuntimeAttr? runtime_override_concat_sharded_cluster
+    RuntimeAttr? runtime_override_make_sites_only
+    RuntimeAttr? runtime_override_sort_merged_vcf_cluster
+    RuntimeAttr? runtime_override_preconcat_sharded_cluster
+    RuntimeAttr? runtime_override_hail_merge_sharded_cluster
+    RuntimeAttr? runtime_override_fix_header_sharded_cluster
+
+    # overerides for merge pesr depth
+    RuntimeAttr? runtime_override_shard_clusters_mpd
+    RuntimeAttr? runtime_override_shard_vids_mpd
+    RuntimeAttr? runtime_override_pull_vcf_shard_mpd
+    RuntimeAttr? runtime_override_merge_pesr_depth_mpd
+
+    RuntimeAttr? runtime_override_sort_merged_vcf_mpd
+    RuntimeAttr? runtime_override_subset_small_mpd
+    RuntimeAttr? runtime_override_subset_large_mpd
+    RuntimeAttr? runtime_override_make_sites_only_mpd
+    RuntimeAttr? runtime_override_concat_large_pesr_depth_mpd
+    RuntimeAttr? runtime_override_concat_shards_mpd
+
+    RuntimeAttr? runtime_override_preconcat_large_pesr_depth_mpd
+    RuntimeAttr? runtime_override_hail_merge_large_pesr_depth_mpd
+    RuntimeAttr? runtime_override_fix_header_large_pesr_depth_mpd
+
+    RuntimeAttr? runtime_override_preconcat_pesr_depth_shards_mpd
+    RuntimeAttr? runtime_override_hail_merge_pesr_depth_shards_mpd
+    RuntimeAttr? runtime_override_fix_header_pesr_depth_shards_mpd
+
   }
 
   # Preprocess some inputs
-  Int num_pass_lines=length(raw_sr_bothside_pass_files)
-  call MiniTasks.CatUncompressedFiles as CleanBothsidePass {
+  call CombineSRBothsidePass.CombineSRBothsidePass {
     input:
-      shards=raw_sr_bothside_pass_files,
-      filter_command="sort | uniq -c | awk -v OFS='\\t' '{print $1/~{num_pass_lines}, $2}'",
-      outfile_name="cohort_sr_genotyping_bothside_pass_list.txt",
+      pesr_vcfs=pesr_vcfs,
+      raw_sr_bothside_pass_files=raw_sr_bothside_pass_files,
+      prefix="~{cohort_name}.sr_bothside_pass",
       sv_base_mini_docker=sv_base_mini_docker,
-      runtime_attr_override=runtime_override_clean_bothside_pass
+      sv_pipeline_docker=sv_pipeline_docker,
+      runtime_attr_get_non_ref_vids=runtime_attr_get_non_ref_vids,
+      runtime_attr_calculate_support_frac=runtime_attr_calculate_support_frac
   }
 
   Float min_background_fail_first_col = min_sr_background_fail_batches * length(raw_sr_background_fail_files)
@@ -71,11 +117,18 @@ workflow CombineBatches {
     input:
       shards=raw_sr_background_fail_files,
       filter_command="sort | uniq -c | awk -v OFS='\\t' '{if($1 >= ~{min_background_fail_first_col}) print $2}'",
-      outfile_name="cohort_sr_genotyping_background_fail_list.txt",
+      outfile_name="~{cohort_name}.background_fail.txt",
       sv_base_mini_docker=sv_base_mini_docker,
       runtime_attr_override=runtime_override_clean_background_fail
   }
 
+  call Utils.CountSamples {
+    input:
+      vcf=depth_vcfs[0],
+      sv_base_mini_docker=sv_base_mini_docker,
+      runtime_attr_override=runtime_override_count_samples
+  }
+
   #Scatter per chromosome
   Array[String] contigs = transpose(read_tsv(contig_list))[0]
   scatter ( contig in contigs ) {
@@ -86,6 +139,7 @@ workflow CombineBatches {
     call VcfClusterContig.VcfClusterSingleChrom as ClusterPesr {
       input:
         vcfs=pesr_vcfs,
+        num_samples=CountSamples.num_samples,
         batches=batches,
         prefix="~{cohort_name}.~{contig}.pesr",
         dist=300,
@@ -95,12 +149,17 @@ workflow CombineBatches {
         sv_size=50,
         sv_types=["DEL","DUP","INV","BND","INS"],
         contig=contig,
+        evidence_type="pesr",
+        cohort_name=cohort_name,
         localize_shard_size=localize_shard_size,
         subset_sr_lists=true,
-        bothside_pass=CleanBothsidePass.outfile,
+        bothside_pass=CombineSRBothsidePass.out,
         background_fail=CleanBackgroundFail.outfile,
         empty_file=empty_file,
+        use_hail=use_hail,
+        gcs_project=gcs_project,
         sv_pipeline_docker=sv_pipeline_docker,
+        sv_pipeline_hail_docker=sv_pipeline_hail_docker,
         sv_base_mini_docker=sv_base_mini_docker,
         runtime_override_localize_vcfs = runtime_override_localize_vcfs,
         runtime_override_join_vcfs = runtime_override_join_vcfs,
@@ -109,19 +168,26 @@ workflow CombineBatches {
         runtime_override_subset_bothside_pass=runtime_override_subset_bothside_pass,
         runtime_override_subset_background_fail=runtime_override_subset_background_fail,
         runtime_override_subset_sv_type=runtime_override_subset_sv_type,
-        runtime_override_shard_vcf_precluster=runtime_override_shard_vcf_precluster,
+        runtime_override_shard_clusters=runtime_override_shard_clusters,
+        runtime_override_shard_vids=runtime_override_shard_vids,
         runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard,
         runtime_override_svtk_vcf_cluster=runtime_override_svtk_vcf_cluster,
         runtime_override_get_vcf_header_with_members_info_line=runtime_override_get_vcf_header_with_members_info_line,
         runtime_override_concat_vcf_cluster=runtime_override_concat_vcf_cluster,
         runtime_override_concat_svtypes=runtime_override_concat_svtypes,
-        runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster
+        runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster,
+        runtime_override_make_sites_only=runtime_override_make_sites_only,
+        runtime_override_sort_merged_vcf=runtime_override_sort_merged_vcf_cluster,
+        runtime_override_preconcat_sharded_cluster=runtime_override_preconcat_sharded_cluster,
+        runtime_override_hail_merge_sharded_cluster=runtime_override_hail_merge_sharded_cluster,
+        runtime_override_fix_header_sharded_cluster=runtime_override_fix_header_sharded_cluster
     }
 
     #Subset RD VCFs to single chromosome & cluster
     call VcfClusterContig.VcfClusterSingleChrom as ClusterDepth {
       input:
         vcfs=depth_vcfs,
+        num_samples=CountSamples.num_samples,
         batches=batches,
         prefix="~{cohort_name}.~{contig}.depth",
         dist=500000,
@@ -131,12 +197,17 @@ workflow CombineBatches {
         sv_size=5000,
         sv_types=["DEL","DUP"],
         contig=contig,
+        evidence_type="depth",
+        cohort_name=cohort_name,
         localize_shard_size=localize_shard_size,
         subset_sr_lists=false,
-        bothside_pass=CleanBothsidePass.outfile,
+        bothside_pass=CombineSRBothsidePass.out,
         background_fail=CleanBackgroundFail.outfile,
         empty_file=empty_file,
+        use_hail=use_hail,
+        gcs_project=gcs_project,
         sv_pipeline_docker=sv_pipeline_docker,
+        sv_pipeline_hail_docker=sv_pipeline_hail_docker,
         sv_base_mini_docker=sv_base_mini_docker,
         runtime_override_localize_vcfs = runtime_override_localize_vcfs,
         runtime_override_join_vcfs = runtime_override_join_vcfs,
@@ -145,18 +216,36 @@ workflow CombineBatches {
         runtime_override_subset_bothside_pass=runtime_override_subset_bothside_pass,
         runtime_override_subset_background_fail=runtime_override_subset_background_fail,
         runtime_override_subset_sv_type=runtime_override_subset_sv_type,
-        runtime_override_shard_vcf_precluster=runtime_override_shard_vcf_precluster,
+        runtime_override_shard_clusters=runtime_override_shard_clusters,
+        runtime_override_shard_vids=runtime_override_shard_vids,
         runtime_override_svtk_vcf_cluster=runtime_override_svtk_vcf_cluster,
         runtime_override_get_vcf_header_with_members_info_line=runtime_override_get_vcf_header_with_members_info_line,
         runtime_override_concat_vcf_cluster=runtime_override_concat_vcf_cluster,
         runtime_override_concat_svtypes=runtime_override_concat_svtypes,
-        runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster
+        runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster,
+        runtime_override_make_sites_only=runtime_override_make_sites_only,
+        runtime_override_sort_merged_vcf=runtime_override_sort_merged_vcf_cluster,
+        runtime_override_preconcat_sharded_cluster=runtime_override_preconcat_sharded_cluster,
+        runtime_override_hail_merge_sharded_cluster=runtime_override_hail_merge_sharded_cluster,
+        runtime_override_fix_header_sharded_cluster=runtime_override_fix_header_sharded_cluster
+    }
+
+    call MiniTasks.ConcatVcfs as ConcatPesrSitesOnly {
+      input:
+        vcfs=ClusterPesr.clustered_vcfs,
+        vcfs_idx=ClusterPesr.clustered_vcf_indexes,
+        naive=true,
+        generate_index=false,
+        sites_only=true,
+        outfile_prefix="~{cohort_name}.clustered_pesr.sites_only",
+        sv_base_mini_docker=sv_base_mini_docker,
+        runtime_attr_override=runtime_override_concat
     }
 
     #Update SR background fail & bothside pass files (1)
     call MiniTasks.UpdateSrList as UpdateBackgroundFailFirst {
       input:
-        vcf=ClusterPesr.clustered_vcf,
+        vcf=ConcatPesrSitesOnly.concat_vcf,
         original_list=ClusterPesr.filtered_background_fail,
         outfile="~{cohort_name}.~{contig}.sr_background_fail.updated.txt",
         sv_pipeline_docker=sv_pipeline_docker,
@@ -164,44 +253,118 @@ workflow CombineBatches {
     }
     call MiniTasks.UpdateSrList as UpdateBothsidePassFirst {
       input:
-        vcf=ClusterPesr.clustered_vcf,
+        vcf=ConcatPesrSitesOnly.concat_vcf,
         original_list=ClusterPesr.filtered_bothside_pass,
         outfile="~{cohort_name}.~{contig}.sr_bothside_pass.updated.txt",
         sv_pipeline_docker=sv_pipeline_docker,
         runtime_attr_override=runtime_override_update_sr_list
     }
 
-    #Merge PESR & RD VCFs
-    call MiniTasks.ConcatVcfs as ConcatPesrDepth {
+    call HarmonizeHeaders.HarmonizeHeaders {
       input:
-        vcfs=[ClusterPesr.clustered_vcf, ClusterDepth.clustered_vcf],
-        vcfs_idx=[ClusterPesr.clustered_vcf_idx, ClusterDepth.clustered_vcf_idx],
-        allow_overlaps=true,
-        outfile_prefix="~{cohort_name}.~{contig}.concat_pesr_depth",
+        header_vcf=ClusterDepth.clustered_vcfs[0],
+        vcfs=ClusterPesr.clustered_vcfs,
+        prefix="~{cohort_name}.~{contig}.harmonize_headers",
         sv_base_mini_docker=sv_base_mini_docker,
-        runtime_attr_override=runtime_override_concat_pesr_depth
+        runtime_override_reheader=runtime_override_reheader,
+        runtime_override_pull_header=runtime_override_pull_header
     }
-    call MergePesrDepth {
+
+    call MergePesrDepth.MergePesrDepth as MergeDeletions {
       input:
-        vcf=ConcatPesrDepth.concat_vcf,
-        vcf_index=ConcatPesrDepth.concat_vcf_idx,
+        subtyped_pesr_vcf=HarmonizeHeaders.out[0],
+        subtyped_depth_vcf=ClusterDepth.clustered_vcfs[0],
+        svtype="DEL",
+        num_samples=CountSamples.num_samples,
+        prefix="~{cohort_name}.~{contig}.merge_del",
+        cohort_name=cohort_name,
         contig=contig,
-        prefix="~{cohort_name}.~{contig}.merge_pesr_depth",
+        use_hail=use_hail,
+        gcs_project=gcs_project,
+        sv_base_mini_docker=sv_base_mini_docker,
         sv_pipeline_docker=sv_pipeline_docker,
-        runtime_attr_override=runtime_override_merge_pesr_depth
+        sv_pipeline_hail_docker=sv_pipeline_hail_docker,
+        runtime_override_shard_clusters=runtime_override_shard_clusters_mpd,
+        runtime_override_shard_vids=runtime_override_shard_vids_mpd,
+        runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard_mpd,
+        runtime_override_merge_pesr_depth=runtime_override_merge_pesr_depth_mpd,
+        runtime_override_sort_merged_vcf=runtime_override_sort_merged_vcf_mpd,
+        runtime_override_subset_small=runtime_override_subset_small_mpd,
+        runtime_override_subset_large=runtime_override_subset_large_mpd,
+        runtime_override_make_sites_only=runtime_override_make_sites_only_mpd,
+        runtime_override_concat_large_pesr_depth=runtime_override_concat_large_pesr_depth_mpd,
+        runtime_override_concat_shards=runtime_override_concat_shards_mpd,
+        runtime_override_preconcat_large_pesr_depth=runtime_override_preconcat_large_pesr_depth_mpd,
+        runtime_override_hail_merge_large_pesr_depth=runtime_override_hail_merge_large_pesr_depth_mpd,
+        runtime_override_fix_header_large_pesr_depth=runtime_override_fix_header_large_pesr_depth_mpd,
+        runtime_override_preconcat_pesr_depth_shards=runtime_override_preconcat_pesr_depth_shards_mpd,
+        runtime_override_hail_merge_pesr_depth_shards=runtime_override_hail_merge_pesr_depth_shards_mpd,
+        runtime_override_fix_header_pesr_depth_shards=runtime_override_fix_header_pesr_depth_shards_mpd
     }
-    call MiniTasks.SortVcf as SortMergePesrDepth {
+
+    call MergePesrDepth.MergePesrDepth as MergeDuplications {
       input:
-        vcf = MergePesrDepth.merged_vcf,
-        outfile_prefix = "~{cohort_name}.~{contig}.sort_pesr_depth",
-        sv_base_mini_docker = sv_base_mini_docker,
-        runtime_attr_override = runtime_override_sort_pesr_depth_merged_vcf
+        subtyped_pesr_vcf=HarmonizeHeaders.out[1],
+        subtyped_depth_vcf=ClusterDepth.clustered_vcfs[1],
+        svtype="DUP",
+        num_samples=CountSamples.num_samples,
+        prefix="~{cohort_name}.~{contig}.merge_dup",
+        cohort_name=cohort_name,
+        contig=contig,
+        use_hail=use_hail,
+        gcs_project=gcs_project,
+        sv_base_mini_docker=sv_base_mini_docker,
+        sv_pipeline_docker=sv_pipeline_docker,
+        sv_pipeline_hail_docker=sv_pipeline_hail_docker,
+        runtime_override_shard_clusters=runtime_override_shard_clusters_mpd,
+        runtime_override_shard_vids=runtime_override_shard_vids_mpd,
+        runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard_mpd,
+        runtime_override_merge_pesr_depth=runtime_override_merge_pesr_depth_mpd,
+        runtime_override_sort_merged_vcf=runtime_override_sort_merged_vcf_mpd,
+        runtime_override_subset_small=runtime_override_subset_small_mpd,
+        runtime_override_subset_large=runtime_override_subset_large_mpd,
+        runtime_override_make_sites_only=runtime_override_make_sites_only_mpd,
+        runtime_override_concat_large_pesr_depth=runtime_override_concat_large_pesr_depth_mpd,
+        runtime_override_concat_shards=runtime_override_concat_shards_mpd,
+        runtime_override_preconcat_large_pesr_depth=runtime_override_preconcat_large_pesr_depth_mpd,
+        runtime_override_hail_merge_large_pesr_depth=runtime_override_hail_merge_large_pesr_depth_mpd,
+        runtime_override_fix_header_large_pesr_depth=runtime_override_fix_header_large_pesr_depth_mpd,
+        runtime_override_preconcat_pesr_depth_shards=runtime_override_preconcat_pesr_depth_shards_mpd,
+        runtime_override_hail_merge_pesr_depth_shards=runtime_override_hail_merge_pesr_depth_shards_mpd,
+        runtime_override_fix_header_pesr_depth_shards=runtime_override_fix_header_pesr_depth_shards_mpd
+    }
+
+    #Merge PESR & RD VCFs
+    if (use_hail) {
+      call HailMerge.HailMerge as ConcatPesrDepthHail {
+        input:
+          vcfs=[MergeDeletions.out, MergeDuplications.out, HarmonizeHeaders.out[2], HarmonizeHeaders.out[3], HarmonizeHeaders.out[4]],
+          prefix="~{cohort_name}.~{contig}.concat_pesr_depth",
+          gcs_project=gcs_project,
+          sv_base_mini_docker=sv_base_mini_docker,
+          sv_pipeline_docker=sv_pipeline_docker,
+          sv_pipeline_hail_docker=sv_pipeline_hail_docker,
+          runtime_override_preconcat=runtime_override_preconcat_pesr_depth,
+          runtime_override_hail_merge=runtime_override_hail_merge_pesr_depth,
+          runtime_override_fix_header=runtime_override_fix_header_pesr_depth
+      }
+    }
+    if (!use_hail) {
+      call MiniTasks.ConcatVcfs as ConcatPesrDepth {
+        input:
+          vcfs=[MergeDeletions.out, MergeDuplications.out, HarmonizeHeaders.out[2], HarmonizeHeaders.out[3], HarmonizeHeaders.out[4]],
+          vcfs_idx=[MergeDeletions.out+".tbi", MergeDuplications.out+".tbi", HarmonizeHeaders.out[2]+".tbi", HarmonizeHeaders.out[3]+".tbi", HarmonizeHeaders.out[4]+".tbi"],
+          allow_overlaps=true,
+          outfile_prefix="~{cohort_name}.~{contig}.concat_pesr_depth",
+          sv_base_mini_docker=sv_base_mini_docker,
+          runtime_attr_override=runtime_override_concat_large_pesr_depth
+      }
     }
 
     #Update SR background fail & bothside pass files (2)
     call MiniTasks.UpdateSrList as UpdateBackgroundFailSecond {
       input:
-        vcf=SortMergePesrDepth.out,
+        vcf=select_first([ConcatPesrDepth.concat_vcf, ConcatPesrDepthHail.merged_vcf]),
         original_list=UpdateBackgroundFailFirst.updated_list,
         outfile="~{cohort_name}.~{contig}.sr_background_fail.updated2.txt",
         sv_pipeline_docker=sv_pipeline_docker,
@@ -209,20 +372,23 @@ workflow CombineBatches {
     }
     call MiniTasks.UpdateSrList as UpdateBothsidePassSecond {
       input:
-        vcf=SortMergePesrDepth.out,
+        vcf=select_first([ConcatPesrDepth.concat_vcf, ConcatPesrDepthHail.merged_vcf]),
         original_list=UpdateBothsidePassFirst.updated_list,
         outfile="~{cohort_name}.~{contig}.sr_bothside_pass.updated2.txt",
         sv_pipeline_docker=sv_pipeline_docker,
         runtime_attr_override=runtime_override_update_sr_list
     }
+
+    File vcfs_out_ = select_first([ConcatPesrDepth.concat_vcf, ConcatPesrDepthHail.merged_vcf])
+    File vcf_indexes_out_ = select_first([ConcatPesrDepth.concat_vcf_idx, ConcatPesrDepthHail.merged_vcf_index])
   }
 
   #Merge resolved vcfs for QC
   if (merge_vcfs) {
     call MiniTasks.ConcatVcfs {
       input:
-        vcfs=SortMergePesrDepth.out,
-        vcfs_idx=SortMergePesrDepth.out_index,
+        vcfs=vcfs_out_,
+        vcfs_idx=vcf_indexes_out_,
         naive=true,
         outfile_prefix="~{cohort_name}.combine_batches",
         sv_base_mini_docker=sv_base_mini_docker,
@@ -232,60 +398,11 @@ workflow CombineBatches {
 
   #Final outputs
   output {
-    Array[File] vcfs = SortMergePesrDepth.out
-    Array[File] vcf_indexes = SortMergePesrDepth.out_index
+    Array[File] vcfs = vcfs_out_
+    Array[File] vcf_indexes = vcf_indexes_out_
     Array[File] cluster_bothside_pass_lists = UpdateBothsidePassSecond.updated_list
     Array[File] cluster_background_fail_lists = UpdateBackgroundFailSecond.updated_list
     File? merged_vcf = ConcatVcfs.concat_vcf
     File? merged_vcf_index = ConcatVcfs.concat_vcf_idx
   }
 }
-
-
-#Merge PESR + RD VCFs
-task MergePesrDepth {
-  input {
-    File vcf
-    File vcf_index
-    String prefix
-    String contig
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  String output_file = prefix + ".vcf.gz"
-
-  # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to
-  # be held in memory or disk while working, potentially in a form that takes up more space)
-  Float input_size = size(vcf, "GiB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 2.0 + 0.6 * input_size,
-                                  disk_gb: ceil(10.0 + 4 * input_size),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euo pipefail
-    /opt/sv-pipeline/04_variant_resolution/scripts/merge_pesr_depth.py \
-      --prefix pesr_depth_merged_~{contig} \
-      ~{vcf} \
-      ~{output_file}
-  >>>
-
-  output {
-    File merged_vcf = output_file
-  }
-}
diff --git a/wdl/CombineSRBothsidePass.wdl b/wdl/CombineSRBothsidePass.wdl
new file mode 100644
index 000000000..5713e8ac9
--- /dev/null
+++ b/wdl/CombineSRBothsidePass.wdl
@@ -0,0 +1,120 @@
+version 1.0
+
+import "Structs.wdl"
+
+workflow CombineSRBothsidePass {
+    input {
+        Array[File] pesr_vcfs
+        Array[File] raw_sr_bothside_pass_files
+        String prefix
+
+        String sv_base_mini_docker
+        String sv_pipeline_docker
+
+        RuntimeAttr? runtime_attr_get_non_ref_vids
+        RuntimeAttr? runtime_attr_calculate_support_frac
+    }
+
+    scatter (i in range(length(pesr_vcfs))) {
+        call GetNonRefVariantLists {
+            input:
+                vcf=pesr_vcfs[i],
+                prefix="~{prefix}.non_ref_vids.shard_~{i}",
+                sv_base_mini_docker=sv_base_mini_docker,
+                runtime_attr_override=runtime_attr_get_non_ref_vids
+        }
+    }
+
+    call CalculateBothsideSupportFraction {
+        input:
+            non_ref_vid_lists=GetNonRefVariantLists.out,
+            raw_sr_bothside_pass_files=raw_sr_bothside_pass_files,
+            prefix="~{prefix}.sr_bothside_support",
+            sv_pipeline_docker=sv_pipeline_docker,
+            runtime_attr_override=runtime_attr_calculate_support_frac
+    }
+
+    output {
+        File out = CalculateBothsideSupportFraction.out
+    }
+}
+
+task GetNonRefVariantLists {
+    input {
+        File vcf
+        String prefix
+        String sv_base_mini_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float input_size = size(vcf, "GB")
+    RuntimeAttr runtime_default = object {
+                                      mem_gb: 3.75,
+                                      disk_gb: ceil(10.0 + input_size * 2.0),
+                                      cpu_cores: 1,
+                                      preemptible_tries: 3,
+                                      max_retries: 1,
+                                      boot_disk_gb: 10
+                                  }
+    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    runtime {
+        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        docker: sv_base_mini_docker
+        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+    }
+
+    command <<<
+        set -euo pipefail
+        bcftools view -G -i 'SUM(AC)>0||SUM(FORMAT/SR_GT)>0' ~{vcf} | bcftools query -f '%ID\n' \
+            > ~{prefix}.list
+    >>>
+    output {
+        File out = "~{prefix}.list"
+    }
+}
+
+
+task CalculateBothsideSupportFraction {
+    input {
+        Array[File] non_ref_vid_lists
+        Array[File] raw_sr_bothside_pass_files
+        String prefix
+        String sv_pipeline_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float input_size = size(non_ref_vid_lists, "GB") + size(raw_sr_bothside_pass_files, "GB")
+    RuntimeAttr runtime_default = object {
+                                      mem_gb: 3.75,
+                                      disk_gb: ceil(10.0 + input_size * 2.0),
+                                      cpu_cores: 1,
+                                      preemptible_tries: 3,
+                                      max_retries: 1,
+                                      boot_disk_gb: 10
+                                  }
+    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    runtime {
+        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        docker: sv_pipeline_docker
+        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+    }
+
+    command <<<
+        set -euo pipefail
+        python /opt/sv-pipeline/04_variant_resolution/scripts/calculate_sr_bothside_support.py \
+            ~{write_lines(non_ref_vid_lists)} \
+            ~{write_lines(raw_sr_bothside_pass_files)} \
+            > ~{prefix}.txt
+    >>>
+    output {
+        File out = "~{prefix}.txt"
+    }
+}
\ No newline at end of file
diff --git a/wdl/DropRedundantCNVs.wdl b/wdl/DropRedundantCNVs.wdl
deleted file mode 100644
index 82ac75d3f..000000000
--- a/wdl/DropRedundantCNVs.wdl
+++ /dev/null
@@ -1,533 +0,0 @@
-version 1.0
-
-import "Structs.wdl"
-
-workflow DropRedundantCNVs {
-  input {
-    File vcf
-    String contig
-    String sv_pipeline_docker
-  }
-
-  call DropRedundantCNVs_1 {
-    input:
-      vcf=vcf,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call DropRedundantCNVs_2 {
-    input:
-      intervals_preclustered_bed=DropRedundantCNVs_1.intervals_preclustered_bed,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call DropRedundantCNVs_3 {
-    input:
-      intervals_preclustered_bed=DropRedundantCNVs_1.intervals_preclustered_bed,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call DropRedundantCNVs_4 {
-    input:
-      intervals_preclustered_subset_bed=DropRedundantCNVs_2.intervals_preclustered_subset_bed,
-      step2_intervals_preclustered_subset_txt=DropRedundantCNVs_3.step2_intervals_preclustered_subset_txt,
-      samples_list=DropRedundantCNVs_1.samples_list,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call DropRedundantCNVs_5 {
-    input:
-      vcf=vcf,
-      vids_to_remove_list_1=DropRedundantCNVs_4.vids_to_remove_list_1,
-      intervals_preclustered_bed=DropRedundantCNVs_1.intervals_preclustered_bed,
-      step2_variants_to_resolve_list=DropRedundantCNVs_4.step2_variants_to_resolve_list,
-      contig=contig,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  call DropRedundantCNVs_6 {
-    input:
-      unsorted_vcf=DropRedundantCNVs_5.unsorted_vcf,
-      contig=contig,
-      sv_pipeline_docker=sv_pipeline_docker
-  }
-
-  output {
-    File cleaned_vcf_shard = DropRedundantCNVs_6.cleaned_vcf_shard
-  }
-}
-
-task DropRedundantCNVs_1 {
-  input {
-    File vcf
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(vcf, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    bcftools query --list-samples ~{vcf} > samples.list
-
-    ###PREP FILES
-    #Convert full VCF to BED intervals
-    #Ignore CPX events with UNRESOLVED filter status
-    svtk vcf2bed --split-cpx --info SVTYPE \
-      <(bcftools view -e 'INFO/SVTYPE == "CPX" && FILTER == "UNRESOLVED"' ~{vcf}) out.bed
-    grep -e '^#\|DEL\|DUP\|CNV\|CPX' out.bed \
-      | awk -v OFS="\t" '{ if ($5=="CN0") print $1, $2, $3, $4, "DEL", $5"\n"$1, $2, $3, $4, "DUP", $5; \
-        else if ($5=="DEL" || $5=="DUP") print $1, $2, $3, $4, $6, $5 }' \
-      | sort -Vk1,1 -k2,2n -k3,3n -k4,4V \
-      | bgzip -c \
-      > intervals.preclustered.bed.gz
-  >>>
-
-  output {
-    File intervals_preclustered_bed = "intervals.preclustered.bed.gz"
-    File samples_list = "samples.list"
-  }
-}
-
-task DropRedundantCNVs_2 {
-  input {
-    File intervals_preclustered_bed
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(intervals_preclustered_bed, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    ###REMOVE CNVS REDUNDANT WITH COMPLEX EVENTS
-    #Subset to only variants that share some overlap (at least 10% recip) with at least one CPX variant
-    bedtools intersect -wa -r -f 0.1 \
-      -a ~{intervals_preclustered_bed} \
-      -b <( zcat ~{intervals_preclustered_bed} | fgrep "CPX" ) \
-      | sort -Vk1,1 -k2,2n -k3,3n -k4,4V \
-      | uniq \
-      | bgzip -c \
-      > intervals.preclustered.subset.bed.gz
-  >>>
-
-  output {
-    File intervals_preclustered_subset_bed = "intervals.preclustered.subset.bed.gz"
-  }
-}
-
-
-task DropRedundantCNVs_3 {
-  input {
-    File intervals_preclustered_bed
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(intervals_preclustered_bed, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 20.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    ###FIND REMAINING REDUNDANT CNVS WITH STRONG (80%) OVERLAP IN SAMPLES AND SIZE
-    #Find CNV intervals that have 80% reciprocal overlap
-    bedtools intersect -wa -wb -r -f 0.8 \
-      -a ~{intervals_preclustered_bed} \
-      -b ~{intervals_preclustered_bed} \
-      | awk -v FS="\t" '{ if ($4!=$10 && $6==$12) print $0 }' \
-      | awk -v OFS="\t" '$4 ~ /DEL|DUP/ { print $0 }' \
-      | awk -v OFS="\t" '$10 ~ /DEL|DUP/ { print $0 }' \
-      | cut -f4,5,10,11 \
-      | sort \
-      | uniq \
-      | gzip \
-      > step2.intervals.preclustered.subset.txt.gz
-  >>>
-
-  output {
-    File step2_intervals_preclustered_subset_txt = "step2.intervals.preclustered.subset.txt.gz"
-  }
-}
-
-
-task DropRedundantCNVs_4 {
-  input {
-    File step2_intervals_preclustered_subset_txt
-    File intervals_preclustered_subset_bed
-    File samples_list
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size(step2_intervals_preclustered_subset_txt, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 60,
-                                  disk_gb: ceil(10.0 + input_size * 2.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euo pipefail
-
-    python3 <<CODE | bgzip > out.vcf.gz
-    import sys
-    import gzip
-    from collections import namedtuple, defaultdict
-
-    import pysam
-    import numpy as np
-    from scipy import sparse
-    from scipy.sparse import csgraph
-
-    BedCall = namedtuple('BedCall', 'chrom start end name samples svtype'.split())
-
-    def reciprocal_overlap(a, b, frac):
-      if a.chrom != b.chrom:
-        return False
-      if a.start >= b.end or b.start >= a.end:
-        return False
-      ov = min(a.end, b.end) - max(a.start, b.start)
-      return (ov / float(max(a.end - a.start, b.end - b.start))) >= frac
-
-
-    def sample_overlap(samples_a, samples_b, denom, frac):
-      if len(samples_a) == 0 or len(samples_b) == 0:
-        return True
-      ov = len(samples_a.intersection(samples_b))
-      return (ov / float(denom)) >= frac
-
-
-    def read_intervals(path, samples_dict):
-      intervals = []
-      with gzip.open(path, "rb") as f:
-        for lineb in f:
-          tokens = lineb.decode('utf-8').strip().split('\t')
-          sample_indexes = set([samples_dict[s] for s in tokens[4].split(',')])
-          intervals.append(BedCall(tokens[0], int(tokens[1]), int(tokens[2]), tokens[3], sample_indexes, tokens[5]))
-        return intervals
-
-    # Save memory using sample id indexing
-    with open("~{samples_list}") as f:
-      samples_list = [line.strip() for line in f]
-    num_samples = len(samples_list)
-    samples_dict = {samples_list[i]: i for i in range(num_samples)}
-
-    intervals = read_intervals("~{intervals_preclustered_subset_bed}", samples_dict)
-    num_intervals = len(intervals)
-
-    # 50% RO and sample overlap in subsetted intervals
-    # Generate sparse graph for clustering
-    RO_FRAC = 0.5
-    G = sparse.eye(len(intervals), dtype=np.uint8, format='lil')
-    for i in range(num_intervals):
-      ro_indexes = [j for j in range(i) if reciprocal_overlap(intervals[i], intervals[j], RO_FRAC)]
-      for j in ro_indexes:
-        G[i, j] = 1
-
-    # Compute clusters
-    n_comp, cluster_labels = csgraph.connected_components(G, connection='weak', directed=False)
-    clusters = defaultdict(list)
-    for i in range(len(cluster_labels)):
-      clusters[cluster_labels[i]].append(i)
-
-    # Find CNVs in clusters containing at least one CPX
-    SAMPLE_FRAC = 0.5
-    vids_to_remove = set([])
-    for cluster in clusters.values():
-      cnvs = [i for i in cluster if "DEL" in intervals[i].name or "DUP" in intervals[i].name]
-      cpx = [i for i in cluster if "CPX" in intervals[i].name]
-      for i in cnvs:
-        for j in cpx:
-          if sample_overlap(intervals[i].samples, intervals[j].samples, len(intervals[i].samples), SAMPLE_FRAC):
-            vids_to_remove.add(intervals[i].name + "\n")
-            break
-
-    with open("VIDs_to_remove.list", 'w') as f:
-      f.writelines(sorted(list(vids_to_remove)))
-
-    # Find clusters of CNVs only, using 80% overlap parameters
-    with gzip.open("~{step2_intervals_preclustered_subset_txt}") as f:
-      intervals2 = []
-      for line in f:
-        tokens = line.decode('utf-8').strip().split('\t')
-        samples_a = set([samples_dict[s] for s in tokens[1].split(',')])
-        samples_b = set([samples_dict[s] for s in tokens[3].split(',')])
-        intervals2.append((tokens[0], samples_a, tokens[2], samples_b))
-
-    num_intervals2 = len(intervals2)
-    vids_to_resolve_list = []
-    SAMPLE_FRAC2 = 0.8
-    for interval in intervals2:
-      samples_a = interval[1]
-      samples_b = interval[3]
-      union = samples_a.union(samples_b)
-      if sample_overlap(samples_a, samples_b, len(union), SAMPLE_FRAC2):
-        vids_to_resolve_list.append("{}\n".format(",".join(sorted([interval[0], interval[2]]))))
-
-    vids_to_resolve_list = sorted(list(set(vids_to_resolve_list)))
-
-    with open("step2.variants_to_resolve.list", 'w') as f:
-      f.writelines(vids_to_resolve_list)
-
-    CODE
-  >>>
-
-  output {
-    File step2_variants_to_resolve_list = "step2.variants_to_resolve.list"
-    File vids_to_remove_list_1 = "VIDs_to_remove.list"
-  }
-}
-
-
-task DropRedundantCNVs_5 {
-  input {
-    File vcf
-    File vids_to_remove_list_1
-    File intervals_preclustered_bed
-    File step2_variants_to_resolve_list
-    String contig
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  Float input_size = size([vcf, intervals_preclustered_bed, intervals_preclustered_bed, step2_variants_to_resolve_list], "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 30,
-                                  disk_gb: ceil(10.0 + input_size * 5.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    python3 <<CODE | bgzip > drop_redundant_cnvs_5.~{contig}.vcf.gz
-    import sys
-    import pysam
-    import gzip
-
-    sys.stderr.write("Reading step2...\n")
-    with open("~{step2_variants_to_resolve_list}") as f:
-        vids_sets_to_resolve = [set(line.strip().split(',')) for line in f.readlines()]
-        vids_list = sorted(list(set([x for y in vids_sets_to_resolve for x in y])))
-
-    sys.stderr.write("Reading vids to remove...\n")
-    with open("~{vids_to_remove_list_1}") as f:
-        vids_to_remove = set([line.strip() for line in f.readlines()])
-
-    sys.stderr.write("Reading preclustered intervals...\n")
-    with gzip.open("~{intervals_preclustered_bed}") as f:
-        intervals = {}
-        for lineb in f:
-            tokens = lineb.decode('utf-8').strip().split('\t')
-            vid = tokens[3]
-            intervals[vid] = tokens
-
-    sys.stderr.write("Finding partners...\n")
-    partners = {}
-    all_partners = set([])
-    for vid in vids_list:
-        # get all other variants from clusters containing this variant
-        partners[vid] = set([p for vset in vids_sets_to_resolve if vid in vset for p in vset])
-        all_partners.update(partners[vid])
-
-    vids_to_remove.update(all_partners)
-    #with open("vids_to_remove_2.list", 'w') as f:
-    #    f.writelines(sorted([v+"\n" for v in vids_to_remove]))
-
-    sys.stderr.write("Scanning vcf...\n")
-    vcf = pysam.VariantFile("~{vcf}")
-    records = {r.id: r for r in vcf if r.id in all_partners}
-    vcf.close()
-
-    def count_gts(record):
-        result = [0, 0, 0]
-        num_samples = len(record.samples)
-        for g in [record.samples[i]['GT'] for i in range(num_samples)]:
-            if g == (0, 0):
-                result[1] += 1
-            elif g == (None, None):
-                result[2] += 1
-            else:
-                result[0] += 1
-        return result
-
-    def get_best_score_vid(scores):
-        return sorted(scores.items(), key=lambda x: x[1])[-1][0]
-
-    sys.stderr.write("Generating records...\n")
-    records_to_add = []
-    processed_vids = set([])
-    for vid in vids_list:
-        if vid in processed_vids:
-            continue
-        vid_partners = partners[vid]
-        processed_vids.update(vid_partners)
-        partner_intervals = [intervals[p] for p in vid_partners]
-        most_samples_vid = sorted(partner_intervals, key=lambda x : len(x[4].split(',')))[-1][3]
-        x = sorted(partner_intervals, key=lambda x : len(x[4].split(',')))
-        best_genotype_vid = None
-        best_non_ref = -1
-        best_ref = -1
-        scores = {p: count_gts(records[p]) for p in vid_partners}
-        scores_non_ref = {p: scores[p][0] for p in vid_partners if scores[p][0] > 0}
-        scores_ref = {p: scores[p][1] for p in vid_partners if scores[p][1] > 0}
-        scores_no_call = {p: scores[p][2] for p in vid_partners if scores[p][2] > 0}
-        if len(scores_non_ref) > 0:
-            best_genotype_vid = get_best_score_vid(scores_non_ref)
-        elif len(scores_ref) > 0:
-            best_genotype_vid = get_best_score_vid(scores_ref)
-        else:
-            best_genotype_vid = get_best_score_vid(scores_no_call)
-        sys.stderr.write(most_samples_vid + "\n")
-        s1 = str(records[most_samples_vid]).split('\t')[0:9]
-        s2 = str(records[best_genotype_vid]).split('\t', 9)
-        records_to_add.append("\t".join(s1) + "\t" + s2[9])
-
-    sys.stderr.write("Writing vcf...\n")
-    vcf = pysam.VariantFile("~{vcf}")
-    sys.stdout.write(str(vcf.header))
-    for record in vcf:
-      if record.id not in vids_to_remove:
-        sys.stdout.write(str(record))
-    vcf.close()
-
-    for record in records_to_add:
-      sys.stdout.write(record)
-
-    CODE
-
-  >>>
-
-  output {
-    File unsorted_vcf = "drop_redundant_cnvs_5.~{contig}.vcf.gz"
-  }
-}
-
-
-task DropRedundantCNVs_6 {
-  input {
-    File unsorted_vcf
-    String contig
-    String sv_pipeline_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  String outfile_name = contig + ".shard.no_CNV_redundancies.vcf.gz"
-
-  Float input_size = size(unsorted_vcf, "GB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 7.5,
-                                  disk_gb: ceil(10.0 + input_size * 20.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_pipeline_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euxo pipefail
-
-    ###CLEAN UP FINAL OUTPUT
-    zcat ~{unsorted_vcf} \
-      | vcf-sort \
-      | bgzip \
-      > ~{outfile_name}
-  >>>
-
-  output {
-    File cleaned_vcf_shard = outfile_name
-  }
-}
-
diff --git a/wdl/GATKSVPipelineBatch.wdl b/wdl/GATKSVPipelineBatch.wdl
index 207918c8a..47b8a2a46 100644
--- a/wdl/GATKSVPipelineBatch.wdl
+++ b/wdl/GATKSVPipelineBatch.wdl
@@ -86,6 +86,8 @@ workflow GATKSVPipelineBatch {
     String sv_base_mini_docker
     String sv_base_docker
     String sv_pipeline_docker
+    String sv_pipeline_hail_docker
+    String sv_pipeline_updates_docker
     String sv_pipeline_rdtest_docker
     String sv_pipeline_base_docker
     String sv_pipeline_qc_docker
@@ -300,6 +302,8 @@ workflow GATKSVPipelineBatch {
       sv_pipeline_base_docker = sv_pipeline_base_docker,
       linux_docker=linux_docker,
       sv_pipeline_docker=sv_pipeline_docker,
+      sv_pipeline_hail_docker=sv_pipeline_hail_docker,
+      sv_pipeline_updates_docker=sv_pipeline_updates_docker,
       sv_pipeline_rdtest_docker=sv_pipeline_rdtest_docker,
       sv_pipeline_qc_docker=sv_pipeline_qc_docker,
       sv_base_mini_docker=sv_base_mini_docker
diff --git a/wdl/GATKSVPipelineSingleSample.wdl b/wdl/GATKSVPipelineSingleSample.wdl
index f914f8075..bd5554005 100644
--- a/wdl/GATKSVPipelineSingleSample.wdl
+++ b/wdl/GATKSVPipelineSingleSample.wdl
@@ -64,6 +64,8 @@ workflow GATKSVPipelineSingleSample {
     String sv_base_mini_docker
     String sv_base_docker
     String sv_pipeline_docker
+    String sv_pipeline_hail_docker
+    String sv_pipeline_updates_docker
     String sv_pipeline_rdtest_docker
     String sv_pipeline_base_docker
     String sv_pipeline_qc_docker
@@ -292,7 +294,6 @@ workflow GATKSVPipelineSingleSample {
 
     File rmsk
     File segdups
-    String? chr_x
 
     Int? min_large_pesr_call_size_for_filtering
     Float? min_large_pesr_depth_overlap_fraction
@@ -373,23 +374,178 @@ workflow GATKSVPipelineSingleSample {
     Int clean_vcf_max_shards_per_chrom_clean_vcf_step1
     Int clean_vcf_min_records_per_shard_clean_vcf_step1
     Int clean_vcf_samples_per_clean_vcf_step2_shard
+    Int clean_vcf5_records_per_shard
+    Int clean_vcf1b_records_per_shard
+
+    String? chr_x
+    String? chr_y
 
     Int? clean_vcf_random_seed
 
     # Run MakeCohortVcf metrics - default is off for single sample pipeline
     Boolean? run_makecohortvcf_metrics = false
 
+    # overrides for local tasks
+    RuntimeAttr? runtime_overide_get_discfile_size
     RuntimeAttr? runtime_override_update_sr_list_cluster
-    RuntimeAttr? runtime_override_update_sr_list_pass
-    RuntimeAttr? runtime_override_update_sr_list_fail
     RuntimeAttr? runtime_override_merge_pesr_depth
-    RuntimeAttr? runtime_override_breakpoint_overlap_filter
     RuntimeAttr? runtime_override_integrate_resolved_vcfs
     RuntimeAttr? runtime_override_rename_variants
+    RuntimeAttr? runtime_override_rename_cleaned_samples
 
-    RuntimeAttr? runtime_override_clean_bothside_pass
+    RuntimeAttr? runtime_override_breakpoint_overlap_filter
+
+    # overrides for mini tasks
+    RuntimeAttr? runtime_override_ids_from_vcf
     RuntimeAttr? runtime_override_clean_background_fail
     RuntimeAttr? runtime_override_make_cpx_cnv_input_file
+    RuntimeAttr? runtime_override_subset_inversions
+    RuntimeAttr? runtime_override_concat_merged_vcfs
+    RuntimeAttr? runtime_override_concat_cpx_vcfs
+    RuntimeAttr? runtime_override_concat_cleaned_vcfs
+
+    # overrides for VcfClusterContig
+    RuntimeAttr? runtime_override_join_vcfs
+    RuntimeAttr? runtime_override_subset_bothside_pass
+    RuntimeAttr? runtime_override_subset_background_fail
+    RuntimeAttr? runtime_override_subset_sv_type
+    RuntimeAttr? runtime_override_shard_clusters
+    RuntimeAttr? runtime_override_shard_vids
+    RuntimeAttr? runtime_override_pull_vcf_shard
+    RuntimeAttr? runtime_override_svtk_vcf_cluster
+    RuntimeAttr? runtime_override_get_vcf_header_with_members_info_line
+    RuntimeAttr? runtime_override_cluster_merge
+    RuntimeAttr? runtime_override_concat_vcf_cluster
+    RuntimeAttr? runtime_override_concat_svtypes
+    RuntimeAttr? runtime_override_concat_sharded_cluster
+    RuntimeAttr? runtime_override_make_sites_only
+    RuntimeAttr? runtime_override_preconcat_sharded_cluster
+    RuntimeAttr? runtime_override_hail_merge_sharded_cluster
+    RuntimeAttr? runtime_override_fix_header_sharded_cluster
+    RuntimeAttr? runtime_override_concat_large_pesr_depth
+
+    # overrides for ResolveComplexVariants
+    RuntimeAttr? runtime_override_update_sr_list_pass
+    RuntimeAttr? runtime_override_update_sr_list_fail
+    RuntimeAttr? runtime_override_integrate_resolved_vcfs
+    RuntimeAttr? runtime_override_rename_variants
+    RuntimeAttr? runtime_override_breakpoint_overlap_filter
+    RuntimeAttr? runtime_override_subset_inversions
+    RuntimeAttr? runtime_override_concat_resolve
+
+    RuntimeAttr? runtime_override_get_se_cutoff
+    RuntimeAttr? runtime_override_shard_vcf_cpx
+    RuntimeAttr? runtime_override_shard_vids_resolve
+    RuntimeAttr? runtime_override_resolve_prep
+    RuntimeAttr? runtime_override_resolve_cpx_per_shard
+    RuntimeAttr? runtime_override_restore_unresolved_cnv_per_shard
+    RuntimeAttr? runtime_override_concat_resolved_per_shard
+    RuntimeAttr? runtime_override_pull_vcf_shard
+    RuntimeAttr? runtime_override_preconcat_resolve
+    RuntimeAttr? runtime_override_hail_merge_resolve
+    RuntimeAttr? runtime_override_fix_header_resolve
+
+    RuntimeAttr? runtime_override_get_se_cutoff_inv
+    RuntimeAttr? runtime_override_shard_vcf_cpx_inv
+    RuntimeAttr? runtime_override_shard_vids_resolve_inv
+    RuntimeAttr? runtime_override_resolve_prep_inv
+    RuntimeAttr? runtime_override_resolve_cpx_per_shard_inv
+    RuntimeAttr? runtime_override_restore_unresolved_cnv_per_shard_inv
+    RuntimeAttr? runtime_override_concat_resolved_per_shard_inv
+    RuntimeAttr? runtime_override_pull_vcf_shard_inv
+    RuntimeAttr? runtime_override_preconcat_resolve_inv
+    RuntimeAttr? runtime_override_hail_merge_resolve_inv
+    RuntimeAttr? runtime_override_fix_header_resolve_inv
+
+    # overrides for GenotypeComplexContig
+    RuntimeAttr? runtime_override_ids_from_median
+    RuntimeAttr? runtime_override_split_vcf_to_genotype
+    RuntimeAttr? runtime_override_concat_cpx_cnv_vcfs
+    RuntimeAttr? runtime_override_get_cpx_cnv_intervals
+    RuntimeAttr? runtime_override_parse_genotypes
+    RuntimeAttr? runtime_override_merge_melted_gts
+    RuntimeAttr? runtime_override_split_bed_by_size
+    RuntimeAttr? runtime_override_rd_genotype
+    RuntimeAttr? runtime_override_concat_melted_genotypes
+    RuntimeAttr? runtime_attr_ids_from_vcf_regeno
+    RuntimeAttr? runtime_attr_subset_ped_regeno
+    RuntimeAttr? runtime_override_preconcat_regeno
+    RuntimeAttr? runtime_override_hail_merge_regeno
+    RuntimeAttr? runtime_override_fix_header_regeno
+
+    # overrides for CleanVcfContig
+    RuntimeAttr? runtime_attr_ids_from_vcf_clean
+    RuntimeAttr? runtime_attr_subset_ped_clean
+    RuntimeAttr? runtime_override_preconcat_clean_final
+    RuntimeAttr? runtime_override_hail_merge_clean_final
+    RuntimeAttr? runtime_override_fix_header_clean_final
+    RuntimeAttr? runtime_override_concat_cleaned_vcfs
+
+    RuntimeAttr? runtime_override_clean_vcf_1a
+    RuntimeAttr? runtime_override_clean_vcf_2
+    RuntimeAttr? runtime_override_clean_vcf_3
+    RuntimeAttr? runtime_override_clean_vcf_4
+    RuntimeAttr? runtime_override_clean_vcf_5_scatter
+    RuntimeAttr? runtime_override_clean_vcf_5_make_cleangq
+    RuntimeAttr? runtime_override_clean_vcf_5_find_redundant_multiallelics
+    RuntimeAttr? runtime_override_clean_vcf_5_polish
+    RuntimeAttr? runtime_override_stitch_fragmented_cnvs
+    RuntimeAttr? runtime_override_final_cleanup
+
+    RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b
+    RuntimeAttr? runtime_attr_override_sort_bed_1b
+    RuntimeAttr? runtime_attr_override_intersect_bed_1b
+    RuntimeAttr? runtime_attr_override_build_dict_1b
+    RuntimeAttr? runtime_attr_override_scatter_1b
+    RuntimeAttr? runtime_attr_override_filter_vcf_1b
+    RuntimeAttr? runtime_override_concat_vcfs_1b
+    RuntimeAttr? runtime_override_cat_multi_cnvs_1b
+
+    RuntimeAttr? runtime_override_preconcat_step1
+    RuntimeAttr? runtime_override_hail_merge_step1
+    RuntimeAttr? runtime_override_fix_header_step1
+
+    RuntimeAttr? runtime_override_preconcat_drc
+    RuntimeAttr? runtime_override_hail_merge_drc
+    RuntimeAttr? runtime_override_fix_header_drc
+
+    RuntimeAttr? runtime_override_split_vcf_to_clean
+    RuntimeAttr? runtime_override_combine_step_1_sex_chr_revisions
+    RuntimeAttr? runtime_override_split_include_list
+    RuntimeAttr? runtime_override_combine_clean_vcf_2
+    RuntimeAttr? runtime_override_combine_revised_4
+    RuntimeAttr? runtime_override_combine_multi_ids_4
+    RuntimeAttr? runtime_override_drop_redundant_cnvs
+    RuntimeAttr? runtime_override_combine_step_1_vcfs
+    RuntimeAttr? runtime_override_sort_drop_redundant_cnvs
+
+    # overrides for VcfQc
+    RuntimeAttr? runtime_override_plot_qc_vcf_wide
+    RuntimeAttr? runtime_override_thousand_g_benchmark
+    RuntimeAttr? runtime_override_thousand_g_plot
+    RuntimeAttr? runtime_override_asc_benchmark
+    RuntimeAttr? runtime_override_asc_plot
+    RuntimeAttr? runtime_override_hgsv_benchmark
+    RuntimeAttr? runtime_override_hgsv_plot
+    RuntimeAttr? runtime_override_plot_qc_per_sample
+    RuntimeAttr? runtime_override_plot_qc_per_family
+    RuntimeAttr? runtime_override_sanders_per_sample_plot
+    RuntimeAttr? runtime_override_collins_per_sample_plot
+    RuntimeAttr? runtime_override_werling_per_sample_plot
+    RuntimeAttr? runtime_override_sanitize_outputs
+    RuntimeAttr? runtime_override_merge_vcfwide_stat_shards
+    RuntimeAttr? runtime_override_merge_vcf_2_bed
+    RuntimeAttr? runtime_override_collect_sharded_vcf_stats
+    RuntimeAttr? runtime_override_svtk_vcf_2_bed
+    RuntimeAttr? runtime_override_split_vcf_to_qc
+    RuntimeAttr? runtime_override_merge_subvcf_stat_shards
+    RuntimeAttr? runtime_override_merge_svtk_vcf_2_bed
+    RuntimeAttr? runtime_override_collect_vids_per_sample
+    RuntimeAttr? runtime_override_split_samples_list
+    RuntimeAttr? runtime_override_tar_shard_vid_lists
+    RuntimeAttr? runtime_override_benchmark_samples
+    RuntimeAttr? runtime_override_split_shuffled_list
+    RuntimeAttr? runtime_override_merge_and_tar_shard_benchmarks
 
     ############################################################
     ## AnnotateVcf
@@ -928,6 +1084,11 @@ workflow GATKSVPipelineSingleSample {
       max_shards_per_chrom_clean_vcf_step1=clean_vcf_max_shards_per_chrom_clean_vcf_step1,
       min_records_per_shard_clean_vcf_step1=clean_vcf_min_records_per_shard_clean_vcf_step1,
       samples_per_clean_vcf_step2_shard=clean_vcf_samples_per_clean_vcf_step2_shard,
+      clean_vcf5_records_per_shard=clean_vcf5_records_per_shard,
+      clean_vcf1b_records_per_shard=clean_vcf1b_records_per_shard,
+
+      chr_x=select_first([chr_x, "chrX"]),
+      chr_y=select_first([chr_y, "chrY"]),
 
       random_seed=clean_vcf_random_seed,
 
@@ -935,21 +1096,152 @@ workflow GATKSVPipelineSingleSample {
 
       linux_docker=linux_docker,
       sv_pipeline_docker=sv_pipeline_docker,
+      sv_pipeline_hail_docker=sv_pipeline_hail_docker,
+      sv_pipeline_updates_docker=sv_pipeline_updates_docker,
       sv_pipeline_rdtest_docker=sv_pipeline_rdtest_docker,
       sv_pipeline_qc_docker=sv_pipeline_qc_docker,
       sv_base_mini_docker=sv_base_mini_docker,
 
+      runtime_overide_get_discfile_size=runtime_overide_get_discfile_size,
       runtime_override_update_sr_list_cluster=runtime_override_update_sr_list_cluster,
-      runtime_override_update_sr_list_pass=runtime_override_update_sr_list_pass,
-      runtime_override_update_sr_list_fail=runtime_override_update_sr_list_fail,
       runtime_override_merge_pesr_depth=runtime_override_merge_pesr_depth,
-      runtime_override_breakpoint_overlap_filter=runtime_override_breakpoint_overlap_filter,
       runtime_override_integrate_resolved_vcfs=runtime_override_integrate_resolved_vcfs,
       runtime_override_rename_variants=runtime_override_rename_variants,
-
-      runtime_override_clean_bothside_pass=runtime_override_clean_bothside_pass,
+      runtime_override_rename_cleaned_samples=runtime_override_rename_cleaned_samples,
+      runtime_override_breakpoint_overlap_filter=runtime_override_breakpoint_overlap_filter,
+      runtime_override_ids_from_vcf=runtime_override_ids_from_vcf,
       runtime_override_clean_background_fail=runtime_override_clean_background_fail,
-      runtime_override_make_cpx_cnv_input_file=runtime_override_make_cpx_cnv_input_file
+      runtime_override_make_cpx_cnv_input_file=runtime_override_make_cpx_cnv_input_file,
+      runtime_override_subset_inversions=runtime_override_subset_inversions,
+      runtime_override_concat_merged_vcfs=runtime_override_concat_merged_vcfs,
+      runtime_override_concat_cpx_vcfs=runtime_override_concat_cpx_vcfs,
+      runtime_override_concat_cleaned_vcfs=runtime_override_concat_cleaned_vcfs,
+      runtime_override_join_vcfs=runtime_override_join_vcfs,
+      runtime_override_subset_bothside_pass=runtime_override_subset_bothside_pass,
+      runtime_override_subset_background_fail=runtime_override_subset_background_fail,
+      runtime_override_subset_sv_type=runtime_override_subset_sv_type,
+      runtime_override_shard_clusters=runtime_override_shard_clusters,
+      runtime_override_shard_vids=runtime_override_shard_vids,
+      runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard,
+      runtime_override_svtk_vcf_cluster=runtime_override_svtk_vcf_cluster,
+      runtime_override_get_vcf_header_with_members_info_line=runtime_override_get_vcf_header_with_members_info_line,
+      runtime_override_cluster_merge=runtime_override_cluster_merge,
+      runtime_override_concat_vcf_cluster=runtime_override_concat_vcf_cluster,
+      runtime_override_concat_svtypes=runtime_override_concat_svtypes,
+      runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster,
+      runtime_override_make_sites_only=runtime_override_make_sites_only,
+      runtime_override_preconcat_sharded_cluster=runtime_override_preconcat_sharded_cluster,
+      runtime_override_hail_merge_sharded_cluster=runtime_override_hail_merge_sharded_cluster,
+      runtime_override_fix_header_sharded_cluster=runtime_override_fix_header_sharded_cluster,
+      runtime_override_concat_large_pesr_depth=runtime_override_concat_large_pesr_depth,
+      runtime_override_update_sr_list_pass=runtime_override_update_sr_list_pass,
+      runtime_override_update_sr_list_fail=runtime_override_update_sr_list_fail,
+      runtime_override_integrate_resolved_vcfs=runtime_override_integrate_resolved_vcfs,
+      runtime_override_rename_variants=runtime_override_rename_variants,
+      runtime_override_breakpoint_overlap_filter=runtime_override_breakpoint_overlap_filter,
+      runtime_override_subset_inversions=runtime_override_subset_inversions,
+      runtime_override_concat_resolve=runtime_override_concat_resolve,
+      runtime_override_get_se_cutoff=runtime_override_get_se_cutoff,
+      runtime_override_shard_vcf_cpx=runtime_override_shard_vcf_cpx,
+      runtime_override_shard_vids_resolve=runtime_override_shard_vids_resolve,
+      runtime_override_resolve_prep=runtime_override_resolve_prep,
+      runtime_override_resolve_cpx_per_shard=runtime_override_resolve_cpx_per_shard,
+      runtime_override_restore_unresolved_cnv_per_shard=runtime_override_restore_unresolved_cnv_per_shard,
+      runtime_override_concat_resolved_per_shard=runtime_override_concat_resolved_per_shard,
+      runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard,
+      runtime_override_preconcat_resolve=runtime_override_preconcat_resolve,
+      runtime_override_hail_merge_resolve=runtime_override_hail_merge_resolve,
+      runtime_override_fix_header_resolve=runtime_override_fix_header_resolve,
+      runtime_override_get_se_cutoff_inv=runtime_override_get_se_cutoff_inv,
+      runtime_override_shard_vcf_cpx_inv=runtime_override_shard_vcf_cpx_inv,
+      runtime_override_shard_vids_resolve_inv=runtime_override_shard_vids_resolve_inv,
+      runtime_override_resolve_prep_inv=runtime_override_resolve_prep_inv,
+      runtime_override_resolve_cpx_per_shard_inv=runtime_override_resolve_cpx_per_shard_inv,
+      runtime_override_restore_unresolved_cnv_per_shard_inv=runtime_override_restore_unresolved_cnv_per_shard_inv,
+      runtime_override_concat_resolved_per_shard_inv=runtime_override_concat_resolved_per_shard_inv,
+      runtime_override_pull_vcf_shard_inv=runtime_override_pull_vcf_shard_inv,
+      runtime_override_preconcat_resolve_inv=runtime_override_preconcat_resolve_inv,
+      runtime_override_hail_merge_resolve_inv=runtime_override_hail_merge_resolve_inv,
+      runtime_override_fix_header_resolve_inv=runtime_override_fix_header_resolve_inv,
+      runtime_override_ids_from_median=runtime_override_ids_from_median,
+      runtime_override_split_vcf_to_genotype=runtime_override_split_vcf_to_genotype,
+      runtime_override_concat_cpx_cnv_vcfs=runtime_override_concat_cpx_cnv_vcfs,
+      runtime_override_get_cpx_cnv_intervals=runtime_override_get_cpx_cnv_intervals,
+      runtime_override_parse_genotypes=runtime_override_parse_genotypes,
+      runtime_override_merge_melted_gts=runtime_override_merge_melted_gts,
+      runtime_override_split_bed_by_size=runtime_override_split_bed_by_size,
+      runtime_override_rd_genotype=runtime_override_rd_genotype,
+      runtime_override_concat_melted_genotypes=runtime_override_concat_melted_genotypes,
+      runtime_attr_ids_from_vcf_regeno=runtime_attr_ids_from_vcf_regeno,
+      runtime_attr_subset_ped_regeno=runtime_attr_subset_ped_regeno,
+      runtime_override_preconcat_regeno=runtime_override_preconcat_regeno,
+      runtime_override_hail_merge_regeno=runtime_override_hail_merge_regeno,
+      runtime_override_fix_header_regeno=runtime_override_fix_header_regeno,
+      runtime_attr_ids_from_vcf_clean=runtime_attr_ids_from_vcf_clean,
+      runtime_attr_subset_ped_clean=runtime_attr_subset_ped_clean,
+      runtime_override_preconcat_clean_final=runtime_override_preconcat_clean_final,
+      runtime_override_hail_merge_clean_final=runtime_override_hail_merge_clean_final,
+      runtime_override_fix_header_clean_final=runtime_override_fix_header_clean_final,
+      runtime_override_concat_cleaned_vcfs=runtime_override_concat_cleaned_vcfs,
+      runtime_override_clean_vcf_1a=runtime_override_clean_vcf_1a,
+      runtime_override_clean_vcf_2=runtime_override_clean_vcf_2,
+      runtime_override_clean_vcf_3=runtime_override_clean_vcf_3,
+      runtime_override_clean_vcf_4=runtime_override_clean_vcf_4,
+      runtime_override_clean_vcf_5_scatter=runtime_override_clean_vcf_5_scatter,
+      runtime_override_clean_vcf_5_make_cleangq=runtime_override_clean_vcf_5_make_cleangq,
+      runtime_override_clean_vcf_5_find_redundant_multiallelics=runtime_override_clean_vcf_5_find_redundant_multiallelics,
+      runtime_override_clean_vcf_5_polish=runtime_override_clean_vcf_5_polish,
+      runtime_override_stitch_fragmented_cnvs=runtime_override_stitch_fragmented_cnvs,
+      runtime_override_final_cleanup=runtime_override_final_cleanup,
+      runtime_attr_override_subset_large_cnvs_1b=runtime_attr_override_subset_large_cnvs_1b,
+      runtime_attr_override_sort_bed_1b=runtime_attr_override_sort_bed_1b,
+      runtime_attr_override_intersect_bed_1b=runtime_attr_override_intersect_bed_1b,
+      runtime_attr_override_build_dict_1b=runtime_attr_override_build_dict_1b,
+      runtime_attr_override_scatter_1b=runtime_attr_override_scatter_1b,
+      runtime_attr_override_filter_vcf_1b=runtime_attr_override_filter_vcf_1b,
+      runtime_override_concat_vcfs_1b=runtime_override_concat_vcfs_1b,
+      runtime_override_cat_multi_cnvs_1b=runtime_override_cat_multi_cnvs_1b,
+      runtime_override_preconcat_step1=runtime_override_preconcat_step1,
+      runtime_override_hail_merge_step1=runtime_override_hail_merge_step1,
+      runtime_override_fix_header_step1=runtime_override_fix_header_step1,
+      runtime_override_preconcat_drc=runtime_override_preconcat_drc,
+      runtime_override_hail_merge_drc=runtime_override_hail_merge_drc,
+      runtime_override_fix_header_drc=runtime_override_fix_header_drc,
+      runtime_override_split_vcf_to_clean=runtime_override_split_vcf_to_clean,
+      runtime_override_combine_step_1_sex_chr_revisions=runtime_override_combine_step_1_sex_chr_revisions,
+      runtime_override_split_include_list=runtime_override_split_include_list,
+      runtime_override_combine_clean_vcf_2=runtime_override_combine_clean_vcf_2,
+      runtime_override_combine_revised_4=runtime_override_combine_revised_4,
+      runtime_override_combine_multi_ids_4=runtime_override_combine_multi_ids_4,
+      runtime_override_drop_redundant_cnvs=runtime_override_drop_redundant_cnvs,
+      runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs,
+      runtime_override_sort_drop_redundant_cnvs=runtime_override_sort_drop_redundant_cnvs,
+      runtime_override_plot_qc_vcf_wide=runtime_override_plot_qc_vcf_wide,
+      runtime_override_thousand_g_benchmark=runtime_override_thousand_g_benchmark,
+      runtime_override_thousand_g_plot=runtime_override_thousand_g_plot,
+      runtime_override_asc_benchmark=runtime_override_asc_benchmark,
+      runtime_override_asc_plot=runtime_override_asc_plot,
+      runtime_override_hgsv_benchmark=runtime_override_hgsv_benchmark,
+      runtime_override_hgsv_plot=runtime_override_hgsv_plot,
+      runtime_override_plot_qc_per_sample=runtime_override_plot_qc_per_sample,
+      runtime_override_plot_qc_per_family=runtime_override_plot_qc_per_family,
+      runtime_override_sanders_per_sample_plot=runtime_override_sanders_per_sample_plot,
+      runtime_override_collins_per_sample_plot=runtime_override_collins_per_sample_plot,
+      runtime_override_werling_per_sample_plot=runtime_override_werling_per_sample_plot,
+      runtime_override_sanitize_outputs=runtime_override_sanitize_outputs,
+      runtime_override_merge_vcfwide_stat_shards=runtime_override_merge_vcfwide_stat_shards,
+      runtime_override_merge_vcf_2_bed=runtime_override_merge_vcf_2_bed,
+      runtime_override_collect_sharded_vcf_stats=runtime_override_collect_sharded_vcf_stats,
+      runtime_override_svtk_vcf_2_bed=runtime_override_svtk_vcf_2_bed,
+      runtime_override_split_vcf_to_qc=runtime_override_split_vcf_to_qc,
+      runtime_override_merge_subvcf_stat_shards=runtime_override_merge_subvcf_stat_shards,
+      runtime_override_merge_svtk_vcf_2_bed=runtime_override_merge_svtk_vcf_2_bed,
+      runtime_override_collect_vids_per_sample=runtime_override_collect_vids_per_sample,
+      runtime_override_split_samples_list=runtime_override_split_samples_list,
+      runtime_override_tar_shard_vid_lists=runtime_override_tar_shard_vid_lists,
+      runtime_override_benchmark_samples=runtime_override_benchmark_samples,
+      runtime_override_split_shuffled_list=runtime_override_split_shuffled_list,
+      runtime_override_merge_and_tar_shard_benchmarks=runtime_override_merge_and_tar_shard_benchmarks
 
   }
 
diff --git a/wdl/GenotypeComplexVariants.wdl b/wdl/GenotypeComplexVariants.wdl
index 5f2aa0e41..d018b2704 100644
--- a/wdl/GenotypeComplexVariants.wdl
+++ b/wdl/GenotypeComplexVariants.wdl
@@ -12,6 +12,7 @@ workflow GenotypeComplexVariants {
     Array[File] depth_vcfs
 
     Boolean merge_vcfs = false
+    Int? records_per_shard
 
     Array[File] complex_resolve_vcfs
     Array[File] complex_resolve_vcf_indexes
@@ -25,9 +26,14 @@ workflow GenotypeComplexVariants {
     File contig_list
     File ref_dict
 
+    Boolean use_hail = false
+    String? gcs_project
+
     String linux_docker
     String sv_base_mini_docker
+    String sv_pipeline_updates_docker
     String sv_pipeline_docker
+    String sv_pipeline_hail_docker
     String sv_pipeline_rdtest_docker
 
     # overrides for mini tasks
@@ -45,6 +51,9 @@ workflow GenotypeComplexVariants {
     RuntimeAttr? runtime_override_concat_melted_genotypes
     RuntimeAttr? runtime_attr_ids_from_vcf
     RuntimeAttr? runtime_attr_subset_ped
+    RuntimeAttr? runtime_override_preconcat
+    RuntimeAttr? runtime_override_hail_merge
+    RuntimeAttr? runtime_override_fix_header
   }
 
   scatter (i in range(length(batches))) {
@@ -74,8 +83,7 @@ workflow GenotypeComplexVariants {
       input:
         bin_exclude=bin_exclude,
         vcf=complex_resolve_vcfs[i],
-        n_master_vcf_shards=200,
-        n_master_min_vars_per_vcf_shard=5000,
+        records_per_shard=select_first([records_per_shard, 50000]),
         batches=batches,
         coverage_files=bincov_files,
         rd_depth_sep_cutoff_files=depth_gt_rd_sep_files,
@@ -84,13 +92,17 @@ workflow GenotypeComplexVariants {
         n_per_split_small=2500,
         n_per_split_large=250,
         n_rd_test_bins=100000,
-        prefix=cohort_name,
+        prefix="~{cohort_name}.~{contig}",
         contig=contig,
         ped_files=SubsetPedFile.ped_subset_file,
         ref_dict=ref_dict,
+        use_hail=use_hail,
+        gcs_project=gcs_project,
         linux_docker=linux_docker,
+        sv_pipeline_updates_docker=sv_pipeline_updates_docker,
         sv_base_mini_docker=sv_base_mini_docker,
         sv_pipeline_docker=sv_pipeline_docker,
+        sv_pipeline_hail_docker=sv_pipeline_hail_docker,
         sv_pipeline_rdtest_docker=sv_pipeline_rdtest_docker,
         runtime_override_ids_from_median=runtime_override_ids_from_median,
         runtime_override_split_vcf_to_genotype=runtime_override_split_vcf_to_genotype,
@@ -100,7 +112,10 @@ workflow GenotypeComplexVariants {
         runtime_override_merge_melted_gts=runtime_override_merge_melted_gts,
         runtime_override_split_bed_by_size=runtime_override_split_bed_by_size,
         runtime_override_rd_genotype=runtime_override_rd_genotype,
-        runtime_override_concat_melted_genotypes=runtime_override_concat_melted_genotypes
+        runtime_override_concat_melted_genotypes=runtime_override_concat_melted_genotypes,
+        runtime_override_preconcat=runtime_override_preconcat,
+        runtime_override_hail_merge=runtime_override_hail_merge,
+        runtime_override_fix_header=runtime_override_fix_header
     }
   }
 
diff --git a/wdl/HailMerge.wdl b/wdl/HailMerge.wdl
new file mode 100644
index 000000000..abfab482e
--- /dev/null
+++ b/wdl/HailMerge.wdl
@@ -0,0 +1,186 @@
+version 1.0
+
+import "Structs.wdl"
+import "TasksMakeCohortVcf.wdl" as MiniTasks
+
+workflow HailMerge {
+  input {
+    Array[File] vcfs
+    String prefix
+    String? gcs_project  # REQUIRED
+    Boolean? reset_cnv_gts
+    String sv_base_mini_docker
+    String sv_pipeline_docker
+    String sv_pipeline_hail_docker
+    RuntimeAttr? runtime_override_preconcat
+    RuntimeAttr? runtime_override_hail_merge
+    RuntimeAttr? runtime_override_fix_header
+  }
+
+  # Concatenate vcfs naively to prevent ClassTooLargeException in Hail
+  if (length(vcfs) > 1) {
+    call MiniTasks.ConcatVcfs as Preconcat {
+      input:
+        vcfs=vcfs,
+        naive=true,
+        generate_index=false,
+        outfile_prefix="~{prefix}.preconcat",
+        sv_base_mini_docker=sv_base_mini_docker,
+        runtime_attr_override=runtime_override_preconcat
+    }
+  }
+
+  call HailMerge {
+    input:
+      vcfs = [select_first([Preconcat.concat_vcf, vcfs[0]])],
+      prefix = prefix,
+      gcs_project = select_first([gcs_project]),
+      sv_pipeline_hail_docker=sv_pipeline_hail_docker,
+      runtime_attr_override=runtime_override_hail_merge
+  }
+
+  call FixHeader {
+    input:
+      merged_vcf = HailMerge.merged_vcf,
+      example_vcf = vcfs[0],
+      prefix = prefix + ".reheadered",
+      reset_cnv_gts = select_first([reset_cnv_gts, false]),
+      sv_pipeline_docker = sv_pipeline_docker,
+      runtime_attr_override=runtime_override_fix_header
+  }
+
+  output {
+    File merged_vcf = FixHeader.out
+    File merged_vcf_index = FixHeader.out_index
+  }
+}
+
+task HailMerge {
+  input {
+    Array[File] vcfs
+    String prefix
+    String gcs_project
+    String region = "us-central1"
+    String sv_pipeline_hail_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  parameter_meta {
+    vcfs: {
+      localization_optional: true
+    }
+  }
+
+  String cluster_name_prefix="gatk-sv-cluster-"
+
+  RuntimeAttr runtime_default = object {
+                                  mem_gb: 6.5,
+                                  disk_gb: 100,
+                                  cpu_cores: 1,
+                                  preemptible_tries: 0,
+                                  max_retries: 1,
+                                  boot_disk_gb: 10
+                                }
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  runtime {
+    memory: select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + " GB"
+    disks: "local-disk " + select_first([runtime_override.disk_gb, runtime_default.disk_gb]) + " SSD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_pipeline_hail_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
+
+  command <<<
+    set -euxo pipefail
+
+    cp ~{write_lines(vcfs)} "files.list"
+
+    python <<CODE
+import hail as hl
+import os
+import uuid
+from google.cloud import dataproc_v1 as dataproc
+
+cluster_name = "gatk-sv-hail-{}".format(uuid.uuid4())
+script_path = "/opt/sv-pipeline/scripts/hailmerge.py"
+
+try:
+  print(os.popen("hailctl dataproc start --num-workers 4 --region {} --project {} --num-master-local-ssds 1 --num-worker-local-ssds 1 --max-idle=60m --max-age=1440m {}".format("~{region}", "~{gcs_project}", cluster_name)).read())
+
+  cluster_client = dataproc.ClusterControllerClient(
+        client_options={"api_endpoint": f"~{region}-dataproc.googleapis.com:443"}
+  )
+
+  for cluster in cluster_client.list_clusters(request={"project_id": "~{gcs_project}", "region": "~{region}"}):
+    if cluster.cluster_name == cluster_name:
+      cluster_staging_bucket = cluster.config.temp_bucket
+      os.popen("gcloud dataproc jobs submit pyspark {} --cluster={} --project {} --files=files.list --region={} --driver-log-levels root=WARN -- {} {}".format(script_path, cluster_name, "~{gcs_project}", "~{region}", cluster_staging_bucket, cluster_name)).read()
+      os.popen("gsutil cp -r gs://{}/{}/merged.vcf.bgz .".format(cluster_staging_bucket, cluster_name)).read()
+      break
+
+except Exception as e:
+  print(e)
+  raise
+finally:
+  os.popen("gcloud dataproc clusters delete --project {} --region {} {}".format("~{gcs_project}", "~{region}", cluster_name)).read()
+CODE
+
+  mv merged.vcf.bgz ~{prefix}.vcf.gz
+  tabix ~{prefix}.vcf.gz
+  >>>
+
+  output {
+    File merged_vcf = "~{prefix}.vcf.gz"
+    File merged_vcf_index = "~{prefix}.vcf.gz.tbi"
+  }
+}
+
+task FixHeader {
+  input {
+    File merged_vcf
+    File example_vcf
+    String prefix
+    Boolean reset_cnv_gts
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  RuntimeAttr runtime_default = object {
+                                  mem_gb: 3.75,
+                                  disk_gb: ceil(10 + size(merged_vcf, "GB") * 2 + size(example_vcf, "GB")),
+                                  cpu_cores: 1,
+                                  preemptible_tries: 3,
+                                  max_retries: 1,
+                                  boot_disk_gb: 10
+                                }
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  runtime {
+    memory: select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + " GB"
+    disks: "local-disk " + select_first([runtime_override.disk_gb, runtime_default.disk_gb]) + " SSD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_pipeline_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
+
+  command <<<
+    set -euxo pipefail
+
+    # Reset to original header
+    bcftools view --no-version -h ~{merged_vcf}  | grep -v ^#CHROM > header
+    bcftools view --no-version -h ~{example_vcf} | grep -e "^##source" -e "^##ALT" -e "^##CPX_TYPE" >> header
+    bcftools view --no-version -h ~{merged_vcf}  | grep ^#CHROM >> header
+    bcftools reheader -h header ~{merged_vcf} \
+      ~{if reset_cnv_gts then "| gunzip | python /opt/sv-pipeline/04_variant_resolution/scripts/reset_cnv_gts.py stdin stdout | bgzip" else ""} \
+      > ~{prefix}.vcf.gz
+    tabix ~{prefix}.vcf.gz
+  >>>
+
+  output {
+    File out = "~{prefix}.vcf.gz"
+    File out_index = "~{prefix}.vcf.gz.tbi"
+  }
+}
diff --git a/wdl/HarmonizeHeaders.wdl b/wdl/HarmonizeHeaders.wdl
new file mode 100644
index 000000000..fe3746d96
--- /dev/null
+++ b/wdl/HarmonizeHeaders.wdl
@@ -0,0 +1,79 @@
+version 1.0
+
+import "Structs.wdl"
+import "TasksMakeCohortVcf.wdl" as MiniTasks
+
+# Reheader a list of vcfs with the header from another vcf
+
+workflow HarmonizeHeaders {
+    input {
+        File header_vcf     # Vcf containing desired header
+        Array[File] vcfs    # Vcfs to replace headers of
+        String prefix
+        String sv_base_mini_docker
+        RuntimeAttr? runtime_override_reheader
+        RuntimeAttr? runtime_override_pull_header
+    }
+
+    call PullHeader {
+        input:
+            vcf=header_vcf,
+            prefix=prefix,
+            sv_base_mini_docker=sv_base_mini_docker,
+            runtime_attr_override=runtime_override_pull_header
+    }
+
+    scatter (i in range(length(vcfs))) {
+        call MiniTasks.ReheaderVcf {
+            input:
+                vcf=vcfs[i],
+                vcf_index=vcfs[i] + ".tbi",
+                header=PullHeader.out,
+                prefix="~{prefix}.reheadered",
+                sv_base_mini_docker=sv_base_mini_docker,
+                runtime_attr_override=runtime_override_reheader
+        }
+    }
+
+    output {
+        Array[File] out = ReheaderVcf.out
+        Array[File] out_index = ReheaderVcf.out_index
+    }
+}
+
+task PullHeader {
+    input {
+        File vcf
+        String prefix
+        String sv_base_mini_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    RuntimeAttr runtime_default = object {
+                                      mem_gb: 2.0,
+                                      disk_gb: ceil(10.0 + size(vcf, "GiB") ),
+                                      cpu_cores: 1,
+                                      preemptible_tries: 3,
+                                      max_retries: 1,
+                                      boot_disk_gb: 10
+                                  }
+    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    runtime {
+        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
+        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        docker: sv_base_mini_docker
+        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+    }
+
+    command <<<
+        set -euo pipefail
+        bcftools view --header-only ~{vcf} > ~{prefix}.header
+    >>>
+
+    output {
+        File out = "~{prefix}.header"
+    }
+}
\ No newline at end of file
diff --git a/wdl/MakeCohortVcf.wdl b/wdl/MakeCohortVcf.wdl
index c25f51186..a6f28c55a 100644
--- a/wdl/MakeCohortVcf.wdl
+++ b/wdl/MakeCohortVcf.wdl
@@ -14,6 +14,7 @@ workflow MakeCohortVcf {
     File ped_file # cohort ped file
 
     # Merge contig vcfs at each stage for QC
+    # Not recommended for very large cohorts
     Boolean merge_cluster_vcfs = false
     Boolean merge_complex_resolve_vcfs = false
     Boolean merge_complex_genotype_vcfs = false
@@ -29,6 +30,12 @@ workflow MakeCohortVcf {
     Array[File] median_coverage_files
     Array[File] rf_cutoff_files
 
+    # Enables use of Hail for merging and sorting VCFs
+    # Recommended for cohorts of 10,000 samples or more
+    # Requires that DataProc be enabled in the GCP project
+    Boolean use_hail = false
+    String? gcs_project
+
     File bin_exclude
     File contig_list
     File allosome_fai
@@ -41,8 +48,15 @@ workflow MakeCohortVcf {
     Int max_shard_size_resolve
     Int max_shards_per_chrom_clean_vcf_step1
     Int min_records_per_shard_clean_vcf_step1
+    Int clean_vcf1b_records_per_shard
     Int samples_per_clean_vcf_step2_shard
+    Int clean_vcf5_records_per_shard
+    Int? clean_vcf5_threads_per_task
     Float min_sr_background_fail_batches
+    Int? max_samples_per_shard_clean_vcf_step3
+
+    String chr_x
+    String chr_y
 
     File empty_file
     File? outlier_samples_list
@@ -68,6 +82,8 @@ workflow MakeCohortVcf {
     String linux_docker
     String sv_base_mini_docker
     String sv_pipeline_docker
+    String sv_pipeline_hail_docker
+    String sv_pipeline_updates_docker
     String sv_pipeline_rdtest_docker
     String sv_pipeline_qc_docker
 
@@ -82,8 +98,6 @@ workflow MakeCohortVcf {
     RuntimeAttr? runtime_override_breakpoint_overlap_filter
 
     # overrides for mini tasks
-    RuntimeAttr? runtime_override_ids_from_vcf
-    RuntimeAttr? runtime_override_clean_bothside_pass
     RuntimeAttr? runtime_override_clean_background_fail
     RuntimeAttr? runtime_override_make_cpx_cnv_input_file
     RuntimeAttr? runtime_override_subset_inversions
@@ -96,7 +110,8 @@ workflow MakeCohortVcf {
     RuntimeAttr? runtime_override_subset_bothside_pass
     RuntimeAttr? runtime_override_subset_background_fail
     RuntimeAttr? runtime_override_subset_sv_type
-    RuntimeAttr? runtime_override_shard_vcf_precluster
+    RuntimeAttr? runtime_override_shard_clusters
+    RuntimeAttr? runtime_override_shard_vids
     RuntimeAttr? runtime_override_pull_vcf_shard
     RuntimeAttr? runtime_override_svtk_vcf_cluster
     RuntimeAttr? runtime_override_get_vcf_header_with_members_info_line
@@ -104,27 +119,44 @@ workflow MakeCohortVcf {
     RuntimeAttr? runtime_override_concat_vcf_cluster
     RuntimeAttr? runtime_override_concat_svtypes
     RuntimeAttr? runtime_override_concat_sharded_cluster
+    RuntimeAttr? runtime_override_make_sites_only
+    RuntimeAttr? runtime_override_preconcat_sharded_cluster
+    RuntimeAttr? runtime_override_hail_merge_sharded_cluster
+    RuntimeAttr? runtime_override_fix_header_sharded_cluster
+    RuntimeAttr? runtime_override_concat_large_pesr_depth
 
-    # overrides for ResolveComplexContig
+    # overrides for ResolveComplexVariants
     RuntimeAttr? runtime_override_update_sr_list_pass
     RuntimeAttr? runtime_override_update_sr_list_fail
+    RuntimeAttr? runtime_override_integrate_resolved_vcfs
+    RuntimeAttr? runtime_override_rename_variants
+    RuntimeAttr? runtime_override_breakpoint_overlap_filter
+    RuntimeAttr? runtime_override_subset_inversions
+    RuntimeAttr? runtime_override_concat_resolve
+
     RuntimeAttr? runtime_override_get_se_cutoff
     RuntimeAttr? runtime_override_shard_vcf_cpx
+    RuntimeAttr? runtime_override_shard_vids_resolve
     RuntimeAttr? runtime_override_resolve_prep
     RuntimeAttr? runtime_override_resolve_cpx_per_shard
     RuntimeAttr? runtime_override_restore_unresolved_cnv_per_shard
     RuntimeAttr? runtime_override_concat_resolved_per_shard
-    RuntimeAttr? runtime_override_complex_resolve_merge
-    RuntimeAttr? runtime_override_merge_resolve_inner
+    RuntimeAttr? runtime_override_pull_vcf_shard
+    RuntimeAttr? runtime_override_preconcat_resolve
+    RuntimeAttr? runtime_override_hail_merge_resolve
+    RuntimeAttr? runtime_override_fix_header_resolve
 
     RuntimeAttr? runtime_override_get_se_cutoff_inv
     RuntimeAttr? runtime_override_shard_vcf_cpx_inv
-    RuntimeAttr? runtime_override_shard_vids_inv
+    RuntimeAttr? runtime_override_shard_vids_resolve_inv
     RuntimeAttr? runtime_override_resolve_prep_inv
     RuntimeAttr? runtime_override_resolve_cpx_per_shard_inv
     RuntimeAttr? runtime_override_restore_unresolved_cnv_per_shard_inv
     RuntimeAttr? runtime_override_concat_resolved_per_shard_inv
-    RuntimeAttr? runtime_override_merge_resolve_inner_inv
+    RuntimeAttr? runtime_override_pull_vcf_shard_inv
+    RuntimeAttr? runtime_override_preconcat_resolve_inv
+    RuntimeAttr? runtime_override_hail_merge_resolve_inv
+    RuntimeAttr? runtime_override_fix_header_resolve_inv
 
     # overrides for GenotypeComplexContig
     RuntimeAttr? runtime_override_ids_from_median
@@ -136,27 +168,55 @@ workflow MakeCohortVcf {
     RuntimeAttr? runtime_override_split_bed_by_size
     RuntimeAttr? runtime_override_rd_genotype
     RuntimeAttr? runtime_override_concat_melted_genotypes
-    RuntimeAttr? runtime_override_complex_genotype_merge
-    RuntimeAttr? runtime_attr_ids_from_vcf
-    RuntimeAttr? runtime_attr_subset_ped
+    RuntimeAttr? runtime_attr_ids_from_vcf_regeno
+    RuntimeAttr? runtime_attr_subset_ped_regeno
+    RuntimeAttr? runtime_override_preconcat_regeno
+    RuntimeAttr? runtime_override_hail_merge_regeno
+    RuntimeAttr? runtime_override_fix_header_regeno
 
     # overrides for CleanVcfContig
+    RuntimeAttr? runtime_override_preconcat_clean_final
+    RuntimeAttr? runtime_override_hail_merge_clean_final
+    RuntimeAttr? runtime_override_fix_header_clean_final
+    RuntimeAttr? runtime_override_concat_cleaned_vcfs
+
     RuntimeAttr? runtime_override_clean_vcf_1a
-    RuntimeAttr? runtime_override_clean_vcf_1b
     RuntimeAttr? runtime_override_clean_vcf_2
     RuntimeAttr? runtime_override_clean_vcf_3
     RuntimeAttr? runtime_override_clean_vcf_4
-    RuntimeAttr? runtime_override_clean_vcf_5
-    RuntimeAttr? runtime_override_drop_redundant_cnvs
+    RuntimeAttr? runtime_override_clean_vcf_5_scatter
+    RuntimeAttr? runtime_override_clean_vcf_5_make_cleangq
+    RuntimeAttr? runtime_override_clean_vcf_5_find_redundant_multiallelics
+    RuntimeAttr? runtime_override_clean_vcf_5_polish
     RuntimeAttr? runtime_override_stitch_fragmented_cnvs
     RuntimeAttr? runtime_override_final_cleanup
+
+    RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b
+    RuntimeAttr? runtime_attr_override_sort_bed_1b
+    RuntimeAttr? runtime_attr_override_intersect_bed_1b
+    RuntimeAttr? runtime_attr_override_build_dict_1b
+    RuntimeAttr? runtime_attr_override_scatter_1b
+    RuntimeAttr? runtime_attr_override_filter_vcf_1b
+    RuntimeAttr? runtime_override_concat_vcfs_1b
+    RuntimeAttr? runtime_override_cat_multi_cnvs_1b
+
+    RuntimeAttr? runtime_override_preconcat_step1
+    RuntimeAttr? runtime_override_hail_merge_step1
+    RuntimeAttr? runtime_override_fix_header_step1
+
+    RuntimeAttr? runtime_override_preconcat_drc
+    RuntimeAttr? runtime_override_hail_merge_drc
+    RuntimeAttr? runtime_override_fix_header_drc
+
     RuntimeAttr? runtime_override_split_vcf_to_clean
-    RuntimeAttr? runtime_override_combine_step_1_vcfs
     RuntimeAttr? runtime_override_combine_step_1_sex_chr_revisions
     RuntimeAttr? runtime_override_split_include_list
     RuntimeAttr? runtime_override_combine_clean_vcf_2
     RuntimeAttr? runtime_override_combine_revised_4
     RuntimeAttr? runtime_override_combine_multi_ids_4
+    RuntimeAttr? runtime_override_drop_redundant_cnvs
+    RuntimeAttr? runtime_override_combine_step_1_vcfs
+    RuntimeAttr? runtime_override_sort_drop_redundant_cnvs
 
     # overrides for VcfQc
     RuntimeAttr? runtime_override_plot_qc_vcf_wide
@@ -202,24 +262,32 @@ workflow MakeCohortVcf {
       depth_exclude_list=depth_exclude_list,
       min_sr_background_fail_batches=min_sr_background_fail_batches,
       empty_file=empty_file,
+      use_hail=use_hail,
+      gcs_project=gcs_project,
+      sv_pipeline_hail_docker=sv_pipeline_hail_docker,
       sv_base_mini_docker=sv_base_mini_docker,
       sv_pipeline_docker=sv_pipeline_docker,
       runtime_override_update_sr_list=runtime_override_update_sr_list_cluster,
       runtime_override_merge_pesr_depth=runtime_override_merge_pesr_depth,
-      runtime_override_clean_bothside_pass=runtime_override_clean_bothside_pass,
       runtime_override_clean_background_fail=runtime_override_clean_background_fail,
       runtime_override_concat=runtime_override_cluster_merge,
       runtime_override_join_vcfs=runtime_override_join_vcfs,
       runtime_override_subset_bothside_pass=runtime_override_subset_bothside_pass,
       runtime_override_subset_background_fail=runtime_override_subset_background_fail,
       runtime_override_subset_sv_type=runtime_override_subset_sv_type,
-      runtime_override_shard_vcf_precluster=runtime_override_shard_vcf_precluster,
+      runtime_override_shard_clusters=runtime_override_shard_clusters,
+      runtime_override_shard_vids=runtime_override_shard_vids,
       runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard,
       runtime_override_svtk_vcf_cluster=runtime_override_svtk_vcf_cluster,
       runtime_override_get_vcf_header_with_members_info_line=runtime_override_get_vcf_header_with_members_info_line,
       runtime_override_concat_vcf_cluster=runtime_override_concat_vcf_cluster,
       runtime_override_concat_svtypes=runtime_override_concat_svtypes,
-      runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster
+      runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster,
+      runtime_override_make_sites_only=runtime_override_make_sites_only,
+      runtime_override_preconcat_sharded_cluster=runtime_override_preconcat_sharded_cluster,
+      runtime_override_hail_merge_sharded_cluster=runtime_override_hail_merge_sharded_cluster,
+      runtime_override_fix_header_sharded_cluster=runtime_override_fix_header_sharded_cluster,
+      runtime_override_concat_large_pesr_depth=runtime_override_concat_large_pesr_depth
   }
 
   call ComplexResolve.ResolveComplexVariants {
@@ -236,32 +304,43 @@ workflow MakeCohortVcf {
       mei_bed=mei_bed,
       pe_exclude_list=pe_exclude_list,
       ref_dict=ref_dict,
+      use_hail=use_hail,
+      gcs_project=gcs_project,
+      sv_pipeline_hail_docker=sv_pipeline_hail_docker,
       max_shard_size=max_shard_size_resolve,
       sv_base_mini_docker=sv_base_mini_docker,
       sv_pipeline_docker=sv_pipeline_docker,
       runtime_override_update_sr_list_pass=runtime_override_update_sr_list_pass,
       runtime_override_update_sr_list_fail=runtime_override_update_sr_list_fail,
-      runtime_override_breakpoint_overlap_filter=runtime_override_breakpoint_overlap_filter,
       runtime_override_integrate_resolved_vcfs=runtime_override_integrate_resolved_vcfs,
       runtime_override_rename_variants=runtime_override_rename_variants,
+      runtime_override_breakpoint_overlap_filter=runtime_override_breakpoint_overlap_filter,
       runtime_override_subset_inversions=runtime_override_subset_inversions,
-      runtime_override_concat=runtime_override_complex_resolve_merge,
+      runtime_override_concat=runtime_override_concat_resolve,
+
       runtime_override_get_se_cutoff=runtime_override_get_se_cutoff,
       runtime_override_shard_vcf_cpx=runtime_override_shard_vcf_cpx,
+      runtime_override_shard_vids=runtime_override_shard_vids_resolve,
       runtime_override_resolve_prep=runtime_override_resolve_prep,
       runtime_override_resolve_cpx_per_shard=runtime_override_resolve_cpx_per_shard,
       runtime_override_restore_unresolved_cnv_per_shard=runtime_override_restore_unresolved_cnv_per_shard,
       runtime_override_concat_resolved_per_shard=runtime_override_concat_resolved_per_shard,
-      runtime_override_merge_resolve_inner=runtime_override_merge_resolve_inner,
-
-      runtime_override_get_se_cutoff_inv=runtime_override_get_se_cutoff_inv,
-      runtime_override_shard_vcf_cpx_inv=runtime_override_shard_vcf_cpx_inv,
-      runtime_override_shard_vids_inv=runtime_override_shard_vids_inv,
-      runtime_override_resolve_prep_inv=runtime_override_resolve_prep_inv,
-      runtime_override_resolve_cpx_per_shard_inv=runtime_override_resolve_cpx_per_shard_inv,
-      runtime_override_restore_unresolved_cnv_per_shard_inv=runtime_override_restore_unresolved_cnv_per_shard_inv,
-      runtime_override_concat_resolved_per_shard_inv=runtime_override_concat_resolved_per_shard_inv,
-      runtime_override_merge_resolve_inner_inv=runtime_override_merge_resolve_inner_inv
+      runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard,
+      runtime_override_preconcat=runtime_override_preconcat_resolve,
+      runtime_override_hail_merge=runtime_override_hail_merge_resolve,
+      runtime_override_fix_header=runtime_override_fix_header_resolve,
+
+      runtime_override_get_se_cutoff=runtime_override_get_se_cutoff_inv,
+      runtime_override_shard_vcf_cpx=runtime_override_shard_vcf_cpx_inv,
+      runtime_override_shard_vids=runtime_override_shard_vids_resolve_inv,
+      runtime_override_resolve_prep=runtime_override_resolve_prep_inv,
+      runtime_override_resolve_cpx_per_shard=runtime_override_resolve_cpx_per_shard_inv,
+      runtime_override_restore_unresolved_cnv_per_shard=runtime_override_restore_unresolved_cnv_per_shard_inv,
+      runtime_override_concat_resolved_per_shard=runtime_override_concat_resolved_per_shard_inv,
+      runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard_inv,
+      runtime_override_preconcat=runtime_override_preconcat_resolve_inv,
+      runtime_override_hail_merge=runtime_override_hail_merge_resolve_inv,
+      runtime_override_fix_header=runtime_override_fix_header_resolve_inv
   }
 
   call ComplexGenotype.GenotypeComplexVariants {
@@ -281,9 +360,10 @@ workflow MakeCohortVcf {
       ref_dict=ref_dict,
       linux_docker=linux_docker,
       sv_base_mini_docker=sv_base_mini_docker,
+      sv_pipeline_hail_docker=sv_pipeline_hail_docker,
       sv_pipeline_docker=sv_pipeline_docker,
+      sv_pipeline_updates_docker=sv_pipeline_updates_docker,
       sv_pipeline_rdtest_docker=sv_pipeline_rdtest_docker,
-      runtime_override_concat=runtime_override_complex_genotype_merge,
       runtime_override_ids_from_median=runtime_override_ids_from_median,
       runtime_override_split_vcf_to_genotype=runtime_override_split_vcf_to_genotype,
       runtime_override_concat_cpx_cnv_vcfs=runtime_override_concat_cpx_cnv_vcfs,
@@ -293,8 +373,11 @@ workflow MakeCohortVcf {
       runtime_override_split_bed_by_size=runtime_override_split_bed_by_size,
       runtime_override_rd_genotype=runtime_override_rd_genotype,
       runtime_override_concat_melted_genotypes=runtime_override_concat_melted_genotypes,
-      runtime_attr_ids_from_vcf=runtime_attr_ids_from_vcf,
-      runtime_attr_subset_ped=runtime_attr_subset_ped
+      runtime_attr_ids_from_vcf=runtime_attr_ids_from_vcf_regeno,
+      runtime_attr_subset_ped=runtime_attr_subset_ped_regeno,
+      runtime_override_preconcat=runtime_override_preconcat_regeno,
+      runtime_override_hail_merge=runtime_override_hail_merge_regeno,
+      runtime_override_fix_header=runtime_override_fix_header_regeno
   }
 
   call Clean.CleanVcf {
@@ -306,31 +389,59 @@ workflow MakeCohortVcf {
       merged_ped_file=ped_file,
       contig_list=contig_list,
       allosome_fai=allosome_fai,
-      max_shards_per_chrom_clean_vcf_step1=max_shards_per_chrom_clean_vcf_step1,
-      min_records_per_shard_clean_vcf_step1=min_records_per_shard_clean_vcf_step1,
-      samples_per_clean_vcf_step2_shard=samples_per_clean_vcf_step2_shard,
+      chr_x=chr_x,
+      chr_y=chr_y,
+      max_shards_per_chrom_step1=max_shards_per_chrom_clean_vcf_step1,
+      min_records_per_shard_step1=min_records_per_shard_clean_vcf_step1,
+      clean_vcf1b_records_per_shard=clean_vcf1b_records_per_shard,
+      samples_per_step2_shard=samples_per_clean_vcf_step2_shard,
+      max_samples_per_shard_step3=max_samples_per_shard_clean_vcf_step3,
+      clean_vcf5_records_per_shard=clean_vcf5_records_per_shard,
       outlier_samples_list=outlier_samples_list,
+      use_hail=use_hail,
+      gcs_project=gcs_project,
       linux_docker=linux_docker,
       sv_base_mini_docker=sv_base_mini_docker,
+      sv_pipeline_hail_docker=sv_pipeline_hail_docker,
       sv_pipeline_docker=sv_pipeline_docker,
+      sv_pipeline_updates_docker=sv_pipeline_updates_docker,
+      runtime_override_preconcat_clean_final=runtime_override_preconcat_clean_final,
+      runtime_override_hail_merge_clean_final=runtime_override_hail_merge_clean_final,
+      runtime_override_fix_header_clean_final=runtime_override_fix_header_clean_final,
       runtime_override_concat_cleaned_vcfs=runtime_override_concat_cleaned_vcfs,
       runtime_override_clean_vcf_1a=runtime_override_clean_vcf_1a,
-      runtime_override_clean_vcf_1b=runtime_override_clean_vcf_1b,
       runtime_override_clean_vcf_2=runtime_override_clean_vcf_2,
       runtime_override_clean_vcf_3=runtime_override_clean_vcf_3,
       runtime_override_clean_vcf_4=runtime_override_clean_vcf_4,
-      runtime_override_clean_vcf_5=runtime_override_clean_vcf_5,
+      runtime_override_clean_vcf_5_scatter=runtime_override_clean_vcf_5_scatter,
+      runtime_override_clean_vcf_5_make_cleangq=runtime_override_clean_vcf_5_make_cleangq,
+      runtime_override_clean_vcf_5_find_redundant_multiallelics=runtime_override_clean_vcf_5_find_redundant_multiallelics,
+      runtime_override_clean_vcf_5_polish=runtime_override_clean_vcf_5_polish,
       runtime_override_stitch_fragmented_cnvs=runtime_override_stitch_fragmented_cnvs,
       runtime_override_final_cleanup=runtime_override_final_cleanup,
+      runtime_attr_override_subset_large_cnvs_1b=runtime_attr_override_subset_large_cnvs_1b,
+      runtime_attr_override_sort_bed_1b=runtime_attr_override_sort_bed_1b,
+      runtime_attr_override_intersect_bed_1b=runtime_attr_override_intersect_bed_1b,
+      runtime_attr_override_build_dict_1b=runtime_attr_override_build_dict_1b,
+      runtime_attr_override_scatter_1b=runtime_attr_override_scatter_1b,
+      runtime_attr_override_filter_vcf_1b=runtime_attr_override_filter_vcf_1b,
+      runtime_override_concat_vcfs_1b=runtime_override_concat_vcfs_1b,
+      runtime_override_cat_multi_cnvs_1b=runtime_override_cat_multi_cnvs_1b,
+      runtime_override_preconcat_step1=runtime_override_preconcat_step1,
+      runtime_override_hail_merge_step1=runtime_override_hail_merge_step1,
+      runtime_override_fix_header_step1=runtime_override_fix_header_step1,
+      runtime_override_preconcat_drc=runtime_override_preconcat_drc,
+      runtime_override_hail_merge_drc=runtime_override_hail_merge_drc,
+      runtime_override_fix_header_drc=runtime_override_fix_header_drc,
       runtime_override_split_vcf_to_clean=runtime_override_split_vcf_to_clean,
-      runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs,
       runtime_override_combine_step_1_sex_chr_revisions=runtime_override_combine_step_1_sex_chr_revisions,
       runtime_override_split_include_list=runtime_override_split_include_list,
       runtime_override_combine_clean_vcf_2=runtime_override_combine_clean_vcf_2,
       runtime_override_combine_revised_4=runtime_override_combine_revised_4,
       runtime_override_combine_multi_ids_4=runtime_override_combine_multi_ids_4,
-      runtime_attr_ids_from_vcf=runtime_attr_ids_from_vcf,
-      runtime_attr_subset_ped=runtime_attr_subset_ped
+      runtime_override_drop_redundant_cnvs=runtime_override_drop_redundant_cnvs,
+      runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs,
+      runtime_override_sort_drop_redundant_cnvs=runtime_override_sort_drop_redundant_cnvs
   }
 
   Array[String] contigs = transpose(read_tsv(contig_list))[0]
diff --git a/wdl/MergePesrDepth.wdl b/wdl/MergePesrDepth.wdl
new file mode 100644
index 000000000..c7756bc28
--- /dev/null
+++ b/wdl/MergePesrDepth.wdl
@@ -0,0 +1,237 @@
+version 1.0
+
+import "Structs.wdl"
+import "TasksMakeCohortVcf.wdl" as MiniTasks
+import "HailMerge.wdl" as HailMerge
+import "ShardedCluster.wdl" as ShardedCluster
+import "Utils.wdl" as utils
+
+workflow MergePesrDepth {
+    input {
+        File subtyped_pesr_vcf
+        File subtyped_depth_vcf
+        Int num_samples
+
+        String prefix
+        String cohort_name
+        String svtype
+        String contig
+        Float merging_shard_scale_factor = 30000000
+
+        Boolean use_hail = false
+        String? gcs_project
+
+        String sv_pipeline_docker
+        String sv_pipeline_hail_docker
+        String sv_base_mini_docker
+
+        # overrides for local tasks
+        RuntimeAttr? runtime_override_shard_clusters
+        RuntimeAttr? runtime_override_shard_vids
+        RuntimeAttr? runtime_override_pull_vcf_shard
+        RuntimeAttr? runtime_override_merge_pesr_depth
+
+        # overrides for MiniTasks
+        RuntimeAttr? runtime_override_sort_merged_vcf
+        RuntimeAttr? runtime_override_subset_small
+        RuntimeAttr? runtime_override_subset_large
+        RuntimeAttr? runtime_override_make_sites_only
+        RuntimeAttr? runtime_override_concat_large_pesr_depth
+        RuntimeAttr? runtime_override_concat_shards
+
+        RuntimeAttr? runtime_override_preconcat_large_pesr_depth
+        RuntimeAttr? runtime_override_hail_merge_large_pesr_depth
+        RuntimeAttr? runtime_override_fix_header_large_pesr_depth
+
+        RuntimeAttr? runtime_override_preconcat_pesr_depth_shards
+        RuntimeAttr? runtime_override_hail_merge_pesr_depth_shards
+        RuntimeAttr? runtime_override_fix_header_pesr_depth_shards
+    }
+
+    # Pull out CNVs too small to cluster (less than reciprocal_overlap_fraction * min_depth_only_length)
+    call MiniTasks.FilterVcf as SubsetSmall {
+        input:
+            vcf=subtyped_pesr_vcf,
+            vcf_index=subtyped_pesr_vcf + ".tbi",
+            outfile_prefix="~{prefix}.subset_small",
+            records_filter='INFO/SVLEN<2500',
+            use_ssd=true,
+            sv_base_mini_docker=sv_base_mini_docker,
+            runtime_attr_override=runtime_override_subset_small
+    }
+
+    call MiniTasks.FilterVcf as SubsetLarge {
+        input:
+            vcf=subtyped_pesr_vcf,
+            vcf_index=subtyped_pesr_vcf + ".tbi",
+            outfile_prefix="~{prefix}.subset_small",
+            records_filter='INFO/SVLEN>=2500',
+            use_ssd=true,
+            sv_base_mini_docker=sv_base_mini_docker,
+            runtime_attr_override=runtime_override_subset_large
+    }
+
+    if (use_hail) {
+        call HailMerge.HailMerge as ConcatLargePesrDepthHail {
+            input:
+                vcfs=[SubsetLarge.filtered_vcf, subtyped_depth_vcf],
+                prefix="~{prefix}.large_pesr_depth",
+                gcs_project=gcs_project,
+                sv_base_mini_docker=sv_base_mini_docker,
+                sv_pipeline_docker=sv_pipeline_docker,
+                sv_pipeline_hail_docker=sv_pipeline_hail_docker,
+                runtime_override_preconcat=runtime_override_preconcat_large_pesr_depth,
+                runtime_override_hail_merge=runtime_override_hail_merge_large_pesr_depth,
+                runtime_override_fix_header=runtime_override_fix_header_large_pesr_depth
+        }
+    }
+    if (!use_hail) {
+        call MiniTasks.ConcatVcfs as ConcatLargePesrDepth {
+            input:
+                vcfs=[SubsetLarge.filtered_vcf, subtyped_depth_vcf],
+                vcfs_idx=[SubsetLarge.filtered_vcf + ".tbi", subtyped_depth_vcf + ".tbi"],
+                allow_overlaps=true,
+                outfile_prefix="~{prefix}.large_pesr_depth",
+                sv_base_mini_docker=sv_base_mini_docker,
+                runtime_attr_override=runtime_override_concat_large_pesr_depth
+        }
+    }
+
+    call MiniTasks.MakeSitesOnlyVcf {
+        input:
+            vcf=select_first([ConcatLargePesrDepth.concat_vcf, ConcatLargePesrDepthHail.merged_vcf]),
+            vcf_index=select_first([ConcatLargePesrDepth.concat_vcf_idx, ConcatLargePesrDepthHail.merged_vcf_index]),
+            prefix="~{prefix}.large_pesr_depth.sites_only",
+            sv_base_mini_docker=sv_base_mini_docker,
+            runtime_attr_override=runtime_override_make_sites_only
+    }
+
+    # Fast cluster without sample overlap linkage for sharding
+    Int merge_shard_size = ceil(merging_shard_scale_factor / num_samples)
+    call ShardedCluster.ShardClusters {
+        input:
+            vcf=MakeSitesOnlyVcf.out,
+            prefix="~{prefix}.shard_clusters",
+            dist=1000000000,
+            frac=0.5,
+            svsize=0,
+            sv_types=[svtype],
+            sv_pipeline_docker=sv_pipeline_docker,
+            runtime_attr_override=runtime_override_shard_clusters
+    }
+
+    call MiniTasks.ShardVidsForClustering {
+        input:
+            clustered_vcf=ShardClusters.out,
+            prefix=prefix,
+            records_per_shard=merge_shard_size,
+            sv_pipeline_docker=sv_pipeline_docker,
+            runtime_attr_override=runtime_override_shard_vids
+    }
+
+    scatter (i in range(length(ShardVidsForClustering.out))) {
+        call MiniTasks.PullVcfShard {
+            input:
+                vcf=select_first([ConcatLargePesrDepth.concat_vcf, ConcatLargePesrDepthHail.merged_vcf]),
+                vids=ShardVidsForClustering.out[i],
+                prefix="~{prefix}.unclustered.shard_${i}",
+                sv_base_mini_docker=sv_base_mini_docker,
+                runtime_attr_override=runtime_override_pull_vcf_shard
+        }
+        call MergePesrDepthShard {
+            input:
+                vcf=PullVcfShard.out,
+                vcf_index=PullVcfShard.out_index,
+                prefix="~{prefix}.merge_pesr_depth.shard_~{i}",
+                vid_prefix="~{cohort_name}_~{contig}_mpd~{i}",
+                sv_pipeline_docker=sv_pipeline_docker,
+                runtime_attr_override=runtime_override_merge_pesr_depth
+        }
+        call MiniTasks.SortVcf {
+            input:
+                vcf = MergePesrDepthShard.out,
+                outfile_prefix = "~{prefix}.sorted.shard_${i}",
+                sv_base_mini_docker = sv_base_mini_docker,
+                runtime_attr_override = runtime_override_sort_merged_vcf
+        }
+    }
+
+    if (use_hail) {
+        call HailMerge.HailMerge as ConcatShardsHail {
+            input:
+                vcfs=flatten([[SubsetSmall.filtered_vcf], SortVcf.out]),
+                prefix="~{prefix}.concat_shards",
+                gcs_project=gcs_project,
+                sv_base_mini_docker=sv_base_mini_docker,
+                sv_pipeline_docker=sv_pipeline_docker,
+                sv_pipeline_hail_docker=sv_pipeline_hail_docker,
+                runtime_override_preconcat=runtime_override_preconcat_pesr_depth_shards,
+                runtime_override_hail_merge=runtime_override_hail_merge_pesr_depth_shards,
+                runtime_override_fix_header=runtime_override_fix_header_pesr_depth_shards
+        }
+    }
+    if (!use_hail) {
+        call MiniTasks.ConcatVcfs as ConcatShards {
+            input:
+                vcfs=flatten([[SubsetSmall.filtered_vcf], SortVcf.out]),
+                vcfs_idx=flatten([[SubsetSmall.filtered_vcf_idx], SortVcf.out_index]),
+                allow_overlaps=true,
+                outfile_prefix="~{prefix}.concat_shards",
+                sv_base_mini_docker=sv_base_mini_docker,
+                runtime_attr_override=runtime_override_concat_shards
+        }
+    }
+
+    output {
+        File out = select_first([ConcatShards.concat_vcf, ConcatShardsHail.merged_vcf])
+        File out_index = select_first([ConcatShards.concat_vcf_idx, ConcatShardsHail.merged_vcf_index])
+    }
+}
+
+
+task MergePesrDepthShard {
+    input {
+        File vcf
+        File vcf_index
+        String prefix
+        String vid_prefix
+        String sv_pipeline_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    String output_file = prefix + ".vcf.gz"
+
+    # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to
+    # be held in memory or disk while working, potentially in a form that takes up more space)
+    Float input_size = size(vcf, "GiB")
+    RuntimeAttr runtime_default = object {
+                                      mem_gb: 2.0 + 0.6 * input_size,
+                                      disk_gb: ceil(10.0 + 6 * input_size),
+                                      cpu_cores: 1,
+                                      preemptible_tries: 3,
+                                      max_retries: 1,
+                                      boot_disk_gb: 10
+                                  }
+    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    runtime {
+        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
+        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} SSD"
+        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        docker: sv_pipeline_docker
+        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+    }
+
+    command <<<
+        set -euo pipefail
+        /opt/sv-pipeline/04_variant_resolution/scripts/merge_pesr_depth.py \
+            --prefix ~{vid_prefix} \
+            ~{vcf} \
+            ~{output_file}
+    >>>
+
+    output {
+        File out = output_file
+    }
+}
diff --git a/wdl/Module07MinGQ.wdl b/wdl/Module07MinGQ.wdl
index e2bf9263e..c134d0f45 100644
--- a/wdl/Module07MinGQ.wdl
+++ b/wdl/Module07MinGQ.wdl
@@ -6,12 +6,11 @@ import "Structs.wdl"
 import "TasksMakeCohortVcf.wdl" as MiniTasks
 import "ReviseSVtypeINStoMEI.wdl" as ReviseSVtype
 
-
-
 workflow Module07MinGQ {
-  input{
+  input {
     String sv_base_mini_docker
     String sv_pipeline_docker
+    String sv_pipeline_updates_docker
     File vcf
     File vcf_idx
     String prefix
@@ -56,7 +55,7 @@ workflow Module07MinGQ {
   Array[Array[String]] contigs = read_tsv(contiglist)
 
   # Get svtype of MEI
-  call ReviseSVtype.ReviseSVtypeINStoMEI as ReviseSVtypeMEI{
+  call ReviseSVtype.ReviseSVtypeINStoMEI as ReviseSVtypeMEI {
     input:
       vcf = vcf,
       vcf_idx = vcf_idx,
@@ -87,20 +86,23 @@ workflow Module07MinGQ {
       input:
         vcf=ReviseSVtypeMEI.updated_vcf,
         vcf_idx=ReviseSVtypeMEI.updated_vcf_idx,
-        contig=contig[0],
         sv_per_shard=1000,
-        prefix=prefix,
-        sv_pipeline_docker=sv_pipeline_docker
-    }
-    call SplitPcrVcf {
-      input:
-        vcf=getAFs.vcf_wAFs,
         prefix="~{prefix}.~{contig[0]}",
-        pcrplus_samples_list=pcrplus_samples_list,
-        sv_base_mini_docker=sv_base_mini_docker
+        sv_pipeline_docker=sv_pipeline_docker,
+        sv_pipeline_updates_docker=sv_pipeline_updates_docker
     }
+    if (defined(pcrplus_samples_list)) {
+      call SplitPcrVcf {
+        input:
+          vcf=getAFs.vcf_wAFs,
+          prefix="~{prefix}.~{contig[0]}",
+          pcrplus_samples_list=pcrplus_samples_list,
+          sv_base_mini_docker=sv_base_mini_docker
+      }
+    }
+    File pcr_minus_vcf = select_first([SplitPcrVcf.PCRMINUS_vcf, getAFs.vcf_wAFs])
 
-    # Dev note Feb 18 2021: the output from cat_AF_table_PCRMINUS is a required 
+    # Dev note Feb 18 2021: the output from cat_AF_table_PCRMINUS is a required
     # input to Module07XfBatchEffect.wdl, so the subsequent three tasks always 
     # need to be generated (even if passing a precomputed minGQ cutoff table)
 
@@ -109,11 +111,11 @@ workflow Module07MinGQ {
       input:
         vcf=ReviseSVtypeMEI.updated_vcf,
         vcf_idx=ReviseSVtypeMEI.updated_vcf_idx,
-        contig=contig[0],
         sv_per_shard=1000,
-        prefix=prefix,
+        prefix="~{prefix}.~{contig[0]}",
         sample_pop_assignments=GetSampleLists.sample_PCR_labels,
-        sv_pipeline_docker=sv_pipeline_docker
+        sv_pipeline_docker=sv_pipeline_docker,
+        sv_pipeline_updates_docker=sv_pipeline_updates_docker
     }
     # Gather table of AC/AN/AF for PCRPLUS and PCRMINUS samples
     call GetAfTables {
@@ -133,12 +135,12 @@ workflow Module07MinGQ {
   }
 
 
-  if (MingqTraining){
+  if (MingqTraining) {
     ###PCRMINUS
     call SplitFamfile as SplitFamfile_PCRMINUS {
       input:
-        vcf=SplitPcrVcf.PCRMINUS_vcf[0],
-        vcf_idx=SplitPcrVcf.PCRMINUS_vcf_idx[0],
+        vcf=pcr_minus_vcf,
+        vcf_idx=pcr_minus_vcf + ".tbi",
         famfile=trios_famfile,
         fams_per_shard=1,
         prefix="~{prefix}.PCRMINUS",
@@ -147,7 +149,7 @@ workflow Module07MinGQ {
     scatter ( fam in SplitFamfile_PCRMINUS.famfile_shards ) {
       call CollectTrioSVdat as CollectTrioSVdat_PCRMINUS {
         input:
-          vcf_shards=SplitPcrVcf.PCRMINUS_vcf,
+          vcf_shards=pcr_minus_vcf,
           famfile=fam,
           sv_pipeline_docker=sv_pipeline_docker
       }
@@ -263,53 +265,6 @@ workflow Module07MinGQ {
   }
 }
 
-# revise svtype of MEIs to SVTYPE=MEI
-task ReviseSVtypeMEI{
-  input{
-    File vcf
-    File vcf_idx
-    String prefix
-    String sv_base_mini_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  RuntimeAttr default_attr = object {
-    cpu_cores: 1, 
-    mem_gb: 3.75, 
-    disk_gb: 100,
-    boot_disk_gb: 10,
-    preemptible_tries: 3,
-    max_retries: 1
-  }
-  
-  RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
-  
-  command <<<
-    zcat ~{vcf} | grep '#' > ~{prefix}.vcf
-    zcat ~{vcf} | grep -v '#' | grep "INS:ME" | sed -e "s/SVTYPE=INS/SVTYPE=MEI/" >> ~{prefix}.vcf
-    zcat ~{vcf} | grep -v '#' | grep -v "INS:ME"  >> ~{prefix}.vcf
-    mkdir tmp
-    vcf-sort -t tmp/ ~{prefix}.vcf | bgzip > ~{prefix}.vcf.gz
-    tabix -p vcf ~{prefix}.vcf.gz
-  >>>
-
-  output{
-    File updated_vcf = "~{prefix}.vcf.gz"
-    File updated_vcf_idx = "~{prefix}.vcf.gz.tbi"
-  }
-
-  runtime {
-    cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores])
-    memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB"
-    disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD"
-    bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb])
-    docker: sv_base_mini_docker
-    preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries])
-    maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries])
-  }
-}
-
-
 # Get lists of PCRPLUS and PCRMINUS samples present in input VCF
 task GetSampleLists {
   input{
@@ -324,7 +279,7 @@ task GetSampleLists {
   RuntimeAttr default_attr = object {
     cpu_cores: 1, 
     mem_gb: 3.75, 
-    disk_gb: 50,
+    disk_gb: ceil(10 + size(vcf, "GB")),
     boot_disk_gb: 10,
     preemptible_tries: 3,
     max_retries: 1
@@ -332,26 +287,17 @@ task GetSampleLists {
   RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
   command <<<
     set -euo pipefail
-    tabix -H ~{vcf} | fgrep -v "##" | cut -f10- | sed 's/\t/\n/g' > all_samples.list
-    if [ ! -z "~{pcrplus_samples_list}" ];then
-      fgrep -wf ~{pcrplus_samples_list} all_samples.list > "~{prefix}.PCRPLUS.samples.list" || true
-      fgrep -wvf ~{pcrplus_samples_list} all_samples.list > "~{prefix}.PCRMINUS.samples.list" || true
-      cat \
-        <( awk -v OFS="\t" '{ print $1, "PCRPLUS" }' "~{prefix}.PCRPLUS.samples.list" || true ) \
-        <( awk -v OFS="\t" '{ print $1, "PCRMINUS" }' "~{prefix}.PCRMINUS.samples.list" || true ) \
-      > "~{prefix}.PCR_status_assignments.txt"
+    bcftools query -l ~{vcf} > all_samples.list
+    if ~{defined(pcrplus_samples_list)}; then
+      awk -v OFS="\t" 'ARGIND==1{inFileA[$1]; next} {if($1 in inFileA){print $1,"PCRPLUS"}else{print $1,"PCRMINUS"}}' ~{pcrplus_samples_list} all_samples.list \
+        > ~{prefix}.PCR_status_assignments.txt
     else
-      cp all_samples.list "~{prefix}.PCRMINUS.samples.list"
-      cat \
-        <( awk -v OFS="\t" '{ print $1, "PCRMINUS" }' "~{prefix}.PCRMINUS.samples.list" || true ) \
-      > "~{prefix}.PCR_status_assignments.txt"
-      touch ~{prefix}.PCRPLUS.samples.list
+      awk -v OFS="\t" '{ print $1, "PCRMINUS" }' all_samples.list \
+        > ~{prefix}.PCR_status_assignments.txt
     fi
   >>>
 
   output {
-    File updated_pcrplus_samples_list = "~{prefix}.PCRPLUS.samples.list"
-    File updated_PCRMINUS_samples_list = "~{prefix}.PCRMINUS.samples.list"
     File sample_PCR_labels = "~{prefix}.PCR_status_assignments.txt"
   }
 
@@ -372,14 +318,14 @@ task SplitPcrVcf {
   input{
     File vcf
     String prefix
-    File? pcrplus_samples_list
+    File pcrplus_samples_list
     String sv_base_mini_docker
     RuntimeAttr? runtime_attr_override
   }
   RuntimeAttr default_attr = object {
     cpu_cores: 1, 
     mem_gb: 3.75, 
-    disk_gb: 50,
+    disk_gb: ceil(10 + size(vcf, "GB") * 2),
     boot_disk_gb: 10,
     preemptible_tries: 3,
     max_retries: 1
@@ -387,34 +333,14 @@ task SplitPcrVcf {
   RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr])
 
   command <<<
-    if [ ! -z "~{pcrplus_samples_list}" ] && [ $( cat "~{pcrplus_samples_list}" | wc -l ) -gt 0 ]; then
-      #Get index of PCR+ samples
-      PCRPLUS_idxs=$( zcat ~{vcf} | sed -n '1,500p' | fgrep "#" | fgrep -v "##" \
-                      | sed 's/\t/\n/g' | awk -v OFS="\t" '{ print NR, $1 }' \
-                      | fgrep -wf ~{pcrplus_samples_list} | cut -f1 | paste -s -d, )
-      #Get PCR+ VCF
-      zcat ~{vcf} \
-      | cut -f1-9,"$PCRPLUS_idxs" \
-      | bgzip -c \
-      > "~{prefix}.PCRPLUS.vcf.gz"
-      tabix -f -p vcf "~{prefix}.PCRPLUS.vcf.gz"
-      #Get PCR- VCF
-      zcat ~{vcf} \
-      | cut --complement -f"$PCRPLUS_idxs" \
-      | bgzip -c \
-      > "~{prefix}.PCRMINUS.vcf.gz"
-      tabix -f -p vcf "~{prefix}.PCRMINUS.vcf.gz"
-    else
-      cp ~{vcf} ~{prefix}.PCRMINUS.vcf.gz
-      tabix -f -p vcf "~{prefix}.PCRMINUS.vcf.gz"
-      touch ~{prefix}.PCRPLUS.vcf.gz
-      touch ~{prefix}.PCRPLUS.vcf.gz.tbi
-    fi
+    bcftools query -l ~{vcf} > all_samples.list
+    awk 'ARGIND==1{inFileA[$1]; next} !($1 in inFileA)' ~{pcrplus_samples_list} all_samples.list \
+      > pcrminus_samples.list
+    bcftools reheader -s pcrminus_samples.list -Oz -o ~{prefix}.PCRMINUS.vcf.gz
+    tabix ~{prefix}.PCRMINUS.vcf.gz
   >>>
 
   output {
-    File PCRPLUS_vcf = "~{prefix}.PCRPLUS.vcf.gz"
-    File PCRPLUS_vcf_idx = "~{prefix}.PCRPLUS.vcf.gz.tbi"
     File PCRMINUS_vcf = "~{prefix}.PCRMINUS.vcf.gz"
     File PCRMINUS_vcf_idx = "~{prefix}.PCRMINUS.vcf.gz.tbi"
   }
@@ -444,7 +370,7 @@ task GetAfTables {
   RuntimeAttr default_attr = object {
     cpu_cores: 1, 
     mem_gb: 3.75, 
-    disk_gb: 50,
+    disk_gb: ceil(10 + size(vcf, "GB") * 3),
     boot_disk_gb: 10,
     preemptible_tries: 3,
     max_retries: 1
@@ -464,9 +390,9 @@ task GetAfTables {
             | cut -f2 \
             | paste -s -d\, || true )
     cut -f"$idxs" "~{prefix}.vcf2bed.bed" \
-    | sed 's/^name/\#VID/g' \
-    | gzip -c \
-    > "~{prefix}.frequencies.preclean.txt.gz"
+      | sed 's/^name/\#VID/g' \
+      | gzip -c \
+      > "~{prefix}.frequencies.preclean.txt.gz"
     if [ ! -z "~{pcrplus_samples_list}" ]; then
       echo -e "dummy\tPCRMINUS\ndummy2\tPCRPLUS" > dummy.tsv
     else
@@ -481,9 +407,9 @@ task GetAfTables {
       AC_idx=$( zcat "~{prefix}.frequencies.txt.gz" | sed -n '1p' | sed 's/\t/\n/g' | awk -v PCR="$PCR" '{ if ($1==PCR"_AC") print NR }' )
       AN_idx=$( zcat "~{prefix}.frequencies.txt.gz" | sed -n '1p' | sed 's/\t/\n/g' | awk -v PCR="$PCR" '{ if ($1==PCR"_AN") print NR }' )
       zcat "~{prefix}.frequencies.txt.gz" \
-      | sed '1d' \
-      | awk -v FS="\t" -v OFS="\t" -v AC="$AC_idx" -v AN="$AN_idx" \
-        '{ print $1, $(AC), $(AN) }' \
+        | sed '1d' \
+        | awk -v FS="\t" -v OFS="\t" -v AC="$AC_idx" -v AN="$AN_idx" \
+          '{ print $1, $(AC), $(AN) }' \
       > ~{prefix}."$PCR".AF_preMinGQ.txt
     done
     if [ ! -z ~{prefix}.PCRPLUS.AF_preMinGQ.txt ]; then
diff --git a/wdl/PatchSRBothsidePass.wdl b/wdl/PatchSRBothsidePass.wdl
new file mode 100644
index 000000000..7b6ad66ec
--- /dev/null
+++ b/wdl/PatchSRBothsidePass.wdl
@@ -0,0 +1,133 @@
+version 1.0
+
+import "Utils.wdl" as utils
+import "Structs.wdl"
+
+workflow PatchSRBothsidePass {
+    input {
+        Array[File] batch_sample_lists
+        File cohort_vcf
+        File updated_bothside_pass_list
+        String cohort_name
+        String contig
+
+        File patch_script
+
+        String sv_base_mini_docker
+        String sv_pipeline_docker
+
+        RuntimeAttr? runtime_attr_get_non_ref_vids
+        RuntimeAttr? runtime_attr_calculate_support_frac
+    }
+
+    scatter (i in range(length(batch_sample_lists))) {
+        call GetNonRefVariantLists {
+            input:
+                samples_list=batch_sample_lists[i],
+                cohort_vcf=cohort_vcf,
+                prefix="~{cohort_name}.~{contig}.non_ref_variants.shard_~{i}",
+                sv_base_mini_docker=sv_base_mini_docker,
+                runtime_attr_override=runtime_attr_get_non_ref_vids
+        }
+    }
+
+    call RecalculateBothsideSupportFractions {
+        input:
+            patch_script=patch_script,
+            non_ref_vid_lists=GetNonRefVariantLists.out,
+            updated_bothside_pass_list=updated_bothside_pass_list,
+            num_batches=length(batch_sample_lists),
+            prefix="~{cohort_name}.~{contig}.sr_bothside_support.patched",
+            sv_pipeline_docker=sv_pipeline_docker,
+            runtime_attr_override=runtime_attr_calculate_support_frac
+    }
+
+    output {
+        File out = RecalculateBothsideSupportFractions.out
+    }
+}
+
+task GetNonRefVariantLists {
+    input {
+        File samples_list
+        File cohort_vcf
+        String prefix
+        String sv_base_mini_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float input_size = size(cohort_vcf, "GB")
+    RuntimeAttr runtime_default = object {
+                                      mem_gb: 3.75,
+                                      disk_gb: ceil(10.0 + input_size),
+                                      cpu_cores: 1,
+                                      preemptible_tries: 3,
+                                      max_retries: 1,
+                                      boot_disk_gb: 10
+                                  }
+    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    runtime {
+        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        docker: sv_base_mini_docker
+        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+    }
+
+    command <<<
+        set -euo pipefail
+        bcftools view --samples-file ~{samples_list} ~{cohort_vcf} \
+        | bcftools view -G -i 'SUM(AC)>0||SUM(FORMAT/SR_GT)>0' \
+        | bcftools query -f '%ID\n' \
+        > ~{prefix}.list
+    >>>
+    output {
+        File out = "~{prefix}.list"
+    }
+}
+
+task RecalculateBothsideSupportFractions {
+    input {
+        File patch_script
+        Array[File] non_ref_vid_lists
+        File updated_bothside_pass_list
+        Int num_batches
+        String prefix
+        String sv_pipeline_docker
+        RuntimeAttr? runtime_attr_override
+    }
+
+    Float input_size = size(non_ref_vid_lists, "GB") + size(updated_bothside_pass_list, "GB")
+    RuntimeAttr runtime_default = object {
+                                      mem_gb: 3.75,
+                                      disk_gb: ceil(10.0 + input_size * 2.0),
+                                      cpu_cores: 1,
+                                      preemptible_tries: 3,
+                                      max_retries: 1,
+                                      boot_disk_gb: 10
+                                  }
+    RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+    runtime {
+        memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+        disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+        cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+        preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+        maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+        docker: sv_pipeline_docker
+        bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+    }
+
+    command <<<
+        set -euo pipefail
+        python ~{patch_script} \
+            ~{write_lines(non_ref_vid_lists)} \
+            ~{updated_bothside_pass_list} \
+            ~{num_batches} \
+            > ~{prefix}.txt
+    >>>
+    output {
+        File out = "~{prefix}.txt"
+    }
+}
\ No newline at end of file
diff --git a/wdl/PatchSRBothsidePassScatter.wdl b/wdl/PatchSRBothsidePassScatter.wdl
new file mode 100644
index 000000000..ae4d77e6b
--- /dev/null
+++ b/wdl/PatchSRBothsidePassScatter.wdl
@@ -0,0 +1,54 @@
+version 1.0
+
+import "Utils.wdl" as utils
+import "PatchSRBothsidePass.wdl" as patch
+import "Structs.wdl"
+
+workflow PatchSRBothsidePassScatter {
+    input {
+        Array[File] batch_vcfs
+        Array[File] cohort_contig_vcfs
+        Array[File] updated_bothside_pass_lists
+        String cohort_name
+        File contig_list
+
+        File patch_script
+
+        String sv_base_mini_docker
+        String sv_pipeline_docker
+
+        RuntimeAttr? runtime_attr_get_sample_ids
+        RuntimeAttr? runtime_attr_get_non_ref_vids
+        RuntimeAttr? runtime_attr_calculate_support_frac
+    }
+
+    scatter (i in range(length(batch_vcfs))) {
+        call utils.GetSampleIdsFromVcf {
+            input:
+                vcf=batch_vcfs[i],
+                sv_base_mini_docker=sv_base_mini_docker,
+                runtime_attr_override=runtime_attr_get_sample_ids
+        }
+    }
+
+    Array[String] contigs = transpose(read_tsv(contig_list))[0]
+    scatter ( i in range(length(contigs)) ) {
+        call patch.PatchSRBothsidePass {
+            input:
+                batch_sample_lists=GetSampleIdsFromVcf.out_file,
+                cohort_vcf=cohort_contig_vcfs[i],
+                updated_bothside_pass_list=updated_bothside_pass_lists[i],
+                cohort_name=cohort_name,
+                contig=contigs[i],
+                patch_script=patch_script,
+                sv_base_mini_docker=sv_base_mini_docker,
+                sv_pipeline_docker=sv_pipeline_docker,
+                runtime_attr_get_non_ref_vids=runtime_attr_get_non_ref_vids,
+                runtime_attr_calculate_support_frac=runtime_attr_calculate_support_frac
+        }
+    }
+
+    output {
+        Array[File] out = PatchSRBothsidePass.out
+    }
+}
diff --git a/wdl/ResolveComplexVariants.wdl b/wdl/ResolveComplexVariants.wdl
index 220607943..3ecd65740 100644
--- a/wdl/ResolveComplexVariants.wdl
+++ b/wdl/ResolveComplexVariants.wdl
@@ -23,18 +23,19 @@ workflow ResolveComplexVariants {
     File pe_exclude_list
     File ref_dict
 
+    Boolean use_hail = false
+    String? gcs_project
+
     String sv_base_mini_docker
     String sv_pipeline_docker
+    String sv_pipeline_hail_docker
 
     # overrides for local tasks
     RuntimeAttr? runtime_override_update_sr_list_pass
     RuntimeAttr? runtime_override_update_sr_list_fail
     RuntimeAttr? runtime_override_integrate_resolved_vcfs
     RuntimeAttr? runtime_override_rename_variants
-
     RuntimeAttr? runtime_override_breakpoint_overlap_filter
-
-    # overrides for mini tasks
     RuntimeAttr? runtime_override_subset_inversions
     RuntimeAttr? runtime_override_concat
 
@@ -46,7 +47,10 @@ workflow ResolveComplexVariants {
     RuntimeAttr? runtime_override_resolve_cpx_per_shard
     RuntimeAttr? runtime_override_restore_unresolved_cnv_per_shard
     RuntimeAttr? runtime_override_concat_resolved_per_shard
-    RuntimeAttr? runtime_override_merge_resolve_inner
+    RuntimeAttr? runtime_override_pull_vcf_shard
+    RuntimeAttr? runtime_override_preconcat
+    RuntimeAttr? runtime_override_hail_merge
+    RuntimeAttr? runtime_override_fix_header
 
     RuntimeAttr? runtime_override_get_se_cutoff_inv
     RuntimeAttr? runtime_override_shard_vcf_cpx_inv
@@ -55,7 +59,10 @@ workflow ResolveComplexVariants {
     RuntimeAttr? runtime_override_resolve_cpx_per_shard_inv
     RuntimeAttr? runtime_override_restore_unresolved_cnv_per_shard_inv
     RuntimeAttr? runtime_override_concat_resolved_per_shard_inv
-    RuntimeAttr? runtime_override_merge_resolve_inner_inv
+    RuntimeAttr? runtime_override_pull_vcf_shard_inv
+    RuntimeAttr? runtime_override_preconcat_inv
+    RuntimeAttr? runtime_override_hail_merge_inv
+    RuntimeAttr? runtime_override_fix_header_inv
   }
 
   #Scatter per chromosome
@@ -87,11 +94,13 @@ workflow ResolveComplexVariants {
         mei_bed=mei_bed,
         pe_exclude_list=pe_exclude_list,
         rf_cutoff_files=rf_cutoff_files,
-        inv_only=true,
         ref_dict=ref_dict,
         precluster_distance=50000000,
         precluster_overlap_frac=0.1,
+        use_hail=use_hail,
+        gcs_project=gcs_project,
         sv_pipeline_docker=sv_pipeline_docker,
+        sv_pipeline_hail_docker=sv_pipeline_hail_docker,
         sv_base_mini_docker=sv_base_mini_docker,
         runtime_override_get_se_cutoff=runtime_override_get_se_cutoff_inv,
         runtime_override_shard_vcf_cpx=runtime_override_shard_vcf_cpx_inv,
@@ -100,7 +109,10 @@ workflow ResolveComplexVariants {
         runtime_override_resolve_cpx_per_shard=runtime_override_resolve_cpx_per_shard_inv,
         runtime_override_restore_unresolved_cnv_per_shard=runtime_override_restore_unresolved_cnv_per_shard_inv,
         runtime_override_concat_resolved_per_shard=runtime_override_concat_resolved_per_shard_inv,
-        runtime_override_merge_resolve_inner=runtime_override_merge_resolve_inner_inv
+        runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard_inv,
+        runtime_override_preconcat=runtime_override_preconcat_inv,
+        runtime_override_hail_merge=runtime_override_hail_merge_inv,
+        runtime_override_fix_header=runtime_override_fix_header_inv
     }
 
     #Run same-bp overlap filter on full vcf
@@ -127,11 +139,13 @@ workflow ResolveComplexVariants {
         mei_bed=mei_bed,
         pe_exclude_list=pe_exclude_list,
         rf_cutoff_files=rf_cutoff_files,
-        inv_only=false,
         ref_dict=ref_dict,
-        precluster_distance=1000,
-        precluster_overlap_frac=0,
+        precluster_distance=2000,
+        precluster_overlap_frac=0.000000001,
+        use_hail=use_hail,
+        gcs_project=gcs_project,
         sv_pipeline_docker=sv_pipeline_docker,
+        sv_pipeline_hail_docker=sv_pipeline_hail_docker,
         sv_base_mini_docker=sv_base_mini_docker,
         runtime_override_get_se_cutoff=runtime_override_get_se_cutoff,
         runtime_override_shard_vcf_cpx=runtime_override_shard_vcf_cpx,
@@ -140,7 +154,10 @@ workflow ResolveComplexVariants {
         runtime_override_resolve_cpx_per_shard=runtime_override_resolve_cpx_per_shard,
         runtime_override_restore_unresolved_cnv_per_shard=runtime_override_restore_unresolved_cnv_per_shard,
         runtime_override_concat_resolved_per_shard=runtime_override_concat_resolved_per_shard,
-        runtime_override_merge_resolve_inner=runtime_override_merge_resolve_inner,
+        runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard,
+        runtime_override_preconcat=runtime_override_preconcat,
+        runtime_override_hail_merge=runtime_override_hail_merge,
+        runtime_override_fix_header=runtime_override_fix_header
     }
 
     #Integrate inv-only and all-variants resolved VCFs
@@ -158,6 +175,8 @@ workflow ResolveComplexVariants {
       input:
         vcf=IntegrateResolvedVcfs.integrated_vcf,
         prefix="~{cohort_name}.~{contig}.renamed",
+        chrom=contig,
+        vid_prefix=cohort_name,
         sv_pipeline_docker=sv_pipeline_docker,
         runtime_attr_override=runtime_override_rename_variants
     }
@@ -167,7 +186,7 @@ workflow ResolveComplexVariants {
       input:
         vcf=RenameVariants.renamed_vcf,
         original_list=cluster_bothside_pass_lists[i],
-        outfile="sr_bothside_pass.~{contig}.updated3.txt",
+        outfile="~{cohort_name}.~{contig}.sr_bothside_pass.updated3.txt",
         sv_pipeline_docker=sv_pipeline_docker,
         runtime_attr_override=runtime_override_update_sr_list_pass
     }
@@ -177,7 +196,7 @@ workflow ResolveComplexVariants {
       input:
         vcf=RenameVariants.renamed_vcf,
         original_list=cluster_background_fail_lists[i],
-        outfile="sr_background_fail.~{contig}.updated3.txt",
+        outfile="~{cohort_name}.~{contig}.sr_background_fail.updated3.txt",
         sv_pipeline_docker=sv_pipeline_docker,
         runtime_attr_override=runtime_override_update_sr_list_fail
     }
@@ -202,6 +221,8 @@ workflow ResolveComplexVariants {
     Array[File] complex_resolve_vcf_indexes = RenameVariants.renamed_vcf_index
     Array[File] complex_resolve_bothside_pass_lists = UpdateBothsidePass.updated_list
     Array[File] complex_resolve_background_fail_lists = UpdateBackgroundFail.updated_list
+    Array[File] breakpoint_overlap_dropped_record_vcfs = BreakpointOverlap.dropped_record_vcf
+    Array[File] breakpoint_overlap_dropped_record_vcf_indexes = BreakpointOverlap.dropped_record_vcf_index
     File? merged_vcf = ConcatVcfs.concat_vcf
     File? merged_vcf_index = ConcatVcfs.concat_vcf_idx
   }
@@ -219,10 +240,10 @@ task IntegrateResolvedVcfs {
 
   Float input_size = size([inv_res_vcf, all_res_vcf], "GiB")
   RuntimeAttr runtime_default = object {
-                                  mem_gb: 2.0,
-                                  disk_gb: ceil(10 + input_size * 10),
+                                  mem_gb: 3.75,
+                                  disk_gb: ceil(10 + input_size * 20),
                                   cpu_cores: 1,
-                                  preemptible_tries: 1,
+                                  preemptible_tries: 3,
                                   max_retries: 1,
                                   boot_disk_gb: 10
                                 }
@@ -256,23 +277,23 @@ task IntegrateResolvedVcfs {
 
     ##get unresolved variants from full vcf that are resolved in inversion resolved vcf###
     zcat ~{inv_res_vcf} \
-      | fgrep -v "#" \
+      |fgrep -v "#" \
       |awk '{if ($8!~"UNRESOLVED") print}' \
-      |fgrep -wvf <(awk '{if ($NF!="MEMBERS") print $NF}' all.resolved.inv.bed  \
-      |tr ',' '\n') \
-      >add.vcf.lines.txt || true
+      |awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]; next} {if (!($3 in inFileA)) print }' \
+        <(awk '{if ($NF!="MEMBERS") print $NF}' all.resolved.inv.bed | tr ',' '\n') - \
+      >add.vcf.lines.txt
 
     ##get unresolved variants id from full vcf to strip since they are resolved in inversion resolved vcf###
     ##inversions that cluster were other variants (rare) are kept as unresolved though they will also be part of a resolved variant in add.vcf.lines.txt##
     awk '{if ($NF!="MEMBERS") print $NF}' inv.resolve.bed \
       |tr ',' '\n'\
-      |fgrep -wf - all.unresolved.inv.bed \
+      |awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$4]; next} {if ($4 in inFileA) print }' all.resolved.inv.bed - \
       |awk '{if ($NF!~",")print $4}' \
-      >remove.unresolved.vcf.ids.txt || true
+      >remove.unresolved.vcf.ids.txt
 
     mkdir temp
     zcat ~{all_res_vcf} \
-      |fgrep -wvf remove.unresolved.vcf.ids.txt \
+      |awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]; next} {if (!($3 in inFileA)) print }' remove.unresolved.vcf.ids.txt - \
       |cat - add.vcf.lines.txt \
       |bcftools sort - -O z -T temp \
       > ~{prefix}.vcf.gz
@@ -326,24 +347,28 @@ task BreakpointOverlap {
       ~{vcf} \
       ~{bothside_pass_list} \
       ~{background_fail_list} \
+      ~{prefix}.dropped_records.vcf.gz \
       | bgzip \
       > ~{prefix}.vcf.gz
     tabix ~{prefix}.vcf.gz
+    tabix ~{prefix}.dropped_records.vcf.gz
   >>>
 
   output {
     File out = "~{prefix}.vcf.gz"
     File out_index = "~{prefix}.vcf.gz.tbi"
+    File dropped_record_vcf = "~{prefix}.dropped_records.vcf.gz"
+    File dropped_record_vcf_index = "~{prefix}.dropped_records.vcf.gz.tbi"
   }
 }
 
-
-
 # Rename variants in VCF
 task RenameVariants {
   input {
     File vcf
+    String vid_prefix
     String prefix
+    String chrom
     String sv_pipeline_docker
     RuntimeAttr? runtime_attr_override
   }
@@ -372,7 +397,7 @@ task RenameVariants {
 
   command <<<
     set -euo pipefail
-    /opt/sv-pipeline/04_variant_resolution/scripts/rename.py --prefix ~{prefix} ~{vcf} - \
+    /opt/sv-pipeline/04_variant_resolution/scripts/rename.py --chrom ~{chrom} --prefix ~{vid_prefix} ~{vcf} - \
       | bgzip \
       > ~{prefix}.vcf.gz
     tabix ~{prefix}.vcf.gz
diff --git a/wdl/ResolveCpxSv.wdl b/wdl/ResolveCpxSv.wdl
index 4630263f5..cd7b103a9 100644
--- a/wdl/ResolveCpxSv.wdl
+++ b/wdl/ResolveCpxSv.wdl
@@ -3,6 +3,7 @@ version 1.0
 # Author: Ryan Collins <rlcollins@g.harvard.edu>
 
 import "TasksMakeCohortVcf.wdl" as MiniTasks
+import "HailMerge.wdl" as HailMerge
 
 #Resolve complex SV for a single chromosome
 workflow ResolveComplexSv {
@@ -16,13 +17,16 @@ workflow ResolveComplexSv {
     Array[File] disc_files
     Array[File] rf_cutoff_files
     File pe_exclude_list
-    Boolean inv_only
     File ref_dict
 
     Int precluster_distance
     Float precluster_overlap_frac
 
+    Boolean use_hail
+    String? gcs_project
+
     String sv_pipeline_docker
+    String sv_pipeline_hail_docker
     String sv_base_mini_docker
 
     # overrides for local tasks
@@ -32,10 +36,12 @@ workflow ResolveComplexSv {
     RuntimeAttr? runtime_override_resolve_prep
     RuntimeAttr? runtime_override_resolve_cpx_per_shard
     RuntimeAttr? runtime_override_restore_unresolved_cnv_per_shard
-    RuntimeAttr? runtime_override_merge_resolve_inner
-
-    # overrides for MiniTasks
     RuntimeAttr? runtime_override_concat_resolved_per_shard
+    RuntimeAttr? runtime_override_pull_vcf_shard
+
+    RuntimeAttr? runtime_override_preconcat
+    RuntimeAttr? runtime_override_hail_merge
+    RuntimeAttr? runtime_override_fix_header
   }
 
   File vcf_idx = vcf + ".tbi"
@@ -45,11 +51,7 @@ workflow ResolveComplexSv {
     File disc_files_idx = disc_files[i] + ".tbi"
   }
 
-  # Get SR count cutoff from RF metrics to use in single-ender rescan procedure
-
   #Shard vcf for complex resolution
-  #Note: as of Nov 2, 2018, return lists of variant IDs for each shard. This should
-  # dramatically improve sharding speed
   call ShardVcfCpx {
     input:
       vcf=vcf,
@@ -61,7 +63,7 @@ workflow ResolveComplexSv {
       runtime_attr_override=runtime_override_shard_vcf_cpx
   }
 
-  call MiniTasks.ShardVids {
+  call MiniTasks.ShardVidsForClustering {
     input:
       clustered_vcf=ShardVcfCpx.out,
       prefix=prefix,
@@ -70,8 +72,9 @@ workflow ResolveComplexSv {
       runtime_attr_override=runtime_override_shard_vids
   }
 
-  if (length(ShardVids.out) > 0) {
+  if (length(ShardVidsForClustering.out) > 0) {
 
+    # Get SR count cutoff from RF metrics to use in single-ender rescan procedure
     call GetSeCutoff {
       input:
         rf_cutoffs=rf_cutoff_files,
@@ -80,13 +83,21 @@ workflow ResolveComplexSv {
     }
 
     #Scatter over shards and resolve variants per shard
-    scatter ( i in range(length(ShardVids.out)) ) {
+    scatter ( i in range(length(ShardVidsForClustering.out)) ) {
+
+      call MiniTasks.PullVcfShard {
+        input:
+          vcf=vcf,
+          vids=ShardVidsForClustering.out[i],
+          prefix="~{prefix}.shard_${i}",
+          sv_base_mini_docker=sv_base_mini_docker,
+          runtime_attr_override=runtime_override_pull_vcf_shard
+      }
 
       #Prep files for svtk resolve using bucket streaming
       call ResolvePrep {
         input:
-          vcf=vcf,
-          VIDs_list=ShardVids.out[i],
+          vcf=PullVcfShard.out,
           chrom=contig,
           disc_files=disc_files,
           disc_files_index=disc_files_idx,
@@ -98,7 +109,7 @@ workflow ResolveComplexSv {
       #Run svtk resolve
       call SvtkResolve {
         input:
-          noref_vcf=ResolvePrep.noref_vcf,
+          vcf=PullVcfShard.out,
           prefix="~{prefix}.svtk_resolve.shard_~{i}",
           chrom=contig,
           cytobands=cytobands,
@@ -113,20 +124,10 @@ workflow ResolveComplexSv {
           runtime_attr_override=runtime_override_resolve_cpx_per_shard
       }
 
-      call MergeResolve {
-        input:
-          full_vcf=ResolvePrep.subsetted_vcf,
-          resolved_vcf=SvtkResolve.rs_vcf,
-          prefix="~{prefix}.merge_resolve.shard_~{i}",
-          noref_vids=ResolvePrep.noref_vids,
-          sv_base_mini_docker=sv_base_mini_docker,
-          runtime_attr_override=runtime_override_merge_resolve_inner
-      }
-
       #Add unresolved variants back into resolved VCF
-      call RestoreUnresolvedCnv as RestoreUnresolvedCnvPerShard {
+      call RestoreUnresolvedCnv {
         input:
-          resolved_vcf=MergeResolve.out,
+          resolved_vcf=SvtkResolve.rs_vcf,
           unresolved_vcf=SvtkResolve.un_vcf,
           prefix="~{prefix}.restore_unresolved.shard_~{i}",
           sv_pipeline_docker=sv_pipeline_docker,
@@ -135,20 +136,36 @@ workflow ResolveComplexSv {
     }
 
     #Merge across shards
-    call MiniTasks.ConcatVcfs as ConcatResolvedPerShard {
-      input:
-        vcfs=RestoreUnresolvedCnvPerShard.res,
-        vcfs_idx=RestoreUnresolvedCnvPerShard.res_idx,
-        allow_overlaps=true,
-        outfile_prefix=prefix + ".resolved",
-        sv_base_mini_docker=sv_base_mini_docker,
-        runtime_attr_override=runtime_override_concat_resolved_per_shard
+    if (use_hail) {
+      call HailMerge.HailMerge as ConcatResolvedPerShardHail {
+        input:
+          vcfs=RestoreUnresolvedCnv.res,
+          prefix="~{prefix}.resolved",
+          gcs_project=gcs_project,
+          sv_base_mini_docker=sv_base_mini_docker,
+          sv_pipeline_docker=sv_pipeline_docker,
+          sv_pipeline_hail_docker=sv_pipeline_hail_docker,
+          runtime_override_preconcat=runtime_override_preconcat,
+          runtime_override_hail_merge=runtime_override_hail_merge,
+          runtime_override_fix_header=runtime_override_fix_header
+      }
+    }
+    if (!use_hail) {
+      call MiniTasks.ConcatVcfs as ConcatResolvedPerShard {
+        input:
+          vcfs=RestoreUnresolvedCnv.res,
+          vcfs_idx=RestoreUnresolvedCnv.res_idx,
+          allow_overlaps=true,
+          outfile_prefix="~{prefix}.resolved",
+          sv_base_mini_docker=sv_base_mini_docker,
+          runtime_attr_override=runtime_override_concat_resolved_per_shard
+      }
     }
   }
 
   output {
-    File resolved_vcf_merged = select_first([ConcatResolvedPerShard.concat_vcf, vcf])
-    File resolved_vcf_merged_idx = select_first([ConcatResolvedPerShard.concat_vcf_idx, vcf_idx])
+    File resolved_vcf_merged = select_first([ConcatResolvedPerShard.concat_vcf, ConcatResolvedPerShardHail.merged_vcf, vcf])
+    File resolved_vcf_merged_idx = select_first([ConcatResolvedPerShard.concat_vcf_idx, ConcatResolvedPerShardHail.merged_vcf_index, vcf_idx])
   }
 }
 
@@ -202,7 +219,6 @@ task GetSeCutoff {
   }
 }
 
-
 task ShardVcfCpx {
   input {
     File vcf
@@ -242,6 +258,7 @@ task ShardVcfCpx {
     svtk vcfcluster <(echo "sites_only.vcf.gz") ~{prefix}.vcf \
       -d ~{dist} \
       -f ~{frac} \
+      --single-end \
       -p candidate_complex_clusters \
       --svtypes DEL,DUP,INS,INV,BND \
       --ignore-svtypes \
@@ -261,7 +278,6 @@ task ShardVcfCpx {
 task ResolvePrep {
   input {
     File vcf
-    File VIDs_list
     File ref_dict
     String chrom
     Array[File] disc_files
@@ -281,18 +297,9 @@ task ResolvePrep {
 
   # sections of disc_files are straemed, but the every operation in this task is record-by-record except
   # bedtools merge, which should only need to keep a few records in memory at a time.
-  # assuming memory overhead is fixed
-  # assuming disk overhead is input size (accounting for compression) + sum(size of disc_files)
-  #  (this is an over-estimate because we only take chunks overlapping VIDs from vcf, but the disk files are not *THAT*
-  #   big and disk is cheap)
-  Float compressed_input_size = size(vcf, "GiB")
-  Float uncompressed_input_size = size([VIDs_list], "GiB")
-  Float compression_factor = 30.0
-  Float base_disk_gb = 10.0
-  Float base_mem_gb = 2.0
   RuntimeAttr runtime_default = object {
-    mem_gb: base_mem_gb,
-    disk_gb: ceil(base_disk_gb + uncompressed_input_size + compression_factor * compressed_input_size),
+    mem_gb: 2.0,
+    disk_gb: ceil(20.0 + 200.0 * size(vcf, "GiB")),
     cpu_cores: 1,
     preemptible_tries: 3,
     max_retries: 1,
@@ -315,53 +322,12 @@ task ResolvePrep {
 
   command <<<
     set -euxo pipefail
-    
-    # First, subset VCF to variants of interest
-    # -uncompress vcf
-    zcat "~{vcf}" > uncompressed.vcf
-    # -Extract vcf header:
-    #   search for first line not starting with '#', stop immediately,
-    #   take everything up to that point, then remove last line
-    ONLY_HEADER=false
-    grep -B9999999999 -m1 -Ev "^#" uncompressed.vcf | sed '$ d' > header.vcf \
-      || ONLY_HEADER=true
-
-    if $ONLY_HEADER; then
-      # filter is trivial, just copy the vcf
-      mv "~{vcf}" input.vcf.gz
-    else
-      rm -f "~{vcf}"
 
-      N_HEADER=$(wc -l < header.vcf)
-      # filter records, concatenate and zip
-      tail -n+$((N_HEADER+1)) uncompressed.vcf \
-        | { fgrep -wf ~{VIDs_list} || true; } \
-        | cat header.vcf - \
-        | bgzip -c \
-        > input.vcf.gz
-      rm -f uncompressed.vcf
-    fi
-
-    #Second, extract all-ref variants from VCF. These break svtk resolve with
-    # remote tabixing enabled
-    svtk vcf2bed input.vcf.gz input.bed
-    { grep -Ev "^#" input.bed || true ; } \
-      | awk -v FS="\t" '{ if ($6!="") print $4 }' \
-      > noref.VIDs.list
-
-    {
-      cat header.vcf;
-      zcat input.vcf.gz | fgrep -wf noref.VIDs.list || true;
-    } \
-      | vcf-sort \
-      | bgzip -c \
-      > noref.vcf.gz
-    rm -f header.vcf
-
-    #Third, use GATK to pull down the discfile chunks within ±2kb of all
+    #Use GATK to pull down the discfile chunks within ±2kb of all
     # INVERSION breakpoints, and bgzip / tabix
     echo "Forming regions.bed"
-    { grep -Ev "^#" input.bed || true; } \
+    svtk vcf2bed ~{vcf} input.bed --no-samples --no-header
+    cat input.bed \
       | (fgrep INV || printf "") \
       | awk -v OFS="\t" -v buffer=2000 \
         '{ print $1, $2-buffer, $2+buffer"\n"$1, $3-buffer, $3+buffer }' \
@@ -395,7 +361,7 @@ task ResolvePrep {
         rm ${SLICE}.PE.txt
       done < ~{write_lines(disc_files)}
     
-      #Fourth, merge PE files and add one artificial pair corresponding to the chromosome of interest
+      #Merge PE files and add one artificial pair corresponding to the chromosome of interest
       #This makes it so that svtk doesn't break downstream
       echo "Merging PE files"
       {
@@ -423,9 +389,6 @@ task ResolvePrep {
   >>>
 
   output {
-    File subsetted_vcf = "input.vcf.gz"
-    File noref_vcf = "noref.vcf.gz"
-    File noref_vids = "noref.VIDs.list"
     File merged_discfile = "discfile.PE.txt.gz"
     File merged_discfile_idx = "discfile.PE.txt.gz.tbi"
   }
@@ -434,7 +397,7 @@ task ResolvePrep {
 #Resolve complex SV
 task SvtkResolve {
   input {
-    File noref_vcf
+    File vcf
     String prefix
     String chrom
     File cytobands
@@ -454,13 +417,15 @@ task SvtkResolve {
 
   # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to
   # be held in memory or disk while working, potentially in a form that takes up more space)
-  Float input_size = size(
-    [noref_vcf, cytobands, mei_bed, pe_exclude_list, pe_exclude_list_idx,merged_discfile], "GiB")
+  Float input_size = size([vcf, merged_discfile], "GiB")
+  Float scaled_mem_gib = 3 + size(vcf, "GiB") * 40 + size(merged_discfile, "GiB") * 80
+  # Cap memory at largest N2 VM size of 512 GB (476.8 GiB)
+  Float default_mem_gib = if (scaled_mem_gib < 476.0) then scaled_mem_gib else 476.0
   RuntimeAttr runtime_default = object {
-    mem_gb: 3 + input_size * 10,
+    mem_gb: default_mem_gib,
     disk_gb: ceil(10 + input_size * 12),
     cpu_cores: 1,
-    preemptible_tries: 1,
+    preemptible_tries: 3,
     max_retries: 1,
     boot_disk_gb: 10
   }
@@ -480,7 +445,7 @@ task SvtkResolve {
 
     #Run svtk resolve on variants after all-ref exclusion
     svtk resolve \
-      ~{noref_vcf} \
+      ~{vcf} \
       ~{resolved_vcf} \
       -p AllBatches_CPX_~{chrom} \
       -u ~{unresolved_vcf} \
@@ -500,60 +465,6 @@ task SvtkResolve {
   }  
 }
 
-task MergeResolve {
-  input {
-    File full_vcf
-    File resolved_vcf
-    String prefix
-    File noref_vids
-    String sv_base_mini_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  String out_vcf = "~{prefix}.resolved.vcf.gz"
-
-  # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to
-  # be held in memory or disk while working, potentially in a form that takes up more space)
-  Float input_size = size([full_vcf, resolved_vcf, noref_vids], "GiB")
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10 + input_size * 15),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 1,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_base_mini_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -eu -o pipefail
-    #Add all-ref variants back into resolved VCF
-    #Note: requires modifying the INFO field with sed & awk given pysam C bug
-    zcat ~{full_vcf} \
-      | grep -Ev "^#" \
-      | awk 'ARGIND==1{inFileA[$1]; next} {if (!($3 in inFileA)) print }' ~{noref_vids} - OFS='\t' \
-      | sed -e 's/;MEMBERS=[^\t]*\t/\t/g' \
-      | awk -v OFS="\t" '{ $8=$8";MEMBERS="$3; print }' \
-      | cat <(zcat ~{resolved_vcf}) - \
-      | vcf-sort \
-      | bgzip \
-      > ~{out_vcf}
-  >>>
-
-  output {
-    File out = out_vcf
-  }
-}
-
 #Restore unresolved CNVs to resolved VCF
 task RestoreUnresolvedCnv {
   input {
@@ -589,47 +500,47 @@ task RestoreUnresolvedCnv {
   }
 
   command <<<
-    set -eu -o pipefail
+    set -euo pipefail
 
     # get unresolved records
-    zcat ~{unresolved_vcf} \
-      | (grep -v "^#" || printf "") \
-      > unresolved_records.vcf
+    bcftools view --no-header ~{unresolved_vcf} -Oz -o unresolved_records.vcf.gz
     rm "~{unresolved_vcf}"
 
     # avoid possible obliteration of input file during later processing by writing
     # to temporary file (and postCPX_cleanup.py writing final result to output name)
-    zcat ~{resolved_vcf} > ~{resolved_plus_cnv}.tmp
-    rm ~{resolved_vcf}
+    mv ~{resolved_vcf} ~{resolved_plus_cnv}.tmp.gz
 
     #Add unresolved CNVs to resolved VCF and wipe unresolved status
-    cat unresolved_records.vcf \
+    zcat unresolved_records.vcf.gz \
       | (fgrep -e "<DEL>" -e "<DUP>" -e "SVTYPE=DEL" -e "SVTYPE=DUP" -e "SVTYPE=CNV" -e "SVTYPE=MCNV" || printf "") \
       | sed -r -e 's/;EVENT=[^;]*;/;/' -e 's/;UNRESOLVED[^;]*;/;/g' \
       | sed -r -e 's/;UNRESOLVED_TYPE[^;]*;/;/g' -e 's/;UNRESOLVED_TYPE[^\t]*\t/\t/g' \
-      >> ~{resolved_plus_cnv}.tmp
+      | bgzip \
+      >> ~{resolved_plus_cnv}.tmp.gz
 
     #Add other unresolved variants & retain unresolved status (except for inversion single enders)
-    cat unresolved_records.vcf \
+    zcat unresolved_records.vcf.gz \
       | (fgrep -v -e "<DEL>" -e "<DUP>" -e "SVTYPE=DEL" -e "SVTYPE=DUP" -e "SVTYPE=CNV" -e "SVTYPE=MCNV" \
                   -e "INVERSION_SINGLE_ENDER" || printf "") \
-      >> ~{resolved_plus_cnv}.tmp
+      | bgzip \
+      >> ~{resolved_plus_cnv}.tmp.gz
 
     #Add inversion single enders as SVTYPE=BND
-    cat unresolved_records.vcf \
+    zcat unresolved_records.vcf.gz \
       | (fgrep -v -e "<DEL>" -e "<DUP>" -e "SVTYPE=DEL" -e "SVTYPE=DUP" -e "SVTYPE=CNV" -e "SVTYPE=MCNV" || printf "") \
       | (fgrep -e "INVERSION_SINGLE_ENDER" || printf "") \
       | sed -e 's/SVTYPE=INV/SVTYPE=BND/g' \
       | sed -e 's/END=\([0-9]*\)/END=\1;END2=\1/' \
-      >> ~{resolved_plus_cnv}.tmp
-    rm unresolved_records.vcf
+      | bgzip \
+      >> ~{resolved_plus_cnv}.tmp.gz
+    rm unresolved_records.vcf.gz
 
     #Sort, clean, and compress
-    cat ~{resolved_plus_cnv}.tmp \
+    zcat ~{resolved_plus_cnv}.tmp.gz \
       | vcf-sort -c \
       | /opt/sv-pipeline/04_variant_resolution/scripts/postCPX_cleanup.py \
         /dev/stdin /dev/stdout \
-      | bgzip -c \
+      | bgzip \
       > ~{resolved_plus_cnv}
     tabix ~{resolved_plus_cnv}
   >>>
diff --git a/wdl/ScatterCpxGenotyping.wdl b/wdl/ScatterCpxGenotyping.wdl
index 01be4ce5a..af21479ea 100644
--- a/wdl/ScatterCpxGenotyping.wdl
+++ b/wdl/ScatterCpxGenotyping.wdl
@@ -4,6 +4,7 @@ version 1.0
 
 import "GenotypeCpxCnvs.wdl" as GenotypeCpx
 import "TasksMakeCohortVcf.wdl" as MiniTasks
+import "HailMerge.wdl" as HailMerge
 
 # Workflow to perform depth-based genotyping for a single vcf shard scattered 
 # across batches on predicted CPX CNVs
@@ -11,8 +12,7 @@ workflow ScatterCpxGenotyping {
   input {
     File bin_exclude
     File vcf
-    Int n_master_vcf_shards
-    Int n_master_min_vars_per_vcf_shard
+    Int records_per_shard
     Array[String] batches
     Array[File] coverage_files
     Array[File] rd_depth_sep_cutoff_files
@@ -26,16 +26,24 @@ workflow ScatterCpxGenotyping {
     String contig
     File ref_dict
 
+    Boolean use_hail
+    String? gcs_project
+
     String linux_docker
     String sv_base_mini_docker
+    String sv_pipeline_updates_docker
     String sv_pipeline_docker
+    String sv_pipeline_hail_docker
     String sv_pipeline_rdtest_docker
 
     # overrides for MiniTasks
-    RuntimeAttr? runtime_override_ids_from_vcf
     RuntimeAttr? runtime_override_split_vcf_to_genotype
     RuntimeAttr? runtime_override_concat_cpx_cnv_vcfs
 
+    RuntimeAttr? runtime_override_preconcat
+    RuntimeAttr? runtime_override_hail_merge
+    RuntimeAttr? runtime_override_fix_header
+
     # overrides for GenotypeCpx
     RuntimeAttr? runtime_override_ids_from_median
     RuntimeAttr? runtime_override_get_cpx_cnv_intervals
@@ -49,18 +57,17 @@ workflow ScatterCpxGenotyping {
   String contig_prefix = prefix + "." + contig
 
   # Shard VCF into even slices
-  call MiniTasks.SplitVcf as SplitVcfToGenotype {
+  call MiniTasks.ScatterVcf as SplitVcfToGenotype {
     input:
       vcf=vcf,
-      prefix=contig_prefix + ".shard_",
-      n_shards=n_master_vcf_shards,
-      min_vars_per_shard=n_master_min_vars_per_vcf_shard,
-      sv_base_mini_docker=sv_base_mini_docker,
+      prefix=contig_prefix,
+      records_per_shard=records_per_shard,
+      sv_pipeline_docker=sv_pipeline_updates_docker,
       runtime_attr_override=runtime_override_split_vcf_to_genotype
   }
 
   # Scatter genotyping over shards
-  scatter ( shard in SplitVcfToGenotype.vcf_shards ) {
+  scatter ( shard in SplitVcfToGenotype.shards ) {
     # Run genotyping
     call GenotypeCpx.GenotypeCpxCnvs as GenotypeShard {
       input:
@@ -92,20 +99,35 @@ workflow ScatterCpxGenotyping {
     }
   }
 
-  # Merge VCF shards
-  call MiniTasks.ConcatVcfs as ConcatCpxCnvVcfs {
-    input:
-      vcfs=GenotypeShard.cpx_depth_gt_resolved_vcf,
-      vcfs_idx=GenotypeShard.cpx_depth_gt_resolved_vcf_idx,
-      allow_overlaps=true,
-      outfile_prefix=contig_prefix + ".regenotyped",
-      sv_base_mini_docker=sv_base_mini_docker,
-      runtime_attr_override=runtime_override_concat_cpx_cnv_vcfs
+  if (use_hail) {
+    call HailMerge.HailMerge as ConcatCpxCnvVcfsHail {
+      input:
+        vcfs=GenotypeShard.cpx_depth_gt_resolved_vcf,
+        prefix="~{prefix}.regenotyped",
+        gcs_project=gcs_project,
+        sv_base_mini_docker=sv_base_mini_docker,
+        sv_pipeline_docker=sv_pipeline_docker,
+        sv_pipeline_hail_docker=sv_pipeline_hail_docker,
+        runtime_override_preconcat=runtime_override_preconcat,
+        runtime_override_hail_merge=runtime_override_hail_merge,
+        runtime_override_fix_header=runtime_override_fix_header
+    }
+  }
+  if (!use_hail) {
+    call MiniTasks.ConcatVcfs as ConcatCpxCnvVcfs {
+      input:
+        vcfs=GenotypeShard.cpx_depth_gt_resolved_vcf,
+        vcfs_idx=GenotypeShard.cpx_depth_gt_resolved_vcf_idx,
+        naive=true,
+        outfile_prefix="~{prefix}.regenotyped",
+        sv_base_mini_docker=sv_base_mini_docker,
+        runtime_attr_override=runtime_override_concat_cpx_cnv_vcfs
+    }
   }
 
   # Output merged VCF
   output {
-    File cpx_depth_gt_resolved_vcf = ConcatCpxCnvVcfs.concat_vcf
-    File cpx_depth_gt_resolved_vcf_idx = ConcatCpxCnvVcfs.concat_vcf_idx
+    File cpx_depth_gt_resolved_vcf = select_first([ConcatCpxCnvVcfs.concat_vcf, ConcatCpxCnvVcfsHail.merged_vcf])
+    File cpx_depth_gt_resolved_vcf_idx = select_first([ConcatCpxCnvVcfs.concat_vcf_idx, ConcatCpxCnvVcfsHail.merged_vcf_index])
   }
  }
diff --git a/wdl/ShardedCluster.wdl b/wdl/ShardedCluster.wdl
index 62009d67a..8c3990f43 100644
--- a/wdl/ShardedCluster.wdl
+++ b/wdl/ShardedCluster.wdl
@@ -4,35 +4,45 @@ version 1.0
 
 import "Structs.wdl"
 import "TasksMakeCohortVcf.wdl" as MiniTasks
-import "Utils.wdl" as utils
+import "HailMerge.wdl" as HailMerge
 
 # Workflow to shard a filtered vcf & run vcfcluster (sub-sub-sub workflow)
 workflow ShardedCluster {
   input {
     File vcf
+    Int num_samples
     Int dist
     Float frac
     String prefix
     String contig
+    String cohort_name
+    String evidence_type
     String sv_type
     Float sample_overlap
     File? exclude_list
+    File empty_file
     Int sv_size
     Array[String] sv_types
     Float merging_shard_scale_factor = 30000000
 
+    Boolean use_hail
+    String? gcs_project
+
     String sv_pipeline_docker
+    String sv_pipeline_hail_docker
     String sv_base_mini_docker
 
-    # Do not use
-    File? NONE_FILE_
-
     # overrides for local tasks
-    RuntimeAttr? runtime_override_shard_vcf_precluster
+    RuntimeAttr? runtime_override_shard_clusters
+    RuntimeAttr? runtime_override_shard_vids
     RuntimeAttr? runtime_override_pull_vcf_shard
     RuntimeAttr? runtime_override_svtk_vcf_cluster
     RuntimeAttr? runtime_override_get_vcf_header_with_members_info_line
 
+    RuntimeAttr? runtime_override_preconcat_sharded_cluster
+    RuntimeAttr? runtime_override_hail_merge_sharded_cluster
+    RuntimeAttr? runtime_override_fix_header_sharded_cluster
+
     # overrides for merge subworkflow
     RuntimeAttr? runtime_override_merge_clusters
     RuntimeAttr? runtime_override_concat_inner_shards
@@ -41,25 +51,32 @@ workflow ShardedCluster {
     RuntimeAttr? runtime_override_concat_sharded_cluster
     RuntimeAttr? runtime_override_sort_merged_vcf
     RuntimeAttr? runtime_override_count_samples
+    RuntimeAttr? runtime_override_get_vids
+    RuntimeAttr? runtime_override_cat_vid_lists_sharded
+    RuntimeAttr? runtime_override_make_sites_only
   }
 
+
   File vcf_idx = vcf + ".tbi"
   if (defined(exclude_list)) {
     File exclude_list_idx = exclude_list + ".tbi"
   }
 
-  call utils.CountSamples {
+  call MiniTasks.MakeSitesOnlyVcf {
     input:
-    vcf=vcf,
-    sv_base_mini_docker=sv_base_mini_docker,
-    runtime_attr_override=runtime_override_count_samples
+      vcf=vcf,
+      vcf_index=vcf + ".tbi",
+      prefix="~{prefix}.sites_only",
+      sv_base_mini_docker=sv_base_mini_docker,
+      runtime_attr_override=runtime_override_make_sites_only
   }
-  Int merge_shard_size = ceil(merging_shard_scale_factor / CountSamples.num_samples)
+
+  Int merge_shard_size = ceil(merging_shard_scale_factor / num_samples)
 
   call ShardClusters {
     input:
-      vcf=vcf,
-      prefix="~{prefix}.shard_clusters",
+      vcf=MakeSitesOnlyVcf.out,
+      prefix="~{prefix}.sites_only.shard_clusters",
       dist=dist,
       frac=frac,
       exclude_list=exclude_list,
@@ -67,24 +84,24 @@ workflow ShardedCluster {
       svsize=sv_size,
       sv_types=sv_types,
       sv_pipeline_docker=sv_pipeline_docker,
-      runtime_attr_override=runtime_override_svtk_vcf_cluster
+      runtime_attr_override=runtime_override_shard_clusters
   }
 
-  call MiniTasks.ShardVids {
+  call MiniTasks.ShardVidsForClustering {
     input:
       clustered_vcf=ShardClusters.out,
-      prefix=prefix,
+      prefix="~{prefix}.sites_only.clustered",
       records_per_shard=merge_shard_size,
       sv_pipeline_docker=sv_pipeline_docker,
-      runtime_attr_override=runtime_override_svtk_vcf_cluster
+      runtime_attr_override=runtime_override_shard_vids
   }
 
   #Run vcfcluster per shard
-  scatter (i in range(length(ShardVids.out))) {
-    call PullVcfShard {
+  scatter (i in range(length(ShardVidsForClustering.out))) {
+    call MiniTasks.PullVcfShard {
       input:
         vcf=vcf,
-        vids=ShardVids.out[i],
+        vids=ShardVidsForClustering.out[i],
         prefix="~{prefix}.unclustered.shard_${i}",
         sv_base_mini_docker=sv_base_mini_docker,
         runtime_attr_override=runtime_override_pull_vcf_shard
@@ -92,10 +109,10 @@ workflow ShardedCluster {
     call SvtkVcfCluster {
       input:
         vcf=PullVcfShard.out,
-        num_samples=CountSamples.num_samples,
+        num_samples=num_samples,
         num_vids=PullVcfShard.count,
         prefix="~{prefix}.clustered.shard_${i}",
-        vid_prefix="~{prefix}_~{contig}_~{sv_type}_~{i}",
+        vid_prefix="~{cohort_name}_~{contig}_~{evidence_type}_~{sv_type}_~{i}",
         dist=dist,
         frac=frac,
         exclude_list=exclude_list,
@@ -119,27 +136,43 @@ workflow ShardedCluster {
     call GetVcfHeaderWithMembersInfoLine {
       input:
         vcf_gz=vcf,
-        prefix="~{prefix}.members",
+        prefix="~{prefix}.clustered",
         sv_base_mini_docker=sv_base_mini_docker,
         runtime_attr_override=runtime_override_get_vcf_header_with_members_info_line
     }
   }
   if (length(SvtkVcfCluster.out) > 0) {
-    call MiniTasks.ConcatVcfs {
-      input:
-        vcfs=SortVcf.out,
-        vcfs_idx=SortVcf.out_index,
-        allow_overlaps=true,
-        outfile_prefix="~{prefix}.clustered",
-        sv_base_mini_docker=sv_base_mini_docker,
-        runtime_attr_override=runtime_override_concat_sharded_cluster
+    if (use_hail) {
+      call HailMerge.HailMerge as ConcatVcfsHail {
+        input:
+          vcfs=SortVcf.out,
+          prefix="~{prefix}.clustered",
+          gcs_project=gcs_project,
+          sv_base_mini_docker=sv_base_mini_docker,
+          sv_pipeline_docker=sv_pipeline_docker,
+          sv_pipeline_hail_docker=sv_pipeline_hail_docker,
+          runtime_override_preconcat=runtime_override_preconcat_sharded_cluster,
+          runtime_override_hail_merge=runtime_override_hail_merge_sharded_cluster,
+          runtime_override_fix_header=runtime_override_fix_header_sharded_cluster
+      }
+    }
+    if (!use_hail) {
+      call MiniTasks.ConcatVcfs as ConcatVcfs {
+        input:
+          vcfs=SortVcf.out,
+          vcfs_idx=SortVcf.out_index,
+          allow_overlaps=true,
+          outfile_prefix="~{prefix}.clustered",
+          sv_base_mini_docker=sv_base_mini_docker,
+          runtime_attr_override=runtime_override_concat_sharded_cluster
+      }
     }
   }
 
   #Output
   output {
-    File clustered_vcf = select_first([GetVcfHeaderWithMembersInfoLine.out, ConcatVcfs.concat_vcf])
-    File clustered_vcf_idx = select_first([GetVcfHeaderWithMembersInfoLine.out_idx, ConcatVcfs.concat_vcf_idx])
+    File clustered_vcf = select_first([GetVcfHeaderWithMembersInfoLine.out, ConcatVcfs.concat_vcf, ConcatVcfsHail.merged_vcf])
+    File clustered_vcf_idx = select_first([GetVcfHeaderWithMembersInfoLine.out_idx, ConcatVcfs.concat_vcf_idx, ConcatVcfsHail.merged_vcf_index])
   }
 }
 
@@ -186,7 +219,7 @@ task GetVcfHeaderWithMembersInfoLine {
   }
 }
 
-#Do fast cluster on sites-only vcf (sample_overlap = 0) to generate shards
+#Do fast cluster on vcf (sample_overlap = 0) to generate shards
 task ShardClusters {
   input {
     File vcf
@@ -225,11 +258,10 @@ task ShardClusters {
 
   command <<<
     set -euo pipefail
-    bcftools view -G ~{vcf} -Oz -o sites_only.vcf.gz
     ~{if defined(exclude_list) && !defined(exclude_list_idx) then "tabix -p bed ~{exclude_list}" else ""}
 
     #Run clustering
-    svtk vcfcluster <(echo "sites_only.vcf.gz") ~{prefix}.vcf.gz \
+    svtk vcfcluster <(echo "~{vcf}") ~{prefix}.vcf.gz \
       -d ~{dist} \
       -f ~{frac} \
       ~{if defined(exclude_list) then "-x ~{exclude_list}" else ""} \
@@ -247,47 +279,6 @@ task ShardClusters {
   }
 }
 
-task PullVcfShard {
-  input {
-    File vcf
-    File vids
-    String prefix
-    String sv_base_mini_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  String output_prefix = "~{prefix}"
-  RuntimeAttr runtime_default = object {
-                                  mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 + size(vcf, "GiB") * 2.0),
-                                  cpu_cores: 1,
-                                  preemptible_tries: 3,
-                                  max_retries: 1,
-                                  boot_disk_gb: 10
-                                }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
-    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_base_mini_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -euo pipefail
-    bcftools view --no-version --include ID=@~{vids} ~{vcf} -O z -o ~{output_prefix}.vcf.gz
-    wc -l < ~{vids} > count.txt
-  >>>
-
-  output {
-    File out = "~{output_prefix}.vcf.gz"
-    Int count = read_int("count.txt")
-  }
-}
-
 task SvtkVcfCluster {
   input {
     File vcf
@@ -307,7 +298,6 @@ task SvtkVcfCluster {
   }
 
   Float default_mem_gb = 3.75 + (120.0 * (num_vids / 19000.0) * (num_samples / 140000.0))
-  String output_prefix = "~{prefix}"
   RuntimeAttr runtime_default = object {
                                   mem_gb: default_mem_gb,
                                   disk_gb: ceil(10.0 + size(vcf, "GiB") * 2.0),
@@ -330,6 +320,7 @@ task SvtkVcfCluster {
   command <<<
     set -euo pipefail
     ~{if defined(exclude_list) && !defined(exclude_list_idx) then "tabix -p bed ~{exclude_list}" else ""}
+
     #Run clustering
     svtk vcfcluster <(echo "~{vcf}") - \
         -d ~{dist} \
@@ -342,10 +333,10 @@ task SvtkVcfCluster {
         --preserve-ids \
         --preserve-genotypes \
         --preserve-header \
-      | gzip > ~{output_prefix}.vcf.gz
+      | gzip > ~{prefix}.vcf.gz
   >>>
 
   output {
-    File out = "~{output_prefix}.vcf.gz"
+    File out = "~{prefix}.vcf.gz"
   }
 }
\ No newline at end of file
diff --git a/wdl/TasksMakeCohortVcf.wdl b/wdl/TasksMakeCohortVcf.wdl
index 9bc2e6a2d..a6a49cca3 100644
--- a/wdl/TasksMakeCohortVcf.wdl
+++ b/wdl/TasksMakeCohortVcf.wdl
@@ -132,7 +132,7 @@ task SortVcf {
 
   RuntimeAttr runtime_default = object {
                                   mem_gb: 3.75,
-                                  disk_gb: ceil(10.0 +  size(vcf, "GB") * 20),
+                                  disk_gb: ceil(10.0 +  size(vcf, "GB") * 40),
                                   cpu_cores: 1,
                                   preemptible_tries: 3,
                                   max_retries: 1,
@@ -176,15 +176,13 @@ task ConcatVcfs {
     Boolean allow_overlaps = false
     Boolean naive = false
     Boolean generate_index = true
+    Boolean sites_only = false
+    Boolean sort_vcf_list = false
     String? outfile_prefix
     String sv_base_mini_docker
     RuntimeAttr? runtime_attr_override
   }
 
-  String outfile_name = outfile_prefix + ".vcf.gz"
-  String allow_overlaps_flag = if allow_overlaps then "--allow-overlaps" else ""
-  String naive_flag = if naive then "--naive" else ""
-
   # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to
   # be held in memory or disk while working, potentially in a form that takes up more space)
   RuntimeAttr runtime_default = object {
@@ -206,18 +204,25 @@ task ConcatVcfs {
     bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
   }
 
+  String outfile_name = outfile_prefix + ".vcf.gz"
+  String allow_overlaps_flag = if allow_overlaps then "--allow-overlaps" else ""
+  String naive_flag = if naive then "--naive" else ""
+  String concat_output_type = if (sites_only) then "v" else "z"
+  String sites_only_command = if (sites_only) then "| bcftools view --no-version -G -Oz" else ""
+  String generate_index_command = if (generate_index) then "tabix ~{outfile_name}" else "touch ~{outfile_name}.tbi"
+
   command <<<
     set -euo pipefail
     VCFS="~{write_lines(vcfs)}"
-    if ~{!defined(vcfs_idx)}; then
-      cat ${VCFS} | xargs -n1 tabix
-    fi
-    bcftools concat --no-version ~{allow_overlaps_flag} ~{naive_flag} --output-type z --file-list ${VCFS} --output "~{outfile_name}"
-    if ~{generate_index}; then
-      tabix "~{outfile_name}"
+    if ~{sort_vcf_list}; then
+      cat $VCFS | awk -F '/' '{print $NF"\t"$0}' | sort -k1,1V | awk '{print $2}' > vcfs.list
     else
-      touch ~{outfile_name}.tbi
+      cp $VCFS vcfs.list
     fi
+    bcftools concat --no-version ~{allow_overlaps_flag} ~{naive_flag} -O~{concat_output_type} --file-list vcfs.list \
+      ~{sites_only_command} \
+      > ~{outfile_name}
+    ~{generate_index_command}
   >>>
 
   output {
@@ -444,53 +449,6 @@ task FilterVcf {
   }
 }
 
-# Find intersection of Variant IDs from vid_list with those present in vcf, return as filtered_vid_list
-task SubsetVariantList {
-  input {
-    File vid_list
-    File vcf
-    String outfile_name
-    String sv_base_mini_docker
-    RuntimeAttr? runtime_attr_override
-  }
-
-  # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to
-  # be held in memory or disk while working, potentially in a form that takes up more space)
-  Float vid_list_size = size(vid_list, "GB")
-  Float vcf_size = size(vcf, "GB")
-  RuntimeAttr runtime_default = object {
-    mem_gb: 2.0,
-    disk_gb: ceil(10.0 + vcf_size + vid_list_size * 2.0),
-    cpu_cores: 1,
-    preemptible_tries: 3,
-    max_retries: 1,
-    boot_disk_gb: 10
-  }
-  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
-  runtime {
-    memory: select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + " GB"
-    disks: "local-disk " + select_first([runtime_override.disk_gb, runtime_default.disk_gb]) + " HDD"
-    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
-    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
-    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
-    docker: sv_base_mini_docker
-    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
-  }
-
-  command <<<
-    set -eu -o pipefail
-    #Get list of variant IDs present in VCF
-    zcat ~{vcf} | (grep -vE "^#" || printf "") | cut -f3 > valid_vids.list
-    #Restrict input variant ID list to valid VIDs
-    (fgrep -wf valid_vids.list ~{vid_list} || printf "") > "~{outfile_name}"
-  >>>
-
-  output {
-    File filtered_vid_list = outfile_name
-  }
-}
-
-
 # evenly split text file into even chunks
 #   if shuffle_file is set to true, shuffle the file before splitting (default = false)
 task SplitUncompressed {
@@ -687,15 +645,10 @@ task UpdateSrList {
     RuntimeAttr? runtime_attr_override
   }
 
-  # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to
-  # be held in memory or disk while working, potentially in a form that takes up more space)
   Float input_size = size([vcf, original_list], "GiB")
-  Float compression_factor = 5.0
-  Float base_disk_gb = 10.0
-  Float base_mem_gb = 2.0
   RuntimeAttr runtime_default = object {
     mem_gb: 3.75,
-    disk_gb: ceil(base_disk_gb + input_size * (2.0 + 2.0 * compression_factor)),
+    disk_gb: ceil(10.0 + size(original_list, "GiB") * 3 + size(vcf, "GiB")),
     cpu_cores: 1,
     preemptible_tries: 3,
     max_retries: 1,
@@ -715,17 +668,19 @@ task UpdateSrList {
   command <<<
     set -euxo pipefail
 
-    ##append new ids to original list##
-    svtk vcf2bed ~{vcf} int.bed -i MEMBERS
-
-    ##remove header and match id one per line##
-    awk '{if (NR>1) print $4 "\t" $NF}' int.bed \
-      | awk -F'[,\t]' '{for(i=2; i<=NF; ++i) print $i "\t" $1 }' \
-      | sort -k1,1\
-      > newidlist.txt
-
-    join -j 1 -t $'\t' <(awk '{print $NF "\t" $0}' ~{original_list} | sort -k1,1) newidlist.txt \
-      | cut -f2-  \
+    # append new ids to original list
+    svtk vcf2bed ~{vcf} int.bed -i MEMBERS --no-samples --no-header
+
+    # match id one per line
+    # if an id is not found in the vcf, use previous id (in case vcf is a shard/subset)
+    # also sort by first column, which is support fraction for a bothside pass list
+    awk -F'[,\t]' -v OFS='\t' \
+      '{ \
+        if (ARGIND==1) for(i=6; i<=NF; ++i) MAP[$i]=$4; \
+        else if ($NF in MAP) print $0,MAP[$NF]; \
+        else print $0,$NF; \
+      }' int.bed ~{original_list} \
+      | sort -k1,1n \
       > ~{outfile}
   >>>
 
@@ -735,7 +690,7 @@ task UpdateSrList {
 }
 
   
-task ShardVids {
+task ShardVidsForClustering {
   input {
     File clustered_vcf
     String prefix
@@ -785,7 +740,7 @@ task ShardVids {
       print("empty vcf - no shards will be produced")
       sys.exit(0)
     vcf.reset()
-
+    
     current_cluster = None
     current_cluster_vids = []
     current_shard = 0
@@ -832,4 +787,232 @@ task ShardVids {
   output {
     Array[File] out = glob("~{prefix}.vids.shard_*.list")
   }
+}
+
+task MakeSitesOnlyVcf {
+  input {
+    File vcf
+    File vcf_index
+    String prefix
+    String sv_base_mini_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  RuntimeAttr runtime_default = object {
+                                  mem_gb: 3.75,
+                                  disk_gb: ceil(10.0 + size(vcf, "GiB") * 1.2),
+                                  cpu_cores: 1,
+                                  preemptible_tries: 3,
+                                  max_retries: 1,
+                                  boot_disk_gb: 10
+                                }
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  runtime {
+    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
+    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_base_mini_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
+
+  command <<<
+    set -euxo pipefail
+    bcftools view --no-version -G ~{vcf} -Oz -o ~{prefix}.vcf.gz
+    tabix ~{prefix}.vcf.gz
+  >>>
+
+  output {
+    File out = "~{prefix}.vcf.gz"
+    File out_index = "~{prefix}.vcf.gz.tbi"
+  }
+}
+
+
+task ReheaderVcf {
+  input {
+    File vcf
+    File vcf_index
+    File header
+    String prefix
+    String sv_base_mini_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  RuntimeAttr runtime_default = object {
+                                  mem_gb: 3.75,
+                                  disk_gb: ceil(10.0 + size(vcf, "GiB") * 2.0),
+                                  cpu_cores: 1,
+                                  preemptible_tries: 3,
+                                  max_retries: 1,
+                                  boot_disk_gb: 10
+                                }
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  runtime {
+    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
+    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_base_mini_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
+
+  command <<<
+    set -euxo pipefail
+    bcftools reheader -h ~{header} ~{vcf} > ~{prefix}.vcf.gz
+    tabix ~{prefix}.vcf.gz
+  >>>
+
+  output {
+    File out = "~{prefix}.vcf.gz"
+    File out_index = "~{prefix}.vcf.gz.tbi"
+  }
+}
+
+task PullVcfShard {
+  input {
+    File vcf
+    File vids
+    String prefix
+    String sv_base_mini_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  String output_prefix = "~{prefix}"
+  RuntimeAttr runtime_default = object {
+                                  mem_gb: 3.75,
+                                  disk_gb: ceil(10.0 + size(vcf, "GiB") * 2.0),
+                                  cpu_cores: 1,
+                                  preemptible_tries: 3,
+                                  max_retries: 1,
+                                  boot_disk_gb: 10
+                                }
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  runtime {
+    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
+    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_base_mini_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
+
+  command <<<
+    set -euo pipefail
+    bcftools view --no-version --include ID=@~{vids} ~{vcf} -O z -o ~{output_prefix}.vcf.gz
+    tabix ~{output_prefix}.vcf.gz
+    wc -l < ~{vids} > count.txt
+  >>>
+
+  output {
+    File out = "~{output_prefix}.vcf.gz"
+    File out_index = "~{output_prefix}.vcf.gz.tbi"
+    Int count = read_int("count.txt")
+  }
+}
+
+task RenameVariantIds {
+  input {
+    File vcf
+    File? vcf_index
+    String vid_prefix
+    String file_prefix
+    Boolean? use_ssd
+    String sv_base_mini_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  String disk_type = if (defined(use_ssd) && select_first([use_ssd])) then "SSD" else "HDD"
+  Float input_size = size(vcf, "GiB")
+  RuntimeAttr runtime_default = object {
+                                  mem_gb: 2.0,
+                                  disk_gb: ceil(10.0 + input_size * 2),
+                                  cpu_cores: 1,
+                                  preemptible_tries: 3,
+                                  max_retries: 1,
+                                  boot_disk_gb: 10
+                                }
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  runtime {
+    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB"
+    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} ~{disk_type}"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_base_mini_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
+
+  command <<<
+    set -euo pipefail
+    zcat ~{vcf} \
+      | awk -F'\t' -v OFS='\t' -v i=0 '{if ($0~/^#/) {print; next} $3="prefix_"(i++); print}' \
+      | bgzip \
+      > ~{file_prefix}.vcf.gz
+    if ~{defined(vcf_index)}; then
+      tabix ~{file_prefix}.vcf.gz
+    else
+      touch ~{file_prefix}.vcf.gz
+    fi
+  >>>
+
+  output {
+    File out = "~{file_prefix}.vcf.gz"
+    File out_index = "~{file_prefix}.vcf.gz.tbi"
+  }
+}
+
+# Note: requires docker with updated bcftools
+task ScatterVcf {
+  input {
+    File vcf
+    String prefix
+    Int records_per_shard
+    Int? threads = 1
+    String sv_pipeline_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  Float input_size = size(vcf, "GB")
+  Float base_disk_gb = 10.0
+
+  RuntimeAttr runtime_default = object {
+                                  mem_gb: 3.75,
+                                  disk_gb: ceil(base_disk_gb + input_size * 5.0),
+                                  cpu_cores: 2,
+                                  preemptible_tries: 3,
+                                  max_retries: 1,
+                                  boot_disk_gb: 10
+                                }
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  runtime {
+    memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB"
+    disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_pipeline_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
+
+  command <<<
+    set -euo pipefail
+    # in case the file is empty create an empty shard
+    bcftools view -h ~{vcf} | bgzip -c > ~{prefix}.0.vcf.gz
+    bcftools +scatter ~{vcf} -o . -O z -p ~{prefix}. --threads ~{threads} -n ~{records_per_shard}
+
+    ls ~{prefix}.*.vcf.gz | sort -k1,1V > vcfs.list
+    i=0
+    while read vcf; do
+      shard_no=`printf %06d $i`
+      mv ${vcf} ~{prefix}.shard_${shard_no}.vcf.gz
+      i=$((i+1))
+    done < vcfs.list
+  >>>
+  output {
+    Array[File] shards = glob("~{prefix}.shard_*.vcf.gz")
+  }
 }
\ No newline at end of file
diff --git a/wdl/VcfClusterSingleChromsome.wdl b/wdl/VcfClusterSingleChromsome.wdl
index 16e380a49..a40ea800b 100644
--- a/wdl/VcfClusterSingleChromsome.wdl
+++ b/wdl/VcfClusterSingleChromsome.wdl
@@ -10,7 +10,10 @@ import "ClusterSingleChromosome.wdl" as VcfClusterTasks
 workflow VcfClusterSingleChrom {
   input {
     Array[File] vcfs
+    Int num_samples
     String prefix
+    String evidence_type
+    String cohort_name
     Int dist
     Float frac
     Float sample_overlap
@@ -25,7 +28,11 @@ workflow VcfClusterSingleChrom {
     File background_fail
     File empty_file
 
+    Boolean use_hail
+    String? gcs_project
+
     String sv_pipeline_docker
+    String sv_pipeline_hail_docker
     String sv_base_mini_docker
 
     # overrides for local tasks
@@ -39,6 +46,8 @@ workflow VcfClusterSingleChrom {
     RuntimeAttr? runtime_override_subset_background_fail
 
     # overrides for VcfClusterTasks
+    RuntimeAttr? runtime_override_shard_clusters
+    RuntimeAttr? runtime_override_shard_vids
     RuntimeAttr? runtime_override_subset_sv_type
     RuntimeAttr? runtime_override_shard_vcf_precluster
     RuntimeAttr? runtime_override_pull_vcf_shard
@@ -47,6 +56,12 @@ workflow VcfClusterSingleChrom {
     RuntimeAttr? runtime_override_concat_vcf_cluster
     RuntimeAttr? runtime_override_concat_svtypes
     RuntimeAttr? runtime_override_concat_sharded_cluster
+    RuntimeAttr? runtime_override_make_sites_only
+    RuntimeAttr? runtime_override_sort_merged_vcf
+
+    RuntimeAttr? runtime_override_preconcat_sharded_cluster
+    RuntimeAttr? runtime_override_hail_merge_sharded_cluster
+    RuntimeAttr? runtime_override_fix_header_sharded_cluster
   }
 
   scatter (i in range(length(vcfs))) {
@@ -102,11 +117,14 @@ workflow VcfClusterSingleChrom {
   }
 
   #Run vcfcluster per chromosome
-  call VcfClusterTasks.ClusterSingleChrom as ClusterSingleChrom {
+  call VcfClusterTasks.ClusterSingleChrom {
     input:
       vcf=ConcatVcfs.concat_vcf,
       vcf_index=ConcatVcfs.concat_vcf_idx,
+      num_samples=num_samples,
       contig=contig,
+      cohort_name=cohort_name,
+      evidence_type=evidence_type,
       prefix=prefix,
       dist=dist,
       frac=frac,
@@ -114,30 +132,42 @@ workflow VcfClusterSingleChrom {
       exclude_list=exclude_list,
       sv_size=sv_size,
       sv_types=sv_types,
+      empty_file=empty_file,
+      use_hail=use_hail,
+      gcs_project=gcs_project,
       sv_pipeline_docker=sv_pipeline_docker,
+      sv_pipeline_hail_docker=sv_pipeline_hail_docker,
       sv_base_mini_docker=sv_base_mini_docker,
       runtime_override_subset_sv_type=runtime_override_subset_sv_type,
-      runtime_override_shard_vcf_precluster=runtime_override_shard_vcf_precluster,
+      runtime_override_shard_clusters=runtime_override_shard_clusters,
+      runtime_override_shard_vids=runtime_override_shard_vids,
       runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard,
       runtime_override_svtk_vcf_cluster=runtime_override_svtk_vcf_cluster,
       runtime_override_get_vcf_header_with_members_info_line=runtime_override_get_vcf_header_with_members_info_line,
       runtime_override_concat_svtypes=runtime_override_concat_svtypes,
-      runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster
+      runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster,
+      runtime_override_make_sites_only=runtime_override_make_sites_only,
+      runtime_override_sort_merged_vcf=runtime_override_sort_merged_vcf,
+      runtime_override_preconcat_sharded_cluster=runtime_override_preconcat_sharded_cluster,
+      runtime_override_hail_merge_sharded_cluster=runtime_override_hail_merge_sharded_cluster,
+      runtime_override_fix_header_sharded_cluster=runtime_override_fix_header_sharded_cluster
   }
 
   if(subset_sr_lists) {
     #Subset bothside_pass & background_fail to chromosome of interest
-    call MiniTasks.SubsetVariantList as SubsetBothsidePass {
+    call SubsetVariantList as SubsetBothsidePass {
       input:
         vid_list=bothside_pass,
+        vid_col=2,
         vcf=ConcatVcfs.concat_vcf,
         outfile_name="~{prefix}.pass.VIDs.list",
         sv_base_mini_docker=sv_base_mini_docker,
         runtime_attr_override=runtime_override_subset_bothside_pass
     }
-    call MiniTasks.SubsetVariantList as SubsetBackgroundFail {
+    call SubsetVariantList as SubsetBackgroundFail {
       input:
         vid_list=background_fail,
+        vid_col=1,
         vcf=ConcatVcfs.concat_vcf,
         outfile_name="~{prefix}.fail.VIDs.list",
         sv_base_mini_docker=sv_base_mini_docker,
@@ -146,8 +176,8 @@ workflow VcfClusterSingleChrom {
   }
 
   output {
-    File clustered_vcf = ClusterSingleChrom.clustered_vcf
-    File clustered_vcf_idx = ClusterSingleChrom.clustered_vcf_idx
+    Array[File] clustered_vcfs = ClusterSingleChrom.clustered_vcfs
+    Array[File] clustered_vcf_indexes = ClusterSingleChrom.clustered_vcf_indexes
     File filtered_bothside_pass = select_first([SubsetBothsidePass.filtered_vid_list, empty_file])
     File filtered_background_fail = select_first([SubsetBackgroundFail.filtered_vid_list, empty_file])
   }
@@ -393,3 +423,47 @@ task FixEvidenceTags {
     File out_index = "~{prefix}.~{contig}.unclustered.vcf.gz.tbi"
   }
 }
+
+# Find intersection of Variant IDs from vid_list with those present in vcf, return as filtered_vid_list
+task SubsetVariantList {
+  input {
+    File vid_list
+    Int vid_col
+    File vcf
+    String outfile_name
+    String sv_base_mini_docker
+    RuntimeAttr? runtime_attr_override
+  }
+
+  # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to
+  # be held in memory or disk while working, potentially in a form that takes up more space)
+  RuntimeAttr runtime_default = object {
+                                  mem_gb: 3.75,
+                                  disk_gb: ceil(10.0 + size(vid_list, "GB") * 2.0 + size(vcf, "GB")),
+                                  cpu_cores: 1,
+                                  preemptible_tries: 3,
+                                  max_retries: 1,
+                                  boot_disk_gb: 10
+                                }
+  RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default])
+  runtime {
+    memory: select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + " GB"
+    disks: "local-disk " + select_first([runtime_override.disk_gb, runtime_default.disk_gb]) + " HDD"
+    cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores])
+    preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries])
+    maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries])
+    docker: sv_base_mini_docker
+    bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb])
+  }
+
+  command <<<
+    set -euo pipefail
+    zgrep -v "^#" ~{vcf} | cut -f3 > valid_vids.list
+    awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]; next} {if ($~{vid_col} in inFileA) print }' valid_vids.list ~{vid_list} \
+      > ~{outfile_name}
+  >>>
+
+  output {
+    File filtered_vid_list = outfile_name
+  }
+}
\ No newline at end of file