diff --git a/dockerfiles/sv-pipeline-base/Dockerfile b/dockerfiles/sv-pipeline-base/Dockerfile index 92daae96d..cd4ecaad6 100644 --- a/dockerfiles/sv-pipeline-base/Dockerfile +++ b/dockerfiles/sv-pipeline-base/Dockerfile @@ -49,7 +49,7 @@ ARG CONDA_DEP_TRANSIENT="make git wget" ARG CONDA_DEP="software-properties-common zlib1g-dev libbz2-dev liblzma-dev libcurl4-openssl-dev libssl-dev libblas-dev liblapack-dev libatlas-base-dev g++ gfortran ${CONDA_DEP_TRANSIENT}" # versions of bedtools > 2.27.0 seem to have lost the ability to read gzipped files # pandas 1.0.0 causes problem with bedtools in aggregate.py -ARG PYTHON_PKGS="setuptools=52.0.0 wheel=0.34.2 bzip2=1.0.8 cython=0.29.14 numpy=1.18.1 pandas=0.25.3 scikit-learn=0.22.1 scipy=1.4.1 intervaltree=3.0.2 matplotlib=3.1.3 natsort=7.0.1 bedtools=2.27.0 pybedtools=0.8.1 pysam=0.14.1=py36_htslib1.7_0" +ARG PYTHON_PKGS="pip=21.2.2 setuptools=52.0.0 wheel=0.34.2 bzip2=1.0.8 cython=0.29.14 numpy=1.18.1 pandas=0.25.3 scikit-learn=0.22.1 scipy=1.4.1 intervaltree=3.0.2 matplotlib=3.1.3 natsort=7.0.1 bedtools=2.27.0 pybedtools=0.8.1 pysam=0.14.1=py36_htslib1.7_0" ENV LANG=C.UTF-8 ENV LC_ALL=C.UTF-8 ARG CONDA_INSTALL_DIR="/opt/conda" diff --git a/dockerfiles/sv-pipeline-hail/Dockerfile b/dockerfiles/sv-pipeline-hail/Dockerfile new file mode 100644 index 000000000..5f19f7b4e --- /dev/null +++ b/dockerfiles/sv-pipeline-hail/Dockerfile @@ -0,0 +1,9 @@ +# GATK SV Pipeline Hail dockerfile + +# IMPORTANT: these arguments must be specified at the begining to take advantage of multi-stage build AND runtime specification of base images +ARG SV_PIPELINE_IMAGE=gatksv/sv-pipeline:latest +FROM ${SV_PIPELINE_IMAGE} + +# Dependencies for creating a Hail cluster on GCS DataProc +RUN pip3 --no-cache-dir install hail==0.2.71 && \ + pip3 --no-cache-dir install google-cloud-dataproc \ diff --git a/dockerfiles/sv-pipeline/Dockerfile b/dockerfiles/sv-pipeline/Dockerfile index 341d80af5..5e64eb469 100644 --- a/dockerfiles/sv-pipeline/Dockerfile +++ b/dockerfiles/sv-pipeline/Dockerfile @@ -48,3 +48,51 @@ RUN apt-get -qqy update --fix-missing && \ /usr/share/man/?? \ /usr/share/man/??_* ENV PATH="/opt/:${PATH}" + +# Compile StitchFragmentedCNVs Java program +ENV STITCH_JAR="/opt/sv-pipeline/java/build/StitchFragmentedCNVs.jar" +RUN cd /opt/sv-pipeline/java && \ + mkdir -p build/classes && \ + javac -d build/classes org/broadinstitute/svpipeline/StitchFragmentedCNVs.java org/broadinstitute/svpipeline/VCFParser.java && \ + jar cfe build/StitchFragmentedCNVs.jar "org.broadinstitute.svpipeline.StitchFragmentedCNVs" -C build/classes . && \ + rm -r build/classes + +# Compile StitchFragmentedCNVs unit tests +ENV STITCH_UNIT_TEST_JAR="/opt/sv-pipeline/java/build/StitchFragmentedCNVsUnitTest.jar" +RUN cd /opt/sv-pipeline/java && \ + mkdir -p build/classes && \ + javac -d build/classes org/broadinstitute/svpipeline/StitchFragmentedCNVsUnitTest.java org/broadinstitute/svpipeline/StitchFragmentedCNVs.java org/broadinstitute/svpipeline/VCFParser.java && \ + jar cfe build/StitchFragmentedCNVsUnitTest.jar "org.broadinstitute.svpipeline.StitchFragmentedCNVsUnitTest" -C build/classes . && \ + echo "Running StitchFragmentedCNVsUnitTest..." && \ + java -enableassertions -jar $STITCH_UNIT_TEST_JAR && \ + rm -r build/classes $STITCH_UNIT_TEST_JAR + +# Compile VCFParser unit tests +ENV VCF_PARSER_UNIT_TEST_JAR="/opt/sv-pipeline/java/build/VCFParserUnitTest.jar" +RUN cd /opt/sv-pipeline/java && \ + mkdir -p build/classes && \ + javac -d build/classes org/broadinstitute/svpipeline/VCFParserUnitTest.java org/broadinstitute/svpipeline/VCFParser.java && \ + jar cfe build/VCFParserUnitTest.jar "org.broadinstitute.svpipeline.VCFParserUnitTest" -C build/classes . && \ + echo "Running VCFParserUnitTest..." && \ + java -enableassertions -jar $VCF_PARSER_UNIT_TEST_JAR && \ + rm -r build/classes $VCF_PARSER_UNIT_TEST_JAR + +# Compile and test CleanVCFPart1 Java program +ENV CLEAN_VCF_PART_1_JAR="/opt/sv-pipeline/java/build/CleanVCFPart1.jar" +RUN cd /opt/sv-pipeline/java && \ + mkdir -p build/classes && \ + javac -d build/classes org/broadinstitute/svpipeline/CleanVCFPart1.java org/broadinstitute/svpipeline/VCFParser.java && \ + jar cfe build/CleanVCFPart1.jar "org.broadinstitute.svpipeline.CleanVCFPart1" -C build/classes . && \ + rm -r build/classes + +# Compile and test CleanVCFPart1 unit tests +ENV CLEAN_VCF_PART_1_UNIT_TEST_JAR="/opt/sv-pipeline/java/build/CleanVCFPart1UnitTest.jar" +RUN cd /opt/sv-pipeline/java && \ + mkdir -p build/classes && \ + javac -d build/classes org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java org/broadinstitute/svpipeline/CleanVCFPart1.java org/broadinstitute/svpipeline/VCFParser.java && \ + jar cfe build/CleanVCFPart1UnitTest.jar "org.broadinstitute.svpipeline.CleanVCFPart1UnitTest" -C build/classes . && \ + echo "Running CleanVCFPart1UnitTest..." && \ + java -enableassertions -jar $CLEAN_VCF_PART_1_UNIT_TEST_JAR && \ + rm -r build/classes $CLEAN_VCF_PART_1_UNIT_TEST_JAR && \ + rm -rf /tmp/* /var/tmp/* + diff --git a/input_templates/GATKSVPipelineBatch.ref_panel_1kg.json.tmpl b/input_templates/GATKSVPipelineBatch.ref_panel_1kg.json.tmpl index ff2a0a725..0fc97f0a9 100644 --- a/input_templates/GATKSVPipelineBatch.ref_panel_1kg.json.tmpl +++ b/input_templates/GATKSVPipelineBatch.ref_panel_1kg.json.tmpl @@ -28,6 +28,8 @@ "GATKSVPipelineBatch.sv_pipeline_base_docker": {{ dockers.sv_pipeline_base_docker | tojson }}, "GATKSVPipelineBatch.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }}, "GATKSVPipelineBatch.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }}, + "GATKSVPipelineBatch.sv_pipeline_hail_docker": {{ dockers.sv_pipeline_hail_docker | tojson }}, + "GATKSVPipelineBatch.sv_pipeline_updates_docker": {{ dockers.sv_pipeline_updates_docker | tojson }}, "GATKSVPipelineBatch.sv_pipeline_qc_docker": {{ dockers.sv_pipeline_qc_docker | tojson }}, "GATKSVPipelineBatch.sv_pipeline_rdtest_docker": {{ dockers.sv_pipeline_rdtest_docker | tojson }}, "GATKSVPipelineBatch.samtools_cloud_docker": {{ dockers.samtools_cloud_docker | tojson }}, @@ -117,6 +119,9 @@ "GATKSVPipelineBatch.RegenotypeCNVs.n_RdTest_bins": "100000", "GATKSVPipelineBatch.RegenotypeCNVs.n_per_split": "5000", + "GATKSVPipelineBatch.MakeCohortVcf.chr_x": {{ reference_resources.chr_x | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.chr_y": {{ reference_resources.chr_y | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.bin_exclude": {{ reference_resources.bin_exclude | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.empty_file" : {{ reference_resources.empty_file | tojson }}, "GATKSVPipelineBatch.MakeCohortVcf.cytobands": {{ reference_resources.cytobands | tojson }}, @@ -126,7 +131,9 @@ "GATKSVPipelineBatch.MakeCohortVcf.min_sr_background_fail_batches": 0.5, "GATKSVPipelineBatch.MakeCohortVcf.max_shards_per_chrom_clean_vcf_step1": 200, "GATKSVPipelineBatch.MakeCohortVcf.min_records_per_shard_clean_vcf_step1": 5000, + "GATKSVPipelineBatch.MakeCohortVcf.clean_vcf1b_records_per_shard": 10000, "GATKSVPipelineBatch.MakeCohortVcf.samples_per_clean_vcf_step2_shard": 100, + "GATKSVPipelineBatch.MakeCohortVcf.clean_vcf5_records_per_shard": 5000, "GATKSVPipelineBatch.MakeCohortVcf.random_seed": 0, "GATKSVPipelineBatch.MakeCohortVcf.max_shard_size_resolve": 500, diff --git a/input_templates/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.json.tmpl b/input_templates/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.json.tmpl index 09fec86b2..efd5c5fc1 100644 --- a/input_templates/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.json.tmpl +++ b/input_templates/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.json.tmpl @@ -34,6 +34,8 @@ "GATKSVPipelineSingleSample.sv_pipeline_base_docker": {{ dockers.sv_pipeline_base_docker | tojson }}, "GATKSVPipelineSingleSample.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }}, "GATKSVPipelineSingleSample.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }}, + "GATKSVPipelineSingleSample.sv_pipeline_hail_docker": {{ dockers.sv_pipeline_hail_docker | tojson }}, + "GATKSVPipelineSingleSample.sv_pipeline_updates_docker": {{ dockers.sv_pipeline_updates_docker | tojson }}, "GATKSVPipelineSingleSample.sv_pipeline_qc_docker": {{ dockers.sv_pipeline_qc_docker | tojson }}, "GATKSVPipelineSingleSample.sv_pipeline_rdtest_docker": {{ dockers.sv_pipeline_rdtest_docker | tojson }}, "GATKSVPipelineSingleSample.wham_docker": {{ dockers.wham_docker | tojson }}, @@ -84,7 +86,9 @@ "GATKSVPipelineSingleSample.max_shard_size_resolve" : 500, "GATKSVPipelineSingleSample.clean_vcf_max_shards_per_chrom_clean_vcf_step1": 200, "GATKSVPipelineSingleSample.clean_vcf_min_records_per_shard_clean_vcf_step1": 5000, + "GATKSVPipelineSingleSample.clean_vcf1b_records_per_shard": 10000, "GATKSVPipelineSingleSample.clean_vcf_samples_per_clean_vcf_step2_shard": 100, + "GATKSVPipelineSingleSample.clean_vcf5_records_per_shard": 5000, "GATKSVPipelineSingleSample.clean_vcf_random_seed": 0, "GATKSVPipelineSingleSample.run_vcf_qc" : false, diff --git a/input_templates/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.no_melt.json.tmpl b/input_templates/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.no_melt.json.tmpl index 54fd6767d..5b341f079 100644 --- a/input_templates/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.no_melt.json.tmpl +++ b/input_templates/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.no_melt.json.tmpl @@ -36,6 +36,8 @@ "GATKSVPipelineSingleSample.sv_pipeline_base_docker": {{ dockers.sv_pipeline_base_docker | tojson }}, "GATKSVPipelineSingleSample.sv_base_mini_docker": {{ dockers.sv_base_mini_docker | tojson }}, "GATKSVPipelineSingleSample.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }}, + "GATKSVPipelineSingleSample.sv_pipeline_hail_docker": {{ dockers.sv_pipeline_hail_docker | tojson }}, + "GATKSVPipelineSingleSample.sv_pipeline_updates_docker": {{ dockers.sv_pipeline_updates_docker | tojson }}, "GATKSVPipelineSingleSample.sv_pipeline_qc_docker": {{ dockers.sv_pipeline_qc_docker | tojson }}, "GATKSVPipelineSingleSample.sv_pipeline_rdtest_docker": {{ dockers.sv_pipeline_rdtest_docker | tojson }}, "GATKSVPipelineSingleSample.wham_docker": {{ dockers.wham_docker | tojson }}, @@ -85,8 +87,10 @@ "GATKSVPipelineSingleSample.max_shard_size_resolve" : 500, "GATKSVPipelineSingleSample.clean_vcf_max_shards_per_chrom_clean_vcf_step1": 200, "GATKSVPipelineSingleSample.clean_vcf_min_records_per_shard_clean_vcf_step1": 5000, + "GATKSVPipelineSingleSample.clean_vcf1b_records_per_shard": 10000, "GATKSVPipelineSingleSample.clean_vcf_samples_per_clean_vcf_step2_shard": 100, "GATKSVPipelineSingleSample.clean_vcf_random_seed": 0, + "GATKSVPipelineSingleSample.clean_vcf5_records_per_shard": 5000, "GATKSVPipelineSingleSample.run_vcf_qc" : false, "GATKSVPipelineSingleSample.protein_coding_gtf" : {{ reference_resources.protein_coding_gtf | tojson }}, diff --git a/input_templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.SingleBatch.json.tmpl b/input_templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.SingleBatch.json.tmpl index 73be24740..a02554989 100644 --- a/input_templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.SingleBatch.json.tmpl +++ b/input_templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.SingleBatch.json.tmpl @@ -14,12 +14,16 @@ "MakeCohortVcf.min_sr_background_fail_batches": 0.5, "MakeCohortVcf.max_shards_per_chrom_clean_vcf_step1": 200, "MakeCohortVcf.min_records_per_shard_clean_vcf_step1": 5000, + "MakeCohortVcf.clean_vcf1b_records_per_shard": 10000, + "MakeCohortVcf.clean_vcf5_records_per_shard": 5000, "MakeCohortVcf.samples_per_clean_vcf_step2_shard": 100, "MakeCohortVcf.random_seed": 0, "MakeCohortVcf.max_shard_size_resolve": 500, "MakeCohortVcf.linux_docker": "${workspace.linux_docker}", "MakeCohortVcf.sv_pipeline_docker": "${workspace.sv_pipeline_docker}", + "MakeCohortVcf.sv_pipeline_hail_docker": "${workspace.sv_pipeline_hail_docker}", + "MakeCohortVcf.sv_pipeline_updates_docker": "${workspace.sv_pipeline_updates_docker}", "MakeCohortVcf.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", "MakeCohortVcf.sv_pipeline_rdtest_docker": "${workspace.sv_pipeline_rdtest_docker}", "MakeCohortVcf.sv_pipeline_qc_docker": "${workspace.sv_pipeline_qc_docker}", @@ -27,6 +31,9 @@ "MakeCohortVcf.primary_contigs_list": "${workspace.primary_contigs_list}", "MakeCohortVcf.sv_pipeline_base_docker": "${workspace.sv_pipeline_base_docker}", + "MakeCohortVcf.chr_x": "${workspace.chr_x}", + "MakeCohortVcf.chr_y": "${workspace.chr_y}", + "MakeCohortVcf.cohort_name": "${this.sample_set_id}", "MakeCohortVcf.batches": "${this.sample_set_id}", "MakeCohortVcf.ped_file": "${workspace.cohort_ped_file}", diff --git a/input_templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl b/input_templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl index 450fa879b..c19ab5a4e 100644 --- a/input_templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl +++ b/input_templates/terra_workspaces/cohort_mode/workflow_configurations/MakeCohortVcf.json.tmpl @@ -14,12 +14,16 @@ "MakeCohortVcf.min_sr_background_fail_batches": 0.5, "MakeCohortVcf.max_shards_per_chrom_clean_vcf_step1": 200, "MakeCohortVcf.min_records_per_shard_clean_vcf_step1": 5000, + "MakeCohortVcf.clean_vcf1b_records_per_shard": 10000, + "MakeCohortVcf.clean_vcf5_records_per_shard": 5000, "MakeCohortVcf.samples_per_clean_vcf_step2_shard": 100, "MakeCohortVcf.random_seed": 0, "MakeCohortVcf.max_shard_size_resolve": 500, "MakeCohortVcf.linux_docker": "${workspace.linux_docker}", "MakeCohortVcf.sv_pipeline_docker": "${workspace.sv_pipeline_docker}", + "MakeCohortVcf.sv_pipeline_hail_docker": "${workspace.sv_pipeline_hail_docker}", + "MakeCohortVcf.sv_pipeline_updates_docker": "${workspace.sv_pipeline_updates_docker}", "MakeCohortVcf.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", "MakeCohortVcf.sv_pipeline_rdtest_docker": "${workspace.sv_pipeline_rdtest_docker}", "MakeCohortVcf.sv_pipeline_qc_docker": "${workspace.sv_pipeline_qc_docker}", @@ -27,6 +31,9 @@ "MakeCohortVcf.primary_contigs_list": "${workspace.primary_contigs_list}", "MakeCohortVcf.sv_pipeline_base_docker": "${workspace.sv_pipeline_base_docker}", + "MakeCohortVcf.chr_x": "${workspace.chr_x}", + "MakeCohortVcf.chr_y": "${workspace.chr_y}", + "MakeCohortVcf.cohort_name": "${this.sample_set_set_id}", "MakeCohortVcf.batches": "${this.sample_sets.sample_set_id}", "MakeCohortVcf.ped_file": "${workspace.cohort_ped_file}", diff --git a/input_templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl b/input_templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl index 456361948..a8e6fe515 100644 --- a/input_templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl +++ b/input_templates/terra_workspaces/cohort_mode/workspace.tsv.tmpl @@ -1,2 +1,2 @@ -workspace:cloud_sdk_docker cnmops_docker condense_counts_docker gatk_docker gatk_docker_pesr_override gcnv_gatk_docker genomes_in_the_cloud_docker linux_docker manta_docker samtools_cloud_docker sv_base_docker sv_base_mini_docker sv_pipeline_base_docker sv_pipeline_docker sv_pipeline_qc_docker sv_pipeline_rdtest_docker wham_docker allosome_file autosome_file bin_exclude cnmops_exclude_list cohort_ped_file contig_ploidy_priors copy_number_autosomal_contigs cytobands dbsnp_vcf delly_exclude_intervals_file depth_exclude_list empty_file exclude_intervals_for_gcnv_filter_intervals external_af_ref_bed external_af_ref_bed_prefix genome_file inclusion_bed linc_rna_gtf manta_region_bed mei_bed melt_standard_vcf_header noncoding_bed pesr_exclude_list preprocessed_intervals primary_contigs_fai primary_contigs_list promoter_bed protein_coding_gtf reference_build reference_dict reference_fasta reference_index reference_version rmsk segdups seed_cutoffs unpadded_intervals_file wgd_scoring_mask wham_include_list_bed_file -{{ dockers.cloud_sdk_docker }} {{ dockers.cnmops_docker }} {{ dockers.condense_counts_docker }} {{ dockers.gatk_docker }} {{ dockers.gatk_docker_pesr_override }} {{ dockers.gatk_docker }} {{ dockers.genomes_in_the_cloud_docker }} {{ dockers.linux_docker }} {{ dockers.manta_docker }} {{ dockers.samtools_cloud_docker }} {{ dockers.sv_base_docker }} {{ dockers.sv_base_mini_docker }} {{ dockers.sv_pipeline_base_docker }} {{ dockers.sv_pipeline_docker }} {{ dockers.sv_pipeline_qc_docker }} {{ dockers.sv_pipeline_rdtest_docker }} {{ dockers.wham_docker }} {{ reference_resources.allosome_file }} {{ reference_resources.autosome_file }} {{ reference_resources.bin_exclude }} {{ reference_resources.cnmops_exclude_list }} gs://broad-dsde-methods-eph/ped_1kgp_all.ped {{ reference_resources.contig_ploidy_priors }} {{ reference_resources.copy_number_autosomal_contigs }} {{ reference_resources.cytobands }} {{ reference_resources.dbsnp_vcf }} {{ reference_resources.delly_exclude_intervals_file }} {{ reference_resources.depth_exclude_list }} {{ reference_resources.empty_file }} {{ reference_resources.exclude_intervals_for_gcnv_filter_intervals }} {{ reference_resources.external_af_ref_bed | tojson }} {{ reference_resources.external_af_ref_bed_prefix | tojson }} {{ reference_resources.genome_file }} {{ reference_resources.inclusion_bed }} {{ reference_resources.linc_rna_gtf | tojson }} {{ reference_resources.manta_region_bed }} {{ reference_resources.mei_bed }} {{ reference_resources.melt_std_vcf_header }} {{ reference_resources.noncoding_bed | tojson }} {{ reference_resources.pesr_exclude_list }} {{ reference_resources.preprocessed_intervals }} {{ reference_resources.primary_contigs_fai }} {{ reference_resources.primary_contigs_list }} {{ reference_resources.promoter_bed | tojson }} {{ reference_resources.protein_coding_gtf | tojson }} {{ reference_resources.reference_build }} {{ reference_resources.reference_dict }} {{ reference_resources.reference_fasta }} {{ reference_resources.reference_index }} {{ reference_resources.reference_version }} {{ reference_resources.rmsk }} {{ reference_resources.segdups }} {{ reference_resources.seed_cutoffs }} {{ reference_resources.unpadded_intervals_file }} {{ reference_resources.wgd_scoring_mask }} {{ reference_resources.wham_include_list_bed_file }} +workspace:cloud_sdk_docker cnmops_docker condense_counts_docker gatk_docker gatk_docker_pesr_override gcnv_gatk_docker genomes_in_the_cloud_docker linux_docker manta_docker samtools_cloud_docker sv_base_docker sv_base_mini_docker sv_pipeline_base_docker sv_pipeline_docker sv_pipeline_hail_docker sv_pipeline_updates_docker sv_pipeline_qc_docker sv_pipeline_rdtest_docker wham_docker allosome_file autosome_file bin_exclude cnmops_exclude_list cohort_ped_file contig_ploidy_priors copy_number_autosomal_contigs cytobands dbsnp_vcf delly_exclude_intervals_file depth_exclude_list empty_file exclude_intervals_for_gcnv_filter_intervals external_af_ref_bed external_af_ref_bed_prefix genome_file inclusion_bed linc_rna_gtf manta_region_bed mei_bed melt_standard_vcf_header noncoding_bed pesr_exclude_list preprocessed_intervals primary_contigs_fai primary_contigs_list promoter_bed protein_coding_gtf reference_build reference_dict reference_fasta reference_index reference_version rmsk segdups seed_cutoffs unpadded_intervals_file wgd_scoring_mask wham_include_list_bed_file chr_x chr_y +{{ dockers.cloud_sdk_docker }} {{ dockers.cnmops_docker }} {{ dockers.condense_counts_docker }} {{ dockers.gatk_docker }} {{ dockers.gatk_docker_pesr_override }} {{ dockers.gatk_docker }} {{ dockers.genomes_in_the_cloud_docker }} {{ dockers.linux_docker }} {{ dockers.manta_docker }} {{ dockers.samtools_cloud_docker }} {{ dockers.sv_base_docker }} {{ dockers.sv_base_mini_docker }} {{ dockers.sv_pipeline_base_docker }} {{ dockers.sv_pipeline_docker }} {{ dockers.sv_pipeline_hail_docker }} {{ dockers.sv_pipeline_updates_docker }} {{ dockers.sv_pipeline_qc_docker }} {{ dockers.sv_pipeline_rdtest_docker }} {{ dockers.wham_docker }} {{ reference_resources.allosome_file }} {{ reference_resources.autosome_file }} {{ reference_resources.bin_exclude }} {{ reference_resources.cnmops_exclude_list }} gs://broad-dsde-methods-eph/ped_1kgp_all.ped {{ reference_resources.contig_ploidy_priors }} {{ reference_resources.copy_number_autosomal_contigs }} {{ reference_resources.cytobands }} {{ reference_resources.dbsnp_vcf }} {{ reference_resources.delly_exclude_intervals_file }} {{ reference_resources.depth_exclude_list }} {{ reference_resources.empty_file }} {{ reference_resources.exclude_intervals_for_gcnv_filter_intervals }} {{ reference_resources.external_af_ref_bed | tojson }} {{ reference_resources.external_af_ref_bed_prefix | tojson }} {{ reference_resources.genome_file }} {{ reference_resources.inclusion_bed }} {{ reference_resources.linc_rna_gtf | tojson }} {{ reference_resources.manta_region_bed }} {{ reference_resources.mei_bed }} {{ reference_resources.melt_std_vcf_header }} {{ reference_resources.noncoding_bed | tojson }} {{ reference_resources.pesr_exclude_list }} {{ reference_resources.preprocessed_intervals }} {{ reference_resources.primary_contigs_fai }} {{ reference_resources.primary_contigs_list }} {{ reference_resources.promoter_bed | tojson }} {{ reference_resources.protein_coding_gtf | tojson }} {{ reference_resources.reference_build }} {{ reference_resources.reference_dict }} {{ reference_resources.reference_fasta }} {{ reference_resources.reference_index }} {{ reference_resources.reference_version }} {{ reference_resources.rmsk }} {{ reference_resources.segdups }} {{ reference_resources.seed_cutoffs }} {{ reference_resources.unpadded_intervals_file }} {{ reference_resources.wgd_scoring_mask }} {{ reference_resources.wham_include_list_bed_file }} {{ reference_resources.chr_x }} {{ reference_resources.chr_y }} diff --git a/input_templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.no_melt.json.tmpl b/input_templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.no_melt.json.tmpl index ce930bc57..5853242cc 100644 --- a/input_templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.no_melt.json.tmpl +++ b/input_templates/terra_workspaces/single_sample/GATKSVPipelineSingleSample.ref_panel_1kg.na12878.no_melt.json.tmpl @@ -36,6 +36,8 @@ "GATKSVPipelineSingleSample.sv_pipeline_base_docker": "${workspace.sv_pipeline_base_docker}", "GATKSVPipelineSingleSample.sv_base_mini_docker": "${workspace.sv_base_mini_docker}", "GATKSVPipelineSingleSample.sv_pipeline_docker": "${workspace.sv_pipeline_docker}", + "GATKSVPipelineSingleSample.sv_pipeline_hail_docker": "${workspace.sv_pipeline_hail_docker}", + "GATKSVPipelineSingleSample.sv_pipeline_updates_docker": "${workspace.sv_pipeline_updates_docker}", "GATKSVPipelineSingleSample.sv_pipeline_qc_docker": "${workspace.sv_pipeline_qc_docker}", "GATKSVPipelineSingleSample.sv_pipeline_rdtest_docker": "${workspace.sv_pipeline_rdtest_docker}", "GATKSVPipelineSingleSample.wham_docker": "${workspace.wham_docker}", @@ -84,7 +86,9 @@ "GATKSVPipelineSingleSample.max_shard_size_resolve" : 500, "GATKSVPipelineSingleSample.clean_vcf_max_shards_per_chrom_clean_vcf_step1": 200, "GATKSVPipelineSingleSample.clean_vcf_min_records_per_shard_clean_vcf_step1": 5000, + "GATKSVPipelineSingleSample.clean_vcf1b_records_per_shard": 10000, "GATKSVPipelineSingleSample.clean_vcf_samples_per_clean_vcf_step2_shard": 100, + "GATKSVPipelineSingleSample.clean_vcf5_records_per_shard": 5000, "GATKSVPipelineSingleSample.clean_vcf_random_seed": 0, "GATKSVPipelineSingleSample.run_vcf_qc" : false, diff --git a/input_templates/terra_workspaces/single_sample/workspace.tsv.tmpl b/input_templates/terra_workspaces/single_sample/workspace.tsv.tmpl index f338efa17..ba6e6ff4f 100644 --- a/input_templates/terra_workspaces/single_sample/workspace.tsv.tmpl +++ b/input_templates/terra_workspaces/single_sample/workspace.tsv.tmpl @@ -1,2 +1,2 @@ -workspace:cnmops_docker condense_counts_docker gatk_docker gatk_docker_pesr_override genomes_in_the_cloud_docker linux_docker manta_docker samtools_cloud_docker sv_base_docker sv_base_mini_docker sv_pipeline_base_docker sv_pipeline_docker sv_pipeline_qc_docker sv_pipeline_rdtest_docker wham_docker ref_panel_name ref_panel_bincov_matrix ref_panel_contig_ploidy_model_tar ref_panel_cutoffs ref_panel_del_bed ref_panel_dup_bed ref_panel_genotype_pesr_pesr_sepcutoff ref_panel_genotype_pesr_depth_sepcutoff ref_panel_genotype_depth_pesr_sepcutoff ref_panel_genotype_depth_depth_sepcutoff ref_panel_ped_file ref_panel_PE_metrics ref_panel_qc_definitions ref_panel_requester_pays_crams ref_panel_SR_metrics ref_panel_vcf reference_name reference_allosome_file reference_autosome_file reference_bin_exclude reference_cnmops_exclude_list reference_contig_ploidy_priors reference_copy_number_autosomal_contigs reference_cytobands reference_dbsnp_vcf reference_delly_exclude_intervals_file reference_depth_exclude_list reference_empty_file reference_exclude_intervals_for_gcnv_filter_intervals reference_external_af_ref_bed reference_external_af_ref_bed_prefix reference_genome_file reference_inclusion_bed reference_linc_rna_gtf reference_manta_region_bed reference_mei_bed reference_melt_std_vcf_header reference_noncoding_bed reference_pesr_exclude_list reference_preprocessed_intervals reference_primary_contigs_list reference_primary_contigs_fai reference_promoter_bed reference_protein_coding_gtf reference_dict reference_fasta reference_index reference_version reference_rmsk reference_segdups reference_seed_cutoffs reference_unpadded_intervals_file reference_wgd_scoring_mask reference_wham_include_list_bed_file -{{ dockers.cnmops_docker }} {{ dockers.condense_counts_docker }} {{ dockers.gatk_docker }} {{ dockers.gatk_docker_pesr_override }} {{ dockers.genomes_in_the_cloud_docker }} {{ dockers.linux_docker }} {{ dockers.manta_docker }} {{ dockers.samtools_cloud_docker }} {{ dockers.sv_base_docker }} {{ dockers.sv_base_mini_docker }} {{ dockers.sv_pipeline_base_docker }} {{ dockers.sv_pipeline_docker }} {{ dockers.sv_pipeline_qc_docker }} {{ dockers.sv_pipeline_rdtest_docker }} {{ dockers.wham_docker }} {{ ref_panel.name }} {{ ref_panel.bincov_matrix }} {{ ref_panel.contig_ploidy_model_tar }} {{ ref_panel.cutoffs }} {{ ref_panel.del_bed }} {{ ref_panel.dup_bed }} {{ ref_panel.genotype_pesr_pesr_sepcutoff }} {{ ref_panel.genotype_pesr_depth_sepcutoff }} {{ ref_panel.genotype_depth_pesr_sepcutoff }} {{ ref_panel.genotype_depth_depth_sepcutoff }} {{ ref_panel.ped_file }} {{ ref_panel.PE_metrics }} {{ ref_panel.qc_definitions }} {{ ref_panel.requester_pays_crams }} {{ ref_panel.SR_metrics }} {{ ref_panel.vcf }} {{ reference_resources.name }} {{ reference_resources.allosome_file }} {{ reference_resources.autosome_file }} {{ reference_resources.bin_exclude }} {{ reference_resources.cnmops_exclude_list }} {{ reference_resources.contig_ploidy_priors }} {{ reference_resources.copy_number_autosomal_contigs }} {{ reference_resources.cytobands }} {{ reference_resources.dbsnp_vcf }} {{ reference_resources.delly_exclude_intervals_file }} {{ reference_resources.depth_exclude_list }} {{ reference_resources.empty_file }} {{ reference_resources.exclude_intervals_for_gcnv_filter_intervals }} {{ reference_resources.external_af_ref_bed }} {{ reference_resources.external_af_ref_bed_prefix }} {{ reference_resources.genome_file }} {{ reference_resources.inclusion_bed }} {{ reference_resources.linc_rna_gtf }} {{ reference_resources.manta_region_bed }} {{ reference_resources.mei_bed }} {{ reference_resources.melt_std_vcf_header }} {{ reference_resources.noncoding_bed }} {{ reference_resources.pesr_exclude_list }} {{ reference_resources.preprocessed_intervals }} {{ reference_resources.primary_contigs_list }} {{ reference_resources.primary_contigs_fai }} {{ reference_resources.promoter_bed }} {{ reference_resources.protein_coding_gtf }} {{ reference_resources.reference_dict }} {{ reference_resources.reference_fasta }} {{ reference_resources.reference_index }} {{ reference_resources.reference_version }} {{ reference_resources.rmsk }} {{ reference_resources.segdups }} {{ reference_resources.seed_cutoffs }} {{ reference_resources.unpadded_intervals_file }} {{ reference_resources.wgd_scoring_mask }} {{ reference_resources.wham_include_list_bed_file }} +workspace:cnmops_docker condense_counts_docker gatk_docker gatk_docker_pesr_override genomes_in_the_cloud_docker linux_docker manta_docker samtools_cloud_docker sv_base_docker sv_base_mini_docker sv_pipeline_base_docker sv_pipeline_docker sv_pipeline_hail_docker sv_pipeline_updates_docker sv_pipeline_qc_docker sv_pipeline_rdtest_docker wham_docker ref_panel_name ref_panel_bincov_matrix ref_panel_contig_ploidy_model_tar ref_panel_cutoffs ref_panel_del_bed ref_panel_dup_bed ref_panel_genotype_pesr_pesr_sepcutoff ref_panel_genotype_pesr_depth_sepcutoff ref_panel_genotype_depth_pesr_sepcutoff ref_panel_genotype_depth_depth_sepcutoff ref_panel_ped_file ref_panel_PE_metrics ref_panel_qc_definitions ref_panel_requester_pays_crams ref_panel_SR_metrics ref_panel_vcf reference_name reference_allosome_file reference_autosome_file reference_bin_exclude reference_cnmops_exclude_list reference_contig_ploidy_priors reference_copy_number_autosomal_contigs reference_cytobands reference_dbsnp_vcf reference_delly_exclude_intervals_file reference_depth_exclude_list reference_empty_file reference_exclude_intervals_for_gcnv_filter_intervals reference_external_af_ref_bed reference_external_af_ref_bed_prefix reference_genome_file reference_inclusion_bed reference_linc_rna_gtf reference_manta_region_bed reference_mei_bed reference_melt_std_vcf_header reference_noncoding_bed reference_pesr_exclude_list reference_preprocessed_intervals reference_primary_contigs_list reference_primary_contigs_fai reference_promoter_bed reference_protein_coding_gtf reference_dict reference_fasta reference_index reference_version reference_rmsk reference_segdups reference_seed_cutoffs reference_unpadded_intervals_file reference_wgd_scoring_mask reference_wham_include_list_bed_file +{{ dockers.cnmops_docker }} {{ dockers.condense_counts_docker }} {{ dockers.gatk_docker }} {{ dockers.gatk_docker_pesr_override }} {{ dockers.genomes_in_the_cloud_docker }} {{ dockers.linux_docker }} {{ dockers.manta_docker }} {{ dockers.samtools_cloud_docker }} {{ dockers.sv_base_docker }} {{ dockers.sv_base_mini_docker }} {{ dockers.sv_pipeline_base_docker }} {{ dockers.sv_pipeline_docker }} {{ dockers.sv_pipeline_hail_docker }} {{ dockers.sv_pipeline_updates_docker }} {{ dockers.sv_pipeline_qc_docker }} {{ dockers.sv_pipeline_rdtest_docker }} {{ dockers.wham_docker }} {{ ref_panel.name }} {{ ref_panel.bincov_matrix }} {{ ref_panel.contig_ploidy_model_tar }} {{ ref_panel.cutoffs }} {{ ref_panel.del_bed }} {{ ref_panel.dup_bed }} {{ ref_panel.genotype_pesr_pesr_sepcutoff }} {{ ref_panel.genotype_pesr_depth_sepcutoff }} {{ ref_panel.genotype_depth_pesr_sepcutoff }} {{ ref_panel.genotype_depth_depth_sepcutoff }} {{ ref_panel.ped_file }} {{ ref_panel.PE_metrics }} {{ ref_panel.qc_definitions }} {{ ref_panel.requester_pays_crams }} {{ ref_panel.SR_metrics }} {{ ref_panel.vcf }} {{ reference_resources.name }} {{ reference_resources.allosome_file }} {{ reference_resources.autosome_file }} {{ reference_resources.bin_exclude }} {{ reference_resources.cnmops_exclude_list }} {{ reference_resources.contig_ploidy_priors }} {{ reference_resources.copy_number_autosomal_contigs }} {{ reference_resources.cytobands }} {{ reference_resources.dbsnp_vcf }} {{ reference_resources.delly_exclude_intervals_file }} {{ reference_resources.depth_exclude_list }} {{ reference_resources.empty_file }} {{ reference_resources.exclude_intervals_for_gcnv_filter_intervals }} {{ reference_resources.external_af_ref_bed }} {{ reference_resources.external_af_ref_bed_prefix }} {{ reference_resources.genome_file }} {{ reference_resources.inclusion_bed }} {{ reference_resources.linc_rna_gtf }} {{ reference_resources.manta_region_bed }} {{ reference_resources.mei_bed }} {{ reference_resources.melt_std_vcf_header }} {{ reference_resources.noncoding_bed }} {{ reference_resources.pesr_exclude_list }} {{ reference_resources.preprocessed_intervals }} {{ reference_resources.primary_contigs_list }} {{ reference_resources.primary_contigs_fai }} {{ reference_resources.promoter_bed }} {{ reference_resources.protein_coding_gtf }} {{ reference_resources.reference_dict }} {{ reference_resources.reference_fasta }} {{ reference_resources.reference_index }} {{ reference_resources.reference_version }} {{ reference_resources.rmsk }} {{ reference_resources.segdups }} {{ reference_resources.seed_cutoffs }} {{ reference_resources.unpadded_intervals_file }} {{ reference_resources.wgd_scoring_mask }} {{ reference_resources.wham_include_list_bed_file }} diff --git a/input_values/dockers.json b/input_values/dockers.json index 747e515cb..f2ec7fff9 100644 --- a/input_values/dockers.json +++ b/input_values/dockers.json @@ -10,10 +10,12 @@ "manta_docker" : "us.gcr.io/broad-dsde-methods/manta:8645aa", "melt_docker" : "us.gcr.io/talkowski-sv-gnomad/melt:vj-4ff9de9f", "samtools_cloud_docker" : "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:mw-gnomad-02-6a66c96", - "sv_base_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-base:mw-gnomad-0506-pr-087d4df", + "sv_base_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-base:mw-gnomad-0506-pr-2-6d104d7", "sv_base_mini_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-base-mini:mw-gnomad-0506-pr-087d4df", - "sv_pipeline_base_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline-base:mw-gnomad-0506-pr-087d4df", - "sv_pipeline_docker" : "us.gcr.io/broad-dsde-methods/eph/sv-pipeline:eph_hotfix_no_evidence-1f461ed", + "sv_pipeline_base_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline-base:mw-gnomad-0506-pr-2-6d104d7", + "sv_pipeline_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline:mw-gnomad-0506-pr-2-6d104d7", + "sv_pipeline_hail_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline-hail:mw-gnomad-0506-pr-2-b7988f0", + "sv_pipeline_updates_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline-updates:mw-gnomad-0506-superscale-dev-304ffa1", "sv_pipeline_qc_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline-qc:mw-gnomad-0506-pr-087d4df", "sv_pipeline_rdtest_docker" : "us.gcr.io/broad-dsde-methods/markw/sv-pipeline-rdtest:mw-gnomad-0506-pr-087d4df", "wham_docker" : "us.gcr.io/broad-dsde-methods/wham:8645aa", diff --git a/input_values/resources_hg38.json b/input_values/resources_hg38.json index eb727d077..dddcec7bf 100644 --- a/input_values/resources_hg38.json +++ b/input_values/resources_hg38.json @@ -2,6 +2,8 @@ "name" : "resources_hg38", "allosome_file" : "gs://gcp-public-data--broad-references/hg38/v0/sv-resources/resources/v1/allosome.fai", "allosomal_contigs" : ["chrX", "chrY"], + "chr_x" : "chrX", + "chr_y" : "chrY", "asc_tarballs" : [ "gs://gatk-sv-resources-secure/resources/hg38_benchmarking/ASC_Werling/ASC_Werling.SV.ALL.bed.gz", "gs://gatk-sv-resources-secure/resources/hg38_benchmarking/ASC_Werling/ASC_Werling.SV.EUR.bed.gz", diff --git a/scripts/docker/build_docker.py b/scripts/docker/build_docker.py index 0b048ab00..e3ea2b892 100755 --- a/scripts/docker/build_docker.py +++ b/scripts/docker/build_docker.py @@ -32,6 +32,8 @@ class to track dependencies, control build and push of entire job 'sv-pipeline-base': {'sv-base': "SVBASE_IMAGE"}, 'sv-pipeline': {'sv-pipeline-base': "SV_PIPELINE_BASE_IMAGE"}, 'sv-pipeline-children-r': {'sv-pipeline-base': "SV_PIPELINE_BASE_IMAGE"}, + 'sv-pipeline-hail': {'sv-pipeline': "SV_PIPELINE_IMAGE"}, + 'sv-pipeline-updates': {'sv-pipeline': "SV_PIPELINE_IMAGE"}, 'sv-pipeline-rdtest': {'sv-pipeline-children-r': "SV_PIPELINE_BASE_R_IMAGE"}, 'sv-pipeline-qc': {'sv-pipeline-children-r': "SV_PIPELINE_BASE_R_IMAGE"} } diff --git a/src/sv-pipeline/04_variant_resolution/scripts/calculate_sr_bothside_support.py b/src/sv-pipeline/04_variant_resolution/scripts/calculate_sr_bothside_support.py new file mode 100644 index 000000000..58656aab9 --- /dev/null +++ b/src/sv-pipeline/04_variant_resolution/scripts/calculate_sr_bothside_support.py @@ -0,0 +1,30 @@ +#!/bin/python + +import sys +from collections import defaultdict + + +def count_vids(list_path): + counts = defaultdict(lambda: 0) + with open(list_path, 'r') as f_list: + for path in f_list: + with open(path.strip(), 'r') as f: + for vid in f: + counts[vid.strip()] += 1 + return counts + + +NON_REF_VIDS_LIST = sys.argv[1] +BOTHSIDE_PASS_LIST = sys.argv[2] + +non_ref_counts = count_vids(NON_REF_VIDS_LIST) +bothside_pass_counts = count_vids(BOTHSIDE_PASS_LIST) + +for vid, bothside_pass_count in bothside_pass_counts.items(): + if bothside_pass_count == 0: + continue + non_ref_count = non_ref_counts[vid] + if non_ref_count == 0: + continue + fraction_support = min(1., bothside_pass_count / float(non_ref_count)) + sys.stdout.write("{}\t{}\n".format(fraction_support, vid)) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_VCF_script.sh b/src/sv-pipeline/04_variant_resolution/scripts/clean_VCF_script.sh deleted file mode 100755 index 03155de8c..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_VCF_script.sh +++ /dev/null @@ -1,682 +0,0 @@ -#!/bin/bash -# -# clean_VCF.sh -# - -##requires >= vcftools/0.1.15 ## - -set -e - -##gzipped vcf## -vcf=$1 -backgroundlist=$2 - - -##get sampleids from VCF## -zcat $vcf \ - |egrep "^#" \ - |tail -n -1 \ - |cut -f10- \ - |tr '\t' '\n' \ - >whitelist.txt - -##convert EV integer back into string## -zcat $vcf \ - | awk '{print $0 "\t"}' \ - | sed -e 's/:7'"\t"'/:RD,PE,SR'"\t"'/g' \ - | sed -e 's/:6'"\t"'/:PE,SR'"\t"'/g' \ - | sed -e 's/:5'"\t"'/:RD,SR'"\t"'/g' \ - | sed -e 's/:3'"\t"'/:RD,PE'"\t"'/g' \ - | sed -e 's/:2'"\t"'/:PE'"\t"'/g' \ - -e 's/:4'"\t"'/:SR'"\t"'/g' \ - -e 's/:1'"\t"'/:RD'"\t"'/g' \ - |sed 's/'"\t"'$//g' \ - |sed 's/ID=EV,Number=1,Type=Integer/ID=EV,Number=1,Type=String/g' \ - | bgzip > EV.update.vcf.gz - -##convert all alt to svtype and alt to N## -svtk vcf2bed EV.update.vcf.gz stdout -i SVTYPE \ - |awk '{ if ($5!~"ME")$5=$7; print $4"\t" "<"$5 ">"}' \ - |gzip \ - >vcf.convert.svtype.gz - -zcat EV.update.vcf.gz \ - |awk 'NR==FNR{inFileA[$1]=$2; next} {if ($3 in inFileA && $1!~"#") $5=inFileA[$3]; print }' OFS='\t' \ - <(zcat vcf.convert.svtype.gz) - \ - |awk '{if ($1!~"#") $4="N"; print}' OFS='\t' \ - |bgzip \ - >convertsvtype.vcf.gz - -##get rid of multiallelic tage in INFO field and add varGQ to QUAL column## -svtk vcf2bed convertsvtype.vcf.gz stdout -i varGQ \ - |awk '{print $4 "\t" $7}' \ - >vargq.persample - -zcat convertsvtype.vcf.gz \ - |sed 's/;MULTIALLELIC//g' \ - |sed 's/;varGQ=[0-9]*//g' \ - |awk 'NR==FNR{inFileA[$1]=$2; next} {if ($3 in inFileA && $1!~"#") $6=inFileA[$3]; print }' OFS='\t' vargq.persample - \ - |bgzip \ - >cleaninfo.vcf.gz - - -##change tag for SR background failures## -zcat cleaninfo.vcf.gz \ - |awk 'NR==FNR{inFileA[$1]; next} {if ($3 in inFileA && $1!~"#") sub($7,"HIGH_SR_BACKGROUND"); print }' $backgroundlist - \ - |awk '{if (NR==2) print $0 "\n" "##FILTER=" ;else print}' \ - |bgzip \ - >int.vcf.gz - - -##Remove CNVs that are improperly genotyped by depth because they are nested within a real CNV## - -##Only affects CNV so pull those out## -svtk vcf2bed int.vcf.gz stdout \ - |awk '{if ($5=="DEL" || $5=="DUP") print}' \ - |gzip>int.bed.gz - -##list of potenital overlaps with a normal copy state variant (>5kb variants require depth but nested events could be missed; i.e a duplication with a nest deletion will have a normal copy state for the deletion)## -bedtools intersect -wa -wb -a <(zcat int.bed.gz|awk '{if ($3-$2>=5000 ) print}') \ --b <(zcat int.bed.gz|awk '{if ($3-$2>=5000) print}') \ - |awk -F"\t" '{if ($4!=$10 && $3-$2>=$9-$8 && $5!=$11) print ;\ - else if ($4!=$10 && $5!=$11) print $7,$8,$9,$10,$11,$12,$1,$2,$3,$4,$5,$6}' OFS='\t' \ - |awk -F'\t' '{if ($6!="") print}' \ - |sort -u \ - >normaloverlap.txt - - -##pull out the depth based copy number variant for each normal overlapping variant## -zcat int.vcf.gz \ - |awk '{if ($1!~"#") sub($1,$3);print}' \ - |awk '{if ($1~"#" || $5=="" || $5=="") print}' \ - |vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \ - |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1"@"header[j] "\t" $j }' \ - |sort -k1,1 \ - |gzip \ - >RD_CN.normalcheck.FORMAT.gz - - -##pull out evidence supporting each normal overlapping variant## -cat <(zcat int.vcf.gz|awk -F"\t" '{if ($1~"#") print}') \ - <(awk '{print $4 "\n" $10}' normaloverlap.txt|sort -u|fgrep -wf - <(zcat int.vcf.gz))\ - |awk '{if ($1!~"#") sub($1,$3);print}' \ - |vcftools --vcf - --stdout --extract-FORMAT-info EV \ - |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1"@"header[j] "\t" $j }' \ - |sort -k1,1 \ - |gzip \ - >EV.normalcheck.FORMAT.gz - -##check if nested is incorrectly classified as normal## - -while read bed -do - echo $bed|tr ' ' '\t'|cut -f1-6 >large.bed - echo $bed|tr ' ' '\t'|cut -f7-12>small.bed - ##require at least 50% coverage to consider a variant overlapping## - overlap=$(bedtools coverage -a small.bed -b large.bed|awk '{if ($NF>=0.50) print "YES";else print "NO"}') - - if [ "$overlap" == "YES" ] - then - smallid=$(awk '{print $4}' small.bed) - - ##pull out variants that are called a variants for both the smaller and larger CNVs (don't have normal copy state to check for)## - awk '{print $NF}' small.bed \ - |tr ',' '\n' \ - |fgrep -wvf - <(awk -F"[,\t]" -v var=$smallid '{for(i=6;i<=NF;i++) print var"@"$i "\t" $4"@"$i "\t" $5}' large.bed) \ - >>overlap.test.txt - fi -donegeno.normal.revise.txt - -##Update genotypes## - -##Determine columns of VCF after header## -zcat int.vcf.gz \ - |egrep ^# \ - |tail -n 1 \ - |tr '\t' '\n' \ - |cat -n - \ - >col.txt - - -##seed the vcf lines file which will provide the revisions to vcf file## -echo "">normal.revise.vcf.lines.txt - - -##pull out and revise vcf line that needs to be edited## -while read line -do - id=$(echo $line|awk '{print $2}' ) - col=$(fgrep -w $id col.txt|awk '{print $1}') - variant=$(echo $line|awk '{print $1}') - cn=$(echo $line|awk '{print $3}') - - zcat int.vcf.gz |fgrep -w $variant >line.txt - - ##Updated genotype and rebuild Format field ## - GT=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $1}') - GQ=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $2}') - RD_CN=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $3}') - RD_GQ=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $4}') - PE_GT=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $5}') - PE_GQ=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $6}') - SR_GT=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $7}') - SR_GQ=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $8}') - EV=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $9}') - - if [ $(cat normal.revise.vcf.lines.txt|fgrep -w $variant|wc -l) -gt 0 ] - then - cat normal.revise.vcf.lines.txt \ - |awk -v col=$col -v var=$variant -v GT=$GT -v GQ=$GQ -v RD_CN=$cn -v RD_GQ=$RD_GQ -v PE_GT=$PE_GT -v PE_GQ=$PE_GQ -v SR_GT=$SR_GT -v SR_GQ=$SR_GT -v EV=$EV '{if ($3==var ) $col="0/1:"GQ":"RD_CN":"RD_GQ":"PE_GT":"PE_GQ":"SR_GT":"SR_GQ":"EV ;print}' \ - >int.lines.txt - - cat int.lines.txt > normal.revise.vcf.lines.txt - - else - cat line.txt \ - |awk -v col=$col -v var=$variant -v GT=$GT -v GQ=$GQ -v RD_CN=$cn -v RD_GQ=$RD_GQ -v PE_GT=$PE_GT -v PE_GQ=$PE_GQ -v SR_GT=$SR_GT -v SR_GQ=$SR_GT -v EV=$EV '{if ($3==var ) $col="0/1:"GQ":"RD_CN":"RD_GQ":"PE_GT":"PE_GQ":"SR_GT":"SR_GQ":"EV ;print}' \ - >>normal.revise.vcf.lines.txt - fi - -donenormal.revise.vcf.gz - -##create new bed with updated genotypes### -svtk vcf2bed normal.revise.vcf.gz stdout \ - |awk '{if ($5=="DEL" || $5=="DUP") print}' \ - |sort -k4,4 \ - |gzip \ - >int.afternormalfix.bed.gz - - -###Find overlapping depth based variants and reassign depth based; note this is necessary because depth call >5kb genotypes are 100% driven by depth ## - -## generate a sample list based on depth for depth overlap check below. Necessary because genotype is capped at 1/1 and by direction (i.e no dels in dups)## -zcat normal.revise.vcf.gz \ - |awk '{if ($1~"#" || ($5=="" || $5=="")) print}'\ - |awk '{if ($1!~"#") sub($1,$3);print}' \ - |vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \ - |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1 "\t" header[j] "\t" $j }' \ - |sort -k1,1 \ - |gzip \ - >RD_CN.afternormalfix.FORMAT.gz - -##grab all samples per variant with a non normal copy state## -zcat RD_CN.afternormalfix.FORMAT.gz \ - |awk '{if ($3!="2") print $1 "\t" $2}' \ - |awk '{a[$1]=a[$1]?a[$1]","$2:$2;}END{for (i in a)print i "\t" a[i];}' \ - |sort -k1,1 \ - >afternormal.combined.RD_CN.list.txt - -#overlapping## -zcat int.afternormalfix.bed.gz \ - |cut -f1-5 \ - |join -1 4 -2 1 -t $'\t' - afternormal.combined.RD_CN.list.txt \ - |awk -F"\t" '{if ($6!="") print }' \ - |awk -F'[,\t]' '{for(i=6;i<=NF;i++) print $2"_"$i,$3,$4,$1,$5,$i,$1"@"$i}' \ - |tr ' ' '\t' \ - |gzip \ - >all.bed.gz - - -##intersect variants and always set larger to left## -bedtools intersect -wa -wb -a all.bed.gz -b all.bed.gz \ - |awk '{if ($4!=$11 && $3-$2>=$10-$9) print $0;else if ($4!=$11) print $8,$9,$10,$11,$12,$13,$14,$1,$2,$3,$4,$5,$6,$7}' \ - |tr ' ' '\t' \ - |sort -u \ - |sort -k7,7 \ - |gzip \ - >bed.overlap.txt.gz - - -##pull out per variant metrics from the INFO field## - -for var in EV RD_CN PE_GT SR_GT PE_GQ SR_GQ -do - cat <(zcat normal.revise.vcf.gz|awk -F"\t" '{if ($1~"#") print}') \ - <(zcat bed.overlap.txt.gz|awk '{print $4 "\n" $11}' |sort -u|fgrep -wf - <(zcat normal.revise.vcf.gz)) \ - |awk '{if ($1!~"#") sub($1,$3);print}' \ - |vcftools --vcf - --stdout --extract-FORMAT-info ${var} \ - |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1"@"header[j] "\t" $j }' \ - |sort -k1,1 \ - |gzip \ - >${var}.FORMAT.gz -done - -##Append info field to bed file## -join -1 7 -2 1 <(zcat bed.overlap.txt.gz) \ - <(zcat EV.FORMAT.gz)|join -j 1 - <(zcat RD_CN.FORMAT.gz) \ - |join -j 1 - <(zcat PE_GT.FORMAT.gz) \ - |join -j 1 - <(zcat PE_GQ.FORMAT.gz) \ - |join -j 1 - <(zcat SR_GT.FORMAT.gz) \ - |join -j 1 - <(zcat SR_GQ.FORMAT.gz) \ - |sort -k14,14 \ - |join -1 14 -2 1 - <(zcat EV.FORMAT.gz) \ - |join -j 1 - <(zcat RD_CN.FORMAT.gz) \ - |join -j 1 - <(zcat PE_GT.FORMAT.gz) \ - |join -j 1 - <(zcat PE_GQ.FORMAT.gz) \ - |join -j 1 - <(zcat SR_GT.FORMAT.gz) \ - |join -j 1 - <(zcat SR_GQ.FORMAT.gz) \ - |tr ' ' '\t' \ - |cut -f3- \ - |awk '{print $3-$2,$10-$9,$0}' \ - |tr ' ' '\t' \ - |sort -nrk1,1 -k2,2nr \ - |cut -f3- \ - |gzip \ - >all.combined.bed.gz - - -####If Multi-allelic is driving depth difference ignore### - -##get copy state per variant## -zcat normal.revise.vcf.gz \ - |awk '{if ($1!~"#") sub($1,$3);print}' \ - |vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \ - |gzip \ - >copystate.RD_CN.FORMAT.gz - -##get copy state per variant## -zcat copystate.RD_CN.FORMAT.gz \ - |awk 'NR>1{for(i=3;i<=NF;i++) lines[$1 "\t" $i]++ } END{for (x in lines) print x}' \ - |gzip \ - >copystate.per.variant.txt.gz - -##Find multi-allelic for del or dup ; CNV >1kb we trust depth ## -##del## -zcat copystate.per.variant.txt.gz \ - |awk '{if ($2!="." && $2>2) print $1}' \ - |sort -u \ - |fgrep -wf <(zcat int.bed.gz|awk -F"\t" '{if ($5=="DEL" && $3-$2>=1000) print $4}' ) \ - >multi.cnvs.txt - -##dup## -zcat copystate.per.variant.txt.gz \ - |awk '{if ($2!="." && ($2<2 || $2>4)) print $1}' \ - |sort -u \ - |fgrep -wf <(zcat int.bed.gz|awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}' ) \ - >>multi.cnvs.txt - - -##update copy state which will lead to a new genotype when genotyping is rerun towards end of script ## -echo "">RD_CN.revise.txt - -while read id -do - echo $id - zcat all.combined.bed.gz \ - |awk -v id=$id '{if ($6==id) print $0 "\t" $4"@"$10}' \ - >overlap.bed.ids.txt - - while read bed - do - compareID=$(echo $bed |awk '{print $NF}') - id1=$(echo $bed |awk '{print $4"@"$6}') - id2=$(echo $bed |awk '{print $10"@"$12}') - vID1=$(echo $bed |awk '{print $4}') - vID2=$(echo $bed |awk '{print $10}') - svtype1=$(echo $bed |awk '{print $5}') - svtype2=$(echo $bed |awk '{print $11}') - support1=$(echo $bed |awk '{print $13}') - support2=$(echo $bed |awk '{print $19}') - length1=$(echo $bed|awk '{print $3-$2}') - length2=$(echo $bed|awk '{print $9-$8}') - RD_CN1=$(echo $bed|awk '{print $14}') - RD_CN2=$(echo $bed|awk '{print $20}') - PE_GT1=$(echo $bed|awk '{print $15}') - PE_GT2=$(echo $bed|awk '{print $21}') - PE_GQ1=$(echo $bed|awk '{print $16}') - PE_GQ2=$(echo $bed|awk '{print $22}') - SR_GT1=$(echo $bed|awk '{print $17}') - SR_GT2=$(echo $bed|awk '{print $23}') - SR_GQ1=$(echo $bed|awk '{print $18}') - SR_GQ2=$(echo $bed|awk '{print $24}') - - echo $bed|tr ' ' '\t'|cut -f1-6 >large.bed - echo $bed|tr ' ' '\t'|cut -f7-12>small.bed - overlap=$(bedtools coverage -a small.bed -b large.bed|awk '{if ($NF>0.50) print "YES";else print "NO"}') - - ##remove any large CNV comparisons that have been revised to normal copy state of 2## - awk '{if ($2==2) print $1}' RD_CN.revise.txt>depthnormal.exclude.txt - - if [ $(fgrep -w $id1 depthnormal.exclude.txt |wc -l) -eq 0 ] - then - ##classification## - ##Call where smaller depth call is being driven by larger## - if [[ $support1 =~ "RD" ]] && [[ $support2 = "RD" ]] && [ "$overlap" == "YES" ] && [[ $support1 != "RD" ]] && [ $(fgrep -w $vID1 multi.cnvs.txt |wc -l) -eq 0 ] - then - echo $bed \ - |awk -v id2=$id2 -v svtype1=$svtype1 -v RD_CN1=$RD_CN1 -v RD_CN2=$RD_CN2 '{if (RD_CN1==1) print id2 "\t" RD_CN2+RD_CN1 ; \ - else if(RD_CN1>1) print id2 "\t" RD_CN2-(RD_CN1-2) }' \ - >>RD_CN.revise.txt - ##Smaller CNV driving larger CNV genotype## - elif [[ $support1 = "RD" ]] && [[ $support2 =~ "RD" ]] && [ "$overlap" == "YES" ] && [[ $support2 != "RD" ]] && [ $(fgrep -w $vID2 multi.cnvs.txt |wc -l) -eq 0 ] - then - echo $bed \ - |awk -v id1=$id1 -v svtype1=$svtype1 -v RD_CN1=$RD_CN1 -v RD_CN2=$RD_CN2 '{if (RD_CN2==1) print id1 "\t" RD_CN1+RD_CN2 ; \ - else if(RD_CN2>1) print id1 "\t" RD_CN1-(RD_CN2-2) }' \ - >>RD_CN.revise.txt - ##Depth only calls where smaller call is being driven by larger## - elif [[ $support1 = "RD" ]] && [[ $support2 = "RD" ]] && [ "$overlap" == "YES" ] && [ "$svtype1" == "$svtype2" ] && [ $(fgrep -w $vID1 multi.cnvs.txt |wc -l) -eq 0 ] - then - echo $bed \ - |awk -v id2=$id2 -v svtype1=$svtype1 -v RD_CN1=$RD_CN1 -v RD_CN2=$RD_CN2 '{if (RD_CN1==1 && RD_CN1>RD_CN2 ) print id2 "\t" 1; \ - else if (RD_CN1>1 && RD_CN1>RD_CN.revise.txt - ##Any other time a larger call is driving a smaller call## - elif [[ $support1 =~ "RD" ]] && [ "$overlap" == "YES" ] && [ $length2 -gt 5000 ] && [ $(fgrep -w $vID1 multi.cnvs.txt |wc -l) -eq 0 ] - then - echo $bed \ - |awk -v id2=$id2 -v svtype1=$svtype1 -v RD_CN1=$RD_CN1 -v RD_CN2=$RD_CN2 '{if (RD_CN1==1) print id2 "\t" RD_CN2+RD_CN1 ; \ - else if(RD_CN1>1) print id2 "\t" RD_CN2-(RD_CN1-2) }' \ - >>RD_CN.revise.txt - fi - fi -doneRD_CN.revise.forgeno.txt - -##Determine columns of VCF after header## -zcat normal.revise.vcf.gz\ - |egrep ^# \ - |tail -n 1 \ - |tr '\t' '\n' \ - |cat -n - \ - >col.txt - - -##seed the vcf lines file which will provide the revisions to vcf file## -echo "">revise.vcf.lines.txt - - -##pull out and revise vcf line that needs to be edited## -while read line -do - id=$(echo $line|awk '{print $2}' ) - col=$(fgrep -w $id col.txt|awk '{print $1}') - variant=$(echo $line|awk '{print $1}') - cn=$(echo $line|awk '{print $3}') - - zcat normal.revise.vcf.gz |fgrep -w $variant >line.txt - - echo $variant $id - ##Updated genotype and rebuild Format field ## - GT=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $1}') - GQ=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $2}') - RD_CN=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $3}') - RD_GQ=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $4}') - PE_GT=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $5}') - PE_GQ=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $6}') - SR_GT=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $7}') - SR_GQ=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $8}') - EV=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $9}') - - if [ $(cat revise.vcf.lines.txt|fgrep -w $variant|wc -l) -gt 0 ] - then - cat revise.vcf.lines.txt \ - |awk -v col=$col -v var=$variant -v GT=$GT -v GQ=$GQ -v RD_CN=$cn -v RD_GQ=$RD_GQ -v PE_GT=$PE_GT -v PE_GQ=$PE_GQ -v SR_GT=$SR_GT -v SR_GQ=$SR_GT -v EV=$EV '{if ($3==var ) $col="0/1:"GQ":"RD_CN":"RD_GQ":"PE_GT":"PE_GQ":"SR_GT":"SR_GQ":"EV ;print}' \ - >int.lines.txt - - cat int.lines.txt > revise.vcf.lines.txt - - else - cat line.txt \ - |awk -v col=$col -v var=$variant -v GT=$GT -v GQ=$GQ -v RD_CN=$cn -v RD_GQ=$RD_GQ -v PE_GT=$PE_GT -v PE_GQ=$PE_GQ -v SR_GT=$SR_GT -v SR_GQ=$SR_GT -v EV=$EV '{if ($3==var ) $col="0/1:"GQ":"RD_CN":"RD_GQ":"PE_GT":"PE_GQ":"SR_GT":"SR_GQ":"EV ;print}' \ - >>revise.vcf.lines.txt - fi - -doneoverlap.revise.vcf.gz - -##multi check## -zcat overlap.revise.vcf.gz \ - |awk '{if ($1!~"#") sub($1,$3);print}' \ - |vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \ - |gzip \ - >copystate.RD_CN.FORMAT.gz - -zcat copystate.RD_CN.FORMAT.gz \ - |awk 'NR>1{for(i=3;i<=NF;i++) lines[$1 "\t" $i]++ } END{for (x in lines) print x}' \ - |gzip \ - >copystate.per.variant.txt.gz - -##Copy state just del and dup ; CNV >1kb we trust depth ## -zcat copystate.per.variant.txt.gz \ - |awk '{if ($2!="." && $2>2) print $1}' \ - |sort -u \ - |fgrep -wf <(zcat int.bed.gz|awk -F"\t" '{if ($5=="DEL" && $3-$2>=1000) print $4}' ) \ - |gzip \ - >multi.del.ids.txt.gz - -zcat copystate.per.variant.txt.gz \ - |awk '{if ($2!="." && ($2<2 || $2>4)) print $1}' \ - |sort -u \ - |fgrep -wf <(zcat int.bed.gz|awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}' ) \ - |gzip \ - >multi.dup.ids.txt.gz - -##Regenotype to determine multiallelic## -##Genotype big dup## -svtk vcf2bed overlap.revise.vcf.gz stdout \ - |gzip>regeno.bed.gz - -##generate list## -##CNV >5kb, split del and dup ## -## ## -zcat regeno.bed.gz \ - |awk '{if ($3-$2>=5000 && $5=="DUP")print $4}' \ - |fgrep -wvf <(zcat multi.dup.ids.txt.gz) \ - >gt5kb.dup.ids.txt - -zcat regeno.bed.gz \ - |awk '{if ($3-$2>=5000 && $5=="DEL")print $4}' \ - |fgrep -wvf <(zcat multi.del.ids.txt.gz) \ - >gt5kb.del.ids.txt - -end=$(zcat overlap.revise.vcf.gz|awk '{if ($1!~"#") print}'|head -n 1 |awk -F'[:\t]' '{print NF}' ) - -zcat overlap.revise.vcf.gz \ - |fgrep -wf gt5kb.dup.ids.txt \ - >dup.int.txt - -zcat overlap.revise.vcf.gz \ - |fgrep -wf gt5kb.del.ids.txt \ - >del.int.txt - -##regenotype VCF## -for ((i=18;i<=$end;i+=9)) -do - echo $i - cat dup.int.txt \ - |awk -F'[:\t]' -v i=$i '{if ($(i+2)==2) sub($i,"0/0"); \ - else if ($(i+2)==3) sub($i,"0/1"); \ - else sub($i,"1/1");print}' \ - >dup.revise.txt - - cat del.int.txt \ - |awk -F'[:\t]' -v i=$i '{if ($(i+2)==2) sub($i,"0/0"); \ - else if ($(i+2)==1) sub($i,"0/1"); \ - else sub($i,"1/1");print}' \ - >del.revise.txt - - cat dup.revise.txt>dup.int.txt - cat del.revise.txt>del.int.txt -done - -cat <(zcat overlap.revise.vcf.gz|fgrep -wvf <(cat gt5kb.dup.ids.txt gt5kb.del.ids.txt)) \ - <(cat dup.revise.txt del.revise.txt) \ - |vcf-sort \ - |bgzip \ - >newdepth.geno.vcf.gz - - -##Tag VCF## -##find individual level metrics to determine multi allelic by PE/SR genotypes## - -for var in PE_GT SR_GT PE_GQ SR_GQ -do - zcat newdepth.geno.vcf.gz \ - |awk '{if ($1!~"#") sub($1,$3);print}' \ - |vcftools --vcf - --stdout --extract-FORMAT-info ${var} \ - |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1"@"header[j] "\t" $j }' \ - |sort -k1,1 \ - |gzip \ - >multicheck.${var}.FORMAT.gz -done - -##concatenate metrics## -join -j 1 <(zcat multicheck.PE_GT.FORMAT.gz) \ - <(zcat multicheck.PE_GQ.FORMAT.gz) \ - |join -j 1 - <(zcat multicheck.SR_GT.FORMAT.gz) \ - |join -j 1 - <(zcat multicheck.SR_GQ.FORMAT.gz) \ - |tr ' ' '\t' \ - |gzip \ - >multi.combined.format.gz - - -##check by genotype## -zcat multi.combined.format.gz \ - |awk '{if ($2>0 && $4==0) print $1"\t" $2; \ - else if ($2==0) print $1 "\t" $4; \ - else if ($3>=$5)print $1"\t" $2; \ - else print $1"\t" $4 }' \ - |tr '@' '\t' \ - |awk '{if ($3>2 && $2!=".") print $1}' \ - |sort -u \ - |gzip \ - >multi.geno.ids.txt.gz - -##Tag multi## -zcat newdepth.geno.vcf.gz \ - |awk 'NR==FNR{inFileA[$1]; next} {if ($3 in inFileA && $1!~"#") sub($7,"MULTIALLELIC"); print }' OFS='\t' \ - <(zcat multi.del.ids.txt.gz multi.dup.ids.txt.gz multi.geno.ids.txt.gz|sort -u) - \ - |bgzip \ - >multitagged.vcf.gz - -###genotype multiallelics## -##pull out multiallelic lines of vcf### -zcat multitagged.vcf.gz \ - |fgrep -wf <(zcat multi.geno.ids.txt.gz) \ - >multi.gt.int.txt - -zcat multitagged.vcf.gz \ - |fgrep -wf <(zcat multi.dup.ids.txt.gz) \ - >multi.dup.int.txt - -zcat multitagged.vcf.gz \ - |fgrep -wf <(zcat multi.del.ids.txt.gz) \ - >multi.del.int.txt - -end=$(zcat multitagged.vcf.gz|awk '{if ($1!~"#") print}'|head -n 1 |awk -F'[:\t]' '{print NF}' ) - -for ((i=18;i<=$end;i+=9)) -do - echo $i - cat multi.dup.int.txt \ - |awk -F'[:\t]' -v i=$i '{sub($i,"./"$(i+2));print}' \ - >dup.multi.revise.txt - - cat multi.del.int.txt \ - |awk -F'[:\t]' -v i=$i '{sub($i,"./"$(i+2));print}' \ - >del.multi.revise.txt - - cat multi.gt.int.txt \ - |awk -F'[:\t]' -v i=$i '{if ($(i+4)>0 && $(i+6)==0) sub($i,"./"$(i+4)); \ - else if ($(i+4)==0) sub($i,"./"$(i+6)); \ - else if ($(i+5)>=$(i+7)) sub($i,"./"$(i+4)); \ - else sub($i,"./"$(i+6)) ;print }' \ - >gt.multi.revise.txt - - cat dup.multi.revise.txt>multi.dup.int.txt - cat del.multi.revise.txt>multi.del.int.txt - cat gt.multi.revise.txt>multi.gt.int.txt -done - -##remove overlapping multi### -zcat multitagged.vcf.gz \ - |awk '{if ($1~"#" || ($7=="MULTIALLELIC" && ($5=="" || $5==""))) print}' \ - |svtk vcf2bed stdin stdout \ - |gzip \ - >multi.bed.gz - -##strip out overlapping multiallelics## -bedtools intersect -wa -wb -a multi.bed.gz -b multi.bed.gz \ - |awk '{if ($4!=$10 && $3-$2>=$9-$8) print $0; \ - else if ($4!=$10) print $7,$8,$9,$10,$11,$12,$1,$2,$3,$4,$5,$6}' \ - |tr ' ' '\t' \ - |sort -u \ - |awk '{print $3-$2,$9-$8,$0}' \ - |tr ' ' '\t' \ - |sort -nrk1,1 -k2,2nr \ - |cut -f3- \ - |gzip \ - >multi.bed.overlap.txt.gz - -echo "">multi.remove.txt - -while read bed -do - echo $bed|tr ' ' '\t'|cut -f1-6 >large.bed - echo $bed|tr ' ' '\t'|cut -f7-12>small.bed - overlap=$(bedtools coverage -a small.bed -b large.bed|awk '{if ($NF>0.50) print "YES";else print "NO"}') - - if [ "$overlap" == "YES" ] && [ $(awk '{print $4}' large.bed|fgrep -wf - multi.remove.txt|wc -l) -eq 0 ] - then - awk '{print $4}' small.bed >>multi.remove.txt - fi -done< <(zcat multi.bed.overlap.txt.gz) - - -##strip out variants with no genotypes and overlapping multiallelics## -### Find missing genotype and then add multiallelics that need to be removed### - -svtk vcf2bed multitagged.vcf.gz stdout \ - |awk -F'\t' '{if ($6=="") print $4}' \ - |cat - multi.remove.txt \ - |sed '/^$/d' \ - |fgrep -wvf - <(zcat multitagged.vcf.gz) \ - |gzip \ - >cleantagandmulti.vcf.gz - -##Fix header## -##get header to clean## -##add new filters## -zcat cleantagandmulti.vcf.gz \ - |awk '{if (NR==2) print $0 "\n" "##FILTER=" ;else print}' \ - |awk '{if ($1~"##" && NR>1) print}' \ - |sort -k1,1 \ - |egrep -v "CIPOS|CIEND|RMSSTD|MEMBERS|UNRESOLVED|source|MULTIALLELIC|varGQ|bcftools|ALT=polished.vcf.gz - diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1.sh b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1.sh deleted file mode 100755 index c1798b378..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1.sh +++ /dev/null @@ -1,268 +0,0 @@ -#!/bin/bash -# -# clean_VCF.sh -# - -##requires >= vcftools/0.1.15 ## -##requires >= bcftools/1.9 ## - -set -euxo pipefail - -# use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker -BCFTOOLS=/usr/local/bin/bcftools - -##gzipped vcf## -vcf=$1 -backgroundlist=$2 -famfile=$3 -allosome_fai=$4 - -##get sampleids from VCF## -zcat $vcf \ - |sed -n '1,1000p' \ - |egrep "^#" \ - |tail -n -1 \ - |cut -f10- \ - |tr '\t' '\n' \ - > includelist.txt - -##convert EV integer back into string## -/opt/sv-pipeline/04_variant_resolution/scripts/replace_ev_numeric_code_with_string.py ${vcf} - | bgzip -c > EV.update.vcf.gz -rm $vcf - -##convert all alt to svtype and alt to N## -svtk vcf2bed EV.update.vcf.gz stdout -i SVTYPE \ - |awk -F"\t" '{ if ($5!~"ME")$5=$7; print $4"\t" "<"$5 ">"}' \ - |gzip \ - >vcf.convert.svtype.gz - -zcat EV.update.vcf.gz \ - |awk 'NR==FNR{inFileA[$1]=$2; next} {if ($3 in inFileA && $1!~"#") $5=inFileA[$3]; print }' OFS='\t' \ - <(zcat vcf.convert.svtype.gz) - \ - |awk '{if ($1!~"#") $4="N"; print}' OFS='\t' \ - |bgzip \ - >convertsvtype.vcf.gz - -##get rid of multiallelic tage in INFO field and add varGQ to QUAL column and Members field## -svtk vcf2bed convertsvtype.vcf.gz stdout -i varGQ \ - |awk -F"\t" '{print $4 "\t" $7}' \ - >vargq.persample - -zcat convertsvtype.vcf.gz \ - |sed 's/;MULTIALLELIC//g' \ - |sed 's/UNRESOLVED;//g' \ - |sed 's/;varGQ=[0-9]*//g' \ - |awk 'NR==FNR{inFileA[$1]=$2; next} {if ($3 in inFileA && $1!~"#") $6=inFileA[$3]; print }' OFS='\t' vargq.persample - \ - |bgzip \ - >cleaninfo.vcf.gz - tabix -p vcf cleaninfo.vcf.gz - - -##fix sex chr if necessary## -if [ $(zcat cleaninfo.vcf.gz|awk '{if (($1~"X" || $1~"Y") && $1!~"#" ) print}'|wc -l) -gt 0 ] -then - - -svtk vcf2bed cleaninfo.vcf.gz stdout \ - |awk '{if (($5=="DEL" || $5=="DUP") && $3-$2>=5000 && ($1~"X" || $1~"Y") && $1!~"#") print}' \ - >clean.bed || true - -awk '{print $4}' clean.bed>clean.bed.ids.txt - - -##male## -awk '{if ($5==1) print $2}' $famfile \ - |fgrep -wf <(zcat cleaninfo.vcf.gz|head -n 1000|fgrep "CHROM"|fgrep POS|cut -f10-|tr '\t' '\n') >male.txt - -##female## -awk '{if ($5==2) print $2}' $famfile \ - |fgrep -wf <(zcat cleaninfo.vcf.gz|head -n 1000|fgrep "CHROM"|fgrep POS|cut -f10-|tr '\t' '\n') >female.txt - - if [ $(cat clean.bed.ids.txt|wc -l) -gt 0 ] - then - - awk '{print $1"\t0\t"$2}' < ${allosome_fai} > allosomes.list - ${BCFTOOLS} query -R allosomes.list -S male.txt -i 'ID=@clean.bed.ids.txt' -f '[%ID\t%SAMPLE\t%RD_CN\n]' cleaninfo.vcf.gz \ - | awk '{if ($3!=".") print}' \ - | gzip > RD_CN.sexcheck.FORMAT.male.gz - - ${BCFTOOLS} query -R allosomes.list -S female.txt -i 'ID=@clean.bed.ids.txt' -f '[%ID\t%SAMPLE\t%RD_CN\n]' cleaninfo.vcf.gz \ - | awk '{if ($3!=".") print}' \ - | gzip > RD_CN.sexcheck.FORMAT.female.gz - - zcat RD_CN.sexcheck.FORMAT.male.gz| Rscript -e 'd<-read.table("stdin")' \ - -e 'x<-tapply(d[,3],d[,1],median)' \ - -e 'write.table(x,"male.median.value.pervar.txt",col.names=FALSE,quote=FALSE,sep = "\t")' - - zcat RD_CN.sexcheck.FORMAT.female.gz| Rscript -e 'd<-read.table("stdin")' \ - -e 'x<-tapply(d[,3],d[,1],median)' \ - -e 'write.table(x,"female.median.value.pervar.txt",col.names=FALSE,quote=FALSE,sep = "\t")' - fi -##Pull out ids where male copy state 1 to normal when female normal and on X## - echo "">sexchr.revise.txt - - if [ $(awk '{if (($5=="DEL" || $5=="DUP") && $3-$2>=5000) print }' clean.bed|awk '{if (($1~"X") && $1!~"#" ) print}'|wc -l) -gt 0 ] - then - awk '{if ($2==1) print $1}' male.median.value.pervar.txt \ - |fgrep -wf <(awk '{if ($2==2) print $1}' female.median.value.pervar.txt) \ - |fgrep -wf - <(zcat cleaninfo.vcf.gz|awk '{if ($1~"X" && $1!~"#") print $3}') \ - >sexchr.revise.txt || true - fi - - if [ $(awk '{if (($5=="DEL" || $5=="DUP") && $3-$2>=5000) print }' clean.bed|awk '{if (($1~"Y") && $1!~"#" ) print}'|wc -l) -gt 0 ] - then - awk '{if ($2==1) print $1}' male.median.value.pervar.txt \ - |fgrep -wf <(awk '{if ($2==0) print $1}' female.median.value.pervar.txt) \ - |fgrep -wf - <(zcat cleaninfo.vcf.gz|awk '{if ($1~"Y" && $1!~"#") print $3}') \ - >>sexchr.revise.txt || true - fi - - -${BCFTOOLS} index cleaninfo.vcf.gz - -##Pull out male and females sex chr## -${BCFTOOLS} view cleaninfo.vcf.gz -S male.txt -r chrX:1-1000000000,chrY:1-1000000000,X:1-1000000000,Y:1-1000000000 --no-update|bgzip>male.vcf.gz -${BCFTOOLS} view cleaninfo.vcf.gz -S female.txt -r chrX:1-1000000000,chrY:1-1000000000,X:1-1000000000,Y:1-1000000000 --no-update|bgzip>female.vcf.gz - -${BCFTOOLS} index male.vcf.gz -${BCFTOOLS} index female.vcf.gz - -zcat male.vcf.gz\ - |awk -F'\t' '{if ($5~"DEL" && $1!~"#") print $0 "\t" "ENDOFLINE"}' \ - |fgrep -wf sexchr.revise.txt \ - |tr '\t' '\n' \ - |awk -F':' '{if ($3>=1 && NF>4 && $1!="GT") $1="0/0";else if ($3==0 && NF>4 && $1!="GT" ) $1="0/1"; if (NF>4 && $1!="GT") $3=$3+1;print}' OFS=":" \ - |tr '\n' '\t' \ - |sed 's/ENDOFLINE/\n/g' \ - |sed -e 's/^[ \t]*//' \ - |sed -e 's/[\t]$//g' \ - |bgzip \ - >male_del.revise.txt.gz ||true - - -zcat male.vcf.gz\ - |awk -F'\t' '{if ($5~"DUP" && $1!~"#") print $0 "\t" "ENDOFLINE"}' \ - |fgrep -wf sexchr.revise.txt \ - |tr '\t' '\n' \ - |awk -F':' '{if ($3<=1 && NF>4 && $1!="GT") $1="0/0";else if ($3==2 && NF>4 && $1!="GT" ) $1="0/1";else if (NF>4 && $1!="GT" ) $1="1/1"; if (NF>4 && $1!="GT" ) $3=$3+1;print}' OFS=":" \ - |tr '\n' '\t' \ - |sed 's/ENDOFLINE/\n/g' \ - |sed -e 's/^[ \t]*//' \ - |sed -e 's/[\t]$//g' \ - |bgzip \ - >male_dup.revise.txt.gz ||true - - if [ $(cat male_dup.revise.txt.gz male_del.revise.txt.gz|wc -l) -gt 0 ] - then - cat <(zcat male.vcf.gz|fgrep -wvf <(zcat male_dup.revise.txt.gz male_del.revise.txt.gz|awk '{print $3}' )) \ - <(zcat male_del.revise.txt.gz male_dup.revise.txt.gz|awk '{if ($1!="") print}'|tr ' ' '\t') \ - |vcf-sort \ - |bgzip \ - >cleanmale.vcf.gz - else - cp male.vcf.gz cleanmale.vcf.gz - fi - - ${BCFTOOLS} index cleanmale.vcf.gz - - ##Modify female only for chrY### - if [ $(zcat cleaninfo.vcf.gz |awk '{if ($1~"Y" && $1!~"#") print}'|wc -l) -gt 0 ] - then - zcat female.vcf.gz\ - |awk -F'\t' '{if ($1!~"#" && $1~"Y") print $0 "\t" "ENDOFLINE"}' \ - |tr '\t' '\n' \ - |awk -F':' '{ if (NF>4 && $1!="GT" ) $1="./." \ - ;if (NF>4 && $1!="GT" ) $2=$3=$4=$5=$6=$7=$8=$9=".";print}' OFS=":" \ - |tr '\n' '\t' \ - |sed 's/ENDOFLINE/\n/g' \ - |sed -e 's/^[ \t]*//' \ - |sed -e 's/[\t]$//g' \ - |bgzip \ - >female.y.revise.txt.gz - - cat <(zcat female.vcf.gz \ - |fgrep -wvf <(zcat female.y.revise.txt.gz|awk '{print $3}' )) \ - <(zcat female.y.revise.txt.gz) \ - |vcf-sort \ - |bgzip \ - >cleanfemale.vcf.gz - - ${BCFTOOLS} index cleanfemale.vcf.gz - - else - cp female.vcf.gz cleanfemale.vcf.gz - ${BCFTOOLS} index cleanfemale.vcf.gz - fi - - - ##replace genotype to ./. for other sex calls## - ##sex anueplodies ## - - if [ $(awk '{if ($5!=2 && $5!=1) print $2}' $famfile|wc -l) -gt 0 ] - then - awk '{if ($5!=2 && $5!=1) print $2}' $famfile>other.txt - ${BCFTOOLS} view cleaninfo.vcf.gz -S other.txt -r chrX:1-1000000000,chrY:1-1000000000,X:1-1000000000,Y:1-1000000000 --no-update|bgzip>other.vcf.gz - ${BCFTOOLS} index other.vcf.gz - - zcat other.vcf.gz\ - |awk -F'\t' '{if ($1!~"#") print $0 "\t" "ENDOFLINE"}' \ - |tr '\t' '\n' \ - |awk -F':' '{ if (NF>4 && $1!="GT" ) $1="./.";print}' OFS=":" \ - |tr '\n' '\t' \ - |sed 's/ENDOFLINE/\n/g' \ - |sed -e 's/^[ \t]*//' \ - |sed -e 's/[\t]$//g' \ - |bgzip \ - >other.revise.txt.gz - - cat <(zcat other.vcf.gz \ - |fgrep -wvf <(zcat other.revise.txt.gz|awk '{print $3}' )) \ - <(zcat other.revise.txt.gz) \ - |vcf-sort \ - |bgzip \ - >cleanother.vcf.gz - - ${BCFTOOLS} index cleanother.vcf.gz - - cat <(zcat cleanmale.vcf.gz|egrep "##") \ - <(paste <(zcat cleanmale.vcf.gz|egrep -v "##") <(zcat cleanfemale.vcf.gz|cut -f10-|egrep -v "##") <(zcat cleanother.vcf.gz|cut -f10-|egrep -v "##") ) \ - |bgzip \ - >combinedsex.vcf.gz - -else - cat <(zcat cleanmale.vcf.gz|egrep "##") \ - <(paste <(zcat cleanmale.vcf.gz|egrep -v "##") <(zcat cleanfemale.vcf.gz|cut -f10-|egrep -v "##")) \ - |bgzip \ - >combinedsex.vcf.gz -fi - - - - tabix -p vcf combinedsex.vcf.gz - -zcat combinedsex.vcf.gz|awk '{if ($1!~"#") print $3}'>modified.ids.txt - -##shuffle sex ids backinto place to match original vcf and back to initial vcf## - vcf-shuffle-cols -t cleaninfo.vcf.gz combinedsex.vcf.gz \ - |awk '{if ($1!~"#") print}' \ - |cat <(zcat cleaninfo.vcf.gz|fgrep -wvf modified.ids.txt ) - \ - |vcf-sort \ - |bgzip \ - >cleanallo.vcf.gz - -else - cp cleaninfo.vcf.gz cleanallo.vcf.gz - echo "">sexchr.revise.txt -fi - -# the code below will not print any lines if the background list file is empty, so add a dummy sentinel record at the end -cat $backgroundlist <(echo "XXX_SENTINEL_XXX") > background_list_with_sentinel.list - -##change tag for SR background failures and Unresolved## -zcat cleanallo.vcf.gz\ - |awk 'NR==FNR{inFileA[$1]; next} {if ($3 in inFileA && $1!~"#") $7=$7";HIGH_SR_BACKGROUND"; print }' OFS='\t' <(awk '{print $NF}' background_list_with_sentinel.list) - \ - |awk '{if (NR==2) print $0 "\n" "##FILTER=" ;else print}' \ - |awk '{if ($8~"UNRESOLVED") $7=$7";UNRESOLVED";print}' OFS='\t' \ - |awk '{if (NR==2) print $0 "\n" "##FILTER=" ;else print}' \ - |bgzip \ - >int.vcf.gz diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b.sh b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b.sh deleted file mode 100755 index 40df71ea7..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b.sh +++ /dev/null @@ -1,166 +0,0 @@ -#!/bin/bash -# -# clean_vcf_part1b.sh -# - -set -euxo pipefail - -##gzipped vcf from clean vcf part1.sh## -int_vcf_gz=$1 - -##Remove CNVs that are improperly genotyped by depth because they are nested within a real CNV## - -##Determine columns of VCF after header## -zcat $int_vcf_gz\ - |sed -n '1,1000p'\ - |egrep ^# \ - |tail -n 1 \ - |tr '\t' '\n' \ - |cat -n - \ - >col.txt - -##Only affects CNV so pull those out## -zcat $int_vcf_gz \ - |awk '{if ($5~"DEL" || $5~"DUP" || $1~"#") print}' \ - |svtk vcf2bed stdin stdout \ - |awk -F"\t" '{if ($6=="") print $6="blanksample";print $0}' OFS='\t' \ - |gzip>int.bed.gz - -##list of potenital overlaps with a normal copy state variant (>5kb variants require depth but nested events could be missed; i.e a duplication with a nest deletion will have a normal copy state for the deletion)## -##flip bed intersect so largest is CNV is always first## -bedtools intersect -wa -wb -a <(zcat int.bed.gz|awk '{if ($3-$2>=5000 ) print}') \ --b <(zcat int.bed.gz|awk '{if ($3-$2>=5000) print}') \ - |awk -F'\t' '{if ($4!=$10 && $3-$2>=$9-$8 && $5!=$11) print ;\ - else if ($4!=$10 && $5!=$11) print $7,$8,$9,$10,$11,$12,$1,$2,$3,$4,$5,$6}' OFS='\t' \ - |awk -F'\t' '{if ($6!="blanksample") print}' \ - |sort -u \ - >normaloverlap.txt - - -##pull out the depth based copy number variant for each normal overlapping variant## -{ cat <(zcat $int_vcf_gz|awk -F"\t" '{if ($1~"#") print}') \ - <(awk '{print $4 "\n" $10}' normaloverlap.txt|sort -u|fgrep -wf - <(zcat $int_vcf_gz)) || true; }\ - |awk '{if ($1!~"#") $1=$3;print}' OFS="\t" \ - |awk '{if ($1~"#" || $5=="" || $5=="") print}' \ - |vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \ - |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1"@"header[j] "\t" $j }' \ - |sort -k1,1 \ - |gzip \ - >RD_CN.normalcheck.FORMAT.gz - - -##pull out evidence supporting each normal overlapping variant## -{ cat <(zcat $int_vcf_gz|awk -F"\t" '{if ($1~"#") print}') \ - <(awk '{print $4 "\n" $10}' normaloverlap.txt|sort -u|fgrep -wf - <(zcat $int_vcf_gz)) || true; }\ - |awk '{if ($1!~"#") $1=$3;print}' OFS="\t"\ - |vcftools --vcf - --stdout --extract-FORMAT-info EV \ - |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1"@"header[j] "\t" $j }' \ - |sort -k1,1 \ - |gzip \ - >EV.normalcheck.FORMAT.gz - - -##check if nested is incorrectly classified as normal## -touch overlap.test.txt -while read bed -do - echo $bed|tr ' ' '\t'|cut -f1-6 >large.bed - echo $bed|tr ' ' '\t'|cut -f7-12>small.bed - ##require at least 50% coverage to consider a variant overlapping## - overlap=$(bedtools coverage -a small.bed -b large.bed|awk '{if ($NF>=0.50) print "YES";else print "NO"}') - - if [ "$overlap" == "YES" ] - then - smallid=$(awk '{print $4}' small.bed) - - ##pull out variants that are called a variants for both the smaller and larger CNVs (don't have normal copy state to check for)## - if [ $(awk '{print $NF}' small.bed \ - |tr ',' '\n' \ - |fgrep -wvf - <(awk -F"[,\t]" -v var=$smallid '{for(i=6;i<=NF;i++) print var"@"$i "\t" $4"@"$i "\t" $5}' large.bed)|wc -l) -gt 0 ] - then - awk '{print $NF}' small.bed \ - |tr ',' '\n' \ - |fgrep -wvf - <(awk -F"[,\t]" -v var=$smallid '{for(i=6;i<=NF;i++) print var"@"$i "\t" $4"@"$i "\t" $5}' large.bed) \ - >>overlap.test.txt - fi - fi -donegeno.normal.revise.txt - -##Update genotypes## -{ zfgrep -wf <(awk '{print $1}' geno.normal.revise.txt|sort -u) $int_vcf_gz || true; }\ - |bgzip \ - >subset.vcf.gz || true - -##pull out and revise vcf line that needs to be edited## -while read variant -do - - echo $variant - #note no longer change depth from id.txt (column 2)## - { fgrep $variant geno.normal.revise.txt || true; }|awk '{print $2 "\t" $3}'>id.txt - zcat subset.vcf.gz |{ fgrep -w $variant || true; }>line.txt - - cat line.txt \ - |tr '\t' '\n' \ - |paste col.txt - \ - |tr ':' '\t' \ - |awk 'NR==FNR{inFileA[$1]=$2; next} {if ($2 in inFileA ) $3="0/1"; print }' OFS='\t' id.txt - \ - |awk 'NR==FNR{inFileA[$1]=$2; next} {if ($2 in inFileA ) $4=$6; print }' OFS='\t' id.txt - \ - |cut -f3-|tr '\t' ':' \ - |tr '\n' '\t' \ - |awk '{print $0}' \ - >>normal.revise.vcf.lines.txt - -done< <(awk '{print $1}' geno.normal.revise.txt|sort -u) - - -##rewrite vcf with updated genotypes## - -cat <(zcat $int_vcf_gz|fgrep -wvf <(awk '{print $3}' normal.revise.vcf.lines.txt|sort -u)) \ - <(sed 's/\t$//' normal.revise.vcf.lines.txt) \ - |vcf-sort \ - |bgzip \ - >normal.revise.vcf.gz || true - - bcftools index normal.revise.vcf.gz - -##get copy state per variant## -zcat normal.revise.vcf.gz \ - |awk '{if ($1!~"#") $1=$3;print}' OFS="\t" \ - |vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \ - |gzip \ - >copystate.RD_CN.FORMAT.gz - -##get copy state per variant## -zcat copystate.RD_CN.FORMAT.gz \ - |awk 'NR>1{for(i=3;i<=NF;i++) lines[$1 "\t" $i]++ } END{for (x in lines) print x}' \ - |gzip \ - >copystate.per.variant.txt.gz - -##Find multi-allelic for del or dup ; CNV >1kb we trust depth ## -##del## -zcat copystate.per.variant.txt.gz \ - |awk '{if ($2!="." && $2>3) print $1}' \ - |sort -u \ - |fgrep -wf <(zcat int.bed.gz|awk -F"\t" '{if ($5=="DEL" && $3-$2>=1000) print $4}' ) \ - >multi.cnvs.txt || true - -##dup## -zcat copystate.per.variant.txt.gz \ - |awk '{if ($2!="." && ($2<1 || $2>4)) print $1}' \ - |sort -u \ - |fgrep -wf <(zcat int.bed.gz|awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}' ) \ - >>multi.cnvs.txt || true diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py new file mode 100644 index 000000000..b7da153cb --- /dev/null +++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py @@ -0,0 +1,154 @@ +""" +Remove CNVs that are improperly genotyped by depth because they are nested +within a real CNV +""" + +import logging +import pybedtools +import pysam +import sys +import json + +from collections import defaultdict + +SVTYPE = "SVTYPE" +BLANK_SAMPLES = "blanksample" + + +class SVType: + DUP = "DUP" + DEL = "DEL" + + +class VariantFormatTypes: + # Predicted copy state + RD_CN = "RD_CN" + # Classes of evidence supporting final genotype + EV = "EV" + + +class VCFReviser: + def __init__(self): + self.rd_cn = {} + self.sample_indices_dict = {} + self.sample_list = [] + + def _update_rd_cn(self, variant, sample_indices): + self.rd_cn[variant.id] = {s: variant.samples[s][VariantFormatTypes.RD_CN] for s in sample_indices} + + @staticmethod + def get_wider(f): + # f[1] : first interval start + # f[2] : first interval end + # f[7] : second interval start + # f[8] : second interval end + if int(f[2]) - int(f[1]) >= int(f[8]) - int(f[7]): + return f[0:6], f[6:12] + else: + return f[6:12], f[0:6] + + @staticmethod + def get_coverage(wider, narrower): + n_start = int(narrower[1]) + n_stop = int(narrower[2]) + w_start = int(wider[1]) + w_stop = int(wider[2]) + + coverage = 0 + if w_start <= n_stop and n_start <= w_stop: + intersection_size = min(n_stop, w_stop) - max(n_start, w_start) + coverage = intersection_size / (n_stop - n_start) + return coverage + + def get_geno_normal_revise(self, vcf_file, bed_file): + overlap_test_text = defaultdict(dict) + with pysam.VariantFile(vcf_file, "r") as f: + header = f.header + i = -1 + for sample in header.samples: + i += 1 + self.sample_indices_dict[sample] = i + self.sample_list.append(sample) + + logging.info("Filtering intersect results") + bed = pybedtools.BedTool(bed_file) + for interval in bed.intervals: + wider, narrower = self.get_wider(interval.fields) + # wider and narrower are lists/tuples with the following fields: + # [0] : contig + # [1] : start position + # [2] : end position + # [3] : variant ID + # [4] : SV type + # [5] : comma-delimited sample lists, or BLANK_SAMPLES if none + if wider[5] == BLANK_SAMPLES: + continue + + coverage = self.get_coverage(wider, narrower) + if coverage >= 0.5: + wider_samples = set(wider[5].split(",")) + narrower_samples = set(narrower[5].split(",")) + non_common_samples = [self.sample_indices_dict[s] for s in wider_samples - narrower_samples] + for x in non_common_samples: + vid = narrower[3] + overlap_test_text[vid][x] = (wider[3], wider[4]) + + # Determine for which vid/sample pairs we need RD_CN + # Substantially reduces memory + logging.info('Getting revised variant IDs') + revise_vids = defaultdict(set) + for var_id, samples_dict in overlap_test_text.items(): + for sample_index, v in samples_dict.items(): + # v[0] : variant ID + # v[1] : SV type + if v[1] == SVType.DUP or v[1] == SVType.DEL: + revise_vids[var_id].add(sample_index) + revise_vids[v[0]].add(sample_index) + + logging.info('Getting RD_CN/EV') + for variant in f: + if variant.id in revise_vids: + sample_indices = revise_vids[variant.id] + self._update_rd_cn(variant, sample_indices) + + logging.info('Generating geno_normal_revise_dict') + geno_normal_revise_dict = {} + for var_id, samples_dict in overlap_test_text.items(): + for sample_index, v in samples_dict.items(): + # v[0] : variant ID + # v[1] : SV type + new_val = None + if sample_index not in revise_vids[v[0]]: + sys.stderr.write("{} {}\n".format(sample_index, v[0])) + if v[1] == SVType.DUP and \ + self.rd_cn[var_id][sample_index] == 2 and \ + self.rd_cn[v[0]][sample_index] == 3: + new_val = 1 + elif v[1] == SVType.DEL and \ + self.rd_cn[var_id][sample_index] == 2 \ + and self.rd_cn[v[0]][sample_index] == 1: + new_val = 3 + + if new_val: + if var_id not in geno_normal_revise_dict: + geno_normal_revise_dict[var_id] = {} + sample_id = self.sample_list[sample_index] + geno_normal_revise_dict[var_id][sample_id] = new_val + + return geno_normal_revise_dict + + +def main(args): + logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) + logging.info('Starting script') + reviser = VCFReviser() + filtered_vcf = args[1] + intersected_bed = args[2] + geno_normal_revise_dict = reviser.get_geno_normal_revise(filtered_vcf, intersected_bed) + logging.info('Dumping dictionary') + sys.stdout.write(json.dumps(geno_normal_revise_dict)) + logging.info('Done') + + +if __name__ == '__main__': + main(sys.argv) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py new file mode 100644 index 000000000..e63b890cd --- /dev/null +++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py @@ -0,0 +1,82 @@ +""" +Remove CNVs that are improperly genotyped by depth because they are nested +within a real CNV +""" + +import os +import logging +import pysam +import sys +from pathlib import Path +import json +import gzip + +SVTYPE = "SVTYPE" +BLANK_SAMPLES = "B" + + +class SVType: + DUP = "DUP" + DEL = "DEL" + + +class VariantFormatTypes: + # Predicted copy state + RD_CN = "RD_CN" + # Classes of evidence supporting final genotype + EV = "EV" + + +def modify_variants(dict_file_gz, vcf, multi_cnvs): + logging.info('Loading dictionary') + with gzip.open(dict_file_gz, 'rt') as f: + geno_normal_revise_dict = json.load(f) + + logging.info('Filtering variants') + with pysam.VariantFile(vcf, "r") as f_in: + header = f_in.header + sys.stdout.write(str(header)) + with open(multi_cnvs, "w") as multi_cnvs_f: + variants = f_in.fetch() + for variant in variants: + if variant.id in geno_normal_revise_dict: + for sample_id in geno_normal_revise_dict[variant.id]: + o = variant.samples[sample_id] + o.update({"GT": (0, 1)}) + o.update({"GQ": o["RD_GQ"]}) + + if variant.stop - variant.start >= 1000: + if variant.info[SVTYPE] in [SVType.DEL, SVType.DUP]: + is_del = variant.info[SVTYPE] == SVType.DEL + for k, v in variant.samples.items(): + rd_cn = v[VariantFormatTypes.RD_CN] + if rd_cn is None: + continue + if (is_del and rd_cn > 3) or \ + (not is_del and (rd_cn < 1 or rd_cn > 4)): + multi_cnvs_f.write(variant.id + "\n") + break + + sys.stdout.write(str(variant)) + + +def ensure_file(filename): + filename = os.path.join(".", filename) + filename = Path(filename) + if filename.exists(): + os.remove(filename) + return filename.name + + +def main(args): + logging.basicConfig(format='%(asctime)s - %(message)s', level=logging.INFO) + logging.info('Starting script') + multi_cnvs_filename = ensure_file("multi.cnvs.txt") + dict_file_gz = args[1] + vcf_file = args[2] + modify_variants(dict_file_gz, vcf_file, multi_cnvs_filename) + logging.info('Done') + + +if __name__ == '__main__': + main(sys.argv) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.py new file mode 100644 index 000000000..86e869e46 --- /dev/null +++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.py @@ -0,0 +1,58 @@ +#!/bin/python + +import argparse +from collections import defaultdict +from os import mkdir, path + + +def count_variants(infile): + variant_counts = defaultdict(int) + with open(infile, 'r') as IN: + for line in IN: + var_id = line.strip().split('\t')[0] + variant_counts[var_id] += 1 + return dict(sorted(variant_counts.items(), key=lambda item: item[1], reverse=True)) + + +def assign_shards(variant_counts, max_samples): + shard_assignments = {} + shard_number = 0 + sample_counter = 0 + first = True + for variant in variant_counts.keys(): + if not first and (sample_counter + variant_counts[variant] > max_samples): + shard_number += 1 + sample_counter = 0 + shard_assignments[variant] = shard_number + sample_counter += variant_counts[variant] + first = False + return shard_number, shard_assignments + + +def create_shards(infile, shard_assignments, num_shards): + if not path.isdir("./shards"): + mkdir("./shards") + with open(infile, 'r') as IN: + for line in IN: + var_id = line.strip().split('\t')[0] + shard = shard_assignments[var_id] + shard_file = f"shards/out.{shard}_{num_shards}.txt" + with open(shard_file, 'a') as OUT: + OUT.write(line) + + +def main(): + parser = argparse.ArgumentParser() + parser.add_argument("combined_file", help="rd_cn_revise file with variant ID, sample ID, and CN columns") + parser.add_argument("-s", "--max-samples", + help="Maximum number of variant x sample entries in a shard (default = 7,000)", + default=7000, type=int) + args = parser.parse_args() + + variant_counts = count_variants(args.combined_file) + num_shards, shard_assignments = assign_shards(variant_counts, args.max_samples) + create_shards(args.combined_file, shard_assignments, num_shards) + + +if __name__ == "__main__": + main() diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.sh b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.sh deleted file mode 100755 index d2defd28b..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.sh +++ /dev/null @@ -1,52 +0,0 @@ -#!/bin/bash -# -# clean_VCF_part3.sh -# - -set -euo pipefail - -combined_file=$1 - -awk '{print $1}' $combined_file \ - | sort \ - | uniq -c \ - | sort -nrk1,1 \ - > variant.count.txt - -final=0 -prev=0 -var=0 - -while read line -do - i=$(echo $line|awk -v prev=$prev '{print $1+prev}' ) - let "var=$var+1" - if [ $i -gt 5000 ] || [ $var -gt 100 ] - then - final=$(echo $final|awk '{print $1+1}') - prev=0 - var=0 - else - prev=$i - fi -done < variant.count.txt - -j=0 -prev=0 -mkdir shards - -while read line -do - i=$(echo $line|awk -v prev=$prev '{print $1+prev}' ) - let "var=$var+1" - if [ $i -gt 5000 ] || [ $var -gt 100 ] - then - j=$(echo $j|awk '{print $1+1}') - prev=0 - var=0 - else - prev=$i - fi - out=$(echo $j"_"$final) - echo $line|awk '{print $2}'|fgrep -wf - $combined_file >>shards/out.$out.txt || true -done < variant.count.txt diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part4.sh b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part4.sh deleted file mode 100755 index 0b29ae8cc..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part4.sh +++ /dev/null @@ -1,132 +0,0 @@ -#!/bin/bash -# -# clean_VCF_part4.sh -# - -set -euxo pipefail - -##gzipped combined bed file## -##combined output file from clean_vcf_part2.sh## -RD_CN_revise_forgeno=$1 -normal_revise_vcf=$2 - - -##seed the vcf lines file which will provide the revisions to vcf file## -echo "">revise.vcf.lines.txt - - -##reduce vcf to lines for given shard## - cat <(zcat $normal_revise_vcf|sed -n '1,1000p' |egrep ^# ) \ - <(zcat $normal_revise_vcf |fgrep -wf <(awk '{print $1}' $RD_CN_revise_forgeno|sort -u)) \ - |bgzip \ - >int.vcf.gz || true - - -##get column ids## -zcat $normal_revise_vcf \ - |sed -n '1,1000p' \ - |egrep ^# \ - |tail -n 1 \ - |tr '\t' '\n' \ - |cat -n - \ - >col.txt - - -##pull out and revise vcf line that needs to be edited## -while read line -do - id=$(echo $line|awk '{print $2}' ) - col=$(awk -v id=$id '{if($2==id) print $1}' col.txt) - variant=$(echo $line|awk '{print $1}') - cn=$(echo $line|awk '{print $3}') - - zcat int.vcf.gz \ - |{ fgrep -w $variant || true; } \ - >line.txt - - echo $variant $id - ##Updated genotype and rebuild Format field ## - GT=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $1}') - GQ=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $2}') - RD_CN=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $3}') - RD_GQ=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $4}') - PE_GT=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $5}') - PE_GQ=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $6}') - SR_GT=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $7}') - SR_GQ=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $8}') - EV=$(awk -v col=$col '{print $col}' line.txt|awk -F":" '{print $9}') - - if [ $(cat revise.vcf.lines.txt|fgrep -w $variant|wc -l) -gt 0 ] - then - cat revise.vcf.lines.txt \ - |awk -v col=$col -v var=$variant -v GT=$GT -v GQ=$GQ -v RD_CN=$cn -v RD_GQ=$RD_GQ -v PE_GT=$PE_GT -v PE_GQ=$PE_GQ -v SR_GT=$SR_GT -v SR_GQ=$SR_GQ -v EV=$EV '{if ($3==var ) $col="0/1:"GQ":"RD_CN":"RD_GQ":"PE_GT":"PE_GQ":"SR_GT":"SR_GQ":"EV ;print}' \ - >int.lines.txt - - cat int.lines.txt > revise.vcf.lines.txt - - else - cat line.txt \ - |awk -v col=$col -v var=$variant -v GT=$GT -v GQ=$GQ -v RD_CN=$cn -v RD_GQ=$RD_GQ -v PE_GT=$PE_GT -v PE_GQ=$PE_GQ -v SR_GT=$SR_GT -v SR_GQ=$SR_GQ -v EV=$EV '{if ($3==var ) $col="0/1:"GQ":"RD_CN":"RD_GQ":"PE_GT":"PE_GQ":"SR_GT":"SR_GQ":"EV ;print}' \ - >>revise.vcf.lines.txt - fi - -done<$RD_CN_revise_forgeno - - -bgzip revise.vcf.lines.txt - - -##get multilallelic genotypes## -##pull out lines for normal vcf for given batch## -total_lines=$(zcat $normal_revise_vcf|egrep -v "^#"|wc -l) -batch=$(ls $RD_CN_revise_forgeno|awk -F'/' '{print $NF}'|awk -F'[._]' '{print $2}'|awk '{if ($1==0) print 1; else print}') -total_batch=$(ls $RD_CN_revise_forgeno|awk -F'/' '{print $NF}'|awk -F'[._]' '{print $3}'|awk '{if ($1==0) print 1; else print}') - -segments=$(echo $total_batch $total_lines|awk '{print $2/$1}') - - cat <(zcat $normal_revise_vcf|sed -n '1,1000p' |egrep ^# ) \ - <(zcat $normal_revise_vcf |egrep -v "^#"|awk -v batch=$batch -v segments=$segments '{if (NR<=batch*segments && NR>=((batch-1)*segments) ) print }') \ - |bgzip \ - >split.vcf.gz - - -for var in PE_GT SR_GT PE_GQ SR_GQ -do - zcat split.vcf.gz\ - |awk -F'\t' '{if ($1!~"#") $1=$3;print}' OFS="\t" \ - |vcftools --vcf - --stdout --extract-FORMAT-info ${var} \ - |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1"@"header[j] "\t" $j }' \ - |sort -k1,1 \ - |gzip \ - >multicheck.${var}.FORMAT.gz -done - -##concatenate metrics## -# get each line formatted as SITE@SAMPLE PE_GT PE_GQ SR_GT SR_GQ -join -j 1 <(zcat multicheck.PE_GT.FORMAT.gz) \ - <(zcat multicheck.PE_GQ.FORMAT.gz) \ - |join -j 1 - <(zcat multicheck.SR_GT.FORMAT.gz) \ - |join -j 1 - <(zcat multicheck.SR_GQ.FORMAT.gz) \ - |tr ' ' '\t' \ - |gzip \ - >multi.combined.format.gz - -# Set the maximum allowable number of samples with a PE or SR GT > 3 to be 1% or 2, whichever is greater -vf_1=$(zcat split.vcf.gz |sed -n '1,1000p' |egrep -v "^##"|cut -f10-|awk 'NR==1{print (NF) * 0.01}' |awk '{if ($1 <= 2) {$1 = 2}; print $1}') - -# Choose the best of PE and SR genotypes for each site / sample -# Count the number of samples with a GT over 3 for each site -# Add site IDs with sample counts over $vf_1 to the multi.geno.ids.txt.gz file -zcat multi.combined.format.gz \ - |awk '{if ($2>0 && $4==0) print $1"\t" $2; \ - else if ($2==0) print $1 "\t" $4; \ - else if ($3>=$5)print $1"\t" $2; \ - else print $1"\t" $4 }' \ - |tr '@' '\t' \ - |awk '{if ($3>2 && $2!=".") print $1}' \ - |sort \ - |uniq -c \ - |awk -v vf_1=$vf_1 '{if ($1>vf_1)print $2}' \ - |gzip \ - >multi.geno.ids.txt.gz - diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5.sh b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5.sh deleted file mode 100755 index 3704e086d..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5.sh +++ /dev/null @@ -1,332 +0,0 @@ -#!/bin/bash -# -# clean_VCF_part5.sh -# - -set -euo pipefail - -##gzipped combined bed file## -##combined output file from clean_vcf_part2.sh## -revise_vcf_lines=$1 -normal_revise_vcf=$2 -famfile=$3 -sexchr_revise=$4 -multi_geno_ids_txt=$5 -outliers_samples_list=$6 - -# use BCFTOOLS 1.9 -BCFTOOLS=/usr/local/bin/bcftools - -cat <(zcat $normal_revise_vcf|fgrep -wvf <(zcat $revise_vcf_lines|awk '{if ($1!="") print $3}'|sort -u)) \ - <(zcat $revise_vcf_lines|awk '{if ($1!="") print}' |tr ' ' '\t') \ - |vcf-sort \ - |bgzip \ - >overlap.revise.vcf.gz || true - -##create bed of VCF## -svtk vcf2bed overlap.revise.vcf.gz stdout|gzip> overlap.revise.bed.gz - -##multi check## -zcat overlap.revise.vcf.gz \ - |awk '{if ($1!~"#") $1=$3;print}' OFS="\t" \ - |vcftools --vcf - --remove $outliers_samples_list --stdout --extract-FORMAT-info RD_CN \ - |gzip \ - >copystate.RD_CN.FORMAT.gz - -zcat overlap.revise.vcf.gz \ - |awk '{if ($1!~"#") $1=$3;print}' OFS="\t" \ - |vcftools --vcf - --remove $outliers_samples_list --stdout --extract-FORMAT-info GT \ - |gzip \ - >genotype.gt.FORMAT.gz - -##New method for determining copy state based on >1% of people having an multi-allelic copy state as define above## -vf_1=$(zcat copystate.RD_CN.FORMAT.gz|awk 'NR==1{print (NF-2) * 0.01}'|awk '{if ($1<=1) print 2; else print }' ) - -zcat copystate.RD_CN.FORMAT.gz \ - |fgrep -wf <(zcat overlap.revise.bed.gz|awk -F"\t" '{if ($5=="DEL" && $3-$2>=1000) print $4}' ) \ - |awk 'NR>1{for(i=3;i<=NF;i++) if ($i!="." && $i>3) print $1 }' \ - |sort \ - |uniq -c \ - |awk -v vf_1=$vf_1 '{if ($1>vf_1)print $2}' \ - |gzip \ - >multi.del.ids.txt.gz || true - -zcat copystate.RD_CN.FORMAT.gz \ - |fgrep -wf <(zcat overlap.revise.bed.gz|awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}' ) \ - |awk 'NR>1{for(i=3;i<=NF;i++) if ($i!="." && $i>4) print $1 }' \ - |sort \ - |uniq -c \ - |awk -v vf_1=$vf_1 '{if ($1>vf_1)print $2}' \ - >multi.dup.ids.txt || true -##Case with CN 0,1,2,3,4## -zcat copystate.RD_CN.FORMAT.gz \ - |fgrep -wf <(zcat overlap.revise.bed.gz \ - |awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}') \ - |awk 'NR>1{for(i=3;i<=NF;i++) if ($i!="." && ($i<1 || $i>4)) print $1 "\t" $i }'\ - |sort -u \ - |awk '{print $1}' \ - |sort \ - |uniq -c \ - |awk '{if ($1>4) print $2}'>gt4copystate.txt ||true -zcat copystate.RD_CN.FORMAT.gz \ - |fgrep -wf <(zcat overlap.revise.bed.gz|awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}' ) \ - |awk 'NR>1{for(i=3;i<=NF;i++) if ($i!="." && ($i<1 || $i>4)) print $1 }' \ - |sort \ - |uniq -c \ - |fgrep -wf gt4copystate.txt \ - |awk -v vf_1=$vf_1 '{if ($1>vf_1)print $2}' \ - >>multi.dup.ids.txt || true -sort -u multi.dup.ids.txt|gzip >multi.dup.ids.txt.gz||true - -##Regenotype to determine multiallelic; we just change copy state for some nested variants and we need to make sure we get proper genotype for these; also previous stages have different notaion for multiallelic and we need to make this uniform; this is a CN based regenotyping so restricted to >5kb ## -##Genotype big dup## -svtk vcf2bed overlap.revise.vcf.gz stdout \ - |gzip>regeno.bed.gz - -##add variants that are <5kb because clustering but have a mutliallelic genotype from before## -zcat genotype.gt.FORMAT.gz \ - |awk '{if ($1~"DUP") print}' \ - |awk '{for (i = 3; i <= NF; ++i) print $1 "\t" $i}' \ - |awk '{if ($2!="1/1" && $2!="0/0" && $2!="0/1" && $2!="./.") print $1}' \ - |fgrep -wvf <(zcat multi.dup.ids.txt.gz) \ - |sort -u>gt5kb.dup.ids.txt || true - -zcat genotype.gt.FORMAT.gz \ - |awk '{if ($1~"DEL") print}' \ - |awk '{for (i = 3; i <= NF; ++i) print $1 "\t" $i}' \ - |awk '{if ($2!="1/1" && $2!="0/0" && $2!="0/1" && $2!="./.") print $1}' \ - |fgrep -wvf <(zcat multi.del.ids.txt.gz) \ - |sort -u>gt5kb.del.ids.txt || true - -##generate list## -##CNV >5kb, split del and dup ## -if [ -f multi.dup.ids.txt.gz ] -then - zcat regeno.bed.gz \ - |awk '{if ($3-$2>=5000 && $5=="DUP")print $4}' \ - |fgrep -wvf <(zcat multi.dup.ids.txt.gz) \ - >>gt5kb.dup.ids.txt || true -else - zcat regeno.bed.gz \ - |awk '{if ($3-$2>=5000 && $5=="DUP")print $4}' \ - >>gt5kb.dup.ids.txt -fi - -if [ -f multi.del.ids.txt.gz ] -then - zcat regeno.bed.gz \ - |awk '{if ($3-$2>=5000 && $5=="DEL")print $4}' \ - |fgrep -wvf <(zcat multi.del.ids.txt.gz) \ - >>gt5kb.del.ids.txt || true -else - zcat regeno.bed.gz \ - |awk '{if ($3-$2>=5000 && $5=="DEL")print $4}' \ - >>gt5kb.del.ids.txt -fi - - -zcat overlap.revise.vcf.gz \ - |fgrep -wf gt5kb.dup.ids.txt \ - >>dup.int.txt || true - -zcat overlap.revise.vcf.gz \ - |fgrep -wf gt5kb.del.ids.txt \ - >>del.int.txt || true - -##regenotype VCF## -dellen=$(cat del.int.txt|wc -l) -columnlen=$(less del.int.txt|cut -f10-|tr '\t' '\n' |wc -l) -dellenchange=$(echo $dellen $columnlen|awk '{if ($1 == 0) { print "0" } else { print $2/$1}}') - -paste <(less del.int.txt|cut -f1-9) <(less del.int.txt|cut -f10-|tr '\t' '\n' \ - |awk -F':' '{if ($3>=2 && $1!="./.") $1="0/0"; \ - else if ($3==1 && $1!="./.") $1="0/1"; \ - else if ($1!="./.")$1="1/1";print}' OFS=":" \ - |awk -v lenchange=$dellenchange 'NR%lenchange {printf("%s\t", $0); next} \ - {print $0}')>del.revise.txt - -duplen=$(cat dup.int.txt|wc -l) -columnlen=$(less dup.int.txt|cut -f10-|tr '\t' '\n' |wc -l) -duplenchange=$(echo $duplen $columnlen|awk '{if ($1 == 0) { print "0" } else { print $2/$1}}') - - -paste <(less dup.int.txt|cut -f1-9) <(less dup.int.txt|cut -f10-|tr '\t' '\n' \ - |awk -F':' '{if ($3<=2 && $1!="./.") $1="0/0"; \ - else if ($3==3 && $1!="./.") $1="0/1"; \ - else if ($1!="./.") $1="1/1";print}' OFS=":" \ - |awk -v lenchange=$duplenchange 'NR%lenchange {printf("%s\t", $0); next} \ - {print $0}') >dup.revise.txt - - -cat <(zcat overlap.revise.vcf.gz|fgrep -wvf <(cat gt5kb.dup.ids.txt gt5kb.del.ids.txt)) \ - <(cat dup.revise.txt del.revise.txt) \ - |vcf-sort \ - |bgzip \ - >newdepth.geno.vcf.gz || true - - -##Tag multi## -##Add filters to header## -zcat newdepth.geno.vcf.gz \ - |awk -F'\t' 'NR==FNR{inFileA[$1]; next} {if ($3 in inFileA && $1!~"#" && $7!~"PESR_GT_OVERDISPERSION") $7=$7";PESR_GT_OVERDISPERSION"; print }' OFS='\t' <(cat <(zcat $multi_geno_ids_txt) <(printf "\n")) - \ - |awk -F'\t' 'NR==FNR{inFileA[$1]; next} {if ($3 in inFileA && $1!~"#") $7=$7";MULTIALLELIC"; print }' OFS='\t' \ - <(cat <(zcat multi.del.ids.txt.gz multi.dup.ids.txt.gz |sort -u) <(printf "\n")) - \ - |sed 's\PASS;\\g' \ - |awk '{if (NR==2) print $0 "\n" "##FILTER=" ;else print}' \ - |awk '{if (NR==2) print $0 "\n" "##FILTER=" ;else print}' \ - |bgzip \ - >multitagged.vcf.gz -tabix multitagged.vcf.gz - -touch all.multi.revised.list - -touch dup.multi.revise.vcf -if [ $(zcat multi.dup.ids.txt.gz|wc -l) -ge 1 ] -then - /opt/sv-pipeline/04_variant_resolution/scripts/reset_multiallelic_format_fields.py multitagged.vcf.gz <(zcat multi.dup.ids.txt.gz) > dup.multi.revise.vcf - ${BCFTOOLS} query -f '%ID\n' dup.multi.revise.vcf >> all.multi.revised.list -fi - -touch del.multi.revise.vcf -if [ $(zcat multi.del.ids.txt.gz|wc -l) -ge 1 ] -then - /opt/sv-pipeline/04_variant_resolution/scripts/reset_multiallelic_format_fields.py multitagged.vcf.gz <(zcat multi.del.ids.txt.gz) > del.multi.revise.vcf - ${BCFTOOLS} query -f '%ID\n' del.multi.revise.vcf >> all.multi.revised.list -fi - -# make sure that the new header includes CN and CNQ format fields if we set any -if [ -s dup.multi.revise.vcf ] -then - grep '^#' dup.multi.revise.vcf > new_header.vcf -elif [ -s del.multi.revise.vcf ] -then - grep '^#' del.multi.revise.vcf > new_header.vcf -else - zcat multitagged.vcf.gz | grep '^#' > new_header.vcf -fi - -# combine the revised variants with the unrevised variants, reheader, resort, and compress - cat <(zcat multitagged.vcf.gz| \ - fgrep -wvf all.multi.revised.list) \ - <(cat del.multi.revise.vcf dup.multi.revise.vcf \ - | grep -v '^#' \ - |awk '!seen[$3]++') \ - |${BCFTOOLS} reheader -h new_header.vcf \ - |vcf-sort \ - |bgzip \ - >multitagged.geno.vcf.gz || true - -##remove overlapping multi### -zcat multitagged.vcf.gz \ - |awk -F'\t' '{if ($1~"#" || ($7~"MULTIALLELIC" && ($5=="" || $5==""))) print}' \ - |svtk vcf2bed stdin stdout \ - |cut -f1-5 \ - |gzip \ - >multi.bed.gz - -##strip out overlapping multiallelics## -bedtools intersect -wa -wb -a multi.bed.gz -b multi.bed.gz \ - |awk -F'\t' '{if ($4!=$9 && $3-$2>=$8-$7) print $0; \ - else if ($4!=$9) print $6,$7,$8,$9,$10,$1,$2,$3,$4,$5}' OFS="\t" \ - |sort -u \ - |awk '{print $3-$2,$8-$7,$0}' OFS="\t" \ - |sort -nrk1,1 -k2,2nr \ - |cut -f3- \ - >multi.bed.overlap.txt - -echo "">multi.remove.txt - -while read bed -do - echo "$bed"|cut -d$'\t' -f1-5 >large.bed - echo "$bed"|cut -d$'\t' -f6-10>small.bed - overlap=$(bedtools coverage -a small.bed -b large.bed|awk '{if ($NF>0.50) print "YES";else print "NO"}') - echo $bed|awk '{print $4}' - if [ "$overlap" == "YES" ] && [ $(awk '{print $4}' large.bed|fgrep -wf - multi.remove.txt|wc -l) -eq 0 ] - then - awk '{print $4}' small.bed >>multi.remove.txt - fi -done< multi.bed.overlap.txt - -##get alt tag for multiallelics## -## produces a file with a row for each distinct multialllic variant ID and copy number combination -${BCFTOOLS} query -i 'FILTER = "MULTIALLELIC"' -f '[%ID\t%CN\n]' multitagged.geno.vcf.gz \ - |sort -u >multi.cn.txt - -##strip out variants with no genotypes and overlapping multiallelics## -### Find missing genotype and then add multiallelics that need to be removed### -##change multiallelics svtype into mCNV## -##add CN information to ALT column## -zcat multitagged.geno.vcf.gz \ - |${BCFTOOLS} view -e 'FILTER == "MULTIALLELIC"' \ - |svtk vcf2bed stdin stdout \ - |awk -F'\t' '{if ($6=="") print $4}' \ - |cat - multi.remove.txt \ - |sed '/^$/d' \ - |fgrep -wvf - <(zcat multitagged.geno.vcf.gz ) \ - |awk -F';' '{if ($1~"MULTIALLELIC" && ( $2~"DEL" || $2~"DUP")) $2="SVTYPE=CNV"; print}' OFS=';' \ - |awk '{OFS="\t"; if ($8~"SVTYPE=CNV;") $5=""; print}' \ - |bgzip \ - >cleantagandmulti.vcf.gz || true - -##add back original CN for sex variants which had to be changed for multiallelic## - -if [ $(zcat cleantagandmulti.vcf.gz|awk '{if (($1~"X" || $1~"Y") && $1!~"#") print}'|wc -l) -gt 0 ] -then -##Determine columns male columns## -zcat cleantagandmulti.vcf.gz\ - |egrep ^# \ - |tail -n 1 \ - |tr '\t' '\n' \ - |cat -n - \ - >col.txt - -awk '{if ($5==1) print $2}' $famfile \ - |fgrep -wf - col.txt \ - >malecols.txt || true - -##regenotype male calls on sex chr and add 1 to copy state for multialleic check## -zcat cleantagandmulti.vcf.gz \ - |fgrep -wf <(grep . $sexchr_revise || true) \ - |awk -v OFS='\t' 'NR == FNR {list[$1]; next} { for (col in list) $col="MALE"$col; print $0 }' malecols.txt - \ - |awk '{print $0 "\t" "ENDOFLINE"}' \ - |tr '\t' '\n' \ - |awk -F':' '{ if ($0!~"SVTYPE" && NF>4 && $1~"MALE" && $1!="GT" && $3-1>=0 && $3!=".") $3=$3-1;print}' OFS=":" \ - |sed 's/^MALE//g' \ - |tr '\n' '\t' \ - |sed 's/ENDOFLINE/\n/g' \ - |sed -e 's/^[ \t]*//' \ - |sed -e 's/[\t]$//g' \ - |bgzip \ - >sexchr.backtoorig.txt.gz || true - -cat <(zcat cleantagandmulti.vcf.gz|fgrep -wvf <(zcat sexchr.backtoorig.txt.gz|awk '{print $3}' )) \ - <(zcat sexchr.backtoorig.txt.gz |awk '{if ($1!="") print}' |tr ' ' '\t') \ - |vcf-sort \ - |bgzip \ - >cleansexCN.vcf.gz || true - -else -cp cleantagandmulti.vcf.gz cleansexCN.vcf.gz - -fi - -mv cleansexCN.vcf.gz cleanGQ.vcf.gz - -##find blank variants with no samples## -svtk vcf2bed cleanGQ.vcf.gz stdout \ - |awk -F'\t' '{if ($5!~"CN" && $6=="") print $4}' \ - >blankcheck.ids.txt - -##Fix header## -##get header to clean## -##add new filters## -zcat cleanGQ.vcf.gz \ - |awk '{if ($1~"##" && NR>1) print}' \ - |fgrep -v "MULTIALLELIC" \ - |awk '{if (NR==2) print $0 "\n" "##FILTER=" ;else print}' \ - |awk '{if (NR==2) print $0 "\n" "##ALT=" ;else print}' \ - |sort -k1,1 \ - |egrep -v "CIPOS|CIEND|RMSSTD|EVENT|INFO=polished.vcf.gz || true diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py new file mode 100755 index 000000000..ad2b744a5 --- /dev/null +++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py @@ -0,0 +1,60 @@ +#!/usr/bin/env python + +import argparse +import sys +import svtk.utils as svu + + +def process_features_for_size1(features_for_size1, redundant_multiallelics): + for intersection in sorted(features_for_size1, key=lambda x: int(x[9]) - int(x[8]), reverse=True): + b_len = int(intersection.fields[9]) - int(intersection.fields[8]) + overlap = int(intersection.fields[14]) + small_coverage = overlap / b_len + if small_coverage > 0.50: + if intersection.fields[3] not in redundant_multiallelics: + redundant_multiallelics.add(intersection.fields[10]) + + +def main(): + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('multiallelic_filename') + parser.add_argument('fout') + args = parser.parse_args() + + print("finding redundant overlapping sites", file=sys.stderr) + multiallelic_bed = svu.vcf2bedtool(args.multiallelic_filename, include_filters=True) + + redundant_multiallelics = set() + # feature fields: + # [1] : first interval start + # [2] : first interval end + # [3] : first interval variant ID + # [8] : second interval start + # [9] : second interval end + # [10] : second interval variant ID + self_inter = multiallelic_bed.intersect(multiallelic_bed, wo=True)\ + .filter(lambda feature: feature[3] != feature[10]) \ + .filter(lambda feature: (int(feature[2]) - int(feature[1])) >= (int(feature[9]) - int(feature[8]))) \ + .sort(sizeD=True) + current_size1 = -1 + features_for_size1 = [] + for feature in self_inter: + size1 = int(feature[2]) - int(feature[1]) + if size1 != current_size1: + process_features_for_size1(features_for_size1, redundant_multiallelics) + features_for_size1 = [] + + current_size1 = size1 + features_for_size1.append(feature) + + process_features_for_size1(features_for_size1, redundant_multiallelics) + print("identified {} redundant multiallelic sites".format(len(redundant_multiallelics)), file=sys.stderr) + with open(args.fout, "w") as list_file: + for vid in redundant_multiallelics: + print(vid, file=list_file) + + +if __name__ == '__main__': + main() diff --git a/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py new file mode 100755 index 000000000..1e28b90af --- /dev/null +++ b/src/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py @@ -0,0 +1,181 @@ +#!/usr/bin/env python + +import argparse +from collections import Counter +import gzip +import pysam +import sys +import svtk.utils as svu + + +def main(): + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('revise_vcf_lines', type=argparse.FileType('r')) + parser.add_argument('normal_revise_vcf') + parser.add_argument('famfile', type=argparse.FileType('r')) + parser.add_argument('sexchr_revise') + parser.add_argument('multi_geno_ids_txt') + parser.add_argument('outlier_samples_list', type=argparse.FileType('r')) + parser.add_argument('out_prefix') + parser.add_argument('--threads_per_file', required=False, default=2, type=int) + args = parser.parse_args() + + # load the revised lines and index by ID + revised_lines_by_id = {} + with pysam.VariantFile(args.revise_vcf_lines, threads=args.threads_per_file) as revise_vcf: + header2 = revise_vcf.header + revised_lines_by_id = {record.id: record for record in revise_vcf} + print("loaded {} revised lines".format(len(revised_lines_by_id)), file=sys.stderr) + + outlier_samples = set([line.rstrip() for line in args.outlier_samples_list if not line.isspace()]) + print("loaded {} outlier samples".format(len(outlier_samples)), file=sys.stderr) + + male_samples = set() + for line in args.famfile: + if line.isspace(): + continue + fields = line.rstrip().split("\t") + if fields[4] == '1': + male_samples.add(fields[1]) + print("identified {} male samples".format(len(male_samples)), file=sys.stderr) + + if args.sexchr_revise.endswith(".gz"): + sexchr_revise = {line.rstrip() for line in gzip.open(args.sexchr_revise, 'rt')} + else: + sexchr_revise = {line.rstrip() for line in open(args.sexchr_revise, 'rt')} + print("{} sites to revise on sex chromosomes".format(len(sexchr_revise)), file=sys.stderr) + + if args.multi_geno_ids_txt.endswith(".gz"): + multi_geno_ids = {line.rstrip() for line in gzip.open(args.multi_geno_ids_txt, 'rt')} + else: + multi_geno_ids = {line.rstrip() for line in open(args.multi_geno_ids_txt, 'rt')} + print("{} multiallelic sites".format(len(multi_geno_ids)), file=sys.stderr) + + NEW_HEADER_LINES = ['##ALT=', + '##FORMAT=', + '##FORMAT=', + '##FILTER=', + '##FILTER='] + + with pysam.VariantFile(args.normal_revise_vcf) as normal_vcf: + + # # Add metadata lines for annotations + header1 = normal_vcf.header + + for f in NEW_HEADER_LINES: + header1.add_line(f) + header2.add_line(f) + + non_outlier_samples = {s for s in header1.samples if s not in outlier_samples} + vf_1 = max(len(non_outlier_samples) * 0.01, 2) + + biallelic_gts = {(1, 1), (0, 0), (0, 1), (None, None)} + + print("reformatting records", file=sys.stderr) + cleangq_filename = args.out_prefix + ".cleanGQ.vcf.gz" + multiallelic_filename = args.out_prefix + ".multiallelic.vcf.gz" + no_variant_samples_list_file = args.out_prefix + ".no_called_samples.list" + + with pysam.VariantFile(cleangq_filename, 'w', header=normal_vcf.header, threads=args.threads_per_file) as cleanqg_out, \ + pysam.VariantFile(multiallelic_filename, 'w', header=normal_vcf.header) as multiallelic_out, \ + open(no_variant_samples_list_file, 'w') as no_variant_samples_out: + for idx, record in enumerate(normal_vcf): + multi_del = False + multi_dup = False + gt4_copystate = False + gt5kb_dup = False + gt5kb_del = False + if (idx - 1) % 1000 == 0: + print("processed {} records".format(idx), file=sys.stderr) + if record.id in revised_lines_by_id: + record = revised_lines_by_id[record.id] + if record.info.get('SVTYPE', None) == 'DEL': + if abs(record.stop - record.pos) >= 1000: + sample_cn_map = {s: record.samples[s]['RD_CN'] for s in non_outlier_samples} + if len([s for s in sample_cn_map if (sample_cn_map[s] is not None and sample_cn_map[s] > 3)]) > vf_1: + multi_del = True + gts = [record.samples[s]['GT'] for s in non_outlier_samples] + if any(gt not in biallelic_gts for gt in gts): + gt5kb_del = True + if abs(record.stop - record.pos) >= 5000: + if not multi_del: + gt5kb_del = True + + if record.info.get('SVTYPE', None) == 'DUP': + if abs(record.stop - record.pos) >= 1000: + sample_cn_map = {s: record.samples[s]['RD_CN'] for s in non_outlier_samples} + if sum(1 for s in sample_cn_map if sample_cn_map[s] is not None and sample_cn_map[s] > 4) > vf_1: + multi_dup = True + if sum(1 for x in Counter(sample_cn_map.values()) if x is not None and (x < 1 or x > 4)) > 4: + gt4_copystate = True + if sum(1 for s in sample_cn_map if sample_cn_map[s] is not None and + (sample_cn_map[s] < 1 or sample_cn_map[s] > 4) and + gt4_copystate) > vf_1: + multi_dup = True + gts = [record.samples[s]['GT'] for s in non_outlier_samples] + if any(gt not in biallelic_gts for gt in gts): + gt5kb_dup = True + if abs(record.stop - record.pos) >= 5000: + if not multi_dup: + gt5kb_dup = True + + if gt5kb_del: + for sample_obj in record.samples.itervalues(): + if not sample_obj['GQ'] is None and sample_obj['RD_CN'] >= 2: + sample_obj['GT'] = (0, 0) + elif not sample_obj['GQ'] is None and sample_obj['RD_CN'] == 1: + sample_obj['GT'] = (0, 1) + elif not sample_obj['GQ'] is None: + sample_obj['GT'] = (1, 1) # RD_CN 0 DEL + + if gt5kb_dup: + for sample_obj in record.samples.itervalues(): + if not sample_obj['GQ'] is None and sample_obj['RD_CN'] <= 2: + sample_obj['GT'] = (0, 0) + elif not sample_obj['GQ'] is None and sample_obj['RD_CN'] == 3: + sample_obj['GT'] = (0, 1) + elif not sample_obj['GQ'] is None: + sample_obj['GT'] = (1, 1) # RD_CN > 3 DUP + + if record.id in multi_geno_ids: + record.filter.add('PESR_GT_OVERDISPERSION') + + if multi_del or multi_dup: + record.filter.add('MULTIALLELIC') + for j, sample in enumerate(record.samples): + record.samples[sample]['GT'] = None + record.samples[sample]['GQ'] = None + record.samples[sample]['CN'] = record.samples[sample]['RD_CN'] + record.samples[sample]['CNQ'] = record.samples[sample]['RD_GQ'] + + if len(record.filter) > 1 and 'PASS' in record.filter: + del record.filter['PASS'] + + if 'MULTIALLELIC' in record.filter and ('' in record.alts or '' in record.alts): + record.alts = ('',) + record.info['SVTYPE'] = 'CNV' + + if record.id in sexchr_revise: + for sample in record.samples: + if sample in male_samples: + cn = int(record.samples[sample]['RD_CN']) + if cn is not None and cn > 0: + record.samples[sample]['RD_CN'] = cn - 1 + if 'CN' in record.samples[sample]: + record.samples[sample]['CN'] = cn - 1 # the old script didn't do this but I think it should + + cleanqg_out.write(record) + + if 'MULTIALLELIC' in record.filter: + multiallelic_out.write(record) + + if len(svu.get_called_samples(record)) == 0: + print(record.id, file=no_variant_samples_out) + + print("done", file=sys.stderr) + + +if __name__ == '__main__': + main() diff --git a/src/sv-pipeline/04_variant_resolution/scripts/merge_pesr_depth.py b/src/sv-pipeline/04_variant_resolution/scripts/merge_pesr_depth.py index f5c830eba..f380cd18e 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/merge_pesr_depth.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/merge_pesr_depth.py @@ -8,10 +8,10 @@ import svtk.utils as svu -def merge_pesr_depth(vcf, fout, prefix, frac=0.5, sample_overlap=0.5): +def merge_pesr_depth(vcf, fout, prefix, frac, sample_overlap, min_depth_only_size): - sample_overlap_cache = {} - sample_id_to_index_dict = {s: i for i, s in enumerate(vcf.header.samples)} + def _get_shard_path(base_path, index): + return "{}.shard_{}.vcf.gz".format(base_path, index) # Given one pesr record and one depth record, merge depth attributes into the pesr record def _merge_pair(record_a, record_b): @@ -89,6 +89,8 @@ def _flush_sample_overlap_cache(): sample_overlap_cache.clear() def _sample_overlap(record_a, record_b): + if sample_overlap == 0: + return True _cache_sample_overlap(record_a) _cache_sample_overlap(record_b) return svu.samples_overlap(sample_overlap_cache[record_a.id], sample_overlap_cache[record_b.id], @@ -106,6 +108,11 @@ def _get_base_record(vcf): vcf.reset() return record + sample_overlap_cache = {} + sample_id_to_index_dict = {s: i for i, s in enumerate(vcf.header.samples)} + cnv_types = ['DEL', 'DUP'] + min_svlen = min_depth_only_size * frac + base_record = _get_base_record(vcf) if base_record is None: raise ValueError("No PESR records were found") @@ -118,9 +125,19 @@ def _get_base_record(vcf): count = 0 for record in vcf.fetch(): + if count > 0 and count % 1000 == 0: + sys.stderr.write("Traversed {} records; {} active records; {} record sample sets cached\n" + .format(count, len(active_records), len(sample_overlap_cache))) + count += 1 + # Seed MEMBERS info with original VID record.info['MEMBERS'] = (record.id,) + if record.info['SVTYPE'] not in cnv_types \ + or record.info['SVLEN'] < min_svlen: + _write_record(record, False) + continue + # Write all-ref sites as "salvaged" samples = _cache_sample_overlap(record) if len(samples) == 0: @@ -150,9 +167,6 @@ def _get_base_record(vcf): clustered_depth_ids.add(ar.id) active_records.append(record) active_records = [r for r in active_records if r.id not in finalized_record_ids] - if count % 1000 == 0: - sys.stderr.write("{}: {}\n".format(count, len(sample_overlap_cache))) - count += 1 _flush_active_records() @@ -191,19 +205,23 @@ def main(): description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter) parser.add_argument('vcf', help='Combined but unmerged VCF of PE/SR calls') - parser.add_argument('fout', help='Output VCF (unsorted!), can be "-" or "stdout"') + parser.add_argument('fout', help='Output VCF (unsorted!)') + parser.add_argument('--interval-overlap', help='Interval reciprocal overlap fraction', + type=float, default=0.5) + parser.add_argument('--sample-overlap', help='Sample overlap fraction', + type=float, default=0.5) + parser.add_argument('--min-depth-only-size', help='Smallest depth only call SVLEN', + type=int, default=5000) parser.add_argument('--prefix', default='pesr_rd_merged') args = parser.parse_args() vcf = pysam.VariantFile(args.vcf) check_header(vcf) - - if args.fout in '- stdout'.split(): - fout = pysam.VariantFile(sys.stdout, 'w', header=vcf.header) - else: - fout = pysam.VariantFile(args.fout, 'w', header=vcf.header) - - merge_pesr_depth(vcf, fout, args.prefix) + fout = pysam.VariantFile(args.fout, 'w', header=vcf.header) + merge_pesr_depth(vcf, fout=fout, prefix=args.prefix, + frac=args.interval_overlap, + sample_overlap=args.sample_overlap, + min_depth_only_size=args.min_depth_only_size) fout.close() diff --git a/src/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py b/src/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py index 18ffd5271..ec1748747 100644 --- a/src/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py +++ b/src/sv-pipeline/04_variant_resolution/scripts/overlap_breakpoint_filter.py @@ -12,6 +12,11 @@ VCF_PATH = sys.argv[1] BOTHSIDE_PASS_PATH = sys.argv[2] BACKGROUND_FAIL_PATH = sys.argv[3] +DROPPED_RECORD_OUTPUT_VCF_PATH = sys.argv[4] +if len(sys.argv) >= 6: + DEBUG_OUTPUT_PATH = sys.argv[5] +else: + DEBUG_OUTPUT_PATH = None # Sorts list xs by specified attributes @@ -25,7 +30,10 @@ def multisort(xs, specs): class RecordData: def __init__(self, record): self.id = record.id - ev = set(record.info['EVIDENCE']) + if 'EVIDENCE' in record.info: + ev = set(record.info['EVIDENCE']) + else: + ev = set() if 'PE' in ev and 'SR' in ev and 'RD' in ev: self.level_of_support = 1 elif 'PE' in ev and 'RD' in ev: @@ -55,9 +63,12 @@ def __init__(self, record): self.freq = len(self.called_samples) self.length = record.info['SVLEN'] self.gt_50bp = self.length >= 50 + self.is_mei = 'melt' in record.info['ALGORITHMS'] - def __repr__(self): - return repr((self.level_of_support, -self.both_end_support, self.sr_fail, self.is_bnd, -self.vargq, -self.freq, self.gt_50bp, self.length, self.id)) + def __str__(self): + return ",".join(str(x) for x in + (self.is_bnd, self.level_of_support, self.is_mei, self.both_end_support, + self.sr_fail, self.vargq, self.freq, self.gt_50bp, self.length, self.id)) vcf = pysam.VariantFile(VCF_PATH) @@ -117,10 +128,11 @@ def __repr__(self): # This is how we sort record pairs to determine which one gets filtered sort_spec = [ + ('is_bnd', False), ('level_of_support', False), + ('is_mei', True), ('both_end_support', True), ('sr_fail', False), - ('is_bnd', False), ('vargq', True), ('freq', True), ('gt_50bp', False), @@ -129,7 +141,10 @@ def __repr__(self): ] # Iterate through record pairs and generate list of record ids to filter out -ids_to_remove = set([]) +ids_to_remove_dict = dict() +if DEBUG_OUTPUT_PATH is not None: + debug = open(DEBUG_OUTPUT_PATH, 'w') + debug.write("#record_kept\trecord_dropped\n") for data_list in pairwise_record_data: # Check for 50% sample overlap sample_intersection = set(data_list[0].called_samples).intersection(data_list[1].called_samples) @@ -138,13 +153,29 @@ def __repr__(self): continue # Determine which to filter sorted_data_list = multisort(list(data_list), sort_spec) - ids_to_remove.add(sorted_data_list[1].id) + ids_to_remove_dict[sorted_data_list[1].id] = sorted_data_list[0].id + if DEBUG_OUTPUT_PATH is not None: + debug.write("\t".join(str(x) for x in sorted_data_list) + "\n") +if DEBUG_OUTPUT_PATH is not None: + debug.close() # Perform filtering -sys.stderr.write("Filtering {} records\n".format(len(ids_to_remove))) +sys.stderr.write("Filtering {} records\n".format(len(ids_to_remove_dict))) vcf = pysam.VariantFile(VCF_PATH) -sys.stdout.write(str(vcf.header)) +header = vcf.header +sys.stdout.write(str(header)) + +# Create +header.add_line( + '##INFO=') +dropped_record_vcf = pysam.VariantFile(DROPPED_RECORD_OUTPUT_VCF_PATH, 'w', header=header) + for record in vcf: - if record.id not in ids_to_remove: + if record.id in ids_to_remove_dict: + record.info['BPID'] = ids_to_remove_dict[record.id] + dropped_record_vcf.write(record) + else: sys.stdout.write(str(record)) vcf.close() +dropped_record_vcf.close() diff --git a/src/sv-pipeline/04_variant_resolution/scripts/overlapbpchange.sh b/src/sv-pipeline/04_variant_resolution/scripts/overlapbpchange.sh deleted file mode 100755 index 41b7692a1..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/overlapbpchange.sh +++ /dev/null @@ -1,111 +0,0 @@ -#!/bin/bash -# -# overlapbpchange.sh -# - -set -euo pipefail - -##Inputs## -vcf=$1 -##sr fail## -backgroundlist=$2 -##sr support on both sides## -bothendSR=$3 - -##clean out variants that overlap at one site## -##pull out variants with duplicate bp that are not driven by depth which will be integrated in the clean vcf## -##make sure to flip bed as well so second bp location can be compared with first from other variants## -svtk vcf2bed $vcf stdout -i CHR2 -i STRANDS -i SVLEN -i varGQ -i END -i EVIDENCE -i SVTYPE --split-bnd \ - | sed "s/+-/+ "$'\t -/g' \ - | sed "s/-+/- "$'\t +/g' \ - | sed "s/++/+ "$'\t +/g' \ - | sed "s/--/- "$'\t -/g' | \ - ##Convert back to 1-based positions## - awk -v OFS='\t' '{$2=$2+1; print $0}' \ - | awk -v OFS='\t' \ - '{if (!(($NF=="DEL" || $NF=="DUP") && $10>=5000)) print $0 "\n" $7,$12,$2,$4,$5,$6,$1,$9,$8,$10,$11,$2,$13,$14 }' | \ - ###Find duplicated variants that overlap at same bp one side## - awk 'cnt[$1"_"$2"_"$8]++{if (cnt[$1"_"$2"_"$8]==2) print prev[$1"_"$2"_"$8] "\t" $1"_"$2"_"$8 \ - ; print $0 "\t" $1"_"$2"_"$8} {prev[$1"_"$2"_"$8]=$0}' \ - | awk '!seen[$4"_"$NF]++' \ - | awk 'cnt[$NF]++{if (cnt[$NF]==2) print prev[$NF] \ - ; print $0 } {prev[$NF]=$0}' \ - >dupside1.bed - - -##Find 50% overlap between samples for overlaps## -join -j 2 <(awk '{print $NF "\t" $6}' dupside1.bed \ - | awk -F'[,\t]' '{for (i=2;i<=NF;i++) print $1 "\t" $i}' \ - | sort \ - | uniq -D \ - | awk '{print $1}'|sort|uniq -c ) \ - <(awk '{print $NF "\t" $6}' dupside1.bed \ - | awk -F'[,\t]' '{for (i=2;i<=NF;i++) print $1 "\t" $i}' \ - | awk '{print $1}' \ - | sort \ - | uniq -c) \ - | awk '{if ($2 >= 0.5 * $3) print $1}' \ - | (fgrep -wf - dupside1.bed || printf "") \ - > dupside1.freq50.txt - -##Add SRfail### -{ fgrep -wf <(awk '{print $NF}' $backgroundlist) dupside1.freq50.txt || true; } \ - | awk '{print $0 "\t" 0}' \ - > dupside1.passSR.txt - -{ fgrep -wvf <(awk '{print $NF}' $backgroundlist) dupside1.freq50.txt || true; } \ - | awk '{print $0 "\t" 1}' \ - >> dupside1.passSR.txt - -##Attach the % of variants that show SR support at bothends## -join -1 4 -2 1 -e "0" -a 1 -o 1.1 1.2 1.3 1.4 1.5 1.6 1.7 1.8 1.9 1.10 1.11 1.12 1.13 1.14 1.15 1.16 2.2 \ - <(sort -k4,4 dupside1.passSR.txt) \ - <(awk '{print $NF "\t" $1}' $bothendSR | sort -k1,1) \ - | tr ' ' '\t' \ - > dupside1.bothpassfilter.txt -rm dupside1.passSR.txt - -##count number of samples and indiciate if size gt 50bp## -join -1 4 -2 1 dupside1.bothpassfilter.txt \ - <(awk '{print $4 "\t" $6}' dupside1.bed \ - | awk -F'[,\t]' '{print $1 "\t" NF-1}' \ - | sort -k1,1) \ - | tr ' ' '\t' \ - | awk '{if ($10>=50) print $0 "\t" 1;else print $0 "\t" 0}' \ - > dupside1.samplecountfilter.txt -rm dupside1.bed dupside1.bothpassfilter.txt - -##Convert Evidence column into Integers for scoring and ## -##RD,PE,SR-1,RD,PE-2,PE,SR-3,RD,SR-4,PE-5,RD-6,SR-7## -sed 's/BAF,//g' dupside1.samplecountfilter.txt \ - | awk -v OFS='\t' ' - { - if ($13=="PE,RD,SR") print $0 "\t" 1 - else if ($13=="PE,RD") print $0 "\t" 2 - else if ($13=="PE,SR") print $0 "\t" 3 - else if ($13=="RD,SR") print $0 "\t" 4 - else if ($13=="PE") print $0 "\t" 5 - else if ($13=="RD") print $0 "\t" 6 - else if ($13=="SR") print $0 "\t" 7 - }' | \ - ##assign BND to bottom - awk '{if ($14=="BND") print $0 "\t" 0;else print $0 "\t" 1}' \ - > dupside1.allfilter.txt -rm dupside1.samplecountfilter.txt -###DO THIS##### -## - - -##sort file with overlapping samples LevelofSupport->BothEndsupport->SRfail-> Not BND->Higher varq-> Higher Freq -> Smallest size if gt 5kb## -sort -k20,20n -k17,17nr -nrk16,16 -k21,21nr -k11,11nr -k18,18nr -k19,19nr -k10,10n dupside1.allfilter.txt \ - | awk '!seen[$15]++' \ - | awk '{print $1}' \ - | (fgrep -wvf - dupside1.freq50.txt || printf "") \ - | awk '{print $4}' \ - > remove.side1.var.txt -rm dupside1.freq50.txt dupside1.allfilter.txt - -##remove variants with samebp## -(zgrep -wvf remove.side1.var.txt $vcf || printf "") \ - | bgzip \ - > non_redundant.vcf.gz diff --git a/src/sv-pipeline/04_variant_resolution/scripts/patch_sr_bothside_support.py b/src/sv-pipeline/04_variant_resolution/scripts/patch_sr_bothside_support.py new file mode 100644 index 000000000..ac7e2b879 --- /dev/null +++ b/src/sv-pipeline/04_variant_resolution/scripts/patch_sr_bothside_support.py @@ -0,0 +1,46 @@ +#!/bin/python + +import sys +from collections import defaultdict + + +def count_vids(list_path): + counts = defaultdict(lambda: 0) + with open(list_path, 'r') as f_list: + for path in f_list: + with open(path.strip(), 'r') as f: + for vid in f: + counts[vid.strip()] += 1 + return counts + + +def count_sr_pass(path, n): + counts = defaultdict(lambda: 0) + with open(path, 'r') as f: + for line in f: + tokens = line.strip().split('\t') + n_support = round(float(tokens[0]) * n) + vid = tokens[-1] + counts[vid] = n_support + return counts + + +NON_REF_VIDS_LIST = sys.argv[1] +BOTHSIDE_PASS_FILE = sys.argv[2] +NUM_BATCHES = int(sys.argv[3]) + +non_ref_counts = count_vids(NON_REF_VIDS_LIST) +bothside_pass_counts = count_sr_pass(BOTHSIDE_PASS_FILE, NUM_BATCHES) + +with open(BOTHSIDE_PASS_FILE, 'r') as f: + for line in f: + tokens = line.strip().split('\t') + vid = tokens[-1] + bothside_pass_count = bothside_pass_counts[vid] + if bothside_pass_count == 0: + continue + non_ref_count = non_ref_counts[vid] + if non_ref_count == 0: + continue + fraction_support = min(1., bothside_pass_count / float(non_ref_count)) + sys.stdout.write("{}\t{}\n".format(fraction_support, "\t".join(tokens[1:]))) diff --git a/src/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.sh b/src/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.sh index e533f07d7..20751103b 100755 --- a/src/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.sh +++ b/src/sv-pipeline/04_variant_resolution/scripts/process_posthoc_cpx_depth_regenotyping.sh @@ -2,7 +2,7 @@ # Reassign variant labels based on depth regenotyping in mod04b -set -exo pipefail +set -eo pipefail ###USAGE usage(){ cat < Minimum insertion site size (in bp) to be considered for + -D Minimum insertion site size (in bp) to be considered for distinguishing insertion site deletions [default: 150 bp] -T Minimum size (in bp) at which to prioritize an inverted dDUP classification over a dupINV or INVdup classification [default: 1000000 bp] - -R Path to table containing the final reclassification - decision made per variant. [default: no table output] + -R Path to table containing the final reclassification + decision made per variant. [default: no table output] -G Path to table containing the raw genotype counts table - per interval per variant. [default: no table output] + per interval per variant. [default: no table output] Notes: @@ -221,9 +221,9 @@ while read chr start end VID samps trash; do unset medCN ###Get list of samples & reference CN to consider, dependent on chr of call - #ChrX: use only diploid females if possible, otherwise use haploid males + #ChrX: use only diploid females if possible, otherwise use haploid males if [ ${chr} == "X" ] || [ ${chr} == "chrX" ]; then - + #Try to get female carrier samples echo -e "${samps}" | sed 's/,/\n/g' \ | fgrep -wf - ${GTDIR}/female.samples.list \ @@ -323,7 +323,7 @@ while read chr start end VID samps trash; do fi - #For predicted carriers, count number of genotypes lower than, + #For predicted carriers, count number of genotypes lower than, # equal to, and greater than the overall median if [ $( cat ${GTDIR}/carrier_samples.tmp | wc -l ) -gt 0 ]; then fgrep -wf ${GTDIR}/carrier_samples.tmp \ @@ -338,8 +338,8 @@ while read chr start end VID samps trash; do else echo -e "0\n0\n0" fi - - #For predicted non-carriers, count number of genotypes lower than, + + #For predicted non-carriers, count number of genotypes lower than, # equal to, and greater than the overall median if [ $( cat ${GTDIR}/control_samples.tmp | wc -l ) -gt 0 ]; then fgrep -wf ${GTDIR}/control_samples.tmp \ @@ -432,7 +432,7 @@ awk -v ENDidx=${ENDidx} -v OFS="\t" '{ $3=$ENDidx; print }' \ ${GTDIR}/inv_se_vcf2bed.precut.bed \ | fgrep -v "#" || true \ >> ${GTDIR}/variants_to_reclassify.vcf2bed.bed - + ###MAKE FINAL ASSESSMENT FOR EACH VARIANT #Print header @@ -994,14 +994,25 @@ while read VID MOD REASON svtype cpxtype cpxintervals SVLEN SOURCE START END; do #Modify info as needed INFO=$( fgrep -w ${VID} ${GTDIR}/variants_to_be_reassessed.vcf \ | cut -f8 \ - | sed -r -e "s/END=[^;]*;/END=$END;/" \ + | sed -r -e "s/^END=[^;]*;/END=$END;/" \ + | sed -r -e "s/;END=[^;]*;/;END=$END;/" \ + | sed -r -e "s/;END=[^;]*$/;END=$END/" \ + | sed -r -e "s/^SVTYPE=[^;]*;/SVTYPE=$svtype;/" \ | sed -r -e "s/;SVTYPE=[^;]*;/;SVTYPE=$svtype;/" \ + | sed -r -e "s/;SVTYPE=[^;]*$/;SVTYPE=$svtype/" \ + | sed -r -e "s/^SVLEN=[^;]*;/SVLEN=$SVLEN;/" \ | sed -r -e "s/;SVLEN=[^;]*;/;SVLEN=$SVLEN;/" \ - | sed -r -e "s/;CPX_TYPE=[^;]*$/;CPX_TYPE=${cpxtype}/" \ + | sed -r -e "s/;SVLEN=[^;]*$/;SVLEN=$SVLEN/" \ + | sed -r -e "s/^CPX_TYPE=[^;]*;/CPX_TYPE=${cpxtype};/" \ | sed -r -e "s/;CPX_TYPE=[^;]*;/;CPX_TYPE=${cpxtype};/" \ + | sed -r -e "s/;CPX_TYPE=[^;]*$/;CPX_TYPE=${cpxtype}/" \ + | sed -r -e 's/^UNRESOLVED;//' \ | sed -r -e 's/;UNRESOLVED;/;/' \ + | sed -r -e 's/;UNRESOLVED$//' \ + | sed -r -e 's/^UNRESOLVED_TYPE=[^;]*;//' \ | sed -r -e 's/;UNRESOLVED_TYPE=[^;]*;/;/' \ | sed -r -e 's/;UNRESOLVED_TYPE=[^;]*$//' \ + | sed -r -e 's/^EVENT=[^;]*;//' \ | sed -r -e 's/;EVENT=[^;]*;/;/' \ | sed -r -e 's/;EVENT=[^;]*$//' ) #Add/remove/modify CPX_TYPE, if needed diff --git a/src/sv-pipeline/04_variant_resolution/scripts/reset_cnv_gts.py b/src/sv-pipeline/04_variant_resolution/scripts/reset_cnv_gts.py new file mode 100644 index 000000000..7c1073f6a --- /dev/null +++ b/src/sv-pipeline/04_variant_resolution/scripts/reset_cnv_gts.py @@ -0,0 +1,50 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- +# + +""" +Sets CNV GT fields to "." +This is needed following any HailMerge step for VCFs containing CNVs +""" + +import argparse +import sys +import pysam + + +def reset_cnv_gts(vcf, fout): + + for record in vcf: + if record.info['SVTYPE'] == 'CNV': + for sample in record.samples: + record.samples[sample]['GT'] = (None,) + fout.write(record) + + +def main(): + + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter) + parser.add_argument('vcf') + parser.add_argument('fout') + + args = parser.parse_args() + + if args.vcf in '- stdin'.split(): + vcf = pysam.VariantFile(sys.stdin) + else: + vcf = pysam.VariantFile(args.vcf) + + header = vcf.header + + if args.fout in '- stdout'.split(): + fout = pysam.VariantFile(sys.stdout, 'w', header=header) + else: + fout = pysam.VariantFile(args.fout, 'w', header=header) + + reset_cnv_gts(vcf, fout) + + +if __name__ == '__main__': + main() diff --git a/src/sv-pipeline/04_variant_resolution/scripts/resolve_CPX_CNV_redundancies.sh b/src/sv-pipeline/04_variant_resolution/scripts/resolve_CPX_CNV_redundancies.sh deleted file mode 100755 index 73bc405a5..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/resolve_CPX_CNV_redundancies.sh +++ /dev/null @@ -1,238 +0,0 @@ -#!/bin/bash - -# Resolve redundancies between simple CNVs and unbalanced complex SV in mod04b - -# TODO : Missing pipefail -set -e - -###USAGE -usage(){ -cat < ${PROCDIR}/intervals.preclustered.bed.gz - - -###REMOVE CNVS REDUNDANT WITH COMPLEX EVENTS -#Subset to only variants that share some overlap (at least 10% recip) with at least one CPX variant -bedtools intersect -wa -r -f 0.1 \ - -a ${PROCDIR}/intervals.preclustered.bed.gz \ - -b <( zcat ${PROCDIR}/intervals.preclustered.bed.gz | fgrep "CPX" ) \ - | sort -Vk1,1 -k2,2n -k3,3n -k4,4V \ - | uniq \ - | bgzip -c \ - > ${PROCDIR}/intervals.preclustered.subset.bed.gz -#Melt subsetted variants -while read chr start end VID samples CNV; do - echo -e "${samples}" \ - | sed 's/,/\n/g' \ - | awk -v OFS="\t" -v chr=${chr} -v start=${start} -v end=${end} -v VID=${VID} -v CNV=${CNV} \ - '{ print chr, start, end, VID, $1, CNV }' -done < <( zcat ${PROCDIR}/intervals.preclustered.subset.bed.gz ) \ - | bgzip -c \ - > ${PROCDIR}/intervals.preclustered.subset.melted.bed.gz -#Cluster BED intervals (50% RO) -svtk bedcluster -f 0.5 \ - ${PROCDIR}/intervals.preclustered.subset.melted.bed.gz - \ - | bgzip -c > \ - ${PROCDIR}/intervals.clustered.bed.gz -#Get list of all variants that cluster with a complex variant, -# evaluate sample overlap from original intervals file, -# and, if overlap >50%, write that ID to be stripped from the output VCF -while read VIDs; do - #Get nonredundant list of sample IDs involved in any clustered variant - echo -e "${VIDs}" | sed 's/,/\n/g' \ - | fgrep -wf - <( zcat ${PROCDIR}/intervals.preclustered.bed.gz ) \ - | cut -f5 | sort | uniq \ - > ${PROCDIR}/nonredundant_samples.list - #Iterate over VIDs and print non-CPX VID if sample overlap >50% - while read VID samples; do - #Get list of samples in variant - echo -e "${samples}" | sed 's/,/\n/g' \ - | sort | uniq > ${PROCDIR}/query_samples.list - nsamp=$( cat ${PROCDIR}/query_samples.list | wc -l ) - - #Compare - frac=$( fgrep -wf ${PROCDIR}/query_samples.list \ - ${PROCDIR}/nonredundant_samples.list | wc -l \ - | awk -v nsamp=${nsamp} '{ print 100*($1/nsamp) }' \ - | cut -f1 -d\. ) - if [ ${frac} -ge 50 ]; then - echo "${VID}" - fi - - #Clean up - rm ${PROCDIR}/query_samples.list - - done < <( echo -e "${VIDs}" | sed 's/,/\n/g' \ - | fgrep -wf - <( zcat ${PROCDIR}/intervals.preclustered.bed.gz ) \ - | cut -f4,5 | sort | uniq | fgrep -v "CPX" ) - - #Clean up - rm ${PROCDIR}/nonredundant_samples.list - -done < <( zcat ${PROCDIR}/intervals.clustered.bed.gz \ - | cut -f7 | fgrep "CPX" | grep -e "DEL\|DUP" ) \ - | sort -V | uniq \ - > ${PROCDIR}/VIDs_to_remove.list - - -###FIND REMAINING REDUNDANT CNVS WITH STRONG (80%) OVERLAP IN SAMPLES AND SIZE -#Find CNV intervals that have 80% reciprocal overlap -bedtools intersect -wa -wb -r -f 0.8 \ - -a ${PROCDIR}/intervals.preclustered.bed.gz \ - -b ${PROCDIR}/intervals.preclustered.bed.gz \ - | awk -v FS="\t" '{ if ($4!=$10 && $6==$12) print $0 }' \ - | awk -v OFS="\t" '$4 ~ /DEL|DUP/ { print $0 }' \ - | awk -v OFS="\t" '$10 ~ /DEL|DUP/ { print $0 }' \ - | bgzip -c \ - > ${PROCDIR}/step2.intervals.preclustered.subset.bed.gz -#Determine which events share 80% sample overlap -while read VIDa sa VIDb sb; do - na=$( echo -e "${sa}" | sed 's/,/\n/g' | sort | uniq | wc -l ) - nb=$( echo -e "${sb}" | sed 's/,/\n/g' | sort | uniq | wc -l ) - denom=$( echo -e "${sa},${sb}" | sed 's/,/\n/g' | sort | uniq | wc -l ) - numer=$( echo -e "${sa}" | sed 's/,/\n/g' | fgrep -wf - \ - <( echo -e "${sb}" | sed 's/,/\n/g' ) \ - | sort | uniq | wc -l ) - if [ ${denom} -gt 0 ]; then - ovr=$(( 100 * ${numer} / ${denom} )) - fi - if [ -z ${ovr} ]; then - ovr=0 - fi - if [ ${ovr} -ge 80 ]; then - echo -e "${VIDa}\n${VIDb}" \ - | sort | uniq | paste -s -d, - fi -done < <( zcat ${PROCDIR}/step2.intervals.preclustered.subset.bed.gz \ - | cut -f4,5,10,11 ) \ - | sort | uniq \ - > ${PROCDIR}/step2.variants_to_resolve.list -#Iterate over variants, pick info & coords from variant with largest N, -# and consolidate genotypes -sed 's/,/\n/g' ${PROCDIR}/step2.variants_to_resolve.list \ - | sort | uniq \ - > ${PROCDIR}/step2.variants_to_resolve.melted.list -if [ -e ${PROCDIR}/records_to_add.vcf ]; then - rm ${PROCDIR}/records_to_add.vcf -fi -until [ $( cat ${PROCDIR}/step2.variants_to_resolve.melted.list | wc -l ) -eq 0 ]; do - #get next variant - VID=$( head -n1 ${PROCDIR}/step2.variants_to_resolve.melted.list ) - #get all other variants from clusters containing this variant - fgrep -w ${VID} ${PROCDIR}/step2.variants_to_resolve.list \ - | sed 's/,/\n/g' | sort | uniq \ - > ${PROCDIR}/step2.partners.tmp - #Print all genotypes to tmp file - zcat ${INVCF} | fgrep -v "#" \ - | fgrep -wf ${PROCDIR}/step2.partners.tmp | cut -f10- \ - > ${PROCDIR}/gts.tmp - #Select best genotypes to keep - ${BIN}/selectBestGT.R ${PROCDIR}/gts.tmp ${PROCDIR}/gts.best.tmp - #Select record with greatest total number of samples - bVID=$( zcat ${PROCDIR}/intervals.preclustered.bed.gz \ - | fgrep -wf ${PROCDIR}/step2.partners.tmp \ - | cut -f4-5 | sed 's/,/\t/g' \ - | awk -v OFS="\t" '{ print $1, NF }' \ - | sort -nrk2,2 \ - | cut -f1 \ - | head -n1 ) - #Add new record to final append tmp file - paste <( zcat ${INVCF} | fgrep -w ${bVID} | cut -f1-9 ) \ - ${PROCDIR}/gts.best.tmp \ - >> ${PROCDIR}/records_to_add.vcf - #Write list of variants to exclude from original VCF - cat ${PROCDIR}/step2.partners.tmp >> ${PROCDIR}/VIDs_to_remove.list - #Exclude variants from list of VIDs to resolve - fgrep -wvf ${PROCDIR}/step2.partners.tmp \ - ${PROCDIR}/step2.variants_to_resolve.melted.list \ - > ${PROCDIR}/step2.variants_to_resolve.melted.list2 \ - || true - mv ${PROCDIR}/step2.variants_to_resolve.melted.list2 \ - ${PROCDIR}/step2.variants_to_resolve.melted.list -done - - -###CLEAN UP FINAL OUTPUT -zcat ${INVCF} \ - | fgrep -wvf ${PROCDIR}/VIDs_to_remove.list \ - | cat - ${PROCDIR}/records_to_add.vcf \ - | vcf-sort \ - | bgzip -c \ - > ${OUTVCF} - - -###CLEAN UP -rm -rf ${PROCDIR} - - diff --git a/src/sv-pipeline/04_variant_resolution/scripts/resolve_cpx_cnv_redundancies.py b/src/sv-pipeline/04_variant_resolution/scripts/resolve_cpx_cnv_redundancies.py new file mode 100755 index 000000000..51baaa0a6 --- /dev/null +++ b/src/sv-pipeline/04_variant_resolution/scripts/resolve_cpx_cnv_redundancies.py @@ -0,0 +1,585 @@ +#!/usr/bin/env python + +import sys +import os +import pybedtools +import pysam +import numpy +import scipy.sparse +import argparse +from typing import List, Text, Optional, Iterable, Iterator, Tuple, Set, Dict, Mapping +from types import MappingProxyType +import multiprocessing + + +class Keys: # static class with re-used strings (to avoid typo errors, allow easy refactoring) + svtype = "SVTYPE" + ins = "INS" + deletion = "DEL" + dup = "DUP" + cpx = "CPX" + cnv = "CNV" + unresolved = "UNRESOLVED" + cpx_intervals = "CPX_INTERVALS" + cpx_type = "CPX_TYPE" + + +class Default: # static class with default values for kwargs + min_cpx_reciprocal_overlap = 0.1 + cnv_cpx_reciprocal_overlap = 0.5 + cnv_cpx_sample_overlap = 0.5 + cnv_cnv_reciprocal_overlap = 0.8 + cnv_cnv_sample_overlap = 0.8 + clusterable_sv_types = frozenset({Keys.deletion, Keys.dup, Keys.cnv}) + cpx_ins_classes = frozenset({"dDUP", "dDUP_iDEL", "INS_iDEL"}) + temp_dir = "/tmp" + num_threads = multiprocessing.cpu_count() + + +name_field = 3 +sv_type_field = 4 +is_cpx_field = 5 +ref_ploidy = 2 # note, even for autosome, VCFs always have ploidy=2 calls +ref_gt = (0, 0) +non_carrier_gts = {None, (None, None), (0, 0), (0, None), (None, 0)} + + +def _fix_coords(start: int, end: int) -> (int, int): + """ ensure that start preceeds end, and is >= 0 """ + start, end = (start, end) if (start <= end) else (end, start) # ensure in sorted order + return max(start - 1, 0), end # convert from VCF to bed format + + +def _get_carrier_status( + record: pysam.VariantRecord +) -> Tuple[numpy.ndarray, numpy.ndarray]: + """ + Get boolean numpy arrays detailing carrier status for each sample + Parameters + ---------- + record: VariantRecord + pysam record for this variant + Returns + ------- + is_carrier: numpy.ndarray + boolean array that is True for samples called non-ref for this Variant, and False otherwise (including no-call) + is_ref: numpy.ndarray + boolean array that is True for samples called ref for this Variant, and False otherwise (including no-call) + """ + if record.info.get(Keys.svtype, None) == Keys.cnv: # genotype is always no-call, check info.CN + copy_numbers = [sample_rec.get("CN") for sample_rec in record.samples.itervalues()] + is_carrier = numpy.fromiter( + (copy_number is not None and copy_number != ref_ploidy for copy_number in copy_numbers), + dtype=bool, count=len(copy_numbers) + ) + is_ref = numpy.fromiter( + (copy_number == ref_ploidy for copy_number in copy_numbers), + dtype=bool, count=len(copy_numbers) + ) + else: + genotypes = [sample_rec.get("GT") for sample_rec in record.samples.itervalues()] + is_carrier = numpy.fromiter( + (genotype not in non_carrier_gts for genotype in genotypes), dtype=bool, count=len(genotypes) + ) + is_ref = numpy.fromiter( + (genotype == ref_gt for genotype in genotypes), dtype=bool, count=len(genotypes) + ) + return is_carrier, is_ref + + +def _unfiltered_vcf_records_to_bed_intervals( + vcf_records: Iterable[pysam.VariantRecord], + is_carrier: Dict[Text, numpy.ndarray], + is_ref: Dict[Text, numpy.ndarray], + clusterable_sv_types: Set[Text] = Default.clusterable_sv_types, + cpx_ins_classes: Set[Text] = Default.cpx_ins_classes +) -> Iterator[Tuple]: + f""" + Iterate over input VCF, yielding records that may be redundant. Also gather is_carrier and is_ref mappings. + Parameters + ---------- + vcf_records: Iterable[VariantRecord] + Iterable with pysam records from input VCF file. + is_carrier: Dict[Text, numpy.ndarray] + Dict from variant ID to boolean array that is True for samples called non-ref for this Variant, and False + otherwise (including no-call). NOTE: this function *updates* is_carrier in place. + is_ref: Dict[Text, numpy.ndarray] + Dict from boolean array that is True for samples called ref for this Variant, and False otherwise (including + no-call). NOTE: this function *updates* is_ref in place. + clusterable_sv_types: Set[Text] (default={Default.clusterable_sv_types}) + SV types that may be redundant (or needed for clustering with redundant SVs). + cpx_ins_classes: Set[Text] (default={Default.cpx_ins_classes}) + CPX SV types that should produce an INS sink (modeled as a DEL) + Yields + ------- + bed_tuple: Tuple + successive records for bed object, with fields: contig, start, end, variant_id, sv_type, is_cpx + """ + for record in vcf_records: + sv_type = record.info[Keys.svtype] + if sv_type == Keys.cpx: + if Keys.unresolved in record.filter: + continue + # If complex, all constituent intervals are in CPX_INTERVALS + variant_id = record.id + is_carrier[variant_id], is_ref[variant_id] = _get_carrier_status(record) + for cpx_interval in record.info[Keys.cpx_intervals]: + sv_type, region = cpx_interval.split('_', 1) + contig, coords = region.split(':', 1) + start, end = _fix_coords(*(int(c) for c in coords.split('-', 1))) + yield contig, start, end, variant_id, sv_type, 1 + if record.info.get(Keys.cpx_type, None) in cpx_ins_classes: + # If complex insertion, return insertion point as 1bp DEL + sv_type = Keys.deletion + contig = record.contig + end = record.pos + start = max(0, end - 1) + yield contig, start, end, variant_id, sv_type, 1 + elif sv_type in clusterable_sv_types: + start, end = _fix_coords(record.pos, record.stop) + variant_id = record.id + is_carrier[variant_id], is_ref[variant_id] = _get_carrier_status(record) + yield record.contig, start, end, variant_id, sv_type, 0 + + +def _vcf_records_to_bed_intervals( + vcf_records: Iterable[pysam.VariantRecord], + is_carrier: Dict[Text, numpy.ndarray], + is_ref: Dict[Text, numpy.ndarray], + clusterable_sv_types: Set[Text] = Default.clusterable_sv_types, + cpx_ins_classes: Set[Text] = Default.cpx_ins_classes +) -> Iterator[Tuple]: + f""" + Iterate over input VCF, yielding records that may be redundant. Also gather is_carrier and is_ref mappings. + This function mainly passes results from _unfiltered_vcf_records_to_bed_intervals, but potentially filters out + unneeded SV intervals that originated in CPX events, and duplicates SVTYPE=CNV into one DUP and one DEL. + Parameters + ---------- + vcf_records: Iterable[VariantRecord] + Iterable with pysam records from input VCF file. + is_carrier: Dict[Text, numpy.ndarray] + Dict from variant ID to boolean array that is True for samples called non-ref for this Variant, and False + otherwise (including no-call). NOTE: this function *updates* is_carrier in place. + is_ref: Dict[Text, numpy.ndarray] + Dict from boolean array that is True for samples called ref for this Variant, and False otherwise (including + no-call). NOTE: this function *updates* is_ref in place. + clusterable_sv_types: Set[Text] (default={Default.clusterable_sv_types}) + SV types that may be redundant (or needed for clustering with redundant SVs). + cpx_ins_classes: Set[Text] (default={Default.cpx_ins_classes}) + CPX SV types that should produce an INS sink (modeled as a DEL) + Yields + ------- + bed_tuple: Tuple + successive records for bed object, with fields: contig, start, end, variant_id, sv_type, is_cpx + """ + for contig, start, end, variant_id, sv_type, is_cpx in _unfiltered_vcf_records_to_bed_intervals( + vcf_records, is_carrier, is_ref, clusterable_sv_types=clusterable_sv_types, cpx_ins_classes=cpx_ins_classes + ): + if sv_type in clusterable_sv_types: + # store sv_type in interval.score, is_cpx in interval.strand + if sv_type == Keys.cnv: # ensure CNVs cluster with both insertions and deletions + yield contig, start, end, variant_id, Keys.deletion, is_cpx + yield contig, start, end, variant_id, Keys.dup, is_cpx + else: # yield this interval normally + yield contig, start, end, variant_id, sv_type, is_cpx + + +def jaccard_index(is_carrier_a: numpy.ndarray, is_carrier_b: numpy.ndarray) -> float: + """ return Jaccard index of carrier samples based on two boolean arrays of carrier status """ + return numpy.logical_and(is_carrier_a, is_carrier_b).sum() / numpy.logical_or(is_carrier_a, is_carrier_b).sum() + + +def _iter_pairwise_connections( + clusterable_bedtool: pybedtools.BedTool, + min_reciprocal_overlap: float, + min_sample_overlap: float = 0, + is_carrier: Mapping[Text, numpy.ndarray] = MappingProxyType({}) +) -> Iterator[Tuple[Text, Text]]: + """ + Iterate over pairs of variant intervals that meet minimum requirement for reciprocal overlap. Exclude self-overlaps. + Optionally impose requirement of minimum Jaccard index for carrier samples. + Parameters + ---------- + clusterable_bedtool: BedTool + bed object with intervals that may overlap each other + min_reciprocal_overlap: float + minimum reciprocal overlap for two intervals to be connected + min_sample_overlap: float (default=0) + minimum Jaccard index of carrier samples for two intervals to be connected + is_carrier: Mapping[Text, numpy.ndarray] + map from variant ID to carrier status (array boolean True/False for each sample) + Yields + ------- + variant_id_1, variant_id_2: Tuple[Text, Text] + successive pairs of variant IDs that meet the overlap requiremnts + """ + # Cluster intervals based on reciprocal overlap + if len(clusterable_bedtool) == 0: + return + overlap_bedtool = clusterable_bedtool.intersect(clusterable_bedtool, f=min_reciprocal_overlap, r=True, wa=True, + wb=True, sorted=True, nonamecheck=True) + num_1_fields = clusterable_bedtool.field_count() + name_1_field = name_field + sv_type_1_field = sv_type_field + name_2_field = num_1_fields + name_field + sv_type_2_field = num_1_fields + sv_type_field + + if min_sample_overlap > 0: + for overlap in overlap_bedtool: + fields = overlap.fields + if fields[sv_type_1_field] != fields[sv_type_2_field]: + continue # only cluster same sv_type + name_1 = fields[name_1_field] + name_2 = fields[name_2_field] + if name_1 != name_2 and jaccard_index(is_carrier[name_1], is_carrier[name_2]) >= min_sample_overlap: + yield name_1, name_2 + else: + for overlap in overlap_bedtool: + fields = overlap.fields + if fields[sv_type_1_field] != fields[sv_type_2_field]: + continue # only cluster same sv_type + name_1 = fields[name_1_field] + name_2 = fields[name_2_field] + if name_1 != name_2: + yield name_1, name_2 + + +def _get_clusters( + clusterable_bedtool: pybedtools.BedTool, + min_reciprocal_overlap: float, + min_sample_overlap: float = 0, + is_carrier: Mapping[Text, numpy.ndarray] = MappingProxyType({}) +) -> List[numpy.ndarray]: + """ + Perform single-linkage clustering of variant intervals based on reciprocal overlap. Potentially impose a clustering + requirement of high Jaccard index for carrier samples. + Parameters + ---------- + clusterable_bedtool: BedTool + bed object with intervals that may cluster with each other + min_reciprocal_overlap: float + minimum reciprocal overlap for two intervals to be placed in a cluster + min_sample_overlap: float (default=0) + minimum Jaccard index of carrier samples for two intervals to be placed in a cluster + is_carrier: Mapping[Text, numpy.ndarray] + map from variant ID to carrier status (array boolean True/False for each sample) + Returns + ------- + clusters: List[numpy.ndarray] + each element is an object numpy array of intervals that are in a cluster + """ + # form map from variant IDs to unique indices for this clustering + name_to_index = {name: index for index, name in enumerate({interval.name for interval in clusterable_bedtool})} + num_vertices = len(name_to_index) + sparse_connections = scipy.sparse.eye(num_vertices, dtype=numpy.uint8, format="lil") + for name_1, name_2 in _iter_pairwise_connections( + clusterable_bedtool, min_reciprocal_overlap=min_reciprocal_overlap, min_sample_overlap=min_sample_overlap, + is_carrier=is_carrier + ): + sparse_connections[(name_to_index[name_1], name_to_index[name_2])] = 1 + + # Cluster graph. Use "weak" connection because bedtools will list "A overlaps B" and "B overlaps A" + num_clusters, cluster_labels = scipy.sparse.csgraph.connected_components(sparse_connections, connection="weak") + + # Build lists of clustered Intervals + clusters = [[] for _ in range(num_clusters)] + for interval in clusterable_bedtool: + cluster_label = cluster_labels[name_to_index[interval.name]] + clusters[cluster_label].append(interval) + + # convert lists to numpy object arrays for faster indexing + def _to_numpy_array(_cluster: List[pybedtools.Interval]) -> numpy.ndarray: + _cluster_array = numpy.empty((len(_cluster),), dtype=numpy.object) + _cluster_array[:] = _cluster + return _cluster_array + + return [_to_numpy_array(cluster) for cluster in clusters] + + +def _is_cpx(interval: pybedtools.Interval) -> int: + """ returns 1 if this interval originated as a CPX interval, 0 otherwise """ + return int(interval.strand) # is_cpx is stored in strand + + +def _is_not_cpx(interval: pybedtools.Interval) -> bool: + """ returns 0 if this interval originated as a CPX interval, 1 otherwise """ + return int(interval.strand) == 0 # is_cpx is stored in strand + + +def _get_redundant_cluster_cnv_cpx_vids( + cluster: numpy.ndarray, + is_carrier: Mapping[Text, numpy.ndarray], + cnv_cpx_sample_overlap: float +) -> Iterator[Text]: + """ + Find CNVs that are redundant with CPX events + for each sample that participates in this interval-cluster: + join every variant ID that the sample participates in into a sample-cluster + if the sample-cluster contains >= 1 CPX and >= 1 non-CPX: + find logical-or carrier status over variant IDs in sample-cluster + for every non-CPX variant ID in sample_cluster: + if its Jaccard index is >= cnv_cpx_sample_overlap, it's redundant + Parameters + ---------- + cluster: numpy.ndarray + numpy object array of pybedtools.Interval holding intervals that cluster together + is_carrier: Mapping[Text, numpy.ndarray] + Map from variant ID to boolean array that is True for samples called non-ref for this Variant, and False + otherwise (including no-call). + cnv_cpx_sample_overlap: float + Minimum Jaccard index for variant interval to have with sample cluster in order for it to be redundant. + Yields + ------- + redundant_variant_id: str + Successive redundant variant IDs + """ + interval_is_cpx = numpy.fromiter((_is_cpx(interval) for interval in cluster), dtype=bool, count=len(cluster)) + + def _is_valid_sample_cluster(indices_in_sample_cluster: numpy.ndarray) -> bool: + # valid sample clusters have some CPX intervals and some non-CPX intervals + return 0 < interval_is_cpx.take(indices_in_sample_cluster).sum() < len(indices_in_sample_cluster) + + if not _is_valid_sample_cluster(numpy.arange(len(cluster))): + # no hope of finding valid sample-clusters if the whole thing won't work + return + + # loop over unique combinations of intervals that are all non-ref for a single sample + is_carrier_matrix = numpy.concatenate( + [is_carrier[interval.name].reshape(1, -1) for interval in cluster], axis=0 + ) + + for interval_in_potential_cluster in numpy.unique(is_carrier_matrix, axis=1).transpose(): + indices_in_potential_cluster = numpy.nonzero(interval_in_potential_cluster)[0] + if not _is_valid_sample_cluster(indices_in_potential_cluster): + continue # not a valid cluster, skip it + # can check jaccard index a little more quickly because each interval is in the cluster, so the intersection + # is equal to the carrier status of the interval + num_cluster_carriers = numpy.logical_or.reduce( + is_carrier_matrix.take(indices_in_potential_cluster, axis=0), axis=0 + ).sum() + for index in indices_in_potential_cluster: + if not interval_is_cpx.take(index) and \ + is_carrier_matrix.take(index, axis=0).sum() / num_cluster_carriers >= cnv_cpx_sample_overlap: + yield cluster.take(index).name + + +def _find_cnv_cpx_redundancies( + potentially_clusterable: pybedtools.BedTool, + is_carrier: Mapping[Text, numpy.ndarray], + min_cpx_reciprocal_overlap: float, + cnv_cpx_reciprocal_overlap: float, + cnv_cpx_sample_overlap: float +) -> Set[Text]: + """ + Subset potentially clusterable intervals to those that meet required minimum overlap with a CPX event. + Then find clusters, and remove redundant CNVs from those clusters. + Parameters + ---------- + potentially_clusterable: BedTool + bed object with intervals that could potentially be used for clustering + is_carrier: Mapping[Text, numpy.ndarray] + Map from variant ID to boolean array that is True for samples called non-ref for this Variant, and False + otherwise (including no-call). + min_cpx_reciprocal_overlap: float + Minimum reciprocal overlap with a CPX interval for a CNV interval to be clusterable. + cnv_cpx_reciprocal_overlap: float + Minimum reciprocal overlap between two intervals to be part of a cluster. + cnv_cpx_sample_overlap: float + Minimum Jaccard index for variant interval to have with sample cluster in order for it to be redundant. + Returns + ------- + vids_to_remove: Set[Text] + Set of variant IDs that are redundant and should be removed from the output VCF. + """ + # find all potentially clusterable intervals that meet required minimum overlap with CPX + precluster_subset = potentially_clusterable.intersect( + potentially_clusterable.filter(_is_cpx), u=True, f=min_cpx_reciprocal_overlap, r=True, sorted=True, + nonamecheck=True + ) + + # find clusters of intervals with high reciprocal overlap, then check each cluster for redundant variant IDs + return { + variant_id + for cluster in _get_clusters(precluster_subset, min_reciprocal_overlap=cnv_cpx_reciprocal_overlap) + for variant_id in _get_redundant_cluster_cnv_cpx_vids(cluster, is_carrier, + cnv_cpx_sample_overlap=cnv_cpx_sample_overlap) + } + + +def _update_cnv_cnv_redundances( + vids_to_remove: Set[Text], + potentially_clusterable: pybedtools.BedTool, + is_carrier: Mapping[Text, numpy.ndarray], + is_ref: Mapping[Text, numpy.ndarray], + cnv_cnv_reciprocal_overlap: float, + cnv_cnv_sample_overlap: float +): + """ + Update vids_to_remove by finding CNVs that are redundant with other CNVs (as opposed to CPX) + -Find CNVs with very high reciprocal overlap, and very high carrier sample Jaccard index + -For each CNV that is connected to any other CNVs + Add that CNV and all its connections to vids_to_remove + Find the "best" CNV: the maximum choosing 1st by number of carriers, 2nd by number of called refs + Add the best CNV to set of vids that will be put back in (no matter what, even if previously or subsequently + "removed") + -Update vids_to_remove by removing the "best" variant IDs + + Parameters + ---------- + vids_to_remove: Set[Text] + set of variant IDs that are redundant and should be removed. NOTE: this function updates this set in place. + potentially_clusterable: BedTool + bed object with intervals that could potentially be used for clustering + is_carrier: Mapping[Text, numpy.ndarray] + Map from variant ID to boolean array that is True for samples called non-ref for this Variant, and False + otherwise (including no-call). + is_ref: Mapping[Text, numpy.ndarray] + Map from variant ID to boolean array that is True for samples called ref for this Variant, and False otherwise + (including no-call). + cnv_cnv_reciprocal_overlap: float + minimum reciprocal overlap for two CNVs to be connected + cnv_cnv_sample_overlap: float + minimum carrier samples Jaccard index for two CNVs to be connected + """ + # for each non-CPX interval, find all non-CPX intervals it has sufficient reciprocal overlap and sample overlap with + variant_pairwise_connections = {} + + non_cpx_potentially_clusterable = potentially_clusterable.filter(_is_not_cpx).saveas() + for name_1, name_2 in _iter_pairwise_connections( + non_cpx_potentially_clusterable, min_reciprocal_overlap=cnv_cnv_reciprocal_overlap, + min_sample_overlap=cnv_cnv_sample_overlap, is_carrier=is_carrier + ): + variant_pairwise_connections[name_1] = variant_pairwise_connections.get(name_1, (name_1,)) + (name_2,) + + vids_to_remove.update(variant_pairwise_connections.keys()) # set all the clustered variants to be removed + + # for each of these variant and its direct connections + # - choose one "best" variant to represent it, with priority given to most carriers, followed by most ref calls + # - keep the "best" variant (even if it's previously or subsequently "removed") and remove all others + num_carrier = {variant_id: variant_is_carrier.sum() for variant_id, variant_is_carrier in is_carrier.items()} + num_ref = {variant_id: variant_is_ref.sum() for variant_id, variant_is_ref in is_ref.items()} + + def _best_variant_id(variant_id: Text) -> (int, int, str): + return num_carrier[variant_id], num_ref[variant_id], variant_id + # then remove the best ones + vids_to_remove.difference_update( + max(variant_id_cluster, key=_best_variant_id) for variant_id_cluster in variant_pairwise_connections.values() + ) + + +def resolve_cpx_cnv_redundancies( + input_vcf: Text, + output_vcf: Text, + min_cpx_reciprocal_overlap: float = Default.min_cpx_reciprocal_overlap, + cnv_cpx_reciprocal_overlap: float = Default.cnv_cpx_reciprocal_overlap, + cnv_cpx_sample_overlap: float = Default.cnv_cpx_sample_overlap, + cnv_cnv_reciprocal_overlap: float = Default.cnv_cnv_reciprocal_overlap, + cnv_cnv_sample_overlap: float = Default.cnv_cnv_sample_overlap, + clusterable_sv_types: Set[Text] = Default.clusterable_sv_types, + cpx_ins_classes: Set[Text] = Default.cpx_ins_classes, + temp_dir: str = Default.temp_dir, + num_threads: int = Default.num_threads +): + f""" + From input VCF, find redundant CNVs: + CNVs that have sufficient reciprocal overlap and carrier sample Jaccard index with a CPX + CNVs that have sufficient reciprocal overlap and carrier sampel Jaccard index with another CNV + Write new VCF without redundant CNVs. + Parameters + ---------- + input_vcf: Text + path to input vcf + output_vcf: Text + path to write output vcf + min_cpx_reciprocal_overlap: float (default={Default.min_cpx_reciprocal_overlap,}) + Minimum reciprocal overlap with a CPX interval for a CNV interval to be clusterable. + cnv_cpx_reciprocal_overlap: float (default={Default.cnv_cpx_reciprocal_overlap}) + Minimum reciprocal overlap between two intervals to be part of a cluster. + cnv_cpx_sample_overlap: float (default={Default.cnv_cpx_sample_overlap}) + Minimum Jaccard index for variant interval to have with sample cluster in order for it to be redundant. + cnv_cnv_reciprocal_overlap: float (default={Default.cnv_cnv_reciprocal_overlap}) + Minimum reciprocal overlap for two CNVs to be connected + cnv_cnv_sample_overlap: float (default={Default.cnv_cnv_sample_overlap}) + Minimum carrier samples Jaccard index for two CNVs to be connected + clusterable_sv_types: Set[Text] (default={Default.clusterable_sv_types}) + SV types that may be redundant (or needed for clustering with redundant SVs). + cpx_ins_classes: Set[Text] (default={Default.cpx_ins_classes}) + CPX SV types that should produce an INS sink (modeled as a DEL) + temp_dir: str (default={Default.temp_dir}) + Base folder to create new temp folder in. + num_threads: int (default={Default.temp_dir}) + Number of threads to use for compression/decompression of VCF files. + """ + temp_dir = os.path.abspath(os.path.expanduser(temp_dir)) + os.makedirs(temp_dir, exist_ok=True) + pybedtools.set_tempdir(temp_dir) + is_carrier, is_ref = {}, {} + with pysam.VariantFile(input_vcf, 'r', threads=num_threads) as f_in: + header = f_in.header + potentially_clusterable = pybedtools.BedTool( + _vcf_records_to_bed_intervals(f_in.fetch(), is_carrier, is_ref, clusterable_sv_types=clusterable_sv_types, + cpx_ins_classes=cpx_ins_classes) + ).saveas().sort() + + # get all the potentially clusterable intervals + vids_to_remove = _find_cnv_cpx_redundancies( + potentially_clusterable, is_carrier, min_cpx_reciprocal_overlap=min_cpx_reciprocal_overlap, + cnv_cpx_reciprocal_overlap=cnv_cpx_reciprocal_overlap, cnv_cpx_sample_overlap=cnv_cpx_sample_overlap + ) + _update_cnv_cnv_redundances( + vids_to_remove, potentially_clusterable, is_carrier, is_ref, + cnv_cnv_reciprocal_overlap=cnv_cnv_reciprocal_overlap, cnv_cnv_sample_overlap=cnv_cnv_sample_overlap + ) + + output_folder = os.path.dirname(os.path.abspath(os.path.expanduser(output_vcf))) + os.makedirs(output_folder, exist_ok=True) + with pysam.VariantFile(input_vcf, 'r', threads=num_threads) as f_in, \ + pysam.VariantFile(output_vcf, 'w', header=header, threads=num_threads) as f_out: + for record in f_in.fetch(): + if record.id not in vids_to_remove: + f_out.write(record) + + +def __parse_arguments(argv: List[Text]) -> argparse.Namespace: + # noinspection PyTypeChecker + parser = argparse.ArgumentParser( + description="Remove CNVs that are redundant with CPX variants, or each other", + formatter_class=argparse.ArgumentDefaultsHelpFormatter + ) + parser.add_argument("input_vcf", type=str, help="VCF with potentially redundant CNVs") + parser.add_argument("output_vcf", type=str, help="VCF with redundant CNVs removed") + parser.add_argument("--min-cpx-reciprocal-overlap", type=float, default=Default.min_cpx_reciprocal_overlap, + help="Minimum reciprocal overlap with a CPX for an interval to be possibly redundant") + parser.add_argument("--cnv-cpx-reciprocal-overlap", type=float, default=Default.cnv_cpx_reciprocal_overlap, + help="Minimum reciprocal interval overlap for clustering CNV with CPX") + parser.add_argument("--cnv-cpx-sample-overlap", type=float, default=Default.cnv_cpx_sample_overlap, + help="Minimum Jaccard index (intersection/union) of samples for clustering CNV with CPX") + parser.add_argument("--cnv-cnv-reciprocal-overlap", type=float, default=Default.cnv_cnv_reciprocal_overlap, + help="Minimum reciprocal interval overlap for clustering CNV with other CNV") + parser.add_argument("--cnv-cnv-sample-overlap", type=float, default=Default.cnv_cnv_sample_overlap, + help="Minimum Jaccard index (intersection/union) of samples for clustering CNV with other CNV") + parser.add_argument("--temp-dir", "-t", type=str, default=Default.temp_dir, help="directory for temp files") + parser.add_argument("--num-threads", type=int, default=Default.num_threads, + help="number of threads for compressing/decompressing bgzipped files") + + if len(argv) <= 1: + parser.parse_args(["--help"]) + sys.exit(0) + parsed_arguments = parser.parse_args(argv[1:]) + if parsed_arguments.input_vcf is None: + raise ValueError("Must supply input-vcf") + if parsed_arguments.output_vcf is None: + raise ValueError("Must supply output-vcf") + + return parsed_arguments + + +def main(argv: Optional[List[Text]] = None): + if argv is None: + argv = sys.argv + arguments = __parse_arguments(argv) + resolve_cpx_cnv_redundancies(**vars(arguments)) + + +if __name__ == "__main__": + main() diff --git a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_backend_part1.sh b/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_backend_part1.sh deleted file mode 100755 index bf9d4994e..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_backend_part1.sh +++ /dev/null @@ -1,419 +0,0 @@ -#!/bin/bash - -# Intelligently shards a VCF prior to complex resolution (for parallelization) - -# Subsetted to first half, to just output lists of which variants should go in each shard - -set -Eeu -o pipefail - -# ARGS defaults and hard-coded values -DIST=${DEFAULT_DIST:-1000} -RECIP=${DEFAULT_RECIP:-0.1} -MIN_LINES_PER_SHARD=${DEFAULT_MIN_LINES_PER_SHARD:-10} -MAX_SHARDS=${DEFAULT_MAX_SHARDS:-100} -NONCLUSTER_SHARDS=${DEFAULT_NONCLUSTER_SHARDS:-30} -PREFIX=${DEFAULT_PREFIX:-"vcf_shard"} -BREAKPOINT_PADDING=${DEFAULT_BREAKPOINT_PADDING:-5000} -IGNORE_SV_TYPES=${DEFAULT_IGNORE_SV_TYPES:-false} -ADD_SINGLE_REC=${DEFAULT_ADD_SINGLE_REC:-false} -SHARD_LARGE_CLUSTERS=${DEFAULT_SHARD_LARGE_CLUSTERS:-true} -SCRIPT_NAME=${SCRIPT_NAME:-$(basename "${BASH_SOURCE[0]}")} -#Set path to execution directory -BIN=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) -#Set default output folder -OUTDIR=`pwd` - -###USAGE -usage(){ -cat < ${SHARD_VCF_TMP}/single_sample_input.vcf.gz -#Identify all candidate complex variant clusters (generous 1kb clustering) -svtk vcfcluster \ - -d ${DIST} \ - -f ${RECIP} \ - -p candidate_complex_clusters \ - ${IGNORE_SV_TYPES_ARG} \ - -o 0 \ - --preserve-ids \ - <( echo "${SHARD_VCF_TMP}/single_sample_input.vcf.gz" ) \ - ${SHARD_VCF_TMP}/input_vcf.clustered.vcf - -#Convert clustered variants to bed -svtk vcf2bed \ - --no-samples \ - --info ALL \ - ${SHARD_VCF_TMP}/input_vcf.clustered.vcf \ - ${SHARD_VCF_TMP}/input_vcf.clustered.bed - -#Write list of clusters with >1 constituent variant -mem_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed \ - | sed 's/\t/\n/g' | awk '{ if ($1=="MEMBERS") print NR }' ) -awk -v idx=${mem_idx} -v OFS="\t" '$idx ~ /,/ { print $1, $2, $3, $idx }' \ - ${SHARD_VCF_TMP}/input_vcf.clustered.bed \ - | (grep -v "^#" || printf "") \ - > ${SHARD_VCF_TMP}/candidate_complex_clusters.bed -if ADD_SINGLE_REC; then - #Add all non-CNV single-record variants - class_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \ - sed 's/\t/\n/g' | awk '{ if ($1=="SVTYPE") print NR }' ) - awk -v idx=${class_idx} -v mem_idx=${mem_idx} -v OFS="\t" \ - '$mem_idx !~ /,/ { print $1, $2, $3, $idx, $mem_idx }' \ - ${SHARD_VCF_TMP}/input_vcf.clustered.bed \ - | awk -v OFS="\t" '$4 !~ /DEL|DUP|CNV|MCNV|mCNV/ { print $1, $2, $3, $5 }' \ - | (grep -v "^#" || printf "") \ - >> ${SHARD_VCF_TMP}/candidate_complex_clusters.bed -fi - -#Get min/max coordinates of all variants in list of VIDs -{ - zcat ${VCF} \ - | (grep "^#" || printf "") \ - | cut -f1-10; - cut -f4 ${SHARD_VCF_TMP}/candidate_complex_clusters.bed \ - | sed 's/\,/\n/g' \ - | sort -Vk1,1 \ - | uniq \ - | (fgrep -wf - <(zcat ${VCF}) || printf "") \ - | cut -f1-10; -} \ - | svtk vcf2bed --no-samples /dev/stdin /dev/stdout \ - > ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed - -###ONLY PERFORM CLUSTER-BASED SHARDING IF ANY VARIANTS PREDICTED TO CLUSTER -if grep -vq "^#" ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed; then - #Split into breakpoints and pad all breakpoints by ±BREAKPOINT_PADDING - ###DETERMINE SET OF NONREDUNDANT INTERVALS FOR ALL CLUSTERS - grep -v "^#" ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed \ - | awk -v OFS="\t" -v buffer=$BREAKPOINT_PADDING \ - '{ print $1, $2-buffer, $2+buffer, $4; - print $1, $3-buffer, $3+buffer, $4 }' \ - | awk -v OFS="\t" '{ if ($2<1) $2=1; print $1, $2, $3, $4 }' \ - | sort -Vk1,1 -k2,2n -k3,3n \ - | bedtools merge -i - -c 4 -o distinct \ - > ${SHARD_VCF_TMP}/breakpoint_intervals.bed - #Iterate over breakpoint intervals and write list of maximum nonredundant intervals - in_cluster=`mktemp` - remaining=`mktemp` - cp ${SHARD_VCF_TMP}/breakpoint_intervals.bed ${remaining} - while read chr start end VIDs; do - #Get all lines associated with current VIDs - echo -e "${VIDs}" \ - | sed 's/,/\n/g' \ - | (fgrep -wf - ${remaining} || printf "") \ - > ${in_cluster} - #Only run if at least one line added to ${in_cluster} - if [ -s "${in_cluster}" ]; then - #Exclude all lines in ${in_cluster} from ${remaining} - bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2 - mv ${remaining}2 ${remaining} - #Iterate until no more related VIDs are present in ${remaining} - until ! cut -f4 ${in_cluster} | sed 's/\,/\n/g' | fgrep -qwf - ${remaining}; do - #Add new lines to ${in_cluster} - cut -f4 ${in_cluster} \ - | sed 's/\,/\n/g' \ - | (fgrep -wf - ${remaining} || printf "") \ - >> ${in_cluster} - #Exclude all lines in ${in_cluster} from ${remaining} - bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2 - mv ${remaining}2 ${remaining} - done - #Write out final interval - for wrapper in 1; do - #Print list of coordinates - cut -f1-3 ${in_cluster} \ - | sort -Vk1,1 -k2,2 -k3,3 \ - | bedtools merge -i - \ - | awk '{ print $1":"$2"-"$3 }' \ - | paste -s -d\; - #Print list of involved VIDs - cut -f4 ${in_cluster} \ - | sed 's/,/\n/g' \ - | sort \ - | uniq \ - | paste -s -d, - done | paste -s - fi - done < ${SHARD_VCF_TMP}/breakpoint_intervals.bed \ - > ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt - - if SHARD_LARGE_CLUSTERS; then - #Pull out exceptionally large clusters to the side to be placed in their own shards - while read ints VIDs; do - if [ $( echo ${VIDs} | sed 's/,/\n/g' | wc -l ) -ge ${MIN_LINES_PER_SHARD} ]; then - echo -e "${ints}\t${VIDs}" - fi - done < ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \ - > ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt - if [ -s "${SHARD_VCF_TMP}/large_intervals_to_test.final.txt" ]; then - cut -f2 ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt \ - | sed 's/,/\n/g' \ - | (fgrep -wvf - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt || printf "") \ - > ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2 - mv ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2 \ - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt - fi - fi - - ###DETERMINE COORDINATES FOR EACH SHARD - #Split variants into shards based on number of variants - #If total number of intervals/MAX_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard - if [ $(( $( wc -l < "${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt" ) / ${MAX_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then - ${BIN}/evenSplitter.R \ - -L ${MIN_LINES_PER_SHARD} \ - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_ - #Otherwise, split into MAX_SHARDS evenly-sized shards - else - ${BIN}/evenSplitter.R \ - -S ${MAX_SHARDS} \ - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_ - fi - #Determine number of shards - n_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}.shard_intervals_*" | wc -l ) - #Writes exceptionally large clusters to their own shards - if [ -s "${SHARD_VCF_TMP}/large_intervals_to_test.final.txt" ]; then - while read ints VIDs; do - ((++n_shards)) - echo -e "${ints}\t${VIDs}" > ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${n_shards} - done < ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt - fi - #Reformat interval shards - for i in $( seq 1 ${n_shards} ); do - cut -f1 ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i} \ - | sed -e 's/\;/\n/g' -e 's/\:/\t/g' -e 's/\-/\t/g' \ - | sort -Vk1,1 -k2,2n -k3,3n \ - | bedtools merge -i - \ - > ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed - rm ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i} - done - - - ###SHARD CLUSTERABLE VCF - #Convert full, original VCF to BED - svtk vcf2bed --no-samples ${VCF} int.bed - #Harrison's patch for sharding - awk '{ if ($1!~"#") { print $1,$2,$2+1,$4,$5; - print $1,$3-1,$3,$4,$5 } - else print }' OFS='\t' int.bed \ - > ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed - rm int.bed - - #Create exclusion list of VIDs already used in earlier shards - touch ${SHARD_VCF_TMP}/used_VIDs.tmp - #Iterate over all sharded intervals - for i in $( seq 1 ${n_shards} ); do - if [ ${i} -gt 1 ]; then - cat ${OUTDIR}/${PREFIX}.shard_*.VIDs.list \ - | sort \ - | uniq \ - > ${SHARD_VCF_TMP}/used_VIDs.tmp - else - touch ${SHARD_VCF_TMP}/used_VIDs.tmp - fi - #Get list of IDs to be used in shard - bedtools intersect \ - -a ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed \ - -b ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed \ - | cut -f4 \ - | (fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.tmp || printf "")\ - | sort \ - | uniq \ - > ${OUTDIR}/${PREFIX}.shard_${i}.VIDs.list - - #Update exclusion list of VIDs already used in earlier shards - cat ${OUTDIR}/${PREFIX}.shard_*.VIDs.list \ - | sort \ - | uniq \ - > ${SHARD_VCF_TMP}/used_VIDs.tmp - done - - #Write list of eligible VIDs - (grep -v "^#" ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed || printf "") \ - | cut -f4 \ - | (fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.tmp || printf "") \ - | sort \ - | uniq \ - > ${SHARD_VCF_TMP}/remaining_VIDs.list -else - n_shards=0 - zcat ${VCF} \ - | (grep -v "^#" || printf "") \ - | cut -f3 \ - | sort \ - | uniq \ - > ${SHARD_VCF_TMP}/remaining_VIDs.list -fi - - -###SHARD NONCLUSTERABLE VCF -#Shard remainder intervals into no more than $NONCLUSTER_SHARDS shards -#If total number of variants/NONCLUSTER_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard -if [ -s "${SHARD_VCF_TMP}/remaining_VIDs.list" ]; then - if [ $(( $(wc -l < ${SHARD_VCF_TMP}/remaining_VIDs.list) / ${NONCLUSTER_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then - ${BIN}/evenSplitter.R \ - -L ${MIN_LINES_PER_SHARD} \ - ${SHARD_VCF_TMP}/remaining_VIDs.list \ - ${SHARD_VCF_TMP}/${PREFIX}.remaining_variants_ - #Otherwise, split into MAX_SHARDS evenly-sized shards - else - ${BIN}/evenSplitter.R \ - -S ${NONCLUSTER_SHARDS} \ - ${SHARD_VCF_TMP}/remaining_VIDs.list \ - ${SHARD_VCF_TMP}/${PREFIX}.remaining_variants_ - fi - n_nonclusterable_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}.remaining_variants_*" | wc -l ) - #Iterate over all sharded variant lists - for i in $( seq 1 ${n_nonclusterable_shards} ); do - idx=$(( ${n_shards} + ${i} )) - mv ${SHARD_VCF_TMP}/${PREFIX}.remaining_variants_${i} \ - ${OUTDIR}/${PREFIX}.shard_${idx}.VIDs.list - done -fi - -###CLEAN UP -rm -rf ${SHARD_VCF_TMP} - -1>&2 echo "$(basename $0): Success" diff --git a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preClustering.sh b/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preClustering.sh deleted file mode 100755 index 2d3062daa..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preClustering.sh +++ /dev/null @@ -1,328 +0,0 @@ -#!/bin/bash - -# Intelligently shards a VCF prior to complex resolution (for parallelization) - -set -e - -###USAGE -usage(){ -cat < ${SHARD_VCF_TMP}/single_sample_input.vcf.gz -#Identify all candidate complex variant clusters (generous 1kb clustering) -svtk vcfcluster \ - -d ${DIST} \ - -f ${RECIP} \ - -p candidate_complex_clusters \ - -o 0 \ - --preserve-ids \ - <( echo "${SHARD_VCF_TMP}/single_sample_input.vcf.gz" ) \ - ${SHARD_VCF_TMP}/input_vcf.clustered.vcf -#Convert clustered variants to bed -svtk vcf2bed \ - --no-samples \ - --info ALL \ - ${SHARD_VCF_TMP}/input_vcf.clustered.vcf \ - ${SHARD_VCF_TMP}/input_vcf.clustered.bed -#Write list of clusters with >1 constituent variant -mem_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \ - sed 's/\t/\n/g' | awk '{ if ($1=="MEMBERS") print NR }' ) -awk -v idx=${mem_idx} -v OFS="\t" \ -'$idx ~ /,/ { print $1, $2, $3, $idx }' \ -${SHARD_VCF_TMP}/input_vcf.clustered.bed | fgrep -v "#" > \ -${SHARD_VCF_TMP}/candidate_complex_clusters.bed -#Get min/max coordinates of all variants in list of VIDs -cat <( zcat ${VCF} | fgrep "#" | cut -f1-10 ) \ -<( cut -f4 ${SHARD_VCF_TMP}/candidate_complex_clusters.bed | \ - sed 's/\,/\n/g' | sort -Vk1,1 | uniq | fgrep -wf - \ - <( zcat ${VCF} ) | cut -f1-10 ) | \ -svtk vcf2bed --no-samples /dev/stdin /dev/stdout > \ -${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed - - -###DETERMINE SET OF NONREDUNDANT INTERVALS FOR ALL CLUSTERS -#Split into breakpoints and pad all breakpoints by ±5kb -fgrep -v "#" ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed | \ -awk -v OFS="\t" -v buffer=5000 \ -'{ print $1, $2-buffer, $2+buffer, $4"\n"$1, $3-buffer, $3+buffer, $4 }' | \ -awk -v OFS="\t" '{ if ($2<1) $2=1; print $1, $2, $3, $4 }' | \ -sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - -c 4 -o distinct > \ -${SHARD_VCF_TMP}/breakpoint_intervals.bed -#Iterate over breakpoint intervals and write list of maximum nonredundant intervals -in_cluster=`mktemp` -remaining=`mktemp` -cp ${SHARD_VCF_TMP}/breakpoint_intervals.bed ${remaining} -while read chr start end VIDs; do - #Get all lines associated with current VIDs - echo -e "${VIDs}" | sed 's/,/\n/g' | fgrep -wf - \ - ${remaining} > ${in_cluster} - #Only run if at least one line added to ${in_cluster} - if [ $( cat ${in_cluster} | wc -l ) -gt 0 ]; then - #Exclude all lines in ${in_cluster} from ${remaining} - bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2 - mv ${remaining}2 ${remaining} - #Iterate until no more related VIDs are present in ${remaining} - until [ $( cut -f4 ${in_cluster} | sed 's/\,/\n/g' | fgrep -wf - ${remaining} | wc -l ) -eq 0 ]; do - #Add new lines to ${in_cluster} - cut -f4 ${in_cluster} | sed 's/\,/\n/g' | fgrep -wf - ${remaining} >> ${in_cluster} - #Exclude all lines in ${in_cluster} from ${remaining} - bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2 - mv ${remaining}2 ${remaining} - done - #Write out final interval - for wrapper in 1; do - #Print list of coordinates - cut -f1-3 ${in_cluster} | sort -Vk1,1 -k2,2 -k3,3 | bedtools merge -i - | \ - awk '{ print $1":"$2"-"$3 }' | paste -s -d\; - #Print list of involved VIDs - cut -f4 ${in_cluster} | sed 's/,/\n/g' | sort | uniq | paste -s -d, - done | paste -s - fi -done < ${SHARD_VCF_TMP}/breakpoint_intervals.bed > \ -${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt - - -###DETERMINE COORDINATES FOR EACH SHARD -#Split variants into shards based on number of variants -#If total number of intervals/MAX_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard -if [ $(( $( cat ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt | wc -l ) / ${MAX_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then - ${BIN}/evenSplitter.R \ - -L ${MIN_LINES_PER_SHARD} \ - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_ -#Otherwise, split into MAX_SHARDS evenly-sized shards -else - ${BIN}/evenSplitter.R \ - -S ${MAX_SHARDS} \ - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_ -fi -#Determine number of shards -n_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}*" | wc -l ) -#Reformat interval shards -for i in $( seq 1 ${n_shards} ); do - cut -f1 ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i} | \ - sed -e 's/\;/\n/g' -e 's/\:/\t/g' -e 's/\-/\t/g' | \ - sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - > \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed - rm ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i} -done - - -###SHARD CLUSTERABLE VCF -#Convert full, original VCF to BED -svtk vcf2bed --no-samples \ - ${VCF} int.bed -#Harrison's patch for sharding -awk '{if ($1!~"#") print $1,$2,$2+1,$4,$5 \ - "\n" $1,$3-1,$3,$4,$5;else print}' OFS='\t' int.bed \ - > ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed -rm int.bed -#Iterate over all sharded intervals -for i in $( seq 1 ${n_shards} ); do - #Write exclusion list of VIDs already used in earlier shards - touch ${SHARD_VCF_TMP}/used_VIDs.tmp - if [ ${i} -gt 1 ]; then - for j in $( seq 1 $(( ${i} - 1 )) ); do - cat ${SHARD_VCF_TMP}/${PREFIX}.shard_${j}.VIDs.list - done | sort | uniq > ${SHARD_VCF_TMP}/used_VIDs.tmp - fi - #Get list of IDs to be used in shard - bedtools intersect \ - -a ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed \ - -b ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed | \ - cut -f4 | fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.tmp > \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_${i}.VIDs.list - #Print header - zcat ${VCF} | head -n1000 | fgrep "#" > \ - ${OUTDIR}/${PREFIX}.shard_${i}.vcf - #Shard based on VIDs (slower than tabix, but avoids omitting variants) - zcat ${VCF} | fgrep -wf ${SHARD_VCF_TMP}/${PREFIX}.shard_${i}.VIDs.list >> \ - ${OUTDIR}/${PREFIX}.shard_${i}.vcf - #Sanity check shard - if [ $( fgrep -v "#" ${OUTDIR}/${PREFIX}.shard_${i}.vcf | wc -l ) -gt 0 ]; then - #Bgzip & tabix shard - bgzip -f ${OUTDIR}/${PREFIX}.shard_${i}.vcf - tabix -f ${OUTDIR}/${PREFIX}.shard_${i}.vcf.gz - else - rm ${OUTDIR}/${PREFIX}.shard_${i}.vcf - fi - #Clean up used VID list - rm ${SHARD_VCF_TMP}/used_VIDs.tmp -done - - -###SHARD NONCLUSTERABLE VCF -#Get list of variant IDs not present in any previous shard -vcf-concat ${OUTDIR}/${PREFIX}.shard_*.vcf.gz \ - | fgrep -v "#" | cut -f3 \ - > ${SHARD_VCF_TMP}/used_VIDs.tmp -#Get list of eligible variant IDs -zcat ${VCF} | fgrep -v "#" | cut -f3 \ - | fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.tmp \ - > ${SHARD_VCF_TMP}/remaining_VIDs.tmp -#Shard remainder intervals into no more than $NONCLUSTER_SHARDS shards -#If total number of variants/NONCLUSTER_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard -if [ $(( $( cat ${SHARD_VCF_TMP}/remaining_VIDs.tmp | wc -l ) / ${NONCLUSTER_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then - ${BIN}/evenSplitter.R \ - -L ${MIN_LINES_PER_SHARD} \ - ${SHARD_VCF_TMP}/remaining_VIDs.tmp \ - ${SHARD_VCF_TMP}/${PREFIX}.remaining_variants_ -#Otherwise, split into MAX_SHARDS evenly-sized shards -else - ${BIN}/evenSplitter.R \ - -S ${NONCLUSTER_SHARDS} \ - ${SHARD_VCF_TMP}/remaining_VIDs.tmp \ - ${SHARD_VCF_TMP}/${PREFIX}.remaining_variants_ -fi -n_nonclusterable_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}.remaining_variants_*" | wc -l ) -#Iterate over all sharded variant lists -for i in $( seq 1 ${n_nonclusterable_shards} ); do - #Print header - zcat ${VCF} | head -n1000 | fgrep "#" > \ - ${OUTDIR}/${PREFIX}.shard_$(( ${n_shards} + ${i} )).vcf - #Shard based on VIDs (slower than tabix, but avoids omitting variants) - zcat ${VCF} | fgrep -wf ${SHARD_VCF_TMP}/${PREFIX}.remaining_variants_${i} >> \ - ${OUTDIR}/${PREFIX}.shard_$(( ${n_shards} + ${i} )).vcf - #Sanity check shard - if [ $( fgrep -v "#" ${OUTDIR}/${PREFIX}.shard_$(( ${n_shards} + ${i} )).vcf | wc -l ) -gt 0 ]; then - #Bgzip & tabix shard - bgzip -f ${OUTDIR}/${PREFIX}.shard_$(( ${n_shards} + ${i} )).vcf - tabix -f ${OUTDIR}/${PREFIX}.shard_$(( ${n_shards} + ${i} )).vcf.gz - else - rm ${OUTDIR}/${PREFIX}.shard_$(( ${n_shards} + ${i} )).vcf - fi -done - - -###CLEAN UP -rm -rf ${SHARD_VCF_TMP} diff --git a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preClustering_part1.sh b/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preClustering_part1.sh deleted file mode 100755 index 7434b0eaa..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preClustering_part1.sh +++ /dev/null @@ -1,24 +0,0 @@ -#!/bin/bash - -# Intelligently shards a VCF prior to complex resolution (for parallelization) - -# Subsetted to first half, to just output lists of which variants should go in each shard - -set -Eeu -o pipefail - -# ARGS defaults and hard-coded values -# generous 1kb clustering, 10% RO clustering -DEFAULT_DIST=1000 -DEFAULT_RECIP=0.1 -DEFAULT_MIN_LINES_PER_SHARD=10 -DEFAULT_MAX_SHARDS=100 -DEFAULT_NONCLUSTER_SHARDS=30 -DEFAULT_PREFIX="vcf_shard" -DEFAULT_BREAKPOINT_PADDING=5000 -DEFAULT_IGNORE_SV_TYPES=false -DEFAULT_ADD_SINGLE_REC=false -DEFAULT_SHARD_LARGE_CLUSTERS=true -SCRIPT_NAME=$(basename "${BASH_SOURCE[0]}") - -BIN=$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd ) -source $BIN/shardVCF_backend_part1.sh diff --git a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX.sh b/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX.sh deleted file mode 100755 index 0793f75c7..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX.sh +++ /dev/null @@ -1,368 +0,0 @@ -#!/bin/bash - -# Intelligently shards a VCF prior to complex resolution (for parallelization) - -set -euo pipefail - -###USAGE -usage(){ -cat < ${SHARD_VCF_TMP}/single_sample_input.vcf.gz -#Identify all candidate complex variant clusters (generous 1kb clustering) -svtk vcfcluster \ - -d 1000 \ - -f 0 \ - -p candidate_complex_clusters \ - --ignore-svtypes \ - -o 0 \ - --preserve-ids \ - <( echo "${SHARD_VCF_TMP}/single_sample_input.vcf.gz" ) \ - ${SHARD_VCF_TMP}/input_vcf.clustered.vcf -#Convert clustered variants to bed -svtk vcf2bed \ - --no-samples \ - --info ALL \ - ${SHARD_VCF_TMP}/input_vcf.clustered.vcf \ - ${SHARD_VCF_TMP}/input_vcf.clustered.bed -#Write list of clusters with >1 constituent variant -mem_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \ - sed 's/\t/\n/g' | awk '{ if ($1=="MEMBERS") print NR }' ) -awk -v idx=${mem_idx} -v OFS="\t" \ -'$idx ~ /,/ { print $1, $2, $3, $idx }' \ -${SHARD_VCF_TMP}/input_vcf.clustered.bed | fgrep -v "#" > \ -${SHARD_VCF_TMP}/candidate_complex_clusters.bed -#Add all non-CNV single-record variants -class_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \ - sed 's/\t/\n/g' | awk '{ if ($1=="SVTYPE") print NR }' ) -awk -v idx=${class_idx} -v mem_idx=${mem_idx} -v OFS="\t" \ -'$mem_idx !~ /,/ { print $1, $2, $3, $idx, $mem_idx }' \ -${SHARD_VCF_TMP}/input_vcf.clustered.bed | \ -awk -v OFS="\t" '$4 !~ /DEL|DUP|CNV|MCNV|mCNV/ { print $1, $2, $3, $5 }' | fgrep -v "#" >> \ -${SHARD_VCF_TMP}/candidate_complex_clusters.bed -#Get min/max coordinates of all variants in list of VIDs -cat <( zcat ${VCF} | fgrep "#" | cut -f1-10 ) \ -<( cut -f4 ${SHARD_VCF_TMP}/candidate_complex_clusters.bed | \ - sed 's/\,/\n/g' | sort -Vk1,1 | uniq | { fgrep -wf - <( zcat ${VCF} ) || true; } \ - | cut -f1-10 ) | \ -svtk vcf2bed --no-samples /dev/stdin /dev/stdout > \ -${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed - - -#Split into breakpoints and pad all breakpoints by ±0kb -#DEV NOTE: padding breakpoints for large chromosomes & many samples was causing -# issues where tens of thousands of breakpoints would end up in the same shard -# and take >36h to resolve, defeating the purpose of sharding -fgrep -v "#" ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed | \ -awk -v OFS="\t" -v buffer=0 \ -'{ print $1, $2-buffer, $2+buffer, $4"\n"$1, $3-buffer, $3+buffer, $4 }' | \ -awk -v OFS="\t" '{ if ($2<1) $2=1; print $1, $2, $3, $4 }' | \ -sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - -c 4 -o distinct > \ -${SHARD_VCF_TMP}/breakpoint_intervals.bed -#Iterate over breakpoint intervals and write list of maximum nonredundant intervals -in_cluster=`mktemp` -remaining=`mktemp` -cp ${SHARD_VCF_TMP}/breakpoint_intervals.bed ${remaining} -while read chr start end VIDs; do - #Get all lines associated with current VIDs - echo -e "${VIDs}" | sed 's/,/\n/g' | { fgrep -wf - ${remaining} || true; } > ${in_cluster} - #Only run if at least one line added to ${in_cluster} - if [ $( cat ${in_cluster} | wc -l ) -gt 0 ]; then - #Exclude all lines in ${in_cluster} from ${remaining} - bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2 - mv ${remaining}2 ${remaining} - #Iterate until no more related VIDs are present in ${remaining} - until [ $( cut -f4 ${in_cluster} | sed 's/\,/\n/g' | { fgrep -wf - ${remaining} || true; } | wc -l ) -eq 0 ]; do - #Add new lines to ${in_cluster} - cut -f4 ${in_cluster} | sed 's/\,/\n/g' | { fgrep -wf - ${remaining} || true; } >> ${in_cluster} - #Exclude all lines in ${in_cluster} from ${remaining} - bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2 - mv ${remaining}2 ${remaining} - done - #Write out final interval - for wrapper in 1; do - #Print list of coordinates - cut -f1-3 ${in_cluster} | sort -Vk1,1 -k2,2 -k3,3 | bedtools merge -i - | \ - awk '{ print $1":"$2"-"$3 }' | paste -s -d\; - #Print list of involved VIDs - cut -f4 ${in_cluster} | sed 's/,/\n/g' | sort | uniq | paste -s -d, - done | paste -s - fi -done < ${SHARD_VCF_TMP}/breakpoint_intervals.bed > \ -${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt -#Pull out exceptionally large clusters to the side to be placed in their own shards -while read ints VIDs; do - if [ $( echo ${VIDs} | sed 's/,/\n/g' | wc -l ) -ge ${MIN_LINES_PER_SHARD} ]; then - echo -e "${ints}\t${VIDs}" - fi -done < ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \ - > ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt -if [ $( cat ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt | wc -l ) -gt 0 ]; then - cut -f2 ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt \ - | sed 's/,/\n/g' \ - | { fgrep -wvf - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt || true; } \ - > ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2 - mv ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2 \ - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt -fi - - -###DETERMINE COORDINATES FOR EACH SHARD -#Split variants into shards based on number of variants -#If total number of intervals/MAX_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard -if [ $(( $( cat ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt | wc -l ) / ${MAX_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then - ${BIN}/evenSplitter.R \ - -L ${MIN_LINES_PER_SHARD} \ - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_ -#Otherwise, split into MAX_SHARDS evenly-sized shards -else - ${BIN}/evenSplitter.R \ - -S ${MAX_SHARDS} \ - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_ -fi -#Writes exceptionally large clusters to their own shards -n_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}*" | wc -l ) -if [ $( cat ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt | wc -l ) -gt 0 ]; then - while read ints VIDs; do - n_shards=$(( ${n_shards} + 1 )) - echo -e "${ints}\t${VIDs}" > ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${n_shards} - done < ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt -fi -#Reformat interval shards -for i in $( seq 1 ${n_shards} ); do - cut -f1 ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i} | \ - sed -e 's/\;/\n/g' -e 's/\:/\t/g' -e 's/\-/\t/g' | \ - sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - > \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed - rm ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i} -done - - -###SHARD VCF -#Convert full, original VCF to BED -svtk vcf2bed --no-samples \ - ${VCF} int.bed -#Harrison's patch for sharding -awk '{if ($1!~"#") print $1,$2,$2+1,$4,$5 \ - "\n" $1,$3-1,$3,$4,$5;else print}' OFS='\t' int.bed \ - > ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed -rm int.bed -#Iterate over all sharded intervals -for i in $( seq 1 $(( ${n_shards} )) ); do - #Write exclusion list of VIDs already used in earlier shards - touch ${SHARD_VCF_TMP}/used_VIDs.tmp - if [ ${i} -gt 1 ]; then - for j in $( seq 1 $(( ${i} - 1 )) ); do - cat ${SHARD_VCF_TMP}/${PREFIX}.shard_${j}.VIDs.list - done | sort | uniq > ${SHARD_VCF_TMP}/used_VIDs.tmp - fi - #Get list of IDs to be used in shard - bedtools intersect \ - -a ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed \ - -b ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed | \ - cut -f4 | { fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.tmp || true; } > \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_${i}.VIDs.list - #Print header - zcat ${VCF} | head -n1000 | fgrep "#" > \ - ${OUTDIR}/${PREFIX}.shard_${i}.vcf - #Shard based on VIDs (slower than tabix, but avoids omitting variants) - zcat ${VCF} | { fgrep -wf ${SHARD_VCF_TMP}/${PREFIX}.shard_${i}.VIDs.list || true; } >> \ - ${OUTDIR}/${PREFIX}.shard_${i}.vcf - #Sanity check shard - if [ $( { fgrep -v "#" ${OUTDIR}/${PREFIX}.shard_${i}.vcf || true; } | wc -l ) -gt 0 ]; then - #Bgzip & tabix shard - bgzip -f ${OUTDIR}/${PREFIX}.shard_${i}.vcf - tabix -f ${OUTDIR}/${PREFIX}.shard_${i}.vcf.gz - else - rm ${OUTDIR}/${PREFIX}.shard_${i}.vcf - fi - #Clean up used VID list - rm ${SHARD_VCF_TMP}/used_VIDs.tmp -done -#Write list of all VIDs used in cluster shards -zcat ${OUTDIR}/${PREFIX}.shard_*.vcf.gz \ - | cut -f1-3 | fgrep -v "#" | cut -f3 \ - > ${SHARD_VCF_TMP}/used_VIDs.cluster_shards.list -#Write list of eligible VIDs -fgrep -v "#" ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed \ - | cut -f4 \ - | { fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.cluster_shards.list || true; } \ - > ${SHARD_VCF_TMP}/remaining_VIDs.list -#Shard remaining records into no more than $NONCLUSTER_SHARDS shards -#If total number of records/NONCLUSTER_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard -if [ $(( $( cat ${SHARD_VCF_TMP}/remaining_VIDs.list | wc -l ) / ${NONCLUSTER_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then - ${BIN}/evenSplitter.R \ - -L ${MIN_LINES_PER_SHARD} \ - ${SHARD_VCF_TMP}/remaining_VIDs.list \ - ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_ -#Otherwise, split into MAX_SHARDS evenly-sized shards -else - ${BIN}/evenSplitter.R \ - -S ${NONCLUSTER_SHARDS} \ - ${SHARD_VCF_TMP}/remaining_VIDs.list \ - ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_ -fi -#Iterate over all non-cluster shards and generate VCF shards -n_noncluster_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}.remainder_VIDs_*" | wc -l ) -for i in $( seq 1 ${n_noncluster_shards} ); do - idx=$(( ${n_shards} + ${i} )) - #Print header - zcat ${VCF} | head -n1000 | fgrep "#" > \ - ${OUTDIR}/${PREFIX}.shard_${idx}.vcf - #Shard based on VIDs (slower than tabix, but avoids omitting variants) - zcat ${VCF} | { fgrep -wf ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_${i} || true; } >> \ - ${OUTDIR}/${PREFIX}.shard_${idx}.vcf - #Sanity check shard - if [ $( { fgrep -v "#" ${OUTDIR}/${PREFIX}.shard_${idx}.vcf || true; } | wc -l ) -gt 0 ]; then - #Bgzip & tabix shard - bgzip -f ${OUTDIR}/${PREFIX}.shard_${idx}.vcf - tabix -f ${OUTDIR}/${PREFIX}.shard_${idx}.vcf.gz - else - rm ${OUTDIR}/${PREFIX}.shard_${idx}.vcf - fi -done - - -###SANITY CHECK SHARDS -while read shard; do - zcat ${shard} | cut -f1 | { fgrep -v "#" || true; } | wc -l -done < <( find ${OUTDIR} -name "${PREFIX}.shard_*.vcf.gz" ) \ - | sort -nrk1,1 \ - > ${SHARD_VCF_TMP}/vars_per_shard.txt -echo -e "FINISHED SHARDING VCF. RESULTING RECORDS PER SHARD FOR LARGEST 100 SHARDS:" -head -n100 ${SHARD_VCF_TMP}/vars_per_shard.txt | paste -s -d',' -#If shard with most variants is >10-fold more than next-largest shard, exit with code 1 -if [ $( find ${OUTDIR} -name "${PREFIX}.shard_*.vcf.gz" | wc -l ) -gt 1 ]; then - first=$( sed -n '1p' ${SHARD_VCF_TMP}/vars_per_shard.txt ) - second=$( sed -n '2p' ${SHARD_VCF_TMP}/vars_per_shard.txt ) - if [ ! -z ${second} ] && [ ${second} -gt 0 ]; then - if [ $(( ${first} / ${second} )) -ge 10 ]; then - echo -e "CRITICAL WARNING: LARGEST SHARD IS AT LEAST 10 TIMES LARGER THAN SECOND-LARGEST SHARD" - exit 1 - fi - fi -fi - - -###CLEAN UP -rm -rf ${SHARD_VCF_TMP} diff --git a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX_invOnly.sh b/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX_invOnly.sh deleted file mode 100755 index e8d6f18bd..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX_invOnly.sh +++ /dev/null @@ -1,369 +0,0 @@ -#!/bin/bash - -# Intelligently shards a VCF prior to complex resolution (for parallelization) - -# Version modified for inv-only sharding (RO 10%, no restriction on breakpoint distance) - -set -euo pipefail - -###USAGE -usage(){ -cat < ${SHARD_VCF_TMP}/single_sample_input.vcf.gz -#Identify all candidate complex variant clusters (generous 50Mb, 10% RO clustering) -svtk vcfcluster \ - -d 50000000 \ - -f 0.10 \ - -p candidate_complex_clusters \ - --ignore-svtypes \ - -o 0 \ - --preserve-ids \ - <( echo "${SHARD_VCF_TMP}/single_sample_input.vcf.gz" ) \ - ${SHARD_VCF_TMP}/input_vcf.clustered.vcf -#Convert clustered variants to bed -svtk vcf2bed \ - --no-samples \ - --info ALL \ - ${SHARD_VCF_TMP}/input_vcf.clustered.vcf \ - ${SHARD_VCF_TMP}/input_vcf.clustered.bed -#Write list of clusters with >1 constituent variant -mem_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \ - sed 's/\t/\n/g' | awk '{ if ($1=="MEMBERS") print NR }' ) -awk -v idx=${mem_idx} -v OFS="\t" \ -'$idx ~ /,/ { print $1, $2, $3, $idx }' \ -${SHARD_VCF_TMP}/input_vcf.clustered.bed | fgrep -v "#" > \ -${SHARD_VCF_TMP}/candidate_complex_clusters.bed -#Add all non-CNV single-record variants -class_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \ - sed 's/\t/\n/g' | awk '{ if ($1=="SVTYPE") print NR }' ) -awk -v idx=${class_idx} -v mem_idx=${mem_idx} -v OFS="\t" \ -'$mem_idx !~ /,/ { print $1, $2, $3, $idx, $mem_idx }' \ -${SHARD_VCF_TMP}/input_vcf.clustered.bed | \ -awk -v OFS="\t" '$4 !~ /DEL|DUP|CNV|MCNV|mCNV/ { print $1, $2, $3, $5 }' | fgrep -v "#" >> \ -${SHARD_VCF_TMP}/candidate_complex_clusters.bed -#Get min/max coordinates of all variants in list of VIDs -cat <( zcat ${VCF} | fgrep "#" | cut -f1-10 ) \ -<( cut -f4 ${SHARD_VCF_TMP}/candidate_complex_clusters.bed | \ - sed 's/\,/\n/g' | sort -Vk1,1 | uniq | { fgrep -wf - <( zcat ${VCF} ) || true; } | \ - cut -f1-10 ) | \ -svtk vcf2bed --no-samples /dev/stdin /dev/stdout > \ -${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed - - -#Split into breakpoints and pad all breakpoints by ±0kb -#DEV NOTE: padding breakpoints for large chromosomes & many samples was causing -# issues where tens of thousands of breakpoints would end up in the same shard -# and take >36h to resolve, defeating the purpose of sharding -fgrep -v "#" ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed | \ -awk -v OFS="\t" -v buffer=0 \ -'{ print $1, $2-buffer, $2+buffer, $4"\n"$1, $3-buffer, $3+buffer, $4 }' | \ -awk -v OFS="\t" '{ if ($2<1) $2=1; print $1, $2, $3, $4 }' | \ -sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - -c 4 -o distinct > \ -${SHARD_VCF_TMP}/breakpoint_intervals.bed -#Iterate over breakpoint intervals and write list of maximum nonredundant intervals -in_cluster=`mktemp` -remaining=`mktemp` -cp ${SHARD_VCF_TMP}/breakpoint_intervals.bed ${remaining} -while read chr start end VIDs; do - #Get all lines associated with current VIDs - echo -e "${VIDs}" | sed 's/,/\n/g' | { fgrep -wf - ${remaining} || true; } \ - > ${in_cluster} - #Only run if at least one line added to ${in_cluster} - if [ $( cat ${in_cluster} | wc -l ) -gt 0 ]; then - #Exclude all lines in ${in_cluster} from ${remaining} - bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2 - mv ${remaining}2 ${remaining} - #Iterate until no more related VIDs are present in ${remaining} - until [ $( cut -f4 ${in_cluster} | sed 's/\,/\n/g' | { fgrep -wf - ${remaining} || true; } | wc -l ) -eq 0 ]; do - #Add new lines to ${in_cluster} - cut -f4 ${in_cluster} | sed 's/\,/\n/g' | { fgrep -wf - ${remaining} || true; } >> ${in_cluster} - #Exclude all lines in ${in_cluster} from ${remaining} - bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2 - mv ${remaining}2 ${remaining} - done - #Write out final interval - for wrapper in 1; do - #Print list of coordinates - cut -f1-3 ${in_cluster} | sort -Vk1,1 -k2,2 -k3,3 | bedtools merge -i - | \ - awk '{ print $1":"$2"-"$3 }' | paste -s -d\; - #Print list of involved VIDs - cut -f4 ${in_cluster} | sed 's/,/\n/g' | sort | uniq | paste -s -d, - done | paste -s - fi -done < ${SHARD_VCF_TMP}/breakpoint_intervals.bed > \ -${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt -#Pull out exceptionally large clusters to the side to be placed in their own shards -while read ints VIDs; do - if [ $( echo ${VIDs} | sed 's/,/\n/g' | wc -l ) -ge ${MIN_LINES_PER_SHARD} ]; then - echo -e "${ints}\t${VIDs}" - fi -done < ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \ - > ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt -cut -f2 ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt \ -| sed 's/,/\n/g' \ -| { fgrep -wvf - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt || true; } \ -> ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2 -mv ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2 \ -${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt - - -###DETERMINE COORDINATES FOR EACH SHARD -#Split variants into shards based on number of variants -#If total number of intervals/MAX_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard -if [ $(( $( cat ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt | wc -l ) / ${MAX_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then - ${BIN}/evenSplitter.R \ - -L ${MIN_LINES_PER_SHARD} \ - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_ -#Otherwise, split into MAX_SHARDS evenly-sized shards -else - ${BIN}/evenSplitter.R \ - -S ${MAX_SHARDS} \ - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_ -fi -#Writes exceptionally large clusters to their own shards -n_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}*" | wc -l ) -if [ $( cat ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt | wc -l ) -gt 0 ]; then - while read ints VIDs; do - n_shards=$(( ${n_shards} + 1 )) - echo -e "${ints}\t${VIDs}" > ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${n_shards} - done < ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt -fi -#Reformat interval shards -for i in $( seq 1 ${n_shards} ); do - cut -f1 ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i} | \ - sed -e 's/\;/\n/g' -e 's/\:/\t/g' -e 's/\-/\t/g' | \ - sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - > \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed - rm ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i} -done - - -###SHARD VCF -#Convert full, original VCF to BED -svtk vcf2bed --no-samples \ - ${VCF} int.bed -#Harrison's patch for sharding -awk '{if ($1!~"#") print $1,$2,$2+1,$4,$5 \ - "\n" $1,$3-1,$3,$4,$5;else print}' OFS='\t' int.bed \ - > ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed -rm int.bed -#Iterate over all sharded intervals -for i in $( seq 1 $(( ${n_shards} )) ); do - #Write exclusion list of VIDs already used in earlier shards - touch ${SHARD_VCF_TMP}/used_VIDs.tmp - if [ ${i} -gt 1 ]; then - for j in $( seq 1 $(( ${i} - 1 )) ); do - cat ${SHARD_VCF_TMP}/${PREFIX}.shard_${j}.VIDs.list - done | sort | uniq > ${SHARD_VCF_TMP}/used_VIDs.tmp - fi - #Get list of IDs to be used in shard - bedtools intersect \ - -a ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed \ - -b ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed | \ - cut -f4 | { fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.tmp || true; } > \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_${i}.VIDs.list - #Print header - zcat ${VCF} | head -n1000 | fgrep "#" > \ - ${OUTDIR}/${PREFIX}.shard_${i}.vcf - #Shard based on VIDs (slower than tabix, but avoids omitting variants) - zcat ${VCF} | { fgrep -wf ${SHARD_VCF_TMP}/${PREFIX}.shard_${i}.VIDs.list || true; } >> \ - ${OUTDIR}/${PREFIX}.shard_${i}.vcf - #Sanity check shard - if [ $( fgrep -v "#" ${OUTDIR}/${PREFIX}.shard_${i}.vcf | wc -l ) -gt 0 ]; then - #Bgzip & tabix shard - bgzip -f ${OUTDIR}/${PREFIX}.shard_${i}.vcf - tabix -f ${OUTDIR}/${PREFIX}.shard_${i}.vcf.gz - else - rm ${OUTDIR}/${PREFIX}.shard_${i}.vcf - fi - #Clean up used VID list - rm ${SHARD_VCF_TMP}/used_VIDs.tmp -done -#Write list of all VIDs used in cluster shards -zcat ${OUTDIR}/${PREFIX}.shard_*.vcf.gz \ - | cut -f1-3 | fgrep -v "#" | cut -f3 \ - > ${SHARD_VCF_TMP}/used_VIDs.cluster_shards.list -#Write list of eligible VIDs -fgrep -v "#" ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed \ - | cut -f4 \ - | { fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.cluster_shards.list || true; } \ - > ${SHARD_VCF_TMP}/remaining_VIDs.list -#Shard remaining records into no more than $NONCLUSTER_SHARDS shards -#If total number of records/NONCLUSTER_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard -if [ $(( $( cat ${SHARD_VCF_TMP}/remaining_VIDs.list | wc -l ) / ${NONCLUSTER_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then - ${BIN}/evenSplitter.R \ - -L ${MIN_LINES_PER_SHARD} \ - ${SHARD_VCF_TMP}/remaining_VIDs.list \ - ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_ -#Otherwise, split into MAX_SHARDS evenly-sized shards -else - ${BIN}/evenSplitter.R \ - -S ${NONCLUSTER_SHARDS} \ - ${SHARD_VCF_TMP}/remaining_VIDs.list \ - ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_ -fi -#Iterate over all non-cluster shards and generate VCF shards -n_noncluster_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}.remainder_VIDs_*" | wc -l ) -for i in $( seq 1 ${n_noncluster_shards} ); do - idx=$(( ${n_shards} + ${i} )) - #Print header - zcat ${VCF} | head -n1000 | fgrep "#" > \ - ${OUTDIR}/${PREFIX}.shard_${idx}.vcf - #Shard based on VIDs (slower than tabix, but avoids omitting variants) - zcat ${VCF} | { fgrep -wf ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_${i} || true; } >> \ - ${OUTDIR}/${PREFIX}.shard_${idx}.vcf - #Sanity check shard - if [ $( { fgrep -v "#" ${OUTDIR}/${PREFIX}.shard_${idx}.vcf || true; } | wc -l ) -gt 0 ]; then - #Bgzip & tabix shard - bgzip -f ${OUTDIR}/${PREFIX}.shard_${idx}.vcf - tabix -f ${OUTDIR}/${PREFIX}.shard_${idx}.vcf.gz - else - rm ${OUTDIR}/${PREFIX}.shard_${idx}.vcf - fi -done - - -###SANITY CHECK SHARDS -while read shard; do - zcat ${shard} | cut -f1 | { fgrep -v "#" || true; } | wc -l -done < <( find ${OUTDIR} -name "${PREFIX}.shard_*.vcf.gz" ) \ - | sort -nrk1,1 \ - > ${SHARD_VCF_TMP}/vars_per_shard.txt -echo -e "FINISHED SHARDING VCF. RESULTING RECORDS PER SHARD FOR LARGEST 100 SHARDS:" -head -n100 ${SHARD_VCF_TMP}/vars_per_shard.txt | paste -s -d',' -#If shard with most variants is >10-fold more than next-largest shard, exit with code 1 -if [ $( find ${OUTDIR} -name "${PREFIX}.shard_*.vcf.gz" | wc -l ) -gt 1 ]; then - first=$( sed -n '1p' ${SHARD_VCF_TMP}/vars_per_shard.txt ) - second=$( sed -n '2p' ${SHARD_VCF_TMP}/vars_per_shard.txt ) - if [ ! -z ${second} ] && [ ${second} -gt 0 ]; then - if [ $(( ${first} / ${second} )) -ge 10 ]; then - echo -e "CRITICAL WARNING: LARGEST SHARD IS AT LEAST 10 TIMES LARGER THAN SECOND-LARGEST SHARD" - exit 1 - fi - fi -fi - - -###CLEAN UP -rm -rf ${SHARD_VCF_TMP} diff --git a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX_invOnly_part1.sh b/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX_invOnly_part1.sh deleted file mode 100755 index 6551fc60f..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX_invOnly_part1.sh +++ /dev/null @@ -1,337 +0,0 @@ -#!/bin/bash - -# Intelligently shards a VCF prior to complex resolution (for parallelization) - -# Version modified for inv-only sharding (RO 10%, no restriction on breakpoint distance) - -# Subsetted to first half, to just output lists of which variants should go in each shard - -set -euo pipefail - -###USAGE -usage(){ -cat < ${SHARD_VCF_TMP}/single_sample_input.vcf.gz -#Identify all candidate complex variant clusters (generous 50Mb, 10% RO clustering) -svtk vcfcluster \ - -d 50000000 \ - -f 0.10 \ - -p candidate_complex_clusters \ - --ignore-svtypes \ - -o 0 \ - --preserve-ids \ - <( echo "${SHARD_VCF_TMP}/single_sample_input.vcf.gz" ) \ - ${SHARD_VCF_TMP}/input_vcf.clustered.vcf -#Convert clustered variants to bed -svtk vcf2bed \ - --no-samples \ - --info ALL \ - ${SHARD_VCF_TMP}/input_vcf.clustered.vcf \ - ${SHARD_VCF_TMP}/input_vcf.clustered.bed -#Write list of clusters with >1 constituent variant -mem_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \ - sed 's/\t/\n/g' | awk '{ if ($1=="MEMBERS") print NR }' ) -awk -v idx=${mem_idx} -v OFS="\t" \ -'$idx ~ /,/ { print $1, $2, $3, $idx }' \ -${SHARD_VCF_TMP}/input_vcf.clustered.bed | (fgrep -v "#" || printf "") > \ -${SHARD_VCF_TMP}/candidate_complex_clusters.bed -#Add all non-CNV single-record variants -class_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \ - sed 's/\t/\n/g' | awk '{ if ($1=="SVTYPE") print NR }' ) -awk -v idx=${class_idx} -v mem_idx=${mem_idx} -v OFS="\t" \ -'$mem_idx !~ /,/ { print $1, $2, $3, $idx, $mem_idx }' \ -${SHARD_VCF_TMP}/input_vcf.clustered.bed | \ -awk -v OFS="\t" '$4 !~ /DEL|DUP|CNV|MCNV|mCNV/ { print $1, $2, $3, $5 }' | (fgrep -v "#" || printf "") >> \ -${SHARD_VCF_TMP}/candidate_complex_clusters.bed -#Get min/max coordinates of all variants in list of VIDs -cat <( zcat ${VCF} | fgrep "#" | cut -f1-10 ) \ -<( cut -f4 ${SHARD_VCF_TMP}/candidate_complex_clusters.bed | \ - sed 's/\,/\n/g' | sort -Vk1,1 | uniq | fgrep -wf - \ - <( zcat ${VCF} ) | cut -f1-10 ) | \ -svtk vcf2bed --no-samples /dev/stdin /dev/stdout > \ -${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed - -if [ $( cat ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed | (fgrep -v "#" || printf "") | wc -l ) -gt 0 ]; then - #Split into breakpoints and pad all breakpoints by ±1bp - #DEV NOTE: padding breakpoints for large chromosomes & many samples was causing - # issues where tens of thousands of breakpoints would end up in the same shard - # and take >36h to resolve, defeating the purpose of sharding - (fgrep -v "#" ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed || printf "") | \ - awk -v OFS="\t" -v buffer=1 \ - '{ print $1, $2-buffer, $2+buffer, $4"\n"$1, $3-buffer, $3+buffer, $4 }' | \ - awk -v OFS="\t" '{ if ($2<0) $2=0; print $1, $2, $3, $4 }' | \ - sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - -c 4 -o distinct > \ - ${SHARD_VCF_TMP}/breakpoint_intervals.bed - #Iterate over breakpoint intervals and write list of maximum nonredundant intervals - in_cluster=`mktemp` - remaining=`mktemp` - cp ${SHARD_VCF_TMP}/breakpoint_intervals.bed ${remaining} - while read chr start end VIDs; do - #Get all lines associated with current VIDs - echo -e "${VIDs}" | sed 's/,/\n/g' | (fgrep -wf - ${remaining} || printf "") \ - > ${in_cluster} || true - #Only run if at least one line added to ${in_cluster} - if [ $( cat ${in_cluster} | wc -l ) -gt 0 ]; then - #Exclude all lines in ${in_cluster} from ${remaining} - bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2 - mv ${remaining}2 ${remaining} - #Iterate until no more related VIDs are present in ${remaining} - until [ $( cut -f4 ${in_cluster} | sed 's/\,/\n/g' | (fgrep -wf - ${remaining} || printf "") | wc -l || true ) -eq 0 ]; do - #Add new lines to ${in_cluster} - cut -f4 ${in_cluster} | sed 's/\,/\n/g' | (fgrep -wf - ${remaining} || printf "") >> ${in_cluster} || true - #Exclude all lines in ${in_cluster} from ${remaining} - bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2 - mv ${remaining}2 ${remaining} - done - #Write out final interval - for wrapper in 1; do - #Print list of coordinates - cut -f1-3 ${in_cluster} | sort -Vk1,1 -k2,2 -k3,3 | bedtools merge -i - | \ - awk '{ print $1":"$2"-"$3 }' | paste -s -d\; - #Print list of involved VIDs - cut -f4 ${in_cluster} | sed 's/,/\n/g' | sort | uniq | paste -s -d, - done | paste -s - fi - done < ${SHARD_VCF_TMP}/breakpoint_intervals.bed > \ - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt - #Pull out exceptionally large clusters to the side to be placed in their own shards - while read ints VIDs; do - if [ $( echo ${VIDs} | sed 's/,/\n/g' | wc -l ) -ge ${MIN_LINES_PER_SHARD} ]; then - echo -e "${ints}\t${VIDs}" - fi - done < ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \ - > ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt - if [ $( cat ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt | wc -l ) -gt 0 ]; then - cut -f2 ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt \ - | sed 's/,/\n/g' \ - | { fgrep -wvf - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt || true; } \ - > ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2 - mv ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2 \ - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt - fi - - - ###DETERMINE COORDINATES FOR EACH SHARD - #Split variants into shards based on number of variants - #If total number of intervals/MAX_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard - if [ $(( $( cat ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt | wc -l ) / ${MAX_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then - ${BIN}/evenSplitter.R \ - -L ${MIN_LINES_PER_SHARD} \ - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_ - #Otherwise, split into MAX_SHARDS evenly-sized shards - else - ${BIN}/evenSplitter.R \ - -S ${MAX_SHARDS} \ - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_ - fi - #Writes exceptionally large clusters to their own shards - n_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}*" | wc -l ) - if [ $( cat ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt | wc -l ) -gt 0 ]; then - while read ints VIDs; do - n_shards=$(( ${n_shards} + 1 )) - echo -e "${ints}\t${VIDs}" > ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${n_shards} - done < ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt - fi - #Reformat interval shards - for i in $( seq 1 ${n_shards} ); do - cut -f1 ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i} | \ - sed -e 's/\;/\n/g' -e 's/\:/\t/g' -e 's/\-/\t/g' | \ - sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - > \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed - rm ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i} - done - - - ###SHARD VCF - #Convert full, original VCF to BED - svtk vcf2bed --no-samples \ - ${VCF} int.bed - #Harrison's patch for sharding - awk '{if ($1!~"#") print $1,$2,$2+1,$4,$5 \ - "\n" $1,$3-1,$3,$4,$5;else print}' OFS='\t' int.bed \ - > ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed - rm int.bed - #Iterate over all sharded intervals - for i in $( seq 1 $(( ${n_shards} )) ); do - #Write exclusion list of VIDs already used in earlier shards - touch ${SHARD_VCF_TMP}/used_VIDs.tmp - if [ ${i} -gt 1 ]; then - for j in $( seq 1 $(( ${i} - 1 )) ); do - cat ${OUTDIR}/${PREFIX}.shard_${j}.VIDs.list - done | sort | uniq > ${SHARD_VCF_TMP}/used_VIDs.tmp - fi - #Get list of IDs to be used in shard - bedtools intersect \ - -a ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed \ - -b ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed | \ - cut -f4 | (fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.tmp || printf "") \ - | sort | uniq > \ - ${OUTDIR}/${PREFIX}.shard_${i}.VIDs.list || true - #Clean up used VID list - rm ${SHARD_VCF_TMP}/used_VIDs.tmp - done - #Write list of all VIDs used in cluster shards - cat ${OUTDIR}/${PREFIX}.shard_*.VIDs.list \ - | sort | uniq \ - > ${SHARD_VCF_TMP}/used_VIDs.cluster_shards.list || true - #Write list of eligible VIDs - (fgrep -v "#" ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed || printf "") \ - | cut -f4 \ - | (fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.cluster_shards.list || printf "") \ - | sort | uniq \ - > ${SHARD_VCF_TMP}/remaining_VIDs.list || true -else - n_shards=0 - zcat ${VCF} | cut -f1-3 | (fgrep -v "#" || printf "") | cut -f3 | sort | uniq \ - > ${SHARD_VCF_TMP}/remaining_VIDs.list -fi - - -if [ $( cat ${SHARD_VCF_TMP}/remaining_VIDs.list | wc -l ) -gt 0 ]; then - #Shard remaining records into no more than $NONCLUSTER_SHARDS shards - #If total number of records/NONCLUSTER_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard - if [ $(( $( cat ${SHARD_VCF_TMP}/remaining_VIDs.list | wc -l ) / ${NONCLUSTER_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then - ${BIN}/evenSplitter.R \ - -L ${MIN_LINES_PER_SHARD} \ - ${SHARD_VCF_TMP}/remaining_VIDs.list \ - ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_ - #Otherwise, split into MAX_SHARDS evenly-sized shards - else - ${BIN}/evenSplitter.R \ - -S ${NONCLUSTER_SHARDS} \ - ${SHARD_VCF_TMP}/remaining_VIDs.list \ - ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_ - fi - #Iterate over all non-cluster shards and generate VCF shards - n_noncluster_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}.remainder_VIDs_*" | wc -l ) - for i in $( seq 1 ${n_noncluster_shards} ); do - idx=$(( ${n_shards} + ${i} )) - mv ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_${i} \ - ${OUTDIR}/${PREFIX}.shard_${idx}.VIDs.list - done -fi - - -###CLEAN UP -rm -rf ${SHARD_VCF_TMP} diff --git a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX_part1.sh b/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX_part1.sh deleted file mode 100755 index d3422745f..000000000 --- a/src/sv-pipeline/04_variant_resolution/scripts/shardVCF_preResolveCPX_part1.sh +++ /dev/null @@ -1,335 +0,0 @@ -#!/bin/bash - -# Intelligently shards a VCF prior to complex resolution (for parallelization) - -# Subsetted to first half, to just output lists of which variants should go in each shard - -set -euo pipefail - -###USAGE -usage(){ -cat < ${SHARD_VCF_TMP}/single_sample_input.vcf.gz -#Identify all candidate complex variant clusters (generous 1kb clustering) -svtk vcfcluster \ - -d 1000 \ - -f 0 \ - -p candidate_complex_clusters \ - --ignore-svtypes \ - -o 0 \ - --preserve-ids \ - <( echo "${SHARD_VCF_TMP}/single_sample_input.vcf.gz" ) \ - ${SHARD_VCF_TMP}/input_vcf.clustered.vcf -#Convert clustered variants to bed -svtk vcf2bed \ - --no-samples \ - --info ALL \ - ${SHARD_VCF_TMP}/input_vcf.clustered.vcf \ - ${SHARD_VCF_TMP}/input_vcf.clustered.bed -#Write list of clusters with >1 constituent variant -mem_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \ - sed 's/\t/\n/g' | awk '{ if ($1=="MEMBERS") print NR }' ) -awk -v idx=${mem_idx} -v OFS="\t" \ -'$idx ~ /,/ { print $1, $2, $3, $idx }' \ -${SHARD_VCF_TMP}/input_vcf.clustered.bed | (fgrep -v "#" || printf "") > \ -${SHARD_VCF_TMP}/candidate_complex_clusters.bed -#Add all non-CNV single-record variants -class_idx=$( head -n1 ${SHARD_VCF_TMP}/input_vcf.clustered.bed | \ - sed 's/\t/\n/g' | awk '{ if ($1=="SVTYPE") print NR }' ) -awk -v idx=${class_idx} -v mem_idx=${mem_idx} -v OFS="\t" \ -'$mem_idx !~ /,/ { print $1, $2, $3, $idx, $mem_idx }' \ -${SHARD_VCF_TMP}/input_vcf.clustered.bed | \ -awk -v OFS="\t" '$4 !~ /DEL|DUP|CNV|MCNV|mCNV/ { print $1, $2, $3, $5 }' | (fgrep -v "#" || printf "") >> \ -${SHARD_VCF_TMP}/candidate_complex_clusters.bed -#Get min/max coordinates of all variants in list of VIDs -cat <( zcat ${VCF} | fgrep "#" | cut -f1-10 ) \ -<( cut -f4 ${SHARD_VCF_TMP}/candidate_complex_clusters.bed | \ - sed 's/\,/\n/g' | sort -Vk1,1 | uniq | fgrep -wf - \ - <( zcat ${VCF} ) | cut -f1-10 ) | \ -svtk vcf2bed --no-samples /dev/stdin /dev/stdout > \ -${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed - - -###ONLY RUN IF ANY CANDIDATE COMPLEX CLUSTERS ARE IDENTIFIED -if [ $( cat ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed | (fgrep -v "#" || printf "") | wc -l ) -gt 0 ]; then - #Split into breakpoints and pad all breakpoints by ±1bp - #DEV NOTE: padding breakpoints for large chromosomes & many samples was causing - # issues where tens of thousands of breakpoints would end up in the same shard - # and take >36h to resolve, defeating the purpose of sharding - (fgrep -v "#" ${SHARD_VCF_TMP}/candidate_complex_clusters.variant_coordinates.bed || printf "") | \ - awk -v OFS="\t" -v buffer=1 \ - '{ print $1, $2-buffer, $2+buffer, $4"\n"$1, $3-buffer, $3+buffer, $4 }' | \ - awk -v OFS="\t" '{ if ($2<0) $2=0; print $1, $2, $3, $4 }' | \ - sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - -c 4 -o distinct > \ - ${SHARD_VCF_TMP}/breakpoint_intervals.bed - #Iterate over breakpoint intervals and write list of maximum nonredundant intervals - in_cluster=`mktemp` - remaining=`mktemp` - cp ${SHARD_VCF_TMP}/breakpoint_intervals.bed ${remaining} - while read chr start end VIDs; do - #Get all lines associated with current VIDs - echo -e "${VIDs}" | sed 's/,/\n/g' | (fgrep -wf - ${remaining} || printf "") \ - > ${in_cluster} || true - #Only run if at least one line added to ${in_cluster} - if [ $( cat ${in_cluster} | wc -l ) -gt 0 ]; then - #Exclude all lines in ${in_cluster} from ${remaining} - bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2 - mv ${remaining}2 ${remaining} - #Iterate until no more related VIDs are present in ${remaining} - until [ $( cut -f4 ${in_cluster} | sed 's/\,/\n/g' | (fgrep -wf - ${remaining} || printf "") | wc -l || true ) -eq 0 ]; do - #Add new lines to ${in_cluster} - cut -f4 ${in_cluster} | sed 's/\,/\n/g' | (fgrep -wf - ${remaining} || printf "") >> ${in_cluster} || true - #Exclude all lines in ${in_cluster} from ${remaining} - bedtools intersect -v -a ${remaining} -b ${in_cluster} > ${remaining}2 - mv ${remaining}2 ${remaining} - done - #Write out final interval - for wrapper in 1; do - #Print list of coordinates - cut -f1-3 ${in_cluster} | sort -Vk1,1 -k2,2 -k3,3 | bedtools merge -i - | \ - awk '{ print $1":"$2"-"$3 }' | paste -s -d\; - #Print list of involved VIDs - cut -f4 ${in_cluster} | sed 's/,/\n/g' | sort | uniq | paste -s -d, - done | paste -s - fi - done < ${SHARD_VCF_TMP}/breakpoint_intervals.bed > \ - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt - #Pull out exceptionally large clusters to the side to be placed in their own shards - while read ints VIDs; do - if [ $( echo ${VIDs} | sed 's/,/\n/g' | wc -l ) -ge ${MIN_LINES_PER_SHARD} ]; then - echo -e "${ints}\t${VIDs}" - fi - done < ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \ - > ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt - if [ $( cat ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt | wc -l ) -gt 0 ]; then - cut -f2 ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt \ - | sed 's/,/\n/g' \ - | { fgrep -wvf - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt || true; } \ - > ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2 - mv ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt2 \ - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt - fi - - - ###DETERMINE COORDINATES FOR EACH SHARD - #Split variants into shards based on number of variants - #If total number of intervals/MAX_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard - if [ $(( $( cat ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt | wc -l ) / ${MAX_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then - ${BIN}/evenSplitter.R \ - -L ${MIN_LINES_PER_SHARD} \ - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_ - #Otherwise, split into MAX_SHARDS evenly-sized shards - else - ${BIN}/evenSplitter.R \ - -S ${MAX_SHARDS} \ - ${SHARD_VCF_TMP}/complex_intervals_to_test.final.txt \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_ - fi - #Writes exceptionally large clusters to their own shards - n_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}*" | wc -l ) - if [ $( cat ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt | wc -l ) -gt 0 ]; then - while read ints VIDs; do - n_shards=$(( ${n_shards} + 1 )) - echo -e "${ints}\t${VIDs}" > ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${n_shards} - done < ${SHARD_VCF_TMP}/large_intervals_to_test.final.txt - fi - #Reformat interval shards - for i in $( seq 1 ${n_shards} ); do - cut -f1 ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i} | \ - sed -e 's/\;/\n/g' -e 's/\:/\t/g' -e 's/\-/\t/g' | \ - sort -Vk1,1 -k2,2n -k3,3n | bedtools merge -i - > \ - ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed - rm ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i} - done - - - ###SHARD VCF - #Convert full, original VCF to BED - svtk vcf2bed --no-samples \ - ${VCF} int.bed - #Harrison's patch for sharding - awk '{if ($1!~"#") print $1,$2,$2+1,$4,$5 \ - "\n" $1,$3-1,$3,$4,$5;else print}' OFS='\t' int.bed \ - > ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed - rm int.bed - #Iterate over all sharded intervals - for i in $( seq 1 $(( ${n_shards} )) ); do - #Write exclusion list of VIDs already used in earlier shards - touch ${SHARD_VCF_TMP}/used_VIDs.tmp - if [ ${i} -gt 1 ]; then - for j in $( seq 1 $(( ${i} - 1 )) ); do - cat ${OUTDIR}/${PREFIX}.shard_${j}.VIDs.list - done | sort | uniq > ${SHARD_VCF_TMP}/used_VIDs.tmp - fi - #Get list of IDs to be used in shard - bedtools intersect \ - -a ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed \ - -b ${SHARD_VCF_TMP}/${PREFIX}.shard_intervals_${i}.bed | \ - cut -f4 | (fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.tmp || printf "") \ - | sort | uniq > \ - ${OUTDIR}/${PREFIX}.shard_${i}.VIDs.list - #Clean up used VID list - rm ${SHARD_VCF_TMP}/used_VIDs.tmp - done - #Write list of all VIDs used in cluster shards - cat ${OUTDIR}/${PREFIX}.shard_*.VIDs.list \ - | sort | uniq \ - > ${SHARD_VCF_TMP}/used_VIDs.cluster_shards.list || true - #Write list of eligible VIDs - (fgrep -v "#" ${SHARD_VCF_TMP}/input_vcf.vcf2bed.bed || printf "") \ - | cut -f4 \ - | (fgrep -wvf ${SHARD_VCF_TMP}/used_VIDs.cluster_shards.list || printf "") \ - | sort | uniq \ - > ${SHARD_VCF_TMP}/remaining_VIDs.list || true -else - n_shards=0 - zcat ${VCF} | cut -f1-3 | (fgrep -v "#" || printf "") | cut -f3 | sort | uniq \ - > ${SHARD_VCF_TMP}/remaining_VIDs.list -fi - - -#Shard remaining records into no more than $NONCLUSTER_SHARDS shards -#If total number of records/NONCLUSTER_SHARDS < MIN_LINES_PER_SHARD, evenly split into MIN_LINES_PER_SHARD sites per shard -if [ $(( $( cat ${SHARD_VCF_TMP}/remaining_VIDs.list | wc -l ) / ${NONCLUSTER_SHARDS} )) -lt ${MIN_LINES_PER_SHARD} ]; then - ${BIN}/evenSplitter.R \ - -L ${MIN_LINES_PER_SHARD} \ - ${SHARD_VCF_TMP}/remaining_VIDs.list \ - ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_ -#Otherwise, split into MAX_SHARDS evenly-sized shards -else - ${BIN}/evenSplitter.R \ - -S ${NONCLUSTER_SHARDS} \ - ${SHARD_VCF_TMP}/remaining_VIDs.list \ - ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_ -fi -#Iterate over all non-cluster shards and generate VCF shards -n_noncluster_shards=$( find ${SHARD_VCF_TMP} -name "${PREFIX}.remainder_VIDs_*" | wc -l ) -for i in $( seq 1 ${n_noncluster_shards} ); do - idx=$(( ${n_shards} + ${i} )) - mv ${SHARD_VCF_TMP}/${PREFIX}.remainder_VIDs_${i} \ - ${OUTDIR}/${PREFIX}.shard_${idx}.VIDs.list -done - - -###CLEAN UP -rm -rf ${SHARD_VCF_TMP} diff --git a/src/sv-pipeline/05_annotation/scripts/compute_AFs.py b/src/sv-pipeline/05_annotation/scripts/compute_AFs.py index 93a4196c9..31b5a0c0a 100755 --- a/src/sv-pipeline/05_annotation/scripts/compute_AFs.py +++ b/src/sv-pipeline/05_annotation/scripts/compute_AFs.py @@ -82,7 +82,7 @@ def update_sex_freqs(record, pop=None): return record -def gather_allele_freqs(record, all_samples, males, females, parbt, pop_dict, pops, +def gather_allele_freqs(record, samples, males_set, females_set, parbt, pop_dict, pops, sex_chroms, no_combos=False): """ Wrapper to compute allele frequencies for all sex & population pairings @@ -99,40 +99,40 @@ def gather_allele_freqs(record, all_samples, males, females, parbt, pop_dict, po rec_in_par = False # Get allele frequencies for all populations - calc_allele_freq(record, all_samples) - if len(males) > 0: + calc_allele_freq(record, samples) + if len(males_set) > 0: if record.chrom in sex_chroms and not rec_in_par: - calc_allele_freq(record, males, prefix='MALE', hemi=True) + calc_allele_freq(record, males_set, prefix='MALE', hemi=True) else: - calc_allele_freq(record, males, prefix='MALE') - if len(females) > 0: - calc_allele_freq(record, females, prefix='FEMALE') + calc_allele_freq(record, males_set, prefix='MALE') + if len(females_set) > 0: + calc_allele_freq(record, females_set, prefix='FEMALE') # Adjust global allele frequencies on sex chromosomes, if famfile provided if record.chrom in sex_chroms and not rec_in_par \ - and svu.is_biallelic(record) and len(males) + len(females) > 0: + and svu.is_biallelic(record) and len(males_set) + len(females_set) > 0: update_sex_freqs(record) # Get allele frequencies per population if len(pops) > 0: for pop in pops: pop_samps = [ - s for s in all_samples if pop_dict.get(s, None) == pop] + s for s in samples if pop_dict.get(s, None) == pop] calc_allele_freq(record, pop_samps, prefix=pop) - if len(males) > 0 and not no_combos: + if len(males_set) > 0 and not no_combos: if record.chrom in sex_chroms and not rec_in_par: - calc_allele_freq(record, [s for s in pop_samps if s in males], + calc_allele_freq(record, list([s for s in pop_samps if s in males_set]), prefix=pop + '_MALE', hemi=True) else: - calc_allele_freq(record, [s for s in pop_samps if s in males], + calc_allele_freq(record, list([s for s in pop_samps if s in males_set]), prefix=pop + '_MALE') - if len(females) > 0 and not no_combos: - calc_allele_freq(record, [s for s in pop_samps if s in females], + if len(females_set) > 0 and not no_combos: + calc_allele_freq(record, list([s for s in pop_samps if s in females_set]), prefix=pop + '_FEMALE') # Adjust per-pop allele frequencies on sex chromosomes, if famfile provided if record.chrom in sex_chroms and not rec_in_par \ - and svu.is_biallelic(record) and len(males) + len(females) > 0: + and svu.is_biallelic(record) and len(males_set) + len(females_set) > 0: update_sex_freqs(record, pop=pop) # Get POPMAX AF biallelic sites only @@ -154,7 +154,7 @@ def calc_allele_freq(record, samples, prefix=None, hemi=False): if svu.is_biallelic(record): # Get all sample GTs - GTs = [s['GT'] for s in record.samples.values() if s.name in samples] + GTs = [record.samples[s]['GT'] for s in samples] # Count alleles & genotypes AC = 0 @@ -237,8 +237,7 @@ def calc_allele_freq(record, samples, prefix=None, hemi=False): else: # Get all sample CNs and remove Nones - CNs_wNones = [s['CN'] - for s in record.samples.values() if s.name in samples] + CNs_wNones = [record.samples[s]['CN'] for s in samples] CNs = [c for c in CNs_wNones if c is not None and c not in '. NA'.split()] if len(CNs) == 0: @@ -306,24 +305,26 @@ def main(): vcf = pysam.VariantFile(args.vcf) # Get list of all samples in vcf - all_samples = list(vcf.header.samples) + samples_list = list(vcf.header.samples) # Get lists of males and females parbt = pbt.BedTool('', from_string=True) if args.famfile is not None: famfile = [line.rstrip('\n') for line in open(args.famfile)] - males = [line.split('\t')[1] - for line in famfile if line.split('\t')[4] == '1'] - females = [line.split('\t')[1] - for line in famfile if line.split('\t')[4] == '2'] + males_set = set([line.split('\t')[1] + for line in famfile if line.split('\t')[4] == '1']) + males_set = set(s for s in samples_list if s in males_set) + females_set = set([line.split('\t')[1] + for line in famfile if line.split('\t')[4] == '2']) + females_set = set(s for s in samples_list if s in females_set) sexes = 'MALE FEMALE'.split() if args.par is not None: parbt = pbt.BedTool(args.par) else: - males = [] - females = [] - sexes = [] + males_set = set() + females_set = set() + sexes = list() # Get dictionary of populations if args.popfile is not None: @@ -335,6 +336,7 @@ def main(): pop_dict = {} pops = [] + # Get list of sex chromosomes, if optioned if args.allosomes_list is not None: sex_chroms = [l.split('\t')[0] @@ -491,7 +493,7 @@ def main(): # Get allele frequencies for each record & write to new VCF for r in vcf.fetch(): - newrec = gather_allele_freqs(r, all_samples, males, females, parbt, pop_dict, + newrec = gather_allele_freqs(r, samples_list, males_set, females_set, parbt, pop_dict, pops, sex_chroms, args.no_combos) fout.write(newrec) diff --git a/src/sv-pipeline/java/StitchFragmentedCNVs.java b/src/sv-pipeline/java/StitchFragmentedCNVs.java deleted file mode 100644 index e7e9e519e..000000000 --- a/src/sv-pipeline/java/StitchFragmentedCNVs.java +++ /dev/null @@ -1,332 +0,0 @@ -import java.io.*; -import java.util.*; - -/** Read a VCF, and try to stitch together adjacent copy-number variations. - * Eligible Records (which we call "stitchable") must meet certain criteria as specified by the - * isStitchable method of the StitchableIterator. - * If two stitchables overlap appropriately, and all their samples have identical genotypes, we can - * replace the first one by adding on the interval covered by the second one. - */ -public class StitchFragmentedCNVs { - private static final VCFParser.ByteSequence END = new VCFParser.ByteSequence("END"); - private static final VCFParser.ByteSequence SVLEN = new VCFParser.ByteSequence("SVLEN"); - - // These 3 values will always be overwritten, but are initialized to reasonable defaults as documentation - private static double PAD_FACTOR = .2; - private static int MAX_PAD = 200000; - private static double MAX_OVERLAP_FACTOR = .2; - - public static void main( final String[] args ) { - if ( args.length != 4 ) { - System.err.println("Usage: java StitchFragmentedCNVs PAD% MAXPAD OVRLAP% VCFFILE"); - System.err.println("E.g.: java StitchFragmentedCNVs .2 200000 .2 input.vcf.gz"); - System.err.println("Combines neighboring CNVs with matching genotypes into a larger event."); - System.err.println("Writes an uncompressed vcf to stdout."); - System.exit(1); - } - - initCommandLineArgs(args); - - try ( final OutputStream os - = new BufferedOutputStream(new FileOutputStream(FileDescriptor.out)) ) { - try ( final VCFParser vcfParser = new VCFParser(args[3]) ) { - while ( vcfParser.hasMetadata() ) { - vcfParser.nextMetaData().write(os); - } - final StitchableIterator sItr = new StitchableIterator(vcfParser); - VCFParser.Record stitchableRecord; - while ( (stitchableRecord = sItr.nextSubject(os)) != null ) { - findExtension(stitchableRecord, sItr); - stitchableRecord.write(os); - } - } - } catch ( final IOException ioe ) { - throw new VCFParser.MalformedVCFException("can't write to stdout", ioe); - } - } - - /** Look for a stitchable downstream of the subject that can be joined to it to - * make a larger event. */ - private static void findExtension( final VCFParser.Record stitchable, - final StitchableIterator sItr ) throws IOException { - final PaddedInterval originalPaddedInterval = new PaddedInterval(stitchable); - PaddedInterval paddedInterval = originalPaddedInterval; - - // sItr.hasNext returns false at EOF, or when the next record is too far away to - // overlap the subject - while ( sItr.hasNext() ) { - final VCFParser.Record record = sItr.next(); - final PaddedInterval paddedInterval2 = new PaddedInterval(record); - if ( paddedInterval.canCoalesceWith(paddedInterval2) && - genotypesMatch(stitchable, record) ) { - paddedInterval = paddedInterval2; - sItr.remove(); - } - } - - if ( paddedInterval != originalPaddedInterval ) { - final int endPos = paddedInterval.getVCFEnd(); - // this won't be null -- it was checked in isStitchable - final VCFParser.ByteSequence endField = stitchable.getInfoField(END); - stitchable.setInfoField(endField, new VCFParser.ByteSequence(Integer.toString(endPos))); - final VCFParser.ByteSequence svLenField = stitchable.getInfoField(SVLEN); - if ( svLenField == null ) { - throw new VCFParser.MalformedVCFException(stitchable.getID().toString() + " has no SVLEN field"); - } - final int svLength = endPos + 1 - stitchable.getPosition(); - final VCFParser.ByteSequence svLenValue = - new VCFParser.ByteSequence(Integer.toString(svLength)); - stitchable.setInfoField(svLenField, svLenValue); - } - } - - private static boolean genotypesMatch( final VCFParser.Record rec1, final VCFParser.Record rec2 ) { - final List gt1 = rec1.getGenotypes(); - final List gt2 = rec2.getGenotypes(); - final int nGTs = gt1.size(); - if ( gt2.size() != nGTs ) { - throw new IllegalStateException("two records have a different number of genotypes"); - } - for ( int idx = 0; idx != nGTs; ++idx ) { - final VCFParser.ByteIterator itr1 = gt1.get(idx).iterator(); - final VCFParser.ByteIterator itr2 = gt2.get(idx).iterator(); - byte b1; - do { - b1 = itr1.hasNext() ? itr1.next() : -1; - final byte b2 = itr2.hasNext() ? itr2.next() : -1; - if ( b1 != b2 ) return false; - } while ( b1 != ':' ); - } - return true; - } - - private static void initCommandLineArgs( final String[] args ) { - try { - PAD_FACTOR = Double.parseDouble(args[0]); - } catch ( final NumberFormatException nfe ) { - System.err.println("Can't interpret 1st argument (padding fraction) as a floating point number."); - System.exit(2); - } - if ( PAD_FACTOR < 0.0 ) { - System.err.println("First argument should be a padding fraction >= 0."); - System.exit(2); - } - try { - MAX_PAD = Integer.parseInt(args[1]); - } catch ( final NumberFormatException nfe ) { - System.err.println("Can't interpret 2nd argument (maximum padding in bases) as an integer."); - System.exit(2); - } - if ( MAX_PAD < 0 ) { - System.err.println("Second argument must be a maximum padding in bases >= 0."); - System.exit(2); - } - try { - MAX_OVERLAP_FACTOR = Double.parseDouble(args[0]); - } catch ( final NumberFormatException nfe ) { - System.err.println("Can't interpret 3rd argument (maximum overlap fraction) as a floating point number."); - System.exit(2); - } - if ( MAX_OVERLAP_FACTOR < 0.0 || MAX_OVERLAP_FACTOR > 1.0 ) { - System.err.println("Third argument should be a maximum overlap fraction between 0 and 1."); - System.exit(2); - } - } - - /** A little helper class to do padding and overlap calculations - * Note: this class uses half-open intervals, unlike a vcf */ - private final static class PaddedInterval { - private final int start; - private final int end; - private final int padding; - private final int maxOverlap; - - public PaddedInterval( final VCFParser.Record record ) { - this.start = record.getPosition(); - // getInfoField can't return null -- it's been checked in isStitchable - // + 1 because vcf has closed intervals, we use half-open - this.end = record.getInfoField(END).asInt() + 1; - final int length = end - start; - this.padding = Math.min(MAX_PAD, (int)(length * PAD_FACTOR)); - this.maxOverlap = (int)(length * MAX_OVERLAP_FACTOR); - } - - public boolean canCoalesceWith( final PaddedInterval downstreamInterval ) { - // Check that the padded intervals overlap. - // Only have to check one end, because we know the downstream interval starts as late - // or later than this one. - if ( end + padding <= downstreamInterval.start - downstreamInterval.padding ) { - return false; - } - // but the unpadded intervals mustn't overlap too much - final int overlap = Math.min(end, downstreamInterval.end) - downstreamInterval.start; - return overlap < maxOverlap && overlap < downstreamInterval.maxOverlap; - } - - public int getVCFEnd() { return end - 1; } - } - - /** As we go through the VCF we create a new Chunk whenever we encounter a stitchable record. - * So, a Chunk consists of a mess of non-stitchables, and a trailing stitchable. */ - private final static class Chunk { - private final List nonStitchables; - private final VCFParser.Record stitchable; - - public Chunk( final List nonStitchables, - final VCFParser.Record stitchable ) { - this.nonStitchables = nonStitchables; - this.stitchable = stitchable; - } - - public List getNonStitchables() { return nonStitchables; } - public VCFParser.Record getStitchable() { return stitchable; } - } - - /** Maintains a list of chunks so that the client just sees the stitchables, while making sure - * that the record ordering of the input file is maintained. - * This is a kind of double iterator: At the outer level, calling nextSubject repeatedly - * until it returns null lets you simply iterate over each stitchable record in the input file. - * Each time you do so, the inner iterator (hasNext/next) gets reset to iterate over the - * stitchable records downstream of the subject. The inner iterator is smart enough to quit - * (i.e., hasNext will return false) when we've read so far ahead that we can't possibly find - * a stitchable that can be joined to the subject. - */ - private final static class StitchableIterator implements Iterator { - private final VCFParser vcfParser; - private final List chunks; - private int subjectIndex; - private VCFParser.ByteSequence subjectChromosome; - private int subjectMinNoOverlapPosition; // far enough downstream that MAX_PAD will ensure there's no overlap - private int iterationIndex; - private VCFParser.Record nextRecord; // this is a pushback for a record that's too far downstream - - private static final VCFParser.ByteSequence MULTIALLELIC = new VCFParser.ByteSequence("MULTIALLELIC"); - private static final VCFParser.ByteSequence SVTYPE = new VCFParser.ByteSequence("SVTYPE"); - private static final VCFParser.ByteSequence SVTYPE_DEL = new VCFParser.ByteSequence("DEL"); - private static final VCFParser.ByteSequence SVTYPE_DUP = new VCFParser.ByteSequence("DUP"); - private static final VCFParser.ByteSequence EVIDENCE = new VCFParser.ByteSequence("EVIDENCE"); - private static final String EVIDENCE_RD = "RD"; - private static final String EVIDENCE_SR = "SR"; - private static final String EVIDENCE_PE = "PE"; - private static final String EVIDENCE_BAF = "BAF"; - - public StitchableIterator( final VCFParser vcfParser ) { - this.vcfParser = vcfParser; - this.chunks = new ArrayList<>(); - } - - /** write the non-stitchables that precede the first stitchable, - * and return the next stitchable */ - public VCFParser.Record nextSubject( final OutputStream os ) throws IOException { - final int nChunks = chunks.size(); - while ( subjectIndex < nChunks ) { - final Chunk chunk = chunks.get(subjectIndex); - // clean out the chunks as we use them: in coordinate-dense vcfs the chunks array - // can get quite large, and, especially in vcfs with lots of sample, a chunk can - // occupy quite a large amount of memory. we want to release the chunks for garbage - // collection ASAP to control memory use. - chunks.set(subjectIndex, null); - iterationIndex = ++subjectIndex; - for ( final VCFParser.Record rec : chunk.getNonStitchables() ) { - rec.write(os); - } - final VCFParser.Record stitchable = chunk.getStitchable(); - if ( stitchable != null ) { - return setSubject(stitchable); - } - } - - // there are no more chunks to serve as subjects, reset the queue - chunks.clear(); - subjectIndex = iterationIndex = 0; - - while ( nextRecord != null || vcfParser.hasRecord() ) { - final VCFParser.Record record = nextRecord != null ? nextRecord: vcfParser.nextRecord(); - nextRecord = null; - if ( isStitchable(record) ) { - return setSubject(record); - } - record.write(os); - } - return null; - } - - /** Is there another stitchable downstream of the subject that is within joining range? **/ - @Override public boolean hasNext() { - final int nChunks = chunks.size(); - while ( iterationIndex < nChunks ) { - final VCFParser.Record stitchable = chunks.get(iterationIndex).getStitchable(); - if ( stitchable != null ) { - return true; - } - ++iterationIndex; - } - if ( nextRecord != null || vcfParser.hasRecord() ) { - List nonStitchables = null; - do { - final VCFParser.Record record = - nextRecord != null ? nextRecord : vcfParser.nextRecord(); - nextRecord = null; - if ( !record.getChromosome().equals(subjectChromosome) || - record.getPosition() >= subjectMinNoOverlapPosition ) { - nextRecord = record; - if ( nonStitchables != null ) { - chunks.add(new Chunk(nonStitchables, null)); - } - return false; - } - if ( isStitchable(record) ) { - if ( nonStitchables == null ) { - nonStitchables = Collections.emptyList(); - } - chunks.add(new Chunk(nonStitchables, record)); - return true; - } - if ( nonStitchables == null ) { - nonStitchables = new ArrayList<>(); - } - nonStitchables.add(record); - } while ( vcfParser.hasRecord() ); - chunks.add(new Chunk(nonStitchables, null)); - } - return false; - } - - @Override public VCFParser.Record next() { - if ( !hasNext() ) { - throw new NoSuchElementException(); - } - return chunks.get(iterationIndex++).getStitchable(); - } - - @Override public void remove() { - final int idx = iterationIndex - 1; - chunks.set(idx, new Chunk(chunks.get(idx).getNonStitchables(), null)); - } - - private static boolean isStitchable( final VCFParser.Record record ) { - if ( MULTIALLELIC.equals(record.getFilter()) ) return false; - final Map infoMap = record.getInfoAsMap(); - final VCFParser.ByteSequence svType = infoMap.get(SVTYPE); - if ( !SVTYPE_DEL.equals(svType) && !SVTYPE_DUP.equals(svType) ) return false; - // you can't be a stitchable if you don't have an "END" info field. - // code elsewhere assumes it can grab this value without checking for its existence - final VCFParser.ByteSequence end = infoMap.get(END); - if ( end == null || end.asInt() == VCFParser.ByteSequence.MISSING_VALUE ) return false; - final VCFParser.ByteSequence evidence = infoMap.get(EVIDENCE); - if ( evidence == null ) return false; - final String evStr = evidence.toString(); - return !evStr.contains(EVIDENCE_PE) && !evStr.contains(EVIDENCE_SR) && - (evStr.contains(EVIDENCE_RD) || evStr.contains(EVIDENCE_BAF)); - } - - private VCFParser.Record setSubject( final VCFParser.Record record ) { - subjectChromosome = record.getChromosome(); - final int start = record.getPosition(); - final int end = record.getInfoField(END).asInt(); // can't be null -- checked in isStitchable - subjectMinNoOverlapPosition = - end + Math.min(MAX_PAD, (int)(PAD_FACTOR * (end - start))) + MAX_PAD; - return record; - } - } -} diff --git a/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java b/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java new file mode 100644 index 000000000..1633ed010 --- /dev/null +++ b/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1.java @@ -0,0 +1,320 @@ +package org.broadinstitute.svpipeline; + +import java.io.*; +import java.nio.charset.StandardCharsets; +import java.util.*; +import java.util.regex.Pattern; +import org.broadinstitute.svpipeline.VCFParser.*; + +public class CleanVCFPart1 { + private static final ByteSequence[] EV_VALS = { + null, + new ByteSequence("RD"), + new ByteSequence("PE"), + new ByteSequence("RD,PE"), + new ByteSequence("SR"), + new ByteSequence("RD,SR"), + new ByteSequence("PE,SR"), + new ByteSequence("RD,PE,SR") + }; + private static final ByteSequence FORMAT_LINE = new ByteSequence("FORMAT"); + private static final ByteSequence ID_KEY = new ByteSequence("ID"); + private static final ByteSequence EV_VALUE = new ByteSequence("EV"); + private static final ByteSequence TYPE_KEY = new ByteSequence("Type"); + private static final ByteSequence STRING_VALUE = new ByteSequence("String"); + private static final ByteSequence NUMBER_KEY = new ByteSequence("Number"); + private static final ByteSequence SVTYPE_KEY = new ByteSequence("SVTYPE"); + private static final ByteSequence ME_VALUE = new ByteSequence(":ME"); + private static final ByteSequence LT_VALUE = new ByteSequence("<"); + private static final ByteSequence GT_VALUE = new ByteSequence(">"); + private static final ByteSequence N_VALUE = new ByteSequence("N"); + private static final ByteSequence END_KEY = new ByteSequence("END"); + private static final ByteSequence VARGQ_KEY = new ByteSequence("varGQ"); + private static final ByteSequence MULTIALLELIC_KEY = new ByteSequence("MULTIALLELIC"); + private static final ByteSequence UNRESOLVED_KEY = new ByteSequence("UNRESOLVED"); + private static final ByteSequence HIGH_SR_BACKGROUND = new ByteSequence("HIGH_SR_BACKGROUND"); + private static final ByteSequence PASS_VALUE = new ByteSequence("PASS"); + private static final ByteSequence BOTHSIDES_VALUE = new ByteSequence("BOTHSIDES_SUPPORT"); + private static final ByteSequence DEL_VALUE = new ByteSequence("DEL"); + private static final ByteSequence DUP_VALUE = new ByteSequence("DUP"); + private static final ByteSequence RDCN_VALUE = new ByteSequence("RD_CN"); + private static final ByteSequence MISSING_VALUE = new ByteSequence("."); + private static final ByteSequence MISSING_GENOTYPE = new ByteSequence("./."); + private static final ByteSequence GT_REF_REF = new ByteSequence("0/0"); + private static final ByteSequence GT_REF_ALT = new ByteSequence("0/1"); + private static final ByteSequence GT_ALT_ALT = new ByteSequence("1/1"); + + private static final int MIN_ALLOSOME_EVENT_SIZE = 5000; + + public static void main( final String[] args ) { + if ( args.length != 8 ) { + System.err.println("Usage: java org.broadinstitute.svpipeline.CleanVCFPart1 " + + "INPUTVCFFILE PEDIGREES XCHR YCHR NOISYEVENTS BOTHSIDES SAMPLESOUT REVISEDEVENTSOUT"); + System.exit(1); + } + final VCFParser parser = new VCFParser(args[0]); + final ByteSequence xChrName = new ByteSequence(args[2]); + final ByteSequence yChrName = new ByteSequence(args[3]); + final Set noisyEvents = readLastColumn(args[4]); + final Set bothsidesSupportEvents = readLastColumn(args[5]); + try ( final OutputStream os + = new BufferedOutputStream(new FileOutputStream(FileDescriptor.out)); + final OutputStream osSamples = new BufferedOutputStream(new FileOutputStream(args[6])); + final OutputStream osRevEvents = new BufferedOutputStream(new FileOutputStream(args[7])) ) { + int[] sexForSample = null; + while ( parser.hasMetadata() ) { + final Metadata metadata = parser.nextMetaData(); + if ( metadata instanceof ColumnHeaderMetadata ) { + final ColumnHeaderMetadata cols = ((ColumnHeaderMetadata)metadata); + final List colNames = cols.getValue(); + final int nCols = colNames.size(); + for ( int idx = 9; idx < nCols; ++idx ) { + colNames.get(idx).write(osSamples); + osSamples.write('\n'); + } + sexForSample = readPedFile(args[1], cols.getValue()); + os.write(("##FILTER=\n") + .getBytes(StandardCharsets.UTF_8)); + os.write("##FILTER=\n" + .getBytes(StandardCharsets.UTF_8)); + os.write(("##FILTER=\n") + .getBytes(StandardCharsets.UTF_8)); + } else if ( metadata instanceof KeyAttributesMetadata ) { + final KeyAttributesMetadata keyAttrs = (KeyAttributesMetadata)metadata; + if ( keyAttrs.getKey().equals(FORMAT_LINE) ) { + final List kvs = keyAttrs.getValue(); + final int nKVs = kvs.size(); + if ( nKVs > 2 ) { + final KeyValue kv0 = kvs.get(0); + final KeyValue kv1 = kvs.get(1); + final KeyValue kv2 = kvs.get(2); + if ( kv0.getKey().equals(ID_KEY) && kv0.getValue().equals(EV_VALUE) ) { + if ( kv1.getKey().equals(NUMBER_KEY) ) { + kvs.set(1, new KeyValue(NUMBER_KEY, MISSING_VALUE)); + } + if ( kv2.getKey().equals(TYPE_KEY) ) { + kvs.set(2, new KeyValue(TYPE_KEY, STRING_VALUE)); + } + } + } + } + } + metadata.write(os); + } + if ( sexForSample == null ) { + throw new RuntimeException("header line with sample names is missing."); + } + while ( parser.hasRecord() ) { + final Record record = parser.nextRecord(); + + // replace the numeric EV value with a text value + final int evIdx = record.getFormat().indexOf(EV_VALUE); + if ( evIdx >= 0 ) { + for ( final CompoundField genotypeVals : record.getGenotypes() ) { + genotypeVals.set(evIdx, EV_VALS[genotypeVals.get(evIdx).asInt()]); + } + } + + // move the SVTYPE to the ALT field (except for MEs) + final InfoField info = record.getInfo(); + final ByteSequence svType = info.get(SVTYPE_KEY); + if ( !record.getAlt().contains(ME_VALUE) ) { + if ( svType != null ) { + record.setAlt(new ByteSequence(LT_VALUE, svType, GT_VALUE)); + } + } + record.setRef(N_VALUE); + + // move varGQ info field to quality column + final ByteSequence varGQ = info.get(VARGQ_KEY); + if ( varGQ != null ) { + record.setQuality(varGQ); + info.remove(VARGQ_KEY); + } + + // remove MULTIALLELIC flag, if present + info.remove(MULTIALLELIC_KEY); + + // remove UNRESOLVED flag and add it as a filter + if ( info.containsKey(UNRESOLVED_KEY) ) { + record.getFilter().add(UNRESOLVED_KEY); + info.remove(UNRESOLVED_KEY); + } + + // mark noisy events + if ( noisyEvents.contains(record.getID()) ) { + record.getFilter().add(HIGH_SR_BACKGROUND); + } + + // mark bothsides support + if ( bothsidesSupportEvents.contains(record.getID()) ) { + final CompoundField filters = record.getFilter(); + if ( filters.size() == 1 && filters.get(0).equals(PASS_VALUE) ) { + record.setFilter(BOTHSIDES_VALUE); + } else { + filters.add(BOTHSIDES_VALUE); + } + } + + // fix genotypes on allosomes + final boolean isY; + if ( (isY = yChrName.equals(record.getChromosome())) || + xChrName.equals(record.getChromosome())) { + final List genotypes = record.getGenotypes(); + final int rdCNIndex = record.getFormat().indexOf(RDCN_VALUE); + final ByteSequence end = info.get(END_KEY); + boolean adjustMale = false; + final boolean isDel; + if ( ((isDel = DEL_VALUE.equals(svType)) || DUP_VALUE.equals(svType)) && rdCNIndex >= 0 && end != null && + end.asInt() + 1 - record.getPosition() > MIN_ALLOSOME_EVENT_SIZE ) { + adjustMale = isRevisableEvent(genotypes, rdCNIndex, sexForSample, isY); + if ( adjustMale ) { + record.getID().write(osRevEvents); + osRevEvents.write('\n'); + } + } + CompoundField emptyGenotype = null; + final int nSamples = genotypes.size(); + for ( int sampleIdx = 0; sampleIdx < nSamples; ++sampleIdx ) { + final int sampleSex = sexForSample[sampleIdx]; + final CompoundField genotype = genotypes.get(sampleIdx); + if ( sampleSex == 1 ) { + if ( adjustMale ) { + final ByteSequence rdCN = genotype.get(rdCNIndex); + if ( rdCN.equals(MISSING_VALUE) ) { + continue; + } + final int rdCNVal = rdCN.asInt(); + genotype.set(rdCNIndex, new ByteSequence(Integer.toString(rdCNVal + 1))); + if ( isDel ) { + if ( rdCNVal >= 1 ) genotype.set(0, GT_REF_REF); + else if ( rdCNVal == 0 ) genotype.set(0, GT_REF_ALT); + } else { + if ( rdCNVal <= 1 ) genotype.set(0, GT_REF_REF); + else if ( rdCNVal == 2 ) genotype.set(0, GT_REF_ALT); + else genotype.set(0, GT_ALT_ALT); + } + } + } else if ( sampleSex == 2 ) { + if ( isY ) { + if ( emptyGenotype == null ) { + emptyGenotype = new CompoundField(MISSING_GENOTYPE, ':'); + int nFields = genotype.size(); + while ( --nFields > 0 ) { + emptyGenotype.add(MISSING_VALUE); + } + emptyGenotype.getValue(); // performance hack to put the pieces together + } + genotypes.set(sampleIdx, emptyGenotype); + } + } else { + genotype.set(0, MISSING_GENOTYPE); + } + } + } + + record.write(os); + } + } catch ( final IOException ioe ) { + throw new RuntimeException("Can't write to stdout", ioe); + } + } + + private static boolean isRevisableEvent( final List genotypes, + final int rdCNIndex, + final int[] sexForColumn, + final boolean isY ) { + // We're going to calculate the median rdCN values for males and females. + // We only care if the median is 0, 1, 2, or something larger, so we'll use 4 bins to + // sum up the counts: all values >2 go into the last bucket. + final int[] maleCounts = new int[4]; + final int[] femaleCounts = new int[4]; + final int nSamples = genotypes.size(); + for ( int sampleIdx = 0; sampleIdx < nSamples; ++sampleIdx ) { + final ByteSequence rdCN = genotypes.get(sampleIdx).get(rdCNIndex); + if ( MISSING_VALUE.equals(rdCN) ) { + continue; + } + int rdCNVal = rdCN.asInt(); + if ( rdCNVal > 2 ) { + rdCNVal = 3; + } + final int sampleSex = sexForColumn[sampleIdx]; + if ( sampleSex == 1 ) { + maleCounts[rdCNVal] += 1; + } else if ( sampleSex == 2 ) { + femaleCounts[rdCNVal] += 1; + } + } + final double maleMedian = calcMedian(maleCounts); + double femaleMedian = calcMedian(femaleCounts); + return maleMedian == 1. && (isY ? femaleMedian == 0. : femaleMedian == 2.); + } + + // visible for testing + static double calcMedian( final int[] counts ) { + final double target = (counts[0] + counts[1] + counts[2] + counts[3]) / 2.; + if ( target == 0. ) { + return Double.NaN; + } + int total = 0; + for ( int iii = 0; iii < 4; ++iii ) { + total += counts[iii]; + if ( total == target ) { + return iii + .5; + } else if ( total > target ) { + return (double)iii; + } + } + throw new IllegalStateException("we should never reach this statement"); + } + + private static Set readLastColumn( final String filename ) { + final Set values = new HashSet<>(); + try { + final BufferedReader neRdr = + new BufferedReader(new InputStreamReader(new FileInputStream(filename))); + String line; + while ( (line = neRdr.readLine()) != null ) { + final String lastCol = line.substring(line.lastIndexOf('\t') + 1); + values.add(new ByteSequence(lastCol)); + } + } catch ( final IOException ioe ) { + throw new RuntimeException("can't read table file " + filename); + } + return values; + } + + private static int[] readPedFile( final String pedFilename, List sampleNames ) { + final int nCols = sampleNames.size() - 9; + final Map sexForSampleMap = new HashMap<>(2*nCols); + final int[] sexForSample = new int[nCols]; + try { + final BufferedReader pedRdr = + new BufferedReader(new InputStreamReader(new FileInputStream(pedFilename))); + final Pattern tabPattern = Pattern.compile("\\t"); + String line; + while ( (line = pedRdr.readLine()) != null ) { + final Scanner scanner = new Scanner(line).useDelimiter(tabPattern); + scanner.next(); // family ignored + final String sampleName = scanner.next(); + scanner.next(); // mom ignored + scanner.next(); // pop ignored + final int sex = scanner.nextInt(); + sexForSampleMap.put(new ByteSequence(sampleName), sex); + } + } catch ( final IOException ioe ) { + throw new RuntimeException("can't read " + pedFilename, ioe); + } + for ( int col = 0; col < nCols; ++col ) { + final Integer sex = sexForSampleMap.get(sampleNames.get(col + 9)); + if ( sex == null ) { + throw new RuntimeException("can't determine sex for sample " + sampleNames.get(col)); + } + sexForSample[col] = sex; + } + return sexForSample; + } +} diff --git a/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java b/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java new file mode 100644 index 000000000..77a6b5658 --- /dev/null +++ b/src/sv-pipeline/java/org/broadinstitute/svpipeline/CleanVCFPart1UnitTest.java @@ -0,0 +1,40 @@ +package org.broadinstitute.svpipeline; + +public class CleanVCFPart1UnitTest { + public static void main( final String[] args ) { + testAsserts(); + testMedianCalculation(); + System.out.println("OK"); + } + + public static void testAsserts() { + boolean caughtIt = false; + try { + assert(false); + } catch ( final AssertionError ae ) { + caughtIt = true; + } + if ( !caughtIt ) { + throw new AssertionError("assertions aren't turned on, so you're not testing anything."); + } + } + + public static void testMedianCalculation() { + final int[] counts = new int[4]; + assert(Double.isNaN(CleanVCFPart1.calcMedian(counts))); + counts[0] = 1; + assert(CleanVCFPart1.calcMedian(counts) == 0.0); + counts[1] = 1; + assert(CleanVCFPart1.calcMedian(counts) == 0.5); + counts[2] = 1; + assert(CleanVCFPart1.calcMedian(counts) == 1.0); + counts[3] = 1; + assert(CleanVCFPart1.calcMedian(counts) == 1.5); + counts[2] = 2; + assert(CleanVCFPart1.calcMedian(counts) == 2.0); + counts[3] = 4; + assert(CleanVCFPart1.calcMedian(counts) == 2.5); + counts[3] = 5; + assert(CleanVCFPart1.calcMedian(counts) == 3.0); + } +} diff --git a/src/sv-pipeline/java/org/broadinstitute/svpipeline/StitchFragmentedCNVs.java b/src/sv-pipeline/java/org/broadinstitute/svpipeline/StitchFragmentedCNVs.java new file mode 100644 index 000000000..a41a508a5 --- /dev/null +++ b/src/sv-pipeline/java/org/broadinstitute/svpipeline/StitchFragmentedCNVs.java @@ -0,0 +1,269 @@ +package org.broadinstitute.svpipeline; + +import java.io.*; +import java.util.*; +import org.broadinstitute.svpipeline.VCFParser.*; + +/** Read a VCF, and try to stitch together adjacent copy-number variations. + * Eligible Records (which we call "stitchable") must meet certain criteria as specified by the + * isStitchable method of the StitchableIterator. + * If two stitchables overlap appropriately, and all their samples have identical genotypes, we can + * replace the first one by adding on the interval covered by the second one. + */ +public class StitchFragmentedCNVs { + // These 3 values will always be overwritten, but are initialized to reasonable defaults as documentation + private static double PAD_FACTOR = .2; + static int MAX_PAD = 200000; // visible for testing + private static double MAX_OVERLAP_FACTOR = .2; + + // A new "end position" for the disposition map signalling that the record is to be removed because it was combined + // with another record. + private static final int ENDPOS_REMOVED_RECORD = -1; + + // relevant INFO field keys and values--these are visible for testing + static final ByteSequence END = new ByteSequence("END"); + static final ByteSequence SVLEN = new ByteSequence("SVLEN"); + static final ByteSequence SVTYPE = new ByteSequence("SVTYPE"); + static final ByteSequence SVTYPE_DEL = new ByteSequence("DEL"); + static final ByteSequence SVTYPE_DUP = new ByteSequence("DUP"); + static final ByteSequence MULTIALLELIC = new ByteSequence("MULTIALLELIC"); + static final ByteSequence EVIDENCE = new ByteSequence("EVIDENCE"); + static final ByteSequence EVIDENCE_RD = new ByteSequence("RD"); + static final ByteSequence EVIDENCE_SR = new ByteSequence("SR"); + static final ByteSequence EVIDENCE_PE = new ByteSequence("PE"); + static final ByteSequence EVIDENCE_BAF = new ByteSequence("BAF"); + + public static void main( final String[] args ) { + if ( args.length != 4 ) { + System.err.println("Usage: java StitchFragmentedCNVs PAD% MAXPAD OVRLAP% VCFFILE"); + System.err.println("E.g.: java StitchFragmentedCNVs .2 200000 .2 input.vcf.gz"); + System.err.println("Combines neighboring CNVs with matching genotypes into a larger event."); + System.err.println("Writes an uncompressed vcf to stdout."); + System.exit(1); + } + + initCommandLineArgs(args); + + // a map of IDs onto revised ENDs + final Map disposition = new HashMap<>(1000); + + // a push-back buffer of previous PaddedIntervals for stitchables that are still in range + final List intervalList = new LinkedList<>(); + + try ( final VCFParser vcfParser = new VCFParser(args[3]) ) { + while ( vcfParser.hasMetadata() ) { + vcfParser.nextMetaData(); + } + ByteSequence currentChromosome = null; + while ( vcfParser.hasRecord() ) { + final Record record = vcfParser.nextRecord(); + if ( isStitchable(record) ) { + if ( !record.getChromosome().equals(currentChromosome) ) { + intervalList.clear(); + currentChromosome = record.getChromosome(); + } + final PaddedInterval currentInterval = new PaddedInterval(record); + final ListIterator previousIntervals = intervalList.listIterator(); + boolean recordRemoved = false; + while ( previousIntervals.hasNext() ) { + final PaddedInterval previousInterval = previousIntervals.next(); + final PaddedInterval revisedInterval; + if ( previousInterval.doneStitching(currentInterval) ) { + previousIntervals.remove(); + } else if ( (revisedInterval = previousInterval.stitchTo(currentInterval)) != null ) { + previousIntervals.set(revisedInterval); + disposition.put(revisedInterval.getRecord().getID(), revisedInterval.getVCFEnd()); + disposition.put(record.getID(), ENDPOS_REMOVED_RECORD); + recordRemoved = true; + } + } + if ( !recordRemoved ) { + intervalList.add(currentInterval); + } + } + } + } + intervalList.clear(); + + try ( final OutputStream os = new BufferedOutputStream(new FileOutputStream(FileDescriptor.out)); + final VCFParser vcfParser = new VCFParser(args[3])) { + while ( vcfParser.hasMetadata() ) { + vcfParser.nextMetaData().write(os); + } + while ( vcfParser.hasRecord() ) { + final Record record = vcfParser.nextRecord(); + final Integer endPosObj = disposition.get(record.getID()); + if ( endPosObj == null ) { + record.write(os); + } else { + final int endPos = endPosObj; + if ( endPos != ENDPOS_REMOVED_RECORD ) { + final InfoField infoField = record.getInfo(); + infoField.put(END, new ByteSequence(Integer.toString(endPos))); + final int svLength = endPos + 1 - record.getPosition(); + infoField.put(SVLEN, new ByteSequence(Integer.toString(svLength))); + record.write(os); + } + } + } + } catch ( final IOException ioe ) { + throw new MalformedVCFException("can't write revised vcf", ioe); + } + } + + private static void initCommandLineArgs( final String[] args ) { + try { + PAD_FACTOR = Double.parseDouble(args[0]); + } catch ( final NumberFormatException nfe ) { + System.err.println("Can't interpret 1st argument (padding fraction) as a floating point number."); + System.exit(2); + } + if ( PAD_FACTOR < 0.0 ) { + System.err.println("First argument should be a padding fraction >= 0."); + System.exit(2); + } + try { + MAX_PAD = Integer.parseInt(args[1]); + } catch ( final NumberFormatException nfe ) { + System.err.println("Can't interpret 2nd argument (maximum padding in bases) as an integer."); + System.exit(2); + } + if ( MAX_PAD < 0 ) { + System.err.println("Second argument must be a maximum padding in bases >= 0."); + System.exit(2); + } + try { + MAX_OVERLAP_FACTOR = Double.parseDouble(args[0]); + } catch ( final NumberFormatException nfe ) { + System.err.println("Can't interpret 3rd argument (maximum overlap fraction) as a floating point number."); + System.exit(2); + } + if ( MAX_OVERLAP_FACTOR < 0.0 || MAX_OVERLAP_FACTOR > 1.0 ) { + System.err.println("Third argument should be a maximum overlap fraction between 0 and 1."); + System.exit(2); + } + } + + // VisibleForTesting + static boolean isStitchable( final Record record ) { + final CompoundField filterField = record.getFilter(); + for ( final ByteSequence filter : filterField ) { + if ( MULTIALLELIC.equals(filter) ) { + return false; + } + } + + final InfoField infoField = record.getInfo(); + final ByteSequence svType = infoField.get(SVTYPE); + if ( !SVTYPE_DEL.equals(svType) && !SVTYPE_DUP.equals(svType) ) { + return false; + } + + // you can't be a stitchable if you don't have an "END" info field. + // code elsewhere assumes it can grab this value without checking for its existence + final ByteSequence endValue = infoField.get(END); + if ( endValue == null || endValue.asInt() == ByteSequence.MISSING_VALUE ) { + return false; + } + + final ByteSequence evidence = infoField.get(EVIDENCE); + if ( evidence == null ) { + return false; + } + return !evidence.contains(EVIDENCE_PE) && !evidence.contains(EVIDENCE_SR) && + (evidence.contains(EVIDENCE_RD) || evidence.contains(EVIDENCE_BAF)); + } + + /** A little helper class to do padding and overlap calculations + * Note: this class uses half-open intervals, unlike a vcf */ + final static class PaddedInterval { // visible for testing + private final Record record; + private final int start; + private final int end; + private final int padding; + private final int maxOverlap; + private final ByteSequence eventType; + + public PaddedInterval( final Record record ) { + this.record = record; + this.start = record.getPosition(); + this.end = record.getInfo().get(END).asInt() + 1; + final int length = end - start; + this.padding = Math.min(MAX_PAD, (int)(length * PAD_FACTOR)); + this.maxOverlap = (int)(length * MAX_OVERLAP_FACTOR); + this.eventType = record.getInfo().get(SVTYPE); + } + + private PaddedInterval( final PaddedInterval upstream, final PaddedInterval downstream ) { + this.record = upstream.record; + this.start = upstream.start; + this.end = downstream.end; + this.padding = Math.max(upstream.padding, downstream.padding); + this.maxOverlap = Math.max(upstream.maxOverlap, downstream.maxOverlap); + this.eventType = upstream.eventType; + } + + public int getPaddedStart() { return start - padding; } + public int getPaddedEnd() { return end + padding; } + public Record getRecord() { return record; } + + /** Returns true if we're done trying to stitch this interval. Criterion is that the + * padded end of this interval more than MAX_PAD bases away from the start of the + * currentInterval. So this one is definitely disjoint (regardless of its length), and that + * will also be true of all subsequent intervals (since they're in sorted order on the + * starting interval. + */ + public boolean doneStitching( final PaddedInterval currentInterval ) { + return getPaddedEnd() < currentInterval.start - MAX_PAD; + } + + /** Returns an expanded interval if possible, otherwise null. */ + public PaddedInterval stitchTo( final PaddedInterval downstreamInterval ) { + if ( !eventType.equals(downstreamInterval.eventType) ) { + return null; + } + + // Check that the padded intervals overlap. + // Only have to check one end, because we know the downstream interval starts as late + // or later than this one. + if ( getPaddedEnd() <= downstreamInterval.getPaddedStart() ) { + return null; + } + + // But the unpadded intervals mustn't overlap too much. + // Note that the calculated overlap can be negative (they don't actually overlap), + // but that's OK. + final int overlap = Math.min(end, downstreamInterval.end) - downstreamInterval.start; + if ( overlap > maxOverlap || overlap > downstreamInterval.maxOverlap ) { + return null; + } + + if ( !genotypesMatch(record.getGenotypes(), downstreamInterval.record.getGenotypes()) ) { + return null; + } + + return new PaddedInterval(this, downstreamInterval); + } + + public int getVCFEnd() { return end - 1; } + + private static boolean genotypesMatch( final List genotypes1, + final List genotypes2 ) { + final int nGTs = genotypes1.size(); + if ( genotypes2.size() != nGTs ) { + throw new IllegalStateException("records have a different number of genotypes"); + } + for ( int idx = 0; idx != nGTs; ++idx ) { + final ByteIterator itr1 = genotypes1.get(idx).getValue().iterator(); + final ByteIterator itr2 = genotypes2.get(idx).getValue().iterator(); + byte b1; + do { + b1 = itr1.hasNext() ? itr1.next() : (byte)':'; + final byte b2 = itr2.hasNext() ? itr2.next() : (byte)':'; + if ( b1 != b2 ) return false; + } while ( b1 != ':' ); + } + return true; + } + } +} diff --git a/src/sv-pipeline/java/org/broadinstitute/svpipeline/StitchFragmentedCNVsUnitTest.java b/src/sv-pipeline/java/org/broadinstitute/svpipeline/StitchFragmentedCNVsUnitTest.java new file mode 100644 index 000000000..1d9513fb6 --- /dev/null +++ b/src/sv-pipeline/java/org/broadinstitute/svpipeline/StitchFragmentedCNVsUnitTest.java @@ -0,0 +1,160 @@ +package org.broadinstitute.svpipeline; + +import java.io.ByteArrayInputStream; +import java.util.Arrays; + +import org.broadinstitute.svpipeline.StitchFragmentedCNVs.PaddedInterval; +import org.broadinstitute.svpipeline.VCFParser.*; + +public final class StitchFragmentedCNVsUnitTest { + public static void main( final String[] args ) { + testAsserts(); + testIsStitchable(); + testDoneStitching(); + testStitchTo(); + System.out.println("OK"); + } + + public static void testAsserts() { + boolean caughtIt = false; + try { + assert (false); + } catch ( final AssertionError ae ) { + caughtIt = true; + } + if ( !caughtIt ) { + throw new AssertionError("assertions aren't turned on (with -ea), so you're not testing anything."); + } + } + + public static void testIsStitchable() { + final String vcfLine = "chr1\t1000\tID1\tN\t\t60\tPASS\tEND=1999;SVTYPE=DEL;EVIDENCE=RD\tGT\t0/0\t0/1\n"; + final Record record = fromString(vcfLine); + assert(StitchFragmentedCNVs.isStitchable(record)); + + // not stitchable if there's a MULTIALLELIC filter component + final ByteSequence originalFilter = record.getFilter().getValue(); + record.setFilter(StitchFragmentedCNVs.MULTIALLELIC); + assert(!StitchFragmentedCNVs.isStitchable(record)); + final ByteSequence x = new ByteSequence("X"); + record.setFilter(Arrays.asList(x, StitchFragmentedCNVs.MULTIALLELIC)); + assert(!StitchFragmentedCNVs.isStitchable(record)); + final ByteSequence y = new ByteSequence("Y"); + record.setFilter(Arrays.asList(x, StitchFragmentedCNVs.MULTIALLELIC, y)); + assert(!StitchFragmentedCNVs.isStitchable(record)); + record.setFilter(originalFilter); + assert(StitchFragmentedCNVs.isStitchable(record)); + + // not stitchable if the the SVTYPE isn't DUP or DEL + final InfoField info = record.getInfo(); + final ByteSequence originalSVTYPE = info.get(StitchFragmentedCNVs.SVTYPE); + info.put(StitchFragmentedCNVs.SVTYPE, new ByteSequence("INS")); + assert(!StitchFragmentedCNVs.isStitchable(record)); + info.put(StitchFragmentedCNVs.SVTYPE, StitchFragmentedCNVs.SVTYPE_DEL); + assert(StitchFragmentedCNVs.isStitchable(record)); + info.put(StitchFragmentedCNVs.SVTYPE, StitchFragmentedCNVs.SVTYPE_DUP); + assert(StitchFragmentedCNVs.isStitchable(record)); + info.put(StitchFragmentedCNVs.SVTYPE, originalSVTYPE); + assert(StitchFragmentedCNVs.isStitchable(record)); + + // not stitchable if END is missing + final ByteSequence originalEnd = info.get(StitchFragmentedCNVs.END); + info.remove(StitchFragmentedCNVs.END); + assert(!StitchFragmentedCNVs.isStitchable(record)); + info.put(StitchFragmentedCNVs.END, originalEnd); + assert(StitchFragmentedCNVs.isStitchable(record)); + + // not stitchable if EVIDENCE includes PE or SR + final ByteSequence originalEVIDENCE = info.get(StitchFragmentedCNVs.EVIDENCE); + info.put(StitchFragmentedCNVs.EVIDENCE, StitchFragmentedCNVs.EVIDENCE_RD); + assert(StitchFragmentedCNVs.isStitchable(record)); + info.put(StitchFragmentedCNVs.EVIDENCE, StitchFragmentedCNVs.EVIDENCE_BAF); + assert(StitchFragmentedCNVs.isStitchable(record)); + info.put(StitchFragmentedCNVs.EVIDENCE, StitchFragmentedCNVs.EVIDENCE_SR); + assert(!StitchFragmentedCNVs.isStitchable(record)); + info.put(StitchFragmentedCNVs.EVIDENCE, StitchFragmentedCNVs.EVIDENCE_PE); + assert(!StitchFragmentedCNVs.isStitchable(record)); + final ByteSequence sep = new ByteSequence(","); + info.put(StitchFragmentedCNVs.EVIDENCE, + new ByteSequence(StitchFragmentedCNVs.EVIDENCE_BAF, sep, StitchFragmentedCNVs.EVIDENCE_SR)); + assert(!StitchFragmentedCNVs.isStitchable(record)); + info.put(StitchFragmentedCNVs.EVIDENCE, originalEVIDENCE); + assert(StitchFragmentedCNVs.isStitchable(record)); + } + + public static void testDoneStitching() { + final String vcfLine = "chr1\t1000\tID1\tN\t\t60\tPASS\tEND=1999;SVTYPE=DEL;EVIDENCE=RD\tGT\t0/0\t0/1\n"; + final Record upstreamRecord = fromString(vcfLine); + upstreamRecord.setPosition(1000); + upstreamRecord.getInfo().put(StitchFragmentedCNVs.END, new ByteSequence(Integer.toString(1999))); + final PaddedInterval upstreamInterval = new PaddedInterval(upstreamRecord); + + // we've got a record on chr1:1000-2000 with a 200-base pad. move MAX_PAD bases further downstream. + final int startNotTooFar = upstreamInterval.getPaddedEnd() + StitchFragmentedCNVs.MAX_PAD; + final Record downstreamRecord = fromString(vcfLine); + downstreamRecord.setPosition(startNotTooFar); + downstreamRecord.getInfo().put(StitchFragmentedCNVs.END, new ByteSequence(Integer.toString(startNotTooFar + 1000))); + final PaddedInterval downstreamInterval = new PaddedInterval(downstreamRecord); + assert(!upstreamInterval.doneStitching(downstreamInterval)); + + // move one more base, and doneStitching should return true + downstreamRecord.setPosition(startNotTooFar + 1); + assert(upstreamInterval.doneStitching(new PaddedInterval(downstreamRecord))); + } + + public static void testStitchTo() { + final String vcfLine1 = "chr1\t1000\tID1\tN\t\t60\tPASS\tEND=1999;SVTYPE=DEL;EVIDENCE=RD\tGT\t0/0\t0/1\n"; + final Record upstreamRecord = fromString(vcfLine1); + assert(StitchFragmentedCNVs.isStitchable(upstreamRecord)); + final PaddedInterval upstreamInterval = new PaddedInterval(upstreamRecord); + final String vcfLine2 = "chr1\t2399\tID1\tN\t\t60\tPASS\tEND=3399;SVTYPE=DEL;EVIDENCE=RD\tGT\t0/0\t0/1\n"; + final Record downstreamRecord = fromString(vcfLine2); + assert(StitchFragmentedCNVs.isStitchable(downstreamRecord)); + final PaddedInterval stitched = upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord)); + assert(stitched != null); + assert(stitched.getRecord().getPosition() == 1000); + assert(stitched.getVCFEnd() == 3399); + + // fails because no overlap (padded intervals are adjacent) + downstreamRecord.setPosition(2400); + assert(upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord)) == null); + + // back to starting conditions + downstreamRecord.setPosition(2399); + assert(upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord)) != null); + + // fails because event types don't match + downstreamRecord.getInfo().put(StitchFragmentedCNVs.SVTYPE, StitchFragmentedCNVs.SVTYPE_DUP); + assert(upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord)) == null); + + // back to starting conditions + downstreamRecord.getInfo().put(StitchFragmentedCNVs.SVTYPE, StitchFragmentedCNVs.SVTYPE_DEL); + assert(upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord)) != null); + + // overlaps upstream interval too much + downstreamRecord.setPosition(1799); + assert(upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord)) == null); + + // back to starting conditions + downstreamRecord.setPosition(2399); + assert(upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord)) != null); + + // overlaps downstream interval too much + downstreamRecord.setPosition(1899); + downstreamRecord.getInfo().put(StitchFragmentedCNVs.END, new ByteSequence(Integer.toString(2399))); + assert(upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord)) == null); + + // back to starting conditions + downstreamRecord.setPosition(2399); + downstreamRecord.getInfo().put(StitchFragmentedCNVs.END, new ByteSequence(Integer.toString(3399))); + assert(upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord)) != null); + + // genotypes don't match + downstreamRecord.getGenotypes().get(0).set(0, new ByteSequence("0/1")); + assert(upstreamInterval.stitchTo(new PaddedInterval(downstreamRecord)) == null); + } + + private static Record fromString( final String vcfLine ) { + return new VCFParser(new ByteArrayInputStream(vcfLine.getBytes())).nextRecord(); + } +} diff --git a/src/sv-pipeline/java/VCFParser.java b/src/sv-pipeline/java/org/broadinstitute/svpipeline/VCFParser.java similarity index 54% rename from src/sv-pipeline/java/VCFParser.java rename to src/sv-pipeline/java/org/broadinstitute/svpipeline/VCFParser.java index 6adb828e8..6c6fbbbf8 100644 --- a/src/sv-pipeline/java/VCFParser.java +++ b/src/sv-pipeline/java/org/broadinstitute/svpipeline/VCFParser.java @@ -1,3 +1,5 @@ +package org.broadinstitute.svpipeline; + import java.io.*; import java.util.*; import java.util.zip.GZIPInputStream; @@ -22,15 +24,13 @@ public class VCFParser implements Closeable { public VCFParser( final String pathName ) { if ( pathName == null || "-".equals(pathName) ) { this.pathName = "stdin"; - this.is = System.in instanceof BufferedInputStream ? - System.in : - new BufferedInputStream(System.in); + this.is = new BufferedInputStream(new FileInputStream(FileDescriptor.in), BUFFER_SIZE); } else { this.pathName = pathName; try { final BufferedInputStream bis = - new BufferedInputStream(new FileInputStream(pathName)); - this.is = pathName.endsWith(GZ) ? new GZIPInputStream(bis) : bis; + new BufferedInputStream(new FileInputStream(pathName), BUFFER_SIZE); + this.is = pathName.endsWith(GZ) ? new GZIPInputStream(bis, BUFFER_SIZE) : bis; } catch ( final IOException ioe ) { throw new MalformedVCFException("can't open " + pathName, ioe); } @@ -40,6 +40,14 @@ public VCFParser( final String pathName ) { } } + public VCFParser( final InputStream is ) { + this.pathName = "input VCF"; + this.is = is; + if ( !readBuffer() ) { + throw new MalformedVCFException("input VCF is empty"); + } + } + public void close() { try { is.close(); @@ -68,7 +76,7 @@ public Metadata nextMetaData() { // it's the only metadata line that doesn't start with "##" but goes: // #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT (sample names) if ( bufferIterator.peek() != '#' ) { - return new Columns(captureColumns()); + return new ColumnHeaderMetadata(captureColumns()); } bufferIterator.skip(); // get the key part of the metadata, e.g., INFO in ##INFO=, or contig in ##contig= @@ -80,11 +88,11 @@ public Metadata nextMetaData() { if ( bufferIterator.peek() != '<' ) { // nope, simple value. just grab the rest of the line final ByteSequence value = capture('\n'); - return new KeyValue(key, value); + return new KeyValueMetadata(key, value); } bufferIterator.skip(); // yup. multiple values. tokenize them. - return new KeyAttributes(key, captureAttributes()); + return new KeyAttributesMetadata(key, captureAttributes()); } /** once we've had hasMetadata return false (line doesn't start with '#') @@ -128,18 +136,6 @@ private boolean readBuffer() { return bufferIterator.hasNext(); } - private void expect( final String expect ) { - final int expectLen = expect.length(); - for ( int iii = 0; iii < expectLen; ++iii ) { - needData(); - final byte nextByte = bufferIterator.next(); - if ( expect.charAt(iii) != nextByte ) { - throw new MalformedVCFException("expected " + expect + " but found " + - expect.substring(0, iii) + (char)nextByte + "..."); - } - } - } - /** grab the sequence of bytes up to the specified delimiter */ private ByteSequence capture( final char delim ) { ByteSequence prefix = null; @@ -191,7 +187,10 @@ private List captureAttributes() { (bs == null ? prefix : new ByteSequence(prefix, bs)); attributes.add(new KeyValue(key, value)); } while ( finalByte != '>' ); - expect("\n"); + needData(); + if ( bufferIterator.next() != '\n' ) { + throw new MalformedVCFException("unexpected characters at end of metadata line"); + } return attributes; } @@ -315,20 +314,67 @@ public ByteSequence( final ByteSequence seq1, final ByteSequence seq2 ) { end = buffer.length; } + public ByteSequence( final ByteSequence... seqs ) { + int totalLen = 0; + for ( final ByteSequence seq : seqs ) { + totalLen += seq.length(); + } + buffer = new byte[totalLen]; + start = 0; + end = totalLen; + int curLen = 0; + for ( final ByteSequence seq : seqs ) { + final int len = seq.length(); + System.arraycopy(seq.buffer, seq.start, buffer, curLen, len); + curLen += len; + } + } + + public ByteSequence( final List pieces, final char delim ) { + final int nPieces = pieces.size(); + int totalLen = 0; + if ( nPieces > 0 ) { + totalLen = nPieces - 1; // this many delimiters + for ( final ByteSequence piece : pieces ) { + totalLen += piece.length(); + } + } + buffer = new byte[totalLen]; + start = 0; + end = totalLen; + if ( nPieces > 0 ) { + ByteSequence piece = pieces.get(0); + int destIdx = piece.length(); + System.arraycopy(piece.buffer, piece.start, buffer, 0, destIdx); + for ( int pieceIdx = 1; pieceIdx < nPieces; ++pieceIdx ) { + buffer[destIdx++] = (byte)delim; + piece = pieces.get(pieceIdx); + int len = piece.length(); + System.arraycopy(piece.buffer, piece.start, buffer, destIdx, len); + destIdx += len; + } + } + } + public int length() { return end - start; } - public ByteSequence replace( final ByteSequence oldValue, final ByteSequence newValue ) { - if ( buffer != oldValue.buffer ) { - throw new IllegalStateException("oldValue not drawn from INFO field"); + public boolean contains( final ByteSequence subSeq ) { + final int len = subSeq.length(); + final int stop = end - len; + for ( int idx = start; idx <= stop; ++idx ) { + int idx1 = idx; + int idx2 = subSeq.start; + int nnn = len; + while ( nnn-- > 0 ) { + if ( buffer[idx1++] != subSeq.buffer[idx2++] ) { + break; + } + } + if ( nnn < 0 ) { + return true; + } } - final int length = length(); - final int newLen = newValue.length(); - final byte[] newBuf = new byte[length + newLen - oldValue.length()]; - final int len1 = oldValue.start - start; - System.arraycopy(buffer, start, newBuf, 0, len1); - System.arraycopy(newValue.buffer, newValue.start, newBuf, len1, newLen); - System.arraycopy(buffer, oldValue.end, newBuf, len1 + newLen, end - oldValue.end); - return new ByteSequence(newBuf, 0, newBuf.length); + return false; } public int asInt() { @@ -365,7 +411,7 @@ public List split( final char delim ) { mark = itr.mark(); } } - splits.add(itr.getSequenceNoDelim(mark)); + splits.add(itr.getSequence(mark)); return splits; } @@ -392,7 +438,7 @@ public void write( final OutputStream os ) throws IOException { } public boolean equals( final ByteSequence that ) { - if ( length() != that.length() ) return false; + if ( that == null || length() != that.length() ) return false; int idx2 = that.start; for ( int idx = start; idx < end; ++idx ) { if ( buffer[idx] != that.buffer[idx2++] ) return false; @@ -401,20 +447,13 @@ public boolean equals( final ByteSequence that ) { } } - enum MetadataType { - KeyValue, - KeyAttributes, - Columns - } - public interface Metadata { - MetadataType getType(); ByteSequence getKey(); Object getValue(); void write( OutputStream os ) throws IOException; } - public static final class KeyValue implements Metadata { + public static final class KeyValue { private final ByteSequence key; private final ByteSequence value; @@ -423,32 +462,49 @@ public KeyValue( final ByteSequence key, final ByteSequence value ) { this.value = value; } - @Override public MetadataType getType() { return MetadataType.KeyValue; } - @Override public ByteSequence getKey() { return key; } - @Override public ByteSequence getValue() { return value; } + public ByteSequence getKey() { return key; } + public ByteSequence getValue() { return value; } + + public void write( final OutputStream os ) throws IOException { + key.write(os); + if ( value != null ) { + os.write('='); + value.write(os); + } + } + + @Override public String toString() { return key + "=" + value; } + } + + public static final class KeyValueMetadata implements Metadata { + private final KeyValue keyValue; + + public KeyValueMetadata( final ByteSequence key, final ByteSequence value ) { + keyValue = new KeyValue(key, value); + } + + @Override public ByteSequence getKey() { return keyValue.getKey(); } + @Override public ByteSequence getValue() { return keyValue.getValue(); } @Override public void write( final OutputStream os ) throws IOException { os.write('#'); os.write('#'); - key.write(os); - os.write('='); - value.write(os); + keyValue.write(os); os.write('\n'); } - @Override public String toString() { return "##" + key + "=" + value; } + @Override public String toString() { return keyValue.toString(); } } - public static final class KeyAttributes implements Metadata { + public static final class KeyAttributesMetadata implements Metadata { private final ByteSequence key; private final List values; - public KeyAttributes( final ByteSequence key, final List values ) { + public KeyAttributesMetadata( final ByteSequence key, final List values ) { this.key = key; this.values = values; } - @Override public MetadataType getType() { return MetadataType.KeyAttributes; } @Override public ByteSequence getKey() { return key; } @Override public List getValue() { return values; } @@ -460,9 +516,7 @@ public KeyAttributes( final ByteSequence key, final List values ) { int prefix = '<'; for ( final KeyValue kv : values ) { os.write(prefix); - kv.getKey().write(os); - os.write('='); - kv.getValue().write(os); + kv.write(os); prefix = ','; } os.write('>'); @@ -471,7 +525,7 @@ public KeyAttributes( final ByteSequence key, final List values ) { @Override public String toString() { final StringBuilder sb = new StringBuilder(); - sb.append("##").append(key).append("="); + sb.append(key).append("="); char prefix = '<'; for ( final KeyValue kv : values ) { sb.append(prefix).append(kv.getKey()).append('=').append(kv.getValue()); @@ -481,14 +535,13 @@ public KeyAttributes( final ByteSequence key, final List values ) { } } - public static final class Columns implements Metadata { + public static final class ColumnHeaderMetadata implements Metadata { private final List columns; - public Columns( final List columns ) { + public ColumnHeaderMetadata( final List columns ) { this.columns = columns; } - @Override public MetadataType getType() { return MetadataType.Columns; } @Override public ByteSequence getKey() { return EMPTY_SEQUENCE; } @Override public List getValue() { return columns; } @@ -513,112 +566,315 @@ public Columns( final List columns ) { } } + /** a field like format and genotype with delimited subfields */ + public static final class CompoundField extends AbstractList { + private ByteSequence value; + private final char delim; + private List subFields; + + public CompoundField( final ByteSequence value, final char delim ) { + this.value = value; + this.delim = delim; + subFields = null; + } + + public CompoundField( final List vals, final char delim ) { + this.value = null; + this.delim = delim; + this.subFields = vals; + } + + public ByteSequence getValue() { + if ( value == null ) { + value = new ByteSequence(subFields, delim); + } + return value; + } + + public void write( final OutputStream os ) throws IOException { + if ( value != null ) value.write(os); + else { + int len = subFields.size(); + if ( len <= 0 ) { + os.write('.'); + } else { + subFields.get(0).write(os); + for ( int idx = 1; idx < len; ++idx ) { + os.write(delim); + subFields.get(idx).write(os); + } + } + } + } + + @Override public int size() { + populateSubFields(); + return subFields.size(); + } + + @Override public ByteSequence get( final int index ) { + populateSubFields(); + return subFields.get(index); + } + + @Override public ByteSequence set( final int index, final ByteSequence val ) { + populateSubFields(); + value = null; + return subFields.set(index, val); + } + + @Override public void add( final int index, final ByteSequence val ) { + populateSubFields(); + value = null; + subFields.add(index, val); + } + + @Override public ByteSequence remove( final int index ) { + populateSubFields(); + value = null; + return subFields.remove(index); + } + + @Override public boolean equals( final Object obj ) { + if ( this == obj ) return true; + if ( !(obj instanceof CompoundField) ) return false; + return getValue().equals(((CompoundField)obj).getValue()); + } + @Override public int hashCode() { + return getValue().hashCode(); + } + @Override public String toString() { return getValue().toString(); } + + private void populateSubFields() { + if ( subFields == null ) { + subFields = value.split(delim); + } + } + } + + /** the info subfields are semicolon delimited and contain key/value pairs */ + public static final class InfoField extends AbstractMap { + private ByteSequence value; + private LinkedHashMap subFields; + + public InfoField( final ByteSequence value ) { + this.value = value; + subFields = null; + } + + public ByteSequence getValue() { + if ( value == null ) { + final ByteArrayOutputStream os = new ByteArrayOutputStream(); + try { + write(os); + } catch ( final IOException ioe ) { + throw new IllegalStateException("IOException when writing to ByteArrayOutputStream!?"); + } + final byte[] buffer = os.toByteArray(); + value = new ByteSequence(buffer, 0, buffer.length); + } + return value; + } + + public void write( final OutputStream os ) throws IOException { + if ( value != null ) { + value.write(os); + } else if ( subFields.isEmpty() ) { + os.write('.'); + } else { + boolean needSep = false; + for ( final Map.Entry entry : subFields.entrySet() ) { + if ( needSep ) { + os.write(';'); + } + needSep = true; + entry.getKey().write(os); + final ByteSequence value = entry.getValue(); + if ( value != null ) { + os.write('='); + value.write(os); + } + } + } + } + + @Override public Set> entrySet() { + populateSubFields(); + return subFields.entrySet(); + } + + @Override public boolean containsKey( final Object key ) { + populateSubFields(); + return subFields.containsKey(key); + } + + @Override public ByteSequence get( final Object key ) { + populateSubFields(); + return subFields.get(key); + } + + @Override public ByteSequence put( final ByteSequence key, final ByteSequence val ) { + populateSubFields(); + value = null; + return subFields.put(key, val); + } + + @Override public ByteSequence remove( final Object key ) { + populateSubFields(); + if ( containsKey(key) ) { + value = null; + } + return subFields.remove(key); + } + + private void populateSubFields() { + if ( subFields == null ) { + subFields = new LinkedHashMap<>(); + final ByteIterator itr = value.iterator(); + int mark = itr.mark(); + ByteSequence key = null; + while ( itr.hasNext() ) { + byte nextByte = itr.next(); + if ( nextByte == '=' ) { + key = itr.getSequenceNoDelim(mark); + mark = itr.mark(); + } else if ( nextByte == ';' ) { + if ( key == null ) { + subFields.put(itr.getSequenceNoDelim(mark), null); + } else { + subFields.put(key, itr.getSequenceNoDelim(mark)); + } + key = null; + mark = itr.mark(); + } + } + if ( key == null ) { + subFields.put(itr.getSequence(mark), null); + } else { + subFields.put(key, itr.getSequence(mark)); + } + } + } + } + /** a line of data from the VCF */ public static final class Record { private static final int UNINITIALIZED = -1; - private final List columns; - private List infoKeyValues = null; + private final List simpleFields; + private CompoundField filters; + private InfoField infos; + private CompoundField formats; + private final List genotypes; + private int position = UNINITIALIZED; private int quality = UNINITIALIZED; - public Record( final List columns ) { - this.columns = columns; + public Record( final List vals ) { + simpleFields = new ArrayList<>(vals.subList(0, 6)); + filters = new CompoundField(vals.get(6), ';'); + infos = new InfoField(vals.get(7)); + final int nVals = vals.size(); + formats = nVals > 8 ? new CompoundField(vals.get(8), ':') : null; + genotypes = new ArrayList<>(Math.max(0, nVals - 9)); + for ( int idx = 9; idx < nVals; ++idx ) { + genotypes.add(new CompoundField(vals.get(idx), ':')); + } } - public ByteSequence getChromosome() { return columns.get(0); } + public ByteSequence getChromosome() { return simpleFields.get(0); } + public void setChromosome( final ByteSequence val ) { simpleFields.set(0, val); } public int getPosition() { if ( position == UNINITIALIZED ) { - position = columns.get(1).asInt(); + position = simpleFields.get(1).asInt(); } return position; } + public void setPosition( final int pos ) { + setPosition(new ByteSequence(Integer.toString(pos))); + } + public void setPosition( final ByteSequence val ) { + simpleFields.set(1, val); + position = UNINITIALIZED; + } + + public ByteSequence getID() { return simpleFields.get(2); } + public void setID( final ByteSequence val ) { simpleFields.set(2, val); } + + public ByteSequence getRef() { return simpleFields.get(3); } + public void setRef( final ByteSequence val ) { simpleFields.set(3, val); } - public ByteSequence getID() { return columns.get(2); } - public ByteSequence getRef() { return columns.get(3); } - public ByteSequence getAlt() { return columns.get(4); } + public ByteSequence getAlt() { return simpleFields.get(4); } + public void setAlt( final ByteSequence val ) { simpleFields.set(4, val); } public int getQuality() { if ( quality == UNINITIALIZED ) { - quality = columns.get(5).asInt(); + quality = simpleFields.get(5).asInt(); } return quality; } - - public ByteSequence getFilter() { return columns.get(6); } - - public List getInfo() { - if ( infoKeyValues == null ) { - infoKeyValues = parseKVs(columns.get(7)); - } - return infoKeyValues; + public void setQuality( final ByteSequence val ) { + simpleFields.set(5, val); + quality = UNINITIALIZED; } - public ByteSequence getInfoField( final ByteSequence key ) { - final List infoKeyValues = getInfo(); - for ( final KeyValue kv : infoKeyValues ) { - if ( key.equals(kv.getKey()) ) return kv.getValue(); - } - return null; + public CompoundField getFilter() { return filters; } + public void setFilter( final ByteSequence val ) { + filters = new CompoundField(val, ';'); } - - public void setInfoField( final ByteSequence oldValue, final ByteSequence newValue ) { - infoKeyValues = null; - columns.set(7, columns.get(7).replace(oldValue, newValue)); + public void setFilter( final List vals ) { + filters = new CompoundField(vals, ';'); } - public Map getInfoAsMap() { - final List infoList = getInfo(); - final Map infoMap = new HashMap<>(infoList.size() * 2); - infoList.forEach(kv -> infoMap.put(kv.getKey(), kv.getValue())); - return infoMap; + public InfoField getInfo() { + return infos; } + public void setInfo( final ByteSequence val ) { infos = new InfoField(val); } - public ByteSequence getFormat() { return columns.size() > 8 ? columns.get(8) : null; } + public CompoundField getFormat() { return formats; } + public void setFormat( final ByteSequence val ) { + formats = new CompoundField(val, ':'); + } - public List getGenotypes() { - return columns.size() > 9 ? columns.subList(9, columns.size()) : Collections.emptyList(); + public List getGenotypes() { return genotypes; } + public void setGenotypes( final List vals ) { + genotypes.clear(); + for ( final ByteSequence val : vals ) { + genotypes.add(new CompoundField(val, ':')); + } } public void write( final OutputStream os ) throws IOException { - final int nCols = columns.size(); - columns.get(0).write(os); - for ( int iii = 1; iii < nCols; ++iii ) { + simpleFields.get(0).write(os); + for ( int idx = 1; idx < 6; ++idx ) { os.write('\t'); - columns.get(iii).write(os); + simpleFields.get(idx).write(os); + } + os.write('\t'); + filters.write(os); + os.write('\t'); + infos.write(os); + if ( formats != null ) { + os.write('\t'); + formats.write(os); + for ( final CompoundField genotype : genotypes ) { + os.write('\t'); + genotype.write(os); + } } os.write('\n'); } - @Override public String toString() { + @Override + public String toString() { final StringBuilder sb = new StringBuilder(); String prefix = ""; - for ( final ByteSequence col : columns ) { - sb.append(prefix).append(col); + for ( final ByteSequence field : simpleFields ) { + sb.append(prefix).append(field.toString()); prefix = "\t"; } return sb.toString(); } - - private static List parseKVs( final ByteSequence bs ) { - final List attributes = new ArrayList<>(); - final ByteIterator itr = bs.iterator(); - int mark = itr.mark(); - ByteSequence key = null; - while ( itr.hasNext() ) { - byte nextByte = itr.next(); - if ( nextByte == '=' ) { - key = itr.getSequenceNoDelim(mark); - mark = itr.mark(); - } else if ( nextByte == ';' ) { - attributes.add(new KeyValue(key, itr.getSequenceNoDelim(mark))); - key = null; - mark = itr.mark(); - } - } - attributes.add(new KeyValue(key, itr.getSequence(mark))); - return attributes; - } } } diff --git a/src/sv-pipeline/java/org/broadinstitute/svpipeline/VCFParserUnitTest.java b/src/sv-pipeline/java/org/broadinstitute/svpipeline/VCFParserUnitTest.java new file mode 100644 index 000000000..08e3a2339 --- /dev/null +++ b/src/sv-pipeline/java/org/broadinstitute/svpipeline/VCFParserUnitTest.java @@ -0,0 +1,267 @@ +package org.broadinstitute.svpipeline; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.util.Arrays; +import java.util.Collections; +import java.util.List; + +import org.broadinstitute.svpipeline.VCFParser.*; + +public final class VCFParserUnitTest { + public static void main( final String[] args ) { + testAsserts(); + testEmptyFile(); + testFileFormatMetadata(); + testFilter(); + testColumnHeaders(); + testRecord(); + testRoundTrip(); + System.out.println("OK"); + } + + public static void testAsserts() { + boolean caughtIt = false; + try { + assert(false); + } catch ( final AssertionError ae ) { + caughtIt = true; + } + if ( !caughtIt ) { + throw new AssertionError("assertions aren't turned on (with -ea), so you're not testing anything."); + } + } + + public static void testEmptyFile() { + boolean caughtIt = false; + try ( final VCFParser parser = new VCFParser("/dev/null") ) { + assert(!parser.hasMetadata()); + } catch ( final MalformedVCFException emptyVCF ) { + caughtIt = true; + } + assert(caughtIt); + } + + public static void testFileFormatMetadata() { + final byte[] bytes = "##fileformat=VCFv4.2\n".getBytes(); + final VCFParser parser = new VCFParser(new ByteArrayInputStream(bytes)); + assert(parser.hasMetadata()); + final Metadata metadata = parser.nextMetaData(); + assert(metadata instanceof KeyValueMetadata); + final KeyValueMetadata kvMetadata = (KeyValueMetadata)metadata; + assert(kvMetadata.getKey().equals(new ByteSequence("fileformat"))); + assert(kvMetadata.getValue().equals(new ByteSequence("VCFv4.2"))); + assert(!parser.hasMetadata()); + assert(!parser.hasRecord()); + try ( final ByteArrayOutputStream os = new ByteArrayOutputStream() ) { + metadata.write(os); + assert(Arrays.equals(bytes, os.toByteArray())); + } catch ( final IOException ioe ) { + throw new RuntimeException(ioe); + } + parser.close(); + } + + public static void testFilter() { + final byte[] bytes = "##FILTER=\n".getBytes(); + final VCFParser parser = new VCFParser(new ByteArrayInputStream(bytes)); + assert(parser.hasMetadata()); + final Metadata metadata = parser.nextMetaData(); + assert(metadata instanceof KeyAttributesMetadata); + final KeyAttributesMetadata kaMetadata = (KeyAttributesMetadata)metadata; + assert(kaMetadata.getKey().equals(new ByteSequence("FILTER"))); + final List kaValues = kaMetadata.getValue(); + assert(kaValues.size() == 2); + final KeyValue kv0 = kaValues.get(0); + assert(kv0.getKey().equals(new ByteSequence("ID"))); + assert(kv0.getValue().equals(new ByteSequence("PASS"))); + final KeyValue kv1 = kaValues.get(1); + assert(kv1.getKey().equals(new ByteSequence("Description"))); + assert(kv1.getValue().equals(new ByteSequence("\"All filters passed\""))); + assert(!parser.hasMetadata()); + assert(!parser.hasRecord()); + try ( final ByteArrayOutputStream os = new ByteArrayOutputStream() ) { + metadata.write(os); + assert(Arrays.equals(bytes, os.toByteArray())); + } catch ( final IOException ioe ) { + throw new RuntimeException(ioe); + } + } + + public static void testColumnHeaders() { + final String line = "CHROM\tPOS\tID\tREF\tALT\tQUAL\tFILTER\tINFO\tFORMAT\tSAMPLE1\tSAMPLE2"; + final byte[] bytes = ("#" + line + "\n").getBytes(); + final VCFParser parser = new VCFParser(new ByteArrayInputStream(bytes)); + assert(parser.hasMetadata()); + final Metadata metadata = parser.nextMetaData(); + assert(metadata instanceof ColumnHeaderMetadata); + final ColumnHeaderMetadata columns = (ColumnHeaderMetadata)metadata; + final List cols = columns.getValue(); + final String[] splitLine = line.split("\t"); + assert(splitLine.length == cols.size()); + for ( int idx = 0; idx < splitLine.length; ++idx ) { + assert(cols.get(idx).equals(new ByteSequence(splitLine[idx]))); + } + assert(!parser.hasMetadata()); + assert(!parser.hasRecord()); + try ( final ByteArrayOutputStream os = new ByteArrayOutputStream() ) { + metadata.write(os); + assert(Arrays.equals(bytes, os.toByteArray())); + } catch ( final IOException ioe ) { + throw new RuntimeException(ioe); + } + } + + public static void testRecord() { + final String line = "chr1\t10000\tna19240_DUP_chr1_1\tN\t\t999\tPASS;BUT_FUNKY\t" + + "END=16000;SVTYPE=DUP;FLAG1;CHR2=chr1;SVLEN=6000;ALGORITHMS=depth;EVIDENCE=RD;FLAG2\t" + + "GT:GQ:RD_CN:RD_GQ:PE_GT:PE_GQ:SR_GT:SR_GQ:EV\t0/1:142:3:142:.:.:.:.:RD\t" + + "0/0:999:2:999:.:.:.:.:RD"; + final byte[] bytes = (line + "\n").getBytes(); + final VCFParser parser = new VCFParser(new ByteArrayInputStream(bytes)); + assert(!parser.hasMetadata()); + assert(parser.hasRecord()); + final Record record = parser.nextRecord(); + assert(!parser.hasMetadata()); + assert(!parser.hasRecord()); + final String[] cols = line.split("\t"); + + assert(record.getChromosome().equals(new ByteSequence(cols[0]))); + final ByteSequence newChr = new ByteSequence("chr1"); + record.setChromosome(newChr); + assert(record.getChromosome().equals(newChr)); + + final int curPosition = record.getPosition(); + assert(curPosition == Integer.parseInt(cols[1])); + final ByteSequence newPos = new ByteSequence("10001"); + record.setPosition(newPos); + final int newPosition = record.getPosition(); + assert(newPosition == 10001); + + assert(record.getID().equals(new ByteSequence(cols[2]))); + final ByteSequence newID = new ByteSequence("newID"); + record.setID(newID); + assert(record.getID().equals(newID)); + + assert(record.getRef().equals(new ByteSequence(cols[3]))); + final ByteSequence newRef = new ByteSequence("A"); + record.setRef(newRef); + assert(record.getRef().equals(newRef)); + + assert(record.getAlt().equals(new ByteSequence(cols[4]))); + final ByteSequence newAlt = new ByteSequence("C"); + record.setAlt(newAlt); + assert(record.getAlt().equals(newAlt)); + + final int curQuality = record.getQuality(); + assert(curQuality == Integer.parseInt(cols[5])); + final ByteSequence newQual = new ByteSequence("1"); + record.setQuality(newQual); + final int newQuality = record.getQuality(); + assert(newQuality == 1); + + final CompoundField filters = record.getFilter(); + final ByteSequence originalFilters = new ByteSequence(cols[6]); + final ByteSequence curFilters = filters.getValue(); + assert(curFilters.equals(originalFilters)); + assert(filters.size() == 2); + assert(filters.get(0).equals(new ByteSequence("PASS"))); + assert(filters.get(1).equals(new ByteSequence("BUT_FUNKY"))); + final ByteSequence failFilter = new ByteSequence("FAIL"); + filters.set(0, failFilter); + assert(filters.get(0).equals(failFilter)); + final ByteSequence newFilters = filters.getValue(); + assert(newFilters.equals(new ByteSequence("FAIL;BUT_FUNKY"))); + record.setFilter(originalFilters); + final CompoundField revisedFilters = record.getFilter(); + final ByteSequence newerFilters = revisedFilters.getValue(); + assert(newerFilters.equals(originalFilters)); + revisedFilters.add(revisedFilters.remove(0)); + final ByteSequence newestFilters = revisedFilters.getValue(); + assert(newestFilters.equals(new ByteSequence("BUT_FUNKY;PASS"))); + + final InfoField info = record.getInfo(); + final ByteSequence originalInfo = new ByteSequence(cols[7]); + final ByteSequence curInfo = info.getValue(); + assert(curInfo.equals(originalInfo)); + final String[] infoVals = cols[7].split(";"); + assert(info.size() == infoVals.length); + for ( final String val : infoVals ) { + final String[] kv = val.split("="); + final ByteSequence key = new ByteSequence(kv[0]); + assert(info.containsKey(key)); + if ( kv.length > 1 ) { + assert(info.get(key).equals(new ByteSequence(kv[1]))); + } else { + assert(info.get(key) == null); + } + } + final ByteSequence svLenKey = new ByteSequence("SVLEN"); + final ByteSequence newSVLen = new ByteSequence("6001"); + info.put(svLenKey, newSVLen); + assert(info.get(svLenKey).equals(newSVLen)); + info.put(svLenKey, new ByteSequence("6000")); + final ByteSequence newInfoValue = info.getValue(); + assert(newInfoValue.equals(originalInfo)); + final ByteSequence flag1Key = new ByteSequence("FLAG1"); + info.remove(flag1Key); + assert(info.get(flag1Key) == null); + final ByteSequence flag2Key = new ByteSequence("FLAG2"); + record.setInfo(flag2Key); + final InfoField newInfo = record.getInfo(); + assert(!newInfo.containsKey(flag1Key)); + assert(newInfo.containsKey(flag2Key)); + + final CompoundField format = record.getFormat(); + final ByteSequence originalFormat = new ByteSequence(cols[8]); + final ByteSequence curFormat = format.getValue(); + assert(curFormat.equals(originalFormat)); + record.setFormat(new ByteSequence("GT")); + assert(record.getFormat().size() == 1); + + final List genotypes = record.getGenotypes(); + assert(genotypes.size() == 2); + final ByteSequence geno1 = genotypes.get(0).getValue(); + final ByteSequence geno1Value = new ByteSequence("0/1:142:3:142:.:.:.:.:RD"); + assert(geno1.equals(geno1Value)); + final ByteSequence geno2 = genotypes.get(1).getValue(); + final ByteSequence geno2Value = new ByteSequence("0/0:999:2:999:.:.:.:.:RD"); + assert(geno2.equals(geno2Value)); + record.setGenotypes(Collections.singletonList(geno2Value)); + final List newGenotypes = record.getGenotypes(); + assert(newGenotypes.size() == 1); + final ByteSequence newGeno2Value = newGenotypes.get(0).getValue(); + assert(newGeno2Value.equals(geno2Value)); + } + + public static void testRoundTrip() { + final StringBuilder sb = new StringBuilder(100000); + for ( int idx = 0; idx < 1000; ++idx ) { + buildLine(idx, sb); + } + final byte[] bytes = sb.toString().getBytes(); + final ByteArrayInputStream is = new ByteArrayInputStream(bytes); + final VCFParser parser = new VCFParser(is); + final ByteArrayOutputStream os = new ByteArrayOutputStream(100000); + while ( parser.hasRecord() ) { + final Record record = parser.nextRecord(); + try { + record.write(os); + } catch ( final IOException ioe ) { + throw new RuntimeException("unexpected IOException"); + } + } + parser.close(); + assert(Arrays.equals(os.toByteArray(),bytes)); + } + + private static void buildLine( final int idx, final StringBuilder sb ) { + final int pos = 10000 * idx; + sb.append("chr1\t").append(10000+100*idx).append('\t').append("Event").append(idx).append('\t'); + sb.append("N\t").append("\t").append("999\t").append("PASS\t"); + sb.append("END=").append(pos+999).append('\t'); + sb.append("SVTYPE=DUP;CHR2=chr1;SVLEN=1000;ALGORITHMS=depth;EVIDENCE=RD\t"); + sb.append("GT:GQ:RD_CN\t").append("0/1:999:2\t").append("0/0:999:1\n"); + } +} diff --git a/src/sv-pipeline/scripts/hailmerge.py b/src/sv-pipeline/scripts/hailmerge.py new file mode 100644 index 000000000..b984738e4 --- /dev/null +++ b/src/sv-pipeline/scripts/hailmerge.py @@ -0,0 +1,79 @@ +import hail as hl +import argparse + +parser = argparse.ArgumentParser() +parser.add_argument('out_bucket') +parser.add_argument('cluster_name') +args = parser.parse_args() + +files = [f.rstrip() for f in open("files.list", "r").readlines()] + +# Define custom reference with only primary contigs, otherwise Hail adds all GRCh38 contigs, +# which may be problematic downstream + +contigs = [ + "chr1", + "chr2", + "chr3", + "chr4", + "chr5", + "chr6", + "chr7", + "chr8", + "chr9", + "chr10", + "chr11", + "chr12", + "chr13", + "chr14", + "chr15", + "chr16", + "chr17", + "chr18", + "chr19", + "chr20", + "chr21", + "chr22", + "chrX", + "chrY" +] + +lengths = { + "chr1": 248956422, + "chr2": 242193529, + "chr3": 198295559, + "chr4": 190214555, + "chr5": 181538259, + "chr6": 170805979, + "chr7": 159345973, + "chr8": 145138636, + "chr9": 138394717, + "chr10": 133797422, + "chr11": 135086622, + "chr12": 133275309, + "chr13": 114364328, + "chr14": 107043718, + "chr15": 101991189, + "chr16": 90338345, + "chr17": 83257441, + "chr18": 80373285, + "chr19": 58617616, + "chr20": 64444167, + "chr21": 46709983, + "chr22": 50818468, + "chrX": 156040895, + "chrY": 57227415 +} + +ref = hl.ReferenceGenome(name="hg38", contigs=contigs, lengths=lengths, x_contigs="chrX", y_contigs="chrY") +all_datasets = hl.import_vcf(files, reference_genome=ref, force_bgz=True) + +# union_rows approach causes ClassTooLargeException +# mt = hl.MatrixTable.union_rows(*all_datasets) +mt = all_datasets +# rest the qual to missing because hail by default populates it with -1.00e+01 +merged_reset_qual = mt.annotate_rows(qual=hl.missing('float64')) + +hl.export_vcf(merged_reset_qual, + "gs://{}/{}/merged.vcf.bgz".format(args.out_bucket, args.cluster_name), + metadata=hl.get_vcf_metadata(files[0])) diff --git a/src/svtest/svtest/utils/VCFUtils.py b/src/svtest/svtest/utils/VCFUtils.py index 177b8224d..8d6fb16e1 100644 --- a/src/svtest/svtest/utils/VCFUtils.py +++ b/src/svtest/svtest/utils/VCFUtils.py @@ -7,8 +7,14 @@ def get_info_field(record, name): if name not in record.info: - raise ValueError("%s info field not found: %s" % - (name, record.info.keys())) + if name == 'SVLEN': + if record.info['SVTYPE'] in ['DEL', 'DUP', 'INV']: + record.info['SVLEN'] = record.stop - record.pos + else: + record.info['SVLEN'] = -1 + else: + raise ValueError("%s info field not found: %s" % + (name, record.info.keys())) return record.info[name] diff --git a/src/svtk/scripts/svtk b/src/svtk/scripts/svtk index c6f6054b7..f05169c6b 100755 --- a/src/svtk/scripts/svtk +++ b/src/svtk/scripts/svtk @@ -25,7 +25,6 @@ usage: svtk [-h] [options] rdtest* Calculate comparative coverage statistics at CNV sites. [ PE/SR analysis ] - collect-pesr Count clipped reads and extract discordant pairs genomewide. sr-test Calculate enrichment of clipped reads at SV breakpoints. pe-test Calculate enrichment of discordant pairs at SV breakpoints. diff --git a/src/svtk/setup.py b/src/svtk/setup.py index 66ce3df43..b967e07d6 100755 --- a/src/svtk/setup.py +++ b/src/svtk/setup.py @@ -28,7 +28,6 @@ 'pybedtools', 'cython', 'natsort', - 'boto3<=1.9.224', 'pandas', ] ) diff --git a/src/svtk/svtk/cli/__init__.py b/src/svtk/svtk/cli/__init__.py index 24766f84b..6055c9903 100644 --- a/src/svtk/svtk/cli/__init__.py +++ b/src/svtk/svtk/cli/__init__.py @@ -5,7 +5,6 @@ from .bincov import main as bincov from .rdtest2vcf import main as rdtest2vcf from .resolve import main as resolve -from .collect_pesr import main as collect_pesr from .annotate import main as annotate from .utils import vcf2bed, remote_tabix from .pesr_test import pe_test, sr_test, count_pe, count_sr diff --git a/src/svtk/svtk/cli/bedcluster.py b/src/svtk/svtk/cli/bedcluster.py index 45439cc90..b220e6411 100644 --- a/src/svtk/svtk/cli/bedcluster.py +++ b/src/svtk/svtk/cli/bedcluster.py @@ -58,15 +58,11 @@ def bedcluster(bed, frac=0.8, intersection=None): ------- clusters : list of deque of pybedtools.Interval """ - - # Get list of unique variant IDs and initialize sparse graph - variant_IDs = [interval.fields[3] for interval in bed.intervals] - G = sparse.eye(len(variant_IDs), dtype=np.uint16, format='lil') - - # Map variant IDs to graph indices - variant_indexes = {} - for i, variant in enumerate(variant_IDs): - variant_indexes[variant.strip()] = i + # Get list of unique variant IDs and map to indices on sparse graph + variant_indices = {variant_id: index for index, variant_id in enumerate( + {interval.name for interval in bed.intervals}) + } + G = sparse.eye(len(variant_indices), dtype=np.uint16, format='lil') # Self-intersect the bed if intersection is None: @@ -81,19 +77,18 @@ def bedcluster(bed, frac=0.8, intersection=None): # Link the two calls from the current line if c2.chrom != '.' and c1.svtype == c2.svtype: - idx1 = variant_indexes[c1.name] - idx2 = variant_indexes[c2.name] + idx1 = variant_indices[c1.name] + idx2 = variant_indices[c2.name] G[idx1, idx2] = 1 # Cluster graph n_comp, cluster_labels = csgraph.connected_components(G, connection='weak') - # Build deques of clustered Intervals - clusters = [deque() for i in range(n_comp)] - for idx, interval in enumerate(bed.intervals): - label = cluster_labels[idx] + # Build lists of clustered Intervals + clusters = [[] for _ in range(n_comp)] + for interval in bed.intervals: + label = cluster_labels[variant_indices[interval.name]] clusters[label].append(interval) - return clusters diff --git a/src/svtk/svtk/cli/collect_pesr.py b/src/svtk/svtk/cli/collect_pesr.py deleted file mode 100644 index ec82401d0..000000000 --- a/src/svtk/svtk/cli/collect_pesr.py +++ /dev/null @@ -1,313 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- -# - -""" -Collect split read and discordant pair data from a bam alignment. - -Split reads: The tool counts the number of reads soft-clipped in each direction -(30S121M = left-clipped, 121M30S = right-clipped) at each position in the -genome. The position of a right-clipped read is shifted by the length of its -alignment. - -Discordant pairs: The tool reduces discordant pairs to (chrA, posA, strandA, -chrB, posB, strandB). - -Unmapped reads, reads with unmapped mates, secondary and supplementary -alignments, and duplicates are excluded (SAM flag 3340). - -Collection can be performed on an S3-hosted bam. The tool will attempt to find -a local copy of the bam index in the working directory, or the directory -specified with `--index-dir`, otherwise the index will be downloaded. -""" - -import argparse -import sys -from collections import defaultdict, deque -import numpy as np -import pysam -from natsort import natsorted -import svtk.utils as svu - - -class PESRCollection: - def __init__(self, bam, splitfile, discfile, sample='.', - max_split_dist=300): - self.bam = bam - self.splitfile = splitfile - self.discfile = discfile - self.sample = sample - - # SR evidence - self.right_split_counts = defaultdict(int) - self.left_split_counts = defaultdict(int) - self.prev_split_pos = None - self.curr_chrom = None - self.max_split_dist = max_split_dist - - # PE evidence - self.disc_pairs = deque() - self.observed_disc_names = {} - self.curr_disc_pos = -1 - - def collect_pesr(self): - """ - Collect PE and SR evidence from a BAM file. - - Excludes unmapped reads, reads with an unmapped mate, duplicate reads, - and secondary or supplementary alignments. Reads are considered split - if their CIGAR string contains a soft clip operation. - """ - - for read in self.bam: - # Restrict to unique primary alignments with a mapped mate - # Equivalent to `samtools view -F 3340` - if svu.is_excluded(read): - continue - - # Soft clip indicate a candidate split read - if svu.is_soft_clipped(read): - if self.splitfile is not None: - self.count_split(read) - - # After counting splits, evaluate discordant pairs - if not read.is_proper_pair: - if self.discfile is not None: - self.report_disc(read) - - self.flush_split_counts() - self.flush_disc_pairs() - - def report_disc(self, read): - """ - Report simplified discordant pair info. - - Parameters - ---------- - read : pysam.AlignedSegment - """ - - # Stack up all discordant pairs at a position, then sort - # and write out in chunks - if read.reference_start != self.curr_disc_pos: - self.flush_disc_pairs() - self.curr_disc_pos = read.reference_start - - # Avoid double-counting translocations by requiring chrA < chrB - if read.reference_id < read.next_reference_id: - self.disc_pairs.append(read) - - # If interchromosomal, rely on coordinate to not double count - elif read.reference_id == read.next_reference_id: - # Report if posA < posB - if read.reference_start < read.next_reference_start: - self.disc_pairs.append(read) - - # If posA == posB, check if we've seen the read before - elif read.reference_start == read.next_reference_start: - # If we have, delete the log to save memory and skip the read - if read.query_name in self.observed_disc_names: - del self.observed_disc_names[read.query_name] - - # Otherwise, report and log it - else: - self.disc_pairs.append(read) - self.observed_disc_names[read.query_name] = 1 - - def write_disc(self, read): - """ - Write discordant pair to file. - """ - strandA = '-' if read.is_reverse else '+' - strandB = '-' if read.mate_is_reverse else '+' - - self.discfile.write( - ('%s\t%d\t%s\t%s\t%d\t%s\t%s\n' % ( - read.reference_name, read.reference_start, strandA, - read.next_reference_name, read.next_reference_start, strandB, - self.sample) - ).encode('utf-8')) - - def flush_disc_pairs(self): - """ - Write all logged discordant reads to file. - """ - def _key(read): - return (read.reference_name, read.reference_start, - read.next_reference_name, read.next_reference_start) - - # Sort by chrA/posA and chrB/posB then write to disc - for read in natsorted(self.disc_pairs, key=_key): - self.write_disc(read) - - # Reset list of reads - self.disc_pairs = deque() - - def count_split(self, read): - """ - Count splits at each position. - - Parameters - ---------- - read : pysam.AlignedSegment - """ - - split_positions = get_split_positions(read) - # pos, side = get_split_positions(read) - - for (pos, side) in split_positions: - # Calculate distance to previous split and update position tracker - # Use abs to catch contig switches - if self.prev_split_pos is None: - dist = 0 - else: - dist = np.abs(pos - self.prev_split_pos) - self.prev_split_pos = pos - - if self.curr_chrom is None: - self.curr_chrom = read.reference_name - - # Flush aggregated split reads if we've moved beyond the max dist - if dist > self.max_split_dist: - self.flush_split_counts() - self.curr_chrom = read.reference_name - - # Tally the split at its corresponding position - if side == 'RIGHT': - self.right_split_counts[pos] += 1 - elif side == 'LEFT': - self.left_split_counts[pos] += 1 - - def flush_split_counts(self): - """ - Write current split counts to disk and reset dictionaries - """ - - # Compile counts collected so far - entries = deque() - for clip in 'left right'.split(): - df = getattr(self, '%s_split_counts' % clip) - - for pos, count in df.items(): - entries.append((self.curr_chrom, pos, clip, count, - self.sample)) - - # Sort in chunks as we go - entries = sorted(entries, key=lambda s: s[1]) - - # Flush to disk - fmt = '%s\t%d\t%s\t%d\t%s\n' - for entry in entries: - self.splitfile.write((fmt % entry).encode('utf-8')) - - # Reset split counts - self.right_split_counts = defaultdict(int) - self.left_split_counts = defaultdict(int) - - -def get_split_positions(read): - """ - Calculate split coordinate based on read alignment and CIGAR operations. - - Support is only present for reads soft-clipped on one side, e.g. 100M51S, - as the coordinate is calculated by shifting the alignment position by the - length of the flanking match operation. - - Parameters - ---------- - read : pysam.AlignedSegment - - Returns - ------- - pos : int - Adjusted split read coordinate - side : str [RIGHT,LEFT,MIDDLE] - Direction of soft clip - """ - - pos = read.pos - - split_positions = [] - - # Left soft clip - sequence is already aligned to split position - if is_left_clipped(read): - split_positions.append([pos, 'LEFT']) - - # Right soft clip - add length of aligned sequence - if is_right_clipped(read): - clip_pos = pos - for operation, length in read.cigartuples: - # Only shift based on matches, ignore DEL/INS/clips - if not is_clipping_operation(operation) and operation_consumes_ref_bases(operation): - clip_pos += length - split_positions.append([clip_pos, 'RIGHT']) - - return split_positions - - -def is_left_clipped(read): - return len(read.cigartuples) >= 1 and is_clipping_operation(read.cigartuples[0][0]) - - -def is_right_clipped(read): - return len(read.cigartuples) >= 1 and is_clipping_operation(read.cigartuples[-1][0]) - - -def is_clipping_operation(operation): - return operation == 4 or operation == 5 - - -def operation_consumes_ref_bases(operation): - """ - Returns true if this is a cigar operation that consumes reference bases - """ - return operation == 0 or operation == 2 or operation == 3 or operation == 7 - - -def main(argv): - parser = argparse.ArgumentParser( - description=__doc__, - prog='svtk collect-pesr', - formatter_class=argparse.RawDescriptionHelpFormatter) - - parser.add_argument('bam', help='Local or S3 path to bam') - parser.add_argument('sample', help='ID to append to each line of output ' - 'files.') - parser.add_argument('splitfile', - help='Output split counts.') - parser.add_argument('discfile', - help='Output discordant pairs.') - - parser.add_argument('--index-dir', default=None, - help='Directory of local BAM indexes if accessing ' - 'a remote S3 bam.') - parser.add_argument('-r', '--region', - help='Tabix-formatted region to parse') - parser.add_argument('-z', '--bgzip', default=False, action='store_true', - help='bgzip and tabix index output') - - # Print help if no arguments specified - if len(argv) == 0: - parser.print_help() - sys.exit(1) - args = parser.parse_args(argv) - - # Load bam from S3 if necessary - if args.bam.startswith('s3://'): - bam = svu.load_s3bam(args.bam, args.index_dir) - else: - bam = pysam.AlignmentFile(args.bam) - - # Restrict to region of interest - if args.region: - bam = bam.fetch(args.region.encode('utf-8')) - - # Collect data and save - with svu.BgzipFile(args.splitfile, args.bgzip) as splitfile: - with svu.BgzipFile(args.discfile, args.bgzip) as discfile: - PESRCollection(bam, splitfile, discfile, - args.sample).collect_pesr() - - -if __name__ == '__main__': - main(sys.argv[1:]) diff --git a/src/svtk/svtk/cli/resolve.py b/src/svtk/svtk/cli/resolve.py index 3ed275d4d..ad9578cc0 100644 --- a/src/svtk/svtk/cli/resolve.py +++ b/src/svtk/svtk/cli/resolve.py @@ -12,6 +12,8 @@ import numpy as np import string from collections import deque +from operator import attrgetter +import itertools import pysam import pybedtools as pbt import svtk.utils as svu @@ -91,25 +93,16 @@ def remove_CPX_from_INV(resolve_CPX, resolve_INV): return out +def multisort(xs, specs): + for key, reverse in reversed(specs): + xs.sort(key=attrgetter(key), reverse=reverse) + return xs + + def cluster_INV(independent_INV): - inv_hash = {} - for i in independent_INV: - if i.chrom not in inv_hash.keys(): - inv_hash[i.chrom] = {} - if i.pos not in inv_hash[i.chrom].keys(): - inv_hash[i.chrom][i.pos] = {} - if i.stop not in inv_hash[i.chrom][i.pos].keys(): - inv_hash[i.chrom][i.pos][i.stop] = i - list_INV = {} - for i in inv_hash.keys(): - list_INV[i] = [] - for j in sorted(inv_hash[i].keys()): - for k in sorted(inv_hash[i][j].keys()): - list_INV[i].append(inv_hash[i][j][k]) - out = [] - for i in list_INV.keys(): - out += _cluster_INV_list(list_INV[i]) - return out + list_INV = [multisort(list(group), (('pos', False), ('stop', False))) + for chrom, group in itertools.groupby(independent_INV, attrgetter('chrom'))] + return [x for group in list_INV for x in _cluster_INV_list(group)] def _cluster_INV_list(independent_INV): @@ -316,13 +309,12 @@ def cluster_cleanup(clusters_v2): return [clusters_v2[i] for i in cluster_pos] -def resolve_complex_sv_v2(resolve_CPX, resolve_INV, resolve_CNV, cytobands, disc_pairs, +def resolve_complex_sv_v2(resolve_INV, cytobands, disc_pairs, mei_bed, variant_prefix='CPX_', min_rescan_support=4, pe_blacklist=None, quiet=False, SR_only_cutoff=1000, random_resolved_id_length=10): - independent_INV = remove_CPX_from_INV(resolve_CPX, resolve_INV) - linked_INV = cluster_INV(independent_INV) - clusters_v2 = link_cpx_V2(linked_INV, resolve_CNV, cpx_dist=2000) + linked_INV = cluster_INV(resolve_INV) + clusters_v2 = link_cpx_V2(linked_INV, cpx_dist=2000) clusters_v2 = cluster_cleanup(clusters_v2) np.random.seed(0) # arbitrary fixed seed for reproducibility @@ -518,9 +510,7 @@ def main(argv): # RLC: As of Sept 19, 2018, only considering inversion single-enders in second-pass # due to too many errors in second-pass linking and variant reporting - resolve_CPX = [] - resolve_CNV = [] - cpx_records_v2 = resolve_complex_sv_v2(resolve_CPX, resolve_INV, resolve_CNV, + cpx_records_v2 = resolve_complex_sv_v2(resolve_INV, cytobands, disc_pairs, mei_bed, args.prefix, args.min_rescan_pe_support, blacklist, args.quiet) diff --git a/src/svtk/svtk/cli/vcfcluster.py b/src/svtk/svtk/cli/vcfcluster.py index e0d0cc1a9..b91c137d4 100644 --- a/src/svtk/svtk/cli/vcfcluster.py +++ b/src/svtk/svtk/cli/vcfcluster.py @@ -115,7 +115,11 @@ def main(argv): help='Do not merge clustered records. Adds CLUSTER info fields.') parser.add_argument('--merge-only', action='store_true', default=False, - help='When run on a vcf generated with --skip-merge, only merges records with identical CLUSTER fields.') + help='When run on a vcf generated with --skip-merge, only merges records ' + 'with identical CLUSTER fields.') + parser.add_argument('--single-end', action='store_true', + default=False, + help='Require only one end to be within the minimum distance.') # parser.add_argument('--cluster-bed', type=argparse.FileType('w'), # help='Bed of constituent calls in each cluster') @@ -145,7 +149,8 @@ def main(argv): sample_overlap=args.sample_overlap, preserve_header=args.preserve_header, do_cluster=do_cluster, - do_merge=do_merge) + do_merge=do_merge, + single_end=args.single_end) # Open new file if args.fout in '- stdout'.split(): diff --git a/src/svtk/svtk/cxsv/complex_sv.py b/src/svtk/svtk/cxsv/complex_sv.py index c3e0e562f..777c46255 100644 --- a/src/svtk/svtk/cxsv/complex_sv.py +++ b/src/svtk/svtk/cxsv/complex_sv.py @@ -179,6 +179,14 @@ def clean_record(self): if len(varGQs) > 0 and 'varGQ' in self.vcf_record.header.info.keys(): self.vcf_record.info['varGQ'] = max(varGQs) + # if resolved as a CNV, ensure RD_CN and RD_GQ are set + if len(self.records) > 1 and self.vcf_record.info['SVTYPE'] in ['DEL', 'DUP', 'CNV'] and len(self.cnvs) > 0: + cnv_record = self.cnvs[0] + if 'RD_CN' in cnv_record.format.keys() and 'RD_GQ' in cnv_record.format.keys(): + for sample in self.vcf_record.samples: + self.vcf_record.samples[sample]['RD_CN'] = cnv_record.samples[sample]['RD_CN'] + self.vcf_record.samples[sample]['RD_GQ'] = cnv_record.samples[sample]['RD_GQ'] + @property def record_ids(self): return [r.id for r in self.records] @@ -245,17 +253,8 @@ def resolve_inversion(self, SR_only_cutoff): is_mei = check_mei_overlap(self.vcf_record.chrom, source_start, source_end, self.mei_bed) - # then check for RdTest support - # is_dup = check_rdtest(self.vcf_record, source_start, source_end, - # self.rdtest) - if is_mei: self.cpx_type = 'MEI_' + self.cpx_type.split('/')[1] - # elif is_dup: - # self.svtype = 'CPX' - # self.cpx_type = 'INV_DISPERSED_DUP' - # else: - # self.cpx_type = self.cpx_type.split('/')[1] self.vcf_record.pos = sink_start self.vcf_record.stop = sink_end @@ -480,6 +479,8 @@ def report_simple_insertion(self): record = self.insertions[0] self.cpx_type = record.alts[0].strip('<>') self.svtype = 'INS' + self.vcf_record.pos = record.pos + self.vcf_record.stop = record.stop self.vcf_record.alts = record.alts self.vcf_record.id = record.id self.vcf_record.info['SVTYPE'] = self.svtype @@ -490,6 +491,8 @@ def report_simple_insertion(self): record = self.insertions[0] self.cpx_type = record.alts[0].strip('<>') self.svtype = 'INS' + self.vcf_record.pos = record.pos + self.vcf_record.stop = record.stop self.vcf_record.id = record.id self.vcf_record.alts = record.alts self.vcf_record.info['SVTYPE'] = self.svtype @@ -508,6 +511,8 @@ def report_insertion_strip_CNVs(self): and self.cnvs[0].info['SVTYPE'] == 'DUP': record = self.cnvs[0] self.svtype = 'DUP' + self.vcf_record.pos = record.pos + self.vcf_record.stop = record.stop self.vcf_record.id = record.id self.vcf_record.alts = record.alts self.vcf_record.info['SVTYPE'] = self.svtype @@ -519,75 +524,13 @@ def report_insertion_strip_CNVs(self): else: self.svtype = 'INS' - # if len(self.breakends) > 0 and len(self.cnvs) == 0: - # record = self.insertions[0] - # self.cpx_type = record.alts[0].strip('<>') - # self.svtype = 'INS' - # self.vcf_record.alts = record.alts - # self.vcf_record.info['SVTYPE'] = self.svtype - # self.vcf_record.info['CPX_TYPE'] = self.cpx_type - # self.vcf_record.info['CHR2'] = record.info['CHR2'] - # self.vcf_record.info['SVLEN'] = record.info['SVLEN'] - # elif len(self.cnvs) == 1 and len(self.breakends) == 0: - # if self.cnvs[0].info['SVTYPE'] == 'DUP': - # record = self.cnvs[0] - # self.svtype = 'DUP' - # self.vcf_record.alts = record.alts - # self.vcf_record.info['SVTYPE'] = self.svtype - # self.vcf_record.info['CHR2'] = record.info['CHR2'] - # self.vcf_record.info['SVLEN'] = record.info['SVLEN'] - # else: - # self.set_unresolved() - # else: - # self.set_unresolved() - # Where Manta calls two insertions flanking a duplication, report just the dup def report_manta_tandem_dup(self): record = self.dups[0] self.cpx_type = record.alts[0].strip('<>') self.svtype = 'DUP' - self.vcf_record.alts = record.alts - self.vcf_record.info['SVTYPE'] = self.svtype - self.vcf_record.info['CPX_TYPE'] = self.cpx_type - self.vcf_record.info['CHR2'] = record.info['CHR2'] - self.vcf_record.info['SVLEN'] = record.info['SVLEN'] - - def report_single_ender(self): - # if cluster contains a single duplication, report that - # otherwise, report the first insertion record and discard all others - if len(self.cnvs) == 1: - if self.cnvs[0].info['SVTYPE'] == 'DUP': - record = self.cnvs[0] - self.svtype = 'DUP' - self.vcf_record.alts = record.alts - self.vcf_record.info['SVTYPE'] = self.svtype - self.vcf_record.info['CHR2'] = record.info['CHR2'] - self.vcf_record.info['SVLEN'] = record.info['SVLEN'] - else: - record = self.insertions[0] - self.cpx_type = record.alts[0].strip('<>') - self.svtype = 'INS' - self.vcf_record.alts = record.alts - self.vcf_record.info['SVTYPE'] = self.svtype - self.vcf_record.info['CPX_TYPE'] = self.cpx_type - self.vcf_record.info['CHR2'] = record.info['CHR2'] - self.vcf_record.info['SVLEN'] = record.info['SVLEN'] - else: - record = self.insertions[0] - self.cpx_type = record.alts[0].strip('<>') - self.svtype = 'INS' - self.vcf_record.alts = record.alts - self.vcf_record.info['SVTYPE'] = self.svtype - self.vcf_record.info['CPX_TYPE'] = self.cpx_type - self.vcf_record.info['CHR2'] = record.info['CHR2'] - self.vcf_record.info['SVLEN'] = record.info['SVLEN'] - - # Where Manta calls two insertions flanking a duplication, report just the dup - - def report_manta_tandem_dup(self): - record = self.dups[0] - self.cpx_type = record.alts[0].strip('<>') - self.svtype = 'DUP' + self.vcf_record.pos = record.pos + self.vcf_record.stop = record.stop self.vcf_record.alts = record.alts self.vcf_record.info['SVTYPE'] = self.svtype self.vcf_record.info['CPX_TYPE'] = self.cpx_type @@ -751,19 +694,3 @@ def check_mei_overlap(chrom, start, end, mei_bed): cov = float(i.fields[6]) return cov >= 0.5 - - -def check_rdtest(record, start, end, rdtest): - """ - Check if putative insertion has depth support - """ - - rdtest_record = record.copy() - rdtest_record.pos = start - rdtest_record.stop = end - rdtest_record.info['SVTYPE'] = 'DUP' - - if end - start < 1000: - return rdtest.test_record(rdtest_record, cutoff_type='pesr_lt1kb') - else: - return rdtest.test_record(rdtest_record, cutoff_type='pesr_gt1kb') diff --git a/src/svtk/svtk/cxsv/cpx_link.py b/src/svtk/svtk/cxsv/cpx_link.py index c03b4d88e..aab48dc1e 100644 --- a/src/svtk/svtk/cxsv/cpx_link.py +++ b/src/svtk/svtk/cxsv/cpx_link.py @@ -11,14 +11,16 @@ import numpy as np import scipy.sparse as sps import natsort -import pysam import svtk.utils as svu -def samples_overlap_records(recA, recB, upper_thresh=0.5, lower_thresh=0.5): - samplesA = set(svu.get_called_samples(recA)) - samplesB = set(svu.get_called_samples(recB)) - return samples_overlap(samplesA, samplesB, upper_thresh=upper_thresh, lower_thresh=lower_thresh) +def samples_overlap_records(recA, recB, called_samples_dict, upper_thresh=0.5, lower_thresh=0.5): + if recA.id not in called_samples_dict: + called_samples_dict[recA.id] = set(svu.get_called_samples(recA)) + if recB.id not in called_samples_dict: + called_samples_dict[recB.id] = set(svu.get_called_samples(recB)) + return samples_overlap(called_samples_dict[recA.id], called_samples_dict[recB.id], + upper_thresh=upper_thresh, lower_thresh=lower_thresh) def samples_overlap(samplesA, samplesB, upper_thresh=0.5, lower_thresh=0.5): @@ -81,7 +83,7 @@ def extract_breakpoints(vcf, bkpt_idxs): return bkpts -def link_cpx(vcf, bkpt_window=300, cpx_dist=2000): +def link_cpx(vcf, bkpt_window=300): """ Parameters ---------- @@ -94,19 +96,12 @@ def link_cpx(vcf, bkpt_window=300, cpx_dist=2000): # Identify breakpoints which overlap within specified window overlap = bt.window(bt, w=bkpt_window).saveas() - # Exclude self-hits - # overlap = overlap.filter(lambda b: b.fields[3] != b.fields[9]).saveas() - # Exclude intersections where two DELs or two DUPs cluster together - # cnvtypes = 'DEL DUP'.split() overlap = overlap.filter(lambda b: not ( b.fields[4] == "DEL" and b.fields[10] == "DEL")).saveas() overlap = overlap.filter(lambda b: not ( b.fields[4] == "DUP" and b.fields[10] == "DUP")).saveas() - # # Exclude intersections with annotated mobile elements (rather than BNDs) - # overlap = overlap.filter(lambda b: b.fields[4] is not re.match(re.compile('INS\:ME\:*'), b.fields[4])).saveas() - # Get linked variant IDs links = [(b[3], b[9]) for b in overlap.intervals] linked_IDs = natsort.natsorted(set(itertools.chain.from_iterable(links))) @@ -138,14 +133,6 @@ def link_cpx(vcf, bkpt_window=300, cpx_dist=2000): for i, c_label in enumerate(comp_list): clusters[c_label].append(bkpts[i]) - # # Remove clusters of only CNV - leftover from shared sample filtering - # def _ok_cluster(cluster): - # ok = any([record.info['SVTYPE'] not in cnvtypes for record in cluster]) - # return ok - - # clusters = [c for c in clusters if _ok_cluster(c)] - # clusters = [c for c in clusters if len(c) > 1] - return clusters @@ -157,118 +144,35 @@ def unify_list(list): return out -def CNV_readin_from_resolved_vcf(resolved_name, inv_intervals): - resolved_f = pysam.VariantFile(resolved_name, 'r') - # rec_a = 0 - out = [] - for i in resolved_f: - for j in inv_intervals: - if i.chrom == j[0]: - if (i.pos - j[1]) * (i.pos - j[2]) < 0 or (i.stop - j[1]) * (i.stop - j[2]) < 0: - if i.info['SVTYPE'] in ['DEL', 'DUP']: - out.append(i) - resolved_f.close() - return out - - -def link_cpx_V2(linked_INV, resolve_CNV, cpx_dist=2000): - linked_INV_V2 = [] +def link_cpx_V2(linked_INV, cpx_dist=2000): + overlapping_inv = [] + called_samples_dict = {} for group in linked_INV: if len(group) > 1: - for i in group: - for j in group: - if ro_calu(i, j) > 0 and samples_overlap_records(i, j): - linked_INV_V2.append([i, j]) - else: - linked_INV_V2.append([group[0]]) - inv_intervals = [] - for i in linked_INV_V2: - if len(i) > 1: - tmp = [i[0].chrom] - for j in i: - tmp += [j.pos, j.stop] - inv_intervals.append( - [tmp[0], min(unify_list(tmp[1:])), max(unify_list(tmp[1:]))]) + for i, j in itertools.combinations(group, 2): + if records_overlap(i, j) and samples_overlap_records(i, j, called_samples_dict): + overlapping_inv.append([i, j]) else: - inv_intervals.append([i[0].chrom, i[0].pos, i[0].stop]) - inv_intervals = sorted(unify_list(inv_intervals)) - # out_rec = unify_list(CNV_readin_from_resolved_vcf(resolved_name,inv_intervals) + CNV_readin_from_resolved_vcf(unresolved_name,inv_intervals)) - out_rec = resolve_CNV + overlapping_inv.append(group) cluster = [] - for i in linked_INV_V2: - if len(i) > 1: - if abs(i[1].pos - i[0].pos) > cpx_dist and abs(i[1].stop - i[0].stop) > cpx_dist: - if 'STRANDS' in i[0].info.keys() and 'STRANDS' in i[1].info.keys(): - if sorted(unify_list([i[0].info['STRANDS'], i[1].info['STRANDS']])) == ['++', '--']: - if i[0].pos < i[1].pos < i[0].stop < i[1].stop or i[1].pos < i[0].pos < i[1].stop < i[0].stop: - cpx_intervals = [[i[0].chrom, sorted([i[0].pos, i[0].stop, i[1].pos, i[1].stop])[0], sorted([i[0].pos, i[0].stop, i[1].pos, i[1].stop])[1]], [ - i[0].chrom, sorted([i[0].pos, i[0].stop, i[1].pos, i[1].stop])[2], sorted([i[0].pos, i[0].stop, i[1].pos, i[1].stop])[3]]] - CNV_close = [j for j in out_rec if ro_calu_interval([j.chrom, j.pos, j.stop], cpx_intervals[0]) > .5 and abs( - j.pos - cpx_intervals[0][1]) < cpx_dist and abs(j.stop - cpx_intervals[0][2]) < cpx_dist] - CNV_close += [j for j in out_rec if ro_calu_interval([j.chrom, j.pos, j.stop], cpx_intervals[1]) > .5 and abs( - j.pos - cpx_intervals[1][1]) < cpx_dist and abs(j.stop - cpx_intervals[1][2]) < cpx_dist] - cluster.append(CNV_close + i) + for inv in overlapping_inv: + if len(inv) > 1: + if abs(inv[1].pos - inv[0].pos) > cpx_dist and abs(inv[1].stop - inv[0].stop) > cpx_dist: + if 'STRANDS' in inv[0].info.keys() and 'STRANDS' in inv[1].info.keys(): + if inv[0].info['STRANDS'] != inv[1].info['STRANDS']: + if inv[0].pos < inv[1].pos < inv[0].stop < inv[1].stop \ + or inv[1].pos < inv[0].pos < inv[1].stop < inv[0].stop: + cluster.append(inv) else: - cluster.append(i) + cluster.append(inv) return cluster -def link_inv(vcf, bkpt_window=300, cpx_dist=2000): - bt = svu.vcf2bedtool(vcf.filename, annotate_ins=False) - overlap = bt.window(bt, w=bkpt_window).saveas() - overlap = overlap.filter(lambda b: not ( - b.fields[4] == "DEL" and b.fields[10] == "DEL")).saveas() - overlap = overlap.filter(lambda b: not ( - b.fields[4] == "DUP" and b.fields[10] == "DUP")).saveas() - links = [(b[3], b[9]) for b in overlap.intervals] - linked_IDs = natsort.natsorted(set(itertools.chain.from_iterable(links))) - linked_IDs = np.array(linked_IDs) - bkpt_idxs = {ID: i for i, ID in enumerate(linked_IDs)} - indexed_links = np.array([(bkpt_idxs[a], bkpt_idxs[b]) for a, b in links]) - n_bkpts = len(linked_IDs) - bkpts = extract_breakpoints(vcf, bkpt_idxs) - # Exclude wildly disparate overlaps - G = sps.eye(n_bkpts, dtype=np.uint16, format='lil') - for i, j in indexed_links: - if (ro_calu(bkpts[i], bkpts[j]) > 0 and samples_overlap_records(bkpts[i], bkpts[j])): - G[i, j] = 1 - # Generate lists of clustered breakpoints - n_comp, comp_list = sps.csgraph.connected_components(G) - clusters = [deque() for x in range(n_comp)] - for i, c_label in enumerate(comp_list): - clusters[c_label].append(bkpts[i]) - return clusters - - def close_enough(r1, r2, cpx_dist=2000): distA = np.abs(r1.pos - r2.pos) distB = np.abs(r1.stop - r2.stop) return distA < cpx_dist or distB < cpx_dist -def ro_calu(r1, r2): - out = 0 - if not r1.chrom == r2.chrom: - out = 0 - elif r1.pos > r2.stop or r1.stop < r2.pos: - out = 0 - else: - maxval = max([r1.stop - r1.pos, r2.stop - r2.pos]) - if maxval > 0: - out = (sorted([r1.pos, r2.pos, r1.stop, r2.stop])[ - 2] - sorted([r1.pos, r2.pos, r1.stop, r2.stop])[1]) / maxval - else: - out = 0 - return out - - -def ro_calu_interval(r1, r2): - out = 0 - if not r1[0] == r2[0]: - out = 0 - elif r1[1] > r2[2] or r1[2] < r2[1]: - out = 0 - else: - out = (sorted(r1[1:] + r2[1:])[2] - sorted(r1[1:] + - r2[1:])[1]) / max([r1[2] - r1[1], r2[2] - r2[1]]) - return out +def records_overlap(r1, r2): + return r1.chrom == r2.chrom and not (r1.pos > r2.stop or r1.stop < r2.pos) diff --git a/src/svtk/svtk/genomeslink.py b/src/svtk/svtk/genomeslink.py index e325bdd1b..85fe7b426 100644 --- a/src/svtk/svtk/genomeslink.py +++ b/src/svtk/svtk/genomeslink.py @@ -131,7 +131,7 @@ def __str__(self): class GenomeSLINK(object): - def __init__(self, nodes, dist, size=1, blacklist=None): + def __init__(self, nodes, dist, size=1, blacklist=None, single_end=False): """ Graph-based single-linkage clustering of genomic coordinates. @@ -147,12 +147,15 @@ def __init__(self, nodes, dist, size=1, blacklist=None): blacklist : pysam.TabixFile, optional Regions to exclude from clustering. Any node with a coordinate inside an excluding region is omitted. (NOTE: not overlap-based.) + single_end : bool, optional + Require only one end to be within min dist. """ self.nodes = nodes self.dist = dist self.size = size self.blacklist = blacklist + self.single_end = single_end def is_clusterable_with(self, first, second): """ @@ -165,9 +168,15 @@ def clusters_with(self, first, second): """ Test if candidates meet cluster distance requirement on chrB, posB """ - return (first.chrB == second.chrB and - abs(first.posA - second.posA) < self.dist and - abs(first.posB - second.posB) < self.dist) + if first.chrB == second.chrB: + if self.single_end: + return abs(first.posA - second.posA) < self.dist or \ + abs(first.posB - second.posB) < self.dist + else: + return abs(first.posA - second.posA) < self.dist and \ + abs(first.posB - second.posB) < self.dist + else: + return False def filter_nodes(self): """ diff --git a/src/svtk/svtk/utils/__init__.py b/src/svtk/svtk/utils/__init__.py index 03ffa3fc7..5ca6d96ad 100644 --- a/src/svtk/svtk/utils/__init__.py +++ b/src/svtk/svtk/utils/__init__.py @@ -1,7 +1,6 @@ from .utils import * from .bgzipfile import BgzipFile -from .s3bam import load_s3bam -from .helpers import is_excluded, is_soft_clipped, reciprocal_overlap, overlap_frac +from .helpers import reciprocal_overlap, overlap_frac from .multi_tabixfile import MultiTabixFile from .genotype_merging import update_best_genotypes from .rdtest import RdTest diff --git a/src/svtk/svtk/utils/helpers.pyx b/src/svtk/svtk/utils/helpers.pyx index 9c26c9219..b9dc5334f 100644 --- a/src/svtk/svtk/utils/helpers.pyx +++ b/src/svtk/svtk/utils/helpers.pyx @@ -1,24 +1,10 @@ #cython: language_level=3 -from pysam.libcalignedsegment cimport AlignedSegment -from pysam.libcalignmentfile cimport AlignmentFile cdef inline int int_max(int a, int b): return a if a >= b else b cdef inline int int_min(int a, int b): return a if a <= b else b cdef inline float float_max(float a, float b): return a if a >= b else b cdef inline float float_min(float a, float b): return a if a <= b else b -cpdef bint is_excluded(AlignedSegment read): - cdef bint exclude = (read.is_unmapped or - read.mate_is_unmapped or - read.is_secondary or - read.is_duplicate or - read.is_supplementary) - return exclude - -cpdef bint is_soft_clipped(AlignedSegment read): - return (((read.cigartuples[0][0] == 4) & (read.cigartuples[-1][0] == 0)) | - ((read.cigartuples[-1][0] == 4) & (read.cigartuples[0][0] == 0))) - cpdef float reciprocal_overlap(int startA, int endA, int startB, int endB): """Calculate fraction of reciprocal overlap between two intervals""" diff --git a/src/svtk/svtk/utils/s3bam.py b/src/svtk/svtk/utils/s3bam.py deleted file mode 100644 index bfe2209d4..000000000 --- a/src/svtk/svtk/utils/s3bam.py +++ /dev/null @@ -1,44 +0,0 @@ -#! /usr/bin/env python -# -*- coding: utf-8 -*- -# vim:fenc=utf-8 -# - -""" -Load S3-hosted bam into pysam.AlignmentFile -""" - -import os -import boto3 -import pysam - - -def load_s3bam(bam_path, index_dir=None): - if not bam_path.startswith('s3://'): - raise Exception('Bam {0} is not a valid S3 path'.format(bam_path)) - - # Pysam doesn't accept explicit path to index file, expects index to be - # present in working directory. If a local copy of the index is available, - # move to its directory to use it. - # Otherwise, index is downloaded automatically - if index_dir is not None: - os.chdir(index_dir) - else: - msg = ('Local index directory not specified for {0}. Downloading ' - 'remote copy of index to working directory.') - raise Warning(msg.format(bam_path)) - - # Parse bucket and key from filepath - s3path = bam_path[5:] - bucket = s3path.split('/')[0] - bam_path = '/'.join(s3path.split('/')[1:]) - - # Create S3 client and get presigned URL - # Necessary to take advantage of pysam's https support until the library - # supports S3 paths directly - s3 = boto3.client('s3') - url = s3.generate_presigned_url( - ClientMethod='get_object', - Params={'Bucket': bucket, 'Key': bam_path}, - ExpiresIn=86400) - - return pysam.AlignmentFile(url) diff --git a/src/svtk/svtk/utils/utils.py b/src/svtk/svtk/utils/utils.py index ee9c9e53c..2eae72c16 100644 --- a/src/svtk/svtk/utils/utils.py +++ b/src/svtk/svtk/utils/utils.py @@ -302,7 +302,7 @@ def _converter(): chrom = record.chrom start = max([0, int(record.pos) - 1]) end = record.pos - entry.format(**locals()) + yield entry.format(**locals()) # elif (record.info.get('SVTYPE', None) == 'CPX' and # 'CPX_TYPE' in record.info.keys()): diff --git a/src/svtk/svtk/vcfcluster.py b/src/svtk/svtk/vcfcluster.py index 0d53cd4c7..0891a67e1 100644 --- a/src/svtk/svtk/vcfcluster.py +++ b/src/svtk/svtk/vcfcluster.py @@ -29,7 +29,8 @@ def __init__(self, vcfs, preserve_genotypes=False, sample_overlap=0.0, preserve_header=False, do_cluster=True, - do_merge=True): + do_merge=True, + single_end=False): """ Clustering of VCF records. @@ -71,6 +72,8 @@ def __init__(self, vcfs, specified, all svtypes will be clustered. sample_overlap : float, optional Minimum fraction of samples to overlap to cluster variants + single_end : bool, optional + Require only one end to be within min dist. """ if (not do_cluster) and (not do_merge): @@ -115,7 +118,7 @@ def __init__(self, vcfs, self.sources = sorted(sources) self.header = self.make_vcf_header() - super().__init__(nodes, dist, 1, blacklist) + super().__init__(nodes, dist, 1, blacklist, single_end) def clusters_with(self, first, second): """ diff --git a/test_input_templates/MakeCohortVcf/MakeCohortVcf.json.tmpl b/test_input_templates/MakeCohortVcf/MakeCohortVcf.json.tmpl index 6811de23b..5dccf27ab 100644 --- a/test_input_templates/MakeCohortVcf/MakeCohortVcf.json.tmpl +++ b/test_input_templates/MakeCohortVcf/MakeCohortVcf.json.tmpl @@ -16,18 +16,25 @@ "MakeCohortVcf.collins_2017_tarball": {{ reference_resources.collins_2017_tarball | tojson }}, "MakeCohortVcf.werling_2018_tarball": {{ reference_resources.werling_2018_tarball | tojson }}, + "MakeCohortVcf.chr_x": {{ reference_resources.chr_x | tojson }}, + "MakeCohortVcf.chr_y": {{ reference_resources.chr_y | tojson }}, + "MakeCohortVcf.min_sr_background_fail_batches": 0.5, + "MakeCohortVcf.max_shard_size_resolve" : 500, "MakeCohortVcf.max_shards_per_chrom_clean_vcf_step1": 200, "MakeCohortVcf.min_records_per_shard_clean_vcf_step1": 5000, + "MakeCohortVcf.clean_vcf1b_records_per_shard": 10000, "MakeCohortVcf.samples_per_clean_vcf_step2_shard": 100, + "MakeCohortVcf.clean_vcf5_records_per_shard": 5000, "MakeCohortVcf.random_seed": 0, - "MakeCohortVcf.max_shard_size_resolve": 500, "MakeCohortVcf.primary_contigs_list": {{ reference_resources.primary_contigs_list | tojson }}, "MakeCohortVcf.sv_pipeline_base_docker": {{ dockers.sv_pipeline_base_docker | tojson }}, "MakeCohortVcf.linux_docker": {{ dockers.linux_docker | tojson }}, "MakeCohortVcf.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }}, + "MakeCohortVcf.sv_pipeline_hail_docker": {{ dockers.sv_pipeline_hail_docker | tojson }}, + "MakeCohortVcf.sv_pipeline_updates_docker": {{ dockers.sv_pipeline_updates_docker | tojson }}, "MakeCohortVcf.sv_base_mini_docker":{{ dockers.sv_base_mini_docker | tojson }}, "MakeCohortVcf.sv_pipeline_rdtest_docker": {{ dockers.sv_pipeline_rdtest_docker | tojson }}, "MakeCohortVcf.sv_pipeline_qc_docker": {{ dockers.sv_pipeline_qc_docker | tojson }}, diff --git a/test_input_templates/batch/GATKSVPipelineBatch.json.tmpl b/test_input_templates/batch/GATKSVPipelineBatch.json.tmpl index 87fcbb8eb..f55e4f99f 100644 --- a/test_input_templates/batch/GATKSVPipelineBatch.json.tmpl +++ b/test_input_templates/batch/GATKSVPipelineBatch.json.tmpl @@ -34,6 +34,8 @@ "GATKSVPipelineBatch.sv_pipeline_base_docker": {{ dockers.sv_pipeline_base_docker | tojson }}, "GATKSVPipelineBatch.sv_base_mini_docker":{{ dockers.sv_base_mini_docker | tojson }}, "GATKSVPipelineBatch.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }}, + "GATKSVPipelineBatch.sv_pipeline_hail_docker": {{ dockers.sv_pipeline_hail_docker | tojson }}, + "GATKSVPipelineBatch.sv_pipeline_updates_docker": {{ dockers.sv_pipeline_updates_docker | tojson }}, "GATKSVPipelineBatch.sv_pipeline_qc_docker": {{ dockers.sv_pipeline_qc_docker | tojson }}, "GATKSVPipelineBatch.sv_pipeline_rdtest_docker": {{ dockers.sv_pipeline_rdtest_docker | tojson }}, "GATKSVPipelineBatch.wham_docker": {{ dockers.wham_docker | tojson }}, @@ -127,7 +129,11 @@ "GATKSVPipelineBatch.MakeCohortVcf.min_sr_background_fail_batches": 0.5, "GATKSVPipelineBatch.MakeCohortVcf.max_shards_per_chrom_clean_vcf_step1": 200, "GATKSVPipelineBatch.MakeCohortVcf.min_records_per_shard_clean_vcf_step1": 5000, + "GATKSVPipelineBatch.MakeCohortVcf.clean_vcf1b_records_per_shard": 10000, + "GATKSVPipelineBatch.MakeCohortVcf.clean_vcf5_records_per_shard": 5000, "GATKSVPipelineBatch.MakeCohortVcf.samples_per_clean_vcf_step2_shard": 100, "GATKSVPipelineBatch.MakeCohortVcf.random_seed": 0, - "GATKSVPipelineBatch.MakeCohortVcf.max_shard_size_resolve": 500 + "GATKSVPipelineBatch.MakeCohortVcf.max_shard_size_resolve": 500, + "GATKSVPipelineBatch.MakeCohortVcf.chr_x": {{ reference_resources.chr_x | tojson }}, + "GATKSVPipelineBatch.MakeCohortVcf.chr_y": {{ reference_resources.chr_y | tojson }} } diff --git a/test_input_templates/single-sample/GATKSVPipelineSingleSampleTest.json.tmpl b/test_input_templates/single-sample/GATKSVPipelineSingleSampleTest.json.tmpl index a31cbf18a..d6579168a 100644 --- a/test_input_templates/single-sample/GATKSVPipelineSingleSampleTest.json.tmpl +++ b/test_input_templates/single-sample/GATKSVPipelineSingleSampleTest.json.tmpl @@ -44,6 +44,8 @@ "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.sv_pipeline_base_docker": {{ dockers.sv_pipeline_base_docker | tojson }}, "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.sv_base_mini_docker":{{ dockers.sv_base_mini_docker | tojson }}, "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.sv_pipeline_docker": {{ dockers.sv_pipeline_docker | tojson }}, + "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.sv_pipeline_hail_docker": {{ dockers.sv_pipeline_hail_docker | tojson }}, + "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.sv_pipeline_updates_docker": {{ dockers.sv_pipeline_updates_docker | tojson }}, "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.sv_pipeline_qc_docker": {{ dockers.sv_pipeline_qc_docker | tojson }}, "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.sv_pipeline_rdtest_docker": {{ dockers.sv_pipeline_rdtest_docker | tojson }}, "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.wham_docker": {{ dockers.wham_docker | tojson }}, @@ -94,7 +96,9 @@ "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.max_shard_size_resolve" : 500, "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.clean_vcf_max_shards_per_chrom_clean_vcf_step1": 200, "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.clean_vcf_min_records_per_shard_clean_vcf_step1": 5000, + "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.clean_vcf1b_records_per_shard": 10000, "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.clean_vcf_samples_per_clean_vcf_step2_shard": 100, + "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.clean_vcf5_records_per_shard": 5000, "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.clean_vcf_random_seed": 0, "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.run_vcf_qc" : false, "GATKSVPipelineSingleSampleTest.GATKSVPipelineSingleSample.max_ref_panel_carrier_freq": 0.03, diff --git a/wdl/CalcAF.wdl b/wdl/CalcAF.wdl index 9bdbdfde9..1801918a7 100644 --- a/wdl/CalcAF.wdl +++ b/wdl/CalcAF.wdl @@ -1,15 +1,16 @@ version 1.0 import "Structs.wdl" +import "CleanVcf5.wdl" as cleanvcf5 workflow CalcAF { input{ File vcf File vcf_idx - String contig Int sv_per_shard String prefix String sv_pipeline_docker + String sv_pipeline_updates_docker File? sample_pop_assignments #Two-column file with sample ID & pop assignment. "." for pop will ignore sample File? famfile #Used for M/F AF calculations File? par_bed #Used for marking hemizygous males on X & Y @@ -20,23 +21,22 @@ workflow CalcAF { # Tabix to chromosome of interest, and shard input VCF for stats collection - call ShardVcf { + call cleanvcf5.ScatterVcf { input: vcf=vcf, - vcf_idx=vcf_idx, - contig=contig, - sv_pipeline_docker=sv_pipeline_docker, - sv_per_shard=sv_per_shard + prefix=prefix, + sv_pipeline_docker=sv_pipeline_updates_docker, + records_per_shard=sv_per_shard } # Scatter over VCF shards - scatter ( shard in ShardVcf.shard_vcfs ) { + scatter ( shard in ScatterVcf.shards ) { # Collect AF summary stats call ComputeShardAFs { input: vcf=shard, sv_pipeline_docker=sv_pipeline_docker, - prefix="~{prefix}.~{contig}", + prefix=prefix, sample_pop_assignments=sample_pop_assignments, famfile=famfile, par_bed=par_bed, @@ -49,7 +49,7 @@ workflow CalcAF { input: vcfs=ComputeShardAFs.shard_wAFs, sv_pipeline_docker=sv_pipeline_docker, - prefix="~{prefix}.~{contig}", + prefix=prefix, drop_empty_records=drop_empty_records } @@ -60,53 +60,6 @@ workflow CalcAF { } } - -# Shard VCF into fixed size chunks -task ShardVcf { - input{ - File vcf - File vcf_idx - String contig - Int sv_per_shard - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 4, - disk_gb: 250, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - - command { - #Tabix chromosome of interest - tabix -h ~{vcf} ~{contig} | bgzip -c > ~{contig}.vcf.gz - #Then shard VCF - /opt/sv-pipeline/scripts/shard_VCF.sh \ - ~{contig}.vcf.gz \ - ~{sv_per_shard} \ - "vcf.shard." - } - - output { - Array[File] shard_vcfs = glob("vcf.shard.*.vcf.gz") - } - - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_pipeline_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} - - # Subset a vcf to a single chromosome, and add global AF information (no subpop) task ComputeShardAFs { input{ @@ -121,8 +74,8 @@ task ComputeShardAFs { } RuntimeAttr default_attr = object { cpu_cores: 1, - mem_gb: 8, - disk_gb: 20, + mem_gb: 1.5, + disk_gb: 20 + size(vcf, "GB") * 2, boot_disk_gb: 10, preemptible_tries: 3, max_retries: 1 diff --git a/wdl/CleanVcf.wdl b/wdl/CleanVcf.wdl index dce8ecb9e..e5d95713b 100644 --- a/wdl/CleanVcf.wdl +++ b/wdl/CleanVcf.wdl @@ -2,7 +2,7 @@ version 1.0 import "CleanVcfChromosome.wdl" as CleanVcfChromosome import "TasksMakeCohortVcf.wdl" as MiniTasks -import "Utils.wdl" as util +import "HailMerge.wdl" as HailMerge workflow CleanVcf { input { @@ -15,52 +15,72 @@ workflow CleanVcf { File contig_list File allosome_fai - Int max_shards_per_chrom_clean_vcf_step1 - Int min_records_per_shard_clean_vcf_step1 - Int samples_per_clean_vcf_step2_shard + Int max_shards_per_chrom_step1 + Int min_records_per_shard_step1 + Int samples_per_step2_shard + Int? max_samples_per_shard_step3 + Int clean_vcf1b_records_per_shard + Int clean_vcf5_records_per_shard + + String chr_x + String chr_y File? outlier_samples_list + Boolean use_hail = false + String? gcs_project + String linux_docker String sv_base_mini_docker String sv_pipeline_docker + String sv_pipeline_hail_docker + String sv_pipeline_updates_docker # overrides for mini tasks + RuntimeAttr? runtime_override_preconcat_clean_final + RuntimeAttr? runtime_override_hail_merge_clean_final + RuntimeAttr? runtime_override_fix_header_clean_final RuntimeAttr? runtime_override_concat_cleaned_vcfs # overrides for CleanVcfContig RuntimeAttr? runtime_override_clean_vcf_1a - RuntimeAttr? runtime_override_clean_vcf_1b RuntimeAttr? runtime_override_clean_vcf_2 RuntimeAttr? runtime_override_clean_vcf_3 RuntimeAttr? runtime_override_clean_vcf_4 - RuntimeAttr? runtime_override_clean_vcf_5 + RuntimeAttr? runtime_override_clean_vcf_5_scatter + RuntimeAttr? runtime_override_clean_vcf_5_make_cleangq + RuntimeAttr? runtime_override_clean_vcf_5_find_redundant_multiallelics + RuntimeAttr? runtime_override_clean_vcf_5_polish RuntimeAttr? runtime_override_stitch_fragmented_cnvs RuntimeAttr? runtime_override_final_cleanup + + # Clean vcf 1b + RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b + RuntimeAttr? runtime_attr_override_sort_bed_1b + RuntimeAttr? runtime_attr_override_intersect_bed_1b + RuntimeAttr? runtime_attr_override_build_dict_1b + RuntimeAttr? runtime_attr_override_scatter_1b + RuntimeAttr? runtime_attr_override_filter_vcf_1b + RuntimeAttr? runtime_override_concat_vcfs_1b + RuntimeAttr? runtime_override_cat_multi_cnvs_1b + + RuntimeAttr? runtime_override_preconcat_step1 + RuntimeAttr? runtime_override_hail_merge_step1 + RuntimeAttr? runtime_override_fix_header_step1 + + RuntimeAttr? runtime_override_preconcat_drc + RuntimeAttr? runtime_override_hail_merge_drc + RuntimeAttr? runtime_override_fix_header_drc + RuntimeAttr? runtime_override_split_vcf_to_clean - RuntimeAttr? runtime_override_combine_step_1_vcfs RuntimeAttr? runtime_override_combine_step_1_sex_chr_revisions RuntimeAttr? runtime_override_split_include_list RuntimeAttr? runtime_override_combine_clean_vcf_2 RuntimeAttr? runtime_override_combine_revised_4 RuntimeAttr? runtime_override_combine_multi_ids_4 - RuntimeAttr? runtime_attr_ids_from_vcf - RuntimeAttr? runtime_attr_subset_ped - } - - call util.GetSampleIdsFromVcf { - input: - vcf = complex_genotype_vcfs[0], - sv_base_mini_docker = sv_base_mini_docker, - runtime_attr_override = runtime_attr_ids_from_vcf - } - call util.SubsetPedFile { - input: - ped_file = merged_ped_file, - sample_list = GetSampleIdsFromVcf.out_file, - subset_name = "vcf_samples", - sv_base_mini_docker = sv_base_mini_docker, - runtime_attr_override = runtime_attr_subset_ped + RuntimeAttr? runtime_override_drop_redundant_cnvs + RuntimeAttr? runtime_override_combine_step_1_vcfs + RuntimeAttr? runtime_override_sort_drop_redundant_cnvs } #Scatter per chromosome @@ -73,47 +93,89 @@ workflow CleanVcf { vcf=complex_genotype_vcfs[i], contig=contig, background_list=complex_resolve_background_fail_lists[i], - ped_file=SubsetPedFile.ped_subset_file, + ped_file=merged_ped_file, bothsides_pass_list=complex_resolve_bothside_pass_lists[i], allosome_fai=allosome_fai, - prefix=cohort_name, - max_shards_per_chrom_step1=max_shards_per_chrom_clean_vcf_step1, - min_records_per_shard_step1=min_records_per_shard_clean_vcf_step1, - samples_per_step2_shard=samples_per_clean_vcf_step2_shard, + prefix="~{cohort_name}.~{contig}", + max_shards_per_chrom_step1=max_shards_per_chrom_step1, + min_records_per_shard_step1=min_records_per_shard_step1, + samples_per_step2_shard=samples_per_step2_shard, + max_samples_per_shard_step3=max_samples_per_shard_step3, outlier_samples_list=outlier_samples_list, + use_hail=use_hail, + gcs_project=gcs_project, + clean_vcf1b_records_per_shard=clean_vcf1b_records_per_shard, + clean_vcf5_records_per_shard=clean_vcf5_records_per_shard, + chr_x=chr_x, + chr_y=chr_y, linux_docker=linux_docker, sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_updates_docker=sv_pipeline_updates_docker, sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, runtime_override_clean_vcf_1a=runtime_override_clean_vcf_1a, - runtime_override_clean_vcf_1b=runtime_override_clean_vcf_1b, runtime_override_clean_vcf_2=runtime_override_clean_vcf_2, runtime_override_clean_vcf_3=runtime_override_clean_vcf_3, runtime_override_clean_vcf_4=runtime_override_clean_vcf_4, - runtime_override_clean_vcf_5=runtime_override_clean_vcf_5, + runtime_override_clean_vcf_5_scatter=runtime_override_clean_vcf_5_scatter, + runtime_override_clean_vcf_5_make_cleangq=runtime_override_clean_vcf_5_make_cleangq, + runtime_override_clean_vcf_5_find_redundant_multiallelics=runtime_override_clean_vcf_5_find_redundant_multiallelics, + runtime_override_clean_vcf_5_polish=runtime_override_clean_vcf_5_polish, runtime_override_stitch_fragmented_cnvs=runtime_override_stitch_fragmented_cnvs, runtime_override_final_cleanup=runtime_override_final_cleanup, runtime_override_split_vcf_to_clean=runtime_override_split_vcf_to_clean, - runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs, runtime_override_combine_step_1_sex_chr_revisions=runtime_override_combine_step_1_sex_chr_revisions, runtime_override_split_include_list=runtime_override_split_include_list, runtime_override_combine_clean_vcf_2=runtime_override_combine_clean_vcf_2, runtime_override_combine_revised_4=runtime_override_combine_revised_4, - runtime_override_combine_multi_ids_4=runtime_override_combine_multi_ids_4 + runtime_override_combine_multi_ids_4=runtime_override_combine_multi_ids_4, + runtime_override_preconcat_step1=runtime_override_preconcat_step1, + runtime_override_hail_merge_step1=runtime_override_hail_merge_step1, + runtime_override_fix_header_step1=runtime_override_fix_header_step1, + runtime_override_preconcat_drc=runtime_override_preconcat_drc, + runtime_override_hail_merge_drc=runtime_override_hail_merge_drc, + runtime_override_fix_header_drc=runtime_override_fix_header_drc, + runtime_override_drop_redundant_cnvs=runtime_override_drop_redundant_cnvs, + runtime_attr_override_subset_large_cnvs_1b=runtime_attr_override_subset_large_cnvs_1b, + runtime_attr_override_sort_bed_1b=runtime_attr_override_sort_bed_1b, + runtime_attr_override_intersect_bed_1b=runtime_attr_override_intersect_bed_1b, + runtime_attr_override_build_dict_1b=runtime_attr_override_build_dict_1b, + runtime_attr_override_scatter_1b=runtime_attr_override_scatter_1b, + runtime_attr_override_filter_vcf_1b=runtime_attr_override_filter_vcf_1b, + runtime_override_concat_vcfs_1b=runtime_override_concat_vcfs_1b, + runtime_override_cat_multi_cnvs_1b=runtime_override_cat_multi_cnvs_1b } } - call MiniTasks.ConcatVcfs as ConcatCleanedVcfs { - input: - vcfs=CleanVcfChromosome.out, - vcfs_idx=CleanVcfChromosome.out_idx, - allow_overlaps=true, - outfile_prefix="~{cohort_name}.cleaned", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_concat_cleaned_vcfs + if (use_hail) { + call HailMerge.HailMerge as ConcatVcfsHail { + input: + vcfs=CleanVcfChromosome.out, + prefix="~{cohort_name}.cleaned", + gcs_project=gcs_project, + reset_cnv_gts=true, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, + runtime_override_preconcat=runtime_override_preconcat_clean_final, + runtime_override_hail_merge=runtime_override_hail_merge_clean_final, + runtime_override_fix_header=runtime_override_fix_header_clean_final + } + } + if (!use_hail) { + call MiniTasks.ConcatVcfs as ConcatCleanedVcfs { + input: + vcfs=CleanVcfChromosome.out, + vcfs_idx=CleanVcfChromosome.out_idx, + allow_overlaps=true, + outfile_prefix="~{cohort_name}.cleaned", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_concat_cleaned_vcfs + } } output { - File cleaned_vcf = ConcatCleanedVcfs.concat_vcf - File cleaned_vcf_index = ConcatCleanedVcfs.concat_vcf_idx + File cleaned_vcf = select_first([ConcatCleanedVcfs.concat_vcf, ConcatVcfsHail.merged_vcf]) + File cleaned_vcf_index = select_first([ConcatCleanedVcfs.concat_vcf_idx, ConcatVcfsHail.merged_vcf_index]) } } \ No newline at end of file diff --git a/wdl/CleanVcf1.wdl b/wdl/CleanVcf1.wdl deleted file mode 100644 index 8e9dc8d8c..000000000 --- a/wdl/CleanVcf1.wdl +++ /dev/null @@ -1,1558 +0,0 @@ -version 1.0 - -import "Structs.wdl" - -workflow CleanVcf1 { - input { - File vcf - File background_list - File ped_file - String sv_pipeline_docker - String linux_docker - File bothsides_pass_list - File allosome_fai - RuntimeAttr? runtime_attr_override # TODO - } - - call CreateEmptyFile { - input: - linux_docker=linux_docker - } - - call CleanVcf1_1 { - input: - vcf=vcf, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1_2 { - input: - EV_update_vcf=CleanVcf1_1.EV_update_vcf, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1_3 { - input: - EV_update_vcf=CleanVcf1_1.EV_update_vcf, - vcf_convert_svtype=CleanVcf1_2.vcf_convert_svtype, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1_4 { - input: - convertsvtype_vcf=CleanVcf1_3.convertsvtype_vcf, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1_5 { - input: - convertsvtype_vcf=CleanVcf1_3.convertsvtype_vcf, - vargq_persample=CleanVcf1_4.vargq_persample, - sv_pipeline_docker=sv_pipeline_docker - } - - if (CleanVcf1_5.count_xy > 0) { - call CleanVcf1_6 { - input: - cleaninfo_vcf=CleanVcf1_5.cleaninfo_vcf, - cleaninfo_vcf_tbi=CleanVcf1_5.cleaninfo_vcf_tbi, - ped_file=ped_file, - sv_pipeline_docker=sv_pipeline_docker - } - - if (CleanVcf1_6.clean_bed_ids_count > 0) { - call CleanVcf1_7 { - input: - allosome_fai=allosome_fai, - cleaninfo_vcf=CleanVcf1_5.cleaninfo_vcf, - cleaninfo_vcf_tbi=CleanVcf1_5.cleaninfo_vcf_tbi, - clean_bed_ids=CleanVcf1_6.clean_bed_ids, - male=CleanVcf1_6.male, - sv_pipeline_docker=sv_pipeline_docker - } - call CleanVcf1_8 { - input: - allosome_fai=allosome_fai, - cleaninfo_vcf=CleanVcf1_5.cleaninfo_vcf, - cleaninfo_vcf_tbi=CleanVcf1_5.cleaninfo_vcf_tbi, - clean_bed_ids=CleanVcf1_6.clean_bed_ids, - female=CleanVcf1_6.female, - sv_pipeline_docker=sv_pipeline_docker - } - call CleanVcf1_9 { - input: - RD_CN_sexcheck_FORMAT_male=CleanVcf1_7.RD_CN_sexcheck_FORMAT_male, - sv_pipeline_docker=sv_pipeline_docker - } - call CleanVcf1_10 { - input: - RD_CN_sexcheck_FORMAT_female=CleanVcf1_8.RD_CN_sexcheck_FORMAT_female, - sv_pipeline_docker=sv_pipeline_docker - } - } - - call CleanVcf1_11 { - input: - cleaninfo_vcf=CleanVcf1_5.cleaninfo_vcf, - cleaninfo_vcf_tbi=CleanVcf1_5.cleaninfo_vcf_tbi, - clean_bed=CleanVcf1_6.clean_bed, - male_median_value_pervar=select_first([CleanVcf1_9.male_median_value_pervar, CreateEmptyFile.empty]), - female_median_value_pervar=select_first([CleanVcf1_10.female_median_value_pervar, CreateEmptyFile.empty]), - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1_12 { - input: - sexchr_revise_1=CleanVcf1_11.sexchr_revise_1, - cleaninfo_vcf=CleanVcf1_5.cleaninfo_vcf, - cleaninfo_vcf_tbi=CleanVcf1_5.cleaninfo_vcf_tbi, - clean_bed=CleanVcf1_6.clean_bed, - male_median_value_pervar=select_first([CleanVcf1_9.male_median_value_pervar, CreateEmptyFile.empty]), - female_median_value_pervar=select_first([CleanVcf1_10.female_median_value_pervar, CreateEmptyFile.empty]), - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1_13 { - input: - cleaninfo_vcf=CleanVcf1_5.cleaninfo_vcf, - cleaninfo_vcf_csi=CleanVcf1_5.cleaninfo_vcf_csi, - male=CleanVcf1_6.male, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1_14 { - input: - cleaninfo_vcf=CleanVcf1_5.cleaninfo_vcf, - cleaninfo_vcf_csi=CleanVcf1_5.cleaninfo_vcf_csi, - female=CleanVcf1_6.female, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1_15 { - input: - male_vcf=CleanVcf1_13.male_vcf, - sexchr_revise_2=CleanVcf1_12.sexchr_revise_2, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1_16 { - input: - male_vcf=CleanVcf1_13.male_vcf, - sexchr_revise_2=CleanVcf1_12.sexchr_revise_2, - sv_pipeline_docker=sv_pipeline_docker - } - - if ((CleanVcf1_15.count + CleanVcf1_16.count) > 0) { - call CleanVcf1_17 { - input: - male_vcf=CleanVcf1_13.male_vcf, - male_dup_revise_txt=CleanVcf1_16.male_dup_revise_txt, - male_del_revise_txt=CleanVcf1_15.male_del_revise_txt, - sv_pipeline_docker=sv_pipeline_docker - } - } - - if (CleanVcf1_5.count_y > 0) { - call CleanVcf1_18 { - input: - female_vcf=CleanVcf1_14.female_vcf, - sv_pipeline_docker=sv_pipeline_docker - } - call CleanVcf1_19 { - input: - female_vcf=CleanVcf1_14.female_vcf, - female_y_revise_txt=CleanVcf1_18.female_y_revise_txt, - sv_pipeline_docker=sv_pipeline_docker - } - } - - if (CleanVcf1_6.ped_file_count > 0) { - call CleanVcf1_20 { - input: - cleaninfo_vcf=CleanVcf1_5.cleaninfo_vcf, - cleaninfo_vcf_csi=CleanVcf1_5.cleaninfo_vcf_csi, - ped_file=ped_file, - sv_pipeline_docker=sv_pipeline_docker - } - call CleanVcf1_21 { - input: - other_vcf=CleanVcf1_20.other_vcf, - sv_pipeline_docker=sv_pipeline_docker - } - call CleanVcf1_22 { - input: - other_vcf=CleanVcf1_20.other_vcf, - other_revise_txt=CleanVcf1_21.other_revise_txt, - sv_pipeline_docker=sv_pipeline_docker - } - call CleanVcf1_23 { - input: - cleanmale_vcf=select_first([CleanVcf1_17.cleanmale_vcf, CleanVcf1_13.male_vcf]), - cleanfemale_vcf=select_first([CleanVcf1_19.cleanfemale_vcf, CleanVcf1_14.female_vcf]), - cleanother_vcf=CleanVcf1_22.cleanother_vcf, - sv_pipeline_docker=sv_pipeline_docker - } - } - if (CleanVcf1_6.ped_file_count == 0) { - call CleanVcf1_24 { - input: - cleanmale_vcf=select_first([CleanVcf1_17.cleanmale_vcf, CleanVcf1_13.male_vcf]), - cleanfemale_vcf=select_first([CleanVcf1_19.cleanfemale_vcf, CleanVcf1_14.female_vcf]), - sv_pipeline_docker=sv_pipeline_docker - } - } - - call CleanVcf1_25 { - input: - combinedsex_vcf=select_first([CleanVcf1_23.combinedsex_vcf, CleanVcf1_24.combinedsex_vcf]), - combinedsex_vcf_tbi=select_first([CleanVcf1_23.combinedsex_vcf_tbi, CleanVcf1_24.combinedsex_vcf_tbi]), - cleaninfo_vcf=CleanVcf1_5.cleaninfo_vcf, - cleaninfo_vcf_tbi=CleanVcf1_5.cleaninfo_vcf_tbi, - sv_pipeline_docker=sv_pipeline_docker - } - } - - call CleanVcf1_26 { - input: - background_list=background_list, - cleanallo_vcf=select_first([CleanVcf1_25.cleanallo_vcf, CleanVcf1_5.cleaninfo_vcf]), - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1_27 { - input: - int_vcf=CleanVcf1_26.int_vcf, - bothsides_pass_list=bothsides_pass_list, - sv_pipeline_docker=sv_pipeline_docker - } - - output { - File include_list=CleanVcf1_1.include_list - File sex=select_first([CleanVcf1_12.sexchr_revise_2, CreateEmptyFile.empty]) - File intermediate_vcf=CleanVcf1_27.intermediate_vcf - File intermediate_vcf_idx=CleanVcf1_27.intermediate_vcf_idx - } -} - - -task CleanVcf1_1 { - input { - File vcf - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - ##get sampleids from VCF## - zcat ~{vcf} \ - |sed -n '1,1000p' \ - |egrep "^#" \ - |tail -n -1 \ - |cut -f10- \ - |tr '\t' '\n' \ - > includelist.txt - - ##convert EV integer back into string## - /opt/sv-pipeline/04_variant_resolution/scripts/replace_ev_numeric_code_with_string.py ~{vcf} - | bgzip -c > EV.update.vcf.gz - >>> - - output { - File include_list="includelist.txt" - File EV_update_vcf="EV.update.vcf.gz" - } -} - -task CleanVcf1_2 { - input { - File EV_update_vcf - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(EV_update_vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - ##convert all alt to svtype and alt to N## - svtk vcf2bed ~{EV_update_vcf} stdout -i SVTYPE \ - |awk -F"\t" '{ if ($5!~"ME")$5=$7; print $4"\t" "<"$5 ">"}' \ - |gzip \ - >vcf.convert.svtype.bed.gz - >>> - - output { - File vcf_convert_svtype="vcf.convert.svtype.bed.gz" - } -} -task CleanVcf1_3 { - input { - File EV_update_vcf - File vcf_convert_svtype - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([EV_update_vcf, vcf_convert_svtype], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - zcat ~{EV_update_vcf} \ - |awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]=$2; next} {if ($3 in inFileA && $1!~"#") $5=inFileA[$3]; print }' \ - <(zcat ~{vcf_convert_svtype}) - \ - |awk '{if ($1!~"#") $4="N"; print}' OFS='\t' \ - |bgzip \ - >convertsvtype.vcf.gz - >>> - - output { - File convertsvtype_vcf="convertsvtype.vcf.gz" - } -} - -task CleanVcf1_4 { - input { - File convertsvtype_vcf - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(convertsvtype_vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##get rid of multiallelic tage in INFO field and add varGQ to QUAL column and Members field## - svtk vcf2bed ~{convertsvtype_vcf} stdout -i varGQ \ - |awk -F"\t" '{print $4 "\t" $7}' \ - >vargq.persample - >>> - - output { - File vargq_persample="vargq.persample" - } -} - -task CleanVcf1_5 { - input { - File convertsvtype_vcf - File vargq_persample - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([convertsvtype_vcf, vargq_persample], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - # use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker - BCFTOOLS=/usr/local/bin/bcftools - - zcat ~{convertsvtype_vcf} \ - |sed 's/;MULTIALLELIC//g' \ - |sed 's/UNRESOLVED;//g' \ - |sed 's/;varGQ=[0-9]*//g' \ - |awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]=$2; next} {if ($3 in inFileA && $1!~"#") $6=inFileA[$3]; print }' ~{vargq_persample} - \ - |bgzip \ - >cleaninfo.vcf.gz - - tabix -p vcf cleaninfo.vcf.gz - ${BCFTOOLS} index cleaninfo.vcf.gz - - zcat cleaninfo.vcf.gz|awk '{if (($1~"X" || $1~"Y") && $1!~"#" ) print}'|wc -l > count_xy.txt - zcat cleaninfo.vcf.gz|awk '{if ($1~"Y" && $1!~"#") print}'|wc -l > count_y.txt - >>> - - output { - File cleaninfo_vcf="cleaninfo.vcf.gz" - File cleaninfo_vcf_tbi="cleaninfo.vcf.gz.tbi" - File cleaninfo_vcf_csi="cleaninfo.vcf.gz.csi" - Int count_xy = read_int("count_xy.txt") - Int count_y= read_int("count_y.txt") - } -} - -task CleanVcf1_6 { - input { - File cleaninfo_vcf - File cleaninfo_vcf_tbi - File ped_file - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([cleaninfo_vcf, ped_file], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - svtk vcf2bed ~{cleaninfo_vcf} stdout \ - |awk '{if (($5=="DEL" || $5=="DUP") && $3-$2>=5000 && ($1~"X" || $1~"Y") && $1!~"#") print}' \ - >clean.bed - - awk '{print $4}' clean.bed>clean.bed.ids.txt - - ##male## - awk '{if ($5==1) print $2}' ~{ped_file} \ - |fgrep -wf <(zcat ~{cleaninfo_vcf}|head -n 1000|fgrep "CHROM"|fgrep POS|cut -f10-|tr '\t' '\n') >male.txt - - ##female## - awk '{if ($5==2) print $2}' ~{ped_file} \ - |fgrep -wf <(zcat ~{cleaninfo_vcf}|head -n 1000|fgrep "CHROM"|fgrep POS|cut -f10-|tr '\t' '\n') >female.txt - - cat clean.bed.ids.txt|wc -l > clean_bed_ids_count.txt - awk '{if ($5!=2 && $5!=1) print $2}' ~{ped_file}|wc -l > ped_file_count.txt - >>> - - output { - File clean_bed="clean.bed" - File clean_bed_ids="clean.bed.ids.txt" - File male="male.txt" - File female="female.txt" - Int clean_bed_ids_count=read_int("clean_bed_ids_count.txt") - Int ped_file_count=read_int("ped_file_count.txt") - } -} - -task CleanVcf1_7 { - input { - File allosome_fai - File cleaninfo_vcf - File cleaninfo_vcf_tbi - File clean_bed_ids - File male - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([cleaninfo_vcf, clean_bed_ids], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - # use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker - BCFTOOLS=/usr/local/bin/bcftools - - awk '{print $1"\t0\t"$2}' < ~{allosome_fai} > allosomes.list - ${BCFTOOLS} query -R allosomes.list -S ~{male} -i 'ID=@~{clean_bed_ids}' -f '[%ID\t%SAMPLE\t%RD_CN\n]' ~{cleaninfo_vcf} \ - | awk '{if ($3!=".") print}' \ - | gzip > RD_CN.sexcheck.FORMAT.male.gz - >>> - - output { - File RD_CN_sexcheck_FORMAT_male="RD_CN.sexcheck.FORMAT.male.gz" - } -} - -task CleanVcf1_8 { - input { - File cleaninfo_vcf - File cleaninfo_vcf_tbi - File allosome_fai - File clean_bed_ids - File female - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(clean_bed_ids, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - # use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker - BCFTOOLS=/usr/local/bin/bcftools - - awk '{print $1"\t0\t"$2}' < ~{allosome_fai} > allosomes.list - ${BCFTOOLS} query -R allosomes.list -S ~{female} -i 'ID=@~{clean_bed_ids}' -f '[%ID\t%SAMPLE\t%RD_CN\n]' ~{cleaninfo_vcf} \ - | awk '{if ($3!=".") print}' \ - | gzip > RD_CN.sexcheck.FORMAT.female.gz - >>> - - output { - File RD_CN_sexcheck_FORMAT_female="RD_CN.sexcheck.FORMAT.female.gz" - } -} - -task CleanVcf1_9 { - input { - File RD_CN_sexcheck_FORMAT_male - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(RD_CN_sexcheck_FORMAT_male, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - zcat ~{RD_CN_sexcheck_FORMAT_male}| Rscript -e 'd<-read.table("stdin")' \ - -e 'x<-tapply(d[,3],d[,1],median)' \ - -e 'write.table(x,"male.median.value.pervar.txt",col.names=FALSE,quote=FALSE,sep = "\t")' - >>> - - output { - File male_median_value_pervar="male.median.value.pervar.txt" - } -} - -task CleanVcf1_10 { - input { - File RD_CN_sexcheck_FORMAT_female - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(RD_CN_sexcheck_FORMAT_female, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - zcat ~{RD_CN_sexcheck_FORMAT_female}| Rscript -e 'd<-read.table("stdin")' \ - -e 'x<-tapply(d[,3],d[,1],median)' \ - -e 'write.table(x,"female.median.value.pervar.txt",col.names=FALSE,quote=FALSE,sep = "\t")' - >>> - - output { - File female_median_value_pervar="female.median.value.pervar.txt" - } -} - -task CreateEmptyFile { - input { - String linux_docker - RuntimeAttr? runtime_attr_override - } - RuntimeAttr runtime_default = object { - mem_gb: 1.0, - disk_gb: 10, - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: linux_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - touch empty.txt - >>> - - output { - File empty="empty.txt" - } -} - -task CleanVcf1_11 { - input { - File cleaninfo_vcf - File cleaninfo_vcf_tbi - File clean_bed - File? male_median_value_pervar - File? female_median_value_pervar - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([cleaninfo_vcf, clean_bed, male_median_value_pervar, female_median_value_pervar], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##Pull out ids where male copy state 1 to normal when female normal and on X## - echo "">sexchr.revise.1.txt - - if [ $(awk '{if (($5=="DEL" || $5=="DUP") && $3-$2>=5000) print }' ~{clean_bed}|awk '{if (($1~"X") && $1!~"#" ) print}'|wc -l) -gt 0 ] - then - awk '{if ($2==1) print $1}' ~{male_median_value_pervar} \ - |{ fgrep -wf <(awk '{if ($2==2) print $1}' ~{female_median_value_pervar}) || true; } \ - |{ fgrep -wf - <(zcat ~{cleaninfo_vcf}|awk '{if ($1~"X" && $1!~"#") print $3}') || true; } \ - >sexchr.revise.1.txt - fi - >>> - - output { - File sexchr_revise_1="sexchr.revise.1.txt" - } -} - -task CleanVcf1_12 { - input { - File sexchr_revise_1 - File cleaninfo_vcf - File cleaninfo_vcf_tbi - File clean_bed - File male_median_value_pervar - File female_median_value_pervar - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([sexchr_revise_1, cleaninfo_vcf, clean_bed, male_median_value_pervar, female_median_value_pervar], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - mv ~{sexchr_revise_1} sexchr.revise.2.txt - if [ $(awk '{if (($5=="DEL" || $5=="DUP") && $3-$2>=5000) print }' ~{clean_bed}|awk '{if (($1~"Y") && $1!~"#" ) print}'|wc -l) -gt 0 ] - then - awk '{if ($2==1) print $1}' ~{male_median_value_pervar} \ - |{ fgrep -wf <(awk '{if ($2==0) print $1}' ~{female_median_value_pervar}) || true; } \ - |{ fgrep -wf - <(zcat ~{cleaninfo_vcf}|awk '{if ($1~"Y" && $1!~"#") print $3}') || true; } \ - >>sexchr.revise.2.txt - fi - >>> - - output { - File sexchr_revise_2="sexchr.revise.2.txt" - } -} - - -task CleanVcf1_13 { - input { - File cleaninfo_vcf - File cleaninfo_vcf_csi - File male - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([cleaninfo_vcf, male], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - # use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker - BCFTOOLS=/usr/local/bin/bcftools - - ##Pull out male sex chr## - ${BCFTOOLS} view ~{cleaninfo_vcf} -S ~{male} -r chrX:1-1000000000,chrY:1-1000000000,X:1-1000000000,Y:1-1000000000 --no-update|bgzip>male.vcf.gz - ${BCFTOOLS} index male.vcf.gz - >>> - - output { - File male_vcf="male.vcf.gz" - File male_vcf_csi="male.vcf.gz.csi" - } -} - - -task CleanVcf1_14 { - input { - File cleaninfo_vcf - File cleaninfo_vcf_csi - File female - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([cleaninfo_vcf, female], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - # use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker - BCFTOOLS=/usr/local/bin/bcftools - - ##Pull out female sex chr## - ${BCFTOOLS} view ~{cleaninfo_vcf} -S ~{female} -r chrX:1-1000000000,chrY:1-1000000000,X:1-1000000000,Y:1-1000000000 --no-update|bgzip>female.vcf.gz - ${BCFTOOLS} index female.vcf.gz - >>> - - output { - File female_vcf="female.vcf.gz" - File female_vcf_csi="female.vcf.gz.csi" - } -} - - -task CleanVcf1_15 { - input { - File male_vcf - File sexchr_revise_2 - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([male_vcf, sexchr_revise_2], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - zcat ~{male_vcf}\ - |awk -F'\t' '{if ($5~"DEL" && $1!~"#") print $0 "\t" "ENDOFLINE"}' \ - |{ fgrep -wf ~{sexchr_revise_2} || true; } \ - |tr '\t' '\n' \ - |awk -F':' '{if ($3>=1 && NF>4 && $1!="GT") $1="0/0";else if ($3==0 && NF>4 && $1!="GT" ) $1="0/1"; if (NF>4 && $1!="GT") $3=$3+1;print}' OFS=":" \ - |tr '\n' '\t' \ - |sed 's/ENDOFLINE/\n/g' \ - |sed -e 's/^[ \t]*//' \ - |sed -e 's/[\t]$//g' \ - |bgzip \ - >male_del.revise.txt.gz - zcat male_del.revise.txt.gz|wc -l > count.txt - >>> - - output { - File male_del_revise_txt="male_del.revise.txt.gz" - Int count=read_int("count.txt") - } -} - - -task CleanVcf1_16 { - input { - File male_vcf - File sexchr_revise_2 - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([male_vcf, sexchr_revise_2], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - zcat ~{male_vcf}\ - |awk -F'\t' '{if ($5~"DUP" && $1!~"#") print $0 "\t" "ENDOFLINE"}' \ - |{ fgrep -wf ~{sexchr_revise_2} || true; } \ - |tr '\t' '\n' \ - |awk -F':' '{if ($3<=1 && NF>4 && $1!="GT") $1="0/0";else if ($3==2 && NF>4 && $1!="GT" ) $1="0/1";else if (NF>4 && $1!="GT" ) $1="1/1"; if (NF>4 && $1!="GT" ) $3=$3+1;print}' OFS=":" \ - |tr '\n' '\t' \ - |sed 's/ENDOFLINE/\n/g' \ - |sed -e 's/^[ \t]*//' \ - |sed -e 's/[\t]$//g' \ - |bgzip \ - >male_dup.revise.txt.gz - zcat male_dup.revise.txt.gz|wc -l > count.txt - >>> - - output { - File male_dup_revise_txt="male_dup.revise.txt.gz" - Int count=read_int("count.txt") - } -} - -task CleanVcf1_17 { - input { - File male_vcf - File male_dup_revise_txt - File male_del_revise_txt - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([male_vcf, male_dup_revise_txt, male_del_revise_txt], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 10.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - # use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker - BCFTOOLS=/usr/local/bin/bcftools - - cat <(zcat ~{male_vcf}|fgrep -wvf <(zcat ~{male_dup_revise_txt} ~{male_del_revise_txt}|awk '{print $3}' )) \ - <(zcat ~{male_del_revise_txt} ~{male_dup_revise_txt}|awk '{if ($1!="") print}'|tr ' ' '\t') \ - |vcf-sort \ - |bgzip \ - >cleanmale.vcf.gz - ${BCFTOOLS} index cleanmale.vcf.gz - >>> - - output { - File cleanmale_vcf="cleanmale.vcf.gz" - File cleanmale_vcf_csi="cleanmale.vcf.gz.csi" - } -} - - - -task CleanVcf1_18 { - input { - File female_vcf - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(female_vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - zcat ~{female_vcf}\ - |awk -F'\t' '{if ($1!~"#" && $1~"Y") print $0 "\t" "ENDOFLINE"}' \ - |tr '\t' '\n' \ - |awk -F':' '{ if (NF>4 && $1!="GT" ) $1="./." \ - ;if (NF>4 && $1!="GT" ) $2=$3=$4=$5=$6=$7=$8=$9=".";print}' OFS=":" \ - |tr '\n' '\t' \ - |sed 's/ENDOFLINE/\n/g' \ - |sed -e 's/^[ \t]*//' \ - |sed -e 's/[\t]$//g' \ - |bgzip \ - >female.y.revise.txt.gz - >>> - - output { - File female_y_revise_txt="female.y.revise.txt.gz" - } -} - - - -task CleanVcf1_19 { - input { - File female_vcf - File female_y_revise_txt - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([female_vcf, female_y_revise_txt], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - # use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker - BCFTOOLS=/usr/local/bin/bcftools - - cat <(zcat ~{female_vcf} \ - |fgrep -wvf <(zcat ~{female_y_revise_txt}|awk '{print $3}' )) \ - <(zcat ~{female_y_revise_txt}) \ - |vcf-sort \ - |bgzip \ - >cleanfemale.vcf.gz - ${BCFTOOLS} index cleanfemale.vcf.gz - >>> - - output { - File cleanfemale_vcf="cleanfemale.vcf.gz" - File cleanfemale_vcf_csi="cleanfemale.vcf.gz.csi" - } -} - - - -task CleanVcf1_20 { - input { - File cleaninfo_vcf - File cleaninfo_vcf_csi - File ped_file - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([cleaninfo_vcf, ped_file], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - # use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker - BCFTOOLS=/usr/local/bin/bcftools - - awk '{if ($5!=2 && $5!=1) print $2}' ~{ped_file}>other.txt - ${BCFTOOLS} view ~{cleaninfo_vcf} -S other.txt -r chrX:1-1000000000,chrY:1-1000000000,X:1-1000000000,Y:1-1000000000 --no-update|bgzip>other.vcf.gz - ${BCFTOOLS} index other.vcf.gz - >>> - - output { - File other_vcf="other.vcf.gz" - File other_vcf_csi="other.vcf.gz.csi" - } -} - -task CleanVcf1_21 { - input { - File other_vcf - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(other_vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - zcat ~{other_vcf}\ - |awk -F'\t' '{if ($1!~"#") print $0 "\t" "ENDOFLINE"}' \ - |tr '\t' '\n' \ - |awk -F':' '{ if (NF>4 && $1!="GT" ) $1="./.";print}' OFS=":" \ - |tr '\n' '\t' \ - |sed 's/ENDOFLINE/\n/g' \ - |sed -e 's/^[ \t]*//' \ - |sed -e 's/[\t]$//g' \ - |bgzip \ - >other.revise.txt.gz - >>> - - output { - File other_revise_txt="other.revise.txt.gz" - } -} - - - -task CleanVcf1_22 { - input { - File other_vcf - File other_revise_txt - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([other_vcf, other_revise_txt], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - # use BCFTOOLS 1.9, installed in /usr/local/bin/bcftools in our docker - BCFTOOLS=/usr/local/bin/bcftools - - cat <(zcat ~{other_vcf} \ - |fgrep -wvf <(zcat ~{other_revise_txt}|awk '{print $3}' )) \ - <(zcat ~{other_revise_txt}) \ - |vcf-sort \ - |bgzip \ - >cleanother.vcf.gz - ${BCFTOOLS} index cleanother.vcf.gz - >>> - - output { - File cleanother_vcf="cleanother.vcf.gz" - File cleanother_vcf_csi="cleanother.vcf.gz.csi" - } -} - - - -task CleanVcf1_23 { - input { - File cleanmale_vcf - File cleanfemale_vcf - File cleanother_vcf - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([cleanmale_vcf, cleanfemale_vcf, cleanother_vcf], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - cat <(zcat ~{cleanmale_vcf}|egrep "##") \ - <(paste <(zcat ~{cleanmale_vcf}|egrep -v "##") <(zcat ~{cleanfemale_vcf}|cut -f10-|egrep -v "##") <(zcat ~{cleanother_vcf}|cut -f10-|egrep -v "##") ) \ - |bgzip \ - >combinedsex.vcf.gz - tabix -p vcf combinedsex.vcf.gz - >>> - - output { - File combinedsex_vcf="combinedsex.vcf.gz" - File combinedsex_vcf_tbi="combinedsex.vcf.gz.tbi" - } -} - -task CleanVcf1_24 { - input { - File cleanmale_vcf - File cleanfemale_vcf - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([cleanmale_vcf, cleanfemale_vcf], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - cat <(zcat ~{cleanmale_vcf}|egrep "##") \ - <(paste <(zcat ~{cleanmale_vcf}|egrep -v "##") <(zcat ~{cleanfemale_vcf}|cut -f10-|egrep -v "##")) \ - |bgzip \ - >combinedsex.vcf.gz - tabix -p vcf combinedsex.vcf.gz - >>> - - output { - File combinedsex_vcf="combinedsex.vcf.gz" - File combinedsex_vcf_tbi="combinedsex.vcf.gz.tbi" - } -} - -task CleanVcf1_25 { - input { - File combinedsex_vcf - File combinedsex_vcf_tbi - File cleaninfo_vcf - File cleaninfo_vcf_tbi - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([combinedsex_vcf, cleaninfo_vcf], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - zcat ~{combinedsex_vcf}|awk '{if ($1!~"#") print $3}'>modified.ids.txt - - ##shuffle sex ids backinto place to match original vcf and back to initial vcf## - vcf-shuffle-cols -t ~{cleaninfo_vcf} ~{combinedsex_vcf} \ - |awk '{if ($1!~"#") print}' \ - |cat <(zcat ~{cleaninfo_vcf}|fgrep -wvf modified.ids.txt ) - \ - |vcf-sort \ - |bgzip \ - >cleanallo.vcf.gz - >>> - - output { - File cleanallo_vcf="cleanallo.vcf.gz" - } -} - -task CleanVcf1_26 { - input { - File background_list - File cleanallo_vcf - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([background_list, cleanallo_vcf], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - # the code below will not print any lines if the background list file is empty, so add a dummy sentinel record at the end - cat ~{background_list} <(echo "XXX_SENTINEL_XXX") > background_list_with_sentinel.list - - ##change tag for SR background failures and Unresolved## - zcat ~{cleanallo_vcf} \ - |awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]; next} {if ($3 in inFileA && $1!~"#") $7=$7";HIGH_SR_BACKGROUND"; print }' <(awk '{print $NF}' background_list_with_sentinel.list) - \ - |awk '{if (NR==2) print $0 "\n" "##FILTER=" ;else print}' \ - |awk '{if ($8~"UNRESOLVED") $7=$7";UNRESOLVED";print}' OFS='\t' \ - |awk '{if (NR==2) print $0 "\n" "##FILTER=" ;else print}' \ - |bgzip \ - >int.vcf.gz - >>> - - output { - File int_vcf="int.vcf.gz" - } -} - -task CleanVcf1_27 { - input { - File int_vcf - File bothsides_pass_list - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([int_vcf, bothsides_pass_list], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - /opt/sv-pipeline/04_variant_resolution/scripts/add_bothsides_support_filter.py \ - --bgzip \ - --outfile int.w_bothsides.vcf.gz \ - ~{int_vcf} \ - ~{bothsides_pass_list} - tabix int.w_bothsides.vcf.gz - >>> - - output { - File intermediate_vcf="int.w_bothsides.vcf.gz" - File intermediate_vcf_idx="int.w_bothsides.vcf.gz.tbi" - } -} diff --git a/wdl/CleanVcf1b.wdl b/wdl/CleanVcf1b.wdl index d9537921b..a93bce670 100644 --- a/wdl/CleanVcf1b.wdl +++ b/wdl/CleanVcf1b.wdl @@ -1,790 +1,354 @@ version 1.0 import "Structs.wdl" +import "CleanVcf5.wdl" as CleanVcf5 +import "TasksMakeCohortVcf.wdl" as MiniTasks workflow CleanVcf1b { - input { - File intermediate_vcf - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override # TODO - } - - call CleanVcf1b_1 { - input: - intermediate_vcf=intermediate_vcf, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1b_2 { - input: - int_bed=CleanVcf1b_1.int_bed, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1b_3 { - input: - int_vcf=intermediate_vcf, - normoverlap=CleanVcf1b_2.normoverlap, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1b_4 { - input: - int_vcf=intermediate_vcf, - normoverlap=CleanVcf1b_2.normoverlap, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1b_5 { - input: - normoverlap=CleanVcf1b_2.normoverlap, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1b_6 { - input: - overlap_test=CleanVcf1b_5.overlap_test, - rd_cn_normcheck=CleanVcf1b_3.rd_cn_normcheck, - ev_normcheck=CleanVcf1b_4.ev_normcheck, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1b_7 { - input: - geno_normal_revise=CleanVcf1b_6.geno_normal_revise, - int_vcf=intermediate_vcf, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1b_8 { - input: - subset_vcf=CleanVcf1b_7.subset_vcf, - geno_normal_revise=CleanVcf1b_6.geno_normal_revise, - col=CleanVcf1b_1.col, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1b_9 { - input: - normal_revise_vcf_lines=CleanVcf1b_8.normal_revise_vcf_lines, - int_vcf=intermediate_vcf, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1b_10 { - input: - normal_revise_vcf=CleanVcf1b_9.normal_revise_vcf, - normal_revise_vcf_csi=CleanVcf1b_9.normal_revise_vcf_csi, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1b_11 { - input: - copystate_rd_cn=CleanVcf1b_10.copystate_rd_cn, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1b_12 { - input: - int_bed=CleanVcf1b_1.int_bed, - copystate_per_variant=CleanVcf1b_11.copystate_per_variant, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf1b_13 { - input: - int_bed=CleanVcf1b_1.int_bed, - multi_del=CleanVcf1b_12.multi_del, - copystate_per_variant=CleanVcf1b_11.copystate_per_variant, - sv_pipeline_docker=sv_pipeline_docker - } - - output { - File multi = CleanVcf1b_13.multi - File normal = CleanVcf1b_9.normal_revise_vcf - File vcftools_idx = CleanVcf1b_9.normal_revise_vcf_csi - } + input { + File intermediate_vcf + String prefix + Int records_per_shard + + String sv_pipeline_docker + String sv_base_mini_docker + String sv_pipeline_updates_docker + + RuntimeAttr? runtime_attr_override_subset_large_cnvs + RuntimeAttr? runtime_attr_override_sort_bed + RuntimeAttr? runtime_attr_override_intersect_bed + RuntimeAttr? runtime_attr_override_build_dict + RuntimeAttr? runtime_attr_override_scatter + RuntimeAttr? runtime_attr_override_filter_vcf + RuntimeAttr? runtime_override_concat_vcfs + RuntimeAttr? runtime_override_cat_multi_cnvs + } + + call SubsetLargeCNVs { + input: + vcf=intermediate_vcf, + prefix="~{prefix}.subset_large_cnvs", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_override_subset_large_cnvs + } + + call Vcf2Bed { + input: + vcf=SubsetLargeCNVs.out, + prefix="~{prefix}.subset_large_cnvs", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_override_subset_large_cnvs + } + + call SortBed { + input: + bed=Vcf2Bed.out, + prefix="~{prefix}.subset_large_cnvs.sorted", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_attr_override_sort_bed + } + + call BedtoolsIntersect { + input: + bed=SortBed.out, + prefix="~{prefix}.bedtools_intersect", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_attr_override_intersect_bed + } + + call BuildGenoNormalReviseDictionary { + input: + filtered_vcf=SubsetLargeCNVs.out, + intersect_bed=BedtoolsIntersect.out, + prefix="~{prefix}.geno_normal_revise", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_override_build_dict + } + + call MiniTasks.ScatterVcf { + input: + vcf=intermediate_vcf, + records_per_shard=records_per_shard, + prefix="~{prefix}.scatter_vcf", + sv_pipeline_docker=sv_pipeline_updates_docker, + runtime_attr_override=runtime_attr_override_scatter + } + + scatter ( i in range(length(ScatterVcf.shards)) ) { + call FilterVcf { + input: + intermediate_vcf=ScatterVcf.shards[i], + dictionary_json_gz=BuildGenoNormalReviseDictionary.out, + prefix="~{prefix}.filter_vcf.shard_~{i}", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_override_filter_vcf + } + } + + call MiniTasks.ConcatVcfs as ConcatCleanVcf1bShards { + input: + vcfs=FilterVcf.out, + naive=true, + sort_vcf_list=true, + outfile_prefix="~{prefix}.concat_vcfs", + sv_base_mini_docker=sv_pipeline_updates_docker, + runtime_attr_override=runtime_override_concat_vcfs + } + + call MiniTasks.CatUncompressedFiles as ConcatMultiCnvs { + input: + shards=FilterVcf.multi_cnvs, + outfile_name="~{prefix}.multi.cnvs.txt", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_cat_multi_cnvs + } + + output { + File normal = ConcatCleanVcf1bShards.concat_vcf + File multi = ConcatMultiCnvs.outfile + } } - -task CleanVcf1b_1 { - input { - File intermediate_vcf - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(intermediate_vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 2.0 + input_size * 1.5, - disk_gb: ceil(10.0 + input_size * 10.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##gzipped vcf from clean vcf part1.sh## - int_vcf_gz=~{intermediate_vcf} - - ##Remove CNVs that are improperly genotyped by depth because they are nested within a real CNV## - - ##Determine columns of VCF after header## - zcat $int_vcf_gz\ - |sed -n '1,1000p'\ - |egrep ^# \ - |tail -n 1 \ - |tr '\t' '\n' \ - |cat -n - \ - >col.txt - - ##Only affects CNV so pull those out## - zcat $int_vcf_gz \ - |awk '{if ($5~"DEL" || $5~"DUP" || $1~"#") print}' \ - |svtk vcf2bed stdin tmp.bed - awk -F"\t" '{if ($6=="") print $6="blanksample";print $0}' OFS='\t' tmp.bed \ - |gzip>int.bed.gz - >>> - - output { - File col="col.txt" - File int_bed="int.bed.gz" - } - -} - - -task CleanVcf1b_2 { - input { - File int_bed - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(int_bed, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 2.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##list of potenital overlaps with a normal copy state variant (>5kb variants require depth but nested events could be missed; i.e a duplication with a nest deletion will have a normal copy state for the deletion)## - ##flip bed intersect so largest is CNV is always first## - bedtools intersect -wa -wb -a <(zcat ~{int_bed}|awk '{if ($3-$2>=5000 ) print}') \ - -b <(zcat ~{int_bed}|awk '{if ($3-$2>=5000) print}') \ - |awk -F'\t' '{if ($4!=$10 && $3-$2>=$9-$8 && $5!=$11) print ;\ - else if ($4!=$10 && $5!=$11) print $7,$8,$9,$10,$11,$12,$1,$2,$3,$4,$5,$6}' OFS='\t' \ - |awk -F'\t' '{if ($6!="blanksample") print}' \ - |sort -u \ - >normaloverlap.txt - >>> - - output { - File normoverlap="normaloverlap.txt" - } - -} - - -task CleanVcf1b_3 { - input { - File int_vcf - File normoverlap - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([int_vcf, normoverlap], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 7.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##pull out the depth based copy number variant for each normal overlapping variant## - int_vcf_gz=~{int_vcf} - cat <(zcat $int_vcf_gz|awk -F"\t" '{if ($1~"#") print}') \ - <(awk '{print $4 "\n" $10}' ~{normoverlap}|sort -u|fgrep -wf - <(zcat $int_vcf_gz)) \ - |awk '{if ($1!~"#") $1=$3;print}' OFS="\t" \ - |awk '{if ($1~"#" || $5=="" || $5=="") print}' \ - |vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \ - |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1"@"header[j] "\t" $j }' \ - |sort -k1,1 \ - |gzip \ - >RD_CN.normalcheck.FORMAT.gz - >>> - - output { - File rd_cn_normcheck="RD_CN.normalcheck.FORMAT.gz" - } - -} - - - -task CleanVcf1b_4 { - input { - File int_vcf - File normoverlap - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([int_vcf, normoverlap], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 7.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##pull out evidence supporting each normal overlapping variant## - int_vcf_gz=~{int_vcf} - cat <(zcat $int_vcf_gz|awk -F"\t" '{if ($1~"#") print}') \ - <(awk '{print $4 "\n" $10}' ~{normoverlap}|sort -u|fgrep -wf - <(zcat $int_vcf_gz)) \ - |awk '{if ($1!~"#") $1=$3;print}' OFS="\t"\ - |vcftools --vcf - --stdout --extract-FORMAT-info EV \ - |awk -F"\t" 'NR==1{for (i=3;i<=NF;i++) header[i]=$i} NR>1{for(j=3;j<=NF;j++) print $1"@"header[j] "\t" $j }' \ - |sort -k1,1 \ - |gzip \ - >EV.normalcheck.FORMAT.gz - >>> - - output { - File ev_normcheck="EV.normalcheck.FORMAT.gz" - } - -} - - -task CleanVcf1b_5 { - input { - File normoverlap - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(normoverlap, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 3.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - - ##check if nested is incorrectly classified as normal## - touch overlap.test.txt - while read bed - do - echo $bed|tr ' ' '\t'|cut -f1-6 >large.bed - echo $bed|tr ' ' '\t'|cut -f7-12>small.bed - ##require at least 50% coverage to consider a variant overlapping## - overlap=$(bedtools coverage -a small.bed -b large.bed|awk '{if ($NF>=0.50) print "YES";else print "NO"}') - - if [ "$overlap" == "YES" ] - then - smallid=$(awk '{print $4}' small.bed) - - ##pull out variants that are called a variants for both the smaller and larger CNVs (don't have normal copy state to check for)## - if [ $(awk '{print $NF}' small.bed \ - |tr ',' '\n' \ - |fgrep -wvf - <(awk -F"[,\t]" -v var=$smallid '{for(i=6;i<=NF;i++) print var"@"$i "\t" $4"@"$i "\t" $5}' large.bed)|wc -l) -gt 0 ] - then - awk '{print $NF}' small.bed \ - |tr ',' '\n' \ - |fgrep -wvf - <(awk -F"[,\t]" -v var=$smallid '{for(i=6;i<=NF;i++) print var"@"$i "\t" $4"@"$i "\t" $5}' large.bed) \ - >>overlap.test.txt - fi - fi - done<~{normoverlap} - >>> - - output { - File overlap_test="overlap.test.txt" - } - -} - - -task CleanVcf1b_6 { - input { - File overlap_test - File rd_cn_normcheck - File ev_normcheck - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([overlap_test, rd_cn_normcheck, ev_normcheck], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 4.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##determine variants that need to be revised from a normal copy state into a CNV## - cat ~{overlap_test} \ - |sort -k1,1 \ - |join -j 1 - <(zcat ~{rd_cn_normcheck}) \ - |join -j 1 - <(zcat ~{ev_normcheck}) \ - |tr ' ' '\t' \ - |sort -k2,2 \ - |join -1 2 -2 1 - <(zcat ~{rd_cn_normcheck}) \ - |awk '{if ($3=="DUP" && $4==2 && $6==3) print $2 "\t" 1; else if ($3=="DEL" && $4==2 && $6==1) print $2 "\t" 3 }' \ - |tr '@' '\t'\ - >geno.normal.revise.txt - - >>> - - output { - File geno_normal_revise="geno.normal.revise.txt" - } - +task SubsetLargeCNVs { + input { + File vcf + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(vcf, "GB") + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + input_size * 2.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + bcftools view --no-version \ + -i '(INFO/SVTYPE=="DEL" || INFO/SVTYPE=="DUP") && INFO/SVLEN>=5000' \ + ~{vcf} \ + | bgzip \ + > ~{prefix}.vcf.gz + >>> + output { + File out = "~{prefix}.vcf.gz" + } } - -task CleanVcf1b_7 { - input { - File int_vcf - File geno_normal_revise - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([int_vcf, geno_normal_revise], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + input_size * 3.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##Update genotypes## - { zfgrep -wf <(awk '{print $1}' ~{geno_normal_revise}|sort -u) ~{int_vcf} || true; }\ - |bgzip \ - >subset.vcf.gz - >>> - - output { - File subset_vcf="subset.vcf.gz" - } - +task Vcf2Bed { + input { + File vcf + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(vcf, "GB") + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + input_size * 2.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + svtk vcf2bed --no-header ~{vcf} stdout \ + | awk -F'\t' -v OFS='\t' '{if ($6=="") $6="blanksample";print $0}' \ + | gzip -1 \ + > ~{prefix}.bed.gz + >>> + output { + File out = "~{prefix}.bed.gz" + } } - -task CleanVcf1b_8 { - input { - File subset_vcf - File geno_normal_revise - File col - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([subset_vcf, geno_normal_revise, col], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 10.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - - python3 < normal.revise.vcf.lines.txt - import gzip - import sys - - VCF='~{subset_vcf}' - REVISE='~{geno_normal_revise}' - COL='~{col}' - - # Grab regenotyped samples of interest - sys.stderr.write("Reading {}...\n".format(REVISE)) - geno_dict = {} - with open(REVISE) as f: - for line in f: - tokens = line.strip().split('\t') - vid = tokens[0] - if vid not in geno_dict: - geno_dict[vid] = [] - geno_dict[vid].append(tokens[1]) # id.txt but only sample id - - # Column definitions - sys.stderr.write("Reading {}...\n".format(COL)) - sample_columns_dict = {} - with open(COL) as f: - for line in f: - tokens = line.strip().split('\t') - sample_columns_dict[tokens[1]] = int(tokens[0]) - 1 - - # Assign GT/GQ - sys.stderr.write("Reassigning genotypes...\n") - with gzip.open(VCF, 'rb') as f: - for lineb in f: - line = lineb.decode('utf-8').strip() - vid = line.split('\t', 3)[2] - if vid in geno_dict: - sample_ids = geno_dict[vid] - tokens = line.split('\t') - sample_indexes = [sample_columns_dict[s] for s in sample_ids] - for i in sample_indexes: - entry = tokens[i].split(':', 4) - entry[0] = "0/1" - entry[1] = entry[3] - tokens[i] = ":".join(entry) - sys.stdout.write("{}\t\n".format("\t".join(tokens))) - else: - sys.stdout.write("{}\t\n".format(line)) - CODE - >>> - - output { - File normal_revise_vcf_lines="normal.revise.vcf.lines.txt" - } - +task SortBed { + input { + File bed + String prefix + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(bed, "GB") + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + input_size * 10.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_base_mini_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + mkdir tmp + zcat ~{bed} \ + | sort -T tmp -k1,1 -k2,2n \ + | gzip -1 \ + > ~{prefix}.bed.gz + >>> + output { + File out = "~{prefix}.bed.gz" + } } - - -task CleanVcf1b_9 { - input { - File int_vcf - File normal_revise_vcf_lines - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([int_vcf, normal_revise_vcf_lines], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 15, - disk_gb: ceil(10.0 + input_size * 50.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##rewrite vcf with updated genotypes## - awk '{print $3}' ~{normal_revise_vcf_lines}|sort -u > vids.list - cat <(zcat ~{int_vcf} | fgrep -wvf vids.list) \ - <(sed 's/\t$//' ~{normal_revise_vcf_lines}) \ - |vcf-sort \ - |bgzip \ - >normal.revise.vcf.gz - - bcftools index normal.revise.vcf.gz - >>> - - output { - File normal_revise_vcf="normal.revise.vcf.gz" - File normal_revise_vcf_csi="normal.revise.vcf.gz.csi" - } - +task BedtoolsIntersect { + input { + File bed + String prefix + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(bed, "GB") + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + input_size * 10.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_base_mini_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + bedtools intersect -sorted -wa -wb -a <(zcat ~{bed}) -b <(zcat ~{bed}) \ + | awk -F'\t' -v OFS='\t' '$4!=$10 && $5!=$11' \ + | gzip -1 \ + > ~{prefix}.bed.gz + >>> + output { + File out = "~{prefix}.bed.gz" + } } - -task CleanVcf1b_10 { - input { - File normal_revise_vcf - File normal_revise_vcf_csi - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(normal_revise_vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 15, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##get copy state per variant## - zcat ~{normal_revise_vcf} \ - |awk '{if ($1!~"#") $1=$3;print}' OFS="\t" \ - |vcftools --vcf - --stdout --extract-FORMAT-info RD_CN \ - |gzip \ - >copystate.RD_CN.FORMAT.gz - >>> - - output { - File copystate_rd_cn="copystate.RD_CN.FORMAT.gz" - } - +task BuildGenoNormalReviseDictionary { + input { + File filtered_vcf + File intersect_bed + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size([filtered_vcf, intersect_bed], "GB") + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + input_size * 2.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + python /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_build_dict.py ~{filtered_vcf} ~{intersect_bed} \ + | gzip -1 \ + > ~{prefix}.json.gz + >>> + output { + File out = "~{prefix}.json.gz" + } } - -task CleanVcf1b_11 { - input { - File copystate_rd_cn - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(copystate_rd_cn, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 15, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##get copy state per variant## - zcat ~{copystate_rd_cn} \ - |awk 'NR>1{for(i=3;i<=NF;i++) lines[$1 "\t" $i]++ } END{for (x in lines) print x}' \ - |gzip \ - >copystate.per.variant.txt.gz - >>> - - output { - File copystate_per_variant="copystate.per.variant.txt.gz" - } - -} - - -task CleanVcf1b_12 { - input { - File int_bed - File copystate_per_variant - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([int_bed, copystate_per_variant], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 15, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##Find multi-allelic for del or dup ; CNV >1kb we trust depth ## - ##del## - zcat ~{copystate_per_variant} \ - |awk '{if ($2!="." && $2>3) print $1}' \ - |sort -u \ - |{ fgrep -wf <(zcat ~{int_bed}|awk -F"\t" '{if ($5=="DEL" && $3-$2>=1000) print $4}' ) || true; } \ - >multi.cnvs.del.txt - >>> - - output { - File multi_del="multi.cnvs.del.txt" - } - -} - - -task CleanVcf1b_13 { - input { - File int_bed - File multi_del - File copystate_per_variant - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([int_bed, multi_del, copystate_per_variant], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 15, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##dup## - mv ~{multi_del} multi.cnvs.txt - zcat ~{copystate_per_variant} \ - |awk '{if ($2!="." && ($2<1 || $2>4)) print $1}' \ - |sort -u \ - |{ fgrep -wf <(zcat ~{int_bed}|awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}' ) || true; } \ - >>multi.cnvs.txt - >>> - - output { - File multi="multi.cnvs.txt" - } - +task FilterVcf { + input { + File intermediate_vcf + File dictionary_json_gz + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size([intermediate_vcf, dictionary_json_gz], "GB") + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + input_size * 2.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + python /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part1b_filter.py ~{dictionary_json_gz} ~{intermediate_vcf} \ + | bgzip \ + > ~{prefix}.vcf.gz + mv multi.cnvs.txt ~{prefix}.multi.cnvs.txt + >>> + output { + File out = "~{prefix}.vcf.gz" + File multi_cnvs = "~{prefix}.multi.cnvs.txt" + } } diff --git a/wdl/CleanVcf5.wdl b/wdl/CleanVcf5.wdl index 7d8d7c47a..a1b75f839 100644 --- a/wdl/CleanVcf5.wdl +++ b/wdl/CleanVcf5.wdl @@ -1,1817 +1,262 @@ version 1.0 import "Structs.wdl" +import "TasksMakeCohortVcf.wdl" as tasks workflow CleanVcf5 { - input { - File revise_vcf_lines - File normal_revise_vcf - File ped_file - File sex_chr_revise - File multi_ids - File? outlier_samples_list - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override # TODO - } - - call CleanVcf5_1 { - input: - normal_revise_vcf=normal_revise_vcf, - revise_vcf_lines=revise_vcf_lines, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_2 { - input: - overlap_revise_vcf=CleanVcf5_1.overlap_revise_vcf, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_3 { - input: - overlap_revise_vcf=CleanVcf5_1.overlap_revise_vcf, - outlier_samples_list=outlier_samples_list, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_4 { - input: - overlap_revise_vcf=CleanVcf5_1.overlap_revise_vcf, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_5 { - input: - copystate_rd_cn_format=CleanVcf5_3.copystate_rd_cn_format, - overlap_revise_bed=CleanVcf5_2.overlap_revise_bed, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_6 { - input: - copystate_rd_cn_format=CleanVcf5_3.copystate_rd_cn_format, - overlap_revise_bed=CleanVcf5_2.overlap_revise_bed, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_7 { - input: - copystate_rd_cn_format=CleanVcf5_3.copystate_rd_cn_format, - overlap_revise_bed=CleanVcf5_2.overlap_revise_bed, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_8 { - input: - copystate_rd_cn_format=CleanVcf5_3.copystate_rd_cn_format, - overlap_revise_bed=CleanVcf5_2.overlap_revise_bed, - gt4copystate=CleanVcf5_7.gt4copystate, - multi_dup_ids_1=CleanVcf5_6.multi_dup_ids_1, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_9 { - input: - overlap_revise_vcf=CleanVcf5_1.overlap_revise_vcf, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_10 { - input: - genotype_gt_format=CleanVcf5_4.genotype_gt_format, - multi_dup_ids=CleanVcf5_8.multi_dup_ids, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_11 { - input: - genotype_gt_format=CleanVcf5_4.genotype_gt_format, - multi_del_ids=CleanVcf5_5.multi_del_ids, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_12 { - input: - multi_dup_ids=CleanVcf5_8.multi_dup_ids, - regeno_bed=CleanVcf5_9.regeno_bed, - gt5kb_dup_ids_1=CleanVcf5_10.gt5kb_dup_ids_1, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_13 { - input: - multi_del_ids=CleanVcf5_5.multi_del_ids, - gt5kb_del_ids_1=CleanVcf5_11.gt5kb_del_ids_1, - regeno_bed=CleanVcf5_9.regeno_bed, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_14 { - input: - overlap_revise_vcf=CleanVcf5_1.overlap_revise_vcf, - gt5kb_dup_ids=CleanVcf5_12.gt5kb_dup_ids, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_15 { - input: - overlap_revise_vcf=CleanVcf5_1.overlap_revise_vcf, - gt5kb_del_ids=CleanVcf5_13.gt5kb_del_ids, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_16 { - input: - del_int=CleanVcf5_15.del_int, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_17 { - input: - dup_int=CleanVcf5_14.dup_int, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_18 { - input: - overlap_revise_vcf=CleanVcf5_1.overlap_revise_vcf, - gt5kb_dup_ids=CleanVcf5_12.gt5kb_dup_ids, - gt5kb_del_ids=CleanVcf5_13.gt5kb_del_ids, - dup_revise=CleanVcf5_17.dup_revise, - del_revise=CleanVcf5_16.del_revise, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_19 { - input: - multi_geno_ids_txt=multi_ids, - multi_del_ids=CleanVcf5_5.multi_del_ids, - multi_dup_ids=CleanVcf5_8.multi_dup_ids, - newdepth_geno_vcf=CleanVcf5_18.newdepth_geno_vcf, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_20 { - input: - multi_dup_ids=CleanVcf5_8.multi_dup_ids, - multitagged_vcf=CleanVcf5_19.multitagged_vcf, - multitagged_vcf_tbi=CleanVcf5_19.multitagged_vcf_tbi, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_21 { - input: - multi_del_ids=CleanVcf5_5.multi_del_ids, - multitagged_vcf=CleanVcf5_19.multitagged_vcf, - multitagged_vcf_tbi=CleanVcf5_19.multitagged_vcf_tbi, - dup_multi_revise_vcf=CleanVcf5_20.dup_multi_revise_vcf, - all_multi_revised_list_1=CleanVcf5_20.all_multi_revised_list_1, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_22 { - input: - multitagged_vcf=CleanVcf5_19.multitagged_vcf, - multitagged_vcf_tbi=CleanVcf5_19.multitagged_vcf_tbi, - dup_multi_revise_vcf=CleanVcf5_20.dup_multi_revise_vcf, - del_multi_revise_vcf=CleanVcf5_21.del_multi_revise_vcf, - all_multi_revised_list_2=CleanVcf5_21.all_multi_revised_list_2, - new_header=CleanVcf5_21.new_header, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_23 { - input: - multitagged_vcf=CleanVcf5_19.multitagged_vcf, - multitagged_vcf_tbi=CleanVcf5_19.multitagged_vcf_tbi, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_24 { - input: - multi_bed=CleanVcf5_23.multi_bed, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_25 { - input: - multi_bed_overlap=CleanVcf5_24.multi_bed_overlap, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_26 { - input: - multitagged_geno_vcf=CleanVcf5_22.multitagged_geno_vcf, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_27 { - input: - multitagged_geno_vcf=CleanVcf5_22.multitagged_geno_vcf, - multi_remove=CleanVcf5_25.multi_remove, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_28 { - input: - cleantagandmulti_vcf=CleanVcf5_27.cleantagandmulti_vcf, - sv_pipeline_docker=sv_pipeline_docker - } - - if (CleanVcf5_28.out > 0) { - call CleanVcf5_29_TRUE_1 { - input: - famfile=ped_file, - cleantagandmulti_vcf=CleanVcf5_27.cleantagandmulti_vcf, - sv_pipeline_docker=sv_pipeline_docker + input { + File normal_revise_vcf + File revise_vcf_lines + File ped_file + File sex_chr_revise + File multi_ids + File? outlier_samples_list + + String prefix + String contig + Int records_per_shard + + File? make_clean_gq_script + File? find_redundant_sites_script + + String sv_base_mini_docker + String sv_pipeline_docker + + Int? threads_per_task + RuntimeAttr? runtime_attr_override_scatter + RuntimeAttr? runtime_attr_override_make_cleangq + RuntimeAttr? runtime_attr_override_find_redundant_multiallelics + RuntimeAttr? runtime_attr_override_polish } - call CleanVcf5_29_TRUE_2 { - input: - malecols=CleanVcf5_29_TRUE_1.malecols, - sex_chr_revise=sex_chr_revise, - cleantagandmulti_vcf=CleanVcf5_27.cleantagandmulti_vcf, - sv_pipeline_docker=sv_pipeline_docker + call tasks.ScatterVcf { + input: + vcf=normal_revise_vcf, + records_per_shard = records_per_shard, + prefix = "~{prefix}.scatter_vcf", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_override_scatter } - call CleanVcf5_29_TRUE_3 { - input: - cleantagandmulti_vcf=CleanVcf5_27.cleantagandmulti_vcf, - sexchr_backtoorig=CleanVcf5_29_TRUE_2.sexchr_backtoorig, - sv_pipeline_docker=sv_pipeline_docker + scatter ( i in range(length(ScatterVcf.shards)) ) { + call MakeCleanGQ { + input: + revise_vcf_lines=revise_vcf_lines, + normal_revise_vcf=ScatterVcf.shards[i], + ped_file=ped_file, + sex_chr_revise=sex_chr_revise, + multi_ids=multi_ids, + outlier_samples_list=outlier_samples_list, + make_clean_gq_script=make_clean_gq_script, + prefix="~{prefix}.make_clean_gq.shard_~{i}", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_override_make_cleangq + } } - } - - File cleansexcn_vcf_ = select_first([CleanVcf5_29_TRUE_3.cleansexcn_vcf, CleanVcf5_27.cleantagandmulti_vcf]) - call CleanVcf5_30 { - input: - cleansexcn_vcf=cleansexcn_vcf_, - sv_pipeline_docker=sv_pipeline_docker - } - - call CleanVcf5_31 { - input: - cleansexcn_vcf=cleansexcn_vcf_, - blankcheck_ids=CleanVcf5_30.blankcheck_ids, - sv_pipeline_docker=sv_pipeline_docker - } - - output { - File polished = CleanVcf5_31.polished - } -} - -task CleanVcf5_1 { - input { - File normal_revise_vcf - File revise_vcf_lines - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([normal_revise_vcf, revise_vcf_lines], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 20.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - cat <(zcat ~{normal_revise_vcf}|fgrep -wvf <(zcat ~{revise_vcf_lines}|awk '{if ($1!="") print $3}'|sort -u)) \ - <(zcat ~{revise_vcf_lines}|awk '{if ($1!="") print}' |tr ' ' '\t') \ - |vcf-sort \ - |bgzip \ - >overlap.revise.vcf.gz - >>> - - output { - File overlap_revise_vcf="overlap.revise.vcf.gz" - } -} - -task CleanVcf5_2 { - input { - File overlap_revise_vcf - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(overlap_revise_vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##create bed of VCF## - svtk vcf2bed ~{overlap_revise_vcf} overlap.revise.bed - gzip overlap.revise.bed - >>> - - output { - File overlap_revise_bed="overlap.revise.bed.gz" - } -} - -task CleanVcf5_3 { - input { - File overlap_revise_vcf - File? outlier_samples_list - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([overlap_revise_vcf, overlap_revise_vcf], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ~{if defined(outlier_samples_list) then "ln ~{outlier_samples_list} outliers.txt" else "touch outliers.txt"} - ##multi check## - zcat ~{overlap_revise_vcf} \ - |awk '{if ($1!~"#") $1=$3;print}' OFS="\t" \ - |vcftools --vcf - --remove outliers.txt --stdout --extract-FORMAT-info RD_CN \ - |gzip \ - >copystate.RD_CN.FORMAT.gz - >>> - - output { - File copystate_rd_cn_format="copystate.RD_CN.FORMAT.gz" - } -} - -task CleanVcf5_4 { - input { - File overlap_revise_vcf - File? outlier_samples_list - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([overlap_revise_vcf, outlier_samples_list], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ~{if defined(outlier_samples_list) then "ln ~{outlier_samples_list} outliers.txt" else "touch outliers.txt"} - zcat ~{overlap_revise_vcf} \ - |awk '{if ($1!~"#") $1=$3;print}' OFS="\t" \ - |vcftools --vcf - --remove outliers.txt --stdout --extract-FORMAT-info GT \ - |gzip \ - >genotype.gt.FORMAT.gz - >>> - - output { - File genotype_gt_format="genotype.gt.FORMAT.gz" - } -} - -task CleanVcf5_5 { - input { - File copystate_rd_cn_format - File overlap_revise_bed - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([copystate_rd_cn_format, overlap_revise_bed], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##New method for determining copy state based on >1% of people having an multi-allelic copy state as define above## - vf_1=$(zcat ~{copystate_rd_cn_format}|awk 'NR==1{print (NF-2) * 0.01}'|awk '{if ($1<=1) print 2; else print }' ) - - zcat ~{copystate_rd_cn_format} \ - |{ fgrep -wf <(zcat ~{overlap_revise_bed} |awk -F"\t" '{if ($5=="DEL" && $3-$2>=1000) print $4}' ) || true; } \ - |awk 'NR>1{for(i=3;i<=NF;i++) if ($i!="." && $i>3) print $1 }' \ - |sort \ - |uniq -c \ - |awk -v vf_1=$vf_1 '{if ($1>vf_1)print $2}' \ - |gzip \ - >multi.del.ids.txt.gz - >>> - - output { - File multi_del_ids="multi.del.ids.txt.gz" - } -} - -task CleanVcf5_6 { - input { - File copystate_rd_cn_format - File overlap_revise_bed - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([copystate_rd_cn_format, overlap_revise_bed], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - vf_1=$(zcat ~{copystate_rd_cn_format}|awk 'NR==1{print (NF-2) * 0.01}'|awk '{if ($1<=1) print 2; else print }' ) - - zcat ~{copystate_rd_cn_format} \ - |{ fgrep -wf <(zcat ~{overlap_revise_bed}|awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}' ) || true; } \ - |awk 'NR>1{for(i=3;i<=NF;i++) if ($i!="." && $i>4) print $1 }' \ - |sort \ - |uniq -c \ - |awk -v vf_1=$vf_1 '{if ($1>vf_1)print $2}' \ - >multi.dup.ids.txt - >>> - - output { - File multi_dup_ids_1="multi.dup.ids.txt" - } -} - -task CleanVcf5_7 { - input { - File copystate_rd_cn_format - File overlap_revise_bed - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([copystate_rd_cn_format, overlap_revise_bed], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##Case with CN 0,1,2,3,4## - zcat ~{copystate_rd_cn_format} \ - |{ fgrep -wf <(zcat ~{overlap_revise_bed} | awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}') || true; } \ - |awk 'NR>1{for(i=3;i<=NF;i++) if ($i!="." && ($i<1 || $i>4)) print $1 "\t" $i }'\ - |sort -u \ - |awk '{print $1}' \ - |sort \ - |uniq -c \ - |awk '{if ($1>4) print $2}'>gt4copystate.txt - >>> - - output { - File gt4copystate="gt4copystate.txt" - } -} - -task CleanVcf5_8 { - input { - File copystate_rd_cn_format - File overlap_revise_bed - File gt4copystate - File multi_dup_ids_1 - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([copystate_rd_cn_format, overlap_revise_bed, gt4copystate, multi_dup_ids_1], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - vf_1=$(zcat ~{copystate_rd_cn_format}|awk 'NR==1{print (NF-2) * 0.01}'|awk '{if ($1<=1) print 2; else print }' ) - - mv ~{multi_dup_ids_1} multi.dup.ids.txt - zcat ~{copystate_rd_cn_format} \ - | awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]; next} {if ($1 in inFileA) print }' <(zcat ~{overlap_revise_bed}|awk -F"\t" '{if ($5=="DUP" && $3-$2>=1000) print $4}') - \ - | awk 'NR>1{for(i=3;i<=NF;i++) if ($i!="." && ($i<1 || $i>4)) print $1 }' \ - | sort \ - | uniq -c \ - | awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]; next} {if ($1 in inFileA) print }' ~{gt4copystate} - \ - | awk -v vf_1=$vf_1 '{if ($1>vf_1)print $2}' \ - >> multi.dup.ids.txt - - sort -u multi.dup.ids.txt |gzip >multi.dup.ids.txt.gz - >>> - - output { - File multi_dup_ids="multi.dup.ids.txt.gz" - } -} - -task CleanVcf5_9 { - input { - File overlap_revise_vcf - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(overlap_revise_vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##Regenotype to determine multiallelic; we just change copy state for some nested variants and we need to make sure we get proper genotype for these; also previous stages have different notaion for multiallelic and we need to make this uniform; this is a CN based regenotyping so restricted to >5kb ## - ##Genotype big dup## - svtk vcf2bed ~{overlap_revise_vcf} regeno.bed - gzip regeno.bed - >>> - - output { - File regeno_bed="regeno.bed.gz" - } -} - -task CleanVcf5_10 { - input { - File genotype_gt_format - File multi_dup_ids - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([genotype_gt_format, multi_dup_ids], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##add variants that are <5kb because clustering but have a mutliallelic genotype from before## - zcat ~{genotype_gt_format} \ - |awk '{if ($1~"DUP") print}' \ - |awk '{for (i = 3; i <= NF; ++i) print $1 "\t" $i}' \ - |awk '{if ($2!="1/1" && $2!="0/0" && $2!="0/1" && $2!="./.") print $1}' \ - |{ fgrep -wvf <(zcat ~{multi_dup_ids}) || true; } \ - |sort -u \ - >gt5kb.dup.ids.txt - >>> - - output { - File gt5kb_dup_ids_1="gt5kb.dup.ids.txt" - } -} - -task CleanVcf5_11 { - input { - File genotype_gt_format - File multi_del_ids - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([genotype_gt_format, multi_del_ids], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - zcat ~{genotype_gt_format} \ - |awk '{if ($1~"DEL") print}' \ - |awk '{for (i = 3; i <= NF; ++i) print $1 "\t" $i}' \ - |awk '{if ($2!="1/1" && $2!="0/0" && $2!="0/1" && $2!="./.") print $1}' \ - |{ fgrep -wvf <(zcat ~{multi_del_ids}) || true; } \ - |sort -u \ - >gt5kb.del.ids.txt - >>> - - output { - File gt5kb_del_ids_1="gt5kb.del.ids.txt" - } -} - -task CleanVcf5_12 { - input { - File multi_dup_ids - File regeno_bed - File gt5kb_dup_ids_1 - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([multi_dup_ids, regeno_bed, gt5kb_dup_ids_1], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - mv ~{gt5kb_dup_ids_1} gt5kb.dup.ids.txt - ##generate list## - ##CNV >5kb, split del and dup ## - if [ -f ~{multi_dup_ids} ] - then - zcat ~{regeno_bed} \ - |awk '{if ($3-$2>=5000 && $5=="DUP")print $4}' \ - |{ fgrep -wvf <(zcat ~{multi_dup_ids}) || true; } \ - >>gt5kb.dup.ids.txt - else - zcat ~{regeno_bed} \ - |awk '{if ($3-$2>=5000 && $5=="DUP")print $4}' \ - >>gt5kb.dup.ids.txt - fi - >>> - - output { - File gt5kb_dup_ids="gt5kb.dup.ids.txt" - } -} - -task CleanVcf5_13 { - input { - File multi_del_ids - File gt5kb_del_ids_1 - File regeno_bed - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([multi_del_ids, gt5kb_del_ids_1, regeno_bed], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - mv ~{gt5kb_del_ids_1} gt5kb.del.ids.txt - if [ -f ~{multi_del_ids} ] - then - zcat ~{regeno_bed} \ - |awk '{if ($3-$2>=5000 && $5=="DEL")print $4}' \ - |{ fgrep -wvf <(zcat ~{multi_del_ids}) || true; } \ - >>gt5kb.del.ids.txt - else - zcat ~{regeno_bed} \ - |awk '{if ($3-$2>=5000 && $5=="DEL")print $4}' \ - >>gt5kb.del.ids.txt - fi - - >>> - - output { - File gt5kb_del_ids="gt5kb.del.ids.txt" - } -} - -task CleanVcf5_14 { - input { - File overlap_revise_vcf - File gt5kb_dup_ids - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([overlap_revise_vcf, gt5kb_dup_ids], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - zcat ~{overlap_revise_vcf} \ - |fgrep -wf ~{gt5kb_dup_ids} \ - >dup.int.txt || true - >>> - - output { - File dup_int="dup.int.txt" - } -} - -task CleanVcf5_15 { - input { - File overlap_revise_vcf - File gt5kb_del_ids - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([overlap_revise_vcf, gt5kb_del_ids], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - zcat ~{overlap_revise_vcf} \ - |fgrep -wf ~{gt5kb_del_ids} \ - >>del.int.txt || true - >>> - output { - File del_int="del.int.txt" - } -} - -task CleanVcf5_16 { - input { - File del_int - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(del_int, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##regenotype VCF## - dellen=$(cat ~{del_int}|wc -l) - columnlen=$(less ~{del_int}|cut -f10-|tr '\t' '\n' |wc -l) - dellenchange=$(echo $dellen $columnlen|awk '{if ($1 == 0) { print "0" } else { print $2/$1}}') - - paste <(less ~{del_int}|cut -f1-9) <(less ~{del_int}|cut -f10-|tr '\t' '\n' \ - |awk -F':' '{if ($3>=2 && $1!="./.") $1="0/0"; \ - else if ($3==1 && $1!="./.") $1="0/1"; \ - else if ($1!="./.")$1="1/1";print}' OFS=":" \ - |awk -v lenchange=$dellenchange 'NR%lenchange {printf("%s\t", $0); next} \ - {print $0}')>del.revise.txt - >>> - - output { - File del_revise="del.revise.txt" - } -} - -task CleanVcf5_17 { - input { - File dup_int - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(dup_int, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - duplen=$(cat ~{dup_int}|wc -l) - columnlen=$(less ~{dup_int}|cut -f10-|tr '\t' '\n' |wc -l) - duplenchange=$(echo $duplen $columnlen|awk '{if ($1 == 0) { print "0" } else { print $2/$1}}') - - paste <(less ~{dup_int}|cut -f1-9) <(less ~{dup_int}|cut -f10-|tr '\t' '\n' \ - |awk -F':' '{if ($3<=2 && $1!="./.") $1="0/0"; \ - else if ($3==3 && $1!="./.") $1="0/1"; \ - else if ($1!="./.") $1="1/1";print}' OFS=":" \ - |awk -v lenchange=$duplenchange 'NR%lenchange {printf("%s\t", $0); next} \ - {print $0}') >dup.revise.txt - >>> - - output { - File dup_revise="dup.revise.txt" - } -} - -task CleanVcf5_18 { - input { - File overlap_revise_vcf - File gt5kb_dup_ids - File gt5kb_del_ids - File dup_revise - File del_revise - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([overlap_revise_vcf, gt5kb_dup_ids, gt5kb_del_ids, dup_revise, del_revise], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 10.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - cat <(zcat ~{overlap_revise_vcf}|fgrep -wvf <(cat ~{gt5kb_dup_ids} ~{gt5kb_del_ids})) \ - <(cat ~{dup_revise} ~{del_revise}) \ - |vcf-sort \ - |bgzip \ - >newdepth.geno.vcf.gz - >>> - - output { - File newdepth_geno_vcf="newdepth.geno.vcf.gz" - } -} - -task CleanVcf5_19 { - input { - File multi_geno_ids_txt - File multi_del_ids - File multi_dup_ids - File newdepth_geno_vcf - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([multi_geno_ids_txt, multi_del_ids, multi_dup_ids, newdepth_geno_vcf, newdepth_geno_vcf], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##Tag multi## - ##Add filters to header## - zcat ~{newdepth_geno_vcf} \ - |awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]; next} {if ($3 in inFileA && $1!~"#" && $7!~"PESR_GT_OVERDISPERSION") $7=$7";PESR_GT_OVERDISPERSION"; print }' \ - <(cat <(zcat ~{multi_geno_ids_txt}) <(printf "\n")) - \ - |awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]; next} {if ($3 in inFileA && $1!~"#") $7=$7";MULTIALLELIC"; print }' \ - <(cat <(zcat ~{multi_del_ids} ~{multi_dup_ids} |sort -u) <(printf "\n")) - \ - |sed 's\PASS;\\g' \ - |awk '{if (NR==2) print $0 "\n" "##FILTER=" ;else print}' \ - |awk '{if (NR==2) print $0 "\n" "##FILTER=" ;else print}' \ - |bgzip \ - >multitagged.vcf.gz - tabix multitagged.vcf.gz - >>> - - output { - File multitagged_vcf="multitagged.vcf.gz" - File multitagged_vcf_tbi="multitagged.vcf.gz.tbi" - } -} - -task CleanVcf5_20 { - input { - File multi_dup_ids - File multitagged_vcf - File multitagged_vcf_tbi - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([multi_dup_ids, multitagged_vcf], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - # use BCFTOOLS 1.9 - BCFTOOLS=/usr/local/bin/bcftools - - touch all.multi.revised.list - touch dup.multi.revise.vcf - if [ $(zcat ~{multi_dup_ids}|wc -l) -ge 1 ] - then - /opt/sv-pipeline/04_variant_resolution/scripts/reset_multiallelic_format_fields.py ~{multitagged_vcf} <(zcat ~{multi_dup_ids}) > dup.multi.revise.vcf - ${BCFTOOLS} query -f '%ID\n' dup.multi.revise.vcf >> all.multi.revised.list - fi - >>> - - output { - File dup_multi_revise_vcf="dup.multi.revise.vcf" - File all_multi_revised_list_1="all.multi.revised.list" - } -} - -task CleanVcf5_21 { - input { - File multi_del_ids - File multitagged_vcf - File multitagged_vcf_tbi - File dup_multi_revise_vcf - File all_multi_revised_list_1 - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([multi_del_ids, multitagged_vcf, dup_multi_revise_vcf, all_multi_revised_list_1], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - # use BCFTOOLS 1.9 - BCFTOOLS=/usr/local/bin/bcftools - - mv ~{all_multi_revised_list_1} all.multi.revised.list - touch del.multi.revise.vcf - if [ $(zcat ~{multi_del_ids}|wc -l) -ge 1 ] - then - /opt/sv-pipeline/04_variant_resolution/scripts/reset_multiallelic_format_fields.py ~{multitagged_vcf} <(zcat ~{multi_del_ids}) > del.multi.revise.vcf - ${BCFTOOLS} query -f '%ID\n' del.multi.revise.vcf >> all.multi.revised.list - fi - - # make sure that the new header includes CN and CNQ format fields if we set any - if [ -s ~{dup_multi_revise_vcf} ] - then - grep '^#' ~{dup_multi_revise_vcf} > new_header.vcf - elif [ -s del.multi.revise.vcf ] - then - grep '^#' del.multi.revise.vcf > new_header.vcf - else - zcat ~{multitagged_vcf} | grep '^#' > new_header.vcf - fi - >>> - - output { - File del_multi_revise_vcf="del.multi.revise.vcf" - File all_multi_revised_list_2="all.multi.revised.list" - File new_header="new_header.vcf" - } -} - - -task CleanVcf5_22 { - input { - File multitagged_vcf - File multitagged_vcf_tbi - File dup_multi_revise_vcf - File del_multi_revise_vcf - File all_multi_revised_list_2 - File new_header - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([multitagged_vcf, dup_multi_revise_vcf, del_multi_revise_vcf, all_multi_revised_list_2, new_header], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 20.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - # use BCFTOOLS 1.9 - BCFTOOLS=/usr/local/bin/bcftools - - # combine the revised variants with the unrevised variants, reheader, resort, and compress - cat <(zcat ~{multitagged_vcf} | fgrep -wvf ~{all_multi_revised_list_2}) \ - <(cat ~{del_multi_revise_vcf} ~{dup_multi_revise_vcf} | grep -v '^#' | awk '!seen[$3]++') \ - |${BCFTOOLS} reheader -h ~{new_header} \ - |vcf-sort \ - |bgzip \ - >multitagged.geno.vcf.gz - >>> - - output { - File multitagged_geno_vcf="multitagged.geno.vcf.gz" - } -} - -task CleanVcf5_23 { - input { - File multitagged_vcf - File multitagged_vcf_tbi - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(multitagged_vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##remove overlapping multi### - zcat ~{multitagged_vcf} \ - |awk -F'\t' '{if ($1~"#" || ($7~"MULTIALLELIC" && ($5=="" || $5==""))) print}' \ - |svtk vcf2bed stdin tmp.bed - cut -f1-5 tmp.bed \ - |gzip \ - >multi.bed.gz - >>> - - output { - File multi_bed="multi.bed.gz" - } -} - -task CleanVcf5_24 { - input { - File multi_bed - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(multi_bed, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##strip out overlapping multiallelics## - bedtools intersect -wa -wb -a ~{multi_bed} -b ~{multi_bed} \ - |awk -F'\t' '{if ($4!=$9 && $3-$2>=$8-$7) print $0; \ - else if ($4!=$9) print $6,$7,$8,$9,$10,$1,$2,$3,$4,$5}' OFS="\t" \ - |sort -u \ - |awk '{print $3-$2,$8-$7,$0}' OFS="\t" \ - |sort -nrk1,1 -k2,2nr \ - |cut -f3- \ - >multi.bed.overlap.txt - >>> - - output { - File multi_bed_overlap="multi.bed.overlap.txt" - } -} - -task CleanVcf5_25 { - input { - File multi_bed_overlap - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(multi_bed_overlap, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - echo "">multi.remove.txt - - while read bed - do - echo "$bed"|cut -d$'\t' -f1-5 >large.bed - echo "$bed"|cut -d$'\t' -f6-10>small.bed - overlap=$(bedtools coverage -a small.bed -b large.bed|awk '{if ($NF>0.50) print "YES";else print "NO"}') - echo $bed|awk '{print $4}' - if [ "$overlap" == "YES" ] && [ $(awk '{print $4}' large.bed|fgrep -wf - multi.remove.txt|wc -l) -eq 0 ] - then - awk '{print $4}' small.bed >>multi.remove.txt - fi - done< ~{multi_bed_overlap} - >>> - - output { - File multi_remove="multi.remove.txt" - } -} - -task CleanVcf5_26 { - input { - File multitagged_geno_vcf - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(multitagged_geno_vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - # use BCFTOOLS 1.9 - BCFTOOLS=/usr/local/bin/bcftools - - ##get alt tag for multiallelics## - ## produces a file with a row for each distinct multialllic variant ID and copy number combination - ${BCFTOOLS} query -i 'FILTER = "MULTIALLELIC"' -f '[%ID\t%CN\n]' ~{multitagged_geno_vcf} \ - |sort -u >multi.cn.txt - >>> - - output { - File multi_cn="multi.cn.txt" - } -} - -task CleanVcf5_27 { - input { - File multitagged_geno_vcf - File multi_remove - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([multitagged_geno_vcf, multi_remove], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - # use BCFTOOLS 1.9 - BCFTOOLS=/usr/local/bin/bcftools - - ##strip out variants with no genotypes and overlapping multiallelics## - ### Find missing genotype and then add multiallelics that need to be removed### - ##change multiallelics svtype into mCNV## - ##add CN information to ALT column## - zcat ~{multitagged_geno_vcf} \ - |${BCFTOOLS} view -e 'FILTER == "MULTIALLELIC"' \ - |svtk vcf2bed stdin tmp.bed - - awk -F'\t' '{if ($6=="") print $4}' tmp.bed \ - |cat - ~{multi_remove} \ - |sed '/^$/d' \ - |{ fgrep -wvf - <(zcat ~{multitagged_geno_vcf} ) || true; } \ - |awk -F';' '{if ($1~"MULTIALLELIC" && ( $2~"DEL" || $2~"DUP")) $2="SVTYPE=CNV"; print}' OFS=';' \ - |awk '{OFS="\t"; if ($8~"SVTYPE=CNV;") $5=""; print}' \ - |bgzip \ - >cleantagandmulti.vcf.gz - >>> - - output { - File cleantagandmulti_vcf="cleantagandmulti.vcf.gz" - } -} - -task CleanVcf5_28 { - input { - File cleantagandmulti_vcf - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(cleantagandmulti_vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - zcat ~{cleantagandmulti_vcf}|awk '{if (($1~"X" || $1~"Y") && $1!~"#") print}'|wc -l > out - >>> - - output { - Int out=read_int("out") - } -} - -task CleanVcf5_29_TRUE_1 { - input { - File famfile - File cleantagandmulti_vcf - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([famfile, cleantagandmulti_vcf], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##Determine columns male columns## - zcat ~{cleantagandmulti_vcf}\ - |egrep ^# \ - |tail -n 1 \ - |tr '\t' '\n' \ - |cat -n - \ - >col.txt - - awk '{if ($5==1) print $2}' ~{famfile} \ - |{ fgrep -wf - col.txt || true; } \ - >malecols.txt - >>> - - output { - File malecols="malecols.txt" - } -} - -task CleanVcf5_29_TRUE_2 { - input { - File sex_chr_revise - File cleantagandmulti_vcf - File malecols - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([sex_chr_revise, cleantagandmulti_vcf, malecols], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - ##regenotype male calls on sex chr and add 1 to copy state for multialleic check## - - python3 < sexchr.backtoorig.txt.gz - import pysam - import sys - - with open("~{malecols}") as f: - samples = [x.strip().split('\t')[1] for x in f.readlines() if x] - - with open("~{sex_chr_revise}") as f: - vids = set([x.strip() for x in f.readlines() if x]) - - vcf = pysam.VariantFile("~{cleantagandmulti_vcf}") - - for record in vcf: - if record.id not in vids: - continue - for i in samples: - g = record.samples[i] - if g['RD_CN'] is not None and g['RD_CN'] >= 1: - g['RD_CN'] = g['RD_CN'] - 1 - sys.stdout.write(str(record)) - - vcf.close() - CODE + call FindRedundantMultiallelics { + input: + multiallelic_vcfs=MakeCleanGQ.multiallelic_vcf, + find_redundant_sites_script=find_redundant_sites_script, + prefix="~{prefix}.find_redundant_multiallelics", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_override_find_redundant_multiallelics + } - >>> + call Polish { + input: + clean_gq_vcfs=MakeCleanGQ.clean_gq_vcf, + no_sample_lists=MakeCleanGQ.no_sample_list, + redundant_multiallelics_list=FindRedundantMultiallelics.redundant_multiallelics_list, + prefix="~{prefix}.polish", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_override_polish + } - output { - File sexchr_backtoorig="sexchr.backtoorig.txt.gz" - } + output { + File polished=Polish.polished + } } -task CleanVcf5_29_TRUE_3 { - input { - File cleantagandmulti_vcf - File sexchr_backtoorig - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([cleantagandmulti_vcf, sexchr_backtoorig], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 50.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } +task MakeCleanGQ { + input { + File revise_vcf_lines + File normal_revise_vcf + File ped_file + File sex_chr_revise + File multi_ids + File? outlier_samples_list + File? make_clean_gq_script + String prefix + Int? threads = 2 + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } - command <<< - set -euxo pipefail - cat <(zcat ~{cleantagandmulti_vcf}|fgrep -wvf <(zcat ~{sexchr_backtoorig}|awk '{print $3}' )) \ - <(zcat ~{sexchr_backtoorig} |awk '{if ($1!="") print}' |tr ' ' '\t') \ - |vcf-sort \ - |bgzip \ - >cleansexCN.vcf.gz - >>> + # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed + # generally assume working memory is ~3 * inputs + Float input_size = size( + select_all([revise_vcf_lines, normal_revise_vcf, ped_file, sex_chr_revise, multi_ids, outlier_samples_list]), + "GB") + Float base_disk_gb = 10.0 + + RuntimeAttr runtime_default = object { + mem_gb: 16, + disk_gb: ceil(base_disk_gb + input_size * 5.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } - output { - File cleansexcn_vcf="cleansexCN.vcf.gz" - } + command <<< + set -eu -o pipefail + + ~{if defined(outlier_samples_list) then "ln ~{outlier_samples_list} outliers.txt" else "touch outliers.txt"} + + # put the revise lines into a normal VCF format + bcftools view -h ~{normal_revise_vcf} > header.txt + cat header.txt <(zcat ~{revise_vcf_lines} | grep . | tr " " "\t") | bgzip -c > revise.vcf.lines.vcf.gz + + python3 ~{default="/opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_update_records.py" make_clean_gq_script} \ + --threads_per_file ~{threads} \ + revise.vcf.lines.vcf.gz \ + ~{normal_revise_vcf} \ + ~{ped_file} \ + ~{sex_chr_revise} \ + ~{multi_ids} \ + outliers.txt \ + ~{prefix} + + bcftools view -G -O z ~{prefix}.multiallelic.vcf.gz > ~{prefix}.multiallelic.sites.vcf.gz + tabix ~{prefix}.cleanGQ.vcf.gz + >>> + + output { + File clean_gq_vcf=prefix + ".cleanGQ.vcf.gz" + File clean_gq_vcf_idx=prefix + ".cleanGQ.vcf.gz.tbi" + File multiallelic_vcf=prefix + ".multiallelic.sites.vcf.gz" + File no_sample_list = prefix + ".no_called_samples.list" + } } -task CleanVcf5_30 { - input { - File cleansexcn_vcf - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } +task FindRedundantMultiallelics { + input { + Array[File] multiallelic_vcfs + File? find_redundant_sites_script + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } - Float input_size = size(cleansexcn_vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } + # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed + # generally assume working memory is ~3 * inputs + Float input_size = size(multiallelic_vcfs, "GB") + Float base_disk_gb = 10.0 + + RuntimeAttr runtime_default = object { + mem_gb: 16, + disk_gb: ceil(base_disk_gb + input_size * 5.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } - command <<< - set -euxo pipefail + command <<< + set -euo pipefail + VCFS="~{write_lines(multiallelic_vcfs)}" + cat $VCFS | awk -F '/' '{print $NF"\t"$0}' | sort -k1,1V | awk '{print $2}' > vcfs_sorted.list + bcftools concat --no-version --output-type z --file-list vcfs_sorted.list --output multiallelic.vcf.gz - mv ~{cleansexcn_vcf} cleanGQ.vcf.gz - ##find blank variants with no samples## - svtk vcf2bed cleanGQ.vcf.gz tmp.bed + python3 ~{default="/opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part5_find_redundant_multiallelics.py" find_redundant_sites_script} \ + multiallelic.vcf.gz \ + ~{prefix}.list - awk -F'\t' '{if ($5!~"CN" && $6=="") print $4}' tmp.bed \ - >blankcheck.ids.txt - >>> + >>> - output { - File blankcheck_ids="blankcheck.ids.txt" - } + output { + File redundant_multiallelics_list="~{prefix}.list" + } } -task CleanVcf5_31 { - input { - File cleansexcn_vcf - File blankcheck_ids - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - Float input_size = size([cleansexcn_vcf, blankcheck_ids], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } +task Polish { + input { + Array[File] clean_gq_vcfs + Array[File] no_sample_lists + File redundant_multiallelics_list + String prefix + String sv_pipeline_docker + Int threads = 2 + RuntimeAttr? runtime_attr_override + } - command <<< - set -euxo pipefail - mv ~{cleansexcn_vcf} cleanGQ.vcf.gz - ##Fix header## - ##get header to clean## - ##add new filters## - zcat cleanGQ.vcf.gz \ - |awk '{if ($1~"##" && NR>1) print}' \ - |{ fgrep -v "MULTIALLELIC" || true; } \ - |awk '{if (NR==2) print $0 "\n" "##FILTER=" ;else print}' \ - |awk '{if (NR==2) print $0 "\n" "##ALT=" ;else print}' \ - |sort -k1,1 \ - |{ egrep -v "CIPOS|CIEND|RMSSTD|EVENT|INFO=polished.vcf.gz - >>> + # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed + # generally assume working memory is ~3 * inputs + Float input_size = size(clean_gq_vcfs, "GB") + Float base_disk_gb = 10.0 + + RuntimeAttr runtime_default = object { + mem_gb: 16, + disk_gb: ceil(base_disk_gb + input_size * 5.0), + cpu_cores: 4, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } - output { - File polished="polished.vcf.gz" - } + command <<< + set -euo pipefail + + VCFS="~{write_lines(clean_gq_vcfs)}" + cat $VCFS | awk -F '/' '{print $NF"\t"$0}' | sort -k1,1V | awk '{print $2}' > vcfs_sorted.list + cat ~{redundant_multiallelics_list} ~{sep=" " no_sample_lists} > ids_to_remove.list + /usr/local/bin/bcftools concat --no-version --output-type u --file-list vcfs_sorted.list | \ + /usr/local/bin/bcftools view --no-version \ + --exclude 'ID=@ids_to_remove.list' \ + --output-type z -o polished.need_reheader.vcf.gz --threads ~{threads} + + # do the last bit of header cleanup + bcftools view -h polished.need_reheader.vcf.gz | awk 'NR == 1' > new_header.vcf + bcftools view -h polished.need_reheader.vcf.gz \ + | awk 'NR > 1' \ + | egrep -v "CIPOS|CIEND|RMSSTD|EVENT|INFO=> new_header.vcf + bcftools reheader polished.need_reheader.vcf.gz -h new_header.vcf -o ~{prefix}.vcf.gz + >>> + + output { + File polished="~{prefix}.vcf.gz" + } } diff --git a/wdl/CleanVcfChromosome.wdl b/wdl/CleanVcfChromosome.wdl index 47e02f193..2a1230da4 100644 --- a/wdl/CleanVcfChromosome.wdl +++ b/wdl/CleanVcfChromosome.wdl @@ -2,10 +2,9 @@ version 1.0 import "Structs.wdl" import "TasksMakeCohortVcf.wdl" as MiniTasks -import "CleanVcf1.wdl" as c1 import "CleanVcf1b.wdl" as c1b import "CleanVcf5.wdl" as c5 -import "DropRedundantCNVs.wdl" as drc +import "HailMerge.wdl" as HailMerge workflow CleanVcfChromosome { input { @@ -19,99 +18,163 @@ workflow CleanVcfChromosome { File bothsides_pass_list Int min_records_per_shard_step1 Int samples_per_step2_shard + Int clean_vcf1b_records_per_shard + Int clean_vcf5_records_per_shard + Int? clean_vcf5_threads_per_task File? outlier_samples_list + Int? max_samples_per_shard_step3 + + String chr_x + String chr_y + + Boolean use_hail + String? gcs_project String linux_docker String sv_base_mini_docker String sv_pipeline_docker + String sv_pipeline_hail_docker + String sv_pipeline_updates_docker # overrides for local tasks RuntimeAttr? runtime_override_clean_vcf_1a - RuntimeAttr? runtime_override_clean_vcf_1b RuntimeAttr? runtime_override_clean_vcf_2 RuntimeAttr? runtime_override_clean_vcf_3 RuntimeAttr? runtime_override_clean_vcf_4 - RuntimeAttr? runtime_override_clean_vcf_5 + RuntimeAttr? runtime_override_clean_vcf_5_scatter + RuntimeAttr? runtime_override_clean_vcf_5_make_cleangq + RuntimeAttr? runtime_override_clean_vcf_5_find_redundant_multiallelics + RuntimeAttr? runtime_override_clean_vcf_5_polish RuntimeAttr? runtime_override_stitch_fragmented_cnvs RuntimeAttr? runtime_override_final_cleanup + # Clean vcf 1b + RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b + RuntimeAttr? runtime_attr_override_sort_bed_1b + RuntimeAttr? runtime_attr_override_intersect_bed_1b + RuntimeAttr? runtime_attr_override_build_dict_1b + RuntimeAttr? runtime_attr_override_scatter_1b + RuntimeAttr? runtime_attr_override_filter_vcf_1b + RuntimeAttr? runtime_override_concat_vcfs_1b + RuntimeAttr? runtime_override_cat_multi_cnvs_1b + + RuntimeAttr? runtime_override_preconcat_step1 + RuntimeAttr? runtime_override_hail_merge_step1 + RuntimeAttr? runtime_override_fix_header_step1 + + RuntimeAttr? runtime_override_preconcat_drc + RuntimeAttr? runtime_override_hail_merge_drc + RuntimeAttr? runtime_override_fix_header_drc + # overrides for MiniTasks RuntimeAttr? runtime_override_split_vcf_to_clean - RuntimeAttr? runtime_override_combine_step_1_vcfs RuntimeAttr? runtime_override_combine_step_1_sex_chr_revisions RuntimeAttr? runtime_override_split_include_list RuntimeAttr? runtime_override_combine_clean_vcf_2 RuntimeAttr? runtime_override_combine_revised_4 RuntimeAttr? runtime_override_combine_multi_ids_4 + RuntimeAttr? runtime_override_drop_redundant_cnvs + RuntimeAttr? runtime_override_combine_step_1_vcfs + RuntimeAttr? runtime_override_sort_drop_redundant_cnvs + } call MiniTasks.SplitVcf as SplitVcfToClean { input: vcf=vcf, contig=contig, - prefix="~{prefix}.~{contig}.shard_", + prefix="~{prefix}.shard_", n_shards=max_shards_per_chrom_step1, min_vars_per_shard=min_records_per_shard_step1, sv_base_mini_docker=sv_base_mini_docker, runtime_attr_override=runtime_override_split_vcf_to_clean } - scatter ( vcf_shard in SplitVcfToClean.vcf_shards ) { - call c1.CleanVcf1 as CleanVcf1a { + scatter ( i in range(length(SplitVcfToClean.vcf_shards)) ) { + call CleanVcf1a { input: - vcf=vcf_shard, - background_list=background_list, - ped_file=ped_file, - sv_pipeline_docker=sv_pipeline_docker, - linux_docker=linux_docker, + vcf=SplitVcfToClean.vcf_shards[i], + prefix="~{prefix}.clean_vcf_1.shard_~{i}", + background_fail_list=background_list, bothsides_pass_list=bothsides_pass_list, + ped_file=ped_file, allosome_fai=allosome_fai, + chr_x=chr_x, + chr_y=chr_y, + sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_override_clean_vcf_1a } } - call MiniTasks.ConcatVcfs as CombineStep1Vcfs { - input: - vcfs=CleanVcf1a.intermediate_vcf, - vcfs_idx=CleanVcf1a.intermediate_vcf_idx, - naive=true, - generate_index=false, - outfile_prefix=prefix + ".cleanVCF_step1.intermediate_vcf.merged", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_combine_step_1_vcfs + if (use_hail) { + call HailMerge.HailMerge as CombineStep1VcfsHail { + input: + vcfs=CleanVcf1a.intermediate_vcf, + prefix="~{prefix}.combine_step_1_vcfs", + gcs_project=gcs_project, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, + runtime_override_preconcat=runtime_override_preconcat_step1, + runtime_override_hail_merge=runtime_override_hail_merge_step1, + runtime_override_fix_header=runtime_override_fix_header_step1 + } + } + if (!use_hail) { + call MiniTasks.ConcatVcfs as CombineStep1Vcfs { + input: + vcfs=CleanVcf1a.intermediate_vcf, + vcfs_idx=CleanVcf1a.intermediate_vcf_idx, + naive=true, + generate_index=false, + outfile_prefix="~{prefix}.combine_step_1_vcfs", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_combine_step_1_vcfs + } } call MiniTasks.CatUncompressedFiles as CombineStep1SexChrRevisions { input: shards=CleanVcf1a.sex, - outfile_name=prefix + ".cleanVCF_step1.sexchr_revise.merged.txt", + outfile_name="~{prefix}.combine_step_1_sex_chr_revisions.txt", sv_base_mini_docker=sv_base_mini_docker, runtime_attr_override=runtime_override_combine_step_1_sex_chr_revisions } call c1b.CleanVcf1b { input: - intermediate_vcf=CombineStep1Vcfs.concat_vcf, + intermediate_vcf=select_first([CombineStep1Vcfs.concat_vcf, CombineStep1VcfsHail.merged_vcf]), + prefix="~{prefix}.clean_vcf_1b", + records_per_shard=clean_vcf1b_records_per_shard, sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_clean_vcf_1b + sv_pipeline_updates_docker=sv_pipeline_updates_docker, + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override_subset_large_cnvs=runtime_attr_override_subset_large_cnvs_1b, + runtime_attr_override_sort_bed=runtime_attr_override_sort_bed_1b, + runtime_attr_override_intersect_bed=runtime_attr_override_intersect_bed_1b, + runtime_attr_override_build_dict=runtime_attr_override_build_dict_1b, + runtime_attr_override_scatter=runtime_attr_override_scatter_1b, + runtime_attr_override_filter_vcf=runtime_attr_override_filter_vcf_1b, + runtime_override_concat_vcfs=runtime_override_concat_vcfs_1b, + runtime_override_cat_multi_cnvs=runtime_override_cat_multi_cnvs_1b } call MiniTasks.SplitUncompressed as SplitIncludeList { input: whole_file=CleanVcf1a.include_list[0], lines_per_shard=samples_per_step2_shard, - shard_prefix="includeexclude.", + shard_prefix="~{prefix}.split_include_list.", sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_override_split_include_list } - scatter ( included_interval in SplitIncludeList.shards ){ - call CleanVcf2{ + scatter ( i in range(length(SplitIncludeList.shards)) ){ + call CleanVcf2 { input: normal_revise_vcf=CleanVcf1b.normal, - include_list=included_interval, + prefix="~{prefix}.clean_vcf_2.shard_~{i}", + include_list=SplitIncludeList.shards[i], multi_cnvs=CleanVcf1b.multi, - vcftools_idx=CleanVcf1b.vcftools_idx, sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_override_clean_vcf_2 } @@ -120,6 +183,7 @@ workflow CleanVcfChromosome { call MiniTasks.CatUncompressedFiles as CombineCleanVcf2 { input: shards=CleanVcf2.out, + outfile_name="~{prefix}.combine_clean_vcf_2.txt", sv_base_mini_docker=sv_base_mini_docker, runtime_attr_override=runtime_override_combine_clean_vcf_2 } @@ -127,15 +191,17 @@ workflow CleanVcfChromosome { call CleanVcf3 { input: rd_cn_revise=CombineCleanVcf2.outfile, + max_samples_shard = max_samples_per_shard_step3, sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_override_clean_vcf_3 } - scatter ( rd_cn_revise in CleanVcf3.shards ){ + scatter ( i in range(length(CleanVcf3.shards)) ){ call CleanVcf4 { input: - rd_cn_revise=rd_cn_revise, + rd_cn_revise=CleanVcf3.shards[i], normal_revise_vcf=CleanVcf1b.normal, + prefix="~{prefix}.clean_vcf_4.shard_~{i}", sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_override_clean_vcf_4 } @@ -144,7 +210,7 @@ workflow CleanVcfChromosome { call MiniTasks.CatUncompressedFiles as CombineRevised4 { input: shards=CleanVcf4.out, - outfile_name="revise.vcf.lines.txt.gz", + outfile_name="~{prefix}.combine_revised_4.txt.gz", sv_base_mini_docker=sv_base_mini_docker, runtime_attr_override=runtime_override_combine_revised_4 } @@ -152,7 +218,7 @@ workflow CleanVcfChromosome { call MiniTasks.CatUncompressedFiles as CombineMultiIds4 { input: shards=CleanVcf4.multi_ids, - outfile_name="multi.geno.ids.txt.gz", + outfile_name="~{prefix}.combine_multi_ids_4.txt.gz", sv_base_mini_docker=sv_base_mini_docker, runtime_attr_override=runtime_override_combine_multi_ids_4 } @@ -165,22 +231,56 @@ workflow CleanVcfChromosome { sex_chr_revise=CombineStep1SexChrRevisions.outfile, multi_ids=CombineMultiIds4.outfile, outlier_samples_list=outlier_samples_list, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_clean_vcf_5 + contig=contig, + prefix="~{prefix}.clean_vcf_5", + records_per_shard=clean_vcf5_records_per_shard, + threads_per_task=clean_vcf5_threads_per_task, + sv_pipeline_docker=sv_pipeline_updates_docker, + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override_scatter=runtime_override_clean_vcf_5_scatter, + runtime_attr_override_make_cleangq=runtime_override_clean_vcf_5_make_cleangq, + runtime_attr_override_find_redundant_multiallelics=runtime_override_clean_vcf_5_find_redundant_multiallelics, + runtime_attr_override_polish=runtime_override_clean_vcf_5_polish } - call drc.DropRedundantCNVs { + call DropRedundantCnvs { input: vcf=CleanVcf5.polished, + prefix="~{prefix}.drop_redundant_cnvs", contig=contig, - sv_pipeline_docker=sv_pipeline_docker + sv_pipeline_docker=sv_pipeline_updates_docker, + runtime_attr_override=runtime_override_drop_redundant_cnvs + } + + if (use_hail) { + call HailMerge.HailMerge as SortDropRedundantCnvsHail { + input: + vcfs=[DropRedundantCnvs.out], + prefix="~{prefix}.drop_redundant_cnvs.sorted", + gcs_project=gcs_project, + reset_cnv_gts=true, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, + runtime_override_preconcat=runtime_override_preconcat_drc, + runtime_override_hail_merge=runtime_override_hail_merge_drc, + runtime_override_fix_header=runtime_override_fix_header_drc + } + } + if (!use_hail) { + call MiniTasks.SortVcf as SortDropRedundantCnvs { + input: + vcf=DropRedundantCnvs.out, + outfile_prefix="~{prefix}.drop_redundant_cnvs.sorted", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_sort_drop_redundant_cnvs + } } call StitchFragmentedCnvs { input: - vcf=DropRedundantCNVs.cleaned_vcf_shard, - contig=contig, - prefix=prefix, + vcf=select_first([SortDropRedundantCnvs.out, SortDropRedundantCnvsHail.merged_vcf]), + prefix="~{prefix}.stitch_fragmented_cnvs", sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_override_stitch_fragmented_cnvs } @@ -189,7 +289,7 @@ workflow CleanVcfChromosome { input: vcf=StitchFragmentedCnvs.stitched_vcf_shard, contig=contig, - prefix=prefix, + prefix="~{prefix}.final_cleanup", sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_override_final_cleanup @@ -202,25 +302,89 @@ workflow CleanVcfChromosome { } +task CleanVcf1a { + input { + File vcf + String prefix + File background_fail_list + File bothsides_pass_list + File ped_file + File allosome_fai + String chr_x + String chr_y + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size([vcf, background_fail_list, bothsides_pass_list], "GB") + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + input_size * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + + touch ~{prefix}.includelist.txt + touch ~{prefix}.sexchr.revise.txt + + # outputs + # includelist.txt: the names of all the samples in the input vcf + # sexchr.revise.txt: the names of the events where genotypes got tweaked on allosomes + # stdout: a revised vcf + java -jar $CLEAN_VCF_PART_1_JAR \ + ~{vcf} \ + ~{ped_file} \ + ~{chr_x} \ + ~{chr_y} \ + ~{background_fail_list} \ + ~{bothsides_pass_list} \ + ~{prefix}.includelist.txt \ + ~{prefix}.sexchr.revise.txt \ + | bgzip \ + > ~{prefix}.vcf.gz + tabix ~{prefix}.vcf.gz + >>> + + output { + File include_list="~{prefix}.includelist.txt" + File sex="~{prefix}.sexchr.revise.txt" + File intermediate_vcf="~{prefix}.vcf.gz" + File intermediate_vcf_idx="~{prefix}.vcf.gz.tbi" + } +} + task CleanVcf2 { input { File normal_revise_vcf + String prefix File include_list File multi_cnvs - File vcftools_idx String sv_pipeline_docker RuntimeAttr? runtime_attr_override } # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed # generally assume working memory is ~3 * inputs - Float input_size = size([normal_revise_vcf, include_list, multi_cnvs, vcftools_idx], "GB") + Float input_size = size([normal_revise_vcf, include_list, multi_cnvs], "GB") Float base_disk_gb = 10.0 - Float base_mem_gb = 4.0 - Float input_mem_scale = 3.0 - Float input_disk_scale = 5.0 + Float input_disk_scale = 3.0 RuntimeAttr runtime_default = object { - mem_gb: base_mem_gb + input_size * input_mem_scale, + mem_gb: 2.0, disk_gb: ceil(base_disk_gb + input_size * input_disk_scale), cpu_cores: 1, preemptible_tries: 3, @@ -241,36 +405,34 @@ task CleanVcf2 { command <<< set -eu -o pipefail + bcftools index ~{normal_revise_vcf} /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part2.sh \ ~{normal_revise_vcf} \ ~{include_list} \ ~{multi_cnvs} \ - "output.txt" + "~{prefix}.txt" >>> output { - File out="output.txt" + File out="~{prefix}.txt" } } -task CleanVcf3{ +task CleanVcf3 { input { File rd_cn_revise + Int? max_samples_shard String sv_pipeline_docker RuntimeAttr? runtime_attr_override } - + Int max_samples_shard_ = select_first([max_samples_shard, 7000]) # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed # generally assume working memory is ~3 * inputs Float input_size = size(rd_cn_revise, "GB") - Float base_disk_gb = 10.0 - Float base_mem_gb = 2.0 - Float input_mem_scale = 3.0 - Float input_disk_scale = 5.0 RuntimeAttr runtime_default = object { - mem_gb: base_mem_gb + input_size * input_mem_scale, - disk_gb: ceil(base_disk_gb + input_size * input_disk_scale), + mem_gb: 3.75, + disk_gb: ceil(10.0 + input_size * 2.0), cpu_cores: 1, preemptible_tries: 3, max_retries: 1, @@ -289,13 +451,9 @@ task CleanVcf3{ command <<< set -euo pipefail - - /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.sh ~{rd_cn_revise} - + python /opt/sv-pipeline/04_variant_resolution/scripts/clean_vcf_part3.py ~{rd_cn_revise} -s ~{max_samples_shard_} # Ensure there is at least one shard - if [ -z "$(ls -A shards/)" ]; then - touch shards/out.0_0.txt - fi + touch shards/out.0_0.txt >>> output { @@ -308,13 +466,14 @@ task CleanVcf4 { input { File rd_cn_revise File normal_revise_vcf + String prefix String sv_pipeline_docker RuntimeAttr? runtime_attr_override } Float input_size = size([rd_cn_revise, normal_revise_vcf], "GB") RuntimeAttr runtime_default = object { - mem_gb: 2.0 + input_size * 3.0, + mem_gb: 2.0, disk_gb: 50, cpu_cores: 1, preemptible_tries: 3, @@ -353,7 +512,7 @@ task CleanVcf4 { vid_sample_cn_map[vid].append(tuple(tokens[1:])) # Traverse VCF and replace genotypes - with open("revise.vcf.lines.txt", "w") as f: + with open("~{prefix}.revise_vcf_lines.txt", "w") as f: vcf = pysam.VariantFile(VCF_FILE) num_vcf_records = 0 for record in vcf: @@ -406,53 +565,46 @@ task CleanVcf4 { gt = s['SR_GT'] if gt > 2: num_gt_over_2 += 1 - if record.id == "gnomad-sv-v3-TEST-SMALL.chr22_BND_chr22_173": - print("{} {}".format(sid, num_gt_over_2)) - print("{} {} {} {}".format(s['PE_GT'], s['PE_GQ'], s['SR_GT'], s['SR_GQ'])) if num_gt_over_2 > max_vf: multi_geno_ids.add(record.id) vcf.close() multi_geno_ids = sorted(list(multi_geno_ids)) - with open("multi.geno.ids.txt", "w") as f: + with open("~{prefix}.multi_geno_ids.txt", "w") as f: for vid in multi_geno_ids: f.write(vid + "\n") CODE - bgzip revise.vcf.lines.txt - gzip multi.geno.ids.txt + bgzip ~{prefix}.revise_vcf_lines.txt + gzip ~{prefix}.multi_geno_ids.txt >>> output { - File out="revise.vcf.lines.txt.gz" - File multi_ids="multi.geno.ids.txt.gz" + File out="~{prefix}.revise_vcf_lines.txt.gz" + File multi_ids="~{prefix}.multi_geno_ids.txt.gz" } } -# Stitch fragmented RD-only calls found in 100% of the same samples -task StitchFragmentedCnvs { +# Remove CNVs that are redundant with CPX events or other CNVs +task DropRedundantCnvs { input { File vcf - String contig String prefix + String contig String sv_pipeline_docker RuntimeAttr? runtime_attr_override } - - String stitched_vcf_name = contig + ".shard.fragmented_CNVs_stitched.vcf.gz" - # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed - # generally assume working memory is ~3 * inputs - Float input_size = size(vcf, "GB") - Float base_disk_gb = 10.0 - Float base_mem_gb = 2.0 - Float input_mem_scale = 3.0 - Float input_disk_scale = 5.0 + Float input_size = size(vcf, "GiB") + # disk is cheap, read/write speed is proportional to disk size, and disk IO is a significant time factor: + # in tests on large VCFs, memory usage is ~1.0 * input VCF size + # the biggest disk usage is at the end of the task, with input + output VCF on disk + Int cpu_cores = 2 # speed up compression / decompression of VCFs RuntimeAttr runtime_default = object { - mem_gb: base_mem_gb + input_size * input_mem_scale, - disk_gb: ceil(base_disk_gb + input_size * input_disk_scale), - cpu_cores: 1, + mem_gb: 3.75 + input_size * 1.5, + disk_gb: ceil(100.0 + input_size * 2.0), + cpu_cores: cpu_cores, preemptible_tries: 3, max_retries: 1, boot_disk_gb: 10 @@ -469,19 +621,64 @@ task StitchFragmentedCnvs { } command <<< - set -eu -o pipefail - - /opt/sv-pipeline/04_variant_resolution/scripts/stitch_fragmented_CNVs.sh \ - ~{vcf} \ - "tmp_~{stitched_vcf_name}" - - /opt/sv-pipeline/04_variant_resolution/scripts/stitch_fragmented_CNVs.sh \ - "tmp_~{stitched_vcf_name}" \ - "~{stitched_vcf_name}" + set -euo pipefail + /opt/sv-pipeline/04_variant_resolution/scripts/resolve_cpx_cnv_redundancies.py \ + ~{vcf} ~{prefix}.vcf.gz --temp-dir ./tmp >>> output { - File stitched_vcf_shard = stitched_vcf_name + File out = "~{prefix}.vcf.gz" + } +} + + +# Stitch fragmented RD-only calls found in 100% of the same samples +task StitchFragmentedCnvs { + input { + File vcf + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(vcf, "GB") + RuntimeAttr runtime_default = object { + mem_gb: 7.5, + disk_gb: ceil(10.0 + input_size * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + Float mem_gb = select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + Int java_mem_mb = ceil(mem_gb * 1000 * 0.8) + + runtime { + memory: "~{mem_gb} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + echo "First pass..." + java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 ~{vcf} \ + | bgzip \ + > tmp.vcf.gz + rm ~{vcf} + echo "Second pass..." + java -Xmx~{java_mem_mb}M -jar ${STITCH_JAR} 0.2 200000 0.2 tmp.vcf.gz \ + | bgzip \ + > ~{prefix}.vcf.gz + >>> + + output { + File stitched_vcf_shard = "~{prefix}.vcf.gz" } } @@ -495,8 +692,6 @@ task FinalCleanup { String sv_pipeline_docker RuntimeAttr? runtime_attr_override } - - String cleaned_shard_name = prefix + "." + contig + ".final_cleanup.vcf.gz" # generally assume working disk size is ~2 * inputs, and outputs are ~2 *inputs, and inputs are not removed # generally assume working memory is ~3 * inputs @@ -532,15 +727,14 @@ task FinalCleanup { --prefix ~{prefix} \ ~{vcf} stdout \ | fgrep -v "##INFO= "~{cleaned_shard_name}" - tabix ~{cleaned_shard_name} + > ~{prefix}.vcf.gz + tabix ~{prefix}.vcf.gz >>> output { - File final_cleaned_shard = cleaned_shard_name - File final_cleaned_shard_idx = cleaned_shard_name + ".tbi" + File final_cleaned_shard = "~{prefix}.vcf.gz" + File final_cleaned_shard_idx = "~{prefix}.vcf.gz.tbi" } } \ No newline at end of file diff --git a/wdl/ClusterSingleChromosome.wdl b/wdl/ClusterSingleChromosome.wdl index a1e18bc8b..4d8adcbe0 100644 --- a/wdl/ClusterSingleChromosome.wdl +++ b/wdl/ClusterSingleChromosome.wdl @@ -4,13 +4,17 @@ version 1.0 import "TasksMakeCohortVcf.wdl" as MiniTasks import "ShardedCluster.wdl" as ShardedCluster +import "HailMerge.wdl" as HailMerge # Workflow to perform sharding & clustering of a vcf for a single chromosome workflow ClusterSingleChrom { input { File vcf File vcf_index + Int num_samples String contig + String cohort_name + String evidence_type String prefix Int dist Float frac @@ -18,23 +22,34 @@ workflow ClusterSingleChrom { File? exclude_list Int sv_size Array[String] sv_types + File empty_file + + Boolean use_hail + String? gcs_project String sv_pipeline_docker + String sv_pipeline_hail_docker String sv_base_mini_docker - # overrides for local tasks - RuntimeAttr? runtime_override_concat_svtypes - # overrides for MiniTasks RuntimeAttr? runtime_override_subset_sv_type + RuntimeAttr? runtime_override_cat_vid_lists_chrom # overrides for ShardedCluster - RuntimeAttr? runtime_override_shard_vcf_precluster + RuntimeAttr? runtime_override_shard_clusters + RuntimeAttr? runtime_override_shard_vids RuntimeAttr? runtime_override_pull_vcf_shard RuntimeAttr? runtime_override_svtk_vcf_cluster RuntimeAttr? runtime_override_get_vcf_header_with_members_info_line RuntimeAttr? runtime_override_concat_svtypes RuntimeAttr? runtime_override_concat_sharded_cluster + RuntimeAttr? runtime_override_cat_vid_lists_sharded + RuntimeAttr? runtime_override_make_sites_only + RuntimeAttr? runtime_override_sort_merged_vcf + + RuntimeAttr? runtime_override_preconcat_sharded_cluster + RuntimeAttr? runtime_override_hail_merge_sharded_cluster + RuntimeAttr? runtime_override_fix_header_sharded_cluster } #Scatter over svtypes @@ -53,105 +68,45 @@ workflow ClusterSingleChrom { } #For each svtype, intelligently shard VCF for clustering - call ShardedCluster.ShardedCluster as ShardedCluster { + call ShardedCluster.ShardedCluster { input: vcf=SubsetSvType.filtered_vcf, + num_samples=num_samples, dist=dist, frac=frac, prefix="~{prefix}.~{sv_type}", + cohort_name=cohort_name, contig=contig, + evidence_type=evidence_type, sv_type=sv_type, sample_overlap=sample_overlap, exclude_list=exclude_list, sv_size=sv_size, sv_types=sv_types, + empty_file=empty_file, + use_hail=use_hail, + gcs_project=gcs_project, sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, sv_base_mini_docker=sv_base_mini_docker, - runtime_override_shard_vcf_precluster=runtime_override_shard_vcf_precluster, + runtime_override_shard_clusters=runtime_override_shard_clusters, + runtime_override_shard_vids=runtime_override_shard_vids, runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard, runtime_override_svtk_vcf_cluster=runtime_override_svtk_vcf_cluster, runtime_override_get_vcf_header_with_members_info_line=runtime_override_get_vcf_header_with_members_info_line, - runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster - } - call RenameVariants { - input: - vcf=ShardedCluster.clustered_vcf, - vcf_index=ShardedCluster.clustered_vcf_idx, - prefix=prefix, - contig=contig, - sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_concat_svtypes + runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster, + runtime_override_cat_vid_lists_sharded=runtime_override_cat_vid_lists_sharded, + runtime_override_make_sites_only=runtime_override_make_sites_only, + runtime_override_sort_merged_vcf=runtime_override_sort_merged_vcf, + runtime_override_preconcat_sharded_cluster=runtime_override_preconcat_sharded_cluster, + runtime_override_hail_merge_sharded_cluster=runtime_override_hail_merge_sharded_cluster, + runtime_override_fix_header_sharded_cluster=runtime_override_fix_header_sharded_cluster } } - #Merge svtypes - call MiniTasks.ConcatVcfs as ConcatSvTypes { - input: - vcfs=RenameVariants.out, - vcfs_idx=RenameVariants.out_index, - allow_overlaps=true, - outfile_prefix="~{prefix}.~{contig}.precluster_concat", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_concat_svtypes - } - #Output clustered vcf output { - File clustered_vcf = ConcatSvTypes.concat_vcf - File clustered_vcf_idx = ConcatSvTypes.concat_vcf_idx - } -} - -task RenameVariants { - input { - File vcf - File vcf_index - String prefix - String contig - - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - String vcf_name = prefix + "." + contig + ".renamed.vcf.gz" - - # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to - # be held in memory or disk while working, potentially in a form that takes up more space) - Float input_size = size(vcf, "GiB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + 2.0 * input_size), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - - /opt/sv-pipeline/04_variant_resolution/scripts/rename_after_vcfcluster.py \ - --chrom ~{contig} \ - --prefix ~{prefix} \ - ~{vcf} - \ - | bgzip -c \ - > ~{vcf_name} - - tabix -p vcf -f ~{vcf_name} - >>> - - output { - File out = vcf_name - File out_index = vcf_name + ".tbi" + Array[File] clustered_vcfs = ShardedCluster.clustered_vcf + Array[File] clustered_vcf_indexes = ShardedCluster.clustered_vcf_idx } } diff --git a/wdl/CombineBatches.wdl b/wdl/CombineBatches.wdl index e42525ae3..66be86168 100644 --- a/wdl/CombineBatches.wdl +++ b/wdl/CombineBatches.wdl @@ -1,7 +1,12 @@ version 1.0 +import "CombineSRBothsidePass.wdl" as CombineSRBothsidePass import "VcfClusterSingleChromsome.wdl" as VcfClusterContig import "TasksMakeCohortVcf.wdl" as MiniTasks +import "HailMerge.wdl" as HailMerge +import "HarmonizeHeaders.wdl" as HarmonizeHeaders +import "MergePesrDepth.wdl" as MergePesrDepth +import "Utils.wdl" as Utils workflow CombineBatches { input { @@ -24,19 +29,31 @@ workflow CombineBatches { File empty_file + Boolean use_hail = false + String? gcs_project + String sv_base_mini_docker String sv_pipeline_docker + String sv_pipeline_hail_docker # overrides for local tasks RuntimeAttr? runtime_override_update_sr_list RuntimeAttr? runtime_override_merge_pesr_depth + RuntimeAttr? runtime_override_reheader + RuntimeAttr? runtime_override_pull_header # overrides for mini tasks - RuntimeAttr? runtime_override_clean_bothside_pass + RuntimeAttr? runtime_attr_get_non_ref_vids + RuntimeAttr? runtime_attr_calculate_support_frac RuntimeAttr? runtime_override_clean_background_fail RuntimeAttr? runtime_override_concat - RuntimeAttr? runtime_override_sort_pesr_depth_merged_vcf RuntimeAttr? runtime_override_concat_pesr_depth + RuntimeAttr? runtime_override_update_fix_pesr_header + RuntimeAttr? runtime_override_count_samples + RuntimeAttr? runtime_override_preconcat_pesr_depth + RuntimeAttr? runtime_override_hail_merge_pesr_depth + RuntimeAttr? runtime_override_fix_header_pesr_depth + RuntimeAttr? runtime_override_concat_large_pesr_depth # overrides for VcfClusterContig RuntimeAttr? runtime_override_localize_vcfs @@ -46,24 +63,53 @@ workflow CombineBatches { RuntimeAttr? runtime_override_subset_bothside_pass RuntimeAttr? runtime_override_subset_background_fail RuntimeAttr? runtime_override_subset_sv_type - RuntimeAttr? runtime_override_shard_vcf_precluster + RuntimeAttr? runtime_override_shard_clusters + RuntimeAttr? runtime_override_shard_vids RuntimeAttr? runtime_override_pull_vcf_shard RuntimeAttr? runtime_override_svtk_vcf_cluster RuntimeAttr? runtime_override_get_vcf_header_with_members_info_line RuntimeAttr? runtime_override_concat_vcf_cluster RuntimeAttr? runtime_override_concat_svtypes RuntimeAttr? runtime_override_concat_sharded_cluster + RuntimeAttr? runtime_override_make_sites_only + RuntimeAttr? runtime_override_sort_merged_vcf_cluster + RuntimeAttr? runtime_override_preconcat_sharded_cluster + RuntimeAttr? runtime_override_hail_merge_sharded_cluster + RuntimeAttr? runtime_override_fix_header_sharded_cluster + + # overerides for merge pesr depth + RuntimeAttr? runtime_override_shard_clusters_mpd + RuntimeAttr? runtime_override_shard_vids_mpd + RuntimeAttr? runtime_override_pull_vcf_shard_mpd + RuntimeAttr? runtime_override_merge_pesr_depth_mpd + + RuntimeAttr? runtime_override_sort_merged_vcf_mpd + RuntimeAttr? runtime_override_subset_small_mpd + RuntimeAttr? runtime_override_subset_large_mpd + RuntimeAttr? runtime_override_make_sites_only_mpd + RuntimeAttr? runtime_override_concat_large_pesr_depth_mpd + RuntimeAttr? runtime_override_concat_shards_mpd + + RuntimeAttr? runtime_override_preconcat_large_pesr_depth_mpd + RuntimeAttr? runtime_override_hail_merge_large_pesr_depth_mpd + RuntimeAttr? runtime_override_fix_header_large_pesr_depth_mpd + + RuntimeAttr? runtime_override_preconcat_pesr_depth_shards_mpd + RuntimeAttr? runtime_override_hail_merge_pesr_depth_shards_mpd + RuntimeAttr? runtime_override_fix_header_pesr_depth_shards_mpd + } # Preprocess some inputs - Int num_pass_lines=length(raw_sr_bothside_pass_files) - call MiniTasks.CatUncompressedFiles as CleanBothsidePass { + call CombineSRBothsidePass.CombineSRBothsidePass { input: - shards=raw_sr_bothside_pass_files, - filter_command="sort | uniq -c | awk -v OFS='\\t' '{print $1/~{num_pass_lines}, $2}'", - outfile_name="cohort_sr_genotyping_bothside_pass_list.txt", + pesr_vcfs=pesr_vcfs, + raw_sr_bothside_pass_files=raw_sr_bothside_pass_files, + prefix="~{cohort_name}.sr_bothside_pass", sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_clean_bothside_pass + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_get_non_ref_vids=runtime_attr_get_non_ref_vids, + runtime_attr_calculate_support_frac=runtime_attr_calculate_support_frac } Float min_background_fail_first_col = min_sr_background_fail_batches * length(raw_sr_background_fail_files) @@ -71,11 +117,18 @@ workflow CombineBatches { input: shards=raw_sr_background_fail_files, filter_command="sort | uniq -c | awk -v OFS='\\t' '{if($1 >= ~{min_background_fail_first_col}) print $2}'", - outfile_name="cohort_sr_genotyping_background_fail_list.txt", + outfile_name="~{cohort_name}.background_fail.txt", sv_base_mini_docker=sv_base_mini_docker, runtime_attr_override=runtime_override_clean_background_fail } + call Utils.CountSamples { + input: + vcf=depth_vcfs[0], + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_count_samples + } + #Scatter per chromosome Array[String] contigs = transpose(read_tsv(contig_list))[0] scatter ( contig in contigs ) { @@ -86,6 +139,7 @@ workflow CombineBatches { call VcfClusterContig.VcfClusterSingleChrom as ClusterPesr { input: vcfs=pesr_vcfs, + num_samples=CountSamples.num_samples, batches=batches, prefix="~{cohort_name}.~{contig}.pesr", dist=300, @@ -95,12 +149,17 @@ workflow CombineBatches { sv_size=50, sv_types=["DEL","DUP","INV","BND","INS"], contig=contig, + evidence_type="pesr", + cohort_name=cohort_name, localize_shard_size=localize_shard_size, subset_sr_lists=true, - bothside_pass=CleanBothsidePass.outfile, + bothside_pass=CombineSRBothsidePass.out, background_fail=CleanBackgroundFail.outfile, empty_file=empty_file, + use_hail=use_hail, + gcs_project=gcs_project, sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, sv_base_mini_docker=sv_base_mini_docker, runtime_override_localize_vcfs = runtime_override_localize_vcfs, runtime_override_join_vcfs = runtime_override_join_vcfs, @@ -109,19 +168,26 @@ workflow CombineBatches { runtime_override_subset_bothside_pass=runtime_override_subset_bothside_pass, runtime_override_subset_background_fail=runtime_override_subset_background_fail, runtime_override_subset_sv_type=runtime_override_subset_sv_type, - runtime_override_shard_vcf_precluster=runtime_override_shard_vcf_precluster, + runtime_override_shard_clusters=runtime_override_shard_clusters, + runtime_override_shard_vids=runtime_override_shard_vids, runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard, runtime_override_svtk_vcf_cluster=runtime_override_svtk_vcf_cluster, runtime_override_get_vcf_header_with_members_info_line=runtime_override_get_vcf_header_with_members_info_line, runtime_override_concat_vcf_cluster=runtime_override_concat_vcf_cluster, runtime_override_concat_svtypes=runtime_override_concat_svtypes, - runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster + runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster, + runtime_override_make_sites_only=runtime_override_make_sites_only, + runtime_override_sort_merged_vcf=runtime_override_sort_merged_vcf_cluster, + runtime_override_preconcat_sharded_cluster=runtime_override_preconcat_sharded_cluster, + runtime_override_hail_merge_sharded_cluster=runtime_override_hail_merge_sharded_cluster, + runtime_override_fix_header_sharded_cluster=runtime_override_fix_header_sharded_cluster } #Subset RD VCFs to single chromosome & cluster call VcfClusterContig.VcfClusterSingleChrom as ClusterDepth { input: vcfs=depth_vcfs, + num_samples=CountSamples.num_samples, batches=batches, prefix="~{cohort_name}.~{contig}.depth", dist=500000, @@ -131,12 +197,17 @@ workflow CombineBatches { sv_size=5000, sv_types=["DEL","DUP"], contig=contig, + evidence_type="depth", + cohort_name=cohort_name, localize_shard_size=localize_shard_size, subset_sr_lists=false, - bothside_pass=CleanBothsidePass.outfile, + bothside_pass=CombineSRBothsidePass.out, background_fail=CleanBackgroundFail.outfile, empty_file=empty_file, + use_hail=use_hail, + gcs_project=gcs_project, sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, sv_base_mini_docker=sv_base_mini_docker, runtime_override_localize_vcfs = runtime_override_localize_vcfs, runtime_override_join_vcfs = runtime_override_join_vcfs, @@ -145,18 +216,36 @@ workflow CombineBatches { runtime_override_subset_bothside_pass=runtime_override_subset_bothside_pass, runtime_override_subset_background_fail=runtime_override_subset_background_fail, runtime_override_subset_sv_type=runtime_override_subset_sv_type, - runtime_override_shard_vcf_precluster=runtime_override_shard_vcf_precluster, + runtime_override_shard_clusters=runtime_override_shard_clusters, + runtime_override_shard_vids=runtime_override_shard_vids, runtime_override_svtk_vcf_cluster=runtime_override_svtk_vcf_cluster, runtime_override_get_vcf_header_with_members_info_line=runtime_override_get_vcf_header_with_members_info_line, runtime_override_concat_vcf_cluster=runtime_override_concat_vcf_cluster, runtime_override_concat_svtypes=runtime_override_concat_svtypes, - runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster + runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster, + runtime_override_make_sites_only=runtime_override_make_sites_only, + runtime_override_sort_merged_vcf=runtime_override_sort_merged_vcf_cluster, + runtime_override_preconcat_sharded_cluster=runtime_override_preconcat_sharded_cluster, + runtime_override_hail_merge_sharded_cluster=runtime_override_hail_merge_sharded_cluster, + runtime_override_fix_header_sharded_cluster=runtime_override_fix_header_sharded_cluster + } + + call MiniTasks.ConcatVcfs as ConcatPesrSitesOnly { + input: + vcfs=ClusterPesr.clustered_vcfs, + vcfs_idx=ClusterPesr.clustered_vcf_indexes, + naive=true, + generate_index=false, + sites_only=true, + outfile_prefix="~{cohort_name}.clustered_pesr.sites_only", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_concat } #Update SR background fail & bothside pass files (1) call MiniTasks.UpdateSrList as UpdateBackgroundFailFirst { input: - vcf=ClusterPesr.clustered_vcf, + vcf=ConcatPesrSitesOnly.concat_vcf, original_list=ClusterPesr.filtered_background_fail, outfile="~{cohort_name}.~{contig}.sr_background_fail.updated.txt", sv_pipeline_docker=sv_pipeline_docker, @@ -164,44 +253,118 @@ workflow CombineBatches { } call MiniTasks.UpdateSrList as UpdateBothsidePassFirst { input: - vcf=ClusterPesr.clustered_vcf, + vcf=ConcatPesrSitesOnly.concat_vcf, original_list=ClusterPesr.filtered_bothside_pass, outfile="~{cohort_name}.~{contig}.sr_bothside_pass.updated.txt", sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_override_update_sr_list } - #Merge PESR & RD VCFs - call MiniTasks.ConcatVcfs as ConcatPesrDepth { + call HarmonizeHeaders.HarmonizeHeaders { input: - vcfs=[ClusterPesr.clustered_vcf, ClusterDepth.clustered_vcf], - vcfs_idx=[ClusterPesr.clustered_vcf_idx, ClusterDepth.clustered_vcf_idx], - allow_overlaps=true, - outfile_prefix="~{cohort_name}.~{contig}.concat_pesr_depth", + header_vcf=ClusterDepth.clustered_vcfs[0], + vcfs=ClusterPesr.clustered_vcfs, + prefix="~{cohort_name}.~{contig}.harmonize_headers", sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_concat_pesr_depth + runtime_override_reheader=runtime_override_reheader, + runtime_override_pull_header=runtime_override_pull_header } - call MergePesrDepth { + + call MergePesrDepth.MergePesrDepth as MergeDeletions { input: - vcf=ConcatPesrDepth.concat_vcf, - vcf_index=ConcatPesrDepth.concat_vcf_idx, + subtyped_pesr_vcf=HarmonizeHeaders.out[0], + subtyped_depth_vcf=ClusterDepth.clustered_vcfs[0], + svtype="DEL", + num_samples=CountSamples.num_samples, + prefix="~{cohort_name}.~{contig}.merge_del", + cohort_name=cohort_name, contig=contig, - prefix="~{cohort_name}.~{contig}.merge_pesr_depth", + use_hail=use_hail, + gcs_project=gcs_project, + sv_base_mini_docker=sv_base_mini_docker, sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_merge_pesr_depth + sv_pipeline_hail_docker=sv_pipeline_hail_docker, + runtime_override_shard_clusters=runtime_override_shard_clusters_mpd, + runtime_override_shard_vids=runtime_override_shard_vids_mpd, + runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard_mpd, + runtime_override_merge_pesr_depth=runtime_override_merge_pesr_depth_mpd, + runtime_override_sort_merged_vcf=runtime_override_sort_merged_vcf_mpd, + runtime_override_subset_small=runtime_override_subset_small_mpd, + runtime_override_subset_large=runtime_override_subset_large_mpd, + runtime_override_make_sites_only=runtime_override_make_sites_only_mpd, + runtime_override_concat_large_pesr_depth=runtime_override_concat_large_pesr_depth_mpd, + runtime_override_concat_shards=runtime_override_concat_shards_mpd, + runtime_override_preconcat_large_pesr_depth=runtime_override_preconcat_large_pesr_depth_mpd, + runtime_override_hail_merge_large_pesr_depth=runtime_override_hail_merge_large_pesr_depth_mpd, + runtime_override_fix_header_large_pesr_depth=runtime_override_fix_header_large_pesr_depth_mpd, + runtime_override_preconcat_pesr_depth_shards=runtime_override_preconcat_pesr_depth_shards_mpd, + runtime_override_hail_merge_pesr_depth_shards=runtime_override_hail_merge_pesr_depth_shards_mpd, + runtime_override_fix_header_pesr_depth_shards=runtime_override_fix_header_pesr_depth_shards_mpd } - call MiniTasks.SortVcf as SortMergePesrDepth { + + call MergePesrDepth.MergePesrDepth as MergeDuplications { input: - vcf = MergePesrDepth.merged_vcf, - outfile_prefix = "~{cohort_name}.~{contig}.sort_pesr_depth", - sv_base_mini_docker = sv_base_mini_docker, - runtime_attr_override = runtime_override_sort_pesr_depth_merged_vcf + subtyped_pesr_vcf=HarmonizeHeaders.out[1], + subtyped_depth_vcf=ClusterDepth.clustered_vcfs[1], + svtype="DUP", + num_samples=CountSamples.num_samples, + prefix="~{cohort_name}.~{contig}.merge_dup", + cohort_name=cohort_name, + contig=contig, + use_hail=use_hail, + gcs_project=gcs_project, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, + runtime_override_shard_clusters=runtime_override_shard_clusters_mpd, + runtime_override_shard_vids=runtime_override_shard_vids_mpd, + runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard_mpd, + runtime_override_merge_pesr_depth=runtime_override_merge_pesr_depth_mpd, + runtime_override_sort_merged_vcf=runtime_override_sort_merged_vcf_mpd, + runtime_override_subset_small=runtime_override_subset_small_mpd, + runtime_override_subset_large=runtime_override_subset_large_mpd, + runtime_override_make_sites_only=runtime_override_make_sites_only_mpd, + runtime_override_concat_large_pesr_depth=runtime_override_concat_large_pesr_depth_mpd, + runtime_override_concat_shards=runtime_override_concat_shards_mpd, + runtime_override_preconcat_large_pesr_depth=runtime_override_preconcat_large_pesr_depth_mpd, + runtime_override_hail_merge_large_pesr_depth=runtime_override_hail_merge_large_pesr_depth_mpd, + runtime_override_fix_header_large_pesr_depth=runtime_override_fix_header_large_pesr_depth_mpd, + runtime_override_preconcat_pesr_depth_shards=runtime_override_preconcat_pesr_depth_shards_mpd, + runtime_override_hail_merge_pesr_depth_shards=runtime_override_hail_merge_pesr_depth_shards_mpd, + runtime_override_fix_header_pesr_depth_shards=runtime_override_fix_header_pesr_depth_shards_mpd + } + + #Merge PESR & RD VCFs + if (use_hail) { + call HailMerge.HailMerge as ConcatPesrDepthHail { + input: + vcfs=[MergeDeletions.out, MergeDuplications.out, HarmonizeHeaders.out[2], HarmonizeHeaders.out[3], HarmonizeHeaders.out[4]], + prefix="~{cohort_name}.~{contig}.concat_pesr_depth", + gcs_project=gcs_project, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, + runtime_override_preconcat=runtime_override_preconcat_pesr_depth, + runtime_override_hail_merge=runtime_override_hail_merge_pesr_depth, + runtime_override_fix_header=runtime_override_fix_header_pesr_depth + } + } + if (!use_hail) { + call MiniTasks.ConcatVcfs as ConcatPesrDepth { + input: + vcfs=[MergeDeletions.out, MergeDuplications.out, HarmonizeHeaders.out[2], HarmonizeHeaders.out[3], HarmonizeHeaders.out[4]], + vcfs_idx=[MergeDeletions.out+".tbi", MergeDuplications.out+".tbi", HarmonizeHeaders.out[2]+".tbi", HarmonizeHeaders.out[3]+".tbi", HarmonizeHeaders.out[4]+".tbi"], + allow_overlaps=true, + outfile_prefix="~{cohort_name}.~{contig}.concat_pesr_depth", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_concat_large_pesr_depth + } } #Update SR background fail & bothside pass files (2) call MiniTasks.UpdateSrList as UpdateBackgroundFailSecond { input: - vcf=SortMergePesrDepth.out, + vcf=select_first([ConcatPesrDepth.concat_vcf, ConcatPesrDepthHail.merged_vcf]), original_list=UpdateBackgroundFailFirst.updated_list, outfile="~{cohort_name}.~{contig}.sr_background_fail.updated2.txt", sv_pipeline_docker=sv_pipeline_docker, @@ -209,20 +372,23 @@ workflow CombineBatches { } call MiniTasks.UpdateSrList as UpdateBothsidePassSecond { input: - vcf=SortMergePesrDepth.out, + vcf=select_first([ConcatPesrDepth.concat_vcf, ConcatPesrDepthHail.merged_vcf]), original_list=UpdateBothsidePassFirst.updated_list, outfile="~{cohort_name}.~{contig}.sr_bothside_pass.updated2.txt", sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_override_update_sr_list } + + File vcfs_out_ = select_first([ConcatPesrDepth.concat_vcf, ConcatPesrDepthHail.merged_vcf]) + File vcf_indexes_out_ = select_first([ConcatPesrDepth.concat_vcf_idx, ConcatPesrDepthHail.merged_vcf_index]) } #Merge resolved vcfs for QC if (merge_vcfs) { call MiniTasks.ConcatVcfs { input: - vcfs=SortMergePesrDepth.out, - vcfs_idx=SortMergePesrDepth.out_index, + vcfs=vcfs_out_, + vcfs_idx=vcf_indexes_out_, naive=true, outfile_prefix="~{cohort_name}.combine_batches", sv_base_mini_docker=sv_base_mini_docker, @@ -232,60 +398,11 @@ workflow CombineBatches { #Final outputs output { - Array[File] vcfs = SortMergePesrDepth.out - Array[File] vcf_indexes = SortMergePesrDepth.out_index + Array[File] vcfs = vcfs_out_ + Array[File] vcf_indexes = vcf_indexes_out_ Array[File] cluster_bothside_pass_lists = UpdateBothsidePassSecond.updated_list Array[File] cluster_background_fail_lists = UpdateBackgroundFailSecond.updated_list File? merged_vcf = ConcatVcfs.concat_vcf File? merged_vcf_index = ConcatVcfs.concat_vcf_idx } } - - -#Merge PESR + RD VCFs -task MergePesrDepth { - input { - File vcf - File vcf_index - String prefix - String contig - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - String output_file = prefix + ".vcf.gz" - - # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to - # be held in memory or disk while working, potentially in a form that takes up more space) - Float input_size = size(vcf, "GiB") - RuntimeAttr runtime_default = object { - mem_gb: 2.0 + 0.6 * input_size, - disk_gb: ceil(10.0 + 4 * input_size), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - /opt/sv-pipeline/04_variant_resolution/scripts/merge_pesr_depth.py \ - --prefix pesr_depth_merged_~{contig} \ - ~{vcf} \ - ~{output_file} - >>> - - output { - File merged_vcf = output_file - } -} diff --git a/wdl/CombineSRBothsidePass.wdl b/wdl/CombineSRBothsidePass.wdl new file mode 100644 index 000000000..5713e8ac9 --- /dev/null +++ b/wdl/CombineSRBothsidePass.wdl @@ -0,0 +1,120 @@ +version 1.0 + +import "Structs.wdl" + +workflow CombineSRBothsidePass { + input { + Array[File] pesr_vcfs + Array[File] raw_sr_bothside_pass_files + String prefix + + String sv_base_mini_docker + String sv_pipeline_docker + + RuntimeAttr? runtime_attr_get_non_ref_vids + RuntimeAttr? runtime_attr_calculate_support_frac + } + + scatter (i in range(length(pesr_vcfs))) { + call GetNonRefVariantLists { + input: + vcf=pesr_vcfs[i], + prefix="~{prefix}.non_ref_vids.shard_~{i}", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_attr_get_non_ref_vids + } + } + + call CalculateBothsideSupportFraction { + input: + non_ref_vid_lists=GetNonRefVariantLists.out, + raw_sr_bothside_pass_files=raw_sr_bothside_pass_files, + prefix="~{prefix}.sr_bothside_support", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_calculate_support_frac + } + + output { + File out = CalculateBothsideSupportFraction.out + } +} + +task GetNonRefVariantLists { + input { + File vcf + String prefix + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(vcf, "GB") + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + input_size * 2.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_base_mini_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + bcftools view -G -i 'SUM(AC)>0||SUM(FORMAT/SR_GT)>0' ~{vcf} | bcftools query -f '%ID\n' \ + > ~{prefix}.list + >>> + output { + File out = "~{prefix}.list" + } +} + + +task CalculateBothsideSupportFraction { + input { + Array[File] non_ref_vid_lists + Array[File] raw_sr_bothside_pass_files + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(non_ref_vid_lists, "GB") + size(raw_sr_bothside_pass_files, "GB") + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + input_size * 2.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + python /opt/sv-pipeline/04_variant_resolution/scripts/calculate_sr_bothside_support.py \ + ~{write_lines(non_ref_vid_lists)} \ + ~{write_lines(raw_sr_bothside_pass_files)} \ + > ~{prefix}.txt + >>> + output { + File out = "~{prefix}.txt" + } +} \ No newline at end of file diff --git a/wdl/DropRedundantCNVs.wdl b/wdl/DropRedundantCNVs.wdl deleted file mode 100644 index 82ac75d3f..000000000 --- a/wdl/DropRedundantCNVs.wdl +++ /dev/null @@ -1,533 +0,0 @@ -version 1.0 - -import "Structs.wdl" - -workflow DropRedundantCNVs { - input { - File vcf - String contig - String sv_pipeline_docker - } - - call DropRedundantCNVs_1 { - input: - vcf=vcf, - sv_pipeline_docker=sv_pipeline_docker - } - - call DropRedundantCNVs_2 { - input: - intervals_preclustered_bed=DropRedundantCNVs_1.intervals_preclustered_bed, - sv_pipeline_docker=sv_pipeline_docker - } - - call DropRedundantCNVs_3 { - input: - intervals_preclustered_bed=DropRedundantCNVs_1.intervals_preclustered_bed, - sv_pipeline_docker=sv_pipeline_docker - } - - call DropRedundantCNVs_4 { - input: - intervals_preclustered_subset_bed=DropRedundantCNVs_2.intervals_preclustered_subset_bed, - step2_intervals_preclustered_subset_txt=DropRedundantCNVs_3.step2_intervals_preclustered_subset_txt, - samples_list=DropRedundantCNVs_1.samples_list, - sv_pipeline_docker=sv_pipeline_docker - } - - call DropRedundantCNVs_5 { - input: - vcf=vcf, - vids_to_remove_list_1=DropRedundantCNVs_4.vids_to_remove_list_1, - intervals_preclustered_bed=DropRedundantCNVs_1.intervals_preclustered_bed, - step2_variants_to_resolve_list=DropRedundantCNVs_4.step2_variants_to_resolve_list, - contig=contig, - sv_pipeline_docker=sv_pipeline_docker - } - - call DropRedundantCNVs_6 { - input: - unsorted_vcf=DropRedundantCNVs_5.unsorted_vcf, - contig=contig, - sv_pipeline_docker=sv_pipeline_docker - } - - output { - File cleaned_vcf_shard = DropRedundantCNVs_6.cleaned_vcf_shard - } -} - -task DropRedundantCNVs_1 { - input { - File vcf - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - bcftools query --list-samples ~{vcf} > samples.list - - ###PREP FILES - #Convert full VCF to BED intervals - #Ignore CPX events with UNRESOLVED filter status - svtk vcf2bed --split-cpx --info SVTYPE \ - <(bcftools view -e 'INFO/SVTYPE == "CPX" && FILTER == "UNRESOLVED"' ~{vcf}) out.bed - grep -e '^#\|DEL\|DUP\|CNV\|CPX' out.bed \ - | awk -v OFS="\t" '{ if ($5=="CN0") print $1, $2, $3, $4, "DEL", $5"\n"$1, $2, $3, $4, "DUP", $5; \ - else if ($5=="DEL" || $5=="DUP") print $1, $2, $3, $4, $6, $5 }' \ - | sort -Vk1,1 -k2,2n -k3,3n -k4,4V \ - | bgzip -c \ - > intervals.preclustered.bed.gz - >>> - - output { - File intervals_preclustered_bed = "intervals.preclustered.bed.gz" - File samples_list = "samples.list" - } -} - -task DropRedundantCNVs_2 { - input { - File intervals_preclustered_bed - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(intervals_preclustered_bed, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - ###REMOVE CNVS REDUNDANT WITH COMPLEX EVENTS - #Subset to only variants that share some overlap (at least 10% recip) with at least one CPX variant - bedtools intersect -wa -r -f 0.1 \ - -a ~{intervals_preclustered_bed} \ - -b <( zcat ~{intervals_preclustered_bed} | fgrep "CPX" ) \ - | sort -Vk1,1 -k2,2n -k3,3n -k4,4V \ - | uniq \ - | bgzip -c \ - > intervals.preclustered.subset.bed.gz - >>> - - output { - File intervals_preclustered_subset_bed = "intervals.preclustered.subset.bed.gz" - } -} - - -task DropRedundantCNVs_3 { - input { - File intervals_preclustered_bed - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(intervals_preclustered_bed, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 20.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - ###FIND REMAINING REDUNDANT CNVS WITH STRONG (80%) OVERLAP IN SAMPLES AND SIZE - #Find CNV intervals that have 80% reciprocal overlap - bedtools intersect -wa -wb -r -f 0.8 \ - -a ~{intervals_preclustered_bed} \ - -b ~{intervals_preclustered_bed} \ - | awk -v FS="\t" '{ if ($4!=$10 && $6==$12) print $0 }' \ - | awk -v OFS="\t" '$4 ~ /DEL|DUP/ { print $0 }' \ - | awk -v OFS="\t" '$10 ~ /DEL|DUP/ { print $0 }' \ - | cut -f4,5,10,11 \ - | sort \ - | uniq \ - | gzip \ - > step2.intervals.preclustered.subset.txt.gz - >>> - - output { - File step2_intervals_preclustered_subset_txt = "step2.intervals.preclustered.subset.txt.gz" - } -} - - -task DropRedundantCNVs_4 { - input { - File step2_intervals_preclustered_subset_txt - File intervals_preclustered_subset_bed - File samples_list - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size(step2_intervals_preclustered_subset_txt, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 60, - disk_gb: ceil(10.0 + input_size * 2.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - - python3 < out.vcf.gz - import sys - import gzip - from collections import namedtuple, defaultdict - - import pysam - import numpy as np - from scipy import sparse - from scipy.sparse import csgraph - - BedCall = namedtuple('BedCall', 'chrom start end name samples svtype'.split()) - - def reciprocal_overlap(a, b, frac): - if a.chrom != b.chrom: - return False - if a.start >= b.end or b.start >= a.end: - return False - ov = min(a.end, b.end) - max(a.start, b.start) - return (ov / float(max(a.end - a.start, b.end - b.start))) >= frac - - - def sample_overlap(samples_a, samples_b, denom, frac): - if len(samples_a) == 0 or len(samples_b) == 0: - return True - ov = len(samples_a.intersection(samples_b)) - return (ov / float(denom)) >= frac - - - def read_intervals(path, samples_dict): - intervals = [] - with gzip.open(path, "rb") as f: - for lineb in f: - tokens = lineb.decode('utf-8').strip().split('\t') - sample_indexes = set([samples_dict[s] for s in tokens[4].split(',')]) - intervals.append(BedCall(tokens[0], int(tokens[1]), int(tokens[2]), tokens[3], sample_indexes, tokens[5])) - return intervals - - # Save memory using sample id indexing - with open("~{samples_list}") as f: - samples_list = [line.strip() for line in f] - num_samples = len(samples_list) - samples_dict = {samples_list[i]: i for i in range(num_samples)} - - intervals = read_intervals("~{intervals_preclustered_subset_bed}", samples_dict) - num_intervals = len(intervals) - - # 50% RO and sample overlap in subsetted intervals - # Generate sparse graph for clustering - RO_FRAC = 0.5 - G = sparse.eye(len(intervals), dtype=np.uint8, format='lil') - for i in range(num_intervals): - ro_indexes = [j for j in range(i) if reciprocal_overlap(intervals[i], intervals[j], RO_FRAC)] - for j in ro_indexes: - G[i, j] = 1 - - # Compute clusters - n_comp, cluster_labels = csgraph.connected_components(G, connection='weak', directed=False) - clusters = defaultdict(list) - for i in range(len(cluster_labels)): - clusters[cluster_labels[i]].append(i) - - # Find CNVs in clusters containing at least one CPX - SAMPLE_FRAC = 0.5 - vids_to_remove = set([]) - for cluster in clusters.values(): - cnvs = [i for i in cluster if "DEL" in intervals[i].name or "DUP" in intervals[i].name] - cpx = [i for i in cluster if "CPX" in intervals[i].name] - for i in cnvs: - for j in cpx: - if sample_overlap(intervals[i].samples, intervals[j].samples, len(intervals[i].samples), SAMPLE_FRAC): - vids_to_remove.add(intervals[i].name + "\n") - break - - with open("VIDs_to_remove.list", 'w') as f: - f.writelines(sorted(list(vids_to_remove))) - - # Find clusters of CNVs only, using 80% overlap parameters - with gzip.open("~{step2_intervals_preclustered_subset_txt}") as f: - intervals2 = [] - for line in f: - tokens = line.decode('utf-8').strip().split('\t') - samples_a = set([samples_dict[s] for s in tokens[1].split(',')]) - samples_b = set([samples_dict[s] for s in tokens[3].split(',')]) - intervals2.append((tokens[0], samples_a, tokens[2], samples_b)) - - num_intervals2 = len(intervals2) - vids_to_resolve_list = [] - SAMPLE_FRAC2 = 0.8 - for interval in intervals2: - samples_a = interval[1] - samples_b = interval[3] - union = samples_a.union(samples_b) - if sample_overlap(samples_a, samples_b, len(union), SAMPLE_FRAC2): - vids_to_resolve_list.append("{}\n".format(",".join(sorted([interval[0], interval[2]])))) - - vids_to_resolve_list = sorted(list(set(vids_to_resolve_list))) - - with open("step2.variants_to_resolve.list", 'w') as f: - f.writelines(vids_to_resolve_list) - - CODE - >>> - - output { - File step2_variants_to_resolve_list = "step2.variants_to_resolve.list" - File vids_to_remove_list_1 = "VIDs_to_remove.list" - } -} - - -task DropRedundantCNVs_5 { - input { - File vcf - File vids_to_remove_list_1 - File intervals_preclustered_bed - File step2_variants_to_resolve_list - String contig - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - Float input_size = size([vcf, intervals_preclustered_bed, intervals_preclustered_bed, step2_variants_to_resolve_list], "GB") - RuntimeAttr runtime_default = object { - mem_gb: 30, - disk_gb: ceil(10.0 + input_size * 5.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - python3 < drop_redundant_cnvs_5.~{contig}.vcf.gz - import sys - import pysam - import gzip - - sys.stderr.write("Reading step2...\n") - with open("~{step2_variants_to_resolve_list}") as f: - vids_sets_to_resolve = [set(line.strip().split(',')) for line in f.readlines()] - vids_list = sorted(list(set([x for y in vids_sets_to_resolve for x in y]))) - - sys.stderr.write("Reading vids to remove...\n") - with open("~{vids_to_remove_list_1}") as f: - vids_to_remove = set([line.strip() for line in f.readlines()]) - - sys.stderr.write("Reading preclustered intervals...\n") - with gzip.open("~{intervals_preclustered_bed}") as f: - intervals = {} - for lineb in f: - tokens = lineb.decode('utf-8').strip().split('\t') - vid = tokens[3] - intervals[vid] = tokens - - sys.stderr.write("Finding partners...\n") - partners = {} - all_partners = set([]) - for vid in vids_list: - # get all other variants from clusters containing this variant - partners[vid] = set([p for vset in vids_sets_to_resolve if vid in vset for p in vset]) - all_partners.update(partners[vid]) - - vids_to_remove.update(all_partners) - #with open("vids_to_remove_2.list", 'w') as f: - # f.writelines(sorted([v+"\n" for v in vids_to_remove])) - - sys.stderr.write("Scanning vcf...\n") - vcf = pysam.VariantFile("~{vcf}") - records = {r.id: r for r in vcf if r.id in all_partners} - vcf.close() - - def count_gts(record): - result = [0, 0, 0] - num_samples = len(record.samples) - for g in [record.samples[i]['GT'] for i in range(num_samples)]: - if g == (0, 0): - result[1] += 1 - elif g == (None, None): - result[2] += 1 - else: - result[0] += 1 - return result - - def get_best_score_vid(scores): - return sorted(scores.items(), key=lambda x: x[1])[-1][0] - - sys.stderr.write("Generating records...\n") - records_to_add = [] - processed_vids = set([]) - for vid in vids_list: - if vid in processed_vids: - continue - vid_partners = partners[vid] - processed_vids.update(vid_partners) - partner_intervals = [intervals[p] for p in vid_partners] - most_samples_vid = sorted(partner_intervals, key=lambda x : len(x[4].split(',')))[-1][3] - x = sorted(partner_intervals, key=lambda x : len(x[4].split(','))) - best_genotype_vid = None - best_non_ref = -1 - best_ref = -1 - scores = {p: count_gts(records[p]) for p in vid_partners} - scores_non_ref = {p: scores[p][0] for p in vid_partners if scores[p][0] > 0} - scores_ref = {p: scores[p][1] for p in vid_partners if scores[p][1] > 0} - scores_no_call = {p: scores[p][2] for p in vid_partners if scores[p][2] > 0} - if len(scores_non_ref) > 0: - best_genotype_vid = get_best_score_vid(scores_non_ref) - elif len(scores_ref) > 0: - best_genotype_vid = get_best_score_vid(scores_ref) - else: - best_genotype_vid = get_best_score_vid(scores_no_call) - sys.stderr.write(most_samples_vid + "\n") - s1 = str(records[most_samples_vid]).split('\t')[0:9] - s2 = str(records[best_genotype_vid]).split('\t', 9) - records_to_add.append("\t".join(s1) + "\t" + s2[9]) - - sys.stderr.write("Writing vcf...\n") - vcf = pysam.VariantFile("~{vcf}") - sys.stdout.write(str(vcf.header)) - for record in vcf: - if record.id not in vids_to_remove: - sys.stdout.write(str(record)) - vcf.close() - - for record in records_to_add: - sys.stdout.write(record) - - CODE - - >>> - - output { - File unsorted_vcf = "drop_redundant_cnvs_5.~{contig}.vcf.gz" - } -} - - -task DropRedundantCNVs_6 { - input { - File unsorted_vcf - String contig - String sv_pipeline_docker - RuntimeAttr? runtime_attr_override - } - - String outfile_name = contig + ".shard.no_CNV_redundancies.vcf.gz" - - Float input_size = size(unsorted_vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 7.5, - disk_gb: ceil(10.0 + input_size * 20.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_pipeline_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euxo pipefail - - ###CLEAN UP FINAL OUTPUT - zcat ~{unsorted_vcf} \ - | vcf-sort \ - | bgzip \ - > ~{outfile_name} - >>> - - output { - File cleaned_vcf_shard = outfile_name - } -} - diff --git a/wdl/GATKSVPipelineBatch.wdl b/wdl/GATKSVPipelineBatch.wdl index 207918c8a..47b8a2a46 100644 --- a/wdl/GATKSVPipelineBatch.wdl +++ b/wdl/GATKSVPipelineBatch.wdl @@ -86,6 +86,8 @@ workflow GATKSVPipelineBatch { String sv_base_mini_docker String sv_base_docker String sv_pipeline_docker + String sv_pipeline_hail_docker + String sv_pipeline_updates_docker String sv_pipeline_rdtest_docker String sv_pipeline_base_docker String sv_pipeline_qc_docker @@ -300,6 +302,8 @@ workflow GATKSVPipelineBatch { sv_pipeline_base_docker = sv_pipeline_base_docker, linux_docker=linux_docker, sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, + sv_pipeline_updates_docker=sv_pipeline_updates_docker, sv_pipeline_rdtest_docker=sv_pipeline_rdtest_docker, sv_pipeline_qc_docker=sv_pipeline_qc_docker, sv_base_mini_docker=sv_base_mini_docker diff --git a/wdl/GATKSVPipelineSingleSample.wdl b/wdl/GATKSVPipelineSingleSample.wdl index f914f8075..bd5554005 100644 --- a/wdl/GATKSVPipelineSingleSample.wdl +++ b/wdl/GATKSVPipelineSingleSample.wdl @@ -64,6 +64,8 @@ workflow GATKSVPipelineSingleSample { String sv_base_mini_docker String sv_base_docker String sv_pipeline_docker + String sv_pipeline_hail_docker + String sv_pipeline_updates_docker String sv_pipeline_rdtest_docker String sv_pipeline_base_docker String sv_pipeline_qc_docker @@ -292,7 +294,6 @@ workflow GATKSVPipelineSingleSample { File rmsk File segdups - String? chr_x Int? min_large_pesr_call_size_for_filtering Float? min_large_pesr_depth_overlap_fraction @@ -373,23 +374,178 @@ workflow GATKSVPipelineSingleSample { Int clean_vcf_max_shards_per_chrom_clean_vcf_step1 Int clean_vcf_min_records_per_shard_clean_vcf_step1 Int clean_vcf_samples_per_clean_vcf_step2_shard + Int clean_vcf5_records_per_shard + Int clean_vcf1b_records_per_shard + + String? chr_x + String? chr_y Int? clean_vcf_random_seed # Run MakeCohortVcf metrics - default is off for single sample pipeline Boolean? run_makecohortvcf_metrics = false + # overrides for local tasks + RuntimeAttr? runtime_overide_get_discfile_size RuntimeAttr? runtime_override_update_sr_list_cluster - RuntimeAttr? runtime_override_update_sr_list_pass - RuntimeAttr? runtime_override_update_sr_list_fail RuntimeAttr? runtime_override_merge_pesr_depth - RuntimeAttr? runtime_override_breakpoint_overlap_filter RuntimeAttr? runtime_override_integrate_resolved_vcfs RuntimeAttr? runtime_override_rename_variants + RuntimeAttr? runtime_override_rename_cleaned_samples - RuntimeAttr? runtime_override_clean_bothside_pass + RuntimeAttr? runtime_override_breakpoint_overlap_filter + + # overrides for mini tasks + RuntimeAttr? runtime_override_ids_from_vcf RuntimeAttr? runtime_override_clean_background_fail RuntimeAttr? runtime_override_make_cpx_cnv_input_file + RuntimeAttr? runtime_override_subset_inversions + RuntimeAttr? runtime_override_concat_merged_vcfs + RuntimeAttr? runtime_override_concat_cpx_vcfs + RuntimeAttr? runtime_override_concat_cleaned_vcfs + + # overrides for VcfClusterContig + RuntimeAttr? runtime_override_join_vcfs + RuntimeAttr? runtime_override_subset_bothside_pass + RuntimeAttr? runtime_override_subset_background_fail + RuntimeAttr? runtime_override_subset_sv_type + RuntimeAttr? runtime_override_shard_clusters + RuntimeAttr? runtime_override_shard_vids + RuntimeAttr? runtime_override_pull_vcf_shard + RuntimeAttr? runtime_override_svtk_vcf_cluster + RuntimeAttr? runtime_override_get_vcf_header_with_members_info_line + RuntimeAttr? runtime_override_cluster_merge + RuntimeAttr? runtime_override_concat_vcf_cluster + RuntimeAttr? runtime_override_concat_svtypes + RuntimeAttr? runtime_override_concat_sharded_cluster + RuntimeAttr? runtime_override_make_sites_only + RuntimeAttr? runtime_override_preconcat_sharded_cluster + RuntimeAttr? runtime_override_hail_merge_sharded_cluster + RuntimeAttr? runtime_override_fix_header_sharded_cluster + RuntimeAttr? runtime_override_concat_large_pesr_depth + + # overrides for ResolveComplexVariants + RuntimeAttr? runtime_override_update_sr_list_pass + RuntimeAttr? runtime_override_update_sr_list_fail + RuntimeAttr? runtime_override_integrate_resolved_vcfs + RuntimeAttr? runtime_override_rename_variants + RuntimeAttr? runtime_override_breakpoint_overlap_filter + RuntimeAttr? runtime_override_subset_inversions + RuntimeAttr? runtime_override_concat_resolve + + RuntimeAttr? runtime_override_get_se_cutoff + RuntimeAttr? runtime_override_shard_vcf_cpx + RuntimeAttr? runtime_override_shard_vids_resolve + RuntimeAttr? runtime_override_resolve_prep + RuntimeAttr? runtime_override_resolve_cpx_per_shard + RuntimeAttr? runtime_override_restore_unresolved_cnv_per_shard + RuntimeAttr? runtime_override_concat_resolved_per_shard + RuntimeAttr? runtime_override_pull_vcf_shard + RuntimeAttr? runtime_override_preconcat_resolve + RuntimeAttr? runtime_override_hail_merge_resolve + RuntimeAttr? runtime_override_fix_header_resolve + + RuntimeAttr? runtime_override_get_se_cutoff_inv + RuntimeAttr? runtime_override_shard_vcf_cpx_inv + RuntimeAttr? runtime_override_shard_vids_resolve_inv + RuntimeAttr? runtime_override_resolve_prep_inv + RuntimeAttr? runtime_override_resolve_cpx_per_shard_inv + RuntimeAttr? runtime_override_restore_unresolved_cnv_per_shard_inv + RuntimeAttr? runtime_override_concat_resolved_per_shard_inv + RuntimeAttr? runtime_override_pull_vcf_shard_inv + RuntimeAttr? runtime_override_preconcat_resolve_inv + RuntimeAttr? runtime_override_hail_merge_resolve_inv + RuntimeAttr? runtime_override_fix_header_resolve_inv + + # overrides for GenotypeComplexContig + RuntimeAttr? runtime_override_ids_from_median + RuntimeAttr? runtime_override_split_vcf_to_genotype + RuntimeAttr? runtime_override_concat_cpx_cnv_vcfs + RuntimeAttr? runtime_override_get_cpx_cnv_intervals + RuntimeAttr? runtime_override_parse_genotypes + RuntimeAttr? runtime_override_merge_melted_gts + RuntimeAttr? runtime_override_split_bed_by_size + RuntimeAttr? runtime_override_rd_genotype + RuntimeAttr? runtime_override_concat_melted_genotypes + RuntimeAttr? runtime_attr_ids_from_vcf_regeno + RuntimeAttr? runtime_attr_subset_ped_regeno + RuntimeAttr? runtime_override_preconcat_regeno + RuntimeAttr? runtime_override_hail_merge_regeno + RuntimeAttr? runtime_override_fix_header_regeno + + # overrides for CleanVcfContig + RuntimeAttr? runtime_attr_ids_from_vcf_clean + RuntimeAttr? runtime_attr_subset_ped_clean + RuntimeAttr? runtime_override_preconcat_clean_final + RuntimeAttr? runtime_override_hail_merge_clean_final + RuntimeAttr? runtime_override_fix_header_clean_final + RuntimeAttr? runtime_override_concat_cleaned_vcfs + + RuntimeAttr? runtime_override_clean_vcf_1a + RuntimeAttr? runtime_override_clean_vcf_2 + RuntimeAttr? runtime_override_clean_vcf_3 + RuntimeAttr? runtime_override_clean_vcf_4 + RuntimeAttr? runtime_override_clean_vcf_5_scatter + RuntimeAttr? runtime_override_clean_vcf_5_make_cleangq + RuntimeAttr? runtime_override_clean_vcf_5_find_redundant_multiallelics + RuntimeAttr? runtime_override_clean_vcf_5_polish + RuntimeAttr? runtime_override_stitch_fragmented_cnvs + RuntimeAttr? runtime_override_final_cleanup + + RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b + RuntimeAttr? runtime_attr_override_sort_bed_1b + RuntimeAttr? runtime_attr_override_intersect_bed_1b + RuntimeAttr? runtime_attr_override_build_dict_1b + RuntimeAttr? runtime_attr_override_scatter_1b + RuntimeAttr? runtime_attr_override_filter_vcf_1b + RuntimeAttr? runtime_override_concat_vcfs_1b + RuntimeAttr? runtime_override_cat_multi_cnvs_1b + + RuntimeAttr? runtime_override_preconcat_step1 + RuntimeAttr? runtime_override_hail_merge_step1 + RuntimeAttr? runtime_override_fix_header_step1 + + RuntimeAttr? runtime_override_preconcat_drc + RuntimeAttr? runtime_override_hail_merge_drc + RuntimeAttr? runtime_override_fix_header_drc + + RuntimeAttr? runtime_override_split_vcf_to_clean + RuntimeAttr? runtime_override_combine_step_1_sex_chr_revisions + RuntimeAttr? runtime_override_split_include_list + RuntimeAttr? runtime_override_combine_clean_vcf_2 + RuntimeAttr? runtime_override_combine_revised_4 + RuntimeAttr? runtime_override_combine_multi_ids_4 + RuntimeAttr? runtime_override_drop_redundant_cnvs + RuntimeAttr? runtime_override_combine_step_1_vcfs + RuntimeAttr? runtime_override_sort_drop_redundant_cnvs + + # overrides for VcfQc + RuntimeAttr? runtime_override_plot_qc_vcf_wide + RuntimeAttr? runtime_override_thousand_g_benchmark + RuntimeAttr? runtime_override_thousand_g_plot + RuntimeAttr? runtime_override_asc_benchmark + RuntimeAttr? runtime_override_asc_plot + RuntimeAttr? runtime_override_hgsv_benchmark + RuntimeAttr? runtime_override_hgsv_plot + RuntimeAttr? runtime_override_plot_qc_per_sample + RuntimeAttr? runtime_override_plot_qc_per_family + RuntimeAttr? runtime_override_sanders_per_sample_plot + RuntimeAttr? runtime_override_collins_per_sample_plot + RuntimeAttr? runtime_override_werling_per_sample_plot + RuntimeAttr? runtime_override_sanitize_outputs + RuntimeAttr? runtime_override_merge_vcfwide_stat_shards + RuntimeAttr? runtime_override_merge_vcf_2_bed + RuntimeAttr? runtime_override_collect_sharded_vcf_stats + RuntimeAttr? runtime_override_svtk_vcf_2_bed + RuntimeAttr? runtime_override_split_vcf_to_qc + RuntimeAttr? runtime_override_merge_subvcf_stat_shards + RuntimeAttr? runtime_override_merge_svtk_vcf_2_bed + RuntimeAttr? runtime_override_collect_vids_per_sample + RuntimeAttr? runtime_override_split_samples_list + RuntimeAttr? runtime_override_tar_shard_vid_lists + RuntimeAttr? runtime_override_benchmark_samples + RuntimeAttr? runtime_override_split_shuffled_list + RuntimeAttr? runtime_override_merge_and_tar_shard_benchmarks ############################################################ ## AnnotateVcf @@ -928,6 +1084,11 @@ workflow GATKSVPipelineSingleSample { max_shards_per_chrom_clean_vcf_step1=clean_vcf_max_shards_per_chrom_clean_vcf_step1, min_records_per_shard_clean_vcf_step1=clean_vcf_min_records_per_shard_clean_vcf_step1, samples_per_clean_vcf_step2_shard=clean_vcf_samples_per_clean_vcf_step2_shard, + clean_vcf5_records_per_shard=clean_vcf5_records_per_shard, + clean_vcf1b_records_per_shard=clean_vcf1b_records_per_shard, + + chr_x=select_first([chr_x, "chrX"]), + chr_y=select_first([chr_y, "chrY"]), random_seed=clean_vcf_random_seed, @@ -935,21 +1096,152 @@ workflow GATKSVPipelineSingleSample { linux_docker=linux_docker, sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, + sv_pipeline_updates_docker=sv_pipeline_updates_docker, sv_pipeline_rdtest_docker=sv_pipeline_rdtest_docker, sv_pipeline_qc_docker=sv_pipeline_qc_docker, sv_base_mini_docker=sv_base_mini_docker, + runtime_overide_get_discfile_size=runtime_overide_get_discfile_size, runtime_override_update_sr_list_cluster=runtime_override_update_sr_list_cluster, - runtime_override_update_sr_list_pass=runtime_override_update_sr_list_pass, - runtime_override_update_sr_list_fail=runtime_override_update_sr_list_fail, runtime_override_merge_pesr_depth=runtime_override_merge_pesr_depth, - runtime_override_breakpoint_overlap_filter=runtime_override_breakpoint_overlap_filter, runtime_override_integrate_resolved_vcfs=runtime_override_integrate_resolved_vcfs, runtime_override_rename_variants=runtime_override_rename_variants, - - runtime_override_clean_bothside_pass=runtime_override_clean_bothside_pass, + runtime_override_rename_cleaned_samples=runtime_override_rename_cleaned_samples, + runtime_override_breakpoint_overlap_filter=runtime_override_breakpoint_overlap_filter, + runtime_override_ids_from_vcf=runtime_override_ids_from_vcf, runtime_override_clean_background_fail=runtime_override_clean_background_fail, - runtime_override_make_cpx_cnv_input_file=runtime_override_make_cpx_cnv_input_file + runtime_override_make_cpx_cnv_input_file=runtime_override_make_cpx_cnv_input_file, + runtime_override_subset_inversions=runtime_override_subset_inversions, + runtime_override_concat_merged_vcfs=runtime_override_concat_merged_vcfs, + runtime_override_concat_cpx_vcfs=runtime_override_concat_cpx_vcfs, + runtime_override_concat_cleaned_vcfs=runtime_override_concat_cleaned_vcfs, + runtime_override_join_vcfs=runtime_override_join_vcfs, + runtime_override_subset_bothside_pass=runtime_override_subset_bothside_pass, + runtime_override_subset_background_fail=runtime_override_subset_background_fail, + runtime_override_subset_sv_type=runtime_override_subset_sv_type, + runtime_override_shard_clusters=runtime_override_shard_clusters, + runtime_override_shard_vids=runtime_override_shard_vids, + runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard, + runtime_override_svtk_vcf_cluster=runtime_override_svtk_vcf_cluster, + runtime_override_get_vcf_header_with_members_info_line=runtime_override_get_vcf_header_with_members_info_line, + runtime_override_cluster_merge=runtime_override_cluster_merge, + runtime_override_concat_vcf_cluster=runtime_override_concat_vcf_cluster, + runtime_override_concat_svtypes=runtime_override_concat_svtypes, + runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster, + runtime_override_make_sites_only=runtime_override_make_sites_only, + runtime_override_preconcat_sharded_cluster=runtime_override_preconcat_sharded_cluster, + runtime_override_hail_merge_sharded_cluster=runtime_override_hail_merge_sharded_cluster, + runtime_override_fix_header_sharded_cluster=runtime_override_fix_header_sharded_cluster, + runtime_override_concat_large_pesr_depth=runtime_override_concat_large_pesr_depth, + runtime_override_update_sr_list_pass=runtime_override_update_sr_list_pass, + runtime_override_update_sr_list_fail=runtime_override_update_sr_list_fail, + runtime_override_integrate_resolved_vcfs=runtime_override_integrate_resolved_vcfs, + runtime_override_rename_variants=runtime_override_rename_variants, + runtime_override_breakpoint_overlap_filter=runtime_override_breakpoint_overlap_filter, + runtime_override_subset_inversions=runtime_override_subset_inversions, + runtime_override_concat_resolve=runtime_override_concat_resolve, + runtime_override_get_se_cutoff=runtime_override_get_se_cutoff, + runtime_override_shard_vcf_cpx=runtime_override_shard_vcf_cpx, + runtime_override_shard_vids_resolve=runtime_override_shard_vids_resolve, + runtime_override_resolve_prep=runtime_override_resolve_prep, + runtime_override_resolve_cpx_per_shard=runtime_override_resolve_cpx_per_shard, + runtime_override_restore_unresolved_cnv_per_shard=runtime_override_restore_unresolved_cnv_per_shard, + runtime_override_concat_resolved_per_shard=runtime_override_concat_resolved_per_shard, + runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard, + runtime_override_preconcat_resolve=runtime_override_preconcat_resolve, + runtime_override_hail_merge_resolve=runtime_override_hail_merge_resolve, + runtime_override_fix_header_resolve=runtime_override_fix_header_resolve, + runtime_override_get_se_cutoff_inv=runtime_override_get_se_cutoff_inv, + runtime_override_shard_vcf_cpx_inv=runtime_override_shard_vcf_cpx_inv, + runtime_override_shard_vids_resolve_inv=runtime_override_shard_vids_resolve_inv, + runtime_override_resolve_prep_inv=runtime_override_resolve_prep_inv, + runtime_override_resolve_cpx_per_shard_inv=runtime_override_resolve_cpx_per_shard_inv, + runtime_override_restore_unresolved_cnv_per_shard_inv=runtime_override_restore_unresolved_cnv_per_shard_inv, + runtime_override_concat_resolved_per_shard_inv=runtime_override_concat_resolved_per_shard_inv, + runtime_override_pull_vcf_shard_inv=runtime_override_pull_vcf_shard_inv, + runtime_override_preconcat_resolve_inv=runtime_override_preconcat_resolve_inv, + runtime_override_hail_merge_resolve_inv=runtime_override_hail_merge_resolve_inv, + runtime_override_fix_header_resolve_inv=runtime_override_fix_header_resolve_inv, + runtime_override_ids_from_median=runtime_override_ids_from_median, + runtime_override_split_vcf_to_genotype=runtime_override_split_vcf_to_genotype, + runtime_override_concat_cpx_cnv_vcfs=runtime_override_concat_cpx_cnv_vcfs, + runtime_override_get_cpx_cnv_intervals=runtime_override_get_cpx_cnv_intervals, + runtime_override_parse_genotypes=runtime_override_parse_genotypes, + runtime_override_merge_melted_gts=runtime_override_merge_melted_gts, + runtime_override_split_bed_by_size=runtime_override_split_bed_by_size, + runtime_override_rd_genotype=runtime_override_rd_genotype, + runtime_override_concat_melted_genotypes=runtime_override_concat_melted_genotypes, + runtime_attr_ids_from_vcf_regeno=runtime_attr_ids_from_vcf_regeno, + runtime_attr_subset_ped_regeno=runtime_attr_subset_ped_regeno, + runtime_override_preconcat_regeno=runtime_override_preconcat_regeno, + runtime_override_hail_merge_regeno=runtime_override_hail_merge_regeno, + runtime_override_fix_header_regeno=runtime_override_fix_header_regeno, + runtime_attr_ids_from_vcf_clean=runtime_attr_ids_from_vcf_clean, + runtime_attr_subset_ped_clean=runtime_attr_subset_ped_clean, + runtime_override_preconcat_clean_final=runtime_override_preconcat_clean_final, + runtime_override_hail_merge_clean_final=runtime_override_hail_merge_clean_final, + runtime_override_fix_header_clean_final=runtime_override_fix_header_clean_final, + runtime_override_concat_cleaned_vcfs=runtime_override_concat_cleaned_vcfs, + runtime_override_clean_vcf_1a=runtime_override_clean_vcf_1a, + runtime_override_clean_vcf_2=runtime_override_clean_vcf_2, + runtime_override_clean_vcf_3=runtime_override_clean_vcf_3, + runtime_override_clean_vcf_4=runtime_override_clean_vcf_4, + runtime_override_clean_vcf_5_scatter=runtime_override_clean_vcf_5_scatter, + runtime_override_clean_vcf_5_make_cleangq=runtime_override_clean_vcf_5_make_cleangq, + runtime_override_clean_vcf_5_find_redundant_multiallelics=runtime_override_clean_vcf_5_find_redundant_multiallelics, + runtime_override_clean_vcf_5_polish=runtime_override_clean_vcf_5_polish, + runtime_override_stitch_fragmented_cnvs=runtime_override_stitch_fragmented_cnvs, + runtime_override_final_cleanup=runtime_override_final_cleanup, + runtime_attr_override_subset_large_cnvs_1b=runtime_attr_override_subset_large_cnvs_1b, + runtime_attr_override_sort_bed_1b=runtime_attr_override_sort_bed_1b, + runtime_attr_override_intersect_bed_1b=runtime_attr_override_intersect_bed_1b, + runtime_attr_override_build_dict_1b=runtime_attr_override_build_dict_1b, + runtime_attr_override_scatter_1b=runtime_attr_override_scatter_1b, + runtime_attr_override_filter_vcf_1b=runtime_attr_override_filter_vcf_1b, + runtime_override_concat_vcfs_1b=runtime_override_concat_vcfs_1b, + runtime_override_cat_multi_cnvs_1b=runtime_override_cat_multi_cnvs_1b, + runtime_override_preconcat_step1=runtime_override_preconcat_step1, + runtime_override_hail_merge_step1=runtime_override_hail_merge_step1, + runtime_override_fix_header_step1=runtime_override_fix_header_step1, + runtime_override_preconcat_drc=runtime_override_preconcat_drc, + runtime_override_hail_merge_drc=runtime_override_hail_merge_drc, + runtime_override_fix_header_drc=runtime_override_fix_header_drc, + runtime_override_split_vcf_to_clean=runtime_override_split_vcf_to_clean, + runtime_override_combine_step_1_sex_chr_revisions=runtime_override_combine_step_1_sex_chr_revisions, + runtime_override_split_include_list=runtime_override_split_include_list, + runtime_override_combine_clean_vcf_2=runtime_override_combine_clean_vcf_2, + runtime_override_combine_revised_4=runtime_override_combine_revised_4, + runtime_override_combine_multi_ids_4=runtime_override_combine_multi_ids_4, + runtime_override_drop_redundant_cnvs=runtime_override_drop_redundant_cnvs, + runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs, + runtime_override_sort_drop_redundant_cnvs=runtime_override_sort_drop_redundant_cnvs, + runtime_override_plot_qc_vcf_wide=runtime_override_plot_qc_vcf_wide, + runtime_override_thousand_g_benchmark=runtime_override_thousand_g_benchmark, + runtime_override_thousand_g_plot=runtime_override_thousand_g_plot, + runtime_override_asc_benchmark=runtime_override_asc_benchmark, + runtime_override_asc_plot=runtime_override_asc_plot, + runtime_override_hgsv_benchmark=runtime_override_hgsv_benchmark, + runtime_override_hgsv_plot=runtime_override_hgsv_plot, + runtime_override_plot_qc_per_sample=runtime_override_plot_qc_per_sample, + runtime_override_plot_qc_per_family=runtime_override_plot_qc_per_family, + runtime_override_sanders_per_sample_plot=runtime_override_sanders_per_sample_plot, + runtime_override_collins_per_sample_plot=runtime_override_collins_per_sample_plot, + runtime_override_werling_per_sample_plot=runtime_override_werling_per_sample_plot, + runtime_override_sanitize_outputs=runtime_override_sanitize_outputs, + runtime_override_merge_vcfwide_stat_shards=runtime_override_merge_vcfwide_stat_shards, + runtime_override_merge_vcf_2_bed=runtime_override_merge_vcf_2_bed, + runtime_override_collect_sharded_vcf_stats=runtime_override_collect_sharded_vcf_stats, + runtime_override_svtk_vcf_2_bed=runtime_override_svtk_vcf_2_bed, + runtime_override_split_vcf_to_qc=runtime_override_split_vcf_to_qc, + runtime_override_merge_subvcf_stat_shards=runtime_override_merge_subvcf_stat_shards, + runtime_override_merge_svtk_vcf_2_bed=runtime_override_merge_svtk_vcf_2_bed, + runtime_override_collect_vids_per_sample=runtime_override_collect_vids_per_sample, + runtime_override_split_samples_list=runtime_override_split_samples_list, + runtime_override_tar_shard_vid_lists=runtime_override_tar_shard_vid_lists, + runtime_override_benchmark_samples=runtime_override_benchmark_samples, + runtime_override_split_shuffled_list=runtime_override_split_shuffled_list, + runtime_override_merge_and_tar_shard_benchmarks=runtime_override_merge_and_tar_shard_benchmarks } diff --git a/wdl/GenotypeComplexVariants.wdl b/wdl/GenotypeComplexVariants.wdl index 5f2aa0e41..d018b2704 100644 --- a/wdl/GenotypeComplexVariants.wdl +++ b/wdl/GenotypeComplexVariants.wdl @@ -12,6 +12,7 @@ workflow GenotypeComplexVariants { Array[File] depth_vcfs Boolean merge_vcfs = false + Int? records_per_shard Array[File] complex_resolve_vcfs Array[File] complex_resolve_vcf_indexes @@ -25,9 +26,14 @@ workflow GenotypeComplexVariants { File contig_list File ref_dict + Boolean use_hail = false + String? gcs_project + String linux_docker String sv_base_mini_docker + String sv_pipeline_updates_docker String sv_pipeline_docker + String sv_pipeline_hail_docker String sv_pipeline_rdtest_docker # overrides for mini tasks @@ -45,6 +51,9 @@ workflow GenotypeComplexVariants { RuntimeAttr? runtime_override_concat_melted_genotypes RuntimeAttr? runtime_attr_ids_from_vcf RuntimeAttr? runtime_attr_subset_ped + RuntimeAttr? runtime_override_preconcat + RuntimeAttr? runtime_override_hail_merge + RuntimeAttr? runtime_override_fix_header } scatter (i in range(length(batches))) { @@ -74,8 +83,7 @@ workflow GenotypeComplexVariants { input: bin_exclude=bin_exclude, vcf=complex_resolve_vcfs[i], - n_master_vcf_shards=200, - n_master_min_vars_per_vcf_shard=5000, + records_per_shard=select_first([records_per_shard, 50000]), batches=batches, coverage_files=bincov_files, rd_depth_sep_cutoff_files=depth_gt_rd_sep_files, @@ -84,13 +92,17 @@ workflow GenotypeComplexVariants { n_per_split_small=2500, n_per_split_large=250, n_rd_test_bins=100000, - prefix=cohort_name, + prefix="~{cohort_name}.~{contig}", contig=contig, ped_files=SubsetPedFile.ped_subset_file, ref_dict=ref_dict, + use_hail=use_hail, + gcs_project=gcs_project, linux_docker=linux_docker, + sv_pipeline_updates_docker=sv_pipeline_updates_docker, sv_base_mini_docker=sv_base_mini_docker, sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, sv_pipeline_rdtest_docker=sv_pipeline_rdtest_docker, runtime_override_ids_from_median=runtime_override_ids_from_median, runtime_override_split_vcf_to_genotype=runtime_override_split_vcf_to_genotype, @@ -100,7 +112,10 @@ workflow GenotypeComplexVariants { runtime_override_merge_melted_gts=runtime_override_merge_melted_gts, runtime_override_split_bed_by_size=runtime_override_split_bed_by_size, runtime_override_rd_genotype=runtime_override_rd_genotype, - runtime_override_concat_melted_genotypes=runtime_override_concat_melted_genotypes + runtime_override_concat_melted_genotypes=runtime_override_concat_melted_genotypes, + runtime_override_preconcat=runtime_override_preconcat, + runtime_override_hail_merge=runtime_override_hail_merge, + runtime_override_fix_header=runtime_override_fix_header } } diff --git a/wdl/HailMerge.wdl b/wdl/HailMerge.wdl new file mode 100644 index 000000000..abfab482e --- /dev/null +++ b/wdl/HailMerge.wdl @@ -0,0 +1,186 @@ +version 1.0 + +import "Structs.wdl" +import "TasksMakeCohortVcf.wdl" as MiniTasks + +workflow HailMerge { + input { + Array[File] vcfs + String prefix + String? gcs_project # REQUIRED + Boolean? reset_cnv_gts + String sv_base_mini_docker + String sv_pipeline_docker + String sv_pipeline_hail_docker + RuntimeAttr? runtime_override_preconcat + RuntimeAttr? runtime_override_hail_merge + RuntimeAttr? runtime_override_fix_header + } + + # Concatenate vcfs naively to prevent ClassTooLargeException in Hail + if (length(vcfs) > 1) { + call MiniTasks.ConcatVcfs as Preconcat { + input: + vcfs=vcfs, + naive=true, + generate_index=false, + outfile_prefix="~{prefix}.preconcat", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_preconcat + } + } + + call HailMerge { + input: + vcfs = [select_first([Preconcat.concat_vcf, vcfs[0]])], + prefix = prefix, + gcs_project = select_first([gcs_project]), + sv_pipeline_hail_docker=sv_pipeline_hail_docker, + runtime_attr_override=runtime_override_hail_merge + } + + call FixHeader { + input: + merged_vcf = HailMerge.merged_vcf, + example_vcf = vcfs[0], + prefix = prefix + ".reheadered", + reset_cnv_gts = select_first([reset_cnv_gts, false]), + sv_pipeline_docker = sv_pipeline_docker, + runtime_attr_override=runtime_override_fix_header + } + + output { + File merged_vcf = FixHeader.out + File merged_vcf_index = FixHeader.out_index + } +} + +task HailMerge { + input { + Array[File] vcfs + String prefix + String gcs_project + String region = "us-central1" + String sv_pipeline_hail_docker + RuntimeAttr? runtime_attr_override + } + + parameter_meta { + vcfs: { + localization_optional: true + } + } + + String cluster_name_prefix="gatk-sv-cluster-" + + RuntimeAttr runtime_default = object { + mem_gb: 6.5, + disk_gb: 100, + cpu_cores: 1, + preemptible_tries: 0, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_override.disk_gb, runtime_default.disk_gb]) + " SSD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_hail_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euxo pipefail + + cp ~{write_lines(vcfs)} "files.list" + + python <>> + + output { + File merged_vcf = "~{prefix}.vcf.gz" + File merged_vcf_index = "~{prefix}.vcf.gz.tbi" + } +} + +task FixHeader { + input { + File merged_vcf + File example_vcf + String prefix + Boolean reset_cnv_gts + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10 + size(merged_vcf, "GB") * 2 + size(example_vcf, "GB")), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_override.disk_gb, runtime_default.disk_gb]) + " SSD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euxo pipefail + + # Reset to original header + bcftools view --no-version -h ~{merged_vcf} | grep -v ^#CHROM > header + bcftools view --no-version -h ~{example_vcf} | grep -e "^##source" -e "^##ALT" -e "^##CPX_TYPE" >> header + bcftools view --no-version -h ~{merged_vcf} | grep ^#CHROM >> header + bcftools reheader -h header ~{merged_vcf} \ + ~{if reset_cnv_gts then "| gunzip | python /opt/sv-pipeline/04_variant_resolution/scripts/reset_cnv_gts.py stdin stdout | bgzip" else ""} \ + > ~{prefix}.vcf.gz + tabix ~{prefix}.vcf.gz + >>> + + output { + File out = "~{prefix}.vcf.gz" + File out_index = "~{prefix}.vcf.gz.tbi" + } +} diff --git a/wdl/HarmonizeHeaders.wdl b/wdl/HarmonizeHeaders.wdl new file mode 100644 index 000000000..fe3746d96 --- /dev/null +++ b/wdl/HarmonizeHeaders.wdl @@ -0,0 +1,79 @@ +version 1.0 + +import "Structs.wdl" +import "TasksMakeCohortVcf.wdl" as MiniTasks + +# Reheader a list of vcfs with the header from another vcf + +workflow HarmonizeHeaders { + input { + File header_vcf # Vcf containing desired header + Array[File] vcfs # Vcfs to replace headers of + String prefix + String sv_base_mini_docker + RuntimeAttr? runtime_override_reheader + RuntimeAttr? runtime_override_pull_header + } + + call PullHeader { + input: + vcf=header_vcf, + prefix=prefix, + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_pull_header + } + + scatter (i in range(length(vcfs))) { + call MiniTasks.ReheaderVcf { + input: + vcf=vcfs[i], + vcf_index=vcfs[i] + ".tbi", + header=PullHeader.out, + prefix="~{prefix}.reheadered", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_reheader + } + } + + output { + Array[File] out = ReheaderVcf.out + Array[File] out_index = ReheaderVcf.out_index + } +} + +task PullHeader { + input { + File vcf + String prefix + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr runtime_default = object { + mem_gb: 2.0, + disk_gb: ceil(10.0 + size(vcf, "GiB") ), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_base_mini_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + bcftools view --header-only ~{vcf} > ~{prefix}.header + >>> + + output { + File out = "~{prefix}.header" + } +} \ No newline at end of file diff --git a/wdl/MakeCohortVcf.wdl b/wdl/MakeCohortVcf.wdl index c25f51186..a6f28c55a 100644 --- a/wdl/MakeCohortVcf.wdl +++ b/wdl/MakeCohortVcf.wdl @@ -14,6 +14,7 @@ workflow MakeCohortVcf { File ped_file # cohort ped file # Merge contig vcfs at each stage for QC + # Not recommended for very large cohorts Boolean merge_cluster_vcfs = false Boolean merge_complex_resolve_vcfs = false Boolean merge_complex_genotype_vcfs = false @@ -29,6 +30,12 @@ workflow MakeCohortVcf { Array[File] median_coverage_files Array[File] rf_cutoff_files + # Enables use of Hail for merging and sorting VCFs + # Recommended for cohorts of 10,000 samples or more + # Requires that DataProc be enabled in the GCP project + Boolean use_hail = false + String? gcs_project + File bin_exclude File contig_list File allosome_fai @@ -41,8 +48,15 @@ workflow MakeCohortVcf { Int max_shard_size_resolve Int max_shards_per_chrom_clean_vcf_step1 Int min_records_per_shard_clean_vcf_step1 + Int clean_vcf1b_records_per_shard Int samples_per_clean_vcf_step2_shard + Int clean_vcf5_records_per_shard + Int? clean_vcf5_threads_per_task Float min_sr_background_fail_batches + Int? max_samples_per_shard_clean_vcf_step3 + + String chr_x + String chr_y File empty_file File? outlier_samples_list @@ -68,6 +82,8 @@ workflow MakeCohortVcf { String linux_docker String sv_base_mini_docker String sv_pipeline_docker + String sv_pipeline_hail_docker + String sv_pipeline_updates_docker String sv_pipeline_rdtest_docker String sv_pipeline_qc_docker @@ -82,8 +98,6 @@ workflow MakeCohortVcf { RuntimeAttr? runtime_override_breakpoint_overlap_filter # overrides for mini tasks - RuntimeAttr? runtime_override_ids_from_vcf - RuntimeAttr? runtime_override_clean_bothside_pass RuntimeAttr? runtime_override_clean_background_fail RuntimeAttr? runtime_override_make_cpx_cnv_input_file RuntimeAttr? runtime_override_subset_inversions @@ -96,7 +110,8 @@ workflow MakeCohortVcf { RuntimeAttr? runtime_override_subset_bothside_pass RuntimeAttr? runtime_override_subset_background_fail RuntimeAttr? runtime_override_subset_sv_type - RuntimeAttr? runtime_override_shard_vcf_precluster + RuntimeAttr? runtime_override_shard_clusters + RuntimeAttr? runtime_override_shard_vids RuntimeAttr? runtime_override_pull_vcf_shard RuntimeAttr? runtime_override_svtk_vcf_cluster RuntimeAttr? runtime_override_get_vcf_header_with_members_info_line @@ -104,27 +119,44 @@ workflow MakeCohortVcf { RuntimeAttr? runtime_override_concat_vcf_cluster RuntimeAttr? runtime_override_concat_svtypes RuntimeAttr? runtime_override_concat_sharded_cluster + RuntimeAttr? runtime_override_make_sites_only + RuntimeAttr? runtime_override_preconcat_sharded_cluster + RuntimeAttr? runtime_override_hail_merge_sharded_cluster + RuntimeAttr? runtime_override_fix_header_sharded_cluster + RuntimeAttr? runtime_override_concat_large_pesr_depth - # overrides for ResolveComplexContig + # overrides for ResolveComplexVariants RuntimeAttr? runtime_override_update_sr_list_pass RuntimeAttr? runtime_override_update_sr_list_fail + RuntimeAttr? runtime_override_integrate_resolved_vcfs + RuntimeAttr? runtime_override_rename_variants + RuntimeAttr? runtime_override_breakpoint_overlap_filter + RuntimeAttr? runtime_override_subset_inversions + RuntimeAttr? runtime_override_concat_resolve + RuntimeAttr? runtime_override_get_se_cutoff RuntimeAttr? runtime_override_shard_vcf_cpx + RuntimeAttr? runtime_override_shard_vids_resolve RuntimeAttr? runtime_override_resolve_prep RuntimeAttr? runtime_override_resolve_cpx_per_shard RuntimeAttr? runtime_override_restore_unresolved_cnv_per_shard RuntimeAttr? runtime_override_concat_resolved_per_shard - RuntimeAttr? runtime_override_complex_resolve_merge - RuntimeAttr? runtime_override_merge_resolve_inner + RuntimeAttr? runtime_override_pull_vcf_shard + RuntimeAttr? runtime_override_preconcat_resolve + RuntimeAttr? runtime_override_hail_merge_resolve + RuntimeAttr? runtime_override_fix_header_resolve RuntimeAttr? runtime_override_get_se_cutoff_inv RuntimeAttr? runtime_override_shard_vcf_cpx_inv - RuntimeAttr? runtime_override_shard_vids_inv + RuntimeAttr? runtime_override_shard_vids_resolve_inv RuntimeAttr? runtime_override_resolve_prep_inv RuntimeAttr? runtime_override_resolve_cpx_per_shard_inv RuntimeAttr? runtime_override_restore_unresolved_cnv_per_shard_inv RuntimeAttr? runtime_override_concat_resolved_per_shard_inv - RuntimeAttr? runtime_override_merge_resolve_inner_inv + RuntimeAttr? runtime_override_pull_vcf_shard_inv + RuntimeAttr? runtime_override_preconcat_resolve_inv + RuntimeAttr? runtime_override_hail_merge_resolve_inv + RuntimeAttr? runtime_override_fix_header_resolve_inv # overrides for GenotypeComplexContig RuntimeAttr? runtime_override_ids_from_median @@ -136,27 +168,55 @@ workflow MakeCohortVcf { RuntimeAttr? runtime_override_split_bed_by_size RuntimeAttr? runtime_override_rd_genotype RuntimeAttr? runtime_override_concat_melted_genotypes - RuntimeAttr? runtime_override_complex_genotype_merge - RuntimeAttr? runtime_attr_ids_from_vcf - RuntimeAttr? runtime_attr_subset_ped + RuntimeAttr? runtime_attr_ids_from_vcf_regeno + RuntimeAttr? runtime_attr_subset_ped_regeno + RuntimeAttr? runtime_override_preconcat_regeno + RuntimeAttr? runtime_override_hail_merge_regeno + RuntimeAttr? runtime_override_fix_header_regeno # overrides for CleanVcfContig + RuntimeAttr? runtime_override_preconcat_clean_final + RuntimeAttr? runtime_override_hail_merge_clean_final + RuntimeAttr? runtime_override_fix_header_clean_final + RuntimeAttr? runtime_override_concat_cleaned_vcfs + RuntimeAttr? runtime_override_clean_vcf_1a - RuntimeAttr? runtime_override_clean_vcf_1b RuntimeAttr? runtime_override_clean_vcf_2 RuntimeAttr? runtime_override_clean_vcf_3 RuntimeAttr? runtime_override_clean_vcf_4 - RuntimeAttr? runtime_override_clean_vcf_5 - RuntimeAttr? runtime_override_drop_redundant_cnvs + RuntimeAttr? runtime_override_clean_vcf_5_scatter + RuntimeAttr? runtime_override_clean_vcf_5_make_cleangq + RuntimeAttr? runtime_override_clean_vcf_5_find_redundant_multiallelics + RuntimeAttr? runtime_override_clean_vcf_5_polish RuntimeAttr? runtime_override_stitch_fragmented_cnvs RuntimeAttr? runtime_override_final_cleanup + + RuntimeAttr? runtime_attr_override_subset_large_cnvs_1b + RuntimeAttr? runtime_attr_override_sort_bed_1b + RuntimeAttr? runtime_attr_override_intersect_bed_1b + RuntimeAttr? runtime_attr_override_build_dict_1b + RuntimeAttr? runtime_attr_override_scatter_1b + RuntimeAttr? runtime_attr_override_filter_vcf_1b + RuntimeAttr? runtime_override_concat_vcfs_1b + RuntimeAttr? runtime_override_cat_multi_cnvs_1b + + RuntimeAttr? runtime_override_preconcat_step1 + RuntimeAttr? runtime_override_hail_merge_step1 + RuntimeAttr? runtime_override_fix_header_step1 + + RuntimeAttr? runtime_override_preconcat_drc + RuntimeAttr? runtime_override_hail_merge_drc + RuntimeAttr? runtime_override_fix_header_drc + RuntimeAttr? runtime_override_split_vcf_to_clean - RuntimeAttr? runtime_override_combine_step_1_vcfs RuntimeAttr? runtime_override_combine_step_1_sex_chr_revisions RuntimeAttr? runtime_override_split_include_list RuntimeAttr? runtime_override_combine_clean_vcf_2 RuntimeAttr? runtime_override_combine_revised_4 RuntimeAttr? runtime_override_combine_multi_ids_4 + RuntimeAttr? runtime_override_drop_redundant_cnvs + RuntimeAttr? runtime_override_combine_step_1_vcfs + RuntimeAttr? runtime_override_sort_drop_redundant_cnvs # overrides for VcfQc RuntimeAttr? runtime_override_plot_qc_vcf_wide @@ -202,24 +262,32 @@ workflow MakeCohortVcf { depth_exclude_list=depth_exclude_list, min_sr_background_fail_batches=min_sr_background_fail_batches, empty_file=empty_file, + use_hail=use_hail, + gcs_project=gcs_project, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, sv_base_mini_docker=sv_base_mini_docker, sv_pipeline_docker=sv_pipeline_docker, runtime_override_update_sr_list=runtime_override_update_sr_list_cluster, runtime_override_merge_pesr_depth=runtime_override_merge_pesr_depth, - runtime_override_clean_bothside_pass=runtime_override_clean_bothside_pass, runtime_override_clean_background_fail=runtime_override_clean_background_fail, runtime_override_concat=runtime_override_cluster_merge, runtime_override_join_vcfs=runtime_override_join_vcfs, runtime_override_subset_bothside_pass=runtime_override_subset_bothside_pass, runtime_override_subset_background_fail=runtime_override_subset_background_fail, runtime_override_subset_sv_type=runtime_override_subset_sv_type, - runtime_override_shard_vcf_precluster=runtime_override_shard_vcf_precluster, + runtime_override_shard_clusters=runtime_override_shard_clusters, + runtime_override_shard_vids=runtime_override_shard_vids, runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard, runtime_override_svtk_vcf_cluster=runtime_override_svtk_vcf_cluster, runtime_override_get_vcf_header_with_members_info_line=runtime_override_get_vcf_header_with_members_info_line, runtime_override_concat_vcf_cluster=runtime_override_concat_vcf_cluster, runtime_override_concat_svtypes=runtime_override_concat_svtypes, - runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster + runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster, + runtime_override_make_sites_only=runtime_override_make_sites_only, + runtime_override_preconcat_sharded_cluster=runtime_override_preconcat_sharded_cluster, + runtime_override_hail_merge_sharded_cluster=runtime_override_hail_merge_sharded_cluster, + runtime_override_fix_header_sharded_cluster=runtime_override_fix_header_sharded_cluster, + runtime_override_concat_large_pesr_depth=runtime_override_concat_large_pesr_depth } call ComplexResolve.ResolveComplexVariants { @@ -236,32 +304,43 @@ workflow MakeCohortVcf { mei_bed=mei_bed, pe_exclude_list=pe_exclude_list, ref_dict=ref_dict, + use_hail=use_hail, + gcs_project=gcs_project, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, max_shard_size=max_shard_size_resolve, sv_base_mini_docker=sv_base_mini_docker, sv_pipeline_docker=sv_pipeline_docker, runtime_override_update_sr_list_pass=runtime_override_update_sr_list_pass, runtime_override_update_sr_list_fail=runtime_override_update_sr_list_fail, - runtime_override_breakpoint_overlap_filter=runtime_override_breakpoint_overlap_filter, runtime_override_integrate_resolved_vcfs=runtime_override_integrate_resolved_vcfs, runtime_override_rename_variants=runtime_override_rename_variants, + runtime_override_breakpoint_overlap_filter=runtime_override_breakpoint_overlap_filter, runtime_override_subset_inversions=runtime_override_subset_inversions, - runtime_override_concat=runtime_override_complex_resolve_merge, + runtime_override_concat=runtime_override_concat_resolve, + runtime_override_get_se_cutoff=runtime_override_get_se_cutoff, runtime_override_shard_vcf_cpx=runtime_override_shard_vcf_cpx, + runtime_override_shard_vids=runtime_override_shard_vids_resolve, runtime_override_resolve_prep=runtime_override_resolve_prep, runtime_override_resolve_cpx_per_shard=runtime_override_resolve_cpx_per_shard, runtime_override_restore_unresolved_cnv_per_shard=runtime_override_restore_unresolved_cnv_per_shard, runtime_override_concat_resolved_per_shard=runtime_override_concat_resolved_per_shard, - runtime_override_merge_resolve_inner=runtime_override_merge_resolve_inner, - - runtime_override_get_se_cutoff_inv=runtime_override_get_se_cutoff_inv, - runtime_override_shard_vcf_cpx_inv=runtime_override_shard_vcf_cpx_inv, - runtime_override_shard_vids_inv=runtime_override_shard_vids_inv, - runtime_override_resolve_prep_inv=runtime_override_resolve_prep_inv, - runtime_override_resolve_cpx_per_shard_inv=runtime_override_resolve_cpx_per_shard_inv, - runtime_override_restore_unresolved_cnv_per_shard_inv=runtime_override_restore_unresolved_cnv_per_shard_inv, - runtime_override_concat_resolved_per_shard_inv=runtime_override_concat_resolved_per_shard_inv, - runtime_override_merge_resolve_inner_inv=runtime_override_merge_resolve_inner_inv + runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard, + runtime_override_preconcat=runtime_override_preconcat_resolve, + runtime_override_hail_merge=runtime_override_hail_merge_resolve, + runtime_override_fix_header=runtime_override_fix_header_resolve, + + runtime_override_get_se_cutoff=runtime_override_get_se_cutoff_inv, + runtime_override_shard_vcf_cpx=runtime_override_shard_vcf_cpx_inv, + runtime_override_shard_vids=runtime_override_shard_vids_resolve_inv, + runtime_override_resolve_prep=runtime_override_resolve_prep_inv, + runtime_override_resolve_cpx_per_shard=runtime_override_resolve_cpx_per_shard_inv, + runtime_override_restore_unresolved_cnv_per_shard=runtime_override_restore_unresolved_cnv_per_shard_inv, + runtime_override_concat_resolved_per_shard=runtime_override_concat_resolved_per_shard_inv, + runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard_inv, + runtime_override_preconcat=runtime_override_preconcat_resolve_inv, + runtime_override_hail_merge=runtime_override_hail_merge_resolve_inv, + runtime_override_fix_header=runtime_override_fix_header_resolve_inv } call ComplexGenotype.GenotypeComplexVariants { @@ -281,9 +360,10 @@ workflow MakeCohortVcf { ref_dict=ref_dict, linux_docker=linux_docker, sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_updates_docker=sv_pipeline_updates_docker, sv_pipeline_rdtest_docker=sv_pipeline_rdtest_docker, - runtime_override_concat=runtime_override_complex_genotype_merge, runtime_override_ids_from_median=runtime_override_ids_from_median, runtime_override_split_vcf_to_genotype=runtime_override_split_vcf_to_genotype, runtime_override_concat_cpx_cnv_vcfs=runtime_override_concat_cpx_cnv_vcfs, @@ -293,8 +373,11 @@ workflow MakeCohortVcf { runtime_override_split_bed_by_size=runtime_override_split_bed_by_size, runtime_override_rd_genotype=runtime_override_rd_genotype, runtime_override_concat_melted_genotypes=runtime_override_concat_melted_genotypes, - runtime_attr_ids_from_vcf=runtime_attr_ids_from_vcf, - runtime_attr_subset_ped=runtime_attr_subset_ped + runtime_attr_ids_from_vcf=runtime_attr_ids_from_vcf_regeno, + runtime_attr_subset_ped=runtime_attr_subset_ped_regeno, + runtime_override_preconcat=runtime_override_preconcat_regeno, + runtime_override_hail_merge=runtime_override_hail_merge_regeno, + runtime_override_fix_header=runtime_override_fix_header_regeno } call Clean.CleanVcf { @@ -306,31 +389,59 @@ workflow MakeCohortVcf { merged_ped_file=ped_file, contig_list=contig_list, allosome_fai=allosome_fai, - max_shards_per_chrom_clean_vcf_step1=max_shards_per_chrom_clean_vcf_step1, - min_records_per_shard_clean_vcf_step1=min_records_per_shard_clean_vcf_step1, - samples_per_clean_vcf_step2_shard=samples_per_clean_vcf_step2_shard, + chr_x=chr_x, + chr_y=chr_y, + max_shards_per_chrom_step1=max_shards_per_chrom_clean_vcf_step1, + min_records_per_shard_step1=min_records_per_shard_clean_vcf_step1, + clean_vcf1b_records_per_shard=clean_vcf1b_records_per_shard, + samples_per_step2_shard=samples_per_clean_vcf_step2_shard, + max_samples_per_shard_step3=max_samples_per_shard_clean_vcf_step3, + clean_vcf5_records_per_shard=clean_vcf5_records_per_shard, outlier_samples_list=outlier_samples_list, + use_hail=use_hail, + gcs_project=gcs_project, linux_docker=linux_docker, sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_updates_docker=sv_pipeline_updates_docker, + runtime_override_preconcat_clean_final=runtime_override_preconcat_clean_final, + runtime_override_hail_merge_clean_final=runtime_override_hail_merge_clean_final, + runtime_override_fix_header_clean_final=runtime_override_fix_header_clean_final, runtime_override_concat_cleaned_vcfs=runtime_override_concat_cleaned_vcfs, runtime_override_clean_vcf_1a=runtime_override_clean_vcf_1a, - runtime_override_clean_vcf_1b=runtime_override_clean_vcf_1b, runtime_override_clean_vcf_2=runtime_override_clean_vcf_2, runtime_override_clean_vcf_3=runtime_override_clean_vcf_3, runtime_override_clean_vcf_4=runtime_override_clean_vcf_4, - runtime_override_clean_vcf_5=runtime_override_clean_vcf_5, + runtime_override_clean_vcf_5_scatter=runtime_override_clean_vcf_5_scatter, + runtime_override_clean_vcf_5_make_cleangq=runtime_override_clean_vcf_5_make_cleangq, + runtime_override_clean_vcf_5_find_redundant_multiallelics=runtime_override_clean_vcf_5_find_redundant_multiallelics, + runtime_override_clean_vcf_5_polish=runtime_override_clean_vcf_5_polish, runtime_override_stitch_fragmented_cnvs=runtime_override_stitch_fragmented_cnvs, runtime_override_final_cleanup=runtime_override_final_cleanup, + runtime_attr_override_subset_large_cnvs_1b=runtime_attr_override_subset_large_cnvs_1b, + runtime_attr_override_sort_bed_1b=runtime_attr_override_sort_bed_1b, + runtime_attr_override_intersect_bed_1b=runtime_attr_override_intersect_bed_1b, + runtime_attr_override_build_dict_1b=runtime_attr_override_build_dict_1b, + runtime_attr_override_scatter_1b=runtime_attr_override_scatter_1b, + runtime_attr_override_filter_vcf_1b=runtime_attr_override_filter_vcf_1b, + runtime_override_concat_vcfs_1b=runtime_override_concat_vcfs_1b, + runtime_override_cat_multi_cnvs_1b=runtime_override_cat_multi_cnvs_1b, + runtime_override_preconcat_step1=runtime_override_preconcat_step1, + runtime_override_hail_merge_step1=runtime_override_hail_merge_step1, + runtime_override_fix_header_step1=runtime_override_fix_header_step1, + runtime_override_preconcat_drc=runtime_override_preconcat_drc, + runtime_override_hail_merge_drc=runtime_override_hail_merge_drc, + runtime_override_fix_header_drc=runtime_override_fix_header_drc, runtime_override_split_vcf_to_clean=runtime_override_split_vcf_to_clean, - runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs, runtime_override_combine_step_1_sex_chr_revisions=runtime_override_combine_step_1_sex_chr_revisions, runtime_override_split_include_list=runtime_override_split_include_list, runtime_override_combine_clean_vcf_2=runtime_override_combine_clean_vcf_2, runtime_override_combine_revised_4=runtime_override_combine_revised_4, runtime_override_combine_multi_ids_4=runtime_override_combine_multi_ids_4, - runtime_attr_ids_from_vcf=runtime_attr_ids_from_vcf, - runtime_attr_subset_ped=runtime_attr_subset_ped + runtime_override_drop_redundant_cnvs=runtime_override_drop_redundant_cnvs, + runtime_override_combine_step_1_vcfs=runtime_override_combine_step_1_vcfs, + runtime_override_sort_drop_redundant_cnvs=runtime_override_sort_drop_redundant_cnvs } Array[String] contigs = transpose(read_tsv(contig_list))[0] diff --git a/wdl/MergePesrDepth.wdl b/wdl/MergePesrDepth.wdl new file mode 100644 index 000000000..c7756bc28 --- /dev/null +++ b/wdl/MergePesrDepth.wdl @@ -0,0 +1,237 @@ +version 1.0 + +import "Structs.wdl" +import "TasksMakeCohortVcf.wdl" as MiniTasks +import "HailMerge.wdl" as HailMerge +import "ShardedCluster.wdl" as ShardedCluster +import "Utils.wdl" as utils + +workflow MergePesrDepth { + input { + File subtyped_pesr_vcf + File subtyped_depth_vcf + Int num_samples + + String prefix + String cohort_name + String svtype + String contig + Float merging_shard_scale_factor = 30000000 + + Boolean use_hail = false + String? gcs_project + + String sv_pipeline_docker + String sv_pipeline_hail_docker + String sv_base_mini_docker + + # overrides for local tasks + RuntimeAttr? runtime_override_shard_clusters + RuntimeAttr? runtime_override_shard_vids + RuntimeAttr? runtime_override_pull_vcf_shard + RuntimeAttr? runtime_override_merge_pesr_depth + + # overrides for MiniTasks + RuntimeAttr? runtime_override_sort_merged_vcf + RuntimeAttr? runtime_override_subset_small + RuntimeAttr? runtime_override_subset_large + RuntimeAttr? runtime_override_make_sites_only + RuntimeAttr? runtime_override_concat_large_pesr_depth + RuntimeAttr? runtime_override_concat_shards + + RuntimeAttr? runtime_override_preconcat_large_pesr_depth + RuntimeAttr? runtime_override_hail_merge_large_pesr_depth + RuntimeAttr? runtime_override_fix_header_large_pesr_depth + + RuntimeAttr? runtime_override_preconcat_pesr_depth_shards + RuntimeAttr? runtime_override_hail_merge_pesr_depth_shards + RuntimeAttr? runtime_override_fix_header_pesr_depth_shards + } + + # Pull out CNVs too small to cluster (less than reciprocal_overlap_fraction * min_depth_only_length) + call MiniTasks.FilterVcf as SubsetSmall { + input: + vcf=subtyped_pesr_vcf, + vcf_index=subtyped_pesr_vcf + ".tbi", + outfile_prefix="~{prefix}.subset_small", + records_filter='INFO/SVLEN<2500', + use_ssd=true, + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_subset_small + } + + call MiniTasks.FilterVcf as SubsetLarge { + input: + vcf=subtyped_pesr_vcf, + vcf_index=subtyped_pesr_vcf + ".tbi", + outfile_prefix="~{prefix}.subset_small", + records_filter='INFO/SVLEN>=2500', + use_ssd=true, + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_subset_large + } + + if (use_hail) { + call HailMerge.HailMerge as ConcatLargePesrDepthHail { + input: + vcfs=[SubsetLarge.filtered_vcf, subtyped_depth_vcf], + prefix="~{prefix}.large_pesr_depth", + gcs_project=gcs_project, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, + runtime_override_preconcat=runtime_override_preconcat_large_pesr_depth, + runtime_override_hail_merge=runtime_override_hail_merge_large_pesr_depth, + runtime_override_fix_header=runtime_override_fix_header_large_pesr_depth + } + } + if (!use_hail) { + call MiniTasks.ConcatVcfs as ConcatLargePesrDepth { + input: + vcfs=[SubsetLarge.filtered_vcf, subtyped_depth_vcf], + vcfs_idx=[SubsetLarge.filtered_vcf + ".tbi", subtyped_depth_vcf + ".tbi"], + allow_overlaps=true, + outfile_prefix="~{prefix}.large_pesr_depth", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_concat_large_pesr_depth + } + } + + call MiniTasks.MakeSitesOnlyVcf { + input: + vcf=select_first([ConcatLargePesrDepth.concat_vcf, ConcatLargePesrDepthHail.merged_vcf]), + vcf_index=select_first([ConcatLargePesrDepth.concat_vcf_idx, ConcatLargePesrDepthHail.merged_vcf_index]), + prefix="~{prefix}.large_pesr_depth.sites_only", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_make_sites_only + } + + # Fast cluster without sample overlap linkage for sharding + Int merge_shard_size = ceil(merging_shard_scale_factor / num_samples) + call ShardedCluster.ShardClusters { + input: + vcf=MakeSitesOnlyVcf.out, + prefix="~{prefix}.shard_clusters", + dist=1000000000, + frac=0.5, + svsize=0, + sv_types=[svtype], + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_override_shard_clusters + } + + call MiniTasks.ShardVidsForClustering { + input: + clustered_vcf=ShardClusters.out, + prefix=prefix, + records_per_shard=merge_shard_size, + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_override_shard_vids + } + + scatter (i in range(length(ShardVidsForClustering.out))) { + call MiniTasks.PullVcfShard { + input: + vcf=select_first([ConcatLargePesrDepth.concat_vcf, ConcatLargePesrDepthHail.merged_vcf]), + vids=ShardVidsForClustering.out[i], + prefix="~{prefix}.unclustered.shard_${i}", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_pull_vcf_shard + } + call MergePesrDepthShard { + input: + vcf=PullVcfShard.out, + vcf_index=PullVcfShard.out_index, + prefix="~{prefix}.merge_pesr_depth.shard_~{i}", + vid_prefix="~{cohort_name}_~{contig}_mpd~{i}", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_override_merge_pesr_depth + } + call MiniTasks.SortVcf { + input: + vcf = MergePesrDepthShard.out, + outfile_prefix = "~{prefix}.sorted.shard_${i}", + sv_base_mini_docker = sv_base_mini_docker, + runtime_attr_override = runtime_override_sort_merged_vcf + } + } + + if (use_hail) { + call HailMerge.HailMerge as ConcatShardsHail { + input: + vcfs=flatten([[SubsetSmall.filtered_vcf], SortVcf.out]), + prefix="~{prefix}.concat_shards", + gcs_project=gcs_project, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, + runtime_override_preconcat=runtime_override_preconcat_pesr_depth_shards, + runtime_override_hail_merge=runtime_override_hail_merge_pesr_depth_shards, + runtime_override_fix_header=runtime_override_fix_header_pesr_depth_shards + } + } + if (!use_hail) { + call MiniTasks.ConcatVcfs as ConcatShards { + input: + vcfs=flatten([[SubsetSmall.filtered_vcf], SortVcf.out]), + vcfs_idx=flatten([[SubsetSmall.filtered_vcf_idx], SortVcf.out_index]), + allow_overlaps=true, + outfile_prefix="~{prefix}.concat_shards", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_concat_shards + } + } + + output { + File out = select_first([ConcatShards.concat_vcf, ConcatShardsHail.merged_vcf]) + File out_index = select_first([ConcatShards.concat_vcf_idx, ConcatShardsHail.merged_vcf_index]) + } +} + + +task MergePesrDepthShard { + input { + File vcf + File vcf_index + String prefix + String vid_prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + String output_file = prefix + ".vcf.gz" + + # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to + # be held in memory or disk while working, potentially in a form that takes up more space) + Float input_size = size(vcf, "GiB") + RuntimeAttr runtime_default = object { + mem_gb: 2.0 + 0.6 * input_size, + disk_gb: ceil(10.0 + 6 * input_size), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} SSD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + /opt/sv-pipeline/04_variant_resolution/scripts/merge_pesr_depth.py \ + --prefix ~{vid_prefix} \ + ~{vcf} \ + ~{output_file} + >>> + + output { + File out = output_file + } +} diff --git a/wdl/Module07MinGQ.wdl b/wdl/Module07MinGQ.wdl index e2bf9263e..c134d0f45 100644 --- a/wdl/Module07MinGQ.wdl +++ b/wdl/Module07MinGQ.wdl @@ -6,12 +6,11 @@ import "Structs.wdl" import "TasksMakeCohortVcf.wdl" as MiniTasks import "ReviseSVtypeINStoMEI.wdl" as ReviseSVtype - - workflow Module07MinGQ { - input{ + input { String sv_base_mini_docker String sv_pipeline_docker + String sv_pipeline_updates_docker File vcf File vcf_idx String prefix @@ -56,7 +55,7 @@ workflow Module07MinGQ { Array[Array[String]] contigs = read_tsv(contiglist) # Get svtype of MEI - call ReviseSVtype.ReviseSVtypeINStoMEI as ReviseSVtypeMEI{ + call ReviseSVtype.ReviseSVtypeINStoMEI as ReviseSVtypeMEI { input: vcf = vcf, vcf_idx = vcf_idx, @@ -87,20 +86,23 @@ workflow Module07MinGQ { input: vcf=ReviseSVtypeMEI.updated_vcf, vcf_idx=ReviseSVtypeMEI.updated_vcf_idx, - contig=contig[0], sv_per_shard=1000, - prefix=prefix, - sv_pipeline_docker=sv_pipeline_docker - } - call SplitPcrVcf { - input: - vcf=getAFs.vcf_wAFs, prefix="~{prefix}.~{contig[0]}", - pcrplus_samples_list=pcrplus_samples_list, - sv_base_mini_docker=sv_base_mini_docker + sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_updates_docker=sv_pipeline_updates_docker } + if (defined(pcrplus_samples_list)) { + call SplitPcrVcf { + input: + vcf=getAFs.vcf_wAFs, + prefix="~{prefix}.~{contig[0]}", + pcrplus_samples_list=pcrplus_samples_list, + sv_base_mini_docker=sv_base_mini_docker + } + } + File pcr_minus_vcf = select_first([SplitPcrVcf.PCRMINUS_vcf, getAFs.vcf_wAFs]) - # Dev note Feb 18 2021: the output from cat_AF_table_PCRMINUS is a required + # Dev note Feb 18 2021: the output from cat_AF_table_PCRMINUS is a required # input to Module07XfBatchEffect.wdl, so the subsequent three tasks always # need to be generated (even if passing a precomputed minGQ cutoff table) @@ -109,11 +111,11 @@ workflow Module07MinGQ { input: vcf=ReviseSVtypeMEI.updated_vcf, vcf_idx=ReviseSVtypeMEI.updated_vcf_idx, - contig=contig[0], sv_per_shard=1000, - prefix=prefix, + prefix="~{prefix}.~{contig[0]}", sample_pop_assignments=GetSampleLists.sample_PCR_labels, - sv_pipeline_docker=sv_pipeline_docker + sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_updates_docker=sv_pipeline_updates_docker } # Gather table of AC/AN/AF for PCRPLUS and PCRMINUS samples call GetAfTables { @@ -133,12 +135,12 @@ workflow Module07MinGQ { } - if (MingqTraining){ + if (MingqTraining) { ###PCRMINUS call SplitFamfile as SplitFamfile_PCRMINUS { input: - vcf=SplitPcrVcf.PCRMINUS_vcf[0], - vcf_idx=SplitPcrVcf.PCRMINUS_vcf_idx[0], + vcf=pcr_minus_vcf, + vcf_idx=pcr_minus_vcf + ".tbi", famfile=trios_famfile, fams_per_shard=1, prefix="~{prefix}.PCRMINUS", @@ -147,7 +149,7 @@ workflow Module07MinGQ { scatter ( fam in SplitFamfile_PCRMINUS.famfile_shards ) { call CollectTrioSVdat as CollectTrioSVdat_PCRMINUS { input: - vcf_shards=SplitPcrVcf.PCRMINUS_vcf, + vcf_shards=pcr_minus_vcf, famfile=fam, sv_pipeline_docker=sv_pipeline_docker } @@ -263,53 +265,6 @@ workflow Module07MinGQ { } } -# revise svtype of MEIs to SVTYPE=MEI -task ReviseSVtypeMEI{ - input{ - File vcf - File vcf_idx - String prefix - String sv_base_mini_docker - RuntimeAttr? runtime_attr_override - } - - RuntimeAttr default_attr = object { - cpu_cores: 1, - mem_gb: 3.75, - disk_gb: 100, - boot_disk_gb: 10, - preemptible_tries: 3, - max_retries: 1 - } - - RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) - - command <<< - zcat ~{vcf} | grep '#' > ~{prefix}.vcf - zcat ~{vcf} | grep -v '#' | grep "INS:ME" | sed -e "s/SVTYPE=INS/SVTYPE=MEI/" >> ~{prefix}.vcf - zcat ~{vcf} | grep -v '#' | grep -v "INS:ME" >> ~{prefix}.vcf - mkdir tmp - vcf-sort -t tmp/ ~{prefix}.vcf | bgzip > ~{prefix}.vcf.gz - tabix -p vcf ~{prefix}.vcf.gz - >>> - - output{ - File updated_vcf = "~{prefix}.vcf.gz" - File updated_vcf_idx = "~{prefix}.vcf.gz.tbi" - } - - runtime { - cpu: select_first([runtime_attr.cpu_cores, default_attr.cpu_cores]) - memory: select_first([runtime_attr.mem_gb, default_attr.mem_gb]) + " GiB" - disks: "local-disk " + select_first([runtime_attr.disk_gb, default_attr.disk_gb]) + " HDD" - bootDiskSizeGb: select_first([runtime_attr.boot_disk_gb, default_attr.boot_disk_gb]) - docker: sv_base_mini_docker - preemptible: select_first([runtime_attr.preemptible_tries, default_attr.preemptible_tries]) - maxRetries: select_first([runtime_attr.max_retries, default_attr.max_retries]) - } -} - - # Get lists of PCRPLUS and PCRMINUS samples present in input VCF task GetSampleLists { input{ @@ -324,7 +279,7 @@ task GetSampleLists { RuntimeAttr default_attr = object { cpu_cores: 1, mem_gb: 3.75, - disk_gb: 50, + disk_gb: ceil(10 + size(vcf, "GB")), boot_disk_gb: 10, preemptible_tries: 3, max_retries: 1 @@ -332,26 +287,17 @@ task GetSampleLists { RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) command <<< set -euo pipefail - tabix -H ~{vcf} | fgrep -v "##" | cut -f10- | sed 's/\t/\n/g' > all_samples.list - if [ ! -z "~{pcrplus_samples_list}" ];then - fgrep -wf ~{pcrplus_samples_list} all_samples.list > "~{prefix}.PCRPLUS.samples.list" || true - fgrep -wvf ~{pcrplus_samples_list} all_samples.list > "~{prefix}.PCRMINUS.samples.list" || true - cat \ - <( awk -v OFS="\t" '{ print $1, "PCRPLUS" }' "~{prefix}.PCRPLUS.samples.list" || true ) \ - <( awk -v OFS="\t" '{ print $1, "PCRMINUS" }' "~{prefix}.PCRMINUS.samples.list" || true ) \ - > "~{prefix}.PCR_status_assignments.txt" + bcftools query -l ~{vcf} > all_samples.list + if ~{defined(pcrplus_samples_list)}; then + awk -v OFS="\t" 'ARGIND==1{inFileA[$1]; next} {if($1 in inFileA){print $1,"PCRPLUS"}else{print $1,"PCRMINUS"}}' ~{pcrplus_samples_list} all_samples.list \ + > ~{prefix}.PCR_status_assignments.txt else - cp all_samples.list "~{prefix}.PCRMINUS.samples.list" - cat \ - <( awk -v OFS="\t" '{ print $1, "PCRMINUS" }' "~{prefix}.PCRMINUS.samples.list" || true ) \ - > "~{prefix}.PCR_status_assignments.txt" - touch ~{prefix}.PCRPLUS.samples.list + awk -v OFS="\t" '{ print $1, "PCRMINUS" }' all_samples.list \ + > ~{prefix}.PCR_status_assignments.txt fi >>> output { - File updated_pcrplus_samples_list = "~{prefix}.PCRPLUS.samples.list" - File updated_PCRMINUS_samples_list = "~{prefix}.PCRMINUS.samples.list" File sample_PCR_labels = "~{prefix}.PCR_status_assignments.txt" } @@ -372,14 +318,14 @@ task SplitPcrVcf { input{ File vcf String prefix - File? pcrplus_samples_list + File pcrplus_samples_list String sv_base_mini_docker RuntimeAttr? runtime_attr_override } RuntimeAttr default_attr = object { cpu_cores: 1, mem_gb: 3.75, - disk_gb: 50, + disk_gb: ceil(10 + size(vcf, "GB") * 2), boot_disk_gb: 10, preemptible_tries: 3, max_retries: 1 @@ -387,34 +333,14 @@ task SplitPcrVcf { RuntimeAttr runtime_attr = select_first([runtime_attr_override, default_attr]) command <<< - if [ ! -z "~{pcrplus_samples_list}" ] && [ $( cat "~{pcrplus_samples_list}" | wc -l ) -gt 0 ]; then - #Get index of PCR+ samples - PCRPLUS_idxs=$( zcat ~{vcf} | sed -n '1,500p' | fgrep "#" | fgrep -v "##" \ - | sed 's/\t/\n/g' | awk -v OFS="\t" '{ print NR, $1 }' \ - | fgrep -wf ~{pcrplus_samples_list} | cut -f1 | paste -s -d, ) - #Get PCR+ VCF - zcat ~{vcf} \ - | cut -f1-9,"$PCRPLUS_idxs" \ - | bgzip -c \ - > "~{prefix}.PCRPLUS.vcf.gz" - tabix -f -p vcf "~{prefix}.PCRPLUS.vcf.gz" - #Get PCR- VCF - zcat ~{vcf} \ - | cut --complement -f"$PCRPLUS_idxs" \ - | bgzip -c \ - > "~{prefix}.PCRMINUS.vcf.gz" - tabix -f -p vcf "~{prefix}.PCRMINUS.vcf.gz" - else - cp ~{vcf} ~{prefix}.PCRMINUS.vcf.gz - tabix -f -p vcf "~{prefix}.PCRMINUS.vcf.gz" - touch ~{prefix}.PCRPLUS.vcf.gz - touch ~{prefix}.PCRPLUS.vcf.gz.tbi - fi + bcftools query -l ~{vcf} > all_samples.list + awk 'ARGIND==1{inFileA[$1]; next} !($1 in inFileA)' ~{pcrplus_samples_list} all_samples.list \ + > pcrminus_samples.list + bcftools reheader -s pcrminus_samples.list -Oz -o ~{prefix}.PCRMINUS.vcf.gz + tabix ~{prefix}.PCRMINUS.vcf.gz >>> output { - File PCRPLUS_vcf = "~{prefix}.PCRPLUS.vcf.gz" - File PCRPLUS_vcf_idx = "~{prefix}.PCRPLUS.vcf.gz.tbi" File PCRMINUS_vcf = "~{prefix}.PCRMINUS.vcf.gz" File PCRMINUS_vcf_idx = "~{prefix}.PCRMINUS.vcf.gz.tbi" } @@ -444,7 +370,7 @@ task GetAfTables { RuntimeAttr default_attr = object { cpu_cores: 1, mem_gb: 3.75, - disk_gb: 50, + disk_gb: ceil(10 + size(vcf, "GB") * 3), boot_disk_gb: 10, preemptible_tries: 3, max_retries: 1 @@ -464,9 +390,9 @@ task GetAfTables { | cut -f2 \ | paste -s -d\, || true ) cut -f"$idxs" "~{prefix}.vcf2bed.bed" \ - | sed 's/^name/\#VID/g' \ - | gzip -c \ - > "~{prefix}.frequencies.preclean.txt.gz" + | sed 's/^name/\#VID/g' \ + | gzip -c \ + > "~{prefix}.frequencies.preclean.txt.gz" if [ ! -z "~{pcrplus_samples_list}" ]; then echo -e "dummy\tPCRMINUS\ndummy2\tPCRPLUS" > dummy.tsv else @@ -481,9 +407,9 @@ task GetAfTables { AC_idx=$( zcat "~{prefix}.frequencies.txt.gz" | sed -n '1p' | sed 's/\t/\n/g' | awk -v PCR="$PCR" '{ if ($1==PCR"_AC") print NR }' ) AN_idx=$( zcat "~{prefix}.frequencies.txt.gz" | sed -n '1p' | sed 's/\t/\n/g' | awk -v PCR="$PCR" '{ if ($1==PCR"_AN") print NR }' ) zcat "~{prefix}.frequencies.txt.gz" \ - | sed '1d' \ - | awk -v FS="\t" -v OFS="\t" -v AC="$AC_idx" -v AN="$AN_idx" \ - '{ print $1, $(AC), $(AN) }' \ + | sed '1d' \ + | awk -v FS="\t" -v OFS="\t" -v AC="$AC_idx" -v AN="$AN_idx" \ + '{ print $1, $(AC), $(AN) }' \ > ~{prefix}."$PCR".AF_preMinGQ.txt done if [ ! -z ~{prefix}.PCRPLUS.AF_preMinGQ.txt ]; then diff --git a/wdl/PatchSRBothsidePass.wdl b/wdl/PatchSRBothsidePass.wdl new file mode 100644 index 000000000..7b6ad66ec --- /dev/null +++ b/wdl/PatchSRBothsidePass.wdl @@ -0,0 +1,133 @@ +version 1.0 + +import "Utils.wdl" as utils +import "Structs.wdl" + +workflow PatchSRBothsidePass { + input { + Array[File] batch_sample_lists + File cohort_vcf + File updated_bothside_pass_list + String cohort_name + String contig + + File patch_script + + String sv_base_mini_docker + String sv_pipeline_docker + + RuntimeAttr? runtime_attr_get_non_ref_vids + RuntimeAttr? runtime_attr_calculate_support_frac + } + + scatter (i in range(length(batch_sample_lists))) { + call GetNonRefVariantLists { + input: + samples_list=batch_sample_lists[i], + cohort_vcf=cohort_vcf, + prefix="~{cohort_name}.~{contig}.non_ref_variants.shard_~{i}", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_attr_get_non_ref_vids + } + } + + call RecalculateBothsideSupportFractions { + input: + patch_script=patch_script, + non_ref_vid_lists=GetNonRefVariantLists.out, + updated_bothside_pass_list=updated_bothside_pass_list, + num_batches=length(batch_sample_lists), + prefix="~{cohort_name}.~{contig}.sr_bothside_support.patched", + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_override=runtime_attr_calculate_support_frac + } + + output { + File out = RecalculateBothsideSupportFractions.out + } +} + +task GetNonRefVariantLists { + input { + File samples_list + File cohort_vcf + String prefix + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(cohort_vcf, "GB") + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + input_size), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_base_mini_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + bcftools view --samples-file ~{samples_list} ~{cohort_vcf} \ + | bcftools view -G -i 'SUM(AC)>0||SUM(FORMAT/SR_GT)>0' \ + | bcftools query -f '%ID\n' \ + > ~{prefix}.list + >>> + output { + File out = "~{prefix}.list" + } +} + +task RecalculateBothsideSupportFractions { + input { + File patch_script + Array[File] non_ref_vid_lists + File updated_bothside_pass_list + Int num_batches + String prefix + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(non_ref_vid_lists, "GB") + size(updated_bothside_pass_list, "GB") + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + input_size * 2.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + python ~{patch_script} \ + ~{write_lines(non_ref_vid_lists)} \ + ~{updated_bothside_pass_list} \ + ~{num_batches} \ + > ~{prefix}.txt + >>> + output { + File out = "~{prefix}.txt" + } +} \ No newline at end of file diff --git a/wdl/PatchSRBothsidePassScatter.wdl b/wdl/PatchSRBothsidePassScatter.wdl new file mode 100644 index 000000000..ae4d77e6b --- /dev/null +++ b/wdl/PatchSRBothsidePassScatter.wdl @@ -0,0 +1,54 @@ +version 1.0 + +import "Utils.wdl" as utils +import "PatchSRBothsidePass.wdl" as patch +import "Structs.wdl" + +workflow PatchSRBothsidePassScatter { + input { + Array[File] batch_vcfs + Array[File] cohort_contig_vcfs + Array[File] updated_bothside_pass_lists + String cohort_name + File contig_list + + File patch_script + + String sv_base_mini_docker + String sv_pipeline_docker + + RuntimeAttr? runtime_attr_get_sample_ids + RuntimeAttr? runtime_attr_get_non_ref_vids + RuntimeAttr? runtime_attr_calculate_support_frac + } + + scatter (i in range(length(batch_vcfs))) { + call utils.GetSampleIdsFromVcf { + input: + vcf=batch_vcfs[i], + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_attr_get_sample_ids + } + } + + Array[String] contigs = transpose(read_tsv(contig_list))[0] + scatter ( i in range(length(contigs)) ) { + call patch.PatchSRBothsidePass { + input: + batch_sample_lists=GetSampleIdsFromVcf.out_file, + cohort_vcf=cohort_contig_vcfs[i], + updated_bothside_pass_list=updated_bothside_pass_lists[i], + cohort_name=cohort_name, + contig=contigs[i], + patch_script=patch_script, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + runtime_attr_get_non_ref_vids=runtime_attr_get_non_ref_vids, + runtime_attr_calculate_support_frac=runtime_attr_calculate_support_frac + } + } + + output { + Array[File] out = PatchSRBothsidePass.out + } +} diff --git a/wdl/ResolveComplexVariants.wdl b/wdl/ResolveComplexVariants.wdl index 220607943..3ecd65740 100644 --- a/wdl/ResolveComplexVariants.wdl +++ b/wdl/ResolveComplexVariants.wdl @@ -23,18 +23,19 @@ workflow ResolveComplexVariants { File pe_exclude_list File ref_dict + Boolean use_hail = false + String? gcs_project + String sv_base_mini_docker String sv_pipeline_docker + String sv_pipeline_hail_docker # overrides for local tasks RuntimeAttr? runtime_override_update_sr_list_pass RuntimeAttr? runtime_override_update_sr_list_fail RuntimeAttr? runtime_override_integrate_resolved_vcfs RuntimeAttr? runtime_override_rename_variants - RuntimeAttr? runtime_override_breakpoint_overlap_filter - - # overrides for mini tasks RuntimeAttr? runtime_override_subset_inversions RuntimeAttr? runtime_override_concat @@ -46,7 +47,10 @@ workflow ResolveComplexVariants { RuntimeAttr? runtime_override_resolve_cpx_per_shard RuntimeAttr? runtime_override_restore_unresolved_cnv_per_shard RuntimeAttr? runtime_override_concat_resolved_per_shard - RuntimeAttr? runtime_override_merge_resolve_inner + RuntimeAttr? runtime_override_pull_vcf_shard + RuntimeAttr? runtime_override_preconcat + RuntimeAttr? runtime_override_hail_merge + RuntimeAttr? runtime_override_fix_header RuntimeAttr? runtime_override_get_se_cutoff_inv RuntimeAttr? runtime_override_shard_vcf_cpx_inv @@ -55,7 +59,10 @@ workflow ResolveComplexVariants { RuntimeAttr? runtime_override_resolve_cpx_per_shard_inv RuntimeAttr? runtime_override_restore_unresolved_cnv_per_shard_inv RuntimeAttr? runtime_override_concat_resolved_per_shard_inv - RuntimeAttr? runtime_override_merge_resolve_inner_inv + RuntimeAttr? runtime_override_pull_vcf_shard_inv + RuntimeAttr? runtime_override_preconcat_inv + RuntimeAttr? runtime_override_hail_merge_inv + RuntimeAttr? runtime_override_fix_header_inv } #Scatter per chromosome @@ -87,11 +94,13 @@ workflow ResolveComplexVariants { mei_bed=mei_bed, pe_exclude_list=pe_exclude_list, rf_cutoff_files=rf_cutoff_files, - inv_only=true, ref_dict=ref_dict, precluster_distance=50000000, precluster_overlap_frac=0.1, + use_hail=use_hail, + gcs_project=gcs_project, sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, sv_base_mini_docker=sv_base_mini_docker, runtime_override_get_se_cutoff=runtime_override_get_se_cutoff_inv, runtime_override_shard_vcf_cpx=runtime_override_shard_vcf_cpx_inv, @@ -100,7 +109,10 @@ workflow ResolveComplexVariants { runtime_override_resolve_cpx_per_shard=runtime_override_resolve_cpx_per_shard_inv, runtime_override_restore_unresolved_cnv_per_shard=runtime_override_restore_unresolved_cnv_per_shard_inv, runtime_override_concat_resolved_per_shard=runtime_override_concat_resolved_per_shard_inv, - runtime_override_merge_resolve_inner=runtime_override_merge_resolve_inner_inv + runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard_inv, + runtime_override_preconcat=runtime_override_preconcat_inv, + runtime_override_hail_merge=runtime_override_hail_merge_inv, + runtime_override_fix_header=runtime_override_fix_header_inv } #Run same-bp overlap filter on full vcf @@ -127,11 +139,13 @@ workflow ResolveComplexVariants { mei_bed=mei_bed, pe_exclude_list=pe_exclude_list, rf_cutoff_files=rf_cutoff_files, - inv_only=false, ref_dict=ref_dict, - precluster_distance=1000, - precluster_overlap_frac=0, + precluster_distance=2000, + precluster_overlap_frac=0.000000001, + use_hail=use_hail, + gcs_project=gcs_project, sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, sv_base_mini_docker=sv_base_mini_docker, runtime_override_get_se_cutoff=runtime_override_get_se_cutoff, runtime_override_shard_vcf_cpx=runtime_override_shard_vcf_cpx, @@ -140,7 +154,10 @@ workflow ResolveComplexVariants { runtime_override_resolve_cpx_per_shard=runtime_override_resolve_cpx_per_shard, runtime_override_restore_unresolved_cnv_per_shard=runtime_override_restore_unresolved_cnv_per_shard, runtime_override_concat_resolved_per_shard=runtime_override_concat_resolved_per_shard, - runtime_override_merge_resolve_inner=runtime_override_merge_resolve_inner, + runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard, + runtime_override_preconcat=runtime_override_preconcat, + runtime_override_hail_merge=runtime_override_hail_merge, + runtime_override_fix_header=runtime_override_fix_header } #Integrate inv-only and all-variants resolved VCFs @@ -158,6 +175,8 @@ workflow ResolveComplexVariants { input: vcf=IntegrateResolvedVcfs.integrated_vcf, prefix="~{cohort_name}.~{contig}.renamed", + chrom=contig, + vid_prefix=cohort_name, sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_override_rename_variants } @@ -167,7 +186,7 @@ workflow ResolveComplexVariants { input: vcf=RenameVariants.renamed_vcf, original_list=cluster_bothside_pass_lists[i], - outfile="sr_bothside_pass.~{contig}.updated3.txt", + outfile="~{cohort_name}.~{contig}.sr_bothside_pass.updated3.txt", sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_override_update_sr_list_pass } @@ -177,7 +196,7 @@ workflow ResolveComplexVariants { input: vcf=RenameVariants.renamed_vcf, original_list=cluster_background_fail_lists[i], - outfile="sr_background_fail.~{contig}.updated3.txt", + outfile="~{cohort_name}.~{contig}.sr_background_fail.updated3.txt", sv_pipeline_docker=sv_pipeline_docker, runtime_attr_override=runtime_override_update_sr_list_fail } @@ -202,6 +221,8 @@ workflow ResolveComplexVariants { Array[File] complex_resolve_vcf_indexes = RenameVariants.renamed_vcf_index Array[File] complex_resolve_bothside_pass_lists = UpdateBothsidePass.updated_list Array[File] complex_resolve_background_fail_lists = UpdateBackgroundFail.updated_list + Array[File] breakpoint_overlap_dropped_record_vcfs = BreakpointOverlap.dropped_record_vcf + Array[File] breakpoint_overlap_dropped_record_vcf_indexes = BreakpointOverlap.dropped_record_vcf_index File? merged_vcf = ConcatVcfs.concat_vcf File? merged_vcf_index = ConcatVcfs.concat_vcf_idx } @@ -219,10 +240,10 @@ task IntegrateResolvedVcfs { Float input_size = size([inv_res_vcf, all_res_vcf], "GiB") RuntimeAttr runtime_default = object { - mem_gb: 2.0, - disk_gb: ceil(10 + input_size * 10), + mem_gb: 3.75, + disk_gb: ceil(10 + input_size * 20), cpu_cores: 1, - preemptible_tries: 1, + preemptible_tries: 3, max_retries: 1, boot_disk_gb: 10 } @@ -256,23 +277,23 @@ task IntegrateResolvedVcfs { ##get unresolved variants from full vcf that are resolved in inversion resolved vcf### zcat ~{inv_res_vcf} \ - | fgrep -v "#" \ + |fgrep -v "#" \ |awk '{if ($8!~"UNRESOLVED") print}' \ - |fgrep -wvf <(awk '{if ($NF!="MEMBERS") print $NF}' all.resolved.inv.bed \ - |tr ',' '\n') \ - >add.vcf.lines.txt || true + |awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]; next} {if (!($3 in inFileA)) print }' \ + <(awk '{if ($NF!="MEMBERS") print $NF}' all.resolved.inv.bed | tr ',' '\n') - \ + >add.vcf.lines.txt ##get unresolved variants id from full vcf to strip since they are resolved in inversion resolved vcf### ##inversions that cluster were other variants (rare) are kept as unresolved though they will also be part of a resolved variant in add.vcf.lines.txt## awk '{if ($NF!="MEMBERS") print $NF}' inv.resolve.bed \ |tr ',' '\n'\ - |fgrep -wf - all.unresolved.inv.bed \ + |awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$4]; next} {if ($4 in inFileA) print }' all.resolved.inv.bed - \ |awk '{if ($NF!~",")print $4}' \ - >remove.unresolved.vcf.ids.txt || true + >remove.unresolved.vcf.ids.txt mkdir temp zcat ~{all_res_vcf} \ - |fgrep -wvf remove.unresolved.vcf.ids.txt \ + |awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]; next} {if (!($3 in inFileA)) print }' remove.unresolved.vcf.ids.txt - \ |cat - add.vcf.lines.txt \ |bcftools sort - -O z -T temp \ > ~{prefix}.vcf.gz @@ -326,24 +347,28 @@ task BreakpointOverlap { ~{vcf} \ ~{bothside_pass_list} \ ~{background_fail_list} \ + ~{prefix}.dropped_records.vcf.gz \ | bgzip \ > ~{prefix}.vcf.gz tabix ~{prefix}.vcf.gz + tabix ~{prefix}.dropped_records.vcf.gz >>> output { File out = "~{prefix}.vcf.gz" File out_index = "~{prefix}.vcf.gz.tbi" + File dropped_record_vcf = "~{prefix}.dropped_records.vcf.gz" + File dropped_record_vcf_index = "~{prefix}.dropped_records.vcf.gz.tbi" } } - - # Rename variants in VCF task RenameVariants { input { File vcf + String vid_prefix String prefix + String chrom String sv_pipeline_docker RuntimeAttr? runtime_attr_override } @@ -372,7 +397,7 @@ task RenameVariants { command <<< set -euo pipefail - /opt/sv-pipeline/04_variant_resolution/scripts/rename.py --prefix ~{prefix} ~{vcf} - \ + /opt/sv-pipeline/04_variant_resolution/scripts/rename.py --chrom ~{chrom} --prefix ~{vid_prefix} ~{vcf} - \ | bgzip \ > ~{prefix}.vcf.gz tabix ~{prefix}.vcf.gz diff --git a/wdl/ResolveCpxSv.wdl b/wdl/ResolveCpxSv.wdl index 4630263f5..cd7b103a9 100644 --- a/wdl/ResolveCpxSv.wdl +++ b/wdl/ResolveCpxSv.wdl @@ -3,6 +3,7 @@ version 1.0 # Author: Ryan Collins import "TasksMakeCohortVcf.wdl" as MiniTasks +import "HailMerge.wdl" as HailMerge #Resolve complex SV for a single chromosome workflow ResolveComplexSv { @@ -16,13 +17,16 @@ workflow ResolveComplexSv { Array[File] disc_files Array[File] rf_cutoff_files File pe_exclude_list - Boolean inv_only File ref_dict Int precluster_distance Float precluster_overlap_frac + Boolean use_hail + String? gcs_project + String sv_pipeline_docker + String sv_pipeline_hail_docker String sv_base_mini_docker # overrides for local tasks @@ -32,10 +36,12 @@ workflow ResolveComplexSv { RuntimeAttr? runtime_override_resolve_prep RuntimeAttr? runtime_override_resolve_cpx_per_shard RuntimeAttr? runtime_override_restore_unresolved_cnv_per_shard - RuntimeAttr? runtime_override_merge_resolve_inner - - # overrides for MiniTasks RuntimeAttr? runtime_override_concat_resolved_per_shard + RuntimeAttr? runtime_override_pull_vcf_shard + + RuntimeAttr? runtime_override_preconcat + RuntimeAttr? runtime_override_hail_merge + RuntimeAttr? runtime_override_fix_header } File vcf_idx = vcf + ".tbi" @@ -45,11 +51,7 @@ workflow ResolveComplexSv { File disc_files_idx = disc_files[i] + ".tbi" } - # Get SR count cutoff from RF metrics to use in single-ender rescan procedure - #Shard vcf for complex resolution - #Note: as of Nov 2, 2018, return lists of variant IDs for each shard. This should - # dramatically improve sharding speed call ShardVcfCpx { input: vcf=vcf, @@ -61,7 +63,7 @@ workflow ResolveComplexSv { runtime_attr_override=runtime_override_shard_vcf_cpx } - call MiniTasks.ShardVids { + call MiniTasks.ShardVidsForClustering { input: clustered_vcf=ShardVcfCpx.out, prefix=prefix, @@ -70,8 +72,9 @@ workflow ResolveComplexSv { runtime_attr_override=runtime_override_shard_vids } - if (length(ShardVids.out) > 0) { + if (length(ShardVidsForClustering.out) > 0) { + # Get SR count cutoff from RF metrics to use in single-ender rescan procedure call GetSeCutoff { input: rf_cutoffs=rf_cutoff_files, @@ -80,13 +83,21 @@ workflow ResolveComplexSv { } #Scatter over shards and resolve variants per shard - scatter ( i in range(length(ShardVids.out)) ) { + scatter ( i in range(length(ShardVidsForClustering.out)) ) { + + call MiniTasks.PullVcfShard { + input: + vcf=vcf, + vids=ShardVidsForClustering.out[i], + prefix="~{prefix}.shard_${i}", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_pull_vcf_shard + } #Prep files for svtk resolve using bucket streaming call ResolvePrep { input: - vcf=vcf, - VIDs_list=ShardVids.out[i], + vcf=PullVcfShard.out, chrom=contig, disc_files=disc_files, disc_files_index=disc_files_idx, @@ -98,7 +109,7 @@ workflow ResolveComplexSv { #Run svtk resolve call SvtkResolve { input: - noref_vcf=ResolvePrep.noref_vcf, + vcf=PullVcfShard.out, prefix="~{prefix}.svtk_resolve.shard_~{i}", chrom=contig, cytobands=cytobands, @@ -113,20 +124,10 @@ workflow ResolveComplexSv { runtime_attr_override=runtime_override_resolve_cpx_per_shard } - call MergeResolve { - input: - full_vcf=ResolvePrep.subsetted_vcf, - resolved_vcf=SvtkResolve.rs_vcf, - prefix="~{prefix}.merge_resolve.shard_~{i}", - noref_vids=ResolvePrep.noref_vids, - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_merge_resolve_inner - } - #Add unresolved variants back into resolved VCF - call RestoreUnresolvedCnv as RestoreUnresolvedCnvPerShard { + call RestoreUnresolvedCnv { input: - resolved_vcf=MergeResolve.out, + resolved_vcf=SvtkResolve.rs_vcf, unresolved_vcf=SvtkResolve.un_vcf, prefix="~{prefix}.restore_unresolved.shard_~{i}", sv_pipeline_docker=sv_pipeline_docker, @@ -135,20 +136,36 @@ workflow ResolveComplexSv { } #Merge across shards - call MiniTasks.ConcatVcfs as ConcatResolvedPerShard { - input: - vcfs=RestoreUnresolvedCnvPerShard.res, - vcfs_idx=RestoreUnresolvedCnvPerShard.res_idx, - allow_overlaps=true, - outfile_prefix=prefix + ".resolved", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_concat_resolved_per_shard + if (use_hail) { + call HailMerge.HailMerge as ConcatResolvedPerShardHail { + input: + vcfs=RestoreUnresolvedCnv.res, + prefix="~{prefix}.resolved", + gcs_project=gcs_project, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, + runtime_override_preconcat=runtime_override_preconcat, + runtime_override_hail_merge=runtime_override_hail_merge, + runtime_override_fix_header=runtime_override_fix_header + } + } + if (!use_hail) { + call MiniTasks.ConcatVcfs as ConcatResolvedPerShard { + input: + vcfs=RestoreUnresolvedCnv.res, + vcfs_idx=RestoreUnresolvedCnv.res_idx, + allow_overlaps=true, + outfile_prefix="~{prefix}.resolved", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_concat_resolved_per_shard + } } } output { - File resolved_vcf_merged = select_first([ConcatResolvedPerShard.concat_vcf, vcf]) - File resolved_vcf_merged_idx = select_first([ConcatResolvedPerShard.concat_vcf_idx, vcf_idx]) + File resolved_vcf_merged = select_first([ConcatResolvedPerShard.concat_vcf, ConcatResolvedPerShardHail.merged_vcf, vcf]) + File resolved_vcf_merged_idx = select_first([ConcatResolvedPerShard.concat_vcf_idx, ConcatResolvedPerShardHail.merged_vcf_index, vcf_idx]) } } @@ -202,7 +219,6 @@ task GetSeCutoff { } } - task ShardVcfCpx { input { File vcf @@ -242,6 +258,7 @@ task ShardVcfCpx { svtk vcfcluster <(echo "sites_only.vcf.gz") ~{prefix}.vcf \ -d ~{dist} \ -f ~{frac} \ + --single-end \ -p candidate_complex_clusters \ --svtypes DEL,DUP,INS,INV,BND \ --ignore-svtypes \ @@ -261,7 +278,6 @@ task ShardVcfCpx { task ResolvePrep { input { File vcf - File VIDs_list File ref_dict String chrom Array[File] disc_files @@ -281,18 +297,9 @@ task ResolvePrep { # sections of disc_files are straemed, but the every operation in this task is record-by-record except # bedtools merge, which should only need to keep a few records in memory at a time. - # assuming memory overhead is fixed - # assuming disk overhead is input size (accounting for compression) + sum(size of disc_files) - # (this is an over-estimate because we only take chunks overlapping VIDs from vcf, but the disk files are not *THAT* - # big and disk is cheap) - Float compressed_input_size = size(vcf, "GiB") - Float uncompressed_input_size = size([VIDs_list], "GiB") - Float compression_factor = 30.0 - Float base_disk_gb = 10.0 - Float base_mem_gb = 2.0 RuntimeAttr runtime_default = object { - mem_gb: base_mem_gb, - disk_gb: ceil(base_disk_gb + uncompressed_input_size + compression_factor * compressed_input_size), + mem_gb: 2.0, + disk_gb: ceil(20.0 + 200.0 * size(vcf, "GiB")), cpu_cores: 1, preemptible_tries: 3, max_retries: 1, @@ -315,53 +322,12 @@ task ResolvePrep { command <<< set -euxo pipefail - - # First, subset VCF to variants of interest - # -uncompress vcf - zcat "~{vcf}" > uncompressed.vcf - # -Extract vcf header: - # search for first line not starting with '#', stop immediately, - # take everything up to that point, then remove last line - ONLY_HEADER=false - grep -B9999999999 -m1 -Ev "^#" uncompressed.vcf | sed '$ d' > header.vcf \ - || ONLY_HEADER=true - - if $ONLY_HEADER; then - # filter is trivial, just copy the vcf - mv "~{vcf}" input.vcf.gz - else - rm -f "~{vcf}" - N_HEADER=$(wc -l < header.vcf) - # filter records, concatenate and zip - tail -n+$((N_HEADER+1)) uncompressed.vcf \ - | { fgrep -wf ~{VIDs_list} || true; } \ - | cat header.vcf - \ - | bgzip -c \ - > input.vcf.gz - rm -f uncompressed.vcf - fi - - #Second, extract all-ref variants from VCF. These break svtk resolve with - # remote tabixing enabled - svtk vcf2bed input.vcf.gz input.bed - { grep -Ev "^#" input.bed || true ; } \ - | awk -v FS="\t" '{ if ($6!="") print $4 }' \ - > noref.VIDs.list - - { - cat header.vcf; - zcat input.vcf.gz | fgrep -wf noref.VIDs.list || true; - } \ - | vcf-sort \ - | bgzip -c \ - > noref.vcf.gz - rm -f header.vcf - - #Third, use GATK to pull down the discfile chunks within ±2kb of all + #Use GATK to pull down the discfile chunks within ±2kb of all # INVERSION breakpoints, and bgzip / tabix echo "Forming regions.bed" - { grep -Ev "^#" input.bed || true; } \ + svtk vcf2bed ~{vcf} input.bed --no-samples --no-header + cat input.bed \ | (fgrep INV || printf "") \ | awk -v OFS="\t" -v buffer=2000 \ '{ print $1, $2-buffer, $2+buffer"\n"$1, $3-buffer, $3+buffer }' \ @@ -395,7 +361,7 @@ task ResolvePrep { rm ${SLICE}.PE.txt done < ~{write_lines(disc_files)} - #Fourth, merge PE files and add one artificial pair corresponding to the chromosome of interest + #Merge PE files and add one artificial pair corresponding to the chromosome of interest #This makes it so that svtk doesn't break downstream echo "Merging PE files" { @@ -423,9 +389,6 @@ task ResolvePrep { >>> output { - File subsetted_vcf = "input.vcf.gz" - File noref_vcf = "noref.vcf.gz" - File noref_vids = "noref.VIDs.list" File merged_discfile = "discfile.PE.txt.gz" File merged_discfile_idx = "discfile.PE.txt.gz.tbi" } @@ -434,7 +397,7 @@ task ResolvePrep { #Resolve complex SV task SvtkResolve { input { - File noref_vcf + File vcf String prefix String chrom File cytobands @@ -454,13 +417,15 @@ task SvtkResolve { # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to # be held in memory or disk while working, potentially in a form that takes up more space) - Float input_size = size( - [noref_vcf, cytobands, mei_bed, pe_exclude_list, pe_exclude_list_idx,merged_discfile], "GiB") + Float input_size = size([vcf, merged_discfile], "GiB") + Float scaled_mem_gib = 3 + size(vcf, "GiB") * 40 + size(merged_discfile, "GiB") * 80 + # Cap memory at largest N2 VM size of 512 GB (476.8 GiB) + Float default_mem_gib = if (scaled_mem_gib < 476.0) then scaled_mem_gib else 476.0 RuntimeAttr runtime_default = object { - mem_gb: 3 + input_size * 10, + mem_gb: default_mem_gib, disk_gb: ceil(10 + input_size * 12), cpu_cores: 1, - preemptible_tries: 1, + preemptible_tries: 3, max_retries: 1, boot_disk_gb: 10 } @@ -480,7 +445,7 @@ task SvtkResolve { #Run svtk resolve on variants after all-ref exclusion svtk resolve \ - ~{noref_vcf} \ + ~{vcf} \ ~{resolved_vcf} \ -p AllBatches_CPX_~{chrom} \ -u ~{unresolved_vcf} \ @@ -500,60 +465,6 @@ task SvtkResolve { } } -task MergeResolve { - input { - File full_vcf - File resolved_vcf - String prefix - File noref_vids - String sv_base_mini_docker - RuntimeAttr? runtime_attr_override - } - - String out_vcf = "~{prefix}.resolved.vcf.gz" - - # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to - # be held in memory or disk while working, potentially in a form that takes up more space) - Float input_size = size([full_vcf, resolved_vcf, noref_vids], "GiB") - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10 + input_size * 15), - cpu_cores: 1, - preemptible_tries: 1, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_base_mini_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -eu -o pipefail - #Add all-ref variants back into resolved VCF - #Note: requires modifying the INFO field with sed & awk given pysam C bug - zcat ~{full_vcf} \ - | grep -Ev "^#" \ - | awk 'ARGIND==1{inFileA[$1]; next} {if (!($3 in inFileA)) print }' ~{noref_vids} - OFS='\t' \ - | sed -e 's/;MEMBERS=[^\t]*\t/\t/g' \ - | awk -v OFS="\t" '{ $8=$8";MEMBERS="$3; print }' \ - | cat <(zcat ~{resolved_vcf}) - \ - | vcf-sort \ - | bgzip \ - > ~{out_vcf} - >>> - - output { - File out = out_vcf - } -} - #Restore unresolved CNVs to resolved VCF task RestoreUnresolvedCnv { input { @@ -589,47 +500,47 @@ task RestoreUnresolvedCnv { } command <<< - set -eu -o pipefail + set -euo pipefail # get unresolved records - zcat ~{unresolved_vcf} \ - | (grep -v "^#" || printf "") \ - > unresolved_records.vcf + bcftools view --no-header ~{unresolved_vcf} -Oz -o unresolved_records.vcf.gz rm "~{unresolved_vcf}" # avoid possible obliteration of input file during later processing by writing # to temporary file (and postCPX_cleanup.py writing final result to output name) - zcat ~{resolved_vcf} > ~{resolved_plus_cnv}.tmp - rm ~{resolved_vcf} + mv ~{resolved_vcf} ~{resolved_plus_cnv}.tmp.gz #Add unresolved CNVs to resolved VCF and wipe unresolved status - cat unresolved_records.vcf \ + zcat unresolved_records.vcf.gz \ | (fgrep -e "" -e "" -e "SVTYPE=DEL" -e "SVTYPE=DUP" -e "SVTYPE=CNV" -e "SVTYPE=MCNV" || printf "") \ | sed -r -e 's/;EVENT=[^;]*;/;/' -e 's/;UNRESOLVED[^;]*;/;/g' \ | sed -r -e 's/;UNRESOLVED_TYPE[^;]*;/;/g' -e 's/;UNRESOLVED_TYPE[^\t]*\t/\t/g' \ - >> ~{resolved_plus_cnv}.tmp + | bgzip \ + >> ~{resolved_plus_cnv}.tmp.gz #Add other unresolved variants & retain unresolved status (except for inversion single enders) - cat unresolved_records.vcf \ + zcat unresolved_records.vcf.gz \ | (fgrep -v -e "" -e "" -e "SVTYPE=DEL" -e "SVTYPE=DUP" -e "SVTYPE=CNV" -e "SVTYPE=MCNV" \ -e "INVERSION_SINGLE_ENDER" || printf "") \ - >> ~{resolved_plus_cnv}.tmp + | bgzip \ + >> ~{resolved_plus_cnv}.tmp.gz #Add inversion single enders as SVTYPE=BND - cat unresolved_records.vcf \ + zcat unresolved_records.vcf.gz \ | (fgrep -v -e "" -e "" -e "SVTYPE=DEL" -e "SVTYPE=DUP" -e "SVTYPE=CNV" -e "SVTYPE=MCNV" || printf "") \ | (fgrep -e "INVERSION_SINGLE_ENDER" || printf "") \ | sed -e 's/SVTYPE=INV/SVTYPE=BND/g' \ | sed -e 's/END=\([0-9]*\)/END=\1;END2=\1/' \ - >> ~{resolved_plus_cnv}.tmp - rm unresolved_records.vcf + | bgzip \ + >> ~{resolved_plus_cnv}.tmp.gz + rm unresolved_records.vcf.gz #Sort, clean, and compress - cat ~{resolved_plus_cnv}.tmp \ + zcat ~{resolved_plus_cnv}.tmp.gz \ | vcf-sort -c \ | /opt/sv-pipeline/04_variant_resolution/scripts/postCPX_cleanup.py \ /dev/stdin /dev/stdout \ - | bgzip -c \ + | bgzip \ > ~{resolved_plus_cnv} tabix ~{resolved_plus_cnv} >>> diff --git a/wdl/ScatterCpxGenotyping.wdl b/wdl/ScatterCpxGenotyping.wdl index 01be4ce5a..af21479ea 100644 --- a/wdl/ScatterCpxGenotyping.wdl +++ b/wdl/ScatterCpxGenotyping.wdl @@ -4,6 +4,7 @@ version 1.0 import "GenotypeCpxCnvs.wdl" as GenotypeCpx import "TasksMakeCohortVcf.wdl" as MiniTasks +import "HailMerge.wdl" as HailMerge # Workflow to perform depth-based genotyping for a single vcf shard scattered # across batches on predicted CPX CNVs @@ -11,8 +12,7 @@ workflow ScatterCpxGenotyping { input { File bin_exclude File vcf - Int n_master_vcf_shards - Int n_master_min_vars_per_vcf_shard + Int records_per_shard Array[String] batches Array[File] coverage_files Array[File] rd_depth_sep_cutoff_files @@ -26,16 +26,24 @@ workflow ScatterCpxGenotyping { String contig File ref_dict + Boolean use_hail + String? gcs_project + String linux_docker String sv_base_mini_docker + String sv_pipeline_updates_docker String sv_pipeline_docker + String sv_pipeline_hail_docker String sv_pipeline_rdtest_docker # overrides for MiniTasks - RuntimeAttr? runtime_override_ids_from_vcf RuntimeAttr? runtime_override_split_vcf_to_genotype RuntimeAttr? runtime_override_concat_cpx_cnv_vcfs + RuntimeAttr? runtime_override_preconcat + RuntimeAttr? runtime_override_hail_merge + RuntimeAttr? runtime_override_fix_header + # overrides for GenotypeCpx RuntimeAttr? runtime_override_ids_from_median RuntimeAttr? runtime_override_get_cpx_cnv_intervals @@ -49,18 +57,17 @@ workflow ScatterCpxGenotyping { String contig_prefix = prefix + "." + contig # Shard VCF into even slices - call MiniTasks.SplitVcf as SplitVcfToGenotype { + call MiniTasks.ScatterVcf as SplitVcfToGenotype { input: vcf=vcf, - prefix=contig_prefix + ".shard_", - n_shards=n_master_vcf_shards, - min_vars_per_shard=n_master_min_vars_per_vcf_shard, - sv_base_mini_docker=sv_base_mini_docker, + prefix=contig_prefix, + records_per_shard=records_per_shard, + sv_pipeline_docker=sv_pipeline_updates_docker, runtime_attr_override=runtime_override_split_vcf_to_genotype } # Scatter genotyping over shards - scatter ( shard in SplitVcfToGenotype.vcf_shards ) { + scatter ( shard in SplitVcfToGenotype.shards ) { # Run genotyping call GenotypeCpx.GenotypeCpxCnvs as GenotypeShard { input: @@ -92,20 +99,35 @@ workflow ScatterCpxGenotyping { } } - # Merge VCF shards - call MiniTasks.ConcatVcfs as ConcatCpxCnvVcfs { - input: - vcfs=GenotypeShard.cpx_depth_gt_resolved_vcf, - vcfs_idx=GenotypeShard.cpx_depth_gt_resolved_vcf_idx, - allow_overlaps=true, - outfile_prefix=contig_prefix + ".regenotyped", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_concat_cpx_cnv_vcfs + if (use_hail) { + call HailMerge.HailMerge as ConcatCpxCnvVcfsHail { + input: + vcfs=GenotypeShard.cpx_depth_gt_resolved_vcf, + prefix="~{prefix}.regenotyped", + gcs_project=gcs_project, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, + runtime_override_preconcat=runtime_override_preconcat, + runtime_override_hail_merge=runtime_override_hail_merge, + runtime_override_fix_header=runtime_override_fix_header + } + } + if (!use_hail) { + call MiniTasks.ConcatVcfs as ConcatCpxCnvVcfs { + input: + vcfs=GenotypeShard.cpx_depth_gt_resolved_vcf, + vcfs_idx=GenotypeShard.cpx_depth_gt_resolved_vcf_idx, + naive=true, + outfile_prefix="~{prefix}.regenotyped", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_concat_cpx_cnv_vcfs + } } # Output merged VCF output { - File cpx_depth_gt_resolved_vcf = ConcatCpxCnvVcfs.concat_vcf - File cpx_depth_gt_resolved_vcf_idx = ConcatCpxCnvVcfs.concat_vcf_idx + File cpx_depth_gt_resolved_vcf = select_first([ConcatCpxCnvVcfs.concat_vcf, ConcatCpxCnvVcfsHail.merged_vcf]) + File cpx_depth_gt_resolved_vcf_idx = select_first([ConcatCpxCnvVcfs.concat_vcf_idx, ConcatCpxCnvVcfsHail.merged_vcf_index]) } } diff --git a/wdl/ShardedCluster.wdl b/wdl/ShardedCluster.wdl index 62009d67a..8c3990f43 100644 --- a/wdl/ShardedCluster.wdl +++ b/wdl/ShardedCluster.wdl @@ -4,35 +4,45 @@ version 1.0 import "Structs.wdl" import "TasksMakeCohortVcf.wdl" as MiniTasks -import "Utils.wdl" as utils +import "HailMerge.wdl" as HailMerge # Workflow to shard a filtered vcf & run vcfcluster (sub-sub-sub workflow) workflow ShardedCluster { input { File vcf + Int num_samples Int dist Float frac String prefix String contig + String cohort_name + String evidence_type String sv_type Float sample_overlap File? exclude_list + File empty_file Int sv_size Array[String] sv_types Float merging_shard_scale_factor = 30000000 + Boolean use_hail + String? gcs_project + String sv_pipeline_docker + String sv_pipeline_hail_docker String sv_base_mini_docker - # Do not use - File? NONE_FILE_ - # overrides for local tasks - RuntimeAttr? runtime_override_shard_vcf_precluster + RuntimeAttr? runtime_override_shard_clusters + RuntimeAttr? runtime_override_shard_vids RuntimeAttr? runtime_override_pull_vcf_shard RuntimeAttr? runtime_override_svtk_vcf_cluster RuntimeAttr? runtime_override_get_vcf_header_with_members_info_line + RuntimeAttr? runtime_override_preconcat_sharded_cluster + RuntimeAttr? runtime_override_hail_merge_sharded_cluster + RuntimeAttr? runtime_override_fix_header_sharded_cluster + # overrides for merge subworkflow RuntimeAttr? runtime_override_merge_clusters RuntimeAttr? runtime_override_concat_inner_shards @@ -41,25 +51,32 @@ workflow ShardedCluster { RuntimeAttr? runtime_override_concat_sharded_cluster RuntimeAttr? runtime_override_sort_merged_vcf RuntimeAttr? runtime_override_count_samples + RuntimeAttr? runtime_override_get_vids + RuntimeAttr? runtime_override_cat_vid_lists_sharded + RuntimeAttr? runtime_override_make_sites_only } + File vcf_idx = vcf + ".tbi" if (defined(exclude_list)) { File exclude_list_idx = exclude_list + ".tbi" } - call utils.CountSamples { + call MiniTasks.MakeSitesOnlyVcf { input: - vcf=vcf, - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_count_samples + vcf=vcf, + vcf_index=vcf + ".tbi", + prefix="~{prefix}.sites_only", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_make_sites_only } - Int merge_shard_size = ceil(merging_shard_scale_factor / CountSamples.num_samples) + + Int merge_shard_size = ceil(merging_shard_scale_factor / num_samples) call ShardClusters { input: - vcf=vcf, - prefix="~{prefix}.shard_clusters", + vcf=MakeSitesOnlyVcf.out, + prefix="~{prefix}.sites_only.shard_clusters", dist=dist, frac=frac, exclude_list=exclude_list, @@ -67,24 +84,24 @@ workflow ShardedCluster { svsize=sv_size, sv_types=sv_types, sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_svtk_vcf_cluster + runtime_attr_override=runtime_override_shard_clusters } - call MiniTasks.ShardVids { + call MiniTasks.ShardVidsForClustering { input: clustered_vcf=ShardClusters.out, - prefix=prefix, + prefix="~{prefix}.sites_only.clustered", records_per_shard=merge_shard_size, sv_pipeline_docker=sv_pipeline_docker, - runtime_attr_override=runtime_override_svtk_vcf_cluster + runtime_attr_override=runtime_override_shard_vids } #Run vcfcluster per shard - scatter (i in range(length(ShardVids.out))) { - call PullVcfShard { + scatter (i in range(length(ShardVidsForClustering.out))) { + call MiniTasks.PullVcfShard { input: vcf=vcf, - vids=ShardVids.out[i], + vids=ShardVidsForClustering.out[i], prefix="~{prefix}.unclustered.shard_${i}", sv_base_mini_docker=sv_base_mini_docker, runtime_attr_override=runtime_override_pull_vcf_shard @@ -92,10 +109,10 @@ workflow ShardedCluster { call SvtkVcfCluster { input: vcf=PullVcfShard.out, - num_samples=CountSamples.num_samples, + num_samples=num_samples, num_vids=PullVcfShard.count, prefix="~{prefix}.clustered.shard_${i}", - vid_prefix="~{prefix}_~{contig}_~{sv_type}_~{i}", + vid_prefix="~{cohort_name}_~{contig}_~{evidence_type}_~{sv_type}_~{i}", dist=dist, frac=frac, exclude_list=exclude_list, @@ -119,27 +136,43 @@ workflow ShardedCluster { call GetVcfHeaderWithMembersInfoLine { input: vcf_gz=vcf, - prefix="~{prefix}.members", + prefix="~{prefix}.clustered", sv_base_mini_docker=sv_base_mini_docker, runtime_attr_override=runtime_override_get_vcf_header_with_members_info_line } } if (length(SvtkVcfCluster.out) > 0) { - call MiniTasks.ConcatVcfs { - input: - vcfs=SortVcf.out, - vcfs_idx=SortVcf.out_index, - allow_overlaps=true, - outfile_prefix="~{prefix}.clustered", - sv_base_mini_docker=sv_base_mini_docker, - runtime_attr_override=runtime_override_concat_sharded_cluster + if (use_hail) { + call HailMerge.HailMerge as ConcatVcfsHail { + input: + vcfs=SortVcf.out, + prefix="~{prefix}.clustered", + gcs_project=gcs_project, + sv_base_mini_docker=sv_base_mini_docker, + sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, + runtime_override_preconcat=runtime_override_preconcat_sharded_cluster, + runtime_override_hail_merge=runtime_override_hail_merge_sharded_cluster, + runtime_override_fix_header=runtime_override_fix_header_sharded_cluster + } + } + if (!use_hail) { + call MiniTasks.ConcatVcfs as ConcatVcfs { + input: + vcfs=SortVcf.out, + vcfs_idx=SortVcf.out_index, + allow_overlaps=true, + outfile_prefix="~{prefix}.clustered", + sv_base_mini_docker=sv_base_mini_docker, + runtime_attr_override=runtime_override_concat_sharded_cluster + } } } #Output output { - File clustered_vcf = select_first([GetVcfHeaderWithMembersInfoLine.out, ConcatVcfs.concat_vcf]) - File clustered_vcf_idx = select_first([GetVcfHeaderWithMembersInfoLine.out_idx, ConcatVcfs.concat_vcf_idx]) + File clustered_vcf = select_first([GetVcfHeaderWithMembersInfoLine.out, ConcatVcfs.concat_vcf, ConcatVcfsHail.merged_vcf]) + File clustered_vcf_idx = select_first([GetVcfHeaderWithMembersInfoLine.out_idx, ConcatVcfs.concat_vcf_idx, ConcatVcfsHail.merged_vcf_index]) } } @@ -186,7 +219,7 @@ task GetVcfHeaderWithMembersInfoLine { } } -#Do fast cluster on sites-only vcf (sample_overlap = 0) to generate shards +#Do fast cluster on vcf (sample_overlap = 0) to generate shards task ShardClusters { input { File vcf @@ -225,11 +258,10 @@ task ShardClusters { command <<< set -euo pipefail - bcftools view -G ~{vcf} -Oz -o sites_only.vcf.gz ~{if defined(exclude_list) && !defined(exclude_list_idx) then "tabix -p bed ~{exclude_list}" else ""} #Run clustering - svtk vcfcluster <(echo "sites_only.vcf.gz") ~{prefix}.vcf.gz \ + svtk vcfcluster <(echo "~{vcf}") ~{prefix}.vcf.gz \ -d ~{dist} \ -f ~{frac} \ ~{if defined(exclude_list) then "-x ~{exclude_list}" else ""} \ @@ -247,47 +279,6 @@ task ShardClusters { } } -task PullVcfShard { - input { - File vcf - File vids - String prefix - String sv_base_mini_docker - RuntimeAttr? runtime_attr_override - } - - String output_prefix = "~{prefix}" - RuntimeAttr runtime_default = object { - mem_gb: 3.75, - disk_gb: ceil(10.0 + size(vcf, "GiB") * 2.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" - disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_base_mini_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -euo pipefail - bcftools view --no-version --include ID=@~{vids} ~{vcf} -O z -o ~{output_prefix}.vcf.gz - wc -l < ~{vids} > count.txt - >>> - - output { - File out = "~{output_prefix}.vcf.gz" - Int count = read_int("count.txt") - } -} - task SvtkVcfCluster { input { File vcf @@ -307,7 +298,6 @@ task SvtkVcfCluster { } Float default_mem_gb = 3.75 + (120.0 * (num_vids / 19000.0) * (num_samples / 140000.0)) - String output_prefix = "~{prefix}" RuntimeAttr runtime_default = object { mem_gb: default_mem_gb, disk_gb: ceil(10.0 + size(vcf, "GiB") * 2.0), @@ -330,6 +320,7 @@ task SvtkVcfCluster { command <<< set -euo pipefail ~{if defined(exclude_list) && !defined(exclude_list_idx) then "tabix -p bed ~{exclude_list}" else ""} + #Run clustering svtk vcfcluster <(echo "~{vcf}") - \ -d ~{dist} \ @@ -342,10 +333,10 @@ task SvtkVcfCluster { --preserve-ids \ --preserve-genotypes \ --preserve-header \ - | gzip > ~{output_prefix}.vcf.gz + | gzip > ~{prefix}.vcf.gz >>> output { - File out = "~{output_prefix}.vcf.gz" + File out = "~{prefix}.vcf.gz" } } \ No newline at end of file diff --git a/wdl/TasksMakeCohortVcf.wdl b/wdl/TasksMakeCohortVcf.wdl index 9bc2e6a2d..a6a49cca3 100644 --- a/wdl/TasksMakeCohortVcf.wdl +++ b/wdl/TasksMakeCohortVcf.wdl @@ -132,7 +132,7 @@ task SortVcf { RuntimeAttr runtime_default = object { mem_gb: 3.75, - disk_gb: ceil(10.0 + size(vcf, "GB") * 20), + disk_gb: ceil(10.0 + size(vcf, "GB") * 40), cpu_cores: 1, preemptible_tries: 3, max_retries: 1, @@ -176,15 +176,13 @@ task ConcatVcfs { Boolean allow_overlaps = false Boolean naive = false Boolean generate_index = true + Boolean sites_only = false + Boolean sort_vcf_list = false String? outfile_prefix String sv_base_mini_docker RuntimeAttr? runtime_attr_override } - String outfile_name = outfile_prefix + ".vcf.gz" - String allow_overlaps_flag = if allow_overlaps then "--allow-overlaps" else "" - String naive_flag = if naive then "--naive" else "" - # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to # be held in memory or disk while working, potentially in a form that takes up more space) RuntimeAttr runtime_default = object { @@ -206,18 +204,25 @@ task ConcatVcfs { bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) } + String outfile_name = outfile_prefix + ".vcf.gz" + String allow_overlaps_flag = if allow_overlaps then "--allow-overlaps" else "" + String naive_flag = if naive then "--naive" else "" + String concat_output_type = if (sites_only) then "v" else "z" + String sites_only_command = if (sites_only) then "| bcftools view --no-version -G -Oz" else "" + String generate_index_command = if (generate_index) then "tabix ~{outfile_name}" else "touch ~{outfile_name}.tbi" + command <<< set -euo pipefail VCFS="~{write_lines(vcfs)}" - if ~{!defined(vcfs_idx)}; then - cat ${VCFS} | xargs -n1 tabix - fi - bcftools concat --no-version ~{allow_overlaps_flag} ~{naive_flag} --output-type z --file-list ${VCFS} --output "~{outfile_name}" - if ~{generate_index}; then - tabix "~{outfile_name}" + if ~{sort_vcf_list}; then + cat $VCFS | awk -F '/' '{print $NF"\t"$0}' | sort -k1,1V | awk '{print $2}' > vcfs.list else - touch ~{outfile_name}.tbi + cp $VCFS vcfs.list fi + bcftools concat --no-version ~{allow_overlaps_flag} ~{naive_flag} -O~{concat_output_type} --file-list vcfs.list \ + ~{sites_only_command} \ + > ~{outfile_name} + ~{generate_index_command} >>> output { @@ -444,53 +449,6 @@ task FilterVcf { } } -# Find intersection of Variant IDs from vid_list with those present in vcf, return as filtered_vid_list -task SubsetVariantList { - input { - File vid_list - File vcf - String outfile_name - String sv_base_mini_docker - RuntimeAttr? runtime_attr_override - } - - # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to - # be held in memory or disk while working, potentially in a form that takes up more space) - Float vid_list_size = size(vid_list, "GB") - Float vcf_size = size(vcf, "GB") - RuntimeAttr runtime_default = object { - mem_gb: 2.0, - disk_gb: ceil(10.0 + vcf_size + vid_list_size * 2.0), - cpu_cores: 1, - preemptible_tries: 3, - max_retries: 1, - boot_disk_gb: 10 - } - RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) - runtime { - memory: select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + " GB" - disks: "local-disk " + select_first([runtime_override.disk_gb, runtime_default.disk_gb]) + " HDD" - cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) - preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) - maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) - docker: sv_base_mini_docker - bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) - } - - command <<< - set -eu -o pipefail - #Get list of variant IDs present in VCF - zcat ~{vcf} | (grep -vE "^#" || printf "") | cut -f3 > valid_vids.list - #Restrict input variant ID list to valid VIDs - (fgrep -wf valid_vids.list ~{vid_list} || printf "") > "~{outfile_name}" - >>> - - output { - File filtered_vid_list = outfile_name - } -} - - # evenly split text file into even chunks # if shuffle_file is set to true, shuffle the file before splitting (default = false) task SplitUncompressed { @@ -687,15 +645,10 @@ task UpdateSrList { RuntimeAttr? runtime_attr_override } - # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to - # be held in memory or disk while working, potentially in a form that takes up more space) Float input_size = size([vcf, original_list], "GiB") - Float compression_factor = 5.0 - Float base_disk_gb = 10.0 - Float base_mem_gb = 2.0 RuntimeAttr runtime_default = object { mem_gb: 3.75, - disk_gb: ceil(base_disk_gb + input_size * (2.0 + 2.0 * compression_factor)), + disk_gb: ceil(10.0 + size(original_list, "GiB") * 3 + size(vcf, "GiB")), cpu_cores: 1, preemptible_tries: 3, max_retries: 1, @@ -715,17 +668,19 @@ task UpdateSrList { command <<< set -euxo pipefail - ##append new ids to original list## - svtk vcf2bed ~{vcf} int.bed -i MEMBERS - - ##remove header and match id one per line## - awk '{if (NR>1) print $4 "\t" $NF}' int.bed \ - | awk -F'[,\t]' '{for(i=2; i<=NF; ++i) print $i "\t" $1 }' \ - | sort -k1,1\ - > newidlist.txt - - join -j 1 -t $'\t' <(awk '{print $NF "\t" $0}' ~{original_list} | sort -k1,1) newidlist.txt \ - | cut -f2- \ + # append new ids to original list + svtk vcf2bed ~{vcf} int.bed -i MEMBERS --no-samples --no-header + + # match id one per line + # if an id is not found in the vcf, use previous id (in case vcf is a shard/subset) + # also sort by first column, which is support fraction for a bothside pass list + awk -F'[,\t]' -v OFS='\t' \ + '{ \ + if (ARGIND==1) for(i=6; i<=NF; ++i) MAP[$i]=$4; \ + else if ($NF in MAP) print $0,MAP[$NF]; \ + else print $0,$NF; \ + }' int.bed ~{original_list} \ + | sort -k1,1n \ > ~{outfile} >>> @@ -735,7 +690,7 @@ task UpdateSrList { } -task ShardVids { +task ShardVidsForClustering { input { File clustered_vcf String prefix @@ -785,7 +740,7 @@ task ShardVids { print("empty vcf - no shards will be produced") sys.exit(0) vcf.reset() - + current_cluster = None current_cluster_vids = [] current_shard = 0 @@ -832,4 +787,232 @@ task ShardVids { output { Array[File] out = glob("~{prefix}.vids.shard_*.list") } +} + +task MakeSitesOnlyVcf { + input { + File vcf + File vcf_index + String prefix + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GiB") * 1.2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_base_mini_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euxo pipefail + bcftools view --no-version -G ~{vcf} -Oz -o ~{prefix}.vcf.gz + tabix ~{prefix}.vcf.gz + >>> + + output { + File out = "~{prefix}.vcf.gz" + File out_index = "~{prefix}.vcf.gz.tbi" + } +} + + +task ReheaderVcf { + input { + File vcf + File vcf_index + File header + String prefix + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GiB") * 2.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_base_mini_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euxo pipefail + bcftools reheader -h ~{header} ~{vcf} > ~{prefix}.vcf.gz + tabix ~{prefix}.vcf.gz + >>> + + output { + File out = "~{prefix}.vcf.gz" + File out_index = "~{prefix}.vcf.gz.tbi" + } +} + +task PullVcfShard { + input { + File vcf + File vids + String prefix + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + String output_prefix = "~{prefix}" + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vcf, "GiB") * 2.0), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_base_mini_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + bcftools view --no-version --include ID=@~{vids} ~{vcf} -O z -o ~{output_prefix}.vcf.gz + tabix ~{output_prefix}.vcf.gz + wc -l < ~{vids} > count.txt + >>> + + output { + File out = "~{output_prefix}.vcf.gz" + File out_index = "~{output_prefix}.vcf.gz.tbi" + Int count = read_int("count.txt") + } +} + +task RenameVariantIds { + input { + File vcf + File? vcf_index + String vid_prefix + String file_prefix + Boolean? use_ssd + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + String disk_type = if (defined(use_ssd) && select_first([use_ssd])) then "SSD" else "HDD" + Float input_size = size(vcf, "GiB") + RuntimeAttr runtime_default = object { + mem_gb: 2.0, + disk_gb: ceil(10.0 + input_size * 2), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GiB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} ~{disk_type}" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_base_mini_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + zcat ~{vcf} \ + | awk -F'\t' -v OFS='\t' -v i=0 '{if ($0~/^#/) {print; next} $3="prefix_"(i++); print}' \ + | bgzip \ + > ~{file_prefix}.vcf.gz + if ~{defined(vcf_index)}; then + tabix ~{file_prefix}.vcf.gz + else + touch ~{file_prefix}.vcf.gz + fi + >>> + + output { + File out = "~{file_prefix}.vcf.gz" + File out_index = "~{file_prefix}.vcf.gz.tbi" + } +} + +# Note: requires docker with updated bcftools +task ScatterVcf { + input { + File vcf + String prefix + Int records_per_shard + Int? threads = 1 + String sv_pipeline_docker + RuntimeAttr? runtime_attr_override + } + + Float input_size = size(vcf, "GB") + Float base_disk_gb = 10.0 + + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(base_disk_gb + input_size * 5.0), + cpu_cores: 2, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: "~{select_first([runtime_override.mem_gb, runtime_default.mem_gb])} GB" + disks: "local-disk ~{select_first([runtime_override.disk_gb, runtime_default.disk_gb])} HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_pipeline_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + # in case the file is empty create an empty shard + bcftools view -h ~{vcf} | bgzip -c > ~{prefix}.0.vcf.gz + bcftools +scatter ~{vcf} -o . -O z -p ~{prefix}. --threads ~{threads} -n ~{records_per_shard} + + ls ~{prefix}.*.vcf.gz | sort -k1,1V > vcfs.list + i=0 + while read vcf; do + shard_no=`printf %06d $i` + mv ${vcf} ~{prefix}.shard_${shard_no}.vcf.gz + i=$((i+1)) + done < vcfs.list + >>> + output { + Array[File] shards = glob("~{prefix}.shard_*.vcf.gz") + } } \ No newline at end of file diff --git a/wdl/VcfClusterSingleChromsome.wdl b/wdl/VcfClusterSingleChromsome.wdl index 16e380a49..a40ea800b 100644 --- a/wdl/VcfClusterSingleChromsome.wdl +++ b/wdl/VcfClusterSingleChromsome.wdl @@ -10,7 +10,10 @@ import "ClusterSingleChromosome.wdl" as VcfClusterTasks workflow VcfClusterSingleChrom { input { Array[File] vcfs + Int num_samples String prefix + String evidence_type + String cohort_name Int dist Float frac Float sample_overlap @@ -25,7 +28,11 @@ workflow VcfClusterSingleChrom { File background_fail File empty_file + Boolean use_hail + String? gcs_project + String sv_pipeline_docker + String sv_pipeline_hail_docker String sv_base_mini_docker # overrides for local tasks @@ -39,6 +46,8 @@ workflow VcfClusterSingleChrom { RuntimeAttr? runtime_override_subset_background_fail # overrides for VcfClusterTasks + RuntimeAttr? runtime_override_shard_clusters + RuntimeAttr? runtime_override_shard_vids RuntimeAttr? runtime_override_subset_sv_type RuntimeAttr? runtime_override_shard_vcf_precluster RuntimeAttr? runtime_override_pull_vcf_shard @@ -47,6 +56,12 @@ workflow VcfClusterSingleChrom { RuntimeAttr? runtime_override_concat_vcf_cluster RuntimeAttr? runtime_override_concat_svtypes RuntimeAttr? runtime_override_concat_sharded_cluster + RuntimeAttr? runtime_override_make_sites_only + RuntimeAttr? runtime_override_sort_merged_vcf + + RuntimeAttr? runtime_override_preconcat_sharded_cluster + RuntimeAttr? runtime_override_hail_merge_sharded_cluster + RuntimeAttr? runtime_override_fix_header_sharded_cluster } scatter (i in range(length(vcfs))) { @@ -102,11 +117,14 @@ workflow VcfClusterSingleChrom { } #Run vcfcluster per chromosome - call VcfClusterTasks.ClusterSingleChrom as ClusterSingleChrom { + call VcfClusterTasks.ClusterSingleChrom { input: vcf=ConcatVcfs.concat_vcf, vcf_index=ConcatVcfs.concat_vcf_idx, + num_samples=num_samples, contig=contig, + cohort_name=cohort_name, + evidence_type=evidence_type, prefix=prefix, dist=dist, frac=frac, @@ -114,30 +132,42 @@ workflow VcfClusterSingleChrom { exclude_list=exclude_list, sv_size=sv_size, sv_types=sv_types, + empty_file=empty_file, + use_hail=use_hail, + gcs_project=gcs_project, sv_pipeline_docker=sv_pipeline_docker, + sv_pipeline_hail_docker=sv_pipeline_hail_docker, sv_base_mini_docker=sv_base_mini_docker, runtime_override_subset_sv_type=runtime_override_subset_sv_type, - runtime_override_shard_vcf_precluster=runtime_override_shard_vcf_precluster, + runtime_override_shard_clusters=runtime_override_shard_clusters, + runtime_override_shard_vids=runtime_override_shard_vids, runtime_override_pull_vcf_shard=runtime_override_pull_vcf_shard, runtime_override_svtk_vcf_cluster=runtime_override_svtk_vcf_cluster, runtime_override_get_vcf_header_with_members_info_line=runtime_override_get_vcf_header_with_members_info_line, runtime_override_concat_svtypes=runtime_override_concat_svtypes, - runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster + runtime_override_concat_sharded_cluster=runtime_override_concat_sharded_cluster, + runtime_override_make_sites_only=runtime_override_make_sites_only, + runtime_override_sort_merged_vcf=runtime_override_sort_merged_vcf, + runtime_override_preconcat_sharded_cluster=runtime_override_preconcat_sharded_cluster, + runtime_override_hail_merge_sharded_cluster=runtime_override_hail_merge_sharded_cluster, + runtime_override_fix_header_sharded_cluster=runtime_override_fix_header_sharded_cluster } if(subset_sr_lists) { #Subset bothside_pass & background_fail to chromosome of interest - call MiniTasks.SubsetVariantList as SubsetBothsidePass { + call SubsetVariantList as SubsetBothsidePass { input: vid_list=bothside_pass, + vid_col=2, vcf=ConcatVcfs.concat_vcf, outfile_name="~{prefix}.pass.VIDs.list", sv_base_mini_docker=sv_base_mini_docker, runtime_attr_override=runtime_override_subset_bothside_pass } - call MiniTasks.SubsetVariantList as SubsetBackgroundFail { + call SubsetVariantList as SubsetBackgroundFail { input: vid_list=background_fail, + vid_col=1, vcf=ConcatVcfs.concat_vcf, outfile_name="~{prefix}.fail.VIDs.list", sv_base_mini_docker=sv_base_mini_docker, @@ -146,8 +176,8 @@ workflow VcfClusterSingleChrom { } output { - File clustered_vcf = ClusterSingleChrom.clustered_vcf - File clustered_vcf_idx = ClusterSingleChrom.clustered_vcf_idx + Array[File] clustered_vcfs = ClusterSingleChrom.clustered_vcfs + Array[File] clustered_vcf_indexes = ClusterSingleChrom.clustered_vcf_indexes File filtered_bothside_pass = select_first([SubsetBothsidePass.filtered_vid_list, empty_file]) File filtered_background_fail = select_first([SubsetBackgroundFail.filtered_vid_list, empty_file]) } @@ -393,3 +423,47 @@ task FixEvidenceTags { File out_index = "~{prefix}.~{contig}.unclustered.vcf.gz.tbi" } } + +# Find intersection of Variant IDs from vid_list with those present in vcf, return as filtered_vid_list +task SubsetVariantList { + input { + File vid_list + Int vid_col + File vcf + String outfile_name + String sv_base_mini_docker + RuntimeAttr? runtime_attr_override + } + + # when filtering/sorting/etc, memory usage will likely go up (much of the data will have to + # be held in memory or disk while working, potentially in a form that takes up more space) + RuntimeAttr runtime_default = object { + mem_gb: 3.75, + disk_gb: ceil(10.0 + size(vid_list, "GB") * 2.0 + size(vcf, "GB")), + cpu_cores: 1, + preemptible_tries: 3, + max_retries: 1, + boot_disk_gb: 10 + } + RuntimeAttr runtime_override = select_first([runtime_attr_override, runtime_default]) + runtime { + memory: select_first([runtime_override.mem_gb, runtime_default.mem_gb]) + " GB" + disks: "local-disk " + select_first([runtime_override.disk_gb, runtime_default.disk_gb]) + " HDD" + cpu: select_first([runtime_override.cpu_cores, runtime_default.cpu_cores]) + preemptible: select_first([runtime_override.preemptible_tries, runtime_default.preemptible_tries]) + maxRetries: select_first([runtime_override.max_retries, runtime_default.max_retries]) + docker: sv_base_mini_docker + bootDiskSizeGb: select_first([runtime_override.boot_disk_gb, runtime_default.boot_disk_gb]) + } + + command <<< + set -euo pipefail + zgrep -v "^#" ~{vcf} | cut -f3 > valid_vids.list + awk -F'\t' -v OFS='\t' 'ARGIND==1{inFileA[$1]; next} {if ($~{vid_col} in inFileA) print }' valid_vids.list ~{vid_list} \ + > ~{outfile_name} + >>> + + output { + File filtered_vid_list = outfile_name + } +} \ No newline at end of file