Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update batch effect workflows #435

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 6 additions & 6 deletions inputs/values/dockers.json
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,12 @@
"samtools_cloud_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/samtools-cloud:2022-06-10-v0.23-beta-9c6fbf56",
"sv_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base:2022-06-10-v0.23-beta-9c6fbf56",
"sv_base_mini_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-base-mini:2022-06-10-v0.23-beta-9c6fbf56",
"sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2022-09-27-v0.26.2-beta-1f1a3b3a",
"sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2022-09-27-v0.26.2-beta-1f1a3b3a",
"sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2022-09-27-v0.26.2-beta-1f1a3b3a",
"sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2022-09-27-v0.26.2-beta-1f1a3b3a",
"sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2022-09-27-v0.26.2-beta-1f1a3b3a",
"sv_pipeline_rdtest_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/sv-pipeline:2022-09-27-v0.26.2-beta-1f1a3b3a",
"sv_pipeline_base_docker": "us.gcr.io/broad-dsde-methods/markw/sv-pipeline:mw-xbatch-fx-e546726",
"sv_pipeline_docker": "us.gcr.io/broad-dsde-methods/markw/sv-pipeline:mw-xbatch-fx-e546726",
"sv_pipeline_hail_docker": "us.gcr.io/broad-dsde-methods/markw/sv-pipeline:mw-xbatch-fx-e546726",
"sv_pipeline_updates_docker": "us.gcr.io/broad-dsde-methods/markw/sv-pipeline:mw-xbatch-fx-e546726",
"sv_pipeline_qc_docker": "us.gcr.io/broad-dsde-methods/markw/sv-pipeline:mw-xbatch-fx-e546726",
"sv_pipeline_rdtest_docker": "us.gcr.io/broad-dsde-methods/markw/sv-pipeline:mw-xbatch-fx-e546726",
"wham_docker": "us.gcr.io/broad-dsde-methods/wham:8645aa",
"igv_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/igv:mw-xz-fixes-2-b1be6a9",
"duphold_docker": "us.gcr.io/broad-dsde-methods/gatk-sv/duphold:mw-xz-fixes-2-b1be6a9",
Expand Down
96 changes: 96 additions & 0 deletions src/sv-pipeline/scripts/benchmark/annotate_vcf_with_BatchEffect.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Annotate sample-level Boost scores directly into an input VCF
"""

import argparse
import pysam
import sys


def svid_info_readin(svid_info):
fin = open(svid_info)
out = {}
for line in fin:
pin = line.strip().split()
out[pin[0]] = pin[1]
fin.close()
return out


def samp_batch_readin(sample_batch):
fin = open(sample_batch)
out = {}
for line in fin:
pin = line.strip().split()
if not pin[1] in out.keys():
out[pin[1]] = []
out[pin[1]].append(pin[0])
fin.close()
return out


def main():
"""
Command-line main block
"""

# Parse command line arguments and options
parser = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
parser.add_argument('--vcf', required=True, help='VCF to be annotated. Required.')
parser.add_argument('--SVID_info', required=True,
help='two-column .tsv of file including the SVID that failed batch effect')
parser.add_argument('--SVID_filter', required=True,
help='two-column .tsv of file including the SVID to have filter column changed')
parser.add_argument('--SVID_format', required=True,
help='two-column .tsv of file including the SVID to have format column changed')
parser.add_argument('--sample_batch', required=True,
help='two-column .tsv of file including the sample ID and corresponding batches')
parser.add_argument('-o', '--outfile', default='stdout', help='Path to output VCF')
args = parser.parse_args()

# Read in batch effect results
svid_info = svid_info_readin(args.SVID_info)
svid_filter = svid_info_readin(args.SVID_filter)
svid_format = svid_info_readin(args.SVID_format)
samp_batch = samp_batch_readin(args.sample_batch)

# Open connection to input VCF
if args.vcf in '- stdin'.split():
vcf = pysam.VariantFile(sys.stdin)
else:
vcf = pysam.VariantFile(args.vcf)

# Add new INFO line to VCF header
vcf.header.add_meta('INFO', items=[('ID', "BATCH_EFFECT"), ('Number', "1"), ('Type', "String"),
('Description', "Batch effect results")])

vcf.header.add_meta('FILTER', items=[('ID', "PCR_AF_BIAS"),
('Description', "SVs showed significant bias between PCR+ and PCR- batches")])

vcf.header.add_meta('FILTER', items=[('ID', "UNSTABLE_AF_ESTIMATE_PCRMINUS"),
('Description', "SVs showed significant bias between pairs of PCR- batches")])

fout = pysam.VariantFile(args.outfile, 'w', header=vcf.header)
for record in vcf:
if record.id in svid_info.keys():
record.info['BATCH_EFFECT'] = svid_info[record.id]
if record.id in svid_filter.keys():
record.filter.add(svid_filter[record.id])
if record.id in svid_format.keys():
batch_key = svid_format[record.id].split(',')
sample_list = []
for batch in batch_key:
sample_list += samp_batch['gnomAD-SV_v3_' + batch]
for sample in sample_list:
if sample in record.samples.keys():
record.samples[sample]['GT'] = (None, None)
fout.write(record)
vcf.close()
fout.close()


if __name__ == '__main__':
main()
141 changes: 141 additions & 0 deletions src/sv-pipeline/scripts/benchmark/identify_SVs_failed_BatchEffect.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
#!R
#!Rscript to integrate batch effect comparison statistics
library("optparse")

option_list = list(
make_option(c( "--stat_plus_vs_minus"), type="character", default=NULL, help="list of all samples", metavar="character"),
make_option(c( "--stat_pre_vs_post"), type="character", default=NULL, help="list of all samples", metavar="character"),
make_option(c( "--stat_pairwise_minus"), type="character", default=NULL, help="loose union of all filters", metavar="character"),
make_option(c( "--stat_pairwise_plus"), type="character", default=NULL, help="loose union of all filters", metavar="character"),
make_option(c( "--stat_one_vs_all_minus"), type="character", default=NULL, help="loose union of all filters", metavar="character"),
make_option(c( "--stat_one_vs_all_plus"), type="character", default=NULL, help="loose union of all filters", metavar="character"),
make_option(c( "--out_fail_info"), type="character", default=NULL, help="loose union of all filters", metavar="character"),
make_option(c( "--out_fail_filter"), type="character", default=NULL, help="loose union of all filters", metavar="character"),
make_option(c( "--out_fail_format"), type="character", default=NULL, help="loose union of all filters", metavar="character")
);
opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);

add_bonferroni_correction<-function(stat1, pv_cff = .05){
correction_fac = nrow(stat1[!is.na(stat1[,2]),])
colnames(stat1)[1] = 'VID'
fail_info = stat1[!is.na(stat1[,2]) & stat1[,2]< pv_cff/correction_fac , ]
fail_filter = stat1[!is.na(stat1[,2]) & stat1[,2]< pv_cff/(correction_fac*5) , ]

stat1[,3] = 'pass'
stat1[,4] = 'pass'
if (length(fail_info > 0)) {
stat1[stat1[,1]%in%fail_info[,1],3] = 'fail'
stat1[stat1[,1]%in%fail_filter[,1],4] = 'fail'
}
colnames(stat1)[3] = 'info'
colnames(stat1)[4] = 'filter'

return(stat1)
}

add_bonferroni_per_batch<-function(stat4){
out_metrics = data.frame(stat4[,c(1,2)])
out_metrics = add_bonferroni_correction(out_metrics)
colnames(out_metrics)[3] = gsub('chisq_p_','',colnames(out_metrics)[2])
out_metrics = out_metrics[,c(1,3)]
out_metrics[out_metrics[,2]=='fail',][,2] = colnames(out_metrics)[2]
for(i in c(3:ncol(stat4))){
tmp_metrics = data.frame(stat4[,c(1,i)])
tmp_metrics = add_bonferroni_correction(tmp_metrics)
colnames(tmp_metrics)[3] = gsub('chisq_p_','',colnames(tmp_metrics)[2])
tmp_metrics = tmp_metrics[,c(1,3)]
failed = tmp_metrics[,2]=='fail'
if (any(failed)) {
tmp_metrics[failed,][,2] = colnames(tmp_metrics)[2]
}

out_metrics = cbind(out_metrics,tmp_metrics[,2])
colnames(out_metrics)[ncol(out_metrics)] = colnames(tmp_metrics)[ncol(tmp_metrics)]
}
return(out_metrics)
}

#read in batch effect comparison results as a 2-column file with SVID and p-value
print("readin PCR+ vs. PCR- comparison statistics ... ")
stat1_pre=read.table(opt$stat_plus_vs_minus, header=T, comment.char="")

print("readin pre- vs. post- filtering comparison statistics ... ")
stat2=read.table(opt$stat_pre_vs_post, header=T, comment.char="")
stat2a_pre = stat2[,c('X.VID','p.PCRMINUS')]
stat2b_pre = stat2[,c('X.VID','p.PCRPLUS')]

print("readin pairwise comparison statistics ... ")
stat3a_pre = read.table(opt$stat_pairwise_minus, header=T, comment.char="")
stat3b_pre = read.table(opt$stat_pairwise_plus, header=T, comment.char="")

print("readin one vs. all comparison statistics ... ")
stat4a_pre = read.table(opt$stat_one_vs_all_minus, header=T, comment.char="")
stat4b_pre = read.table(opt$stat_one_vs_all_plus, header=T, comment.char="")

#seek for SVs with significant P value after bonferroni correction
print("identify SVs with significant bonferroni corrected p-values ... ")
stat1 = add_bonferroni_correction(stat1_pre)
stat2a = add_bonferroni_correction(stat2a_pre)
stat2b = add_bonferroni_correction(stat2b_pre)
stat3a = add_bonferroni_correction(stat3a_pre)
stat3b = add_bonferroni_correction(stat3b_pre)
stat4a = add_bonferroni_per_batch(stat4a_pre)
stat4b = add_bonferroni_per_batch(stat4b_pre)

#collect failed SVID
fail_SVID_plus_vs_minus_info = data.frame(stat1[stat1$info=='fail',][,c(1,2)])
fail_SVID_pre_vs_post_minus_info = stat2a[stat2a$info=='fail',][,c(1,2)]
fail_SVID_pre_vs_post_plus_info = stat2b[stat2b$info=='fail',][,c(1,2)]
fail_SVID_pairwise_minus_info = stat3a[stat3a$info=='fail',][,c(1,2)]
fail_SVID_pairwise_plus_info = stat3b[stat3b$info=='fail',][,c(1,2)]

#label SVID with the reason for failure
fail_SVID_plus_vs_minus_info[,2] = 'PCR_AF_BIAS'
fail_SVID_pre_vs_post_minus_info[,2] = 'UNSTABLE_AF_ESTIMATE_PCRMINUS_pre_post'
fail_SVID_pre_vs_post_plus_info[,2] = 'UNSTABLE_AF_ESTIMATE_PCRPLUS_pre_post'
fail_SVID_pairwise_minus_info[,2] = 'UNSTABLE_AF_ESTIMATE_PCRMINUS_pairwise'
fail_SVID_pairwise_plus_info[,2] = 'UNSTABLE_AF_ESTIMATE_PCRPLUS_pairwise'

#integrate and print significant results to output file
print("write SVs with significant bonferroni corrected p-values ... ")
info_out = merge(fail_SVID_plus_vs_minus_info,fail_SVID_pre_vs_post_minus_info,by='VID', all=T)
info_out = merge(info_out,fail_SVID_pre_vs_post_plus_info,by='VID', all=T)
info_out = merge(info_out,fail_SVID_pairwise_minus_info,by='VID', all=T)
info_out = merge(info_out,fail_SVID_pairwise_plus_info,by='VID', all=T)
info_out[,ncol(info_out)+1] = apply(info_out[,c(2,5,6)],1,function(x){paste(x[!is.na(x)],collapse = ',')})
colnames(info_out)[ncol(info_out)] = 'info_wo_pre_post'
info_out[,ncol(info_out)+1] = apply(info_out[,c(2:6)],1,function(x){paste(x[!is.na(x)],collapse = ',')})
colnames(info_out)[ncol(info_out)] = 'info_with_pre_post'
info_out_a = info_out[,c('VID','info_wo_pre_post')]
info_out_b = info_out[,c('VID','info_with_pre_post')]
info_out_a = info_out_a[info_out_a[,2]!="",]
info_out_b = info_out_b[info_out_b[,2]!="",]
write.table(info_out_a, opt$out_fail_info, quote=F, sep='\t',col.names=F, row.names=F)

#integrate SVID that failed batch effect and need filter column to be revised
print("write SVs with significant bias between batches ... ")
fail_SVID_plus_vs_minus_filter = stat1[stat1$filter=='fail',][,c(1,2)]
fail_SVID_pre_vs_post_minus_filter = stat2a[stat2a$filter=='fail',][,c(1,2)]
fail_SVID_pre_vs_post_plus_filter = stat2b[stat2b$filter=='fail',][,c(1,2)]
fail_SVID_pairwise_minus_filter = stat3a[stat3a$filter=='fail',][,c(1,2)]
fail_SVID_pairwise_plus_filter = stat3b[stat3b$filter=='fail',][,c(1,2)]
fail_filter = unique(c(fail_SVID_plus_vs_minus_filter, fail_SVID_pre_vs_post_minus_filter,fail_SVID_pairwise_minus_filter))
fail_SVID_plus_vs_minus_filter[,2] = 'PCR_AF_BIAS'
fail_SVID_pairwise_minus_filter[,2] = 'UNSTABLE_AF_ESTIMATE_PCRMINUS'
filter_out = merge(fail_SVID_plus_vs_minus_filter, fail_SVID_pairwise_minus_filter, by='VID', all=T)
filter_out[,4] = apply(filter_out[,c(2,3)],1,function(x){paste(x[!is.na(x)],collapse = ',')})
filter_out[,5] = apply(filter_out,1,function(x){strsplit(as.character(x[4]),',')[[1]][1]})

write.table(filter_out[,c(1,5)], opt$out_fail_filter, quote=F, sep='\t',col.names=F, row.names=F)

#integrate one vs. all comparison results to re-label format columns
print("write SVs with significant bias from one vs. all comparisons... ")
stat4_out = merge(stat4a, stat4b, by='VID', all=T)
stat4_out[is.na(stat4_out)] = 'pass'
stat4_out[,ncol(stat4_out)+1] = apply(stat4_out[,c(2:ncol(stat4_out))],1, function(x){paste(x[x!="pass"], collapse = ',')})
stat4_out_2 = stat4_out[,c(1,ncol(stat4_out))]
stat4_out_2 = stat4_out_2[stat4_out_2[,2]!="",]
write.table(stat4_out_2, opt$out_fail_format, quote=F, sep='\t',col.names=F, row.names=F)


Original file line number Diff line number Diff line change
@@ -0,0 +1,122 @@
#!/usr/bin/env Rscript

# Copyright (c) 2022 Talkowski Laboratory
# Contact: Ryan Collins <[email protected]>
# Distributed under terms of the MIT license.

# Talkowski SV pipeline downstream analysis helper script

# Test for differences in AFs between PCR+ and PCR- batches


###Set global parameters
options(stringsAsFactors=F, scipen=1000)


###################
###HELPER FUNCTIONS
###################
# Sum AC and AN data per population for PCR+ and PCR- samples
collapse.data.by.pcr.status <- function(dat, minus.batches, plus.batches, allpops){
minus.df <- do.call(cbind, lapply(allpops, function(pop){
do.call(cbind, lapply(c("AC", "AN"), function(x){
apply(dat[, paste(pop, "_", x, ".", minus.batches, sep="")], 1, sum)
}))
}))
colnames(minus.df) <- unlist(sapply(allpops, function(pop){
paste(pop, "_", c("AC", "AN"), ".minus", sep="")}))
plus.df <- do.call(cbind, lapply(allpops, function(pop){
do.call(cbind, lapply(c("AC", "AN"), function(x){
apply(dat[, paste(pop, "_", x, ".", plus.batches, sep="")], 1, sum)
}))
}))
colnames(plus.df) <- unlist(sapply(allpops, function(pop){
paste(pop, "_", c("AC", "AN"), ".plus", sep="")}))
cbind(dat[, 1:3], minus.df, plus.df)
}

# Find best population for AF comparisons for a single variant and run chi-squared test
compare.pcr.afs <- function(vals, allpops){
# Clean data
VID <- as.character(vals[1])
vals <- vals[-c(1:3)]
vnames <- names(vals)
vals <- as.numeric(vals)
names(vals) <- vnames

# Find population where AC > 0 that has largest minimum AN between PCR+ and PCR-
minus.ACs <- vals[paste(allpops, "AC.minus", sep="_")]
plus.ACs <- vals[paste(allpops, "AC.plus", sep="_")]
elig.pops <- allpops[which(apply(cbind(minus.ACs, plus.ACs), 1, sum) > 0)]
if(length(elig.pops) > 0){
minus.ANs <- vals[paste(elig.pops, "AN.minus", sep="_")]
plus.ANs <- vals[paste(elig.pops, "AN.plus", sep="_")]
min.ANs <- apply(cbind(minus.ANs, plus.ANs), 1, min)
best.pop <- head(elig.pops[which(min.ANs == max(min.ANs, na.rm=T))], 1)

# Extra data and run chi-squared test
minus.AN <- as.numeric(vals[paste(best.pop, "AN.minus", sep="_")])
minus.AC <- as.numeric(vals[paste(best.pop, "AC.minus", sep="_")])
minus.AF <- minus.AC / minus.AN
plus.AN <- as.numeric(vals[paste(best.pop, "AN.plus", sep="_")])
plus.AC <- as.numeric(vals[paste(best.pop, "AC.plus", sep="_")])
plus.AF <- plus.AC / plus.AN
if(minus.AN > 0 & plus.AN > 0){
chisq.p <- chisq.test(matrix(c(minus.AN-minus.AC, minus.AC,
plus.AN-plus.AC, plus.AC),
nrow=2, byrow=F))$p.value
}else{
chisq.p <- NA
}
}else{
best.pop <- NA
plus.AF <- minus.AF <- 0
chisq.p <- 1
}

# Return values
data.frame("VID"=VID, "pop"=best.pop, "minus.AF"=minus.AF,
"plus.AF"=plus.AF, "chisq.p"=chisq.p)
}


### Read command-line arguments
args <- commandArgs(trailingOnly=T)
infile <- as.character(args[1])
minus.batches.in <- as.character(args[2])
plus.batches.in <- as.character(args[3])
OUTFILE <- as.character(args[4])

# #Dev parameters:
# infile <- "~/scratch/gnomAD-SV-v3.1.10pct_NCR.no_outliers.batch_fx.chr22.merged_AF_table.shard_41.txt.gz"
# minus.batches.in <- "~/scratch/gnomAD-SV-v3.1.PCRMINUS.batches.list"
# plus.batches.in <- "~/scratch/gnomAD-SV-v3.1.PCRPLUS.batches.list"
# OUTFILE <- "~/scratch/test_batchFx.tsv"

### Process input AF data
dat <- read.table(infile, header=T, sep="\t", comment.char="", check.names=F)
allpops <- unique(sapply(colnames(dat)[4:ncol(dat)],
function(x){strsplit(as.character(x),'_')[[1]][1]}))
batches.in.header <- unique(sapply(colnames(dat)[grep("_AN.", colnames(dat), fixed=T)],
function(x){unlist(strsplit(x, split="_AN.", fixed=T))[2]}))

### Load PCR+ and PCR- batches and match with AF data header
minus.batches <- intersect(unique(read.table(minus.batches.in, header=F)[, 1]),
batches.in.header)
plus.batches <- intersect(unique(read.table(plus.batches.in, header=F)[, 1]),
batches.in.header)

# Subset data to batches in either PCR+ or PCR- lists
batch.suffixes <- sapply(colnames(dat)[-c(1:3)],
function(x){unlist(strsplit(x, split="_A[NC]\\." ))[2]})
dat <- dat[, c(1:3, which(batch.suffixes %in% c(minus.batches, plus.batches))+3)]

# Collapse AC/AN data by PCR status
dat <- collapse.data.by.pcr.status(dat, minus.batches, plus.batches, allpops)

### Compare AFs between PCR+ and PCR- batches per variant
res <- do.call(rbind, apply(dat, 1, compare.pcr.afs, allpops=allpops))
colnames(res) <- c("VID", "pop", "minus.AF", "plus.AF", "chisq.p")

### Write to outfile
write.table(res, OUTFILE, col.names=T, row.names=F, sep="\t", quote=F)
Loading