Skip to content

Commit

Permalink
Use all MRNs for query_DB task
Browse files Browse the repository at this point in the history
  • Loading branch information
dufeiyu committed Jun 16, 2022
1 parent 969a877 commit a10540e
Show file tree
Hide file tree
Showing 6 changed files with 65 additions and 47 deletions.
27 changes: 14 additions & 13 deletions MyeloseqHD.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ workflow MyeloseqHD {

File SampleSheet
# sample sheet has this structure:
# index name RG_ID RG_FLOWCELL RG_LANE RG_LIB RG_SAMPLE MRN ACCESSION DOB SEX EXCEPTION [R1] [R2]
# index name RG_ID RG_FLOWCELL RG_LANE RG_LIB RG_SAMPLE MRN ALL_MRNs ACCESSION DOB SEX EXCEPTION [R1] [R2]
File? DemuxSampleSheet

Expand Down Expand Up @@ -69,13 +69,13 @@ workflow MyeloseqHD {

Array[Array[String]] inputData = read_tsv(select_first([prepare_samples.sample_sheet,SampleSheet]))

# the inputdata should be: index name RG_ID RG_FLOWCELL RG_LANE RG_LIB RG_SAMPLE MRN ACCESSION DOB SEX EXCEPTION read1path read2path
# the inputdata should be: index name RG_ID RG_FLOWCELL RG_LANE RG_LIB RG_SAMPLE MRN ALL_MRNs ACCESSION DOB SEX EXCEPTION read1path read2path
scatter (samples in inputData){

if(!defined(DemuxSampleSheet)){
call trim_reads {
input: Read1=samples[12],
Read2=samples[13],
input: Read1=samples[13],
Read2=samples[14],
Adapters=Adapters,
Name=samples[1],
queue=Queue,
Expand All @@ -86,8 +86,8 @@ workflow MyeloseqHD {
call dragen_align {
input: DragenRef=DragenReference,
DragenHotspot=DragenHotspot,
fastq1=select_first([trim_reads.read1,samples[12]]),
fastq2=select_first([trim_reads.read2,samples[13]]),
fastq1=select_first([trim_reads.read1,samples[13]]),
fastq2=select_first([trim_reads.read2,samples[14]]),
Name=samples[1],
RG=samples[3] + '.' + samples[4] + '.' + samples[0],
SM=samples[6],
Expand All @@ -112,20 +112,21 @@ workflow MyeloseqHD {
ReferenceDict=ReferenceDict,
Name=samples[1],
mrn=samples[7],
accession=samples[8],
DOB=samples[9],
sex=samples[10],
exception=samples[11],
all_mrn=samples[8],
accession=samples[9],
DOB=samples[10],
sex=samples[11],
exception=samples[12],
RunInfoString=RunInfoString,
VariantDB=VariantDB,
Vepcache=VEP,
AmpliconBed=AmpliconBed,
CustomAnnotationVcf=CustomAnnotationVcf,
CustomAnnotationIndex=CustomAnnotationIndex,
CustomAnnotationParameters=CustomAnnotationParameters,
GenotypeVcf=GenotypeVcf,
MinReads=MinReads,
MinVaf=MinVaf,
GenotypeVcf=GenotypeVcf,
MinReads=MinReads,
MinVaf=MinVaf,
HaplotectBed=HaplotectBed,
QcMetrics=QcMetrics,
OutputDir=OutputDir,
Expand Down
1 change: 1 addition & 0 deletions MyeloseqHDAnalysis.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"MyeloseqHDAnalysis.OutputDir":"PATH",
"MyeloseqHDAnalysis.SubDir":"PATH",
"MyeloseqHDAnalysis.mrn":"NAME",
"MyeloseqHDAnalysis.all_mrn":"NAME",
"MyeloseqHDAnalysis.accession":"NAME",
"MyeloseqHDAnalysis.DOB":"NAME",
"MyeloseqHDAnalysis.sex":"NAME",
Expand Down
5 changes: 3 additions & 2 deletions MyeloseqHDAnalysis.wdl
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ workflow MyeloseqHDAnalysis {
String QcMetrics

String mrn
String all_mrn
String accession
String DOB
String sex
Expand All @@ -39,7 +40,7 @@ workflow MyeloseqHDAnalysis {


call query_DB {
input: mrn=mrn,
input: mrn=all_mrn,
accession=accession,
VariantDB=VariantDB,
queue=Queue,
Expand Down Expand Up @@ -502,6 +503,6 @@ task query_DB {
job_group: jobGroup
}
output {
File query_vcf = "${mrn}_${accession}_query.vcf"
File query_vcf = "${accession}_query.vcf"
}
}
25 changes: 15 additions & 10 deletions dockerfiles/docker-myeloseq/variantDB.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,7 +42,7 @@ def get_transcript_list(bed):
parser.add_argument('-d',"--db",type=str,help='Path to MyeloseqHD varaint sqlite3 DB file')
parser.add_argument('-i',"--mrn_input",type=str,help='Input MRN number for DB upload')
parser.add_argument('-j',"--accession_input",type=str,help='Input accession id for DB upload')
parser.add_argument('-m',"--mrn_query",type=str,help='Query DB with sample MRN number')
parser.add_argument('-m',"--mrn_query",type=str,help='Query DB with sample MRN number, comma separated if multiple')
parser.add_argument('-p',"--path",type=str,help='Path to vcf files')
parser.add_argument('-v',"--vcf",type=str,help='Path to vcf file with variants')

Expand Down Expand Up @@ -208,22 +208,27 @@ def get_transcript_list(bed):
con.commit()

elif mrn and acn: #query db and output a vcf file
uniq_acn = cur.execute("select distinct accession from myeloseqhd_variants where mrn = ? and accession != ?", (mrn, acn)).fetchall()
acn_list = []
for row_acn in uniq_acn:
acn_list.append(row_acn[0])
all_uniq_acn, all_var_rows = [], []
for mrn_str in mrn.split(','):
uniq_acn = cur.execute("select distinct accession from myeloseqhd_variants where mrn = ? and accession != ?", (mrn_str, acn)).fetchall()
acn_list = []
for row_acn in uniq_acn:
acn_list.append(row_acn[0])
var_rows = cur.execute("select * from myeloseqhd_variants where mrn = ? and accession != ?", (mrn_str, acn)).fetchall()
all_uniq_acn.extend(acn_list)
all_var_rows.extend(var_rows)

var_rows = cur.execute("select * from myeloseqhd_variants where mrn = ? and accession != ?", (mrn, acn)).fetchall()
desc_list = [i[0] for i in cur.description]
desc_list = [e for e in desc_list if e not in ('id', 'chromosome', 'position', 'reference', 'variant', 'amps')]

db_version = 'v1'
outvcf_file = mrn + '_' + acn + '_query.vcf'
outvcf_file = acn + '_query.vcf'
vcf_writer = Writer.from_string(outvcf_file, "##fileformat=VCFv4.2\n#" + "\t".join(['CHROM','POS','ID','REF','ALT','QUAL','FILTER','INFO','FORMAT','MYELOSEQHDDB']),mode="w")
vcf_writer.add_to_header('##MyeloSeqHDDB_VERSION=' + db_version)

if acn_list:
vcf_writer.add_to_header('##MyeloSeqHDDB_ACCESSIONLIST=' + ",".join(acn_list))
if all_uniq_acn:
all_uniq_acn=list(set(all_uniq_acn))
vcf_writer.add_to_header('##MyeloSeqHDDB_ACCESSIONLIST=' + ",".join(all_uniq_acn))

vcf_writer.add_to_header('##MyeloSeqHDDB_FIELDS=' + "|".join(desc_list))
vcf_writer.add_to_header('##contig=<ID=chr1,length=248956422>')
Expand Down Expand Up @@ -256,7 +261,7 @@ def get_transcript_list(bed):
vcf_writer.write_header()

var_dict = defaultdict(list)
for vr in var_rows:
for vr in all_var_rows:
lookup_key = "_".join([vr[5], str(vr[6]), vr[7], vr[8]])
var_dict[lookup_key].append(vr)

Expand Down
Binary file modified imports.zip
Binary file not shown.
54 changes: 32 additions & 22 deletions scripts/launcher.pl
Original file line number Diff line number Diff line change
Expand Up @@ -79,10 +79,12 @@
my %hash;

while (my $l = $dump_fh->getline) {
next if $l =~ /^Accession/;
next if $l =~ /^(Accession|,,,,)/;

my @columns = split /,/, $l;
my $id = $columns[2].'_'.$columns[0];
my $all_MRNs = $columns[8];
$all_MRNs =~ s/\|/,/g;
my $sex = $columns[4];

if ($sex eq 'M') {
Expand All @@ -99,6 +101,7 @@
$hash{$id} = {
DOB => $columns[5],
sex => $sex,
all_MRNs => $all_MRNs,
};
}
$dump_fh->close;
Expand All @@ -120,59 +123,63 @@
}
my ($lane, $flowcell, $lib, $index_str, $exception) = @$row;

$lib =~ s/\s+//g;
my ($sample, $mrn, $accession) = $lib =~ /^([A-Z]{4}\-(\d+)\-([A-Z]\d+\-\d+)\-[A-Z0-9]+)\-lib/;

unless ($mrn and $accession) {
die "Library name: $lib must contain MRN, accession id and specimen type";
if ($exception and $exception =~ /NOTRANSFER/ and $exception =~ /RESEQ/) {
die "$lib has both NOTRANSFER and RESEQ as exception and only one is allowed.";
}

unless ((length($mrn) == 10 and $mrn =~ /^99/) or (length($mrn) == 9 and $mrn =~ /^(1|2)/)) {
die "$lib has invalid MRN: $1 MRN must be either a 10-digit number starting with 99 or a 9-digit number starting with 1i or 2";
}
$lib =~ s/\s+//g;
my ($sample, $mrn, $accession) = $lib =~ /^([A-Z]{4}\-(\d+)\-([A-Z]\d+\-\d+)\-[A-Z0-9]+)\-lib/;
my $id = $lib =~ /^H_/ ? 'NONE' : $mrn.'_'.$accession;
($sample) = $lib =~ /^(H_\S+)\-lib/ if $lib =~ /^H_/;

if ($exception =~ /NOTRANSFER/ and $exception =~ /RESEQ/) {
die "$lib has both NOTRANSFER and RESEQ as exception and only one is allowed.";
}
unless ($lib =~ /^H_/) {
unless ($mrn and $accession) {
die "Library name: $lib must contain MRN, accession id and specimen type";
}

my $id = $mrn.'_'.$accession;
unless (exists $hash{$id}) {
if ($exception and $exception =~ /NOTRANSFER|RESEQ/) {
print "$id is NOTRANSFER or RESEQ and no-matching of CoPath dump is ok\n";
unless ((length($mrn) == 10 and $mrn =~ /^99/) or (length($mrn) == 9 and $mrn =~ /^(1|2)/)) {
die "$lib has invalid MRN: $1 MRN must be either a 10-digit number starting with 99 or a 9-digit number starting with 1i or 2";
}
else {
die "FAIL to find matching $mrn and $accession from CoPath dump for library: $lib";

unless (exists $hash{$id}) {
if ($exception and $exception =~ /NOTRANSFER|RESEQ/) {
print "$id is NOTRANSFER or RESEQ and no-matching of CoPath dump is ok\n";
}
else {
die "FAIL to find matching $mrn and $accession from CoPath dump for library: $lib";
}
}
}

my ($index) = $index_str =~ /([ATGC]{8})AT\-AAAAAAAAAA/;

if ($exception) {
push @cases_excluded, $lib.'_'.$index if $exception =~ /NOTRANSFER/;
if ($exception =~ /NOTRANSFER|RESEQ/) {
$exception =~ s/,?(NOTRANSFER|RESEQ),?//;
$exception = 'NONE' if $exception =~ /^\s*$/;
}
push @cases_excluded, $lib.'_'.$index if $exception =~ /NOTRANSFER/;
}
else {
$exception = 'NONE';
}

my ($sex, $DOB);
my ($sex, $DOB, $all_MRNs);
if ($hash{$id}) {
$sex = $hash{$id}->{sex};
$DOB = $hash{$id}->{DOB};
$all_MRNs = $hash{$id}->{all_MRNs};
unless ($sex eq 'male' or $sex eq 'female') {
die "Unknown gender: $sex for library: $lib";
}
}
else { #NOTRANSFER RESEQ
($mrn, $accession, $sex, $DOB) = ('NONE') x 4;
($mrn, $accession, $sex, $DOB, $all_MRNs) = ('NONE') x 5;
}

$ds_str .= join ',', $lane, $lib, $lib, '', $index, '';
$ds_str .= "\n";
$si_str .= join "\t", $index, $lib, $seq_id, $flowcell, $lane, $lib, $sample, $mrn, $accession, $DOB, $sex, $exception;
$si_str .= join "\t", $index, $lib, $seq_id, $flowcell, $lane, $lib, $sample, $mrn, $all_MRNs, $accession, $DOB, $sex, $exception;
$si_str .= "\n";

$seq_id++;
Expand Down Expand Up @@ -249,6 +256,9 @@
if (@cases_excluded and $inputs->{'MyeloseqHD.DataTransfer'} eq 'true') {
$inputs->{'MyeloseqHD.CasesExcluded'} = join ',', @cases_excluded;
}
else {
delete $inputs->{'MyeloseqHD.CasesExcluded'};
}

my $input_json = File::Spec->join($out_dir, 'MyeloseqHD.json');
my $json_fh = IO::File->new(">$input_json") or die "fail to write to $input_json";
Expand Down

0 comments on commit a10540e

Please sign in to comment.