-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSnake_hmm_all
175 lines (157 loc) · 6.72 KB
/
Snake_hmm_all
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
# Find the name of runs and samples
import glob, os
import numpy as np
configfile: "config_hmm_all.yaml"
work_dir = config["working_folder"]
out_dir = config["output_folder"]
spades_folder = config["spades_folder"]
reads_folder = config["reads_folder"]
model_folder = config["model_folder"]
fastq_folder = config["fastq_folder"]
sample_run_file = glob.glob(reads_folder+"*.txt", recursive=True)
runs_sizes = [os.stat(x).st_size for x in sample_run_file]
sample_run_file_np = np.array(sample_run_file)
runs_sizes_np = np.array(runs_sizes)
# to start with small files:
#sample_run_file_2 = sample_run_file_np[runs_sizes_np <= np.percentile(runs_sizes_np, 30)].tolist()
sample_run_file_2 = sample_run_file_np.copy()
runs = [f.split("/")[-1].split(".")[-2].split("_")[-2] for f in sample_run_file_2 ]
if not os.path.exists(out_dir + "hmm"):
os.makedirs(out_dir + "hmm")
rule all:
input:
reads_out = expand(out_dir+"genes/{runs}/{class_ab}.fasta", runs=runs, class_ab=config["class_name"])
rule extract_fasta:
input:
reads_class1 = reads_folder + "{runs}_class.txt",
output:
reads1 = out_dir+"fasta/{runs}/{runs}_{class_ab}_positive_1.fastq",
reads2 = out_dir+"fasta/{runs}/{runs}_{class_ab}_positive_2.fastq",
reads3 = out_dir+"fasta/{runs}/{runs}_{class_ab}_out.txt"
conda:
"fargene"
shadow: "minimal"
params:
num_of_processes = 1,
work_dir = work_dir,
name_code = lambda wildcards: config["class_ab"][wildcards.class_ab]["out_tisarg"],
fastq_folder = fastq_folder
shell:
"""
cat {input.reads_class1} |grep {params.name_code} |awk -F "," '{{print $1}}' |awk -F "_" '{{print $1}}' |sort |uniq > tmp_class_read.txt ||true
if [ -s tmp_class_read.txt ]; then
seqtk subseq {params.fastq_folder}{wildcards.runs}/{wildcards.runs}_1_QC.fastq tmp_class_read.txt > {output.reads1}
seqtk subseq {params.fastq_folder}{wildcards.runs}/{wildcards.runs}_2_QC.fastq tmp_class_read.txt > {output.reads2}
echo "done {wildcards.runs}" > {output.reads3}
else
touch {output.reads1}
touch {output.reads2}
echo "no reads, done {wildcards.runs} {wildcards.class_ab}" > {output.reads3}
fi
"""
rule make_assembly:
input:
reads1 = out_dir+"fasta/{runs}/{runs}_{class_ab}_positive_1.fastq",
reads2 = out_dir+"fasta/{runs}/{runs}_{class_ab}_positive_2.fastq",
reads3 = out_dir+"fasta/{runs}/{runs}_{class_ab}_out.txt"
output:
reads1 = out_dir+"assembly/{runs}_{class_ab}/{runs}_{class_ab}-contigs.fasta",
reads2 = out_dir+"assembly/{runs}_{class_ab}/{runs}_{class_ab}-amino-contigs.fasta",
reads3 = out_dir+"assembly/{runs}_{class_ab}/{runs}_{class_ab}_out.txt"
conda:
"fargene"
shadow: "minimal"
params:
num_of_processes = 1,
work_dir = work_dir,
spades = spades_folder,
assembly_dir = lambda wildcards: out_dir + "assembly/" + wildcards.runs + "_" + wildcards.class_ab + "/"
shell:
"""
echo "assembly {wildcards.runs}"
echo "input files"
cat {input.reads3}
echo "continue to assembly"
if [ -s {input.reads1} ]; then
echo "yes reads forward"
if [ -s {input.reads2} ]; then
echo "yes reads reverse"
echo "doing spades"
{params.spades} --meta -1 {input.reads1} -2 {input.reads2} -o {params.assembly_dir} > {wildcards.runs}_summary.txt
echo "done spades"
else
echo " no reads reverse"
fi
else
"making empty files"
touch {output.reads1}
touch {output.reads2}
fi
echo "assembly done, continue to check if contig file exists"
if [ -f {params.assembly_dir}contigs.fasta ]; then
if [ -s {params.assembly_dir}contigs.fasta ]; then
mv {params.assembly_dir}contigs.fasta {output.reads1}
transeq {output.reads1} {output.reads2} -frame=6 -table=11 sformat=pearson
else
touch {output.reads1}
touch {output.reads2}
fi
else
touch {output.reads1}
touch {output.reads2}
fi
echo "done with assembly {wildcards.runs}" > {output.reads3}
"""
rule run_hmm:
input:
reads1 = out_dir + "assembly/{runs}_{class_ab}/{runs}_{class_ab}-amino-contigs.fasta",
reads2 = out_dir + "assembly/{runs}_{class_ab}/{runs}_{class_ab}_out.txt"
output:
genes_out = out_dir + "genes/{runs}/{class_ab}.fasta",
genes_list_out = out_dir + "hmm_filter/{runs}/{class_ab}.csv",
hmm_out = out_dir + "hmm/{runs}_{class_ab}_hmmsearched.out"
shadow: "minimal"
params:
num_of_processes = 1,
model = lambda wildcards: model_folder + config["class_ab"][wildcards.class_ab]["model"] + ".hmm",
threshold = lambda wildcards: config["class_ab"][wildcards.class_ab]["score"]
conda:
"fargene"
shell:
"""
echo "starting HMM"
echo "hmm {wildcards.runs}"
echo "running hmm"
if [ -s {input.reads1} ]; then
hmmsearch --domtblout {output.hmm_out} -E 1000 --domE 1000 {params.model} {input.reads1} > temp_out.txt
echo "hmm done, goint to do filter"
cat {output.hmm_out} |grep -v '^#' > {wildcards.runs}_{wildcards.class_ab}_temp_out1.txt ||true
echo "filter hmm started"
if [ -s {wildcards.runs}_{wildcards.class_ab}_temp_out1.txt ]; then
echo "there are some matches to be evaluated"
grep -v '^#' {wildcards.runs}_{wildcards.class_ab}_temp_out1.txt | awk '$14>{params.threshold} {{print $1,$3,$20,$21}}' > temp_out2.csv
echo "matches and metadata created"
grep -v '^#' {wildcards.runs}_{wildcards.class_ab}_temp_out1.txt | awk '$14>{params.threshold} {{print $1}}' > tmptlist.txt
echo "matches created"
else
echo "empty file hmm"
touch tmptlist.txt
fi
if [ -s tmptlist.txt ]; then
cat temp_out2.csv > {output.genes_list_out}
seqtk subseq {input.reads1} tmptlist.txt > tmp.fasta
sed '/^>/s/$/@@@{wildcards.class_ab}/' tmp.fasta > {output.genes_out}
else
echo " empty file "
echo " empty file "
echo " empty file "
echo " empty file "
touch {output.genes_list_out}
touch {output.genes_out}
fi
else
touch {output.genes_list_out}
touch {output.genes_out}
touch {output.hmm_out}
fi
"""