-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathanonymizer.py
193 lines (171 loc) · 5.83 KB
/
anonymizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
__author__ = 'cami'
__version__ = '0.0.5'
import sys
import io
import math
import argparse
from Bio import SeqIO
from scripts.Validator.sequencevalidator import SequenceValidator
class Anonymizer(SequenceValidator):
"""
input are streamed sequences which have identifier that need to be anonymous
by default it reads from std.in
for example a fastq file including paired end reads:
@readid/1
sequence
+
quality
@readid/2
sequence
+
quality
you need to specify
- a character that introduces a header line (@ here)
- a block size (4 here)
- if you have pairs or single blocks (pairs here: forward and backward reads)
- optional a prefix (e.g. A|S1|R for reads from sample 1 in dataset A)
output is the same file with anonymous ids and a mapping form old ids to new ones
"""
_label = "Anonymizer"
_legal_formats = ["fastq", "fasta"]
def anonymize_sequences(
self, mapping, input_stream=sys.stdin, output_stream=sys.stdout,
sequence_prefix='', file_format="fasta"):
"""
Anonymize sequences and write a mapping file
@attention: only 'fasta' and 'fastq' supported
@param mapping: output stream for sequence id mapping
@type mapping: file | io.FileIO | StringIO.StringIO
@param input_stream: Input stream of fasta format data
@type input_stream: file | io.FileIO | StringIO.StringIO
@param output_stream: Output stream of anonymous fasta format data
@type output_stream: file | io.FileIO | StringIO.StringIO
@param sequence_prefix: Prefix of the anonymous sequence id.
@type sequence_prefix: str
@param file_format: Fasta format of input and output. Either 'fasta' or 'fastq'.
@type file_format: str
@return: None
@rtype: None
"""
assert self.is_stream(input_stream)
assert self.is_stream(output_stream)
assert self.is_stream(mapping)
assert isinstance(sequence_prefix, str)
assert isinstance(file_format, str)
file_format = file_format.lower()
assert file_format in self._legal_formats
sequence_counter = 0
for seq_record in SeqIO.parse(input_stream, file_format):
anonymous_id = "{pre}{ct}".format(pre=sequence_prefix, ct=sequence_counter)
map_line = "{oldid}\t{newid}\n".format(oldid=seq_record.id, newid=anonymous_id)
mapping.write(map_line)
seq_record.id = anonymous_id
seq_record.description = ''
output_stream.write(seq_record.format(file_format))
sequence_counter += 1
def anonymize_sequence_pairs(
self, mapping, input_stream=sys.stdin, output_stream=sys.stdout,
sequence_prefix='', file_format="fasta"):
"""
Anonymize pairs of sequences
@attention: only 'fasta' and 'fastq' supported
@param mapping: output stream for sequence id mapping
@type mapping: file | io.FileIO | StringIO.StringIO | None
@param input_stream: Input stream of fasta format data
@type input_stream: file | io.FileIO | StringIO.StringIO | None
@param output_stream: Output stream of anonymous fasta format data
@type output_stream: file | io.FileIO | StringIO.StringIO | None
@param sequence_prefix: Prefix of the anonymous sequence id.
@type sequence_prefix: str
@param file_format: Fasta format of input and output. Either 'fasta' or 'fastq'.
@type file_format: str
@return: None
@rtype: None
"""
assert self.is_stream(input_stream)
assert self.is_stream(output_stream)
assert self.is_stream(mapping)
assert isinstance(sequence_prefix, str)
assert isinstance(file_format, str)
file_format = file_format.lower()
assert file_format in self._legal_formats
sequence_counter = 0
for seq_record in SeqIO.parse(input_stream, file_format):
mod_value = sequence_counter % 2
anonymous_id = "{pre}{ct}/{pair}".format(
pre=sequence_prefix,
ct=int(math.floor(sequence_counter/2.0)),
pair=mod_value+1)
map_line = "{oldid}\t{newid}\n".format(oldid=seq_record.id, newid=anonymous_id)
mapping.write(map_line)
seq_record.id = anonymous_id
seq_record.description = ''
output_stream.write(seq_record.format(file_format))
sequence_counter += 1
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"-log",
help="log file",
action='store',
type=argparse.FileType('a'),
default=None)
parser.add_argument(
"-input",
help="input file (e.g. shuffled reads), reads from std.in by default",
action='store',
type=argparse.FileType('r'),
default=sys.stdin)
parser.add_argument(
"-out",
help="output file, actually same as input file but with anonymous ids",
action='store',
type=argparse.FileType('w'),
default=sys.stdout)
parser.add_argument(
"-map",
help="mapping file: old IDs -> anonymous IDs",
action='store',
type=argparse.FileType('w'),
default=None)
parser.add_argument(
"-prefix",
help="prefix for the anonymous ids",
action='store',
default="")
parser.add_argument(
"-s",
help="process single blocks instead of pairs (e.g. single reads instead of paired end)",
action="store_true",
default=False)
parser.add_argument(
"-format",
help="format of the input file fasta or fastq",
action='store',
choices=["fasta", "fastq"],
default="fasta")
options = parser.parse_args()
input_file_stream = options.input
map_file_stream = options.map
output_file_stream = options.out
log_file_stream = options.log
anonymizer = Anonymizer(logfile=log_file_stream)
if options.s:
anonymizer.anonymize_sequences(
mapping=map_file_stream,
input_stream=input_file_stream,
output_stream=output_file_stream,
sequence_prefix=options.prefix,
file_format=options.format)
else:
anonymizer.anonymize_sequence_pairs(
mapping=map_file_stream,
input_stream=input_file_stream,
output_stream=output_file_stream,
sequence_prefix=options.prefix,
file_format=options.format)
map_file_stream.close()
if output_file_stream is not sys.stdout:
output_file_stream.close()
if input_file_stream is not sys.stdin:
input_file_stream.close()