-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathreadpw.py
executable file
·431 lines (369 loc) · 12.9 KB
/
readpw.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
#!/usr/bin/env python3
from __future__ import print_function
import sys, os
import bz2, re
import itertools
import operator
import marisa_trie
import numpy as np
from os.path import (expanduser)
from math import sqrt
# opens file checking whether it is bz2 compressed or not.
import tarfile
"""A simple password library. Has function to put passwords into nice data
structure for fast look up.
The file creates a cache databse file in home folder with the name .passwords.
In Unix you can find it using `~/.pypasswords`.
Run with Python 3 your life will be much easier. """
MAX_INT = 2**64-1
DEBUG = True
home = expanduser("~")
pass_dir = os.path.join(home, '.pypasswords')
ROCKYOU_TOTAL_CNT = 32603388.0
def sample_following_dist(handle_iter, n, totalf):
"""Samples n passwords following the distribution from the handle
@handle_iter is an iterator that gives (pw,f) @n is the total
number of samle asked for @totalf is the total number of users,
which is euqal to sum(f for pw,f in handle_iter)
As, handle_iterator is an iterator and can only traverse once, @totalf
needs to be supplied to the funciton.
Returns, an array of @n tuples (id, pw) sampled from @handle_iter.
"""
multiplier = 1.0
if totalf == 1.0:
multiplier = 1e8
# print "WARNING!! I don't except probabilities"
totalf = totalf * multiplier
print("# Population Size", totalf)
A = np.sort(np.unique(np.random.randint(0, totalf, size=n*2))[:n])
A = A[::-1]
# Uniqueness check, non necessarily required, but not very
# computationally intensive
assert len(A) == n, "Not enough randomnumbers generated"\
"Requried {}, generated only {}".format(n, len(A))
j = 0
sampled = 0
val = A.pop()
# print handle_iter
for _,w,f in handle_iter:
j += f*multiplier
if not A: break
while val<j:
sampled += 1
if sampled %5000 == 0:
print ("Sampled:",sampled)
yield (val, w)
if A:
val = A.pop()
else:
break
print ("# Stopped at:", w, f, j, '\n')
while A and val<j:
yield (val, w)
if A:
i, val = A.pop()
else:
break
def MILLION(n):
return n*10e6
def sort_dict(D):
# sort the dictionary by keys and returns a tuple list
return sorted(D.items(), key=operator.itemgetter(1))
# returns the type of file.
def file_type(filename):
magic_dict = {
b"\x1f\x8b\x08": "gz",
b"\x42\x5a\x68": "bz2",
b"\x50\x4b\x03\x04": "zip"
}
max_len = max(len(x) for x in magic_dict)
with open(filename, 'rb') as f:
file_start = f.read(max_len)
for magic, filetype in magic_dict.items():
if file_start.startswith(magic):
return filetype
return "no match"
def open_(filename, mode='r'):
"""Replace with tarfile.open in future, and ignore Python2"""
if mode == 'w':
type_ = filename.split('.')[-1]
else:
type_ = file_type(filename)
if type_ == "bz2":
f = bz2.open(filename, mode + 't', errors='replace')
elif type_ == "gz":
f = tarfile.open(filename, mode)
else:
f = open(filename, mode)
return f
def getallgroups(arr, k=-1):
"""
returns all the subset of @arr of size less than equalto @k
the return array will be of size sum_{i=1}^k nCi, n = len(arr)
"""
if k<0:
k = len(arr)
return itertools.chain.from_iterable(
itertools.combinations(set(arr), j)
for j in range(1,k+1)
)
def is_asciistring(s):
try:
s.decode('ascii')
return True
except (UnicodeDecodeError, UnicodeEncodeError) as e:
# warning("UnicodeError:", s, str(e))
return False
def get_line(file_object, limit=-1, pw_filter=lambda x: True):
regex = re.compile(r'\s*([0-9]+) (.*)$')
i = 0
for l in file_object:
if limit>0 and limit<=i:
break
m = regex.match(l)
if not m:
warning ("REGEX FAIL: ", l)
c, w = m.groups()
c = int(c)
w = w.replace('\x00', '\\x00')
# try:
# w = w.decode('utf-8', errors='replace')
# except UnicodeDecodeError:
# #try with latin1
# warning("Error in decoding: ({} {}). Line: {}. Ignoring!"\
# .format(w, c, l))
# continue
if w and pw_filter(w) and c>0:
i += 1
yield w,c
else:
pass
#warning ("Filter Failed or malformed string: ", w, c)
def open_get_line(filename, limit=-1, **kwargs):
"""Opens the password file named @filename and reads first @limit
passwords. @kwargs are passed to get_line for further processing.
For example, pw_filter etc.
@fielname: string
@limit: integer
"""
with open_(filename) as f:
for w,c in get_line(f, limit, **kwargs):
yield w, c
regex = r'([A-Za-z_]+)|([0-9]+)|(\W+)'
def print_err( *args ):
if DEBUG:
sys.stderr.write(' '.join([str(a) for a in args])+'\n')
def tokens(w):
T = []
while w:
m = re.match(regex, w)
T.append(m.group(0))
w = w[len(T[-1]):]
return T
def whatchar(c):
return 'L' if c.isalpha() else \
'D' if c.isdigit else 'Y'
def mean_sd(arr):
s = sum(arr)
s2 = sum([x * x for x in arr])
n = len(arr)
m = s / float(n)
sd = sqrt(float(s2) / n - m * m)
return m, sd
def convert2group(t, totalC):
"""
What is this?
"""
return t + np.random.randint(0, (MAX_INT-t)/totalC) * totalC
def warning(*objs):
if DEBUG:
print("WARNING: ", *objs, file=sys.stderr)
# assumes last element in the array(A) is the sum of all elements
def getIndex(p, A):
p %= A[-1]
i = 0
for i, v in enumerate(A):
p -= v
if p < 0: break
return i
class Passwords(object):
"""Its a class to efficiently store and read large password file. It
creates two files for each password in the under the directory
'eff_data/' in home+.pypassword directory (~/.pypasswords). First file
is a trie, which just stores all the password in efficient prefix
trie format using "marisa_trie" module. The second is a numy large
array, containing the indicies. This is what I found the most
memory and compute efficient way of accessing passwords in Python.
@pass_file: the path of the file you want to process. The file
should countain freq and the password similar to the output of
unix "uniq -c" command.
@max_pass_len, min_pass_len defines the
range of password to consider. Note, this filtering does not
effect the totalf, and only changes the iterpws() function.
"""
def __init__(self, pass_file, max_pass_len=40, min_pass_len=1):
self.fbasename = os.path.basename(pass_file).split('.',1)[0]
_dirname = '{}/eff_data/'.format(pass_dir)
if not os.path.exists(_dirname):
os.makedirs(_dirname)
self._max_pass_len = max_pass_len
self._min_pass_len = min_pass_len
self._file_trie = os.path.join(_dirname, self.fbasename + '.trie')
self._file_freq = os.path.join(_dirname, self.fbasename + '.npz')
self._T, self._freq_list, self._totalf = None, None, None
if os.path.exists(self._file_trie) and os.path.exists(self._file_freq):
self.load_data()
else:
self.create_data_structure(pass_file)
assert self._T, "Could not initialize the trie."
def create_data_structure(self, pass_file):
# Record trie, Slow, and not memory efficient
# self._T = marisa_trie.RecordTrie(
# '<II', ((unicode(w), (c,))
# for i, (w,c) in
# enumerate(passwords.open_get_line(pass_file)))
# )
tmp_dict = {w: c for w,c in open_get_line(pass_file)}
self._T = marisa_trie.Trie(tmp_dict.keys())
self._freq_list = np.zeros(len(self._T), dtype=int)
for k in self._T.iterkeys():
self._freq_list[self._T.key_id(k)] = tmp_dict[k]
self._T.save(self._file_trie)
self._totalf = self._freq_list.sum()
np.savez_compressed(
self._file_freq,
freq=self._freq_list,
fsum=self._totalf
)
def sample_pws(self, n, asperdist=True):
"""Returns n passwords sampled from this password dataset. if
asperdist is True, then returns the password sampled according
the password histogram distribution (with
replacement). Passwords are always sampled with replacement.
TODO: The sample users, instead of passwords perse.
"""
if asperdist:
sample = np.random.choice(
self._freq_list.shape[0], size=n, p=self._freq_list/self._totalf
)
else:
sample = np.random.choice(len(self._T), size=n)
return (self._T.restore_key(i) for i in sample)
def load_data(self):
self._T = marisa_trie.Trie()
self._T.load(self._file_trie)
np_f = np.load(self._file_freq)
self._freq_list, self._totalf = np_f['freq'], np_f['fsum']
def totalf(self):
return self._totalf
def pw2id(self, pw):
try:
return self._T.key_id(pw)
except KeyError:
return -1
except UnicodeDecodeError as e:
print(repr(pw), e)
raise ValueError(e)
def id2pw(self, _id):
try:
return self._T.restore_key(_id)
except KeyError:
return ''
def prob(self, pw):
try:
return self.__getitem__(pw)/self._totalf
except ValueError:
return 0.0
def prob_(self, pw):
p = self.prob(pw)
if p > 0: return p
w = pw
p = 1
minp = 1e-2 # Some random probability of each character
while len(w) > 0:
w_prefix_arr = self._T.prefixes(w)
if not w_prefix_arr:
w = w[1:]
p *= minp
else:
w = w[len(w_prefix_arr[-1]):]
p *= self.prob(w_prefix_arr[-1])
return p
def pw2freq(self, pw):
try:
return self._freq_list[self._T.key_id(pw)]
# return self._T.get(unicode(pw), 0)
except KeyError:
return 0
def id2freq(self, _id):
_id = int(_id)
try:
return self._freq_list[_id]
except ValueError:
return 0
def sumvalues(self, q=0):
"""Sum of top q passowrd frequencies
"""
if q == 0:
return self._totalf
else:
return -np.partition(-self._freq_list, q)[:q].sum()
def iterpws(self, n):
"""
Returns passwords in order of their frequencies.
@n: The numebr of passwords to return
Return: pwid, password, frequency
Every password is assigned an uniq id, for efficient access.
"""
for _id in np.argsort(self._freq_list)[::-1][:n]:
pw = self._T.restore_key(_id)
if self._min_pass_len <= len(pw) <= self._max_pass_len:
yield _id, pw, self._freq_list[_id]
def justiter(self):
for w, _id in self._T.iteritems():
yield _id, w, self._freq_list[_id]
def keys(self):
return self._T.iterkeys()
def values(self):
return self._freq_list
def __iter__(self):
for _id in np.argsort(self._freq_list)[::-1]:
yield _id, self._freq_list[_id]
def __getitem__(self, k):
if not isinstance(k, (str, int)):
raise TypeError(
"_id is wrong type ({}) expects str or int"
.format(type(k)))
if isinstance(k, str):
k = self.pw2id(k)
if k > self._freq_list.shape[0] or k < 0:
raise ValueError("Index out of range, must between [0, {}]".format(len(self)))
return self._freq_list[k]
def __contains__(self, k):
return self.pw2id(k) >= 0
def __len__(self):
return self._freq_list.shape[0]
import unittest
class TestPasswords(unittest.TestCase):
def test_pw2freq(self):
passwords = Passwords(
os.path.expanduser('~/passwords/rockyou-withcount.txt.bz2')
)
for pw, f in {'michelle': 12714, 'george': 4749,
'familia': 1975, 'honeybunny': 242,
'asdfasdf2wg': 0, ' 234 adsf': 0}.items():
pw = pw
self.assertEqual(passwords.pw2freq(pw), f)
def test_getallgroups(self):
for inp, res in [(
[1,2,3], set([
(1,), (2,), (3,), (1,2), (2,3), (1,3), (1,2,3)])
)]:
res1 = set(getallgroups(inp))
self.assertEqual(res1, res)
if __name__ == "__main__":
# print(list(getallgroups([1,2,3,4,5,6,7,8,9], 5)))
# unittest.main()
pws = Passwords('/home/rahul/passwords/rockyou-withcount.txt.gz')
for s in [" ", "password", " password", "asdfasd a sdfasf "]:
print(s, pws.prob_(s))