-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhelper.py
executable file
·222 lines (190 loc) · 5.68 KB
/
helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
#!/usr/bin/python
import sys, os
import bz2, re
import json
BASE_DIR = os.getcwd()
sys.path.append(BASE_DIR)
MAX_INT = int(2**32-1)
DEBUG=False
PRODUCTION=False
from os.path import (expanduser, basename)
import struct
# opens file checking whether it is bz2 compressed or not.
# import tarfile
import gzip
DEBUG = True
home = expanduser("~")
pwd = os.path.dirname(os.path.abspath(__file__))
regex = r'([A-Za-z_]+)|([0-9]+)|(\W+)'
class random:
@staticmethod
def randints(s, e, n=1):
"""
returns n uniform random numbers from [s, e] (including both ends)
"""
assert e>=s, "Wrong range: [{}, {}]".format(s, e)
n = max(1, n)
if DEBUG:
arr = [s + a%(e-s) for a in struct.unpack('<%dL'%n, os.urandom(4*n))]
else:
random.seed(0)
arr = [s + a%(e-s) for a in random.random()*n]
return arr
@staticmethod
def randint(s,e):
"""
returns one random integer between s and e. Try using @randints in case you need
multiple random integer. @randints is more efficient
"""
return random.randints(s,e,1)[0]
@staticmethod
def choice(arr):
i = random.randint(0, len(arr)-1)
assert i<len(arr), "Length exceeded by somehow! Should be < {}, but it is {}"\
.format(len(arr), i)
return arr[i]
@staticmethod
def sample(arr, n):
return [arr[i] for i in random.randints(0, len(arr)-1, n)]
# returns the type of file.
def file_type(filename, param='r'):
magic_dict = {
"\x1f\x8b\x08": "gz",
"\x42\x5a\x68": "bz2",
"\x50\x4b\x03\x04": "zip"
}
if param.startswith('w'):
return filename.split('.')[-1]
max_len = max(len(x) for x in magic_dict)
with open(filename, param) as f:
file_start = f.read(max_len)
for magic, filetype in magic_dict.items():
if file_start.startswith(magic):
return filetype
return "no match"
# returns the type of file.
def file_type(filename):
magic_dict = {
b"\x1f\x8b\x08": "gz",
b"\x42\x5a\x68": "bz2",
b"\x50\x4b\x03\x04": "zip"
}
max_len = max(len(x) for x in magic_dict)
with open(filename, 'rb') as f:
file_start = f.read(max_len)
for magic, filetype in magic_dict.items():
if file_start.startswith(magic):
return filetype
return "no match"
def open_(filename, mode='r'):
"""Replace with tarfile.open in future, and ignore Python2"""
if mode == 'w':
type_ = filename.split('.')[-1]
else:
type_ = file_type(filename)
if type_ == "bz2":
f = bz2.open(filename, mode + 't', errors='replace')
elif type_ == "gz":
f = gzip.open(filename, mode)
else:
f = open(filename, mode)
return f
def print_err( *args ):
if DEBUG == True:
sys.stderr.write(' '.join([str(a) for a in args])+'\n')
def print_production( *args ):
if PRODUCTION == True:
sys.stderr.write(' '.join([str(a) for a in args])+'\n')
printed_once_dict={}
def print_once( *args ):
h = hash(args)
if h not in printed_once_dict:
printed_once_dict[h] = True
print args
def whatchar(c):
if c.isalpha(): return 'W';
if c.isdigit():
return 'D';
else:
return 'Y'
from math import sqrt
#A = [('asdf',12), ('swer', 213)..]
#p = 15 --> res swer
def bin_search(A, p, s, e):
"""Search p in A
"""
mid = (s+e)/2
if (mid == 0 or A[mid-1][1]<=p)\
and A[mid][1] > p:
return A[mid][0]
elif A[mid][1]<=p:
return bin_search(A, p, mid, e)
else:
return bin_search(A, p, s, mid )
def mean_sd(arr):
s = sum(arr)
s2 = sum((x * x for x in arr))
n = len(arr)
m = s / float(n)
try:
sd = sqrt(abs(s2*n - s * s))/n
except ValueError:
print "In mean_sd:", arr, (s2*n-s*s)
raise ValueError
return m, sd
def convert2group(t, totalC):
return t + random.randint(0, (MAX_INT-t)/totalC) * totalC
# assumes last element in the array(A) is the sum of all elements
def getIndex(p, A):
p %= A[-1]
i = 0;
for i, v in enumerate(A):
p -= v;
if p<0: break
return i
from multiprocessing import Pool
import itertools
def wrap_func(args):
func, data = args
D = [func(d) for d in data]
print_err('done', len(D))
return D
def ProcessParallel(func, data, func_load=10):
"""
its a wrapper over multiprocess.Pool
"""
m = len(data)/func_load # 10 is the magic number
#print "Total:", len(data), m
if m>10:
split_data = [(func, data[i*m:(i+1)*m]) for i in range(10)]
if DEBUG:
p = map(wrap_func, split_data)
else:
pool = Pool()
p = pool.map(func=wrap_func, iterable=split_data)
return list(itertools.chain(*p))
else:
return wrap_func((func, data))
def diff(oldG, newG):
"""
returns the difference of the two grammars.
"""
if not (isinstance(oldG, dict) and isinstance(newG, dict)):
yield (oldG, newG)
else:
for k in oldG.keys():
if k not in newG:
yield k
else:
vold, vnew = oldG[k], newG[k]
if vold != vnew:
diff(oldG[k], newG[k])
if __name__=='__main__':
import dawg
pws = dict(get_line(open_(sys.argv[1])), lim=1e7)
new_fname = sys.argv[1].replace('.tar.bz', '.dawg')
if new_fname == sys.argv[1]:
new_fname = sys.argv[1].split('.', 1)[0] + '.dawg'
assert new_fname != sys.argv[1], "Give a better name to your original file."
T = dawg.IntCompletionDAWG(pws.items())
T.save('{}'.format(new_fname))