forked from facelessuser/BracketHighlighter
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathure.py
261 lines (217 loc) · 6.95 KB
/
ure.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
"""
ure - unicode re
A simple script that wraps the re interface with methods to handle unicode properties.
Patterns will all have re.UNICODE enabled and unicode property formats will be replaced
with the unicode characters in that category.
Example:
r"\p{Ll}\p{Lu}"
Licensed under MIT
Copyright (c) 2013 Isaac Muse <[email protected]>
"""
import re
import sys
from os.path import exists, join
try:
import unicodedata
except:
from os.path import dirname
sys.path.append(dirname(sys.executable))
import unicodedata
try:
import cpickle as pickle
except:
import pickle
from os import unlink
PY3 = sys.version_info[0] >= 3
uchr = chr if PY3 else unichr
DEBUG = re.DEBUG
I = re.I
IGNORECASE = re.IGNORECASE
L = re.L
LOCALE = re.LOCALE
M = re.M
MULTILINE = re.MULTILINE
S = re.S
DOTALL = re.DOTALL
U = re.U
UNICODE = re.UNICODE
X = re.X
VERBOSE = re.VERBOSE
escape = re.escape
purge = re.purge
_unicode_properties = None
_unicode_key_pattern = None
_loaded = False
if "_use_cache" not in globals():
_use_cache = None
_cache_prefix = ""
def set_cache_directory(pth, prefix=""):
"""
Set cache path
"""
global _use_cache
global _cache_prefix
if exists(pth):
_use_cache = pth
_cache_prefix = prefix
def _build_unicode_property_table(unicode_range):
"""
Build property table for unicode range.
"""
table = {}
p = None
for i in range(*unicode_range):
try:
c = uchr(i)
p = unicodedata.category(c)
except:
continue
if p[0] not in table:
table[p[0]] = {}
if p[1] not in table[p[0]]:
table[p[0]][p[1]] = []
table[p[0]][p[1]].append(c)
# Join as one string
for k1, v1 in table.items():
for k2, v2 in v1.items():
v1[k2] = ''.join(v2)
return table
def _build_unicode_key_pattern():
"""
Build regex key pattern
"""
unicode_prop = r"\p\{(%s)\}"
unicode_keys = []
for k1, v1 in _unicode_properties.items():
unicode_keys.append("%s(?:%s)" % (k1, "|".join(v1.keys())))
return re.compile(unicode_prop % "|".join(unicode_keys), re.UNICODE)
def _init_unicode():
"""
Prepare unicode property tables and key pattern
"""
global _loaded
global _unicode_properties
global _unicode_key_pattern
if _use_cache is not None:
props = join(_use_cache, "%s_unicode_properties.cache" % _cache_prefix)
if (not exists(join(_use_cache, "%s_unicode_properties.cache" % _cache_prefix))):
_unicode_properties = _build_unicode_property_table((0x0000, 0x10FFFF))
_unicode_key_pattern = _build_unicode_key_pattern()
try:
with open(props, 'wb') as f:
pickle.dump(_unicode_key_pattern, f)
pickle.dump(_unicode_properties, f)
except Exception as e:
if exists(props):
unlink(props)
else:
try:
with open(props, 'rb') as f:
_unicode_key_pattern = pickle.load(f)
_unicode_properties = pickle.load(f)
except Exception as e:
if exists(props):
unlink(props)
_unicode_properties = _build_unicode_property_table((0x0000, 0x10FFFF))
_unicode_key_pattern = _build_unicode_key_pattern()
else:
_unicode_properties = _build_unicode_property_table((0x0000, 0x10FFFF))
_unicode_key_pattern = _build_unicode_key_pattern()
_loaded = True
def find_char_groups(s):
"""
Find character groups
"""
pos = 0
groups = []
escaped = False
found = False
first = None
for c in s:
if c == "\\":
escaped = not escaped
elif escaped:
escaped = False
elif c == "[" and not found:
found = True
first = pos
elif c == "]" and found:
groups.append((first, pos))
pos += 1
return groups
def get_unicode_category(prop):
"""
Retrieve the unicode category from the table
"""
p1, p2 = (prop[0], prop[1]) if len(prop) > 1 else (prop[0], None)
return ''.join([x for x in _unicode_properties[p1].values()]) if p2 is None else _unicode_properties[p1][p2]
def parse_unicode_properties(re_pattern):
"""
Replaces regex property notation with unicode values
"""
# Init unicode table if it has not already been initialized
global _loaded
if not _loaded:
_init_unicode()
char_groups = find_char_groups(re_pattern)
ure_pattern = re_pattern
for p in reversed(list(_unicode_key_pattern.finditer(re_pattern))):
v = get_unicode_category(p.group(1))
brackets = True
if v is None:
continue
for g in char_groups:
if p.start(0) >= g[0] and p.end(0) <= g[1]:
brackets = False
break
if brackets:
v = "[" + v + "]"
ure_pattern = ure_pattern[:p.start(0) - 1] + v + ure_pattern[p.end(0): len(ure_pattern)]
return ure_pattern
def compile(pattern, flags=0):
"""
compile after parsing unicode properties and set flag to unicode
"""
return re.compile(parse_unicode_properties(pattern), flags | re.UNICODE)
def search(pattern, string, flags=0):
"""
search after parsing unicode properties and set flag to unicode
"""
re.search(parse_unicode_properties(pattern), string, flags | re.UNICODE)
def match(pattern, string, flags=0):
"""
match after parsing unicode properties and set flag to unicode
"""
re.match(parse_unicode_properties(pattern), string, flags | re.UNICODE)
def split(pattern, string, maxsplit=0, flags=0):
"""
split after parsing unicode properties and set flag to unicode
"""
re.split(parse_unicode_properties(pattern), string, maxsplit, flags | re.UNICODE)
def findall(pattern, string, flags=0):
"""
findall after parsing unicode properties and set flag to unicode
"""
re.findall(parse_unicode_properties(pattern), string, flags | re.UNICODE)
def finditer(pattern, string, flags=0):
"""
finditer after parsing unicode properties and set flag to unicode
"""
re.finditer(parse_unicode_properties(pattern), string, flags | re.UNICODE)
def sub(pattern, repl, string, count=0, flags=0):
"""
sub after parsing unicode properties and set flag to unicode
"""
re.sub(parse_unicode_properties(pattern), repl, string, count, flags | re.UNICODE)
def subn(pattern, repl, string, count=0, flags=0):
"""
subn after parsing unicode properties and set flag to unicode
"""
re.subn(parse_unicode_properties(pattern), repl, string, flags | re.UNICODE)
# _init_unicode()
if __name__ == "__main__":
from os.path import dirname, abspath
print(__file__)
set_cache_directory(dirname(abspath(__file__)), "test")
print("Testing ure's unicode properties replacement")
print(parse_unicode_properties(r"\p{Ll}"))