-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathutils.py
84 lines (71 loc) · 2.69 KB
/
utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
#-*- coding:utf8 -*-
import sys, re
import cgi
def thai2arabic(number):
assert isinstance(number, unicode), "%r is not unicode" % (number)
d_tha = u'๐๑๒๓๔๕๖๗๘๙'
d_arb = u'0123456789'
result = ''
for c in number:
if c in d_tha:
result += d_arb[d_tha.find(c)]
else:
result += c
return result
def arabic2thai(number):
assert isinstance(number, unicode), "%r is not unicode" % (number)
d_tha = u'๐๑๒๓๔๕๖๗๘๙'
d_arb = u'0123456789'
result = ''
for c in number:
if c in d_arb:
result += d_tha[d_arb.find(c)]
else:
result += c
return result
def parse(data):
title = re.findall('<FONT class=head1>(.+?)</font>',data,re.S)[0]
p = data.find('<TABLE width=90')
x = re.findall('<DIV class=e>(.+?)</DIV>',data[p:],re.S)
ret = re.findall('<pre>(.+?)</center><br>',x[0],re.S)
if len(ret) != 1:
content = ''
else:
content = ret[0]
content = re.sub('</?[a|center|u].*?>','',content).strip()
meta = re.findall('<u>(.+?)</u>',x[0],re.S)[0]
title = title.strip().split('\r\n')
x = title[0].split()
volumn,book_name,sub_volumn = thai2arabic(x[2]),x[3],thai2arabic(x[5])
scroll_name = title[1]
lines = thai2arabic(re.findall('บรรทัดที่ (.+?)\.',meta.strip(),re.S)[0].strip())
pages = thai2arabic(re.findall('หน้าที่ (.+?)\.',meta.strip(),re.S)[0].strip())
return dict(content=content.decode('utf8','ignore'),volumn=volumn,
book_name=book_name.decode('utf8','ignore'),sub_volumn=sub_volumn,
scroll_name=scroll_name.decode('utf8','ignore'),lines=lines,pages=pages)
re_string = re.compile(r'(?P<htmlchars>[<&>])|(?P<space>^[ \t]+)|(?P<lineend>\r\n|\r|\n)|(?P<protocal>(^|\s)((http|ftp)://.*?))(\s|$)', re.S|re.M|re.I)
def plaintext2html(text, tabstop=4):
def do_sub(m):
c = m.groupdict()
if c['htmlchars']:
return cgi.escape(c['htmlchars'])
if c['lineend']:
return '<br>'
elif c['space']:
t = m.group().replace('\t', ' '*tabstop)
t = t.replace(' ', ' ')
return t
elif c['space'] == '\t':
return ' '*tabstop;
else:
url = m.group('protocal')
if url.startswith(' '):
prefix = ' '
url = url[1:]
else:
prefix = ''
last = m.groups()[-1]
if last in ['\n', '\r', '\r\n']:
last = '<br>'
return '%s<a href="%s">%s</a>%s' % (prefix, url, url, last)
return re.sub(re_string, do_sub, text)