-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy patheg_url.py
58 lines (44 loc) · 1.8 KB
/
eg_url.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
"""
Based on https://www.w3.org/Addressing/URL/5_BNF.html
because I'm a lazy bastard; it's clearly not up to date
(as shown by 'right=wrong' below).
"""
from parson import Grammar
grammar = r""" url :end.
url : httpaddress | mailtoaddress.
mailtoaddress : {'mailto'} ':' :'protocol'
{(!'@' xalpha)+} :'user'
'@' {hostname} :'host'.
httpaddress : {'http'} '://' :'protocol' hostport ('/' path)? ('?' search)? ('#' fragment)?.
hostport : host (':' port)?.
host : {hostname | hostnumber} :'host'.
hostname : ialpha ++ '.'.
hostnumber : digits '.' digits '.' digits '.' digits.
port : {digits} :'port'.
path : {(segment '/')* segment?} :'path'.
segment : xpalpha+.
search : {(xalpha+) ++ '+'} :'search'.
fragment : {xalpha+} :'fragment'.
xalpha : alpha | digit | safe | extra | escape.
xpalpha : xalpha | '+'.
ialpha : alpha xalpha*.
alpha : /[a-zA-Z]/.
digit : /\d/.
digits : /\d+/.
safe : /[$_@.&+-]/.
extra : /[!*"'(),]/.
escape : '%' hex hex.
hex : /[\dA-Fa-f]/.
"""
g = Grammar(grammar)()
## g.attempt('true')
## g('mailto:[email protected]')
#. ('mailto', 'protocol', 'coyote', 'user', 'acme.com', 'host')
## g('http://google.com')
#. ('http', 'protocol', 'google.com', 'host')
## g.attempt('http://google.com//')
## g('http://en.wikipedia.org/wiki/Uniform_resource_locator')
#. ('http', 'protocol', 'en.wikipedia.org', 'host', 'wiki/Uniform_resource_locator', 'path')
## g.attempt('http://wry.me/fun/toys/yes.html?right=wrong#fraggle')
## g( 'http://wry.me/fun/toys/yes.html?rightwrong#fraggle')
#. ('http', 'protocol', 'wry.me', 'host', 'fun/toys/yes.html', 'path', 'rightwrong', 'search', 'fraggle', 'fragment')