-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathrfc2html.py
415 lines (341 loc) · 23 KB
/
rfc2html.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
# Copyright 2017-2021 Henrik Levkowtez Killi and the IETF Trust, All Rights Reserved
# -*- indent-with-tabs: 0 -*-
# pylint: disable=C0301,C0103,C0303,C0209,R1705
from __future__ import unicode_literals, print_function, division
import re
try:
from html import escape
except ImportError:
from cgi import escape
import urllib
__version__ = '2.0.2'
BOM_CODE = 65279
def markup(text, path=".", script="", extra="", name=None):
# ------------------------------------------------------------------------
# Start of markup handling
# Strip BOM if present
if ord(text[0]) == BOM_CODE:
text = text[1:]
# Convert \r which is not followed or preceded by a \n to \n
# (in case this is a mac document)
text = re.sub(r"([^\n])\r([^\n])", r"\g<1>\n\g<2>", text)
# Strip control characters with the exception of \t, \b and \n (we'll
# deal with \b later)
text = re.sub(r'[\x00-\x07\x0b-\x1f]', '', text)
# -------------
# Normalization
# Remove whitespace at the end of lines
text = re.sub(r"[\t ]+\n", "\n", text)
# Remove whitespace (including formfeeds) at the end of the document.
# (Trailing formfeeds will result in trailing blank pages.)
text = re.sub(r"[\t \r\n\f]+$", "\n", text)
text = text.expandtabs()
# Remove extra blank lines at the start of the document
text = re.sub(r"^\n*", "", text, 1)
# Fix up page breaks:
# \f should aways be preceeded and followed by \n
text = re.sub(r"([^\n])\f", r"\g<1>\n\f", text)
text = re.sub(r"\f([^\n])", r"\f\n\g<1>", text)
# Limit the number of blank lines after page break
text = re.sub(r"\f\n+", "\f\n", text)
# [Page nn] should be followed by \n\f\n
text = re.sub(r"(?i)(\[Page [0-9ivxlc]+\])[\n\f\t ]*(\n *[^\n\f\t ])", r"\g<1>\n\f\g<2>", text)
# Normalize indentation
linestarts = re.findall(r"(?m)^([ ]*)\S", text)
prefixlen = 72
for start in linestarts:
if len(start) < prefixlen:
prefixlen = len(start)
if prefixlen:
text = re.sub(r"\n"+(" "*prefixlen), "\n", text)
# reference name tag markup
reference = {}
ref_url = {}
## Locate the start of the References section as the first reference
## definition after the last reference usage
ref_beg = re.search(r"(?im)^(\d+(\.\d+)*)(\.?[ ]+)(References?|Normative References?|Informative References?)", text)
ref_text = text[ref_beg.end():] if ref_beg else text
ref_stop_list = re.findall(r'(?im)^(?:Appendix\s+[A-Z]\.|[0-9]+\.)(?:[0-9.]+)?\s+\S.+$', ref_text)
ref_stop_text = [ t for t in ref_stop_list if not 'reference' in t.lower() ][:1]
if ref_stop_text:
ref_end = ref_text.index(ref_stop_text[0])
ref_text = ref_text[:ref_end]
##ref_usages = re.findall("(\W)(\[)([-\w.]+)((, ?[-\w.]+)*\])", text)
ref_defs = re.findall(r"(?sm)^( *\n *)\[([-\w.]+?)\]( +)(.*?)(\n *)$", ref_text)
##ref_pos = [ match.start() for match in ref_usages ]
##def_pos = [ match.start() for match in ref_defs ]
##ref_pos = [ pos for pos in ref_pos if not pos in ref_defs ]
##last_ref_pos = ref_pos[-1] if ref_pos else None
#sys.stderr.write("ref_defs: %s\n" % repr(ref_defs))
for tuple in ref_defs:
title_match = re.search(r'(?sm)^(.*?("[^"]+?").+?|.*?(,[^,]+?,)[^,]+?)$', tuple[3])
if title_match:
reftitle = title_match.group(2) or title_match.group(3).strip("[ ,]+")
# Get rid of page break information inside the title
reftitle = re.sub(r"(?s)\n\n\S+.*\n\n", "", reftitle)
reftitle = escape(reftitle, quote=True)
reftitle = re.sub(r"[\n\t ]+", " ", reftitle) # Remove newlines and tabs
reference[tuple[1]] = reftitle if not re.search(r'(?i)(page|section|appendix)[- ]', reftitle) else ''
url_match = re.search(r"(http|https|ftp)://\S+", tuple[3])
if url_match:
ref_url[tuple[1]] = url_match.group(0)
# -------------
# escape any html significant characters
text = escape(text)
# -------------
# Adding markup
text = "<pre>"+text+"</pre>"
# Typewriter-style underline:
text = re.sub(r"_[\b](.)", r"<u>\g<1></u>", text)
# Strip remaining instances of \b
text = text.replace('\b', '')
# Document-specific fixes
if name and name == "draft-ietf-dnsop-interim-signed-root-01":
text = text.replace("F\x84ltstr\xF7m", "F\u00e4ltstr\u00f6m")
text = text.replace("Ihr\x89n", "Ihr\u00e9n")
# Line number markup goes here
# Obsoletes: ... markup
def rfclist_replace(keyword, text):
def replacement(match):
group = list(match.groups(""))
group[3] = re.sub(r"\d+", r'<a href="%s?%srfc=\g<0>">\g<0></a>' % (script, extra), group[3])
if group[8]:
group[8] = re.sub(r"\d+", r'<a href="%s?%srfc=\g<0>">\g<0></a>' % (script, extra), group[8])
else:
group[8] = ""
return "\n%s%s%s\n%s%s" % (group[0], group[3], group[5], group[7], group[8])
text = re.sub(r"\n(%s( RFCs| RFC)?: ?( RFCs| RFC)?)(( \d+,| \d+)+)(.*)\n(( *)((\d+, )*(\d+)))*" % keyword, replacement, text, 1)
return text
text = rfclist_replace("Obsoletes", text)
text = rfclist_replace("Updates", text)
lines = text.splitlines(True)
head = "".join(lines[:28])
rest = "".join(lines[28:])
# title markup
head = re.sub(r'(?im)(([12][0-9][0-9][0-9]|^Obsoletes.*|^Category: (Standards Track|Informational|Experimental|Best Current Practice)) *\n\n+ +)([A-Z][^\n]+)$', r'\g<1><h1>\g<4></h1>', head, 1)
head = re.sub(r'(?i)(<h1.+</h1>)(\n +)([^<\n]+)\n', r'\g<1>\g<2><h1>\g<3></h1>\n', head, 1)
head = re.sub(r'(?i)(<h1.+</h1>)(\n +)([^<\n]+)\n', r'\g<1>\g<2><h1>\g<3></h1>\n', head, 1)
text = head + rest
# http link markup
# link crossing a line. Not permitting ":" after the line break will
# result in some URLs broken across lines not being recognized, but
# will on the other hand correctly handle a series of URL listed line
# by line, one on each line.
# Link crossing a line, where the continuation contains '.' or '/'
text = re.sub(r'(?im)(\s|^|[^=]"|\()((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[./][A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])([.,)"\s]|$)',
r'\g<1><a href="\g<2>\g<6>">\g<2></a>\g<5><a href="\g<2>\g<6>">\g<6></a>\g<7>', text)
text = re.sub(r"(?im)(<)((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])(>)",
r'\g<1><a href="\g<2>\g<6>">\g<2></a>\g<5><a href="\g<2>\g<6>">\g<6></a>\g<7>', text)
# Link crossing a line, where first line ends in '-' or '/'
text = re.sub(r'(?im)(\s|^|[^=]"|\()((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?[-/])(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])([.,)"\s]|$)',
r'\g<1><a href="\g<2>\g<6>">\g<2></a>\g<5><a href="\g<2>\g<6>">\g<6></a>\g<7>', text)
text = re.sub(r"(?im)(<)((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])(>)",
r'\g<1><a href="\g<2>\g<6>">\g<2></a>\g<5><a href="\g<2>\g<6>">\g<6></a>\g<7>', text)
# link crossing a line, enclosed in "<" ... ">"
text = re.sub(r"(?im)<((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])>",
r'<\g<1><a href="\g<1>\g<5>">\g<1></a>\g<4><a href="\g<1>\g<5>">\g<5></a>>', text)
text = re.sub(r"(?im)(<)((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&;?#~=-]+[A-Za-z0-9_/@%&;?#~=-])(>)",
r'\g<1><a href="\g<2>\g<6>">\g<2></a>\g<5><a href="\g<2>\g<6>">\g<6></a>\g<7>', text)
# link crossing two lines, enclosed in "<" ... ">"
text = re.sub(r"(?im)<((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])>",
r'<\g<1><a href="\g<1>\g<5>\g<7>">\g<1></a>\g<4><a href="\g<1>\g<5>\g<7>">\g<5></a>\g<6><a href="\g<1>\g<5>\g<7>">\g<7></a>>', text)
text = re.sub(r"(?im)(<)((http|https|ftp)://([:A-Za-z0-9_./@%&?#~=-]+)?)(\n +)([A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])(\n +)([A-Za-z0-9_./@%&;?#~=-]+[A-Za-z0-9_/@%&;?#~=-])(>)",
r'\g<1><a href="\g<2>\g<6>\g<8>">\g<2></a>\g<5><a href="\g<2>\g<6>\g<8>">\g<6></a>\g<7><a href="\g<2>\g<6>\g<8>">\g<8></a>\g<9>', text)
# link on a single line
text = re.sub(r'(?im)(\s|^|[^=]"|<|\()((http|https|ftp)://[:A-Za-z0-9_./@%&?#~=-]+[A-Za-z0-9_/@%&?#~=-])([.,)"\s]|>|$)',
r'\g<1><a href="\g<2>">\g<2></a>\g<4>', text)
# # Special case for licensing boilerplate
# text = text.replace('<a href="http://trustee.ietf.org/">http://trustee.ietf.org/</a>\n license-info',
# '<a href="http://trustee.ietf.org/licence-info">http://trustee.ietf.org/</a>\n <a href="http://trustee.ietf.org/licence-info">licence-info</a>')
# undo markup if RFC2606 domain
text = re.sub(r'(?i)<a href="[a-z]*?://([a-z0-9_-]+?\.)?example(\.(com|org|net))?(/.*?)?">(.*?)</a>', r"\g<5>", text)
# draft markup
# draft name crossing line break
text = re.sub(r"([^/#=\?\w-])(draft-([-a-zA-Z0-9]+-)?)((?: {3,}\S.*)?\n +)([-a-zA-Z0-9]+[a-zA-Z0-9](.txt)?)",
r'\g<1><a href="%s?%sdraft=\g<2>\g<5>">\g<2></a>\g<4><a href="%s?%sdraft=\g<2>\g<5>">\g<5></a>' % (script, extra, script, extra), text)
# draft name on one line (but don't mess with what we just did above)
text = re.sub(r"([^/#=\?\w>=-])(draft-[-a-zA-Z0-9]+[a-zA-Z0-9](.txt)?)",
r'\g<1><a href="%s?%sdraft=\g<2>">\g<2></a>' % (script, extra), text)
# rfc markup
# rfc and number on the same line
text = re.sub(r'(?i)([^[/>\w-])(rfc([- ]?))([0-9]+)(\W)',
r'\g<1><a href="%s?%srfc=\g<4>">\g<2>\g<4></a>\g<5>' % (script, extra), text)
# rfc and number on separate lines
text = re.sub(r"(?i)([^[/>\w-])(rfc([-]?))(\n +)([0-9]+)(\W)",
r'\g<1><a href="%s?%srfc=\g<5>">\g<2></a>\g<4><a href="%s?%srfc=\g<5>">\g<5></a>\g<6>' % (script, extra, script, extra), text)
# spelled out Request For Comments markup
text = re.sub(r"(?i)(\s)(Request\s+For\s+Comments\s+\([^)]+\)\s+)([0-9]+)",
r'\g<1>\g<2><a href="%s?%srfc=\g<3>">\g<3></a>' % (script, extra), text)
# bcp markup
text = re.sub(r"(?i)([^[/>\w.-])(bcp([- ]?))([0-9]+)(\W)",
r'\g<1><a href="%s?%sbcp=\g<4>">\g<2>\g<4></a>\g<5>' % (script, extra), text)
text = re.sub(r"(?i)([^[/>\w.-])(bcp([-]?))(\n +)([0-9]+)(\W)",
r'\g<1><a href="%s?%sbcp=\g<5>">\g<2></a>\g<4><a href="%s?%sbcp=\g<5>">\g<5></a>\g<6>' % (script, extra, script, extra), text)
def workinprogress_replacement(match):
g1 = match.group(1)
g2 = match.group(2)
g3 = match.group(3)
# eliminate embedded hyperlinks in text we'll use as anchor text
g4 = match.group(4)
g4 = re.sub(r"<a.+?>(.+?)</a>", r"\g<1>", g4)
g4url = urllib.parse.quote_plus(g4)
g5 = match.group(5)
return '%s[<a id="ref-%s">%s</a>]%s<a style="text-decoration: none" href="https://www.google.com/search?sitesearch=datatracker.ietf.org%%2Fdoc%%2Fhtml%%2F&q=inurl:draft-+%s">%s</a>%s' % (g1, g2, g2, g3, g4url, g4, g5)
text = re.sub(r'(\n *\n *)\[([-\w.]+)\](\s+.*?)(".+")(,\s+Work\s+in\s+Progress.)', workinprogress_replacement, text)
text = re.sub(r'(\n *\n *)\[([-\w.]+)\](\s)', r'\g<1>[<a id="ref-\g<2>">\g<2></a>]\g<3>', text)
text = re.sub(r"(\n *\n *)\[(RFC [-\w.]+)\](\s)", r'\g<1>[<a id="ref-\g<2>">\g<2></a>]\g<3>', text)
ref_targets = re.findall(r'<a id="ref-(.*?)"', text)
# reference link markup
def reference_replacement(match):
pre = match.group(1)
beg = match.group(2)
tag = match.group(3)
end = match.group(4)
isrfc = re.match(r"(?i)^rfc[ -]?([0-9]+)$", tag)
if isrfc:
rfcnum = isrfc.group(1)
if tag in reference:
return '%s%s<a href="%s?%srfc=%s" title="%s">%s</a>%s' % (pre, beg, script, extra, rfcnum, reference[tag], tag, end)
else:
return '%s%s<a href="%s?%srfc=%s">%s</a>%s' % (pre, beg, script, extra, rfcnum , tag, end)
else:
if tag in ref_targets:
if tag in reference:
return '%s%s<a href="#ref-%s" title="%s">%s</a>%s' % (pre, beg, tag, reference[tag], tag, end)
else:
return '%s%s<a href="#ref-%s">%s</a>%s' % (pre, beg, tag, tag, end)
else:
return match.group(0)
# Group: 1 2 3 45
text = re.sub(r"(\W)(\[)([-\w.]+)((, ?[-\w.]+)*\])", reference_replacement, text)
text = re.sub(r"(\W)(\[)(RFC [0-9]+)((, ?RFC [0-9]+)*\])", reference_replacement, text)
while True:
old = text
text = re.sub(r"(\W)(\[(?:<a.*?>.*?</a>, ?)+)([-\w.]+)((, ?[-\w.]+)*\])", reference_replacement, text)
if text == old:
break
while True:
old = text
text = re.sub(r"(\W)(\[(?:<a.*?>.*?</a>, ?)+)(RFC [-\w.]+)((, ?RFC [-\w.]+)*\])", reference_replacement, text)
if text == old:
break
# greying out the page headers and footers
text = re.sub(r"\n(.+\[Page \w+\])\n\f\n(.+)\n", r'\n<span class="grey">\g<1></span>\n\f<span class="grey">\g<2></span>\n', text)
# contents link markup: section links
# 1 2 3 4 5 6 7
text = re.sub(r"(?m)^(\s*)(\d+(\.\d+)*)(\.?[ ]+)(.*[^ .])( *\. ?\.)(.*[0-9])$", r'\g<1><a href="#section-\g<2>">\g<2></a>\g<4>\g<5>\g<6>\g<7>', text)
text = re.sub(r"(?m)^(\s*)(Appendix |)([A-Z](\.\d+)*)(\.?[ ]+)(.*[^ .])( *\. ?\.)(.*[0-9])$", r'\g<1><a href="#appendix-\g<3>">\g<2>\g<3></a>\g<5>\g<6>\g<7>\g<8>', text)
# page number markup
multidoc_separator = r"========================================================================"
if re.search(multidoc_separator, text):
parts = re.split(multidoc_separator, text)
for i in range(len(parts)):
parts[i] = re.sub(r"(?si)(\f)([^\f]*\[Page (\w+)\])", r'\g<1><span id="%(page)s-\g<3>" ></span>\g<2>'%{"page": "page-%s"%(i+1)}, parts[i])
parts[i] = re.sub(r"(?i)(\. ?\. +|\. \. \.|\.\.\. *)([0-9ivxlc]+)( *\n)", r'\g<1><a href="#%(page)s-\g<2>">\g<2></a>\g<3>'%{"page": "page-%s"%(i+1)}, parts[i])
text = multidoc_separator.join(parts)
else:
# page name tag markup
text = re.sub(r"(?si)(\f)([^\f]*\[Page (\w+)\])", r'\g<1><hr class="noprint" id="page-\g<3>">\g<2>', text)
# contents link markup: page numbers
text = re.sub(r"(?i)(\. ?\. +|\. \. \.|\.\.\. *)([0-9ivxlc]+)( *\n)", r'\g<1><a href="#page-\g<2>">\g<2></a>\g<3>', text)
# section number tag markup
def section_anchor_replacement(match):
# exclude TOC entries
mstring = match.group(0)
if r" \. \. " in mstring or r"\.\.\." in mstring:
return mstring
level = len(re.findall(r"[^\.]+", match.group(1)))+1
level = min(level, 6)
html = '<h%s><a class="selflink" id="section-%s" href="#section-%s">%s</a>%s</h%s>' % (level, match.group(1), match.group(1), match.group(1), match.group(3), level)
html = html.replace("\n", '</h%s>\n<h%s>' % (level, level))
return html
text = re.sub(r"(?im)^(\d+(\.\d+)*)(\.?[ ]+\S.*?(\n +\w+.*)?( |$))", section_anchor_replacement, text)
#text = re.sub("(?i)(\n *\n *)(\d+(\.\d+)*)(\.?[ ].*)", section_replacement, text)
# section number link markup
text = re.sub(r"(?i)(section\s)(\d+(\.\d+)*)", r'<a href="#section-\g<2>">\g<1>\g<2></a>', text)
text = re.sub(r"(?i)(section)\n(\s+)(\d+(\.\d+)*)", r'<a href="#section-\g<3>">\g<1></a>\n\g<2><a href="#section-\g<3>">\g<3></a>', text)
# Special cases for licensing boilerplate
text = text.replace('<a href="#section-4">Section 4</a>.e of the Trust Legal Provisions',
'Section 4.e of the <a href="https://trustee.ietf.org/license-info">Trust Legal Provisions</a>')
while True:
old = text
text = re.sub(r"(?i)(sections\s(<a.*?>.*?</a>(,\s|\s?-\s?|\sthrough\s|\sor\s|\sto\s|,?\sand\s))*)(\d+(\.\d+)*)", r'\g<1><a href="#section-\g<4>">\g<4></a>', text)
if text == old:
break
# appendix number tag markup
def appendix_replacement(match):
# exclude TOC entries
mstring = match.group(0)
if r" \. \. " in mstring or r"\.\.\." in mstring:
return mstring
txt = match.group(4)
num = match.group(2).rstrip('.')
if num != match.group(2):
txt = "." + txt
level = len(re.findall(r"[^\.]+", num))+1
level = min(level, 6)
return '<h%s><a class="selflink" id="appendix-%s" href="#appendix-%s">%s%s</a>%s</h%s>' % (level, num, num, match.group(1), num, txt, level)
text = re.sub(r"(?m)^(Appendix |)([A-Z](\.|\.\d+)+)(\.?[ ].*)$", appendix_replacement, text)
#text = re.sub("(?i)(\n *\n *)(\d+(\.\d+)*)(\.?[ ].*)", appendix_replacement, text)
# appendix number link markup
text = re.sub(r" ([Aa]ppendix\s)([A-Z](\.\d+)*)", r' <a href="#appendix-\g<2>">\g<1>\g<2></a>', text)
text = re.sub(r" ([Aa]ppendix)\n(\s+)([A-Z](\.\d+)*)", r' <a href="#appendix-\g<3>">\g<1></a>\n\g<2><a href="#appendix-\g<3>">\g<3></a>', text)
# # section x of draft-y markup
# text = re.sub("(?i)<a href=\"[^\"]*\">(section)\s(\d+(\.\d+)*)</a>(\.?\s+(of|in)\s+)<a href=\"[^\"]*\">(draft-[-.a-zA-Z0-9]+[a-zA-Z0-9])</a>", "<a href=\"%s?%surl=%s/rfc\g<7>.txt#section-\g<2>\">\g<1> \g<2>\g<4>\g<6>\g<7></a>" % (script, extra, rfcs), text)
# # draft-y, section x markup
# text = re.sub("(?i)<a href=\"[^\"]*\">(draft-[-.a-zA-Z0-9]+[a-zA-Z0-9])</a>(,?\s)<a href=\"[^\"]*\">(section)\s(\d+(\.\d+)*)</a>", "<a href=\"%s?%surl=%s/rfc\g<2>.txt#section-\g<5>\">\g<1>\g<2>\g<3>\g<4> \g<5></a>" % (script, extra, rfcs), text)
# # [draft-y], section x markup
# text = re.sub("(?i)\[<a href=\"[^>\"]+\">(draft-[-.a-zA-Z0-9]+[a-zA-Z0-9])</a>\](,?\s)<a href=\"[^>\"]*\">(section)\s(\d+(\.\d+)*)</a>", "<a href=\"%s?%surl=%s/rfc\g<2>.txt#section-\g<5>\">[\g<1>\g<2>]\g<3>\g<4> \g<5></a>" % (script, extra, rfcs), text)
for n in ['rfc', 'bcp', 'fyi', 'std']:
# section x of rfc y markup
text = re.sub(r'(?i)<a href="([^"]*)"[^>]*>(section)\s(\d+(\.\d+)*)</a>(\.?\s+(of|in)\s+)<a href="([^"]*)"[^>]*>(%s[- ]?)([0-9]+)</a>'%n,
r'<a href="%s?%s%s=\g<9>\g<1>">\g<2> \g<3>\g<5>\g<8>\g<9></a>' % (script, extra, n), text)
text = re.sub(r'(?i)<a href="([^"]*)"[^>]*>(section)</a>(\n\s+)<a href="(?:[^"]*)"[^>]*>(\d+(\.\d+)*)</a>(\.?\s+(of|in)\s+)<a href="([^"]*)"[^>]*>(%s[- ]?)([0-9]+)</a>'%n,
r'<a href="%s?%s%s=\g<10>\g<1>">\g<2></a>\g<3><a href="%s?%s%s=\g<10>\g<1>">\g<4>\g<6>\g<9>\g<10></a>' % (script, extra, n, script, extra, n), text)
# appendix x of rfc y markup
text = re.sub(r'(?i)<a href="([^"]*)"[^>]*>(appendix)\s([A-Z](\.\d+)*)</a>(\.?\s+(of|in)\s+)<a href="([^"]*)"[^>]*>(%s[- ]?)([0-9]+)</a>'%n,
r'<a href="%s?%s%s=\g<9>\g<1>">\g<2> \g<3>\g<5>\g<8>\g<9></a>' % (script, extra, n), text)
text = re.sub(r'(?i)<a href="([^"]*)"[^>]*>(appendix)</a>(\n\s+)<a href="(?:[^"]*)"[^>]*>([A-Z]+(\.\d+)*)</a>(\.?\s+(of|in)\s+)<a href="([^"]*)"[^>]*>(%s[- ]?)([0-9]+)</a>'%n,
r'<a href="%s?%s%s=\g<10>\g<1>">\g<2></a>\g<3><a href="%s?%s%s=\g<10>\g<1>">\g<4>\g<6>\g<9>\g<10></a>' % (script, extra, n, script, extra, n), text)
# rfc y, section x markup
text = re.sub(r'(?i)<a href="([^"]*)"[^>]*>(%s[- ]?)([0-9]+)</a>(,?\s+)<a href="([^"]*)"[^>]*>(section)\s?(([^<]*))</a>'%n,
r'<a href="%s?%s%s=\g<3>\g<5>">\g<2>\g<3>\g<4>\g<6> \g<7></a>' % (script, extra, n), text)
# rfc y, appendix x markup
text = re.sub(r'(?i)<a href="([^"]*)"[^>]*>(%s[- ]?)([0-9]+)</a>(,?\s+)<a href="([^"]*)"[^>]*>(appendix)\s?(([^<]*))</a>'%n,
r'<a href="%s?%s%s=\g<3>\g<5>">\g<2>\g<3>\g<4>\g<6> \g<7></a>' % (script, extra, n), text)
# section x of? [rfc y] markup
text = re.sub(r'(?i)<a href="([^"]*)"[^>]*>(section)\s(\d+(\.\d+)*)</a>(\.?\s+(of\s+|in\s+)?)\[<a href="([^"]*)"[^>]*>(%s[- ]?)([0-9]+)</a>\]'%n,
r'<a href="%s?%s%s=\g<9>\g<1>">\g<2> \g<3>\g<5>[\g<8>\g<9>]</a>' % (script, extra, n), text)
text = re.sub(r'(?i)<a href="([^"]*)"[^>]*>(section)</a>(\n\s+)<a href="(?:[^"]*)"[^>]*>(\d+(\.\d+)*)</a>(\.?\s+(of\s+|in\s+)?)\[<a href="([^"]*)"[^>]*>(%s[- ]?)([0-9]+)</a>\]'%n,
r'<a href="%s?%s%s=\g<10>\g<1>">\g<2></a>\g<3><a href="%s?%s%s=\g<10>\g<1>">\g<4>\g<6>[\g<9>\g<10>]</a>' % (script, extra, n, script, extra, n), text)
# appendix x of? [rfc y] markup
text = re.sub(r'(?i)<a href="([^"]*)"[^>]*>(appendix)\s([A-Z](\.\d+)*)</a>(\.?\s+(of\s+|in\s+)?)\[<a href="([^"]*)"[^>]*>(%s[- ]?)([0-9]+)</a>\]'%n,
r'<a href="%s?%s%s=\g<9>\g<1>">\g<2> \g<3>\g<5>[\g<8>\g<9>]</a>' % (script, extra, n), text)
text = re.sub(r'(?i)<a href="([^"]*)"[^>]*>(appendix)</a>(\n\s+)<a href="(?:[^"]*)"[^>]*>([A-Z](\.\d+)*)</a>(\.?\s+(of\s+|in\s+)?)\[<a href="([^"]*)"[^>]*>(%s[- ]?)([0-9]+)</a>\]'%n,
r'<a href="%s?%s%s=\g<10>\g<1>">\g<2></a>\g<3><a href="%s?%s%s=\g<10>\g<1>">\g<4>\g<6>[\g<9>\g<10>]</a>' % (script, extra, n, script, extra, n), text)
# [rfc y], section x markup
text = re.sub(r'(?i)\[<a href="([^>"]+)"[^>]*>(%s[- ]?)([0-9]+)</a>\](,?\s+)<a href="([^>"]*)"[^>]*>(section)\s(\d+(\.\d+)*)</a>'%n,
r'<a href="%s?%s%s=\g<3>\g<5>">[\g<2>\g<3>]\g<4>\g<6> \g<7></a>' % (script, extra, n), text)
# [rfc y], appendix x markup
text = re.sub(r'(?i)\[<a href="([^>"]+)"[^>]*>(%s[- ]?)([0-9]+)</a>\](,?\s+)<a href="([^>"]*)"[^>]*>(appendix)\s([A-Z](\.\d+)*)</a>'%n,
r'<a href="%s?%s%s=\g<3>\g<5>">[\g<2>\g<3>]\g<4>\g<6> \g<7></a>' % (script, extra, n), text)
# remove section link for section x.x (of|in) <something else>
old = text
text = re.sub(r'(?i)<a href="[^"]*"[^>]*>(section\s)(\d+(\.\d+)*)</a>(\.?[a-z]*\s+(of|in)\s+)(\[?)<a href="([^"]*)"([^>]*)>(.*)</a>(\]?)',
r'\g<1>\g<2>\g<4>\g<6><a href="\g<7>"\g<8>>\g<9></a>\g<10>', text)
text = re.sub(r'(?i)(\[?)<a href="([^"]*#ref[^"]*)"([^>]*)>(.*?)</a>(\]?,\s+)<a href="[^"]*"[^>]*>(section\s)(\d+(\.\d+)*)</a>',
r'\g<1><a href="\g<2>"\g<3>>\g<4></a>\g<5>\g<6>\g<7>', text)
# Special fix for referring to the trust legal provisons in
# boilerplate text:
text = re.sub(r'(?i)<a href="[^"]*"[^>]*>(section\s)(\d+(\.\d+)*)</a>(\.?[a-z]*\s+(of|in)\s*\n\s*the Trust Legal Provisions)',
r'\g<1>\g<2>\g<4>', text)
#
#text = re.sub("\f", "<div class=\"newpage\" />", text)
text = re.sub(r"\n?\f\n?", '</pre>\n<pre class="newpage">', text)
# restore indentation
if prefixlen:
text = re.sub(r"\n", "\n"+(" "*prefixlen), text)
if path:
text = re.sub(r"%s\?(rfc|bcp|std)=" % script, r"%s/\g<1>" % path, text)
text = re.sub(r"%s\?draft=" % script, r"%s/" % path, text)
return text