-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathalgo.py
485 lines (417 loc) · 17.4 KB
/
algo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
"""Algorithms for manipulating BibTeX data.
This module implements various algorithms supplied by BibTeX to style
files, as well as some algorithms to make BibTeX data more accessible
to Python.
"""
__all__ = ('Name parse_names ' +
'parse_month ' +
'title_case ' +
'TeXProcessor TeXToUnicode tex_to_unicode').split()
import re
import collections
import unicodedata
import string
import messages
# Control sequences (defined as "control_seq_ilk" in bibtex) and their
# Unicode translations. This is similar to, but slightly different
# from the TeX definitions (of course).
_CONTROL_SEQS = {
'\\i': 'ı', '\\j': 'ȷ', '\\oe': 'œ', '\\OE': 'Œ',
'\\ae': 'æ', '\\AE': 'Æ', '\\aa': 'å', '\\AA': 'Å',
'\\o': 'ø', '\\O': 'Ø', '\\l': 'ł', '\\L': 'Ł', '\\ss': 'ß'
}
class NameParser:
def __init__(self):
pass
def __depth(self, data):
depth, depths = 0, [0] * len(data)
for pos, ch in enumerate(data):
depths[pos] = depth
if ch == '{':
depth += 1
depths[pos] = depth
elif ch == '}':
depth -= 1
return depths
def __split_depth0(self, regexp, data, flags=0):
regexp = re.compile(regexp, flags=flags)
depths = self.__depth(data)
parts, last = [], 0
for m in regexp.finditer(data):
if depths[m.start()] == 0:
parts.append(data[last:m.start()])
last = m.end()
if regexp.groups:
parts.extend(m.groups())
parts.append(data[last:])
return parts
def _first_char(self, data):
"""Return the first character of data (in bibtex's sense)."""
# XXX Should this be pulled out as some generic algorithm?
pos = 0
depths = self.__depth(data)
while True:
if pos == len(data):
return ''
elif data[pos].isalpha():
return data[pos]
elif data.startswith('{\\', pos):
# Special character
pos += 1
m = re.compile(r'\\[a-zA-Z]+').match(data, pos)
if m and m.group() in _CONTROL_SEQS:
# Known bibtex control sequence
return _CONTROL_SEQS[m.group()]
# Scan for the first alphabetic character
while pos < len(data) and depths[pos]:
if data[pos].isalpha():
return data[pos]
pos += 1
elif data[pos] == '{':
# Skip brace group
while pos < len(data) and depths[pos]:
pos += 1
else:
pos += 1
def __split_von_last(self, toks):
# See von_name_ends_and_last_name_starts_stuff
for von_end in range(len(toks) - 1, 1, -2):
if self._first_char(toks[von_end - 2]).islower():
return (toks[:von_end-1], toks[von_end:])
return ([], toks)
def parse(self, string, pos):
"""Parse a BibTeX name list.
Returns a list of Name objects. Raises InputError if there is
a syntax error.
"""
# See x_format_name
# Split names (see name_scan_for_and)
name_strings = [n.strip() for n in self.__split_depth0(
'[ \t]and(?=[ \t])', string, flags=re.IGNORECASE)]
# Process each name
names = []
for name_string in name_strings:
# Remove leading and trailing white space, ~, and -, and
# trailing commas.
name_string = name_trailing = name_string.lstrip('-~ \t')
name_string = name_string.rstrip('-~ \t,')
if ',' in name_trailing[len(name_string):]:
# BibTeX warns about this because it often indicates a
# bigger syntax problem
pos.warn('trailing comma after name `{}\''.format(name_string))
# Split on depth-0 commas and further split tokens in each
# part, keeping only the first connector between each
# token.
parts = [self.__split_depth0('([-~ \t])[-~ \t]*', part.strip())
for part in self.__split_depth0(',', name_string)]
# Process name depending on how many commas there were
first = von = last = jr = []
if len(parts) == 1:
# "First von Last"
toks = parts[0]
# The von tokens start with the first lower-case token
# (but cannot start at the last token)
for von_start in range(0, len(toks) - 2, 2):
if self._first_char(toks[von_start]).islower():
# Found beginning; now find the end
first = toks[:max(0, von_start-1)]
von, last = self.__split_von_last(toks[von_start:])
break
else:
# No von tokens. Find hyphen-connected last name
# tokens.
for last_start in range(len(toks) - 1, -1, -2):
if last_start and toks[last_start-1] != '-':
break
first = toks[:max(0, last_start-1)]
last = toks[last_start:]
elif 2 <= len(parts) <= 3:
# "von Last, First[, Jr]"
von, last = self.__split_von_last(parts[0])
first = parts[1]
if len(parts) == 3:
jr = parts[2]
else:
pos.raise_error(
'too many commas in name `{}\''.format(name_string))
names.append(Name(''.join(first), ''.join(von),
''.join(last), ''.join(jr)))
return names
class Name(collections.namedtuple('Name', 'first von last jr')):
"""A parsed name.
The name is parsed in to first name, "von", last name, and the
complement (or "jr"). Each component is in uninterpreted form
(e.g., TeX syntax). Missing components are set to the empty
string.
"""
def is_others(self):
return self.first == '' and self.von == '' and \
self.last == 'others' and self.jr == ''
def pretty(self, template='{first} {von} {last} {jr}'):
"""Pretty-print author according to template.
The template is a 'format' template with the added feature
that literal text surrounding fields that expand to empty
strings is prioritized, rather than concatenated.
Specifically, of the literal text snippets between two
non-null fields, only the first of the highest priority is
kept, where non-white space outranks white space outranks the
empty string. Literal text before and after the first and
last fields is always kept.
Hence, if the template is '{von} {last}, {first}, {jr}' and
the name has a last and a jr not no von or first, then the
first comma will be kept and the space and second dropped. If
the name has only a von and a last, then both commas will be
dropped. If the name has only a last, then all separators
will be dropped.
"""
# XXX BibTeX's own format.name$ templates are more
# sophisticated than this, and it's not clear these are easier
# to use. These do have the (dubious) benefit of having
# access to the usual format machinery.
def priority(string):
if not string:
return 0
elif string.isspace():
return 1
return 2
fields = {'first': self.first, 'von': self.von,
'last': self.last, 'jr': self.jr}
f = string.Formatter()
pieces = ['']
first_field, last_field = 0, -1
leading = trailing = ''
for i, (literal_text, field_name, format_spec, conv) in \
enumerate(f.parse(template)):
if i == 0:
# Always keep leading text
leading = literal_text
elif field_name is None:
# Always keep trailing test
trailing = literal_text
elif priority(literal_text) > priority(pieces[-1]):
# Overrides previous piece
pieces[-1] = literal_text
if field_name is not None:
obj, _ = f.get_field(field_name, (), fields)
if not obj:
continue
obj = f.convert_field(obj, conv)
if first_field == 0:
first_field = len(pieces)
last_field = len(pieces)
pieces.extend([f.format_field(obj, format_spec), ''])
# Only keep the pieces between non-null fields
pieces = pieces[first_field:last_field + 1]
return leading + ''.join(pieces) + trailing
def parse_names(string, pos=messages.Pos.unknown):
"""Parse a BibTeX name list (e.g., an author or editor field).
Returns a list of Name objects. The parsing is equivalent to
BibTeX's built-in "format.name$" function. Raises InputError if
there is a syntax error.
"""
return NameParser().parse(string, pos)
_MONTHS = 'January February March April May June July August September October November December'.lower().split()
def parse_month(string, pos=messages.Pos.unknown):
"""Parse a BibTeX month field.
This performs fairly fuzzy parsing that supports all standard
month macro styles (and then some).
Raises InputError if the field cannot be parsed.
"""
val = string.strip().rstrip('.').lower()
for i, name in enumerate(_MONTHS):
if name.startswith(val) and len(val) >= 3:
return i + 1
pos.raise_error('invalid month `{}\''.format(string))
CS_RE = re.compile(r'\\[a-zA-Z]+')
def title_case(string, pos=messages.Pos.unknown):
"""Convert to title case (like BibTeX's built-in "change.case$").
Raises InputError if the title string contains syntax errors.
"""
# See "@<Perform the case conversion@>"
out = []
level, prev_colon, pos = 0, False, 0
while pos < len(string):
keep = (pos == 0 or (prev_colon and string[pos-1] in ' \t\n'))
if level == 0 and string.startswith('{\\', pos) and not keep:
# Special character
out.append(string[pos])
pos += 1
level += 1
while level and pos < len(string):
if string[pos] == '\\':
m = CS_RE.match(string, pos)
if m:
if m.group() in _CONTROL_SEQS:
# Lower case control sequence
out.append(m.group().lower())
else:
# Unknown control sequence, keep case
out.append(m.group())
pos = m.end()
continue
elif string[pos] == '{':
level += 1
elif string[pos] == '}':
level -= 1
# Lower-case non-control sequence
out.append(string[pos].lower())
pos += 1
prev_colon = False
continue
# Handle braces
char = string[pos]
if char == '{':
level += 1
elif char == '}':
if level == 0:
pos.raise_error('unexpected }')
level -= 1
# Handle colon state
if char == ':':
prev_colon = True
elif char not in ' \t\n':
prev_colon = False
# Change case of a regular character
if level > 0 or keep:
out.append(string[pos])
else:
out.append(string[pos].lower())
pos += 1
return ''.join(out)
# A TeX control sequence is
#
# 1) an active character (subsequent white space is NOT ignored) or,
# 2) a \ followed by either
# 2.1) a sequence of letter-category characters (subsequent white
# space is ignored), or
# 2.2) a single space-category character (subsequent white space is
# ignored), or
# 2.3) a single other character (subsequent white space is NOT
# ignored).
#
# This regexp assumes plain TeX's initial category codes. Technically
# only ~ and \f are active characters, but we include several other
# special characters that we want to abort on.
tex_cs_re = re.compile(
r'([~\f$&#^_]|(\\[a-zA-Z]+|\\[ \t\r\n])|\\.)(?(2)[ \t\r\n]*)')
class TeXProcessor:
"""Base class for simple TeX macro processors.
This assumes the initial category codes set up by plain.tex (and,
likewise, LaTeX).
"""
def process(self, string, pos):
"""Expand active characters and macros in string.
Raises InputError if it encounters an active character or
macro it doesn't recognize.
"""
self.__data = string
self.__off = 0
self.__pos = pos
# Process macros
while True:
m = tex_cs_re.search(self.__data, self.__off)
if not m:
break
self.__off = m.end()
macro = m.group(1)
nval = self._expand(macro)
if nval is None:
if macro.startswith('\\'):
pos.raise_error('unknown macro `{}\''.format(macro))
pos.raise_error(
'unknown special character `{}\''.format(macro))
self.__data = self.__data[:m.start()] + nval + \
self.__data[self.__off:]
self.__off = m.start() + len(nval)
return self.__data
def _scan_argument(self):
"""Scan an return a macro argument."""
if self.__off >= len(self.__data):
self.__pos.raise_error('macro argument expected')
if self.__data[self.__off] == '{':
start = self.__off
depth = 0
while depth or self.__off == start:
if self.__data[self.__off] == '{':
depth += 1
elif self.__data[self.__off] == '}':
depth -= 1
self.__off += 1
return self.__data[start + 1:self.__off - 1]
elif self.__data[self.__off] == '\\':
m = tex_cs_re.match(self.__data, self.__off)
self.__off = m.end()
return m.group(1)
else:
arg = self.__data[self.__off]
self.__off += 1
return arg
def _expand(self, cs):
"""Return the expansion of an active character or control sequence.
Returns None if the sequence is unknown. This should be
overridden by sub-classes.
"""
return None
class TeXToUnicode(TeXProcessor):
"""A simple TeX-to-unicode converter.
This interprets accents and other special tokens like '--' and
eliminates braces.
"""
# Simple TeX-to-Unicode replacements
_SIMPLE = {
# Active characters
'~': '\u00A0',
# chardefs from plain.tex
'\\%': '%', '\\&': '&', '\\#': '#', '\\$': '$', '\\ss': 'ß',
'\\ae': 'æ', '\\oe': 'œ', '\\o': 'ø',
'\\AE': 'Æ', '\\OE': 'Œ', '\\O': 'Ø',
'\\i': 'ı', '\\j': 'ȷ',
'\\aa': 'å', '\\AA': 'Å', '\\l': 'ł', '\\L': 'Ł',
# Other defs from plain.tex
'\\_': '_', '\\dag': '†', '\\ddag': '‡', '\\S': '§', '\\P': '¶',
}
# TeX accent control sequences to Unicode combining characters
_ACCENTS = {
# Accents defined in plain.tex
'\\`': '\u0300', "\\'": '\u0301', '\\v': '\u030C', '\\u': '\u0306',
'\\=': '\u0304', '\\^': '\u0302', '\\.': '\u0307', '\\H': '\u030B',
'\\~': '\u0303', '\\"': '\u0308',
'\\d': '\u0323', '\\b': '\u0331', '\\c': '\u0327',
# Other accents that seem to be standard, but I can't find
# their definitions
'\\r': '\u030A', '\\k': '\u0328'
}
def process(self, string, pos):
string = super().process(string, pos)
# Handle ligatures that are unique to TeX. This must be done
# after macro expansion, but before brace removal because
# braces inhibit ligatures.
string = string.replace('---', '\u2014').replace('--', '\u2013')
# Remove braces
return string.replace('{', '').replace('}', '')
def _expand(self, cs):
if cs in self._SIMPLE:
return self._SIMPLE[cs]
if cs in self._ACCENTS:
arg = self._scan_argument()
if len(arg) == 0:
seq, rest = ' ' + self._ACCENTS[cs], ''
elif arg.startswith('\\i') or arg.startswith('\\j'):
# Unicode combining marks should be applied to the
# regular i, not the dotless i.
seq, rest = arg[1] + self._ACCENTS[cs], arg[2:]
else:
seq, rest = arg[0] + self._ACCENTS[cs], arg[1:]
return unicodedata.normalize('NFC', seq) + rest
return None
def tex_to_unicode(string, pos=messages.Pos.unknown):
"""Convert a BibTeX field value written in TeX to Unicode.
This interprets accents and other special tokens like '--' and
eliminates braces. Raises InputError if it encounters a macro it
doesn't understand.
Note that BibTeX's internal understanding of accented characters
(e.g., purify$ and change.case$) is much more limited than TeX's.
This implements something closer to TeX on the assumption that the
goal is to display the string.
"""
return TeXToUnicode().process(string, pos)