forked from stedt-project/printutils
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathFascicleXetexUtil.pm
287 lines (244 loc) · 8.17 KB
/
FascicleXetexUtil.pm
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
package FascicleXetexUtil;
use strict;
use Exporter;
our @ISA = qw(Exporter);
our @EXPORT = qw(bold_protoform prettify_protoform escape_tex xml2tex
eq_reflexes merge_glosses src_concat);
use utf8;
=head1 NAME
FascicleXetexUtil
=head1 SYNOPSIS
Some utility functions for generating LaTeX code for the fascicle.
Not everything function in here is strictly TeX, of course, (like
record-combining ones, at the end), but they're only useful in the
context of generating a print volume where you need to save space
and make things pretty.
=head1 USAGE
You can just call the functions, they're all exported.
BUT you have to set the $FascicleXetexUtil::tag2info sub ref manually.
This is a subroutine that should accept a tag number (and an optional alternate gloss)
and return a string that is valid XeTeX and will be inserted inline into notes.
=head1 AUTHOR
by Dominic Yu
=head1 VERSION
0.1 - 2008.02.23
=over
=item *
work in progress
=back
=cut
my @italicize_abbrevs =
qw|GSR GSTC STC HPTB LTSR TSR AHD VSTB TBT HCT LTBA BSOAS CSDPN TIL OED|;
our $tag2info; # sub ref only used inside xml2tex, set from the outside
sub bold_protoform { # pass in something already escape_tex'd
my $s = shift;
for ($s) {
s/⪤} +/⪤} */g;
s/\\textasciitilde{} +/\\textasciitilde{} */g;
s/ = +/ = */g;
s/ or +/ or */g;
$_ = '*' . $_;
s/(\*\S+)/\\textbf{$1}/g; # bold only the protoform, not allofam or "or"
}
return $s;
}
sub prettify_protoform {
my $s = shift;
$s =~ s#\(?(.)/(.)(?:/(.))?(?:/(.))?\)?#_tabularify($1,$2,$3,$4)#ge;
return $s;
}
sub _tabularify {
my $s = "\\begin{tabular}[c]{c}";
foreach (@_) {
$s .= "$_\\\\" if $_;
}
$s .= "\\end{tabular}";
return $s;
}
sub escape_tex {
my $s = shift;
my $ignore_curly_braces = shift; # second argument means "Don't escape curly braces"
my $ignore_dollar_signs = shift; # third argument means "Don't escape dollar signs"
$s =~ s/{/\\{/g unless $ignore_curly_braces;
$s =~ s/}/\\}/g unless $ignore_curly_braces;
$s =~ s/#/\\#/g;
$s =~ s/&/\\&/g;
$s =~ s/\$/\\\$/g unless $ignore_dollar_signs; # for dollar signs in Na data
$s =~ s/~/\\textasciitilde{}/g;
# $s =~ s/</\\textless\\ /g;
# $s =~ s/>/\\textgreater\\ /g;
$s =~ s/([ⓁⓋⓒⒸⓈ˯˰⪤↮↭])/\\STEDTU{$1}/g;
# this marks special symbols not really in unicode as STEDTU font
# VL, VD, checked, tone C, stopped tone, low open, low stopped, allofam symbols
$s =~ s/◦/\\,/g; # STEDT delimiter, not in Charis SIL, can be del'd (\, is a mini-space in TeX)
$s =~ s/\|//g; # STEDT overriding delimiter, can be safely ignored
return $s;
}
sub _qtd {
my $s = $_[0];
$s =~ s/'/'/g;
$s =~ s/"/"/g;
return $s;
}
sub _nonbreak_hyphens {
my $s = $_[0];
$s =~ s/-/‑/g;
return $s;
}
sub xml2tex { # for the notes
local $_ = $_[0];
s/{/\\{/g unless $_[1]; # skip curly braces
s/}/\\}/g unless $_[1];
s/\$/\\\$/g; # escape dollar signs
s|^<par>||;
s|</par>$||;
s|</par><par>|\n\n|g;
s|<br />|\\\\{}\n|g;
s|<sup>(.*?)</sup>|\$^\\mathrm{$1}\$|g;
s|<sub>(.*?)</sub>|\$_\\mathrm{$1}\$|g;
s|<emph>(.*?)</emph>|\\textit{$1}|g;
s|<strong>(.*?)</strong>|"\\textbf{" . _qtd($1). "}"|ge;
s|<gloss>(.*?)</gloss>|$1|g; # no formatting?
s|<reconstruction>\*(.*?)</reconstruction>|"\\textbf{*" . _nonbreak_hyphens($1) . "}"|ge;
# s|<xref ref="(\d+)">#\1(.*?)</xref>|#$1$2|g;
s|<xref ref="(\d+)">#\1(.*?)</xref>|_tag2info($1,$2)|ge;
s|<a href="(.*?)">(.*?)</a>|\\href{$1}{$2}|g; #convert hyperlinks
s|<footnote>(.*?)</footnote>|\\footnote{$1}|g;
s|<hanform>(.*?)</hanform>|\\TC{$1}|g;
s|<unicode>(.*?)</unicode>|\\TC{\\char"\U$1}|g; # assume hex unicode codepoint is a CJK character (note that \char" needs uppercase)
s|<latinform>(.*?)</latinform>|"\\textbf{" . _nonbreak_hyphens(_qtd($1)) . "}"|ge; # exception to smart quote
s|<plainlatinform>(.*?)</plainlatinform>|_qtd($1)|ge; # not used...
s/&/&/g;
s/</</g;
s/>/>/g;
# some smart-quote code lifted from the internet somewhere, here for reference
# # left_single
# sub { $_[0] =~ s/(\s|\A)'/$1‘/g;
# $_[0] =~ s/(?<!\w)'(?=\w)/‘/g;
# },
# # right_single
# sub { $_[0] =~ s/(?<!\s)'/’/g;
# $_[0] =~ s/'(?=\s|\z)/’/g;
s/(\S)'/$1’/g; # smart quotes
# this formulation doesn't account for single quotes right
# after opening-type contexts, like open paren, brackets, etc.
# it's practically impossible to find good code online to
# "educate" straight quotes
s/'/‘/g;
s/"(?=[\w'])/“/g;
s/"/”/g; # or $_[0] =~ s/(?<!\s)"/”/g; $_[0] =~ s/(\A|\s)"/$1“/g;
s/(cf\.) /$1\\ /ig; # no extra spacing after cf., e.g., etc.
s/(e\. ?g\.) /$1\\ /g;
s/(i\. ?e\.) /$1\\ /g;
s/(pp?\.) (?=\d)/$1\\ /g;
s/(vs\.) /$1\\ /g;
s/(\bn\.) /$1\\ /g; # means "footnote"
s/(\bMand\.) /$1\\ /g;
# italicize certain abbreviations
for my $abbrev (@italicize_abbrevs) {
s/\b($abbrev)\b/\\textit{$1}/g;
}
$_ = escape_tex($_, 1, 1); # pass 1, 1 to mean don't escape curly braces or dollar signs, since we did that already
s/<-+>/\$\\longleftrightarrow\$/g; # convert double-headed arrows
s/-+>/\$\\rightarrow\$/g; # convert right arrows
s/<-+/\$\\leftarrow\$/g; # convert left arrows
s/< /<~/g; # no break after "comes from" sign
return $_;
}
sub _tag2info {
&$tag2info;
}
# dead code? is this subroutine ever accessed?
sub xml2html {
my @footnotes;
my $i = 1;
local $_ = $_[0];
s|<par>|<p>|g;
s|</par>|</p>|g;
s|<emph>|<i>|g;
s|</emph>|</i>|g;
s|<gloss>(.*?)</gloss>|$1|g; # no formatting?
s|<reconstruction>\*(.*?)</reconstruction>|"<b>*" . _nonbreak_hyphens($1) . "</b>"|ge;
s|<xref ref="(\d+)">#\1(.*?)</xref>|_tag2info($1,$2)|ge;
s|<footnote>(.*?)</footnote>|push @footnotes, $1; "<sup>" . $i++ . "</sup>"|ge;
s|<hanform>(.*?)</hanform>|$1|g;
s|<latinform>(.*?)</latinform>|"<b>" . _nonbreak_hyphens($1) . "</b>"|ge;
s|<plainlatinform>(.*?)</plainlatinform>|$1|g;
s/(\S)'/$1’/g; # smart quotes
s/'/‘/g;
s/"(?=[\w'])/“/g;
s/"/”/g; # or $_[0] =~ s/(?<!\s)"/”/g; $_[0] =~ s/(\A|\s)"/$1“/g;
# italicize certain abbreviations
for my $abbrev (@italicize_abbrevs) {
s|\b($abbrev)\b|<i>$1</i>|g;
}
### specify STEDTU here?
s/<-+>/⟷/g; # convert arrows
s/< /< /g; # no-break space after "comes from" sign
$i = 1;
for my $f (@footnotes) { $_ .= '<p class="footnote">' . $i++ . ". $f</p>" }
return $_;
}
# special functions to combine similar records
sub eq_reflexes {
my ($a, $b) = @_;
$a =~ tr/-+ .,;~◦⪤=\|//d; # remove spaces and delimiters
$b =~ tr/-+ .,;~◦⪤=\|//d;
$a =~ s/ː/:/g; # normalize vowel length to ASCII colon
$b =~ s/ː/:/g;
return $a eq $b;
}
sub _magic_gloss_compare {
my $z = $_[0] eq $_[1];
return 1 if $z; # dummy case
return $z unless $_[0] =~ /\("/ || $_[1] =~ /\("/;
my ($a, $b) = @_; # copy values
$a =~ s/ +\(".*?"\)//g;
$b =~ s/ +\(".*?"\)//g;
return 0 if $a ne $b;
# save the longer string to the first value passed in
# as a side effect (see sub below)
$_[0] = $_[1] if length($_[1]) > length($_[0]);
return 1;
}
sub merge_glosses {
my ($a, $b) = @_;
return $a if $a eq $b; # dummy case, save some time?
$a =~ s| / |;|g; # make slashes equivalent to semicolons
$b =~ s| / |;|g;
my @a = split / *; */, $a;
my @b = split / *; */, $b;
my ($longer, $shorter); # array refs
if (@a >= @b) { # greater-or-equal, so if they're the same you concatenate left-to-right
($longer, $shorter) = \(@a,@b);
} else {
($longer, $shorter) = \(@b,@a);
}
foreach my $s (@$shorter) {
# add each gloss from the shorter set of glosses
# as long as you can't find it in the longer set
push @$longer, $s
unless grep {_magic_gloss_compare($_,$s)} @$longer;
}
return join '; ', @$longer;
}
sub src_concat {
my @abbrs = split /;/, $_[0];
my @ids = split /;/, $_[1];
#my $result = "\\mbox{$abbrs[0]}";
my $result = "{$abbrs[0]}";
$result .= ":" . escape_tex($ids[0]) if $ids[0]; # escape the pound symbols in the srcid
my $lastabbr = $abbrs[0];
for my $i (1..$#abbrs) {
if ($abbrs[$i] eq $lastabbr) {
$result .= "," . escape_tex($ids[$i]) if $ids[$i];
} else {
#$result .= "; \\mbox{$abbrs[$i]}";
$result .= "; {$abbrs[$i]}";
$result .= ":" . escape_tex($ids[$i]) if $ids[$i];
$lastabbr = $abbrs[$i];
}
}
return $result;
}
1;