-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathutf8.h
132 lines (125 loc) · 3.63 KB
/
utf8.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
#define MAXUTF8CHARLEN 32
#define Uboxtl 0x250C
#define Uboxdr Uboxtl // down-right at top-left
//#define Uboxudl 0x251C
#define Uboxtbl 0x251C
#define Uboxudr Uboxtbl // up-down-right at middle-left
#define Uboxtbr 0x2524
#define Uboxudl Uboxtbr // up-down-left at middle-right
#define Uboxtr 0x2510
#define Uboxdl Uboxtr // down-left at top-right
#define Uboxbl 0x2514
#define Uboxur Uboxbl // up-right at bottom-left
#define Uboxbr 0x2518
#define Uboxul Uboxbr // up-left at bottom-right
#define Uboxhz 0x2500
#define Uboxlr Uboxhz // left-right is horizontal
#define Uboxvt 0x2502
#define Uboxud Uboxvt // up-down is vertical
#define Uboxudlr 0x253C // up-down-left-right
#define Uboxdiamond 0x25C7
#define Uboxtinysquare 0x25AB
#define Uboxlozenge 0x25CA
// still need middle-top and middle-bottom
UCS nextutf8char(UCS str, INP codepoint) {
IN seqlen;
IN datalen = (IN)strlen((CCS)str);
UCS p = str;
IF (datalen LT 1 OR str[0] EQNUL)
{ RT NULL; }
IF (!(str[0] & 0x80)) {
*codepoint = (wchar_t)str[0];
seqlen = 1;
} EF ((str[0] & 0xE0) == 0xC0) {
*codepoint = (IN)(((str[0] & 0x1F) << 6) | (str[1] & 0x3F));
seqlen = 2;
} EF ((str[0] & 0xF0) == 0xE0) {
*codepoint = (IN)(((str[0] & 0x0F) << 12) | ((str[1] & 0x3F) << 6) | (str[2] & 0x3F));
seqlen = 3;
} EL { RT NULL; }
p += seqlen;
RT p;
}
IN utf8thischar(IN codepoint, UCS utf8buffer) {
IF INRANGE(codepoint, 0, 127) {
// 0xxxxxxx
utf8buffer[0] = codepoint;
utf8buffer[1] = NUL;
RT 1;
} EF INRANGE(codepoint, 128, 2047) {
// 110xxxxx 10xxxxxx
utf8buffer[0] = (codepoint >> 6) + 0xC0;
utf8buffer[1] = (codepoint & 0x3F) + 0x80;
utf8buffer[2] = NUL;
RT 2;
} EF INRANGE(codepoint, 2048, 65535) {
// 1110xxxx 10xxxxxx 10xxxxxx
utf8buffer[0] = (codepoint >> 12) + 0xE0;
utf8buffer[1] = ((codepoint >> 6) & 0x3F) + 0x80;
utf8buffer[2] = (codepoint & 0x3F) + 0x80;
utf8buffer[3] = NUL;
RT 3;
} EF INRANGE(codepoint, 65536, 1112064) {
// 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
utf8buffer[0] = (codepoint >> 18) + 0xF0;
utf8buffer[1] = ((codepoint >> 12) & 0x3F) + 0x80;
utf8buffer[2] = ((codepoint >> 6) & 0x3F) + 0x80;
utf8buffer[3] = (codepoint & 0x3F) + 0x80;
utf8buffer[4] = NUL;
RT 4;
} EL { RT 0; } // out of range
}
IN repeatutf8(IN codepoint, IN numtimes) {
CH charbuffer[MAXUTF8CHARLEN];
IN charlen = utf8thischar(codepoint, charbuffer);
IN totallen = 0;
IN ni = -1;
WI (INC ni LT numtimes) {
printf("%s", charbuffer);
totallen ADDS charlen;
}
RT totallen;
}
#define U1(cp) repeatutf8(cp, 1)
#define Un(cp, nt) repeatutf8(cp, nt)
VD copyfromhex(CS target, CS source) {
CS tgch = target; // write to
CS srch = source; // read from
WI (*srch NQNUL) {
IF (*srch EQ '%') { // expect 2 more chars
CH hexch1 = *(srch + 1);
CH hexch2 = *(srch + 2);
IF ISHEXPAIR(hexch1, hexch2) {
IN hexval = HEXPAIRVALUE(hexch1, hexch2);
*tgch = hexval; // write value
INC tgch;
srch += 3; // increment %FF
} EL { // hex char not valid, % is %
*tgch = *srch;
INC tgch;
INC srch;
}
} EL { // normal char, copy as is
*tgch = *srch;
INC tgch;
INC srch;
}
}
*tgch = NUL; // null-terminate!
}
VD copynopath(CS target, CS source) {
CS tgch = target; // write to
CS srch = source; // read from
WI (*srch NQNUL) {
IF (*srch EQ '/') { // expect 2 more chars
*tgch = '-';
INC tgch;
INC srch;
} EL {
*tgch = *srch;
INC tgch;
INC srch;
}
}
*tgch = NUL;
}