Annotation of src/usr.bin/tmux/utf8.c, Revision 1.25
1.25 ! nicm 1: /* $OpenBSD: utf8.c,v 1.24 2015/11/14 12:03:23 nicm Exp $ */
1.1 nicm 2:
3: /*
4: * Copyright (c) 2008 Nicholas Marriott <nicm@users.sourceforge.net>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.11 nicm 21: #include <stdlib.h>
1.1 nicm 22: #include <string.h>
1.9 nicm 23: #include <vis.h>
1.1 nicm 24:
25: #include "tmux.h"
26:
27: struct utf8_width_entry {
28: u_int first;
29: u_int last;
30:
31: int width;
32:
33: struct utf8_width_entry *left;
34: struct utf8_width_entry *right;
35: };
36:
1.14 schwarze 37: /* Sorted, then repeatedly split in the middle to balance the tree. */
1.17 nicm 38: static struct utf8_width_entry utf8_width_table[] = {
1.14 schwarze 39: { 0x00b41, 0x00b44, 0, NULL, NULL },
40: { 0x008e4, 0x00902, 0, NULL, NULL },
41: { 0x006d6, 0x006dd, 0, NULL, NULL },
42: { 0x005c4, 0x005c5, 0, NULL, NULL },
1.1 nicm 43: { 0x00591, 0x005bd, 0, NULL, NULL },
1.14 schwarze 44: { 0x00300, 0x0036f, 0, NULL, NULL },
45: { 0x00483, 0x00489, 0, NULL, NULL },
1.1 nicm 46: { 0x005bf, 0x005bf, 0, NULL, NULL },
1.14 schwarze 47: { 0x005c1, 0x005c2, 0, NULL, NULL },
48: { 0x00610, 0x0061a, 0, NULL, NULL },
49: { 0x00600, 0x00605, 0, NULL, NULL },
50: { 0x005c7, 0x005c7, 0, NULL, NULL },
51: { 0x0064b, 0x0065f, 0, NULL, NULL },
52: { 0x0061c, 0x0061c, 0, NULL, NULL },
53: { 0x00670, 0x00670, 0, NULL, NULL },
54: { 0x007a6, 0x007b0, 0, NULL, NULL },
1.1 nicm 55: { 0x006ea, 0x006ed, 0, NULL, NULL },
1.14 schwarze 56: { 0x006df, 0x006e4, 0, NULL, NULL },
57: { 0x006e7, 0x006e8, 0, NULL, NULL },
58: { 0x00711, 0x00711, 0, NULL, NULL },
59: { 0x0070f, 0x0070f, 0, NULL, NULL },
60: { 0x00730, 0x0074a, 0, NULL, NULL },
61: { 0x0081b, 0x00823, 0, NULL, NULL },
62: { 0x007eb, 0x007f3, 0, NULL, NULL },
63: { 0x00816, 0x00819, 0, NULL, NULL },
64: { 0x00829, 0x0082d, 0, NULL, NULL },
65: { 0x00825, 0x00827, 0, NULL, NULL },
66: { 0x00859, 0x0085b, 0, NULL, NULL },
67: { 0x00a41, 0x00a42, 0, NULL, NULL },
68: { 0x00981, 0x00981, 0, NULL, NULL },
69: { 0x00941, 0x00948, 0, NULL, NULL },
70: { 0x0093a, 0x0093a, 0, NULL, NULL },
71: { 0x0093c, 0x0093c, 0, NULL, NULL },
72: { 0x00951, 0x00957, 0, NULL, NULL },
73: { 0x0094d, 0x0094d, 0, NULL, NULL },
1.1 nicm 74: { 0x00962, 0x00963, 0, NULL, NULL },
1.14 schwarze 75: { 0x009e2, 0x009e3, 0, NULL, NULL },
76: { 0x009c1, 0x009c4, 0, NULL, NULL },
77: { 0x009bc, 0x009bc, 0, NULL, NULL },
78: { 0x009cd, 0x009cd, 0, NULL, NULL },
79: { 0x00a01, 0x00a02, 0, NULL, NULL },
80: { 0x00a3c, 0x00a3c, 0, NULL, NULL },
81: { 0x00ac1, 0x00ac5, 0, NULL, NULL },
82: { 0x00a70, 0x00a71, 0, NULL, NULL },
83: { 0x00a4b, 0x00a4d, 0, NULL, NULL },
84: { 0x00a47, 0x00a48, 0, NULL, NULL },
85: { 0x00a51, 0x00a51, 0, NULL, NULL },
86: { 0x00a81, 0x00a82, 0, NULL, NULL },
87: { 0x00a75, 0x00a75, 0, NULL, NULL },
88: { 0x00abc, 0x00abc, 0, NULL, NULL },
89: { 0x00ae2, 0x00ae3, 0, NULL, NULL },
90: { 0x00ac7, 0x00ac8, 0, NULL, NULL },
91: { 0x00acd, 0x00acd, 0, NULL, NULL },
92: { 0x00b3c, 0x00b3c, 0, NULL, NULL },
1.1 nicm 93: { 0x00b01, 0x00b01, 0, NULL, NULL },
1.14 schwarze 94: { 0x00b3f, 0x00b3f, 0, NULL, NULL },
95: { 0x03190, 0x031ba, 2, NULL, NULL },
96: { 0x017c9, 0x017d3, 0, NULL, NULL },
97: { 0x00ec8, 0x00ecd, 0, NULL, NULL },
98: { 0x00cc6, 0x00cc6, 0, NULL, NULL },
99: { 0x00c3e, 0x00c40, 0, NULL, NULL },
100: { 0x00b82, 0x00b82, 0, NULL, NULL },
101: { 0x00b56, 0x00b56, 0, NULL, NULL },
102: { 0x00b4d, 0x00b4d, 0, NULL, NULL },
103: { 0x00b62, 0x00b63, 0, NULL, NULL },
104: { 0x00bcd, 0x00bcd, 0, NULL, NULL },
105: { 0x00bc0, 0x00bc0, 0, NULL, NULL },
106: { 0x00c00, 0x00c00, 0, NULL, NULL },
107: { 0x00c62, 0x00c63, 0, NULL, NULL },
1.1 nicm 108: { 0x00c4a, 0x00c4d, 0, NULL, NULL },
1.14 schwarze 109: { 0x00c46, 0x00c48, 0, NULL, NULL },
110: { 0x00c55, 0x00c56, 0, NULL, NULL },
111: { 0x00cbc, 0x00cbc, 0, NULL, NULL },
112: { 0x00c81, 0x00c81, 0, NULL, NULL },
1.1 nicm 113: { 0x00cbf, 0x00cbf, 0, NULL, NULL },
114: { 0x00dd2, 0x00dd4, 0, NULL, NULL },
1.14 schwarze 115: { 0x00d41, 0x00d44, 0, NULL, NULL },
116: { 0x00ce2, 0x00ce3, 0, NULL, NULL },
117: { 0x00ccc, 0x00ccd, 0, NULL, NULL },
118: { 0x00d01, 0x00d01, 0, NULL, NULL },
119: { 0x00d62, 0x00d63, 0, NULL, NULL },
120: { 0x00d4d, 0x00d4d, 0, NULL, NULL },
121: { 0x00dca, 0x00dca, 0, NULL, NULL },
122: { 0x00e47, 0x00e4e, 0, NULL, NULL },
123: { 0x00e31, 0x00e31, 0, NULL, NULL },
124: { 0x00dd6, 0x00dd6, 0, NULL, NULL },
125: { 0x00e34, 0x00e3a, 0, NULL, NULL },
126: { 0x00eb4, 0x00eb9, 0, NULL, NULL },
127: { 0x00eb1, 0x00eb1, 0, NULL, NULL },
128: { 0x00ebb, 0x00ebc, 0, NULL, NULL },
129: { 0x0105e, 0x01060, 0, NULL, NULL },
130: { 0x00f8d, 0x00f97, 0, NULL, NULL },
131: { 0x00f39, 0x00f39, 0, NULL, NULL },
132: { 0x00f35, 0x00f35, 0, NULL, NULL },
133: { 0x00f18, 0x00f19, 0, NULL, NULL },
134: { 0x00f37, 0x00f37, 0, NULL, NULL },
135: { 0x00f80, 0x00f84, 0, NULL, NULL },
1.1 nicm 136: { 0x00f71, 0x00f7e, 0, NULL, NULL },
1.14 schwarze 137: { 0x00f86, 0x00f87, 0, NULL, NULL },
138: { 0x01032, 0x01037, 0, NULL, NULL },
139: { 0x00fc6, 0x00fc6, 0, NULL, NULL },
140: { 0x00f99, 0x00fbc, 0, NULL, NULL },
141: { 0x0102d, 0x01030, 0, NULL, NULL },
142: { 0x0103d, 0x0103e, 0, NULL, NULL },
143: { 0x01039, 0x0103a, 0, NULL, NULL },
144: { 0x01058, 0x01059, 0, NULL, NULL },
145: { 0x0135d, 0x0135f, 0, NULL, NULL },
146: { 0x01085, 0x01086, 0, NULL, NULL },
147: { 0x01071, 0x01074, 0, NULL, NULL },
148: { 0x01082, 0x01082, 0, NULL, NULL },
149: { 0x0109d, 0x0109d, 0, NULL, NULL },
150: { 0x0108d, 0x0108d, 0, NULL, NULL },
151: { 0x01100, 0x011ff, 2, NULL, NULL },
152: { 0x01772, 0x01773, 0, NULL, NULL },
153: { 0x01732, 0x01734, 0, NULL, NULL },
154: { 0x01712, 0x01714, 0, NULL, NULL },
1.1 nicm 155: { 0x01752, 0x01753, 0, NULL, NULL },
1.14 schwarze 156: { 0x017b7, 0x017bd, 0, NULL, NULL },
157: { 0x017b4, 0x017b5, 0, NULL, NULL },
158: { 0x017c6, 0x017c6, 0, NULL, NULL },
159: { 0x01c2c, 0x01c33, 0, NULL, NULL },
160: { 0x01a7f, 0x01a7f, 0, NULL, NULL },
161: { 0x01a17, 0x01a18, 0, NULL, NULL },
162: { 0x01920, 0x01922, 0, NULL, NULL },
163: { 0x0180b, 0x0180e, 0, NULL, NULL },
1.1 nicm 164: { 0x017dd, 0x017dd, 0, NULL, NULL },
1.14 schwarze 165: { 0x018a9, 0x018a9, 0, NULL, NULL },
166: { 0x01932, 0x01932, 0, NULL, NULL },
1.1 nicm 167: { 0x01927, 0x01928, 0, NULL, NULL },
168: { 0x01939, 0x0193b, 0, NULL, NULL },
1.14 schwarze 169: { 0x01a60, 0x01a60, 0, NULL, NULL },
170: { 0x01a56, 0x01a56, 0, NULL, NULL },
171: { 0x01a1b, 0x01a1b, 0, NULL, NULL },
172: { 0x01a58, 0x01a5e, 0, NULL, NULL },
173: { 0x01a65, 0x01a6c, 0, NULL, NULL },
174: { 0x01a62, 0x01a62, 0, NULL, NULL },
175: { 0x01a73, 0x01a7c, 0, NULL, NULL },
176: { 0x01b80, 0x01b81, 0, NULL, NULL },
177: { 0x01b36, 0x01b3a, 0, NULL, NULL },
178: { 0x01b00, 0x01b03, 0, NULL, NULL },
179: { 0x01ab0, 0x01abe, 0, NULL, NULL },
180: { 0x01b34, 0x01b34, 0, NULL, NULL },
181: { 0x01b42, 0x01b42, 0, NULL, NULL },
1.1 nicm 182: { 0x01b3c, 0x01b3c, 0, NULL, NULL },
1.14 schwarze 183: { 0x01b6b, 0x01b73, 0, NULL, NULL },
184: { 0x01be6, 0x01be6, 0, NULL, NULL },
185: { 0x01ba8, 0x01ba9, 0, NULL, NULL },
186: { 0x01ba2, 0x01ba5, 0, NULL, NULL },
187: { 0x01bab, 0x01bad, 0, NULL, NULL },
188: { 0x01bed, 0x01bed, 0, NULL, NULL },
189: { 0x01be8, 0x01be9, 0, NULL, NULL },
190: { 0x01bef, 0x01bf1, 0, NULL, NULL },
191: { 0x02329, 0x0232a, 2, NULL, NULL },
192: { 0x01dc0, 0x01df5, 0, NULL, NULL },
193: { 0x01ce2, 0x01ce8, 0, NULL, NULL },
194: { 0x01cd0, 0x01cd2, 0, NULL, NULL },
195: { 0x01c36, 0x01c37, 0, NULL, NULL },
196: { 0x01cd4, 0x01ce0, 0, NULL, NULL },
197: { 0x01cf4, 0x01cf4, 0, NULL, NULL },
198: { 0x01ced, 0x01ced, 0, NULL, NULL },
199: { 0x01cf8, 0x01cf9, 0, NULL, NULL },
200: { 0x02060, 0x02064, 0, NULL, NULL },
201: { 0x0200b, 0x0200f, 0, NULL, NULL },
202: { 0x01dfc, 0x01dff, 0, NULL, NULL },
203: { 0x0202a, 0x0202e, 0, NULL, NULL },
204: { 0x02066, 0x0206f, 0, NULL, NULL },
205: { 0x020d0, 0x020f0, 0, NULL, NULL },
206: { 0x03001, 0x03029, 2, NULL, NULL },
207: { 0x02e80, 0x02e99, 2, NULL, NULL },
208: { 0x02d7f, 0x02d7f, 0, NULL, NULL },
209: { 0x02cef, 0x02cf1, 0, NULL, NULL },
210: { 0x02de0, 0x02dff, 0, NULL, NULL },
211: { 0x02f00, 0x02fd5, 2, NULL, NULL },
212: { 0x02e9b, 0x02ef3, 2, NULL, NULL },
213: { 0x02ff0, 0x02ffb, 2, NULL, NULL },
1.1 nicm 214: { 0x03099, 0x0309a, 0, NULL, NULL },
1.14 schwarze 215: { 0x0302e, 0x0303e, 2, NULL, NULL },
216: { 0x0302a, 0x0302d, 0, NULL, NULL },
217: { 0x03041, 0x03096, 2, NULL, NULL },
218: { 0x03105, 0x0312d, 2, NULL, NULL },
219: { 0x0309b, 0x030ff, 2, NULL, NULL },
220: { 0x03131, 0x0318e, 2, NULL, NULL },
1.1 nicm 221: { 0x10a3f, 0x10a3f, 0, NULL, NULL },
1.14 schwarze 222: { 0x0aa4c, 0x0aa4c, 0, NULL, NULL },
1.1 nicm 223: { 0x0a825, 0x0a826, 0, NULL, NULL },
1.14 schwarze 224: { 0x0a490, 0x0a4c6, 2, NULL, NULL },
225: { 0x03250, 0x032fe, 2, NULL, NULL },
226: { 0x031f0, 0x0321e, 2, NULL, NULL },
227: { 0x031c0, 0x031e3, 2, NULL, NULL },
228: { 0x03220, 0x03247, 2, NULL, NULL },
229: { 0x04e00, 0x09fcc, 2, NULL, NULL },
230: { 0x03300, 0x04db5, 2, NULL, NULL },
231: { 0x0a000, 0x0a48c, 2, NULL, NULL },
232: { 0x0a6f0, 0x0a6f1, 0, NULL, NULL },
233: { 0x0a674, 0x0a67d, 0, NULL, NULL },
234: { 0x0a66f, 0x0a672, 0, NULL, NULL },
235: { 0x0a69f, 0x0a69f, 0, NULL, NULL },
236: { 0x0a806, 0x0a806, 0, NULL, NULL },
237: { 0x0a802, 0x0a802, 0, NULL, NULL },
238: { 0x0a80b, 0x0a80b, 0, NULL, NULL },
239: { 0x0a9b6, 0x0a9b9, 0, NULL, NULL },
240: { 0x0a947, 0x0a951, 0, NULL, NULL },
241: { 0x0a8e0, 0x0a8f1, 0, NULL, NULL },
242: { 0x0a8c4, 0x0a8c4, 0, NULL, NULL },
243: { 0x0a926, 0x0a92d, 0, NULL, NULL },
244: { 0x0a980, 0x0a982, 0, NULL, NULL },
245: { 0x0a960, 0x0a97c, 2, NULL, NULL },
246: { 0x0a9b3, 0x0a9b3, 0, NULL, NULL },
247: { 0x0aa29, 0x0aa2e, 0, NULL, NULL },
248: { 0x0a9bc, 0x0a9bc, 0, NULL, NULL },
249: { 0x0a9e5, 0x0a9e5, 0, NULL, NULL },
250: { 0x0aa35, 0x0aa36, 0, NULL, NULL },
251: { 0x0aa31, 0x0aa32, 0, NULL, NULL },
252: { 0x0aa43, 0x0aa43, 0, NULL, NULL },
253: { 0x0fb1e, 0x0fb1e, 0, NULL, NULL },
254: { 0x0aaf6, 0x0aaf6, 0, NULL, NULL },
255: { 0x0aab7, 0x0aab8, 0, NULL, NULL },
256: { 0x0aab0, 0x0aab0, 0, NULL, NULL },
257: { 0x0aa7c, 0x0aa7c, 0, NULL, NULL },
258: { 0x0aab2, 0x0aab4, 0, NULL, NULL },
259: { 0x0aac1, 0x0aac1, 0, NULL, NULL },
260: { 0x0aabe, 0x0aabf, 0, NULL, NULL },
261: { 0x0aaec, 0x0aaed, 0, NULL, NULL },
262: { 0x0ac00, 0x0d7a3, 2, NULL, NULL },
263: { 0x0abe8, 0x0abe8, 0, NULL, NULL },
264: { 0x0abe5, 0x0abe5, 0, NULL, NULL },
265: { 0x0abed, 0x0abed, 0, NULL, NULL },
266: { 0x0f900, 0x0fa6d, 2, NULL, NULL },
1.24 nicm 267: { 0x0d800, 0x0dfff, 0, NULL, NULL },
1.14 schwarze 268: { 0x0fa70, 0x0fad9, 2, NULL, NULL },
269: { 0x0fff9, 0x0fffb, 0, NULL, NULL },
270: { 0x0fe30, 0x0fe52, 2, NULL, NULL },
1.1 nicm 271: { 0x0fe10, 0x0fe19, 2, NULL, NULL },
1.14 schwarze 272: { 0x0fe00, 0x0fe0f, 0, NULL, NULL },
273: { 0x0fe20, 0x0fe2d, 0, NULL, NULL },
274: { 0x0fe68, 0x0fe6b, 2, NULL, NULL },
275: { 0x0fe54, 0x0fe66, 2, NULL, NULL },
276: { 0x0feff, 0x0feff, 0, NULL, NULL },
277: { 0x10a01, 0x10a03, 0, NULL, NULL },
278: { 0x102e0, 0x102e0, 0, NULL, NULL },
279: { 0x101fd, 0x101fd, 0, NULL, NULL },
280: { 0x10376, 0x1037a, 0, NULL, NULL },
281: { 0x10a0c, 0x10a0f, 0, NULL, NULL },
1.1 nicm 282: { 0x10a05, 0x10a06, 0, NULL, NULL },
1.14 schwarze 283: { 0x10a38, 0x10a3a, 0, NULL, NULL },
284: { 0x11633, 0x1163a, 0, NULL, NULL },
285: { 0x11236, 0x11237, 0, NULL, NULL },
286: { 0x11100, 0x11102, 0, NULL, NULL },
287: { 0x1107f, 0x11081, 0, NULL, NULL },
288: { 0x11001, 0x11001, 0, NULL, NULL },
289: { 0x10ae5, 0x10ae6, 0, NULL, NULL },
290: { 0x11038, 0x11046, 0, NULL, NULL },
291: { 0x110b9, 0x110ba, 0, NULL, NULL },
292: { 0x110b3, 0x110b6, 0, NULL, NULL },
293: { 0x110bd, 0x110bd, 0, NULL, NULL },
294: { 0x11180, 0x11181, 0, NULL, NULL },
295: { 0x1112d, 0x11134, 0, NULL, NULL },
296: { 0x11127, 0x1112b, 0, NULL, NULL },
297: { 0x11173, 0x11173, 0, NULL, NULL },
298: { 0x1122f, 0x11231, 0, NULL, NULL },
299: { 0x111b6, 0x111be, 0, NULL, NULL },
300: { 0x11234, 0x11234, 0, NULL, NULL },
301: { 0x11370, 0x11374, 0, NULL, NULL },
302: { 0x11301, 0x11301, 0, NULL, NULL },
303: { 0x112df, 0x112df, 0, NULL, NULL },
304: { 0x112e3, 0x112ea, 0, NULL, NULL },
305: { 0x11340, 0x11340, 0, NULL, NULL },
306: { 0x1133c, 0x1133c, 0, NULL, NULL },
307: { 0x11366, 0x1136c, 0, NULL, NULL },
308: { 0x114c2, 0x114c3, 0, NULL, NULL },
309: { 0x114ba, 0x114ba, 0, NULL, NULL },
310: { 0x114b3, 0x114b8, 0, NULL, NULL },
311: { 0x114bf, 0x114c0, 0, NULL, NULL },
312: { 0x115bc, 0x115bd, 0, NULL, NULL },
313: { 0x115b2, 0x115b5, 0, NULL, NULL },
314: { 0x115bf, 0x115c0, 0, NULL, NULL },
315: { 0x1d1aa, 0x1d1ad, 0, NULL, NULL },
316: { 0x16b30, 0x16b36, 0, NULL, NULL },
317: { 0x116ad, 0x116ad, 0, NULL, NULL },
318: { 0x1163f, 0x11640, 0, NULL, NULL },
319: { 0x1163d, 0x1163d, 0, NULL, NULL },
320: { 0x116ab, 0x116ab, 0, NULL, NULL },
321: { 0x116b7, 0x116b7, 0, NULL, NULL },
322: { 0x116b0, 0x116b5, 0, NULL, NULL },
323: { 0x16af0, 0x16af4, 0, NULL, NULL },
324: { 0x1bca0, 0x1bca3, 0, NULL, NULL },
325: { 0x1b000, 0x1b001, 2, NULL, NULL },
326: { 0x16f8f, 0x16f92, 0, NULL, NULL },
327: { 0x1bc9d, 0x1bc9e, 0, NULL, NULL },
328: { 0x1d173, 0x1d182, 0, NULL, NULL },
329: { 0x1d167, 0x1d169, 0, NULL, NULL },
330: { 0x1d185, 0x1d18b, 0, NULL, NULL },
331: { 0x2a700, 0x2b734, 2, NULL, NULL },
332: { 0x1f210, 0x1f23a, 2, NULL, NULL },
333: { 0x1e8d0, 0x1e8d6, 0, NULL, NULL },
334: { 0x1d242, 0x1d244, 0, NULL, NULL },
335: { 0x1f200, 0x1f202, 2, NULL, NULL },
336: { 0x1f250, 0x1f251, 2, NULL, NULL },
337: { 0x1f240, 0x1f248, 2, NULL, NULL },
338: { 0x20000, 0x2a6d6, 2, NULL, NULL },
339: { 0xe0020, 0xe007f, 0, NULL, NULL },
340: { 0x2f800, 0x2fa1d, 2, NULL, NULL },
341: { 0x2b740, 0x2b81d, 2, NULL, NULL },
342: { 0xe0001, 0xe0001, 0, NULL, NULL },
343: { 0xf0000, 0xffffd, 0, NULL, NULL },
344: { 0xe0100, 0xe01ef, 0, NULL, NULL },
345: { 0x100000, 0x10fffd, 0, NULL, NULL },
1.1 nicm 346: };
1.17 nicm 347: static struct utf8_width_entry *utf8_width_root = NULL;
1.1 nicm 348:
1.17 nicm 349: static void utf8_build(void);
1.1 nicm 350:
1.11 nicm 351: /* Set a single character. */
352: void
1.19 nicm 353: utf8_set(struct utf8_data *ud, u_char ch)
1.11 nicm 354: {
1.20 nicm 355: u_int i;
356:
1.19 nicm 357: *ud->data = ch;
1.25 ! nicm 358: ud->have = 1;
1.19 nicm 359: ud->size = 1;
1.11 nicm 360:
1.19 nicm 361: ud->width = 1;
1.20 nicm 362:
363: for (i = ud->size; i < sizeof ud->data; i++)
364: ud->data[i] = '\0';
365: }
366:
367: /* Copy UTF-8 character. */
368: void
369: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
370: {
371: u_int i;
372:
373: memcpy(to, from, sizeof *to);
374:
375: for (i = to->size; i < sizeof to->data; i++)
376: to->data[i] = '\0';
1.11 nicm 377: }
378:
1.4 nicm 379: /*
380: * Open UTF-8 sequence.
381: *
382: * 11000010-11011111 C2-DF start of 2-byte sequence
383: * 11100000-11101111 E0-EF start of 3-byte sequence
384: * 11110000-11110100 F0-F4 start of 4-byte sequence
385: */
1.23 nicm 386: enum utf8_state
1.19 nicm 387: utf8_open(struct utf8_data *ud, u_char ch)
1.4 nicm 388: {
1.19 nicm 389: memset(ud, 0, sizeof *ud);
1.4 nicm 390: if (ch >= 0xc2 && ch <= 0xdf)
1.19 nicm 391: ud->size = 2;
1.4 nicm 392: else if (ch >= 0xe0 && ch <= 0xef)
1.19 nicm 393: ud->size = 3;
1.4 nicm 394: else if (ch >= 0xf0 && ch <= 0xf4)
1.19 nicm 395: ud->size = 4;
1.4 nicm 396: else
1.23 nicm 397: return (UTF8_ERROR);
1.19 nicm 398: utf8_append(ud, ch);
1.23 nicm 399: return (UTF8_MORE);
1.4 nicm 400: }
401:
1.23 nicm 402: /* Append character to UTF-8, closing if finished. */
403: enum utf8_state
1.19 nicm 404: utf8_append(struct utf8_data *ud, u_char ch)
1.4 nicm 405: {
1.19 nicm 406: if (ud->have >= ud->size)
1.4 nicm 407: fatalx("UTF-8 character overflow");
1.19 nicm 408: if (ud->size > sizeof ud->data)
1.4 nicm 409: fatalx("UTF-8 character size too large");
410:
1.21 nicm 411: if (ud->have != 0 && (ch & 0xc0) != 0x80)
412: ud->width = 0xff;
413:
1.19 nicm 414: ud->data[ud->have++] = ch;
415: if (ud->have != ud->size)
1.23 nicm 416: return (UTF8_MORE);
1.4 nicm 417:
1.21 nicm 418: if (ud->width == 0xff)
1.23 nicm 419: return (UTF8_ERROR);
1.19 nicm 420: ud->width = utf8_width(utf8_combine(ud));
1.23 nicm 421: return (UTF8_DONE);
1.1 nicm 422: }
423:
1.4 nicm 424: /* Build UTF-8 width tree. */
1.17 nicm 425: static void
1.1 nicm 426: utf8_build(void)
427: {
428: struct utf8_width_entry **ptr, *item, *node;
1.17 nicm 429: u_int i;
1.1 nicm 430:
431: for (i = 0; i < nitems(utf8_width_table); i++) {
432: item = &utf8_width_table[i];
433:
434: ptr = &utf8_width_root;
435: while (*ptr != NULL) {
436: node = *ptr;
437: if (item->last < node->first)
1.13 nicm 438: ptr = &node->left;
1.1 nicm 439: else if (item->first > node->last)
1.13 nicm 440: ptr = &node->right;
1.1 nicm 441: }
442: *ptr = item;
443: }
444: }
445:
1.17 nicm 446: /* Lookup width of UTF-8 data in tree. */
447: u_int
448: utf8_width(u_int uc)
449: {
450: struct utf8_width_entry *item;
451:
452: if (utf8_width_root == NULL)
453: utf8_build();
454:
455: item = utf8_width_root;
456: while (item != NULL) {
457: if (uc < item->first)
458: item = item->left;
459: else if (uc > item->last)
460: item = item->right;
461: else
462: return (item->width);
463: }
464: return (1);
465: }
466:
1.4 nicm 467: /* Combine UTF-8 into 32-bit Unicode. */
1.1 nicm 468: u_int
1.19 nicm 469: utf8_combine(const struct utf8_data *ud)
1.1 nicm 470: {
1.22 nicm 471: u_int uc;
1.1 nicm 472:
1.22 nicm 473: uc = 0xfffd;
1.19 nicm 474: switch (ud->size) {
1.4 nicm 475: case 1:
1.22 nicm 476: uc = ud->data[0];
1.4 nicm 477: break;
478: case 2:
1.22 nicm 479: uc = ud->data[1] & 0x3f;
480: uc |= (ud->data[0] & 0x1f) << 6;
1.4 nicm 481: break;
482: case 3:
1.22 nicm 483: uc = ud->data[2] & 0x3f;
484: uc |= (ud->data[1] & 0x3f) << 6;
485: uc |= (ud->data[0] & 0xf) << 12;
1.4 nicm 486: break;
487: case 4:
1.22 nicm 488: uc = ud->data[3] & 0x3f;
489: uc |= (ud->data[2] & 0x3f) << 6;
490: uc |= (ud->data[1] & 0x3f) << 12;
491: uc |= (ud->data[0] & 0x7) << 18;
1.4 nicm 492: break;
1.1 nicm 493: }
1.22 nicm 494: return (uc);
1.15 nicm 495: }
496:
1.17 nicm 497: /* Split 32-bit Unicode into UTF-8. */
1.23 nicm 498: enum utf8_state
1.19 nicm 499: utf8_split(u_int uc, struct utf8_data *ud)
1.15 nicm 500: {
501: if (uc < 0x7f) {
1.19 nicm 502: ud->size = 1;
503: ud->data[0] = uc;
1.15 nicm 504: } else if (uc < 0x7ff) {
1.19 nicm 505: ud->size = 2;
506: ud->data[0] = 0xc0 | ((uc >> 6) & 0x1f);
507: ud->data[1] = 0x80 | (uc & 0x3f);
1.15 nicm 508: } else if (uc < 0xffff) {
1.19 nicm 509: ud->size = 3;
510: ud->data[0] = 0xe0 | ((uc >> 12) & 0xf);
511: ud->data[1] = 0x80 | ((uc >> 6) & 0x3f);
512: ud->data[2] = 0x80 | (uc & 0x3f);
1.15 nicm 513: } else if (uc < 0x1fffff) {
1.19 nicm 514: ud->size = 4;
515: ud->data[0] = 0xf0 | ((uc >> 18) & 0x7);
516: ud->data[1] = 0x80 | ((uc >> 12) & 0x3f);
517: ud->data[2] = 0x80 | ((uc >> 6) & 0x3f);
518: ud->data[3] = 0x80 | (uc & 0x3f);
1.15 nicm 519: } else
1.23 nicm 520: return (UTF8_ERROR);
1.19 nicm 521: ud->width = utf8_width(uc);
1.23 nicm 522: return (UTF8_DONE);
1.6 nicm 523: }
524:
525: /* Split a two-byte UTF-8 character. */
526: u_int
527: utf8_split2(u_int uc, u_char *ptr)
528: {
529: if (uc > 0x7f) {
530: ptr[0] = (uc >> 6) | 0xc0;
531: ptr[1] = (uc & 0x3f) | 0x80;
532: return (2);
533: }
534: ptr[0] = uc;
1.1 nicm 535: return (1);
1.9 nicm 536: }
537:
538: /*
539: * Encode len characters from src into dst, which is guaranteed to have four
540: * bytes available for each character from src (for \abc or UTF-8) plus space
541: * for \0.
542: */
543: int
544: utf8_strvis(char *dst, const char *src, size_t len, int flag)
545: {
1.19 nicm 546: struct utf8_data ud;
1.9 nicm 547: const char *start, *end;
1.23 nicm 548: enum utf8_state more;
1.9 nicm 549: size_t i;
550:
551: start = dst;
552: end = src + len;
553:
554: while (src < end) {
1.23 nicm 555: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
556: while (++src < end && more == UTF8_MORE)
1.19 nicm 557: more = utf8_append(&ud, *src);
1.23 nicm 558: if (more == UTF8_DONE) {
1.9 nicm 559: /* UTF-8 character finished. */
1.19 nicm 560: for (i = 0; i < ud.size; i++)
561: *dst++ = ud.data[i];
1.9 nicm 562: continue;
563: }
1.23 nicm 564: /* Not a complete, valid UTF-8 character. */
565: src -= ud.have;
1.9 nicm 566: }
567: if (src < end - 1)
568: dst = vis(dst, src[0], flag, src[1]);
569: else if (src < end)
570: dst = vis(dst, src[0], flag, '\0');
571: src++;
572: }
573:
574: *dst = '\0';
575: return (dst - start);
1.16 nicm 576: }
577:
578: /*
579: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
580: * the returned string. Anything not valid printable ASCII or UTF-8 is
581: * stripped.
582: */
583: char *
584: utf8_sanitize(const char *src)
585: {
586: char *dst;
587: size_t n;
1.23 nicm 588: enum utf8_state more;
1.19 nicm 589: struct utf8_data ud;
1.16 nicm 590: u_int i;
591:
592: dst = NULL;
593:
594: n = 0;
595: while (*src != '\0') {
596: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 597: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
598: while (*++src != '\0' && more == UTF8_MORE)
1.19 nicm 599: more = utf8_append(&ud, *src);
1.23 nicm 600: if (more == UTF8_DONE) {
1.19 nicm 601: dst = xreallocarray(dst, n + ud.width,
1.16 nicm 602: sizeof *dst);
1.19 nicm 603: for (i = 0; i < ud.width; i++)
1.16 nicm 604: dst[n++] = '_';
605: continue;
606: }
1.19 nicm 607: src -= ud.have;
1.16 nicm 608: }
609: if (*src > 0x1f && *src < 0x7f)
1.21 nicm 610: dst[n++] = *src;
1.23 nicm 611: else
612: dst[n++] = '_';
1.16 nicm 613: src++;
614: }
615:
616: dst = xreallocarray(dst, n + 1, sizeof *dst);
617: dst[n] = '\0';
618: return (dst);
1.11 nicm 619: }
620:
621: /*
622: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
623: * Caller frees.
624: */
625: struct utf8_data *
626: utf8_fromcstr(const char *src)
627: {
628: struct utf8_data *dst;
629: size_t n;
1.23 nicm 630: enum utf8_state more;
1.11 nicm 631:
632: dst = NULL;
633:
634: n = 0;
635: while (*src != '\0') {
1.12 nicm 636: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 637: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
638: while (*++src != '\0' && more == UTF8_MORE)
1.11 nicm 639: more = utf8_append(&dst[n], *src);
1.23 nicm 640: if (more == UTF8_DONE) {
1.11 nicm 641: n++;
642: continue;
643: }
644: src -= dst[n].have;
645: }
1.23 nicm 646: utf8_set(&dst[n], *src);
647: n++;
1.11 nicm 648: src++;
649: }
650:
1.12 nicm 651: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 652: dst[n].size = 0;
653: return (dst);
654: }
655:
656: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
657: char *
658: utf8_tocstr(struct utf8_data *src)
659: {
660: char *dst;
661: size_t n;
662:
663: dst = NULL;
664:
665: n = 0;
666: for(; src->size != 0; src++) {
1.12 nicm 667: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 668: memcpy(dst + n, src->data, src->size);
669: n += src->size;
670: }
671:
1.12 nicm 672: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 673: dst[n] = '\0';
674: return (dst);
675: }
676:
677: /* Get width of UTF-8 string. */
678: u_int
679: utf8_cstrwidth(const char *s)
680: {
681: struct utf8_data tmp;
682: u_int width;
1.23 nicm 683: enum utf8_state more;
1.11 nicm 684:
685: width = 0;
686: while (*s != '\0') {
1.23 nicm 687: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
688: while (*++s != '\0' && more == UTF8_MORE)
1.11 nicm 689: more = utf8_append(&tmp, *s);
1.23 nicm 690: if (more == UTF8_DONE) {
1.11 nicm 691: width += tmp.width;
692: continue;
693: }
694: s -= tmp.have;
695: }
1.23 nicm 696: if (*s > 0x1f && *s != 0x7f)
1.21 nicm 697: width++;
1.11 nicm 698: s++;
699: }
700: return (width);
701: }
702:
703: /* Trim UTF-8 string to width. Caller frees. */
704: char *
705: utf8_trimcstr(const char *s, u_int width)
706: {
707: struct utf8_data *tmp, *next;
708: char *out;
709: u_int at;
710:
711: tmp = utf8_fromcstr(s);
712:
713: at = 0;
714: for (next = tmp; next->size != 0; next++) {
715: if (at + next->width > width) {
716: next->size = 0;
717: break;
718: }
719: at += next->width;
720: }
721:
722: out = utf8_tocstr(tmp);
723: free(tmp);
1.18 nicm 724: return (out);
725: }
726:
727: /* Pad UTF-8 string to width. Caller frees. */
728: char *
729: utf8_padcstr(const char *s, u_int width)
730: {
731: size_t slen;
732: char *out;
733: u_int n, i;
734:
735: n = utf8_cstrwidth(s);
736: if (n >= width)
737: return (xstrdup(s));
738:
739: slen = strlen(s);
740: out = xmalloc(slen + 1 + (width - n));
741: memcpy(out, s, slen);
742: for (i = n; i < width; i++)
743: out[slen++] = ' ';
744: out[slen] = '\0';
1.11 nicm 745: return (out);
1.1 nicm 746: }