Annotation of src/usr.bin/tmux/utf8.c, Revision 1.18
1.18 ! nicm 1: /* $OpenBSD: utf8.c,v 1.17 2015/11/12 12:19:57 nicm Exp $ */
1.1 nicm 2:
3: /*
4: * Copyright (c) 2008 Nicholas Marriott <nicm@users.sourceforge.net>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.11 nicm 21: #include <stdlib.h>
1.1 nicm 22: #include <string.h>
1.9 nicm 23: #include <vis.h>
1.1 nicm 24:
25: #include "tmux.h"
26:
27: struct utf8_width_entry {
28: u_int first;
29: u_int last;
30:
31: int width;
32:
33: struct utf8_width_entry *left;
34: struct utf8_width_entry *right;
35: };
36:
1.14 schwarze 37: /* Sorted, then repeatedly split in the middle to balance the tree. */
1.17 nicm 38: static struct utf8_width_entry utf8_width_table[] = {
1.14 schwarze 39: { 0x00b41, 0x00b44, 0, NULL, NULL },
40: { 0x008e4, 0x00902, 0, NULL, NULL },
41: { 0x006d6, 0x006dd, 0, NULL, NULL },
42: { 0x005c4, 0x005c5, 0, NULL, NULL },
1.1 nicm 43: { 0x00591, 0x005bd, 0, NULL, NULL },
1.14 schwarze 44: { 0x00300, 0x0036f, 0, NULL, NULL },
45: { 0x00483, 0x00489, 0, NULL, NULL },
1.1 nicm 46: { 0x005bf, 0x005bf, 0, NULL, NULL },
1.14 schwarze 47: { 0x005c1, 0x005c2, 0, NULL, NULL },
48: { 0x00610, 0x0061a, 0, NULL, NULL },
49: { 0x00600, 0x00605, 0, NULL, NULL },
50: { 0x005c7, 0x005c7, 0, NULL, NULL },
51: { 0x0064b, 0x0065f, 0, NULL, NULL },
52: { 0x0061c, 0x0061c, 0, NULL, NULL },
53: { 0x00670, 0x00670, 0, NULL, NULL },
54: { 0x007a6, 0x007b0, 0, NULL, NULL },
1.1 nicm 55: { 0x006ea, 0x006ed, 0, NULL, NULL },
1.14 schwarze 56: { 0x006df, 0x006e4, 0, NULL, NULL },
57: { 0x006e7, 0x006e8, 0, NULL, NULL },
58: { 0x00711, 0x00711, 0, NULL, NULL },
59: { 0x0070f, 0x0070f, 0, NULL, NULL },
60: { 0x00730, 0x0074a, 0, NULL, NULL },
61: { 0x0081b, 0x00823, 0, NULL, NULL },
62: { 0x007eb, 0x007f3, 0, NULL, NULL },
63: { 0x00816, 0x00819, 0, NULL, NULL },
64: { 0x00829, 0x0082d, 0, NULL, NULL },
65: { 0x00825, 0x00827, 0, NULL, NULL },
66: { 0x00859, 0x0085b, 0, NULL, NULL },
67: { 0x00a41, 0x00a42, 0, NULL, NULL },
68: { 0x00981, 0x00981, 0, NULL, NULL },
69: { 0x00941, 0x00948, 0, NULL, NULL },
70: { 0x0093a, 0x0093a, 0, NULL, NULL },
71: { 0x0093c, 0x0093c, 0, NULL, NULL },
72: { 0x00951, 0x00957, 0, NULL, NULL },
73: { 0x0094d, 0x0094d, 0, NULL, NULL },
1.1 nicm 74: { 0x00962, 0x00963, 0, NULL, NULL },
1.14 schwarze 75: { 0x009e2, 0x009e3, 0, NULL, NULL },
76: { 0x009c1, 0x009c4, 0, NULL, NULL },
77: { 0x009bc, 0x009bc, 0, NULL, NULL },
78: { 0x009cd, 0x009cd, 0, NULL, NULL },
79: { 0x00a01, 0x00a02, 0, NULL, NULL },
80: { 0x00a3c, 0x00a3c, 0, NULL, NULL },
81: { 0x00ac1, 0x00ac5, 0, NULL, NULL },
82: { 0x00a70, 0x00a71, 0, NULL, NULL },
83: { 0x00a4b, 0x00a4d, 0, NULL, NULL },
84: { 0x00a47, 0x00a48, 0, NULL, NULL },
85: { 0x00a51, 0x00a51, 0, NULL, NULL },
86: { 0x00a81, 0x00a82, 0, NULL, NULL },
87: { 0x00a75, 0x00a75, 0, NULL, NULL },
88: { 0x00abc, 0x00abc, 0, NULL, NULL },
89: { 0x00ae2, 0x00ae3, 0, NULL, NULL },
90: { 0x00ac7, 0x00ac8, 0, NULL, NULL },
91: { 0x00acd, 0x00acd, 0, NULL, NULL },
92: { 0x00b3c, 0x00b3c, 0, NULL, NULL },
1.1 nicm 93: { 0x00b01, 0x00b01, 0, NULL, NULL },
1.14 schwarze 94: { 0x00b3f, 0x00b3f, 0, NULL, NULL },
95: { 0x03190, 0x031ba, 2, NULL, NULL },
96: { 0x017c9, 0x017d3, 0, NULL, NULL },
97: { 0x00ec8, 0x00ecd, 0, NULL, NULL },
98: { 0x00cc6, 0x00cc6, 0, NULL, NULL },
99: { 0x00c3e, 0x00c40, 0, NULL, NULL },
100: { 0x00b82, 0x00b82, 0, NULL, NULL },
101: { 0x00b56, 0x00b56, 0, NULL, NULL },
102: { 0x00b4d, 0x00b4d, 0, NULL, NULL },
103: { 0x00b62, 0x00b63, 0, NULL, NULL },
104: { 0x00bcd, 0x00bcd, 0, NULL, NULL },
105: { 0x00bc0, 0x00bc0, 0, NULL, NULL },
106: { 0x00c00, 0x00c00, 0, NULL, NULL },
107: { 0x00c62, 0x00c63, 0, NULL, NULL },
1.1 nicm 108: { 0x00c4a, 0x00c4d, 0, NULL, NULL },
1.14 schwarze 109: { 0x00c46, 0x00c48, 0, NULL, NULL },
110: { 0x00c55, 0x00c56, 0, NULL, NULL },
111: { 0x00cbc, 0x00cbc, 0, NULL, NULL },
112: { 0x00c81, 0x00c81, 0, NULL, NULL },
1.1 nicm 113: { 0x00cbf, 0x00cbf, 0, NULL, NULL },
114: { 0x00dd2, 0x00dd4, 0, NULL, NULL },
1.14 schwarze 115: { 0x00d41, 0x00d44, 0, NULL, NULL },
116: { 0x00ce2, 0x00ce3, 0, NULL, NULL },
117: { 0x00ccc, 0x00ccd, 0, NULL, NULL },
118: { 0x00d01, 0x00d01, 0, NULL, NULL },
119: { 0x00d62, 0x00d63, 0, NULL, NULL },
120: { 0x00d4d, 0x00d4d, 0, NULL, NULL },
121: { 0x00dca, 0x00dca, 0, NULL, NULL },
122: { 0x00e47, 0x00e4e, 0, NULL, NULL },
123: { 0x00e31, 0x00e31, 0, NULL, NULL },
124: { 0x00dd6, 0x00dd6, 0, NULL, NULL },
125: { 0x00e34, 0x00e3a, 0, NULL, NULL },
126: { 0x00eb4, 0x00eb9, 0, NULL, NULL },
127: { 0x00eb1, 0x00eb1, 0, NULL, NULL },
128: { 0x00ebb, 0x00ebc, 0, NULL, NULL },
129: { 0x0105e, 0x01060, 0, NULL, NULL },
130: { 0x00f8d, 0x00f97, 0, NULL, NULL },
131: { 0x00f39, 0x00f39, 0, NULL, NULL },
132: { 0x00f35, 0x00f35, 0, NULL, NULL },
133: { 0x00f18, 0x00f19, 0, NULL, NULL },
134: { 0x00f37, 0x00f37, 0, NULL, NULL },
135: { 0x00f80, 0x00f84, 0, NULL, NULL },
1.1 nicm 136: { 0x00f71, 0x00f7e, 0, NULL, NULL },
1.14 schwarze 137: { 0x00f86, 0x00f87, 0, NULL, NULL },
138: { 0x01032, 0x01037, 0, NULL, NULL },
139: { 0x00fc6, 0x00fc6, 0, NULL, NULL },
140: { 0x00f99, 0x00fbc, 0, NULL, NULL },
141: { 0x0102d, 0x01030, 0, NULL, NULL },
142: { 0x0103d, 0x0103e, 0, NULL, NULL },
143: { 0x01039, 0x0103a, 0, NULL, NULL },
144: { 0x01058, 0x01059, 0, NULL, NULL },
145: { 0x0135d, 0x0135f, 0, NULL, NULL },
146: { 0x01085, 0x01086, 0, NULL, NULL },
147: { 0x01071, 0x01074, 0, NULL, NULL },
148: { 0x01082, 0x01082, 0, NULL, NULL },
149: { 0x0109d, 0x0109d, 0, NULL, NULL },
150: { 0x0108d, 0x0108d, 0, NULL, NULL },
151: { 0x01100, 0x011ff, 2, NULL, NULL },
152: { 0x01772, 0x01773, 0, NULL, NULL },
153: { 0x01732, 0x01734, 0, NULL, NULL },
154: { 0x01712, 0x01714, 0, NULL, NULL },
1.1 nicm 155: { 0x01752, 0x01753, 0, NULL, NULL },
1.14 schwarze 156: { 0x017b7, 0x017bd, 0, NULL, NULL },
157: { 0x017b4, 0x017b5, 0, NULL, NULL },
158: { 0x017c6, 0x017c6, 0, NULL, NULL },
159: { 0x01c2c, 0x01c33, 0, NULL, NULL },
160: { 0x01a7f, 0x01a7f, 0, NULL, NULL },
161: { 0x01a17, 0x01a18, 0, NULL, NULL },
162: { 0x01920, 0x01922, 0, NULL, NULL },
163: { 0x0180b, 0x0180e, 0, NULL, NULL },
1.1 nicm 164: { 0x017dd, 0x017dd, 0, NULL, NULL },
1.14 schwarze 165: { 0x018a9, 0x018a9, 0, NULL, NULL },
166: { 0x01932, 0x01932, 0, NULL, NULL },
1.1 nicm 167: { 0x01927, 0x01928, 0, NULL, NULL },
168: { 0x01939, 0x0193b, 0, NULL, NULL },
1.14 schwarze 169: { 0x01a60, 0x01a60, 0, NULL, NULL },
170: { 0x01a56, 0x01a56, 0, NULL, NULL },
171: { 0x01a1b, 0x01a1b, 0, NULL, NULL },
172: { 0x01a58, 0x01a5e, 0, NULL, NULL },
173: { 0x01a65, 0x01a6c, 0, NULL, NULL },
174: { 0x01a62, 0x01a62, 0, NULL, NULL },
175: { 0x01a73, 0x01a7c, 0, NULL, NULL },
176: { 0x01b80, 0x01b81, 0, NULL, NULL },
177: { 0x01b36, 0x01b3a, 0, NULL, NULL },
178: { 0x01b00, 0x01b03, 0, NULL, NULL },
179: { 0x01ab0, 0x01abe, 0, NULL, NULL },
180: { 0x01b34, 0x01b34, 0, NULL, NULL },
181: { 0x01b42, 0x01b42, 0, NULL, NULL },
1.1 nicm 182: { 0x01b3c, 0x01b3c, 0, NULL, NULL },
1.14 schwarze 183: { 0x01b6b, 0x01b73, 0, NULL, NULL },
184: { 0x01be6, 0x01be6, 0, NULL, NULL },
185: { 0x01ba8, 0x01ba9, 0, NULL, NULL },
186: { 0x01ba2, 0x01ba5, 0, NULL, NULL },
187: { 0x01bab, 0x01bad, 0, NULL, NULL },
188: { 0x01bed, 0x01bed, 0, NULL, NULL },
189: { 0x01be8, 0x01be9, 0, NULL, NULL },
190: { 0x01bef, 0x01bf1, 0, NULL, NULL },
191: { 0x02329, 0x0232a, 2, NULL, NULL },
192: { 0x01dc0, 0x01df5, 0, NULL, NULL },
193: { 0x01ce2, 0x01ce8, 0, NULL, NULL },
194: { 0x01cd0, 0x01cd2, 0, NULL, NULL },
195: { 0x01c36, 0x01c37, 0, NULL, NULL },
196: { 0x01cd4, 0x01ce0, 0, NULL, NULL },
197: { 0x01cf4, 0x01cf4, 0, NULL, NULL },
198: { 0x01ced, 0x01ced, 0, NULL, NULL },
199: { 0x01cf8, 0x01cf9, 0, NULL, NULL },
200: { 0x02060, 0x02064, 0, NULL, NULL },
201: { 0x0200b, 0x0200f, 0, NULL, NULL },
202: { 0x01dfc, 0x01dff, 0, NULL, NULL },
203: { 0x0202a, 0x0202e, 0, NULL, NULL },
204: { 0x02066, 0x0206f, 0, NULL, NULL },
205: { 0x020d0, 0x020f0, 0, NULL, NULL },
206: { 0x03001, 0x03029, 2, NULL, NULL },
207: { 0x02e80, 0x02e99, 2, NULL, NULL },
208: { 0x02d7f, 0x02d7f, 0, NULL, NULL },
209: { 0x02cef, 0x02cf1, 0, NULL, NULL },
210: { 0x02de0, 0x02dff, 0, NULL, NULL },
211: { 0x02f00, 0x02fd5, 2, NULL, NULL },
212: { 0x02e9b, 0x02ef3, 2, NULL, NULL },
213: { 0x02ff0, 0x02ffb, 2, NULL, NULL },
1.1 nicm 214: { 0x03099, 0x0309a, 0, NULL, NULL },
1.14 schwarze 215: { 0x0302e, 0x0303e, 2, NULL, NULL },
216: { 0x0302a, 0x0302d, 0, NULL, NULL },
217: { 0x03041, 0x03096, 2, NULL, NULL },
218: { 0x03105, 0x0312d, 2, NULL, NULL },
219: { 0x0309b, 0x030ff, 2, NULL, NULL },
220: { 0x03131, 0x0318e, 2, NULL, NULL },
1.1 nicm 221: { 0x10a3f, 0x10a3f, 0, NULL, NULL },
1.14 schwarze 222: { 0x0aa4c, 0x0aa4c, 0, NULL, NULL },
1.1 nicm 223: { 0x0a825, 0x0a826, 0, NULL, NULL },
1.14 schwarze 224: { 0x0a490, 0x0a4c6, 2, NULL, NULL },
225: { 0x03250, 0x032fe, 2, NULL, NULL },
226: { 0x031f0, 0x0321e, 2, NULL, NULL },
227: { 0x031c0, 0x031e3, 2, NULL, NULL },
228: { 0x03220, 0x03247, 2, NULL, NULL },
229: { 0x04e00, 0x09fcc, 2, NULL, NULL },
230: { 0x03300, 0x04db5, 2, NULL, NULL },
231: { 0x0a000, 0x0a48c, 2, NULL, NULL },
232: { 0x0a6f0, 0x0a6f1, 0, NULL, NULL },
233: { 0x0a674, 0x0a67d, 0, NULL, NULL },
234: { 0x0a66f, 0x0a672, 0, NULL, NULL },
235: { 0x0a69f, 0x0a69f, 0, NULL, NULL },
236: { 0x0a806, 0x0a806, 0, NULL, NULL },
237: { 0x0a802, 0x0a802, 0, NULL, NULL },
238: { 0x0a80b, 0x0a80b, 0, NULL, NULL },
239: { 0x0a9b6, 0x0a9b9, 0, NULL, NULL },
240: { 0x0a947, 0x0a951, 0, NULL, NULL },
241: { 0x0a8e0, 0x0a8f1, 0, NULL, NULL },
242: { 0x0a8c4, 0x0a8c4, 0, NULL, NULL },
243: { 0x0a926, 0x0a92d, 0, NULL, NULL },
244: { 0x0a980, 0x0a982, 0, NULL, NULL },
245: { 0x0a960, 0x0a97c, 2, NULL, NULL },
246: { 0x0a9b3, 0x0a9b3, 0, NULL, NULL },
247: { 0x0aa29, 0x0aa2e, 0, NULL, NULL },
248: { 0x0a9bc, 0x0a9bc, 0, NULL, NULL },
249: { 0x0a9e5, 0x0a9e5, 0, NULL, NULL },
250: { 0x0aa35, 0x0aa36, 0, NULL, NULL },
251: { 0x0aa31, 0x0aa32, 0, NULL, NULL },
252: { 0x0aa43, 0x0aa43, 0, NULL, NULL },
253: { 0x0fb1e, 0x0fb1e, 0, NULL, NULL },
254: { 0x0aaf6, 0x0aaf6, 0, NULL, NULL },
255: { 0x0aab7, 0x0aab8, 0, NULL, NULL },
256: { 0x0aab0, 0x0aab0, 0, NULL, NULL },
257: { 0x0aa7c, 0x0aa7c, 0, NULL, NULL },
258: { 0x0aab2, 0x0aab4, 0, NULL, NULL },
259: { 0x0aac1, 0x0aac1, 0, NULL, NULL },
260: { 0x0aabe, 0x0aabf, 0, NULL, NULL },
261: { 0x0aaec, 0x0aaed, 0, NULL, NULL },
262: { 0x0ac00, 0x0d7a3, 2, NULL, NULL },
263: { 0x0abe8, 0x0abe8, 0, NULL, NULL },
264: { 0x0abe5, 0x0abe5, 0, NULL, NULL },
265: { 0x0abed, 0x0abed, 0, NULL, NULL },
266: { 0x0f900, 0x0fa6d, 2, NULL, NULL },
267: { 0x0d800, 0x0f8ff, 0, NULL, NULL },
268: { 0x0fa70, 0x0fad9, 2, NULL, NULL },
269: { 0x0fff9, 0x0fffb, 0, NULL, NULL },
270: { 0x0fe30, 0x0fe52, 2, NULL, NULL },
1.1 nicm 271: { 0x0fe10, 0x0fe19, 2, NULL, NULL },
1.14 schwarze 272: { 0x0fe00, 0x0fe0f, 0, NULL, NULL },
273: { 0x0fe20, 0x0fe2d, 0, NULL, NULL },
274: { 0x0fe68, 0x0fe6b, 2, NULL, NULL },
275: { 0x0fe54, 0x0fe66, 2, NULL, NULL },
276: { 0x0feff, 0x0feff, 0, NULL, NULL },
277: { 0x10a01, 0x10a03, 0, NULL, NULL },
278: { 0x102e0, 0x102e0, 0, NULL, NULL },
279: { 0x101fd, 0x101fd, 0, NULL, NULL },
280: { 0x10376, 0x1037a, 0, NULL, NULL },
281: { 0x10a0c, 0x10a0f, 0, NULL, NULL },
1.1 nicm 282: { 0x10a05, 0x10a06, 0, NULL, NULL },
1.14 schwarze 283: { 0x10a38, 0x10a3a, 0, NULL, NULL },
284: { 0x11633, 0x1163a, 0, NULL, NULL },
285: { 0x11236, 0x11237, 0, NULL, NULL },
286: { 0x11100, 0x11102, 0, NULL, NULL },
287: { 0x1107f, 0x11081, 0, NULL, NULL },
288: { 0x11001, 0x11001, 0, NULL, NULL },
289: { 0x10ae5, 0x10ae6, 0, NULL, NULL },
290: { 0x11038, 0x11046, 0, NULL, NULL },
291: { 0x110b9, 0x110ba, 0, NULL, NULL },
292: { 0x110b3, 0x110b6, 0, NULL, NULL },
293: { 0x110bd, 0x110bd, 0, NULL, NULL },
294: { 0x11180, 0x11181, 0, NULL, NULL },
295: { 0x1112d, 0x11134, 0, NULL, NULL },
296: { 0x11127, 0x1112b, 0, NULL, NULL },
297: { 0x11173, 0x11173, 0, NULL, NULL },
298: { 0x1122f, 0x11231, 0, NULL, NULL },
299: { 0x111b6, 0x111be, 0, NULL, NULL },
300: { 0x11234, 0x11234, 0, NULL, NULL },
301: { 0x11370, 0x11374, 0, NULL, NULL },
302: { 0x11301, 0x11301, 0, NULL, NULL },
303: { 0x112df, 0x112df, 0, NULL, NULL },
304: { 0x112e3, 0x112ea, 0, NULL, NULL },
305: { 0x11340, 0x11340, 0, NULL, NULL },
306: { 0x1133c, 0x1133c, 0, NULL, NULL },
307: { 0x11366, 0x1136c, 0, NULL, NULL },
308: { 0x114c2, 0x114c3, 0, NULL, NULL },
309: { 0x114ba, 0x114ba, 0, NULL, NULL },
310: { 0x114b3, 0x114b8, 0, NULL, NULL },
311: { 0x114bf, 0x114c0, 0, NULL, NULL },
312: { 0x115bc, 0x115bd, 0, NULL, NULL },
313: { 0x115b2, 0x115b5, 0, NULL, NULL },
314: { 0x115bf, 0x115c0, 0, NULL, NULL },
315: { 0x1d1aa, 0x1d1ad, 0, NULL, NULL },
316: { 0x16b30, 0x16b36, 0, NULL, NULL },
317: { 0x116ad, 0x116ad, 0, NULL, NULL },
318: { 0x1163f, 0x11640, 0, NULL, NULL },
319: { 0x1163d, 0x1163d, 0, NULL, NULL },
320: { 0x116ab, 0x116ab, 0, NULL, NULL },
321: { 0x116b7, 0x116b7, 0, NULL, NULL },
322: { 0x116b0, 0x116b5, 0, NULL, NULL },
323: { 0x16af0, 0x16af4, 0, NULL, NULL },
324: { 0x1bca0, 0x1bca3, 0, NULL, NULL },
325: { 0x1b000, 0x1b001, 2, NULL, NULL },
326: { 0x16f8f, 0x16f92, 0, NULL, NULL },
327: { 0x1bc9d, 0x1bc9e, 0, NULL, NULL },
328: { 0x1d173, 0x1d182, 0, NULL, NULL },
329: { 0x1d167, 0x1d169, 0, NULL, NULL },
330: { 0x1d185, 0x1d18b, 0, NULL, NULL },
331: { 0x2a700, 0x2b734, 2, NULL, NULL },
332: { 0x1f210, 0x1f23a, 2, NULL, NULL },
333: { 0x1e8d0, 0x1e8d6, 0, NULL, NULL },
334: { 0x1d242, 0x1d244, 0, NULL, NULL },
335: { 0x1f200, 0x1f202, 2, NULL, NULL },
336: { 0x1f250, 0x1f251, 2, NULL, NULL },
337: { 0x1f240, 0x1f248, 2, NULL, NULL },
338: { 0x20000, 0x2a6d6, 2, NULL, NULL },
339: { 0xe0020, 0xe007f, 0, NULL, NULL },
340: { 0x2f800, 0x2fa1d, 2, NULL, NULL },
341: { 0x2b740, 0x2b81d, 2, NULL, NULL },
342: { 0xe0001, 0xe0001, 0, NULL, NULL },
343: { 0xf0000, 0xffffd, 0, NULL, NULL },
344: { 0xe0100, 0xe01ef, 0, NULL, NULL },
345: { 0x100000, 0x10fffd, 0, NULL, NULL },
1.1 nicm 346: };
1.17 nicm 347: static struct utf8_width_entry *utf8_width_root = NULL;
1.1 nicm 348:
1.17 nicm 349: static void utf8_build(void);
1.1 nicm 350:
1.11 nicm 351: /* Set a single character. */
352: void
353: utf8_set(struct utf8_data *utf8data, u_char ch)
354: {
355: *utf8data->data = ch;
356: utf8data->size = 1;
357:
358: utf8data->width = 1;
359: }
360:
1.4 nicm 361: /*
362: * Open UTF-8 sequence.
363: *
364: * 11000010-11011111 C2-DF start of 2-byte sequence
365: * 11100000-11101111 E0-EF start of 3-byte sequence
366: * 11110000-11110100 F0-F4 start of 4-byte sequence
367: *
368: * Returns 1 if more UTF-8 to come, 0 if not UTF-8.
369: */
370: int
371: utf8_open(struct utf8_data *utf8data, u_char ch)
372: {
373: memset(utf8data, 0, sizeof *utf8data);
374: if (ch >= 0xc2 && ch <= 0xdf)
375: utf8data->size = 2;
376: else if (ch >= 0xe0 && ch <= 0xef)
377: utf8data->size = 3;
378: else if (ch >= 0xf0 && ch <= 0xf4)
379: utf8data->size = 4;
380: else
381: return (0);
382: utf8_append(utf8data, ch);
383: return (1);
384: }
385:
386: /*
387: * Append character to UTF-8, closing if finished.
388: *
1.5 nicm 389: * Returns 1 if more UTF-8 data to come, 0 if finished.
1.4 nicm 390: */
391: int
392: utf8_append(struct utf8_data *utf8data, u_char ch)
393: {
1.15 nicm 394: /* XXX this should do validity checks too! */
395:
1.4 nicm 396: if (utf8data->have >= utf8data->size)
397: fatalx("UTF-8 character overflow");
398: if (utf8data->size > sizeof utf8data->data)
399: fatalx("UTF-8 character size too large");
400:
401: utf8data->data[utf8data->have++] = ch;
402: if (utf8data->have != utf8data->size)
403: return (1);
404:
1.17 nicm 405: utf8data->width = utf8_width(utf8_combine(utf8data));
1.1 nicm 406: return (0);
407: }
408:
1.4 nicm 409: /* Build UTF-8 width tree. */
1.17 nicm 410: static void
1.1 nicm 411: utf8_build(void)
412: {
413: struct utf8_width_entry **ptr, *item, *node;
1.17 nicm 414: u_int i;
1.1 nicm 415:
416: for (i = 0; i < nitems(utf8_width_table); i++) {
417: item = &utf8_width_table[i];
418:
419: ptr = &utf8_width_root;
420: while (*ptr != NULL) {
421: node = *ptr;
422: if (item->last < node->first)
1.13 nicm 423: ptr = &node->left;
1.1 nicm 424: else if (item->first > node->last)
1.13 nicm 425: ptr = &node->right;
1.1 nicm 426: }
427: *ptr = item;
428: }
429: }
430:
1.17 nicm 431: /* Lookup width of UTF-8 data in tree. */
432: u_int
433: utf8_width(u_int uc)
434: {
435: struct utf8_width_entry *item;
436:
437: if (utf8_width_root == NULL)
438: utf8_build();
439:
440: item = utf8_width_root;
441: while (item != NULL) {
442: if (uc < item->first)
443: item = item->left;
444: else if (uc > item->last)
445: item = item->right;
446: else
447: return (item->width);
448: }
449: return (1);
450: }
451:
1.4 nicm 452: /* Combine UTF-8 into 32-bit Unicode. */
1.1 nicm 453: u_int
1.4 nicm 454: utf8_combine(const struct utf8_data *utf8data)
1.1 nicm 455: {
1.4 nicm 456: u_int value;
1.1 nicm 457:
1.4 nicm 458: value = 0xff;
459: switch (utf8data->size) {
460: case 1:
461: value = utf8data->data[0];
462: break;
463: case 2:
464: value = utf8data->data[1] & 0x3f;
465: value |= (utf8data->data[0] & 0x1f) << 6;
466: break;
467: case 3:
468: value = utf8data->data[2] & 0x3f;
469: value |= (utf8data->data[1] & 0x3f) << 6;
1.15 nicm 470: value |= (utf8data->data[0] & 0xf) << 12;
1.4 nicm 471: break;
472: case 4:
473: value = utf8data->data[3] & 0x3f;
474: value |= (utf8data->data[2] & 0x3f) << 6;
475: value |= (utf8data->data[1] & 0x3f) << 12;
1.15 nicm 476: value |= (utf8data->data[0] & 0x7) << 18;
1.4 nicm 477: break;
1.1 nicm 478: }
1.4 nicm 479: return (value);
1.15 nicm 480: }
481:
1.17 nicm 482: /* Split 32-bit Unicode into UTF-8. */
1.15 nicm 483: int
484: utf8_split(u_int uc, struct utf8_data *utf8data)
485: {
486: if (uc < 0x7f) {
487: utf8data->size = 1;
488: utf8data->data[0] = uc;
489: } else if (uc < 0x7ff) {
490: utf8data->size = 2;
491: utf8data->data[0] = 0xc0 | ((uc >> 6) & 0x1f);
492: utf8data->data[1] = 0x80 | (uc & 0x3f);
493: } else if (uc < 0xffff) {
494: utf8data->size = 3;
495: utf8data->data[0] = 0xe0 | ((uc >> 12) & 0xf);
496: utf8data->data[1] = 0x80 | ((uc >> 6) & 0x3f);
497: utf8data->data[2] = 0x80 | (uc & 0x3f);
498: } else if (uc < 0x1fffff) {
499: utf8data->size = 4;
500: utf8data->data[0] = 0xf0 | ((uc >> 18) & 0x7);
501: utf8data->data[1] = 0x80 | ((uc >> 12) & 0x3f);
502: utf8data->data[2] = 0x80 | ((uc >> 6) & 0x3f);
503: utf8data->data[3] = 0x80 | (uc & 0x3f);
504: } else
505: return (-1);
1.17 nicm 506: utf8data->width = utf8_width(uc);
1.15 nicm 507: return (0);
1.6 nicm 508: }
509:
510: /* Split a two-byte UTF-8 character. */
511: u_int
512: utf8_split2(u_int uc, u_char *ptr)
513: {
514: if (uc > 0x7f) {
515: ptr[0] = (uc >> 6) | 0xc0;
516: ptr[1] = (uc & 0x3f) | 0x80;
517: return (2);
518: }
519: ptr[0] = uc;
1.1 nicm 520: return (1);
1.9 nicm 521: }
522:
523: /*
524: * Encode len characters from src into dst, which is guaranteed to have four
525: * bytes available for each character from src (for \abc or UTF-8) plus space
526: * for \0.
527: */
528: int
529: utf8_strvis(char *dst, const char *src, size_t len, int flag)
530: {
531: struct utf8_data utf8data;
532: const char *start, *end;
533: int more;
534: size_t i;
535:
536: start = dst;
537: end = src + len;
538:
539: while (src < end) {
540: if (utf8_open(&utf8data, *src)) {
541: more = 1;
542: while (++src < end && more)
543: more = utf8_append(&utf8data, *src);
544: if (!more) {
545: /* UTF-8 character finished. */
546: for (i = 0; i < utf8data.size; i++)
547: *dst++ = utf8data.data[i];
548: continue;
549: } else if (utf8data.have > 0) {
550: /* Not a complete UTF-8 character. */
551: src -= utf8data.have;
552: }
553: }
554: if (src < end - 1)
555: dst = vis(dst, src[0], flag, src[1]);
556: else if (src < end)
557: dst = vis(dst, src[0], flag, '\0');
558: src++;
559: }
560:
561: *dst = '\0';
562: return (dst - start);
1.16 nicm 563: }
564:
565: /*
566: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
567: * the returned string. Anything not valid printable ASCII or UTF-8 is
568: * stripped.
569: */
570: char *
571: utf8_sanitize(const char *src)
572: {
573: char *dst;
574: size_t n;
575: int more;
576: struct utf8_data utf8data;
577: u_int i;
578:
579: dst = NULL;
580:
581: n = 0;
582: while (*src != '\0') {
583: dst = xreallocarray(dst, n + 1, sizeof *dst);
584: if (utf8_open(&utf8data, *src)) {
585: more = 1;
586: while (*++src != '\0' && more)
587: more = utf8_append(&utf8data, *src);
588: if (!more) {
589: dst = xreallocarray(dst, n + utf8data.width,
590: sizeof *dst);
591: for (i = 0; i < utf8data.width; i++)
592: dst[n++] = '_';
593: continue;
594: }
595: src -= utf8data.have;
596: }
597: if (*src > 0x1f && *src < 0x7f)
598: dst[n] = *src;
599: src++;
600:
601: n++;
602: }
603:
604: dst = xreallocarray(dst, n + 1, sizeof *dst);
605: dst[n] = '\0';
606: return (dst);
1.11 nicm 607: }
608:
609: /*
610: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
611: * Caller frees.
612: */
613: struct utf8_data *
614: utf8_fromcstr(const char *src)
615: {
616: struct utf8_data *dst;
617: size_t n;
618: int more;
619:
620: dst = NULL;
621:
622: n = 0;
623: while (*src != '\0') {
1.12 nicm 624: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 625: if (utf8_open(&dst[n], *src)) {
626: more = 1;
627: while (*++src != '\0' && more)
628: more = utf8_append(&dst[n], *src);
629: if (!more) {
630: n++;
631: continue;
632: }
633: src -= dst[n].have;
634: }
635: utf8_set(&dst[n], *src);
636: src++;
637:
638: n++;
639: }
640:
1.12 nicm 641: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 642: dst[n].size = 0;
643: return (dst);
644: }
645:
646: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
647: char *
648: utf8_tocstr(struct utf8_data *src)
649: {
650: char *dst;
651: size_t n;
652:
653: dst = NULL;
654:
655: n = 0;
656: for(; src->size != 0; src++) {
1.12 nicm 657: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 658: memcpy(dst + n, src->data, src->size);
659: n += src->size;
660: }
661:
1.12 nicm 662: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 663: dst[n] = '\0';
664: return (dst);
665: }
666:
667: /* Get width of UTF-8 string. */
668: u_int
669: utf8_cstrwidth(const char *s)
670: {
671: struct utf8_data tmp;
672: u_int width;
673: int more;
674:
675: width = 0;
676: while (*s != '\0') {
677: if (utf8_open(&tmp, *s)) {
678: more = 1;
679: while (*++s != '\0' && more)
680: more = utf8_append(&tmp, *s);
681: if (!more) {
682: width += tmp.width;
683: continue;
684: }
685: s -= tmp.have;
686: }
687: width++;
688: s++;
689: }
690: return (width);
691: }
692:
693: /* Trim UTF-8 string to width. Caller frees. */
694: char *
695: utf8_trimcstr(const char *s, u_int width)
696: {
697: struct utf8_data *tmp, *next;
698: char *out;
699: u_int at;
700:
701: tmp = utf8_fromcstr(s);
702:
703: at = 0;
704: for (next = tmp; next->size != 0; next++) {
705: if (at + next->width > width) {
706: next->size = 0;
707: break;
708: }
709: at += next->width;
710: }
711:
712: out = utf8_tocstr(tmp);
713: free(tmp);
1.18 ! nicm 714: return (out);
! 715: }
! 716:
! 717: /* Pad UTF-8 string to width. Caller frees. */
! 718: char *
! 719: utf8_padcstr(const char *s, u_int width)
! 720: {
! 721: size_t slen;
! 722: char *out;
! 723: u_int n, i;
! 724:
! 725: n = utf8_cstrwidth(s);
! 726: if (n >= width)
! 727: return (xstrdup(s));
! 728:
! 729: slen = strlen(s);
! 730: out = xmalloc(slen + 1 + (width - n));
! 731: memcpy(out, s, slen);
! 732: for (i = n; i < width; i++)
! 733: out[slen++] = ' ';
! 734: out[slen] = '\0';
1.11 nicm 735: return (out);
1.1 nicm 736: }