Annotation of src/usr.bin/tmux/utf8.c, Revision 1.16
1.16 ! nicm 1: /* $OpenBSD: utf8.c,v 1.15 2015/11/12 11:05:34 nicm Exp $ */
1.1 nicm 2:
3: /*
4: * Copyright (c) 2008 Nicholas Marriott <nicm@users.sourceforge.net>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.11 nicm 21: #include <stdlib.h>
1.1 nicm 22: #include <string.h>
1.9 nicm 23: #include <vis.h>
1.1 nicm 24:
25: #include "tmux.h"
26:
27: struct utf8_width_entry {
28: u_int first;
29: u_int last;
30:
31: int width;
32:
33: struct utf8_width_entry *left;
34: struct utf8_width_entry *right;
35: };
36:
1.14 schwarze 37: /* Sorted, then repeatedly split in the middle to balance the tree. */
1.1 nicm 38: struct utf8_width_entry utf8_width_table[] = {
1.14 schwarze 39: { 0x00b41, 0x00b44, 0, NULL, NULL },
40: { 0x008e4, 0x00902, 0, NULL, NULL },
41: { 0x006d6, 0x006dd, 0, NULL, NULL },
42: { 0x005c4, 0x005c5, 0, NULL, NULL },
1.1 nicm 43: { 0x00591, 0x005bd, 0, NULL, NULL },
1.14 schwarze 44: { 0x00300, 0x0036f, 0, NULL, NULL },
45: { 0x00483, 0x00489, 0, NULL, NULL },
1.1 nicm 46: { 0x005bf, 0x005bf, 0, NULL, NULL },
1.14 schwarze 47: { 0x005c1, 0x005c2, 0, NULL, NULL },
48: { 0x00610, 0x0061a, 0, NULL, NULL },
49: { 0x00600, 0x00605, 0, NULL, NULL },
50: { 0x005c7, 0x005c7, 0, NULL, NULL },
51: { 0x0064b, 0x0065f, 0, NULL, NULL },
52: { 0x0061c, 0x0061c, 0, NULL, NULL },
53: { 0x00670, 0x00670, 0, NULL, NULL },
54: { 0x007a6, 0x007b0, 0, NULL, NULL },
1.1 nicm 55: { 0x006ea, 0x006ed, 0, NULL, NULL },
1.14 schwarze 56: { 0x006df, 0x006e4, 0, NULL, NULL },
57: { 0x006e7, 0x006e8, 0, NULL, NULL },
58: { 0x00711, 0x00711, 0, NULL, NULL },
59: { 0x0070f, 0x0070f, 0, NULL, NULL },
60: { 0x00730, 0x0074a, 0, NULL, NULL },
61: { 0x0081b, 0x00823, 0, NULL, NULL },
62: { 0x007eb, 0x007f3, 0, NULL, NULL },
63: { 0x00816, 0x00819, 0, NULL, NULL },
64: { 0x00829, 0x0082d, 0, NULL, NULL },
65: { 0x00825, 0x00827, 0, NULL, NULL },
66: { 0x00859, 0x0085b, 0, NULL, NULL },
67: { 0x00a41, 0x00a42, 0, NULL, NULL },
68: { 0x00981, 0x00981, 0, NULL, NULL },
69: { 0x00941, 0x00948, 0, NULL, NULL },
70: { 0x0093a, 0x0093a, 0, NULL, NULL },
71: { 0x0093c, 0x0093c, 0, NULL, NULL },
72: { 0x00951, 0x00957, 0, NULL, NULL },
73: { 0x0094d, 0x0094d, 0, NULL, NULL },
1.1 nicm 74: { 0x00962, 0x00963, 0, NULL, NULL },
1.14 schwarze 75: { 0x009e2, 0x009e3, 0, NULL, NULL },
76: { 0x009c1, 0x009c4, 0, NULL, NULL },
77: { 0x009bc, 0x009bc, 0, NULL, NULL },
78: { 0x009cd, 0x009cd, 0, NULL, NULL },
79: { 0x00a01, 0x00a02, 0, NULL, NULL },
80: { 0x00a3c, 0x00a3c, 0, NULL, NULL },
81: { 0x00ac1, 0x00ac5, 0, NULL, NULL },
82: { 0x00a70, 0x00a71, 0, NULL, NULL },
83: { 0x00a4b, 0x00a4d, 0, NULL, NULL },
84: { 0x00a47, 0x00a48, 0, NULL, NULL },
85: { 0x00a51, 0x00a51, 0, NULL, NULL },
86: { 0x00a81, 0x00a82, 0, NULL, NULL },
87: { 0x00a75, 0x00a75, 0, NULL, NULL },
88: { 0x00abc, 0x00abc, 0, NULL, NULL },
89: { 0x00ae2, 0x00ae3, 0, NULL, NULL },
90: { 0x00ac7, 0x00ac8, 0, NULL, NULL },
91: { 0x00acd, 0x00acd, 0, NULL, NULL },
92: { 0x00b3c, 0x00b3c, 0, NULL, NULL },
1.1 nicm 93: { 0x00b01, 0x00b01, 0, NULL, NULL },
1.14 schwarze 94: { 0x00b3f, 0x00b3f, 0, NULL, NULL },
95: { 0x03190, 0x031ba, 2, NULL, NULL },
96: { 0x017c9, 0x017d3, 0, NULL, NULL },
97: { 0x00ec8, 0x00ecd, 0, NULL, NULL },
98: { 0x00cc6, 0x00cc6, 0, NULL, NULL },
99: { 0x00c3e, 0x00c40, 0, NULL, NULL },
100: { 0x00b82, 0x00b82, 0, NULL, NULL },
101: { 0x00b56, 0x00b56, 0, NULL, NULL },
102: { 0x00b4d, 0x00b4d, 0, NULL, NULL },
103: { 0x00b62, 0x00b63, 0, NULL, NULL },
104: { 0x00bcd, 0x00bcd, 0, NULL, NULL },
105: { 0x00bc0, 0x00bc0, 0, NULL, NULL },
106: { 0x00c00, 0x00c00, 0, NULL, NULL },
107: { 0x00c62, 0x00c63, 0, NULL, NULL },
1.1 nicm 108: { 0x00c4a, 0x00c4d, 0, NULL, NULL },
1.14 schwarze 109: { 0x00c46, 0x00c48, 0, NULL, NULL },
110: { 0x00c55, 0x00c56, 0, NULL, NULL },
111: { 0x00cbc, 0x00cbc, 0, NULL, NULL },
112: { 0x00c81, 0x00c81, 0, NULL, NULL },
1.1 nicm 113: { 0x00cbf, 0x00cbf, 0, NULL, NULL },
114: { 0x00dd2, 0x00dd4, 0, NULL, NULL },
1.14 schwarze 115: { 0x00d41, 0x00d44, 0, NULL, NULL },
116: { 0x00ce2, 0x00ce3, 0, NULL, NULL },
117: { 0x00ccc, 0x00ccd, 0, NULL, NULL },
118: { 0x00d01, 0x00d01, 0, NULL, NULL },
119: { 0x00d62, 0x00d63, 0, NULL, NULL },
120: { 0x00d4d, 0x00d4d, 0, NULL, NULL },
121: { 0x00dca, 0x00dca, 0, NULL, NULL },
122: { 0x00e47, 0x00e4e, 0, NULL, NULL },
123: { 0x00e31, 0x00e31, 0, NULL, NULL },
124: { 0x00dd6, 0x00dd6, 0, NULL, NULL },
125: { 0x00e34, 0x00e3a, 0, NULL, NULL },
126: { 0x00eb4, 0x00eb9, 0, NULL, NULL },
127: { 0x00eb1, 0x00eb1, 0, NULL, NULL },
128: { 0x00ebb, 0x00ebc, 0, NULL, NULL },
129: { 0x0105e, 0x01060, 0, NULL, NULL },
130: { 0x00f8d, 0x00f97, 0, NULL, NULL },
131: { 0x00f39, 0x00f39, 0, NULL, NULL },
132: { 0x00f35, 0x00f35, 0, NULL, NULL },
133: { 0x00f18, 0x00f19, 0, NULL, NULL },
134: { 0x00f37, 0x00f37, 0, NULL, NULL },
135: { 0x00f80, 0x00f84, 0, NULL, NULL },
1.1 nicm 136: { 0x00f71, 0x00f7e, 0, NULL, NULL },
1.14 schwarze 137: { 0x00f86, 0x00f87, 0, NULL, NULL },
138: { 0x01032, 0x01037, 0, NULL, NULL },
139: { 0x00fc6, 0x00fc6, 0, NULL, NULL },
140: { 0x00f99, 0x00fbc, 0, NULL, NULL },
141: { 0x0102d, 0x01030, 0, NULL, NULL },
142: { 0x0103d, 0x0103e, 0, NULL, NULL },
143: { 0x01039, 0x0103a, 0, NULL, NULL },
144: { 0x01058, 0x01059, 0, NULL, NULL },
145: { 0x0135d, 0x0135f, 0, NULL, NULL },
146: { 0x01085, 0x01086, 0, NULL, NULL },
147: { 0x01071, 0x01074, 0, NULL, NULL },
148: { 0x01082, 0x01082, 0, NULL, NULL },
149: { 0x0109d, 0x0109d, 0, NULL, NULL },
150: { 0x0108d, 0x0108d, 0, NULL, NULL },
151: { 0x01100, 0x011ff, 2, NULL, NULL },
152: { 0x01772, 0x01773, 0, NULL, NULL },
153: { 0x01732, 0x01734, 0, NULL, NULL },
154: { 0x01712, 0x01714, 0, NULL, NULL },
1.1 nicm 155: { 0x01752, 0x01753, 0, NULL, NULL },
1.14 schwarze 156: { 0x017b7, 0x017bd, 0, NULL, NULL },
157: { 0x017b4, 0x017b5, 0, NULL, NULL },
158: { 0x017c6, 0x017c6, 0, NULL, NULL },
159: { 0x01c2c, 0x01c33, 0, NULL, NULL },
160: { 0x01a7f, 0x01a7f, 0, NULL, NULL },
161: { 0x01a17, 0x01a18, 0, NULL, NULL },
162: { 0x01920, 0x01922, 0, NULL, NULL },
163: { 0x0180b, 0x0180e, 0, NULL, NULL },
1.1 nicm 164: { 0x017dd, 0x017dd, 0, NULL, NULL },
1.14 schwarze 165: { 0x018a9, 0x018a9, 0, NULL, NULL },
166: { 0x01932, 0x01932, 0, NULL, NULL },
1.1 nicm 167: { 0x01927, 0x01928, 0, NULL, NULL },
168: { 0x01939, 0x0193b, 0, NULL, NULL },
1.14 schwarze 169: { 0x01a60, 0x01a60, 0, NULL, NULL },
170: { 0x01a56, 0x01a56, 0, NULL, NULL },
171: { 0x01a1b, 0x01a1b, 0, NULL, NULL },
172: { 0x01a58, 0x01a5e, 0, NULL, NULL },
173: { 0x01a65, 0x01a6c, 0, NULL, NULL },
174: { 0x01a62, 0x01a62, 0, NULL, NULL },
175: { 0x01a73, 0x01a7c, 0, NULL, NULL },
176: { 0x01b80, 0x01b81, 0, NULL, NULL },
177: { 0x01b36, 0x01b3a, 0, NULL, NULL },
178: { 0x01b00, 0x01b03, 0, NULL, NULL },
179: { 0x01ab0, 0x01abe, 0, NULL, NULL },
180: { 0x01b34, 0x01b34, 0, NULL, NULL },
181: { 0x01b42, 0x01b42, 0, NULL, NULL },
1.1 nicm 182: { 0x01b3c, 0x01b3c, 0, NULL, NULL },
1.14 schwarze 183: { 0x01b6b, 0x01b73, 0, NULL, NULL },
184: { 0x01be6, 0x01be6, 0, NULL, NULL },
185: { 0x01ba8, 0x01ba9, 0, NULL, NULL },
186: { 0x01ba2, 0x01ba5, 0, NULL, NULL },
187: { 0x01bab, 0x01bad, 0, NULL, NULL },
188: { 0x01bed, 0x01bed, 0, NULL, NULL },
189: { 0x01be8, 0x01be9, 0, NULL, NULL },
190: { 0x01bef, 0x01bf1, 0, NULL, NULL },
191: { 0x02329, 0x0232a, 2, NULL, NULL },
192: { 0x01dc0, 0x01df5, 0, NULL, NULL },
193: { 0x01ce2, 0x01ce8, 0, NULL, NULL },
194: { 0x01cd0, 0x01cd2, 0, NULL, NULL },
195: { 0x01c36, 0x01c37, 0, NULL, NULL },
196: { 0x01cd4, 0x01ce0, 0, NULL, NULL },
197: { 0x01cf4, 0x01cf4, 0, NULL, NULL },
198: { 0x01ced, 0x01ced, 0, NULL, NULL },
199: { 0x01cf8, 0x01cf9, 0, NULL, NULL },
200: { 0x02060, 0x02064, 0, NULL, NULL },
201: { 0x0200b, 0x0200f, 0, NULL, NULL },
202: { 0x01dfc, 0x01dff, 0, NULL, NULL },
203: { 0x0202a, 0x0202e, 0, NULL, NULL },
204: { 0x02066, 0x0206f, 0, NULL, NULL },
205: { 0x020d0, 0x020f0, 0, NULL, NULL },
206: { 0x03001, 0x03029, 2, NULL, NULL },
207: { 0x02e80, 0x02e99, 2, NULL, NULL },
208: { 0x02d7f, 0x02d7f, 0, NULL, NULL },
209: { 0x02cef, 0x02cf1, 0, NULL, NULL },
210: { 0x02de0, 0x02dff, 0, NULL, NULL },
211: { 0x02f00, 0x02fd5, 2, NULL, NULL },
212: { 0x02e9b, 0x02ef3, 2, NULL, NULL },
213: { 0x02ff0, 0x02ffb, 2, NULL, NULL },
1.1 nicm 214: { 0x03099, 0x0309a, 0, NULL, NULL },
1.14 schwarze 215: { 0x0302e, 0x0303e, 2, NULL, NULL },
216: { 0x0302a, 0x0302d, 0, NULL, NULL },
217: { 0x03041, 0x03096, 2, NULL, NULL },
218: { 0x03105, 0x0312d, 2, NULL, NULL },
219: { 0x0309b, 0x030ff, 2, NULL, NULL },
220: { 0x03131, 0x0318e, 2, NULL, NULL },
1.1 nicm 221: { 0x10a3f, 0x10a3f, 0, NULL, NULL },
1.14 schwarze 222: { 0x0aa4c, 0x0aa4c, 0, NULL, NULL },
1.1 nicm 223: { 0x0a825, 0x0a826, 0, NULL, NULL },
1.14 schwarze 224: { 0x0a490, 0x0a4c6, 2, NULL, NULL },
225: { 0x03250, 0x032fe, 2, NULL, NULL },
226: { 0x031f0, 0x0321e, 2, NULL, NULL },
227: { 0x031c0, 0x031e3, 2, NULL, NULL },
228: { 0x03220, 0x03247, 2, NULL, NULL },
229: { 0x04e00, 0x09fcc, 2, NULL, NULL },
230: { 0x03300, 0x04db5, 2, NULL, NULL },
231: { 0x0a000, 0x0a48c, 2, NULL, NULL },
232: { 0x0a6f0, 0x0a6f1, 0, NULL, NULL },
233: { 0x0a674, 0x0a67d, 0, NULL, NULL },
234: { 0x0a66f, 0x0a672, 0, NULL, NULL },
235: { 0x0a69f, 0x0a69f, 0, NULL, NULL },
236: { 0x0a806, 0x0a806, 0, NULL, NULL },
237: { 0x0a802, 0x0a802, 0, NULL, NULL },
238: { 0x0a80b, 0x0a80b, 0, NULL, NULL },
239: { 0x0a9b6, 0x0a9b9, 0, NULL, NULL },
240: { 0x0a947, 0x0a951, 0, NULL, NULL },
241: { 0x0a8e0, 0x0a8f1, 0, NULL, NULL },
242: { 0x0a8c4, 0x0a8c4, 0, NULL, NULL },
243: { 0x0a926, 0x0a92d, 0, NULL, NULL },
244: { 0x0a980, 0x0a982, 0, NULL, NULL },
245: { 0x0a960, 0x0a97c, 2, NULL, NULL },
246: { 0x0a9b3, 0x0a9b3, 0, NULL, NULL },
247: { 0x0aa29, 0x0aa2e, 0, NULL, NULL },
248: { 0x0a9bc, 0x0a9bc, 0, NULL, NULL },
249: { 0x0a9e5, 0x0a9e5, 0, NULL, NULL },
250: { 0x0aa35, 0x0aa36, 0, NULL, NULL },
251: { 0x0aa31, 0x0aa32, 0, NULL, NULL },
252: { 0x0aa43, 0x0aa43, 0, NULL, NULL },
253: { 0x0fb1e, 0x0fb1e, 0, NULL, NULL },
254: { 0x0aaf6, 0x0aaf6, 0, NULL, NULL },
255: { 0x0aab7, 0x0aab8, 0, NULL, NULL },
256: { 0x0aab0, 0x0aab0, 0, NULL, NULL },
257: { 0x0aa7c, 0x0aa7c, 0, NULL, NULL },
258: { 0x0aab2, 0x0aab4, 0, NULL, NULL },
259: { 0x0aac1, 0x0aac1, 0, NULL, NULL },
260: { 0x0aabe, 0x0aabf, 0, NULL, NULL },
261: { 0x0aaec, 0x0aaed, 0, NULL, NULL },
262: { 0x0ac00, 0x0d7a3, 2, NULL, NULL },
263: { 0x0abe8, 0x0abe8, 0, NULL, NULL },
264: { 0x0abe5, 0x0abe5, 0, NULL, NULL },
265: { 0x0abed, 0x0abed, 0, NULL, NULL },
266: { 0x0f900, 0x0fa6d, 2, NULL, NULL },
267: { 0x0d800, 0x0f8ff, 0, NULL, NULL },
268: { 0x0fa70, 0x0fad9, 2, NULL, NULL },
269: { 0x0fff9, 0x0fffb, 0, NULL, NULL },
270: { 0x0fe30, 0x0fe52, 2, NULL, NULL },
1.1 nicm 271: { 0x0fe10, 0x0fe19, 2, NULL, NULL },
1.14 schwarze 272: { 0x0fe00, 0x0fe0f, 0, NULL, NULL },
273: { 0x0fe20, 0x0fe2d, 0, NULL, NULL },
274: { 0x0fe68, 0x0fe6b, 2, NULL, NULL },
275: { 0x0fe54, 0x0fe66, 2, NULL, NULL },
276: { 0x0feff, 0x0feff, 0, NULL, NULL },
277: { 0x10a01, 0x10a03, 0, NULL, NULL },
278: { 0x102e0, 0x102e0, 0, NULL, NULL },
279: { 0x101fd, 0x101fd, 0, NULL, NULL },
280: { 0x10376, 0x1037a, 0, NULL, NULL },
281: { 0x10a0c, 0x10a0f, 0, NULL, NULL },
1.1 nicm 282: { 0x10a05, 0x10a06, 0, NULL, NULL },
1.14 schwarze 283: { 0x10a38, 0x10a3a, 0, NULL, NULL },
284: { 0x11633, 0x1163a, 0, NULL, NULL },
285: { 0x11236, 0x11237, 0, NULL, NULL },
286: { 0x11100, 0x11102, 0, NULL, NULL },
287: { 0x1107f, 0x11081, 0, NULL, NULL },
288: { 0x11001, 0x11001, 0, NULL, NULL },
289: { 0x10ae5, 0x10ae6, 0, NULL, NULL },
290: { 0x11038, 0x11046, 0, NULL, NULL },
291: { 0x110b9, 0x110ba, 0, NULL, NULL },
292: { 0x110b3, 0x110b6, 0, NULL, NULL },
293: { 0x110bd, 0x110bd, 0, NULL, NULL },
294: { 0x11180, 0x11181, 0, NULL, NULL },
295: { 0x1112d, 0x11134, 0, NULL, NULL },
296: { 0x11127, 0x1112b, 0, NULL, NULL },
297: { 0x11173, 0x11173, 0, NULL, NULL },
298: { 0x1122f, 0x11231, 0, NULL, NULL },
299: { 0x111b6, 0x111be, 0, NULL, NULL },
300: { 0x11234, 0x11234, 0, NULL, NULL },
301: { 0x11370, 0x11374, 0, NULL, NULL },
302: { 0x11301, 0x11301, 0, NULL, NULL },
303: { 0x112df, 0x112df, 0, NULL, NULL },
304: { 0x112e3, 0x112ea, 0, NULL, NULL },
305: { 0x11340, 0x11340, 0, NULL, NULL },
306: { 0x1133c, 0x1133c, 0, NULL, NULL },
307: { 0x11366, 0x1136c, 0, NULL, NULL },
308: { 0x114c2, 0x114c3, 0, NULL, NULL },
309: { 0x114ba, 0x114ba, 0, NULL, NULL },
310: { 0x114b3, 0x114b8, 0, NULL, NULL },
311: { 0x114bf, 0x114c0, 0, NULL, NULL },
312: { 0x115bc, 0x115bd, 0, NULL, NULL },
313: { 0x115b2, 0x115b5, 0, NULL, NULL },
314: { 0x115bf, 0x115c0, 0, NULL, NULL },
315: { 0x1d1aa, 0x1d1ad, 0, NULL, NULL },
316: { 0x16b30, 0x16b36, 0, NULL, NULL },
317: { 0x116ad, 0x116ad, 0, NULL, NULL },
318: { 0x1163f, 0x11640, 0, NULL, NULL },
319: { 0x1163d, 0x1163d, 0, NULL, NULL },
320: { 0x116ab, 0x116ab, 0, NULL, NULL },
321: { 0x116b7, 0x116b7, 0, NULL, NULL },
322: { 0x116b0, 0x116b5, 0, NULL, NULL },
323: { 0x16af0, 0x16af4, 0, NULL, NULL },
324: { 0x1bca0, 0x1bca3, 0, NULL, NULL },
325: { 0x1b000, 0x1b001, 2, NULL, NULL },
326: { 0x16f8f, 0x16f92, 0, NULL, NULL },
327: { 0x1bc9d, 0x1bc9e, 0, NULL, NULL },
328: { 0x1d173, 0x1d182, 0, NULL, NULL },
329: { 0x1d167, 0x1d169, 0, NULL, NULL },
330: { 0x1d185, 0x1d18b, 0, NULL, NULL },
331: { 0x2a700, 0x2b734, 2, NULL, NULL },
332: { 0x1f210, 0x1f23a, 2, NULL, NULL },
333: { 0x1e8d0, 0x1e8d6, 0, NULL, NULL },
334: { 0x1d242, 0x1d244, 0, NULL, NULL },
335: { 0x1f200, 0x1f202, 2, NULL, NULL },
336: { 0x1f250, 0x1f251, 2, NULL, NULL },
337: { 0x1f240, 0x1f248, 2, NULL, NULL },
338: { 0x20000, 0x2a6d6, 2, NULL, NULL },
339: { 0xe0020, 0xe007f, 0, NULL, NULL },
340: { 0x2f800, 0x2fa1d, 2, NULL, NULL },
341: { 0x2b740, 0x2b81d, 2, NULL, NULL },
342: { 0xe0001, 0xe0001, 0, NULL, NULL },
343: { 0xf0000, 0xffffd, 0, NULL, NULL },
344: { 0xe0100, 0xe01ef, 0, NULL, NULL },
345: { 0x100000, 0x10fffd, 0, NULL, NULL },
1.1 nicm 346: };
347:
348: struct utf8_width_entry *utf8_width_root = NULL;
349:
350: int utf8_overlap(struct utf8_width_entry *, struct utf8_width_entry *);
1.4 nicm 351: u_int utf8_combine(const struct utf8_data *);
352: u_int utf8_width(const struct utf8_data *);
1.1 nicm 353:
1.11 nicm 354: /* Set a single character. */
355: void
356: utf8_set(struct utf8_data *utf8data, u_char ch)
357: {
358: *utf8data->data = ch;
359: utf8data->size = 1;
360:
361: utf8data->width = 1;
362: }
363:
1.4 nicm 364: /*
365: * Open UTF-8 sequence.
366: *
367: * 11000010-11011111 C2-DF start of 2-byte sequence
368: * 11100000-11101111 E0-EF start of 3-byte sequence
369: * 11110000-11110100 F0-F4 start of 4-byte sequence
370: *
371: * Returns 1 if more UTF-8 to come, 0 if not UTF-8.
372: */
373: int
374: utf8_open(struct utf8_data *utf8data, u_char ch)
375: {
376: memset(utf8data, 0, sizeof *utf8data);
377: if (ch >= 0xc2 && ch <= 0xdf)
378: utf8data->size = 2;
379: else if (ch >= 0xe0 && ch <= 0xef)
380: utf8data->size = 3;
381: else if (ch >= 0xf0 && ch <= 0xf4)
382: utf8data->size = 4;
383: else
384: return (0);
385: utf8_append(utf8data, ch);
386: return (1);
387: }
388:
389: /*
390: * Append character to UTF-8, closing if finished.
391: *
1.5 nicm 392: * Returns 1 if more UTF-8 data to come, 0 if finished.
1.4 nicm 393: */
394: int
395: utf8_append(struct utf8_data *utf8data, u_char ch)
396: {
1.15 nicm 397: /* XXX this should do validity checks too! */
398:
1.4 nicm 399: if (utf8data->have >= utf8data->size)
400: fatalx("UTF-8 character overflow");
401: if (utf8data->size > sizeof utf8data->data)
402: fatalx("UTF-8 character size too large");
403:
404: utf8data->data[utf8data->have++] = ch;
405: if (utf8data->have != utf8data->size)
406: return (1);
407:
408: utf8data->width = utf8_width(utf8data);
409: return (0);
410: }
411:
412: /* Check if two width tree entries overlap. */
1.1 nicm 413: int
1.10 nicm 414: utf8_overlap(struct utf8_width_entry *item1, struct utf8_width_entry *item2)
1.1 nicm 415: {
416: if (item1->first >= item2->first && item1->first <= item2->last)
417: return (1);
418: if (item1->last >= item2->first && item1->last <= item2->last)
419: return (1);
420: if (item2->first >= item1->first && item2->first <= item1->last)
421: return (1);
422: if (item2->last >= item1->first && item2->last <= item1->last)
423: return (1);
424: return (0);
425: }
426:
1.4 nicm 427: /* Build UTF-8 width tree. */
1.1 nicm 428: void
429: utf8_build(void)
430: {
431: struct utf8_width_entry **ptr, *item, *node;
432: u_int i, j;
433:
434: for (i = 0; i < nitems(utf8_width_table); i++) {
435: item = &utf8_width_table[i];
436:
437: for (j = 0; j < nitems(utf8_width_table); j++) {
438: if (i != j && utf8_overlap(item, &utf8_width_table[j]))
439: log_fatalx("utf8 overlap: %u %u", i, j);
440: }
441:
442: ptr = &utf8_width_root;
443: while (*ptr != NULL) {
444: node = *ptr;
445: if (item->last < node->first)
1.13 nicm 446: ptr = &node->left;
1.1 nicm 447: else if (item->first > node->last)
1.13 nicm 448: ptr = &node->right;
1.1 nicm 449: }
450: *ptr = item;
451: }
452: }
453:
1.4 nicm 454: /* Combine UTF-8 into 32-bit Unicode. */
1.1 nicm 455: u_int
1.4 nicm 456: utf8_combine(const struct utf8_data *utf8data)
1.1 nicm 457: {
1.4 nicm 458: u_int value;
1.1 nicm 459:
1.4 nicm 460: value = 0xff;
461: switch (utf8data->size) {
462: case 1:
463: value = utf8data->data[0];
464: break;
465: case 2:
466: value = utf8data->data[1] & 0x3f;
467: value |= (utf8data->data[0] & 0x1f) << 6;
468: break;
469: case 3:
470: value = utf8data->data[2] & 0x3f;
471: value |= (utf8data->data[1] & 0x3f) << 6;
1.15 nicm 472: value |= (utf8data->data[0] & 0xf) << 12;
1.4 nicm 473: break;
474: case 4:
475: value = utf8data->data[3] & 0x3f;
476: value |= (utf8data->data[2] & 0x3f) << 6;
477: value |= (utf8data->data[1] & 0x3f) << 12;
1.15 nicm 478: value |= (utf8data->data[0] & 0x7) << 18;
1.4 nicm 479: break;
1.1 nicm 480: }
1.4 nicm 481: return (value);
1.15 nicm 482: }
483:
484: /* Split a UTF-8 character. */
485: int
486: utf8_split(u_int uc, struct utf8_data *utf8data)
487: {
488: if (uc < 0x7f) {
489: utf8data->size = 1;
490: utf8data->data[0] = uc;
491: } else if (uc < 0x7ff) {
492: utf8data->size = 2;
493: utf8data->data[0] = 0xc0 | ((uc >> 6) & 0x1f);
494: utf8data->data[1] = 0x80 | (uc & 0x3f);
495: } else if (uc < 0xffff) {
496: utf8data->size = 3;
497: utf8data->data[0] = 0xe0 | ((uc >> 12) & 0xf);
498: utf8data->data[1] = 0x80 | ((uc >> 6) & 0x3f);
499: utf8data->data[2] = 0x80 | (uc & 0x3f);
500: } else if (uc < 0x1fffff) {
501: utf8data->size = 4;
502: utf8data->data[0] = 0xf0 | ((uc >> 18) & 0x7);
503: utf8data->data[1] = 0x80 | ((uc >> 12) & 0x3f);
504: utf8data->data[2] = 0x80 | ((uc >> 6) & 0x3f);
505: utf8data->data[3] = 0x80 | (uc & 0x3f);
506: } else
507: return (-1);
508: utf8data->width = utf8_width(utf8data);
509: return (0);
1.6 nicm 510: }
511:
512: /* Split a two-byte UTF-8 character. */
513: u_int
514: utf8_split2(u_int uc, u_char *ptr)
515: {
516: if (uc > 0x7f) {
517: ptr[0] = (uc >> 6) | 0xc0;
518: ptr[1] = (uc & 0x3f) | 0x80;
519: return (2);
520: }
521: ptr[0] = uc;
522: return (1);
1.1 nicm 523: }
524:
1.4 nicm 525: /* Lookup width of UTF-8 data in tree. */
526: u_int
527: utf8_width(const struct utf8_data *utf8data)
1.1 nicm 528: {
529: struct utf8_width_entry *item;
1.4 nicm 530: u_int value;
1.1 nicm 531:
1.4 nicm 532: value = utf8_combine(utf8data);
1.1 nicm 533:
534: item = utf8_width_root;
535: while (item != NULL) {
1.4 nicm 536: if (value < item->first)
1.1 nicm 537: item = item->left;
1.4 nicm 538: else if (value > item->last)
1.1 nicm 539: item = item->right;
540: else
541: return (item->width);
542: }
543: return (1);
1.9 nicm 544: }
545:
546: /*
547: * Encode len characters from src into dst, which is guaranteed to have four
548: * bytes available for each character from src (for \abc or UTF-8) plus space
549: * for \0.
550: */
551: int
552: utf8_strvis(char *dst, const char *src, size_t len, int flag)
553: {
554: struct utf8_data utf8data;
555: const char *start, *end;
556: int more;
557: size_t i;
558:
559: start = dst;
560: end = src + len;
561:
562: while (src < end) {
563: if (utf8_open(&utf8data, *src)) {
564: more = 1;
565: while (++src < end && more)
566: more = utf8_append(&utf8data, *src);
567: if (!more) {
568: /* UTF-8 character finished. */
569: for (i = 0; i < utf8data.size; i++)
570: *dst++ = utf8data.data[i];
571: continue;
572: } else if (utf8data.have > 0) {
573: /* Not a complete UTF-8 character. */
574: src -= utf8data.have;
575: }
576: }
577: if (src < end - 1)
578: dst = vis(dst, src[0], flag, src[1]);
579: else if (src < end)
580: dst = vis(dst, src[0], flag, '\0');
581: src++;
582: }
583:
584: *dst = '\0';
585: return (dst - start);
1.16 ! nicm 586: }
! 587:
! 588: /*
! 589: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
! 590: * the returned string. Anything not valid printable ASCII or UTF-8 is
! 591: * stripped.
! 592: */
! 593: char *
! 594: utf8_sanitize(const char *src)
! 595: {
! 596: char *dst;
! 597: size_t n;
! 598: int more;
! 599: struct utf8_data utf8data;
! 600: u_int i;
! 601:
! 602: dst = NULL;
! 603:
! 604: n = 0;
! 605: while (*src != '\0') {
! 606: dst = xreallocarray(dst, n + 1, sizeof *dst);
! 607: if (utf8_open(&utf8data, *src)) {
! 608: more = 1;
! 609: while (*++src != '\0' && more)
! 610: more = utf8_append(&utf8data, *src);
! 611: if (!more) {
! 612: dst = xreallocarray(dst, n + utf8data.width,
! 613: sizeof *dst);
! 614: for (i = 0; i < utf8data.width; i++)
! 615: dst[n++] = '_';
! 616: continue;
! 617: }
! 618: src -= utf8data.have;
! 619: }
! 620: if (*src > 0x1f && *src < 0x7f)
! 621: dst[n] = *src;
! 622: src++;
! 623:
! 624: n++;
! 625: }
! 626:
! 627: dst = xreallocarray(dst, n + 1, sizeof *dst);
! 628: dst[n] = '\0';
! 629: return (dst);
1.11 nicm 630: }
631:
632: /*
633: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
634: * Caller frees.
635: */
636: struct utf8_data *
637: utf8_fromcstr(const char *src)
638: {
639: struct utf8_data *dst;
640: size_t n;
641: int more;
642:
643: dst = NULL;
644:
645: n = 0;
646: while (*src != '\0') {
1.12 nicm 647: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 648: if (utf8_open(&dst[n], *src)) {
649: more = 1;
650: while (*++src != '\0' && more)
651: more = utf8_append(&dst[n], *src);
652: if (!more) {
653: n++;
654: continue;
655: }
656: src -= dst[n].have;
657: }
658: utf8_set(&dst[n], *src);
659: src++;
660:
661: n++;
662: }
663:
1.12 nicm 664: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 665: dst[n].size = 0;
666: return (dst);
667: }
668:
669: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
670: char *
671: utf8_tocstr(struct utf8_data *src)
672: {
673: char *dst;
674: size_t n;
675:
676: dst = NULL;
677:
678: n = 0;
679: for(; src->size != 0; src++) {
1.12 nicm 680: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 681: memcpy(dst + n, src->data, src->size);
682: n += src->size;
683: }
684:
1.12 nicm 685: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 686: dst[n] = '\0';
687: return (dst);
688: }
689:
690: /* Get width of UTF-8 string. */
691: u_int
692: utf8_cstrwidth(const char *s)
693: {
694: struct utf8_data tmp;
695: u_int width;
696: int more;
697:
698: width = 0;
699: while (*s != '\0') {
700: if (utf8_open(&tmp, *s)) {
701: more = 1;
702: while (*++s != '\0' && more)
703: more = utf8_append(&tmp, *s);
704: if (!more) {
705: width += tmp.width;
706: continue;
707: }
708: s -= tmp.have;
709: }
710: width++;
711: s++;
712: }
713: return (width);
714: }
715:
716: /* Trim UTF-8 string to width. Caller frees. */
717: char *
718: utf8_trimcstr(const char *s, u_int width)
719: {
720: struct utf8_data *tmp, *next;
721: char *out;
722: u_int at;
723:
724: tmp = utf8_fromcstr(s);
725:
726: at = 0;
727: for (next = tmp; next->size != 0; next++) {
728: if (at + next->width > width) {
729: next->size = 0;
730: break;
731: }
732: at += next->width;
733: }
734:
735: out = utf8_tocstr(tmp);
736: free(tmp);
737: return (out);
1.1 nicm 738: }