Annotation of src/usr.bin/tmux/utf8.c, Revision 1.46
1.46 ! nicm 1: /* $OpenBSD: utf8.c,v 1.45 2020/05/25 09:32:10 nicm Exp $ */
1.1 nicm 2:
3: /*
1.26 nicm 4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
1.1 nicm 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.41 nicm 21: #include <ctype.h>
1.30 nicm 22: #include <errno.h>
1.11 nicm 23: #include <stdlib.h>
1.1 nicm 24: #include <string.h>
1.9 nicm 25: #include <vis.h>
1.28 nicm 26: #include <wchar.h>
1.1 nicm 27:
28: #include "tmux.h"
29:
1.29 nicm 30: static int utf8_width(wchar_t);
1.45 nicm 31:
32: struct utf8_big_item {
33: u_int index;
34: RB_ENTRY(utf8_big_item) entry;
35:
36: char data[UTF8_SIZE];
37: u_char size;
38: };
39: RB_HEAD(utf8_big_tree, utf8_big_item);
40:
41: static int
42: utf8_big_cmp(struct utf8_big_item *bi1, struct utf8_big_item *bi2)
43: {
44: if (bi1->size < bi2->size)
45: return (-1);
46: if (bi1->size > bi2->size)
47: return (1);
48: return (memcmp(bi1->data, bi2->data, bi1->size));
49: }
50: RB_GENERATE_STATIC(utf8_big_tree, utf8_big_item, entry, utf8_big_cmp);
51: static struct utf8_big_tree utf8_big_tree = RB_INITIALIZER(utf8_big_tree);
52:
53: static struct utf8_big_item *utf8_big_list;
54: static u_int utf8_big_list_size;
55: static u_int utf8_big_list_used;
56:
57: union utf8_big_map {
1.46 ! nicm 58: u_int value;
1.45 nicm 59: struct {
60: u_char flags;
61: #define UTF8_BIG_SIZE 0x1f
62: #define UTF8_BIG_WIDTH2 0x20
63:
64: u_char data[3];
65: };
66: } __packed;
67:
68: static const union utf8_big_map utf8_big_space1 = {
69: .flags = 1,
70: .data = " "
71: };
72: static const union utf8_big_map utf8_big_space2 = {
73: .flags = UTF8_BIG_WIDTH2|2,
74: .data = " "
75: };
76:
77: /* Get a big item by index. */
78: static struct utf8_big_item *
79: utf8_get_big_item(const char *data, size_t size)
80: {
81: struct utf8_big_item bi;
82:
83: memcpy(bi.data, data, size);
84: bi.size = size;
85:
86: return (RB_FIND(utf8_big_tree, &utf8_big_tree, &bi));
87: }
88:
89: /* Add a big item. */
90: static int
91: utf8_put_big_item(const char *data, size_t size, u_int *index)
92: {
93: struct utf8_big_item *bi;
94:
95: bi = utf8_get_big_item(data, size);
96: if (bi != NULL) {
97: *index = bi->index;
98: log_debug("%s: have %.*s at %u", __func__, (int)size, data,
99: *index);
100: return (0);
101: }
102:
103: if (utf8_big_list_used == utf8_big_list_size) {
104: if (utf8_big_list_size == 0xffffff)
105: return (-1);
106: if (utf8_big_list_size == 0)
107: utf8_big_list_size = 256;
108: else if (utf8_big_list_size > 0x7fffff)
109: utf8_big_list_size = 0xffffff;
110: else
111: utf8_big_list_size *= 2;
112: utf8_big_list = xreallocarray(utf8_big_list, utf8_big_list_size,
113: sizeof *utf8_big_list);
114: }
115: *index = utf8_big_list_used++;
116:
117: bi = &utf8_big_list[*index];
118: bi->index = *index;
119: memcpy(bi->data, data, size);
120: bi->size = size;
121: RB_INSERT(utf8_big_tree, &utf8_big_tree, bi);
122:
123: log_debug("%s: added %.*s at %u", __func__, (int)size, data, *index);
124: return (0);
125: }
126:
127: /* Get UTF-8 as index into buffer. */
1.46 ! nicm 128: u_int
1.45 nicm 129: utf8_map_big(const struct utf8_data *ud)
130: {
131: union utf8_big_map m = { .value = 0 };
132: u_int o;
133: const char *data = ud->data;
134: size_t size = ud->size;
135:
136: if (ud->width != 1 && ud->width != 2)
137: return (utf8_big_space1.value);
138:
139: if (size > UTF8_BIG_SIZE)
140: goto fail;
141: if (size == 1)
142: return (utf8_set_big(data[0], 1));
143:
144: m.flags = size;
145: if (ud->width == 2)
146: m.flags |= UTF8_BIG_WIDTH2;
147:
148: if (size <= 3) {
149: memcpy(&m.data, data, size);
150: return (m.value);
151: }
152:
153: if (utf8_put_big_item(data, size, &o) != 0)
154: goto fail;
155: m.data[0] = (o & 0xff);
156: m.data[1] = (o >> 8) & 0xff;
157: m.data[2] = (o >> 16);
158: return (m.value);
159:
160: fail:
161: if (ud->width == 1)
162: return (utf8_big_space1.value);
163: return (utf8_big_space2.value);
164: }
165:
166: /* Get UTF-8 from index into buffer. */
167: void
1.46 ! nicm 168: utf8_get_big(u_int v, struct utf8_data *ud)
1.45 nicm 169: {
170: union utf8_big_map m = { .value = v };
171: struct utf8_big_item *bi;
172: u_int o;
173:
174: memset(ud, 0, sizeof *ud);
175: ud->size = ud->have = (m.flags & UTF8_BIG_SIZE);
176: if (m.flags & UTF8_BIG_WIDTH2)
177: ud->width = 2;
178: else
179: ud->width = 1;
180:
181: if (ud->size <= 3) {
182: memcpy(ud->data, m.data, ud->size);
183: return;
184: }
185:
1.46 ! nicm 186: o = ((u_int)m.data[2] << 16)|((u_int)m.data[1] << 8)|m.data[0];
1.45 nicm 187: if (o >= utf8_big_list_used)
188: memset(ud->data, ' ', ud->size);
189: else {
190: bi = &utf8_big_list[o];
191: memcpy(ud->data, bi->data, ud->size);
192: }
193: }
194:
195: /* Get big value for UTF-8 single character. */
1.46 ! nicm 196: u_int
1.45 nicm 197: utf8_set_big(char c, u_int width)
198: {
199: union utf8_big_map m = { .flags = 1, .data[0] = c };
200:
201: if (width == 2)
202: m.flags |= UTF8_BIG_WIDTH2;
203: return (m.value);
204: }
1.29 nicm 205:
1.11 nicm 206: /* Set a single character. */
207: void
1.19 nicm 208: utf8_set(struct utf8_data *ud, u_char ch)
1.11 nicm 209: {
1.33 nicm 210: static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
1.20 nicm 211:
1.33 nicm 212: memcpy(ud, &empty, sizeof *ud);
1.19 nicm 213: *ud->data = ch;
1.20 nicm 214: }
215:
216: /* Copy UTF-8 character. */
217: void
218: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
219: {
220: u_int i;
221:
222: memcpy(to, from, sizeof *to);
223:
224: for (i = to->size; i < sizeof to->data; i++)
225: to->data[i] = '\0';
1.11 nicm 226: }
227:
1.4 nicm 228: /*
229: * Open UTF-8 sequence.
230: *
231: * 11000010-11011111 C2-DF start of 2-byte sequence
232: * 11100000-11101111 E0-EF start of 3-byte sequence
233: * 11110000-11110100 F0-F4 start of 4-byte sequence
234: */
1.23 nicm 235: enum utf8_state
1.19 nicm 236: utf8_open(struct utf8_data *ud, u_char ch)
1.4 nicm 237: {
1.19 nicm 238: memset(ud, 0, sizeof *ud);
1.4 nicm 239: if (ch >= 0xc2 && ch <= 0xdf)
1.19 nicm 240: ud->size = 2;
1.4 nicm 241: else if (ch >= 0xe0 && ch <= 0xef)
1.19 nicm 242: ud->size = 3;
1.4 nicm 243: else if (ch >= 0xf0 && ch <= 0xf4)
1.19 nicm 244: ud->size = 4;
1.4 nicm 245: else
1.23 nicm 246: return (UTF8_ERROR);
1.19 nicm 247: utf8_append(ud, ch);
1.23 nicm 248: return (UTF8_MORE);
1.4 nicm 249: }
250:
1.23 nicm 251: /* Append character to UTF-8, closing if finished. */
252: enum utf8_state
1.19 nicm 253: utf8_append(struct utf8_data *ud, u_char ch)
1.4 nicm 254: {
1.29 nicm 255: wchar_t wc;
256: int width;
257:
1.19 nicm 258: if (ud->have >= ud->size)
1.4 nicm 259: fatalx("UTF-8 character overflow");
1.19 nicm 260: if (ud->size > sizeof ud->data)
1.4 nicm 261: fatalx("UTF-8 character size too large");
262:
1.21 nicm 263: if (ud->have != 0 && (ch & 0xc0) != 0x80)
264: ud->width = 0xff;
265:
1.19 nicm 266: ud->data[ud->have++] = ch;
267: if (ud->have != ud->size)
1.23 nicm 268: return (UTF8_MORE);
1.4 nicm 269:
1.21 nicm 270: if (ud->width == 0xff)
1.23 nicm 271: return (UTF8_ERROR);
1.29 nicm 272:
273: if (utf8_combine(ud, &wc) != UTF8_DONE)
274: return (UTF8_ERROR);
275: if ((width = utf8_width(wc)) < 0)
276: return (UTF8_ERROR);
277: ud->width = width;
278:
1.23 nicm 279: return (UTF8_DONE);
1.1 nicm 280: }
281:
1.28 nicm 282: /* Get width of Unicode character. */
1.29 nicm 283: static int
1.28 nicm 284: utf8_width(wchar_t wc)
1.17 nicm 285: {
1.29 nicm 286: int width;
1.17 nicm 287:
1.28 nicm 288: width = wcwidth(wc);
1.30 nicm 289: if (width < 0 || width > 0xff) {
1.36 nicm 290: log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width);
1.29 nicm 291: return (-1);
1.30 nicm 292: }
1.28 nicm 293: return (width);
1.17 nicm 294: }
295:
1.28 nicm 296: /* Combine UTF-8 into Unicode. */
1.29 nicm 297: enum utf8_state
298: utf8_combine(const struct utf8_data *ud, wchar_t *wc)
1.1 nicm 299: {
1.29 nicm 300: switch (mbtowc(wc, ud->data, ud->size)) {
301: case -1:
1.30 nicm 302: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
303: errno);
1.29 nicm 304: mbtowc(NULL, NULL, MB_CUR_MAX);
305: return (UTF8_ERROR);
306: case 0:
307: return (UTF8_ERROR);
308: default:
309: return (UTF8_DONE);
310: }
1.15 nicm 311: }
312:
1.28 nicm 313: /* Split Unicode into UTF-8. */
1.23 nicm 314: enum utf8_state
1.28 nicm 315: utf8_split(wchar_t wc, struct utf8_data *ud)
1.15 nicm 316: {
1.29 nicm 317: char s[MB_LEN_MAX];
318: int slen;
1.28 nicm 319:
320: slen = wctomb(s, wc);
321: if (slen <= 0 || slen > (int)sizeof ud->data)
1.23 nicm 322: return (UTF8_ERROR);
1.28 nicm 323:
324: memcpy(ud->data, s, slen);
325: ud->size = slen;
326:
327: ud->width = utf8_width(wc);
1.23 nicm 328: return (UTF8_DONE);
1.9 nicm 329: }
330:
331: /*
332: * Encode len characters from src into dst, which is guaranteed to have four
333: * bytes available for each character from src (for \abc or UTF-8) plus space
334: * for \0.
335: */
336: int
337: utf8_strvis(char *dst, const char *src, size_t len, int flag)
338: {
1.19 nicm 339: struct utf8_data ud;
1.9 nicm 340: const char *start, *end;
1.23 nicm 341: enum utf8_state more;
1.9 nicm 342: size_t i;
343:
344: start = dst;
345: end = src + len;
346:
347: while (src < end) {
1.23 nicm 348: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
349: while (++src < end && more == UTF8_MORE)
1.19 nicm 350: more = utf8_append(&ud, *src);
1.23 nicm 351: if (more == UTF8_DONE) {
1.9 nicm 352: /* UTF-8 character finished. */
1.19 nicm 353: for (i = 0; i < ud.size; i++)
354: *dst++ = ud.data[i];
1.9 nicm 355: continue;
356: }
1.23 nicm 357: /* Not a complete, valid UTF-8 character. */
358: src -= ud.have;
1.9 nicm 359: }
1.41 nicm 360: if (src[0] == '$' && src < end - 1) {
1.42 nicm 361: if (isalpha((u_char)src[1]) ||
362: src[1] == '_' ||
363: src[1] == '{')
1.41 nicm 364: *dst++ = '\\';
365: *dst++ = '$';
366: } else if (src < end - 1)
1.9 nicm 367: dst = vis(dst, src[0], flag, src[1]);
368: else if (src < end)
369: dst = vis(dst, src[0], flag, '\0');
370: src++;
371: }
372:
373: *dst = '\0';
374: return (dst - start);
1.35 nicm 375: }
376:
377: /* Same as utf8_strvis but allocate the buffer. */
378: int
379: utf8_stravis(char **dst, const char *src, int flag)
380: {
381: char *buf;
382: int len;
383:
384: buf = xreallocarray(NULL, 4, strlen(src) + 1);
385: len = utf8_strvis(buf, src, strlen(src), flag);
386:
387: *dst = xrealloc(buf, len + 1);
388: return (len);
1.38 nicm 389: }
390:
391: /* Does this string contain anything that isn't valid UTF-8? */
392: int
393: utf8_isvalid(const char *s)
394: {
395: struct utf8_data ud;
396: const char *end;
397: enum utf8_state more;
398:
399: end = s + strlen(s);
400: while (s < end) {
401: if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
402: while (++s < end && more == UTF8_MORE)
403: more = utf8_append(&ud, *s);
404: if (more == UTF8_DONE)
405: continue;
406: return (0);
407: }
408: if (*s < 0x20 || *s > 0x7e)
409: return (0);
410: s++;
411: }
412: return (1);
1.16 nicm 413: }
414:
415: /*
416: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
417: * the returned string. Anything not valid printable ASCII or UTF-8 is
418: * stripped.
419: */
420: char *
421: utf8_sanitize(const char *src)
422: {
423: char *dst;
424: size_t n;
1.23 nicm 425: enum utf8_state more;
1.19 nicm 426: struct utf8_data ud;
1.16 nicm 427: u_int i;
428:
429: dst = NULL;
430:
431: n = 0;
432: while (*src != '\0') {
433: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 434: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
435: while (*++src != '\0' && more == UTF8_MORE)
1.19 nicm 436: more = utf8_append(&ud, *src);
1.23 nicm 437: if (more == UTF8_DONE) {
1.19 nicm 438: dst = xreallocarray(dst, n + ud.width,
1.16 nicm 439: sizeof *dst);
1.19 nicm 440: for (i = 0; i < ud.width; i++)
1.16 nicm 441: dst[n++] = '_';
442: continue;
443: }
1.19 nicm 444: src -= ud.have;
1.16 nicm 445: }
446: if (*src > 0x1f && *src < 0x7f)
1.21 nicm 447: dst[n++] = *src;
1.23 nicm 448: else
449: dst[n++] = '_';
1.16 nicm 450: src++;
451: }
452:
453: dst = xreallocarray(dst, n + 1, sizeof *dst);
454: dst[n] = '\0';
455: return (dst);
1.34 nicm 456: }
457:
458: /* Get UTF-8 buffer length. */
459: size_t
460: utf8_strlen(const struct utf8_data *s)
461: {
462: size_t i;
463:
464: for (i = 0; s[i].size != 0; i++)
465: /* nothing */;
466: return (i);
467: }
468:
469: /* Get UTF-8 string width. */
470: u_int
471: utf8_strwidth(const struct utf8_data *s, ssize_t n)
472: {
473: ssize_t i;
474: u_int width;
475:
476: width = 0;
477: for (i = 0; s[i].size != 0; i++) {
478: if (n != -1 && n == i)
479: break;
480: width += s[i].width;
481: }
482: return (width);
1.11 nicm 483: }
484:
485: /*
486: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
487: * Caller frees.
488: */
489: struct utf8_data *
490: utf8_fromcstr(const char *src)
491: {
492: struct utf8_data *dst;
493: size_t n;
1.23 nicm 494: enum utf8_state more;
1.11 nicm 495:
496: dst = NULL;
497:
498: n = 0;
499: while (*src != '\0') {
1.12 nicm 500: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 501: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
502: while (*++src != '\0' && more == UTF8_MORE)
1.11 nicm 503: more = utf8_append(&dst[n], *src);
1.23 nicm 504: if (more == UTF8_DONE) {
1.11 nicm 505: n++;
506: continue;
507: }
508: src -= dst[n].have;
509: }
1.23 nicm 510: utf8_set(&dst[n], *src);
511: n++;
1.11 nicm 512: src++;
513: }
514:
1.12 nicm 515: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 516: dst[n].size = 0;
517: return (dst);
518: }
519:
520: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
521: char *
522: utf8_tocstr(struct utf8_data *src)
523: {
524: char *dst;
525: size_t n;
526:
527: dst = NULL;
528:
529: n = 0;
530: for(; src->size != 0; src++) {
1.12 nicm 531: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 532: memcpy(dst + n, src->data, src->size);
533: n += src->size;
534: }
535:
1.12 nicm 536: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 537: dst[n] = '\0';
538: return (dst);
539: }
540:
541: /* Get width of UTF-8 string. */
542: u_int
543: utf8_cstrwidth(const char *s)
544: {
545: struct utf8_data tmp;
546: u_int width;
1.23 nicm 547: enum utf8_state more;
1.11 nicm 548:
549: width = 0;
550: while (*s != '\0') {
1.23 nicm 551: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
552: while (*++s != '\0' && more == UTF8_MORE)
1.11 nicm 553: more = utf8_append(&tmp, *s);
1.23 nicm 554: if (more == UTF8_DONE) {
1.11 nicm 555: width += tmp.width;
556: continue;
557: }
558: s -= tmp.have;
559: }
1.23 nicm 560: if (*s > 0x1f && *s != 0x7f)
1.21 nicm 561: width++;
1.11 nicm 562: s++;
563: }
564: return (width);
1.18 nicm 565: }
566:
1.44 nicm 567: /* Pad UTF-8 string to width on the left. Caller frees. */
1.18 nicm 568: char *
569: utf8_padcstr(const char *s, u_int width)
570: {
571: size_t slen;
572: char *out;
573: u_int n, i;
574:
575: n = utf8_cstrwidth(s);
576: if (n >= width)
577: return (xstrdup(s));
578:
579: slen = strlen(s);
580: out = xmalloc(slen + 1 + (width - n));
581: memcpy(out, s, slen);
582: for (i = n; i < width; i++)
583: out[slen++] = ' ';
584: out[slen] = '\0';
1.44 nicm 585: return (out);
586: }
587:
588: /* Pad UTF-8 string to width on the right. Caller frees. */
589: char *
590: utf8_rpadcstr(const char *s, u_int width)
591: {
592: size_t slen;
593: char *out;
594: u_int n, i;
595:
596: n = utf8_cstrwidth(s);
597: if (n >= width)
598: return (xstrdup(s));
599:
600: slen = strlen(s);
601: out = xmalloc(slen + 1 + (width - n));
602: for (i = 0; i < width - n; i++)
603: out[i] = ' ';
604: memcpy(out + i, s, slen);
605: out[i + slen] = '\0';
1.11 nicm 606: return (out);
1.43 nicm 607: }
608:
609: int
610: utf8_cstrhas(const char *s, const struct utf8_data *ud)
611: {
612: struct utf8_data *copy, *loop;
613: int found = 0;
614:
615: copy = utf8_fromcstr(s);
616: for (loop = copy; loop->size != 0; loop++) {
617: if (loop->size != ud->size)
618: continue;
619: if (memcmp(loop->data, ud->data, loop->size) == 0) {
620: found = 1;
621: break;
622: }
623: }
624: free(copy);
625:
626: return (found);
1.1 nicm 627: }