Annotation of src/usr.bin/tmux/utf8.c, Revision 1.58
1.58 ! nicm 1: /* $OpenBSD: utf8.c,v 1.57 2020/09/16 18:37:55 nicm Exp $ */
1.1 nicm 2:
3: /*
1.26 nicm 4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
1.1 nicm 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.41 nicm 21: #include <ctype.h>
1.30 nicm 22: #include <errno.h>
1.11 nicm 23: #include <stdlib.h>
1.1 nicm 24: #include <string.h>
1.9 nicm 25: #include <vis.h>
1.28 nicm 26: #include <wchar.h>
1.1 nicm 27:
28: #include "tmux.h"
29:
1.47 nicm 30: struct utf8_item {
1.54 nicm 31: RB_ENTRY(utf8_item) index_entry;
32: u_int index;
1.45 nicm 33:
1.54 nicm 34: RB_ENTRY(utf8_item) data_entry;
1.45 nicm 35: char data[UTF8_SIZE];
36: u_char size;
37: };
38:
39: static int
1.54 nicm 40: utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
1.45 nicm 41: {
1.47 nicm 42: if (ui1->size < ui2->size)
1.45 nicm 43: return (-1);
1.47 nicm 44: if (ui1->size > ui2->size)
1.45 nicm 45: return (1);
1.47 nicm 46: return (memcmp(ui1->data, ui2->data, ui1->size));
1.45 nicm 47: }
1.54 nicm 48: RB_HEAD(utf8_data_tree, utf8_item);
49: RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
50: static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
1.45 nicm 51:
1.54 nicm 52: static int
53: utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
54: {
55: if (ui1->index < ui2->index)
56: return (-1);
57: if (ui1->index > ui2->index)
58: return (1);
59: return (0);
60: }
61: RB_HEAD(utf8_index_tree, utf8_item);
62: RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
63: static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
64:
65: static u_int utf8_next_index;
1.45 nicm 66:
1.53 nicm 67: #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
1.58 ! nicm 68: #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
1.53 nicm 69:
70: #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
71: #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
1.45 nicm 72:
1.54 nicm 73: /* Get a UTF-8 item from data. */
1.47 nicm 74: static struct utf8_item *
1.54 nicm 75: utf8_item_by_data(const char *data, size_t size)
1.45 nicm 76: {
1.47 nicm 77: struct utf8_item ui;
78:
79: memcpy(ui.data, data, size);
80: ui.size = size;
1.45 nicm 81:
1.54 nicm 82: return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
1.47 nicm 83: }
1.45 nicm 84:
1.54 nicm 85: /* Get a UTF-8 item from data. */
86: static struct utf8_item *
87: utf8_item_by_index(u_int index)
1.47 nicm 88: {
1.54 nicm 89: struct utf8_item ui;
90:
91: ui.index = index;
92:
93: return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
1.45 nicm 94: }
95:
1.47 nicm 96: /* Add a UTF-8 item. */
1.45 nicm 97: static int
1.54 nicm 98: utf8_put_item(const char *data, size_t size, u_int *index)
1.45 nicm 99: {
1.47 nicm 100: struct utf8_item *ui;
1.45 nicm 101:
1.54 nicm 102: ui = utf8_item_by_data(data, size);
1.47 nicm 103: if (ui != NULL) {
1.57 nicm 104: *index = ui->index;
1.54 nicm 105: log_debug("%s: found %.*s = %u", __func__, (int)size, data,
106: *index);
1.45 nicm 107: return (0);
108: }
109:
1.54 nicm 110: if (utf8_next_index == 0xffffff + 1)
1.47 nicm 111: return (-1);
112:
1.54 nicm 113: ui = xcalloc(1, sizeof *ui);
114: ui->index = utf8_next_index++;
115: RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
116:
1.47 nicm 117: memcpy(ui->data, data, size);
118: ui->size = size;
1.54 nicm 119: RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
1.45 nicm 120:
1.57 nicm 121: *index = ui->index;
1.54 nicm 122: log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
1.45 nicm 123: return (0);
124: }
125:
1.47 nicm 126: /* Get UTF-8 character from data. */
127: enum utf8_state
128: utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
1.45 nicm 129: {
1.54 nicm 130: u_int index;
1.45 nicm 131:
1.52 nicm 132: if (ud->width > 2)
1.55 nicm 133: fatalx("invalid UTF-8 width: %u", ud->width);
1.45 nicm 134:
1.52 nicm 135: if (ud->size > UTF8_SIZE)
1.45 nicm 136: goto fail;
1.53 nicm 137: if (ud->size <= 3) {
1.54 nicm 138: index = (((utf8_char)ud->data[2] << 16)|
1.53 nicm 139: ((utf8_char)ud->data[1] << 8)|
140: ((utf8_char)ud->data[0]));
1.54 nicm 141: } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
1.53 nicm 142: goto fail;
1.54 nicm 143: *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
1.53 nicm 144: log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
145: (int)ud->size, ud->data, *uc);
1.47 nicm 146: return (UTF8_DONE);
1.45 nicm 147:
148: fail:
1.52 nicm 149: if (ud->width == 0)
1.53 nicm 150: *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
1.52 nicm 151: else if (ud->width == 1)
1.53 nicm 152: *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
1.47 nicm 153: else
1.53 nicm 154: *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
1.47 nicm 155: return (UTF8_ERROR);
1.45 nicm 156: }
157:
1.47 nicm 158: /* Get UTF-8 data from character. */
1.45 nicm 159: void
1.47 nicm 160: utf8_to_data(utf8_char uc, struct utf8_data *ud)
1.45 nicm 161: {
1.47 nicm 162: struct utf8_item *ui;
1.54 nicm 163: u_int index;
1.45 nicm 164:
165: memset(ud, 0, sizeof *ud);
1.53 nicm 166: ud->size = ud->have = UTF8_GET_SIZE(uc);
167: ud->width = UTF8_GET_WIDTH(uc);
1.45 nicm 168:
169: if (ud->size <= 3) {
1.53 nicm 170: ud->data[2] = (uc >> 16);
171: ud->data[1] = ((uc >> 8) & 0xff);
172: ud->data[0] = (uc & 0xff);
173: } else {
1.54 nicm 174: index = (uc & 0xffffff);
175: if ((ui = utf8_item_by_index(index)) == NULL)
1.53 nicm 176: memset(ud->data, ' ', ud->size);
1.54 nicm 177: else
1.53 nicm 178: memcpy(ud->data, ui->data, ud->size);
1.45 nicm 179: }
180:
1.53 nicm 181: log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
182: (int)ud->size, ud->data);
1.45 nicm 183: }
184:
1.47 nicm 185: /* Get UTF-8 character from a single ASCII character. */
1.46 nicm 186: u_int
1.52 nicm 187: utf8_build_one(u_char ch)
1.45 nicm 188: {
1.53 nicm 189: return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
1.45 nicm 190: }
1.29 nicm 191:
1.11 nicm 192: /* Set a single character. */
193: void
1.19 nicm 194: utf8_set(struct utf8_data *ud, u_char ch)
1.11 nicm 195: {
1.33 nicm 196: static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
1.20 nicm 197:
1.33 nicm 198: memcpy(ud, &empty, sizeof *ud);
1.19 nicm 199: *ud->data = ch;
1.20 nicm 200: }
201:
202: /* Copy UTF-8 character. */
203: void
204: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
205: {
206: u_int i;
207:
208: memcpy(to, from, sizeof *to);
209:
210: for (i = to->size; i < sizeof to->data; i++)
211: to->data[i] = '\0';
1.11 nicm 212: }
213:
1.47 nicm 214: /* Get width of Unicode character. */
1.48 nicm 215: static enum utf8_state
216: utf8_width(struct utf8_data *ud, int *width)
1.47 nicm 217: {
1.48 nicm 218: wchar_t wc;
1.47 nicm 219:
1.48 nicm 220: switch (mbtowc(&wc, ud->data, ud->size)) {
221: case -1:
222: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
223: errno);
224: mbtowc(NULL, NULL, MB_CUR_MAX);
225: return (UTF8_ERROR);
226: case 0:
227: return (UTF8_ERROR);
228: }
229: *width = wcwidth(wc);
230: if (*width < 0 || *width > 0xff) {
231: log_debug("UTF-8 %.*s, wcwidth() %d", (int)ud->size, ud->data,
232: *width);
233: return (UTF8_ERROR);
1.47 nicm 234: }
1.48 nicm 235: return (UTF8_DONE);
1.47 nicm 236: }
237:
1.4 nicm 238: /*
239: * Open UTF-8 sequence.
240: *
241: * 11000010-11011111 C2-DF start of 2-byte sequence
242: * 11100000-11101111 E0-EF start of 3-byte sequence
243: * 11110000-11110100 F0-F4 start of 4-byte sequence
244: */
1.23 nicm 245: enum utf8_state
1.19 nicm 246: utf8_open(struct utf8_data *ud, u_char ch)
1.4 nicm 247: {
1.19 nicm 248: memset(ud, 0, sizeof *ud);
1.4 nicm 249: if (ch >= 0xc2 && ch <= 0xdf)
1.19 nicm 250: ud->size = 2;
1.4 nicm 251: else if (ch >= 0xe0 && ch <= 0xef)
1.19 nicm 252: ud->size = 3;
1.4 nicm 253: else if (ch >= 0xf0 && ch <= 0xf4)
1.19 nicm 254: ud->size = 4;
1.4 nicm 255: else
1.23 nicm 256: return (UTF8_ERROR);
1.19 nicm 257: utf8_append(ud, ch);
1.23 nicm 258: return (UTF8_MORE);
1.4 nicm 259: }
260:
1.23 nicm 261: /* Append character to UTF-8, closing if finished. */
262: enum utf8_state
1.19 nicm 263: utf8_append(struct utf8_data *ud, u_char ch)
1.4 nicm 264: {
1.29 nicm 265: int width;
266:
1.19 nicm 267: if (ud->have >= ud->size)
1.4 nicm 268: fatalx("UTF-8 character overflow");
1.19 nicm 269: if (ud->size > sizeof ud->data)
1.4 nicm 270: fatalx("UTF-8 character size too large");
271:
1.21 nicm 272: if (ud->have != 0 && (ch & 0xc0) != 0x80)
273: ud->width = 0xff;
274:
1.19 nicm 275: ud->data[ud->have++] = ch;
276: if (ud->have != ud->size)
1.23 nicm 277: return (UTF8_MORE);
1.4 nicm 278:
1.21 nicm 279: if (ud->width == 0xff)
1.23 nicm 280: return (UTF8_ERROR);
1.48 nicm 281: if (utf8_width(ud, &width) != UTF8_DONE)
1.29 nicm 282: return (UTF8_ERROR);
283: ud->width = width;
284:
1.23 nicm 285: return (UTF8_DONE);
1.9 nicm 286: }
287:
288: /*
289: * Encode len characters from src into dst, which is guaranteed to have four
290: * bytes available for each character from src (for \abc or UTF-8) plus space
291: * for \0.
292: */
293: int
294: utf8_strvis(char *dst, const char *src, size_t len, int flag)
295: {
1.19 nicm 296: struct utf8_data ud;
1.47 nicm 297: const char *start = dst, *end = src + len;
1.23 nicm 298: enum utf8_state more;
1.9 nicm 299: size_t i;
300:
301: while (src < end) {
1.23 nicm 302: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
303: while (++src < end && more == UTF8_MORE)
1.19 nicm 304: more = utf8_append(&ud, *src);
1.23 nicm 305: if (more == UTF8_DONE) {
1.9 nicm 306: /* UTF-8 character finished. */
1.19 nicm 307: for (i = 0; i < ud.size; i++)
308: *dst++ = ud.data[i];
1.9 nicm 309: continue;
310: }
1.23 nicm 311: /* Not a complete, valid UTF-8 character. */
312: src -= ud.have;
1.9 nicm 313: }
1.41 nicm 314: if (src[0] == '$' && src < end - 1) {
1.42 nicm 315: if (isalpha((u_char)src[1]) ||
316: src[1] == '_' ||
317: src[1] == '{')
1.41 nicm 318: *dst++ = '\\';
319: *dst++ = '$';
320: } else if (src < end - 1)
1.9 nicm 321: dst = vis(dst, src[0], flag, src[1]);
322: else if (src < end)
323: dst = vis(dst, src[0], flag, '\0');
324: src++;
325: }
326: *dst = '\0';
327: return (dst - start);
1.35 nicm 328: }
329:
330: /* Same as utf8_strvis but allocate the buffer. */
331: int
332: utf8_stravis(char **dst, const char *src, int flag)
333: {
334: char *buf;
335: int len;
336:
337: buf = xreallocarray(NULL, 4, strlen(src) + 1);
338: len = utf8_strvis(buf, src, strlen(src), flag);
1.56 nicm 339:
340: *dst = xrealloc(buf, len + 1);
341: return (len);
342: }
343:
344: /* Same as utf8_strvis but allocate the buffer. */
345: int
346: utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
347: {
348: char *buf;
349: int len;
350:
351: buf = xreallocarray(NULL, 4, srclen + 1);
352: len = utf8_strvis(buf, src, srclen, flag);
1.35 nicm 353:
354: *dst = xrealloc(buf, len + 1);
355: return (len);
1.38 nicm 356: }
357:
358: /* Does this string contain anything that isn't valid UTF-8? */
359: int
360: utf8_isvalid(const char *s)
361: {
1.47 nicm 362: struct utf8_data ud;
363: const char *end;
364: enum utf8_state more;
1.38 nicm 365:
366: end = s + strlen(s);
367: while (s < end) {
368: if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
369: while (++s < end && more == UTF8_MORE)
370: more = utf8_append(&ud, *s);
371: if (more == UTF8_DONE)
372: continue;
373: return (0);
374: }
375: if (*s < 0x20 || *s > 0x7e)
376: return (0);
377: s++;
378: }
379: return (1);
1.16 nicm 380: }
381:
382: /*
383: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
384: * the returned string. Anything not valid printable ASCII or UTF-8 is
385: * stripped.
386: */
387: char *
388: utf8_sanitize(const char *src)
389: {
1.47 nicm 390: char *dst = NULL;
391: size_t n = 0;
392: enum utf8_state more;
393: struct utf8_data ud;
394: u_int i;
1.16 nicm 395:
396: while (*src != '\0') {
397: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 398: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
399: while (*++src != '\0' && more == UTF8_MORE)
1.19 nicm 400: more = utf8_append(&ud, *src);
1.23 nicm 401: if (more == UTF8_DONE) {
1.19 nicm 402: dst = xreallocarray(dst, n + ud.width,
1.16 nicm 403: sizeof *dst);
1.19 nicm 404: for (i = 0; i < ud.width; i++)
1.16 nicm 405: dst[n++] = '_';
406: continue;
407: }
1.19 nicm 408: src -= ud.have;
1.16 nicm 409: }
410: if (*src > 0x1f && *src < 0x7f)
1.21 nicm 411: dst[n++] = *src;
1.23 nicm 412: else
413: dst[n++] = '_';
1.16 nicm 414: src++;
415: }
416: dst = xreallocarray(dst, n + 1, sizeof *dst);
417: dst[n] = '\0';
418: return (dst);
1.34 nicm 419: }
420:
421: /* Get UTF-8 buffer length. */
422: size_t
423: utf8_strlen(const struct utf8_data *s)
424: {
425: size_t i;
426:
427: for (i = 0; s[i].size != 0; i++)
428: /* nothing */;
429: return (i);
430: }
431:
432: /* Get UTF-8 string width. */
433: u_int
434: utf8_strwidth(const struct utf8_data *s, ssize_t n)
435: {
436: ssize_t i;
1.47 nicm 437: u_int width = 0;
1.34 nicm 438:
439: for (i = 0; s[i].size != 0; i++) {
440: if (n != -1 && n == i)
441: break;
442: width += s[i].width;
443: }
444: return (width);
1.11 nicm 445: }
446:
447: /*
448: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
449: * Caller frees.
450: */
451: struct utf8_data *
452: utf8_fromcstr(const char *src)
453: {
1.47 nicm 454: struct utf8_data *dst = NULL;
455: size_t n = 0;
1.23 nicm 456: enum utf8_state more;
1.11 nicm 457:
458: while (*src != '\0') {
1.12 nicm 459: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 460: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
461: while (*++src != '\0' && more == UTF8_MORE)
1.11 nicm 462: more = utf8_append(&dst[n], *src);
1.23 nicm 463: if (more == UTF8_DONE) {
1.11 nicm 464: n++;
465: continue;
466: }
467: src -= dst[n].have;
468: }
1.23 nicm 469: utf8_set(&dst[n], *src);
470: n++;
1.11 nicm 471: src++;
472: }
1.12 nicm 473: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 474: dst[n].size = 0;
475: return (dst);
476: }
477:
478: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
479: char *
480: utf8_tocstr(struct utf8_data *src)
481: {
1.47 nicm 482: char *dst = NULL;
483: size_t n = 0;
1.11 nicm 484:
485: for(; src->size != 0; src++) {
1.12 nicm 486: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 487: memcpy(dst + n, src->data, src->size);
488: n += src->size;
489: }
1.12 nicm 490: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 491: dst[n] = '\0';
492: return (dst);
493: }
494:
495: /* Get width of UTF-8 string. */
496: u_int
497: utf8_cstrwidth(const char *s)
498: {
499: struct utf8_data tmp;
500: u_int width;
1.23 nicm 501: enum utf8_state more;
1.11 nicm 502:
503: width = 0;
504: while (*s != '\0') {
1.23 nicm 505: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
506: while (*++s != '\0' && more == UTF8_MORE)
1.11 nicm 507: more = utf8_append(&tmp, *s);
1.23 nicm 508: if (more == UTF8_DONE) {
1.11 nicm 509: width += tmp.width;
510: continue;
511: }
512: s -= tmp.have;
513: }
1.23 nicm 514: if (*s > 0x1f && *s != 0x7f)
1.21 nicm 515: width++;
1.11 nicm 516: s++;
517: }
518: return (width);
1.18 nicm 519: }
520:
1.44 nicm 521: /* Pad UTF-8 string to width on the left. Caller frees. */
1.18 nicm 522: char *
523: utf8_padcstr(const char *s, u_int width)
524: {
525: size_t slen;
526: char *out;
1.47 nicm 527: u_int n, i;
1.18 nicm 528:
529: n = utf8_cstrwidth(s);
530: if (n >= width)
531: return (xstrdup(s));
532:
533: slen = strlen(s);
534: out = xmalloc(slen + 1 + (width - n));
535: memcpy(out, s, slen);
536: for (i = n; i < width; i++)
537: out[slen++] = ' ';
538: out[slen] = '\0';
1.44 nicm 539: return (out);
540: }
541:
542: /* Pad UTF-8 string to width on the right. Caller frees. */
543: char *
544: utf8_rpadcstr(const char *s, u_int width)
545: {
546: size_t slen;
547: char *out;
1.47 nicm 548: u_int n, i;
1.44 nicm 549:
550: n = utf8_cstrwidth(s);
551: if (n >= width)
552: return (xstrdup(s));
553:
554: slen = strlen(s);
555: out = xmalloc(slen + 1 + (width - n));
556: for (i = 0; i < width - n; i++)
557: out[i] = ' ';
558: memcpy(out + i, s, slen);
559: out[i + slen] = '\0';
1.11 nicm 560: return (out);
1.43 nicm 561: }
562:
563: int
564: utf8_cstrhas(const char *s, const struct utf8_data *ud)
565: {
566: struct utf8_data *copy, *loop;
567: int found = 0;
568:
569: copy = utf8_fromcstr(s);
570: for (loop = copy; loop->size != 0; loop++) {
571: if (loop->size != ud->size)
572: continue;
573: if (memcmp(loop->data, ud->data, loop->size) == 0) {
574: found = 1;
575: break;
576: }
577: }
578: free(copy);
579:
580: return (found);
1.1 nicm 581: }