Annotation of src/usr.bin/tmux/utf8.c, Revision 1.55
1.55 ! nicm 1: /* $OpenBSD: utf8.c,v 1.54 2020/06/09 08:34:33 nicm Exp $ */
1.1 nicm 2:
3: /*
1.26 nicm 4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
1.1 nicm 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.41 nicm 21: #include <ctype.h>
1.30 nicm 22: #include <errno.h>
1.11 nicm 23: #include <stdlib.h>
1.1 nicm 24: #include <string.h>
1.9 nicm 25: #include <vis.h>
1.28 nicm 26: #include <wchar.h>
1.1 nicm 27:
28: #include "tmux.h"
29:
1.47 nicm 30: struct utf8_item {
1.54 nicm 31: RB_ENTRY(utf8_item) index_entry;
32: u_int index;
1.45 nicm 33:
1.54 nicm 34: RB_ENTRY(utf8_item) data_entry;
1.45 nicm 35: char data[UTF8_SIZE];
36: u_char size;
37: };
38:
39: static int
1.54 nicm 40: utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
1.45 nicm 41: {
1.47 nicm 42: if (ui1->size < ui2->size)
1.45 nicm 43: return (-1);
1.47 nicm 44: if (ui1->size > ui2->size)
1.45 nicm 45: return (1);
1.47 nicm 46: return (memcmp(ui1->data, ui2->data, ui1->size));
1.45 nicm 47: }
1.54 nicm 48: RB_HEAD(utf8_data_tree, utf8_item);
49: RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
50: static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
1.45 nicm 51:
1.54 nicm 52: static int
53: utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
54: {
55: if (ui1->index < ui2->index)
56: return (-1);
57: if (ui1->index > ui2->index)
58: return (1);
59: return (0);
60: }
61: RB_HEAD(utf8_index_tree, utf8_item);
62: RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
63: static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
64:
65: static u_int utf8_next_index;
1.45 nicm 66:
1.53 nicm 67: #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
68: #define UTF8_GET_WIDTH(flags) (((uc) >> 29) - 1)
69:
70: #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
71: #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
1.45 nicm 72:
1.54 nicm 73: /* Get a UTF-8 item from data. */
1.47 nicm 74: static struct utf8_item *
1.54 nicm 75: utf8_item_by_data(const char *data, size_t size)
1.45 nicm 76: {
1.47 nicm 77: struct utf8_item ui;
78:
79: memcpy(ui.data, data, size);
80: ui.size = size;
1.45 nicm 81:
1.54 nicm 82: return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
1.47 nicm 83: }
1.45 nicm 84:
1.54 nicm 85: /* Get a UTF-8 item from data. */
86: static struct utf8_item *
87: utf8_item_by_index(u_int index)
1.47 nicm 88: {
1.54 nicm 89: struct utf8_item ui;
90:
91: ui.index = index;
92:
93: return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
1.45 nicm 94: }
95:
1.47 nicm 96: /* Add a UTF-8 item. */
1.45 nicm 97: static int
1.54 nicm 98: utf8_put_item(const char *data, size_t size, u_int *index)
1.45 nicm 99: {
1.47 nicm 100: struct utf8_item *ui;
1.45 nicm 101:
1.54 nicm 102: ui = utf8_item_by_data(data, size);
1.47 nicm 103: if (ui != NULL) {
1.54 nicm 104: log_debug("%s: found %.*s = %u", __func__, (int)size, data,
105: *index);
106: *index = ui->index;
1.45 nicm 107: return (0);
108: }
109:
1.54 nicm 110: if (utf8_next_index == 0xffffff + 1)
1.47 nicm 111: return (-1);
112:
1.54 nicm 113: ui = xcalloc(1, sizeof *ui);
114: ui->index = utf8_next_index++;
115: RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
116:
1.47 nicm 117: memcpy(ui->data, data, size);
118: ui->size = size;
1.54 nicm 119: RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
1.45 nicm 120:
1.54 nicm 121: log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
122: *index = ui->index;
1.45 nicm 123: return (0);
124: }
125:
1.47 nicm 126: /* Get UTF-8 character from data. */
127: enum utf8_state
128: utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
1.45 nicm 129: {
1.54 nicm 130: u_int index;
1.45 nicm 131:
1.52 nicm 132: if (ud->width > 2)
1.55 ! nicm 133: fatalx("invalid UTF-8 width: %u", ud->width);
1.45 nicm 134:
1.52 nicm 135: if (ud->size > UTF8_SIZE)
1.45 nicm 136: goto fail;
1.53 nicm 137: if (ud->size <= 3) {
1.54 nicm 138: index = (((utf8_char)ud->data[2] << 16)|
1.53 nicm 139: ((utf8_char)ud->data[1] << 8)|
140: ((utf8_char)ud->data[0]));
1.54 nicm 141: } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
1.53 nicm 142: goto fail;
1.54 nicm 143: *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
1.53 nicm 144: log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
145: (int)ud->size, ud->data, *uc);
1.47 nicm 146: return (UTF8_DONE);
1.45 nicm 147:
148: fail:
1.52 nicm 149: if (ud->width == 0)
1.53 nicm 150: *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
1.52 nicm 151: else if (ud->width == 1)
1.53 nicm 152: *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
1.47 nicm 153: else
1.53 nicm 154: *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
1.47 nicm 155: return (UTF8_ERROR);
1.45 nicm 156: }
157:
1.47 nicm 158: /* Get UTF-8 data from character. */
1.45 nicm 159: void
1.47 nicm 160: utf8_to_data(utf8_char uc, struct utf8_data *ud)
1.45 nicm 161: {
1.47 nicm 162: struct utf8_item *ui;
1.54 nicm 163: u_int index;
1.45 nicm 164:
165: memset(ud, 0, sizeof *ud);
1.53 nicm 166: ud->size = ud->have = UTF8_GET_SIZE(uc);
167: ud->width = UTF8_GET_WIDTH(uc);
1.45 nicm 168:
169: if (ud->size <= 3) {
1.53 nicm 170: ud->data[2] = (uc >> 16);
171: ud->data[1] = ((uc >> 8) & 0xff);
172: ud->data[0] = (uc & 0xff);
173: } else {
1.54 nicm 174: index = (uc & 0xffffff);
175: if ((ui = utf8_item_by_index(index)) == NULL)
1.53 nicm 176: memset(ud->data, ' ', ud->size);
1.54 nicm 177: else
1.53 nicm 178: memcpy(ud->data, ui->data, ud->size);
1.45 nicm 179: }
180:
1.53 nicm 181: log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
182: (int)ud->size, ud->data);
1.45 nicm 183: }
184:
1.47 nicm 185: /* Get UTF-8 character from a single ASCII character. */
1.46 nicm 186: u_int
1.52 nicm 187: utf8_build_one(u_char ch)
1.45 nicm 188: {
1.53 nicm 189: return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
1.45 nicm 190: }
1.29 nicm 191:
1.11 nicm 192: /* Set a single character. */
193: void
1.19 nicm 194: utf8_set(struct utf8_data *ud, u_char ch)
1.11 nicm 195: {
1.33 nicm 196: static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
1.20 nicm 197:
1.33 nicm 198: memcpy(ud, &empty, sizeof *ud);
1.19 nicm 199: *ud->data = ch;
1.20 nicm 200: }
201:
202: /* Copy UTF-8 character. */
203: void
204: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
205: {
206: u_int i;
207:
208: memcpy(to, from, sizeof *to);
209:
210: for (i = to->size; i < sizeof to->data; i++)
211: to->data[i] = '\0';
1.11 nicm 212: }
213:
1.47 nicm 214: /* Get width of Unicode character. */
1.48 nicm 215: static enum utf8_state
216: utf8_width(struct utf8_data *ud, int *width)
1.47 nicm 217: {
1.48 nicm 218: wchar_t wc;
1.47 nicm 219:
1.48 nicm 220: switch (mbtowc(&wc, ud->data, ud->size)) {
221: case -1:
222: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
223: errno);
224: mbtowc(NULL, NULL, MB_CUR_MAX);
225: return (UTF8_ERROR);
226: case 0:
227: return (UTF8_ERROR);
228: }
229: *width = wcwidth(wc);
230: if (*width < 0 || *width > 0xff) {
231: log_debug("UTF-8 %.*s, wcwidth() %d", (int)ud->size, ud->data,
232: *width);
233: return (UTF8_ERROR);
1.47 nicm 234: }
1.48 nicm 235: return (UTF8_DONE);
1.47 nicm 236: }
237:
1.4 nicm 238: /*
239: * Open UTF-8 sequence.
240: *
241: * 11000010-11011111 C2-DF start of 2-byte sequence
242: * 11100000-11101111 E0-EF start of 3-byte sequence
243: * 11110000-11110100 F0-F4 start of 4-byte sequence
244: */
1.23 nicm 245: enum utf8_state
1.19 nicm 246: utf8_open(struct utf8_data *ud, u_char ch)
1.4 nicm 247: {
1.19 nicm 248: memset(ud, 0, sizeof *ud);
1.4 nicm 249: if (ch >= 0xc2 && ch <= 0xdf)
1.19 nicm 250: ud->size = 2;
1.4 nicm 251: else if (ch >= 0xe0 && ch <= 0xef)
1.19 nicm 252: ud->size = 3;
1.4 nicm 253: else if (ch >= 0xf0 && ch <= 0xf4)
1.19 nicm 254: ud->size = 4;
1.4 nicm 255: else
1.23 nicm 256: return (UTF8_ERROR);
1.19 nicm 257: utf8_append(ud, ch);
1.23 nicm 258: return (UTF8_MORE);
1.4 nicm 259: }
260:
1.23 nicm 261: /* Append character to UTF-8, closing if finished. */
262: enum utf8_state
1.19 nicm 263: utf8_append(struct utf8_data *ud, u_char ch)
1.4 nicm 264: {
1.29 nicm 265: int width;
266:
1.19 nicm 267: if (ud->have >= ud->size)
1.4 nicm 268: fatalx("UTF-8 character overflow");
1.19 nicm 269: if (ud->size > sizeof ud->data)
1.4 nicm 270: fatalx("UTF-8 character size too large");
271:
1.21 nicm 272: if (ud->have != 0 && (ch & 0xc0) != 0x80)
273: ud->width = 0xff;
274:
1.19 nicm 275: ud->data[ud->have++] = ch;
276: if (ud->have != ud->size)
1.23 nicm 277: return (UTF8_MORE);
1.4 nicm 278:
1.21 nicm 279: if (ud->width == 0xff)
1.23 nicm 280: return (UTF8_ERROR);
1.48 nicm 281: if (utf8_width(ud, &width) != UTF8_DONE)
1.29 nicm 282: return (UTF8_ERROR);
283: ud->width = width;
284:
1.23 nicm 285: return (UTF8_DONE);
1.9 nicm 286: }
287:
288: /*
289: * Encode len characters from src into dst, which is guaranteed to have four
290: * bytes available for each character from src (for \abc or UTF-8) plus space
291: * for \0.
292: */
293: int
294: utf8_strvis(char *dst, const char *src, size_t len, int flag)
295: {
1.19 nicm 296: struct utf8_data ud;
1.47 nicm 297: const char *start = dst, *end = src + len;
1.23 nicm 298: enum utf8_state more;
1.9 nicm 299: size_t i;
300:
301: while (src < end) {
1.23 nicm 302: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
303: while (++src < end && more == UTF8_MORE)
1.19 nicm 304: more = utf8_append(&ud, *src);
1.23 nicm 305: if (more == UTF8_DONE) {
1.9 nicm 306: /* UTF-8 character finished. */
1.19 nicm 307: for (i = 0; i < ud.size; i++)
308: *dst++ = ud.data[i];
1.9 nicm 309: continue;
310: }
1.23 nicm 311: /* Not a complete, valid UTF-8 character. */
312: src -= ud.have;
1.9 nicm 313: }
1.41 nicm 314: if (src[0] == '$' && src < end - 1) {
1.42 nicm 315: if (isalpha((u_char)src[1]) ||
316: src[1] == '_' ||
317: src[1] == '{')
1.41 nicm 318: *dst++ = '\\';
319: *dst++ = '$';
320: } else if (src < end - 1)
1.9 nicm 321: dst = vis(dst, src[0], flag, src[1]);
322: else if (src < end)
323: dst = vis(dst, src[0], flag, '\0');
324: src++;
325: }
326: *dst = '\0';
327: return (dst - start);
1.35 nicm 328: }
329:
330: /* Same as utf8_strvis but allocate the buffer. */
331: int
332: utf8_stravis(char **dst, const char *src, int flag)
333: {
334: char *buf;
335: int len;
336:
337: buf = xreallocarray(NULL, 4, strlen(src) + 1);
338: len = utf8_strvis(buf, src, strlen(src), flag);
339:
340: *dst = xrealloc(buf, len + 1);
341: return (len);
1.38 nicm 342: }
343:
344: /* Does this string contain anything that isn't valid UTF-8? */
345: int
346: utf8_isvalid(const char *s)
347: {
1.47 nicm 348: struct utf8_data ud;
349: const char *end;
350: enum utf8_state more;
1.38 nicm 351:
352: end = s + strlen(s);
353: while (s < end) {
354: if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
355: while (++s < end && more == UTF8_MORE)
356: more = utf8_append(&ud, *s);
357: if (more == UTF8_DONE)
358: continue;
359: return (0);
360: }
361: if (*s < 0x20 || *s > 0x7e)
362: return (0);
363: s++;
364: }
365: return (1);
1.16 nicm 366: }
367:
368: /*
369: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
370: * the returned string. Anything not valid printable ASCII or UTF-8 is
371: * stripped.
372: */
373: char *
374: utf8_sanitize(const char *src)
375: {
1.47 nicm 376: char *dst = NULL;
377: size_t n = 0;
378: enum utf8_state more;
379: struct utf8_data ud;
380: u_int i;
1.16 nicm 381:
382: while (*src != '\0') {
383: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 384: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
385: while (*++src != '\0' && more == UTF8_MORE)
1.19 nicm 386: more = utf8_append(&ud, *src);
1.23 nicm 387: if (more == UTF8_DONE) {
1.19 nicm 388: dst = xreallocarray(dst, n + ud.width,
1.16 nicm 389: sizeof *dst);
1.19 nicm 390: for (i = 0; i < ud.width; i++)
1.16 nicm 391: dst[n++] = '_';
392: continue;
393: }
1.19 nicm 394: src -= ud.have;
1.16 nicm 395: }
396: if (*src > 0x1f && *src < 0x7f)
1.21 nicm 397: dst[n++] = *src;
1.23 nicm 398: else
399: dst[n++] = '_';
1.16 nicm 400: src++;
401: }
402: dst = xreallocarray(dst, n + 1, sizeof *dst);
403: dst[n] = '\0';
404: return (dst);
1.34 nicm 405: }
406:
407: /* Get UTF-8 buffer length. */
408: size_t
409: utf8_strlen(const struct utf8_data *s)
410: {
411: size_t i;
412:
413: for (i = 0; s[i].size != 0; i++)
414: /* nothing */;
415: return (i);
416: }
417:
418: /* Get UTF-8 string width. */
419: u_int
420: utf8_strwidth(const struct utf8_data *s, ssize_t n)
421: {
422: ssize_t i;
1.47 nicm 423: u_int width = 0;
1.34 nicm 424:
425: for (i = 0; s[i].size != 0; i++) {
426: if (n != -1 && n == i)
427: break;
428: width += s[i].width;
429: }
430: return (width);
1.11 nicm 431: }
432:
433: /*
434: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
435: * Caller frees.
436: */
437: struct utf8_data *
438: utf8_fromcstr(const char *src)
439: {
1.47 nicm 440: struct utf8_data *dst = NULL;
441: size_t n = 0;
1.23 nicm 442: enum utf8_state more;
1.11 nicm 443:
444: while (*src != '\0') {
1.12 nicm 445: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 446: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
447: while (*++src != '\0' && more == UTF8_MORE)
1.11 nicm 448: more = utf8_append(&dst[n], *src);
1.23 nicm 449: if (more == UTF8_DONE) {
1.11 nicm 450: n++;
451: continue;
452: }
453: src -= dst[n].have;
454: }
1.23 nicm 455: utf8_set(&dst[n], *src);
456: n++;
1.11 nicm 457: src++;
458: }
1.12 nicm 459: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 460: dst[n].size = 0;
461: return (dst);
462: }
463:
464: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
465: char *
466: utf8_tocstr(struct utf8_data *src)
467: {
1.47 nicm 468: char *dst = NULL;
469: size_t n = 0;
1.11 nicm 470:
471: for(; src->size != 0; src++) {
1.12 nicm 472: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 473: memcpy(dst + n, src->data, src->size);
474: n += src->size;
475: }
1.12 nicm 476: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 477: dst[n] = '\0';
478: return (dst);
479: }
480:
481: /* Get width of UTF-8 string. */
482: u_int
483: utf8_cstrwidth(const char *s)
484: {
485: struct utf8_data tmp;
486: u_int width;
1.23 nicm 487: enum utf8_state more;
1.11 nicm 488:
489: width = 0;
490: while (*s != '\0') {
1.23 nicm 491: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
492: while (*++s != '\0' && more == UTF8_MORE)
1.11 nicm 493: more = utf8_append(&tmp, *s);
1.23 nicm 494: if (more == UTF8_DONE) {
1.11 nicm 495: width += tmp.width;
496: continue;
497: }
498: s -= tmp.have;
499: }
1.23 nicm 500: if (*s > 0x1f && *s != 0x7f)
1.21 nicm 501: width++;
1.11 nicm 502: s++;
503: }
504: return (width);
1.18 nicm 505: }
506:
1.44 nicm 507: /* Pad UTF-8 string to width on the left. Caller frees. */
1.18 nicm 508: char *
509: utf8_padcstr(const char *s, u_int width)
510: {
511: size_t slen;
512: char *out;
1.47 nicm 513: u_int n, i;
1.18 nicm 514:
515: n = utf8_cstrwidth(s);
516: if (n >= width)
517: return (xstrdup(s));
518:
519: slen = strlen(s);
520: out = xmalloc(slen + 1 + (width - n));
521: memcpy(out, s, slen);
522: for (i = n; i < width; i++)
523: out[slen++] = ' ';
524: out[slen] = '\0';
1.44 nicm 525: return (out);
526: }
527:
528: /* Pad UTF-8 string to width on the right. Caller frees. */
529: char *
530: utf8_rpadcstr(const char *s, u_int width)
531: {
532: size_t slen;
533: char *out;
1.47 nicm 534: u_int n, i;
1.44 nicm 535:
536: n = utf8_cstrwidth(s);
537: if (n >= width)
538: return (xstrdup(s));
539:
540: slen = strlen(s);
541: out = xmalloc(slen + 1 + (width - n));
542: for (i = 0; i < width - n; i++)
543: out[i] = ' ';
544: memcpy(out + i, s, slen);
545: out[i + slen] = '\0';
1.11 nicm 546: return (out);
1.43 nicm 547: }
548:
549: int
550: utf8_cstrhas(const char *s, const struct utf8_data *ud)
551: {
552: struct utf8_data *copy, *loop;
553: int found = 0;
554:
555: copy = utf8_fromcstr(s);
556: for (loop = copy; loop->size != 0; loop++) {
557: if (loop->size != ud->size)
558: continue;
559: if (memcmp(loop->data, ud->data, loop->size) == 0) {
560: found = 1;
561: break;
562: }
563: }
564: free(copy);
565:
566: return (found);
1.1 nicm 567: }