Annotation of src/usr.bin/tmux/utf8.c, Revision 1.62
1.62 ! nicm 1: /* $OpenBSD: utf8.c,v 1.61 2023/06/30 21:55:09 nicm Exp $ */
1.1 nicm 2:
3: /*
1.26 nicm 4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
1.1 nicm 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.41 nicm 21: #include <ctype.h>
1.30 nicm 22: #include <errno.h>
1.11 nicm 23: #include <stdlib.h>
1.1 nicm 24: #include <string.h>
1.9 nicm 25: #include <vis.h>
1.28 nicm 26: #include <wchar.h>
1.1 nicm 27:
28: #include "tmux.h"
29:
1.47 nicm 30: struct utf8_item {
1.54 nicm 31: RB_ENTRY(utf8_item) index_entry;
32: u_int index;
1.45 nicm 33:
1.54 nicm 34: RB_ENTRY(utf8_item) data_entry;
1.45 nicm 35: char data[UTF8_SIZE];
36: u_char size;
37: };
38:
39: static int
1.54 nicm 40: utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
1.45 nicm 41: {
1.47 nicm 42: if (ui1->size < ui2->size)
1.45 nicm 43: return (-1);
1.47 nicm 44: if (ui1->size > ui2->size)
1.45 nicm 45: return (1);
1.47 nicm 46: return (memcmp(ui1->data, ui2->data, ui1->size));
1.45 nicm 47: }
1.54 nicm 48: RB_HEAD(utf8_data_tree, utf8_item);
49: RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
50: static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
1.45 nicm 51:
1.54 nicm 52: static int
53: utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
54: {
55: if (ui1->index < ui2->index)
56: return (-1);
57: if (ui1->index > ui2->index)
58: return (1);
59: return (0);
60: }
61: RB_HEAD(utf8_index_tree, utf8_item);
62: RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
63: static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
64:
65: static u_int utf8_next_index;
1.45 nicm 66:
1.53 nicm 67: #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
1.58 nicm 68: #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
1.53 nicm 69:
70: #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
71: #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
1.45 nicm 72:
1.54 nicm 73: /* Get a UTF-8 item from data. */
1.47 nicm 74: static struct utf8_item *
1.62 ! nicm 75: utf8_item_by_data(const u_char *data, size_t size)
1.45 nicm 76: {
1.47 nicm 77: struct utf8_item ui;
78:
79: memcpy(ui.data, data, size);
80: ui.size = size;
1.45 nicm 81:
1.54 nicm 82: return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
1.47 nicm 83: }
1.45 nicm 84:
1.54 nicm 85: /* Get a UTF-8 item from data. */
86: static struct utf8_item *
87: utf8_item_by_index(u_int index)
1.47 nicm 88: {
1.54 nicm 89: struct utf8_item ui;
90:
91: ui.index = index;
92:
93: return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
1.45 nicm 94: }
95:
1.47 nicm 96: /* Add a UTF-8 item. */
1.45 nicm 97: static int
1.61 nicm 98: utf8_put_item(const u_char *data, size_t size, u_int *index)
1.45 nicm 99: {
1.47 nicm 100: struct utf8_item *ui;
1.45 nicm 101:
1.54 nicm 102: ui = utf8_item_by_data(data, size);
1.47 nicm 103: if (ui != NULL) {
1.57 nicm 104: *index = ui->index;
1.54 nicm 105: log_debug("%s: found %.*s = %u", __func__, (int)size, data,
106: *index);
1.45 nicm 107: return (0);
108: }
109:
1.54 nicm 110: if (utf8_next_index == 0xffffff + 1)
1.47 nicm 111: return (-1);
112:
1.54 nicm 113: ui = xcalloc(1, sizeof *ui);
114: ui->index = utf8_next_index++;
115: RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
116:
1.47 nicm 117: memcpy(ui->data, data, size);
118: ui->size = size;
1.54 nicm 119: RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
1.45 nicm 120:
1.57 nicm 121: *index = ui->index;
1.54 nicm 122: log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
1.45 nicm 123: return (0);
124: }
125:
1.47 nicm 126: /* Get UTF-8 character from data. */
127: enum utf8_state
128: utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
1.45 nicm 129: {
1.54 nicm 130: u_int index;
1.45 nicm 131:
1.52 nicm 132: if (ud->width > 2)
1.55 nicm 133: fatalx("invalid UTF-8 width: %u", ud->width);
1.45 nicm 134:
1.52 nicm 135: if (ud->size > UTF8_SIZE)
1.45 nicm 136: goto fail;
1.53 nicm 137: if (ud->size <= 3) {
1.54 nicm 138: index = (((utf8_char)ud->data[2] << 16)|
1.53 nicm 139: ((utf8_char)ud->data[1] << 8)|
140: ((utf8_char)ud->data[0]));
1.54 nicm 141: } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
1.53 nicm 142: goto fail;
1.54 nicm 143: *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
1.53 nicm 144: log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
145: (int)ud->size, ud->data, *uc);
1.47 nicm 146: return (UTF8_DONE);
1.45 nicm 147:
148: fail:
1.52 nicm 149: if (ud->width == 0)
1.53 nicm 150: *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
1.52 nicm 151: else if (ud->width == 1)
1.53 nicm 152: *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
1.47 nicm 153: else
1.53 nicm 154: *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
1.47 nicm 155: return (UTF8_ERROR);
1.45 nicm 156: }
157:
1.47 nicm 158: /* Get UTF-8 data from character. */
1.45 nicm 159: void
1.47 nicm 160: utf8_to_data(utf8_char uc, struct utf8_data *ud)
1.45 nicm 161: {
1.47 nicm 162: struct utf8_item *ui;
1.54 nicm 163: u_int index;
1.45 nicm 164:
165: memset(ud, 0, sizeof *ud);
1.53 nicm 166: ud->size = ud->have = UTF8_GET_SIZE(uc);
167: ud->width = UTF8_GET_WIDTH(uc);
1.45 nicm 168:
169: if (ud->size <= 3) {
1.53 nicm 170: ud->data[2] = (uc >> 16);
171: ud->data[1] = ((uc >> 8) & 0xff);
172: ud->data[0] = (uc & 0xff);
173: } else {
1.54 nicm 174: index = (uc & 0xffffff);
175: if ((ui = utf8_item_by_index(index)) == NULL)
1.53 nicm 176: memset(ud->data, ' ', ud->size);
1.54 nicm 177: else
1.53 nicm 178: memcpy(ud->data, ui->data, ud->size);
1.45 nicm 179: }
180:
1.53 nicm 181: log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
182: (int)ud->size, ud->data);
1.45 nicm 183: }
184:
1.47 nicm 185: /* Get UTF-8 character from a single ASCII character. */
1.46 nicm 186: u_int
1.52 nicm 187: utf8_build_one(u_char ch)
1.45 nicm 188: {
1.53 nicm 189: return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
1.45 nicm 190: }
1.29 nicm 191:
1.11 nicm 192: /* Set a single character. */
193: void
1.19 nicm 194: utf8_set(struct utf8_data *ud, u_char ch)
1.11 nicm 195: {
1.33 nicm 196: static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
1.20 nicm 197:
1.33 nicm 198: memcpy(ud, &empty, sizeof *ud);
1.19 nicm 199: *ud->data = ch;
1.20 nicm 200: }
201:
202: /* Copy UTF-8 character. */
203: void
204: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
205: {
206: u_int i;
207:
208: memcpy(to, from, sizeof *to);
209:
210: for (i = to->size; i < sizeof to->data; i++)
211: to->data[i] = '\0';
1.11 nicm 212: }
213:
1.47 nicm 214: /* Get width of Unicode character. */
1.48 nicm 215: static enum utf8_state
216: utf8_width(struct utf8_data *ud, int *width)
1.47 nicm 217: {
1.48 nicm 218: wchar_t wc;
1.47 nicm 219:
1.48 nicm 220: switch (mbtowc(&wc, ud->data, ud->size)) {
221: case -1:
222: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
223: errno);
224: mbtowc(NULL, NULL, MB_CUR_MAX);
225: return (UTF8_ERROR);
226: case 0:
227: return (UTF8_ERROR);
228: }
1.60 nicm 229: log_debug("UTF-8 %.*s is %08X", (int)ud->size, ud->data, (u_int)wc);
1.48 nicm 230: *width = wcwidth(wc);
1.60 nicm 231: log_debug("wcwidth(%08X) returned %d", (u_int)wc, *width);
232: if (*width < 0) {
233: /*
234: * C1 control characters are nonprintable, so they are always
235: * zero width.
236: */
237: *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
238: }
1.59 nicm 239: if (*width >= 0 && *width <= 0xff)
240: return (UTF8_DONE);
241: return (UTF8_ERROR);
1.47 nicm 242: }
243:
1.4 nicm 244: /*
245: * Open UTF-8 sequence.
246: *
247: * 11000010-11011111 C2-DF start of 2-byte sequence
248: * 11100000-11101111 E0-EF start of 3-byte sequence
249: * 11110000-11110100 F0-F4 start of 4-byte sequence
250: */
1.23 nicm 251: enum utf8_state
1.19 nicm 252: utf8_open(struct utf8_data *ud, u_char ch)
1.4 nicm 253: {
1.19 nicm 254: memset(ud, 0, sizeof *ud);
1.4 nicm 255: if (ch >= 0xc2 && ch <= 0xdf)
1.19 nicm 256: ud->size = 2;
1.4 nicm 257: else if (ch >= 0xe0 && ch <= 0xef)
1.19 nicm 258: ud->size = 3;
1.4 nicm 259: else if (ch >= 0xf0 && ch <= 0xf4)
1.19 nicm 260: ud->size = 4;
1.4 nicm 261: else
1.23 nicm 262: return (UTF8_ERROR);
1.19 nicm 263: utf8_append(ud, ch);
1.23 nicm 264: return (UTF8_MORE);
1.4 nicm 265: }
266:
1.23 nicm 267: /* Append character to UTF-8, closing if finished. */
268: enum utf8_state
1.19 nicm 269: utf8_append(struct utf8_data *ud, u_char ch)
1.4 nicm 270: {
1.29 nicm 271: int width;
272:
1.19 nicm 273: if (ud->have >= ud->size)
1.4 nicm 274: fatalx("UTF-8 character overflow");
1.19 nicm 275: if (ud->size > sizeof ud->data)
1.4 nicm 276: fatalx("UTF-8 character size too large");
277:
1.21 nicm 278: if (ud->have != 0 && (ch & 0xc0) != 0x80)
279: ud->width = 0xff;
280:
1.19 nicm 281: ud->data[ud->have++] = ch;
282: if (ud->have != ud->size)
1.23 nicm 283: return (UTF8_MORE);
1.4 nicm 284:
1.21 nicm 285: if (ud->width == 0xff)
1.23 nicm 286: return (UTF8_ERROR);
1.48 nicm 287: if (utf8_width(ud, &width) != UTF8_DONE)
1.29 nicm 288: return (UTF8_ERROR);
289: ud->width = width;
290:
1.23 nicm 291: return (UTF8_DONE);
1.9 nicm 292: }
293:
294: /*
295: * Encode len characters from src into dst, which is guaranteed to have four
296: * bytes available for each character from src (for \abc or UTF-8) plus space
297: * for \0.
298: */
299: int
300: utf8_strvis(char *dst, const char *src, size_t len, int flag)
301: {
1.19 nicm 302: struct utf8_data ud;
1.47 nicm 303: const char *start = dst, *end = src + len;
1.23 nicm 304: enum utf8_state more;
1.9 nicm 305: size_t i;
306:
307: while (src < end) {
1.23 nicm 308: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
309: while (++src < end && more == UTF8_MORE)
1.19 nicm 310: more = utf8_append(&ud, *src);
1.23 nicm 311: if (more == UTF8_DONE) {
1.9 nicm 312: /* UTF-8 character finished. */
1.19 nicm 313: for (i = 0; i < ud.size; i++)
314: *dst++ = ud.data[i];
1.9 nicm 315: continue;
316: }
1.23 nicm 317: /* Not a complete, valid UTF-8 character. */
318: src -= ud.have;
1.9 nicm 319: }
1.41 nicm 320: if (src[0] == '$' && src < end - 1) {
1.42 nicm 321: if (isalpha((u_char)src[1]) ||
322: src[1] == '_' ||
323: src[1] == '{')
1.41 nicm 324: *dst++ = '\\';
325: *dst++ = '$';
326: } else if (src < end - 1)
1.9 nicm 327: dst = vis(dst, src[0], flag, src[1]);
328: else if (src < end)
329: dst = vis(dst, src[0], flag, '\0');
330: src++;
331: }
332: *dst = '\0';
333: return (dst - start);
1.35 nicm 334: }
335:
336: /* Same as utf8_strvis but allocate the buffer. */
337: int
338: utf8_stravis(char **dst, const char *src, int flag)
339: {
340: char *buf;
341: int len;
342:
343: buf = xreallocarray(NULL, 4, strlen(src) + 1);
344: len = utf8_strvis(buf, src, strlen(src), flag);
1.56 nicm 345:
346: *dst = xrealloc(buf, len + 1);
347: return (len);
348: }
349:
350: /* Same as utf8_strvis but allocate the buffer. */
351: int
352: utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
353: {
354: char *buf;
355: int len;
356:
357: buf = xreallocarray(NULL, 4, srclen + 1);
358: len = utf8_strvis(buf, src, srclen, flag);
1.35 nicm 359:
360: *dst = xrealloc(buf, len + 1);
361: return (len);
1.38 nicm 362: }
363:
364: /* Does this string contain anything that isn't valid UTF-8? */
365: int
366: utf8_isvalid(const char *s)
367: {
1.47 nicm 368: struct utf8_data ud;
369: const char *end;
370: enum utf8_state more;
1.38 nicm 371:
372: end = s + strlen(s);
373: while (s < end) {
374: if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
375: while (++s < end && more == UTF8_MORE)
376: more = utf8_append(&ud, *s);
377: if (more == UTF8_DONE)
378: continue;
379: return (0);
380: }
381: if (*s < 0x20 || *s > 0x7e)
382: return (0);
383: s++;
384: }
385: return (1);
1.16 nicm 386: }
387:
388: /*
389: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
390: * the returned string. Anything not valid printable ASCII or UTF-8 is
391: * stripped.
392: */
393: char *
394: utf8_sanitize(const char *src)
395: {
1.47 nicm 396: char *dst = NULL;
397: size_t n = 0;
398: enum utf8_state more;
399: struct utf8_data ud;
400: u_int i;
1.16 nicm 401:
402: while (*src != '\0') {
403: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 404: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
405: while (*++src != '\0' && more == UTF8_MORE)
1.19 nicm 406: more = utf8_append(&ud, *src);
1.23 nicm 407: if (more == UTF8_DONE) {
1.19 nicm 408: dst = xreallocarray(dst, n + ud.width,
1.16 nicm 409: sizeof *dst);
1.19 nicm 410: for (i = 0; i < ud.width; i++)
1.16 nicm 411: dst[n++] = '_';
412: continue;
413: }
1.19 nicm 414: src -= ud.have;
1.16 nicm 415: }
416: if (*src > 0x1f && *src < 0x7f)
1.21 nicm 417: dst[n++] = *src;
1.23 nicm 418: else
419: dst[n++] = '_';
1.16 nicm 420: src++;
421: }
422: dst = xreallocarray(dst, n + 1, sizeof *dst);
423: dst[n] = '\0';
424: return (dst);
1.34 nicm 425: }
426:
427: /* Get UTF-8 buffer length. */
428: size_t
429: utf8_strlen(const struct utf8_data *s)
430: {
431: size_t i;
432:
433: for (i = 0; s[i].size != 0; i++)
434: /* nothing */;
435: return (i);
436: }
437:
438: /* Get UTF-8 string width. */
439: u_int
440: utf8_strwidth(const struct utf8_data *s, ssize_t n)
441: {
442: ssize_t i;
1.47 nicm 443: u_int width = 0;
1.34 nicm 444:
445: for (i = 0; s[i].size != 0; i++) {
446: if (n != -1 && n == i)
447: break;
448: width += s[i].width;
449: }
450: return (width);
1.11 nicm 451: }
452:
453: /*
454: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
455: * Caller frees.
456: */
457: struct utf8_data *
458: utf8_fromcstr(const char *src)
459: {
1.47 nicm 460: struct utf8_data *dst = NULL;
461: size_t n = 0;
1.23 nicm 462: enum utf8_state more;
1.11 nicm 463:
464: while (*src != '\0') {
1.12 nicm 465: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 466: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
467: while (*++src != '\0' && more == UTF8_MORE)
1.11 nicm 468: more = utf8_append(&dst[n], *src);
1.23 nicm 469: if (more == UTF8_DONE) {
1.11 nicm 470: n++;
471: continue;
472: }
473: src -= dst[n].have;
474: }
1.23 nicm 475: utf8_set(&dst[n], *src);
476: n++;
1.11 nicm 477: src++;
478: }
1.12 nicm 479: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 480: dst[n].size = 0;
481: return (dst);
482: }
483:
484: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
485: char *
486: utf8_tocstr(struct utf8_data *src)
487: {
1.47 nicm 488: char *dst = NULL;
489: size_t n = 0;
1.11 nicm 490:
491: for(; src->size != 0; src++) {
1.12 nicm 492: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 493: memcpy(dst + n, src->data, src->size);
494: n += src->size;
495: }
1.12 nicm 496: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 497: dst[n] = '\0';
498: return (dst);
499: }
500:
501: /* Get width of UTF-8 string. */
502: u_int
503: utf8_cstrwidth(const char *s)
504: {
505: struct utf8_data tmp;
506: u_int width;
1.23 nicm 507: enum utf8_state more;
1.11 nicm 508:
509: width = 0;
510: while (*s != '\0') {
1.23 nicm 511: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
512: while (*++s != '\0' && more == UTF8_MORE)
1.11 nicm 513: more = utf8_append(&tmp, *s);
1.23 nicm 514: if (more == UTF8_DONE) {
1.11 nicm 515: width += tmp.width;
516: continue;
517: }
518: s -= tmp.have;
519: }
1.23 nicm 520: if (*s > 0x1f && *s != 0x7f)
1.21 nicm 521: width++;
1.11 nicm 522: s++;
523: }
524: return (width);
1.18 nicm 525: }
526:
1.44 nicm 527: /* Pad UTF-8 string to width on the left. Caller frees. */
1.18 nicm 528: char *
529: utf8_padcstr(const char *s, u_int width)
530: {
531: size_t slen;
532: char *out;
1.47 nicm 533: u_int n, i;
1.18 nicm 534:
535: n = utf8_cstrwidth(s);
536: if (n >= width)
537: return (xstrdup(s));
538:
539: slen = strlen(s);
540: out = xmalloc(slen + 1 + (width - n));
541: memcpy(out, s, slen);
542: for (i = n; i < width; i++)
543: out[slen++] = ' ';
544: out[slen] = '\0';
1.44 nicm 545: return (out);
546: }
547:
548: /* Pad UTF-8 string to width on the right. Caller frees. */
549: char *
550: utf8_rpadcstr(const char *s, u_int width)
551: {
552: size_t slen;
553: char *out;
1.47 nicm 554: u_int n, i;
1.44 nicm 555:
556: n = utf8_cstrwidth(s);
557: if (n >= width)
558: return (xstrdup(s));
559:
560: slen = strlen(s);
561: out = xmalloc(slen + 1 + (width - n));
562: for (i = 0; i < width - n; i++)
563: out[i] = ' ';
564: memcpy(out + i, s, slen);
565: out[i + slen] = '\0';
1.11 nicm 566: return (out);
1.43 nicm 567: }
568:
569: int
570: utf8_cstrhas(const char *s, const struct utf8_data *ud)
571: {
572: struct utf8_data *copy, *loop;
573: int found = 0;
574:
575: copy = utf8_fromcstr(s);
576: for (loop = copy; loop->size != 0; loop++) {
577: if (loop->size != ud->size)
578: continue;
579: if (memcmp(loop->data, ud->data, loop->size) == 0) {
580: found = 1;
581: break;
582: }
583: }
584: free(copy);
585:
586: return (found);
1.1 nicm 587: }