Annotation of src/usr.bin/tmux/utf8.c, Revision 1.53
1.53 ! nicm 1: /* $OpenBSD: utf8.c,v 1.52 2020/06/02 20:10:23 nicm Exp $ */
1.1 nicm 2:
3: /*
1.26 nicm 4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
1.1 nicm 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.41 nicm 21: #include <ctype.h>
1.30 nicm 22: #include <errno.h>
1.11 nicm 23: #include <stdlib.h>
1.1 nicm 24: #include <string.h>
1.9 nicm 25: #include <vis.h>
1.28 nicm 26: #include <wchar.h>
1.1 nicm 27:
28: #include "tmux.h"
29:
1.47 nicm 30: struct utf8_item {
31: u_int offset;
32: RB_ENTRY(utf8_item) entry;
1.45 nicm 33:
34: char data[UTF8_SIZE];
35: u_char size;
36: };
1.47 nicm 37: RB_HEAD(utf8_tree, utf8_item);
1.45 nicm 38:
39: static int
1.47 nicm 40: utf8_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
1.45 nicm 41: {
1.47 nicm 42: if (ui1->size < ui2->size)
1.45 nicm 43: return (-1);
1.47 nicm 44: if (ui1->size > ui2->size)
1.45 nicm 45: return (1);
1.47 nicm 46: return (memcmp(ui1->data, ui2->data, ui1->size));
1.45 nicm 47: }
1.47 nicm 48: RB_GENERATE_STATIC(utf8_tree, utf8_item, entry, utf8_cmp);
49: static struct utf8_tree utf8_tree = RB_INITIALIZER(utf8_tree);
1.45 nicm 50:
1.47 nicm 51: static struct utf8_item *utf8_list;
52: static u_int utf8_list_size;
53: static u_int utf8_list_used;
1.45 nicm 54:
1.53 ! nicm 55: #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
! 56: #define UTF8_GET_WIDTH(flags) (((uc) >> 29) - 1)
! 57:
! 58: #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
! 59: #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
1.45 nicm 60:
1.47 nicm 61: /* Get a UTF-8 item by offset. */
62: static struct utf8_item *
63: utf8_get_item(const char *data, size_t size)
1.45 nicm 64: {
1.47 nicm 65: struct utf8_item ui;
66:
67: memcpy(ui.data, data, size);
68: ui.size = size;
1.45 nicm 69:
1.47 nicm 70: return (RB_FIND(utf8_tree, &utf8_tree, &ui));
71: }
1.45 nicm 72:
1.47 nicm 73: /* Expand UTF-8 list. */
74: static int
75: utf8_expand_list(void)
76: {
77: if (utf8_list_size == 0xffffff)
78: return (-1);
79: if (utf8_list_size == 0)
80: utf8_list_size = 256;
81: else if (utf8_list_size > 0x7fffff)
82: utf8_list_size = 0xffffff;
83: else
84: utf8_list_size *= 2;
85: utf8_list = xreallocarray(utf8_list, utf8_list_size, sizeof *utf8_list);
86: return (0);
1.45 nicm 87: }
88:
1.47 nicm 89: /* Add a UTF-8 item. */
1.45 nicm 90: static int
1.47 nicm 91: utf8_put_item(const char *data, size_t size, u_int *offset)
1.45 nicm 92: {
1.47 nicm 93: struct utf8_item *ui;
1.45 nicm 94:
1.47 nicm 95: ui = utf8_get_item(data, size);
96: if (ui != NULL) {
97: *offset = ui->offset;
1.45 nicm 98: log_debug("%s: have %.*s at %u", __func__, (int)size, data,
1.47 nicm 99: *offset);
1.45 nicm 100: return (0);
101: }
102:
1.47 nicm 103: if (utf8_list_used == utf8_list_size && utf8_expand_list() != 0)
104: return (-1);
105: *offset = utf8_list_used++;
106:
107: ui = &utf8_list[*offset];
108: ui->offset = *offset;
109: memcpy(ui->data, data, size);
110: ui->size = size;
111: RB_INSERT(utf8_tree, &utf8_tree, ui);
1.45 nicm 112:
1.47 nicm 113: log_debug("%s: added %.*s at %u", __func__, (int)size, data, *offset);
1.45 nicm 114: return (0);
115: }
116:
1.47 nicm 117: /* Get UTF-8 character from data. */
118: enum utf8_state
119: utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
1.45 nicm 120: {
1.53 ! nicm 121: u_int offset;
1.45 nicm 122:
1.52 nicm 123: if (ud->width > 2)
1.49 nicm 124: fatalx("invalid UTF-8 width");
1.45 nicm 125:
1.52 nicm 126: if (ud->size > UTF8_SIZE)
1.45 nicm 127: goto fail;
1.53 ! nicm 128: if (ud->size <= 3) {
! 129: offset = (((utf8_char)ud->data[2] << 16)|
! 130: ((utf8_char)ud->data[1] << 8)|
! 131: ((utf8_char)ud->data[0]));
! 132: } else if (utf8_put_item(ud->data, ud->size, &offset) != 0)
! 133: goto fail;
! 134: *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|offset;
! 135: log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
! 136: (int)ud->size, ud->data, *uc);
1.47 nicm 137: return (UTF8_DONE);
1.45 nicm 138:
139: fail:
1.52 nicm 140: if (ud->width == 0)
1.53 ! nicm 141: *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
1.52 nicm 142: else if (ud->width == 1)
1.53 ! nicm 143: *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
1.47 nicm 144: else
1.53 ! nicm 145: *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
1.47 nicm 146: return (UTF8_ERROR);
1.45 nicm 147: }
148:
1.47 nicm 149: /* Get UTF-8 data from character. */
1.45 nicm 150: void
1.47 nicm 151: utf8_to_data(utf8_char uc, struct utf8_data *ud)
1.45 nicm 152: {
1.47 nicm 153: struct utf8_item *ui;
154: u_int offset;
1.45 nicm 155:
156: memset(ud, 0, sizeof *ud);
1.53 ! nicm 157: ud->size = ud->have = UTF8_GET_SIZE(uc);
! 158: ud->width = UTF8_GET_WIDTH(uc);
1.45 nicm 159:
160: if (ud->size <= 3) {
1.53 ! nicm 161: ud->data[2] = (uc >> 16);
! 162: ud->data[1] = ((uc >> 8) & 0xff);
! 163: ud->data[0] = (uc & 0xff);
! 164: } else {
! 165: offset = (uc & 0xffffff);
! 166: if (offset >= utf8_list_used)
! 167: memset(ud->data, ' ', ud->size);
! 168: else {
! 169: ui = &utf8_list[offset];
! 170: memcpy(ud->data, ui->data, ud->size);
! 171: }
1.45 nicm 172: }
173:
1.53 ! nicm 174: log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
! 175: (int)ud->size, ud->data);
1.45 nicm 176: }
177:
1.47 nicm 178: /* Get UTF-8 character from a single ASCII character. */
1.46 nicm 179: u_int
1.52 nicm 180: utf8_build_one(u_char ch)
1.45 nicm 181: {
1.53 ! nicm 182: return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
1.45 nicm 183: }
1.29 nicm 184:
1.11 nicm 185: /* Set a single character. */
186: void
1.19 nicm 187: utf8_set(struct utf8_data *ud, u_char ch)
1.11 nicm 188: {
1.33 nicm 189: static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
1.20 nicm 190:
1.33 nicm 191: memcpy(ud, &empty, sizeof *ud);
1.19 nicm 192: *ud->data = ch;
1.20 nicm 193: }
194:
195: /* Copy UTF-8 character. */
196: void
197: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
198: {
199: u_int i;
200:
201: memcpy(to, from, sizeof *to);
202:
203: for (i = to->size; i < sizeof to->data; i++)
204: to->data[i] = '\0';
1.11 nicm 205: }
206:
1.47 nicm 207: /* Get width of Unicode character. */
1.48 nicm 208: static enum utf8_state
209: utf8_width(struct utf8_data *ud, int *width)
1.47 nicm 210: {
1.48 nicm 211: wchar_t wc;
1.47 nicm 212:
1.48 nicm 213: switch (mbtowc(&wc, ud->data, ud->size)) {
214: case -1:
215: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
216: errno);
217: mbtowc(NULL, NULL, MB_CUR_MAX);
218: return (UTF8_ERROR);
219: case 0:
220: return (UTF8_ERROR);
221: }
222: *width = wcwidth(wc);
223: if (*width < 0 || *width > 0xff) {
224: log_debug("UTF-8 %.*s, wcwidth() %d", (int)ud->size, ud->data,
225: *width);
226: return (UTF8_ERROR);
1.47 nicm 227: }
1.48 nicm 228: return (UTF8_DONE);
1.47 nicm 229: }
230:
1.4 nicm 231: /*
232: * Open UTF-8 sequence.
233: *
234: * 11000010-11011111 C2-DF start of 2-byte sequence
235: * 11100000-11101111 E0-EF start of 3-byte sequence
236: * 11110000-11110100 F0-F4 start of 4-byte sequence
237: */
1.23 nicm 238: enum utf8_state
1.19 nicm 239: utf8_open(struct utf8_data *ud, u_char ch)
1.4 nicm 240: {
1.19 nicm 241: memset(ud, 0, sizeof *ud);
1.4 nicm 242: if (ch >= 0xc2 && ch <= 0xdf)
1.19 nicm 243: ud->size = 2;
1.4 nicm 244: else if (ch >= 0xe0 && ch <= 0xef)
1.19 nicm 245: ud->size = 3;
1.4 nicm 246: else if (ch >= 0xf0 && ch <= 0xf4)
1.19 nicm 247: ud->size = 4;
1.4 nicm 248: else
1.23 nicm 249: return (UTF8_ERROR);
1.19 nicm 250: utf8_append(ud, ch);
1.23 nicm 251: return (UTF8_MORE);
1.4 nicm 252: }
253:
1.23 nicm 254: /* Append character to UTF-8, closing if finished. */
255: enum utf8_state
1.19 nicm 256: utf8_append(struct utf8_data *ud, u_char ch)
1.4 nicm 257: {
1.29 nicm 258: int width;
259:
1.19 nicm 260: if (ud->have >= ud->size)
1.4 nicm 261: fatalx("UTF-8 character overflow");
1.19 nicm 262: if (ud->size > sizeof ud->data)
1.4 nicm 263: fatalx("UTF-8 character size too large");
264:
1.21 nicm 265: if (ud->have != 0 && (ch & 0xc0) != 0x80)
266: ud->width = 0xff;
267:
1.19 nicm 268: ud->data[ud->have++] = ch;
269: if (ud->have != ud->size)
1.23 nicm 270: return (UTF8_MORE);
1.4 nicm 271:
1.21 nicm 272: if (ud->width == 0xff)
1.23 nicm 273: return (UTF8_ERROR);
1.48 nicm 274: if (utf8_width(ud, &width) != UTF8_DONE)
1.29 nicm 275: return (UTF8_ERROR);
276: ud->width = width;
277:
1.23 nicm 278: return (UTF8_DONE);
1.9 nicm 279: }
280:
281: /*
282: * Encode len characters from src into dst, which is guaranteed to have four
283: * bytes available for each character from src (for \abc or UTF-8) plus space
284: * for \0.
285: */
286: int
287: utf8_strvis(char *dst, const char *src, size_t len, int flag)
288: {
1.19 nicm 289: struct utf8_data ud;
1.47 nicm 290: const char *start = dst, *end = src + len;
1.23 nicm 291: enum utf8_state more;
1.9 nicm 292: size_t i;
293:
294: while (src < end) {
1.23 nicm 295: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
296: while (++src < end && more == UTF8_MORE)
1.19 nicm 297: more = utf8_append(&ud, *src);
1.23 nicm 298: if (more == UTF8_DONE) {
1.9 nicm 299: /* UTF-8 character finished. */
1.19 nicm 300: for (i = 0; i < ud.size; i++)
301: *dst++ = ud.data[i];
1.9 nicm 302: continue;
303: }
1.23 nicm 304: /* Not a complete, valid UTF-8 character. */
305: src -= ud.have;
1.9 nicm 306: }
1.41 nicm 307: if (src[0] == '$' && src < end - 1) {
1.42 nicm 308: if (isalpha((u_char)src[1]) ||
309: src[1] == '_' ||
310: src[1] == '{')
1.41 nicm 311: *dst++ = '\\';
312: *dst++ = '$';
313: } else if (src < end - 1)
1.9 nicm 314: dst = vis(dst, src[0], flag, src[1]);
315: else if (src < end)
316: dst = vis(dst, src[0], flag, '\0');
317: src++;
318: }
319: *dst = '\0';
320: return (dst - start);
1.35 nicm 321: }
322:
323: /* Same as utf8_strvis but allocate the buffer. */
324: int
325: utf8_stravis(char **dst, const char *src, int flag)
326: {
327: char *buf;
328: int len;
329:
330: buf = xreallocarray(NULL, 4, strlen(src) + 1);
331: len = utf8_strvis(buf, src, strlen(src), flag);
332:
333: *dst = xrealloc(buf, len + 1);
334: return (len);
1.38 nicm 335: }
336:
337: /* Does this string contain anything that isn't valid UTF-8? */
338: int
339: utf8_isvalid(const char *s)
340: {
1.47 nicm 341: struct utf8_data ud;
342: const char *end;
343: enum utf8_state more;
1.38 nicm 344:
345: end = s + strlen(s);
346: while (s < end) {
347: if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
348: while (++s < end && more == UTF8_MORE)
349: more = utf8_append(&ud, *s);
350: if (more == UTF8_DONE)
351: continue;
352: return (0);
353: }
354: if (*s < 0x20 || *s > 0x7e)
355: return (0);
356: s++;
357: }
358: return (1);
1.16 nicm 359: }
360:
361: /*
362: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
363: * the returned string. Anything not valid printable ASCII or UTF-8 is
364: * stripped.
365: */
366: char *
367: utf8_sanitize(const char *src)
368: {
1.47 nicm 369: char *dst = NULL;
370: size_t n = 0;
371: enum utf8_state more;
372: struct utf8_data ud;
373: u_int i;
1.16 nicm 374:
375: while (*src != '\0') {
376: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 377: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
378: while (*++src != '\0' && more == UTF8_MORE)
1.19 nicm 379: more = utf8_append(&ud, *src);
1.23 nicm 380: if (more == UTF8_DONE) {
1.19 nicm 381: dst = xreallocarray(dst, n + ud.width,
1.16 nicm 382: sizeof *dst);
1.19 nicm 383: for (i = 0; i < ud.width; i++)
1.16 nicm 384: dst[n++] = '_';
385: continue;
386: }
1.19 nicm 387: src -= ud.have;
1.16 nicm 388: }
389: if (*src > 0x1f && *src < 0x7f)
1.21 nicm 390: dst[n++] = *src;
1.23 nicm 391: else
392: dst[n++] = '_';
1.16 nicm 393: src++;
394: }
395: dst = xreallocarray(dst, n + 1, sizeof *dst);
396: dst[n] = '\0';
397: return (dst);
1.34 nicm 398: }
399:
400: /* Get UTF-8 buffer length. */
401: size_t
402: utf8_strlen(const struct utf8_data *s)
403: {
404: size_t i;
405:
406: for (i = 0; s[i].size != 0; i++)
407: /* nothing */;
408: return (i);
409: }
410:
411: /* Get UTF-8 string width. */
412: u_int
413: utf8_strwidth(const struct utf8_data *s, ssize_t n)
414: {
415: ssize_t i;
1.47 nicm 416: u_int width = 0;
1.34 nicm 417:
418: for (i = 0; s[i].size != 0; i++) {
419: if (n != -1 && n == i)
420: break;
421: width += s[i].width;
422: }
423: return (width);
1.11 nicm 424: }
425:
426: /*
427: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
428: * Caller frees.
429: */
430: struct utf8_data *
431: utf8_fromcstr(const char *src)
432: {
1.47 nicm 433: struct utf8_data *dst = NULL;
434: size_t n = 0;
1.23 nicm 435: enum utf8_state more;
1.11 nicm 436:
437: while (*src != '\0') {
1.12 nicm 438: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 439: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
440: while (*++src != '\0' && more == UTF8_MORE)
1.11 nicm 441: more = utf8_append(&dst[n], *src);
1.23 nicm 442: if (more == UTF8_DONE) {
1.11 nicm 443: n++;
444: continue;
445: }
446: src -= dst[n].have;
447: }
1.23 nicm 448: utf8_set(&dst[n], *src);
449: n++;
1.11 nicm 450: src++;
451: }
1.12 nicm 452: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 453: dst[n].size = 0;
454: return (dst);
455: }
456:
457: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
458: char *
459: utf8_tocstr(struct utf8_data *src)
460: {
1.47 nicm 461: char *dst = NULL;
462: size_t n = 0;
1.11 nicm 463:
464: for(; src->size != 0; src++) {
1.12 nicm 465: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 466: memcpy(dst + n, src->data, src->size);
467: n += src->size;
468: }
1.12 nicm 469: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 470: dst[n] = '\0';
471: return (dst);
472: }
473:
474: /* Get width of UTF-8 string. */
475: u_int
476: utf8_cstrwidth(const char *s)
477: {
478: struct utf8_data tmp;
479: u_int width;
1.23 nicm 480: enum utf8_state more;
1.11 nicm 481:
482: width = 0;
483: while (*s != '\0') {
1.23 nicm 484: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
485: while (*++s != '\0' && more == UTF8_MORE)
1.11 nicm 486: more = utf8_append(&tmp, *s);
1.23 nicm 487: if (more == UTF8_DONE) {
1.11 nicm 488: width += tmp.width;
489: continue;
490: }
491: s -= tmp.have;
492: }
1.23 nicm 493: if (*s > 0x1f && *s != 0x7f)
1.21 nicm 494: width++;
1.11 nicm 495: s++;
496: }
497: return (width);
1.18 nicm 498: }
499:
1.44 nicm 500: /* Pad UTF-8 string to width on the left. Caller frees. */
1.18 nicm 501: char *
502: utf8_padcstr(const char *s, u_int width)
503: {
504: size_t slen;
505: char *out;
1.47 nicm 506: u_int n, i;
1.18 nicm 507:
508: n = utf8_cstrwidth(s);
509: if (n >= width)
510: return (xstrdup(s));
511:
512: slen = strlen(s);
513: out = xmalloc(slen + 1 + (width - n));
514: memcpy(out, s, slen);
515: for (i = n; i < width; i++)
516: out[slen++] = ' ';
517: out[slen] = '\0';
1.44 nicm 518: return (out);
519: }
520:
521: /* Pad UTF-8 string to width on the right. Caller frees. */
522: char *
523: utf8_rpadcstr(const char *s, u_int width)
524: {
525: size_t slen;
526: char *out;
1.47 nicm 527: u_int n, i;
1.44 nicm 528:
529: n = utf8_cstrwidth(s);
530: if (n >= width)
531: return (xstrdup(s));
532:
533: slen = strlen(s);
534: out = xmalloc(slen + 1 + (width - n));
535: for (i = 0; i < width - n; i++)
536: out[i] = ' ';
537: memcpy(out + i, s, slen);
538: out[i + slen] = '\0';
1.11 nicm 539: return (out);
1.43 nicm 540: }
541:
542: int
543: utf8_cstrhas(const char *s, const struct utf8_data *ud)
544: {
545: struct utf8_data *copy, *loop;
546: int found = 0;
547:
548: copy = utf8_fromcstr(s);
549: for (loop = copy; loop->size != 0; loop++) {
550: if (loop->size != ud->size)
551: continue;
552: if (memcmp(loop->data, ud->data, loop->size) == 0) {
553: found = 1;
554: break;
555: }
556: }
557: free(copy);
558:
559: return (found);
1.1 nicm 560: }