Annotation of src/usr.bin/tmux/utf8.c, Revision 1.47
1.47 ! nicm 1: /* $OpenBSD: utf8.c,v 1.46 2020/05/25 15:02:25 nicm Exp $ */
1.1 nicm 2:
3: /*
1.26 nicm 4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
1.1 nicm 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.41 nicm 21: #include <ctype.h>
1.30 nicm 22: #include <errno.h>
1.11 nicm 23: #include <stdlib.h>
1.1 nicm 24: #include <string.h>
1.9 nicm 25: #include <vis.h>
1.28 nicm 26: #include <wchar.h>
1.1 nicm 27:
28: #include "tmux.h"
29:
1.47 ! nicm 30: struct utf8_item {
! 31: u_int offset;
! 32: RB_ENTRY(utf8_item) entry;
1.45 nicm 33:
34: char data[UTF8_SIZE];
35: u_char size;
36: };
1.47 ! nicm 37: RB_HEAD(utf8_tree, utf8_item);
1.45 nicm 38:
39: static int
1.47 ! nicm 40: utf8_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
1.45 nicm 41: {
1.47 ! nicm 42: if (ui1->size < ui2->size)
1.45 nicm 43: return (-1);
1.47 ! nicm 44: if (ui1->size > ui2->size)
1.45 nicm 45: return (1);
1.47 ! nicm 46: return (memcmp(ui1->data, ui2->data, ui1->size));
1.45 nicm 47: }
1.47 ! nicm 48: RB_GENERATE_STATIC(utf8_tree, utf8_item, entry, utf8_cmp);
! 49: static struct utf8_tree utf8_tree = RB_INITIALIZER(utf8_tree);
1.45 nicm 50:
1.47 ! nicm 51: static struct utf8_item *utf8_list;
! 52: static u_int utf8_list_size;
! 53: static u_int utf8_list_used;
1.45 nicm 54:
1.47 ! nicm 55: union utf8_map {
! 56: utf8_char uc;
1.45 nicm 57: struct {
58: u_char flags;
1.47 ! nicm 59: #define UTF8_FLAG_SIZE 0x1f
! 60: #define UTF8_FLAG_WIDTH2 0x20
1.45 nicm 61:
62: u_char data[3];
63: };
64: } __packed;
65:
1.47 ! nicm 66: static const union utf8_map utf8_space1 = {
1.45 nicm 67: .flags = 1,
68: .data = " "
69: };
1.47 ! nicm 70: static const union utf8_map utf8_space2 = {
! 71: .flags = UTF8_FLAG_WIDTH2|2,
1.45 nicm 72: .data = " "
73: };
74:
1.47 ! nicm 75: /* Get a UTF-8 item by offset. */
! 76: static struct utf8_item *
! 77: utf8_get_item(const char *data, size_t size)
1.45 nicm 78: {
1.47 ! nicm 79: struct utf8_item ui;
! 80:
! 81: memcpy(ui.data, data, size);
! 82: ui.size = size;
1.45 nicm 83:
1.47 ! nicm 84: return (RB_FIND(utf8_tree, &utf8_tree, &ui));
! 85: }
1.45 nicm 86:
1.47 ! nicm 87: /* Expand UTF-8 list. */
! 88: static int
! 89: utf8_expand_list(void)
! 90: {
! 91: if (utf8_list_size == 0xffffff)
! 92: return (-1);
! 93: if (utf8_list_size == 0)
! 94: utf8_list_size = 256;
! 95: else if (utf8_list_size > 0x7fffff)
! 96: utf8_list_size = 0xffffff;
! 97: else
! 98: utf8_list_size *= 2;
! 99: utf8_list = xreallocarray(utf8_list, utf8_list_size, sizeof *utf8_list);
! 100: return (0);
1.45 nicm 101: }
102:
1.47 ! nicm 103: /* Add a UTF-8 item. */
1.45 nicm 104: static int
1.47 ! nicm 105: utf8_put_item(const char *data, size_t size, u_int *offset)
1.45 nicm 106: {
1.47 ! nicm 107: struct utf8_item *ui;
1.45 nicm 108:
1.47 ! nicm 109: ui = utf8_get_item(data, size);
! 110: if (ui != NULL) {
! 111: *offset = ui->offset;
1.45 nicm 112: log_debug("%s: have %.*s at %u", __func__, (int)size, data,
1.47 ! nicm 113: *offset);
1.45 nicm 114: return (0);
115: }
116:
1.47 ! nicm 117: if (utf8_list_used == utf8_list_size && utf8_expand_list() != 0)
! 118: return (-1);
! 119: *offset = utf8_list_used++;
! 120:
! 121: ui = &utf8_list[*offset];
! 122: ui->offset = *offset;
! 123: memcpy(ui->data, data, size);
! 124: ui->size = size;
! 125: RB_INSERT(utf8_tree, &utf8_tree, ui);
1.45 nicm 126:
1.47 ! nicm 127: log_debug("%s: added %.*s at %u", __func__, (int)size, data, *offset);
1.45 nicm 128: return (0);
129: }
130:
1.47 ! nicm 131: /* Get UTF-8 character from data. */
! 132: enum utf8_state
! 133: utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
1.45 nicm 134: {
1.47 ! nicm 135: union utf8_map m = { .uc = 0 };
! 136: u_int offset;
1.45 nicm 137:
138: if (ud->width != 1 && ud->width != 2)
1.47 ! nicm 139: return (utf8_space1.uc);
1.45 nicm 140:
1.47 ! nicm 141: if (ud->size > UTF8_FLAG_SIZE)
1.45 nicm 142: goto fail;
1.47 ! nicm 143: if (ud->size == 1)
! 144: return (utf8_build_one(ud->data[0], 1));
1.45 nicm 145:
1.47 ! nicm 146: m.flags = ud->size;
1.45 nicm 147: if (ud->width == 2)
1.47 ! nicm 148: m.flags |= UTF8_FLAG_WIDTH2;
1.45 nicm 149:
1.47 ! nicm 150: if (ud->size <= 3)
! 151: memcpy(m.data, ud->data, ud->size);
! 152: else {
! 153: if (utf8_put_item(ud->data, ud->size, &offset) != 0)
! 154: goto fail;
! 155: m.data[0] = (offset & 0xff);
! 156: m.data[1] = (offset >> 8) & 0xff;
! 157: m.data[2] = (offset >> 16);
1.45 nicm 158: }
1.47 ! nicm 159: *uc = m.uc;
! 160: return (UTF8_DONE);
1.45 nicm 161:
162: fail:
163: if (ud->width == 1)
1.47 ! nicm 164: *uc = utf8_space1.uc;
! 165: else
! 166: *uc = utf8_space2.uc;
! 167: return (UTF8_ERROR);
1.45 nicm 168: }
169:
1.47 ! nicm 170: /* Get UTF-8 data from character. */
1.45 nicm 171: void
1.47 ! nicm 172: utf8_to_data(utf8_char uc, struct utf8_data *ud)
1.45 nicm 173: {
1.47 ! nicm 174: union utf8_map m = { .uc = uc };
! 175: struct utf8_item *ui;
! 176: u_int offset;
1.45 nicm 177:
178: memset(ud, 0, sizeof *ud);
1.47 ! nicm 179: ud->size = ud->have = (m.flags & UTF8_FLAG_SIZE);
! 180: if (m.flags & UTF8_FLAG_WIDTH2)
1.45 nicm 181: ud->width = 2;
182: else
183: ud->width = 1;
184:
185: if (ud->size <= 3) {
186: memcpy(ud->data, m.data, ud->size);
187: return;
188: }
189:
1.47 ! nicm 190: offset = ((u_int)m.data[2] << 16)|((u_int)m.data[1] << 8)|m.data[0];
! 191: if (offset >= utf8_list_used)
1.45 nicm 192: memset(ud->data, ' ', ud->size);
193: else {
1.47 ! nicm 194: ui = &utf8_list[offset];
! 195: memcpy(ud->data, ui->data, ud->size);
1.45 nicm 196: }
197: }
198:
1.47 ! nicm 199: /* Get UTF-8 character from a single ASCII character. */
1.46 nicm 200: u_int
1.47 ! nicm 201: utf8_build_one(char c, u_int width)
1.45 nicm 202: {
1.47 ! nicm 203: union utf8_map m = { .flags = 1, .data[0] = c };
1.45 nicm 204:
205: if (width == 2)
1.47 ! nicm 206: m.flags |= UTF8_FLAG_WIDTH2;
! 207: return (m.uc);
1.45 nicm 208: }
1.29 nicm 209:
1.11 nicm 210: /* Set a single character. */
211: void
1.19 nicm 212: utf8_set(struct utf8_data *ud, u_char ch)
1.11 nicm 213: {
1.33 nicm 214: static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
1.20 nicm 215:
1.33 nicm 216: memcpy(ud, &empty, sizeof *ud);
1.19 nicm 217: *ud->data = ch;
1.20 nicm 218: }
219:
220: /* Copy UTF-8 character. */
221: void
222: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
223: {
224: u_int i;
225:
226: memcpy(to, from, sizeof *to);
227:
228: for (i = to->size; i < sizeof to->data; i++)
229: to->data[i] = '\0';
1.11 nicm 230: }
231:
1.47 ! nicm 232: /* Get width of Unicode character. */
! 233: static int
! 234: utf8_width(wchar_t wc)
! 235: {
! 236: int width;
! 237:
! 238: width = wcwidth(wc);
! 239: if (width < 0 || width > 0xff) {
! 240: log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width);
! 241: return (-1);
! 242: }
! 243: return (width);
! 244: }
! 245:
1.4 nicm 246: /*
247: * Open UTF-8 sequence.
248: *
249: * 11000010-11011111 C2-DF start of 2-byte sequence
250: * 11100000-11101111 E0-EF start of 3-byte sequence
251: * 11110000-11110100 F0-F4 start of 4-byte sequence
252: */
1.23 nicm 253: enum utf8_state
1.19 nicm 254: utf8_open(struct utf8_data *ud, u_char ch)
1.4 nicm 255: {
1.19 nicm 256: memset(ud, 0, sizeof *ud);
1.4 nicm 257: if (ch >= 0xc2 && ch <= 0xdf)
1.19 nicm 258: ud->size = 2;
1.4 nicm 259: else if (ch >= 0xe0 && ch <= 0xef)
1.19 nicm 260: ud->size = 3;
1.4 nicm 261: else if (ch >= 0xf0 && ch <= 0xf4)
1.19 nicm 262: ud->size = 4;
1.4 nicm 263: else
1.23 nicm 264: return (UTF8_ERROR);
1.19 nicm 265: utf8_append(ud, ch);
1.23 nicm 266: return (UTF8_MORE);
1.4 nicm 267: }
268:
1.23 nicm 269: /* Append character to UTF-8, closing if finished. */
270: enum utf8_state
1.19 nicm 271: utf8_append(struct utf8_data *ud, u_char ch)
1.4 nicm 272: {
1.29 nicm 273: wchar_t wc;
274: int width;
275:
1.19 nicm 276: if (ud->have >= ud->size)
1.4 nicm 277: fatalx("UTF-8 character overflow");
1.19 nicm 278: if (ud->size > sizeof ud->data)
1.4 nicm 279: fatalx("UTF-8 character size too large");
280:
1.21 nicm 281: if (ud->have != 0 && (ch & 0xc0) != 0x80)
282: ud->width = 0xff;
283:
1.19 nicm 284: ud->data[ud->have++] = ch;
285: if (ud->have != ud->size)
1.23 nicm 286: return (UTF8_MORE);
1.4 nicm 287:
1.21 nicm 288: if (ud->width == 0xff)
1.23 nicm 289: return (UTF8_ERROR);
1.29 nicm 290:
291: if (utf8_combine(ud, &wc) != UTF8_DONE)
292: return (UTF8_ERROR);
293: if ((width = utf8_width(wc)) < 0)
294: return (UTF8_ERROR);
295: ud->width = width;
296:
1.23 nicm 297: return (UTF8_DONE);
1.1 nicm 298: }
299:
1.28 nicm 300: /* Combine UTF-8 into Unicode. */
1.29 nicm 301: enum utf8_state
302: utf8_combine(const struct utf8_data *ud, wchar_t *wc)
1.1 nicm 303: {
1.29 nicm 304: switch (mbtowc(wc, ud->data, ud->size)) {
305: case -1:
1.30 nicm 306: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
307: errno);
1.29 nicm 308: mbtowc(NULL, NULL, MB_CUR_MAX);
309: return (UTF8_ERROR);
310: case 0:
311: return (UTF8_ERROR);
312: default:
313: return (UTF8_DONE);
314: }
1.15 nicm 315: }
316:
1.28 nicm 317: /* Split Unicode into UTF-8. */
1.23 nicm 318: enum utf8_state
1.28 nicm 319: utf8_split(wchar_t wc, struct utf8_data *ud)
1.15 nicm 320: {
1.29 nicm 321: char s[MB_LEN_MAX];
322: int slen;
1.28 nicm 323:
324: slen = wctomb(s, wc);
325: if (slen <= 0 || slen > (int)sizeof ud->data)
1.23 nicm 326: return (UTF8_ERROR);
1.28 nicm 327:
328: memcpy(ud->data, s, slen);
329: ud->size = slen;
330:
331: ud->width = utf8_width(wc);
1.23 nicm 332: return (UTF8_DONE);
1.9 nicm 333: }
334:
335: /*
336: * Encode len characters from src into dst, which is guaranteed to have four
337: * bytes available for each character from src (for \abc or UTF-8) plus space
338: * for \0.
339: */
340: int
341: utf8_strvis(char *dst, const char *src, size_t len, int flag)
342: {
1.19 nicm 343: struct utf8_data ud;
1.47 ! nicm 344: const char *start = dst, *end = src + len;
1.23 nicm 345: enum utf8_state more;
1.9 nicm 346: size_t i;
347:
348: while (src < end) {
1.23 nicm 349: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
350: while (++src < end && more == UTF8_MORE)
1.19 nicm 351: more = utf8_append(&ud, *src);
1.23 nicm 352: if (more == UTF8_DONE) {
1.9 nicm 353: /* UTF-8 character finished. */
1.19 nicm 354: for (i = 0; i < ud.size; i++)
355: *dst++ = ud.data[i];
1.9 nicm 356: continue;
357: }
1.23 nicm 358: /* Not a complete, valid UTF-8 character. */
359: src -= ud.have;
1.9 nicm 360: }
1.41 nicm 361: if (src[0] == '$' && src < end - 1) {
1.42 nicm 362: if (isalpha((u_char)src[1]) ||
363: src[1] == '_' ||
364: src[1] == '{')
1.41 nicm 365: *dst++ = '\\';
366: *dst++ = '$';
367: } else if (src < end - 1)
1.9 nicm 368: dst = vis(dst, src[0], flag, src[1]);
369: else if (src < end)
370: dst = vis(dst, src[0], flag, '\0');
371: src++;
372: }
373: *dst = '\0';
374: return (dst - start);
1.35 nicm 375: }
376:
377: /* Same as utf8_strvis but allocate the buffer. */
378: int
379: utf8_stravis(char **dst, const char *src, int flag)
380: {
381: char *buf;
382: int len;
383:
384: buf = xreallocarray(NULL, 4, strlen(src) + 1);
385: len = utf8_strvis(buf, src, strlen(src), flag);
386:
387: *dst = xrealloc(buf, len + 1);
388: return (len);
1.38 nicm 389: }
390:
391: /* Does this string contain anything that isn't valid UTF-8? */
392: int
393: utf8_isvalid(const char *s)
394: {
1.47 ! nicm 395: struct utf8_data ud;
! 396: const char *end;
! 397: enum utf8_state more;
1.38 nicm 398:
399: end = s + strlen(s);
400: while (s < end) {
401: if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
402: while (++s < end && more == UTF8_MORE)
403: more = utf8_append(&ud, *s);
404: if (more == UTF8_DONE)
405: continue;
406: return (0);
407: }
408: if (*s < 0x20 || *s > 0x7e)
409: return (0);
410: s++;
411: }
412: return (1);
1.16 nicm 413: }
414:
415: /*
416: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
417: * the returned string. Anything not valid printable ASCII or UTF-8 is
418: * stripped.
419: */
420: char *
421: utf8_sanitize(const char *src)
422: {
1.47 ! nicm 423: char *dst = NULL;
! 424: size_t n = 0;
! 425: enum utf8_state more;
! 426: struct utf8_data ud;
! 427: u_int i;
1.16 nicm 428:
429: while (*src != '\0') {
430: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 431: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
432: while (*++src != '\0' && more == UTF8_MORE)
1.19 nicm 433: more = utf8_append(&ud, *src);
1.23 nicm 434: if (more == UTF8_DONE) {
1.19 nicm 435: dst = xreallocarray(dst, n + ud.width,
1.16 nicm 436: sizeof *dst);
1.19 nicm 437: for (i = 0; i < ud.width; i++)
1.16 nicm 438: dst[n++] = '_';
439: continue;
440: }
1.19 nicm 441: src -= ud.have;
1.16 nicm 442: }
443: if (*src > 0x1f && *src < 0x7f)
1.21 nicm 444: dst[n++] = *src;
1.23 nicm 445: else
446: dst[n++] = '_';
1.16 nicm 447: src++;
448: }
449: dst = xreallocarray(dst, n + 1, sizeof *dst);
450: dst[n] = '\0';
451: return (dst);
1.34 nicm 452: }
453:
454: /* Get UTF-8 buffer length. */
455: size_t
456: utf8_strlen(const struct utf8_data *s)
457: {
458: size_t i;
459:
460: for (i = 0; s[i].size != 0; i++)
461: /* nothing */;
462: return (i);
463: }
464:
465: /* Get UTF-8 string width. */
466: u_int
467: utf8_strwidth(const struct utf8_data *s, ssize_t n)
468: {
469: ssize_t i;
1.47 ! nicm 470: u_int width = 0;
1.34 nicm 471:
472: for (i = 0; s[i].size != 0; i++) {
473: if (n != -1 && n == i)
474: break;
475: width += s[i].width;
476: }
477: return (width);
1.11 nicm 478: }
479:
480: /*
481: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
482: * Caller frees.
483: */
484: struct utf8_data *
485: utf8_fromcstr(const char *src)
486: {
1.47 ! nicm 487: struct utf8_data *dst = NULL;
! 488: size_t n = 0;
1.23 nicm 489: enum utf8_state more;
1.11 nicm 490:
491: while (*src != '\0') {
1.12 nicm 492: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 493: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
494: while (*++src != '\0' && more == UTF8_MORE)
1.11 nicm 495: more = utf8_append(&dst[n], *src);
1.23 nicm 496: if (more == UTF8_DONE) {
1.11 nicm 497: n++;
498: continue;
499: }
500: src -= dst[n].have;
501: }
1.23 nicm 502: utf8_set(&dst[n], *src);
503: n++;
1.11 nicm 504: src++;
505: }
1.12 nicm 506: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 507: dst[n].size = 0;
508: return (dst);
509: }
510:
511: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
512: char *
513: utf8_tocstr(struct utf8_data *src)
514: {
1.47 ! nicm 515: char *dst = NULL;
! 516: size_t n = 0;
1.11 nicm 517:
518: for(; src->size != 0; src++) {
1.12 nicm 519: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 520: memcpy(dst + n, src->data, src->size);
521: n += src->size;
522: }
1.12 nicm 523: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 524: dst[n] = '\0';
525: return (dst);
526: }
527:
528: /* Get width of UTF-8 string. */
529: u_int
530: utf8_cstrwidth(const char *s)
531: {
532: struct utf8_data tmp;
533: u_int width;
1.23 nicm 534: enum utf8_state more;
1.11 nicm 535:
536: width = 0;
537: while (*s != '\0') {
1.23 nicm 538: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
539: while (*++s != '\0' && more == UTF8_MORE)
1.11 nicm 540: more = utf8_append(&tmp, *s);
1.23 nicm 541: if (more == UTF8_DONE) {
1.11 nicm 542: width += tmp.width;
543: continue;
544: }
545: s -= tmp.have;
546: }
1.23 nicm 547: if (*s > 0x1f && *s != 0x7f)
1.21 nicm 548: width++;
1.11 nicm 549: s++;
550: }
551: return (width);
1.18 nicm 552: }
553:
1.44 nicm 554: /* Pad UTF-8 string to width on the left. Caller frees. */
1.18 nicm 555: char *
556: utf8_padcstr(const char *s, u_int width)
557: {
558: size_t slen;
559: char *out;
1.47 ! nicm 560: u_int n, i;
1.18 nicm 561:
562: n = utf8_cstrwidth(s);
563: if (n >= width)
564: return (xstrdup(s));
565:
566: slen = strlen(s);
567: out = xmalloc(slen + 1 + (width - n));
568: memcpy(out, s, slen);
569: for (i = n; i < width; i++)
570: out[slen++] = ' ';
571: out[slen] = '\0';
1.44 nicm 572: return (out);
573: }
574:
575: /* Pad UTF-8 string to width on the right. Caller frees. */
576: char *
577: utf8_rpadcstr(const char *s, u_int width)
578: {
579: size_t slen;
580: char *out;
1.47 ! nicm 581: u_int n, i;
1.44 nicm 582:
583: n = utf8_cstrwidth(s);
584: if (n >= width)
585: return (xstrdup(s));
586:
587: slen = strlen(s);
588: out = xmalloc(slen + 1 + (width - n));
589: for (i = 0; i < width - n; i++)
590: out[i] = ' ';
591: memcpy(out + i, s, slen);
592: out[i + slen] = '\0';
1.11 nicm 593: return (out);
1.43 nicm 594: }
595:
596: int
597: utf8_cstrhas(const char *s, const struct utf8_data *ud)
598: {
599: struct utf8_data *copy, *loop;
600: int found = 0;
601:
602: copy = utf8_fromcstr(s);
603: for (loop = copy; loop->size != 0; loop++) {
604: if (loop->size != ud->size)
605: continue;
606: if (memcmp(loop->data, ud->data, loop->size) == 0) {
607: found = 1;
608: break;
609: }
610: }
611: free(copy);
612:
613: return (found);
1.1 nicm 614: }