Annotation of src/usr.bin/tmux/utf8.c, Revision 1.52
1.52 ! nicm 1: /* $OpenBSD: utf8.c,v 1.51 2020/06/02 17:17:44 nicm Exp $ */
1.1 nicm 2:
3: /*
1.26 nicm 4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
1.1 nicm 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.41 nicm 21: #include <ctype.h>
1.30 nicm 22: #include <errno.h>
1.11 nicm 23: #include <stdlib.h>
1.1 nicm 24: #include <string.h>
1.9 nicm 25: #include <vis.h>
1.28 nicm 26: #include <wchar.h>
1.1 nicm 27:
28: #include "tmux.h"
29:
1.47 nicm 30: struct utf8_item {
31: u_int offset;
32: RB_ENTRY(utf8_item) entry;
1.45 nicm 33:
34: char data[UTF8_SIZE];
35: u_char size;
36: };
1.47 nicm 37: RB_HEAD(utf8_tree, utf8_item);
1.45 nicm 38:
39: static int
1.47 nicm 40: utf8_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
1.45 nicm 41: {
1.47 nicm 42: if (ui1->size < ui2->size)
1.45 nicm 43: return (-1);
1.47 nicm 44: if (ui1->size > ui2->size)
1.45 nicm 45: return (1);
1.47 nicm 46: return (memcmp(ui1->data, ui2->data, ui1->size));
1.45 nicm 47: }
1.47 nicm 48: RB_GENERATE_STATIC(utf8_tree, utf8_item, entry, utf8_cmp);
49: static struct utf8_tree utf8_tree = RB_INITIALIZER(utf8_tree);
1.45 nicm 50:
1.47 nicm 51: static struct utf8_item *utf8_list;
52: static u_int utf8_list_size;
53: static u_int utf8_list_used;
1.45 nicm 54:
1.47 nicm 55: union utf8_map {
56: utf8_char uc;
1.45 nicm 57: struct {
58: u_char flags;
59: u_char data[3];
60: };
61: } __packed;
62:
1.52 ! nicm 63: #define UTF8_GET_SIZE(flags) ((flags) & 0x1f)
! 64: #define UTF8_GET_WIDTH(flags) (((flags) >> 5) - 1)
! 65:
! 66: #define UTF8_SET_SIZE(size) (size)
! 67: #define UTF8_SET_WIDTH(width) ((width + 1) << 5)
! 68:
! 69: static const union utf8_map utf8_space0 = {
! 70: .flags = UTF8_SET_WIDTH(0)|UTF8_SET_SIZE(0),
! 71: .data = ""
! 72: };
1.47 nicm 73: static const union utf8_map utf8_space1 = {
1.52 ! nicm 74: .flags = UTF8_SET_WIDTH(1)|UTF8_SET_SIZE(1),
1.45 nicm 75: .data = " "
76: };
1.47 nicm 77: static const union utf8_map utf8_space2 = {
1.52 ! nicm 78: .flags = UTF8_SET_WIDTH(2)|UTF8_SET_SIZE(2),
1.45 nicm 79: .data = " "
80: };
81:
1.47 nicm 82: /* Get a UTF-8 item by offset. */
83: static struct utf8_item *
84: utf8_get_item(const char *data, size_t size)
1.45 nicm 85: {
1.47 nicm 86: struct utf8_item ui;
87:
88: memcpy(ui.data, data, size);
89: ui.size = size;
1.45 nicm 90:
1.47 nicm 91: return (RB_FIND(utf8_tree, &utf8_tree, &ui));
92: }
1.45 nicm 93:
1.47 nicm 94: /* Expand UTF-8 list. */
95: static int
96: utf8_expand_list(void)
97: {
98: if (utf8_list_size == 0xffffff)
99: return (-1);
100: if (utf8_list_size == 0)
101: utf8_list_size = 256;
102: else if (utf8_list_size > 0x7fffff)
103: utf8_list_size = 0xffffff;
104: else
105: utf8_list_size *= 2;
106: utf8_list = xreallocarray(utf8_list, utf8_list_size, sizeof *utf8_list);
107: return (0);
1.45 nicm 108: }
109:
1.47 nicm 110: /* Add a UTF-8 item. */
1.45 nicm 111: static int
1.47 nicm 112: utf8_put_item(const char *data, size_t size, u_int *offset)
1.45 nicm 113: {
1.47 nicm 114: struct utf8_item *ui;
1.45 nicm 115:
1.47 nicm 116: ui = utf8_get_item(data, size);
117: if (ui != NULL) {
118: *offset = ui->offset;
1.45 nicm 119: log_debug("%s: have %.*s at %u", __func__, (int)size, data,
1.47 nicm 120: *offset);
1.45 nicm 121: return (0);
122: }
123:
1.47 nicm 124: if (utf8_list_used == utf8_list_size && utf8_expand_list() != 0)
125: return (-1);
126: *offset = utf8_list_used++;
127:
128: ui = &utf8_list[*offset];
129: ui->offset = *offset;
130: memcpy(ui->data, data, size);
131: ui->size = size;
132: RB_INSERT(utf8_tree, &utf8_tree, ui);
1.45 nicm 133:
1.47 nicm 134: log_debug("%s: added %.*s at %u", __func__, (int)size, data, *offset);
1.45 nicm 135: return (0);
136: }
137:
1.47 nicm 138: /* Get UTF-8 character from data. */
139: enum utf8_state
140: utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
1.45 nicm 141: {
1.47 nicm 142: union utf8_map m = { .uc = 0 };
143: u_int offset;
1.45 nicm 144:
1.52 ! nicm 145: if (ud->width > 2)
1.49 nicm 146: fatalx("invalid UTF-8 width");
1.45 nicm 147:
1.52 ! nicm 148: if (ud->size > UTF8_SIZE)
1.45 nicm 149: goto fail;
1.52 ! nicm 150: m.flags = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width);
1.47 nicm 151: if (ud->size <= 3)
152: memcpy(m.data, ud->data, ud->size);
153: else {
154: if (utf8_put_item(ud->data, ud->size, &offset) != 0)
155: goto fail;
156: m.data[0] = (offset & 0xff);
157: m.data[1] = (offset >> 8) & 0xff;
158: m.data[2] = (offset >> 16);
1.45 nicm 159: }
1.51 nicm 160: *uc = htonl(m.uc);
1.47 nicm 161: return (UTF8_DONE);
1.45 nicm 162:
163: fail:
1.52 ! nicm 164: if (ud->width == 0)
! 165: *uc = htonl(utf8_space0.uc);
! 166: else if (ud->width == 1)
1.51 nicm 167: *uc = htonl(utf8_space1.uc);
1.47 nicm 168: else
1.51 nicm 169: *uc = htonl(utf8_space2.uc);
1.47 nicm 170: return (UTF8_ERROR);
1.45 nicm 171: }
172:
1.47 nicm 173: /* Get UTF-8 data from character. */
1.45 nicm 174: void
1.47 nicm 175: utf8_to_data(utf8_char uc, struct utf8_data *ud)
1.45 nicm 176: {
1.51 nicm 177: union utf8_map m = { .uc = ntohl(uc) };
1.47 nicm 178: struct utf8_item *ui;
179: u_int offset;
1.45 nicm 180:
181: memset(ud, 0, sizeof *ud);
1.52 ! nicm 182: ud->size = ud->have = UTF8_GET_SIZE(m.flags);
! 183: ud->width = UTF8_GET_WIDTH(m.flags);
1.45 nicm 184:
185: if (ud->size <= 3) {
186: memcpy(ud->data, m.data, ud->size);
187: return;
188: }
189:
1.47 nicm 190: offset = ((u_int)m.data[2] << 16)|((u_int)m.data[1] << 8)|m.data[0];
191: if (offset >= utf8_list_used)
1.45 nicm 192: memset(ud->data, ' ', ud->size);
193: else {
1.47 nicm 194: ui = &utf8_list[offset];
195: memcpy(ud->data, ui->data, ud->size);
1.45 nicm 196: }
197: }
198:
1.47 nicm 199: /* Get UTF-8 character from a single ASCII character. */
1.46 nicm 200: u_int
1.52 ! nicm 201: utf8_build_one(u_char ch)
1.45 nicm 202: {
1.52 ! nicm 203: union utf8_map m;
1.45 nicm 204:
1.52 ! nicm 205: m.flags = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1);
! 206: m.data[0] = ch;
1.51 nicm 207: return (htonl(m.uc));
1.45 nicm 208: }
1.29 nicm 209:
1.11 nicm 210: /* Set a single character. */
211: void
1.19 nicm 212: utf8_set(struct utf8_data *ud, u_char ch)
1.11 nicm 213: {
1.33 nicm 214: static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
1.20 nicm 215:
1.33 nicm 216: memcpy(ud, &empty, sizeof *ud);
1.19 nicm 217: *ud->data = ch;
1.20 nicm 218: }
219:
220: /* Copy UTF-8 character. */
221: void
222: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
223: {
224: u_int i;
225:
226: memcpy(to, from, sizeof *to);
227:
228: for (i = to->size; i < sizeof to->data; i++)
229: to->data[i] = '\0';
1.11 nicm 230: }
231:
1.47 nicm 232: /* Get width of Unicode character. */
1.48 nicm 233: static enum utf8_state
234: utf8_width(struct utf8_data *ud, int *width)
1.47 nicm 235: {
1.48 nicm 236: wchar_t wc;
1.47 nicm 237:
1.48 nicm 238: switch (mbtowc(&wc, ud->data, ud->size)) {
239: case -1:
240: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
241: errno);
242: mbtowc(NULL, NULL, MB_CUR_MAX);
243: return (UTF8_ERROR);
244: case 0:
245: return (UTF8_ERROR);
246: }
247: *width = wcwidth(wc);
248: if (*width < 0 || *width > 0xff) {
249: log_debug("UTF-8 %.*s, wcwidth() %d", (int)ud->size, ud->data,
250: *width);
251: return (UTF8_ERROR);
1.47 nicm 252: }
1.48 nicm 253: return (UTF8_DONE);
1.47 nicm 254: }
255:
1.4 nicm 256: /*
257: * Open UTF-8 sequence.
258: *
259: * 11000010-11011111 C2-DF start of 2-byte sequence
260: * 11100000-11101111 E0-EF start of 3-byte sequence
261: * 11110000-11110100 F0-F4 start of 4-byte sequence
262: */
1.23 nicm 263: enum utf8_state
1.19 nicm 264: utf8_open(struct utf8_data *ud, u_char ch)
1.4 nicm 265: {
1.19 nicm 266: memset(ud, 0, sizeof *ud);
1.4 nicm 267: if (ch >= 0xc2 && ch <= 0xdf)
1.19 nicm 268: ud->size = 2;
1.4 nicm 269: else if (ch >= 0xe0 && ch <= 0xef)
1.19 nicm 270: ud->size = 3;
1.4 nicm 271: else if (ch >= 0xf0 && ch <= 0xf4)
1.19 nicm 272: ud->size = 4;
1.4 nicm 273: else
1.23 nicm 274: return (UTF8_ERROR);
1.19 nicm 275: utf8_append(ud, ch);
1.23 nicm 276: return (UTF8_MORE);
1.4 nicm 277: }
278:
1.23 nicm 279: /* Append character to UTF-8, closing if finished. */
280: enum utf8_state
1.19 nicm 281: utf8_append(struct utf8_data *ud, u_char ch)
1.4 nicm 282: {
1.29 nicm 283: int width;
284:
1.19 nicm 285: if (ud->have >= ud->size)
1.4 nicm 286: fatalx("UTF-8 character overflow");
1.19 nicm 287: if (ud->size > sizeof ud->data)
1.4 nicm 288: fatalx("UTF-8 character size too large");
289:
1.21 nicm 290: if (ud->have != 0 && (ch & 0xc0) != 0x80)
291: ud->width = 0xff;
292:
1.19 nicm 293: ud->data[ud->have++] = ch;
294: if (ud->have != ud->size)
1.23 nicm 295: return (UTF8_MORE);
1.4 nicm 296:
1.21 nicm 297: if (ud->width == 0xff)
1.23 nicm 298: return (UTF8_ERROR);
1.48 nicm 299: if (utf8_width(ud, &width) != UTF8_DONE)
1.29 nicm 300: return (UTF8_ERROR);
301: ud->width = width;
302:
1.23 nicm 303: return (UTF8_DONE);
1.9 nicm 304: }
305:
306: /*
307: * Encode len characters from src into dst, which is guaranteed to have four
308: * bytes available for each character from src (for \abc or UTF-8) plus space
309: * for \0.
310: */
311: int
312: utf8_strvis(char *dst, const char *src, size_t len, int flag)
313: {
1.19 nicm 314: struct utf8_data ud;
1.47 nicm 315: const char *start = dst, *end = src + len;
1.23 nicm 316: enum utf8_state more;
1.9 nicm 317: size_t i;
318:
319: while (src < end) {
1.23 nicm 320: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
321: while (++src < end && more == UTF8_MORE)
1.19 nicm 322: more = utf8_append(&ud, *src);
1.23 nicm 323: if (more == UTF8_DONE) {
1.9 nicm 324: /* UTF-8 character finished. */
1.19 nicm 325: for (i = 0; i < ud.size; i++)
326: *dst++ = ud.data[i];
1.9 nicm 327: continue;
328: }
1.23 nicm 329: /* Not a complete, valid UTF-8 character. */
330: src -= ud.have;
1.9 nicm 331: }
1.41 nicm 332: if (src[0] == '$' && src < end - 1) {
1.42 nicm 333: if (isalpha((u_char)src[1]) ||
334: src[1] == '_' ||
335: src[1] == '{')
1.41 nicm 336: *dst++ = '\\';
337: *dst++ = '$';
338: } else if (src < end - 1)
1.9 nicm 339: dst = vis(dst, src[0], flag, src[1]);
340: else if (src < end)
341: dst = vis(dst, src[0], flag, '\0');
342: src++;
343: }
344: *dst = '\0';
345: return (dst - start);
1.35 nicm 346: }
347:
348: /* Same as utf8_strvis but allocate the buffer. */
349: int
350: utf8_stravis(char **dst, const char *src, int flag)
351: {
352: char *buf;
353: int len;
354:
355: buf = xreallocarray(NULL, 4, strlen(src) + 1);
356: len = utf8_strvis(buf, src, strlen(src), flag);
357:
358: *dst = xrealloc(buf, len + 1);
359: return (len);
1.38 nicm 360: }
361:
362: /* Does this string contain anything that isn't valid UTF-8? */
363: int
364: utf8_isvalid(const char *s)
365: {
1.47 nicm 366: struct utf8_data ud;
367: const char *end;
368: enum utf8_state more;
1.38 nicm 369:
370: end = s + strlen(s);
371: while (s < end) {
372: if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
373: while (++s < end && more == UTF8_MORE)
374: more = utf8_append(&ud, *s);
375: if (more == UTF8_DONE)
376: continue;
377: return (0);
378: }
379: if (*s < 0x20 || *s > 0x7e)
380: return (0);
381: s++;
382: }
383: return (1);
1.16 nicm 384: }
385:
386: /*
387: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
388: * the returned string. Anything not valid printable ASCII or UTF-8 is
389: * stripped.
390: */
391: char *
392: utf8_sanitize(const char *src)
393: {
1.47 nicm 394: char *dst = NULL;
395: size_t n = 0;
396: enum utf8_state more;
397: struct utf8_data ud;
398: u_int i;
1.16 nicm 399:
400: while (*src != '\0') {
401: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 402: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
403: while (*++src != '\0' && more == UTF8_MORE)
1.19 nicm 404: more = utf8_append(&ud, *src);
1.23 nicm 405: if (more == UTF8_DONE) {
1.19 nicm 406: dst = xreallocarray(dst, n + ud.width,
1.16 nicm 407: sizeof *dst);
1.19 nicm 408: for (i = 0; i < ud.width; i++)
1.16 nicm 409: dst[n++] = '_';
410: continue;
411: }
1.19 nicm 412: src -= ud.have;
1.16 nicm 413: }
414: if (*src > 0x1f && *src < 0x7f)
1.21 nicm 415: dst[n++] = *src;
1.23 nicm 416: else
417: dst[n++] = '_';
1.16 nicm 418: src++;
419: }
420: dst = xreallocarray(dst, n + 1, sizeof *dst);
421: dst[n] = '\0';
422: return (dst);
1.34 nicm 423: }
424:
425: /* Get UTF-8 buffer length. */
426: size_t
427: utf8_strlen(const struct utf8_data *s)
428: {
429: size_t i;
430:
431: for (i = 0; s[i].size != 0; i++)
432: /* nothing */;
433: return (i);
434: }
435:
436: /* Get UTF-8 string width. */
437: u_int
438: utf8_strwidth(const struct utf8_data *s, ssize_t n)
439: {
440: ssize_t i;
1.47 nicm 441: u_int width = 0;
1.34 nicm 442:
443: for (i = 0; s[i].size != 0; i++) {
444: if (n != -1 && n == i)
445: break;
446: width += s[i].width;
447: }
448: return (width);
1.11 nicm 449: }
450:
451: /*
452: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
453: * Caller frees.
454: */
455: struct utf8_data *
456: utf8_fromcstr(const char *src)
457: {
1.47 nicm 458: struct utf8_data *dst = NULL;
459: size_t n = 0;
1.23 nicm 460: enum utf8_state more;
1.11 nicm 461:
462: while (*src != '\0') {
1.12 nicm 463: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 464: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
465: while (*++src != '\0' && more == UTF8_MORE)
1.11 nicm 466: more = utf8_append(&dst[n], *src);
1.23 nicm 467: if (more == UTF8_DONE) {
1.11 nicm 468: n++;
469: continue;
470: }
471: src -= dst[n].have;
472: }
1.23 nicm 473: utf8_set(&dst[n], *src);
474: n++;
1.11 nicm 475: src++;
476: }
1.12 nicm 477: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 478: dst[n].size = 0;
479: return (dst);
480: }
481:
482: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
483: char *
484: utf8_tocstr(struct utf8_data *src)
485: {
1.47 nicm 486: char *dst = NULL;
487: size_t n = 0;
1.11 nicm 488:
489: for(; src->size != 0; src++) {
1.12 nicm 490: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 491: memcpy(dst + n, src->data, src->size);
492: n += src->size;
493: }
1.12 nicm 494: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 495: dst[n] = '\0';
496: return (dst);
497: }
498:
499: /* Get width of UTF-8 string. */
500: u_int
501: utf8_cstrwidth(const char *s)
502: {
503: struct utf8_data tmp;
504: u_int width;
1.23 nicm 505: enum utf8_state more;
1.11 nicm 506:
507: width = 0;
508: while (*s != '\0') {
1.23 nicm 509: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
510: while (*++s != '\0' && more == UTF8_MORE)
1.11 nicm 511: more = utf8_append(&tmp, *s);
1.23 nicm 512: if (more == UTF8_DONE) {
1.11 nicm 513: width += tmp.width;
514: continue;
515: }
516: s -= tmp.have;
517: }
1.23 nicm 518: if (*s > 0x1f && *s != 0x7f)
1.21 nicm 519: width++;
1.11 nicm 520: s++;
521: }
522: return (width);
1.18 nicm 523: }
524:
1.44 nicm 525: /* Pad UTF-8 string to width on the left. Caller frees. */
1.18 nicm 526: char *
527: utf8_padcstr(const char *s, u_int width)
528: {
529: size_t slen;
530: char *out;
1.47 nicm 531: u_int n, i;
1.18 nicm 532:
533: n = utf8_cstrwidth(s);
534: if (n >= width)
535: return (xstrdup(s));
536:
537: slen = strlen(s);
538: out = xmalloc(slen + 1 + (width - n));
539: memcpy(out, s, slen);
540: for (i = n; i < width; i++)
541: out[slen++] = ' ';
542: out[slen] = '\0';
1.44 nicm 543: return (out);
544: }
545:
546: /* Pad UTF-8 string to width on the right. Caller frees. */
547: char *
548: utf8_rpadcstr(const char *s, u_int width)
549: {
550: size_t slen;
551: char *out;
1.47 nicm 552: u_int n, i;
1.44 nicm 553:
554: n = utf8_cstrwidth(s);
555: if (n >= width)
556: return (xstrdup(s));
557:
558: slen = strlen(s);
559: out = xmalloc(slen + 1 + (width - n));
560: for (i = 0; i < width - n; i++)
561: out[i] = ' ';
562: memcpy(out + i, s, slen);
563: out[i + slen] = '\0';
1.11 nicm 564: return (out);
1.43 nicm 565: }
566:
567: int
568: utf8_cstrhas(const char *s, const struct utf8_data *ud)
569: {
570: struct utf8_data *copy, *loop;
571: int found = 0;
572:
573: copy = utf8_fromcstr(s);
574: for (loop = copy; loop->size != 0; loop++) {
575: if (loop->size != ud->size)
576: continue;
577: if (memcmp(loop->data, ud->data, loop->size) == 0) {
578: found = 1;
579: break;
580: }
581: }
582: free(copy);
583:
584: return (found);
1.1 nicm 585: }