Annotation of src/usr.bin/tmux/utf8.c, Revision 1.49
1.49 ! nicm 1: /* $OpenBSD: utf8.c,v 1.48 2020/05/25 18:57:25 nicm Exp $ */
1.1 nicm 2:
3: /*
1.26 nicm 4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
1.1 nicm 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.41 nicm 21: #include <ctype.h>
1.30 nicm 22: #include <errno.h>
1.11 nicm 23: #include <stdlib.h>
1.1 nicm 24: #include <string.h>
1.9 nicm 25: #include <vis.h>
1.28 nicm 26: #include <wchar.h>
1.1 nicm 27:
28: #include "tmux.h"
29:
1.47 nicm 30: struct utf8_item {
31: u_int offset;
32: RB_ENTRY(utf8_item) entry;
1.45 nicm 33:
34: char data[UTF8_SIZE];
35: u_char size;
36: };
1.47 nicm 37: RB_HEAD(utf8_tree, utf8_item);
1.45 nicm 38:
39: static int
1.47 nicm 40: utf8_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
1.45 nicm 41: {
1.47 nicm 42: if (ui1->size < ui2->size)
1.45 nicm 43: return (-1);
1.47 nicm 44: if (ui1->size > ui2->size)
1.45 nicm 45: return (1);
1.47 nicm 46: return (memcmp(ui1->data, ui2->data, ui1->size));
1.45 nicm 47: }
1.47 nicm 48: RB_GENERATE_STATIC(utf8_tree, utf8_item, entry, utf8_cmp);
49: static struct utf8_tree utf8_tree = RB_INITIALIZER(utf8_tree);
1.45 nicm 50:
1.47 nicm 51: static struct utf8_item *utf8_list;
52: static u_int utf8_list_size;
53: static u_int utf8_list_used;
1.45 nicm 54:
1.47 nicm 55: union utf8_map {
56: utf8_char uc;
1.45 nicm 57: struct {
58: u_char flags;
1.47 nicm 59: #define UTF8_FLAG_SIZE 0x1f
60: #define UTF8_FLAG_WIDTH2 0x20
1.45 nicm 61:
62: u_char data[3];
63: };
64: } __packed;
65:
1.47 nicm 66: static const union utf8_map utf8_space1 = {
1.45 nicm 67: .flags = 1,
68: .data = " "
69: };
1.47 nicm 70: static const union utf8_map utf8_space2 = {
71: .flags = UTF8_FLAG_WIDTH2|2,
1.45 nicm 72: .data = " "
73: };
74:
1.47 nicm 75: /* Get a UTF-8 item by offset. */
76: static struct utf8_item *
77: utf8_get_item(const char *data, size_t size)
1.45 nicm 78: {
1.47 nicm 79: struct utf8_item ui;
80:
81: memcpy(ui.data, data, size);
82: ui.size = size;
1.45 nicm 83:
1.47 nicm 84: return (RB_FIND(utf8_tree, &utf8_tree, &ui));
85: }
1.45 nicm 86:
1.47 nicm 87: /* Expand UTF-8 list. */
88: static int
89: utf8_expand_list(void)
90: {
91: if (utf8_list_size == 0xffffff)
92: return (-1);
93: if (utf8_list_size == 0)
94: utf8_list_size = 256;
95: else if (utf8_list_size > 0x7fffff)
96: utf8_list_size = 0xffffff;
97: else
98: utf8_list_size *= 2;
99: utf8_list = xreallocarray(utf8_list, utf8_list_size, sizeof *utf8_list);
100: return (0);
1.45 nicm 101: }
102:
1.47 nicm 103: /* Add a UTF-8 item. */
1.45 nicm 104: static int
1.47 nicm 105: utf8_put_item(const char *data, size_t size, u_int *offset)
1.45 nicm 106: {
1.47 nicm 107: struct utf8_item *ui;
1.45 nicm 108:
1.47 nicm 109: ui = utf8_get_item(data, size);
110: if (ui != NULL) {
111: *offset = ui->offset;
1.45 nicm 112: log_debug("%s: have %.*s at %u", __func__, (int)size, data,
1.47 nicm 113: *offset);
1.45 nicm 114: return (0);
115: }
116:
1.47 nicm 117: if (utf8_list_used == utf8_list_size && utf8_expand_list() != 0)
118: return (-1);
119: *offset = utf8_list_used++;
120:
121: ui = &utf8_list[*offset];
122: ui->offset = *offset;
123: memcpy(ui->data, data, size);
124: ui->size = size;
125: RB_INSERT(utf8_tree, &utf8_tree, ui);
1.45 nicm 126:
1.47 nicm 127: log_debug("%s: added %.*s at %u", __func__, (int)size, data, *offset);
1.45 nicm 128: return (0);
129: }
130:
1.47 nicm 131: /* Get UTF-8 character from data. */
132: enum utf8_state
133: utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
1.45 nicm 134: {
1.47 nicm 135: union utf8_map m = { .uc = 0 };
136: u_int offset;
1.45 nicm 137:
138: if (ud->width != 1 && ud->width != 2)
1.49 ! nicm 139: fatalx("invalid UTF-8 width");
! 140: if (ud->size == 0)
! 141: fatalx("invalid UTF-8 size");
1.45 nicm 142:
1.47 nicm 143: if (ud->size > UTF8_FLAG_SIZE)
1.45 nicm 144: goto fail;
1.49 ! nicm 145: if (ud->size == 1) {
! 146: *uc = utf8_build_one(ud->data[0], 1);
! 147: return (UTF8_DONE);
! 148: }
1.45 nicm 149:
1.47 nicm 150: m.flags = ud->size;
1.45 nicm 151: if (ud->width == 2)
1.47 nicm 152: m.flags |= UTF8_FLAG_WIDTH2;
1.45 nicm 153:
1.47 nicm 154: if (ud->size <= 3)
155: memcpy(m.data, ud->data, ud->size);
156: else {
157: if (utf8_put_item(ud->data, ud->size, &offset) != 0)
158: goto fail;
159: m.data[0] = (offset & 0xff);
160: m.data[1] = (offset >> 8) & 0xff;
161: m.data[2] = (offset >> 16);
1.45 nicm 162: }
1.47 nicm 163: *uc = m.uc;
164: return (UTF8_DONE);
1.45 nicm 165:
166: fail:
167: if (ud->width == 1)
1.47 nicm 168: *uc = utf8_space1.uc;
169: else
170: *uc = utf8_space2.uc;
171: return (UTF8_ERROR);
1.45 nicm 172: }
173:
1.47 nicm 174: /* Get UTF-8 data from character. */
1.45 nicm 175: void
1.47 nicm 176: utf8_to_data(utf8_char uc, struct utf8_data *ud)
1.45 nicm 177: {
1.47 nicm 178: union utf8_map m = { .uc = uc };
179: struct utf8_item *ui;
180: u_int offset;
1.45 nicm 181:
182: memset(ud, 0, sizeof *ud);
1.47 nicm 183: ud->size = ud->have = (m.flags & UTF8_FLAG_SIZE);
184: if (m.flags & UTF8_FLAG_WIDTH2)
1.45 nicm 185: ud->width = 2;
186: else
187: ud->width = 1;
188:
189: if (ud->size <= 3) {
190: memcpy(ud->data, m.data, ud->size);
191: return;
192: }
193:
1.47 nicm 194: offset = ((u_int)m.data[2] << 16)|((u_int)m.data[1] << 8)|m.data[0];
195: if (offset >= utf8_list_used)
1.45 nicm 196: memset(ud->data, ' ', ud->size);
197: else {
1.47 nicm 198: ui = &utf8_list[offset];
199: memcpy(ud->data, ui->data, ud->size);
1.45 nicm 200: }
201: }
202:
1.47 nicm 203: /* Get UTF-8 character from a single ASCII character. */
1.46 nicm 204: u_int
1.47 nicm 205: utf8_build_one(char c, u_int width)
1.45 nicm 206: {
1.47 nicm 207: union utf8_map m = { .flags = 1, .data[0] = c };
1.45 nicm 208:
209: if (width == 2)
1.47 nicm 210: m.flags |= UTF8_FLAG_WIDTH2;
211: return (m.uc);
1.45 nicm 212: }
1.29 nicm 213:
1.11 nicm 214: /* Set a single character. */
215: void
1.19 nicm 216: utf8_set(struct utf8_data *ud, u_char ch)
1.11 nicm 217: {
1.33 nicm 218: static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
1.20 nicm 219:
1.33 nicm 220: memcpy(ud, &empty, sizeof *ud);
1.19 nicm 221: *ud->data = ch;
1.20 nicm 222: }
223:
224: /* Copy UTF-8 character. */
225: void
226: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
227: {
228: u_int i;
229:
230: memcpy(to, from, sizeof *to);
231:
232: for (i = to->size; i < sizeof to->data; i++)
233: to->data[i] = '\0';
1.11 nicm 234: }
235:
1.47 nicm 236: /* Get width of Unicode character. */
1.48 nicm 237: static enum utf8_state
238: utf8_width(struct utf8_data *ud, int *width)
1.47 nicm 239: {
1.48 nicm 240: wchar_t wc;
1.47 nicm 241:
1.48 nicm 242: switch (mbtowc(&wc, ud->data, ud->size)) {
243: case -1:
244: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
245: errno);
246: mbtowc(NULL, NULL, MB_CUR_MAX);
247: return (UTF8_ERROR);
248: case 0:
249: return (UTF8_ERROR);
250: }
251: *width = wcwidth(wc);
252: if (*width < 0 || *width > 0xff) {
253: log_debug("UTF-8 %.*s, wcwidth() %d", (int)ud->size, ud->data,
254: *width);
255: return (UTF8_ERROR);
1.47 nicm 256: }
1.48 nicm 257: return (UTF8_DONE);
1.47 nicm 258: }
259:
1.4 nicm 260: /*
261: * Open UTF-8 sequence.
262: *
263: * 11000010-11011111 C2-DF start of 2-byte sequence
264: * 11100000-11101111 E0-EF start of 3-byte sequence
265: * 11110000-11110100 F0-F4 start of 4-byte sequence
266: */
1.23 nicm 267: enum utf8_state
1.19 nicm 268: utf8_open(struct utf8_data *ud, u_char ch)
1.4 nicm 269: {
1.19 nicm 270: memset(ud, 0, sizeof *ud);
1.4 nicm 271: if (ch >= 0xc2 && ch <= 0xdf)
1.19 nicm 272: ud->size = 2;
1.4 nicm 273: else if (ch >= 0xe0 && ch <= 0xef)
1.19 nicm 274: ud->size = 3;
1.4 nicm 275: else if (ch >= 0xf0 && ch <= 0xf4)
1.19 nicm 276: ud->size = 4;
1.4 nicm 277: else
1.23 nicm 278: return (UTF8_ERROR);
1.19 nicm 279: utf8_append(ud, ch);
1.23 nicm 280: return (UTF8_MORE);
1.4 nicm 281: }
282:
1.23 nicm 283: /* Append character to UTF-8, closing if finished. */
284: enum utf8_state
1.19 nicm 285: utf8_append(struct utf8_data *ud, u_char ch)
1.4 nicm 286: {
1.29 nicm 287: int width;
288:
1.19 nicm 289: if (ud->have >= ud->size)
1.4 nicm 290: fatalx("UTF-8 character overflow");
1.19 nicm 291: if (ud->size > sizeof ud->data)
1.4 nicm 292: fatalx("UTF-8 character size too large");
293:
1.21 nicm 294: if (ud->have != 0 && (ch & 0xc0) != 0x80)
295: ud->width = 0xff;
296:
1.19 nicm 297: ud->data[ud->have++] = ch;
298: if (ud->have != ud->size)
1.23 nicm 299: return (UTF8_MORE);
1.4 nicm 300:
1.21 nicm 301: if (ud->width == 0xff)
1.23 nicm 302: return (UTF8_ERROR);
1.48 nicm 303: if (utf8_width(ud, &width) != UTF8_DONE)
1.29 nicm 304: return (UTF8_ERROR);
305: ud->width = width;
306:
1.23 nicm 307: return (UTF8_DONE);
1.9 nicm 308: }
309:
310: /*
311: * Encode len characters from src into dst, which is guaranteed to have four
312: * bytes available for each character from src (for \abc or UTF-8) plus space
313: * for \0.
314: */
315: int
316: utf8_strvis(char *dst, const char *src, size_t len, int flag)
317: {
1.19 nicm 318: struct utf8_data ud;
1.47 nicm 319: const char *start = dst, *end = src + len;
1.23 nicm 320: enum utf8_state more;
1.9 nicm 321: size_t i;
322:
323: while (src < end) {
1.23 nicm 324: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
325: while (++src < end && more == UTF8_MORE)
1.19 nicm 326: more = utf8_append(&ud, *src);
1.23 nicm 327: if (more == UTF8_DONE) {
1.9 nicm 328: /* UTF-8 character finished. */
1.19 nicm 329: for (i = 0; i < ud.size; i++)
330: *dst++ = ud.data[i];
1.9 nicm 331: continue;
332: }
1.23 nicm 333: /* Not a complete, valid UTF-8 character. */
334: src -= ud.have;
1.9 nicm 335: }
1.41 nicm 336: if (src[0] == '$' && src < end - 1) {
1.42 nicm 337: if (isalpha((u_char)src[1]) ||
338: src[1] == '_' ||
339: src[1] == '{')
1.41 nicm 340: *dst++ = '\\';
341: *dst++ = '$';
342: } else if (src < end - 1)
1.9 nicm 343: dst = vis(dst, src[0], flag, src[1]);
344: else if (src < end)
345: dst = vis(dst, src[0], flag, '\0');
346: src++;
347: }
348: *dst = '\0';
349: return (dst - start);
1.35 nicm 350: }
351:
352: /* Same as utf8_strvis but allocate the buffer. */
353: int
354: utf8_stravis(char **dst, const char *src, int flag)
355: {
356: char *buf;
357: int len;
358:
359: buf = xreallocarray(NULL, 4, strlen(src) + 1);
360: len = utf8_strvis(buf, src, strlen(src), flag);
361:
362: *dst = xrealloc(buf, len + 1);
363: return (len);
1.38 nicm 364: }
365:
366: /* Does this string contain anything that isn't valid UTF-8? */
367: int
368: utf8_isvalid(const char *s)
369: {
1.47 nicm 370: struct utf8_data ud;
371: const char *end;
372: enum utf8_state more;
1.38 nicm 373:
374: end = s + strlen(s);
375: while (s < end) {
376: if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
377: while (++s < end && more == UTF8_MORE)
378: more = utf8_append(&ud, *s);
379: if (more == UTF8_DONE)
380: continue;
381: return (0);
382: }
383: if (*s < 0x20 || *s > 0x7e)
384: return (0);
385: s++;
386: }
387: return (1);
1.16 nicm 388: }
389:
390: /*
391: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
392: * the returned string. Anything not valid printable ASCII or UTF-8 is
393: * stripped.
394: */
395: char *
396: utf8_sanitize(const char *src)
397: {
1.47 nicm 398: char *dst = NULL;
399: size_t n = 0;
400: enum utf8_state more;
401: struct utf8_data ud;
402: u_int i;
1.16 nicm 403:
404: while (*src != '\0') {
405: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 406: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
407: while (*++src != '\0' && more == UTF8_MORE)
1.19 nicm 408: more = utf8_append(&ud, *src);
1.23 nicm 409: if (more == UTF8_DONE) {
1.19 nicm 410: dst = xreallocarray(dst, n + ud.width,
1.16 nicm 411: sizeof *dst);
1.19 nicm 412: for (i = 0; i < ud.width; i++)
1.16 nicm 413: dst[n++] = '_';
414: continue;
415: }
1.19 nicm 416: src -= ud.have;
1.16 nicm 417: }
418: if (*src > 0x1f && *src < 0x7f)
1.21 nicm 419: dst[n++] = *src;
1.23 nicm 420: else
421: dst[n++] = '_';
1.16 nicm 422: src++;
423: }
424: dst = xreallocarray(dst, n + 1, sizeof *dst);
425: dst[n] = '\0';
426: return (dst);
1.34 nicm 427: }
428:
429: /* Get UTF-8 buffer length. */
430: size_t
431: utf8_strlen(const struct utf8_data *s)
432: {
433: size_t i;
434:
435: for (i = 0; s[i].size != 0; i++)
436: /* nothing */;
437: return (i);
438: }
439:
440: /* Get UTF-8 string width. */
441: u_int
442: utf8_strwidth(const struct utf8_data *s, ssize_t n)
443: {
444: ssize_t i;
1.47 nicm 445: u_int width = 0;
1.34 nicm 446:
447: for (i = 0; s[i].size != 0; i++) {
448: if (n != -1 && n == i)
449: break;
450: width += s[i].width;
451: }
452: return (width);
1.11 nicm 453: }
454:
455: /*
456: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
457: * Caller frees.
458: */
459: struct utf8_data *
460: utf8_fromcstr(const char *src)
461: {
1.47 nicm 462: struct utf8_data *dst = NULL;
463: size_t n = 0;
1.23 nicm 464: enum utf8_state more;
1.11 nicm 465:
466: while (*src != '\0') {
1.12 nicm 467: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 468: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
469: while (*++src != '\0' && more == UTF8_MORE)
1.11 nicm 470: more = utf8_append(&dst[n], *src);
1.23 nicm 471: if (more == UTF8_DONE) {
1.11 nicm 472: n++;
473: continue;
474: }
475: src -= dst[n].have;
476: }
1.23 nicm 477: utf8_set(&dst[n], *src);
478: n++;
1.11 nicm 479: src++;
480: }
1.12 nicm 481: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 482: dst[n].size = 0;
483: return (dst);
484: }
485:
486: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
487: char *
488: utf8_tocstr(struct utf8_data *src)
489: {
1.47 nicm 490: char *dst = NULL;
491: size_t n = 0;
1.11 nicm 492:
493: for(; src->size != 0; src++) {
1.12 nicm 494: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 495: memcpy(dst + n, src->data, src->size);
496: n += src->size;
497: }
1.12 nicm 498: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 499: dst[n] = '\0';
500: return (dst);
501: }
502:
503: /* Get width of UTF-8 string. */
504: u_int
505: utf8_cstrwidth(const char *s)
506: {
507: struct utf8_data tmp;
508: u_int width;
1.23 nicm 509: enum utf8_state more;
1.11 nicm 510:
511: width = 0;
512: while (*s != '\0') {
1.23 nicm 513: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
514: while (*++s != '\0' && more == UTF8_MORE)
1.11 nicm 515: more = utf8_append(&tmp, *s);
1.23 nicm 516: if (more == UTF8_DONE) {
1.11 nicm 517: width += tmp.width;
518: continue;
519: }
520: s -= tmp.have;
521: }
1.23 nicm 522: if (*s > 0x1f && *s != 0x7f)
1.21 nicm 523: width++;
1.11 nicm 524: s++;
525: }
526: return (width);
1.18 nicm 527: }
528:
1.44 nicm 529: /* Pad UTF-8 string to width on the left. Caller frees. */
1.18 nicm 530: char *
531: utf8_padcstr(const char *s, u_int width)
532: {
533: size_t slen;
534: char *out;
1.47 nicm 535: u_int n, i;
1.18 nicm 536:
537: n = utf8_cstrwidth(s);
538: if (n >= width)
539: return (xstrdup(s));
540:
541: slen = strlen(s);
542: out = xmalloc(slen + 1 + (width - n));
543: memcpy(out, s, slen);
544: for (i = n; i < width; i++)
545: out[slen++] = ' ';
546: out[slen] = '\0';
1.44 nicm 547: return (out);
548: }
549:
550: /* Pad UTF-8 string to width on the right. Caller frees. */
551: char *
552: utf8_rpadcstr(const char *s, u_int width)
553: {
554: size_t slen;
555: char *out;
1.47 nicm 556: u_int n, i;
1.44 nicm 557:
558: n = utf8_cstrwidth(s);
559: if (n >= width)
560: return (xstrdup(s));
561:
562: slen = strlen(s);
563: out = xmalloc(slen + 1 + (width - n));
564: for (i = 0; i < width - n; i++)
565: out[i] = ' ';
566: memcpy(out + i, s, slen);
567: out[i + slen] = '\0';
1.11 nicm 568: return (out);
1.43 nicm 569: }
570:
571: int
572: utf8_cstrhas(const char *s, const struct utf8_data *ud)
573: {
574: struct utf8_data *copy, *loop;
575: int found = 0;
576:
577: copy = utf8_fromcstr(s);
578: for (loop = copy; loop->size != 0; loop++) {
579: if (loop->size != ud->size)
580: continue;
581: if (memcmp(loop->data, ud->data, loop->size) == 0) {
582: found = 1;
583: break;
584: }
585: }
586: free(copy);
587:
588: return (found);
1.1 nicm 589: }