Annotation of src/usr.bin/tmux/utf8.c, Revision 1.51
1.51 ! nicm 1: /* $OpenBSD: utf8.c,v 1.50 2020/06/02 11:29:00 nicm Exp $ */
1.1 nicm 2:
3: /*
1.26 nicm 4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
1.1 nicm 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.41 nicm 21: #include <ctype.h>
1.30 nicm 22: #include <errno.h>
1.11 nicm 23: #include <stdlib.h>
1.1 nicm 24: #include <string.h>
1.9 nicm 25: #include <vis.h>
1.28 nicm 26: #include <wchar.h>
1.1 nicm 27:
28: #include "tmux.h"
29:
1.47 nicm 30: struct utf8_item {
31: u_int offset;
32: RB_ENTRY(utf8_item) entry;
1.45 nicm 33:
34: char data[UTF8_SIZE];
35: u_char size;
36: };
1.47 nicm 37: RB_HEAD(utf8_tree, utf8_item);
1.45 nicm 38:
39: static int
1.47 nicm 40: utf8_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
1.45 nicm 41: {
1.47 nicm 42: if (ui1->size < ui2->size)
1.45 nicm 43: return (-1);
1.47 nicm 44: if (ui1->size > ui2->size)
1.45 nicm 45: return (1);
1.47 nicm 46: return (memcmp(ui1->data, ui2->data, ui1->size));
1.45 nicm 47: }
1.47 nicm 48: RB_GENERATE_STATIC(utf8_tree, utf8_item, entry, utf8_cmp);
49: static struct utf8_tree utf8_tree = RB_INITIALIZER(utf8_tree);
1.45 nicm 50:
1.47 nicm 51: static struct utf8_item *utf8_list;
52: static u_int utf8_list_size;
53: static u_int utf8_list_used;
1.45 nicm 54:
1.47 nicm 55: union utf8_map {
56: utf8_char uc;
1.45 nicm 57: struct {
58: u_char flags;
1.47 nicm 59: #define UTF8_FLAG_SIZE 0x1f
60: #define UTF8_FLAG_WIDTH2 0x20
1.45 nicm 61:
62: u_char data[3];
63: };
64: } __packed;
65:
1.47 nicm 66: static const union utf8_map utf8_space1 = {
1.45 nicm 67: .flags = 1,
68: .data = " "
69: };
1.47 nicm 70: static const union utf8_map utf8_space2 = {
71: .flags = UTF8_FLAG_WIDTH2|2,
1.45 nicm 72: .data = " "
73: };
74:
1.47 nicm 75: /* Get a UTF-8 item by offset. */
76: static struct utf8_item *
77: utf8_get_item(const char *data, size_t size)
1.45 nicm 78: {
1.47 nicm 79: struct utf8_item ui;
80:
81: memcpy(ui.data, data, size);
82: ui.size = size;
1.45 nicm 83:
1.47 nicm 84: return (RB_FIND(utf8_tree, &utf8_tree, &ui));
85: }
1.45 nicm 86:
1.47 nicm 87: /* Expand UTF-8 list. */
88: static int
89: utf8_expand_list(void)
90: {
91: if (utf8_list_size == 0xffffff)
92: return (-1);
93: if (utf8_list_size == 0)
94: utf8_list_size = 256;
95: else if (utf8_list_size > 0x7fffff)
96: utf8_list_size = 0xffffff;
97: else
98: utf8_list_size *= 2;
99: utf8_list = xreallocarray(utf8_list, utf8_list_size, sizeof *utf8_list);
100: return (0);
1.45 nicm 101: }
102:
1.47 nicm 103: /* Add a UTF-8 item. */
1.45 nicm 104: static int
1.47 nicm 105: utf8_put_item(const char *data, size_t size, u_int *offset)
1.45 nicm 106: {
1.47 nicm 107: struct utf8_item *ui;
1.45 nicm 108:
1.47 nicm 109: ui = utf8_get_item(data, size);
110: if (ui != NULL) {
111: *offset = ui->offset;
1.45 nicm 112: log_debug("%s: have %.*s at %u", __func__, (int)size, data,
1.47 nicm 113: *offset);
1.45 nicm 114: return (0);
115: }
116:
1.47 nicm 117: if (utf8_list_used == utf8_list_size && utf8_expand_list() != 0)
118: return (-1);
119: *offset = utf8_list_used++;
120:
121: ui = &utf8_list[*offset];
122: ui->offset = *offset;
123: memcpy(ui->data, data, size);
124: ui->size = size;
125: RB_INSERT(utf8_tree, &utf8_tree, ui);
1.45 nicm 126:
1.47 nicm 127: log_debug("%s: added %.*s at %u", __func__, (int)size, data, *offset);
1.45 nicm 128: return (0);
129: }
130:
1.47 nicm 131: /* Get UTF-8 character from data. */
132: enum utf8_state
133: utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
1.45 nicm 134: {
1.47 nicm 135: union utf8_map m = { .uc = 0 };
136: u_int offset;
1.45 nicm 137:
1.50 nicm 138: if (ud->width == 0)
139: goto fail;
1.45 nicm 140: if (ud->width != 1 && ud->width != 2)
1.49 nicm 141: fatalx("invalid UTF-8 width");
142: if (ud->size == 0)
143: fatalx("invalid UTF-8 size");
1.45 nicm 144:
1.47 nicm 145: if (ud->size > UTF8_FLAG_SIZE)
1.45 nicm 146: goto fail;
1.49 nicm 147: if (ud->size == 1) {
148: *uc = utf8_build_one(ud->data[0], 1);
149: return (UTF8_DONE);
150: }
1.45 nicm 151:
1.47 nicm 152: m.flags = ud->size;
1.45 nicm 153: if (ud->width == 2)
1.47 nicm 154: m.flags |= UTF8_FLAG_WIDTH2;
1.45 nicm 155:
1.47 nicm 156: if (ud->size <= 3)
157: memcpy(m.data, ud->data, ud->size);
158: else {
159: if (utf8_put_item(ud->data, ud->size, &offset) != 0)
160: goto fail;
161: m.data[0] = (offset & 0xff);
162: m.data[1] = (offset >> 8) & 0xff;
163: m.data[2] = (offset >> 16);
1.45 nicm 164: }
1.51 ! nicm 165: *uc = htonl(m.uc);
1.47 nicm 166: return (UTF8_DONE);
1.45 nicm 167:
168: fail:
169: if (ud->width == 1)
1.51 ! nicm 170: *uc = htonl(utf8_space1.uc);
1.47 nicm 171: else
1.51 ! nicm 172: *uc = htonl(utf8_space2.uc);
1.47 nicm 173: return (UTF8_ERROR);
1.45 nicm 174: }
175:
1.47 nicm 176: /* Get UTF-8 data from character. */
1.45 nicm 177: void
1.47 nicm 178: utf8_to_data(utf8_char uc, struct utf8_data *ud)
1.45 nicm 179: {
1.51 ! nicm 180: union utf8_map m = { .uc = ntohl(uc) };
1.47 nicm 181: struct utf8_item *ui;
182: u_int offset;
1.45 nicm 183:
184: memset(ud, 0, sizeof *ud);
1.47 nicm 185: ud->size = ud->have = (m.flags & UTF8_FLAG_SIZE);
186: if (m.flags & UTF8_FLAG_WIDTH2)
1.45 nicm 187: ud->width = 2;
188: else
189: ud->width = 1;
190:
191: if (ud->size <= 3) {
192: memcpy(ud->data, m.data, ud->size);
193: return;
194: }
195:
1.47 nicm 196: offset = ((u_int)m.data[2] << 16)|((u_int)m.data[1] << 8)|m.data[0];
197: if (offset >= utf8_list_used)
1.45 nicm 198: memset(ud->data, ' ', ud->size);
199: else {
1.47 nicm 200: ui = &utf8_list[offset];
201: memcpy(ud->data, ui->data, ud->size);
1.45 nicm 202: }
203: }
204:
1.47 nicm 205: /* Get UTF-8 character from a single ASCII character. */
1.46 nicm 206: u_int
1.47 nicm 207: utf8_build_one(char c, u_int width)
1.45 nicm 208: {
1.47 nicm 209: union utf8_map m = { .flags = 1, .data[0] = c };
1.45 nicm 210:
211: if (width == 2)
1.47 nicm 212: m.flags |= UTF8_FLAG_WIDTH2;
1.51 ! nicm 213: return (htonl(m.uc));
1.45 nicm 214: }
1.29 nicm 215:
1.11 nicm 216: /* Set a single character. */
217: void
1.19 nicm 218: utf8_set(struct utf8_data *ud, u_char ch)
1.11 nicm 219: {
1.33 nicm 220: static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
1.20 nicm 221:
1.33 nicm 222: memcpy(ud, &empty, sizeof *ud);
1.19 nicm 223: *ud->data = ch;
1.20 nicm 224: }
225:
226: /* Copy UTF-8 character. */
227: void
228: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
229: {
230: u_int i;
231:
232: memcpy(to, from, sizeof *to);
233:
234: for (i = to->size; i < sizeof to->data; i++)
235: to->data[i] = '\0';
1.11 nicm 236: }
237:
1.47 nicm 238: /* Get width of Unicode character. */
1.48 nicm 239: static enum utf8_state
240: utf8_width(struct utf8_data *ud, int *width)
1.47 nicm 241: {
1.48 nicm 242: wchar_t wc;
1.47 nicm 243:
1.48 nicm 244: switch (mbtowc(&wc, ud->data, ud->size)) {
245: case -1:
246: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
247: errno);
248: mbtowc(NULL, NULL, MB_CUR_MAX);
249: return (UTF8_ERROR);
250: case 0:
251: return (UTF8_ERROR);
252: }
253: *width = wcwidth(wc);
254: if (*width < 0 || *width > 0xff) {
255: log_debug("UTF-8 %.*s, wcwidth() %d", (int)ud->size, ud->data,
256: *width);
257: return (UTF8_ERROR);
1.47 nicm 258: }
1.48 nicm 259: return (UTF8_DONE);
1.47 nicm 260: }
261:
1.4 nicm 262: /*
263: * Open UTF-8 sequence.
264: *
265: * 11000010-11011111 C2-DF start of 2-byte sequence
266: * 11100000-11101111 E0-EF start of 3-byte sequence
267: * 11110000-11110100 F0-F4 start of 4-byte sequence
268: */
1.23 nicm 269: enum utf8_state
1.19 nicm 270: utf8_open(struct utf8_data *ud, u_char ch)
1.4 nicm 271: {
1.19 nicm 272: memset(ud, 0, sizeof *ud);
1.4 nicm 273: if (ch >= 0xc2 && ch <= 0xdf)
1.19 nicm 274: ud->size = 2;
1.4 nicm 275: else if (ch >= 0xe0 && ch <= 0xef)
1.19 nicm 276: ud->size = 3;
1.4 nicm 277: else if (ch >= 0xf0 && ch <= 0xf4)
1.19 nicm 278: ud->size = 4;
1.4 nicm 279: else
1.23 nicm 280: return (UTF8_ERROR);
1.19 nicm 281: utf8_append(ud, ch);
1.23 nicm 282: return (UTF8_MORE);
1.4 nicm 283: }
284:
1.23 nicm 285: /* Append character to UTF-8, closing if finished. */
286: enum utf8_state
1.19 nicm 287: utf8_append(struct utf8_data *ud, u_char ch)
1.4 nicm 288: {
1.29 nicm 289: int width;
290:
1.19 nicm 291: if (ud->have >= ud->size)
1.4 nicm 292: fatalx("UTF-8 character overflow");
1.19 nicm 293: if (ud->size > sizeof ud->data)
1.4 nicm 294: fatalx("UTF-8 character size too large");
295:
1.21 nicm 296: if (ud->have != 0 && (ch & 0xc0) != 0x80)
297: ud->width = 0xff;
298:
1.19 nicm 299: ud->data[ud->have++] = ch;
300: if (ud->have != ud->size)
1.23 nicm 301: return (UTF8_MORE);
1.4 nicm 302:
1.21 nicm 303: if (ud->width == 0xff)
1.23 nicm 304: return (UTF8_ERROR);
1.48 nicm 305: if (utf8_width(ud, &width) != UTF8_DONE)
1.29 nicm 306: return (UTF8_ERROR);
307: ud->width = width;
308:
1.23 nicm 309: return (UTF8_DONE);
1.9 nicm 310: }
311:
312: /*
313: * Encode len characters from src into dst, which is guaranteed to have four
314: * bytes available for each character from src (for \abc or UTF-8) plus space
315: * for \0.
316: */
317: int
318: utf8_strvis(char *dst, const char *src, size_t len, int flag)
319: {
1.19 nicm 320: struct utf8_data ud;
1.47 nicm 321: const char *start = dst, *end = src + len;
1.23 nicm 322: enum utf8_state more;
1.9 nicm 323: size_t i;
324:
325: while (src < end) {
1.23 nicm 326: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
327: while (++src < end && more == UTF8_MORE)
1.19 nicm 328: more = utf8_append(&ud, *src);
1.23 nicm 329: if (more == UTF8_DONE) {
1.9 nicm 330: /* UTF-8 character finished. */
1.19 nicm 331: for (i = 0; i < ud.size; i++)
332: *dst++ = ud.data[i];
1.9 nicm 333: continue;
334: }
1.23 nicm 335: /* Not a complete, valid UTF-8 character. */
336: src -= ud.have;
1.9 nicm 337: }
1.41 nicm 338: if (src[0] == '$' && src < end - 1) {
1.42 nicm 339: if (isalpha((u_char)src[1]) ||
340: src[1] == '_' ||
341: src[1] == '{')
1.41 nicm 342: *dst++ = '\\';
343: *dst++ = '$';
344: } else if (src < end - 1)
1.9 nicm 345: dst = vis(dst, src[0], flag, src[1]);
346: else if (src < end)
347: dst = vis(dst, src[0], flag, '\0');
348: src++;
349: }
350: *dst = '\0';
351: return (dst - start);
1.35 nicm 352: }
353:
354: /* Same as utf8_strvis but allocate the buffer. */
355: int
356: utf8_stravis(char **dst, const char *src, int flag)
357: {
358: char *buf;
359: int len;
360:
361: buf = xreallocarray(NULL, 4, strlen(src) + 1);
362: len = utf8_strvis(buf, src, strlen(src), flag);
363:
364: *dst = xrealloc(buf, len + 1);
365: return (len);
1.38 nicm 366: }
367:
368: /* Does this string contain anything that isn't valid UTF-8? */
369: int
370: utf8_isvalid(const char *s)
371: {
1.47 nicm 372: struct utf8_data ud;
373: const char *end;
374: enum utf8_state more;
1.38 nicm 375:
376: end = s + strlen(s);
377: while (s < end) {
378: if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
379: while (++s < end && more == UTF8_MORE)
380: more = utf8_append(&ud, *s);
381: if (more == UTF8_DONE)
382: continue;
383: return (0);
384: }
385: if (*s < 0x20 || *s > 0x7e)
386: return (0);
387: s++;
388: }
389: return (1);
1.16 nicm 390: }
391:
392: /*
393: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
394: * the returned string. Anything not valid printable ASCII or UTF-8 is
395: * stripped.
396: */
397: char *
398: utf8_sanitize(const char *src)
399: {
1.47 nicm 400: char *dst = NULL;
401: size_t n = 0;
402: enum utf8_state more;
403: struct utf8_data ud;
404: u_int i;
1.16 nicm 405:
406: while (*src != '\0') {
407: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 408: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
409: while (*++src != '\0' && more == UTF8_MORE)
1.19 nicm 410: more = utf8_append(&ud, *src);
1.23 nicm 411: if (more == UTF8_DONE) {
1.19 nicm 412: dst = xreallocarray(dst, n + ud.width,
1.16 nicm 413: sizeof *dst);
1.19 nicm 414: for (i = 0; i < ud.width; i++)
1.16 nicm 415: dst[n++] = '_';
416: continue;
417: }
1.19 nicm 418: src -= ud.have;
1.16 nicm 419: }
420: if (*src > 0x1f && *src < 0x7f)
1.21 nicm 421: dst[n++] = *src;
1.23 nicm 422: else
423: dst[n++] = '_';
1.16 nicm 424: src++;
425: }
426: dst = xreallocarray(dst, n + 1, sizeof *dst);
427: dst[n] = '\0';
428: return (dst);
1.34 nicm 429: }
430:
431: /* Get UTF-8 buffer length. */
432: size_t
433: utf8_strlen(const struct utf8_data *s)
434: {
435: size_t i;
436:
437: for (i = 0; s[i].size != 0; i++)
438: /* nothing */;
439: return (i);
440: }
441:
442: /* Get UTF-8 string width. */
443: u_int
444: utf8_strwidth(const struct utf8_data *s, ssize_t n)
445: {
446: ssize_t i;
1.47 nicm 447: u_int width = 0;
1.34 nicm 448:
449: for (i = 0; s[i].size != 0; i++) {
450: if (n != -1 && n == i)
451: break;
452: width += s[i].width;
453: }
454: return (width);
1.11 nicm 455: }
456:
457: /*
458: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
459: * Caller frees.
460: */
461: struct utf8_data *
462: utf8_fromcstr(const char *src)
463: {
1.47 nicm 464: struct utf8_data *dst = NULL;
465: size_t n = 0;
1.23 nicm 466: enum utf8_state more;
1.11 nicm 467:
468: while (*src != '\0') {
1.12 nicm 469: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 470: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
471: while (*++src != '\0' && more == UTF8_MORE)
1.11 nicm 472: more = utf8_append(&dst[n], *src);
1.23 nicm 473: if (more == UTF8_DONE) {
1.11 nicm 474: n++;
475: continue;
476: }
477: src -= dst[n].have;
478: }
1.23 nicm 479: utf8_set(&dst[n], *src);
480: n++;
1.11 nicm 481: src++;
482: }
1.12 nicm 483: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 484: dst[n].size = 0;
485: return (dst);
486: }
487:
488: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
489: char *
490: utf8_tocstr(struct utf8_data *src)
491: {
1.47 nicm 492: char *dst = NULL;
493: size_t n = 0;
1.11 nicm 494:
495: for(; src->size != 0; src++) {
1.12 nicm 496: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 497: memcpy(dst + n, src->data, src->size);
498: n += src->size;
499: }
1.12 nicm 500: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 501: dst[n] = '\0';
502: return (dst);
503: }
504:
505: /* Get width of UTF-8 string. */
506: u_int
507: utf8_cstrwidth(const char *s)
508: {
509: struct utf8_data tmp;
510: u_int width;
1.23 nicm 511: enum utf8_state more;
1.11 nicm 512:
513: width = 0;
514: while (*s != '\0') {
1.23 nicm 515: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
516: while (*++s != '\0' && more == UTF8_MORE)
1.11 nicm 517: more = utf8_append(&tmp, *s);
1.23 nicm 518: if (more == UTF8_DONE) {
1.11 nicm 519: width += tmp.width;
520: continue;
521: }
522: s -= tmp.have;
523: }
1.23 nicm 524: if (*s > 0x1f && *s != 0x7f)
1.21 nicm 525: width++;
1.11 nicm 526: s++;
527: }
528: return (width);
1.18 nicm 529: }
530:
1.44 nicm 531: /* Pad UTF-8 string to width on the left. Caller frees. */
1.18 nicm 532: char *
533: utf8_padcstr(const char *s, u_int width)
534: {
535: size_t slen;
536: char *out;
1.47 nicm 537: u_int n, i;
1.18 nicm 538:
539: n = utf8_cstrwidth(s);
540: if (n >= width)
541: return (xstrdup(s));
542:
543: slen = strlen(s);
544: out = xmalloc(slen + 1 + (width - n));
545: memcpy(out, s, slen);
546: for (i = n; i < width; i++)
547: out[slen++] = ' ';
548: out[slen] = '\0';
1.44 nicm 549: return (out);
550: }
551:
552: /* Pad UTF-8 string to width on the right. Caller frees. */
553: char *
554: utf8_rpadcstr(const char *s, u_int width)
555: {
556: size_t slen;
557: char *out;
1.47 nicm 558: u_int n, i;
1.44 nicm 559:
560: n = utf8_cstrwidth(s);
561: if (n >= width)
562: return (xstrdup(s));
563:
564: slen = strlen(s);
565: out = xmalloc(slen + 1 + (width - n));
566: for (i = 0; i < width - n; i++)
567: out[i] = ' ';
568: memcpy(out + i, s, slen);
569: out[i + slen] = '\0';
1.11 nicm 570: return (out);
1.43 nicm 571: }
572:
573: int
574: utf8_cstrhas(const char *s, const struct utf8_data *ud)
575: {
576: struct utf8_data *copy, *loop;
577: int found = 0;
578:
579: copy = utf8_fromcstr(s);
580: for (loop = copy; loop->size != 0; loop++) {
581: if (loop->size != ud->size)
582: continue;
583: if (memcmp(loop->data, ud->data, loop->size) == 0) {
584: found = 1;
585: break;
586: }
587: }
588: free(copy);
589:
590: return (found);
1.1 nicm 591: }