Annotation of src/usr.bin/tmux/utf8.c, Revision 1.64
1.64 ! nicm 1: /* $OpenBSD: utf8.c,v 1.63 2023/09/01 14:29:11 nicm Exp $ */
1.1 nicm 2:
3: /*
1.26 nicm 4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
1.1 nicm 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.41 nicm 21: #include <ctype.h>
1.30 nicm 22: #include <errno.h>
1.11 nicm 23: #include <stdlib.h>
1.1 nicm 24: #include <string.h>
1.9 nicm 25: #include <vis.h>
1.1 nicm 26:
27: #include "tmux.h"
28:
1.64 ! nicm 29: static const wchar_t utf8_force_wide[] = {
! 30: 0x0261D,
! 31: 0x026F9,
! 32: 0x0270A,
! 33: 0x0270B,
! 34: 0x0270C,
! 35: 0x0270D,
! 36: 0x1F1E6,
! 37: 0x1F1E7,
! 38: 0x1F1E8,
! 39: 0x1F1E9,
! 40: 0x1F1EA,
! 41: 0x1F1EB,
! 42: 0x1F1EC,
! 43: 0x1F1ED,
! 44: 0x1F1EE,
! 45: 0x1F1EF,
! 46: 0x1F1F0,
! 47: 0x1F1F1,
! 48: 0x1F1F2,
! 49: 0x1F1F3,
! 50: 0x1F1F4,
! 51: 0x1F1F5,
! 52: 0x1F1F6,
! 53: 0x1F1F7,
! 54: 0x1F1F8,
! 55: 0x1F1F9,
! 56: 0x1F1FA,
! 57: 0x1F1FB,
! 58: 0x1F1FC,
! 59: 0x1F1FD,
! 60: 0x1F1FE,
! 61: 0x1F1FF,
! 62: 0x1F385,
! 63: 0x1F3C2,
! 64: 0x1F3C3,
! 65: 0x1F3C4,
! 66: 0x1F3C7,
! 67: 0x1F3CA,
! 68: 0x1F3CB,
! 69: 0x1F3CC,
! 70: 0x1F3FB,
! 71: 0x1F3FC,
! 72: 0x1F3FD,
! 73: 0x1F3FE,
! 74: 0x1F3FF,
! 75: 0x1F442,
! 76: 0x1F443,
! 77: 0x1F446,
! 78: 0x1F447,
! 79: 0x1F448,
! 80: 0x1F449,
! 81: 0x1F44A,
! 82: 0x1F44B,
! 83: 0x1F44C,
! 84: 0x1F44D,
! 85: 0x1F44E,
! 86: 0x1F44F,
! 87: 0x1F450,
! 88: 0x1F466,
! 89: 0x1F467,
! 90: 0x1F468,
! 91: 0x1F469,
! 92: 0x1F46B,
! 93: 0x1F46C,
! 94: 0x1F46D,
! 95: 0x1F46E,
! 96: 0x1F470,
! 97: 0x1F471,
! 98: 0x1F472,
! 99: 0x1F473,
! 100: 0x1F474,
! 101: 0x1F475,
! 102: 0x1F476,
! 103: 0x1F477,
! 104: 0x1F478,
! 105: 0x1F47C,
! 106: 0x1F481,
! 107: 0x1F482,
! 108: 0x1F483,
! 109: 0x1F485,
! 110: 0x1F486,
! 111: 0x1F487,
! 112: 0x1F48F,
! 113: 0x1F491,
! 114: 0x1F4AA,
! 115: 0x1F574,
! 116: 0x1F575,
! 117: 0x1F57A,
! 118: 0x1F590,
! 119: 0x1F595,
! 120: 0x1F596,
! 121: 0x1F645,
! 122: 0x1F646,
! 123: 0x1F647,
! 124: 0x1F64B,
! 125: 0x1F64C,
! 126: 0x1F64D,
! 127: 0x1F64E,
! 128: 0x1F64F,
! 129: 0x1F6A3,
! 130: 0x1F6B4,
! 131: 0x1F6B5,
! 132: 0x1F6B6,
! 133: 0x1F6C0,
! 134: 0x1F6CC,
! 135: 0x1F90C,
! 136: 0x1F90F,
! 137: 0x1F918,
! 138: 0x1F919,
! 139: 0x1F91A,
! 140: 0x1F91B,
! 141: 0x1F91C,
! 142: 0x1F91D,
! 143: 0x1F91E,
! 144: 0x1F91F,
! 145: 0x1F926,
! 146: 0x1F930,
! 147: 0x1F931,
! 148: 0x1F932,
! 149: 0x1F933,
! 150: 0x1F934,
! 151: 0x1F935,
! 152: 0x1F936,
! 153: 0x1F937,
! 154: 0x1F938,
! 155: 0x1F939,
! 156: 0x1F93D,
! 157: 0x1F93E,
! 158: 0x1F977,
! 159: 0x1F9B5,
! 160: 0x1F9B6,
! 161: 0x1F9B8,
! 162: 0x1F9B9,
! 163: 0x1F9BB,
! 164: 0x1F9CD,
! 165: 0x1F9CE,
! 166: 0x1F9CF,
! 167: 0x1F9D1,
! 168: 0x1F9D2,
! 169: 0x1F9D3,
! 170: 0x1F9D4,
! 171: 0x1F9D5,
! 172: 0x1F9D6,
! 173: 0x1F9D7,
! 174: 0x1F9D8,
! 175: 0x1F9D9,
! 176: 0x1F9DA,
! 177: 0x1F9DB,
! 178: 0x1F9DC,
! 179: 0x1F9DD,
! 180: 0x1FAC3,
! 181: 0x1FAC4,
! 182: 0x1FAC5,
! 183: 0x1FAF0,
! 184: 0x1FAF1,
! 185: 0x1FAF2,
! 186: 0x1FAF3,
! 187: 0x1FAF4,
! 188: 0x1FAF5,
! 189: 0x1FAF6,
! 190: 0x1FAF7,
! 191: 0x1FAF8
! 192: };
! 193:
1.47 nicm 194: struct utf8_item {
1.54 nicm 195: RB_ENTRY(utf8_item) index_entry;
196: u_int index;
1.45 nicm 197:
1.54 nicm 198: RB_ENTRY(utf8_item) data_entry;
1.45 nicm 199: char data[UTF8_SIZE];
200: u_char size;
201: };
202:
203: static int
1.54 nicm 204: utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
1.45 nicm 205: {
1.47 nicm 206: if (ui1->size < ui2->size)
1.45 nicm 207: return (-1);
1.47 nicm 208: if (ui1->size > ui2->size)
1.45 nicm 209: return (1);
1.47 nicm 210: return (memcmp(ui1->data, ui2->data, ui1->size));
1.45 nicm 211: }
1.54 nicm 212: RB_HEAD(utf8_data_tree, utf8_item);
213: RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
214: static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
1.45 nicm 215:
1.54 nicm 216: static int
217: utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
218: {
219: if (ui1->index < ui2->index)
220: return (-1);
221: if (ui1->index > ui2->index)
222: return (1);
223: return (0);
224: }
225: RB_HEAD(utf8_index_tree, utf8_item);
226: RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
227: static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
228:
229: static u_int utf8_next_index;
1.45 nicm 230:
1.53 nicm 231: #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
1.58 nicm 232: #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
1.53 nicm 233:
234: #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
235: #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
1.45 nicm 236:
1.54 nicm 237: /* Get a UTF-8 item from data. */
1.47 nicm 238: static struct utf8_item *
1.62 nicm 239: utf8_item_by_data(const u_char *data, size_t size)
1.45 nicm 240: {
1.47 nicm 241: struct utf8_item ui;
242:
243: memcpy(ui.data, data, size);
244: ui.size = size;
1.45 nicm 245:
1.54 nicm 246: return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
1.47 nicm 247: }
1.45 nicm 248:
1.54 nicm 249: /* Get a UTF-8 item from data. */
250: static struct utf8_item *
251: utf8_item_by_index(u_int index)
1.47 nicm 252: {
1.54 nicm 253: struct utf8_item ui;
254:
255: ui.index = index;
256:
257: return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
1.45 nicm 258: }
259:
1.47 nicm 260: /* Add a UTF-8 item. */
1.45 nicm 261: static int
1.61 nicm 262: utf8_put_item(const u_char *data, size_t size, u_int *index)
1.45 nicm 263: {
1.47 nicm 264: struct utf8_item *ui;
1.45 nicm 265:
1.54 nicm 266: ui = utf8_item_by_data(data, size);
1.47 nicm 267: if (ui != NULL) {
1.57 nicm 268: *index = ui->index;
1.54 nicm 269: log_debug("%s: found %.*s = %u", __func__, (int)size, data,
270: *index);
1.45 nicm 271: return (0);
272: }
273:
1.54 nicm 274: if (utf8_next_index == 0xffffff + 1)
1.47 nicm 275: return (-1);
276:
1.54 nicm 277: ui = xcalloc(1, sizeof *ui);
278: ui->index = utf8_next_index++;
279: RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
280:
1.47 nicm 281: memcpy(ui->data, data, size);
282: ui->size = size;
1.54 nicm 283: RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
1.45 nicm 284:
1.57 nicm 285: *index = ui->index;
1.54 nicm 286: log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
1.45 nicm 287: return (0);
288: }
289:
1.64 ! nicm 290: static int
! 291: utf8_table_cmp(const void *vp1, const void *vp2)
! 292: {
! 293: const wchar_t *wc1 = vp1, *wc2 = vp2;
! 294:
! 295: if (*wc1 < *wc2)
! 296: return (-1);
! 297: if (*wc1 > *wc2)
! 298: return (1);
! 299: return (0);
! 300: }
! 301:
! 302: /* Check if character in table. */
! 303: int
! 304: utf8_in_table(wchar_t find, const wchar_t *table, u_int count)
! 305: {
! 306: wchar_t *found;
! 307:
! 308: found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp);
! 309: return (found != NULL);
! 310: }
! 311:
1.47 nicm 312: /* Get UTF-8 character from data. */
313: enum utf8_state
314: utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
1.45 nicm 315: {
1.54 nicm 316: u_int index;
1.45 nicm 317:
1.52 nicm 318: if (ud->width > 2)
1.55 nicm 319: fatalx("invalid UTF-8 width: %u", ud->width);
1.45 nicm 320:
1.52 nicm 321: if (ud->size > UTF8_SIZE)
1.45 nicm 322: goto fail;
1.53 nicm 323: if (ud->size <= 3) {
1.54 nicm 324: index = (((utf8_char)ud->data[2] << 16)|
1.63 nicm 325: ((utf8_char)ud->data[1] << 8)|
326: ((utf8_char)ud->data[0]));
1.54 nicm 327: } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
1.53 nicm 328: goto fail;
1.54 nicm 329: *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
1.53 nicm 330: log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
331: (int)ud->size, ud->data, *uc);
1.47 nicm 332: return (UTF8_DONE);
1.45 nicm 333:
334: fail:
1.52 nicm 335: if (ud->width == 0)
1.53 nicm 336: *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
1.52 nicm 337: else if (ud->width == 1)
1.53 nicm 338: *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
1.47 nicm 339: else
1.53 nicm 340: *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
1.47 nicm 341: return (UTF8_ERROR);
1.45 nicm 342: }
343:
1.47 nicm 344: /* Get UTF-8 data from character. */
1.45 nicm 345: void
1.47 nicm 346: utf8_to_data(utf8_char uc, struct utf8_data *ud)
1.45 nicm 347: {
1.47 nicm 348: struct utf8_item *ui;
1.54 nicm 349: u_int index;
1.45 nicm 350:
351: memset(ud, 0, sizeof *ud);
1.53 nicm 352: ud->size = ud->have = UTF8_GET_SIZE(uc);
353: ud->width = UTF8_GET_WIDTH(uc);
1.45 nicm 354:
355: if (ud->size <= 3) {
1.53 nicm 356: ud->data[2] = (uc >> 16);
357: ud->data[1] = ((uc >> 8) & 0xff);
358: ud->data[0] = (uc & 0xff);
359: } else {
1.54 nicm 360: index = (uc & 0xffffff);
361: if ((ui = utf8_item_by_index(index)) == NULL)
1.53 nicm 362: memset(ud->data, ' ', ud->size);
1.54 nicm 363: else
1.53 nicm 364: memcpy(ud->data, ui->data, ud->size);
1.45 nicm 365: }
366:
1.53 nicm 367: log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
368: (int)ud->size, ud->data);
1.45 nicm 369: }
370:
1.47 nicm 371: /* Get UTF-8 character from a single ASCII character. */
1.46 nicm 372: u_int
1.52 nicm 373: utf8_build_one(u_char ch)
1.45 nicm 374: {
1.53 nicm 375: return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
1.45 nicm 376: }
1.29 nicm 377:
1.11 nicm 378: /* Set a single character. */
379: void
1.19 nicm 380: utf8_set(struct utf8_data *ud, u_char ch)
1.11 nicm 381: {
1.33 nicm 382: static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
1.20 nicm 383:
1.33 nicm 384: memcpy(ud, &empty, sizeof *ud);
1.19 nicm 385: *ud->data = ch;
1.20 nicm 386: }
387:
388: /* Copy UTF-8 character. */
389: void
390: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
391: {
392: u_int i;
393:
394: memcpy(to, from, sizeof *to);
395:
396: for (i = to->size; i < sizeof to->data; i++)
397: to->data[i] = '\0';
1.11 nicm 398: }
399:
1.47 nicm 400: /* Get width of Unicode character. */
1.48 nicm 401: static enum utf8_state
402: utf8_width(struct utf8_data *ud, int *width)
1.47 nicm 403: {
1.48 nicm 404: wchar_t wc;
1.47 nicm 405:
1.64 ! nicm 406: if (utf8_towc(ud, &wc) != UTF8_DONE)
1.48 nicm 407: return (UTF8_ERROR);
1.64 ! nicm 408: if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) {
! 409: *width = 2;
! 410: return (UTF8_DONE);
1.48 nicm 411: }
1.64 ! nicm 412:
1.48 nicm 413: *width = wcwidth(wc);
1.63 nicm 414: log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
1.60 nicm 415: if (*width < 0) {
416: /*
417: * C1 control characters are nonprintable, so they are always
418: * zero width.
419: */
420: *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
421: }
1.59 nicm 422: if (*width >= 0 && *width <= 0xff)
423: return (UTF8_DONE);
424: return (UTF8_ERROR);
1.64 ! nicm 425: }
! 426:
! 427: /* Convert UTF-8 character to wide character. */
! 428: enum utf8_state
! 429: utf8_towc(const struct utf8_data *ud, wchar_t *wc)
! 430: {
! 431: switch (mbtowc(wc, ud->data, ud->size)) {
! 432: case -1:
! 433: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
! 434: errno);
! 435: mbtowc(NULL, NULL, MB_CUR_MAX);
! 436: return (UTF8_ERROR);
! 437: case 0:
! 438: return (UTF8_ERROR);
! 439: }
! 440: log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
! 441: return (UTF8_DONE);
1.47 nicm 442: }
443:
1.4 nicm 444: /*
445: * Open UTF-8 sequence.
446: *
447: * 11000010-11011111 C2-DF start of 2-byte sequence
448: * 11100000-11101111 E0-EF start of 3-byte sequence
449: * 11110000-11110100 F0-F4 start of 4-byte sequence
450: */
1.23 nicm 451: enum utf8_state
1.19 nicm 452: utf8_open(struct utf8_data *ud, u_char ch)
1.4 nicm 453: {
1.19 nicm 454: memset(ud, 0, sizeof *ud);
1.4 nicm 455: if (ch >= 0xc2 && ch <= 0xdf)
1.19 nicm 456: ud->size = 2;
1.4 nicm 457: else if (ch >= 0xe0 && ch <= 0xef)
1.19 nicm 458: ud->size = 3;
1.4 nicm 459: else if (ch >= 0xf0 && ch <= 0xf4)
1.19 nicm 460: ud->size = 4;
1.4 nicm 461: else
1.23 nicm 462: return (UTF8_ERROR);
1.19 nicm 463: utf8_append(ud, ch);
1.23 nicm 464: return (UTF8_MORE);
1.4 nicm 465: }
466:
1.23 nicm 467: /* Append character to UTF-8, closing if finished. */
468: enum utf8_state
1.19 nicm 469: utf8_append(struct utf8_data *ud, u_char ch)
1.4 nicm 470: {
1.29 nicm 471: int width;
472:
1.19 nicm 473: if (ud->have >= ud->size)
1.4 nicm 474: fatalx("UTF-8 character overflow");
1.19 nicm 475: if (ud->size > sizeof ud->data)
1.4 nicm 476: fatalx("UTF-8 character size too large");
477:
1.21 nicm 478: if (ud->have != 0 && (ch & 0xc0) != 0x80)
479: ud->width = 0xff;
480:
1.19 nicm 481: ud->data[ud->have++] = ch;
482: if (ud->have != ud->size)
1.23 nicm 483: return (UTF8_MORE);
1.4 nicm 484:
1.21 nicm 485: if (ud->width == 0xff)
1.23 nicm 486: return (UTF8_ERROR);
1.48 nicm 487: if (utf8_width(ud, &width) != UTF8_DONE)
1.29 nicm 488: return (UTF8_ERROR);
489: ud->width = width;
490:
1.23 nicm 491: return (UTF8_DONE);
1.9 nicm 492: }
493:
494: /*
495: * Encode len characters from src into dst, which is guaranteed to have four
496: * bytes available for each character from src (for \abc or UTF-8) plus space
497: * for \0.
498: */
499: int
500: utf8_strvis(char *dst, const char *src, size_t len, int flag)
501: {
1.19 nicm 502: struct utf8_data ud;
1.47 nicm 503: const char *start = dst, *end = src + len;
1.23 nicm 504: enum utf8_state more;
1.9 nicm 505: size_t i;
506:
507: while (src < end) {
1.23 nicm 508: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
509: while (++src < end && more == UTF8_MORE)
1.19 nicm 510: more = utf8_append(&ud, *src);
1.23 nicm 511: if (more == UTF8_DONE) {
1.9 nicm 512: /* UTF-8 character finished. */
1.19 nicm 513: for (i = 0; i < ud.size; i++)
514: *dst++ = ud.data[i];
1.9 nicm 515: continue;
516: }
1.23 nicm 517: /* Not a complete, valid UTF-8 character. */
518: src -= ud.have;
1.9 nicm 519: }
1.41 nicm 520: if (src[0] == '$' && src < end - 1) {
1.42 nicm 521: if (isalpha((u_char)src[1]) ||
522: src[1] == '_' ||
523: src[1] == '{')
1.41 nicm 524: *dst++ = '\\';
525: *dst++ = '$';
526: } else if (src < end - 1)
1.9 nicm 527: dst = vis(dst, src[0], flag, src[1]);
528: else if (src < end)
529: dst = vis(dst, src[0], flag, '\0');
530: src++;
531: }
532: *dst = '\0';
533: return (dst - start);
1.35 nicm 534: }
535:
536: /* Same as utf8_strvis but allocate the buffer. */
537: int
538: utf8_stravis(char **dst, const char *src, int flag)
539: {
540: char *buf;
541: int len;
542:
543: buf = xreallocarray(NULL, 4, strlen(src) + 1);
544: len = utf8_strvis(buf, src, strlen(src), flag);
1.56 nicm 545:
546: *dst = xrealloc(buf, len + 1);
547: return (len);
548: }
549:
550: /* Same as utf8_strvis but allocate the buffer. */
551: int
552: utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
553: {
554: char *buf;
555: int len;
556:
557: buf = xreallocarray(NULL, 4, srclen + 1);
558: len = utf8_strvis(buf, src, srclen, flag);
1.35 nicm 559:
560: *dst = xrealloc(buf, len + 1);
561: return (len);
1.38 nicm 562: }
563:
564: /* Does this string contain anything that isn't valid UTF-8? */
565: int
566: utf8_isvalid(const char *s)
567: {
1.47 nicm 568: struct utf8_data ud;
569: const char *end;
570: enum utf8_state more;
1.38 nicm 571:
572: end = s + strlen(s);
573: while (s < end) {
574: if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
575: while (++s < end && more == UTF8_MORE)
576: more = utf8_append(&ud, *s);
577: if (more == UTF8_DONE)
578: continue;
579: return (0);
580: }
581: if (*s < 0x20 || *s > 0x7e)
582: return (0);
583: s++;
584: }
585: return (1);
1.16 nicm 586: }
587:
588: /*
589: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
590: * the returned string. Anything not valid printable ASCII or UTF-8 is
591: * stripped.
592: */
593: char *
594: utf8_sanitize(const char *src)
595: {
1.47 nicm 596: char *dst = NULL;
597: size_t n = 0;
598: enum utf8_state more;
599: struct utf8_data ud;
600: u_int i;
1.16 nicm 601:
602: while (*src != '\0') {
603: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 604: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
605: while (*++src != '\0' && more == UTF8_MORE)
1.19 nicm 606: more = utf8_append(&ud, *src);
1.23 nicm 607: if (more == UTF8_DONE) {
1.19 nicm 608: dst = xreallocarray(dst, n + ud.width,
1.16 nicm 609: sizeof *dst);
1.19 nicm 610: for (i = 0; i < ud.width; i++)
1.16 nicm 611: dst[n++] = '_';
612: continue;
613: }
1.19 nicm 614: src -= ud.have;
1.16 nicm 615: }
616: if (*src > 0x1f && *src < 0x7f)
1.21 nicm 617: dst[n++] = *src;
1.23 nicm 618: else
619: dst[n++] = '_';
1.16 nicm 620: src++;
621: }
622: dst = xreallocarray(dst, n + 1, sizeof *dst);
623: dst[n] = '\0';
624: return (dst);
1.34 nicm 625: }
626:
627: /* Get UTF-8 buffer length. */
628: size_t
629: utf8_strlen(const struct utf8_data *s)
630: {
631: size_t i;
632:
633: for (i = 0; s[i].size != 0; i++)
634: /* nothing */;
635: return (i);
636: }
637:
638: /* Get UTF-8 string width. */
639: u_int
640: utf8_strwidth(const struct utf8_data *s, ssize_t n)
641: {
642: ssize_t i;
1.47 nicm 643: u_int width = 0;
1.34 nicm 644:
645: for (i = 0; s[i].size != 0; i++) {
646: if (n != -1 && n == i)
647: break;
648: width += s[i].width;
649: }
650: return (width);
1.11 nicm 651: }
652:
653: /*
654: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
655: * Caller frees.
656: */
657: struct utf8_data *
658: utf8_fromcstr(const char *src)
659: {
1.47 nicm 660: struct utf8_data *dst = NULL;
661: size_t n = 0;
1.23 nicm 662: enum utf8_state more;
1.11 nicm 663:
664: while (*src != '\0') {
1.12 nicm 665: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 666: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
667: while (*++src != '\0' && more == UTF8_MORE)
1.11 nicm 668: more = utf8_append(&dst[n], *src);
1.23 nicm 669: if (more == UTF8_DONE) {
1.11 nicm 670: n++;
671: continue;
672: }
673: src -= dst[n].have;
674: }
1.23 nicm 675: utf8_set(&dst[n], *src);
676: n++;
1.11 nicm 677: src++;
678: }
1.12 nicm 679: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 680: dst[n].size = 0;
681: return (dst);
682: }
683:
684: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
685: char *
686: utf8_tocstr(struct utf8_data *src)
687: {
1.47 nicm 688: char *dst = NULL;
689: size_t n = 0;
1.11 nicm 690:
691: for(; src->size != 0; src++) {
1.12 nicm 692: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 693: memcpy(dst + n, src->data, src->size);
694: n += src->size;
695: }
1.12 nicm 696: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 697: dst[n] = '\0';
698: return (dst);
699: }
700:
701: /* Get width of UTF-8 string. */
702: u_int
703: utf8_cstrwidth(const char *s)
704: {
705: struct utf8_data tmp;
706: u_int width;
1.23 nicm 707: enum utf8_state more;
1.11 nicm 708:
709: width = 0;
710: while (*s != '\0') {
1.23 nicm 711: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
712: while (*++s != '\0' && more == UTF8_MORE)
1.11 nicm 713: more = utf8_append(&tmp, *s);
1.23 nicm 714: if (more == UTF8_DONE) {
1.11 nicm 715: width += tmp.width;
716: continue;
717: }
718: s -= tmp.have;
719: }
1.23 nicm 720: if (*s > 0x1f && *s != 0x7f)
1.21 nicm 721: width++;
1.11 nicm 722: s++;
723: }
724: return (width);
1.18 nicm 725: }
726:
1.44 nicm 727: /* Pad UTF-8 string to width on the left. Caller frees. */
1.18 nicm 728: char *
729: utf8_padcstr(const char *s, u_int width)
730: {
731: size_t slen;
732: char *out;
1.47 nicm 733: u_int n, i;
1.18 nicm 734:
735: n = utf8_cstrwidth(s);
736: if (n >= width)
737: return (xstrdup(s));
738:
739: slen = strlen(s);
740: out = xmalloc(slen + 1 + (width - n));
741: memcpy(out, s, slen);
742: for (i = n; i < width; i++)
743: out[slen++] = ' ';
744: out[slen] = '\0';
1.44 nicm 745: return (out);
746: }
747:
748: /* Pad UTF-8 string to width on the right. Caller frees. */
749: char *
750: utf8_rpadcstr(const char *s, u_int width)
751: {
752: size_t slen;
753: char *out;
1.47 nicm 754: u_int n, i;
1.44 nicm 755:
756: n = utf8_cstrwidth(s);
757: if (n >= width)
758: return (xstrdup(s));
759:
760: slen = strlen(s);
761: out = xmalloc(slen + 1 + (width - n));
762: for (i = 0; i < width - n; i++)
763: out[i] = ' ';
764: memcpy(out + i, s, slen);
765: out[i + slen] = '\0';
1.11 nicm 766: return (out);
1.43 nicm 767: }
768:
769: int
770: utf8_cstrhas(const char *s, const struct utf8_data *ud)
771: {
772: struct utf8_data *copy, *loop;
773: int found = 0;
774:
775: copy = utf8_fromcstr(s);
776: for (loop = copy; loop->size != 0; loop++) {
777: if (loop->size != ud->size)
778: continue;
779: if (memcmp(loop->data, ud->data, loop->size) == 0) {
780: found = 1;
781: break;
782: }
783: }
784: free(copy);
785:
786: return (found);
1.1 nicm 787: }