[BACK]Return to utf8.c CVS log [TXT][DIR] Up to [local] / src / usr.bin / tmux

Annotation of src/usr.bin/tmux/utf8.c, Revision 1.64

1.64    ! nicm        1: /* $OpenBSD: utf8.c,v 1.63 2023/09/01 14:29:11 nicm Exp $ */
1.1       nicm        2:
                      3: /*
1.26      nicm        4:  * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
1.1       nicm        5:  *
                      6:  * Permission to use, copy, modify, and distribute this software for any
                      7:  * purpose with or without fee is hereby granted, provided that the above
                      8:  * copyright notice and this permission notice appear in all copies.
                      9:  *
                     10:  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
                     11:  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
                     12:  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
                     13:  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
                     14:  * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
                     15:  * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
                     16:  * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
                     17:  */
                     18:
                     19: #include <sys/types.h>
                     20:
1.41      nicm       21: #include <ctype.h>
1.30      nicm       22: #include <errno.h>
1.11      nicm       23: #include <stdlib.h>
1.1       nicm       24: #include <string.h>
1.9       nicm       25: #include <vis.h>
1.1       nicm       26:
                     27: #include "tmux.h"
                     28:
1.64    ! nicm       29: static const wchar_t utf8_force_wide[] = {
        !            30:        0x0261D,
        !            31:        0x026F9,
        !            32:        0x0270A,
        !            33:        0x0270B,
        !            34:        0x0270C,
        !            35:        0x0270D,
        !            36:        0x1F1E6,
        !            37:        0x1F1E7,
        !            38:        0x1F1E8,
        !            39:        0x1F1E9,
        !            40:        0x1F1EA,
        !            41:        0x1F1EB,
        !            42:        0x1F1EC,
        !            43:        0x1F1ED,
        !            44:        0x1F1EE,
        !            45:        0x1F1EF,
        !            46:        0x1F1F0,
        !            47:        0x1F1F1,
        !            48:        0x1F1F2,
        !            49:        0x1F1F3,
        !            50:        0x1F1F4,
        !            51:        0x1F1F5,
        !            52:        0x1F1F6,
        !            53:        0x1F1F7,
        !            54:        0x1F1F8,
        !            55:        0x1F1F9,
        !            56:        0x1F1FA,
        !            57:        0x1F1FB,
        !            58:        0x1F1FC,
        !            59:        0x1F1FD,
        !            60:        0x1F1FE,
        !            61:        0x1F1FF,
        !            62:        0x1F385,
        !            63:        0x1F3C2,
        !            64:        0x1F3C3,
        !            65:        0x1F3C4,
        !            66:        0x1F3C7,
        !            67:        0x1F3CA,
        !            68:        0x1F3CB,
        !            69:        0x1F3CC,
        !            70:        0x1F3FB,
        !            71:        0x1F3FC,
        !            72:        0x1F3FD,
        !            73:        0x1F3FE,
        !            74:        0x1F3FF,
        !            75:        0x1F442,
        !            76:        0x1F443,
        !            77:        0x1F446,
        !            78:        0x1F447,
        !            79:        0x1F448,
        !            80:        0x1F449,
        !            81:        0x1F44A,
        !            82:        0x1F44B,
        !            83:        0x1F44C,
        !            84:        0x1F44D,
        !            85:        0x1F44E,
        !            86:        0x1F44F,
        !            87:        0x1F450,
        !            88:        0x1F466,
        !            89:        0x1F467,
        !            90:        0x1F468,
        !            91:        0x1F469,
        !            92:        0x1F46B,
        !            93:        0x1F46C,
        !            94:        0x1F46D,
        !            95:        0x1F46E,
        !            96:        0x1F470,
        !            97:        0x1F471,
        !            98:        0x1F472,
        !            99:        0x1F473,
        !           100:        0x1F474,
        !           101:        0x1F475,
        !           102:        0x1F476,
        !           103:        0x1F477,
        !           104:        0x1F478,
        !           105:        0x1F47C,
        !           106:        0x1F481,
        !           107:        0x1F482,
        !           108:        0x1F483,
        !           109:        0x1F485,
        !           110:        0x1F486,
        !           111:        0x1F487,
        !           112:        0x1F48F,
        !           113:        0x1F491,
        !           114:        0x1F4AA,
        !           115:        0x1F574,
        !           116:        0x1F575,
        !           117:        0x1F57A,
        !           118:        0x1F590,
        !           119:        0x1F595,
        !           120:        0x1F596,
        !           121:        0x1F645,
        !           122:        0x1F646,
        !           123:        0x1F647,
        !           124:        0x1F64B,
        !           125:        0x1F64C,
        !           126:        0x1F64D,
        !           127:        0x1F64E,
        !           128:        0x1F64F,
        !           129:        0x1F6A3,
        !           130:        0x1F6B4,
        !           131:        0x1F6B5,
        !           132:        0x1F6B6,
        !           133:        0x1F6C0,
        !           134:        0x1F6CC,
        !           135:        0x1F90C,
        !           136:        0x1F90F,
        !           137:        0x1F918,
        !           138:        0x1F919,
        !           139:        0x1F91A,
        !           140:        0x1F91B,
        !           141:        0x1F91C,
        !           142:        0x1F91D,
        !           143:        0x1F91E,
        !           144:        0x1F91F,
        !           145:        0x1F926,
        !           146:        0x1F930,
        !           147:        0x1F931,
        !           148:        0x1F932,
        !           149:        0x1F933,
        !           150:        0x1F934,
        !           151:        0x1F935,
        !           152:        0x1F936,
        !           153:        0x1F937,
        !           154:        0x1F938,
        !           155:        0x1F939,
        !           156:        0x1F93D,
        !           157:        0x1F93E,
        !           158:        0x1F977,
        !           159:        0x1F9B5,
        !           160:        0x1F9B6,
        !           161:        0x1F9B8,
        !           162:        0x1F9B9,
        !           163:        0x1F9BB,
        !           164:        0x1F9CD,
        !           165:        0x1F9CE,
        !           166:        0x1F9CF,
        !           167:        0x1F9D1,
        !           168:        0x1F9D2,
        !           169:        0x1F9D3,
        !           170:        0x1F9D4,
        !           171:        0x1F9D5,
        !           172:        0x1F9D6,
        !           173:        0x1F9D7,
        !           174:        0x1F9D8,
        !           175:        0x1F9D9,
        !           176:        0x1F9DA,
        !           177:        0x1F9DB,
        !           178:        0x1F9DC,
        !           179:        0x1F9DD,
        !           180:        0x1FAC3,
        !           181:        0x1FAC4,
        !           182:        0x1FAC5,
        !           183:        0x1FAF0,
        !           184:        0x1FAF1,
        !           185:        0x1FAF2,
        !           186:        0x1FAF3,
        !           187:        0x1FAF4,
        !           188:        0x1FAF5,
        !           189:        0x1FAF6,
        !           190:        0x1FAF7,
        !           191:        0x1FAF8
        !           192: };
        !           193:
1.47      nicm      194: struct utf8_item {
1.54      nicm      195:        RB_ENTRY(utf8_item)     index_entry;
                    196:        u_int                   index;
1.45      nicm      197:
1.54      nicm      198:        RB_ENTRY(utf8_item)     data_entry;
1.45      nicm      199:        char                    data[UTF8_SIZE];
                    200:        u_char                  size;
                    201: };
                    202:
                    203: static int
1.54      nicm      204: utf8_data_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
1.45      nicm      205: {
1.47      nicm      206:        if (ui1->size < ui2->size)
1.45      nicm      207:                return (-1);
1.47      nicm      208:        if (ui1->size > ui2->size)
1.45      nicm      209:                return (1);
1.47      nicm      210:        return (memcmp(ui1->data, ui2->data, ui1->size));
1.45      nicm      211: }
1.54      nicm      212: RB_HEAD(utf8_data_tree, utf8_item);
                    213: RB_GENERATE_STATIC(utf8_data_tree, utf8_item, data_entry, utf8_data_cmp);
                    214: static struct utf8_data_tree utf8_data_tree = RB_INITIALIZER(utf8_data_tree);
1.45      nicm      215:
1.54      nicm      216: static int
                    217: utf8_index_cmp(struct utf8_item *ui1, struct utf8_item *ui2)
                    218: {
                    219:        if (ui1->index < ui2->index)
                    220:                return (-1);
                    221:        if (ui1->index > ui2->index)
                    222:                return (1);
                    223:        return (0);
                    224: }
                    225: RB_HEAD(utf8_index_tree, utf8_item);
                    226: RB_GENERATE_STATIC(utf8_index_tree, utf8_item, index_entry, utf8_index_cmp);
                    227: static struct utf8_index_tree utf8_index_tree = RB_INITIALIZER(utf8_index_tree);
                    228:
                    229: static u_int utf8_next_index;
1.45      nicm      230:
1.53      nicm      231: #define UTF8_GET_SIZE(uc) (((uc) >> 24) & 0x1f)
1.58      nicm      232: #define UTF8_GET_WIDTH(uc) (((uc) >> 29) - 1)
1.53      nicm      233:
                    234: #define UTF8_SET_SIZE(size) (((utf8_char)(size)) << 24)
                    235: #define UTF8_SET_WIDTH(width) ((((utf8_char)(width)) + 1) << 29)
1.45      nicm      236:
1.54      nicm      237: /* Get a UTF-8 item from data. */
1.47      nicm      238: static struct utf8_item *
1.62      nicm      239: utf8_item_by_data(const u_char *data, size_t size)
1.45      nicm      240: {
1.47      nicm      241:        struct utf8_item        ui;
                    242:
                    243:        memcpy(ui.data, data, size);
                    244:        ui.size = size;
1.45      nicm      245:
1.54      nicm      246:        return (RB_FIND(utf8_data_tree, &utf8_data_tree, &ui));
1.47      nicm      247: }
1.45      nicm      248:
1.54      nicm      249: /* Get a UTF-8 item from data. */
                    250: static struct utf8_item *
                    251: utf8_item_by_index(u_int index)
1.47      nicm      252: {
1.54      nicm      253:        struct utf8_item        ui;
                    254:
                    255:        ui.index = index;
                    256:
                    257:        return (RB_FIND(utf8_index_tree, &utf8_index_tree, &ui));
1.45      nicm      258: }
                    259:
1.47      nicm      260: /* Add a UTF-8 item. */
1.45      nicm      261: static int
1.61      nicm      262: utf8_put_item(const u_char *data, size_t size, u_int *index)
1.45      nicm      263: {
1.47      nicm      264:        struct utf8_item        *ui;
1.45      nicm      265:
1.54      nicm      266:        ui = utf8_item_by_data(data, size);
1.47      nicm      267:        if (ui != NULL) {
1.57      nicm      268:                *index = ui->index;
1.54      nicm      269:                log_debug("%s: found %.*s = %u", __func__, (int)size, data,
                    270:                    *index);
1.45      nicm      271:                return (0);
                    272:        }
                    273:
1.54      nicm      274:        if (utf8_next_index == 0xffffff + 1)
1.47      nicm      275:                return (-1);
                    276:
1.54      nicm      277:        ui = xcalloc(1, sizeof *ui);
                    278:        ui->index = utf8_next_index++;
                    279:        RB_INSERT(utf8_index_tree, &utf8_index_tree, ui);
                    280:
1.47      nicm      281:        memcpy(ui->data, data, size);
                    282:        ui->size = size;
1.54      nicm      283:        RB_INSERT(utf8_data_tree, &utf8_data_tree, ui);
1.45      nicm      284:
1.57      nicm      285:        *index = ui->index;
1.54      nicm      286:        log_debug("%s: added %.*s = %u", __func__, (int)size, data, *index);
1.45      nicm      287:        return (0);
                    288: }
                    289:
1.64    ! nicm      290: static int
        !           291: utf8_table_cmp(const void *vp1, const void *vp2)
        !           292: {
        !           293:        const wchar_t   *wc1 = vp1, *wc2 = vp2;
        !           294:
        !           295:        if (*wc1 < *wc2)
        !           296:                return (-1);
        !           297:        if (*wc1 > *wc2)
        !           298:                return (1);
        !           299:        return (0);
        !           300: }
        !           301:
        !           302: /* Check if character in table. */
        !           303: int
        !           304: utf8_in_table(wchar_t find, const wchar_t *table, u_int count)
        !           305: {
        !           306:        wchar_t *found;
        !           307:
        !           308:        found = bsearch(&find, table, count, sizeof *table, utf8_table_cmp);
        !           309:        return (found != NULL);
        !           310: }
        !           311:
1.47      nicm      312: /* Get UTF-8 character from data. */
                    313: enum utf8_state
                    314: utf8_from_data(const struct utf8_data *ud, utf8_char *uc)
1.45      nicm      315: {
1.54      nicm      316:        u_int   index;
1.45      nicm      317:
1.52      nicm      318:        if (ud->width > 2)
1.55      nicm      319:                fatalx("invalid UTF-8 width: %u", ud->width);
1.45      nicm      320:
1.52      nicm      321:        if (ud->size > UTF8_SIZE)
1.45      nicm      322:                goto fail;
1.53      nicm      323:        if (ud->size <= 3) {
1.54      nicm      324:                index = (((utf8_char)ud->data[2] << 16)|
1.63      nicm      325:                          ((utf8_char)ud->data[1] << 8)|
                    326:                          ((utf8_char)ud->data[0]));
1.54      nicm      327:        } else if (utf8_put_item(ud->data, ud->size, &index) != 0)
1.53      nicm      328:                goto fail;
1.54      nicm      329:        *uc = UTF8_SET_SIZE(ud->size)|UTF8_SET_WIDTH(ud->width)|index;
1.53      nicm      330:        log_debug("%s: (%d %d %.*s) -> %08x", __func__, ud->width, ud->size,
                    331:            (int)ud->size, ud->data, *uc);
1.47      nicm      332:        return (UTF8_DONE);
1.45      nicm      333:
                    334: fail:
1.52      nicm      335:        if (ud->width == 0)
1.53      nicm      336:                *uc = UTF8_SET_SIZE(0)|UTF8_SET_WIDTH(0);
1.52      nicm      337:        else if (ud->width == 1)
1.53      nicm      338:                *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x20;
1.47      nicm      339:        else
1.53      nicm      340:                *uc = UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|0x2020;
1.47      nicm      341:        return (UTF8_ERROR);
1.45      nicm      342: }
                    343:
1.47      nicm      344: /* Get UTF-8 data from character. */
1.45      nicm      345: void
1.47      nicm      346: utf8_to_data(utf8_char uc, struct utf8_data *ud)
1.45      nicm      347: {
1.47      nicm      348:        struct utf8_item        *ui;
1.54      nicm      349:        u_int                    index;
1.45      nicm      350:
                    351:        memset(ud, 0, sizeof *ud);
1.53      nicm      352:        ud->size = ud->have = UTF8_GET_SIZE(uc);
                    353:        ud->width = UTF8_GET_WIDTH(uc);
1.45      nicm      354:
                    355:        if (ud->size <= 3) {
1.53      nicm      356:                ud->data[2] = (uc >> 16);
                    357:                ud->data[1] = ((uc >> 8) & 0xff);
                    358:                ud->data[0] = (uc & 0xff);
                    359:        } else {
1.54      nicm      360:                index = (uc & 0xffffff);
                    361:                if ((ui = utf8_item_by_index(index)) == NULL)
1.53      nicm      362:                        memset(ud->data, ' ', ud->size);
1.54      nicm      363:                else
1.53      nicm      364:                        memcpy(ud->data, ui->data, ud->size);
1.45      nicm      365:        }
                    366:
1.53      nicm      367:        log_debug("%s: %08x -> (%d %d %.*s)", __func__, uc, ud->width, ud->size,
                    368:            (int)ud->size, ud->data);
1.45      nicm      369: }
                    370:
1.47      nicm      371: /* Get UTF-8 character from a single ASCII character. */
1.46      nicm      372: u_int
1.52      nicm      373: utf8_build_one(u_char ch)
1.45      nicm      374: {
1.53      nicm      375:        return (UTF8_SET_SIZE(1)|UTF8_SET_WIDTH(1)|ch);
1.45      nicm      376: }
1.29      nicm      377:
1.11      nicm      378: /* Set a single character. */
                    379: void
1.19      nicm      380: utf8_set(struct utf8_data *ud, u_char ch)
1.11      nicm      381: {
1.33      nicm      382:        static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
1.20      nicm      383:
1.33      nicm      384:        memcpy(ud, &empty, sizeof *ud);
1.19      nicm      385:        *ud->data = ch;
1.20      nicm      386: }
                    387:
                    388: /* Copy UTF-8 character. */
                    389: void
                    390: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
                    391: {
                    392:        u_int   i;
                    393:
                    394:        memcpy(to, from, sizeof *to);
                    395:
                    396:        for (i = to->size; i < sizeof to->data; i++)
                    397:                to->data[i] = '\0';
1.11      nicm      398: }
                    399:
1.47      nicm      400: /* Get width of Unicode character. */
1.48      nicm      401: static enum utf8_state
                    402: utf8_width(struct utf8_data *ud, int *width)
1.47      nicm      403: {
1.48      nicm      404:        wchar_t wc;
1.47      nicm      405:
1.64    ! nicm      406:        if (utf8_towc(ud, &wc) != UTF8_DONE)
1.48      nicm      407:                return (UTF8_ERROR);
1.64    ! nicm      408:        if (utf8_in_table(wc, utf8_force_wide, nitems(utf8_force_wide))) {
        !           409:                *width = 2;
        !           410:                return (UTF8_DONE);
1.48      nicm      411:        }
1.64    ! nicm      412:
1.48      nicm      413:        *width = wcwidth(wc);
1.63      nicm      414:        log_debug("wcwidth(%05X) returned %d", (u_int)wc, *width);
1.60      nicm      415:        if (*width < 0) {
                    416:                /*
                    417:                 * C1 control characters are nonprintable, so they are always
                    418:                 * zero width.
                    419:                 */
                    420:                *width = (wc >= 0x80 && wc <= 0x9f) ? 0 : 1;
                    421:        }
1.59      nicm      422:        if (*width >= 0 && *width <= 0xff)
                    423:                return (UTF8_DONE);
                    424:        return (UTF8_ERROR);
1.64    ! nicm      425: }
        !           426:
        !           427: /* Convert UTF-8 character to wide character. */
        !           428: enum utf8_state
        !           429: utf8_towc(const struct utf8_data *ud, wchar_t *wc)
        !           430: {
        !           431:        switch (mbtowc(wc, ud->data, ud->size)) {
        !           432:        case -1:
        !           433:                log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
        !           434:                    errno);
        !           435:                mbtowc(NULL, NULL, MB_CUR_MAX);
        !           436:                return (UTF8_ERROR);
        !           437:        case 0:
        !           438:                return (UTF8_ERROR);
        !           439:        }
        !           440:        log_debug("UTF-8 %.*s is %05X", (int)ud->size, ud->data, (u_int)*wc);
        !           441:        return (UTF8_DONE);
1.47      nicm      442: }
                    443:
1.4       nicm      444: /*
                    445:  * Open UTF-8 sequence.
                    446:  *
                    447:  * 11000010-11011111 C2-DF start of 2-byte sequence
                    448:  * 11100000-11101111 E0-EF start of 3-byte sequence
                    449:  * 11110000-11110100 F0-F4 start of 4-byte sequence
                    450:  */
1.23      nicm      451: enum utf8_state
1.19      nicm      452: utf8_open(struct utf8_data *ud, u_char ch)
1.4       nicm      453: {
1.19      nicm      454:        memset(ud, 0, sizeof *ud);
1.4       nicm      455:        if (ch >= 0xc2 && ch <= 0xdf)
1.19      nicm      456:                ud->size = 2;
1.4       nicm      457:        else if (ch >= 0xe0 && ch <= 0xef)
1.19      nicm      458:                ud->size = 3;
1.4       nicm      459:        else if (ch >= 0xf0 && ch <= 0xf4)
1.19      nicm      460:                ud->size = 4;
1.4       nicm      461:        else
1.23      nicm      462:                return (UTF8_ERROR);
1.19      nicm      463:        utf8_append(ud, ch);
1.23      nicm      464:        return (UTF8_MORE);
1.4       nicm      465: }
                    466:
1.23      nicm      467: /* Append character to UTF-8, closing if finished. */
                    468: enum utf8_state
1.19      nicm      469: utf8_append(struct utf8_data *ud, u_char ch)
1.4       nicm      470: {
1.29      nicm      471:        int     width;
                    472:
1.19      nicm      473:        if (ud->have >= ud->size)
1.4       nicm      474:                fatalx("UTF-8 character overflow");
1.19      nicm      475:        if (ud->size > sizeof ud->data)
1.4       nicm      476:                fatalx("UTF-8 character size too large");
                    477:
1.21      nicm      478:        if (ud->have != 0 && (ch & 0xc0) != 0x80)
                    479:                ud->width = 0xff;
                    480:
1.19      nicm      481:        ud->data[ud->have++] = ch;
                    482:        if (ud->have != ud->size)
1.23      nicm      483:                return (UTF8_MORE);
1.4       nicm      484:
1.21      nicm      485:        if (ud->width == 0xff)
1.23      nicm      486:                return (UTF8_ERROR);
1.48      nicm      487:        if (utf8_width(ud, &width) != UTF8_DONE)
1.29      nicm      488:                return (UTF8_ERROR);
                    489:        ud->width = width;
                    490:
1.23      nicm      491:        return (UTF8_DONE);
1.9       nicm      492: }
                    493:
                    494: /*
                    495:  * Encode len characters from src into dst, which is guaranteed to have four
                    496:  * bytes available for each character from src (for \abc or UTF-8) plus space
                    497:  * for \0.
                    498:  */
                    499: int
                    500: utf8_strvis(char *dst, const char *src, size_t len, int flag)
                    501: {
1.19      nicm      502:        struct utf8_data         ud;
1.47      nicm      503:        const char              *start = dst, *end = src + len;
1.23      nicm      504:        enum utf8_state          more;
1.9       nicm      505:        size_t                   i;
                    506:
                    507:        while (src < end) {
1.23      nicm      508:                if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
                    509:                        while (++src < end && more == UTF8_MORE)
1.19      nicm      510:                                more = utf8_append(&ud, *src);
1.23      nicm      511:                        if (more == UTF8_DONE) {
1.9       nicm      512:                                /* UTF-8 character finished. */
1.19      nicm      513:                                for (i = 0; i < ud.size; i++)
                    514:                                        *dst++ = ud.data[i];
1.9       nicm      515:                                continue;
                    516:                        }
1.23      nicm      517:                        /* Not a complete, valid UTF-8 character. */
                    518:                        src -= ud.have;
1.9       nicm      519:                }
1.41      nicm      520:                if (src[0] == '$' && src < end - 1) {
1.42      nicm      521:                        if (isalpha((u_char)src[1]) ||
                    522:                            src[1] == '_' ||
                    523:                            src[1] == '{')
1.41      nicm      524:                                *dst++ = '\\';
                    525:                        *dst++ = '$';
                    526:                } else if (src < end - 1)
1.9       nicm      527:                        dst = vis(dst, src[0], flag, src[1]);
                    528:                else if (src < end)
                    529:                        dst = vis(dst, src[0], flag, '\0');
                    530:                src++;
                    531:        }
                    532:        *dst = '\0';
                    533:        return (dst - start);
1.35      nicm      534: }
                    535:
                    536: /* Same as utf8_strvis but allocate the buffer. */
                    537: int
                    538: utf8_stravis(char **dst, const char *src, int flag)
                    539: {
                    540:        char    *buf;
                    541:        int      len;
                    542:
                    543:        buf = xreallocarray(NULL, 4, strlen(src) + 1);
                    544:        len = utf8_strvis(buf, src, strlen(src), flag);
1.56      nicm      545:
                    546:        *dst = xrealloc(buf, len + 1);
                    547:        return (len);
                    548: }
                    549:
                    550: /* Same as utf8_strvis but allocate the buffer. */
                    551: int
                    552: utf8_stravisx(char **dst, const char *src, size_t srclen, int flag)
                    553: {
                    554:        char    *buf;
                    555:        int      len;
                    556:
                    557:        buf = xreallocarray(NULL, 4, srclen + 1);
                    558:        len = utf8_strvis(buf, src, srclen, flag);
1.35      nicm      559:
                    560:        *dst = xrealloc(buf, len + 1);
                    561:        return (len);
1.38      nicm      562: }
                    563:
                    564: /* Does this string contain anything that isn't valid UTF-8? */
                    565: int
                    566: utf8_isvalid(const char *s)
                    567: {
1.47      nicm      568:        struct utf8_data ud;
                    569:        const char      *end;
                    570:        enum utf8_state  more;
1.38      nicm      571:
                    572:        end = s + strlen(s);
                    573:        while (s < end) {
                    574:                if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
                    575:                        while (++s < end && more == UTF8_MORE)
                    576:                                more = utf8_append(&ud, *s);
                    577:                        if (more == UTF8_DONE)
                    578:                                continue;
                    579:                        return (0);
                    580:                }
                    581:                if (*s < 0x20 || *s > 0x7e)
                    582:                        return (0);
                    583:                s++;
                    584:        }
                    585:        return (1);
1.16      nicm      586: }
                    587:
                    588: /*
                    589:  * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
                    590:  * the returned string. Anything not valid printable ASCII or UTF-8 is
                    591:  * stripped.
                    592:  */
                    593: char *
                    594: utf8_sanitize(const char *src)
                    595: {
1.47      nicm      596:        char            *dst = NULL;
                    597:        size_t           n = 0;
                    598:        enum utf8_state  more;
                    599:        struct utf8_data ud;
                    600:        u_int            i;
1.16      nicm      601:
                    602:        while (*src != '\0') {
                    603:                dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23      nicm      604:                if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
                    605:                        while (*++src != '\0' && more == UTF8_MORE)
1.19      nicm      606:                                more = utf8_append(&ud, *src);
1.23      nicm      607:                        if (more == UTF8_DONE) {
1.19      nicm      608:                                dst = xreallocarray(dst, n + ud.width,
1.16      nicm      609:                                    sizeof *dst);
1.19      nicm      610:                                for (i = 0; i < ud.width; i++)
1.16      nicm      611:                                        dst[n++] = '_';
                    612:                                continue;
                    613:                        }
1.19      nicm      614:                        src -= ud.have;
1.16      nicm      615:                }
                    616:                if (*src > 0x1f && *src < 0x7f)
1.21      nicm      617:                        dst[n++] = *src;
1.23      nicm      618:                else
                    619:                        dst[n++] = '_';
1.16      nicm      620:                src++;
                    621:        }
                    622:        dst = xreallocarray(dst, n + 1, sizeof *dst);
                    623:        dst[n] = '\0';
                    624:        return (dst);
1.34      nicm      625: }
                    626:
                    627: /* Get UTF-8 buffer length. */
                    628: size_t
                    629: utf8_strlen(const struct utf8_data *s)
                    630: {
                    631:        size_t  i;
                    632:
                    633:        for (i = 0; s[i].size != 0; i++)
                    634:                /* nothing */;
                    635:        return (i);
                    636: }
                    637:
                    638: /* Get UTF-8 string width. */
                    639: u_int
                    640: utf8_strwidth(const struct utf8_data *s, ssize_t n)
                    641: {
                    642:        ssize_t i;
1.47      nicm      643:        u_int   width = 0;
1.34      nicm      644:
                    645:        for (i = 0; s[i].size != 0; i++) {
                    646:                if (n != -1 && n == i)
                    647:                        break;
                    648:                width += s[i].width;
                    649:        }
                    650:        return (width);
1.11      nicm      651: }
                    652:
                    653: /*
                    654:  * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
                    655:  * Caller frees.
                    656:  */
                    657: struct utf8_data *
                    658: utf8_fromcstr(const char *src)
                    659: {
1.47      nicm      660:        struct utf8_data        *dst = NULL;
                    661:        size_t                   n = 0;
1.23      nicm      662:        enum utf8_state          more;
1.11      nicm      663:
                    664:        while (*src != '\0') {
1.12      nicm      665:                dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23      nicm      666:                if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
                    667:                        while (*++src != '\0' && more == UTF8_MORE)
1.11      nicm      668:                                more = utf8_append(&dst[n], *src);
1.23      nicm      669:                        if (more == UTF8_DONE) {
1.11      nicm      670:                                n++;
                    671:                                continue;
                    672:                        }
                    673:                        src -= dst[n].have;
                    674:                }
1.23      nicm      675:                utf8_set(&dst[n], *src);
                    676:                n++;
1.11      nicm      677:                src++;
                    678:        }
1.12      nicm      679:        dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11      nicm      680:        dst[n].size = 0;
                    681:        return (dst);
                    682: }
                    683:
                    684: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
                    685: char *
                    686: utf8_tocstr(struct utf8_data *src)
                    687: {
1.47      nicm      688:        char    *dst = NULL;
                    689:        size_t   n = 0;
1.11      nicm      690:
                    691:        for(; src->size != 0; src++) {
1.12      nicm      692:                dst = xreallocarray(dst, n + src->size, 1);
1.11      nicm      693:                memcpy(dst + n, src->data, src->size);
                    694:                n += src->size;
                    695:        }
1.12      nicm      696:        dst = xreallocarray(dst, n + 1, 1);
1.11      nicm      697:        dst[n] = '\0';
                    698:        return (dst);
                    699: }
                    700:
                    701: /* Get width of UTF-8 string. */
                    702: u_int
                    703: utf8_cstrwidth(const char *s)
                    704: {
                    705:        struct utf8_data        tmp;
                    706:        u_int                   width;
1.23      nicm      707:        enum utf8_state         more;
1.11      nicm      708:
                    709:        width = 0;
                    710:        while (*s != '\0') {
1.23      nicm      711:                if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
                    712:                        while (*++s != '\0' && more == UTF8_MORE)
1.11      nicm      713:                                more = utf8_append(&tmp, *s);
1.23      nicm      714:                        if (more == UTF8_DONE) {
1.11      nicm      715:                                width += tmp.width;
                    716:                                continue;
                    717:                        }
                    718:                        s -= tmp.have;
                    719:                }
1.23      nicm      720:                if (*s > 0x1f && *s != 0x7f)
1.21      nicm      721:                        width++;
1.11      nicm      722:                s++;
                    723:        }
                    724:        return (width);
1.18      nicm      725: }
                    726:
1.44      nicm      727: /* Pad UTF-8 string to width on the left. Caller frees. */
1.18      nicm      728: char *
                    729: utf8_padcstr(const char *s, u_int width)
                    730: {
                    731:        size_t   slen;
                    732:        char    *out;
1.47      nicm      733:        u_int    n, i;
1.18      nicm      734:
                    735:        n = utf8_cstrwidth(s);
                    736:        if (n >= width)
                    737:                return (xstrdup(s));
                    738:
                    739:        slen = strlen(s);
                    740:        out = xmalloc(slen + 1 + (width - n));
                    741:        memcpy(out, s, slen);
                    742:        for (i = n; i < width; i++)
                    743:                out[slen++] = ' ';
                    744:        out[slen] = '\0';
1.44      nicm      745:        return (out);
                    746: }
                    747:
                    748: /* Pad UTF-8 string to width on the right. Caller frees. */
                    749: char *
                    750: utf8_rpadcstr(const char *s, u_int width)
                    751: {
                    752:        size_t   slen;
                    753:        char    *out;
1.47      nicm      754:        u_int    n, i;
1.44      nicm      755:
                    756:        n = utf8_cstrwidth(s);
                    757:        if (n >= width)
                    758:                return (xstrdup(s));
                    759:
                    760:        slen = strlen(s);
                    761:        out = xmalloc(slen + 1 + (width - n));
                    762:        for (i = 0; i < width - n; i++)
                    763:                out[i] = ' ';
                    764:        memcpy(out + i, s, slen);
                    765:        out[i + slen] = '\0';
1.11      nicm      766:        return (out);
1.43      nicm      767: }
                    768:
                    769: int
                    770: utf8_cstrhas(const char *s, const struct utf8_data *ud)
                    771: {
                    772:        struct utf8_data        *copy, *loop;
                    773:        int                      found = 0;
                    774:
                    775:        copy = utf8_fromcstr(s);
                    776:        for (loop = copy; loop->size != 0; loop++) {
                    777:                if (loop->size != ud->size)
                    778:                        continue;
                    779:                if (memcmp(loop->data, ud->data, loop->size) == 0) {
                    780:                        found = 1;
                    781:                        break;
                    782:                }
                    783:        }
                    784:        free(copy);
                    785:
                    786:        return (found);
1.1       nicm      787: }