Annotation of src/usr.bin/tmux/utf8.c, Revision 1.34
1.34 ! nicm 1: /* $OpenBSD: utf8.c,v 1.33 2016/05/27 22:57:27 nicm Exp $ */
1.1 nicm 2:
3: /*
1.26 nicm 4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
1.1 nicm 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.30 nicm 21: #include <errno.h>
1.11 nicm 22: #include <stdlib.h>
1.1 nicm 23: #include <string.h>
1.9 nicm 24: #include <vis.h>
1.28 nicm 25: #include <wchar.h>
1.1 nicm 26:
27: #include "tmux.h"
28:
1.29 nicm 29: static int utf8_width(wchar_t);
30:
1.11 nicm 31: /* Set a single character. */
32: void
1.19 nicm 33: utf8_set(struct utf8_data *ud, u_char ch)
1.11 nicm 34: {
1.33 nicm 35: static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
1.20 nicm 36:
1.33 nicm 37: memcpy(ud, &empty, sizeof *ud);
1.19 nicm 38: *ud->data = ch;
1.20 nicm 39: }
40:
41: /* Copy UTF-8 character. */
42: void
43: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
44: {
45: u_int i;
46:
47: memcpy(to, from, sizeof *to);
48:
49: for (i = to->size; i < sizeof to->data; i++)
50: to->data[i] = '\0';
1.11 nicm 51: }
52:
1.4 nicm 53: /*
54: * Open UTF-8 sequence.
55: *
56: * 11000010-11011111 C2-DF start of 2-byte sequence
57: * 11100000-11101111 E0-EF start of 3-byte sequence
58: * 11110000-11110100 F0-F4 start of 4-byte sequence
59: */
1.23 nicm 60: enum utf8_state
1.19 nicm 61: utf8_open(struct utf8_data *ud, u_char ch)
1.4 nicm 62: {
1.19 nicm 63: memset(ud, 0, sizeof *ud);
1.4 nicm 64: if (ch >= 0xc2 && ch <= 0xdf)
1.19 nicm 65: ud->size = 2;
1.4 nicm 66: else if (ch >= 0xe0 && ch <= 0xef)
1.19 nicm 67: ud->size = 3;
1.4 nicm 68: else if (ch >= 0xf0 && ch <= 0xf4)
1.19 nicm 69: ud->size = 4;
1.4 nicm 70: else
1.23 nicm 71: return (UTF8_ERROR);
1.19 nicm 72: utf8_append(ud, ch);
1.23 nicm 73: return (UTF8_MORE);
1.4 nicm 74: }
75:
1.23 nicm 76: /* Append character to UTF-8, closing if finished. */
77: enum utf8_state
1.19 nicm 78: utf8_append(struct utf8_data *ud, u_char ch)
1.4 nicm 79: {
1.29 nicm 80: wchar_t wc;
81: int width;
82:
1.19 nicm 83: if (ud->have >= ud->size)
1.4 nicm 84: fatalx("UTF-8 character overflow");
1.19 nicm 85: if (ud->size > sizeof ud->data)
1.4 nicm 86: fatalx("UTF-8 character size too large");
87:
1.21 nicm 88: if (ud->have != 0 && (ch & 0xc0) != 0x80)
89: ud->width = 0xff;
90:
1.19 nicm 91: ud->data[ud->have++] = ch;
92: if (ud->have != ud->size)
1.23 nicm 93: return (UTF8_MORE);
1.4 nicm 94:
1.21 nicm 95: if (ud->width == 0xff)
1.23 nicm 96: return (UTF8_ERROR);
1.29 nicm 97:
98: if (utf8_combine(ud, &wc) != UTF8_DONE)
99: return (UTF8_ERROR);
100: if ((width = utf8_width(wc)) < 0)
101: return (UTF8_ERROR);
102: ud->width = width;
103:
1.23 nicm 104: return (UTF8_DONE);
1.1 nicm 105: }
106:
1.28 nicm 107: /* Get width of Unicode character. */
1.29 nicm 108: static int
1.28 nicm 109: utf8_width(wchar_t wc)
1.17 nicm 110: {
1.29 nicm 111: int width;
1.17 nicm 112:
1.28 nicm 113: width = wcwidth(wc);
1.30 nicm 114: if (width < 0 || width > 0xff) {
115: log_debug("Unicode %04x, wcwidth() %d", wc, width);
1.29 nicm 116: return (-1);
1.30 nicm 117: }
1.28 nicm 118: return (width);
1.17 nicm 119: }
120:
1.28 nicm 121: /* Combine UTF-8 into Unicode. */
1.29 nicm 122: enum utf8_state
123: utf8_combine(const struct utf8_data *ud, wchar_t *wc)
1.1 nicm 124: {
1.29 nicm 125: switch (mbtowc(wc, ud->data, ud->size)) {
126: case -1:
1.30 nicm 127: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
128: errno);
1.29 nicm 129: mbtowc(NULL, NULL, MB_CUR_MAX);
130: return (UTF8_ERROR);
131: case 0:
132: return (UTF8_ERROR);
133: default:
134: return (UTF8_DONE);
135: }
1.15 nicm 136: }
137:
1.28 nicm 138: /* Split Unicode into UTF-8. */
1.23 nicm 139: enum utf8_state
1.28 nicm 140: utf8_split(wchar_t wc, struct utf8_data *ud)
1.15 nicm 141: {
1.29 nicm 142: char s[MB_LEN_MAX];
143: int slen;
1.28 nicm 144:
145: slen = wctomb(s, wc);
146: if (slen <= 0 || slen > (int)sizeof ud->data)
1.23 nicm 147: return (UTF8_ERROR);
1.28 nicm 148:
149: memcpy(ud->data, s, slen);
150: ud->size = slen;
151:
152: ud->width = utf8_width(wc);
1.23 nicm 153: return (UTF8_DONE);
1.9 nicm 154: }
155:
156: /*
157: * Encode len characters from src into dst, which is guaranteed to have four
158: * bytes available for each character from src (for \abc or UTF-8) plus space
159: * for \0.
160: */
161: int
162: utf8_strvis(char *dst, const char *src, size_t len, int flag)
163: {
1.19 nicm 164: struct utf8_data ud;
1.9 nicm 165: const char *start, *end;
1.23 nicm 166: enum utf8_state more;
1.9 nicm 167: size_t i;
168:
169: start = dst;
170: end = src + len;
171:
172: while (src < end) {
1.23 nicm 173: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
174: while (++src < end && more == UTF8_MORE)
1.19 nicm 175: more = utf8_append(&ud, *src);
1.23 nicm 176: if (more == UTF8_DONE) {
1.9 nicm 177: /* UTF-8 character finished. */
1.19 nicm 178: for (i = 0; i < ud.size; i++)
179: *dst++ = ud.data[i];
1.9 nicm 180: continue;
181: }
1.23 nicm 182: /* Not a complete, valid UTF-8 character. */
183: src -= ud.have;
1.9 nicm 184: }
185: if (src < end - 1)
186: dst = vis(dst, src[0], flag, src[1]);
187: else if (src < end)
188: dst = vis(dst, src[0], flag, '\0');
189: src++;
190: }
191:
192: *dst = '\0';
193: return (dst - start);
1.16 nicm 194: }
195:
196: /*
197: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
198: * the returned string. Anything not valid printable ASCII or UTF-8 is
199: * stripped.
200: */
201: char *
202: utf8_sanitize(const char *src)
203: {
204: char *dst;
205: size_t n;
1.23 nicm 206: enum utf8_state more;
1.19 nicm 207: struct utf8_data ud;
1.16 nicm 208: u_int i;
209:
210: dst = NULL;
211:
212: n = 0;
213: while (*src != '\0') {
214: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 215: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
216: while (*++src != '\0' && more == UTF8_MORE)
1.19 nicm 217: more = utf8_append(&ud, *src);
1.23 nicm 218: if (more == UTF8_DONE) {
1.19 nicm 219: dst = xreallocarray(dst, n + ud.width,
1.16 nicm 220: sizeof *dst);
1.19 nicm 221: for (i = 0; i < ud.width; i++)
1.16 nicm 222: dst[n++] = '_';
223: continue;
224: }
1.19 nicm 225: src -= ud.have;
1.16 nicm 226: }
227: if (*src > 0x1f && *src < 0x7f)
1.21 nicm 228: dst[n++] = *src;
1.23 nicm 229: else
230: dst[n++] = '_';
1.16 nicm 231: src++;
232: }
233:
234: dst = xreallocarray(dst, n + 1, sizeof *dst);
235: dst[n] = '\0';
236: return (dst);
1.34 ! nicm 237: }
! 238:
! 239: /* Get UTF-8 buffer length. */
! 240: size_t
! 241: utf8_strlen(const struct utf8_data *s)
! 242: {
! 243: size_t i;
! 244:
! 245: for (i = 0; s[i].size != 0; i++)
! 246: /* nothing */;
! 247: return (i);
! 248: }
! 249:
! 250: /* Get UTF-8 string width. */
! 251: u_int
! 252: utf8_strwidth(const struct utf8_data *s, ssize_t n)
! 253: {
! 254: ssize_t i;
! 255: u_int width;
! 256:
! 257: width = 0;
! 258: for (i = 0; s[i].size != 0; i++) {
! 259: if (n != -1 && n == i)
! 260: break;
! 261: width += s[i].width;
! 262: }
! 263: return (width);
1.11 nicm 264: }
265:
266: /*
267: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
268: * Caller frees.
269: */
270: struct utf8_data *
271: utf8_fromcstr(const char *src)
272: {
273: struct utf8_data *dst;
274: size_t n;
1.23 nicm 275: enum utf8_state more;
1.11 nicm 276:
277: dst = NULL;
278:
279: n = 0;
280: while (*src != '\0') {
1.12 nicm 281: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 282: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
283: while (*++src != '\0' && more == UTF8_MORE)
1.11 nicm 284: more = utf8_append(&dst[n], *src);
1.23 nicm 285: if (more == UTF8_DONE) {
1.11 nicm 286: n++;
287: continue;
288: }
289: src -= dst[n].have;
290: }
1.23 nicm 291: utf8_set(&dst[n], *src);
292: n++;
1.11 nicm 293: src++;
294: }
295:
1.12 nicm 296: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 297: dst[n].size = 0;
298: return (dst);
299: }
300:
301: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
302: char *
303: utf8_tocstr(struct utf8_data *src)
304: {
305: char *dst;
306: size_t n;
307:
308: dst = NULL;
309:
310: n = 0;
311: for(; src->size != 0; src++) {
1.12 nicm 312: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 313: memcpy(dst + n, src->data, src->size);
314: n += src->size;
315: }
316:
1.12 nicm 317: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 318: dst[n] = '\0';
319: return (dst);
320: }
321:
322: /* Get width of UTF-8 string. */
323: u_int
324: utf8_cstrwidth(const char *s)
325: {
326: struct utf8_data tmp;
327: u_int width;
1.23 nicm 328: enum utf8_state more;
1.11 nicm 329:
330: width = 0;
331: while (*s != '\0') {
1.23 nicm 332: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
333: while (*++s != '\0' && more == UTF8_MORE)
1.11 nicm 334: more = utf8_append(&tmp, *s);
1.23 nicm 335: if (more == UTF8_DONE) {
1.11 nicm 336: width += tmp.width;
337: continue;
338: }
339: s -= tmp.have;
340: }
1.23 nicm 341: if (*s > 0x1f && *s != 0x7f)
1.21 nicm 342: width++;
1.11 nicm 343: s++;
344: }
345: return (width);
346: }
347:
348: /* Trim UTF-8 string to width. Caller frees. */
349: char *
350: utf8_trimcstr(const char *s, u_int width)
351: {
352: struct utf8_data *tmp, *next;
353: char *out;
354: u_int at;
355:
356: tmp = utf8_fromcstr(s);
357:
358: at = 0;
359: for (next = tmp; next->size != 0; next++) {
360: if (at + next->width > width) {
361: next->size = 0;
362: break;
363: }
364: at += next->width;
365: }
366:
367: out = utf8_tocstr(tmp);
1.27 nicm 368: free(tmp);
369: return (out);
370: }
371:
372: /* Trim UTF-8 string to width. Caller frees. */
373: char *
374: utf8_rtrimcstr(const char *s, u_int width)
375: {
376: struct utf8_data *tmp, *next, *end;
377: char *out;
378: u_int at;
379:
380: tmp = utf8_fromcstr(s);
381:
382: for (end = tmp; end->size != 0; end++)
383: /* nothing */;
384: if (end == tmp) {
385: free(tmp);
386: return (xstrdup(""));
387: }
388: next = end - 1;
389:
390: at = 0;
391: for (;;)
392: {
393: if (at + next->width > width) {
394: next++;
395: break;
396: }
397: at += next->width;
398:
399: if (next == tmp)
400: break;
401: next--;
402: }
403:
404: out = utf8_tocstr(next);
1.11 nicm 405: free(tmp);
1.18 nicm 406: return (out);
407: }
408:
409: /* Pad UTF-8 string to width. Caller frees. */
410: char *
411: utf8_padcstr(const char *s, u_int width)
412: {
413: size_t slen;
414: char *out;
415: u_int n, i;
416:
417: n = utf8_cstrwidth(s);
418: if (n >= width)
419: return (xstrdup(s));
420:
421: slen = strlen(s);
422: out = xmalloc(slen + 1 + (width - n));
423: memcpy(out, s, slen);
424: for (i = n; i < width; i++)
425: out[slen++] = ' ';
426: out[slen] = '\0';
1.11 nicm 427: return (out);
1.1 nicm 428: }