Annotation of src/usr.bin/tmux/utf8.c, Revision 1.38
1.38 ! nicm 1: /* $OpenBSD: utf8.c,v 1.37 2017/05/31 17:56:48 nicm Exp $ */
1.1 nicm 2:
3: /*
1.26 nicm 4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
1.1 nicm 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.30 nicm 21: #include <errno.h>
1.11 nicm 22: #include <stdlib.h>
1.1 nicm 23: #include <string.h>
1.9 nicm 24: #include <vis.h>
1.28 nicm 25: #include <wchar.h>
1.1 nicm 26:
27: #include "tmux.h"
28:
1.29 nicm 29: static int utf8_width(wchar_t);
30:
1.11 nicm 31: /* Set a single character. */
32: void
1.19 nicm 33: utf8_set(struct utf8_data *ud, u_char ch)
1.11 nicm 34: {
1.33 nicm 35: static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
1.20 nicm 36:
1.33 nicm 37: memcpy(ud, &empty, sizeof *ud);
1.19 nicm 38: *ud->data = ch;
1.20 nicm 39: }
40:
41: /* Copy UTF-8 character. */
42: void
43: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
44: {
45: u_int i;
46:
47: memcpy(to, from, sizeof *to);
48:
49: for (i = to->size; i < sizeof to->data; i++)
50: to->data[i] = '\0';
1.11 nicm 51: }
52:
1.4 nicm 53: /*
54: * Open UTF-8 sequence.
55: *
56: * 11000010-11011111 C2-DF start of 2-byte sequence
57: * 11100000-11101111 E0-EF start of 3-byte sequence
58: * 11110000-11110100 F0-F4 start of 4-byte sequence
59: */
1.23 nicm 60: enum utf8_state
1.19 nicm 61: utf8_open(struct utf8_data *ud, u_char ch)
1.4 nicm 62: {
1.19 nicm 63: memset(ud, 0, sizeof *ud);
1.4 nicm 64: if (ch >= 0xc2 && ch <= 0xdf)
1.19 nicm 65: ud->size = 2;
1.4 nicm 66: else if (ch >= 0xe0 && ch <= 0xef)
1.19 nicm 67: ud->size = 3;
1.4 nicm 68: else if (ch >= 0xf0 && ch <= 0xf4)
1.19 nicm 69: ud->size = 4;
1.4 nicm 70: else
1.23 nicm 71: return (UTF8_ERROR);
1.19 nicm 72: utf8_append(ud, ch);
1.23 nicm 73: return (UTF8_MORE);
1.4 nicm 74: }
75:
1.23 nicm 76: /* Append character to UTF-8, closing if finished. */
77: enum utf8_state
1.19 nicm 78: utf8_append(struct utf8_data *ud, u_char ch)
1.4 nicm 79: {
1.29 nicm 80: wchar_t wc;
81: int width;
82:
1.19 nicm 83: if (ud->have >= ud->size)
1.4 nicm 84: fatalx("UTF-8 character overflow");
1.19 nicm 85: if (ud->size > sizeof ud->data)
1.4 nicm 86: fatalx("UTF-8 character size too large");
87:
1.21 nicm 88: if (ud->have != 0 && (ch & 0xc0) != 0x80)
89: ud->width = 0xff;
90:
1.19 nicm 91: ud->data[ud->have++] = ch;
92: if (ud->have != ud->size)
1.23 nicm 93: return (UTF8_MORE);
1.4 nicm 94:
1.21 nicm 95: if (ud->width == 0xff)
1.23 nicm 96: return (UTF8_ERROR);
1.29 nicm 97:
98: if (utf8_combine(ud, &wc) != UTF8_DONE)
99: return (UTF8_ERROR);
100: if ((width = utf8_width(wc)) < 0)
101: return (UTF8_ERROR);
102: ud->width = width;
103:
1.23 nicm 104: return (UTF8_DONE);
1.1 nicm 105: }
106:
1.28 nicm 107: /* Get width of Unicode character. */
1.29 nicm 108: static int
1.28 nicm 109: utf8_width(wchar_t wc)
1.17 nicm 110: {
1.29 nicm 111: int width;
1.17 nicm 112:
1.28 nicm 113: width = wcwidth(wc);
1.30 nicm 114: if (width < 0 || width > 0xff) {
1.36 nicm 115: log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width);
1.29 nicm 116: return (-1);
1.30 nicm 117: }
1.28 nicm 118: return (width);
1.17 nicm 119: }
120:
1.28 nicm 121: /* Combine UTF-8 into Unicode. */
1.29 nicm 122: enum utf8_state
123: utf8_combine(const struct utf8_data *ud, wchar_t *wc)
1.1 nicm 124: {
1.29 nicm 125: switch (mbtowc(wc, ud->data, ud->size)) {
126: case -1:
1.30 nicm 127: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
128: errno);
1.29 nicm 129: mbtowc(NULL, NULL, MB_CUR_MAX);
130: return (UTF8_ERROR);
131: case 0:
132: return (UTF8_ERROR);
133: default:
134: return (UTF8_DONE);
135: }
1.15 nicm 136: }
137:
1.28 nicm 138: /* Split Unicode into UTF-8. */
1.23 nicm 139: enum utf8_state
1.28 nicm 140: utf8_split(wchar_t wc, struct utf8_data *ud)
1.15 nicm 141: {
1.29 nicm 142: char s[MB_LEN_MAX];
143: int slen;
1.28 nicm 144:
145: slen = wctomb(s, wc);
146: if (slen <= 0 || slen > (int)sizeof ud->data)
1.23 nicm 147: return (UTF8_ERROR);
1.28 nicm 148:
149: memcpy(ud->data, s, slen);
150: ud->size = slen;
151:
152: ud->width = utf8_width(wc);
1.23 nicm 153: return (UTF8_DONE);
1.9 nicm 154: }
155:
156: /*
157: * Encode len characters from src into dst, which is guaranteed to have four
158: * bytes available for each character from src (for \abc or UTF-8) plus space
159: * for \0.
160: */
161: int
162: utf8_strvis(char *dst, const char *src, size_t len, int flag)
163: {
1.19 nicm 164: struct utf8_data ud;
1.9 nicm 165: const char *start, *end;
1.23 nicm 166: enum utf8_state more;
1.9 nicm 167: size_t i;
168:
169: start = dst;
170: end = src + len;
171:
172: while (src < end) {
1.23 nicm 173: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
174: while (++src < end && more == UTF8_MORE)
1.19 nicm 175: more = utf8_append(&ud, *src);
1.23 nicm 176: if (more == UTF8_DONE) {
1.9 nicm 177: /* UTF-8 character finished. */
1.19 nicm 178: for (i = 0; i < ud.size; i++)
179: *dst++ = ud.data[i];
1.9 nicm 180: continue;
181: }
1.23 nicm 182: /* Not a complete, valid UTF-8 character. */
183: src -= ud.have;
1.9 nicm 184: }
185: if (src < end - 1)
186: dst = vis(dst, src[0], flag, src[1]);
187: else if (src < end)
188: dst = vis(dst, src[0], flag, '\0');
189: src++;
190: }
191:
192: *dst = '\0';
193: return (dst - start);
1.35 nicm 194: }
195:
196: /* Same as utf8_strvis but allocate the buffer. */
197: int
198: utf8_stravis(char **dst, const char *src, int flag)
199: {
200: char *buf;
201: int len;
202:
203: buf = xreallocarray(NULL, 4, strlen(src) + 1);
204: len = utf8_strvis(buf, src, strlen(src), flag);
205:
206: *dst = xrealloc(buf, len + 1);
207: return (len);
1.38 ! nicm 208: }
! 209:
! 210: /* Does this string contain anything that isn't valid UTF-8? */
! 211: int
! 212: utf8_isvalid(const char *s)
! 213: {
! 214: struct utf8_data ud;
! 215: const char *end;
! 216: enum utf8_state more;
! 217: size_t i;
! 218:
! 219: end = s + strlen(s);
! 220: while (s < end) {
! 221: if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
! 222: while (++s < end && more == UTF8_MORE)
! 223: more = utf8_append(&ud, *s);
! 224: if (more == UTF8_DONE)
! 225: continue;
! 226: return (0);
! 227: }
! 228: if (*s < 0x20 || *s > 0x7e)
! 229: return (0);
! 230: s++;
! 231: }
! 232: return (1);
1.16 nicm 233: }
234:
235: /*
236: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
237: * the returned string. Anything not valid printable ASCII or UTF-8 is
238: * stripped.
239: */
240: char *
241: utf8_sanitize(const char *src)
242: {
243: char *dst;
244: size_t n;
1.23 nicm 245: enum utf8_state more;
1.19 nicm 246: struct utf8_data ud;
1.16 nicm 247: u_int i;
248:
249: dst = NULL;
250:
251: n = 0;
252: while (*src != '\0') {
253: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 254: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
255: while (*++src != '\0' && more == UTF8_MORE)
1.19 nicm 256: more = utf8_append(&ud, *src);
1.23 nicm 257: if (more == UTF8_DONE) {
1.19 nicm 258: dst = xreallocarray(dst, n + ud.width,
1.16 nicm 259: sizeof *dst);
1.19 nicm 260: for (i = 0; i < ud.width; i++)
1.16 nicm 261: dst[n++] = '_';
262: continue;
263: }
1.19 nicm 264: src -= ud.have;
1.16 nicm 265: }
266: if (*src > 0x1f && *src < 0x7f)
1.21 nicm 267: dst[n++] = *src;
1.23 nicm 268: else
269: dst[n++] = '_';
1.16 nicm 270: src++;
271: }
272:
273: dst = xreallocarray(dst, n + 1, sizeof *dst);
274: dst[n] = '\0';
275: return (dst);
1.34 nicm 276: }
277:
278: /* Get UTF-8 buffer length. */
279: size_t
280: utf8_strlen(const struct utf8_data *s)
281: {
282: size_t i;
283:
284: for (i = 0; s[i].size != 0; i++)
285: /* nothing */;
286: return (i);
287: }
288:
289: /* Get UTF-8 string width. */
290: u_int
291: utf8_strwidth(const struct utf8_data *s, ssize_t n)
292: {
293: ssize_t i;
294: u_int width;
295:
296: width = 0;
297: for (i = 0; s[i].size != 0; i++) {
298: if (n != -1 && n == i)
299: break;
300: width += s[i].width;
301: }
302: return (width);
1.11 nicm 303: }
304:
305: /*
306: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
307: * Caller frees.
308: */
309: struct utf8_data *
310: utf8_fromcstr(const char *src)
311: {
312: struct utf8_data *dst;
313: size_t n;
1.23 nicm 314: enum utf8_state more;
1.11 nicm 315:
316: dst = NULL;
317:
318: n = 0;
319: while (*src != '\0') {
1.12 nicm 320: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 321: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
322: while (*++src != '\0' && more == UTF8_MORE)
1.11 nicm 323: more = utf8_append(&dst[n], *src);
1.23 nicm 324: if (more == UTF8_DONE) {
1.11 nicm 325: n++;
326: continue;
327: }
328: src -= dst[n].have;
329: }
1.23 nicm 330: utf8_set(&dst[n], *src);
331: n++;
1.11 nicm 332: src++;
333: }
334:
1.12 nicm 335: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 336: dst[n].size = 0;
337: return (dst);
338: }
339:
340: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
341: char *
342: utf8_tocstr(struct utf8_data *src)
343: {
344: char *dst;
345: size_t n;
346:
347: dst = NULL;
348:
349: n = 0;
350: for(; src->size != 0; src++) {
1.12 nicm 351: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 352: memcpy(dst + n, src->data, src->size);
353: n += src->size;
354: }
355:
1.12 nicm 356: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 357: dst[n] = '\0';
358: return (dst);
359: }
360:
361: /* Get width of UTF-8 string. */
362: u_int
363: utf8_cstrwidth(const char *s)
364: {
365: struct utf8_data tmp;
366: u_int width;
1.23 nicm 367: enum utf8_state more;
1.11 nicm 368:
369: width = 0;
370: while (*s != '\0') {
1.23 nicm 371: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
372: while (*++s != '\0' && more == UTF8_MORE)
1.11 nicm 373: more = utf8_append(&tmp, *s);
1.23 nicm 374: if (more == UTF8_DONE) {
1.11 nicm 375: width += tmp.width;
376: continue;
377: }
378: s -= tmp.have;
379: }
1.23 nicm 380: if (*s > 0x1f && *s != 0x7f)
1.21 nicm 381: width++;
1.11 nicm 382: s++;
383: }
384: return (width);
385: }
386:
387: /* Trim UTF-8 string to width. Caller frees. */
388: char *
389: utf8_trimcstr(const char *s, u_int width)
390: {
391: struct utf8_data *tmp, *next;
392: char *out;
393: u_int at;
394:
395: tmp = utf8_fromcstr(s);
396:
397: at = 0;
398: for (next = tmp; next->size != 0; next++) {
399: if (at + next->width > width) {
400: next->size = 0;
401: break;
402: }
403: at += next->width;
404: }
405:
406: out = utf8_tocstr(tmp);
1.27 nicm 407: free(tmp);
408: return (out);
409: }
410:
411: /* Trim UTF-8 string to width. Caller frees. */
412: char *
413: utf8_rtrimcstr(const char *s, u_int width)
414: {
415: struct utf8_data *tmp, *next, *end;
416: char *out;
417: u_int at;
418:
419: tmp = utf8_fromcstr(s);
420:
421: for (end = tmp; end->size != 0; end++)
422: /* nothing */;
423: if (end == tmp) {
424: free(tmp);
425: return (xstrdup(""));
426: }
427: next = end - 1;
428:
429: at = 0;
1.37 nicm 430: for (;;) {
1.27 nicm 431: if (at + next->width > width) {
432: next++;
433: break;
434: }
435: at += next->width;
436:
437: if (next == tmp)
438: break;
439: next--;
440: }
441:
442: out = utf8_tocstr(next);
1.11 nicm 443: free(tmp);
1.18 nicm 444: return (out);
445: }
446:
447: /* Pad UTF-8 string to width. Caller frees. */
448: char *
449: utf8_padcstr(const char *s, u_int width)
450: {
451: size_t slen;
452: char *out;
453: u_int n, i;
454:
455: n = utf8_cstrwidth(s);
456: if (n >= width)
457: return (xstrdup(s));
458:
459: slen = strlen(s);
460: out = xmalloc(slen + 1 + (width - n));
461: memcpy(out, s, slen);
462: for (i = n; i < width; i++)
463: out[slen++] = ' ';
464: out[slen] = '\0';
1.11 nicm 465: return (out);
1.1 nicm 466: }