Annotation of src/usr.bin/tmux/utf8.c, Revision 1.42
1.42 ! nicm 1: /* $OpenBSD: utf8.c,v 1.41 2019/05/23 14:03:44 nicm Exp $ */
1.1 nicm 2:
3: /*
1.26 nicm 4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
1.1 nicm 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.41 nicm 21: #include <ctype.h>
1.30 nicm 22: #include <errno.h>
1.11 nicm 23: #include <stdlib.h>
1.1 nicm 24: #include <string.h>
1.9 nicm 25: #include <vis.h>
1.28 nicm 26: #include <wchar.h>
1.1 nicm 27:
28: #include "tmux.h"
29:
1.29 nicm 30: static int utf8_width(wchar_t);
31:
1.11 nicm 32: /* Set a single character. */
33: void
1.19 nicm 34: utf8_set(struct utf8_data *ud, u_char ch)
1.11 nicm 35: {
1.33 nicm 36: static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
1.20 nicm 37:
1.33 nicm 38: memcpy(ud, &empty, sizeof *ud);
1.19 nicm 39: *ud->data = ch;
1.20 nicm 40: }
41:
42: /* Copy UTF-8 character. */
43: void
44: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
45: {
46: u_int i;
47:
48: memcpy(to, from, sizeof *to);
49:
50: for (i = to->size; i < sizeof to->data; i++)
51: to->data[i] = '\0';
1.11 nicm 52: }
53:
1.4 nicm 54: /*
55: * Open UTF-8 sequence.
56: *
57: * 11000010-11011111 C2-DF start of 2-byte sequence
58: * 11100000-11101111 E0-EF start of 3-byte sequence
59: * 11110000-11110100 F0-F4 start of 4-byte sequence
60: */
1.23 nicm 61: enum utf8_state
1.19 nicm 62: utf8_open(struct utf8_data *ud, u_char ch)
1.4 nicm 63: {
1.19 nicm 64: memset(ud, 0, sizeof *ud);
1.4 nicm 65: if (ch >= 0xc2 && ch <= 0xdf)
1.19 nicm 66: ud->size = 2;
1.4 nicm 67: else if (ch >= 0xe0 && ch <= 0xef)
1.19 nicm 68: ud->size = 3;
1.4 nicm 69: else if (ch >= 0xf0 && ch <= 0xf4)
1.19 nicm 70: ud->size = 4;
1.4 nicm 71: else
1.23 nicm 72: return (UTF8_ERROR);
1.19 nicm 73: utf8_append(ud, ch);
1.23 nicm 74: return (UTF8_MORE);
1.4 nicm 75: }
76:
1.23 nicm 77: /* Append character to UTF-8, closing if finished. */
78: enum utf8_state
1.19 nicm 79: utf8_append(struct utf8_data *ud, u_char ch)
1.4 nicm 80: {
1.29 nicm 81: wchar_t wc;
82: int width;
83:
1.19 nicm 84: if (ud->have >= ud->size)
1.4 nicm 85: fatalx("UTF-8 character overflow");
1.19 nicm 86: if (ud->size > sizeof ud->data)
1.4 nicm 87: fatalx("UTF-8 character size too large");
88:
1.21 nicm 89: if (ud->have != 0 && (ch & 0xc0) != 0x80)
90: ud->width = 0xff;
91:
1.19 nicm 92: ud->data[ud->have++] = ch;
93: if (ud->have != ud->size)
1.23 nicm 94: return (UTF8_MORE);
1.4 nicm 95:
1.21 nicm 96: if (ud->width == 0xff)
1.23 nicm 97: return (UTF8_ERROR);
1.29 nicm 98:
99: if (utf8_combine(ud, &wc) != UTF8_DONE)
100: return (UTF8_ERROR);
101: if ((width = utf8_width(wc)) < 0)
102: return (UTF8_ERROR);
103: ud->width = width;
104:
1.23 nicm 105: return (UTF8_DONE);
1.1 nicm 106: }
107:
1.28 nicm 108: /* Get width of Unicode character. */
1.29 nicm 109: static int
1.28 nicm 110: utf8_width(wchar_t wc)
1.17 nicm 111: {
1.29 nicm 112: int width;
1.17 nicm 113:
1.28 nicm 114: width = wcwidth(wc);
1.30 nicm 115: if (width < 0 || width > 0xff) {
1.36 nicm 116: log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width);
1.29 nicm 117: return (-1);
1.30 nicm 118: }
1.28 nicm 119: return (width);
1.17 nicm 120: }
121:
1.28 nicm 122: /* Combine UTF-8 into Unicode. */
1.29 nicm 123: enum utf8_state
124: utf8_combine(const struct utf8_data *ud, wchar_t *wc)
1.1 nicm 125: {
1.29 nicm 126: switch (mbtowc(wc, ud->data, ud->size)) {
127: case -1:
1.30 nicm 128: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
129: errno);
1.29 nicm 130: mbtowc(NULL, NULL, MB_CUR_MAX);
131: return (UTF8_ERROR);
132: case 0:
133: return (UTF8_ERROR);
134: default:
135: return (UTF8_DONE);
136: }
1.15 nicm 137: }
138:
1.28 nicm 139: /* Split Unicode into UTF-8. */
1.23 nicm 140: enum utf8_state
1.28 nicm 141: utf8_split(wchar_t wc, struct utf8_data *ud)
1.15 nicm 142: {
1.29 nicm 143: char s[MB_LEN_MAX];
144: int slen;
1.28 nicm 145:
146: slen = wctomb(s, wc);
147: if (slen <= 0 || slen > (int)sizeof ud->data)
1.23 nicm 148: return (UTF8_ERROR);
1.28 nicm 149:
150: memcpy(ud->data, s, slen);
151: ud->size = slen;
152:
153: ud->width = utf8_width(wc);
1.23 nicm 154: return (UTF8_DONE);
1.9 nicm 155: }
156:
157: /*
158: * Encode len characters from src into dst, which is guaranteed to have four
159: * bytes available for each character from src (for \abc or UTF-8) plus space
160: * for \0.
161: */
162: int
163: utf8_strvis(char *dst, const char *src, size_t len, int flag)
164: {
1.19 nicm 165: struct utf8_data ud;
1.9 nicm 166: const char *start, *end;
1.23 nicm 167: enum utf8_state more;
1.9 nicm 168: size_t i;
169:
170: start = dst;
171: end = src + len;
172:
173: while (src < end) {
1.23 nicm 174: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
175: while (++src < end && more == UTF8_MORE)
1.19 nicm 176: more = utf8_append(&ud, *src);
1.23 nicm 177: if (more == UTF8_DONE) {
1.9 nicm 178: /* UTF-8 character finished. */
1.19 nicm 179: for (i = 0; i < ud.size; i++)
180: *dst++ = ud.data[i];
1.9 nicm 181: continue;
182: }
1.23 nicm 183: /* Not a complete, valid UTF-8 character. */
184: src -= ud.have;
1.9 nicm 185: }
1.41 nicm 186: if (src[0] == '$' && src < end - 1) {
1.42 ! nicm 187: if (isalpha((u_char)src[1]) ||
! 188: src[1] == '_' ||
! 189: src[1] == '{')
1.41 nicm 190: *dst++ = '\\';
191: *dst++ = '$';
192: } else if (src < end - 1)
1.9 nicm 193: dst = vis(dst, src[0], flag, src[1]);
194: else if (src < end)
195: dst = vis(dst, src[0], flag, '\0');
196: src++;
197: }
198:
199: *dst = '\0';
200: return (dst - start);
1.35 nicm 201: }
202:
203: /* Same as utf8_strvis but allocate the buffer. */
204: int
205: utf8_stravis(char **dst, const char *src, int flag)
206: {
207: char *buf;
208: int len;
209:
210: buf = xreallocarray(NULL, 4, strlen(src) + 1);
211: len = utf8_strvis(buf, src, strlen(src), flag);
212:
213: *dst = xrealloc(buf, len + 1);
214: return (len);
1.38 nicm 215: }
216:
217: /* Does this string contain anything that isn't valid UTF-8? */
218: int
219: utf8_isvalid(const char *s)
220: {
221: struct utf8_data ud;
222: const char *end;
223: enum utf8_state more;
224:
225: end = s + strlen(s);
226: while (s < end) {
227: if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
228: while (++s < end && more == UTF8_MORE)
229: more = utf8_append(&ud, *s);
230: if (more == UTF8_DONE)
231: continue;
232: return (0);
233: }
234: if (*s < 0x20 || *s > 0x7e)
235: return (0);
236: s++;
237: }
238: return (1);
1.16 nicm 239: }
240:
241: /*
242: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
243: * the returned string. Anything not valid printable ASCII or UTF-8 is
244: * stripped.
245: */
246: char *
247: utf8_sanitize(const char *src)
248: {
249: char *dst;
250: size_t n;
1.23 nicm 251: enum utf8_state more;
1.19 nicm 252: struct utf8_data ud;
1.16 nicm 253: u_int i;
254:
255: dst = NULL;
256:
257: n = 0;
258: while (*src != '\0') {
259: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 260: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
261: while (*++src != '\0' && more == UTF8_MORE)
1.19 nicm 262: more = utf8_append(&ud, *src);
1.23 nicm 263: if (more == UTF8_DONE) {
1.19 nicm 264: dst = xreallocarray(dst, n + ud.width,
1.16 nicm 265: sizeof *dst);
1.19 nicm 266: for (i = 0; i < ud.width; i++)
1.16 nicm 267: dst[n++] = '_';
268: continue;
269: }
1.19 nicm 270: src -= ud.have;
1.16 nicm 271: }
272: if (*src > 0x1f && *src < 0x7f)
1.21 nicm 273: dst[n++] = *src;
1.23 nicm 274: else
275: dst[n++] = '_';
1.16 nicm 276: src++;
277: }
278:
279: dst = xreallocarray(dst, n + 1, sizeof *dst);
280: dst[n] = '\0';
281: return (dst);
1.34 nicm 282: }
283:
284: /* Get UTF-8 buffer length. */
285: size_t
286: utf8_strlen(const struct utf8_data *s)
287: {
288: size_t i;
289:
290: for (i = 0; s[i].size != 0; i++)
291: /* nothing */;
292: return (i);
293: }
294:
295: /* Get UTF-8 string width. */
296: u_int
297: utf8_strwidth(const struct utf8_data *s, ssize_t n)
298: {
299: ssize_t i;
300: u_int width;
301:
302: width = 0;
303: for (i = 0; s[i].size != 0; i++) {
304: if (n != -1 && n == i)
305: break;
306: width += s[i].width;
307: }
308: return (width);
1.11 nicm 309: }
310:
311: /*
312: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
313: * Caller frees.
314: */
315: struct utf8_data *
316: utf8_fromcstr(const char *src)
317: {
318: struct utf8_data *dst;
319: size_t n;
1.23 nicm 320: enum utf8_state more;
1.11 nicm 321:
322: dst = NULL;
323:
324: n = 0;
325: while (*src != '\0') {
1.12 nicm 326: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 327: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
328: while (*++src != '\0' && more == UTF8_MORE)
1.11 nicm 329: more = utf8_append(&dst[n], *src);
1.23 nicm 330: if (more == UTF8_DONE) {
1.11 nicm 331: n++;
332: continue;
333: }
334: src -= dst[n].have;
335: }
1.23 nicm 336: utf8_set(&dst[n], *src);
337: n++;
1.11 nicm 338: src++;
339: }
340:
1.12 nicm 341: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 342: dst[n].size = 0;
343: return (dst);
344: }
345:
346: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
347: char *
348: utf8_tocstr(struct utf8_data *src)
349: {
350: char *dst;
351: size_t n;
352:
353: dst = NULL;
354:
355: n = 0;
356: for(; src->size != 0; src++) {
1.12 nicm 357: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 358: memcpy(dst + n, src->data, src->size);
359: n += src->size;
360: }
361:
1.12 nicm 362: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 363: dst[n] = '\0';
364: return (dst);
365: }
366:
367: /* Get width of UTF-8 string. */
368: u_int
369: utf8_cstrwidth(const char *s)
370: {
371: struct utf8_data tmp;
372: u_int width;
1.23 nicm 373: enum utf8_state more;
1.11 nicm 374:
375: width = 0;
376: while (*s != '\0') {
1.23 nicm 377: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
378: while (*++s != '\0' && more == UTF8_MORE)
1.11 nicm 379: more = utf8_append(&tmp, *s);
1.23 nicm 380: if (more == UTF8_DONE) {
1.11 nicm 381: width += tmp.width;
382: continue;
383: }
384: s -= tmp.have;
385: }
1.23 nicm 386: if (*s > 0x1f && *s != 0x7f)
1.21 nicm 387: width++;
1.11 nicm 388: s++;
389: }
390: return (width);
1.18 nicm 391: }
392:
393: /* Pad UTF-8 string to width. Caller frees. */
394: char *
395: utf8_padcstr(const char *s, u_int width)
396: {
397: size_t slen;
398: char *out;
399: u_int n, i;
400:
401: n = utf8_cstrwidth(s);
402: if (n >= width)
403: return (xstrdup(s));
404:
405: slen = strlen(s);
406: out = xmalloc(slen + 1 + (width - n));
407: memcpy(out, s, slen);
408: for (i = n; i < width; i++)
409: out[slen++] = ' ';
410: out[slen] = '\0';
1.11 nicm 411: return (out);
1.1 nicm 412: }