Annotation of src/usr.bin/tmux/utf8.c, Revision 1.41
1.41 ! nicm 1: /* $OpenBSD: utf8.c,v 1.40 2019/03/18 20:53:33 nicm Exp $ */
1.1 nicm 2:
3: /*
1.26 nicm 4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
1.1 nicm 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.41 ! nicm 21: #include <ctype.h>
1.30 nicm 22: #include <errno.h>
1.11 nicm 23: #include <stdlib.h>
1.1 nicm 24: #include <string.h>
1.9 nicm 25: #include <vis.h>
1.28 nicm 26: #include <wchar.h>
1.1 nicm 27:
28: #include "tmux.h"
29:
1.29 nicm 30: static int utf8_width(wchar_t);
31:
1.11 nicm 32: /* Set a single character. */
33: void
1.19 nicm 34: utf8_set(struct utf8_data *ud, u_char ch)
1.11 nicm 35: {
1.33 nicm 36: static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
1.20 nicm 37:
1.33 nicm 38: memcpy(ud, &empty, sizeof *ud);
1.19 nicm 39: *ud->data = ch;
1.20 nicm 40: }
41:
42: /* Copy UTF-8 character. */
43: void
44: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
45: {
46: u_int i;
47:
48: memcpy(to, from, sizeof *to);
49:
50: for (i = to->size; i < sizeof to->data; i++)
51: to->data[i] = '\0';
1.11 nicm 52: }
53:
1.4 nicm 54: /*
55: * Open UTF-8 sequence.
56: *
57: * 11000010-11011111 C2-DF start of 2-byte sequence
58: * 11100000-11101111 E0-EF start of 3-byte sequence
59: * 11110000-11110100 F0-F4 start of 4-byte sequence
60: */
1.23 nicm 61: enum utf8_state
1.19 nicm 62: utf8_open(struct utf8_data *ud, u_char ch)
1.4 nicm 63: {
1.19 nicm 64: memset(ud, 0, sizeof *ud);
1.4 nicm 65: if (ch >= 0xc2 && ch <= 0xdf)
1.19 nicm 66: ud->size = 2;
1.4 nicm 67: else if (ch >= 0xe0 && ch <= 0xef)
1.19 nicm 68: ud->size = 3;
1.4 nicm 69: else if (ch >= 0xf0 && ch <= 0xf4)
1.19 nicm 70: ud->size = 4;
1.4 nicm 71: else
1.23 nicm 72: return (UTF8_ERROR);
1.19 nicm 73: utf8_append(ud, ch);
1.23 nicm 74: return (UTF8_MORE);
1.4 nicm 75: }
76:
1.23 nicm 77: /* Append character to UTF-8, closing if finished. */
78: enum utf8_state
1.19 nicm 79: utf8_append(struct utf8_data *ud, u_char ch)
1.4 nicm 80: {
1.29 nicm 81: wchar_t wc;
82: int width;
83:
1.19 nicm 84: if (ud->have >= ud->size)
1.4 nicm 85: fatalx("UTF-8 character overflow");
1.19 nicm 86: if (ud->size > sizeof ud->data)
1.4 nicm 87: fatalx("UTF-8 character size too large");
88:
1.21 nicm 89: if (ud->have != 0 && (ch & 0xc0) != 0x80)
90: ud->width = 0xff;
91:
1.19 nicm 92: ud->data[ud->have++] = ch;
93: if (ud->have != ud->size)
1.23 nicm 94: return (UTF8_MORE);
1.4 nicm 95:
1.21 nicm 96: if (ud->width == 0xff)
1.23 nicm 97: return (UTF8_ERROR);
1.29 nicm 98:
99: if (utf8_combine(ud, &wc) != UTF8_DONE)
100: return (UTF8_ERROR);
101: if ((width = utf8_width(wc)) < 0)
102: return (UTF8_ERROR);
103: ud->width = width;
104:
1.23 nicm 105: return (UTF8_DONE);
1.1 nicm 106: }
107:
1.28 nicm 108: /* Get width of Unicode character. */
1.29 nicm 109: static int
1.28 nicm 110: utf8_width(wchar_t wc)
1.17 nicm 111: {
1.29 nicm 112: int width;
1.17 nicm 113:
1.28 nicm 114: width = wcwidth(wc);
1.30 nicm 115: if (width < 0 || width > 0xff) {
1.36 nicm 116: log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width);
1.29 nicm 117: return (-1);
1.30 nicm 118: }
1.28 nicm 119: return (width);
1.17 nicm 120: }
121:
1.28 nicm 122: /* Combine UTF-8 into Unicode. */
1.29 nicm 123: enum utf8_state
124: utf8_combine(const struct utf8_data *ud, wchar_t *wc)
1.1 nicm 125: {
1.29 nicm 126: switch (mbtowc(wc, ud->data, ud->size)) {
127: case -1:
1.30 nicm 128: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
129: errno);
1.29 nicm 130: mbtowc(NULL, NULL, MB_CUR_MAX);
131: return (UTF8_ERROR);
132: case 0:
133: return (UTF8_ERROR);
134: default:
135: return (UTF8_DONE);
136: }
1.15 nicm 137: }
138:
1.28 nicm 139: /* Split Unicode into UTF-8. */
1.23 nicm 140: enum utf8_state
1.28 nicm 141: utf8_split(wchar_t wc, struct utf8_data *ud)
1.15 nicm 142: {
1.29 nicm 143: char s[MB_LEN_MAX];
144: int slen;
1.28 nicm 145:
146: slen = wctomb(s, wc);
147: if (slen <= 0 || slen > (int)sizeof ud->data)
1.23 nicm 148: return (UTF8_ERROR);
1.28 nicm 149:
150: memcpy(ud->data, s, slen);
151: ud->size = slen;
152:
153: ud->width = utf8_width(wc);
1.23 nicm 154: return (UTF8_DONE);
1.9 nicm 155: }
156:
157: /*
158: * Encode len characters from src into dst, which is guaranteed to have four
159: * bytes available for each character from src (for \abc or UTF-8) plus space
160: * for \0.
161: */
162: int
163: utf8_strvis(char *dst, const char *src, size_t len, int flag)
164: {
1.19 nicm 165: struct utf8_data ud;
1.9 nicm 166: const char *start, *end;
1.23 nicm 167: enum utf8_state more;
1.9 nicm 168: size_t i;
169:
170: start = dst;
171: end = src + len;
172:
173: while (src < end) {
1.23 nicm 174: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
175: while (++src < end && more == UTF8_MORE)
1.19 nicm 176: more = utf8_append(&ud, *src);
1.23 nicm 177: if (more == UTF8_DONE) {
1.9 nicm 178: /* UTF-8 character finished. */
1.19 nicm 179: for (i = 0; i < ud.size; i++)
180: *dst++ = ud.data[i];
1.9 nicm 181: continue;
182: }
1.23 nicm 183: /* Not a complete, valid UTF-8 character. */
184: src -= ud.have;
1.9 nicm 185: }
1.41 ! nicm 186: if (src[0] == '$' && src < end - 1) {
! 187: if (isalpha((u_char)src[1]) || src[1] == '_')
! 188: *dst++ = '\\';
! 189: *dst++ = '$';
! 190: } else if (src < end - 1)
1.9 nicm 191: dst = vis(dst, src[0], flag, src[1]);
192: else if (src < end)
193: dst = vis(dst, src[0], flag, '\0');
194: src++;
195: }
196:
197: *dst = '\0';
198: return (dst - start);
1.35 nicm 199: }
200:
201: /* Same as utf8_strvis but allocate the buffer. */
202: int
203: utf8_stravis(char **dst, const char *src, int flag)
204: {
205: char *buf;
206: int len;
207:
208: buf = xreallocarray(NULL, 4, strlen(src) + 1);
209: len = utf8_strvis(buf, src, strlen(src), flag);
210:
211: *dst = xrealloc(buf, len + 1);
212: return (len);
1.38 nicm 213: }
214:
215: /* Does this string contain anything that isn't valid UTF-8? */
216: int
217: utf8_isvalid(const char *s)
218: {
219: struct utf8_data ud;
220: const char *end;
221: enum utf8_state more;
222:
223: end = s + strlen(s);
224: while (s < end) {
225: if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
226: while (++s < end && more == UTF8_MORE)
227: more = utf8_append(&ud, *s);
228: if (more == UTF8_DONE)
229: continue;
230: return (0);
231: }
232: if (*s < 0x20 || *s > 0x7e)
233: return (0);
234: s++;
235: }
236: return (1);
1.16 nicm 237: }
238:
239: /*
240: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
241: * the returned string. Anything not valid printable ASCII or UTF-8 is
242: * stripped.
243: */
244: char *
245: utf8_sanitize(const char *src)
246: {
247: char *dst;
248: size_t n;
1.23 nicm 249: enum utf8_state more;
1.19 nicm 250: struct utf8_data ud;
1.16 nicm 251: u_int i;
252:
253: dst = NULL;
254:
255: n = 0;
256: while (*src != '\0') {
257: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 258: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
259: while (*++src != '\0' && more == UTF8_MORE)
1.19 nicm 260: more = utf8_append(&ud, *src);
1.23 nicm 261: if (more == UTF8_DONE) {
1.19 nicm 262: dst = xreallocarray(dst, n + ud.width,
1.16 nicm 263: sizeof *dst);
1.19 nicm 264: for (i = 0; i < ud.width; i++)
1.16 nicm 265: dst[n++] = '_';
266: continue;
267: }
1.19 nicm 268: src -= ud.have;
1.16 nicm 269: }
270: if (*src > 0x1f && *src < 0x7f)
1.21 nicm 271: dst[n++] = *src;
1.23 nicm 272: else
273: dst[n++] = '_';
1.16 nicm 274: src++;
275: }
276:
277: dst = xreallocarray(dst, n + 1, sizeof *dst);
278: dst[n] = '\0';
279: return (dst);
1.34 nicm 280: }
281:
282: /* Get UTF-8 buffer length. */
283: size_t
284: utf8_strlen(const struct utf8_data *s)
285: {
286: size_t i;
287:
288: for (i = 0; s[i].size != 0; i++)
289: /* nothing */;
290: return (i);
291: }
292:
293: /* Get UTF-8 string width. */
294: u_int
295: utf8_strwidth(const struct utf8_data *s, ssize_t n)
296: {
297: ssize_t i;
298: u_int width;
299:
300: width = 0;
301: for (i = 0; s[i].size != 0; i++) {
302: if (n != -1 && n == i)
303: break;
304: width += s[i].width;
305: }
306: return (width);
1.11 nicm 307: }
308:
309: /*
310: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
311: * Caller frees.
312: */
313: struct utf8_data *
314: utf8_fromcstr(const char *src)
315: {
316: struct utf8_data *dst;
317: size_t n;
1.23 nicm 318: enum utf8_state more;
1.11 nicm 319:
320: dst = NULL;
321:
322: n = 0;
323: while (*src != '\0') {
1.12 nicm 324: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 325: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
326: while (*++src != '\0' && more == UTF8_MORE)
1.11 nicm 327: more = utf8_append(&dst[n], *src);
1.23 nicm 328: if (more == UTF8_DONE) {
1.11 nicm 329: n++;
330: continue;
331: }
332: src -= dst[n].have;
333: }
1.23 nicm 334: utf8_set(&dst[n], *src);
335: n++;
1.11 nicm 336: src++;
337: }
338:
1.12 nicm 339: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 340: dst[n].size = 0;
341: return (dst);
342: }
343:
344: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
345: char *
346: utf8_tocstr(struct utf8_data *src)
347: {
348: char *dst;
349: size_t n;
350:
351: dst = NULL;
352:
353: n = 0;
354: for(; src->size != 0; src++) {
1.12 nicm 355: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 356: memcpy(dst + n, src->data, src->size);
357: n += src->size;
358: }
359:
1.12 nicm 360: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 361: dst[n] = '\0';
362: return (dst);
363: }
364:
365: /* Get width of UTF-8 string. */
366: u_int
367: utf8_cstrwidth(const char *s)
368: {
369: struct utf8_data tmp;
370: u_int width;
1.23 nicm 371: enum utf8_state more;
1.11 nicm 372:
373: width = 0;
374: while (*s != '\0') {
1.23 nicm 375: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
376: while (*++s != '\0' && more == UTF8_MORE)
1.11 nicm 377: more = utf8_append(&tmp, *s);
1.23 nicm 378: if (more == UTF8_DONE) {
1.11 nicm 379: width += tmp.width;
380: continue;
381: }
382: s -= tmp.have;
383: }
1.23 nicm 384: if (*s > 0x1f && *s != 0x7f)
1.21 nicm 385: width++;
1.11 nicm 386: s++;
387: }
388: return (width);
1.18 nicm 389: }
390:
391: /* Pad UTF-8 string to width. Caller frees. */
392: char *
393: utf8_padcstr(const char *s, u_int width)
394: {
395: size_t slen;
396: char *out;
397: u_int n, i;
398:
399: n = utf8_cstrwidth(s);
400: if (n >= width)
401: return (xstrdup(s));
402:
403: slen = strlen(s);
404: out = xmalloc(slen + 1 + (width - n));
405: memcpy(out, s, slen);
406: for (i = n; i < width; i++)
407: out[slen++] = ' ';
408: out[slen] = '\0';
1.11 nicm 409: return (out);
1.1 nicm 410: }