Annotation of src/usr.bin/tmux/utf8.c, Revision 1.39
1.39 ! nicm 1: /* $OpenBSD: utf8.c,v 1.38 2017/06/04 09:02:36 nicm Exp $ */
1.1 nicm 2:
3: /*
1.26 nicm 4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
1.1 nicm 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.30 nicm 21: #include <errno.h>
1.11 nicm 22: #include <stdlib.h>
1.1 nicm 23: #include <string.h>
1.9 nicm 24: #include <vis.h>
1.28 nicm 25: #include <wchar.h>
1.1 nicm 26:
27: #include "tmux.h"
28:
1.29 nicm 29: static int utf8_width(wchar_t);
30:
1.11 nicm 31: /* Set a single character. */
32: void
1.19 nicm 33: utf8_set(struct utf8_data *ud, u_char ch)
1.11 nicm 34: {
1.33 nicm 35: static const struct utf8_data empty = { { 0 }, 1, 1, 1 };
1.20 nicm 36:
1.33 nicm 37: memcpy(ud, &empty, sizeof *ud);
1.19 nicm 38: *ud->data = ch;
1.20 nicm 39: }
40:
41: /* Copy UTF-8 character. */
42: void
43: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
44: {
45: u_int i;
46:
47: memcpy(to, from, sizeof *to);
48:
49: for (i = to->size; i < sizeof to->data; i++)
50: to->data[i] = '\0';
1.11 nicm 51: }
52:
1.4 nicm 53: /*
54: * Open UTF-8 sequence.
55: *
56: * 11000010-11011111 C2-DF start of 2-byte sequence
57: * 11100000-11101111 E0-EF start of 3-byte sequence
58: * 11110000-11110100 F0-F4 start of 4-byte sequence
59: */
1.23 nicm 60: enum utf8_state
1.19 nicm 61: utf8_open(struct utf8_data *ud, u_char ch)
1.4 nicm 62: {
1.19 nicm 63: memset(ud, 0, sizeof *ud);
1.4 nicm 64: if (ch >= 0xc2 && ch <= 0xdf)
1.19 nicm 65: ud->size = 2;
1.4 nicm 66: else if (ch >= 0xe0 && ch <= 0xef)
1.19 nicm 67: ud->size = 3;
1.4 nicm 68: else if (ch >= 0xf0 && ch <= 0xf4)
1.19 nicm 69: ud->size = 4;
1.4 nicm 70: else
1.23 nicm 71: return (UTF8_ERROR);
1.19 nicm 72: utf8_append(ud, ch);
1.23 nicm 73: return (UTF8_MORE);
1.4 nicm 74: }
75:
1.23 nicm 76: /* Append character to UTF-8, closing if finished. */
77: enum utf8_state
1.19 nicm 78: utf8_append(struct utf8_data *ud, u_char ch)
1.4 nicm 79: {
1.29 nicm 80: wchar_t wc;
81: int width;
82:
1.19 nicm 83: if (ud->have >= ud->size)
1.4 nicm 84: fatalx("UTF-8 character overflow");
1.19 nicm 85: if (ud->size > sizeof ud->data)
1.4 nicm 86: fatalx("UTF-8 character size too large");
87:
1.21 nicm 88: if (ud->have != 0 && (ch & 0xc0) != 0x80)
89: ud->width = 0xff;
90:
1.19 nicm 91: ud->data[ud->have++] = ch;
92: if (ud->have != ud->size)
1.23 nicm 93: return (UTF8_MORE);
1.4 nicm 94:
1.21 nicm 95: if (ud->width == 0xff)
1.23 nicm 96: return (UTF8_ERROR);
1.29 nicm 97:
98: if (utf8_combine(ud, &wc) != UTF8_DONE)
99: return (UTF8_ERROR);
100: if ((width = utf8_width(wc)) < 0)
101: return (UTF8_ERROR);
102: ud->width = width;
103:
1.23 nicm 104: return (UTF8_DONE);
1.1 nicm 105: }
106:
1.28 nicm 107: /* Get width of Unicode character. */
1.29 nicm 108: static int
1.28 nicm 109: utf8_width(wchar_t wc)
1.17 nicm 110: {
1.29 nicm 111: int width;
1.17 nicm 112:
1.28 nicm 113: width = wcwidth(wc);
1.30 nicm 114: if (width < 0 || width > 0xff) {
1.36 nicm 115: log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width);
1.29 nicm 116: return (-1);
1.30 nicm 117: }
1.28 nicm 118: return (width);
1.17 nicm 119: }
120:
1.28 nicm 121: /* Combine UTF-8 into Unicode. */
1.29 nicm 122: enum utf8_state
123: utf8_combine(const struct utf8_data *ud, wchar_t *wc)
1.1 nicm 124: {
1.29 nicm 125: switch (mbtowc(wc, ud->data, ud->size)) {
126: case -1:
1.30 nicm 127: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
128: errno);
1.29 nicm 129: mbtowc(NULL, NULL, MB_CUR_MAX);
130: return (UTF8_ERROR);
131: case 0:
132: return (UTF8_ERROR);
133: default:
134: return (UTF8_DONE);
135: }
1.15 nicm 136: }
137:
1.28 nicm 138: /* Split Unicode into UTF-8. */
1.23 nicm 139: enum utf8_state
1.28 nicm 140: utf8_split(wchar_t wc, struct utf8_data *ud)
1.15 nicm 141: {
1.29 nicm 142: char s[MB_LEN_MAX];
143: int slen;
1.28 nicm 144:
145: slen = wctomb(s, wc);
146: if (slen <= 0 || slen > (int)sizeof ud->data)
1.23 nicm 147: return (UTF8_ERROR);
1.28 nicm 148:
149: memcpy(ud->data, s, slen);
150: ud->size = slen;
151:
152: ud->width = utf8_width(wc);
1.23 nicm 153: return (UTF8_DONE);
1.9 nicm 154: }
155:
156: /*
157: * Encode len characters from src into dst, which is guaranteed to have four
158: * bytes available for each character from src (for \abc or UTF-8) plus space
159: * for \0.
160: */
161: int
162: utf8_strvis(char *dst, const char *src, size_t len, int flag)
163: {
1.19 nicm 164: struct utf8_data ud;
1.9 nicm 165: const char *start, *end;
1.23 nicm 166: enum utf8_state more;
1.9 nicm 167: size_t i;
168:
169: start = dst;
170: end = src + len;
171:
172: while (src < end) {
1.23 nicm 173: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
174: while (++src < end && more == UTF8_MORE)
1.19 nicm 175: more = utf8_append(&ud, *src);
1.23 nicm 176: if (more == UTF8_DONE) {
1.9 nicm 177: /* UTF-8 character finished. */
1.19 nicm 178: for (i = 0; i < ud.size; i++)
179: *dst++ = ud.data[i];
1.9 nicm 180: continue;
181: }
1.23 nicm 182: /* Not a complete, valid UTF-8 character. */
183: src -= ud.have;
1.9 nicm 184: }
185: if (src < end - 1)
186: dst = vis(dst, src[0], flag, src[1]);
187: else if (src < end)
188: dst = vis(dst, src[0], flag, '\0');
189: src++;
190: }
191:
192: *dst = '\0';
193: return (dst - start);
1.35 nicm 194: }
195:
196: /* Same as utf8_strvis but allocate the buffer. */
197: int
198: utf8_stravis(char **dst, const char *src, int flag)
199: {
200: char *buf;
201: int len;
202:
203: buf = xreallocarray(NULL, 4, strlen(src) + 1);
204: len = utf8_strvis(buf, src, strlen(src), flag);
205:
206: *dst = xrealloc(buf, len + 1);
207: return (len);
1.38 nicm 208: }
209:
210: /* Does this string contain anything that isn't valid UTF-8? */
211: int
212: utf8_isvalid(const char *s)
213: {
214: struct utf8_data ud;
215: const char *end;
216: enum utf8_state more;
217:
218: end = s + strlen(s);
219: while (s < end) {
220: if ((more = utf8_open(&ud, *s)) == UTF8_MORE) {
221: while (++s < end && more == UTF8_MORE)
222: more = utf8_append(&ud, *s);
223: if (more == UTF8_DONE)
224: continue;
225: return (0);
226: }
227: if (*s < 0x20 || *s > 0x7e)
228: return (0);
229: s++;
230: }
231: return (1);
1.16 nicm 232: }
233:
234: /*
235: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
236: * the returned string. Anything not valid printable ASCII or UTF-8 is
237: * stripped.
238: */
239: char *
240: utf8_sanitize(const char *src)
241: {
242: char *dst;
243: size_t n;
1.23 nicm 244: enum utf8_state more;
1.19 nicm 245: struct utf8_data ud;
1.16 nicm 246: u_int i;
247:
248: dst = NULL;
249:
250: n = 0;
251: while (*src != '\0') {
252: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 253: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
254: while (*++src != '\0' && more == UTF8_MORE)
1.19 nicm 255: more = utf8_append(&ud, *src);
1.23 nicm 256: if (more == UTF8_DONE) {
1.19 nicm 257: dst = xreallocarray(dst, n + ud.width,
1.16 nicm 258: sizeof *dst);
1.19 nicm 259: for (i = 0; i < ud.width; i++)
1.16 nicm 260: dst[n++] = '_';
261: continue;
262: }
1.19 nicm 263: src -= ud.have;
1.16 nicm 264: }
265: if (*src > 0x1f && *src < 0x7f)
1.21 nicm 266: dst[n++] = *src;
1.23 nicm 267: else
268: dst[n++] = '_';
1.16 nicm 269: src++;
270: }
271:
272: dst = xreallocarray(dst, n + 1, sizeof *dst);
273: dst[n] = '\0';
274: return (dst);
1.34 nicm 275: }
276:
277: /* Get UTF-8 buffer length. */
278: size_t
279: utf8_strlen(const struct utf8_data *s)
280: {
281: size_t i;
282:
283: for (i = 0; s[i].size != 0; i++)
284: /* nothing */;
285: return (i);
286: }
287:
288: /* Get UTF-8 string width. */
289: u_int
290: utf8_strwidth(const struct utf8_data *s, ssize_t n)
291: {
292: ssize_t i;
293: u_int width;
294:
295: width = 0;
296: for (i = 0; s[i].size != 0; i++) {
297: if (n != -1 && n == i)
298: break;
299: width += s[i].width;
300: }
301: return (width);
1.11 nicm 302: }
303:
304: /*
305: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
306: * Caller frees.
307: */
308: struct utf8_data *
309: utf8_fromcstr(const char *src)
310: {
311: struct utf8_data *dst;
312: size_t n;
1.23 nicm 313: enum utf8_state more;
1.11 nicm 314:
315: dst = NULL;
316:
317: n = 0;
318: while (*src != '\0') {
1.12 nicm 319: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 320: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
321: while (*++src != '\0' && more == UTF8_MORE)
1.11 nicm 322: more = utf8_append(&dst[n], *src);
1.23 nicm 323: if (more == UTF8_DONE) {
1.11 nicm 324: n++;
325: continue;
326: }
327: src -= dst[n].have;
328: }
1.23 nicm 329: utf8_set(&dst[n], *src);
330: n++;
1.11 nicm 331: src++;
332: }
333:
1.12 nicm 334: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 335: dst[n].size = 0;
336: return (dst);
337: }
338:
339: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
340: char *
341: utf8_tocstr(struct utf8_data *src)
342: {
343: char *dst;
344: size_t n;
345:
346: dst = NULL;
347:
348: n = 0;
349: for(; src->size != 0; src++) {
1.12 nicm 350: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 351: memcpy(dst + n, src->data, src->size);
352: n += src->size;
353: }
354:
1.12 nicm 355: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 356: dst[n] = '\0';
357: return (dst);
358: }
359:
360: /* Get width of UTF-8 string. */
361: u_int
362: utf8_cstrwidth(const char *s)
363: {
364: struct utf8_data tmp;
365: u_int width;
1.23 nicm 366: enum utf8_state more;
1.11 nicm 367:
368: width = 0;
369: while (*s != '\0') {
1.23 nicm 370: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
371: while (*++s != '\0' && more == UTF8_MORE)
1.11 nicm 372: more = utf8_append(&tmp, *s);
1.23 nicm 373: if (more == UTF8_DONE) {
1.11 nicm 374: width += tmp.width;
375: continue;
376: }
377: s -= tmp.have;
378: }
1.23 nicm 379: if (*s > 0x1f && *s != 0x7f)
1.21 nicm 380: width++;
1.11 nicm 381: s++;
382: }
383: return (width);
384: }
385:
386: /* Trim UTF-8 string to width. Caller frees. */
387: char *
388: utf8_trimcstr(const char *s, u_int width)
389: {
390: struct utf8_data *tmp, *next;
391: char *out;
392: u_int at;
393:
394: tmp = utf8_fromcstr(s);
395:
396: at = 0;
397: for (next = tmp; next->size != 0; next++) {
398: if (at + next->width > width) {
399: next->size = 0;
400: break;
401: }
402: at += next->width;
403: }
404:
405: out = utf8_tocstr(tmp);
1.27 nicm 406: free(tmp);
407: return (out);
408: }
409:
410: /* Trim UTF-8 string to width. Caller frees. */
411: char *
412: utf8_rtrimcstr(const char *s, u_int width)
413: {
414: struct utf8_data *tmp, *next, *end;
415: char *out;
416: u_int at;
417:
418: tmp = utf8_fromcstr(s);
419:
420: for (end = tmp; end->size != 0; end++)
421: /* nothing */;
422: if (end == tmp) {
423: free(tmp);
424: return (xstrdup(""));
425: }
426: next = end - 1;
427:
428: at = 0;
1.37 nicm 429: for (;;) {
1.27 nicm 430: if (at + next->width > width) {
431: next++;
432: break;
433: }
434: at += next->width;
435:
436: if (next == tmp)
437: break;
438: next--;
439: }
440:
441: out = utf8_tocstr(next);
1.11 nicm 442: free(tmp);
1.18 nicm 443: return (out);
444: }
445:
446: /* Pad UTF-8 string to width. Caller frees. */
447: char *
448: utf8_padcstr(const char *s, u_int width)
449: {
450: size_t slen;
451: char *out;
452: u_int n, i;
453:
454: n = utf8_cstrwidth(s);
455: if (n >= width)
456: return (xstrdup(s));
457:
458: slen = strlen(s);
459: out = xmalloc(slen + 1 + (width - n));
460: memcpy(out, s, slen);
461: for (i = n; i < width; i++)
462: out[slen++] = ' ';
463: out[slen] = '\0';
1.11 nicm 464: return (out);
1.1 nicm 465: }