Annotation of src/usr.bin/tmux/utf8.c, Revision 1.29
1.29 ! nicm 1: /* $OpenBSD: utf8.c,v 1.28 2016/03/01 12:02:08 nicm Exp $ */
1.1 nicm 2:
3: /*
1.26 nicm 4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
1.1 nicm 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.11 nicm 21: #include <stdlib.h>
1.1 nicm 22: #include <string.h>
1.9 nicm 23: #include <vis.h>
1.28 nicm 24: #include <wchar.h>
1.1 nicm 25:
26: #include "tmux.h"
27:
1.29 ! nicm 28: static int utf8_width(wchar_t);
! 29:
1.11 nicm 30: /* Set a single character. */
31: void
1.19 nicm 32: utf8_set(struct utf8_data *ud, u_char ch)
1.11 nicm 33: {
1.20 nicm 34: u_int i;
35:
1.19 nicm 36: *ud->data = ch;
1.25 nicm 37: ud->have = 1;
1.19 nicm 38: ud->size = 1;
1.11 nicm 39:
1.19 nicm 40: ud->width = 1;
1.20 nicm 41:
42: for (i = ud->size; i < sizeof ud->data; i++)
43: ud->data[i] = '\0';
44: }
45:
46: /* Copy UTF-8 character. */
47: void
48: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
49: {
50: u_int i;
51:
52: memcpy(to, from, sizeof *to);
53:
54: for (i = to->size; i < sizeof to->data; i++)
55: to->data[i] = '\0';
1.11 nicm 56: }
57:
1.4 nicm 58: /*
59: * Open UTF-8 sequence.
60: *
61: * 11000010-11011111 C2-DF start of 2-byte sequence
62: * 11100000-11101111 E0-EF start of 3-byte sequence
63: * 11110000-11110100 F0-F4 start of 4-byte sequence
64: */
1.23 nicm 65: enum utf8_state
1.19 nicm 66: utf8_open(struct utf8_data *ud, u_char ch)
1.4 nicm 67: {
1.19 nicm 68: memset(ud, 0, sizeof *ud);
1.4 nicm 69: if (ch >= 0xc2 && ch <= 0xdf)
1.19 nicm 70: ud->size = 2;
1.4 nicm 71: else if (ch >= 0xe0 && ch <= 0xef)
1.19 nicm 72: ud->size = 3;
1.4 nicm 73: else if (ch >= 0xf0 && ch <= 0xf4)
1.19 nicm 74: ud->size = 4;
1.4 nicm 75: else
1.23 nicm 76: return (UTF8_ERROR);
1.19 nicm 77: utf8_append(ud, ch);
1.23 nicm 78: return (UTF8_MORE);
1.4 nicm 79: }
80:
1.23 nicm 81: /* Append character to UTF-8, closing if finished. */
82: enum utf8_state
1.19 nicm 83: utf8_append(struct utf8_data *ud, u_char ch)
1.4 nicm 84: {
1.29 ! nicm 85: wchar_t wc;
! 86: int width;
! 87:
1.19 nicm 88: if (ud->have >= ud->size)
1.4 nicm 89: fatalx("UTF-8 character overflow");
1.19 nicm 90: if (ud->size > sizeof ud->data)
1.4 nicm 91: fatalx("UTF-8 character size too large");
92:
1.21 nicm 93: if (ud->have != 0 && (ch & 0xc0) != 0x80)
94: ud->width = 0xff;
95:
1.19 nicm 96: ud->data[ud->have++] = ch;
97: if (ud->have != ud->size)
1.23 nicm 98: return (UTF8_MORE);
1.4 nicm 99:
1.21 nicm 100: if (ud->width == 0xff)
1.23 nicm 101: return (UTF8_ERROR);
1.29 ! nicm 102:
! 103: if (utf8_combine(ud, &wc) != UTF8_DONE)
! 104: return (UTF8_ERROR);
! 105: if ((width = utf8_width(wc)) < 0)
! 106: return (UTF8_ERROR);
! 107: ud->width = width;
! 108:
1.23 nicm 109: return (UTF8_DONE);
1.1 nicm 110: }
111:
1.28 nicm 112: /* Get width of Unicode character. */
1.29 ! nicm 113: static int
1.28 nicm 114: utf8_width(wchar_t wc)
1.17 nicm 115: {
1.29 ! nicm 116: int width;
1.17 nicm 117:
1.28 nicm 118: width = wcwidth(wc);
1.29 ! nicm 119: if (width < 0 || width > 0xff)
! 120: return (-1);
1.28 nicm 121: return (width);
1.17 nicm 122: }
123:
1.28 nicm 124: /* Combine UTF-8 into Unicode. */
1.29 ! nicm 125: enum utf8_state
! 126: utf8_combine(const struct utf8_data *ud, wchar_t *wc)
1.1 nicm 127: {
1.29 ! nicm 128: switch (mbtowc(wc, ud->data, ud->size)) {
! 129: case -1:
! 130: mbtowc(NULL, NULL, MB_CUR_MAX);
! 131: return (UTF8_ERROR);
! 132: case 0:
! 133: return (UTF8_ERROR);
! 134: default:
! 135: return (UTF8_DONE);
! 136: }
1.15 nicm 137: }
138:
1.28 nicm 139: /* Split Unicode into UTF-8. */
1.23 nicm 140: enum utf8_state
1.28 nicm 141: utf8_split(wchar_t wc, struct utf8_data *ud)
1.15 nicm 142: {
1.29 ! nicm 143: char s[MB_LEN_MAX];
! 144: int slen;
1.28 nicm 145:
146: slen = wctomb(s, wc);
147: if (slen <= 0 || slen > (int)sizeof ud->data)
1.23 nicm 148: return (UTF8_ERROR);
1.28 nicm 149:
150: memcpy(ud->data, s, slen);
151: ud->size = slen;
152:
153: ud->width = utf8_width(wc);
1.23 nicm 154: return (UTF8_DONE);
1.9 nicm 155: }
156:
157: /*
158: * Encode len characters from src into dst, which is guaranteed to have four
159: * bytes available for each character from src (for \abc or UTF-8) plus space
160: * for \0.
161: */
162: int
163: utf8_strvis(char *dst, const char *src, size_t len, int flag)
164: {
1.19 nicm 165: struct utf8_data ud;
1.9 nicm 166: const char *start, *end;
1.23 nicm 167: enum utf8_state more;
1.9 nicm 168: size_t i;
169:
170: start = dst;
171: end = src + len;
172:
173: while (src < end) {
1.23 nicm 174: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
175: while (++src < end && more == UTF8_MORE)
1.19 nicm 176: more = utf8_append(&ud, *src);
1.23 nicm 177: if (more == UTF8_DONE) {
1.9 nicm 178: /* UTF-8 character finished. */
1.19 nicm 179: for (i = 0; i < ud.size; i++)
180: *dst++ = ud.data[i];
1.9 nicm 181: continue;
182: }
1.23 nicm 183: /* Not a complete, valid UTF-8 character. */
184: src -= ud.have;
1.9 nicm 185: }
186: if (src < end - 1)
187: dst = vis(dst, src[0], flag, src[1]);
188: else if (src < end)
189: dst = vis(dst, src[0], flag, '\0');
190: src++;
191: }
192:
193: *dst = '\0';
194: return (dst - start);
1.16 nicm 195: }
196:
197: /*
198: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
199: * the returned string. Anything not valid printable ASCII or UTF-8 is
200: * stripped.
201: */
202: char *
203: utf8_sanitize(const char *src)
204: {
205: char *dst;
206: size_t n;
1.23 nicm 207: enum utf8_state more;
1.19 nicm 208: struct utf8_data ud;
1.16 nicm 209: u_int i;
210:
211: dst = NULL;
212:
213: n = 0;
214: while (*src != '\0') {
215: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 216: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
217: while (*++src != '\0' && more == UTF8_MORE)
1.19 nicm 218: more = utf8_append(&ud, *src);
1.23 nicm 219: if (more == UTF8_DONE) {
1.19 nicm 220: dst = xreallocarray(dst, n + ud.width,
1.16 nicm 221: sizeof *dst);
1.19 nicm 222: for (i = 0; i < ud.width; i++)
1.16 nicm 223: dst[n++] = '_';
224: continue;
225: }
1.19 nicm 226: src -= ud.have;
1.16 nicm 227: }
228: if (*src > 0x1f && *src < 0x7f)
1.21 nicm 229: dst[n++] = *src;
1.23 nicm 230: else
231: dst[n++] = '_';
1.16 nicm 232: src++;
233: }
234:
235: dst = xreallocarray(dst, n + 1, sizeof *dst);
236: dst[n] = '\0';
237: return (dst);
1.11 nicm 238: }
239:
240: /*
241: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
242: * Caller frees.
243: */
244: struct utf8_data *
245: utf8_fromcstr(const char *src)
246: {
247: struct utf8_data *dst;
248: size_t n;
1.23 nicm 249: enum utf8_state more;
1.11 nicm 250:
251: dst = NULL;
252:
253: n = 0;
254: while (*src != '\0') {
1.12 nicm 255: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 256: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
257: while (*++src != '\0' && more == UTF8_MORE)
1.11 nicm 258: more = utf8_append(&dst[n], *src);
1.23 nicm 259: if (more == UTF8_DONE) {
1.11 nicm 260: n++;
261: continue;
262: }
263: src -= dst[n].have;
264: }
1.23 nicm 265: utf8_set(&dst[n], *src);
266: n++;
1.11 nicm 267: src++;
268: }
269:
1.12 nicm 270: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 271: dst[n].size = 0;
272: return (dst);
273: }
274:
275: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
276: char *
277: utf8_tocstr(struct utf8_data *src)
278: {
279: char *dst;
280: size_t n;
281:
282: dst = NULL;
283:
284: n = 0;
285: for(; src->size != 0; src++) {
1.12 nicm 286: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 287: memcpy(dst + n, src->data, src->size);
288: n += src->size;
289: }
290:
1.12 nicm 291: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 292: dst[n] = '\0';
293: return (dst);
294: }
295:
296: /* Get width of UTF-8 string. */
297: u_int
298: utf8_cstrwidth(const char *s)
299: {
300: struct utf8_data tmp;
301: u_int width;
1.23 nicm 302: enum utf8_state more;
1.11 nicm 303:
304: width = 0;
305: while (*s != '\0') {
1.23 nicm 306: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
307: while (*++s != '\0' && more == UTF8_MORE)
1.11 nicm 308: more = utf8_append(&tmp, *s);
1.23 nicm 309: if (more == UTF8_DONE) {
1.11 nicm 310: width += tmp.width;
311: continue;
312: }
313: s -= tmp.have;
314: }
1.23 nicm 315: if (*s > 0x1f && *s != 0x7f)
1.21 nicm 316: width++;
1.11 nicm 317: s++;
318: }
319: return (width);
320: }
321:
322: /* Trim UTF-8 string to width. Caller frees. */
323: char *
324: utf8_trimcstr(const char *s, u_int width)
325: {
326: struct utf8_data *tmp, *next;
327: char *out;
328: u_int at;
329:
330: tmp = utf8_fromcstr(s);
331:
332: at = 0;
333: for (next = tmp; next->size != 0; next++) {
334: if (at + next->width > width) {
335: next->size = 0;
336: break;
337: }
338: at += next->width;
339: }
340:
341: out = utf8_tocstr(tmp);
1.27 nicm 342: free(tmp);
343: return (out);
344: }
345:
346: /* Trim UTF-8 string to width. Caller frees. */
347: char *
348: utf8_rtrimcstr(const char *s, u_int width)
349: {
350: struct utf8_data *tmp, *next, *end;
351: char *out;
352: u_int at;
353:
354: tmp = utf8_fromcstr(s);
355:
356: for (end = tmp; end->size != 0; end++)
357: /* nothing */;
358: if (end == tmp) {
359: free(tmp);
360: return (xstrdup(""));
361: }
362: next = end - 1;
363:
364: at = 0;
365: for (;;)
366: {
367: if (at + next->width > width) {
368: next++;
369: break;
370: }
371: at += next->width;
372:
373: if (next == tmp)
374: break;
375: next--;
376: }
377:
378: out = utf8_tocstr(next);
1.11 nicm 379: free(tmp);
1.18 nicm 380: return (out);
381: }
382:
383: /* Pad UTF-8 string to width. Caller frees. */
384: char *
385: utf8_padcstr(const char *s, u_int width)
386: {
387: size_t slen;
388: char *out;
389: u_int n, i;
390:
391: n = utf8_cstrwidth(s);
392: if (n >= width)
393: return (xstrdup(s));
394:
395: slen = strlen(s);
396: out = xmalloc(slen + 1 + (width - n));
397: memcpy(out, s, slen);
398: for (i = n; i < width; i++)
399: out[slen++] = ' ';
400: out[slen] = '\0';
1.11 nicm 401: return (out);
1.1 nicm 402: }