Annotation of src/usr.bin/tmux/utf8.c, Revision 1.30
1.30 ! nicm 1: /* $OpenBSD: utf8.c,v 1.29 2016/03/02 15:36:03 nicm Exp $ */
1.1 nicm 2:
3: /*
1.26 nicm 4: * Copyright (c) 2008 Nicholas Marriott <nicholas.marriott@gmail.com>
1.1 nicm 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF MIND, USE, DATA OR PROFITS, WHETHER
15: * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
16: * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18:
19: #include <sys/types.h>
20:
1.30 ! nicm 21: #include <errno.h>
1.11 nicm 22: #include <stdlib.h>
1.1 nicm 23: #include <string.h>
1.9 nicm 24: #include <vis.h>
1.28 nicm 25: #include <wchar.h>
1.1 nicm 26:
27: #include "tmux.h"
28:
1.29 nicm 29: static int utf8_width(wchar_t);
30:
1.11 nicm 31: /* Set a single character. */
32: void
1.19 nicm 33: utf8_set(struct utf8_data *ud, u_char ch)
1.11 nicm 34: {
1.20 nicm 35: u_int i;
36:
1.19 nicm 37: *ud->data = ch;
1.25 nicm 38: ud->have = 1;
1.19 nicm 39: ud->size = 1;
1.11 nicm 40:
1.19 nicm 41: ud->width = 1;
1.20 nicm 42:
43: for (i = ud->size; i < sizeof ud->data; i++)
44: ud->data[i] = '\0';
45: }
46:
47: /* Copy UTF-8 character. */
48: void
49: utf8_copy(struct utf8_data *to, const struct utf8_data *from)
50: {
51: u_int i;
52:
53: memcpy(to, from, sizeof *to);
54:
55: for (i = to->size; i < sizeof to->data; i++)
56: to->data[i] = '\0';
1.11 nicm 57: }
58:
1.4 nicm 59: /*
60: * Open UTF-8 sequence.
61: *
62: * 11000010-11011111 C2-DF start of 2-byte sequence
63: * 11100000-11101111 E0-EF start of 3-byte sequence
64: * 11110000-11110100 F0-F4 start of 4-byte sequence
65: */
1.23 nicm 66: enum utf8_state
1.19 nicm 67: utf8_open(struct utf8_data *ud, u_char ch)
1.4 nicm 68: {
1.19 nicm 69: memset(ud, 0, sizeof *ud);
1.4 nicm 70: if (ch >= 0xc2 && ch <= 0xdf)
1.19 nicm 71: ud->size = 2;
1.4 nicm 72: else if (ch >= 0xe0 && ch <= 0xef)
1.19 nicm 73: ud->size = 3;
1.4 nicm 74: else if (ch >= 0xf0 && ch <= 0xf4)
1.19 nicm 75: ud->size = 4;
1.4 nicm 76: else
1.23 nicm 77: return (UTF8_ERROR);
1.19 nicm 78: utf8_append(ud, ch);
1.23 nicm 79: return (UTF8_MORE);
1.4 nicm 80: }
81:
1.23 nicm 82: /* Append character to UTF-8, closing if finished. */
83: enum utf8_state
1.19 nicm 84: utf8_append(struct utf8_data *ud, u_char ch)
1.4 nicm 85: {
1.29 nicm 86: wchar_t wc;
87: int width;
88:
1.19 nicm 89: if (ud->have >= ud->size)
1.4 nicm 90: fatalx("UTF-8 character overflow");
1.19 nicm 91: if (ud->size > sizeof ud->data)
1.4 nicm 92: fatalx("UTF-8 character size too large");
93:
1.21 nicm 94: if (ud->have != 0 && (ch & 0xc0) != 0x80)
95: ud->width = 0xff;
96:
1.19 nicm 97: ud->data[ud->have++] = ch;
98: if (ud->have != ud->size)
1.23 nicm 99: return (UTF8_MORE);
1.4 nicm 100:
1.21 nicm 101: if (ud->width == 0xff)
1.23 nicm 102: return (UTF8_ERROR);
1.29 nicm 103:
104: if (utf8_combine(ud, &wc) != UTF8_DONE)
105: return (UTF8_ERROR);
106: if ((width = utf8_width(wc)) < 0)
107: return (UTF8_ERROR);
108: ud->width = width;
109:
1.23 nicm 110: return (UTF8_DONE);
1.1 nicm 111: }
112:
1.28 nicm 113: /* Get width of Unicode character. */
1.29 nicm 114: static int
1.28 nicm 115: utf8_width(wchar_t wc)
1.17 nicm 116: {
1.29 nicm 117: int width;
1.17 nicm 118:
1.28 nicm 119: width = wcwidth(wc);
1.30 ! nicm 120: if (width < 0 || width > 0xff) {
! 121: log_debug("Unicode %04x, wcwidth() %d", wc, width);
1.29 nicm 122: return (-1);
1.30 ! nicm 123: }
1.28 nicm 124: return (width);
1.17 nicm 125: }
126:
1.28 nicm 127: /* Combine UTF-8 into Unicode. */
1.29 nicm 128: enum utf8_state
129: utf8_combine(const struct utf8_data *ud, wchar_t *wc)
1.1 nicm 130: {
1.29 nicm 131: switch (mbtowc(wc, ud->data, ud->size)) {
132: case -1:
1.30 ! nicm 133: log_debug("UTF-8 %.*s, mbtowc() %d", (int)ud->size, ud->data,
! 134: errno);
1.29 nicm 135: mbtowc(NULL, NULL, MB_CUR_MAX);
136: return (UTF8_ERROR);
137: case 0:
138: return (UTF8_ERROR);
139: default:
140: return (UTF8_DONE);
141: }
1.15 nicm 142: }
143:
1.28 nicm 144: /* Split Unicode into UTF-8. */
1.23 nicm 145: enum utf8_state
1.28 nicm 146: utf8_split(wchar_t wc, struct utf8_data *ud)
1.15 nicm 147: {
1.29 nicm 148: char s[MB_LEN_MAX];
149: int slen;
1.28 nicm 150:
151: slen = wctomb(s, wc);
152: if (slen <= 0 || slen > (int)sizeof ud->data)
1.23 nicm 153: return (UTF8_ERROR);
1.28 nicm 154:
155: memcpy(ud->data, s, slen);
156: ud->size = slen;
157:
158: ud->width = utf8_width(wc);
1.23 nicm 159: return (UTF8_DONE);
1.9 nicm 160: }
161:
162: /*
163: * Encode len characters from src into dst, which is guaranteed to have four
164: * bytes available for each character from src (for \abc or UTF-8) plus space
165: * for \0.
166: */
167: int
168: utf8_strvis(char *dst, const char *src, size_t len, int flag)
169: {
1.19 nicm 170: struct utf8_data ud;
1.9 nicm 171: const char *start, *end;
1.23 nicm 172: enum utf8_state more;
1.9 nicm 173: size_t i;
174:
175: start = dst;
176: end = src + len;
177:
178: while (src < end) {
1.23 nicm 179: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
180: while (++src < end && more == UTF8_MORE)
1.19 nicm 181: more = utf8_append(&ud, *src);
1.23 nicm 182: if (more == UTF8_DONE) {
1.9 nicm 183: /* UTF-8 character finished. */
1.19 nicm 184: for (i = 0; i < ud.size; i++)
185: *dst++ = ud.data[i];
1.9 nicm 186: continue;
187: }
1.23 nicm 188: /* Not a complete, valid UTF-8 character. */
189: src -= ud.have;
1.9 nicm 190: }
191: if (src < end - 1)
192: dst = vis(dst, src[0], flag, src[1]);
193: else if (src < end)
194: dst = vis(dst, src[0], flag, '\0');
195: src++;
196: }
197:
198: *dst = '\0';
199: return (dst - start);
1.16 nicm 200: }
201:
202: /*
203: * Sanitize a string, changing any UTF-8 characters to '_'. Caller should free
204: * the returned string. Anything not valid printable ASCII or UTF-8 is
205: * stripped.
206: */
207: char *
208: utf8_sanitize(const char *src)
209: {
210: char *dst;
211: size_t n;
1.23 nicm 212: enum utf8_state more;
1.19 nicm 213: struct utf8_data ud;
1.16 nicm 214: u_int i;
215:
216: dst = NULL;
217:
218: n = 0;
219: while (*src != '\0') {
220: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 221: if ((more = utf8_open(&ud, *src)) == UTF8_MORE) {
222: while (*++src != '\0' && more == UTF8_MORE)
1.19 nicm 223: more = utf8_append(&ud, *src);
1.23 nicm 224: if (more == UTF8_DONE) {
1.19 nicm 225: dst = xreallocarray(dst, n + ud.width,
1.16 nicm 226: sizeof *dst);
1.19 nicm 227: for (i = 0; i < ud.width; i++)
1.16 nicm 228: dst[n++] = '_';
229: continue;
230: }
1.19 nicm 231: src -= ud.have;
1.16 nicm 232: }
233: if (*src > 0x1f && *src < 0x7f)
1.21 nicm 234: dst[n++] = *src;
1.23 nicm 235: else
236: dst[n++] = '_';
1.16 nicm 237: src++;
238: }
239:
240: dst = xreallocarray(dst, n + 1, sizeof *dst);
241: dst[n] = '\0';
242: return (dst);
1.11 nicm 243: }
244:
245: /*
246: * Convert a string into a buffer of UTF-8 characters. Terminated by size == 0.
247: * Caller frees.
248: */
249: struct utf8_data *
250: utf8_fromcstr(const char *src)
251: {
252: struct utf8_data *dst;
253: size_t n;
1.23 nicm 254: enum utf8_state more;
1.11 nicm 255:
256: dst = NULL;
257:
258: n = 0;
259: while (*src != '\0') {
1.12 nicm 260: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.23 nicm 261: if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) {
262: while (*++src != '\0' && more == UTF8_MORE)
1.11 nicm 263: more = utf8_append(&dst[n], *src);
1.23 nicm 264: if (more == UTF8_DONE) {
1.11 nicm 265: n++;
266: continue;
267: }
268: src -= dst[n].have;
269: }
1.23 nicm 270: utf8_set(&dst[n], *src);
271: n++;
1.11 nicm 272: src++;
273: }
274:
1.12 nicm 275: dst = xreallocarray(dst, n + 1, sizeof *dst);
1.11 nicm 276: dst[n].size = 0;
277: return (dst);
278: }
279:
280: /* Convert from a buffer of UTF-8 characters into a string. Caller frees. */
281: char *
282: utf8_tocstr(struct utf8_data *src)
283: {
284: char *dst;
285: size_t n;
286:
287: dst = NULL;
288:
289: n = 0;
290: for(; src->size != 0; src++) {
1.12 nicm 291: dst = xreallocarray(dst, n + src->size, 1);
1.11 nicm 292: memcpy(dst + n, src->data, src->size);
293: n += src->size;
294: }
295:
1.12 nicm 296: dst = xreallocarray(dst, n + 1, 1);
1.11 nicm 297: dst[n] = '\0';
298: return (dst);
299: }
300:
301: /* Get width of UTF-8 string. */
302: u_int
303: utf8_cstrwidth(const char *s)
304: {
305: struct utf8_data tmp;
306: u_int width;
1.23 nicm 307: enum utf8_state more;
1.11 nicm 308:
309: width = 0;
310: while (*s != '\0') {
1.23 nicm 311: if ((more = utf8_open(&tmp, *s)) == UTF8_MORE) {
312: while (*++s != '\0' && more == UTF8_MORE)
1.11 nicm 313: more = utf8_append(&tmp, *s);
1.23 nicm 314: if (more == UTF8_DONE) {
1.11 nicm 315: width += tmp.width;
316: continue;
317: }
318: s -= tmp.have;
319: }
1.23 nicm 320: if (*s > 0x1f && *s != 0x7f)
1.21 nicm 321: width++;
1.11 nicm 322: s++;
323: }
324: return (width);
325: }
326:
327: /* Trim UTF-8 string to width. Caller frees. */
328: char *
329: utf8_trimcstr(const char *s, u_int width)
330: {
331: struct utf8_data *tmp, *next;
332: char *out;
333: u_int at;
334:
335: tmp = utf8_fromcstr(s);
336:
337: at = 0;
338: for (next = tmp; next->size != 0; next++) {
339: if (at + next->width > width) {
340: next->size = 0;
341: break;
342: }
343: at += next->width;
344: }
345:
346: out = utf8_tocstr(tmp);
1.27 nicm 347: free(tmp);
348: return (out);
349: }
350:
351: /* Trim UTF-8 string to width. Caller frees. */
352: char *
353: utf8_rtrimcstr(const char *s, u_int width)
354: {
355: struct utf8_data *tmp, *next, *end;
356: char *out;
357: u_int at;
358:
359: tmp = utf8_fromcstr(s);
360:
361: for (end = tmp; end->size != 0; end++)
362: /* nothing */;
363: if (end == tmp) {
364: free(tmp);
365: return (xstrdup(""));
366: }
367: next = end - 1;
368:
369: at = 0;
370: for (;;)
371: {
372: if (at + next->width > width) {
373: next++;
374: break;
375: }
376: at += next->width;
377:
378: if (next == tmp)
379: break;
380: next--;
381: }
382:
383: out = utf8_tocstr(next);
1.11 nicm 384: free(tmp);
1.18 nicm 385: return (out);
386: }
387:
388: /* Pad UTF-8 string to width. Caller frees. */
389: char *
390: utf8_padcstr(const char *s, u_int width)
391: {
392: size_t slen;
393: char *out;
394: u_int n, i;
395:
396: n = utf8_cstrwidth(s);
397: if (n >= width)
398: return (xstrdup(s));
399:
400: slen = strlen(s);
401: out = xmalloc(slen + 1 + (width - n));
402: memcpy(out, s, slen);
403: for (i = n; i < width; i++)
404: out[slen++] = ' ';
405: out[slen] = '\0';
1.11 nicm 406: return (out);
1.1 nicm 407: }