version 1.46, 2020/05/25 15:02:25 |
version 1.47, 2020/05/25 18:19:29 |
|
|
|
|
#include "tmux.h" |
#include "tmux.h" |
|
|
static int utf8_width(wchar_t); |
struct utf8_item { |
|
u_int offset; |
|
RB_ENTRY(utf8_item) entry; |
|
|
struct utf8_big_item { |
|
u_int index; |
|
RB_ENTRY(utf8_big_item) entry; |
|
|
|
char data[UTF8_SIZE]; |
char data[UTF8_SIZE]; |
u_char size; |
u_char size; |
}; |
}; |
RB_HEAD(utf8_big_tree, utf8_big_item); |
RB_HEAD(utf8_tree, utf8_item); |
|
|
static int |
static int |
utf8_big_cmp(struct utf8_big_item *bi1, struct utf8_big_item *bi2) |
utf8_cmp(struct utf8_item *ui1, struct utf8_item *ui2) |
{ |
{ |
if (bi1->size < bi2->size) |
if (ui1->size < ui2->size) |
return (-1); |
return (-1); |
if (bi1->size > bi2->size) |
if (ui1->size > ui2->size) |
return (1); |
return (1); |
return (memcmp(bi1->data, bi2->data, bi1->size)); |
return (memcmp(ui1->data, ui2->data, ui1->size)); |
} |
} |
RB_GENERATE_STATIC(utf8_big_tree, utf8_big_item, entry, utf8_big_cmp); |
RB_GENERATE_STATIC(utf8_tree, utf8_item, entry, utf8_cmp); |
static struct utf8_big_tree utf8_big_tree = RB_INITIALIZER(utf8_big_tree); |
static struct utf8_tree utf8_tree = RB_INITIALIZER(utf8_tree); |
|
|
static struct utf8_big_item *utf8_big_list; |
static struct utf8_item *utf8_list; |
static u_int utf8_big_list_size; |
static u_int utf8_list_size; |
static u_int utf8_big_list_used; |
static u_int utf8_list_used; |
|
|
union utf8_big_map { |
union utf8_map { |
u_int value; |
utf8_char uc; |
struct { |
struct { |
u_char flags; |
u_char flags; |
#define UTF8_BIG_SIZE 0x1f |
#define UTF8_FLAG_SIZE 0x1f |
#define UTF8_BIG_WIDTH2 0x20 |
#define UTF8_FLAG_WIDTH2 0x20 |
|
|
u_char data[3]; |
u_char data[3]; |
}; |
}; |
} __packed; |
} __packed; |
|
|
static const union utf8_big_map utf8_big_space1 = { |
static const union utf8_map utf8_space1 = { |
.flags = 1, |
.flags = 1, |
.data = " " |
.data = " " |
}; |
}; |
static const union utf8_big_map utf8_big_space2 = { |
static const union utf8_map utf8_space2 = { |
.flags = UTF8_BIG_WIDTH2|2, |
.flags = UTF8_FLAG_WIDTH2|2, |
.data = " " |
.data = " " |
}; |
}; |
|
|
/* Get a big item by index. */ |
/* Get a UTF-8 item by offset. */ |
static struct utf8_big_item * |
static struct utf8_item * |
utf8_get_big_item(const char *data, size_t size) |
utf8_get_item(const char *data, size_t size) |
{ |
{ |
struct utf8_big_item bi; |
struct utf8_item ui; |
|
|
memcpy(bi.data, data, size); |
memcpy(ui.data, data, size); |
bi.size = size; |
ui.size = size; |
|
|
return (RB_FIND(utf8_big_tree, &utf8_big_tree, &bi)); |
return (RB_FIND(utf8_tree, &utf8_tree, &ui)); |
} |
} |
|
|
/* Add a big item. */ |
/* Expand UTF-8 list. */ |
static int |
static int |
utf8_put_big_item(const char *data, size_t size, u_int *index) |
utf8_expand_list(void) |
{ |
{ |
struct utf8_big_item *bi; |
if (utf8_list_size == 0xffffff) |
|
return (-1); |
|
if (utf8_list_size == 0) |
|
utf8_list_size = 256; |
|
else if (utf8_list_size > 0x7fffff) |
|
utf8_list_size = 0xffffff; |
|
else |
|
utf8_list_size *= 2; |
|
utf8_list = xreallocarray(utf8_list, utf8_list_size, sizeof *utf8_list); |
|
return (0); |
|
} |
|
|
bi = utf8_get_big_item(data, size); |
/* Add a UTF-8 item. */ |
if (bi != NULL) { |
static int |
*index = bi->index; |
utf8_put_item(const char *data, size_t size, u_int *offset) |
|
{ |
|
struct utf8_item *ui; |
|
|
|
ui = utf8_get_item(data, size); |
|
if (ui != NULL) { |
|
*offset = ui->offset; |
log_debug("%s: have %.*s at %u", __func__, (int)size, data, |
log_debug("%s: have %.*s at %u", __func__, (int)size, data, |
*index); |
*offset); |
return (0); |
return (0); |
} |
} |
|
|
if (utf8_big_list_used == utf8_big_list_size) { |
if (utf8_list_used == utf8_list_size && utf8_expand_list() != 0) |
if (utf8_big_list_size == 0xffffff) |
return (-1); |
return (-1); |
*offset = utf8_list_used++; |
if (utf8_big_list_size == 0) |
|
utf8_big_list_size = 256; |
|
else if (utf8_big_list_size > 0x7fffff) |
|
utf8_big_list_size = 0xffffff; |
|
else |
|
utf8_big_list_size *= 2; |
|
utf8_big_list = xreallocarray(utf8_big_list, utf8_big_list_size, |
|
sizeof *utf8_big_list); |
|
} |
|
*index = utf8_big_list_used++; |
|
|
|
bi = &utf8_big_list[*index]; |
ui = &utf8_list[*offset]; |
bi->index = *index; |
ui->offset = *offset; |
memcpy(bi->data, data, size); |
memcpy(ui->data, data, size); |
bi->size = size; |
ui->size = size; |
RB_INSERT(utf8_big_tree, &utf8_big_tree, bi); |
RB_INSERT(utf8_tree, &utf8_tree, ui); |
|
|
log_debug("%s: added %.*s at %u", __func__, (int)size, data, *index); |
log_debug("%s: added %.*s at %u", __func__, (int)size, data, *offset); |
return (0); |
return (0); |
} |
} |
|
|
/* Get UTF-8 as index into buffer. */ |
/* Get UTF-8 character from data. */ |
u_int |
enum utf8_state |
utf8_map_big(const struct utf8_data *ud) |
utf8_from_data(const struct utf8_data *ud, utf8_char *uc) |
{ |
{ |
union utf8_big_map m = { .value = 0 }; |
union utf8_map m = { .uc = 0 }; |
u_int o; |
u_int offset; |
const char *data = ud->data; |
|
size_t size = ud->size; |
|
|
|
if (ud->width != 1 && ud->width != 2) |
if (ud->width != 1 && ud->width != 2) |
return (utf8_big_space1.value); |
return (utf8_space1.uc); |
|
|
if (size > UTF8_BIG_SIZE) |
if (ud->size > UTF8_FLAG_SIZE) |
goto fail; |
goto fail; |
if (size == 1) |
if (ud->size == 1) |
return (utf8_set_big(data[0], 1)); |
return (utf8_build_one(ud->data[0], 1)); |
|
|
m.flags = size; |
m.flags = ud->size; |
if (ud->width == 2) |
if (ud->width == 2) |
m.flags |= UTF8_BIG_WIDTH2; |
m.flags |= UTF8_FLAG_WIDTH2; |
|
|
if (size <= 3) { |
if (ud->size <= 3) |
memcpy(&m.data, data, size); |
memcpy(m.data, ud->data, ud->size); |
return (m.value); |
else { |
|
if (utf8_put_item(ud->data, ud->size, &offset) != 0) |
|
goto fail; |
|
m.data[0] = (offset & 0xff); |
|
m.data[1] = (offset >> 8) & 0xff; |
|
m.data[2] = (offset >> 16); |
} |
} |
|
*uc = m.uc; |
|
return (UTF8_DONE); |
|
|
if (utf8_put_big_item(data, size, &o) != 0) |
|
goto fail; |
|
m.data[0] = (o & 0xff); |
|
m.data[1] = (o >> 8) & 0xff; |
|
m.data[2] = (o >> 16); |
|
return (m.value); |
|
|
|
fail: |
fail: |
if (ud->width == 1) |
if (ud->width == 1) |
return (utf8_big_space1.value); |
*uc = utf8_space1.uc; |
return (utf8_big_space2.value); |
else |
|
*uc = utf8_space2.uc; |
|
return (UTF8_ERROR); |
} |
} |
|
|
/* Get UTF-8 from index into buffer. */ |
/* Get UTF-8 data from character. */ |
void |
void |
utf8_get_big(u_int v, struct utf8_data *ud) |
utf8_to_data(utf8_char uc, struct utf8_data *ud) |
{ |
{ |
union utf8_big_map m = { .value = v }; |
union utf8_map m = { .uc = uc }; |
struct utf8_big_item *bi; |
struct utf8_item *ui; |
u_int o; |
u_int offset; |
|
|
memset(ud, 0, sizeof *ud); |
memset(ud, 0, sizeof *ud); |
ud->size = ud->have = (m.flags & UTF8_BIG_SIZE); |
ud->size = ud->have = (m.flags & UTF8_FLAG_SIZE); |
if (m.flags & UTF8_BIG_WIDTH2) |
if (m.flags & UTF8_FLAG_WIDTH2) |
ud->width = 2; |
ud->width = 2; |
else |
else |
ud->width = 1; |
ud->width = 1; |
|
|
return; |
return; |
} |
} |
|
|
o = ((u_int)m.data[2] << 16)|((u_int)m.data[1] << 8)|m.data[0]; |
offset = ((u_int)m.data[2] << 16)|((u_int)m.data[1] << 8)|m.data[0]; |
if (o >= utf8_big_list_used) |
if (offset >= utf8_list_used) |
memset(ud->data, ' ', ud->size); |
memset(ud->data, ' ', ud->size); |
else { |
else { |
bi = &utf8_big_list[o]; |
ui = &utf8_list[offset]; |
memcpy(ud->data, bi->data, ud->size); |
memcpy(ud->data, ui->data, ud->size); |
} |
} |
} |
} |
|
|
/* Get big value for UTF-8 single character. */ |
/* Get UTF-8 character from a single ASCII character. */ |
u_int |
u_int |
utf8_set_big(char c, u_int width) |
utf8_build_one(char c, u_int width) |
{ |
{ |
union utf8_big_map m = { .flags = 1, .data[0] = c }; |
union utf8_map m = { .flags = 1, .data[0] = c }; |
|
|
if (width == 2) |
if (width == 2) |
m.flags |= UTF8_BIG_WIDTH2; |
m.flags |= UTF8_FLAG_WIDTH2; |
return (m.value); |
return (m.uc); |
} |
} |
|
|
/* Set a single character. */ |
/* Set a single character. */ |
|
|
to->data[i] = '\0'; |
to->data[i] = '\0'; |
} |
} |
|
|
|
/* Get width of Unicode character. */ |
|
static int |
|
utf8_width(wchar_t wc) |
|
{ |
|
int width; |
|
|
|
width = wcwidth(wc); |
|
if (width < 0 || width > 0xff) { |
|
log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width); |
|
return (-1); |
|
} |
|
return (width); |
|
} |
|
|
/* |
/* |
* Open UTF-8 sequence. |
* Open UTF-8 sequence. |
* |
* |
|
|
return (UTF8_DONE); |
return (UTF8_DONE); |
} |
} |
|
|
/* Get width of Unicode character. */ |
|
static int |
|
utf8_width(wchar_t wc) |
|
{ |
|
int width; |
|
|
|
width = wcwidth(wc); |
|
if (width < 0 || width > 0xff) { |
|
log_debug("Unicode %04lx, wcwidth() %d", (long)wc, width); |
|
return (-1); |
|
} |
|
return (width); |
|
} |
|
|
|
/* Combine UTF-8 into Unicode. */ |
/* Combine UTF-8 into Unicode. */ |
enum utf8_state |
enum utf8_state |
utf8_combine(const struct utf8_data *ud, wchar_t *wc) |
utf8_combine(const struct utf8_data *ud, wchar_t *wc) |
|
|
utf8_strvis(char *dst, const char *src, size_t len, int flag) |
utf8_strvis(char *dst, const char *src, size_t len, int flag) |
{ |
{ |
struct utf8_data ud; |
struct utf8_data ud; |
const char *start, *end; |
const char *start = dst, *end = src + len; |
enum utf8_state more; |
enum utf8_state more; |
size_t i; |
size_t i; |
|
|
start = dst; |
|
end = src + len; |
|
|
|
while (src < end) { |
while (src < end) { |
if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { |
if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { |
while (++src < end && more == UTF8_MORE) |
while (++src < end && more == UTF8_MORE) |
|
|
dst = vis(dst, src[0], flag, '\0'); |
dst = vis(dst, src[0], flag, '\0'); |
src++; |
src++; |
} |
} |
|
|
*dst = '\0'; |
*dst = '\0'; |
return (dst - start); |
return (dst - start); |
} |
} |
|
|
int |
int |
utf8_isvalid(const char *s) |
utf8_isvalid(const char *s) |
{ |
{ |
struct utf8_data ud; |
struct utf8_data ud; |
const char *end; |
const char *end; |
enum utf8_state more; |
enum utf8_state more; |
|
|
end = s + strlen(s); |
end = s + strlen(s); |
while (s < end) { |
while (s < end) { |
|
|
char * |
char * |
utf8_sanitize(const char *src) |
utf8_sanitize(const char *src) |
{ |
{ |
char *dst; |
char *dst = NULL; |
size_t n; |
size_t n = 0; |
enum utf8_state more; |
enum utf8_state more; |
struct utf8_data ud; |
struct utf8_data ud; |
u_int i; |
u_int i; |
|
|
dst = NULL; |
|
|
|
n = 0; |
|
while (*src != '\0') { |
while (*src != '\0') { |
dst = xreallocarray(dst, n + 1, sizeof *dst); |
dst = xreallocarray(dst, n + 1, sizeof *dst); |
if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { |
if ((more = utf8_open(&ud, *src)) == UTF8_MORE) { |
|
|
dst[n++] = '_'; |
dst[n++] = '_'; |
src++; |
src++; |
} |
} |
|
|
dst = xreallocarray(dst, n + 1, sizeof *dst); |
dst = xreallocarray(dst, n + 1, sizeof *dst); |
dst[n] = '\0'; |
dst[n] = '\0'; |
return (dst); |
return (dst); |
|
|
utf8_strwidth(const struct utf8_data *s, ssize_t n) |
utf8_strwidth(const struct utf8_data *s, ssize_t n) |
{ |
{ |
ssize_t i; |
ssize_t i; |
u_int width; |
u_int width = 0; |
|
|
width = 0; |
|
for (i = 0; s[i].size != 0; i++) { |
for (i = 0; s[i].size != 0; i++) { |
if (n != -1 && n == i) |
if (n != -1 && n == i) |
break; |
break; |
|
|
struct utf8_data * |
struct utf8_data * |
utf8_fromcstr(const char *src) |
utf8_fromcstr(const char *src) |
{ |
{ |
struct utf8_data *dst; |
struct utf8_data *dst = NULL; |
size_t n; |
size_t n = 0; |
enum utf8_state more; |
enum utf8_state more; |
|
|
dst = NULL; |
|
|
|
n = 0; |
|
while (*src != '\0') { |
while (*src != '\0') { |
dst = xreallocarray(dst, n + 1, sizeof *dst); |
dst = xreallocarray(dst, n + 1, sizeof *dst); |
if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) { |
if ((more = utf8_open(&dst[n], *src)) == UTF8_MORE) { |
|
|
n++; |
n++; |
src++; |
src++; |
} |
} |
|
|
dst = xreallocarray(dst, n + 1, sizeof *dst); |
dst = xreallocarray(dst, n + 1, sizeof *dst); |
dst[n].size = 0; |
dst[n].size = 0; |
return (dst); |
return (dst); |
|
|
char * |
char * |
utf8_tocstr(struct utf8_data *src) |
utf8_tocstr(struct utf8_data *src) |
{ |
{ |
char *dst; |
char *dst = NULL; |
size_t n; |
size_t n = 0; |
|
|
dst = NULL; |
|
|
|
n = 0; |
|
for(; src->size != 0; src++) { |
for(; src->size != 0; src++) { |
dst = xreallocarray(dst, n + src->size, 1); |
dst = xreallocarray(dst, n + src->size, 1); |
memcpy(dst + n, src->data, src->size); |
memcpy(dst + n, src->data, src->size); |
n += src->size; |
n += src->size; |
} |
} |
|
|
dst = xreallocarray(dst, n + 1, 1); |
dst = xreallocarray(dst, n + 1, 1); |
dst[n] = '\0'; |
dst[n] = '\0'; |
return (dst); |
return (dst); |
|
|
{ |
{ |
size_t slen; |
size_t slen; |
char *out; |
char *out; |
u_int n, i; |
u_int n, i; |
|
|
n = utf8_cstrwidth(s); |
n = utf8_cstrwidth(s); |
if (n >= width) |
if (n >= width) |
|
|
{ |
{ |
size_t slen; |
size_t slen; |
char *out; |
char *out; |
u_int n, i; |
u_int n, i; |
|
|
n = utf8_cstrwidth(s); |
n = utf8_cstrwidth(s); |
if (n >= width) |
if (n >= width) |