version 1.18, 2015/11/12 12:43:36 |
version 1.19, 2015/11/12 22:04:37 |
|
|
|
|
/* Set a single character. */ |
/* Set a single character. */ |
void |
void |
utf8_set(struct utf8_data *utf8data, u_char ch) |
utf8_set(struct utf8_data *ud, u_char ch) |
{ |
{ |
*utf8data->data = ch; |
*ud->data = ch; |
utf8data->size = 1; |
ud->size = 1; |
|
|
utf8data->width = 1; |
ud->width = 1; |
} |
} |
|
|
/* |
/* |
|
|
* Returns 1 if more UTF-8 to come, 0 if not UTF-8. |
* Returns 1 if more UTF-8 to come, 0 if not UTF-8. |
*/ |
*/ |
int |
int |
utf8_open(struct utf8_data *utf8data, u_char ch) |
utf8_open(struct utf8_data *ud, u_char ch) |
{ |
{ |
memset(utf8data, 0, sizeof *utf8data); |
memset(ud, 0, sizeof *ud); |
if (ch >= 0xc2 && ch <= 0xdf) |
if (ch >= 0xc2 && ch <= 0xdf) |
utf8data->size = 2; |
ud->size = 2; |
else if (ch >= 0xe0 && ch <= 0xef) |
else if (ch >= 0xe0 && ch <= 0xef) |
utf8data->size = 3; |
ud->size = 3; |
else if (ch >= 0xf0 && ch <= 0xf4) |
else if (ch >= 0xf0 && ch <= 0xf4) |
utf8data->size = 4; |
ud->size = 4; |
else |
else |
return (0); |
return (0); |
utf8_append(utf8data, ch); |
utf8_append(ud, ch); |
return (1); |
return (1); |
} |
} |
|
|
|
|
* Returns 1 if more UTF-8 data to come, 0 if finished. |
* Returns 1 if more UTF-8 data to come, 0 if finished. |
*/ |
*/ |
int |
int |
utf8_append(struct utf8_data *utf8data, u_char ch) |
utf8_append(struct utf8_data *ud, u_char ch) |
{ |
{ |
/* XXX this should do validity checks too! */ |
/* XXX this should do validity checks too! */ |
|
|
if (utf8data->have >= utf8data->size) |
if (ud->have >= ud->size) |
fatalx("UTF-8 character overflow"); |
fatalx("UTF-8 character overflow"); |
if (utf8data->size > sizeof utf8data->data) |
if (ud->size > sizeof ud->data) |
fatalx("UTF-8 character size too large"); |
fatalx("UTF-8 character size too large"); |
|
|
utf8data->data[utf8data->have++] = ch; |
ud->data[ud->have++] = ch; |
if (utf8data->have != utf8data->size) |
if (ud->have != ud->size) |
return (1); |
return (1); |
|
|
utf8data->width = utf8_width(utf8_combine(utf8data)); |
ud->width = utf8_width(utf8_combine(ud)); |
return (0); |
return (0); |
} |
} |
|
|
|
|
|
|
/* Combine UTF-8 into 32-bit Unicode. */ |
/* Combine UTF-8 into 32-bit Unicode. */ |
u_int |
u_int |
utf8_combine(const struct utf8_data *utf8data) |
utf8_combine(const struct utf8_data *ud) |
{ |
{ |
u_int value; |
u_int value; |
|
|
value = 0xff; |
value = 0xff; |
switch (utf8data->size) { |
switch (ud->size) { |
case 1: |
case 1: |
value = utf8data->data[0]; |
value = ud->data[0]; |
break; |
break; |
case 2: |
case 2: |
value = utf8data->data[1] & 0x3f; |
value = ud->data[1] & 0x3f; |
value |= (utf8data->data[0] & 0x1f) << 6; |
value |= (ud->data[0] & 0x1f) << 6; |
break; |
break; |
case 3: |
case 3: |
value = utf8data->data[2] & 0x3f; |
value = ud->data[2] & 0x3f; |
value |= (utf8data->data[1] & 0x3f) << 6; |
value |= (ud->data[1] & 0x3f) << 6; |
value |= (utf8data->data[0] & 0xf) << 12; |
value |= (ud->data[0] & 0xf) << 12; |
break; |
break; |
case 4: |
case 4: |
value = utf8data->data[3] & 0x3f; |
value = ud->data[3] & 0x3f; |
value |= (utf8data->data[2] & 0x3f) << 6; |
value |= (ud->data[2] & 0x3f) << 6; |
value |= (utf8data->data[1] & 0x3f) << 12; |
value |= (ud->data[1] & 0x3f) << 12; |
value |= (utf8data->data[0] & 0x7) << 18; |
value |= (ud->data[0] & 0x7) << 18; |
break; |
break; |
} |
} |
return (value); |
return (value); |
|
|
|
|
/* Split 32-bit Unicode into UTF-8. */ |
/* Split 32-bit Unicode into UTF-8. */ |
int |
int |
utf8_split(u_int uc, struct utf8_data *utf8data) |
utf8_split(u_int uc, struct utf8_data *ud) |
{ |
{ |
if (uc < 0x7f) { |
if (uc < 0x7f) { |
utf8data->size = 1; |
ud->size = 1; |
utf8data->data[0] = uc; |
ud->data[0] = uc; |
} else if (uc < 0x7ff) { |
} else if (uc < 0x7ff) { |
utf8data->size = 2; |
ud->size = 2; |
utf8data->data[0] = 0xc0 | ((uc >> 6) & 0x1f); |
ud->data[0] = 0xc0 | ((uc >> 6) & 0x1f); |
utf8data->data[1] = 0x80 | (uc & 0x3f); |
ud->data[1] = 0x80 | (uc & 0x3f); |
} else if (uc < 0xffff) { |
} else if (uc < 0xffff) { |
utf8data->size = 3; |
ud->size = 3; |
utf8data->data[0] = 0xe0 | ((uc >> 12) & 0xf); |
ud->data[0] = 0xe0 | ((uc >> 12) & 0xf); |
utf8data->data[1] = 0x80 | ((uc >> 6) & 0x3f); |
ud->data[1] = 0x80 | ((uc >> 6) & 0x3f); |
utf8data->data[2] = 0x80 | (uc & 0x3f); |
ud->data[2] = 0x80 | (uc & 0x3f); |
} else if (uc < 0x1fffff) { |
} else if (uc < 0x1fffff) { |
utf8data->size = 4; |
ud->size = 4; |
utf8data->data[0] = 0xf0 | ((uc >> 18) & 0x7); |
ud->data[0] = 0xf0 | ((uc >> 18) & 0x7); |
utf8data->data[1] = 0x80 | ((uc >> 12) & 0x3f); |
ud->data[1] = 0x80 | ((uc >> 12) & 0x3f); |
utf8data->data[2] = 0x80 | ((uc >> 6) & 0x3f); |
ud->data[2] = 0x80 | ((uc >> 6) & 0x3f); |
utf8data->data[3] = 0x80 | (uc & 0x3f); |
ud->data[3] = 0x80 | (uc & 0x3f); |
} else |
} else |
return (-1); |
return (-1); |
utf8data->width = utf8_width(uc); |
ud->width = utf8_width(uc); |
return (0); |
return (0); |
} |
} |
|
|
|
|
int |
int |
utf8_strvis(char *dst, const char *src, size_t len, int flag) |
utf8_strvis(char *dst, const char *src, size_t len, int flag) |
{ |
{ |
struct utf8_data utf8data; |
struct utf8_data ud; |
const char *start, *end; |
const char *start, *end; |
int more; |
int more; |
size_t i; |
size_t i; |
|
|
end = src + len; |
end = src + len; |
|
|
while (src < end) { |
while (src < end) { |
if (utf8_open(&utf8data, *src)) { |
if (utf8_open(&ud, *src)) { |
more = 1; |
more = 1; |
while (++src < end && more) |
while (++src < end && more) |
more = utf8_append(&utf8data, *src); |
more = utf8_append(&ud, *src); |
if (!more) { |
if (!more) { |
/* UTF-8 character finished. */ |
/* UTF-8 character finished. */ |
for (i = 0; i < utf8data.size; i++) |
for (i = 0; i < ud.size; i++) |
*dst++ = utf8data.data[i]; |
*dst++ = ud.data[i]; |
continue; |
continue; |
} else if (utf8data.have > 0) { |
} else if (ud.have > 0) { |
/* Not a complete UTF-8 character. */ |
/* Not a complete UTF-8 character. */ |
src -= utf8data.have; |
src -= ud.have; |
} |
} |
} |
} |
if (src < end - 1) |
if (src < end - 1) |
|
|
char *dst; |
char *dst; |
size_t n; |
size_t n; |
int more; |
int more; |
struct utf8_data utf8data; |
struct utf8_data ud; |
u_int i; |
u_int i; |
|
|
dst = NULL; |
dst = NULL; |
|
|
n = 0; |
n = 0; |
while (*src != '\0') { |
while (*src != '\0') { |
dst = xreallocarray(dst, n + 1, sizeof *dst); |
dst = xreallocarray(dst, n + 1, sizeof *dst); |
if (utf8_open(&utf8data, *src)) { |
if (utf8_open(&ud, *src)) { |
more = 1; |
more = 1; |
while (*++src != '\0' && more) |
while (*++src != '\0' && more) |
more = utf8_append(&utf8data, *src); |
more = utf8_append(&ud, *src); |
if (!more) { |
if (!more) { |
dst = xreallocarray(dst, n + utf8data.width, |
dst = xreallocarray(dst, n + ud.width, |
sizeof *dst); |
sizeof *dst); |
for (i = 0; i < utf8data.width; i++) |
for (i = 0; i < ud.width; i++) |
dst[n++] = '_'; |
dst[n++] = '_'; |
continue; |
continue; |
} |
} |
src -= utf8data.have; |
src -= ud.have; |
} |
} |
if (*src > 0x1f && *src < 0x7f) |
if (*src > 0x1f && *src < 0x7f) |
dst[n] = *src; |
dst[n] = *src; |