=================================================================== RCS file: /cvsrepo/anoncvs/cvs/src/usr.bin/awk/run.c,v retrieving revision 1.74 retrieving revision 1.75 diff -c -r1.74 -r1.75 *** src/usr.bin/awk/run.c 2022/09/21 01:42:59 1.74 --- src/usr.bin/awk/run.c 2023/09/17 14:49:44 1.75 *************** *** 1,4 **** ! /* $OpenBSD: run.c,v 1.74 2022/09/21 01:42:59 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved --- 1,4 ---- ! /* $OpenBSD: run.c,v 1.75 2023/09/17 14:49:44 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved *************** *** 27,33 **** #include #include #include - #include #include #include #include --- 27,32 ---- *************** *** 41,48 **** --- 40,49 ---- #include "awk.h" #include "awkgram.tab.h" + static void stdinit(void); static void flush_all(void); + static char *wide_char_to_byte_str(int rune, size_t *outlen); #if 1 #define tempfree(x) do { if (istemp(x)) tfree(x); } while (/*CONSTCOND*/0) *************** *** 580,590 **** --- 581,805 ---- } + /* ======== utf-8 code ========== */ + + /* + * Awk strings can contain ascii, random 8-bit items (eg Latin-1), + * or utf-8. u8_isutf tests whether a string starts with a valid + * utf-8 sequence, and returns 0 if not (e.g., high bit set). + * u8_nextlen returns length of next valid sequence, which is + * 1 for ascii, 2..4 for utf-8, or 1 for high bit non-utf. + * u8_strlen returns length of string in valid utf-8 sequences + * and/or high-bit bytes. Conversion functions go between byte + * number and character number. + * + * In theory, this behaves the same as before for non-utf8 bytes. + * + * Limited checking! This is a potential security hole. + */ + + /* is s the beginning of a valid utf-8 string? */ + /* return length 1..4 if yes, 0 if no */ + int u8_isutf(const char *s) + { + int n, ret; + unsigned char c; + + c = s[0]; + if (c < 128) + return 1; /* what if it's 0? */ + + n = strlen(s); + if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) { + ret = 2; /* 110xxxxx 10xxxxxx */ + } else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80 + && (s[2] & 0xC0) == 0x80) { + ret = 3; /* 1110xxxx 10xxxxxx 10xxxxxx */ + } else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80 + && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) { + ret = 4; /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + } else { + ret = 0; + } + return ret; + } + + /* Convert (prefix of) utf8 string to utf-32 rune. */ + /* Sets *rune to the value, returns the length. */ + /* No error checking: watch out. */ + int u8_rune(int *rune, const char *s) + { + int n, ret; + unsigned char c; + + c = s[0]; + if (c < 128) { + *rune = c; + return 1; + } + + n = strlen(s); + if (n >= 2 && ((c>>5) & 0x7) == 0x6 && (s[1] & 0xC0) == 0x80) { + *rune = ((c & 0x1F) << 6) | (s[1] & 0x3F); /* 110xxxxx 10xxxxxx */ + ret = 2; + } else if (n >= 3 && ((c>>4) & 0xF) == 0xE && (s[1] & 0xC0) == 0x80 + && (s[2] & 0xC0) == 0x80) { + *rune = ((c & 0xF) << 12) | ((s[1] & 0x3F) << 6) | (s[2] & 0x3F); + /* 1110xxxx 10xxxxxx 10xxxxxx */ + ret = 3; + } else if (n >= 4 && ((c>>3) & 0x1F) == 0x1E && (s[1] & 0xC0) == 0x80 + && (s[2] & 0xC0) == 0x80 && (s[3] & 0xC0) == 0x80) { + *rune = ((c & 0x7) << 18) | ((s[1] & 0x3F) << 12) | ((s[2] & 0x3F) << 6) | (s[3] & 0x3F); + /* 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx */ + ret = 4; + } else { + *rune = c; + ret = 1; + } + return ret; /* returns one byte if sequence doesn't look like utf */ + } + + /* return length of next sequence: 1 for ascii or random, 2..4 for valid utf8 */ + int u8_nextlen(const char *s) + { + int len; + + len = u8_isutf(s); + if (len == 0) + len = 1; + return len; + } + + /* return number of utf characters or single non-utf bytes */ + int u8_strlen(const char *s) + { + int i, len, n, totlen; + unsigned char c; + + n = strlen(s); + totlen = 0; + for (i = 0; i < n; i += len) { + c = s[i]; + if (c < 128) { + len = 1; + } else { + len = u8_nextlen(&s[i]); + } + totlen++; + if (i > n) + FATAL("bad utf count [%s] n=%d i=%d\n", s, n, i); + } + return totlen; + } + + /* convert utf-8 char number in a string to its byte offset */ + int u8_char2byte(const char *s, int charnum) + { + int n; + int bytenum = 0; + + while (charnum > 0) { + n = u8_nextlen(s); + s += n; + bytenum += n; + charnum--; + } + return bytenum; + } + + /* convert byte offset in s to utf-8 char number that starts there */ + int u8_byte2char(const char *s, int bytenum) + { + int i, len, b; + int charnum = 0; /* BUG: what origin? */ + /* should be 0 to match start==0 which means no match */ + + b = strlen(s); + if (bytenum > b) { + return -1; /* ??? */ + } + for (i = 0; i <= bytenum; i += len) { + len = u8_nextlen(s+i); + charnum++; + } + return charnum; + } + + /* runetochar() adapted from rune.c in the Plan 9 distributione */ + + enum + { + Runeerror = 128, /* from somewhere else */ + Runemax = 0x10FFFF, + + Bit1 = 7, + Bitx = 6, + Bit2 = 5, + Bit3 = 4, + Bit4 = 3, + Bit5 = 2, + + T1 = ((1<<(Bit1+1))-1) ^ 0xFF, /* 0000 0000 */ + Tx = ((1<<(Bitx+1))-1) ^ 0xFF, /* 1000 0000 */ + T2 = ((1<<(Bit2+1))-1) ^ 0xFF, /* 1100 0000 */ + T3 = ((1<<(Bit3+1))-1) ^ 0xFF, /* 1110 0000 */ + T4 = ((1<<(Bit4+1))-1) ^ 0xFF, /* 1111 0000 */ + T5 = ((1<<(Bit5+1))-1) ^ 0xFF, /* 1111 1000 */ + + Rune1 = (1<<(Bit1+0*Bitx))-1, /* 0000 0000 0000 0000 0111 1111 */ + Rune2 = (1<<(Bit2+1*Bitx))-1, /* 0000 0000 0000 0111 1111 1111 */ + Rune3 = (1<<(Bit3+2*Bitx))-1, /* 0000 0000 1111 1111 1111 1111 */ + Rune4 = (1<<(Bit4+3*Bitx))-1, /* 0011 1111 1111 1111 1111 1111 */ + + Maskx = (1< 00-7F */ + if (c <= Rune1) { + str[0] = c; + return 1; + } + + /* two character sequence 00080-007FF => T2 Tx */ + if (c <= Rune2) { + str[0] = T2 | (c >> 1*Bitx); + str[1] = Tx | (c & Maskx); + return 2; + } + + /* three character sequence 00800-0FFFF => T3 Tx Tx */ + if (c > Runemax) + c = Runeerror; + if (c <= Rune3) { + str[0] = T3 | (c >> 2*Bitx); + str[1] = Tx | ((c >> 1*Bitx) & Maskx); + str[2] = Tx | (c & Maskx); + return 3; + } + + /* four character sequence 010000-1FFFFF => T4 Tx Tx Tx */ + str[0] = T4 | (c >> 3*Bitx); + str[1] = Tx | ((c >> 2*Bitx) & Maskx); + str[2] = Tx | ((c >> 1*Bitx) & Maskx); + str[3] = Tx | (c & Maskx); + return 4; + } + + + /* ========== end of utf8 code =========== */ + + + Cell *matchop(Node **a, int n) /* ~ and match() */ { Cell *x, *y; char *s, *t; int i; + int cstart, cpatlen, len; fa *pfa; int (*mf)(fa *, const char *) = match, mode = 0; *************** *** 605,613 **** } tempfree(x); if (n == MATCHFCN) { ! int start = patbeg - s + 1; ! if (patlen < 0) ! start = 0; setfval(rstartloc, (Awkfloat) start); setfval(rlengthloc, (Awkfloat) patlen); x = gettemp(); --- 820,840 ---- } tempfree(x); if (n == MATCHFCN) { ! int start = patbeg - s + 1; /* origin 1 */ ! if (patlen < 0) { ! start = 0; /* not found */ ! } else { ! cstart = u8_byte2char(s, start-1); ! cpatlen = 0; ! for (i = 0; i < patlen; i += len) { ! len = u8_nextlen(patbeg+i); ! cpatlen++; ! } ! ! start = cstart; ! patlen = cpatlen; ! } ! setfval(rstartloc, (Awkfloat) start); setfval(rlengthloc, (Awkfloat) patlen); x = gettemp(); *************** *** 658,667 **** --- 885,899 ---- int i; Cell *x, *y; Awkfloat j; + bool x_is_nan, y_is_nan; x = execute(a[0]); y = execute(a[1]); + x_is_nan = isnan(x->fval); + y_is_nan = isnan(y->fval); if (x->tval&NUM && y->tval&NUM) { + if ((x_is_nan || y_is_nan) && n != NE) + return(False); j = x->fval - y->fval; i = j<0? -1: (j>0? 1: 0); } else { *************** *** 674,680 **** else return(False); case LE: if (i<=0) return(True); else return(False); ! case NE: if (i!=0) return(True); else return(False); case EQ: if (i == 0) return(True); else return(False); --- 906,913 ---- else return(False); case LE: if (i<=0) return(True); else return(False); ! case NE: if (x_is_nan && y_is_nan) return(True); ! else if (i!=0) return(True); else return(False); case EQ: if (i == 0) return(True); else return(False); *************** *** 743,748 **** --- 976,982 ---- Cell *substr(Node **a, int nnn) /* substr(a[0], a[1], a[2]) */ { int k, m, n; + int mb, nb; char *s; int temp; Cell *x, *y, *z = NULL; *************** *** 778,789 **** n = 0; else if (n > k - m) n = k - m; DPRINTF("substr: m=%d, n=%d, s=%s\n", m, n, s); y = gettemp(); ! temp = s[n+m-1]; /* with thanks to John Linderman */ ! s[n+m-1] = '\0'; ! setsval(y, s + m - 1); ! s[n+m-1] = temp; tempfree(x); return(y); } --- 1012,1027 ---- n = 0; else if (n > k - m) n = k - m; + /* m is start, n is length from there */ DPRINTF("substr: m=%d, n=%d, s=%s\n", m, n, s); y = gettemp(); ! mb = u8_char2byte(s, m-1); /* byte offset of start char in s */ ! nb = u8_char2byte(s, m-1+n); /* byte offset of end+1 char in s */ ! ! temp = s[nb]; /* with thanks to John Linderman */ ! s[nb] = '\0'; ! setsval(y, s + mb); ! s[nb] = temp; tempfree(x); return(y); } *************** *** 804,810 **** for (q = p1, p2 = s2; *p2 != '\0' && *q == *p2; q++, p2++) continue; if (*p2 == '\0') { ! v = (Awkfloat) (p1 - s1 + 1); /* origin 1 */ break; } } --- 1042,1056 ---- for (q = p1, p2 = s2; *p2 != '\0' && *q == *p2; q++, p2++) continue; if (*p2 == '\0') { ! /* v = (Awkfloat) (p1 - s1 + 1); origin 1 */ ! ! /* should be a function: used in match() as well */ ! int i, len; ! v = 0; ! for (i = 0; i < p1-s1+1; i += len) { ! len = u8_nextlen(s1+i); ! v++; ! } break; } } *************** *** 814,819 **** --- 1060,1077 ---- return(z); } + int has_utf8(char *s) /* return 1 if s contains any utf-8 (2 bytes or more) character */ + { + int n; + + for (n = 0; *s != 0; s += n) { + n = u8_nextlen(s); + if (n > 1) + return 1; + } + return 0; + } + #define MAXNUMSIZE 50 int format(char **pbuf, int *pbufsize, const char *s, Node *a) /* printf-like conversions */ *************** *** 856,862 **** s += 2; continue; } - /* have to be real careful in case this is a huge number, eg, %100000d */ fmtwd = atoi(s+1); if (fmtwd < 0) fmtwd = -fmtwd; --- 1114,1119 ---- *************** *** 929,935 **** n = fmtwd; adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format5"); switch (flag) { ! case '?': snprintf(p, BUFSZ(p), "%s", fmt); /* unknown, so dump it too */ t = getsval(x); n = strlen(t); if (fmtwd > n) --- 1186,1193 ---- n = fmtwd; adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format5"); switch (flag) { ! case '?': ! snprintf(p, BUFSZ(p), "%s", fmt); /* unknown, so dump it too */ t = getsval(x); n = strlen(t); if (fmtwd > n) *************** *** 943,971 **** case 'f': snprintf(p, BUFSZ(p), fmt, getfval(x)); break; case 'd': snprintf(p, BUFSZ(p), fmt, (intmax_t) getfval(x)); break; case 'u': snprintf(p, BUFSZ(p), fmt, (uintmax_t) getfval(x)); break; ! case 's': t = getsval(x); n = strlen(t); ! if (fmtwd > n) ! n = fmtwd; ! if (!adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format7")) ! FATAL("huge string/format (%d chars) in printf %.30s... ran format() out of memory", n, t); ! snprintf(p, BUFSZ(p), fmt, t); break; ! case 'c': if (isnum(x)) { ! if ((int)getfval(x)) ! snprintf(p, BUFSZ(p), fmt, (int) getfval(x)); ! else { *p++ = '\0'; /* explicit null byte */ *p = '\0'; /* next output will start here */ } ! } else snprintf(p, BUFSZ(p), fmt, getsval(x)[0]); break; default: FATAL("can't happen: bad conversion %c in format()", flag); } tempfree(x); p += strlen(p); s++; --- 1201,1376 ---- case 'f': snprintf(p, BUFSZ(p), fmt, getfval(x)); break; case 'd': snprintf(p, BUFSZ(p), fmt, (intmax_t) getfval(x)); break; case 'u': snprintf(p, BUFSZ(p), fmt, (uintmax_t) getfval(x)); break; ! ! case 's': { t = getsval(x); n = strlen(t); ! /* if simple format or no utf-8 in the string, sprintf works */ ! if (!has_utf8(t) || strcmp(fmt,"%s") == 0) { ! if (fmtwd > n) ! n = fmtwd; ! if (!adjbuf(&buf, &bufsize, 1+n+p-buf, recsize, &p, "format7")) ! FATAL("huge string/format (%d chars) in printf %.30s..." \ ! " ran format() out of memory", n, t); ! snprintf(p, BUFSZ(p), fmt, t); ! break; ! } ! ! /* get here if string has utf-8 chars and fmt is not plain %s */ ! /* "%-w.ps", where -, w and .p are all optional */ ! /* '0' before the w is a flag character */ ! /* fmt points at % */ ! int ljust = 0, wid = 0, prec = n, pad = 0; ! char *f = fmt+1; ! if (f[0] == '-') { ! ljust = 1; ! f++; ! } ! // flags '0' and '+' are recognized but skipped ! if (f[0] == '0') { ! f++; ! if (f[0] == '+') ! f++; ! } ! if (f[0] == '+') { ! f++; ! if (f[0] == '0') ! f++; ! } ! if (isdigit((uschar)f[0])) { /* there is a wid */ ! wid = strtol(f, &f, 10); ! } ! if (f[0] == '.') { /* there is a .prec */ ! prec = strtol(++f, &f, 10); ! } ! if (prec > u8_strlen(t)) ! prec = u8_strlen(t); ! pad = wid>prec ? wid - prec : 0; // has to be >= 0 ! int i, k, n; ! ! if (ljust) { // print prec chars from t, then pad blanks ! n = u8_char2byte(t, prec); ! for (k = 0; k < n; k++) { ! //putchar(t[k]); ! *p++ = t[k]; ! } ! for (i = 0; i < pad; i++) { ! //printf(" "); ! *p++ = ' '; ! } ! } else { // print pad blanks, then prec chars from t ! for (i = 0; i < pad; i++) { ! //printf(" "); ! *p++ = ' '; ! } ! n = u8_char2byte(t, prec); ! for (k = 0; k < n; k++) { ! //putchar(t[k]); ! *p++ = t[k]; ! } ! } ! *p = 0; break; ! } ! ! case 'c': { ! /* ! * If a numeric value is given, awk should just turn ! * it into a character and print it: ! * BEGIN { printf("%c\n", 65) } ! * prints "A". ! * ! * But what if the numeric value is > 128 and ! * represents a valid Unicode code point?!? We do ! * our best to convert it back into UTF-8. If we ! * can't, we output the encoding of the Unicode ! * "invalid character", 0xFFFD. ! */ if (isnum(x)) { ! int charval = (int) getfval(x); ! ! if (charval != 0) { ! if (charval < 128) ! snprintf(p, BUFSZ(p), fmt, charval); ! else { ! // possible unicode character ! size_t count; ! char *bs = wide_char_to_byte_str(charval, &count); ! ! if (bs == NULL) { // invalid character ! // use unicode invalid character, 0xFFFD ! bs = "\357\277\275"; ! count = 3; ! } ! t = bs; ! n = count; ! goto format_percent_c; ! } ! } else { *p++ = '\0'; /* explicit null byte */ *p = '\0'; /* next output will start here */ } ! break; ! } ! t = getsval(x); ! n = u8_nextlen(t); ! format_percent_c: ! if (n < 2) { /* not utf8 */ snprintf(p, BUFSZ(p), fmt, getsval(x)[0]); + break; + } + + // utf8 character, almost same song and dance as for %s + int ljust = 0, wid = 0, prec = n, pad = 0; + char *f = fmt+1; + if (f[0] == '-') { + ljust = 1; + f++; + } + // flags '0' and '+' are recognized but skipped + if (f[0] == '0') { + f++; + if (f[0] == '+') + f++; + } + if (f[0] == '+') { + f++; + if (f[0] == '0') + f++; + } + if (isdigit((uschar)f[0])) { /* there is a wid */ + wid = strtol(f, &f, 10); + } + if (f[0] == '.') { /* there is a .prec */ + prec = strtol(++f, &f, 10); + } + if (prec > 1) // %c --> only one character + prec = 1; + pad = wid>prec ? wid - prec : 0; // has to be >= 0 + int i; + + if (ljust) { // print one char from t, then pad blanks + for (int i = 0; i < n; i++) + *p++ = t[i]; + for (i = 0; i < pad; i++) { + //printf(" "); + *p++ = ' '; + } + } else { // print pad blanks, then prec chars from t + for (i = 0; i < pad; i++) { + //printf(" "); + *p++ = ' '; + } + for (int i = 0; i < n; i++) + *p++ = t[i]; + } + *p = 0; break; + } default: FATAL("can't happen: bad conversion %c in format()", flag); } + tempfree(x); p += strlen(p); s++; *************** *** 1265,1271 **** char *origfs = NULL; int sep; char temp, num[50]; ! int n, tempstat, arg3type; double result; y = execute(a[0]); /* source string */ --- 1670,1676 ---- char *origfs = NULL; int sep; char temp, num[50]; ! int j, n, tempstat, arg3type; double result; y = execute(a[0]); /* source string */ *************** *** 1274,1293 **** FATAL("out of space in split"); tempfree(y); arg3type = ptoi(a[3]); ! if (a[2] == NULL) /* fs string */ fs = getsval(fsloc); ! else if (arg3type == STRING) { /* split(str,arr,"string") */ x = execute(a[2]); fs = origfs = strdup(getsval(x)); if (fs == NULL) FATAL("out of space in split"); tempfree(x); ! } else if (arg3type == REGEXPR) fs = "(regexpr)"; /* split(str,arr,/regexpr/) */ ! else FATAL("illegal type of split"); sep = *fs; ap = execute(a[1]); /* array name */ freesymtab(ap); DPRINTF("split: s=|%s|, a=%s, sep=|%s|\n", s, NN(ap->nval), fs); ap->tval &= ~STR; --- 1679,1700 ---- FATAL("out of space in split"); tempfree(y); arg3type = ptoi(a[3]); ! if (a[2] == NULL) { /* BUG: CSV should override implicit fs but not explicit */ fs = getsval(fsloc); ! } else if (arg3type == STRING) { /* split(str,arr,"string") */ x = execute(a[2]); fs = origfs = strdup(getsval(x)); if (fs == NULL) FATAL("out of space in split"); tempfree(x); ! } else if (arg3type == REGEXPR) { fs = "(regexpr)"; /* split(str,arr,/regexpr/) */ ! } else { FATAL("illegal type of split"); + } sep = *fs; ap = execute(a[1]); /* array name */ + /* BUG 7/26/22: this appears not to reset array: see C1/asplit */ freesymtab(ap); DPRINTF("split: s=|%s|, a=%s, sep=|%s|\n", s, NN(ap->nval), fs); ap->tval &= ~STR; *************** *** 1341,1347 **** setsymtab(num, s, 0.0, STR, (Array *) ap->sval); spdone: pfa = NULL; ! } else if (sep == ' ') { for (n = 0; ; ) { #define ISWS(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') while (ISWS(*s)) --- 1748,1788 ---- setsymtab(num, s, 0.0, STR, (Array *) ap->sval); spdone: pfa = NULL; ! ! } else if (a[2] == NULL && CSV) { /* CSV only if no explicit separator */ ! char *newt = (char *) malloc(strlen(s)); /* for building new string; reuse for each field */ ! for (;;) { ! char *fr = newt; ! n++; ! if (*s == '"' ) { /* start of "..." */ ! for (s++ ; *s != '\0'; ) { ! if (*s == '"' && s[1] != '\0' && s[1] == '"') { ! s += 2; /* doubled quote */ ! *fr++ = '"'; ! } else if (*s == '"' && (s[1] == '\0' || s[1] == ',')) { ! s++; /* skip over closing quote */ ! break; ! } else { ! *fr++ = *s++; ! } ! } ! *fr++ = 0; ! } else { /* unquoted field */ ! while (*s != ',' && *s != '\0') ! *fr++ = *s++; ! *fr++ = 0; ! } ! snprintf(num, sizeof(num), "%d", n); ! if (is_number(newt, &result)) ! setsymtab(num, newt, result, STR|NUM, (Array *) ap->sval); ! else ! setsymtab(num, newt, 0.0, STR, (Array *) ap->sval); ! if (*s++ == '\0') ! break; ! } ! free(newt); ! ! } else if (!CSV && sep == ' ') { /* usual case: split on white space */ for (n = 0; ; ) { #define ISWS(c) ((c) == ' ' || (c) == '\t' || (c) == '\n') while (ISWS(*s)) *************** *** 1364,1382 **** if (*s != '\0') s++; } } else if (sep == 0) { /* new: split(s, a, "") => 1 char/elem */ ! for (n = 0; *s != '\0'; s++) { ! char buf[2]; n++; snprintf(num, sizeof(num), "%d", n); ! buf[0] = *s; ! buf[1] = '\0'; if (isdigit((uschar)buf[0])) setsymtab(num, buf, atof(buf), STR|NUM, (Array *) ap->sval); else setsymtab(num, buf, 0.0, STR, (Array *) ap->sval); } ! } else if (*s != '\0') { for (;;) { n++; t = s; --- 1805,1829 ---- if (*s != '\0') s++; } + } else if (sep == 0) { /* new: split(s, a, "") => 1 char/elem */ ! for (n = 0; *s != '\0'; s += u8_nextlen(s)) { ! char buf[10]; n++; snprintf(num, sizeof(num), "%d", n); ! ! for (j = 0; j < u8_nextlen(s); j++) { ! buf[j] = s[j]; ! } ! buf[j] = '\0'; ! if (isdigit((uschar)buf[0])) setsymtab(num, buf, atof(buf), STR|NUM, (Array *) ap->sval); else setsymtab(num, buf, 0.0, STR, (Array *) ap->sval); } ! ! } else if (*s != '\0') { /* some random single character */ for (;;) { n++; t = s; *************** *** 1535,1540 **** --- 1982,1988 ---- size_t n = 0; wchar_t wc; size_t sz = MB_CUR_MAX; + int unused; if (sz == 1) { buf = tostring(s); *************** *** 1554,1560 **** * doesn't work.) * Increment said variable to avoid a different warning. */ ! int unused = wctomb(NULL, L'\0'); unused++; ps = s; --- 2002,2008 ---- * doesn't work.) * Increment said variable to avoid a different warning. */ ! unused = wctomb(NULL, L'\0'); unused++; ps = s; *************** *** 1629,1635 **** if (isarr(x)) u = ((Array *) x->sval)->nelem; /* GROT. should be function*/ else ! u = strlen(getsval(x)); break; case FLOG: errno = 0; --- 2077,2083 ---- if (isarr(x)) u = ((Array *) x->sval)->nelem; /* GROT. should be function*/ else ! u = u8_strlen(getsval(x)); break; case FLOG: errno = 0; *************** *** 2401,2404 **** --- 2849,2890 ---- *pb_ptr = pb; *sptr_ptr = sptr; + } + + static char *wide_char_to_byte_str(int rune, size_t *outlen) + { + static char buf[5]; + int len; + + if (rune < 0 || rune > 0x10FFFF) + return NULL; + + memset(buf, 0, sizeof(buf)); + + len = 0; + if (rune <= 0x0000007F) { + buf[len++] = rune; + } else if (rune <= 0x000007FF) { + // 110xxxxx 10xxxxxx + buf[len++] = 0xC0 | (rune >> 6); + buf[len++] = 0x80 | (rune & 0x3F); + } else if (rune <= 0x0000FFFF) { + // 1110xxxx 10xxxxxx 10xxxxxx + buf[len++] = 0xE0 | (rune >> 12); + buf[len++] = 0x80 | ((rune >> 6) & 0x3F); + buf[len++] = 0x80 | (rune & 0x3F); + + } else { + // 0x00010000 - 0x10FFFF + // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + buf[len++] = 0xF0 | (rune >> 18); + buf[len++] = 0x80 | ((rune >> 12) & 0x3F); + buf[len++] = 0x80 | ((rune >> 6) & 0x3F); + buf[len++] = 0x80 | (rune & 0x3F); + } + + *outlen = len; + buf[len++] = '\0'; + + return buf; }