=================================================================== RCS file: /cvsrepo/anoncvs/cvs/src/usr.bin/awk/b.c,v retrieving revision 1.37 retrieving revision 1.38 diff -u -r1.37 -r1.38 --- src/usr.bin/awk/b.c 2021/07/08 21:26:39 1.37 +++ src/usr.bin/awk/b.c 2023/09/17 14:49:44 1.38 @@ -1,4 +1,4 @@ -/* $OpenBSD: b.c,v 1.37 2021/07/08 21:26:39 millert Exp $ */ +/* $OpenBSD: b.c,v 1.38 2023/09/17 14:49:44 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved @@ -81,6 +81,41 @@ fa *fatab[NFA]; int nfatab = 0; /* entries in fatab */ + +/* utf-8 mechanism: + + For most of Awk, utf-8 strings just "work", since they look like + null-terminated sequences of 8-bit bytes. + + Functions like length(), index(), and substr() have to operate + in units of utf-8 characters. The u8_* functions in run.c + handle this. + + Regular expressions are more complicated, since the basic + mechanism of the goto table used 8-bit byte indices into the + gototab entries to compute the next state. Unicode is a lot + bigger, so the gototab entries are now structs with a character + and a next state, and there is a linear search of the characters + to find the state. (Yes, this is slower, by a significant + amount. Tough.) + + Throughout the RE mechanism in b.c, utf-8 characters are + converted to their utf-32 value. This mostly shows up in + cclenter, which expands character class ranges like a-z and now + alpha-omega. The size of a gototab array is still about 256. + This should be dynamic, but for now things work ok for a single + code page of Unicode, which is the most likely case. + + The code changes are localized in run.c and b.c. I have added a + handful of functions to somewhat better hide the implementation, + but a lot more could be done. + + */ + +static int get_gototab(fa*, int, int); +static int set_gototab(fa*, int, int, int); +extern int u8_rune(int *, const uschar *); + static int * intalloc(size_t n, const char *f) { @@ -113,7 +148,7 @@ static void resize_state(fa *f, int state) { - unsigned int **p; + gtt **p; uschar *p2; int **p3; int i, new_count; @@ -123,7 +158,7 @@ new_count = state + 10; /* needs to be tuned */ - p = (unsigned int **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0])); + p = (gtt **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0])); if (p == NULL) goto out; f->gototab = p; @@ -139,12 +174,13 @@ f->posns = p3; for (i = f->state_count; i < new_count; ++i) { - f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab)); + f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab)); if (f->gototab[i] == NULL) goto out; f->out[i] = 0; f->posns[i] = NULL; } + f->gototab_len = NCHARS; /* should be variable, growable */ f->state_count = new_count; return; out: @@ -239,7 +275,7 @@ if ((f->posns[2])[1] == f->accept) f->out[2] = 1; for (i = 0; i < NCHARS; i++) - f->gototab[2][i] = 0; + set_gototab(f, 2, 0, 0); /* f->gototab[2][i] = 0; */ f->curstat = cgoto(f, 2, HAT); if (anchor) { *f->posns[2] = k-1; /* leave out position 0 */ @@ -308,13 +344,13 @@ /* in the parsing of regular expressions, metacharacters like . have */ /* to be seen literally; \056 is not a metacharacter. */ -int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */ +int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */ { /* only pick up one 8-bit byte (2 chars) */ const uschar *p; int n = 0; int i; - for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) { + for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) { if (isdigit(*p)) n = 16 * n + *p - '0'; else if (*p >= 'a' && *p <= 'f') @@ -334,24 +370,28 @@ const uschar *p = *pp; int c; - if ((c = *p++) == 't') +/* BUG: should advance by utf-8 char even if makes no sense */ + + if ((c = *p++) == 't') { c = '\t'; - else if (c == 'n') + } else if (c == 'n') { c = '\n'; - else if (c == 'f') + } else if (c == 'f') { c = '\f'; - else if (c == 'r') + } else if (c == 'r') { c = '\r'; - else if (c == 'b') + } else if (c == 'b') { c = '\b'; - else if (c == 'v') + } else if (c == 'v') { c = '\v'; - else if (c == 'a') + } else if (c == 'a') { c = '\a'; - else if (c == '\\') + } else if (c == '\\') { c = '\\'; - else if (c == 'x') { /* hexadecimal goo follows */ - c = hexstr(&p); /* this adds a null if number is invalid */ + } else if (c == 'x') { /* 2 hex digits follow */ + c = hexstr(&p, 2); /* this adds a null if number is invalid */ + } else if (c == 'u') { /* unicode char number up to 8 hex digits */ + c = hexstr(&p, 8); } else if (isoctdigit(c)) { /* \d \dd \ddd */ int n = c - '0'; if (isoctdigit(*p)) { @@ -366,50 +406,67 @@ return c; } -char *cclenter(const char *argp) /* add a character class */ +int *cclenter(const char *argp) /* add a character class */ { int i, c, c2; - const uschar *op, *p = (const uschar *) argp; - uschar *bp; - static uschar *buf = NULL; + int n; + const uschar *p = (const uschar *) argp; + int *bp, *retp; + static int *buf = NULL; static int bufsz = 100; - op = p; - if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) + if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL) FATAL("out of space for character class [%.10s...] 1", p); bp = buf; - for (i = 0; (c = *p++) != 0; ) { + for (i = 0; *p != 0; ) { + n = u8_rune(&c, p); + p += n; if (c == '\\') { c = quoted(&p); } else if (c == '-' && i > 0 && bp[-1] != 0) { if (*p != 0) { c = bp[-1]; - c2 = *p++; + /* c2 = *p++; */ + n = u8_rune(&c2, p); + p += n; if (c2 == '\\') - c2 = quoted(&p); + c2 = quoted(&p); /* BUG: sets p, has to be u8 size */ if (c > c2) { /* empty; ignore */ bp--; i--; continue; } while (c < c2) { - if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1")) - FATAL("out of space for character class [%.10s...] 2", p); + if (i >= bufsz) { + buf = (int *) reallocarray(buf, bufsz, sizeof(int) * 2); + if (buf == NULL) + FATAL("out of space for character class [%.10s...] 2", p); + bufsz *= 2; + bp = buf + i; + } *bp++ = ++c; i++; } continue; } } - if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2")) - FATAL("out of space for character class [%.10s...] 3", p); + if (i >= bufsz) { + buf = (int *) reallocarray(buf, bufsz, sizeof(int) * 2); + if (buf == NULL) + FATAL("out of space for character class [%.10s...] 2", p); + bufsz *= 2; + bp = buf + i; + } *bp++ = c; i++; } *bp = 0; - DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); - xfree(op); - return (char *) tostring((char *) buf); + /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */ + /* xfree(op); BUG: what are we freeing here? */ + retp = (int *) calloc(bp-buf+1, sizeof(int)); + for (i = 0; i < bp-buf+1; i++) + retp[i] = buf[i]; + return retp; } void overflo(const char *s) @@ -532,9 +589,9 @@ } } -int member(int c, const char *sarg) /* is c in s? */ +int member(int c, int *sarg) /* is c in s? */ { - const uschar *s = (const uschar *) sarg; + int *s = (int *) sarg; while (*s) if (c == *s++) @@ -542,11 +599,41 @@ return(0); } +static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */ +{ + int i; + for (i = 0; i < f->gototab_len; i++) { + if (f->gototab[state][i].ch == 0) + break; + if (f->gototab[state][i].ch == ch) + return f->gototab[state][i].state; + } + return 0; +} + +static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */ +{ + int i; + for (i = 0; i < f->gototab_len; i++) { + if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) { + f->gototab[state][i].ch = ch; + f->gototab[state][i].state = val; + return val; + } + } + overflo(__func__); + return val; /* not used anywhere at the moment */ +} + int match(fa *f, const char *p0) /* shortest match ? */ { int s, ns; + int n; + int rune; const uschar *p = (const uschar *) p0; + /* return pmatch(f, p0); does it matter whether longest or shortest? */ + s = f->initstat; assert (s < f->state_count); @@ -554,19 +641,25 @@ return(1); do { /* assert(*p < NCHARS); */ - if ((ns = f->gototab[s][*p]) != 0) + n = u8_rune(&rune, p); + if ((ns = get_gototab(f, s, rune)) != 0) s = ns; else - s = cgoto(f, s, *p); + s = cgoto(f, s, rune); if (f->out[s]) return(1); - } while (*p++ != 0); + if (*p == 0) + break; + p += n; + } while (1); /* was *p++ != 0 */ return(0); } int pmatch(fa *f, const char *p0) /* longest match, for sub */ { int s, ns; + int n; + int rune; const uschar *p = (const uschar *) p0; const uschar *q; @@ -581,10 +674,11 @@ if (f->out[s]) /* final state */ patlen = q-p; /* assert(*q < NCHARS); */ - if ((ns = f->gototab[s][*q]) != 0) + n = u8_rune(&rune, q); + if ((ns = get_gototab(f, s, rune)) != 0) s = ns; else - s = cgoto(f, s, *q); + s = cgoto(f, s, rune); assert(s < f->state_count); @@ -596,7 +690,11 @@ else goto nextin; /* no match */ } - } while (*q++ != 0); + if (*q == 0) + break; + q += n; + } while (1); + q++; /* was *q++ */ if (f->out[s]) patlen = q-p-1; /* don't count $ */ if (patlen >= 0) { @@ -605,13 +703,19 @@ } nextin: s = 2; - } while (*p++); + if (*p == 0) + break; + n = u8_rune(&rune, p); + p += n; + } while (1); /* was *p++ */ return (0); } int nematch(fa *f, const char *p0) /* non-empty match, for sub */ { int s, ns; + int n; + int rune; const uschar *p = (const uschar *) p0; const uschar *q; @@ -626,10 +730,11 @@ if (f->out[s]) /* final state */ patlen = q-p; /* assert(*q < NCHARS); */ - if ((ns = f->gototab[s][*q]) != 0) + n = u8_rune(&rune, q); + if ((ns = get_gototab(f, s, rune)) != 0) s = ns; else - s = cgoto(f, s, *q); + s = cgoto(f, s, rune); if (s == 1) { /* no transition */ if (patlen > 0) { patbeg = (const char *) p; @@ -637,7 +742,11 @@ } else goto nnextin; /* no nonempty match */ } - } while (*q++ != 0); + if (*q == 0) + break; + q += n; + } while (1); + q++; if (f->out[s]) patlen = q-p-1; /* don't count $ */ if (patlen > 0 ) { @@ -651,7 +760,36 @@ return (0); } +static int getrune(FILE *fp, char **pbuf, int *pbufsize, int quantum, + int *curpos, int *lastpos) +{ + int c = 0; + char *buf = *pbuf; + static const int max_bytes = 4; // max multiple bytes in UTF-8 is 4 + int i, rune; + uschar private_buf[max_bytes + 1]; + for (i = 0; i <= max_bytes; i++) { + if (++*curpos == *lastpos) { + if (*lastpos == *pbufsize) + if (!adjbuf((char **) pbuf, pbufsize, *pbufsize+1, quantum, 0, "getrune")) + FATAL("stream '%.30s...' too long", buf); + buf[(*lastpos)++] = (c = getc(fp)) != EOF ? c : 0; + private_buf[i] = c; + } + if (c == 0 || c < 128 || (c >> 6) == 4) { // 10xxxxxx starts a new character + ungetc(c, fp); + private_buf[i] = 0; + break; + } + } + + u8_rune(& rune, private_buf); + + return rune; +} + + /* * NAME * fnematch @@ -672,6 +810,7 @@ char *buf = *pbuf; int bufsize = *pbufsize; int c, i, j, k, ns, s; + int rune; s = pfa->initstat; patlen = 0; @@ -695,12 +834,19 @@ buf[k++] = (c = getc(f)) != EOF ? c : 0; } c = (uschar)buf[j]; - /* assert(c < NCHARS); */ + if (c < 128) + rune = c; + else { + j--; + k--; + ungetc(c, f); + rune = getrune(f, &buf, &bufsize, quantum, &j, &k); + } - if ((ns = pfa->gototab[s][c]) != 0) + if ((ns = get_gototab(pfa, s, rune)) != 0) s = ns; else - s = cgoto(pfa, s, c); + s = cgoto(pfa, s, rune); if (pfa->out[s]) { /* final state */ patlen = j - i + 1; @@ -1026,6 +1172,8 @@ return 0; } +extern int u8_rune(int *, const uschar *); /* run.c; should be in header file */ + int relex(void) /* lexical analyzer for reparse */ { int c, n; @@ -1043,6 +1191,12 @@ rescan: starttok = prestr; + if ((n = u8_rune(&rlxval, prestr)) > 1) { + prestr += n; + starttok = prestr; + return CHAR; + } + switch (c = *prestr++) { case '|': return OR; case '*': return STAR; @@ -1080,10 +1234,15 @@ } else cflag = 0; - n = 2 * strlen((const char *) prestr)+1; + n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */ if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) FATAL("out of space for reg expr %.10s...", lastre); for (; ; ) { + if ((n = u8_rune(&rlxval, prestr)) > 1) { + for (i = 0; i < n; i++) + *bp++ = *prestr++; + continue; + } if ((c = *prestr++) == '\\') { *bp++ = '\\'; if ((c = *prestr++) == '\0') @@ -1250,7 +1409,7 @@ int *p, *q; int i, j, k; - assert(c == HAT || c < NCHARS); + /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */ while (f->accept >= maxsetvec) { /* guessing here! */ resizesetvec(__func__); } @@ -1266,8 +1425,8 @@ || (k == DOT && c != 0 && c != HAT) || (k == ALL && c != 0) || (k == EMPTYRE && c != 0) - || (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) - || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { + || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp)) + || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) { q = f->re[p[i]].lfollow; for (j = 1; j <= *q; j++) { if (q[j] >= maxsetvec) { @@ -1299,7 +1458,7 @@ goto different; /* setvec is state i */ if (c != HAT) - f->gototab[s][c] = i; + set_gototab(f, s, c, i); return i; different:; } @@ -1308,13 +1467,13 @@ ++(f->curstat); resize_state(f, f->curstat); for (i = 0; i < NCHARS; i++) - f->gototab[f->curstat][i] = 0; + set_gototab(f, f->curstat, 0, 0); xfree(f->posns[f->curstat]); p = intalloc(setcnt + 1, __func__); f->posns[f->curstat] = p; if (c != HAT) - f->gototab[s][c] = f->curstat; + set_gototab(f, s, c, f->curstat); for (i = 0; i <= setcnt; i++) p[i] = tmpset[i]; if (setvec[f->accept])