=================================================================== RCS file: /cvsrepo/anoncvs/cvs/src/usr.bin/awk/b.c,v retrieving revision 1.37 retrieving revision 1.38 diff -c -r1.37 -r1.38 *** src/usr.bin/awk/b.c 2021/07/08 21:26:39 1.37 --- src/usr.bin/awk/b.c 2023/09/17 14:49:44 1.38 *************** *** 1,4 **** ! /* $OpenBSD: b.c,v 1.37 2021/07/08 21:26:39 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved --- 1,4 ---- ! /* $OpenBSD: b.c,v 1.38 2023/09/17 14:49:44 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved *************** *** 81,86 **** --- 81,121 ---- fa *fatab[NFA]; int nfatab = 0; /* entries in fatab */ + + /* utf-8 mechanism: + + For most of Awk, utf-8 strings just "work", since they look like + null-terminated sequences of 8-bit bytes. + + Functions like length(), index(), and substr() have to operate + in units of utf-8 characters. The u8_* functions in run.c + handle this. + + Regular expressions are more complicated, since the basic + mechanism of the goto table used 8-bit byte indices into the + gototab entries to compute the next state. Unicode is a lot + bigger, so the gototab entries are now structs with a character + and a next state, and there is a linear search of the characters + to find the state. (Yes, this is slower, by a significant + amount. Tough.) + + Throughout the RE mechanism in b.c, utf-8 characters are + converted to their utf-32 value. This mostly shows up in + cclenter, which expands character class ranges like a-z and now + alpha-omega. The size of a gototab array is still about 256. + This should be dynamic, but for now things work ok for a single + code page of Unicode, which is the most likely case. + + The code changes are localized in run.c and b.c. I have added a + handful of functions to somewhat better hide the implementation, + but a lot more could be done. + + */ + + static int get_gototab(fa*, int, int); + static int set_gototab(fa*, int, int, int); + extern int u8_rune(int *, const uschar *); + static int * intalloc(size_t n, const char *f) { *************** *** 113,119 **** static void resize_state(fa *f, int state) { ! unsigned int **p; uschar *p2; int **p3; int i, new_count; --- 148,154 ---- static void resize_state(fa *f, int state) { ! gtt **p; uschar *p2; int **p3; int i, new_count; *************** *** 123,129 **** new_count = state + 10; /* needs to be tuned */ ! p = (unsigned int **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0])); if (p == NULL) goto out; f->gototab = p; --- 158,164 ---- new_count = state + 10; /* needs to be tuned */ ! p = (gtt **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0])); if (p == NULL) goto out; f->gototab = p; *************** *** 139,150 **** f->posns = p3; for (i = f->state_count; i < new_count; ++i) { ! f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab)); if (f->gototab[i] == NULL) goto out; f->out[i] = 0; f->posns[i] = NULL; } f->state_count = new_count; return; out: --- 174,186 ---- f->posns = p3; for (i = f->state_count; i < new_count; ++i) { ! f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab)); if (f->gototab[i] == NULL) goto out; f->out[i] = 0; f->posns[i] = NULL; } + f->gototab_len = NCHARS; /* should be variable, growable */ f->state_count = new_count; return; out: *************** *** 239,245 **** if ((f->posns[2])[1] == f->accept) f->out[2] = 1; for (i = 0; i < NCHARS; i++) ! f->gototab[2][i] = 0; f->curstat = cgoto(f, 2, HAT); if (anchor) { *f->posns[2] = k-1; /* leave out position 0 */ --- 275,281 ---- if ((f->posns[2])[1] == f->accept) f->out[2] = 1; for (i = 0; i < NCHARS; i++) ! set_gototab(f, 2, 0, 0); /* f->gototab[2][i] = 0; */ f->curstat = cgoto(f, 2, HAT); if (anchor) { *f->posns[2] = k-1; /* leave out position 0 */ *************** *** 308,320 **** /* in the parsing of regular expressions, metacharacters like . have */ /* to be seen literally; \056 is not a metacharacter. */ ! int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */ { /* only pick up one 8-bit byte (2 chars) */ const uschar *p; int n = 0; int i; ! for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) { if (isdigit(*p)) n = 16 * n + *p - '0'; else if (*p >= 'a' && *p <= 'f') --- 344,356 ---- /* in the parsing of regular expressions, metacharacters like . have */ /* to be seen literally; \056 is not a metacharacter. */ ! int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */ { /* only pick up one 8-bit byte (2 chars) */ const uschar *p; int n = 0; int i; ! for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) { if (isdigit(*p)) n = 16 * n + *p - '0'; else if (*p >= 'a' && *p <= 'f') *************** *** 334,357 **** const uschar *p = *pp; int c; ! if ((c = *p++) == 't') c = '\t'; ! else if (c == 'n') c = '\n'; ! else if (c == 'f') c = '\f'; ! else if (c == 'r') c = '\r'; ! else if (c == 'b') c = '\b'; ! else if (c == 'v') c = '\v'; ! else if (c == 'a') c = '\a'; ! else if (c == '\\') c = '\\'; ! else if (c == 'x') { /* hexadecimal goo follows */ ! c = hexstr(&p); /* this adds a null if number is invalid */ } else if (isoctdigit(c)) { /* \d \dd \ddd */ int n = c - '0'; if (isoctdigit(*p)) { --- 370,397 ---- const uschar *p = *pp; int c; ! /* BUG: should advance by utf-8 char even if makes no sense */ ! ! if ((c = *p++) == 't') { c = '\t'; ! } else if (c == 'n') { c = '\n'; ! } else if (c == 'f') { c = '\f'; ! } else if (c == 'r') { c = '\r'; ! } else if (c == 'b') { c = '\b'; ! } else if (c == 'v') { c = '\v'; ! } else if (c == 'a') { c = '\a'; ! } else if (c == '\\') { c = '\\'; ! } else if (c == 'x') { /* 2 hex digits follow */ ! c = hexstr(&p, 2); /* this adds a null if number is invalid */ ! } else if (c == 'u') { /* unicode char number up to 8 hex digits */ ! c = hexstr(&p, 8); } else if (isoctdigit(c)) { /* \d \dd \ddd */ int n = c - '0'; if (isoctdigit(*p)) { *************** *** 366,415 **** return c; } ! char *cclenter(const char *argp) /* add a character class */ { int i, c, c2; ! const uschar *op, *p = (const uschar *) argp; ! uschar *bp; ! static uschar *buf = NULL; static int bufsz = 100; ! op = p; ! if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) FATAL("out of space for character class [%.10s...] 1", p); bp = buf; ! for (i = 0; (c = *p++) != 0; ) { if (c == '\\') { c = quoted(&p); } else if (c == '-' && i > 0 && bp[-1] != 0) { if (*p != 0) { c = bp[-1]; ! c2 = *p++; if (c2 == '\\') ! c2 = quoted(&p); if (c > c2) { /* empty; ignore */ bp--; i--; continue; } while (c < c2) { ! if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1")) ! FATAL("out of space for character class [%.10s...] 2", p); *bp++ = ++c; i++; } continue; } } ! if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2")) ! FATAL("out of space for character class [%.10s...] 3", p); *bp++ = c; i++; } *bp = 0; ! DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); ! xfree(op); ! return (char *) tostring((char *) buf); } void overflo(const char *s) --- 406,472 ---- return c; } ! int *cclenter(const char *argp) /* add a character class */ { int i, c, c2; ! int n; ! const uschar *p = (const uschar *) argp; ! int *bp, *retp; ! static int *buf = NULL; static int bufsz = 100; ! if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL) FATAL("out of space for character class [%.10s...] 1", p); bp = buf; ! for (i = 0; *p != 0; ) { ! n = u8_rune(&c, p); ! p += n; if (c == '\\') { c = quoted(&p); } else if (c == '-' && i > 0 && bp[-1] != 0) { if (*p != 0) { c = bp[-1]; ! /* c2 = *p++; */ ! n = u8_rune(&c2, p); ! p += n; if (c2 == '\\') ! c2 = quoted(&p); /* BUG: sets p, has to be u8 size */ if (c > c2) { /* empty; ignore */ bp--; i--; continue; } while (c < c2) { ! if (i >= bufsz) { ! buf = (int *) reallocarray(buf, bufsz, sizeof(int) * 2); ! if (buf == NULL) ! FATAL("out of space for character class [%.10s...] 2", p); ! bufsz *= 2; ! bp = buf + i; ! } *bp++ = ++c; i++; } continue; } } ! if (i >= bufsz) { ! buf = (int *) reallocarray(buf, bufsz, sizeof(int) * 2); ! if (buf == NULL) ! FATAL("out of space for character class [%.10s...] 2", p); ! bufsz *= 2; ! bp = buf + i; ! } *bp++ = c; i++; } *bp = 0; ! /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */ ! /* xfree(op); BUG: what are we freeing here? */ ! retp = (int *) calloc(bp-buf+1, sizeof(int)); ! for (i = 0; i < bp-buf+1; i++) ! retp[i] = buf[i]; ! return retp; } void overflo(const char *s) *************** *** 532,540 **** } } ! int member(int c, const char *sarg) /* is c in s? */ { ! const uschar *s = (const uschar *) sarg; while (*s) if (c == *s++) --- 589,597 ---- } } ! int member(int c, int *sarg) /* is c in s? */ { ! int *s = (int *) sarg; while (*s) if (c == *s++) *************** *** 542,552 **** --- 599,639 ---- return(0); } + static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */ + { + int i; + for (i = 0; i < f->gototab_len; i++) { + if (f->gototab[state][i].ch == 0) + break; + if (f->gototab[state][i].ch == ch) + return f->gototab[state][i].state; + } + return 0; + } + + static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */ + { + int i; + for (i = 0; i < f->gototab_len; i++) { + if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) { + f->gototab[state][i].ch = ch; + f->gototab[state][i].state = val; + return val; + } + } + overflo(__func__); + return val; /* not used anywhere at the moment */ + } + int match(fa *f, const char *p0) /* shortest match ? */ { int s, ns; + int n; + int rune; const uschar *p = (const uschar *) p0; + /* return pmatch(f, p0); does it matter whether longest or shortest? */ + s = f->initstat; assert (s < f->state_count); *************** *** 554,572 **** return(1); do { /* assert(*p < NCHARS); */ ! if ((ns = f->gototab[s][*p]) != 0) s = ns; else ! s = cgoto(f, s, *p); if (f->out[s]) return(1); ! } while (*p++ != 0); return(0); } int pmatch(fa *f, const char *p0) /* longest match, for sub */ { int s, ns; const uschar *p = (const uschar *) p0; const uschar *q; --- 641,665 ---- return(1); do { /* assert(*p < NCHARS); */ ! n = u8_rune(&rune, p); ! if ((ns = get_gototab(f, s, rune)) != 0) s = ns; else ! s = cgoto(f, s, rune); if (f->out[s]) return(1); ! if (*p == 0) ! break; ! p += n; ! } while (1); /* was *p++ != 0 */ return(0); } int pmatch(fa *f, const char *p0) /* longest match, for sub */ { int s, ns; + int n; + int rune; const uschar *p = (const uschar *) p0; const uschar *q; *************** *** 581,590 **** if (f->out[s]) /* final state */ patlen = q-p; /* assert(*q < NCHARS); */ ! if ((ns = f->gototab[s][*q]) != 0) s = ns; else ! s = cgoto(f, s, *q); assert(s < f->state_count); --- 674,684 ---- if (f->out[s]) /* final state */ patlen = q-p; /* assert(*q < NCHARS); */ ! n = u8_rune(&rune, q); ! if ((ns = get_gototab(f, s, rune)) != 0) s = ns; else ! s = cgoto(f, s, rune); assert(s < f->state_count); *************** *** 596,602 **** else goto nextin; /* no match */ } ! } while (*q++ != 0); if (f->out[s]) patlen = q-p-1; /* don't count $ */ if (patlen >= 0) { --- 690,700 ---- else goto nextin; /* no match */ } ! if (*q == 0) ! break; ! q += n; ! } while (1); ! q++; /* was *q++ */ if (f->out[s]) patlen = q-p-1; /* don't count $ */ if (patlen >= 0) { *************** *** 605,617 **** } nextin: s = 2; ! } while (*p++); return (0); } int nematch(fa *f, const char *p0) /* non-empty match, for sub */ { int s, ns; const uschar *p = (const uschar *) p0; const uschar *q; --- 703,721 ---- } nextin: s = 2; ! if (*p == 0) ! break; ! n = u8_rune(&rune, p); ! p += n; ! } while (1); /* was *p++ */ return (0); } int nematch(fa *f, const char *p0) /* non-empty match, for sub */ { int s, ns; + int n; + int rune; const uschar *p = (const uschar *) p0; const uschar *q; *************** *** 626,635 **** if (f->out[s]) /* final state */ patlen = q-p; /* assert(*q < NCHARS); */ ! if ((ns = f->gototab[s][*q]) != 0) s = ns; else ! s = cgoto(f, s, *q); if (s == 1) { /* no transition */ if (patlen > 0) { patbeg = (const char *) p; --- 730,740 ---- if (f->out[s]) /* final state */ patlen = q-p; /* assert(*q < NCHARS); */ ! n = u8_rune(&rune, q); ! if ((ns = get_gototab(f, s, rune)) != 0) s = ns; else ! s = cgoto(f, s, rune); if (s == 1) { /* no transition */ if (patlen > 0) { patbeg = (const char *) p; *************** *** 637,643 **** } else goto nnextin; /* no nonempty match */ } ! } while (*q++ != 0); if (f->out[s]) patlen = q-p-1; /* don't count $ */ if (patlen > 0 ) { --- 742,752 ---- } else goto nnextin; /* no nonempty match */ } ! if (*q == 0) ! break; ! q += n; ! } while (1); ! q++; if (f->out[s]) patlen = q-p-1; /* don't count $ */ if (patlen > 0 ) { *************** *** 651,657 **** --- 760,795 ---- return (0); } + static int getrune(FILE *fp, char **pbuf, int *pbufsize, int quantum, + int *curpos, int *lastpos) + { + int c = 0; + char *buf = *pbuf; + static const int max_bytes = 4; // max multiple bytes in UTF-8 is 4 + int i, rune; + uschar private_buf[max_bytes + 1]; + for (i = 0; i <= max_bytes; i++) { + if (++*curpos == *lastpos) { + if (*lastpos == *pbufsize) + if (!adjbuf((char **) pbuf, pbufsize, *pbufsize+1, quantum, 0, "getrune")) + FATAL("stream '%.30s...' too long", buf); + buf[(*lastpos)++] = (c = getc(fp)) != EOF ? c : 0; + private_buf[i] = c; + } + if (c == 0 || c < 128 || (c >> 6) == 4) { // 10xxxxxx starts a new character + ungetc(c, fp); + private_buf[i] = 0; + break; + } + } + + u8_rune(& rune, private_buf); + + return rune; + } + + /* * NAME * fnematch *************** *** 672,677 **** --- 810,816 ---- char *buf = *pbuf; int bufsize = *pbufsize; int c, i, j, k, ns, s; + int rune; s = pfa->initstat; patlen = 0; *************** *** 695,706 **** buf[k++] = (c = getc(f)) != EOF ? c : 0; } c = (uschar)buf[j]; ! /* assert(c < NCHARS); */ ! if ((ns = pfa->gototab[s][c]) != 0) s = ns; else ! s = cgoto(pfa, s, c); if (pfa->out[s]) { /* final state */ patlen = j - i + 1; --- 834,852 ---- buf[k++] = (c = getc(f)) != EOF ? c : 0; } c = (uschar)buf[j]; ! if (c < 128) ! rune = c; ! else { ! j--; ! k--; ! ungetc(c, f); ! rune = getrune(f, &buf, &bufsize, quantum, &j, &k); ! } ! if ((ns = get_gototab(pfa, s, rune)) != 0) s = ns; else ! s = cgoto(pfa, s, rune); if (pfa->out[s]) { /* final state */ patlen = j - i + 1; *************** *** 1026,1031 **** --- 1172,1179 ---- return 0; } + extern int u8_rune(int *, const uschar *); /* run.c; should be in header file */ + int relex(void) /* lexical analyzer for reparse */ { int c, n; *************** *** 1043,1048 **** --- 1191,1202 ---- rescan: starttok = prestr; + if ((n = u8_rune(&rlxval, prestr)) > 1) { + prestr += n; + starttok = prestr; + return CHAR; + } + switch (c = *prestr++) { case '|': return OR; case '*': return STAR; *************** *** 1080,1089 **** } else cflag = 0; ! n = 2 * strlen((const char *) prestr)+1; if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) FATAL("out of space for reg expr %.10s...", lastre); for (; ; ) { if ((c = *prestr++) == '\\') { *bp++ = '\\'; if ((c = *prestr++) == '\0') --- 1234,1248 ---- } else cflag = 0; ! n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */ if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) FATAL("out of space for reg expr %.10s...", lastre); for (; ; ) { + if ((n = u8_rune(&rlxval, prestr)) > 1) { + for (i = 0; i < n; i++) + *bp++ = *prestr++; + continue; + } if ((c = *prestr++) == '\\') { *bp++ = '\\'; if ((c = *prestr++) == '\0') *************** *** 1250,1256 **** int *p, *q; int i, j, k; ! assert(c == HAT || c < NCHARS); while (f->accept >= maxsetvec) { /* guessing here! */ resizesetvec(__func__); } --- 1409,1415 ---- int *p, *q; int i, j, k; ! /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */ while (f->accept >= maxsetvec) { /* guessing here! */ resizesetvec(__func__); } *************** *** 1266,1273 **** || (k == DOT && c != 0 && c != HAT) || (k == ALL && c != 0) || (k == EMPTYRE && c != 0) ! || (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) ! || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { q = f->re[p[i]].lfollow; for (j = 1; j <= *q; j++) { if (q[j] >= maxsetvec) { --- 1425,1432 ---- || (k == DOT && c != 0 && c != HAT) || (k == ALL && c != 0) || (k == EMPTYRE && c != 0) ! || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp)) ! || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) { q = f->re[p[i]].lfollow; for (j = 1; j <= *q; j++) { if (q[j] >= maxsetvec) { *************** *** 1299,1305 **** goto different; /* setvec is state i */ if (c != HAT) ! f->gototab[s][c] = i; return i; different:; } --- 1458,1464 ---- goto different; /* setvec is state i */ if (c != HAT) ! set_gototab(f, s, c, i); return i; different:; } *************** *** 1308,1320 **** ++(f->curstat); resize_state(f, f->curstat); for (i = 0; i < NCHARS; i++) ! f->gototab[f->curstat][i] = 0; xfree(f->posns[f->curstat]); p = intalloc(setcnt + 1, __func__); f->posns[f->curstat] = p; if (c != HAT) ! f->gototab[s][c] = f->curstat; for (i = 0; i <= setcnt; i++) p[i] = tmpset[i]; if (setvec[f->accept]) --- 1467,1479 ---- ++(f->curstat); resize_state(f, f->curstat); for (i = 0; i < NCHARS; i++) ! set_gototab(f, f->curstat, 0, 0); xfree(f->posns[f->curstat]); p = intalloc(setcnt + 1, __func__); f->posns[f->curstat] = p; if (c != HAT) ! set_gototab(f, s, c, f->curstat); for (i = 0; i <= setcnt; i++) p[i] = tmpset[i]; if (setvec[f->accept])