=================================================================== RCS file: /cvsrepo/anoncvs/cvs/src/usr.bin/awk/b.c,v retrieving revision 1.47 retrieving revision 1.48 diff -u -r1.47 -r1.48 --- src/usr.bin/awk/b.c 2023/11/15 18:56:53 1.47 +++ src/usr.bin/awk/b.c 2023/11/22 01:01:21 1.48 @@ -1,4 +1,4 @@ -/* $OpenBSD: b.c,v 1.47 2023/11/15 18:56:53 millert Exp $ */ +/* $OpenBSD: b.c,v 1.48 2023/11/22 01:01:21 millert Exp $ */ /**************************************************************** Copyright (C) Lucent Technologies 1997 All Rights Reserved @@ -770,59 +770,6 @@ #define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long -// Read one rune at a time from the given FILE*. Return both -// the bytes and the actual rune. - -struct runedata { - int rune; - size_t len; - char bytes[6]; -}; - -struct runedata getrune(FILE *fp) -{ - struct runedata result; - int c, i, next; - - memset(&result, 0, sizeof(result)); - - c = getc(fp); - if (c == EOF) - return result; // result.rune == 0 --> EOF - else if (c < 128 || awk_mb_cur_max == 1) { - result.bytes[0] = c; - result.len = 1; - result.rune = c; - - return result; - } - - // need to get bytes and fill things in - result.bytes[0] = c; - result.len = 1; - - next = 1; - for (i = 1; i < MAX_UTF_BYTES; i++) { - c = getc(fp); - if (c == EOF) - break; - result.bytes[next++] = c; - result.len++; - } - - // put back any extra input bytes - int actual_len = u8_nextlen(result.bytes); - while (result.len > actual_len) { - ungetc(result.bytes[--result.len], fp); - } - - result.bytes[result.len] = '\0'; - (void) u8_rune(& result.rune, (uschar *) result.bytes); - - return result; -} - - /* * NAME * fnematch @@ -840,60 +787,76 @@ bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum) { - char *buf = *pbuf; + char *i, *j, *k, *buf = *pbuf; int bufsize = *pbufsize; - int i, j, k, ns, s; - struct runedata r; + int c, n, ns, s; s = pfa->initstat; patlen = 0; /* - * All indices relative to buf. - * i <= j <= k <= bufsize + * buf <= i <= j <= k <= buf+bufsize * - * i: origin of active substring (first byte of first character) - * j: current character (last byte of current character) - * k: destination of next getc() + * i: origin of active substring + * j: current character + * k: destination of the next getc */ - i = -1, k = 0; - do { - j = i++; - do { - r = getrune(f); - if (r.len == 0) { - r.len = 1; // store NUL byte for EOF + + i = j = k = buf; + + do { + /* + * Call u8_rune with at least MAX_UTF_BYTES ahead in + * the buffer until EOF interferes. + */ + if (k - j < MAX_UTF_BYTES) { + if (k + MAX_UTF_BYTES > buf + bufsize) { + adjbuf(&buf, &bufsize, + bufsize + MAX_UTF_BYTES, + quantum, 0, "fnematch"); } - j += r.len; - if (j >= bufsize) { - if (!adjbuf(&buf, &bufsize, j+1, quantum, 0, "fnematch")) - FATAL("stream '%.30s...' too long", buf); + for (n = MAX_UTF_BYTES ; n > 0; n--) { + *k++ = (c = getc(f)) != EOF ? c : 0; + if (c == EOF) { + if (ferror(f)) + FATAL("fnematch: getc error"); + break; + } } - memcpy(buf + k, r.bytes, r.len); - k += r.len; + } - if ((ns = get_gototab(pfa, s, r.rune)) != 0) - s = ns; - else - s = cgoto(pfa, s, r.rune); + j += u8_rune(&c, (uschar *)j); - if (pfa->out[s]) { /* final state */ - patlen = j - i + 1; - if (r.rune == 0) /* don't count $ */ - patlen--; - } - } while (buf[j] && s != 1); + if ((ns = get_gototab(pfa, s, c)) != 0) + s = ns; + else + s = cgoto(pfa, s, c); + + if (pfa->out[s]) { /* final state */ + patbeg = i; + patlen = j - i; + if (c == 0) /* don't count $ */ + patlen--; + } + + if (c && s != 1) + continue; /* origin i still viable, next j */ + if (patlen) + break; /* best match found */ + + /* no match at origin i, next i and start over */ + i += u8_rune(&c, (uschar *)i); + if (c == 0) + break; /* no match */ + j = i; s = 2; - if (r.len > 1) - i += r.len - 1; // i incremented around the loop - } while (buf[i] && !patlen); + } while (1); /* adjbuf() may have relocated a resized buffer. Inform the world. */ *pbuf = buf; *pbufsize = bufsize; if (patlen) { - patbeg = buf + i; /* * Under no circumstances is the last character fed to * the automaton part of the match. It is EOF's nullbyte, @@ -905,10 +868,11 @@ * (except for EOF's nullbyte, if present) and null * terminate the buffer. */ - for (; r.len > 0; r.len--) - if (buf[--k] && ungetc(buf[k], f) == EOF) - FATAL("unable to ungetc '%c'", buf[k]); - buf[k-patlen] = '\0'; + do + if (*--k && ungetc(*k, f) == EOF) + FATAL("unable to ungetc '%c'", *k); + while (k > patbeg + patlen); + *k = '\0'; return true; } else