version 1.42, 2023/09/21 17:19:06 |
version 1.43, 2023/10/06 22:29:24 |
|
|
fa *fatab[NFA]; |
fa *fatab[NFA]; |
int nfatab = 0; /* entries in fatab */ |
int nfatab = 0; /* entries in fatab */ |
|
|
|
extern int u8_nextlen(const char *s); |
|
|
|
|
/* utf-8 mechanism: |
/* utf-8 mechanism: |
|
|
For most of Awk, utf-8 strings just "work", since they look like |
For most of Awk, utf-8 strings just "work", since they look like |
|
|
return (0); |
return (0); |
} |
} |
|
|
static int getrune(FILE *fp, char **pbuf, int *pbufsize, int quantum, |
|
int *curpos, int *lastpos) |
#define MAX_UTF_BYTES 4 // UTF-8 is up to 4 bytes long |
|
|
|
// Read one rune at a time from the given FILE*. Return both |
|
// the bytes and the actual rune. |
|
|
|
struct runedata { |
|
int rune; |
|
size_t len; |
|
char bytes[6]; |
|
}; |
|
|
|
struct runedata getrune(FILE *fp) |
{ |
{ |
int c = 0; |
struct runedata result; |
char *buf = *pbuf; |
int c, i, next; |
static const int max_bytes = 4; // max multiple bytes in UTF-8 is 4 |
|
int i, rune; |
|
uschar private_buf[max_bytes + 1]; |
|
|
|
for (i = 0; i <= max_bytes; i++) { |
memset(&result, 0, sizeof(result)); |
if (++*curpos == *lastpos) { |
|
if (*lastpos == *pbufsize) |
c = getc(fp); |
if (!adjbuf((char **) pbuf, pbufsize, *pbufsize+1, quantum, 0, "getrune")) |
if (c == EOF) |
FATAL("stream '%.30s...' too long", buf); |
return result; // result.rune == 0 --> EOF |
buf[(*lastpos)++] = (c = getc(fp)) != EOF ? c : 0; |
else if (c < 128 || awk_mb_cur_max == 1) { |
private_buf[i] = c; |
result.bytes[0] = c; |
} |
result.len = 1; |
if (c == 0 || c < 128 || (c >> 6) == 4) { // 10xxxxxx starts a new character |
result.rune = c; |
ungetc(c, fp); |
|
private_buf[i] = 0; |
return result; |
|
} |
|
|
|
// need to get bytes and fill things in |
|
result.bytes[0] = c; |
|
result.len = 1; |
|
|
|
next = 1; |
|
for (i = 1; i < MAX_UTF_BYTES; i++) { |
|
c = getc(fp); |
|
if (c == EOF) |
break; |
break; |
} |
result.bytes[next++] = c; |
|
result.len++; |
} |
} |
|
|
u8_rune(& rune, private_buf); |
// put back any extra input bytes |
|
int actual_len = u8_nextlen(result.bytes); |
|
while (result.len > actual_len) { |
|
ungetc(result.bytes[--result.len], fp); |
|
} |
|
|
return rune; |
result.bytes[result.len] = '\0'; |
|
(void) u8_rune(& result.rune, (uschar *) result.bytes); |
|
|
|
return result; |
} |
} |
|
|
|
|
|
|
{ |
{ |
char *buf = *pbuf; |
char *buf = *pbuf; |
int bufsize = *pbufsize; |
int bufsize = *pbufsize; |
int c, i, j, k, ns, s; |
int i, j, k, ns, s; |
int rune; |
struct runedata r; |
|
|
s = pfa->initstat; |
s = pfa->initstat; |
patlen = 0; |
patlen = 0; |
|
|
* All indices relative to buf. |
* All indices relative to buf. |
* i <= j <= k <= bufsize |
* i <= j <= k <= bufsize |
* |
* |
* i: origin of active substring |
* i: origin of active substring (first byte of first character) |
* j: current character |
* j: current character (last byte of current character) |
* k: destination of next getc() |
* k: destination of next getc() |
*/ |
*/ |
i = -1, k = 0; |
i = -1, k = 0; |
do { |
do { |
j = i++; |
j = i++; |
do { |
do { |
if (++j == k) { |
r = getrune(f); |
if (k == bufsize) |
if ((++j + r.len) >= k) { |
|
if (k >= bufsize) |
if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch")) |
if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch")) |
FATAL("stream '%.30s...' too long", buf); |
FATAL("stream '%.30s...' too long", buf); |
buf[k++] = (c = getc(f)) != EOF ? c : 0; |
|
} |
} |
c = (uschar)buf[j]; |
memcpy(buf + k, r.bytes, r.len); |
if (c < 128 || awk_mb_cur_max == 1) |
j += r.len - 1; // incremented next time around the loop |
rune = c; |
k += r.len; |
else { |
|
j--; |
|
k--; |
|
ungetc(c, f); |
|
rune = getrune(f, &buf, &bufsize, quantum, &j, &k); |
|
} |
|
|
|
if ((ns = get_gototab(pfa, s, rune)) != 0) |
if ((ns = get_gototab(pfa, s, r.rune)) != 0) |
s = ns; |
s = ns; |
else |
else |
s = cgoto(pfa, s, rune); |
s = cgoto(pfa, s, r.rune); |
|
|
if (pfa->out[s]) { /* final state */ |
if (pfa->out[s]) { /* final state */ |
patlen = j - i + 1; |
patlen = j - i + 1; |
if (c == 0) /* don't count $ */ |
if (r.rune == 0) /* don't count $ */ |
patlen--; |
patlen--; |
} |
} |
} while (buf[j] && s != 1); |
} while (buf[j] && s != 1); |
s = 2; |
s = 2; |
|
if (r.len > 1) |
|
i += r.len - 1; // i incremented around the loop |
} while (buf[i] && !patlen); |
} while (buf[i] && !patlen); |
|
|
/* adjbuf() may have relocated a resized buffer. Inform the world. */ |
/* adjbuf() may have relocated a resized buffer. Inform the world. */ |
|
|
* (except for EOF's nullbyte, if present) and null |
* (except for EOF's nullbyte, if present) and null |
* terminate the buffer. |
* terminate the buffer. |
*/ |
*/ |
do |
do { |
if (buf[--k] && ungetc(buf[k], f) == EOF) |
int ii; |
FATAL("unable to ungetc '%c'", buf[k]); |
for (ii = r.len; ii > 0; ii--) |
while (k > i + patlen); |
if (buf[--k] && ungetc(buf[k], f) == EOF) |
|
FATAL("unable to ungetc '%c'", buf[k]); |
|
} while (k > i + patlen); |
buf[k] = '\0'; |
buf[k] = '\0'; |
return true; |
return true; |
} |
} |