version 1.37, 2021/07/08 21:26:39 |
version 1.38, 2023/09/17 14:49:44 |
|
|
fa *fatab[NFA]; |
fa *fatab[NFA]; |
int nfatab = 0; /* entries in fatab */ |
int nfatab = 0; /* entries in fatab */ |
|
|
|
|
|
/* utf-8 mechanism: |
|
|
|
For most of Awk, utf-8 strings just "work", since they look like |
|
null-terminated sequences of 8-bit bytes. |
|
|
|
Functions like length(), index(), and substr() have to operate |
|
in units of utf-8 characters. The u8_* functions in run.c |
|
handle this. |
|
|
|
Regular expressions are more complicated, since the basic |
|
mechanism of the goto table used 8-bit byte indices into the |
|
gototab entries to compute the next state. Unicode is a lot |
|
bigger, so the gototab entries are now structs with a character |
|
and a next state, and there is a linear search of the characters |
|
to find the state. (Yes, this is slower, by a significant |
|
amount. Tough.) |
|
|
|
Throughout the RE mechanism in b.c, utf-8 characters are |
|
converted to their utf-32 value. This mostly shows up in |
|
cclenter, which expands character class ranges like a-z and now |
|
alpha-omega. The size of a gototab array is still about 256. |
|
This should be dynamic, but for now things work ok for a single |
|
code page of Unicode, which is the most likely case. |
|
|
|
The code changes are localized in run.c and b.c. I have added a |
|
handful of functions to somewhat better hide the implementation, |
|
but a lot more could be done. |
|
|
|
*/ |
|
|
|
static int get_gototab(fa*, int, int); |
|
static int set_gototab(fa*, int, int, int); |
|
extern int u8_rune(int *, const uschar *); |
|
|
static int * |
static int * |
intalloc(size_t n, const char *f) |
intalloc(size_t n, const char *f) |
{ |
{ |
|
|
static void |
static void |
resize_state(fa *f, int state) |
resize_state(fa *f, int state) |
{ |
{ |
unsigned int **p; |
gtt **p; |
uschar *p2; |
uschar *p2; |
int **p3; |
int **p3; |
int i, new_count; |
int i, new_count; |
|
|
|
|
new_count = state + 10; /* needs to be tuned */ |
new_count = state + 10; /* needs to be tuned */ |
|
|
p = (unsigned int **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0])); |
p = (gtt **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0])); |
if (p == NULL) |
if (p == NULL) |
goto out; |
goto out; |
f->gototab = p; |
f->gototab = p; |
|
|
f->posns = p3; |
f->posns = p3; |
|
|
for (i = f->state_count; i < new_count; ++i) { |
for (i = f->state_count; i < new_count; ++i) { |
f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab)); |
f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab)); |
if (f->gototab[i] == NULL) |
if (f->gototab[i] == NULL) |
goto out; |
goto out; |
f->out[i] = 0; |
f->out[i] = 0; |
f->posns[i] = NULL; |
f->posns[i] = NULL; |
} |
} |
|
f->gototab_len = NCHARS; /* should be variable, growable */ |
f->state_count = new_count; |
f->state_count = new_count; |
return; |
return; |
out: |
out: |
|
|
if ((f->posns[2])[1] == f->accept) |
if ((f->posns[2])[1] == f->accept) |
f->out[2] = 1; |
f->out[2] = 1; |
for (i = 0; i < NCHARS; i++) |
for (i = 0; i < NCHARS; i++) |
f->gototab[2][i] = 0; |
set_gototab(f, 2, 0, 0); /* f->gototab[2][i] = 0; */ |
f->curstat = cgoto(f, 2, HAT); |
f->curstat = cgoto(f, 2, HAT); |
if (anchor) { |
if (anchor) { |
*f->posns[2] = k-1; /* leave out position 0 */ |
*f->posns[2] = k-1; /* leave out position 0 */ |
|
|
/* in the parsing of regular expressions, metacharacters like . have */ |
/* in the parsing of regular expressions, metacharacters like . have */ |
/* to be seen literally; \056 is not a metacharacter. */ |
/* to be seen literally; \056 is not a metacharacter. */ |
|
|
int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */ |
int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */ |
{ /* only pick up one 8-bit byte (2 chars) */ |
{ /* only pick up one 8-bit byte (2 chars) */ |
const uschar *p; |
const uschar *p; |
int n = 0; |
int n = 0; |
int i; |
int i; |
|
|
for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) { |
for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) { |
if (isdigit(*p)) |
if (isdigit(*p)) |
n = 16 * n + *p - '0'; |
n = 16 * n + *p - '0'; |
else if (*p >= 'a' && *p <= 'f') |
else if (*p >= 'a' && *p <= 'f') |
|
|
const uschar *p = *pp; |
const uschar *p = *pp; |
int c; |
int c; |
|
|
if ((c = *p++) == 't') |
/* BUG: should advance by utf-8 char even if makes no sense */ |
|
|
|
if ((c = *p++) == 't') { |
c = '\t'; |
c = '\t'; |
else if (c == 'n') |
} else if (c == 'n') { |
c = '\n'; |
c = '\n'; |
else if (c == 'f') |
} else if (c == 'f') { |
c = '\f'; |
c = '\f'; |
else if (c == 'r') |
} else if (c == 'r') { |
c = '\r'; |
c = '\r'; |
else if (c == 'b') |
} else if (c == 'b') { |
c = '\b'; |
c = '\b'; |
else if (c == 'v') |
} else if (c == 'v') { |
c = '\v'; |
c = '\v'; |
else if (c == 'a') |
} else if (c == 'a') { |
c = '\a'; |
c = '\a'; |
else if (c == '\\') |
} else if (c == '\\') { |
c = '\\'; |
c = '\\'; |
else if (c == 'x') { /* hexadecimal goo follows */ |
} else if (c == 'x') { /* 2 hex digits follow */ |
c = hexstr(&p); /* this adds a null if number is invalid */ |
c = hexstr(&p, 2); /* this adds a null if number is invalid */ |
|
} else if (c == 'u') { /* unicode char number up to 8 hex digits */ |
|
c = hexstr(&p, 8); |
} else if (isoctdigit(c)) { /* \d \dd \ddd */ |
} else if (isoctdigit(c)) { /* \d \dd \ddd */ |
int n = c - '0'; |
int n = c - '0'; |
if (isoctdigit(*p)) { |
if (isoctdigit(*p)) { |
|
|
return c; |
return c; |
} |
} |
|
|
char *cclenter(const char *argp) /* add a character class */ |
int *cclenter(const char *argp) /* add a character class */ |
{ |
{ |
int i, c, c2; |
int i, c, c2; |
const uschar *op, *p = (const uschar *) argp; |
int n; |
uschar *bp; |
const uschar *p = (const uschar *) argp; |
static uschar *buf = NULL; |
int *bp, *retp; |
|
static int *buf = NULL; |
static int bufsz = 100; |
static int bufsz = 100; |
|
|
op = p; |
if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL) |
if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL) |
|
FATAL("out of space for character class [%.10s...] 1", p); |
FATAL("out of space for character class [%.10s...] 1", p); |
bp = buf; |
bp = buf; |
for (i = 0; (c = *p++) != 0; ) { |
for (i = 0; *p != 0; ) { |
|
n = u8_rune(&c, p); |
|
p += n; |
if (c == '\\') { |
if (c == '\\') { |
c = quoted(&p); |
c = quoted(&p); |
} else if (c == '-' && i > 0 && bp[-1] != 0) { |
} else if (c == '-' && i > 0 && bp[-1] != 0) { |
if (*p != 0) { |
if (*p != 0) { |
c = bp[-1]; |
c = bp[-1]; |
c2 = *p++; |
/* c2 = *p++; */ |
|
n = u8_rune(&c2, p); |
|
p += n; |
if (c2 == '\\') |
if (c2 == '\\') |
c2 = quoted(&p); |
c2 = quoted(&p); /* BUG: sets p, has to be u8 size */ |
if (c > c2) { /* empty; ignore */ |
if (c > c2) { /* empty; ignore */ |
bp--; |
bp--; |
i--; |
i--; |
continue; |
continue; |
} |
} |
while (c < c2) { |
while (c < c2) { |
if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1")) |
if (i >= bufsz) { |
FATAL("out of space for character class [%.10s...] 2", p); |
buf = (int *) reallocarray(buf, bufsz, sizeof(int) * 2); |
|
if (buf == NULL) |
|
FATAL("out of space for character class [%.10s...] 2", p); |
|
bufsz *= 2; |
|
bp = buf + i; |
|
} |
*bp++ = ++c; |
*bp++ = ++c; |
i++; |
i++; |
} |
} |
continue; |
continue; |
} |
} |
} |
} |
if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2")) |
if (i >= bufsz) { |
FATAL("out of space for character class [%.10s...] 3", p); |
buf = (int *) reallocarray(buf, bufsz, sizeof(int) * 2); |
|
if (buf == NULL) |
|
FATAL("out of space for character class [%.10s...] 2", p); |
|
bufsz *= 2; |
|
bp = buf + i; |
|
} |
*bp++ = c; |
*bp++ = c; |
i++; |
i++; |
} |
} |
*bp = 0; |
*bp = 0; |
DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); |
/* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */ |
xfree(op); |
/* xfree(op); BUG: what are we freeing here? */ |
return (char *) tostring((char *) buf); |
retp = (int *) calloc(bp-buf+1, sizeof(int)); |
|
for (i = 0; i < bp-buf+1; i++) |
|
retp[i] = buf[i]; |
|
return retp; |
} |
} |
|
|
void overflo(const char *s) |
void overflo(const char *s) |
|
|
} |
} |
} |
} |
|
|
int member(int c, const char *sarg) /* is c in s? */ |
int member(int c, int *sarg) /* is c in s? */ |
{ |
{ |
const uschar *s = (const uschar *) sarg; |
int *s = (int *) sarg; |
|
|
while (*s) |
while (*s) |
if (c == *s++) |
if (c == *s++) |
|
|
return(0); |
return(0); |
} |
} |
|
|
|
static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */ |
|
{ |
|
int i; |
|
for (i = 0; i < f->gototab_len; i++) { |
|
if (f->gototab[state][i].ch == 0) |
|
break; |
|
if (f->gototab[state][i].ch == ch) |
|
return f->gototab[state][i].state; |
|
} |
|
return 0; |
|
} |
|
|
|
static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */ |
|
{ |
|
int i; |
|
for (i = 0; i < f->gototab_len; i++) { |
|
if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) { |
|
f->gototab[state][i].ch = ch; |
|
f->gototab[state][i].state = val; |
|
return val; |
|
} |
|
} |
|
overflo(__func__); |
|
return val; /* not used anywhere at the moment */ |
|
} |
|
|
int match(fa *f, const char *p0) /* shortest match ? */ |
int match(fa *f, const char *p0) /* shortest match ? */ |
{ |
{ |
int s, ns; |
int s, ns; |
|
int n; |
|
int rune; |
const uschar *p = (const uschar *) p0; |
const uschar *p = (const uschar *) p0; |
|
|
|
/* return pmatch(f, p0); does it matter whether longest or shortest? */ |
|
|
s = f->initstat; |
s = f->initstat; |
assert (s < f->state_count); |
assert (s < f->state_count); |
|
|
|
|
return(1); |
return(1); |
do { |
do { |
/* assert(*p < NCHARS); */ |
/* assert(*p < NCHARS); */ |
if ((ns = f->gototab[s][*p]) != 0) |
n = u8_rune(&rune, p); |
|
if ((ns = get_gototab(f, s, rune)) != 0) |
s = ns; |
s = ns; |
else |
else |
s = cgoto(f, s, *p); |
s = cgoto(f, s, rune); |
if (f->out[s]) |
if (f->out[s]) |
return(1); |
return(1); |
} while (*p++ != 0); |
if (*p == 0) |
|
break; |
|
p += n; |
|
} while (1); /* was *p++ != 0 */ |
return(0); |
return(0); |
} |
} |
|
|
int pmatch(fa *f, const char *p0) /* longest match, for sub */ |
int pmatch(fa *f, const char *p0) /* longest match, for sub */ |
{ |
{ |
int s, ns; |
int s, ns; |
|
int n; |
|
int rune; |
const uschar *p = (const uschar *) p0; |
const uschar *p = (const uschar *) p0; |
const uschar *q; |
const uschar *q; |
|
|
|
|
if (f->out[s]) /* final state */ |
if (f->out[s]) /* final state */ |
patlen = q-p; |
patlen = q-p; |
/* assert(*q < NCHARS); */ |
/* assert(*q < NCHARS); */ |
if ((ns = f->gototab[s][*q]) != 0) |
n = u8_rune(&rune, q); |
|
if ((ns = get_gototab(f, s, rune)) != 0) |
s = ns; |
s = ns; |
else |
else |
s = cgoto(f, s, *q); |
s = cgoto(f, s, rune); |
|
|
assert(s < f->state_count); |
assert(s < f->state_count); |
|
|
|
|
else |
else |
goto nextin; /* no match */ |
goto nextin; /* no match */ |
} |
} |
} while (*q++ != 0); |
if (*q == 0) |
|
break; |
|
q += n; |
|
} while (1); |
|
q++; /* was *q++ */ |
if (f->out[s]) |
if (f->out[s]) |
patlen = q-p-1; /* don't count $ */ |
patlen = q-p-1; /* don't count $ */ |
if (patlen >= 0) { |
if (patlen >= 0) { |
|
|
} |
} |
nextin: |
nextin: |
s = 2; |
s = 2; |
} while (*p++); |
if (*p == 0) |
|
break; |
|
n = u8_rune(&rune, p); |
|
p += n; |
|
} while (1); /* was *p++ */ |
return (0); |
return (0); |
} |
} |
|
|
int nematch(fa *f, const char *p0) /* non-empty match, for sub */ |
int nematch(fa *f, const char *p0) /* non-empty match, for sub */ |
{ |
{ |
int s, ns; |
int s, ns; |
|
int n; |
|
int rune; |
const uschar *p = (const uschar *) p0; |
const uschar *p = (const uschar *) p0; |
const uschar *q; |
const uschar *q; |
|
|
|
|
if (f->out[s]) /* final state */ |
if (f->out[s]) /* final state */ |
patlen = q-p; |
patlen = q-p; |
/* assert(*q < NCHARS); */ |
/* assert(*q < NCHARS); */ |
if ((ns = f->gototab[s][*q]) != 0) |
n = u8_rune(&rune, q); |
|
if ((ns = get_gototab(f, s, rune)) != 0) |
s = ns; |
s = ns; |
else |
else |
s = cgoto(f, s, *q); |
s = cgoto(f, s, rune); |
if (s == 1) { /* no transition */ |
if (s == 1) { /* no transition */ |
if (patlen > 0) { |
if (patlen > 0) { |
patbeg = (const char *) p; |
patbeg = (const char *) p; |
|
|
} else |
} else |
goto nnextin; /* no nonempty match */ |
goto nnextin; /* no nonempty match */ |
} |
} |
} while (*q++ != 0); |
if (*q == 0) |
|
break; |
|
q += n; |
|
} while (1); |
|
q++; |
if (f->out[s]) |
if (f->out[s]) |
patlen = q-p-1; /* don't count $ */ |
patlen = q-p-1; /* don't count $ */ |
if (patlen > 0 ) { |
if (patlen > 0 ) { |
|
|
return (0); |
return (0); |
} |
} |
|
|
|
static int getrune(FILE *fp, char **pbuf, int *pbufsize, int quantum, |
|
int *curpos, int *lastpos) |
|
{ |
|
int c = 0; |
|
char *buf = *pbuf; |
|
static const int max_bytes = 4; // max multiple bytes in UTF-8 is 4 |
|
int i, rune; |
|
uschar private_buf[max_bytes + 1]; |
|
|
|
for (i = 0; i <= max_bytes; i++) { |
|
if (++*curpos == *lastpos) { |
|
if (*lastpos == *pbufsize) |
|
if (!adjbuf((char **) pbuf, pbufsize, *pbufsize+1, quantum, 0, "getrune")) |
|
FATAL("stream '%.30s...' too long", buf); |
|
buf[(*lastpos)++] = (c = getc(fp)) != EOF ? c : 0; |
|
private_buf[i] = c; |
|
} |
|
if (c == 0 || c < 128 || (c >> 6) == 4) { // 10xxxxxx starts a new character |
|
ungetc(c, fp); |
|
private_buf[i] = 0; |
|
break; |
|
} |
|
} |
|
|
|
u8_rune(& rune, private_buf); |
|
|
|
return rune; |
|
} |
|
|
|
|
/* |
/* |
* NAME |
* NAME |
* fnematch |
* fnematch |
|
|
char *buf = *pbuf; |
char *buf = *pbuf; |
int bufsize = *pbufsize; |
int bufsize = *pbufsize; |
int c, i, j, k, ns, s; |
int c, i, j, k, ns, s; |
|
int rune; |
|
|
s = pfa->initstat; |
s = pfa->initstat; |
patlen = 0; |
patlen = 0; |
|
|
buf[k++] = (c = getc(f)) != EOF ? c : 0; |
buf[k++] = (c = getc(f)) != EOF ? c : 0; |
} |
} |
c = (uschar)buf[j]; |
c = (uschar)buf[j]; |
/* assert(c < NCHARS); */ |
if (c < 128) |
|
rune = c; |
|
else { |
|
j--; |
|
k--; |
|
ungetc(c, f); |
|
rune = getrune(f, &buf, &bufsize, quantum, &j, &k); |
|
} |
|
|
if ((ns = pfa->gototab[s][c]) != 0) |
if ((ns = get_gototab(pfa, s, rune)) != 0) |
s = ns; |
s = ns; |
else |
else |
s = cgoto(pfa, s, c); |
s = cgoto(pfa, s, rune); |
|
|
if (pfa->out[s]) { /* final state */ |
if (pfa->out[s]) { /* final state */ |
patlen = j - i + 1; |
patlen = j - i + 1; |
|
|
return 0; |
return 0; |
} |
} |
|
|
|
extern int u8_rune(int *, const uschar *); /* run.c; should be in header file */ |
|
|
int relex(void) /* lexical analyzer for reparse */ |
int relex(void) /* lexical analyzer for reparse */ |
{ |
{ |
int c, n; |
int c, n; |
|
|
rescan: |
rescan: |
starttok = prestr; |
starttok = prestr; |
|
|
|
if ((n = u8_rune(&rlxval, prestr)) > 1) { |
|
prestr += n; |
|
starttok = prestr; |
|
return CHAR; |
|
} |
|
|
switch (c = *prestr++) { |
switch (c = *prestr++) { |
case '|': return OR; |
case '|': return OR; |
case '*': return STAR; |
case '*': return STAR; |
|
|
} |
} |
else |
else |
cflag = 0; |
cflag = 0; |
n = 2 * strlen((const char *) prestr)+1; |
n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */ |
if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) |
if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1")) |
FATAL("out of space for reg expr %.10s...", lastre); |
FATAL("out of space for reg expr %.10s...", lastre); |
for (; ; ) { |
for (; ; ) { |
|
if ((n = u8_rune(&rlxval, prestr)) > 1) { |
|
for (i = 0; i < n; i++) |
|
*bp++ = *prestr++; |
|
continue; |
|
} |
if ((c = *prestr++) == '\\') { |
if ((c = *prestr++) == '\\') { |
*bp++ = '\\'; |
*bp++ = '\\'; |
if ((c = *prestr++) == '\0') |
if ((c = *prestr++) == '\0') |
|
|
int *p, *q; |
int *p, *q; |
int i, j, k; |
int i, j, k; |
|
|
assert(c == HAT || c < NCHARS); |
/* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */ |
while (f->accept >= maxsetvec) { /* guessing here! */ |
while (f->accept >= maxsetvec) { /* guessing here! */ |
resizesetvec(__func__); |
resizesetvec(__func__); |
} |
} |
|
|
|| (k == DOT && c != 0 && c != HAT) |
|| (k == DOT && c != 0 && c != HAT) |
|| (k == ALL && c != 0) |
|| (k == ALL && c != 0) |
|| (k == EMPTYRE && c != 0) |
|| (k == EMPTYRE && c != 0) |
|| (k == CCL && member(c, (char *) f->re[p[i]].lval.up)) |
|| (k == CCL && member(c, (int *) f->re[p[i]].lval.rp)) |
|| (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) { |
|| (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) { |
q = f->re[p[i]].lfollow; |
q = f->re[p[i]].lfollow; |
for (j = 1; j <= *q; j++) { |
for (j = 1; j <= *q; j++) { |
if (q[j] >= maxsetvec) { |
if (q[j] >= maxsetvec) { |
|
|
goto different; |
goto different; |
/* setvec is state i */ |
/* setvec is state i */ |
if (c != HAT) |
if (c != HAT) |
f->gototab[s][c] = i; |
set_gototab(f, s, c, i); |
return i; |
return i; |
different:; |
different:; |
} |
} |
|
|
++(f->curstat); |
++(f->curstat); |
resize_state(f, f->curstat); |
resize_state(f, f->curstat); |
for (i = 0; i < NCHARS; i++) |
for (i = 0; i < NCHARS; i++) |
f->gototab[f->curstat][i] = 0; |
set_gototab(f, f->curstat, 0, 0); |
xfree(f->posns[f->curstat]); |
xfree(f->posns[f->curstat]); |
p = intalloc(setcnt + 1, __func__); |
p = intalloc(setcnt + 1, __func__); |
|
|
f->posns[f->curstat] = p; |
f->posns[f->curstat] = p; |
if (c != HAT) |
if (c != HAT) |
f->gototab[s][c] = f->curstat; |
set_gototab(f, s, c, f->curstat); |
for (i = 0; i <= setcnt; i++) |
for (i = 0; i <= setcnt; i++) |
p[i] = tmpset[i]; |
p[i] = tmpset[i]; |
if (setvec[f->accept]) |
if (setvec[f->accept]) |