src/usr.bin/awk/b.c - diff

Return to b.c CVS log

Up to [local] / src / usr.bin / awk

Diff for /src/usr.bin/awk/b.c between version 1.37 and 1.38

version 1.37, 2021/07/08 21:26:39

version 1.38, 2023/09/17 14:49:44

Line 81

fa *fatab[NFA];

int nfatab = 0; /* entries in fatab */

/* utf-8 mechanism:

For most of Awk, utf-8 strings just "work", since they look like

null-terminated sequences of 8-bit bytes.

Functions like length(), index(), and substr() have to operate

in units of utf-8 characters. The u8_* functions in run.c

handle this.

Regular expressions are more complicated, since the basic

mechanism of the goto table used 8-bit byte indices into the

gototab entries to compute the next state. Unicode is a lot

bigger, so the gototab entries are now structs with a character

and a next state, and there is a linear search of the characters

to find the state. (Yes, this is slower, by a significant

amount. Tough.)

Throughout the RE mechanism in b.c, utf-8 characters are

converted to their utf-32 value. This mostly shows up in

cclenter, which expands character class ranges like a-z and now

alpha-omega. The size of a gototab array is still about 256.

This should be dynamic, but for now things work ok for a single

code page of Unicode, which is the most likely case.

The code changes are localized in run.c and b.c. I have added a

handful of functions to somewhat better hide the implementation,

but a lot more could be done.

static int get_gototab(fa*, int, int);

static int set_gototab(fa*, int, int, int);

extern int u8_rune(int *, const uschar *);

static int *

intalloc(size_t n, const char *f)

{

Line 113

Line 148

static void

resize_state(fa *f, int state)

{

unsigned int **p;

gtt **p;

uschar *p2;

int **p3;

int i, new_count;

Line 123

Line 158

new_count = state + 10; /* needs to be tuned */

p = (unsigned int **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0]));

p = (gtt **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0]));

if (p == NULL)

goto out;

f->gototab = p;

Line 139

Line 174

f->posns = p3;

for (i = f->state_count; i < new_count; ++i) {

f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab));

f->gototab[i] = (gtt *) calloc(NCHARS, sizeof(**f->gototab));

if (f->gototab[i] == NULL)

goto out;

f->out[i] = 0;

f->posns[i] = NULL;

}

f->gototab_len = NCHARS; /* should be variable, growable */

f->state_count = new_count;

return;

out:

Line 239

Line 275

if ((f->posns[2])[1] == f->accept)

f->out[2] = 1;

for (i = 0; i < NCHARS; i++)

f->gototab[2][i] = 0;

set_gototab(f, 2, 0, 0); /* f->gototab[2][i] = 0; */

f->curstat = cgoto(f, 2, HAT);

if (anchor) {

*f->posns[2] = k-1; /* leave out position 0 */

Line 308

Line 344

/* in the parsing of regular expressions, metacharacters like . have */

/* to be seen literally; \056 is not a metacharacter. */

int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */

int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */

{ /* only pick up one 8-bit byte (2 chars) */

const uschar *p;

int n = 0;

int i;

for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {

for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {

if (isdigit(*p))

n = 16 * n + *p - '0';

else if (*p >= 'a' && *p <= 'f')

Line 334

Line 370

const uschar *p = *pp;

int c;

if ((c = *p++) == 't')

/* BUG: should advance by utf-8 char even if makes no sense */

if ((c = *p++) == 't') {

c = '\t';

else if (c == 'n')

} else if (c == 'n') {

c = '\n';

else if (c == 'f')

} else if (c == 'f') {

c = '\f';

else if (c == 'r')

} else if (c == 'r') {

c = '\r';

else if (c == 'b')

} else if (c == 'b') {

c = '\b';

else if (c == 'v')

} else if (c == 'v') {

c = '\v';

else if (c == 'a')

} else if (c == 'a') {

c = '\a';

else if (c == '\\')

} else if (c == '\\') {

c = '\\';

else if (c == 'x') { /* hexadecimal goo follows */

} else if (c == 'x') { /* 2 hex digits follow */

c = hexstr(&p); /* this adds a null if number is invalid */

c = hexstr(&p, 2); /* this adds a null if number is invalid */

} else if (c == 'u') { /* unicode char number up to 8 hex digits */

c = hexstr(&p, 8);

} else if (isoctdigit(c)) { /* \d \dd \ddd */

int n = c - '0';

if (isoctdigit(*p)) {

Line 366

Line 406

return c;

}

char *cclenter(const char *argp) /* add a character class */

int *cclenter(const char *argp) /* add a character class */

{

int i, c, c2;

const uschar *op, *p = (const uschar *) argp;

int n;

uschar *bp;

const uschar *p = (const uschar *) argp;

static uschar *buf = NULL;

int *bp, *retp;

static int *buf = NULL;

static int bufsz = 100;

op = p;

if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)

if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)

FATAL("out of space for character class [%.10s...] 1", p);

bp = buf;

for (i = 0; (c = *p++) != 0; ) {

for (i = 0; *p != 0; ) {

n = u8_rune(&c, p);

p += n;

if (c == '\\') {

c = quoted(&p);

} else if (c == '-' && i > 0 && bp[-1] != 0) {

if (*p != 0) {

c = bp[-1];

c2 = *p++;

/* c2 = *p++; */

n = u8_rune(&c2, p);

p += n;

if (c2 == '\\')

c2 = quoted(&p);

c2 = quoted(&p); /* BUG: sets p, has to be u8 size */

if (c > c2) { /* empty; ignore */

bp--;

i--;

continue;

}

while (c < c2) {

if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))

if (i >= bufsz) {

FATAL("out of space for character class [%.10s...] 2", p);

buf = (int *) reallocarray(buf, bufsz, sizeof(int) * 2);

if (buf == NULL)

FATAL("out of space for character class [%.10s...] 2", p);

bufsz *= 2;

bp = buf + i;

}

*bp++ = ++c;

i++;

}

continue;

}

if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))

if (i >= bufsz) {

FATAL("out of space for character class [%.10s...] 3", p);

buf = (int *) reallocarray(buf, bufsz, sizeof(int) * 2);

if (buf == NULL)

FATAL("out of space for character class [%.10s...] 2", p);

bufsz *= 2;

bp = buf + i;

}

*bp++ = c;

i++;

}

*bp = 0;

DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf);

/* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */

xfree(op);

/* xfree(op); BUG: what are we freeing here? */

return (char *) tostring((char *) buf);

retp = (int *) calloc(bp-buf+1, sizeof(int));

for (i = 0; i < bp-buf+1; i++)

retp[i] = buf[i];

return retp;

}

void overflo(const char *s)

Line 532

Line 589

}

int member(int c, const char *sarg) /* is c in s? */

int member(int c, int *sarg) /* is c in s? */

{

const uschar *s = (const uschar *) sarg;

int *s = (int *) sarg;

while (*s)

if (c == *s++)

Line 542

Line 599

return(0);

}

static int get_gototab(fa *f, int state, int ch) /* hide gototab inplementation */

{

int i;

for (i = 0; i < f->gototab_len; i++) {

if (f->gototab[state][i].ch == 0)

break;

if (f->gototab[state][i].ch == ch)

return f->gototab[state][i].state;

}

return 0;

}

static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab inplementation */

{

int i;

for (i = 0; i < f->gototab_len; i++) {

if (f->gototab[state][i].ch == 0 || f->gototab[state][i].ch == ch) {

f->gototab[state][i].ch = ch;

f->gototab[state][i].state = val;

return val;

}

overflo(__func__);

return val; /* not used anywhere at the moment */

}

int match(fa *f, const char *p0) /* shortest match ? */

{

int s, ns;

int n;

int rune;

const uschar *p = (const uschar *) p0;

/* return pmatch(f, p0); does it matter whether longest or shortest? */

s = f->initstat;

assert (s < f->state_count);

Line 554

Line 641

return(1);

do {

/* assert(*p < NCHARS); */

if ((ns = f->gototab[s][*p]) != 0)

n = u8_rune(&rune, p);

if ((ns = get_gototab(f, s, rune)) != 0)

s = ns;

else

s = cgoto(f, s, *p);

s = cgoto(f, s, rune);

if (f->out[s])

return(1);

} while (*p++ != 0);

if (*p == 0)

break;

p += n;

} while (1); /* was *p++ != 0 */

return(0);

}

int pmatch(fa *f, const char *p0) /* longest match, for sub */

{

int s, ns;

int n;

int rune;

const uschar *p = (const uschar *) p0;

const uschar *q;

Line 581

Line 674

if (f->out[s]) /* final state */

patlen = q-p;

/* assert(*q < NCHARS); */

if ((ns = f->gototab[s][*q]) != 0)

n = u8_rune(&rune, q);

if ((ns = get_gototab(f, s, rune)) != 0)

s = ns;

else

s = cgoto(f, s, *q);

s = cgoto(f, s, rune);

assert(s < f->state_count);

Line 596

Line 690

else

goto nextin; /* no match */

}

} while (*q++ != 0);

if (*q == 0)

break;

q += n;

} while (1);

q++; /* was *q++ */

if (f->out[s])

patlen = q-p-1; /* don't count $ */

if (patlen >= 0) {

Line 605

Line 703

}

nextin:

s = 2;

} while (*p++);

if (*p == 0)

break;

n = u8_rune(&rune, p);

p += n;

} while (1); /* was *p++ */

return (0);

}

int nematch(fa *f, const char *p0) /* non-empty match, for sub */

{

int s, ns;

int n;

int rune;

const uschar *p = (const uschar *) p0;

const uschar *q;

Line 626

Line 730

if (f->out[s]) /* final state */

patlen = q-p;

/* assert(*q < NCHARS); */

if ((ns = f->gototab[s][*q]) != 0)

n = u8_rune(&rune, q);

if ((ns = get_gototab(f, s, rune)) != 0)

s = ns;

else

s = cgoto(f, s, *q);

s = cgoto(f, s, rune);

if (s == 1) { /* no transition */

if (patlen > 0) {

patbeg = (const char *) p;

Line 637

Line 742

} else

goto nnextin; /* no nonempty match */

}

} while (*q++ != 0);

if (*q == 0)

break;

q += n;

} while (1);

q++;

if (f->out[s])

patlen = q-p-1; /* don't count $ */

if (patlen > 0 ) {

Line 651

Line 760

return (0);

}

static int getrune(FILE *fp, char **pbuf, int *pbufsize, int quantum,

int *curpos, int *lastpos)

{

int c = 0;

char *buf = *pbuf;

static const int max_bytes = 4; // max multiple bytes in UTF-8 is 4

int i, rune;

uschar private_buf[max_bytes + 1];

for (i = 0; i <= max_bytes; i++) {

if (++*curpos == *lastpos) {

if (*lastpos == *pbufsize)

if (!adjbuf((char **) pbuf, pbufsize, *pbufsize+1, quantum, 0, "getrune"))

FATAL("stream '%.30s...' too long", buf);

buf[(*lastpos)++] = (c = getc(fp)) != EOF ? c : 0;

private_buf[i] = c;

}

if (c == 0 || c < 128 || (c >> 6) == 4) { // 10xxxxxx starts a new character

ungetc(c, fp);

private_buf[i] = 0;

break;

}

u8_rune(& rune, private_buf);

return rune;

}

* NAME

* fnematch

Line 672

Line 810

char *buf = *pbuf;

int bufsize = *pbufsize;

int c, i, j, k, ns, s;

int rune;

s = pfa->initstat;

patlen = 0;

Line 695

Line 834

buf[k++] = (c = getc(f)) != EOF ? c : 0;

}

c = (uschar)buf[j];

/* assert(c < NCHARS); */

if (c < 128)

rune = c;

else {

j--;

k--;

ungetc(c, f);

rune = getrune(f, &buf, &bufsize, quantum, &j, &k);

}

if ((ns = pfa->gototab[s][c]) != 0)

if ((ns = get_gototab(pfa, s, rune)) != 0)

s = ns;

else

s = cgoto(pfa, s, c);

s = cgoto(pfa, s, rune);

if (pfa->out[s]) { /* final state */

patlen = j - i + 1;

Line 1026

Line 1172

return 0;

}

extern int u8_rune(int *, const uschar *); /* run.c; should be in header file */

int relex(void) /* lexical analyzer for reparse */

{

int c, n;

Line 1043

Line 1191

rescan:

starttok = prestr;

if ((n = u8_rune(&rlxval, prestr)) > 1) {

prestr += n;

starttok = prestr;

return CHAR;

}

switch (c = *prestr++) {

case '|': return OR;

case '*': return STAR;

Line 1080

Line 1234

}

else

cflag = 0;

n = 2 * strlen((const char *) prestr)+1;

n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */

if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))

FATAL("out of space for reg expr %.10s...", lastre);

for (; ; ) {

if ((n = u8_rune(&rlxval, prestr)) > 1) {

for (i = 0; i < n; i++)

*bp++ = *prestr++;

continue;

}

if ((c = *prestr++) == '\\') {

*bp++ = '\\';

if ((c = *prestr++) == '\0')

Line 1250

Line 1409

int *p, *q;

int i, j, k;

assert(c == HAT || c < NCHARS);

/* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */

while (f->accept >= maxsetvec) { /* guessing here! */

resizesetvec(__func__);

}

Line 1266

Line 1425

|| (k == DOT && c != 0 && c != HAT)

|| (k == ALL && c != 0)

|| (k == EMPTYRE && c != 0)

|| (k == CCL && member(c, (char *) f->re[p[i]].lval.up))

|| (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))

|| (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {

|| (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {

q = f->re[p[i]].lfollow;

for (j = 1; j <= *q; j++) {

if (q[j] >= maxsetvec) {

Line 1299

Line 1458

goto different;

/* setvec is state i */

if (c != HAT)

f->gototab[s][c] = i;

set_gototab(f, s, c, i);

return i;

different:;

}

Line 1308

Line 1467

++(f->curstat);

resize_state(f, f->curstat);

for (i = 0; i < NCHARS; i++)

f->gototab[f->curstat][i] = 0;

set_gototab(f, f->curstat, 0, 0);

xfree(f->posns[f->curstat]);

p = intalloc(setcnt + 1, __func__);

f->posns[f->curstat] = p;

if (c != HAT)

f->gototab[s][c] = f->curstat;

set_gototab(f, s, c, f->curstat);

for (i = 0; i <= setcnt; i++)

p[i] = tmpset[i];

if (setvec[f->accept])