Annotation of src/usr.bin/awk/b.c, Revision 1.52
1.52 ! millert 1: /* $OpenBSD: b.c,v 1.51 2024/04/25 18:33:53 millert Exp $ */
1.1 tholo 2: /****************************************************************
1.5 kstailey 3: Copyright (C) Lucent Technologies 1997
1.1 tholo 4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
1.5 kstailey 11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
1.1 tholo 24: ****************************************************************/
25:
1.15 millert 26: /* lasciate ogne speranza, voi ch'intrate. */
1.1 tholo 27:
28: #define DEBUG
29:
30: #include <ctype.h>
1.23 millert 31: #include <limits.h>
1.1 tholo 32: #include <stdio.h>
33: #include <string.h>
34: #include <stdlib.h>
35: #include "awk.h"
1.34 millert 36: #include "awkgram.tab.h"
1.1 tholo 37:
38: #define MAXLIN 22
39:
1.7 millert 40: #define type(v) (v)->nobj /* badly overloaded here */
41: #define info(v) (v)->ntype /* badly overloaded here */
1.1 tholo 42: #define left(v) (v)->narg[0]
43: #define right(v) (v)->narg[1]
44: #define parent(v) (v)->nnext
45:
46: #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
1.15 millert 47: #define ELEAF case EMPTYRE: /* empty string in regexp */
1.1 tholo 48: #define UNARY case STAR: case PLUS: case QUEST:
49:
50: /* encoding in tree Nodes:
1.15 millert 51: leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
1.1 tholo 52: left is index, right contains value or pointer to value
53: unary (STAR, PLUS, QUEST): left is child, right is null
54: binary (CAT, OR): left and right are children
55: parent contains pointer to parent
56: */
57:
58:
59: int *setvec;
60: int *tmpset;
61: int maxsetvec = 0;
62:
63: int rtok; /* next token in current re */
1.4 millert 64: int rlxval;
1.28 millert 65: static const uschar *rlxstr;
66: static const uschar *prestr; /* current position in current re */
67: static const uschar *lastre; /* origin of last re */
68: static const uschar *lastatom; /* origin of last Atom */
69: static const uschar *starttok;
70: static const uschar *basestr; /* starts with original, replaced during
1.23 millert 71: repetition processing */
1.28 millert 72: static const uschar *firstbasestr;
1.1 tholo 73:
1.4 millert 74: static int setcnt;
75: static int poscnt;
1.1 tholo 76:
1.28 millert 77: const char *patbeg;
1.1 tholo 78: int patlen;
79:
1.27 millert 80: #define NFA 128 /* cache this many dynamic fa's */
1.1 tholo 81: fa *fatab[NFA];
82: int nfatab = 0; /* entries in fatab */
83:
1.43 millert 84: extern int u8_nextlen(const char *s);
85:
1.38 millert 86:
87: /* utf-8 mechanism:
88:
89: For most of Awk, utf-8 strings just "work", since they look like
90: null-terminated sequences of 8-bit bytes.
91:
92: Functions like length(), index(), and substr() have to operate
93: in units of utf-8 characters. The u8_* functions in run.c
94: handle this.
95:
96: Regular expressions are more complicated, since the basic
97: mechanism of the goto table used 8-bit byte indices into the
98: gototab entries to compute the next state. Unicode is a lot
99: bigger, so the gototab entries are now structs with a character
1.49 millert 100: and a next state. These are sorted by code point and binary
101: searched.
1.38 millert 102:
103: Throughout the RE mechanism in b.c, utf-8 characters are
104: converted to their utf-32 value. This mostly shows up in
105: cclenter, which expands character class ranges like a-z and now
106: alpha-omega. The size of a gototab array is still about 256.
107: This should be dynamic, but for now things work ok for a single
108: code page of Unicode, which is the most likely case.
109:
110: The code changes are localized in run.c and b.c. I have added a
111: handful of functions to somewhat better hide the implementation,
112: but a lot more could be done.
113:
114: */
115:
1.49 millert 116: static int entry_cmp(const void *l, const void *r);
1.38 millert 117: static int get_gototab(fa*, int, int);
118: static int set_gototab(fa*, int, int, int);
1.49 millert 119: static void clear_gototab(fa*, int);
1.50 millert 120: extern int u8_rune(int *, const char *);
1.38 millert 121:
1.27 millert 122: static int *
123: intalloc(size_t n, const char *f)
124: {
1.35 millert 125: int *p = (int *) calloc(n, sizeof(int));
1.27 millert 126: if (p == NULL)
127: overflo(f);
128: return p;
129: }
130:
131: static void
132: allocsetvec(const char *f)
133: {
134: maxsetvec = MAXLIN;
1.35 millert 135: setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec));
136: tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
1.27 millert 137: if (setvec == NULL || tmpset == NULL)
138: overflo(f);
139: }
140:
141: static void
142: resizesetvec(const char *f)
143: {
1.35 millert 144: setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
145: tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
1.27 millert 146: if (setvec == NULL || tmpset == NULL)
147: overflo(f);
148: maxsetvec *= 4;
149: }
150:
151: static void
152: resize_state(fa *f, int state)
153: {
1.49 millert 154: gtt *p;
1.35 millert 155: uschar *p2;
156: int **p3;
1.27 millert 157: int i, new_count;
158:
159: if (++state < f->state_count)
160: return;
161:
162: new_count = state + 10; /* needs to be tuned */
163:
1.49 millert 164: p = (gtt *) reallocarray(f->gototab, new_count, sizeof(gtt));
1.27 millert 165: if (p == NULL)
166: goto out;
167: f->gototab = p;
168:
1.35 millert 169: p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0]));
170: if (p2 == NULL)
1.27 millert 171: goto out;
1.35 millert 172: f->out = p2;
1.27 millert 173:
1.35 millert 174: p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0]));
175: if (p3 == NULL)
1.27 millert 176: goto out;
1.35 millert 177: f->posns = p3;
1.27 millert 178:
179: for (i = f->state_count; i < new_count; ++i) {
1.49 millert 180: f->gototab[i].entries = (gtte *) calloc(NCHARS, sizeof(gtte));
181: if (f->gototab[i].entries == NULL)
1.27 millert 182: goto out;
1.49 millert 183: f->gototab[i].allocated = NCHARS;
184: f->gototab[i].inuse = 0;
185: f->out[i] = 0;
1.27 millert 186: f->posns[i] = NULL;
187: }
188: f->state_count = new_count;
189: return;
190: out:
191: overflo(__func__);
192: }
193:
1.29 millert 194: fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
1.1 tholo 195: {
196: int i, use, nuse;
197: fa *pfa;
1.6 millert 198: static int now = 1;
1.1 tholo 199:
1.25 millert 200: if (setvec == NULL) { /* first time through any RE */
1.27 millert 201: allocsetvec(__func__);
1.1 tholo 202: }
203:
1.29 millert 204: if (compile_time != RUNNING) /* a constant for sure */
1.1 tholo 205: return mkdfa(s, anchor);
206: for (i = 0; i < nfatab; i++) /* is it there already? */
207: if (fatab[i]->anchor == anchor
1.11 millert 208: && strcmp((const char *) fatab[i]->restr, s) == 0) {
1.6 millert 209: fatab[i]->use = now++;
1.1 tholo 210: return fatab[i];
1.10 millert 211: }
1.1 tholo 212: pfa = mkdfa(s, anchor);
213: if (nfatab < NFA) { /* room for another */
214: fatab[nfatab] = pfa;
1.6 millert 215: fatab[nfatab]->use = now++;
1.1 tholo 216: nfatab++;
217: return pfa;
218: }
219: use = fatab[0]->use; /* replace least-recently used */
220: nuse = 0;
221: for (i = 1; i < nfatab; i++)
222: if (fatab[i]->use < use) {
223: use = fatab[i]->use;
224: nuse = i;
225: }
226: freefa(fatab[nuse]);
227: fatab[nuse] = pfa;
1.6 millert 228: pfa->use = now++;
1.1 tholo 229: return pfa;
230: }
231:
1.29 millert 232: fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
1.30 millert 233: /* anchor = true for anchored matches, else false */
1.1 tholo 234: {
235: Node *p, *p1;
236: fa *f;
237:
1.28 millert 238: firstbasestr = (const uschar *) s;
1.23 millert 239: basestr = firstbasestr;
1.1 tholo 240: p = reparse(s);
241: p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
242: /* put ALL STAR in front of reg. exp. */
243: p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
244: /* put FINAL after reg. exp. */
245:
246: poscnt = 0;
247: penter(p1); /* enter parent pointers and leaf indices */
1.35 millert 248: if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
1.27 millert 249: overflo(__func__);
1.1 tholo 250: f->accept = poscnt-1; /* penter has computed number of positions in re */
251: cfoll(f, p1); /* set up follow sets */
252: freetr(p1);
1.27 millert 253: resize_state(f, 1);
254: f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
255: f->posns[1] = intalloc(1, __func__);
1.1 tholo 256: *f->posns[1] = 0;
257: f->initstat = makeinit(f, anchor);
258: f->anchor = anchor;
1.10 millert 259: f->restr = (uschar *) tostring(s);
1.23 millert 260: if (firstbasestr != basestr) {
261: if (basestr)
262: xfree(basestr);
263: }
1.1 tholo 264: return f;
265: }
266:
1.29 millert 267: int makeinit(fa *f, bool anchor)
1.1 tholo 268: {
269: int i, k;
270:
271: f->curstat = 2;
272: f->out[2] = 0;
273: k = *(f->re[0].lfollow);
1.25 millert 274: xfree(f->posns[2]);
1.27 millert 275: f->posns[2] = intalloc(k + 1, __func__);
1.30 millert 276: for (i = 0; i <= k; i++) {
1.1 tholo 277: (f->posns[2])[i] = (f->re[0].lfollow)[i];
278: }
279: if ((f->posns[2])[1] == f->accept)
280: f->out[2] = 1;
1.49 millert 281: clear_gototab(f, 2);
1.1 tholo 282: f->curstat = cgoto(f, 2, HAT);
283: if (anchor) {
284: *f->posns[2] = k-1; /* leave out position 0 */
1.30 millert 285: for (i = 0; i < k; i++) {
1.1 tholo 286: (f->posns[0])[i] = (f->posns[2])[i];
287: }
288:
289: f->out[0] = f->out[2];
290: if (f->curstat != 2)
291: --(*f->posns[f->curstat]);
292: }
293: return f->curstat;
294: }
295:
296: void penter(Node *p) /* set up parent pointers and leaf indices */
297: {
298: switch (type(p)) {
1.15 millert 299: ELEAF
1.1 tholo 300: LEAF
1.7 millert 301: info(p) = poscnt;
1.1 tholo 302: poscnt++;
303: break;
304: UNARY
305: penter(left(p));
306: parent(left(p)) = p;
307: break;
308: case CAT:
309: case OR:
310: penter(left(p));
311: penter(right(p));
312: parent(left(p)) = p;
313: parent(right(p)) = p;
314: break;
1.31 millert 315: case ZERO:
316: break;
1.1 tholo 317: default: /* can't happen */
1.8 millert 318: FATAL("can't happen: unknown type %d in penter", type(p));
1.1 tholo 319: break;
320: }
321: }
322:
323: void freetr(Node *p) /* free parse tree */
324: {
325: switch (type(p)) {
1.15 millert 326: ELEAF
1.1 tholo 327: LEAF
328: xfree(p);
329: break;
330: UNARY
1.31 millert 331: case ZERO:
1.1 tholo 332: freetr(left(p));
333: xfree(p);
334: break;
335: case CAT:
336: case OR:
337: freetr(left(p));
338: freetr(right(p));
339: xfree(p);
340: break;
341: default: /* can't happen */
1.8 millert 342: FATAL("can't happen: unknown type %d in freetr", type(p));
1.1 tholo 343: break;
344: }
345: }
346:
347: /* in the parsing of regular expressions, metacharacters like . have */
348: /* to be seen literally; \056 is not a metacharacter. */
349:
1.38 millert 350: int hexstr(const uschar **pp, int max) /* find and eval hex string at pp, return new p */
1.2 millert 351: { /* only pick up one 8-bit byte (2 chars) */
1.28 millert 352: const uschar *p;
1.1 tholo 353: int n = 0;
1.2 millert 354: int i;
1.1 tholo 355:
1.38 millert 356: for (i = 0, p = *pp; i < max && isxdigit(*p); i++, p++) {
1.1 tholo 357: if (isdigit(*p))
358: n = 16 * n + *p - '0';
359: else if (*p >= 'a' && *p <= 'f')
360: n = 16 * n + *p - 'a' + 10;
361: else if (*p >= 'A' && *p <= 'F')
362: n = 16 * n + *p - 'A' + 10;
363: }
1.28 millert 364: *pp = p;
1.1 tholo 365: return n;
366: }
367:
1.2 millert 368: #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
1.1 tholo 369:
1.28 millert 370: int quoted(const uschar **pp) /* pick up next thing after a \\ */
1.1 tholo 371: /* and increment *pp */
372: {
1.28 millert 373: const uschar *p = *pp;
1.1 tholo 374: int c;
375:
1.38 millert 376: /* BUG: should advance by utf-8 char even if makes no sense */
377:
378: if ((c = *p++) == 't') {
1.1 tholo 379: c = '\t';
1.38 millert 380: } else if (c == 'n') {
1.1 tholo 381: c = '\n';
1.38 millert 382: } else if (c == 'f') {
1.1 tholo 383: c = '\f';
1.38 millert 384: } else if (c == 'r') {
1.1 tholo 385: c = '\r';
1.38 millert 386: } else if (c == 'b') {
1.1 tholo 387: c = '\b';
1.38 millert 388: } else if (c == 'v') {
1.25 millert 389: c = '\v';
1.38 millert 390: } else if (c == 'a') {
1.25 millert 391: c = '\a';
1.38 millert 392: } else if (c == '\\') {
1.1 tholo 393: c = '\\';
1.38 millert 394: } else if (c == 'x') { /* 2 hex digits follow */
395: c = hexstr(&p, 2); /* this adds a null if number is invalid */
396: } else if (c == 'u') { /* unicode char number up to 8 hex digits */
397: c = hexstr(&p, 8);
1.1 tholo 398: } else if (isoctdigit(c)) { /* \d \dd \ddd */
399: int n = c - '0';
400: if (isoctdigit(*p)) {
401: n = 8 * n + *p++ - '0';
402: if (isoctdigit(*p))
403: n = 8 * n + *p++ - '0';
404: }
405: c = n;
406: } /* else */
407: /* c = c; */
408: *pp = p;
409: return c;
410: }
411:
1.38 millert 412: int *cclenter(const char *argp) /* add a character class */
1.1 tholo 413: {
414: int i, c, c2;
1.38 millert 415: int n;
416: const uschar *p = (const uschar *) argp;
417: int *bp, *retp;
418: static int *buf = NULL;
1.5 kstailey 419: static int bufsz = 100;
1.1 tholo 420:
1.38 millert 421: if (buf == NULL && (buf = (int *) calloc(bufsz, sizeof(int))) == NULL)
1.8 millert 422: FATAL("out of space for character class [%.10s...] 1", p);
1.5 kstailey 423: bp = buf;
1.38 millert 424: for (i = 0; *p != 0; ) {
1.50 millert 425: n = u8_rune(&c, (const char *) p);
1.38 millert 426: p += n;
1.1 tholo 427: if (c == '\\') {
1.17 millert 428: c = quoted(&p);
1.5 kstailey 429: } else if (c == '-' && i > 0 && bp[-1] != 0) {
1.1 tholo 430: if (*p != 0) {
1.5 kstailey 431: c = bp[-1];
1.38 millert 432: /* c2 = *p++; */
1.50 millert 433: n = u8_rune(&c2, (const char *) p);
1.38 millert 434: p += n;
1.1 tholo 435: if (c2 == '\\')
1.38 millert 436: c2 = quoted(&p); /* BUG: sets p, has to be u8 size */
1.1 tholo 437: if (c > c2) { /* empty; ignore */
1.5 kstailey 438: bp--;
1.1 tholo 439: i--;
440: continue;
441: }
442: while (c < c2) {
1.38 millert 443: if (i >= bufsz) {
1.45 millert 444: buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
1.38 millert 445: if (buf == NULL)
446: FATAL("out of space for character class [%.10s...] 2", p);
447: bufsz *= 2;
448: bp = buf + i;
449: }
1.5 kstailey 450: *bp++ = ++c;
1.1 tholo 451: i++;
452: }
453: continue;
454: }
455: }
1.38 millert 456: if (i >= bufsz) {
1.45 millert 457: buf = (int *) reallocarray(buf, bufsz, 2 * sizeof(int));
1.38 millert 458: if (buf == NULL)
459: FATAL("out of space for character class [%.10s...] 2", p);
460: bufsz *= 2;
461: bp = buf + i;
462: }
1.5 kstailey 463: *bp++ = c;
1.1 tholo 464: i++;
465: }
1.5 kstailey 466: *bp = 0;
1.38 millert 467: /* DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf); BUG: can't print array of int */
468: /* xfree(op); BUG: what are we freeing here? */
469: retp = (int *) calloc(bp-buf+1, sizeof(int));
470: for (i = 0; i < bp-buf+1; i++)
471: retp[i] = buf[i];
472: return retp;
1.1 tholo 473: }
474:
1.11 millert 475: void overflo(const char *s)
1.1 tholo 476: {
1.27 millert 477: FATAL("regular expression too big: out of space in %.30s...", s);
1.1 tholo 478: }
479:
480: void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
481: {
482: int i;
1.4 millert 483: int *p;
1.1 tholo 484:
485: switch (type(v)) {
1.15 millert 486: ELEAF
1.1 tholo 487: LEAF
1.7 millert 488: f->re[info(v)].ltype = type(v);
489: f->re[info(v)].lval.np = right(v);
1.1 tholo 490: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 491: resizesetvec(__func__);
1.1 tholo 492: }
493: for (i = 0; i <= f->accept; i++)
494: setvec[i] = 0;
495: setcnt = 0;
496: follow(v); /* computes setvec and setcnt */
1.27 millert 497: p = intalloc(setcnt + 1, __func__);
1.7 millert 498: f->re[info(v)].lfollow = p;
1.1 tholo 499: *p = setcnt;
500: for (i = f->accept; i >= 0; i--)
501: if (setvec[i] == 1)
502: *++p = i;
503: break;
504: UNARY
505: cfoll(f,left(v));
506: break;
507: case CAT:
508: case OR:
509: cfoll(f,left(v));
510: cfoll(f,right(v));
511: break;
1.31 millert 512: case ZERO:
513: break;
1.1 tholo 514: default: /* can't happen */
1.8 millert 515: FATAL("can't happen: unknown type %d in cfoll", type(v));
1.1 tholo 516: }
517: }
518:
519: int first(Node *p) /* collects initially active leaves of p into setvec */
1.15 millert 520: /* returns 0 if p matches empty string */
1.1 tholo 521: {
1.4 millert 522: int b, lp;
1.1 tholo 523:
524: switch (type(p)) {
1.15 millert 525: ELEAF
1.1 tholo 526: LEAF
1.7 millert 527: lp = info(p); /* look for high-water mark of subscripts */
1.1 tholo 528: while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
1.27 millert 529: resizesetvec(__func__);
1.1 tholo 530: }
1.15 millert 531: if (type(p) == EMPTYRE) {
532: setvec[lp] = 0;
533: return(0);
534: }
1.1 tholo 535: if (setvec[lp] != 1) {
536: setvec[lp] = 1;
537: setcnt++;
538: }
1.41 millert 539: if (type(p) == CCL && (*(int *) right(p)) == 0)
1.1 tholo 540: return(0); /* empty CCL */
1.30 millert 541: return(1);
1.1 tholo 542: case PLUS:
1.30 millert 543: if (first(left(p)) == 0)
544: return(0);
1.1 tholo 545: return(1);
546: case STAR:
547: case QUEST:
548: first(left(p));
549: return(0);
550: case CAT:
551: if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
552: return(1);
553: case OR:
554: b = first(right(p));
555: if (first(left(p)) == 0 || b == 0) return(0);
556: return(1);
1.31 millert 557: case ZERO:
558: return 0;
1.1 tholo 559: }
1.8 millert 560: FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
1.1 tholo 561: return(-1);
562: }
563:
564: void follow(Node *v) /* collects leaves that can follow v into setvec */
565: {
566: Node *p;
567:
568: if (type(v) == FINAL)
569: return;
570: p = parent(v);
571: switch (type(p)) {
572: case STAR:
573: case PLUS:
574: first(v);
575: follow(p);
576: return;
577:
578: case OR:
579: case QUEST:
580: follow(p);
581: return;
582:
583: case CAT:
584: if (v == left(p)) { /* v is left child of p */
585: if (first(right(p)) == 0) {
586: follow(p);
587: return;
588: }
589: } else /* v is right child */
590: follow(p);
591: return;
592: }
593: }
594:
1.38 millert 595: int member(int c, int *sarg) /* is c in s? */
1.1 tholo 596: {
1.38 millert 597: int *s = (int *) sarg;
1.10 millert 598:
1.1 tholo 599: while (*s)
600: if (c == *s++)
601: return(1);
602: return(0);
603: }
604:
1.49 millert 605: static void resize_gototab(fa *f, int state)
606: {
607: size_t new_size = f->gototab[state].allocated * 2;
608: gtte *p = (gtte *) realloc(f->gototab[state].entries, new_size * sizeof(gtte));
609: if (p == NULL)
610: overflo(__func__);
611:
612: // need to initialized the new memory to zero
613: size_t orig_size = f->gototab[state].allocated; // 2nd half of new mem is this size
614: memset(p + orig_size, 0, orig_size * sizeof(gtte)); // clean it out
615:
1.51 millert 616: f->gototab[state].allocated = new_size; // update gototab info
1.49 millert 617: f->gototab[state].entries = p;
618: }
619:
1.51 millert 620: static int get_gototab(fa *f, int state, int ch) /* hide gototab implementation */
1.38 millert 621: {
1.49 millert 622: gtte key;
623: gtte *item;
624:
625: key.ch = ch;
626: key.state = 0; /* irrelevant */
1.50 millert 627: item = (gtte *) bsearch(& key, f->gototab[state].entries,
1.49 millert 628: f->gototab[state].inuse, sizeof(gtte),
629: entry_cmp);
630:
631: if (item == NULL)
632: return 0;
633: else
634: return item->state;
1.38 millert 635: }
636:
1.49 millert 637: static int entry_cmp(const void *l, const void *r)
1.44 millert 638: {
1.49 millert 639: const gtte *left, *right;
640:
641: left = (const gtte *) l;
642: right = (const gtte *) r;
643:
644: return left->ch - right->ch;
1.44 millert 645: }
646:
1.51 millert 647: static int set_gototab(fa *f, int state, int ch, int val) /* hide gototab implementation */
1.38 millert 648: {
1.49 millert 649: if (f->gototab[state].inuse == 0) {
650: f->gototab[state].entries[0].ch = ch;
651: f->gototab[state].entries[0].state = val;
652: f->gototab[state].inuse++;
653: return val;
1.52 ! millert 654: } else if ((unsigned)ch > f->gototab[state].entries[f->gototab[state].inuse-1].ch) {
1.49 millert 655: // not seen yet, insert and return
656: gtt *tab = & f->gototab[state];
657: if (tab->inuse + 1 >= tab->allocated)
658: resize_gototab(f, state);
659:
1.51 millert 660: f->gototab[state].entries[f->gototab[state].inuse].ch = ch;
661: f->gototab[state].entries[f->gototab[state].inuse].state = val;
1.49 millert 662: f->gototab[state].inuse++;
663: return val;
664: } else {
665: // maybe we have it, maybe we don't
666: gtte key;
667: gtte *item;
668:
669: key.ch = ch;
670: key.state = 0; /* irrelevant */
1.50 millert 671: item = (gtte *) bsearch(& key, f->gototab[state].entries,
1.49 millert 672: f->gototab[state].inuse, sizeof(gtte),
673: entry_cmp);
674:
675: if (item != NULL) {
676: // we have it, update state and return
677: item->state = val;
678: return item->state;
679: }
680: // otherwise, fall through to insert and reallocate.
681: }
682:
683: gtt *tab = & f->gototab[state];
684: if (tab->inuse + 1 >= tab->allocated)
685: resize_gototab(f, state);
686: f->gototab[state].entries[tab->inuse].ch = ch;
687: f->gototab[state].entries[tab->inuse].state = val;
1.51 millert 688: ++tab->inuse;
1.49 millert 689:
690: qsort(f->gototab[state].entries,
691: f->gototab[state].inuse, sizeof(gtte), entry_cmp);
692:
1.38 millert 693: return val; /* not used anywhere at the moment */
694: }
695:
1.49 millert 696: static void clear_gototab(fa *f, int state)
697: {
698: memset(f->gototab[state].entries, 0,
699: f->gototab[state].allocated * sizeof(gtte));
700: f->gototab[state].inuse = 0;
701: }
702:
1.11 millert 703: int match(fa *f, const char *p0) /* shortest match ? */
1.1 tholo 704: {
705: int s, ns;
1.38 millert 706: int n;
707: int rune;
1.28 millert 708: const uschar *p = (const uschar *) p0;
1.1 tholo 709:
1.38 millert 710: /* return pmatch(f, p0); does it matter whether longest or shortest? */
711:
1.27 millert 712: s = f->initstat;
713: assert (s < f->state_count);
714:
1.1 tholo 715: if (f->out[s])
716: return(1);
717: do {
1.15 millert 718: /* assert(*p < NCHARS); */
1.50 millert 719: n = u8_rune(&rune, (const char *) p);
1.38 millert 720: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 721: s = ns;
722: else
1.38 millert 723: s = cgoto(f, s, rune);
1.1 tholo 724: if (f->out[s])
725: return(1);
1.38 millert 726: if (*p == 0)
727: break;
728: p += n;
729: } while (1); /* was *p++ != 0 */
1.1 tholo 730: return(0);
731: }
732:
1.11 millert 733: int pmatch(fa *f, const char *p0) /* longest match, for sub */
1.1 tholo 734: {
735: int s, ns;
1.38 millert 736: int n;
737: int rune;
1.28 millert 738: const uschar *p = (const uschar *) p0;
739: const uschar *q;
1.1 tholo 740:
1.27 millert 741: s = f->initstat;
742: assert(s < f->state_count);
743:
1.28 millert 744: patbeg = (const char *)p;
1.1 tholo 745: patlen = -1;
746: do {
747: q = p;
748: do {
749: if (f->out[s]) /* final state */
750: patlen = q-p;
1.15 millert 751: /* assert(*q < NCHARS); */
1.50 millert 752: n = u8_rune(&rune, (const char *) q);
1.38 millert 753: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 754: s = ns;
755: else
1.38 millert 756: s = cgoto(f, s, rune);
1.27 millert 757:
758: assert(s < f->state_count);
759:
1.9 deraadt 760: if (s == 1) { /* no transition */
1.1 tholo 761: if (patlen >= 0) {
1.28 millert 762: patbeg = (const char *) p;
1.1 tholo 763: return(1);
764: }
765: else
766: goto nextin; /* no match */
1.9 deraadt 767: }
1.38 millert 768: if (*q == 0)
769: break;
770: q += n;
771: } while (1);
772: q++; /* was *q++ */
1.1 tholo 773: if (f->out[s])
774: patlen = q-p-1; /* don't count $ */
775: if (patlen >= 0) {
1.28 millert 776: patbeg = (const char *) p;
1.1 tholo 777: return(1);
778: }
779: nextin:
780: s = 2;
1.38 millert 781: if (*p == 0)
782: break;
1.50 millert 783: n = u8_rune(&rune, (const char *) p);
1.38 millert 784: p += n;
785: } while (1); /* was *p++ */
1.1 tholo 786: return (0);
787: }
788:
1.11 millert 789: int nematch(fa *f, const char *p0) /* non-empty match, for sub */
1.1 tholo 790: {
791: int s, ns;
1.38 millert 792: int n;
793: int rune;
1.28 millert 794: const uschar *p = (const uschar *) p0;
795: const uschar *q;
1.1 tholo 796:
1.27 millert 797: s = f->initstat;
798: assert(s < f->state_count);
799:
1.28 millert 800: patbeg = (const char *)p;
1.1 tholo 801: patlen = -1;
802: while (*p) {
803: q = p;
804: do {
805: if (f->out[s]) /* final state */
806: patlen = q-p;
1.15 millert 807: /* assert(*q < NCHARS); */
1.50 millert 808: n = u8_rune(&rune, (const char *) q);
1.38 millert 809: if ((ns = get_gototab(f, s, rune)) != 0)
1.1 tholo 810: s = ns;
811: else
1.38 millert 812: s = cgoto(f, s, rune);
1.9 deraadt 813: if (s == 1) { /* no transition */
1.1 tholo 814: if (patlen > 0) {
1.28 millert 815: patbeg = (const char *) p;
1.1 tholo 816: return(1);
817: } else
818: goto nnextin; /* no nonempty match */
1.9 deraadt 819: }
1.38 millert 820: if (*q == 0)
821: break;
822: q += n;
823: } while (1);
824: q++;
1.1 tholo 825: if (f->out[s])
826: patlen = q-p-1; /* don't count $ */
827: if (patlen > 0 ) {
1.28 millert 828: patbeg = (const char *) p;
1.1 tholo 829: return(1);
830: }
831: nnextin:
832: s = 2;
833: p++;
834: }
835: return (0);
836: }
837:
1.43 millert 838:
1.26 millert 839: /*
840: * NAME
841: * fnematch
842: *
843: * DESCRIPTION
844: * A stream-fed version of nematch which transfers characters to a
845: * null-terminated buffer. All characters up to and including the last
846: * character of the matching text or EOF are placed in the buffer. If
847: * a match is found, patbeg and patlen are set appropriately.
848: *
849: * RETURN VALUES
1.29 millert 850: * false No match found.
851: * true Match found.
1.26 millert 852: */
853:
1.29 millert 854: bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
1.26 millert 855: {
1.48 millert 856: char *i, *j, *k, *buf = *pbuf;
1.26 millert 857: int bufsize = *pbufsize;
1.48 millert 858: int c, n, ns, s;
1.26 millert 859:
860: s = pfa->initstat;
861: patlen = 0;
862:
863: /*
1.48 millert 864: * buf <= i <= j <= k <= buf+bufsize
1.26 millert 865: *
1.48 millert 866: * i: origin of active substring
867: * j: current character
868: * k: destination of the next getc
1.26 millert 869: */
1.48 millert 870:
871: i = j = k = buf;
872:
873: do {
874: /*
1.51 millert 875: * Call u8_rune with at least awk_mb_cur_max ahead in
1.48 millert 876: * the buffer until EOF interferes.
877: */
1.52 ! millert 878: if (k - j < (int)awk_mb_cur_max) {
1.51 millert 879: if (k + awk_mb_cur_max > buf + bufsize) {
880: char *obuf = buf;
1.48 millert 881: adjbuf(&buf, &bufsize,
1.51 millert 882: bufsize + awk_mb_cur_max,
1.48 millert 883: quantum, 0, "fnematch");
1.51 millert 884:
885: /* buf resized, maybe moved. update pointers */
886: *pbufsize = bufsize;
887: if (obuf != buf) {
888: i = buf + (i - obuf);
889: j = buf + (j - obuf);
890: k = buf + (k - obuf);
891: *pbuf = buf;
892: if (patlen)
893: patbeg = buf + (patbeg - obuf);
894: }
1.46 millert 895: }
1.51 millert 896: for (n = awk_mb_cur_max ; n > 0; n--) {
1.48 millert 897: *k++ = (c = getc(f)) != EOF ? c : 0;
898: if (c == EOF) {
899: if (ferror(f))
900: FATAL("fnematch: getc error");
901: break;
902: }
1.38 millert 903: }
1.48 millert 904: }
1.26 millert 905:
1.50 millert 906: j += u8_rune(&c, j);
1.48 millert 907:
908: if ((ns = get_gototab(pfa, s, c)) != 0)
909: s = ns;
910: else
911: s = cgoto(pfa, s, c);
1.26 millert 912:
1.48 millert 913: if (pfa->out[s]) { /* final state */
914: patbeg = i;
915: patlen = j - i;
916: if (c == 0) /* don't count $ */
917: patlen--;
918: }
919:
920: if (c && s != 1)
921: continue; /* origin i still viable, next j */
922: if (patlen)
923: break; /* best match found */
924:
925: /* no match at origin i, next i and start over */
1.50 millert 926: i += u8_rune(&c, i);
1.48 millert 927: if (c == 0)
928: break; /* no match */
929: j = i;
1.26 millert 930: s = 2;
1.48 millert 931: } while (1);
1.26 millert 932:
933: if (patlen) {
934: /*
935: * Under no circumstances is the last character fed to
936: * the automaton part of the match. It is EOF's nullbyte,
937: * or it sent the automaton into a state with no further
938: * transitions available (s==1), or both. Room for a
939: * terminating nullbyte is guaranteed.
940: *
941: * ungetc any chars after the end of matching text
942: * (except for EOF's nullbyte, if present) and null
943: * terminate the buffer.
944: */
1.48 millert 945: do
946: if (*--k && ungetc(*k, f) == EOF)
947: FATAL("unable to ungetc '%c'", *k);
948: while (k > patbeg + patlen);
949: *k = '\0';
1.29 millert 950: return true;
1.26 millert 951: }
952: else
1.29 millert 953: return false;
1.26 millert 954: }
955:
1.11 millert 956: Node *reparse(const char *p) /* parses regular expression pointed to by p */
1.1 tholo 957: { /* uses relex() to scan regular expression */
958: Node *np;
959:
1.32 millert 960: DPRINTF("reparse <%s>\n", p);
1.28 millert 961: lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
1.1 tholo 962: rtok = relex();
1.11 millert 963: /* GNU compatibility: an empty regexp matches anything */
1.15 millert 964: if (rtok == '\0') {
1.11 millert 965: /* FATAL("empty regular expression"); previous */
1.15 millert 966: return(op2(EMPTYRE, NIL, NIL));
967: }
1.1 tholo 968: np = regexp();
969: if (rtok != '\0')
1.8 millert 970: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 971: return(np);
972: }
973:
974: Node *regexp(void) /* top-level parse of reg expr */
975: {
976: return (alt(concat(primary())));
977: }
978:
979: Node *primary(void)
980: {
981: Node *np;
1.23 millert 982: int savelastatom;
1.1 tholo 983:
984: switch (rtok) {
985: case CHAR:
1.23 millert 986: lastatom = starttok;
1.7 millert 987: np = op2(CHAR, NIL, itonp(rlxval));
1.1 tholo 988: rtok = relex();
989: return (unary(np));
990: case ALL:
991: rtok = relex();
992: return (unary(op2(ALL, NIL, NIL)));
1.15 millert 993: case EMPTYRE:
994: rtok = relex();
1.23 millert 995: return (unary(op2(EMPTYRE, NIL, NIL)));
1.1 tholo 996: case DOT:
1.23 millert 997: lastatom = starttok;
1.1 tholo 998: rtok = relex();
999: return (unary(op2(DOT, NIL, NIL)));
1000: case CCL:
1.28 millert 1001: np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1.23 millert 1002: lastatom = starttok;
1.1 tholo 1003: rtok = relex();
1004: return (unary(np));
1005: case NCCL:
1.28 millert 1006: np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1.23 millert 1007: lastatom = starttok;
1.1 tholo 1008: rtok = relex();
1009: return (unary(np));
1010: case '^':
1011: rtok = relex();
1.7 millert 1012: return (unary(op2(CHAR, NIL, itonp(HAT))));
1.1 tholo 1013: case '$':
1014: rtok = relex();
1015: return (unary(op2(CHAR, NIL, NIL)));
1016: case '(':
1.23 millert 1017: lastatom = starttok;
1018: savelastatom = starttok - basestr; /* Retain over recursion */
1.1 tholo 1019: rtok = relex();
1020: if (rtok == ')') { /* special pleading for () */
1021: rtok = relex();
1.42 millert 1022: return unary(op2(CCL, NIL, (Node *) cclenter("")));
1.1 tholo 1023: }
1024: np = regexp();
1025: if (rtok == ')') {
1.23 millert 1026: lastatom = basestr + savelastatom; /* Restore */
1.1 tholo 1027: rtok = relex();
1028: return (unary(np));
1029: }
1030: else
1.8 millert 1031: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 1032: default:
1.8 millert 1033: FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1.1 tholo 1034: }
1035: return 0; /*NOTREACHED*/
1036: }
1037:
1038: Node *concat(Node *np)
1039: {
1040: switch (rtok) {
1.23 millert 1041: case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1.1 tholo 1042: return (concat(op2(CAT, np, primary())));
1.23 millert 1043: case EMPTYRE:
1044: rtok = relex();
1.42 millert 1045: return (concat(op2(CAT, op2(CCL, NIL, (Node *) cclenter("")),
1.23 millert 1046: primary())));
1.1 tholo 1047: }
1048: return (np);
1049: }
1050:
1051: Node *alt(Node *np)
1052: {
1053: if (rtok == OR) {
1054: rtok = relex();
1055: return (alt(op2(OR, np, concat(primary()))));
1056: }
1057: return (np);
1058: }
1059:
1060: Node *unary(Node *np)
1061: {
1062: switch (rtok) {
1063: case STAR:
1064: rtok = relex();
1065: return (unary(op2(STAR, np, NIL)));
1066: case PLUS:
1067: rtok = relex();
1068: return (unary(op2(PLUS, np, NIL)));
1069: case QUEST:
1070: rtok = relex();
1071: return (unary(op2(QUEST, np, NIL)));
1.31 millert 1072: case ZERO:
1073: rtok = relex();
1074: return (unary(op2(ZERO, np, NIL)));
1.1 tholo 1075: default:
1076: return (np);
1077: }
1078: }
1079:
1.11 millert 1080: /*
1081: * Character class definitions conformant to the POSIX locale as
1082: * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
1083: * and operating character sets are both ASCII (ISO646) or supersets
1084: * thereof.
1085: *
1086: * Note that to avoid overflowing the temporary buffer used in
1087: * relex(), the expanded character class (prior to range expansion)
1088: * must be less than twice the size of their full name.
1089: */
1.12 millert 1090:
1091: /* Because isblank doesn't show up in any of the header files on any
1092: * system i use, it's defined here. if some other locale has a richer
1093: * definition of "blank", define HAS_ISBLANK and provide your own
1094: * version.
1095: * the parentheses here are an attempt to find a path through the maze
1096: * of macro definition and/or function and/or version provided. thanks
1097: * to nelson beebe for the suggestion; let's see if it works everywhere.
1098: */
1099:
1100: #ifndef HAS_ISBLANK
1101:
1.17 millert 1102: int (xisblank)(int c)
1.12 millert 1103: {
1104: return c==' ' || c=='\t';
1105: }
1106:
1107: #endif
1108:
1.31 millert 1109: static const struct charclass {
1.11 millert 1110: const char *cc_name;
1111: int cc_namelen;
1.12 millert 1112: int (*cc_func)(int);
1.11 millert 1113: } charclasses[] = {
1.12 millert 1114: { "alnum", 5, isalnum },
1115: { "alpha", 5, isalpha },
1.17 millert 1116: #ifndef HAS_ISBLANK
1.21 millert 1117: { "blank", 5, xisblank },
1.17 millert 1118: #else
1.12 millert 1119: { "blank", 5, isblank },
1.17 millert 1120: #endif
1.12 millert 1121: { "cntrl", 5, iscntrl },
1122: { "digit", 5, isdigit },
1123: { "graph", 5, isgraph },
1124: { "lower", 5, islower },
1125: { "print", 5, isprint },
1126: { "punct", 5, ispunct },
1127: { "space", 5, isspace },
1128: { "upper", 5, isupper },
1129: { "xdigit", 6, isxdigit },
1.11 millert 1130: { NULL, 0, NULL },
1131: };
1132:
1.23 millert 1133: #define REPEAT_SIMPLE 0
1134: #define REPEAT_PLUS_APPENDED 1
1135: #define REPEAT_WITH_Q 2
1136: #define REPEAT_ZERO 3
1137:
1138: static int
1139: replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1140: int atomlen, int firstnum, int secondnum, int special_case)
1141: {
1142: int i, j;
1.26 millert 1143: uschar *buf = NULL;
1.23 millert 1144: int ret = 1;
1.31 millert 1145: int init_q = (firstnum == 0); /* first added char will be ? */
1.23 millert 1146: int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
1147: int prefix_length = reptok - basestr; /* prefix includes first rep */
1.28 millert 1148: int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
1.23 millert 1149: int size = prefix_length + suffix_length;
1150:
1151: if (firstnum > 1) { /* add room for reps 2 through firstnum */
1152: size += atomlen*(firstnum-1);
1153: }
1154:
1155: /* Adjust size of buffer for special cases */
1156: if (special_case == REPEAT_PLUS_APPENDED) {
1157: size++; /* for the final + */
1158: } else if (special_case == REPEAT_WITH_Q) {
1.36 millert 1159: size += init_q + (atomlen+1)* (n_q_reps-init_q);
1.23 millert 1160: } else if (special_case == REPEAT_ZERO) {
1161: size += 2; /* just a null ERE: () */
1162: }
1.35 millert 1163: if ((buf = (uschar *) malloc(size + 1)) == NULL)
1.23 millert 1164: FATAL("out of space in reg expr %.10s..", lastre);
1165: memcpy(buf, basestr, prefix_length); /* copy prefix */
1166: j = prefix_length;
1167: if (special_case == REPEAT_ZERO) {
1168: j -= atomlen;
1169: buf[j++] = '(';
1170: buf[j++] = ')';
1171: }
1.30 millert 1172: for (i = 1; i < firstnum; i++) { /* copy x reps */
1.23 millert 1173: memcpy(&buf[j], atom, atomlen);
1174: j += atomlen;
1175: }
1176: if (special_case == REPEAT_PLUS_APPENDED) {
1177: buf[j++] = '+';
1178: } else if (special_case == REPEAT_WITH_Q) {
1.29 millert 1179: if (init_q)
1180: buf[j++] = '?';
1.30 millert 1181: for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
1.23 millert 1182: memcpy(&buf[j], atom, atomlen);
1183: j += atomlen;
1184: buf[j++] = '?';
1185: }
1186: }
1187: memcpy(&buf[j], reptok+reptoklen, suffix_length);
1.36 millert 1188: j += suffix_length;
1189: buf[j] = '\0';
1.23 millert 1190: /* free old basestr */
1191: if (firstbasestr != basestr) {
1192: if (basestr)
1193: xfree(basestr);
1194: }
1195: basestr = buf;
1196: prestr = buf + prefix_length;
1197: if (special_case == REPEAT_ZERO) {
1198: prestr -= atomlen;
1199: ret++;
1200: }
1201: return ret;
1202: }
1203:
1204: static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
1205: int atomlen, int firstnum, int secondnum)
1206: {
1207: /*
1208: In general, the repetition specifier or "bound" is replaced here
1209: by an equivalent ERE string, repeating the immediately previous atom
1210: and appending ? and + as needed. Note that the first copy of the
1211: atom is left in place, except in the special_case of a zero-repeat
1212: (i.e., {0}).
1213: */
1214: if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
1215: if (firstnum < 2) {
1216: /* 0 or 1: should be handled before you get here */
1217: FATAL("internal error");
1218: } else {
1219: return replace_repeat(reptok, reptoklen, atom, atomlen,
1220: firstnum, secondnum, REPEAT_PLUS_APPENDED);
1221: }
1222: } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1223: if (firstnum == 0) { /* {0} or {0,0} */
1224: /* This case is unusual because the resulting
1225: replacement string might actually be SMALLER than
1226: the original ERE */
1227: return replace_repeat(reptok, reptoklen, atom, atomlen,
1228: firstnum, secondnum, REPEAT_ZERO);
1229: } else { /* (firstnum >= 1) */
1230: return replace_repeat(reptok, reptoklen, atom, atomlen,
1231: firstnum, secondnum, REPEAT_SIMPLE);
1232: }
1233: } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1234: /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1235: return replace_repeat(reptok, reptoklen, atom, atomlen,
1236: firstnum, secondnum, REPEAT_WITH_Q);
1237: } else { /* Error - shouldn't be here (n>m) */
1238: FATAL("internal error");
1239: }
1240: return 0;
1241: }
1.11 millert 1242:
1.1 tholo 1243: int relex(void) /* lexical analyzer for reparse */
1244: {
1.5 kstailey 1245: int c, n;
1.1 tholo 1246: int cflag;
1.25 millert 1247: static uschar *buf = NULL;
1.5 kstailey 1248: static int bufsz = 100;
1.10 millert 1249: uschar *bp;
1.31 millert 1250: const struct charclass *cc;
1.12 millert 1251: int i;
1.29 millert 1252: int num, m;
1253: bool commafound, digitfound;
1.23 millert 1254: const uschar *startreptok;
1.24 millert 1255: static int parens = 0;
1.23 millert 1256:
1257: rescan:
1258: starttok = prestr;
1.1 tholo 1259:
1.50 millert 1260: if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1.38 millert 1261: prestr += n;
1262: starttok = prestr;
1263: return CHAR;
1264: }
1265:
1.1 tholo 1266: switch (c = *prestr++) {
1267: case '|': return OR;
1268: case '*': return STAR;
1269: case '+': return PLUS;
1270: case '?': return QUEST;
1271: case '.': return DOT;
1272: case '\0': prestr--; return '\0';
1273: case '^':
1274: case '$':
1.24 millert 1275: return c;
1.1 tholo 1276: case '(':
1.24 millert 1277: parens++;
1278: return c;
1.1 tholo 1279: case ')':
1.24 millert 1280: if (parens) {
1281: parens--;
1282: return c;
1283: }
1284: /* unmatched close parenthesis; per POSIX, treat as literal */
1285: rlxval = c;
1286: return CHAR;
1.1 tholo 1287: case '\\':
1.17 millert 1288: rlxval = quoted(&prestr);
1.1 tholo 1289: return CHAR;
1290: default:
1291: rlxval = c;
1292: return CHAR;
1.25 millert 1293: case '[':
1.35 millert 1294: if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 1295: FATAL("out of space in reg expr %.10s..", lastre);
1.5 kstailey 1296: bp = buf;
1.1 tholo 1297: if (*prestr == '^') {
1298: cflag = 1;
1299: prestr++;
1300: }
1301: else
1302: cflag = 0;
1.38 millert 1303: n = 5 * strlen((const char *) prestr)+1; /* BUG: was 2. what value? */
1.15 millert 1304: if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1.8 millert 1305: FATAL("out of space for reg expr %.10s...", lastre);
1.1 tholo 1306: for (; ; ) {
1.50 millert 1307: if ((n = u8_rune(&rlxval, (const char *) prestr)) > 1) {
1.38 millert 1308: for (i = 0; i < n; i++)
1309: *bp++ = *prestr++;
1310: continue;
1311: }
1.1 tholo 1312: if ((c = *prestr++) == '\\') {
1.5 kstailey 1313: *bp++ = '\\';
1.1 tholo 1314: if ((c = *prestr++) == '\0')
1.8 millert 1315: FATAL("nonterminated character class %.20s...", lastre);
1.5 kstailey 1316: *bp++ = c;
1.10 millert 1317: /* } else if (c == '\n') { */
1318: /* FATAL("newline in character class %.20s...", lastre); */
1.11 millert 1319: } else if (c == '[' && *prestr == ':') {
1320: /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1321: for (cc = charclasses; cc->cc_name; cc++)
1322: if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1323: break;
1324: if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1325: prestr[2 + cc->cc_namelen] == ']') {
1326: prestr += cc->cc_namelen + 3;
1.22 millert 1327: /*
1328: * BUG: We begin at 1, instead of 0, since we
1329: * would otherwise prematurely terminate the
1330: * string for classes like [[:cntrl:]]. This
1331: * means that we can't match the NUL character,
1332: * not without first adapting the entire
1333: * program to track each string's length.
1334: */
1.23 millert 1335: for (i = 1; i <= UCHAR_MAX; i++) {
1.37 millert 1336: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "relex2"))
1.12 millert 1337: FATAL("out of space for reg expr %.10s...", lastre);
1338: if (cc->cc_func(i)) {
1.32 millert 1339: /* escape backslash */
1340: if (i == '\\') {
1341: *bp++ = '\\';
1342: n++;
1343: }
1344:
1.12 millert 1345: *bp++ = i;
1346: n++;
1347: }
1348: }
1.11 millert 1349: } else
1350: *bp++ = c;
1.23 millert 1351: } else if (c == '[' && *prestr == '.') {
1352: char collate_char;
1353: prestr++;
1354: collate_char = *prestr++;
1355: if (*prestr == '.' && prestr[1] == ']') {
1356: prestr += 2;
1357: /* Found it: map via locale TBD: for
1358: now, simply return this char. This
1359: is sufficient to pass conformance
1360: test awk.ex 156
1361: */
1362: if (*prestr == ']') {
1363: prestr++;
1364: rlxval = collate_char;
1365: return CHAR;
1366: }
1367: }
1368: } else if (c == '[' && *prestr == '=') {
1369: char equiv_char;
1370: prestr++;
1371: equiv_char = *prestr++;
1372: if (*prestr == '=' && prestr[1] == ']') {
1373: prestr += 2;
1374: /* Found it: map via locale TBD: for now
1375: simply return this char. This is
1376: sufficient to pass conformance test
1377: awk.ex 156
1378: */
1379: if (*prestr == ']') {
1380: prestr++;
1381: rlxval = equiv_char;
1382: return CHAR;
1383: }
1384: }
1.1 tholo 1385: } else if (c == '\0') {
1.8 millert 1386: FATAL("nonterminated character class %.20s", lastre);
1.5 kstailey 1387: } else if (bp == buf) { /* 1st char is special */
1388: *bp++ = c;
1.1 tholo 1389: } else if (c == ']') {
1.5 kstailey 1390: *bp++ = 0;
1.10 millert 1391: rlxstr = (uschar *) tostring((char *) buf);
1.1 tholo 1392: if (cflag == 0)
1393: return CCL;
1394: else
1395: return NCCL;
1396: } else
1.5 kstailey 1397: *bp++ = c;
1.1 tholo 1398: }
1.23 millert 1399: break;
1400: case '{':
1401: if (isdigit(*(prestr))) {
1402: num = 0; /* Process as a repetition */
1403: n = -1; m = -1;
1.29 millert 1404: commafound = false;
1405: digitfound = false;
1.23 millert 1406: startreptok = prestr-1;
1407: /* Remember start of previous atom here ? */
1408: } else { /* just a { char, not a repetition */
1409: rlxval = c;
1410: return CHAR;
1411: }
1412: for (; ; ) {
1413: if ((c = *prestr++) == '}') {
1414: if (commafound) {
1415: if (digitfound) { /* {n,m} */
1416: m = num;
1.30 millert 1417: if (m < n)
1.23 millert 1418: FATAL("illegal repetition expression: class %.20s",
1419: lastre);
1.30 millert 1420: if (n == 0 && m == 1) {
1.23 millert 1421: return QUEST;
1422: }
1423: } else { /* {n,} */
1.30 millert 1424: if (n == 0)
1425: return STAR;
1426: else if (n == 1)
1427: return PLUS;
1.23 millert 1428: }
1429: } else {
1430: if (digitfound) { /* {n} same as {n,n} */
1431: n = num;
1432: m = num;
1433: } else { /* {} */
1434: FATAL("illegal repetition expression: class %.20s",
1435: lastre);
1436: }
1437: }
1438: if (repeat(starttok, prestr-starttok, lastatom,
1439: startreptok - lastatom, n, m) > 0) {
1.30 millert 1440: if (n == 0 && m == 0) {
1.31 millert 1441: return ZERO;
1.23 millert 1442: }
1443: /* must rescan input for next token */
1444: goto rescan;
1445: }
1446: /* Failed to replace: eat up {...} characters
1447: and treat like just PLUS */
1448: return PLUS;
1449: } else if (c == '\0') {
1450: FATAL("nonterminated character class %.20s",
1451: lastre);
1452: } else if (isdigit(c)) {
1453: num = 10 * num + c - '0';
1.29 millert 1454: digitfound = true;
1.23 millert 1455: } else if (c == ',') {
1456: if (commafound)
1457: FATAL("illegal repetition expression: class %.20s",
1458: lastre);
1459: /* looking for {n,} or {n,m} */
1.29 millert 1460: commafound = true;
1.23 millert 1461: n = num;
1.29 millert 1462: digitfound = false; /* reset */
1.23 millert 1463: num = 0;
1464: } else {
1465: FATAL("illegal repetition expression: class %.20s",
1466: lastre);
1467: }
1468: }
1469: break;
1.1 tholo 1470: }
1471: }
1472:
1473: int cgoto(fa *f, int s, int c)
1474: {
1.27 millert 1475: int *p, *q;
1.1 tholo 1476: int i, j, k;
1477:
1.38 millert 1478: /* assert(c == HAT || c < NCHARS); BUG: seg fault if disable test */
1.1 tholo 1479: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 1480: resizesetvec(__func__);
1.1 tholo 1481: }
1482: for (i = 0; i <= f->accept; i++)
1483: setvec[i] = 0;
1484: setcnt = 0;
1.27 millert 1485: resize_state(f, s);
1.1 tholo 1486: /* compute positions of gototab[s,c] into setvec */
1487: p = f->posns[s];
1488: for (i = 1; i <= *p; i++) {
1489: if ((k = f->re[p[i]].ltype) != FINAL) {
1.7 millert 1490: if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1.1 tholo 1491: || (k == DOT && c != 0 && c != HAT)
1492: || (k == ALL && c != 0)
1.15 millert 1493: || (k == EMPTYRE && c != 0)
1.38 millert 1494: || (k == CCL && member(c, (int *) f->re[p[i]].lval.rp))
1495: || (k == NCCL && !member(c, (int *) f->re[p[i]].lval.rp) && c != 0 && c != HAT)) {
1.1 tholo 1496: q = f->re[p[i]].lfollow;
1497: for (j = 1; j <= *q; j++) {
1498: if (q[j] >= maxsetvec) {
1.27 millert 1499: resizesetvec(__func__);
1.1 tholo 1500: }
1501: if (setvec[q[j]] == 0) {
1502: setcnt++;
1503: setvec[q[j]] = 1;
1504: }
1505: }
1506: }
1507: }
1508: }
1509: /* determine if setvec is a previous state */
1510: tmpset[0] = setcnt;
1511: j = 1;
1512: for (i = f->accept; i >= 0; i--)
1513: if (setvec[i]) {
1514: tmpset[j++] = i;
1515: }
1.27 millert 1516: resize_state(f, f->curstat > s ? f->curstat : s);
1.1 tholo 1517: /* tmpset == previous state? */
1518: for (i = 1; i <= f->curstat; i++) {
1519: p = f->posns[i];
1520: if ((k = tmpset[0]) != p[0])
1521: goto different;
1522: for (j = 1; j <= k; j++)
1523: if (tmpset[j] != p[j])
1524: goto different;
1525: /* setvec is state i */
1.30 millert 1526: if (c != HAT)
1.38 millert 1527: set_gototab(f, s, c, i);
1.1 tholo 1528: return i;
1529: different:;
1530: }
1531:
1532: /* add tmpset to current set of states */
1.27 millert 1533: ++(f->curstat);
1534: resize_state(f, f->curstat);
1.49 millert 1535: clear_gototab(f, f->curstat);
1.1 tholo 1536: xfree(f->posns[f->curstat]);
1.27 millert 1537: p = intalloc(setcnt + 1, __func__);
1.1 tholo 1538:
1539: f->posns[f->curstat] = p;
1.30 millert 1540: if (c != HAT)
1.38 millert 1541: set_gototab(f, s, c, f->curstat);
1.1 tholo 1542: for (i = 0; i <= setcnt; i++)
1543: p[i] = tmpset[i];
1544: if (setvec[f->accept])
1545: f->out[f->curstat] = 1;
1546: else
1547: f->out[f->curstat] = 0;
1548: return f->curstat;
1549: }
1550:
1551:
1552: void freefa(fa *f) /* free a finite automaton */
1553: {
1554: int i;
1555:
1556: if (f == NULL)
1557: return;
1.27 millert 1558: for (i = 0; i < f->state_count; i++)
1.49 millert 1559: xfree(f->gototab[i].entries);
1560: xfree(f->gototab);
1.1 tholo 1561: for (i = 0; i <= f->curstat; i++)
1562: xfree(f->posns[i]);
1563: for (i = 0; i <= f->accept; i++) {
1564: xfree(f->re[i].lfollow);
1565: if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1.30 millert 1566: xfree(f->re[i].lval.np);
1.1 tholo 1567: }
1568: xfree(f->restr);
1.27 millert 1569: xfree(f->out);
1570: xfree(f->posns);
1571: xfree(f->gototab);
1.1 tholo 1572: xfree(f);
1573: }