Annotation of src/usr.bin/awk/b.c, Revision 1.36
1.36 ! millert 1: /* $OpenBSD: b.c,v 1.35 2020/12/09 20:00:11 millert Exp $ */
1.1 tholo 2: /****************************************************************
1.5 kstailey 3: Copyright (C) Lucent Technologies 1997
1.1 tholo 4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
1.5 kstailey 11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
1.1 tholo 24: ****************************************************************/
25:
1.15 millert 26: /* lasciate ogne speranza, voi ch'intrate. */
1.1 tholo 27:
28: #define DEBUG
29:
30: #include <ctype.h>
1.23 millert 31: #include <limits.h>
1.1 tholo 32: #include <stdio.h>
33: #include <string.h>
34: #include <stdlib.h>
35: #include "awk.h"
1.34 millert 36: #include "awkgram.tab.h"
1.1 tholo 37:
38: #define MAXLIN 22
39:
1.7 millert 40: #define type(v) (v)->nobj /* badly overloaded here */
41: #define info(v) (v)->ntype /* badly overloaded here */
1.1 tholo 42: #define left(v) (v)->narg[0]
43: #define right(v) (v)->narg[1]
44: #define parent(v) (v)->nnext
45:
46: #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
1.15 millert 47: #define ELEAF case EMPTYRE: /* empty string in regexp */
1.1 tholo 48: #define UNARY case STAR: case PLUS: case QUEST:
49:
50: /* encoding in tree Nodes:
1.15 millert 51: leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
1.1 tholo 52: left is index, right contains value or pointer to value
53: unary (STAR, PLUS, QUEST): left is child, right is null
54: binary (CAT, OR): left and right are children
55: parent contains pointer to parent
56: */
57:
58:
59: int *setvec;
60: int *tmpset;
61: int maxsetvec = 0;
62:
63: int rtok; /* next token in current re */
1.4 millert 64: int rlxval;
1.28 millert 65: static const uschar *rlxstr;
66: static const uschar *prestr; /* current position in current re */
67: static const uschar *lastre; /* origin of last re */
68: static const uschar *lastatom; /* origin of last Atom */
69: static const uschar *starttok;
70: static const uschar *basestr; /* starts with original, replaced during
1.23 millert 71: repetition processing */
1.28 millert 72: static const uschar *firstbasestr;
1.1 tholo 73:
1.4 millert 74: static int setcnt;
75: static int poscnt;
1.1 tholo 76:
1.28 millert 77: const char *patbeg;
1.1 tholo 78: int patlen;
79:
1.27 millert 80: #define NFA 128 /* cache this many dynamic fa's */
1.1 tholo 81: fa *fatab[NFA];
82: int nfatab = 0; /* entries in fatab */
83:
1.27 millert 84: static int *
85: intalloc(size_t n, const char *f)
86: {
1.35 millert 87: int *p = (int *) calloc(n, sizeof(int));
1.27 millert 88: if (p == NULL)
89: overflo(f);
90: return p;
91: }
92:
93: static void
94: allocsetvec(const char *f)
95: {
96: maxsetvec = MAXLIN;
1.35 millert 97: setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec));
98: tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
1.27 millert 99: if (setvec == NULL || tmpset == NULL)
100: overflo(f);
101: }
102:
103: static void
104: resizesetvec(const char *f)
105: {
1.35 millert 106: setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
107: tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
1.27 millert 108: if (setvec == NULL || tmpset == NULL)
109: overflo(f);
110: maxsetvec *= 4;
111: }
112:
113: static void
114: resize_state(fa *f, int state)
115: {
1.35 millert 116: unsigned int **p;
117: uschar *p2;
118: int **p3;
1.27 millert 119: int i, new_count;
120:
121: if (++state < f->state_count)
122: return;
123:
124: new_count = state + 10; /* needs to be tuned */
125:
1.35 millert 126: p = (unsigned int **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0]));
1.27 millert 127: if (p == NULL)
128: goto out;
129: f->gototab = p;
130:
1.35 millert 131: p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0]));
132: if (p2 == NULL)
1.27 millert 133: goto out;
1.35 millert 134: f->out = p2;
1.27 millert 135:
1.35 millert 136: p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0]));
137: if (p3 == NULL)
1.27 millert 138: goto out;
1.35 millert 139: f->posns = p3;
1.27 millert 140:
141: for (i = f->state_count; i < new_count; ++i) {
1.35 millert 142: f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab));
1.27 millert 143: if (f->gototab[i] == NULL)
144: goto out;
145: f->out[i] = 0;
146: f->posns[i] = NULL;
147: }
148: f->state_count = new_count;
149: return;
150: out:
151: overflo(__func__);
152: }
153:
1.29 millert 154: fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
1.1 tholo 155: {
156: int i, use, nuse;
157: fa *pfa;
1.6 millert 158: static int now = 1;
1.1 tholo 159:
1.25 millert 160: if (setvec == NULL) { /* first time through any RE */
1.27 millert 161: allocsetvec(__func__);
1.1 tholo 162: }
163:
1.29 millert 164: if (compile_time != RUNNING) /* a constant for sure */
1.1 tholo 165: return mkdfa(s, anchor);
166: for (i = 0; i < nfatab; i++) /* is it there already? */
167: if (fatab[i]->anchor == anchor
1.11 millert 168: && strcmp((const char *) fatab[i]->restr, s) == 0) {
1.6 millert 169: fatab[i]->use = now++;
1.1 tholo 170: return fatab[i];
1.10 millert 171: }
1.1 tholo 172: pfa = mkdfa(s, anchor);
173: if (nfatab < NFA) { /* room for another */
174: fatab[nfatab] = pfa;
1.6 millert 175: fatab[nfatab]->use = now++;
1.1 tholo 176: nfatab++;
177: return pfa;
178: }
179: use = fatab[0]->use; /* replace least-recently used */
180: nuse = 0;
181: for (i = 1; i < nfatab; i++)
182: if (fatab[i]->use < use) {
183: use = fatab[i]->use;
184: nuse = i;
185: }
186: freefa(fatab[nuse]);
187: fatab[nuse] = pfa;
1.6 millert 188: pfa->use = now++;
1.1 tholo 189: return pfa;
190: }
191:
1.29 millert 192: fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
1.30 millert 193: /* anchor = true for anchored matches, else false */
1.1 tholo 194: {
195: Node *p, *p1;
196: fa *f;
197:
1.28 millert 198: firstbasestr = (const uschar *) s;
1.23 millert 199: basestr = firstbasestr;
1.1 tholo 200: p = reparse(s);
201: p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
202: /* put ALL STAR in front of reg. exp. */
203: p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
204: /* put FINAL after reg. exp. */
205:
206: poscnt = 0;
207: penter(p1); /* enter parent pointers and leaf indices */
1.35 millert 208: if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
1.27 millert 209: overflo(__func__);
1.1 tholo 210: f->accept = poscnt-1; /* penter has computed number of positions in re */
211: cfoll(f, p1); /* set up follow sets */
212: freetr(p1);
1.27 millert 213: resize_state(f, 1);
214: f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
215: f->posns[1] = intalloc(1, __func__);
1.1 tholo 216: *f->posns[1] = 0;
217: f->initstat = makeinit(f, anchor);
218: f->anchor = anchor;
1.10 millert 219: f->restr = (uschar *) tostring(s);
1.23 millert 220: if (firstbasestr != basestr) {
221: if (basestr)
222: xfree(basestr);
223: }
1.1 tholo 224: return f;
225: }
226:
1.29 millert 227: int makeinit(fa *f, bool anchor)
1.1 tholo 228: {
229: int i, k;
230:
231: f->curstat = 2;
232: f->out[2] = 0;
233: k = *(f->re[0].lfollow);
1.25 millert 234: xfree(f->posns[2]);
1.27 millert 235: f->posns[2] = intalloc(k + 1, __func__);
1.30 millert 236: for (i = 0; i <= k; i++) {
1.1 tholo 237: (f->posns[2])[i] = (f->re[0].lfollow)[i];
238: }
239: if ((f->posns[2])[1] == f->accept)
240: f->out[2] = 1;
1.30 millert 241: for (i = 0; i < NCHARS; i++)
1.1 tholo 242: f->gototab[2][i] = 0;
243: f->curstat = cgoto(f, 2, HAT);
244: if (anchor) {
245: *f->posns[2] = k-1; /* leave out position 0 */
1.30 millert 246: for (i = 0; i < k; i++) {
1.1 tholo 247: (f->posns[0])[i] = (f->posns[2])[i];
248: }
249:
250: f->out[0] = f->out[2];
251: if (f->curstat != 2)
252: --(*f->posns[f->curstat]);
253: }
254: return f->curstat;
255: }
256:
257: void penter(Node *p) /* set up parent pointers and leaf indices */
258: {
259: switch (type(p)) {
1.15 millert 260: ELEAF
1.1 tholo 261: LEAF
1.7 millert 262: info(p) = poscnt;
1.1 tholo 263: poscnt++;
264: break;
265: UNARY
266: penter(left(p));
267: parent(left(p)) = p;
268: break;
269: case CAT:
270: case OR:
271: penter(left(p));
272: penter(right(p));
273: parent(left(p)) = p;
274: parent(right(p)) = p;
275: break;
1.31 millert 276: case ZERO:
277: break;
1.1 tholo 278: default: /* can't happen */
1.8 millert 279: FATAL("can't happen: unknown type %d in penter", type(p));
1.1 tholo 280: break;
281: }
282: }
283:
284: void freetr(Node *p) /* free parse tree */
285: {
286: switch (type(p)) {
1.15 millert 287: ELEAF
1.1 tholo 288: LEAF
289: xfree(p);
290: break;
291: UNARY
1.31 millert 292: case ZERO:
1.1 tholo 293: freetr(left(p));
294: xfree(p);
295: break;
296: case CAT:
297: case OR:
298: freetr(left(p));
299: freetr(right(p));
300: xfree(p);
301: break;
302: default: /* can't happen */
1.8 millert 303: FATAL("can't happen: unknown type %d in freetr", type(p));
1.1 tholo 304: break;
305: }
306: }
307:
308: /* in the parsing of regular expressions, metacharacters like . have */
309: /* to be seen literally; \056 is not a metacharacter. */
310:
1.28 millert 311: int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */
1.2 millert 312: { /* only pick up one 8-bit byte (2 chars) */
1.28 millert 313: const uschar *p;
1.1 tholo 314: int n = 0;
1.2 millert 315: int i;
1.1 tholo 316:
1.28 millert 317: for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
1.1 tholo 318: if (isdigit(*p))
319: n = 16 * n + *p - '0';
320: else if (*p >= 'a' && *p <= 'f')
321: n = 16 * n + *p - 'a' + 10;
322: else if (*p >= 'A' && *p <= 'F')
323: n = 16 * n + *p - 'A' + 10;
324: }
1.28 millert 325: *pp = p;
1.1 tholo 326: return n;
327: }
328:
1.2 millert 329: #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
1.1 tholo 330:
1.28 millert 331: int quoted(const uschar **pp) /* pick up next thing after a \\ */
1.1 tholo 332: /* and increment *pp */
333: {
1.28 millert 334: const uschar *p = *pp;
1.1 tholo 335: int c;
336:
337: if ((c = *p++) == 't')
338: c = '\t';
339: else if (c == 'n')
340: c = '\n';
341: else if (c == 'f')
342: c = '\f';
343: else if (c == 'r')
344: c = '\r';
345: else if (c == 'b')
346: c = '\b';
1.25 millert 347: else if (c == 'v')
348: c = '\v';
1.20 millert 349: else if (c == 'a')
1.25 millert 350: c = '\a';
1.1 tholo 351: else if (c == '\\')
352: c = '\\';
353: else if (c == 'x') { /* hexadecimal goo follows */
354: c = hexstr(&p); /* this adds a null if number is invalid */
355: } else if (isoctdigit(c)) { /* \d \dd \ddd */
356: int n = c - '0';
357: if (isoctdigit(*p)) {
358: n = 8 * n + *p++ - '0';
359: if (isoctdigit(*p))
360: n = 8 * n + *p++ - '0';
361: }
362: c = n;
363: } /* else */
364: /* c = c; */
365: *pp = p;
366: return c;
367: }
368:
1.11 millert 369: char *cclenter(const char *argp) /* add a character class */
1.1 tholo 370: {
371: int i, c, c2;
1.28 millert 372: const uschar *op, *p = (const uschar *) argp;
373: uschar *bp;
1.25 millert 374: static uschar *buf = NULL;
1.5 kstailey 375: static int bufsz = 100;
1.1 tholo 376:
377: op = p;
1.35 millert 378: if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 379: FATAL("out of space for character class [%.10s...] 1", p);
1.5 kstailey 380: bp = buf;
381: for (i = 0; (c = *p++) != 0; ) {
1.1 tholo 382: if (c == '\\') {
1.17 millert 383: c = quoted(&p);
1.5 kstailey 384: } else if (c == '-' && i > 0 && bp[-1] != 0) {
1.1 tholo 385: if (*p != 0) {
1.5 kstailey 386: c = bp[-1];
1.1 tholo 387: c2 = *p++;
388: if (c2 == '\\')
1.17 millert 389: c2 = quoted(&p);
1.1 tholo 390: if (c > c2) { /* empty; ignore */
1.5 kstailey 391: bp--;
1.1 tholo 392: i--;
393: continue;
394: }
395: while (c < c2) {
1.15 millert 396: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
1.8 millert 397: FATAL("out of space for character class [%.10s...] 2", p);
1.5 kstailey 398: *bp++ = ++c;
1.1 tholo 399: i++;
400: }
401: continue;
402: }
403: }
1.15 millert 404: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
1.8 millert 405: FATAL("out of space for character class [%.10s...] 3", p);
1.5 kstailey 406: *bp++ = c;
1.1 tholo 407: i++;
408: }
1.5 kstailey 409: *bp = 0;
1.32 millert 410: DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf);
1.1 tholo 411: xfree(op);
1.10 millert 412: return (char *) tostring((char *) buf);
1.1 tholo 413: }
414:
1.11 millert 415: void overflo(const char *s)
1.1 tholo 416: {
1.27 millert 417: FATAL("regular expression too big: out of space in %.30s...", s);
1.1 tholo 418: }
419:
420: void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
421: {
422: int i;
1.4 millert 423: int *p;
1.1 tholo 424:
425: switch (type(v)) {
1.15 millert 426: ELEAF
1.1 tholo 427: LEAF
1.7 millert 428: f->re[info(v)].ltype = type(v);
429: f->re[info(v)].lval.np = right(v);
1.1 tholo 430: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 431: resizesetvec(__func__);
1.1 tholo 432: }
433: for (i = 0; i <= f->accept; i++)
434: setvec[i] = 0;
435: setcnt = 0;
436: follow(v); /* computes setvec and setcnt */
1.27 millert 437: p = intalloc(setcnt + 1, __func__);
1.7 millert 438: f->re[info(v)].lfollow = p;
1.1 tholo 439: *p = setcnt;
440: for (i = f->accept; i >= 0; i--)
441: if (setvec[i] == 1)
442: *++p = i;
443: break;
444: UNARY
445: cfoll(f,left(v));
446: break;
447: case CAT:
448: case OR:
449: cfoll(f,left(v));
450: cfoll(f,right(v));
451: break;
1.31 millert 452: case ZERO:
453: break;
1.1 tholo 454: default: /* can't happen */
1.8 millert 455: FATAL("can't happen: unknown type %d in cfoll", type(v));
1.1 tholo 456: }
457: }
458:
459: int first(Node *p) /* collects initially active leaves of p into setvec */
1.15 millert 460: /* returns 0 if p matches empty string */
1.1 tholo 461: {
1.4 millert 462: int b, lp;
1.1 tholo 463:
464: switch (type(p)) {
1.15 millert 465: ELEAF
1.1 tholo 466: LEAF
1.7 millert 467: lp = info(p); /* look for high-water mark of subscripts */
1.1 tholo 468: while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
1.27 millert 469: resizesetvec(__func__);
1.1 tholo 470: }
1.15 millert 471: if (type(p) == EMPTYRE) {
472: setvec[lp] = 0;
473: return(0);
474: }
1.1 tholo 475: if (setvec[lp] != 1) {
476: setvec[lp] = 1;
477: setcnt++;
478: }
479: if (type(p) == CCL && (*(char *) right(p)) == '\0')
480: return(0); /* empty CCL */
1.30 millert 481: return(1);
1.1 tholo 482: case PLUS:
1.30 millert 483: if (first(left(p)) == 0)
484: return(0);
1.1 tholo 485: return(1);
486: case STAR:
487: case QUEST:
488: first(left(p));
489: return(0);
490: case CAT:
491: if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
492: return(1);
493: case OR:
494: b = first(right(p));
495: if (first(left(p)) == 0 || b == 0) return(0);
496: return(1);
1.31 millert 497: case ZERO:
498: return 0;
1.1 tholo 499: }
1.8 millert 500: FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
1.1 tholo 501: return(-1);
502: }
503:
504: void follow(Node *v) /* collects leaves that can follow v into setvec */
505: {
506: Node *p;
507:
508: if (type(v) == FINAL)
509: return;
510: p = parent(v);
511: switch (type(p)) {
512: case STAR:
513: case PLUS:
514: first(v);
515: follow(p);
516: return;
517:
518: case OR:
519: case QUEST:
520: follow(p);
521: return;
522:
523: case CAT:
524: if (v == left(p)) { /* v is left child of p */
525: if (first(right(p)) == 0) {
526: follow(p);
527: return;
528: }
529: } else /* v is right child */
530: follow(p);
531: return;
532: }
533: }
534:
1.11 millert 535: int member(int c, const char *sarg) /* is c in s? */
1.1 tholo 536: {
1.28 millert 537: const uschar *s = (const uschar *) sarg;
1.10 millert 538:
1.1 tholo 539: while (*s)
540: if (c == *s++)
541: return(1);
542: return(0);
543: }
544:
1.11 millert 545: int match(fa *f, const char *p0) /* shortest match ? */
1.1 tholo 546: {
547: int s, ns;
1.28 millert 548: const uschar *p = (const uschar *) p0;
1.1 tholo 549:
1.27 millert 550: s = f->initstat;
551: assert (s < f->state_count);
552:
1.1 tholo 553: if (f->out[s])
554: return(1);
555: do {
1.15 millert 556: /* assert(*p < NCHARS); */
1.1 tholo 557: if ((ns = f->gototab[s][*p]) != 0)
558: s = ns;
559: else
560: s = cgoto(f, s, *p);
561: if (f->out[s])
562: return(1);
563: } while (*p++ != 0);
564: return(0);
565: }
566:
1.11 millert 567: int pmatch(fa *f, const char *p0) /* longest match, for sub */
1.1 tholo 568: {
569: int s, ns;
1.28 millert 570: const uschar *p = (const uschar *) p0;
571: const uschar *q;
1.1 tholo 572:
1.27 millert 573: s = f->initstat;
574: assert(s < f->state_count);
575:
1.28 millert 576: patbeg = (const char *)p;
1.1 tholo 577: patlen = -1;
578: do {
579: q = p;
580: do {
581: if (f->out[s]) /* final state */
582: patlen = q-p;
1.15 millert 583: /* assert(*q < NCHARS); */
1.1 tholo 584: if ((ns = f->gototab[s][*q]) != 0)
585: s = ns;
586: else
587: s = cgoto(f, s, *q);
1.27 millert 588:
589: assert(s < f->state_count);
590:
1.9 deraadt 591: if (s == 1) { /* no transition */
1.1 tholo 592: if (patlen >= 0) {
1.28 millert 593: patbeg = (const char *) p;
1.1 tholo 594: return(1);
595: }
596: else
597: goto nextin; /* no match */
1.9 deraadt 598: }
1.1 tholo 599: } while (*q++ != 0);
600: if (f->out[s])
601: patlen = q-p-1; /* don't count $ */
602: if (patlen >= 0) {
1.28 millert 603: patbeg = (const char *) p;
1.1 tholo 604: return(1);
605: }
606: nextin:
607: s = 2;
1.27 millert 608: } while (*p++);
1.1 tholo 609: return (0);
610: }
611:
1.11 millert 612: int nematch(fa *f, const char *p0) /* non-empty match, for sub */
1.1 tholo 613: {
614: int s, ns;
1.28 millert 615: const uschar *p = (const uschar *) p0;
616: const uschar *q;
1.1 tholo 617:
1.27 millert 618: s = f->initstat;
619: assert(s < f->state_count);
620:
1.28 millert 621: patbeg = (const char *)p;
1.1 tholo 622: patlen = -1;
623: while (*p) {
624: q = p;
625: do {
626: if (f->out[s]) /* final state */
627: patlen = q-p;
1.15 millert 628: /* assert(*q < NCHARS); */
1.1 tholo 629: if ((ns = f->gototab[s][*q]) != 0)
630: s = ns;
631: else
632: s = cgoto(f, s, *q);
1.9 deraadt 633: if (s == 1) { /* no transition */
1.1 tholo 634: if (patlen > 0) {
1.28 millert 635: patbeg = (const char *) p;
1.1 tholo 636: return(1);
637: } else
638: goto nnextin; /* no nonempty match */
1.9 deraadt 639: }
1.1 tholo 640: } while (*q++ != 0);
641: if (f->out[s])
642: patlen = q-p-1; /* don't count $ */
643: if (patlen > 0 ) {
1.28 millert 644: patbeg = (const char *) p;
1.1 tholo 645: return(1);
646: }
647: nnextin:
648: s = 2;
649: p++;
650: }
651: return (0);
652: }
653:
1.26 millert 654:
655: /*
656: * NAME
657: * fnematch
658: *
659: * DESCRIPTION
660: * A stream-fed version of nematch which transfers characters to a
661: * null-terminated buffer. All characters up to and including the last
662: * character of the matching text or EOF are placed in the buffer. If
663: * a match is found, patbeg and patlen are set appropriately.
664: *
665: * RETURN VALUES
1.29 millert 666: * false No match found.
667: * true Match found.
1.26 millert 668: */
669:
1.29 millert 670: bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
1.26 millert 671: {
672: char *buf = *pbuf;
673: int bufsize = *pbufsize;
674: int c, i, j, k, ns, s;
675:
676: s = pfa->initstat;
677: patlen = 0;
678:
679: /*
680: * All indices relative to buf.
681: * i <= j <= k <= bufsize
682: *
683: * i: origin of active substring
684: * j: current character
685: * k: destination of next getc()
686: */
687: i = -1, k = 0;
688: do {
689: j = i++;
690: do {
691: if (++j == k) {
692: if (k == bufsize)
693: if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
694: FATAL("stream '%.30s...' too long", buf);
695: buf[k++] = (c = getc(f)) != EOF ? c : 0;
696: }
1.33 millert 697: c = (uschar)buf[j];
1.26 millert 698: /* assert(c < NCHARS); */
699:
700: if ((ns = pfa->gototab[s][c]) != 0)
701: s = ns;
702: else
703: s = cgoto(pfa, s, c);
704:
705: if (pfa->out[s]) { /* final state */
706: patlen = j - i + 1;
707: if (c == 0) /* don't count $ */
708: patlen--;
709: }
710: } while (buf[j] && s != 1);
711: s = 2;
712: } while (buf[i] && !patlen);
713:
714: /* adjbuf() may have relocated a resized buffer. Inform the world. */
715: *pbuf = buf;
716: *pbufsize = bufsize;
717:
718: if (patlen) {
719: patbeg = buf + i;
720: /*
721: * Under no circumstances is the last character fed to
722: * the automaton part of the match. It is EOF's nullbyte,
723: * or it sent the automaton into a state with no further
724: * transitions available (s==1), or both. Room for a
725: * terminating nullbyte is guaranteed.
726: *
727: * ungetc any chars after the end of matching text
728: * (except for EOF's nullbyte, if present) and null
729: * terminate the buffer.
730: */
731: do
732: if (buf[--k] && ungetc(buf[k], f) == EOF)
733: FATAL("unable to ungetc '%c'", buf[k]);
734: while (k > i + patlen);
1.30 millert 735: buf[k] = '\0';
1.29 millert 736: return true;
1.26 millert 737: }
738: else
1.29 millert 739: return false;
1.26 millert 740: }
741:
1.11 millert 742: Node *reparse(const char *p) /* parses regular expression pointed to by p */
1.1 tholo 743: { /* uses relex() to scan regular expression */
744: Node *np;
745:
1.32 millert 746: DPRINTF("reparse <%s>\n", p);
1.28 millert 747: lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
1.1 tholo 748: rtok = relex();
1.11 millert 749: /* GNU compatibility: an empty regexp matches anything */
1.15 millert 750: if (rtok == '\0') {
1.11 millert 751: /* FATAL("empty regular expression"); previous */
1.15 millert 752: return(op2(EMPTYRE, NIL, NIL));
753: }
1.1 tholo 754: np = regexp();
755: if (rtok != '\0')
1.8 millert 756: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 757: return(np);
758: }
759:
760: Node *regexp(void) /* top-level parse of reg expr */
761: {
762: return (alt(concat(primary())));
763: }
764:
765: Node *primary(void)
766: {
767: Node *np;
1.23 millert 768: int savelastatom;
1.1 tholo 769:
770: switch (rtok) {
771: case CHAR:
1.23 millert 772: lastatom = starttok;
1.7 millert 773: np = op2(CHAR, NIL, itonp(rlxval));
1.1 tholo 774: rtok = relex();
775: return (unary(np));
776: case ALL:
777: rtok = relex();
778: return (unary(op2(ALL, NIL, NIL)));
1.15 millert 779: case EMPTYRE:
780: rtok = relex();
1.23 millert 781: return (unary(op2(EMPTYRE, NIL, NIL)));
1.1 tholo 782: case DOT:
1.23 millert 783: lastatom = starttok;
1.1 tholo 784: rtok = relex();
785: return (unary(op2(DOT, NIL, NIL)));
786: case CCL:
1.28 millert 787: np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1.23 millert 788: lastatom = starttok;
1.1 tholo 789: rtok = relex();
790: return (unary(np));
791: case NCCL:
1.28 millert 792: np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1.23 millert 793: lastatom = starttok;
1.1 tholo 794: rtok = relex();
795: return (unary(np));
796: case '^':
797: rtok = relex();
1.7 millert 798: return (unary(op2(CHAR, NIL, itonp(HAT))));
1.1 tholo 799: case '$':
800: rtok = relex();
801: return (unary(op2(CHAR, NIL, NIL)));
802: case '(':
1.23 millert 803: lastatom = starttok;
804: savelastatom = starttok - basestr; /* Retain over recursion */
1.1 tholo 805: rtok = relex();
806: if (rtok == ')') { /* special pleading for () */
807: rtok = relex();
808: return unary(op2(CCL, NIL, (Node *) tostring("")));
809: }
810: np = regexp();
811: if (rtok == ')') {
1.23 millert 812: lastatom = basestr + savelastatom; /* Restore */
1.1 tholo 813: rtok = relex();
814: return (unary(np));
815: }
816: else
1.8 millert 817: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 818: default:
1.8 millert 819: FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1.1 tholo 820: }
821: return 0; /*NOTREACHED*/
822: }
823:
824: Node *concat(Node *np)
825: {
826: switch (rtok) {
1.23 millert 827: case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1.1 tholo 828: return (concat(op2(CAT, np, primary())));
1.23 millert 829: case EMPTYRE:
830: rtok = relex();
831: return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
832: primary())));
1.1 tholo 833: }
834: return (np);
835: }
836:
837: Node *alt(Node *np)
838: {
839: if (rtok == OR) {
840: rtok = relex();
841: return (alt(op2(OR, np, concat(primary()))));
842: }
843: return (np);
844: }
845:
846: Node *unary(Node *np)
847: {
848: switch (rtok) {
849: case STAR:
850: rtok = relex();
851: return (unary(op2(STAR, np, NIL)));
852: case PLUS:
853: rtok = relex();
854: return (unary(op2(PLUS, np, NIL)));
855: case QUEST:
856: rtok = relex();
857: return (unary(op2(QUEST, np, NIL)));
1.31 millert 858: case ZERO:
859: rtok = relex();
860: return (unary(op2(ZERO, np, NIL)));
1.1 tholo 861: default:
862: return (np);
863: }
864: }
865:
1.11 millert 866: /*
867: * Character class definitions conformant to the POSIX locale as
868: * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
869: * and operating character sets are both ASCII (ISO646) or supersets
870: * thereof.
871: *
872: * Note that to avoid overflowing the temporary buffer used in
873: * relex(), the expanded character class (prior to range expansion)
874: * must be less than twice the size of their full name.
875: */
1.12 millert 876:
877: /* Because isblank doesn't show up in any of the header files on any
878: * system i use, it's defined here. if some other locale has a richer
879: * definition of "blank", define HAS_ISBLANK and provide your own
880: * version.
881: * the parentheses here are an attempt to find a path through the maze
882: * of macro definition and/or function and/or version provided. thanks
883: * to nelson beebe for the suggestion; let's see if it works everywhere.
884: */
885:
886: #ifndef HAS_ISBLANK
887:
1.17 millert 888: int (xisblank)(int c)
1.12 millert 889: {
890: return c==' ' || c=='\t';
891: }
892:
893: #endif
894:
1.31 millert 895: static const struct charclass {
1.11 millert 896: const char *cc_name;
897: int cc_namelen;
1.12 millert 898: int (*cc_func)(int);
1.11 millert 899: } charclasses[] = {
1.12 millert 900: { "alnum", 5, isalnum },
901: { "alpha", 5, isalpha },
1.17 millert 902: #ifndef HAS_ISBLANK
1.21 millert 903: { "blank", 5, xisblank },
1.17 millert 904: #else
1.12 millert 905: { "blank", 5, isblank },
1.17 millert 906: #endif
1.12 millert 907: { "cntrl", 5, iscntrl },
908: { "digit", 5, isdigit },
909: { "graph", 5, isgraph },
910: { "lower", 5, islower },
911: { "print", 5, isprint },
912: { "punct", 5, ispunct },
913: { "space", 5, isspace },
914: { "upper", 5, isupper },
915: { "xdigit", 6, isxdigit },
1.11 millert 916: { NULL, 0, NULL },
917: };
918:
1.23 millert 919: #define REPEAT_SIMPLE 0
920: #define REPEAT_PLUS_APPENDED 1
921: #define REPEAT_WITH_Q 2
922: #define REPEAT_ZERO 3
923:
924: static int
925: replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
926: int atomlen, int firstnum, int secondnum, int special_case)
927: {
928: int i, j;
1.26 millert 929: uschar *buf = NULL;
1.23 millert 930: int ret = 1;
1.31 millert 931: int init_q = (firstnum == 0); /* first added char will be ? */
1.23 millert 932: int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
933: int prefix_length = reptok - basestr; /* prefix includes first rep */
1.28 millert 934: int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
1.23 millert 935: int size = prefix_length + suffix_length;
936:
937: if (firstnum > 1) { /* add room for reps 2 through firstnum */
938: size += atomlen*(firstnum-1);
939: }
940:
941: /* Adjust size of buffer for special cases */
942: if (special_case == REPEAT_PLUS_APPENDED) {
943: size++; /* for the final + */
944: } else if (special_case == REPEAT_WITH_Q) {
1.36 ! millert 945: size += init_q + (atomlen+1)* (n_q_reps-init_q);
1.23 millert 946: } else if (special_case == REPEAT_ZERO) {
947: size += 2; /* just a null ERE: () */
948: }
1.35 millert 949: if ((buf = (uschar *) malloc(size + 1)) == NULL)
1.23 millert 950: FATAL("out of space in reg expr %.10s..", lastre);
951: memcpy(buf, basestr, prefix_length); /* copy prefix */
952: j = prefix_length;
953: if (special_case == REPEAT_ZERO) {
954: j -= atomlen;
955: buf[j++] = '(';
956: buf[j++] = ')';
957: }
1.30 millert 958: for (i = 1; i < firstnum; i++) { /* copy x reps */
1.23 millert 959: memcpy(&buf[j], atom, atomlen);
960: j += atomlen;
961: }
962: if (special_case == REPEAT_PLUS_APPENDED) {
963: buf[j++] = '+';
964: } else if (special_case == REPEAT_WITH_Q) {
1.29 millert 965: if (init_q)
966: buf[j++] = '?';
1.30 millert 967: for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
1.23 millert 968: memcpy(&buf[j], atom, atomlen);
969: j += atomlen;
970: buf[j++] = '?';
971: }
972: }
973: memcpy(&buf[j], reptok+reptoklen, suffix_length);
1.36 ! millert 974: j += suffix_length;
! 975: buf[j] = '\0';
1.23 millert 976: /* free old basestr */
977: if (firstbasestr != basestr) {
978: if (basestr)
979: xfree(basestr);
980: }
981: basestr = buf;
982: prestr = buf + prefix_length;
983: if (special_case == REPEAT_ZERO) {
984: prestr -= atomlen;
985: ret++;
986: }
987: return ret;
988: }
989:
990: static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
991: int atomlen, int firstnum, int secondnum)
992: {
993: /*
994: In general, the repetition specifier or "bound" is replaced here
995: by an equivalent ERE string, repeating the immediately previous atom
996: and appending ? and + as needed. Note that the first copy of the
997: atom is left in place, except in the special_case of a zero-repeat
998: (i.e., {0}).
999: */
1000: if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
1001: if (firstnum < 2) {
1002: /* 0 or 1: should be handled before you get here */
1003: FATAL("internal error");
1004: } else {
1005: return replace_repeat(reptok, reptoklen, atom, atomlen,
1006: firstnum, secondnum, REPEAT_PLUS_APPENDED);
1007: }
1008: } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1009: if (firstnum == 0) { /* {0} or {0,0} */
1010: /* This case is unusual because the resulting
1011: replacement string might actually be SMALLER than
1012: the original ERE */
1013: return replace_repeat(reptok, reptoklen, atom, atomlen,
1014: firstnum, secondnum, REPEAT_ZERO);
1015: } else { /* (firstnum >= 1) */
1016: return replace_repeat(reptok, reptoklen, atom, atomlen,
1017: firstnum, secondnum, REPEAT_SIMPLE);
1018: }
1019: } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1020: /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1021: return replace_repeat(reptok, reptoklen, atom, atomlen,
1022: firstnum, secondnum, REPEAT_WITH_Q);
1023: } else { /* Error - shouldn't be here (n>m) */
1024: FATAL("internal error");
1025: }
1026: return 0;
1027: }
1.11 millert 1028:
1.1 tholo 1029: int relex(void) /* lexical analyzer for reparse */
1030: {
1.5 kstailey 1031: int c, n;
1.1 tholo 1032: int cflag;
1.25 millert 1033: static uschar *buf = NULL;
1.5 kstailey 1034: static int bufsz = 100;
1.10 millert 1035: uschar *bp;
1.31 millert 1036: const struct charclass *cc;
1.12 millert 1037: int i;
1.29 millert 1038: int num, m;
1039: bool commafound, digitfound;
1.23 millert 1040: const uschar *startreptok;
1.24 millert 1041: static int parens = 0;
1.23 millert 1042:
1043: rescan:
1044: starttok = prestr;
1.1 tholo 1045:
1046: switch (c = *prestr++) {
1047: case '|': return OR;
1048: case '*': return STAR;
1049: case '+': return PLUS;
1050: case '?': return QUEST;
1051: case '.': return DOT;
1052: case '\0': prestr--; return '\0';
1053: case '^':
1054: case '$':
1.24 millert 1055: return c;
1.1 tholo 1056: case '(':
1.24 millert 1057: parens++;
1058: return c;
1.1 tholo 1059: case ')':
1.24 millert 1060: if (parens) {
1061: parens--;
1062: return c;
1063: }
1064: /* unmatched close parenthesis; per POSIX, treat as literal */
1065: rlxval = c;
1066: return CHAR;
1.1 tholo 1067: case '\\':
1.17 millert 1068: rlxval = quoted(&prestr);
1.1 tholo 1069: return CHAR;
1070: default:
1071: rlxval = c;
1072: return CHAR;
1.25 millert 1073: case '[':
1.35 millert 1074: if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 1075: FATAL("out of space in reg expr %.10s..", lastre);
1.5 kstailey 1076: bp = buf;
1.1 tholo 1077: if (*prestr == '^') {
1078: cflag = 1;
1079: prestr++;
1080: }
1081: else
1082: cflag = 0;
1.11 millert 1083: n = 2 * strlen((const char *) prestr)+1;
1.15 millert 1084: if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1.8 millert 1085: FATAL("out of space for reg expr %.10s...", lastre);
1.1 tholo 1086: for (; ; ) {
1087: if ((c = *prestr++) == '\\') {
1.5 kstailey 1088: *bp++ = '\\';
1.1 tholo 1089: if ((c = *prestr++) == '\0')
1.8 millert 1090: FATAL("nonterminated character class %.20s...", lastre);
1.5 kstailey 1091: *bp++ = c;
1.10 millert 1092: /* } else if (c == '\n') { */
1093: /* FATAL("newline in character class %.20s...", lastre); */
1.11 millert 1094: } else if (c == '[' && *prestr == ':') {
1095: /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1096: for (cc = charclasses; cc->cc_name; cc++)
1097: if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1098: break;
1099: if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1100: prestr[2 + cc->cc_namelen] == ']') {
1101: prestr += cc->cc_namelen + 3;
1.22 millert 1102: /*
1103: * BUG: We begin at 1, instead of 0, since we
1104: * would otherwise prematurely terminate the
1105: * string for classes like [[:cntrl:]]. This
1106: * means that we can't match the NUL character,
1107: * not without first adapting the entire
1108: * program to track each string's length.
1109: */
1.23 millert 1110: for (i = 1; i <= UCHAR_MAX; i++) {
1.15 millert 1111: if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1.12 millert 1112: FATAL("out of space for reg expr %.10s...", lastre);
1113: if (cc->cc_func(i)) {
1.32 millert 1114: /* escape backslash */
1115: if (i == '\\') {
1116: *bp++ = '\\';
1117: n++;
1118: }
1119:
1.12 millert 1120: *bp++ = i;
1121: n++;
1122: }
1123: }
1.11 millert 1124: } else
1125: *bp++ = c;
1.23 millert 1126: } else if (c == '[' && *prestr == '.') {
1127: char collate_char;
1128: prestr++;
1129: collate_char = *prestr++;
1130: if (*prestr == '.' && prestr[1] == ']') {
1131: prestr += 2;
1132: /* Found it: map via locale TBD: for
1133: now, simply return this char. This
1134: is sufficient to pass conformance
1135: test awk.ex 156
1136: */
1137: if (*prestr == ']') {
1138: prestr++;
1139: rlxval = collate_char;
1140: return CHAR;
1141: }
1142: }
1143: } else if (c == '[' && *prestr == '=') {
1144: char equiv_char;
1145: prestr++;
1146: equiv_char = *prestr++;
1147: if (*prestr == '=' && prestr[1] == ']') {
1148: prestr += 2;
1149: /* Found it: map via locale TBD: for now
1150: simply return this char. This is
1151: sufficient to pass conformance test
1152: awk.ex 156
1153: */
1154: if (*prestr == ']') {
1155: prestr++;
1156: rlxval = equiv_char;
1157: return CHAR;
1158: }
1159: }
1.1 tholo 1160: } else if (c == '\0') {
1.8 millert 1161: FATAL("nonterminated character class %.20s", lastre);
1.5 kstailey 1162: } else if (bp == buf) { /* 1st char is special */
1163: *bp++ = c;
1.1 tholo 1164: } else if (c == ']') {
1.5 kstailey 1165: *bp++ = 0;
1.10 millert 1166: rlxstr = (uschar *) tostring((char *) buf);
1.1 tholo 1167: if (cflag == 0)
1168: return CCL;
1169: else
1170: return NCCL;
1171: } else
1.5 kstailey 1172: *bp++ = c;
1.1 tholo 1173: }
1.23 millert 1174: break;
1175: case '{':
1176: if (isdigit(*(prestr))) {
1177: num = 0; /* Process as a repetition */
1178: n = -1; m = -1;
1.29 millert 1179: commafound = false;
1180: digitfound = false;
1.23 millert 1181: startreptok = prestr-1;
1182: /* Remember start of previous atom here ? */
1183: } else { /* just a { char, not a repetition */
1184: rlxval = c;
1185: return CHAR;
1186: }
1187: for (; ; ) {
1188: if ((c = *prestr++) == '}') {
1189: if (commafound) {
1190: if (digitfound) { /* {n,m} */
1191: m = num;
1.30 millert 1192: if (m < n)
1.23 millert 1193: FATAL("illegal repetition expression: class %.20s",
1194: lastre);
1.30 millert 1195: if (n == 0 && m == 1) {
1.23 millert 1196: return QUEST;
1197: }
1198: } else { /* {n,} */
1.30 millert 1199: if (n == 0)
1200: return STAR;
1201: else if (n == 1)
1202: return PLUS;
1.23 millert 1203: }
1204: } else {
1205: if (digitfound) { /* {n} same as {n,n} */
1206: n = num;
1207: m = num;
1208: } else { /* {} */
1209: FATAL("illegal repetition expression: class %.20s",
1210: lastre);
1211: }
1212: }
1213: if (repeat(starttok, prestr-starttok, lastatom,
1214: startreptok - lastatom, n, m) > 0) {
1.30 millert 1215: if (n == 0 && m == 0) {
1.31 millert 1216: return ZERO;
1.23 millert 1217: }
1218: /* must rescan input for next token */
1219: goto rescan;
1220: }
1221: /* Failed to replace: eat up {...} characters
1222: and treat like just PLUS */
1223: return PLUS;
1224: } else if (c == '\0') {
1225: FATAL("nonterminated character class %.20s",
1226: lastre);
1227: } else if (isdigit(c)) {
1228: num = 10 * num + c - '0';
1.29 millert 1229: digitfound = true;
1.23 millert 1230: } else if (c == ',') {
1231: if (commafound)
1232: FATAL("illegal repetition expression: class %.20s",
1233: lastre);
1234: /* looking for {n,} or {n,m} */
1.29 millert 1235: commafound = true;
1.23 millert 1236: n = num;
1.29 millert 1237: digitfound = false; /* reset */
1.23 millert 1238: num = 0;
1239: } else {
1240: FATAL("illegal repetition expression: class %.20s",
1241: lastre);
1242: }
1243: }
1244: break;
1.1 tholo 1245: }
1246: }
1247:
1248: int cgoto(fa *f, int s, int c)
1249: {
1.27 millert 1250: int *p, *q;
1.1 tholo 1251: int i, j, k;
1252:
1.12 millert 1253: assert(c == HAT || c < NCHARS);
1.1 tholo 1254: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 1255: resizesetvec(__func__);
1.1 tholo 1256: }
1257: for (i = 0; i <= f->accept; i++)
1258: setvec[i] = 0;
1259: setcnt = 0;
1.27 millert 1260: resize_state(f, s);
1.1 tholo 1261: /* compute positions of gototab[s,c] into setvec */
1262: p = f->posns[s];
1263: for (i = 1; i <= *p; i++) {
1264: if ((k = f->re[p[i]].ltype) != FINAL) {
1.7 millert 1265: if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1.1 tholo 1266: || (k == DOT && c != 0 && c != HAT)
1267: || (k == ALL && c != 0)
1.15 millert 1268: || (k == EMPTYRE && c != 0)
1.10 millert 1269: || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1270: || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1.1 tholo 1271: q = f->re[p[i]].lfollow;
1272: for (j = 1; j <= *q; j++) {
1273: if (q[j] >= maxsetvec) {
1.27 millert 1274: resizesetvec(__func__);
1.1 tholo 1275: }
1276: if (setvec[q[j]] == 0) {
1277: setcnt++;
1278: setvec[q[j]] = 1;
1279: }
1280: }
1281: }
1282: }
1283: }
1284: /* determine if setvec is a previous state */
1285: tmpset[0] = setcnt;
1286: j = 1;
1287: for (i = f->accept; i >= 0; i--)
1288: if (setvec[i]) {
1289: tmpset[j++] = i;
1290: }
1.27 millert 1291: resize_state(f, f->curstat > s ? f->curstat : s);
1.1 tholo 1292: /* tmpset == previous state? */
1293: for (i = 1; i <= f->curstat; i++) {
1294: p = f->posns[i];
1295: if ((k = tmpset[0]) != p[0])
1296: goto different;
1297: for (j = 1; j <= k; j++)
1298: if (tmpset[j] != p[j])
1299: goto different;
1300: /* setvec is state i */
1.30 millert 1301: if (c != HAT)
1302: f->gototab[s][c] = i;
1.1 tholo 1303: return i;
1304: different:;
1305: }
1306:
1307: /* add tmpset to current set of states */
1.27 millert 1308: ++(f->curstat);
1309: resize_state(f, f->curstat);
1.1 tholo 1310: for (i = 0; i < NCHARS; i++)
1311: f->gototab[f->curstat][i] = 0;
1312: xfree(f->posns[f->curstat]);
1.27 millert 1313: p = intalloc(setcnt + 1, __func__);
1.1 tholo 1314:
1315: f->posns[f->curstat] = p;
1.30 millert 1316: if (c != HAT)
1317: f->gototab[s][c] = f->curstat;
1.1 tholo 1318: for (i = 0; i <= setcnt; i++)
1319: p[i] = tmpset[i];
1320: if (setvec[f->accept])
1321: f->out[f->curstat] = 1;
1322: else
1323: f->out[f->curstat] = 0;
1324: return f->curstat;
1325: }
1326:
1327:
1328: void freefa(fa *f) /* free a finite automaton */
1329: {
1330: int i;
1331:
1332: if (f == NULL)
1333: return;
1.27 millert 1334: for (i = 0; i < f->state_count; i++)
1335: xfree(f->gototab[i])
1.1 tholo 1336: for (i = 0; i <= f->curstat; i++)
1337: xfree(f->posns[i]);
1338: for (i = 0; i <= f->accept; i++) {
1339: xfree(f->re[i].lfollow);
1340: if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1.30 millert 1341: xfree(f->re[i].lval.np);
1.1 tholo 1342: }
1343: xfree(f->restr);
1.27 millert 1344: xfree(f->out);
1345: xfree(f->posns);
1346: xfree(f->gototab);
1.1 tholo 1347: xfree(f);
1348: }