Annotation of src/usr.bin/awk/b.c, Revision 1.35
1.35 ! millert 1: /* $OpenBSD: b.c,v 1.34 2020/07/30 17:45:44 millert Exp $ */
1.1 tholo 2: /****************************************************************
1.5 kstailey 3: Copyright (C) Lucent Technologies 1997
1.1 tholo 4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
1.5 kstailey 11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
1.1 tholo 24: ****************************************************************/
25:
1.15 millert 26: /* lasciate ogne speranza, voi ch'intrate. */
1.1 tholo 27:
28: #define DEBUG
29:
30: #include <ctype.h>
1.23 millert 31: #include <limits.h>
1.1 tholo 32: #include <stdio.h>
33: #include <string.h>
34: #include <stdlib.h>
35: #include "awk.h"
1.34 millert 36: #include "awkgram.tab.h"
1.1 tholo 37:
38: #define MAXLIN 22
39:
1.7 millert 40: #define type(v) (v)->nobj /* badly overloaded here */
41: #define info(v) (v)->ntype /* badly overloaded here */
1.1 tholo 42: #define left(v) (v)->narg[0]
43: #define right(v) (v)->narg[1]
44: #define parent(v) (v)->nnext
45:
46: #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
1.15 millert 47: #define ELEAF case EMPTYRE: /* empty string in regexp */
1.1 tholo 48: #define UNARY case STAR: case PLUS: case QUEST:
49:
50: /* encoding in tree Nodes:
1.15 millert 51: leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
1.1 tholo 52: left is index, right contains value or pointer to value
53: unary (STAR, PLUS, QUEST): left is child, right is null
54: binary (CAT, OR): left and right are children
55: parent contains pointer to parent
56: */
57:
58:
59: int *setvec;
60: int *tmpset;
61: int maxsetvec = 0;
62:
63: int rtok; /* next token in current re */
1.4 millert 64: int rlxval;
1.28 millert 65: static const uschar *rlxstr;
66: static const uschar *prestr; /* current position in current re */
67: static const uschar *lastre; /* origin of last re */
68: static const uschar *lastatom; /* origin of last Atom */
69: static const uschar *starttok;
70: static const uschar *basestr; /* starts with original, replaced during
1.23 millert 71: repetition processing */
1.28 millert 72: static const uschar *firstbasestr;
1.1 tholo 73:
1.4 millert 74: static int setcnt;
75: static int poscnt;
1.1 tholo 76:
1.28 millert 77: const char *patbeg;
1.1 tholo 78: int patlen;
79:
1.27 millert 80: #define NFA 128 /* cache this many dynamic fa's */
1.1 tholo 81: fa *fatab[NFA];
82: int nfatab = 0; /* entries in fatab */
83:
1.27 millert 84: static int *
85: intalloc(size_t n, const char *f)
86: {
1.35 ! millert 87: int *p = (int *) calloc(n, sizeof(int));
1.27 millert 88: if (p == NULL)
89: overflo(f);
90: return p;
91: }
92:
93: static void
94: allocsetvec(const char *f)
95: {
96: maxsetvec = MAXLIN;
1.35 ! millert 97: setvec = (int *) reallocarray(setvec, maxsetvec, sizeof(*setvec));
! 98: tmpset = (int *) reallocarray(tmpset, maxsetvec, sizeof(*tmpset));
1.27 millert 99: if (setvec == NULL || tmpset == NULL)
100: overflo(f);
101: }
102:
103: static void
104: resizesetvec(const char *f)
105: {
1.35 ! millert 106: setvec = (int *) reallocarray(setvec, maxsetvec, 4 * sizeof(*setvec));
! 107: tmpset = (int *) reallocarray(tmpset, maxsetvec, 4 * sizeof(*tmpset));
1.27 millert 108: if (setvec == NULL || tmpset == NULL)
109: overflo(f);
110: maxsetvec *= 4;
111: }
112:
113: static void
114: resize_state(fa *f, int state)
115: {
1.35 ! millert 116: unsigned int **p;
! 117: uschar *p2;
! 118: int **p3;
1.27 millert 119: int i, new_count;
120:
121: if (++state < f->state_count)
122: return;
123:
124: new_count = state + 10; /* needs to be tuned */
125:
1.35 ! millert 126: p = (unsigned int **) reallocarray(f->gototab, new_count, sizeof(f->gototab[0]));
1.27 millert 127: if (p == NULL)
128: goto out;
129: f->gototab = p;
130:
1.35 ! millert 131: p2 = (uschar *) reallocarray(f->out, new_count, sizeof(f->out[0]));
! 132: if (p2 == NULL)
1.27 millert 133: goto out;
1.35 ! millert 134: f->out = p2;
1.27 millert 135:
1.35 ! millert 136: p3 = (int **) reallocarray(f->posns, new_count, sizeof(f->posns[0]));
! 137: if (p3 == NULL)
1.27 millert 138: goto out;
1.35 ! millert 139: f->posns = p3;
1.27 millert 140:
141: for (i = f->state_count; i < new_count; ++i) {
1.35 ! millert 142: f->gototab[i] = (unsigned int *) calloc(NCHARS, sizeof(**f->gototab));
1.27 millert 143: if (f->gototab[i] == NULL)
144: goto out;
145: f->out[i] = 0;
146: f->posns[i] = NULL;
147: }
148: f->state_count = new_count;
149: return;
150: out:
151: overflo(__func__);
152: }
153:
1.29 millert 154: fa *makedfa(const char *s, bool anchor) /* returns dfa for reg expr s */
1.1 tholo 155: {
156: int i, use, nuse;
157: fa *pfa;
1.6 millert 158: static int now = 1;
1.1 tholo 159:
1.25 millert 160: if (setvec == NULL) { /* first time through any RE */
1.27 millert 161: allocsetvec(__func__);
1.1 tholo 162: }
163:
1.29 millert 164: if (compile_time != RUNNING) /* a constant for sure */
1.1 tholo 165: return mkdfa(s, anchor);
166: for (i = 0; i < nfatab; i++) /* is it there already? */
167: if (fatab[i]->anchor == anchor
1.11 millert 168: && strcmp((const char *) fatab[i]->restr, s) == 0) {
1.6 millert 169: fatab[i]->use = now++;
1.1 tholo 170: return fatab[i];
1.10 millert 171: }
1.1 tholo 172: pfa = mkdfa(s, anchor);
173: if (nfatab < NFA) { /* room for another */
174: fatab[nfatab] = pfa;
1.6 millert 175: fatab[nfatab]->use = now++;
1.1 tholo 176: nfatab++;
177: return pfa;
178: }
179: use = fatab[0]->use; /* replace least-recently used */
180: nuse = 0;
181: for (i = 1; i < nfatab; i++)
182: if (fatab[i]->use < use) {
183: use = fatab[i]->use;
184: nuse = i;
185: }
186: freefa(fatab[nuse]);
187: fatab[nuse] = pfa;
1.6 millert 188: pfa->use = now++;
1.1 tholo 189: return pfa;
190: }
191:
1.29 millert 192: fa *mkdfa(const char *s, bool anchor) /* does the real work of making a dfa */
1.30 millert 193: /* anchor = true for anchored matches, else false */
1.1 tholo 194: {
195: Node *p, *p1;
196: fa *f;
197:
1.28 millert 198: firstbasestr = (const uschar *) s;
1.23 millert 199: basestr = firstbasestr;
1.1 tholo 200: p = reparse(s);
201: p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
202: /* put ALL STAR in front of reg. exp. */
203: p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
204: /* put FINAL after reg. exp. */
205:
206: poscnt = 0;
207: penter(p1); /* enter parent pointers and leaf indices */
1.35 ! millert 208: if ((f = (fa *) calloc(1, sizeof(fa) + poscnt * sizeof(rrow))) == NULL)
1.27 millert 209: overflo(__func__);
1.1 tholo 210: f->accept = poscnt-1; /* penter has computed number of positions in re */
211: cfoll(f, p1); /* set up follow sets */
212: freetr(p1);
1.27 millert 213: resize_state(f, 1);
214: f->posns[0] = intalloc(*(f->re[0].lfollow), __func__);
215: f->posns[1] = intalloc(1, __func__);
1.1 tholo 216: *f->posns[1] = 0;
217: f->initstat = makeinit(f, anchor);
218: f->anchor = anchor;
1.10 millert 219: f->restr = (uschar *) tostring(s);
1.23 millert 220: if (firstbasestr != basestr) {
221: if (basestr)
222: xfree(basestr);
223: }
1.1 tholo 224: return f;
225: }
226:
1.29 millert 227: int makeinit(fa *f, bool anchor)
1.1 tholo 228: {
229: int i, k;
230:
231: f->curstat = 2;
232: f->out[2] = 0;
233: k = *(f->re[0].lfollow);
1.25 millert 234: xfree(f->posns[2]);
1.27 millert 235: f->posns[2] = intalloc(k + 1, __func__);
1.30 millert 236: for (i = 0; i <= k; i++) {
1.1 tholo 237: (f->posns[2])[i] = (f->re[0].lfollow)[i];
238: }
239: if ((f->posns[2])[1] == f->accept)
240: f->out[2] = 1;
1.30 millert 241: for (i = 0; i < NCHARS; i++)
1.1 tholo 242: f->gototab[2][i] = 0;
243: f->curstat = cgoto(f, 2, HAT);
244: if (anchor) {
245: *f->posns[2] = k-1; /* leave out position 0 */
1.30 millert 246: for (i = 0; i < k; i++) {
1.1 tholo 247: (f->posns[0])[i] = (f->posns[2])[i];
248: }
249:
250: f->out[0] = f->out[2];
251: if (f->curstat != 2)
252: --(*f->posns[f->curstat]);
253: }
254: return f->curstat;
255: }
256:
257: void penter(Node *p) /* set up parent pointers and leaf indices */
258: {
259: switch (type(p)) {
1.15 millert 260: ELEAF
1.1 tholo 261: LEAF
1.7 millert 262: info(p) = poscnt;
1.1 tholo 263: poscnt++;
264: break;
265: UNARY
266: penter(left(p));
267: parent(left(p)) = p;
268: break;
269: case CAT:
270: case OR:
271: penter(left(p));
272: penter(right(p));
273: parent(left(p)) = p;
274: parent(right(p)) = p;
275: break;
1.31 millert 276: case ZERO:
277: break;
1.1 tholo 278: default: /* can't happen */
1.8 millert 279: FATAL("can't happen: unknown type %d in penter", type(p));
1.1 tholo 280: break;
281: }
282: }
283:
284: void freetr(Node *p) /* free parse tree */
285: {
286: switch (type(p)) {
1.15 millert 287: ELEAF
1.1 tholo 288: LEAF
289: xfree(p);
290: break;
291: UNARY
1.31 millert 292: case ZERO:
1.1 tholo 293: freetr(left(p));
294: xfree(p);
295: break;
296: case CAT:
297: case OR:
298: freetr(left(p));
299: freetr(right(p));
300: xfree(p);
301: break;
302: default: /* can't happen */
1.8 millert 303: FATAL("can't happen: unknown type %d in freetr", type(p));
1.1 tholo 304: break;
305: }
306: }
307:
308: /* in the parsing of regular expressions, metacharacters like . have */
309: /* to be seen literally; \056 is not a metacharacter. */
310:
1.28 millert 311: int hexstr(const uschar **pp) /* find and eval hex string at pp, return new p */
1.2 millert 312: { /* only pick up one 8-bit byte (2 chars) */
1.28 millert 313: const uschar *p;
1.1 tholo 314: int n = 0;
1.2 millert 315: int i;
1.1 tholo 316:
1.28 millert 317: for (i = 0, p = *pp; i < 2 && isxdigit(*p); i++, p++) {
1.1 tholo 318: if (isdigit(*p))
319: n = 16 * n + *p - '0';
320: else if (*p >= 'a' && *p <= 'f')
321: n = 16 * n + *p - 'a' + 10;
322: else if (*p >= 'A' && *p <= 'F')
323: n = 16 * n + *p - 'A' + 10;
324: }
1.28 millert 325: *pp = p;
1.1 tholo 326: return n;
327: }
328:
1.2 millert 329: #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
1.1 tholo 330:
1.28 millert 331: int quoted(const uschar **pp) /* pick up next thing after a \\ */
1.1 tholo 332: /* and increment *pp */
333: {
1.28 millert 334: const uschar *p = *pp;
1.1 tholo 335: int c;
336:
337: if ((c = *p++) == 't')
338: c = '\t';
339: else if (c == 'n')
340: c = '\n';
341: else if (c == 'f')
342: c = '\f';
343: else if (c == 'r')
344: c = '\r';
345: else if (c == 'b')
346: c = '\b';
1.25 millert 347: else if (c == 'v')
348: c = '\v';
1.20 millert 349: else if (c == 'a')
1.25 millert 350: c = '\a';
1.1 tholo 351: else if (c == '\\')
352: c = '\\';
353: else if (c == 'x') { /* hexadecimal goo follows */
354: c = hexstr(&p); /* this adds a null if number is invalid */
355: } else if (isoctdigit(c)) { /* \d \dd \ddd */
356: int n = c - '0';
357: if (isoctdigit(*p)) {
358: n = 8 * n + *p++ - '0';
359: if (isoctdigit(*p))
360: n = 8 * n + *p++ - '0';
361: }
362: c = n;
363: } /* else */
364: /* c = c; */
365: *pp = p;
366: return c;
367: }
368:
1.11 millert 369: char *cclenter(const char *argp) /* add a character class */
1.1 tholo 370: {
371: int i, c, c2;
1.28 millert 372: const uschar *op, *p = (const uschar *) argp;
373: uschar *bp;
1.25 millert 374: static uschar *buf = NULL;
1.5 kstailey 375: static int bufsz = 100;
1.1 tholo 376:
377: op = p;
1.35 ! millert 378: if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 379: FATAL("out of space for character class [%.10s...] 1", p);
1.5 kstailey 380: bp = buf;
381: for (i = 0; (c = *p++) != 0; ) {
1.1 tholo 382: if (c == '\\') {
1.17 millert 383: c = quoted(&p);
1.5 kstailey 384: } else if (c == '-' && i > 0 && bp[-1] != 0) {
1.1 tholo 385: if (*p != 0) {
1.5 kstailey 386: c = bp[-1];
1.1 tholo 387: c2 = *p++;
388: if (c2 == '\\')
1.17 millert 389: c2 = quoted(&p);
1.1 tholo 390: if (c > c2) { /* empty; ignore */
1.5 kstailey 391: bp--;
1.1 tholo 392: i--;
393: continue;
394: }
395: while (c < c2) {
1.15 millert 396: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
1.8 millert 397: FATAL("out of space for character class [%.10s...] 2", p);
1.5 kstailey 398: *bp++ = ++c;
1.1 tholo 399: i++;
400: }
401: continue;
402: }
403: }
1.15 millert 404: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
1.8 millert 405: FATAL("out of space for character class [%.10s...] 3", p);
1.5 kstailey 406: *bp++ = c;
1.1 tholo 407: i++;
408: }
1.5 kstailey 409: *bp = 0;
1.32 millert 410: DPRINTF("cclenter: in = |%s|, out = |%s|\n", op, buf);
1.1 tholo 411: xfree(op);
1.10 millert 412: return (char *) tostring((char *) buf);
1.1 tholo 413: }
414:
1.11 millert 415: void overflo(const char *s)
1.1 tholo 416: {
1.27 millert 417: FATAL("regular expression too big: out of space in %.30s...", s);
1.1 tholo 418: }
419:
420: void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
421: {
422: int i;
1.4 millert 423: int *p;
1.1 tholo 424:
425: switch (type(v)) {
1.15 millert 426: ELEAF
1.1 tholo 427: LEAF
1.7 millert 428: f->re[info(v)].ltype = type(v);
429: f->re[info(v)].lval.np = right(v);
1.1 tholo 430: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 431: resizesetvec(__func__);
1.1 tholo 432: }
433: for (i = 0; i <= f->accept; i++)
434: setvec[i] = 0;
435: setcnt = 0;
436: follow(v); /* computes setvec and setcnt */
1.27 millert 437: p = intalloc(setcnt + 1, __func__);
1.7 millert 438: f->re[info(v)].lfollow = p;
1.1 tholo 439: *p = setcnt;
440: for (i = f->accept; i >= 0; i--)
441: if (setvec[i] == 1)
442: *++p = i;
443: break;
444: UNARY
445: cfoll(f,left(v));
446: break;
447: case CAT:
448: case OR:
449: cfoll(f,left(v));
450: cfoll(f,right(v));
451: break;
1.31 millert 452: case ZERO:
453: break;
1.1 tholo 454: default: /* can't happen */
1.8 millert 455: FATAL("can't happen: unknown type %d in cfoll", type(v));
1.1 tholo 456: }
457: }
458:
459: int first(Node *p) /* collects initially active leaves of p into setvec */
1.15 millert 460: /* returns 0 if p matches empty string */
1.1 tholo 461: {
1.4 millert 462: int b, lp;
1.1 tholo 463:
464: switch (type(p)) {
1.15 millert 465: ELEAF
1.1 tholo 466: LEAF
1.7 millert 467: lp = info(p); /* look for high-water mark of subscripts */
1.1 tholo 468: while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
1.27 millert 469: resizesetvec(__func__);
1.1 tholo 470: }
1.15 millert 471: if (type(p) == EMPTYRE) {
472: setvec[lp] = 0;
473: return(0);
474: }
1.1 tholo 475: if (setvec[lp] != 1) {
476: setvec[lp] = 1;
477: setcnt++;
478: }
479: if (type(p) == CCL && (*(char *) right(p)) == '\0')
480: return(0); /* empty CCL */
1.30 millert 481: return(1);
1.1 tholo 482: case PLUS:
1.30 millert 483: if (first(left(p)) == 0)
484: return(0);
1.1 tholo 485: return(1);
486: case STAR:
487: case QUEST:
488: first(left(p));
489: return(0);
490: case CAT:
491: if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
492: return(1);
493: case OR:
494: b = first(right(p));
495: if (first(left(p)) == 0 || b == 0) return(0);
496: return(1);
1.31 millert 497: case ZERO:
498: return 0;
1.1 tholo 499: }
1.8 millert 500: FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
1.1 tholo 501: return(-1);
502: }
503:
504: void follow(Node *v) /* collects leaves that can follow v into setvec */
505: {
506: Node *p;
507:
508: if (type(v) == FINAL)
509: return;
510: p = parent(v);
511: switch (type(p)) {
512: case STAR:
513: case PLUS:
514: first(v);
515: follow(p);
516: return;
517:
518: case OR:
519: case QUEST:
520: follow(p);
521: return;
522:
523: case CAT:
524: if (v == left(p)) { /* v is left child of p */
525: if (first(right(p)) == 0) {
526: follow(p);
527: return;
528: }
529: } else /* v is right child */
530: follow(p);
531: return;
532: }
533: }
534:
1.11 millert 535: int member(int c, const char *sarg) /* is c in s? */
1.1 tholo 536: {
1.28 millert 537: const uschar *s = (const uschar *) sarg;
1.10 millert 538:
1.1 tholo 539: while (*s)
540: if (c == *s++)
541: return(1);
542: return(0);
543: }
544:
1.11 millert 545: int match(fa *f, const char *p0) /* shortest match ? */
1.1 tholo 546: {
547: int s, ns;
1.28 millert 548: const uschar *p = (const uschar *) p0;
1.1 tholo 549:
1.27 millert 550: s = f->initstat;
551: assert (s < f->state_count);
552:
1.1 tholo 553: if (f->out[s])
554: return(1);
555: do {
1.15 millert 556: /* assert(*p < NCHARS); */
1.1 tholo 557: if ((ns = f->gototab[s][*p]) != 0)
558: s = ns;
559: else
560: s = cgoto(f, s, *p);
561: if (f->out[s])
562: return(1);
563: } while (*p++ != 0);
564: return(0);
565: }
566:
1.11 millert 567: int pmatch(fa *f, const char *p0) /* longest match, for sub */
1.1 tholo 568: {
569: int s, ns;
1.28 millert 570: const uschar *p = (const uschar *) p0;
571: const uschar *q;
1.1 tholo 572:
1.27 millert 573: s = f->initstat;
574: assert(s < f->state_count);
575:
1.28 millert 576: patbeg = (const char *)p;
1.1 tholo 577: patlen = -1;
578: do {
579: q = p;
580: do {
581: if (f->out[s]) /* final state */
582: patlen = q-p;
1.15 millert 583: /* assert(*q < NCHARS); */
1.1 tholo 584: if ((ns = f->gototab[s][*q]) != 0)
585: s = ns;
586: else
587: s = cgoto(f, s, *q);
1.27 millert 588:
589: assert(s < f->state_count);
590:
1.9 deraadt 591: if (s == 1) { /* no transition */
1.1 tholo 592: if (patlen >= 0) {
1.28 millert 593: patbeg = (const char *) p;
1.1 tholo 594: return(1);
595: }
596: else
597: goto nextin; /* no match */
1.9 deraadt 598: }
1.1 tholo 599: } while (*q++ != 0);
600: if (f->out[s])
601: patlen = q-p-1; /* don't count $ */
602: if (patlen >= 0) {
1.28 millert 603: patbeg = (const char *) p;
1.1 tholo 604: return(1);
605: }
606: nextin:
607: s = 2;
1.27 millert 608: } while (*p++);
1.1 tholo 609: return (0);
610: }
611:
1.11 millert 612: int nematch(fa *f, const char *p0) /* non-empty match, for sub */
1.1 tholo 613: {
614: int s, ns;
1.28 millert 615: const uschar *p = (const uschar *) p0;
616: const uschar *q;
1.1 tholo 617:
1.27 millert 618: s = f->initstat;
619: assert(s < f->state_count);
620:
1.28 millert 621: patbeg = (const char *)p;
1.1 tholo 622: patlen = -1;
623: while (*p) {
624: q = p;
625: do {
626: if (f->out[s]) /* final state */
627: patlen = q-p;
1.15 millert 628: /* assert(*q < NCHARS); */
1.1 tholo 629: if ((ns = f->gototab[s][*q]) != 0)
630: s = ns;
631: else
632: s = cgoto(f, s, *q);
1.9 deraadt 633: if (s == 1) { /* no transition */
1.1 tholo 634: if (patlen > 0) {
1.28 millert 635: patbeg = (const char *) p;
1.1 tholo 636: return(1);
637: } else
638: goto nnextin; /* no nonempty match */
1.9 deraadt 639: }
1.1 tholo 640: } while (*q++ != 0);
641: if (f->out[s])
642: patlen = q-p-1; /* don't count $ */
643: if (patlen > 0 ) {
1.28 millert 644: patbeg = (const char *) p;
1.1 tholo 645: return(1);
646: }
647: nnextin:
648: s = 2;
649: p++;
650: }
651: return (0);
652: }
653:
1.26 millert 654:
655: /*
656: * NAME
657: * fnematch
658: *
659: * DESCRIPTION
660: * A stream-fed version of nematch which transfers characters to a
661: * null-terminated buffer. All characters up to and including the last
662: * character of the matching text or EOF are placed in the buffer. If
663: * a match is found, patbeg and patlen are set appropriately.
664: *
665: * RETURN VALUES
1.29 millert 666: * false No match found.
667: * true Match found.
1.26 millert 668: */
669:
1.29 millert 670: bool fnematch(fa *pfa, FILE *f, char **pbuf, int *pbufsize, int quantum)
1.26 millert 671: {
672: char *buf = *pbuf;
673: int bufsize = *pbufsize;
674: int c, i, j, k, ns, s;
675:
676: s = pfa->initstat;
677: patlen = 0;
678:
679: /*
680: * All indices relative to buf.
681: * i <= j <= k <= bufsize
682: *
683: * i: origin of active substring
684: * j: current character
685: * k: destination of next getc()
686: */
687: i = -1, k = 0;
688: do {
689: j = i++;
690: do {
691: if (++j == k) {
692: if (k == bufsize)
693: if (!adjbuf(&buf, &bufsize, bufsize+1, quantum, 0, "fnematch"))
694: FATAL("stream '%.30s...' too long", buf);
695: buf[k++] = (c = getc(f)) != EOF ? c : 0;
696: }
1.33 millert 697: c = (uschar)buf[j];
1.26 millert 698: /* assert(c < NCHARS); */
699:
700: if ((ns = pfa->gototab[s][c]) != 0)
701: s = ns;
702: else
703: s = cgoto(pfa, s, c);
704:
705: if (pfa->out[s]) { /* final state */
706: patlen = j - i + 1;
707: if (c == 0) /* don't count $ */
708: patlen--;
709: }
710: } while (buf[j] && s != 1);
711: s = 2;
712: } while (buf[i] && !patlen);
713:
714: /* adjbuf() may have relocated a resized buffer. Inform the world. */
715: *pbuf = buf;
716: *pbufsize = bufsize;
717:
718: if (patlen) {
719: patbeg = buf + i;
720: /*
721: * Under no circumstances is the last character fed to
722: * the automaton part of the match. It is EOF's nullbyte,
723: * or it sent the automaton into a state with no further
724: * transitions available (s==1), or both. Room for a
725: * terminating nullbyte is guaranteed.
726: *
727: * ungetc any chars after the end of matching text
728: * (except for EOF's nullbyte, if present) and null
729: * terminate the buffer.
730: */
731: do
732: if (buf[--k] && ungetc(buf[k], f) == EOF)
733: FATAL("unable to ungetc '%c'", buf[k]);
734: while (k > i + patlen);
1.30 millert 735: buf[k] = '\0';
1.29 millert 736: return true;
1.26 millert 737: }
738: else
1.29 millert 739: return false;
1.26 millert 740: }
741:
1.11 millert 742: Node *reparse(const char *p) /* parses regular expression pointed to by p */
1.1 tholo 743: { /* uses relex() to scan regular expression */
744: Node *np;
745:
1.32 millert 746: DPRINTF("reparse <%s>\n", p);
1.28 millert 747: lastre = prestr = (const uschar *) p; /* prestr points to string to be parsed */
1.1 tholo 748: rtok = relex();
1.11 millert 749: /* GNU compatibility: an empty regexp matches anything */
1.15 millert 750: if (rtok == '\0') {
1.11 millert 751: /* FATAL("empty regular expression"); previous */
1.15 millert 752: return(op2(EMPTYRE, NIL, NIL));
753: }
1.1 tholo 754: np = regexp();
755: if (rtok != '\0')
1.8 millert 756: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 757: return(np);
758: }
759:
760: Node *regexp(void) /* top-level parse of reg expr */
761: {
762: return (alt(concat(primary())));
763: }
764:
765: Node *primary(void)
766: {
767: Node *np;
1.23 millert 768: int savelastatom;
1.1 tholo 769:
770: switch (rtok) {
771: case CHAR:
1.23 millert 772: lastatom = starttok;
1.7 millert 773: np = op2(CHAR, NIL, itonp(rlxval));
1.1 tholo 774: rtok = relex();
775: return (unary(np));
776: case ALL:
777: rtok = relex();
778: return (unary(op2(ALL, NIL, NIL)));
1.15 millert 779: case EMPTYRE:
780: rtok = relex();
1.23 millert 781: return (unary(op2(EMPTYRE, NIL, NIL)));
1.1 tholo 782: case DOT:
1.23 millert 783: lastatom = starttok;
1.1 tholo 784: rtok = relex();
785: return (unary(op2(DOT, NIL, NIL)));
786: case CCL:
1.28 millert 787: np = op2(CCL, NIL, (Node*) cclenter((const char *) rlxstr));
1.23 millert 788: lastatom = starttok;
1.1 tholo 789: rtok = relex();
790: return (unary(np));
791: case NCCL:
1.28 millert 792: np = op2(NCCL, NIL, (Node *) cclenter((const char *) rlxstr));
1.23 millert 793: lastatom = starttok;
1.1 tholo 794: rtok = relex();
795: return (unary(np));
796: case '^':
797: rtok = relex();
1.7 millert 798: return (unary(op2(CHAR, NIL, itonp(HAT))));
1.1 tholo 799: case '$':
800: rtok = relex();
801: return (unary(op2(CHAR, NIL, NIL)));
802: case '(':
1.23 millert 803: lastatom = starttok;
804: savelastatom = starttok - basestr; /* Retain over recursion */
1.1 tholo 805: rtok = relex();
806: if (rtok == ')') { /* special pleading for () */
807: rtok = relex();
808: return unary(op2(CCL, NIL, (Node *) tostring("")));
809: }
810: np = regexp();
811: if (rtok == ')') {
1.23 millert 812: lastatom = basestr + savelastatom; /* Restore */
1.1 tholo 813: rtok = relex();
814: return (unary(np));
815: }
816: else
1.8 millert 817: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 818: default:
1.8 millert 819: FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1.1 tholo 820: }
821: return 0; /*NOTREACHED*/
822: }
823:
824: Node *concat(Node *np)
825: {
826: switch (rtok) {
1.23 millert 827: case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
1.1 tholo 828: return (concat(op2(CAT, np, primary())));
1.23 millert 829: case EMPTYRE:
830: rtok = relex();
831: return (concat(op2(CAT, op2(CCL, NIL, (Node *) tostring("")),
832: primary())));
1.1 tholo 833: }
834: return (np);
835: }
836:
837: Node *alt(Node *np)
838: {
839: if (rtok == OR) {
840: rtok = relex();
841: return (alt(op2(OR, np, concat(primary()))));
842: }
843: return (np);
844: }
845:
846: Node *unary(Node *np)
847: {
848: switch (rtok) {
849: case STAR:
850: rtok = relex();
851: return (unary(op2(STAR, np, NIL)));
852: case PLUS:
853: rtok = relex();
854: return (unary(op2(PLUS, np, NIL)));
855: case QUEST:
856: rtok = relex();
857: return (unary(op2(QUEST, np, NIL)));
1.31 millert 858: case ZERO:
859: rtok = relex();
860: return (unary(op2(ZERO, np, NIL)));
1.1 tholo 861: default:
862: return (np);
863: }
864: }
865:
1.11 millert 866: /*
867: * Character class definitions conformant to the POSIX locale as
868: * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
869: * and operating character sets are both ASCII (ISO646) or supersets
870: * thereof.
871: *
872: * Note that to avoid overflowing the temporary buffer used in
873: * relex(), the expanded character class (prior to range expansion)
874: * must be less than twice the size of their full name.
875: */
1.12 millert 876:
877: /* Because isblank doesn't show up in any of the header files on any
878: * system i use, it's defined here. if some other locale has a richer
879: * definition of "blank", define HAS_ISBLANK and provide your own
880: * version.
881: * the parentheses here are an attempt to find a path through the maze
882: * of macro definition and/or function and/or version provided. thanks
883: * to nelson beebe for the suggestion; let's see if it works everywhere.
884: */
885:
886: #ifndef HAS_ISBLANK
887:
1.17 millert 888: int (xisblank)(int c)
1.12 millert 889: {
890: return c==' ' || c=='\t';
891: }
892:
893: #endif
894:
1.31 millert 895: static const struct charclass {
1.11 millert 896: const char *cc_name;
897: int cc_namelen;
1.12 millert 898: int (*cc_func)(int);
1.11 millert 899: } charclasses[] = {
1.12 millert 900: { "alnum", 5, isalnum },
901: { "alpha", 5, isalpha },
1.17 millert 902: #ifndef HAS_ISBLANK
1.21 millert 903: { "blank", 5, xisblank },
1.17 millert 904: #else
1.12 millert 905: { "blank", 5, isblank },
1.17 millert 906: #endif
1.12 millert 907: { "cntrl", 5, iscntrl },
908: { "digit", 5, isdigit },
909: { "graph", 5, isgraph },
910: { "lower", 5, islower },
911: { "print", 5, isprint },
912: { "punct", 5, ispunct },
913: { "space", 5, isspace },
914: { "upper", 5, isupper },
915: { "xdigit", 6, isxdigit },
1.11 millert 916: { NULL, 0, NULL },
917: };
918:
1.23 millert 919: #define REPEAT_SIMPLE 0
920: #define REPEAT_PLUS_APPENDED 1
921: #define REPEAT_WITH_Q 2
922: #define REPEAT_ZERO 3
923:
924: static int
925: replace_repeat(const uschar *reptok, int reptoklen, const uschar *atom,
926: int atomlen, int firstnum, int secondnum, int special_case)
927: {
928: int i, j;
1.26 millert 929: uschar *buf = NULL;
1.23 millert 930: int ret = 1;
1.31 millert 931: int init_q = (firstnum == 0); /* first added char will be ? */
1.23 millert 932: int n_q_reps = secondnum-firstnum; /* m>n, so reduce until {1,m-n} left */
933: int prefix_length = reptok - basestr; /* prefix includes first rep */
1.28 millert 934: int suffix_length = strlen((const char *) reptok) - reptoklen; /* string after rep specifier */
1.23 millert 935: int size = prefix_length + suffix_length;
936:
937: if (firstnum > 1) { /* add room for reps 2 through firstnum */
938: size += atomlen*(firstnum-1);
939: }
940:
941: /* Adjust size of buffer for special cases */
942: if (special_case == REPEAT_PLUS_APPENDED) {
943: size++; /* for the final + */
944: } else if (special_case == REPEAT_WITH_Q) {
945: size += init_q + (atomlen+1)* n_q_reps;
946: } else if (special_case == REPEAT_ZERO) {
947: size += 2; /* just a null ERE: () */
948: }
1.35 ! millert 949: if ((buf = (uschar *) malloc(size + 1)) == NULL)
1.23 millert 950: FATAL("out of space in reg expr %.10s..", lastre);
951: memcpy(buf, basestr, prefix_length); /* copy prefix */
952: j = prefix_length;
953: if (special_case == REPEAT_ZERO) {
954: j -= atomlen;
955: buf[j++] = '(';
956: buf[j++] = ')';
957: }
1.30 millert 958: for (i = 1; i < firstnum; i++) { /* copy x reps */
1.23 millert 959: memcpy(&buf[j], atom, atomlen);
960: j += atomlen;
961: }
962: if (special_case == REPEAT_PLUS_APPENDED) {
963: buf[j++] = '+';
964: } else if (special_case == REPEAT_WITH_Q) {
1.29 millert 965: if (init_q)
966: buf[j++] = '?';
1.30 millert 967: for (i = init_q; i < n_q_reps; i++) { /* copy x? reps */
1.23 millert 968: memcpy(&buf[j], atom, atomlen);
969: j += atomlen;
970: buf[j++] = '?';
971: }
972: }
973: memcpy(&buf[j], reptok+reptoklen, suffix_length);
974: if (special_case == REPEAT_ZERO) {
975: buf[j+suffix_length] = '\0';
976: } else {
977: buf[size] = '\0';
978: }
979: /* free old basestr */
980: if (firstbasestr != basestr) {
981: if (basestr)
982: xfree(basestr);
983: }
984: basestr = buf;
985: prestr = buf + prefix_length;
986: if (special_case == REPEAT_ZERO) {
987: prestr -= atomlen;
988: ret++;
989: }
990: return ret;
991: }
992:
993: static int repeat(const uschar *reptok, int reptoklen, const uschar *atom,
994: int atomlen, int firstnum, int secondnum)
995: {
996: /*
997: In general, the repetition specifier or "bound" is replaced here
998: by an equivalent ERE string, repeating the immediately previous atom
999: and appending ? and + as needed. Note that the first copy of the
1000: atom is left in place, except in the special_case of a zero-repeat
1001: (i.e., {0}).
1002: */
1003: if (secondnum < 0) { /* means {n,} -> repeat n-1 times followed by PLUS */
1004: if (firstnum < 2) {
1005: /* 0 or 1: should be handled before you get here */
1006: FATAL("internal error");
1007: } else {
1008: return replace_repeat(reptok, reptoklen, atom, atomlen,
1009: firstnum, secondnum, REPEAT_PLUS_APPENDED);
1010: }
1011: } else if (firstnum == secondnum) { /* {n} or {n,n} -> simply repeat n-1 times */
1012: if (firstnum == 0) { /* {0} or {0,0} */
1013: /* This case is unusual because the resulting
1014: replacement string might actually be SMALLER than
1015: the original ERE */
1016: return replace_repeat(reptok, reptoklen, atom, atomlen,
1017: firstnum, secondnum, REPEAT_ZERO);
1018: } else { /* (firstnum >= 1) */
1019: return replace_repeat(reptok, reptoklen, atom, atomlen,
1020: firstnum, secondnum, REPEAT_SIMPLE);
1021: }
1022: } else if (firstnum < secondnum) { /* {n,m} -> repeat n-1 times then alternate */
1023: /* x{n,m} => xx...x{1, m-n+1} => xx...x?x?x?..x? */
1024: return replace_repeat(reptok, reptoklen, atom, atomlen,
1025: firstnum, secondnum, REPEAT_WITH_Q);
1026: } else { /* Error - shouldn't be here (n>m) */
1027: FATAL("internal error");
1028: }
1029: return 0;
1030: }
1.11 millert 1031:
1.1 tholo 1032: int relex(void) /* lexical analyzer for reparse */
1033: {
1.5 kstailey 1034: int c, n;
1.1 tholo 1035: int cflag;
1.25 millert 1036: static uschar *buf = NULL;
1.5 kstailey 1037: static int bufsz = 100;
1.10 millert 1038: uschar *bp;
1.31 millert 1039: const struct charclass *cc;
1.12 millert 1040: int i;
1.29 millert 1041: int num, m;
1042: bool commafound, digitfound;
1.23 millert 1043: const uschar *startreptok;
1.24 millert 1044: static int parens = 0;
1.23 millert 1045:
1046: rescan:
1047: starttok = prestr;
1.1 tholo 1048:
1049: switch (c = *prestr++) {
1050: case '|': return OR;
1051: case '*': return STAR;
1052: case '+': return PLUS;
1053: case '?': return QUEST;
1054: case '.': return DOT;
1055: case '\0': prestr--; return '\0';
1056: case '^':
1057: case '$':
1.24 millert 1058: return c;
1.1 tholo 1059: case '(':
1.24 millert 1060: parens++;
1061: return c;
1.1 tholo 1062: case ')':
1.24 millert 1063: if (parens) {
1064: parens--;
1065: return c;
1066: }
1067: /* unmatched close parenthesis; per POSIX, treat as literal */
1068: rlxval = c;
1069: return CHAR;
1.1 tholo 1070: case '\\':
1.17 millert 1071: rlxval = quoted(&prestr);
1.1 tholo 1072: return CHAR;
1073: default:
1074: rlxval = c;
1075: return CHAR;
1.25 millert 1076: case '[':
1.35 ! millert 1077: if (buf == NULL && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 1078: FATAL("out of space in reg expr %.10s..", lastre);
1.5 kstailey 1079: bp = buf;
1.1 tholo 1080: if (*prestr == '^') {
1081: cflag = 1;
1082: prestr++;
1083: }
1084: else
1085: cflag = 0;
1.11 millert 1086: n = 2 * strlen((const char *) prestr)+1;
1.15 millert 1087: if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1.8 millert 1088: FATAL("out of space for reg expr %.10s...", lastre);
1.1 tholo 1089: for (; ; ) {
1090: if ((c = *prestr++) == '\\') {
1.5 kstailey 1091: *bp++ = '\\';
1.1 tholo 1092: if ((c = *prestr++) == '\0')
1.8 millert 1093: FATAL("nonterminated character class %.20s...", lastre);
1.5 kstailey 1094: *bp++ = c;
1.10 millert 1095: /* } else if (c == '\n') { */
1096: /* FATAL("newline in character class %.20s...", lastre); */
1.11 millert 1097: } else if (c == '[' && *prestr == ':') {
1098: /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
1099: for (cc = charclasses; cc->cc_name; cc++)
1100: if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
1101: break;
1102: if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
1103: prestr[2 + cc->cc_namelen] == ']') {
1104: prestr += cc->cc_namelen + 3;
1.22 millert 1105: /*
1106: * BUG: We begin at 1, instead of 0, since we
1107: * would otherwise prematurely terminate the
1108: * string for classes like [[:cntrl:]]. This
1109: * means that we can't match the NUL character,
1110: * not without first adapting the entire
1111: * program to track each string's length.
1112: */
1.23 millert 1113: for (i = 1; i <= UCHAR_MAX; i++) {
1.15 millert 1114: if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1.12 millert 1115: FATAL("out of space for reg expr %.10s...", lastre);
1116: if (cc->cc_func(i)) {
1.32 millert 1117: /* escape backslash */
1118: if (i == '\\') {
1119: *bp++ = '\\';
1120: n++;
1121: }
1122:
1.12 millert 1123: *bp++ = i;
1124: n++;
1125: }
1126: }
1.11 millert 1127: } else
1128: *bp++ = c;
1.23 millert 1129: } else if (c == '[' && *prestr == '.') {
1130: char collate_char;
1131: prestr++;
1132: collate_char = *prestr++;
1133: if (*prestr == '.' && prestr[1] == ']') {
1134: prestr += 2;
1135: /* Found it: map via locale TBD: for
1136: now, simply return this char. This
1137: is sufficient to pass conformance
1138: test awk.ex 156
1139: */
1140: if (*prestr == ']') {
1141: prestr++;
1142: rlxval = collate_char;
1143: return CHAR;
1144: }
1145: }
1146: } else if (c == '[' && *prestr == '=') {
1147: char equiv_char;
1148: prestr++;
1149: equiv_char = *prestr++;
1150: if (*prestr == '=' && prestr[1] == ']') {
1151: prestr += 2;
1152: /* Found it: map via locale TBD: for now
1153: simply return this char. This is
1154: sufficient to pass conformance test
1155: awk.ex 156
1156: */
1157: if (*prestr == ']') {
1158: prestr++;
1159: rlxval = equiv_char;
1160: return CHAR;
1161: }
1162: }
1.1 tholo 1163: } else if (c == '\0') {
1.8 millert 1164: FATAL("nonterminated character class %.20s", lastre);
1.5 kstailey 1165: } else if (bp == buf) { /* 1st char is special */
1166: *bp++ = c;
1.1 tholo 1167: } else if (c == ']') {
1.5 kstailey 1168: *bp++ = 0;
1.10 millert 1169: rlxstr = (uschar *) tostring((char *) buf);
1.1 tholo 1170: if (cflag == 0)
1171: return CCL;
1172: else
1173: return NCCL;
1174: } else
1.5 kstailey 1175: *bp++ = c;
1.1 tholo 1176: }
1.23 millert 1177: break;
1178: case '{':
1179: if (isdigit(*(prestr))) {
1180: num = 0; /* Process as a repetition */
1181: n = -1; m = -1;
1.29 millert 1182: commafound = false;
1183: digitfound = false;
1.23 millert 1184: startreptok = prestr-1;
1185: /* Remember start of previous atom here ? */
1186: } else { /* just a { char, not a repetition */
1187: rlxval = c;
1188: return CHAR;
1189: }
1190: for (; ; ) {
1191: if ((c = *prestr++) == '}') {
1192: if (commafound) {
1193: if (digitfound) { /* {n,m} */
1194: m = num;
1.30 millert 1195: if (m < n)
1.23 millert 1196: FATAL("illegal repetition expression: class %.20s",
1197: lastre);
1.30 millert 1198: if (n == 0 && m == 1) {
1.23 millert 1199: return QUEST;
1200: }
1201: } else { /* {n,} */
1.30 millert 1202: if (n == 0)
1203: return STAR;
1204: else if (n == 1)
1205: return PLUS;
1.23 millert 1206: }
1207: } else {
1208: if (digitfound) { /* {n} same as {n,n} */
1209: n = num;
1210: m = num;
1211: } else { /* {} */
1212: FATAL("illegal repetition expression: class %.20s",
1213: lastre);
1214: }
1215: }
1216: if (repeat(starttok, prestr-starttok, lastatom,
1217: startreptok - lastatom, n, m) > 0) {
1.30 millert 1218: if (n == 0 && m == 0) {
1.31 millert 1219: return ZERO;
1.23 millert 1220: }
1221: /* must rescan input for next token */
1222: goto rescan;
1223: }
1224: /* Failed to replace: eat up {...} characters
1225: and treat like just PLUS */
1226: return PLUS;
1227: } else if (c == '\0') {
1228: FATAL("nonterminated character class %.20s",
1229: lastre);
1230: } else if (isdigit(c)) {
1231: num = 10 * num + c - '0';
1.29 millert 1232: digitfound = true;
1.23 millert 1233: } else if (c == ',') {
1234: if (commafound)
1235: FATAL("illegal repetition expression: class %.20s",
1236: lastre);
1237: /* looking for {n,} or {n,m} */
1.29 millert 1238: commafound = true;
1.23 millert 1239: n = num;
1.29 millert 1240: digitfound = false; /* reset */
1.23 millert 1241: num = 0;
1242: } else {
1243: FATAL("illegal repetition expression: class %.20s",
1244: lastre);
1245: }
1246: }
1247: break;
1.1 tholo 1248: }
1249: }
1250:
1251: int cgoto(fa *f, int s, int c)
1252: {
1.27 millert 1253: int *p, *q;
1.1 tholo 1254: int i, j, k;
1255:
1.12 millert 1256: assert(c == HAT || c < NCHARS);
1.1 tholo 1257: while (f->accept >= maxsetvec) { /* guessing here! */
1.27 millert 1258: resizesetvec(__func__);
1.1 tholo 1259: }
1260: for (i = 0; i <= f->accept; i++)
1261: setvec[i] = 0;
1262: setcnt = 0;
1.27 millert 1263: resize_state(f, s);
1.1 tholo 1264: /* compute positions of gototab[s,c] into setvec */
1265: p = f->posns[s];
1266: for (i = 1; i <= *p; i++) {
1267: if ((k = f->re[p[i]].ltype) != FINAL) {
1.7 millert 1268: if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1.1 tholo 1269: || (k == DOT && c != 0 && c != HAT)
1270: || (k == ALL && c != 0)
1.15 millert 1271: || (k == EMPTYRE && c != 0)
1.10 millert 1272: || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
1273: || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1.1 tholo 1274: q = f->re[p[i]].lfollow;
1275: for (j = 1; j <= *q; j++) {
1276: if (q[j] >= maxsetvec) {
1.27 millert 1277: resizesetvec(__func__);
1.1 tholo 1278: }
1279: if (setvec[q[j]] == 0) {
1280: setcnt++;
1281: setvec[q[j]] = 1;
1282: }
1283: }
1284: }
1285: }
1286: }
1287: /* determine if setvec is a previous state */
1288: tmpset[0] = setcnt;
1289: j = 1;
1290: for (i = f->accept; i >= 0; i--)
1291: if (setvec[i]) {
1292: tmpset[j++] = i;
1293: }
1.27 millert 1294: resize_state(f, f->curstat > s ? f->curstat : s);
1.1 tholo 1295: /* tmpset == previous state? */
1296: for (i = 1; i <= f->curstat; i++) {
1297: p = f->posns[i];
1298: if ((k = tmpset[0]) != p[0])
1299: goto different;
1300: for (j = 1; j <= k; j++)
1301: if (tmpset[j] != p[j])
1302: goto different;
1303: /* setvec is state i */
1.30 millert 1304: if (c != HAT)
1305: f->gototab[s][c] = i;
1.1 tholo 1306: return i;
1307: different:;
1308: }
1309:
1310: /* add tmpset to current set of states */
1.27 millert 1311: ++(f->curstat);
1312: resize_state(f, f->curstat);
1.1 tholo 1313: for (i = 0; i < NCHARS; i++)
1314: f->gototab[f->curstat][i] = 0;
1315: xfree(f->posns[f->curstat]);
1.27 millert 1316: p = intalloc(setcnt + 1, __func__);
1.1 tholo 1317:
1318: f->posns[f->curstat] = p;
1.30 millert 1319: if (c != HAT)
1320: f->gototab[s][c] = f->curstat;
1.1 tholo 1321: for (i = 0; i <= setcnt; i++)
1322: p[i] = tmpset[i];
1323: if (setvec[f->accept])
1324: f->out[f->curstat] = 1;
1325: else
1326: f->out[f->curstat] = 0;
1327: return f->curstat;
1328: }
1329:
1330:
1331: void freefa(fa *f) /* free a finite automaton */
1332: {
1333: int i;
1334:
1335: if (f == NULL)
1336: return;
1.27 millert 1337: for (i = 0; i < f->state_count; i++)
1338: xfree(f->gototab[i])
1.1 tholo 1339: for (i = 0; i <= f->curstat; i++)
1340: xfree(f->posns[i]);
1341: for (i = 0; i <= f->accept; i++) {
1342: xfree(f->re[i].lfollow);
1343: if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
1.30 millert 1344: xfree(f->re[i].lval.np);
1.1 tholo 1345: }
1346: xfree(f->restr);
1.27 millert 1347: xfree(f->out);
1348: xfree(f->posns);
1349: xfree(f->gototab);
1.1 tholo 1350: xfree(f);
1351: }