Annotation of src/usr.bin/awk/b.c, Revision 1.11
1.11 ! millert 1: /* $OpenBSD: b.c,v 1.10 2001/09/08 00:12:40 millert Exp $ */
1.1 tholo 2: /****************************************************************
1.5 kstailey 3: Copyright (C) Lucent Technologies 1997
1.1 tholo 4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
1.5 kstailey 11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
1.1 tholo 24: ****************************************************************/
25:
26: /* lasciate ogne speranza, voi ch'entrate. */
27:
28: #define DEBUG
29:
30: #include <ctype.h>
31: #include <stdio.h>
32: #include <string.h>
33: #include <stdlib.h>
34: #include "awk.h"
1.5 kstailey 35: #include "ytab.h"
1.1 tholo 36:
1.10 millert 37: #define HAT (NCHARS-2) /* matches ^ in regular expr */
1.1 tholo 38: /* NCHARS is 2**n */
39: #define MAXLIN 22
40:
1.7 millert 41: #define type(v) (v)->nobj /* badly overloaded here */
42: #define info(v) (v)->ntype /* badly overloaded here */
1.1 tholo 43: #define left(v) (v)->narg[0]
44: #define right(v) (v)->narg[1]
45: #define parent(v) (v)->nnext
46:
47: #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
48: #define UNARY case STAR: case PLUS: case QUEST:
49:
50: /* encoding in tree Nodes:
51: leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
52: left is index, right contains value or pointer to value
53: unary (STAR, PLUS, QUEST): left is child, right is null
54: binary (CAT, OR): left and right are children
55: parent contains pointer to parent
56: */
57:
58:
59: int *setvec;
60: int *tmpset;
61: int maxsetvec = 0;
62:
63: int rtok; /* next token in current re */
1.4 millert 64: int rlxval;
1.10 millert 65: static uschar *rlxstr;
66: static uschar *prestr; /* current position in current re */
67: static uschar *lastre; /* origin of last re */
1.1 tholo 68:
1.4 millert 69: static int setcnt;
70: static int poscnt;
1.1 tholo 71:
72: char *patbeg;
73: int patlen;
74:
75: #define NFA 20 /* cache this many dynamic fa's */
76: fa *fatab[NFA];
77: int nfatab = 0; /* entries in fatab */
78:
1.11 ! millert 79: fa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */
1.1 tholo 80: {
81: int i, use, nuse;
82: fa *pfa;
1.6 millert 83: static int now = 1;
1.1 tholo 84:
85: if (setvec == 0) { /* first time through any RE */
86: maxsetvec = MAXLIN;
87: setvec = (int *) malloc(maxsetvec * sizeof(int));
88: tmpset = (int *) malloc(maxsetvec * sizeof(int));
89: if (setvec == 0 || tmpset == 0)
90: overflo("out of space initializing makedfa");
91: }
92:
93: if (compile_time) /* a constant for sure */
94: return mkdfa(s, anchor);
95: for (i = 0; i < nfatab; i++) /* is it there already? */
96: if (fatab[i]->anchor == anchor
1.11 ! millert 97: && strcmp((const char *) fatab[i]->restr, s) == 0) {
1.6 millert 98: fatab[i]->use = now++;
1.1 tholo 99: return fatab[i];
1.10 millert 100: }
1.1 tholo 101: pfa = mkdfa(s, anchor);
102: if (nfatab < NFA) { /* room for another */
103: fatab[nfatab] = pfa;
1.6 millert 104: fatab[nfatab]->use = now++;
1.1 tholo 105: nfatab++;
106: return pfa;
107: }
108: use = fatab[0]->use; /* replace least-recently used */
109: nuse = 0;
110: for (i = 1; i < nfatab; i++)
111: if (fatab[i]->use < use) {
112: use = fatab[i]->use;
113: nuse = i;
114: }
115: freefa(fatab[nuse]);
116: fatab[nuse] = pfa;
1.6 millert 117: pfa->use = now++;
1.1 tholo 118: return pfa;
119: }
120:
1.11 ! millert 121: fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
1.1 tholo 122: /* anchor = 1 for anchored matches, else 0 */
123: {
124: Node *p, *p1;
125: fa *f;
126:
127: p = reparse(s);
128: p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
129: /* put ALL STAR in front of reg. exp. */
130: p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
131: /* put FINAL after reg. exp. */
132:
133: poscnt = 0;
134: penter(p1); /* enter parent pointers and leaf indices */
135: if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
136: overflo("out of space for fa");
137: f->accept = poscnt-1; /* penter has computed number of positions in re */
138: cfoll(f, p1); /* set up follow sets */
139: freetr(p1);
1.4 millert 140: if ((f->posns[0] = (int *) calloc(1, *(f->re[0].lfollow)*sizeof(int))) == NULL)
1.1 tholo 141: overflo("out of space in makedfa");
1.4 millert 142: if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
1.1 tholo 143: overflo("out of space in makedfa");
144: *f->posns[1] = 0;
145: f->initstat = makeinit(f, anchor);
146: f->anchor = anchor;
1.10 millert 147: f->restr = (uschar *) tostring(s);
1.1 tholo 148: return f;
149: }
150:
151: int makeinit(fa *f, int anchor)
152: {
153: int i, k;
154:
155: f->curstat = 2;
156: f->out[2] = 0;
157: f->reset = 0;
158: k = *(f->re[0].lfollow);
159: xfree(f->posns[2]);
1.4 millert 160: if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
1.1 tholo 161: overflo("out of space in makeinit");
162: for (i=0; i <= k; i++) {
163: (f->posns[2])[i] = (f->re[0].lfollow)[i];
164: }
165: if ((f->posns[2])[1] == f->accept)
166: f->out[2] = 1;
167: for (i=0; i < NCHARS; i++)
168: f->gototab[2][i] = 0;
169: f->curstat = cgoto(f, 2, HAT);
170: if (anchor) {
171: *f->posns[2] = k-1; /* leave out position 0 */
172: for (i=0; i < k; i++) {
173: (f->posns[0])[i] = (f->posns[2])[i];
174: }
175:
176: f->out[0] = f->out[2];
177: if (f->curstat != 2)
178: --(*f->posns[f->curstat]);
179: }
180: return f->curstat;
181: }
182:
183: void penter(Node *p) /* set up parent pointers and leaf indices */
184: {
185: switch (type(p)) {
186: LEAF
1.7 millert 187: info(p) = poscnt;
1.1 tholo 188: poscnt++;
189: break;
190: UNARY
191: penter(left(p));
192: parent(left(p)) = p;
193: break;
194: case CAT:
195: case OR:
196: penter(left(p));
197: penter(right(p));
198: parent(left(p)) = p;
199: parent(right(p)) = p;
200: break;
201: default: /* can't happen */
1.8 millert 202: FATAL("can't happen: unknown type %d in penter", type(p));
1.1 tholo 203: break;
204: }
205: }
206:
207: void freetr(Node *p) /* free parse tree */
208: {
209: switch (type(p)) {
210: LEAF
211: xfree(p);
212: break;
213: UNARY
214: freetr(left(p));
215: xfree(p);
216: break;
217: case CAT:
218: case OR:
219: freetr(left(p));
220: freetr(right(p));
221: xfree(p);
222: break;
223: default: /* can't happen */
1.8 millert 224: FATAL("can't happen: unknown type %d in freetr", type(p));
1.1 tholo 225: break;
226: }
227: }
228:
229: /* in the parsing of regular expressions, metacharacters like . have */
230: /* to be seen literally; \056 is not a metacharacter. */
231:
232: int hexstr(char **pp) /* find and eval hex string at pp, return new p */
1.2 millert 233: { /* only pick up one 8-bit byte (2 chars) */
1.10 millert 234: uschar *p;
1.1 tholo 235: int n = 0;
1.2 millert 236: int i;
1.1 tholo 237:
1.10 millert 238: for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
1.1 tholo 239: if (isdigit(*p))
240: n = 16 * n + *p - '0';
241: else if (*p >= 'a' && *p <= 'f')
242: n = 16 * n + *p - 'a' + 10;
243: else if (*p >= 'A' && *p <= 'F')
244: n = 16 * n + *p - 'A' + 10;
245: }
1.10 millert 246: *pp = (char *) p;
1.1 tholo 247: return n;
248: }
249:
1.2 millert 250: #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
1.1 tholo 251:
252: int quoted(char **pp) /* pick up next thing after a \\ */
253: /* and increment *pp */
254: {
255: char *p = *pp;
256: int c;
257:
258: if ((c = *p++) == 't')
259: c = '\t';
260: else if (c == 'n')
261: c = '\n';
262: else if (c == 'f')
263: c = '\f';
264: else if (c == 'r')
265: c = '\r';
266: else if (c == 'b')
267: c = '\b';
268: else if (c == '\\')
269: c = '\\';
270: else if (c == 'x') { /* hexadecimal goo follows */
271: c = hexstr(&p); /* this adds a null if number is invalid */
272: } else if (isoctdigit(c)) { /* \d \dd \ddd */
273: int n = c - '0';
274: if (isoctdigit(*p)) {
275: n = 8 * n + *p++ - '0';
276: if (isoctdigit(*p))
277: n = 8 * n + *p++ - '0';
278: }
279: c = n;
280: } /* else */
281: /* c = c; */
282: *pp = p;
283: return c;
284: }
285:
1.11 ! millert 286: char *cclenter(const char *argp) /* add a character class */
1.1 tholo 287: {
288: int i, c, c2;
1.10 millert 289: uschar *p = (uschar *) argp;
290: uschar *op, *bp;
291: static uschar *buf = 0;
1.5 kstailey 292: static int bufsz = 100;
1.1 tholo 293:
294: op = p;
1.10 millert 295: if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 296: FATAL("out of space for character class [%.10s...] 1", p);
1.5 kstailey 297: bp = buf;
298: for (i = 0; (c = *p++) != 0; ) {
1.1 tholo 299: if (c == '\\') {
1.10 millert 300: c = quoted((char **) &p);
1.5 kstailey 301: } else if (c == '-' && i > 0 && bp[-1] != 0) {
1.1 tholo 302: if (*p != 0) {
1.5 kstailey 303: c = bp[-1];
1.1 tholo 304: c2 = *p++;
305: if (c2 == '\\')
1.10 millert 306: c2 = quoted((char **) &p);
1.1 tholo 307: if (c > c2) { /* empty; ignore */
1.5 kstailey 308: bp--;
1.1 tholo 309: i--;
310: continue;
311: }
312: while (c < c2) {
1.10 millert 313: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0))
1.8 millert 314: FATAL("out of space for character class [%.10s...] 2", p);
1.5 kstailey 315: *bp++ = ++c;
1.1 tholo 316: i++;
317: }
318: continue;
319: }
320: }
1.10 millert 321: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0))
1.8 millert 322: FATAL("out of space for character class [%.10s...] 3", p);
1.5 kstailey 323: *bp++ = c;
1.1 tholo 324: i++;
325: }
1.5 kstailey 326: *bp = 0;
327: dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
1.1 tholo 328: xfree(op);
1.10 millert 329: return (char *) tostring((char *) buf);
1.1 tholo 330: }
331:
1.11 ! millert 332: void overflo(const char *s)
1.1 tholo 333: {
1.8 millert 334: FATAL("regular expression too big: %.30s...", s);
1.1 tholo 335: }
336:
337: void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
338: {
339: int i;
1.4 millert 340: int *p;
1.1 tholo 341:
342: switch (type(v)) {
343: LEAF
1.7 millert 344: f->re[info(v)].ltype = type(v);
345: f->re[info(v)].lval.np = right(v);
1.1 tholo 346: while (f->accept >= maxsetvec) { /* guessing here! */
347: maxsetvec *= 4;
348: setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
349: tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
1.5 kstailey 350: if (setvec == 0 || tmpset == 0)
1.1 tholo 351: overflo("out of space in cfoll()");
352: }
353: for (i = 0; i <= f->accept; i++)
354: setvec[i] = 0;
355: setcnt = 0;
356: follow(v); /* computes setvec and setcnt */
1.4 millert 357: if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
1.1 tholo 358: overflo("out of space building follow set");
1.7 millert 359: f->re[info(v)].lfollow = p;
1.1 tholo 360: *p = setcnt;
361: for (i = f->accept; i >= 0; i--)
362: if (setvec[i] == 1)
363: *++p = i;
364: break;
365: UNARY
366: cfoll(f,left(v));
367: break;
368: case CAT:
369: case OR:
370: cfoll(f,left(v));
371: cfoll(f,right(v));
372: break;
373: default: /* can't happen */
1.8 millert 374: FATAL("can't happen: unknown type %d in cfoll", type(v));
1.1 tholo 375: }
376: }
377:
378: int first(Node *p) /* collects initially active leaves of p into setvec */
379: /* returns 1 if p matches empty string */
380: {
1.4 millert 381: int b, lp;
1.1 tholo 382:
383: switch (type(p)) {
384: LEAF
1.7 millert 385: lp = info(p); /* look for high-water mark of subscripts */
1.1 tholo 386: while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
387: maxsetvec *= 4;
388: setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
389: tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
1.7 millert 390: if (setvec == 0 || tmpset == 0)
1.1 tholo 391: overflo("out of space in first()");
392: }
393: if (setvec[lp] != 1) {
394: setvec[lp] = 1;
395: setcnt++;
396: }
397: if (type(p) == CCL && (*(char *) right(p)) == '\0')
398: return(0); /* empty CCL */
399: else return(1);
400: case PLUS:
401: if (first(left(p)) == 0) return(0);
402: return(1);
403: case STAR:
404: case QUEST:
405: first(left(p));
406: return(0);
407: case CAT:
408: if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
409: return(1);
410: case OR:
411: b = first(right(p));
412: if (first(left(p)) == 0 || b == 0) return(0);
413: return(1);
414: }
1.8 millert 415: FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
1.1 tholo 416: return(-1);
417: }
418:
419: void follow(Node *v) /* collects leaves that can follow v into setvec */
420: {
421: Node *p;
422:
423: if (type(v) == FINAL)
424: return;
425: p = parent(v);
426: switch (type(p)) {
427: case STAR:
428: case PLUS:
429: first(v);
430: follow(p);
431: return;
432:
433: case OR:
434: case QUEST:
435: follow(p);
436: return;
437:
438: case CAT:
439: if (v == left(p)) { /* v is left child of p */
440: if (first(right(p)) == 0) {
441: follow(p);
442: return;
443: }
444: } else /* v is right child */
445: follow(p);
446: return;
447: }
448: }
449:
1.11 ! millert 450: int member(int c, const char *sarg) /* is c in s? */
1.1 tholo 451: {
1.10 millert 452: uschar *s = (uschar *) sarg;
453:
1.1 tholo 454: while (*s)
455: if (c == *s++)
456: return(1);
457: return(0);
458: }
459:
1.11 ! millert 460: int match(fa *f, const char *p0) /* shortest match ? */
1.1 tholo 461: {
462: int s, ns;
463: uschar *p = (uschar *) p0;
464:
465: s = f->reset ? makeinit(f,0) : f->initstat;
466: if (f->out[s])
467: return(1);
468: do {
469: if ((ns = f->gototab[s][*p]) != 0)
470: s = ns;
471: else
472: s = cgoto(f, s, *p);
473: if (f->out[s])
474: return(1);
475: } while (*p++ != 0);
476: return(0);
477: }
478:
1.11 ! millert 479: int pmatch(fa *f, const char *p0) /* longest match, for sub */
1.1 tholo 480: {
481: int s, ns;
482: uschar *p = (uschar *) p0;
483: uschar *q;
484: int i, k;
485:
486: s = f->reset ? makeinit(f,1) : f->initstat;
487: patbeg = (char *) p;
488: patlen = -1;
489: do {
490: q = p;
491: do {
492: if (f->out[s]) /* final state */
493: patlen = q-p;
494: if ((ns = f->gototab[s][*q]) != 0)
495: s = ns;
496: else
497: s = cgoto(f, s, *q);
1.9 deraadt 498: if (s == 1) { /* no transition */
1.1 tholo 499: if (patlen >= 0) {
500: patbeg = (char *) p;
501: return(1);
502: }
503: else
504: goto nextin; /* no match */
1.9 deraadt 505: }
1.1 tholo 506: } while (*q++ != 0);
507: if (f->out[s])
508: patlen = q-p-1; /* don't count $ */
509: if (patlen >= 0) {
510: patbeg = (char *) p;
511: return(1);
512: }
513: nextin:
514: s = 2;
515: if (f->reset) {
516: for (i = 2; i <= f->curstat; i++)
517: xfree(f->posns[i]);
518: k = *f->posns[0];
1.4 millert 519: if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
1.1 tholo 520: overflo("out of space in pmatch");
521: for (i = 0; i <= k; i++)
522: (f->posns[2])[i] = (f->posns[0])[i];
523: f->initstat = f->curstat = 2;
524: f->out[2] = f->out[0];
525: for (i = 0; i < NCHARS; i++)
526: f->gototab[2][i] = 0;
527: }
528: } while (*p++ != 0);
529: return (0);
530: }
531:
1.11 ! millert 532: int nematch(fa *f, const char *p0) /* non-empty match, for sub */
1.1 tholo 533: {
534: int s, ns;
535: uschar *p = (uschar *) p0;
536: uschar *q;
537: int i, k;
538:
539: s = f->reset ? makeinit(f,1) : f->initstat;
540: patlen = -1;
541: while (*p) {
542: q = p;
543: do {
544: if (f->out[s]) /* final state */
545: patlen = q-p;
546: if ((ns = f->gototab[s][*q]) != 0)
547: s = ns;
548: else
549: s = cgoto(f, s, *q);
1.9 deraadt 550: if (s == 1) { /* no transition */
1.1 tholo 551: if (patlen > 0) {
552: patbeg = (char *) p;
553: return(1);
554: } else
555: goto nnextin; /* no nonempty match */
1.9 deraadt 556: }
1.1 tholo 557: } while (*q++ != 0);
558: if (f->out[s])
559: patlen = q-p-1; /* don't count $ */
560: if (patlen > 0 ) {
561: patbeg = (char *) p;
562: return(1);
563: }
564: nnextin:
565: s = 2;
566: if (f->reset) {
567: for (i = 2; i <= f->curstat; i++)
568: xfree(f->posns[i]);
569: k = *f->posns[0];
1.4 millert 570: if ((f->posns[2] = (int *) calloc(1, (k+1)*sizeof(int))) == NULL)
1.1 tholo 571: overflo("out of state space");
572: for (i = 0; i <= k; i++)
573: (f->posns[2])[i] = (f->posns[0])[i];
574: f->initstat = f->curstat = 2;
575: f->out[2] = f->out[0];
576: for (i = 0; i < NCHARS; i++)
577: f->gototab[2][i] = 0;
578: }
579: p++;
580: }
581: return (0);
582: }
583:
1.11 ! millert 584: Node *reparse(const char *p) /* parses regular expression pointed to by p */
1.1 tholo 585: { /* uses relex() to scan regular expression */
586: Node *np;
587:
588: dprintf( ("reparse <%s>\n", p) );
1.10 millert 589: lastre = prestr = (uschar *) p; /* prestr points to string to be parsed */
1.1 tholo 590: rtok = relex();
1.11 ! millert 591: /* GNU compatibility: an empty regexp matches anything */
1.1 tholo 592: if (rtok == '\0')
1.11 ! millert 593: /* FATAL("empty regular expression"); previous */
! 594: return(op2(ALL, NIL, NIL));
1.1 tholo 595: np = regexp();
596: if (rtok != '\0')
1.8 millert 597: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 598: return(np);
599: }
600:
601: Node *regexp(void) /* top-level parse of reg expr */
602: {
603: return (alt(concat(primary())));
604: }
605:
606: Node *primary(void)
607: {
608: Node *np;
609:
610: switch (rtok) {
611: case CHAR:
1.7 millert 612: np = op2(CHAR, NIL, itonp(rlxval));
1.1 tholo 613: rtok = relex();
614: return (unary(np));
615: case ALL:
616: rtok = relex();
617: return (unary(op2(ALL, NIL, NIL)));
618: case DOT:
619: rtok = relex();
620: return (unary(op2(DOT, NIL, NIL)));
621: case CCL:
1.10 millert 622: np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
1.1 tholo 623: rtok = relex();
624: return (unary(np));
625: case NCCL:
1.10 millert 626: np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
1.1 tholo 627: rtok = relex();
628: return (unary(np));
629: case '^':
630: rtok = relex();
1.7 millert 631: return (unary(op2(CHAR, NIL, itonp(HAT))));
1.1 tholo 632: case '$':
633: rtok = relex();
634: return (unary(op2(CHAR, NIL, NIL)));
635: case '(':
636: rtok = relex();
637: if (rtok == ')') { /* special pleading for () */
638: rtok = relex();
639: return unary(op2(CCL, NIL, (Node *) tostring("")));
640: }
641: np = regexp();
642: if (rtok == ')') {
643: rtok = relex();
644: return (unary(np));
645: }
646: else
1.8 millert 647: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 648: default:
1.8 millert 649: FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1.1 tholo 650: }
651: return 0; /*NOTREACHED*/
652: }
653:
654: Node *concat(Node *np)
655: {
656: switch (rtok) {
657: case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
658: return (concat(op2(CAT, np, primary())));
659: }
660: return (np);
661: }
662:
663: Node *alt(Node *np)
664: {
665: if (rtok == OR) {
666: rtok = relex();
667: return (alt(op2(OR, np, concat(primary()))));
668: }
669: return (np);
670: }
671:
672: Node *unary(Node *np)
673: {
674: switch (rtok) {
675: case STAR:
676: rtok = relex();
677: return (unary(op2(STAR, np, NIL)));
678: case PLUS:
679: rtok = relex();
680: return (unary(op2(PLUS, np, NIL)));
681: case QUEST:
682: rtok = relex();
683: return (unary(op2(QUEST, np, NIL)));
684: default:
685: return (np);
686: }
687: }
688:
1.11 ! millert 689: /*
! 690: * Character class definitions conformant to the POSIX locale as
! 691: * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
! 692: * and operating character sets are both ASCII (ISO646) or supersets
! 693: * thereof.
! 694: *
! 695: * Note that to avoid overflowing the temporary buffer used in
! 696: * relex(), the expanded character class (prior to range expansion)
! 697: * must be less than twice the size of their full name.
! 698: */
! 699: struct charclass {
! 700: const char *cc_name;
! 701: int cc_namelen;
! 702: const char *cc_expand;
! 703: } charclasses[] = {
! 704: { "alnum", 5, "0-9A-Za-z" },
! 705: { "alpha", 5, "A-Za-z" },
! 706: { "blank", 5, " \t" },
! 707: { "cntrl", 5, "\000-\037\177" },
! 708: { "digit", 5, "0-9" },
! 709: { "graph", 5, "\041-\176" },
! 710: { "lower", 5, "a-z" },
! 711: { "print", 5, " \041-\176" },
! 712: { "punct", 5, "\041-\057\072-\100\133-\140\173-\176" },
! 713: { "space", 5, " \f\n\r\t\v" },
! 714: { "upper", 5, "A-Z" },
! 715: { "xdigit", 6, "0-9A-Fa-f" },
! 716: { NULL, 0, NULL },
! 717: };
! 718:
! 719:
1.1 tholo 720: int relex(void) /* lexical analyzer for reparse */
721: {
1.5 kstailey 722: int c, n;
1.1 tholo 723: int cflag;
1.10 millert 724: static uschar *buf = 0;
1.5 kstailey 725: static int bufsz = 100;
1.10 millert 726: uschar *bp;
1.11 ! millert 727: struct charclass *cc;
! 728: const uschar *p;
1.1 tholo 729:
730: switch (c = *prestr++) {
731: case '|': return OR;
732: case '*': return STAR;
733: case '+': return PLUS;
734: case '?': return QUEST;
735: case '.': return DOT;
736: case '\0': prestr--; return '\0';
737: case '^':
738: case '$':
739: case '(':
740: case ')':
741: return c;
742: case '\\':
1.10 millert 743: rlxval = quoted((char **) &prestr);
1.1 tholo 744: return CHAR;
745: default:
746: rlxval = c;
747: return CHAR;
748: case '[':
1.10 millert 749: if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 750: FATAL("out of space in reg expr %.10s..", lastre);
1.5 kstailey 751: bp = buf;
1.1 tholo 752: if (*prestr == '^') {
753: cflag = 1;
754: prestr++;
755: }
756: else
757: cflag = 0;
1.11 ! millert 758: n = 2 * strlen((const char *) prestr)+1;
1.10 millert 759: if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, 0))
1.8 millert 760: FATAL("out of space for reg expr %.10s...", lastre);
1.1 tholo 761: for (; ; ) {
762: if ((c = *prestr++) == '\\') {
1.5 kstailey 763: *bp++ = '\\';
1.1 tholo 764: if ((c = *prestr++) == '\0')
1.8 millert 765: FATAL("nonterminated character class %.20s...", lastre);
1.5 kstailey 766: *bp++ = c;
1.10 millert 767: /* } else if (c == '\n') { */
768: /* FATAL("newline in character class %.20s...", lastre); */
1.11 ! millert 769: } else if (c == '[' && *prestr == ':') {
! 770: /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
! 771: for (cc = charclasses; cc->cc_name; cc++)
! 772: if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
! 773: break;
! 774: if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
! 775: prestr[2 + cc->cc_namelen] == ']') {
! 776: prestr += cc->cc_namelen + 3;
! 777: for (p = (const uschar *) cc->cc_expand; *p; p++)
! 778: *bp++ = *p;
! 779: } else
! 780: *bp++ = c;
1.1 tholo 781: } else if (c == '\0') {
1.8 millert 782: FATAL("nonterminated character class %.20s", lastre);
1.5 kstailey 783: } else if (bp == buf) { /* 1st char is special */
784: *bp++ = c;
1.1 tholo 785: } else if (c == ']') {
1.5 kstailey 786: *bp++ = 0;
1.10 millert 787: rlxstr = (uschar *) tostring((char *) buf);
1.1 tholo 788: if (cflag == 0)
789: return CCL;
790: else
791: return NCCL;
792: } else
1.5 kstailey 793: *bp++ = c;
1.1 tholo 794: }
795: }
796: }
797:
798: int cgoto(fa *f, int s, int c)
799: {
800: int i, j, k;
1.4 millert 801: int *p, *q;
1.1 tholo 802:
1.10 millert 803: if (c < 0 || c > 255)
1.8 millert 804: FATAL("can't happen: neg char %d in cgoto", c);
1.1 tholo 805: while (f->accept >= maxsetvec) { /* guessing here! */
806: maxsetvec *= 4;
807: setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
808: tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
1.7 millert 809: if (setvec == 0 || tmpset == 0)
1.1 tholo 810: overflo("out of space in cgoto()");
811: }
812: for (i = 0; i <= f->accept; i++)
813: setvec[i] = 0;
814: setcnt = 0;
815: /* compute positions of gototab[s,c] into setvec */
816: p = f->posns[s];
817: for (i = 1; i <= *p; i++) {
818: if ((k = f->re[p[i]].ltype) != FINAL) {
1.7 millert 819: if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1.1 tholo 820: || (k == DOT && c != 0 && c != HAT)
821: || (k == ALL && c != 0)
1.10 millert 822: || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
823: || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1.1 tholo 824: q = f->re[p[i]].lfollow;
825: for (j = 1; j <= *q; j++) {
826: if (q[j] >= maxsetvec) {
827: maxsetvec *= 4;
828: setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
829: tmpset = (int *) realloc(setvec, maxsetvec * sizeof(int));
830: if (setvec == 0 || tmpset == 0)
831: overflo("cgoto overflow");
832: }
833: if (setvec[q[j]] == 0) {
834: setcnt++;
835: setvec[q[j]] = 1;
836: }
837: }
838: }
839: }
840: }
841: /* determine if setvec is a previous state */
842: tmpset[0] = setcnt;
843: j = 1;
844: for (i = f->accept; i >= 0; i--)
845: if (setvec[i]) {
846: tmpset[j++] = i;
847: }
848: /* tmpset == previous state? */
849: for (i = 1; i <= f->curstat; i++) {
850: p = f->posns[i];
851: if ((k = tmpset[0]) != p[0])
852: goto different;
853: for (j = 1; j <= k; j++)
854: if (tmpset[j] != p[j])
855: goto different;
856: /* setvec is state i */
857: f->gototab[s][c] = i;
858: return i;
859: different:;
860: }
861:
862: /* add tmpset to current set of states */
863: if (f->curstat >= NSTATES-1) {
864: f->curstat = 2;
865: f->reset = 1;
866: for (i = 2; i < NSTATES; i++)
867: xfree(f->posns[i]);
868: } else
869: ++(f->curstat);
870: for (i = 0; i < NCHARS; i++)
871: f->gototab[f->curstat][i] = 0;
872: xfree(f->posns[f->curstat]);
1.4 millert 873: if ((p = (int *) calloc(1, (setcnt+1)*sizeof(int))) == NULL)
1.1 tholo 874: overflo("out of space in cgoto");
875:
876: f->posns[f->curstat] = p;
877: f->gototab[s][c] = f->curstat;
878: for (i = 0; i <= setcnt; i++)
879: p[i] = tmpset[i];
880: if (setvec[f->accept])
881: f->out[f->curstat] = 1;
882: else
883: f->out[f->curstat] = 0;
884: return f->curstat;
885: }
886:
887:
888: void freefa(fa *f) /* free a finite automaton */
889: {
890: int i;
891:
892: if (f == NULL)
893: return;
894: for (i = 0; i <= f->curstat; i++)
895: xfree(f->posns[i]);
896: for (i = 0; i <= f->accept; i++) {
897: xfree(f->re[i].lfollow);
898: if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
899: xfree((f->re[i].lval.np));
900: }
901: xfree(f->restr);
902: xfree(f);
903: }