Annotation of src/usr.bin/awk/b.c, Revision 1.13
1.13 ! pvalchev 1: /* $OpenBSD: b.c,v 1.12 2004/12/30 01:52:48 millert Exp $ */
1.1 tholo 2: /****************************************************************
1.5 kstailey 3: Copyright (C) Lucent Technologies 1997
1.1 tholo 4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
1.5 kstailey 11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
1.1 tholo 24: ****************************************************************/
25:
26: /* lasciate ogne speranza, voi ch'entrate. */
27:
28: #define DEBUG
29:
30: #include <ctype.h>
31: #include <stdio.h>
32: #include <string.h>
33: #include <stdlib.h>
34: #include "awk.h"
1.5 kstailey 35: #include "ytab.h"
1.1 tholo 36:
1.12 millert 37: #define HAT (NCHARS+2) /* matches ^ in regular expr */
1.1 tholo 38: /* NCHARS is 2**n */
39: #define MAXLIN 22
40:
1.7 millert 41: #define type(v) (v)->nobj /* badly overloaded here */
42: #define info(v) (v)->ntype /* badly overloaded here */
1.1 tholo 43: #define left(v) (v)->narg[0]
44: #define right(v) (v)->narg[1]
45: #define parent(v) (v)->nnext
46:
47: #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
48: #define UNARY case STAR: case PLUS: case QUEST:
49:
50: /* encoding in tree Nodes:
51: leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL):
52: left is index, right contains value or pointer to value
53: unary (STAR, PLUS, QUEST): left is child, right is null
54: binary (CAT, OR): left and right are children
55: parent contains pointer to parent
56: */
57:
58:
59: int *setvec;
60: int *tmpset;
61: int maxsetvec = 0;
62:
63: int rtok; /* next token in current re */
1.4 millert 64: int rlxval;
1.10 millert 65: static uschar *rlxstr;
66: static uschar *prestr; /* current position in current re */
67: static uschar *lastre; /* origin of last re */
1.1 tholo 68:
1.4 millert 69: static int setcnt;
70: static int poscnt;
1.1 tholo 71:
72: char *patbeg;
73: int patlen;
74:
75: #define NFA 20 /* cache this many dynamic fa's */
76: fa *fatab[NFA];
77: int nfatab = 0; /* entries in fatab */
78:
1.11 millert 79: fa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */
1.1 tholo 80: {
81: int i, use, nuse;
82: fa *pfa;
1.6 millert 83: static int now = 1;
1.1 tholo 84:
85: if (setvec == 0) { /* first time through any RE */
86: maxsetvec = MAXLIN;
87: setvec = (int *) malloc(maxsetvec * sizeof(int));
88: tmpset = (int *) malloc(maxsetvec * sizeof(int));
89: if (setvec == 0 || tmpset == 0)
90: overflo("out of space initializing makedfa");
91: }
92:
93: if (compile_time) /* a constant for sure */
94: return mkdfa(s, anchor);
95: for (i = 0; i < nfatab; i++) /* is it there already? */
96: if (fatab[i]->anchor == anchor
1.11 millert 97: && strcmp((const char *) fatab[i]->restr, s) == 0) {
1.6 millert 98: fatab[i]->use = now++;
1.1 tholo 99: return fatab[i];
1.10 millert 100: }
1.1 tholo 101: pfa = mkdfa(s, anchor);
102: if (nfatab < NFA) { /* room for another */
103: fatab[nfatab] = pfa;
1.6 millert 104: fatab[nfatab]->use = now++;
1.1 tholo 105: nfatab++;
106: return pfa;
107: }
108: use = fatab[0]->use; /* replace least-recently used */
109: nuse = 0;
110: for (i = 1; i < nfatab; i++)
111: if (fatab[i]->use < use) {
112: use = fatab[i]->use;
113: nuse = i;
114: }
115: freefa(fatab[nuse]);
116: fatab[nuse] = pfa;
1.6 millert 117: pfa->use = now++;
1.1 tholo 118: return pfa;
119: }
120:
1.11 millert 121: fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
1.1 tholo 122: /* anchor = 1 for anchored matches, else 0 */
123: {
124: Node *p, *p1;
125: fa *f;
126:
127: p = reparse(s);
128: p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
129: /* put ALL STAR in front of reg. exp. */
130: p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
131: /* put FINAL after reg. exp. */
132:
133: poscnt = 0;
134: penter(p1); /* enter parent pointers and leaf indices */
135: if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
136: overflo("out of space for fa");
137: f->accept = poscnt-1; /* penter has computed number of positions in re */
138: cfoll(f, p1); /* set up follow sets */
139: freetr(p1);
1.13 ! pvalchev 140: if ((f->posns[0] = (int *) calloc(*(f->re[0].lfollow), sizeof(int))) == NULL)
1.1 tholo 141: overflo("out of space in makedfa");
1.4 millert 142: if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
1.1 tholo 143: overflo("out of space in makedfa");
144: *f->posns[1] = 0;
145: f->initstat = makeinit(f, anchor);
146: f->anchor = anchor;
1.10 millert 147: f->restr = (uschar *) tostring(s);
1.1 tholo 148: return f;
149: }
150:
151: int makeinit(fa *f, int anchor)
152: {
153: int i, k;
154:
155: f->curstat = 2;
156: f->out[2] = 0;
157: f->reset = 0;
158: k = *(f->re[0].lfollow);
159: xfree(f->posns[2]);
1.13 ! pvalchev 160: if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
1.1 tholo 161: overflo("out of space in makeinit");
162: for (i=0; i <= k; i++) {
163: (f->posns[2])[i] = (f->re[0].lfollow)[i];
164: }
165: if ((f->posns[2])[1] == f->accept)
166: f->out[2] = 1;
167: for (i=0; i < NCHARS; i++)
168: f->gototab[2][i] = 0;
169: f->curstat = cgoto(f, 2, HAT);
170: if (anchor) {
171: *f->posns[2] = k-1; /* leave out position 0 */
172: for (i=0; i < k; i++) {
173: (f->posns[0])[i] = (f->posns[2])[i];
174: }
175:
176: f->out[0] = f->out[2];
177: if (f->curstat != 2)
178: --(*f->posns[f->curstat]);
179: }
180: return f->curstat;
181: }
182:
183: void penter(Node *p) /* set up parent pointers and leaf indices */
184: {
185: switch (type(p)) {
186: LEAF
1.7 millert 187: info(p) = poscnt;
1.1 tholo 188: poscnt++;
189: break;
190: UNARY
191: penter(left(p));
192: parent(left(p)) = p;
193: break;
194: case CAT:
195: case OR:
196: penter(left(p));
197: penter(right(p));
198: parent(left(p)) = p;
199: parent(right(p)) = p;
200: break;
201: default: /* can't happen */
1.8 millert 202: FATAL("can't happen: unknown type %d in penter", type(p));
1.1 tholo 203: break;
204: }
205: }
206:
207: void freetr(Node *p) /* free parse tree */
208: {
209: switch (type(p)) {
210: LEAF
211: xfree(p);
212: break;
213: UNARY
214: freetr(left(p));
215: xfree(p);
216: break;
217: case CAT:
218: case OR:
219: freetr(left(p));
220: freetr(right(p));
221: xfree(p);
222: break;
223: default: /* can't happen */
1.8 millert 224: FATAL("can't happen: unknown type %d in freetr", type(p));
1.1 tholo 225: break;
226: }
227: }
228:
229: /* in the parsing of regular expressions, metacharacters like . have */
230: /* to be seen literally; \056 is not a metacharacter. */
231:
232: int hexstr(char **pp) /* find and eval hex string at pp, return new p */
1.2 millert 233: { /* only pick up one 8-bit byte (2 chars) */
1.10 millert 234: uschar *p;
1.1 tholo 235: int n = 0;
1.2 millert 236: int i;
1.1 tholo 237:
1.10 millert 238: for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
1.1 tholo 239: if (isdigit(*p))
240: n = 16 * n + *p - '0';
241: else if (*p >= 'a' && *p <= 'f')
242: n = 16 * n + *p - 'a' + 10;
243: else if (*p >= 'A' && *p <= 'F')
244: n = 16 * n + *p - 'A' + 10;
245: }
1.10 millert 246: *pp = (char *) p;
1.1 tholo 247: return n;
248: }
249:
1.2 millert 250: #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
1.1 tholo 251:
252: int quoted(char **pp) /* pick up next thing after a \\ */
253: /* and increment *pp */
254: {
255: char *p = *pp;
256: int c;
257:
258: if ((c = *p++) == 't')
259: c = '\t';
260: else if (c == 'n')
261: c = '\n';
262: else if (c == 'f')
263: c = '\f';
264: else if (c == 'r')
265: c = '\r';
266: else if (c == 'b')
267: c = '\b';
268: else if (c == '\\')
269: c = '\\';
270: else if (c == 'x') { /* hexadecimal goo follows */
271: c = hexstr(&p); /* this adds a null if number is invalid */
272: } else if (isoctdigit(c)) { /* \d \dd \ddd */
273: int n = c - '0';
274: if (isoctdigit(*p)) {
275: n = 8 * n + *p++ - '0';
276: if (isoctdigit(*p))
277: n = 8 * n + *p++ - '0';
278: }
279: c = n;
280: } /* else */
281: /* c = c; */
282: *pp = p;
283: return c;
284: }
285:
1.11 millert 286: char *cclenter(const char *argp) /* add a character class */
1.1 tholo 287: {
288: int i, c, c2;
1.10 millert 289: uschar *p = (uschar *) argp;
290: uschar *op, *bp;
291: static uschar *buf = 0;
1.5 kstailey 292: static int bufsz = 100;
1.1 tholo 293:
294: op = p;
1.10 millert 295: if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 296: FATAL("out of space for character class [%.10s...] 1", p);
1.5 kstailey 297: bp = buf;
298: for (i = 0; (c = *p++) != 0; ) {
1.1 tholo 299: if (c == '\\') {
1.10 millert 300: c = quoted((char **) &p);
1.5 kstailey 301: } else if (c == '-' && i > 0 && bp[-1] != 0) {
1.1 tholo 302: if (*p != 0) {
1.5 kstailey 303: c = bp[-1];
1.1 tholo 304: c2 = *p++;
305: if (c2 == '\\')
1.10 millert 306: c2 = quoted((char **) &p);
1.1 tholo 307: if (c > c2) { /* empty; ignore */
1.5 kstailey 308: bp--;
1.1 tholo 309: i--;
310: continue;
311: }
312: while (c < c2) {
1.10 millert 313: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0))
1.8 millert 314: FATAL("out of space for character class [%.10s...] 2", p);
1.5 kstailey 315: *bp++ = ++c;
1.1 tholo 316: i++;
317: }
318: continue;
319: }
320: }
1.10 millert 321: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, 0))
1.8 millert 322: FATAL("out of space for character class [%.10s...] 3", p);
1.5 kstailey 323: *bp++ = c;
1.1 tholo 324: i++;
325: }
1.5 kstailey 326: *bp = 0;
327: dprintf( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
1.1 tholo 328: xfree(op);
1.10 millert 329: return (char *) tostring((char *) buf);
1.1 tholo 330: }
331:
1.11 millert 332: void overflo(const char *s)
1.1 tholo 333: {
1.8 millert 334: FATAL("regular expression too big: %.30s...", s);
1.1 tholo 335: }
336:
337: void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
338: {
339: int i;
1.4 millert 340: int *p;
1.1 tholo 341:
342: switch (type(v)) {
343: LEAF
1.7 millert 344: f->re[info(v)].ltype = type(v);
345: f->re[info(v)].lval.np = right(v);
1.1 tholo 346: while (f->accept >= maxsetvec) { /* guessing here! */
347: maxsetvec *= 4;
348: setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
349: tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
1.5 kstailey 350: if (setvec == 0 || tmpset == 0)
1.1 tholo 351: overflo("out of space in cfoll()");
352: }
353: for (i = 0; i <= f->accept; i++)
354: setvec[i] = 0;
355: setcnt = 0;
356: follow(v); /* computes setvec and setcnt */
1.13 ! pvalchev 357: if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL)
1.1 tholo 358: overflo("out of space building follow set");
1.7 millert 359: f->re[info(v)].lfollow = p;
1.1 tholo 360: *p = setcnt;
361: for (i = f->accept; i >= 0; i--)
362: if (setvec[i] == 1)
363: *++p = i;
364: break;
365: UNARY
366: cfoll(f,left(v));
367: break;
368: case CAT:
369: case OR:
370: cfoll(f,left(v));
371: cfoll(f,right(v));
372: break;
373: default: /* can't happen */
1.8 millert 374: FATAL("can't happen: unknown type %d in cfoll", type(v));
1.1 tholo 375: }
376: }
377:
378: int first(Node *p) /* collects initially active leaves of p into setvec */
379: /* returns 1 if p matches empty string */
380: {
1.4 millert 381: int b, lp;
1.1 tholo 382:
383: switch (type(p)) {
384: LEAF
1.7 millert 385: lp = info(p); /* look for high-water mark of subscripts */
1.1 tholo 386: while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
387: maxsetvec *= 4;
388: setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
389: tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
1.7 millert 390: if (setvec == 0 || tmpset == 0)
1.1 tholo 391: overflo("out of space in first()");
392: }
393: if (setvec[lp] != 1) {
394: setvec[lp] = 1;
395: setcnt++;
396: }
397: if (type(p) == CCL && (*(char *) right(p)) == '\0')
398: return(0); /* empty CCL */
399: else return(1);
400: case PLUS:
401: if (first(left(p)) == 0) return(0);
402: return(1);
403: case STAR:
404: case QUEST:
405: first(left(p));
406: return(0);
407: case CAT:
408: if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
409: return(1);
410: case OR:
411: b = first(right(p));
412: if (first(left(p)) == 0 || b == 0) return(0);
413: return(1);
414: }
1.8 millert 415: FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
1.1 tholo 416: return(-1);
417: }
418:
419: void follow(Node *v) /* collects leaves that can follow v into setvec */
420: {
421: Node *p;
422:
423: if (type(v) == FINAL)
424: return;
425: p = parent(v);
426: switch (type(p)) {
427: case STAR:
428: case PLUS:
429: first(v);
430: follow(p);
431: return;
432:
433: case OR:
434: case QUEST:
435: follow(p);
436: return;
437:
438: case CAT:
439: if (v == left(p)) { /* v is left child of p */
440: if (first(right(p)) == 0) {
441: follow(p);
442: return;
443: }
444: } else /* v is right child */
445: follow(p);
446: return;
447: }
448: }
449:
1.11 millert 450: int member(int c, const char *sarg) /* is c in s? */
1.1 tholo 451: {
1.10 millert 452: uschar *s = (uschar *) sarg;
453:
1.1 tholo 454: while (*s)
455: if (c == *s++)
456: return(1);
457: return(0);
458: }
459:
1.11 millert 460: int match(fa *f, const char *p0) /* shortest match ? */
1.1 tholo 461: {
462: int s, ns;
463: uschar *p = (uschar *) p0;
464:
465: s = f->reset ? makeinit(f,0) : f->initstat;
466: if (f->out[s])
467: return(1);
468: do {
1.12 millert 469: assert(*p < NCHARS);
1.1 tholo 470: if ((ns = f->gototab[s][*p]) != 0)
471: s = ns;
472: else
473: s = cgoto(f, s, *p);
474: if (f->out[s])
475: return(1);
476: } while (*p++ != 0);
477: return(0);
478: }
479:
1.11 millert 480: int pmatch(fa *f, const char *p0) /* longest match, for sub */
1.1 tholo 481: {
482: int s, ns;
483: uschar *p = (uschar *) p0;
484: uschar *q;
485: int i, k;
486:
1.12 millert 487: /* s = f->reset ? makeinit(f,1) : f->initstat; */
488: if (f->reset) {
489: f->initstat = s = makeinit(f,1);
490: } else {
491: s = f->initstat;
492: }
1.1 tholo 493: patbeg = (char *) p;
494: patlen = -1;
495: do {
496: q = p;
497: do {
498: if (f->out[s]) /* final state */
499: patlen = q-p;
1.12 millert 500: assert(*q < NCHARS);
1.1 tholo 501: if ((ns = f->gototab[s][*q]) != 0)
502: s = ns;
503: else
504: s = cgoto(f, s, *q);
1.9 deraadt 505: if (s == 1) { /* no transition */
1.1 tholo 506: if (patlen >= 0) {
507: patbeg = (char *) p;
508: return(1);
509: }
510: else
511: goto nextin; /* no match */
1.9 deraadt 512: }
1.1 tholo 513: } while (*q++ != 0);
514: if (f->out[s])
515: patlen = q-p-1; /* don't count $ */
516: if (patlen >= 0) {
517: patbeg = (char *) p;
518: return(1);
519: }
520: nextin:
521: s = 2;
522: if (f->reset) {
523: for (i = 2; i <= f->curstat; i++)
524: xfree(f->posns[i]);
525: k = *f->posns[0];
1.13 ! pvalchev 526: if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
1.1 tholo 527: overflo("out of space in pmatch");
528: for (i = 0; i <= k; i++)
529: (f->posns[2])[i] = (f->posns[0])[i];
530: f->initstat = f->curstat = 2;
531: f->out[2] = f->out[0];
532: for (i = 0; i < NCHARS; i++)
533: f->gototab[2][i] = 0;
534: }
535: } while (*p++ != 0);
536: return (0);
537: }
538:
1.11 millert 539: int nematch(fa *f, const char *p0) /* non-empty match, for sub */
1.1 tholo 540: {
541: int s, ns;
542: uschar *p = (uschar *) p0;
543: uschar *q;
544: int i, k;
545:
1.12 millert 546: /* s = f->reset ? makeinit(f,1) : f->initstat; */
547: if (f->reset) {
548: f->initstat = s = makeinit(f,1);
549: } else {
550: s = f->initstat;
551: }
1.1 tholo 552: patlen = -1;
553: while (*p) {
554: q = p;
555: do {
556: if (f->out[s]) /* final state */
557: patlen = q-p;
1.12 millert 558: assert(*q < NCHARS);
1.1 tholo 559: if ((ns = f->gototab[s][*q]) != 0)
560: s = ns;
561: else
562: s = cgoto(f, s, *q);
1.9 deraadt 563: if (s == 1) { /* no transition */
1.1 tholo 564: if (patlen > 0) {
565: patbeg = (char *) p;
566: return(1);
567: } else
568: goto nnextin; /* no nonempty match */
1.9 deraadt 569: }
1.1 tholo 570: } while (*q++ != 0);
571: if (f->out[s])
572: patlen = q-p-1; /* don't count $ */
573: if (patlen > 0 ) {
574: patbeg = (char *) p;
575: return(1);
576: }
577: nnextin:
578: s = 2;
579: if (f->reset) {
580: for (i = 2; i <= f->curstat; i++)
581: xfree(f->posns[i]);
582: k = *f->posns[0];
1.13 ! pvalchev 583: if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
1.1 tholo 584: overflo("out of state space");
585: for (i = 0; i <= k; i++)
586: (f->posns[2])[i] = (f->posns[0])[i];
587: f->initstat = f->curstat = 2;
588: f->out[2] = f->out[0];
589: for (i = 0; i < NCHARS; i++)
590: f->gototab[2][i] = 0;
591: }
592: p++;
593: }
594: return (0);
595: }
596:
1.11 millert 597: Node *reparse(const char *p) /* parses regular expression pointed to by p */
1.1 tholo 598: { /* uses relex() to scan regular expression */
599: Node *np;
600:
601: dprintf( ("reparse <%s>\n", p) );
1.10 millert 602: lastre = prestr = (uschar *) p; /* prestr points to string to be parsed */
1.1 tholo 603: rtok = relex();
1.11 millert 604: /* GNU compatibility: an empty regexp matches anything */
1.1 tholo 605: if (rtok == '\0')
1.11 millert 606: /* FATAL("empty regular expression"); previous */
607: return(op2(ALL, NIL, NIL));
1.1 tholo 608: np = regexp();
609: if (rtok != '\0')
1.8 millert 610: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 611: return(np);
612: }
613:
614: Node *regexp(void) /* top-level parse of reg expr */
615: {
616: return (alt(concat(primary())));
617: }
618:
619: Node *primary(void)
620: {
621: Node *np;
622:
623: switch (rtok) {
624: case CHAR:
1.7 millert 625: np = op2(CHAR, NIL, itonp(rlxval));
1.1 tholo 626: rtok = relex();
627: return (unary(np));
628: case ALL:
629: rtok = relex();
630: return (unary(op2(ALL, NIL, NIL)));
631: case DOT:
632: rtok = relex();
633: return (unary(op2(DOT, NIL, NIL)));
634: case CCL:
1.10 millert 635: np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
1.1 tholo 636: rtok = relex();
637: return (unary(np));
638: case NCCL:
1.10 millert 639: np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
1.1 tholo 640: rtok = relex();
641: return (unary(np));
642: case '^':
643: rtok = relex();
1.7 millert 644: return (unary(op2(CHAR, NIL, itonp(HAT))));
1.1 tholo 645: case '$':
646: rtok = relex();
647: return (unary(op2(CHAR, NIL, NIL)));
648: case '(':
649: rtok = relex();
650: if (rtok == ')') { /* special pleading for () */
651: rtok = relex();
652: return unary(op2(CCL, NIL, (Node *) tostring("")));
653: }
654: np = regexp();
655: if (rtok == ')') {
656: rtok = relex();
657: return (unary(np));
658: }
659: else
1.8 millert 660: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 661: default:
1.8 millert 662: FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1.1 tholo 663: }
664: return 0; /*NOTREACHED*/
665: }
666:
667: Node *concat(Node *np)
668: {
669: switch (rtok) {
670: case CHAR: case DOT: case ALL: case CCL: case NCCL: case '$': case '(':
671: return (concat(op2(CAT, np, primary())));
672: }
673: return (np);
674: }
675:
676: Node *alt(Node *np)
677: {
678: if (rtok == OR) {
679: rtok = relex();
680: return (alt(op2(OR, np, concat(primary()))));
681: }
682: return (np);
683: }
684:
685: Node *unary(Node *np)
686: {
687: switch (rtok) {
688: case STAR:
689: rtok = relex();
690: return (unary(op2(STAR, np, NIL)));
691: case PLUS:
692: rtok = relex();
693: return (unary(op2(PLUS, np, NIL)));
694: case QUEST:
695: rtok = relex();
696: return (unary(op2(QUEST, np, NIL)));
697: default:
698: return (np);
699: }
700: }
701:
1.11 millert 702: /*
703: * Character class definitions conformant to the POSIX locale as
704: * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
705: * and operating character sets are both ASCII (ISO646) or supersets
706: * thereof.
707: *
708: * Note that to avoid overflowing the temporary buffer used in
709: * relex(), the expanded character class (prior to range expansion)
710: * must be less than twice the size of their full name.
711: */
1.12 millert 712:
713: /* Because isblank doesn't show up in any of the header files on any
714: * system i use, it's defined here. if some other locale has a richer
715: * definition of "blank", define HAS_ISBLANK and provide your own
716: * version.
717: * the parentheses here are an attempt to find a path through the maze
718: * of macro definition and/or function and/or version provided. thanks
719: * to nelson beebe for the suggestion; let's see if it works everywhere.
720: */
721:
722: #ifndef HAS_ISBLANK
723:
724: int (isblank)(int c)
725: {
726: return c==' ' || c=='\t';
727: }
728:
729: #endif
730:
1.11 millert 731: struct charclass {
732: const char *cc_name;
733: int cc_namelen;
1.12 millert 734: int (*cc_func)(int);
1.11 millert 735: } charclasses[] = {
1.12 millert 736: { "alnum", 5, isalnum },
737: { "alpha", 5, isalpha },
738: { "blank", 5, isblank },
739: { "cntrl", 5, iscntrl },
740: { "digit", 5, isdigit },
741: { "graph", 5, isgraph },
742: { "lower", 5, islower },
743: { "print", 5, isprint },
744: { "punct", 5, ispunct },
745: { "space", 5, isspace },
746: { "upper", 5, isupper },
747: { "xdigit", 6, isxdigit },
1.11 millert 748: { NULL, 0, NULL },
749: };
750:
751:
1.1 tholo 752: int relex(void) /* lexical analyzer for reparse */
753: {
1.5 kstailey 754: int c, n;
1.1 tholo 755: int cflag;
1.10 millert 756: static uschar *buf = 0;
1.5 kstailey 757: static int bufsz = 100;
1.10 millert 758: uschar *bp;
1.11 millert 759: struct charclass *cc;
1.12 millert 760: int i;
1.1 tholo 761:
762: switch (c = *prestr++) {
763: case '|': return OR;
764: case '*': return STAR;
765: case '+': return PLUS;
766: case '?': return QUEST;
767: case '.': return DOT;
768: case '\0': prestr--; return '\0';
769: case '^':
770: case '$':
771: case '(':
772: case ')':
773: return c;
774: case '\\':
1.10 millert 775: rlxval = quoted((char **) &prestr);
1.1 tholo 776: return CHAR;
777: default:
778: rlxval = c;
779: return CHAR;
780: case '[':
1.10 millert 781: if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 782: FATAL("out of space in reg expr %.10s..", lastre);
1.5 kstailey 783: bp = buf;
1.1 tholo 784: if (*prestr == '^') {
785: cflag = 1;
786: prestr++;
787: }
788: else
789: cflag = 0;
1.11 millert 790: n = 2 * strlen((const char *) prestr)+1;
1.10 millert 791: if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, 0))
1.8 millert 792: FATAL("out of space for reg expr %.10s...", lastre);
1.1 tholo 793: for (; ; ) {
794: if ((c = *prestr++) == '\\') {
1.5 kstailey 795: *bp++ = '\\';
1.1 tholo 796: if ((c = *prestr++) == '\0')
1.8 millert 797: FATAL("nonterminated character class %.20s...", lastre);
1.5 kstailey 798: *bp++ = c;
1.10 millert 799: /* } else if (c == '\n') { */
800: /* FATAL("newline in character class %.20s...", lastre); */
1.11 millert 801: } else if (c == '[' && *prestr == ':') {
802: /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
803: for (cc = charclasses; cc->cc_name; cc++)
804: if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
805: break;
806: if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
807: prestr[2 + cc->cc_namelen] == ']') {
808: prestr += cc->cc_namelen + 3;
1.12 millert 809: for (i = 0; i < NCHARS; i++) {
810: if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, 0))
811: FATAL("out of space for reg expr %.10s...", lastre);
812: if (cc->cc_func(i)) {
813: *bp++ = i;
814: n++;
815: }
816: }
1.11 millert 817: } else
818: *bp++ = c;
1.1 tholo 819: } else if (c == '\0') {
1.8 millert 820: FATAL("nonterminated character class %.20s", lastre);
1.5 kstailey 821: } else if (bp == buf) { /* 1st char is special */
822: *bp++ = c;
1.1 tholo 823: } else if (c == ']') {
1.5 kstailey 824: *bp++ = 0;
1.10 millert 825: rlxstr = (uschar *) tostring((char *) buf);
1.1 tholo 826: if (cflag == 0)
827: return CCL;
828: else
829: return NCCL;
830: } else
1.5 kstailey 831: *bp++ = c;
1.1 tholo 832: }
833: }
834: }
835:
836: int cgoto(fa *f, int s, int c)
837: {
838: int i, j, k;
1.4 millert 839: int *p, *q;
1.1 tholo 840:
1.12 millert 841: assert(c == HAT || c < NCHARS);
1.1 tholo 842: while (f->accept >= maxsetvec) { /* guessing here! */
843: maxsetvec *= 4;
844: setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
845: tmpset = (int *) realloc(tmpset, maxsetvec * sizeof(int));
1.7 millert 846: if (setvec == 0 || tmpset == 0)
1.1 tholo 847: overflo("out of space in cgoto()");
848: }
849: for (i = 0; i <= f->accept; i++)
850: setvec[i] = 0;
851: setcnt = 0;
852: /* compute positions of gototab[s,c] into setvec */
853: p = f->posns[s];
854: for (i = 1; i <= *p; i++) {
855: if ((k = f->re[p[i]].ltype) != FINAL) {
1.7 millert 856: if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1.1 tholo 857: || (k == DOT && c != 0 && c != HAT)
858: || (k == ALL && c != 0)
1.10 millert 859: || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
860: || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1.1 tholo 861: q = f->re[p[i]].lfollow;
862: for (j = 1; j <= *q; j++) {
863: if (q[j] >= maxsetvec) {
864: maxsetvec *= 4;
865: setvec = (int *) realloc(setvec, maxsetvec * sizeof(int));
866: tmpset = (int *) realloc(setvec, maxsetvec * sizeof(int));
867: if (setvec == 0 || tmpset == 0)
868: overflo("cgoto overflow");
869: }
870: if (setvec[q[j]] == 0) {
871: setcnt++;
872: setvec[q[j]] = 1;
873: }
874: }
875: }
876: }
877: }
878: /* determine if setvec is a previous state */
879: tmpset[0] = setcnt;
880: j = 1;
881: for (i = f->accept; i >= 0; i--)
882: if (setvec[i]) {
883: tmpset[j++] = i;
884: }
885: /* tmpset == previous state? */
886: for (i = 1; i <= f->curstat; i++) {
887: p = f->posns[i];
888: if ((k = tmpset[0]) != p[0])
889: goto different;
890: for (j = 1; j <= k; j++)
891: if (tmpset[j] != p[j])
892: goto different;
893: /* setvec is state i */
894: f->gototab[s][c] = i;
895: return i;
896: different:;
897: }
898:
899: /* add tmpset to current set of states */
900: if (f->curstat >= NSTATES-1) {
901: f->curstat = 2;
902: f->reset = 1;
903: for (i = 2; i < NSTATES; i++)
904: xfree(f->posns[i]);
905: } else
906: ++(f->curstat);
907: for (i = 0; i < NCHARS; i++)
908: f->gototab[f->curstat][i] = 0;
909: xfree(f->posns[f->curstat]);
1.13 ! pvalchev 910: if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL)
1.1 tholo 911: overflo("out of space in cgoto");
912:
913: f->posns[f->curstat] = p;
914: f->gototab[s][c] = f->curstat;
915: for (i = 0; i <= setcnt; i++)
916: p[i] = tmpset[i];
917: if (setvec[f->accept])
918: f->out[f->curstat] = 1;
919: else
920: f->out[f->curstat] = 0;
921: return f->curstat;
922: }
923:
924:
925: void freefa(fa *f) /* free a finite automaton */
926: {
927: int i;
928:
929: if (f == NULL)
930: return;
931: for (i = 0; i <= f->curstat; i++)
932: xfree(f->posns[i]);
933: for (i = 0; i <= f->accept; i++) {
934: xfree(f->re[i].lfollow);
935: if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
936: xfree((f->re[i].lval.np));
937: }
938: xfree(f->restr);
939: xfree(f);
940: }