Annotation of src/usr.bin/awk/b.c, Revision 1.22
1.22 ! millert 1: /* $OpenBSD: b.c,v 1.21 2020/06/10 21:00:01 millert Exp $ */
1.1 tholo 2: /****************************************************************
1.5 kstailey 3: Copyright (C) Lucent Technologies 1997
1.1 tholo 4: All Rights Reserved
5:
6: Permission to use, copy, modify, and distribute this software and
7: its documentation for any purpose and without fee is hereby
8: granted, provided that the above copyright notice appear in all
9: copies and that both that the copyright notice and this
10: permission notice and warranty disclaimer appear in supporting
1.5 kstailey 11: documentation, and that the name Lucent Technologies or any of
12: its entities not be used in advertising or publicity pertaining
13: to distribution of the software without specific, written prior
14: permission.
15:
16: LUCENT DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
17: INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS.
18: IN NO EVENT SHALL LUCENT OR ANY OF ITS ENTITIES BE LIABLE FOR ANY
19: SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20: WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
21: IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION,
22: ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF
23: THIS SOFTWARE.
1.1 tholo 24: ****************************************************************/
25:
1.15 millert 26: /* lasciate ogne speranza, voi ch'intrate. */
1.1 tholo 27:
28: #define DEBUG
29:
30: #include <ctype.h>
31: #include <stdio.h>
32: #include <string.h>
33: #include <stdlib.h>
34: #include "awk.h"
1.5 kstailey 35: #include "ytab.h"
1.1 tholo 36:
1.12 millert 37: #define HAT (NCHARS+2) /* matches ^ in regular expr */
1.1 tholo 38: /* NCHARS is 2**n */
39: #define MAXLIN 22
40:
1.7 millert 41: #define type(v) (v)->nobj /* badly overloaded here */
42: #define info(v) (v)->ntype /* badly overloaded here */
1.1 tholo 43: #define left(v) (v)->narg[0]
44: #define right(v) (v)->narg[1]
45: #define parent(v) (v)->nnext
46:
47: #define LEAF case CCL: case NCCL: case CHAR: case DOT: case FINAL: case ALL:
1.15 millert 48: #define ELEAF case EMPTYRE: /* empty string in regexp */
1.1 tholo 49: #define UNARY case STAR: case PLUS: case QUEST:
50:
51: /* encoding in tree Nodes:
1.15 millert 52: leaf (CCL, NCCL, CHAR, DOT, FINAL, ALL, EMPTYRE):
1.1 tholo 53: left is index, right contains value or pointer to value
54: unary (STAR, PLUS, QUEST): left is child, right is null
55: binary (CAT, OR): left and right are children
56: parent contains pointer to parent
57: */
58:
59:
60: int *setvec;
61: int *tmpset;
62: int maxsetvec = 0;
63:
64: int rtok; /* next token in current re */
1.4 millert 65: int rlxval;
1.10 millert 66: static uschar *rlxstr;
67: static uschar *prestr; /* current position in current re */
68: static uschar *lastre; /* origin of last re */
1.1 tholo 69:
1.4 millert 70: static int setcnt;
71: static int poscnt;
1.1 tholo 72:
73: char *patbeg;
74: int patlen;
75:
76: #define NFA 20 /* cache this many dynamic fa's */
77: fa *fatab[NFA];
78: int nfatab = 0; /* entries in fatab */
79:
1.11 millert 80: fa *makedfa(const char *s, int anchor) /* returns dfa for reg expr s */
1.1 tholo 81: {
82: int i, use, nuse;
83: fa *pfa;
1.6 millert 84: static int now = 1;
1.1 tholo 85:
86: if (setvec == 0) { /* first time through any RE */
87: maxsetvec = MAXLIN;
1.14 deraadt 88: setvec = (int *) calloc(maxsetvec, sizeof(int));
89: tmpset = (int *) calloc(maxsetvec, sizeof(int));
1.1 tholo 90: if (setvec == 0 || tmpset == 0)
91: overflo("out of space initializing makedfa");
92: }
93:
94: if (compile_time) /* a constant for sure */
95: return mkdfa(s, anchor);
96: for (i = 0; i < nfatab; i++) /* is it there already? */
97: if (fatab[i]->anchor == anchor
1.11 millert 98: && strcmp((const char *) fatab[i]->restr, s) == 0) {
1.6 millert 99: fatab[i]->use = now++;
1.1 tholo 100: return fatab[i];
1.10 millert 101: }
1.1 tholo 102: pfa = mkdfa(s, anchor);
103: if (nfatab < NFA) { /* room for another */
104: fatab[nfatab] = pfa;
1.6 millert 105: fatab[nfatab]->use = now++;
1.1 tholo 106: nfatab++;
107: return pfa;
108: }
109: use = fatab[0]->use; /* replace least-recently used */
110: nuse = 0;
111: for (i = 1; i < nfatab; i++)
112: if (fatab[i]->use < use) {
113: use = fatab[i]->use;
114: nuse = i;
115: }
116: freefa(fatab[nuse]);
117: fatab[nuse] = pfa;
1.6 millert 118: pfa->use = now++;
1.1 tholo 119: return pfa;
120: }
121:
1.11 millert 122: fa *mkdfa(const char *s, int anchor) /* does the real work of making a dfa */
1.1 tholo 123: /* anchor = 1 for anchored matches, else 0 */
124: {
125: Node *p, *p1;
126: fa *f;
127:
128: p = reparse(s);
129: p1 = op2(CAT, op2(STAR, op2(ALL, NIL, NIL), NIL), p);
130: /* put ALL STAR in front of reg. exp. */
131: p1 = op2(CAT, p1, op2(FINAL, NIL, NIL));
132: /* put FINAL after reg. exp. */
133:
134: poscnt = 0;
135: penter(p1); /* enter parent pointers and leaf indices */
136: if ((f = (fa *) calloc(1, sizeof(fa) + poscnt*sizeof(rrow))) == NULL)
137: overflo("out of space for fa");
138: f->accept = poscnt-1; /* penter has computed number of positions in re */
139: cfoll(f, p1); /* set up follow sets */
140: freetr(p1);
1.13 pvalchev 141: if ((f->posns[0] = (int *) calloc(*(f->re[0].lfollow), sizeof(int))) == NULL)
1.1 tholo 142: overflo("out of space in makedfa");
1.4 millert 143: if ((f->posns[1] = (int *) calloc(1, sizeof(int))) == NULL)
1.1 tholo 144: overflo("out of space in makedfa");
145: *f->posns[1] = 0;
146: f->initstat = makeinit(f, anchor);
147: f->anchor = anchor;
1.10 millert 148: f->restr = (uschar *) tostring(s);
1.1 tholo 149: return f;
150: }
151:
152: int makeinit(fa *f, int anchor)
153: {
154: int i, k;
155:
156: f->curstat = 2;
157: f->out[2] = 0;
158: f->reset = 0;
159: k = *(f->re[0].lfollow);
160: xfree(f->posns[2]);
1.13 pvalchev 161: if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
1.1 tholo 162: overflo("out of space in makeinit");
163: for (i=0; i <= k; i++) {
164: (f->posns[2])[i] = (f->re[0].lfollow)[i];
165: }
166: if ((f->posns[2])[1] == f->accept)
167: f->out[2] = 1;
168: for (i=0; i < NCHARS; i++)
169: f->gototab[2][i] = 0;
170: f->curstat = cgoto(f, 2, HAT);
171: if (anchor) {
172: *f->posns[2] = k-1; /* leave out position 0 */
173: for (i=0; i < k; i++) {
174: (f->posns[0])[i] = (f->posns[2])[i];
175: }
176:
177: f->out[0] = f->out[2];
178: if (f->curstat != 2)
179: --(*f->posns[f->curstat]);
180: }
181: return f->curstat;
182: }
183:
184: void penter(Node *p) /* set up parent pointers and leaf indices */
185: {
186: switch (type(p)) {
1.15 millert 187: ELEAF
1.1 tholo 188: LEAF
1.7 millert 189: info(p) = poscnt;
1.1 tholo 190: poscnt++;
191: break;
192: UNARY
193: penter(left(p));
194: parent(left(p)) = p;
195: break;
196: case CAT:
197: case OR:
198: penter(left(p));
199: penter(right(p));
200: parent(left(p)) = p;
201: parent(right(p)) = p;
202: break;
203: default: /* can't happen */
1.8 millert 204: FATAL("can't happen: unknown type %d in penter", type(p));
1.1 tholo 205: break;
206: }
207: }
208:
209: void freetr(Node *p) /* free parse tree */
210: {
211: switch (type(p)) {
1.15 millert 212: ELEAF
1.1 tholo 213: LEAF
214: xfree(p);
215: break;
216: UNARY
217: freetr(left(p));
218: xfree(p);
219: break;
220: case CAT:
221: case OR:
222: freetr(left(p));
223: freetr(right(p));
224: xfree(p);
225: break;
226: default: /* can't happen */
1.8 millert 227: FATAL("can't happen: unknown type %d in freetr", type(p));
1.1 tholo 228: break;
229: }
230: }
231:
232: /* in the parsing of regular expressions, metacharacters like . have */
233: /* to be seen literally; \056 is not a metacharacter. */
234:
1.17 millert 235: int hexstr(uschar **pp) /* find and eval hex string at pp, return new p */
1.2 millert 236: { /* only pick up one 8-bit byte (2 chars) */
1.10 millert 237: uschar *p;
1.1 tholo 238: int n = 0;
1.2 millert 239: int i;
1.1 tholo 240:
1.10 millert 241: for (i = 0, p = (uschar *) *pp; i < 2 && isxdigit(*p); i++, p++) {
1.1 tholo 242: if (isdigit(*p))
243: n = 16 * n + *p - '0';
244: else if (*p >= 'a' && *p <= 'f')
245: n = 16 * n + *p - 'a' + 10;
246: else if (*p >= 'A' && *p <= 'F')
247: n = 16 * n + *p - 'A' + 10;
248: }
1.17 millert 249: *pp = (uschar *) p;
1.1 tholo 250: return n;
251: }
252:
1.2 millert 253: #define isoctdigit(c) ((c) >= '0' && (c) <= '7') /* multiple use of arg */
1.1 tholo 254:
1.17 millert 255: int quoted(uschar **pp) /* pick up next thing after a \\ */
1.1 tholo 256: /* and increment *pp */
257: {
1.17 millert 258: uschar *p = *pp;
1.1 tholo 259: int c;
260:
261: if ((c = *p++) == 't')
262: c = '\t';
1.20 millert 263: else if (c == 'v')
264: c = '\v';
1.1 tholo 265: else if (c == 'n')
266: c = '\n';
267: else if (c == 'f')
268: c = '\f';
269: else if (c == 'r')
270: c = '\r';
271: else if (c == 'b')
272: c = '\b';
1.20 millert 273: else if (c == 'a')
274: c = '\007';
1.1 tholo 275: else if (c == '\\')
276: c = '\\';
277: else if (c == 'x') { /* hexadecimal goo follows */
278: c = hexstr(&p); /* this adds a null if number is invalid */
279: } else if (isoctdigit(c)) { /* \d \dd \ddd */
280: int n = c - '0';
281: if (isoctdigit(*p)) {
282: n = 8 * n + *p++ - '0';
283: if (isoctdigit(*p))
284: n = 8 * n + *p++ - '0';
285: }
286: c = n;
287: } /* else */
288: /* c = c; */
289: *pp = p;
290: return c;
291: }
292:
1.11 millert 293: char *cclenter(const char *argp) /* add a character class */
1.1 tholo 294: {
295: int i, c, c2;
1.10 millert 296: uschar *p = (uschar *) argp;
297: uschar *op, *bp;
298: static uschar *buf = 0;
1.5 kstailey 299: static int bufsz = 100;
1.1 tholo 300:
301: op = p;
1.10 millert 302: if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 303: FATAL("out of space for character class [%.10s...] 1", p);
1.5 kstailey 304: bp = buf;
305: for (i = 0; (c = *p++) != 0; ) {
1.1 tholo 306: if (c == '\\') {
1.17 millert 307: c = quoted(&p);
1.5 kstailey 308: } else if (c == '-' && i > 0 && bp[-1] != 0) {
1.1 tholo 309: if (*p != 0) {
1.5 kstailey 310: c = bp[-1];
1.1 tholo 311: c2 = *p++;
312: if (c2 == '\\')
1.17 millert 313: c2 = quoted(&p);
1.1 tholo 314: if (c > c2) { /* empty; ignore */
1.5 kstailey 315: bp--;
1.1 tholo 316: i--;
317: continue;
318: }
319: while (c < c2) {
1.15 millert 320: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter1"))
1.8 millert 321: FATAL("out of space for character class [%.10s...] 2", p);
1.5 kstailey 322: *bp++ = ++c;
1.1 tholo 323: i++;
324: }
325: continue;
326: }
327: }
1.15 millert 328: if (!adjbuf((char **) &buf, &bufsz, bp-buf+2, 100, (char **) &bp, "cclenter2"))
1.8 millert 329: FATAL("out of space for character class [%.10s...] 3", p);
1.5 kstailey 330: *bp++ = c;
1.1 tholo 331: i++;
332: }
1.5 kstailey 333: *bp = 0;
1.19 deraadt 334: DPRINTF( ("cclenter: in = |%s|, out = |%s|\n", op, buf) );
1.1 tholo 335: xfree(op);
1.10 millert 336: return (char *) tostring((char *) buf);
1.1 tholo 337: }
338:
1.11 millert 339: void overflo(const char *s)
1.1 tholo 340: {
1.8 millert 341: FATAL("regular expression too big: %.30s...", s);
1.1 tholo 342: }
343:
344: void cfoll(fa *f, Node *v) /* enter follow set of each leaf of vertex v into lfollow[leaf] */
345: {
346: int i;
1.4 millert 347: int *p;
1.1 tholo 348:
349: switch (type(v)) {
1.15 millert 350: ELEAF
1.1 tholo 351: LEAF
1.7 millert 352: f->re[info(v)].ltype = type(v);
353: f->re[info(v)].lval.np = right(v);
1.1 tholo 354: while (f->accept >= maxsetvec) { /* guessing here! */
1.18 deraadt 355: setvec = reallocarray(setvec, maxsetvec,
356: 4 * sizeof(int));
357: tmpset = reallocarray(tmpset, maxsetvec,
358: 4 * sizeof(int));
1.5 kstailey 359: if (setvec == 0 || tmpset == 0)
1.1 tholo 360: overflo("out of space in cfoll()");
1.18 deraadt 361: maxsetvec *= 4;
1.1 tholo 362: }
363: for (i = 0; i <= f->accept; i++)
364: setvec[i] = 0;
365: setcnt = 0;
366: follow(v); /* computes setvec and setcnt */
1.13 pvalchev 367: if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL)
1.1 tholo 368: overflo("out of space building follow set");
1.7 millert 369: f->re[info(v)].lfollow = p;
1.1 tholo 370: *p = setcnt;
371: for (i = f->accept; i >= 0; i--)
372: if (setvec[i] == 1)
373: *++p = i;
374: break;
375: UNARY
376: cfoll(f,left(v));
377: break;
378: case CAT:
379: case OR:
380: cfoll(f,left(v));
381: cfoll(f,right(v));
382: break;
383: default: /* can't happen */
1.8 millert 384: FATAL("can't happen: unknown type %d in cfoll", type(v));
1.1 tholo 385: }
386: }
387:
388: int first(Node *p) /* collects initially active leaves of p into setvec */
1.15 millert 389: /* returns 0 if p matches empty string */
1.1 tholo 390: {
1.4 millert 391: int b, lp;
1.1 tholo 392:
393: switch (type(p)) {
1.15 millert 394: ELEAF
1.1 tholo 395: LEAF
1.7 millert 396: lp = info(p); /* look for high-water mark of subscripts */
1.1 tholo 397: while (setcnt >= maxsetvec || lp >= maxsetvec) { /* guessing here! */
1.18 deraadt 398: setvec = reallocarray(setvec, maxsetvec,
399: 4 * sizeof(int));
400: tmpset = reallocarray(tmpset, maxsetvec,
401: 4 * sizeof(int));
1.7 millert 402: if (setvec == 0 || tmpset == 0)
1.1 tholo 403: overflo("out of space in first()");
1.18 deraadt 404: maxsetvec *= 4;
1.1 tholo 405: }
1.15 millert 406: if (type(p) == EMPTYRE) {
407: setvec[lp] = 0;
408: return(0);
409: }
1.1 tholo 410: if (setvec[lp] != 1) {
411: setvec[lp] = 1;
412: setcnt++;
413: }
414: if (type(p) == CCL && (*(char *) right(p)) == '\0')
415: return(0); /* empty CCL */
416: else return(1);
417: case PLUS:
418: if (first(left(p)) == 0) return(0);
419: return(1);
420: case STAR:
421: case QUEST:
422: first(left(p));
423: return(0);
424: case CAT:
425: if (first(left(p)) == 0 && first(right(p)) == 0) return(0);
426: return(1);
427: case OR:
428: b = first(right(p));
429: if (first(left(p)) == 0 || b == 0) return(0);
430: return(1);
431: }
1.8 millert 432: FATAL("can't happen: unknown type %d in first", type(p)); /* can't happen */
1.1 tholo 433: return(-1);
434: }
435:
436: void follow(Node *v) /* collects leaves that can follow v into setvec */
437: {
438: Node *p;
439:
440: if (type(v) == FINAL)
441: return;
442: p = parent(v);
443: switch (type(p)) {
444: case STAR:
445: case PLUS:
446: first(v);
447: follow(p);
448: return;
449:
450: case OR:
451: case QUEST:
452: follow(p);
453: return;
454:
455: case CAT:
456: if (v == left(p)) { /* v is left child of p */
457: if (first(right(p)) == 0) {
458: follow(p);
459: return;
460: }
461: } else /* v is right child */
462: follow(p);
463: return;
464: }
465: }
466:
1.11 millert 467: int member(int c, const char *sarg) /* is c in s? */
1.1 tholo 468: {
1.10 millert 469: uschar *s = (uschar *) sarg;
470:
1.1 tholo 471: while (*s)
472: if (c == *s++)
473: return(1);
474: return(0);
475: }
476:
1.11 millert 477: int match(fa *f, const char *p0) /* shortest match ? */
1.1 tholo 478: {
479: int s, ns;
480: uschar *p = (uschar *) p0;
481:
482: s = f->reset ? makeinit(f,0) : f->initstat;
483: if (f->out[s])
484: return(1);
485: do {
1.15 millert 486: /* assert(*p < NCHARS); */
1.1 tholo 487: if ((ns = f->gototab[s][*p]) != 0)
488: s = ns;
489: else
490: s = cgoto(f, s, *p);
491: if (f->out[s])
492: return(1);
493: } while (*p++ != 0);
494: return(0);
495: }
496:
1.11 millert 497: int pmatch(fa *f, const char *p0) /* longest match, for sub */
1.1 tholo 498: {
499: int s, ns;
500: uschar *p = (uschar *) p0;
501: uschar *q;
502: int i, k;
503:
1.12 millert 504: /* s = f->reset ? makeinit(f,1) : f->initstat; */
505: if (f->reset) {
506: f->initstat = s = makeinit(f,1);
507: } else {
508: s = f->initstat;
509: }
1.1 tholo 510: patbeg = (char *) p;
511: patlen = -1;
512: do {
513: q = p;
514: do {
515: if (f->out[s]) /* final state */
516: patlen = q-p;
1.15 millert 517: /* assert(*q < NCHARS); */
1.1 tholo 518: if ((ns = f->gototab[s][*q]) != 0)
519: s = ns;
520: else
521: s = cgoto(f, s, *q);
1.9 deraadt 522: if (s == 1) { /* no transition */
1.1 tholo 523: if (patlen >= 0) {
524: patbeg = (char *) p;
525: return(1);
526: }
527: else
528: goto nextin; /* no match */
1.9 deraadt 529: }
1.1 tholo 530: } while (*q++ != 0);
531: if (f->out[s])
532: patlen = q-p-1; /* don't count $ */
533: if (patlen >= 0) {
534: patbeg = (char *) p;
535: return(1);
536: }
537: nextin:
538: s = 2;
539: if (f->reset) {
540: for (i = 2; i <= f->curstat; i++)
541: xfree(f->posns[i]);
542: k = *f->posns[0];
1.13 pvalchev 543: if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
1.1 tholo 544: overflo("out of space in pmatch");
545: for (i = 0; i <= k; i++)
546: (f->posns[2])[i] = (f->posns[0])[i];
547: f->initstat = f->curstat = 2;
548: f->out[2] = f->out[0];
549: for (i = 0; i < NCHARS; i++)
550: f->gototab[2][i] = 0;
551: }
552: } while (*p++ != 0);
553: return (0);
554: }
555:
1.11 millert 556: int nematch(fa *f, const char *p0) /* non-empty match, for sub */
1.1 tholo 557: {
558: int s, ns;
559: uschar *p = (uschar *) p0;
560: uschar *q;
561: int i, k;
562:
1.12 millert 563: /* s = f->reset ? makeinit(f,1) : f->initstat; */
564: if (f->reset) {
565: f->initstat = s = makeinit(f,1);
566: } else {
567: s = f->initstat;
568: }
1.1 tholo 569: patlen = -1;
570: while (*p) {
571: q = p;
572: do {
573: if (f->out[s]) /* final state */
574: patlen = q-p;
1.15 millert 575: /* assert(*q < NCHARS); */
1.1 tholo 576: if ((ns = f->gototab[s][*q]) != 0)
577: s = ns;
578: else
579: s = cgoto(f, s, *q);
1.9 deraadt 580: if (s == 1) { /* no transition */
1.1 tholo 581: if (patlen > 0) {
582: patbeg = (char *) p;
583: return(1);
584: } else
585: goto nnextin; /* no nonempty match */
1.9 deraadt 586: }
1.1 tholo 587: } while (*q++ != 0);
588: if (f->out[s])
589: patlen = q-p-1; /* don't count $ */
590: if (patlen > 0 ) {
591: patbeg = (char *) p;
592: return(1);
593: }
594: nnextin:
595: s = 2;
596: if (f->reset) {
597: for (i = 2; i <= f->curstat; i++)
598: xfree(f->posns[i]);
599: k = *f->posns[0];
1.13 pvalchev 600: if ((f->posns[2] = (int *) calloc(k+1, sizeof(int))) == NULL)
1.1 tholo 601: overflo("out of state space");
602: for (i = 0; i <= k; i++)
603: (f->posns[2])[i] = (f->posns[0])[i];
604: f->initstat = f->curstat = 2;
605: f->out[2] = f->out[0];
606: for (i = 0; i < NCHARS; i++)
607: f->gototab[2][i] = 0;
608: }
609: p++;
610: }
611: return (0);
612: }
613:
1.11 millert 614: Node *reparse(const char *p) /* parses regular expression pointed to by p */
1.1 tholo 615: { /* uses relex() to scan regular expression */
616: Node *np;
617:
1.19 deraadt 618: DPRINTF( ("reparse <%s>\n", p) );
1.10 millert 619: lastre = prestr = (uschar *) p; /* prestr points to string to be parsed */
1.1 tholo 620: rtok = relex();
1.11 millert 621: /* GNU compatibility: an empty regexp matches anything */
1.15 millert 622: if (rtok == '\0') {
1.11 millert 623: /* FATAL("empty regular expression"); previous */
1.15 millert 624: return(op2(EMPTYRE, NIL, NIL));
625: }
1.1 tholo 626: np = regexp();
627: if (rtok != '\0')
1.8 millert 628: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 629: return(np);
630: }
631:
632: Node *regexp(void) /* top-level parse of reg expr */
633: {
634: return (alt(concat(primary())));
635: }
636:
637: Node *primary(void)
638: {
639: Node *np;
640:
641: switch (rtok) {
642: case CHAR:
1.7 millert 643: np = op2(CHAR, NIL, itonp(rlxval));
1.1 tholo 644: rtok = relex();
645: return (unary(np));
646: case ALL:
647: rtok = relex();
648: return (unary(op2(ALL, NIL, NIL)));
1.15 millert 649: case EMPTYRE:
650: rtok = relex();
651: return (unary(op2(ALL, NIL, NIL)));
1.1 tholo 652: case DOT:
653: rtok = relex();
654: return (unary(op2(DOT, NIL, NIL)));
655: case CCL:
1.10 millert 656: np = op2(CCL, NIL, (Node*) cclenter((char *) rlxstr));
1.1 tholo 657: rtok = relex();
658: return (unary(np));
659: case NCCL:
1.10 millert 660: np = op2(NCCL, NIL, (Node *) cclenter((char *) rlxstr));
1.1 tholo 661: rtok = relex();
662: return (unary(np));
663: case '^':
664: rtok = relex();
1.7 millert 665: return (unary(op2(CHAR, NIL, itonp(HAT))));
1.1 tholo 666: case '$':
667: rtok = relex();
668: return (unary(op2(CHAR, NIL, NIL)));
669: case '(':
670: rtok = relex();
671: if (rtok == ')') { /* special pleading for () */
672: rtok = relex();
673: return unary(op2(CCL, NIL, (Node *) tostring("")));
674: }
675: np = regexp();
676: if (rtok == ')') {
677: rtok = relex();
678: return (unary(np));
679: }
680: else
1.8 millert 681: FATAL("syntax error in regular expression %s at %s", lastre, prestr);
1.1 tholo 682: default:
1.8 millert 683: FATAL("illegal primary in regular expression %s at %s", lastre, prestr);
1.1 tholo 684: }
685: return 0; /*NOTREACHED*/
686: }
687:
688: Node *concat(Node *np)
689: {
690: switch (rtok) {
1.15 millert 691: case CHAR: case DOT: case ALL: case EMPTYRE: case CCL: case NCCL: case '$': case '(':
1.1 tholo 692: return (concat(op2(CAT, np, primary())));
693: }
694: return (np);
695: }
696:
697: Node *alt(Node *np)
698: {
699: if (rtok == OR) {
700: rtok = relex();
701: return (alt(op2(OR, np, concat(primary()))));
702: }
703: return (np);
704: }
705:
706: Node *unary(Node *np)
707: {
708: switch (rtok) {
709: case STAR:
710: rtok = relex();
711: return (unary(op2(STAR, np, NIL)));
712: case PLUS:
713: rtok = relex();
714: return (unary(op2(PLUS, np, NIL)));
715: case QUEST:
716: rtok = relex();
717: return (unary(op2(QUEST, np, NIL)));
718: default:
719: return (np);
720: }
721: }
722:
1.11 millert 723: /*
724: * Character class definitions conformant to the POSIX locale as
725: * defined in IEEE P1003.1 draft 7 of June 2001, assuming the source
726: * and operating character sets are both ASCII (ISO646) or supersets
727: * thereof.
728: *
729: * Note that to avoid overflowing the temporary buffer used in
730: * relex(), the expanded character class (prior to range expansion)
731: * must be less than twice the size of their full name.
732: */
1.12 millert 733:
734: /* Because isblank doesn't show up in any of the header files on any
735: * system i use, it's defined here. if some other locale has a richer
736: * definition of "blank", define HAS_ISBLANK and provide your own
737: * version.
738: * the parentheses here are an attempt to find a path through the maze
739: * of macro definition and/or function and/or version provided. thanks
740: * to nelson beebe for the suggestion; let's see if it works everywhere.
741: */
742:
743: #ifndef HAS_ISBLANK
744:
1.17 millert 745: int (xisblank)(int c)
1.12 millert 746: {
747: return c==' ' || c=='\t';
748: }
749:
750: #endif
751:
1.11 millert 752: struct charclass {
753: const char *cc_name;
754: int cc_namelen;
1.12 millert 755: int (*cc_func)(int);
1.11 millert 756: } charclasses[] = {
1.12 millert 757: { "alnum", 5, isalnum },
758: { "alpha", 5, isalpha },
1.17 millert 759: #ifndef HAS_ISBLANK
1.21 millert 760: { "blank", 5, xisblank },
1.17 millert 761: #else
1.12 millert 762: { "blank", 5, isblank },
1.17 millert 763: #endif
1.12 millert 764: { "cntrl", 5, iscntrl },
765: { "digit", 5, isdigit },
766: { "graph", 5, isgraph },
767: { "lower", 5, islower },
768: { "print", 5, isprint },
769: { "punct", 5, ispunct },
770: { "space", 5, isspace },
771: { "upper", 5, isupper },
772: { "xdigit", 6, isxdigit },
1.11 millert 773: { NULL, 0, NULL },
774: };
775:
776:
1.1 tholo 777: int relex(void) /* lexical analyzer for reparse */
778: {
1.5 kstailey 779: int c, n;
1.1 tholo 780: int cflag;
1.10 millert 781: static uschar *buf = 0;
1.5 kstailey 782: static int bufsz = 100;
1.10 millert 783: uschar *bp;
1.11 millert 784: struct charclass *cc;
1.12 millert 785: int i;
1.1 tholo 786:
787: switch (c = *prestr++) {
788: case '|': return OR;
789: case '*': return STAR;
790: case '+': return PLUS;
791: case '?': return QUEST;
792: case '.': return DOT;
793: case '\0': prestr--; return '\0';
794: case '^':
795: case '$':
796: case '(':
797: case ')':
798: return c;
799: case '\\':
1.17 millert 800: rlxval = quoted(&prestr);
1.1 tholo 801: return CHAR;
802: default:
803: rlxval = c;
804: return CHAR;
805: case '[':
1.10 millert 806: if (buf == 0 && (buf = (uschar *) malloc(bufsz)) == NULL)
1.8 millert 807: FATAL("out of space in reg expr %.10s..", lastre);
1.5 kstailey 808: bp = buf;
1.1 tholo 809: if (*prestr == '^') {
810: cflag = 1;
811: prestr++;
812: }
813: else
814: cflag = 0;
1.11 millert 815: n = 2 * strlen((const char *) prestr)+1;
1.15 millert 816: if (!adjbuf((char **) &buf, &bufsz, n, n, (char **) &bp, "relex1"))
1.8 millert 817: FATAL("out of space for reg expr %.10s...", lastre);
1.1 tholo 818: for (; ; ) {
819: if ((c = *prestr++) == '\\') {
1.5 kstailey 820: *bp++ = '\\';
1.1 tholo 821: if ((c = *prestr++) == '\0')
1.8 millert 822: FATAL("nonterminated character class %.20s...", lastre);
1.5 kstailey 823: *bp++ = c;
1.10 millert 824: /* } else if (c == '\n') { */
825: /* FATAL("newline in character class %.20s...", lastre); */
1.11 millert 826: } else if (c == '[' && *prestr == ':') {
827: /* POSIX char class names, Dag-Erling Smorgrav, des@ofug.org */
828: for (cc = charclasses; cc->cc_name; cc++)
829: if (strncmp((const char *) prestr + 1, (const char *) cc->cc_name, cc->cc_namelen) == 0)
830: break;
831: if (cc->cc_name != NULL && prestr[1 + cc->cc_namelen] == ':' &&
832: prestr[2 + cc->cc_namelen] == ']') {
833: prestr += cc->cc_namelen + 3;
1.22 ! millert 834: /*
! 835: * BUG: We begin at 1, instead of 0, since we
! 836: * would otherwise prematurely terminate the
! 837: * string for classes like [[:cntrl:]]. This
! 838: * means that we can't match the NUL character,
! 839: * not without first adapting the entire
! 840: * program to track each string's length.
! 841: */
! 842: for (i = 1; i < NCHARS; i++) {
1.15 millert 843: if (!adjbuf((char **) &buf, &bufsz, bp-buf+1, 100, (char **) &bp, "relex2"))
1.12 millert 844: FATAL("out of space for reg expr %.10s...", lastre);
845: if (cc->cc_func(i)) {
846: *bp++ = i;
847: n++;
848: }
849: }
1.11 millert 850: } else
851: *bp++ = c;
1.1 tholo 852: } else if (c == '\0') {
1.8 millert 853: FATAL("nonterminated character class %.20s", lastre);
1.5 kstailey 854: } else if (bp == buf) { /* 1st char is special */
855: *bp++ = c;
1.1 tholo 856: } else if (c == ']') {
1.5 kstailey 857: *bp++ = 0;
1.10 millert 858: rlxstr = (uschar *) tostring((char *) buf);
1.1 tholo 859: if (cflag == 0)
860: return CCL;
861: else
862: return NCCL;
863: } else
1.5 kstailey 864: *bp++ = c;
1.1 tholo 865: }
866: }
867: }
868:
869: int cgoto(fa *f, int s, int c)
870: {
871: int i, j, k;
1.4 millert 872: int *p, *q;
1.1 tholo 873:
1.12 millert 874: assert(c == HAT || c < NCHARS);
1.1 tholo 875: while (f->accept >= maxsetvec) { /* guessing here! */
1.18 deraadt 876: setvec = reallocarray(setvec, maxsetvec, 4 * sizeof(int));
877: tmpset = reallocarray(tmpset, maxsetvec, 4 * sizeof(int));
1.7 millert 878: if (setvec == 0 || tmpset == 0)
1.1 tholo 879: overflo("out of space in cgoto()");
1.18 deraadt 880: maxsetvec *= 4;
1.1 tholo 881: }
882: for (i = 0; i <= f->accept; i++)
883: setvec[i] = 0;
884: setcnt = 0;
885: /* compute positions of gototab[s,c] into setvec */
886: p = f->posns[s];
887: for (i = 1; i <= *p; i++) {
888: if ((k = f->re[p[i]].ltype) != FINAL) {
1.7 millert 889: if ((k == CHAR && c == ptoi(f->re[p[i]].lval.np))
1.1 tholo 890: || (k == DOT && c != 0 && c != HAT)
891: || (k == ALL && c != 0)
1.15 millert 892: || (k == EMPTYRE && c != 0)
1.10 millert 893: || (k == CCL && member(c, (char *) f->re[p[i]].lval.up))
894: || (k == NCCL && !member(c, (char *) f->re[p[i]].lval.up) && c != 0 && c != HAT)) {
1.1 tholo 895: q = f->re[p[i]].lfollow;
896: for (j = 1; j <= *q; j++) {
897: if (q[j] >= maxsetvec) {
1.18 deraadt 898: setvec = reallocarray(setvec,
899: maxsetvec, 4 * sizeof(int));
900: tmpset = reallocarray(tmpset,
901: maxsetvec, 4 * sizeof(int));
1.1 tholo 902: if (setvec == 0 || tmpset == 0)
903: overflo("cgoto overflow");
1.18 deraadt 904: maxsetvec *= 4;
1.1 tholo 905: }
906: if (setvec[q[j]] == 0) {
907: setcnt++;
908: setvec[q[j]] = 1;
909: }
910: }
911: }
912: }
913: }
914: /* determine if setvec is a previous state */
915: tmpset[0] = setcnt;
916: j = 1;
917: for (i = f->accept; i >= 0; i--)
918: if (setvec[i]) {
919: tmpset[j++] = i;
920: }
921: /* tmpset == previous state? */
922: for (i = 1; i <= f->curstat; i++) {
923: p = f->posns[i];
924: if ((k = tmpset[0]) != p[0])
925: goto different;
926: for (j = 1; j <= k; j++)
927: if (tmpset[j] != p[j])
928: goto different;
929: /* setvec is state i */
930: f->gototab[s][c] = i;
931: return i;
932: different:;
933: }
934:
935: /* add tmpset to current set of states */
936: if (f->curstat >= NSTATES-1) {
937: f->curstat = 2;
938: f->reset = 1;
939: for (i = 2; i < NSTATES; i++)
940: xfree(f->posns[i]);
941: } else
942: ++(f->curstat);
943: for (i = 0; i < NCHARS; i++)
944: f->gototab[f->curstat][i] = 0;
945: xfree(f->posns[f->curstat]);
1.13 pvalchev 946: if ((p = (int *) calloc(setcnt+1, sizeof(int))) == NULL)
1.1 tholo 947: overflo("out of space in cgoto");
948:
949: f->posns[f->curstat] = p;
950: f->gototab[s][c] = f->curstat;
951: for (i = 0; i <= setcnt; i++)
952: p[i] = tmpset[i];
953: if (setvec[f->accept])
954: f->out[f->curstat] = 1;
955: else
956: f->out[f->curstat] = 0;
957: return f->curstat;
958: }
959:
960:
961: void freefa(fa *f) /* free a finite automaton */
962: {
963: int i;
964:
965: if (f == NULL)
966: return;
967: for (i = 0; i <= f->curstat; i++)
968: xfree(f->posns[i]);
969: for (i = 0; i <= f->accept; i++) {
970: xfree(f->re[i].lfollow);
971: if (f->re[i].ltype == CCL || f->re[i].ltype == NCCL)
972: xfree((f->re[i].lval.np));
973: }
974: xfree(f->restr);
975: xfree(f);
976: }